diff options
Diffstat (limited to 'fs')
303 files changed, 8156 insertions, 5024 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index cf62b05..7d6c213 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -84,7 +84,7 @@ static const match_table_t tokens = { static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) { - char *options; + char *options, *tmp_options; substring_t args[MAX_OPT_ARGS]; char *p; int option = 0; @@ -102,9 +102,12 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) if (!opts) return 0; - options = kstrdup(opts, GFP_KERNEL); - if (!options) + tmp_options = kstrdup(opts, GFP_KERNEL); + if (!tmp_options) { + ret = -ENOMEM; goto fail_option_alloc; + } + options = tmp_options; while ((p = strsep(&options, ",")) != NULL) { int token; @@ -159,8 +162,12 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) break; case Opt_cache: s = match_strdup(&args[0]); - if (!s) - goto fail_option_alloc; + if (!s) { + ret = -ENOMEM; + P9_DPRINTK(P9_DEBUG_ERROR, + "problem allocating copy of cache arg\n"); + goto free_and_return; + } if (strcmp(s, "loose") == 0) v9ses->cache = CACHE_LOOSE; @@ -173,8 +180,12 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) case Opt_access: s = match_strdup(&args[0]); - if (!s) - goto fail_option_alloc; + if (!s) { + ret = -ENOMEM; + P9_DPRINTK(P9_DEBUG_ERROR, + "problem allocating copy of access arg\n"); + goto free_and_return; + } v9ses->flags &= ~V9FS_ACCESS_MASK; if (strcmp(s, "user") == 0) @@ -194,13 +205,11 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) continue; } } - kfree(options); - return ret; +free_and_return: + kfree(tmp_options); fail_option_alloc: - P9_DPRINTK(P9_DEBUG_ERROR, - "failed to allocate copy of option argument\n"); - return -ENOMEM; + return ret; } /** diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index 3a7560e..ed83583 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -60,3 +60,4 @@ void v9fs_dentry_release(struct dentry *); int v9fs_uflags2omode(int uflags, int extended); ssize_t v9fs_file_readn(struct file *, char *, char __user *, u32, u64); +void v9fs_blank_wstat(struct p9_wstat *wstat); diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 3902bf4..74a0461 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -257,6 +257,23 @@ v9fs_file_write(struct file *filp, const char __user * data, return total; } +static int v9fs_file_fsync(struct file *filp, struct dentry *dentry, + int datasync) +{ + struct p9_fid *fid; + struct p9_wstat wstat; + int retval; + + P9_DPRINTK(P9_DEBUG_VFS, "filp %p dentry %p datasync %x\n", filp, + dentry, datasync); + + fid = filp->private_data; + v9fs_blank_wstat(&wstat); + + retval = p9_client_wstat(fid, &wstat); + return retval; +} + static const struct file_operations v9fs_cached_file_operations = { .llseek = generic_file_llseek, .read = do_sync_read, @@ -266,6 +283,7 @@ static const struct file_operations v9fs_cached_file_operations = { .release = v9fs_dir_release, .lock = v9fs_file_lock, .mmap = generic_file_readonly_mmap, + .fsync = v9fs_file_fsync, }; const struct file_operations v9fs_file_operations = { @@ -276,4 +294,5 @@ const struct file_operations v9fs_file_operations = { .release = v9fs_dir_release, .lock = v9fs_file_lock, .mmap = generic_file_readonly_mmap, + .fsync = v9fs_file_fsync, }; diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 18f74ec..a407fa3 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -176,7 +176,7 @@ int v9fs_uflags2omode(int uflags, int extended) * */ -static void +void v9fs_blank_wstat(struct p9_wstat *wstat) { wstat->type = ~0; @@ -1001,44 +1001,6 @@ done: } /** - * v9fs_vfs_readlink - read a symlink's location - * @dentry: dentry for symlink - * @buffer: buffer to load symlink location into - * @buflen: length of buffer - * - */ - -static int v9fs_vfs_readlink(struct dentry *dentry, char __user * buffer, - int buflen) -{ - int retval; - int ret; - char *link = __getname(); - - if (unlikely(!link)) - return -ENOMEM; - - if (buflen > PATH_MAX) - buflen = PATH_MAX; - - P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name, - dentry); - - retval = v9fs_readlink(dentry, link, buflen); - - if (retval > 0) { - if ((ret = copy_to_user(buffer, link, retval)) != 0) { - P9_DPRINTK(P9_DEBUG_ERROR, - "problem copying to user: %d\n", ret); - retval = ret; - } - } - - __putname(link); - return retval; -} - -/** * v9fs_vfs_follow_link - follow a symlink path * @dentry: dentry for symlink * @nd: nameidata @@ -1230,7 +1192,6 @@ static const struct inode_operations v9fs_dir_inode_operations_ext = { .rmdir = v9fs_vfs_rmdir, .mknod = v9fs_vfs_mknod, .rename = v9fs_vfs_rename, - .readlink = v9fs_vfs_readlink, .getattr = v9fs_vfs_getattr, .setattr = v9fs_vfs_setattr, }; @@ -1253,7 +1214,7 @@ static const struct inode_operations v9fs_file_inode_operations = { }; static const struct inode_operations v9fs_symlink_inode_operations = { - .readlink = v9fs_vfs_readlink, + .readlink = generic_readlink, .follow_link = v9fs_vfs_follow_link, .put_link = v9fs_vfs_put_link, .getattr = v9fs_vfs_getattr, diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 14a8644..69357c0 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -188,7 +188,8 @@ static void v9fs_kill_super(struct super_block *s) P9_DPRINTK(P9_DEBUG_VFS, " %p\n", s); - v9fs_dentry_release(s->s_root); /* clunk root */ + if (s->s_root) + v9fs_dentry_release(s->s_root); /* clunk root */ kill_anon_super(s); @@ -6,10 +6,6 @@ menu "File systems" if BLOCK -config FS_JOURNAL_INFO - bool - default n - source "fs/ext2/Kconfig" source "fs/ext3/Kconfig" source "fs/ext4/Kconfig" diff --git a/fs/affs/affs.h b/fs/affs/affs.h index e511dc6..0e40caa 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -106,8 +106,8 @@ struct affs_sb_info { u32 s_last_bmap; struct buffer_head *s_bmap_bh; char *s_prefix; /* Prefix for volumes and assigns. */ - int s_prefix_len; /* Length of prefix. */ char s_volume[32]; /* Volume prefix for absolute symlinks. */ + spinlock_t symlink_lock; /* protects the previous two */ }; #define SF_INTL 0x0001 /* International filesystem. */ diff --git a/fs/affs/namei.c b/fs/affs/namei.c index 960d336..d70bbba 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -341,10 +341,13 @@ affs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) p = (char *)AFFS_HEAD(bh)->table; lc = '/'; if (*symname == '/') { + struct affs_sb_info *sbi = AFFS_SB(sb); while (*symname == '/') symname++; - while (AFFS_SB(sb)->s_volume[i]) /* Cannot overflow */ - *p++ = AFFS_SB(sb)->s_volume[i++]; + spin_lock(&sbi->symlink_lock); + while (sbi->s_volume[i]) /* Cannot overflow */ + *p++ = sbi->s_volume[i++]; + spin_unlock(&sbi->symlink_lock); } while (i < maxlen && (c = *symname++)) { if (c == '.' && lc == '/' && *symname == '.' && symname[1] == '/') { diff --git a/fs/affs/super.c b/fs/affs/super.c index 104fdcb..d41e967 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -203,7 +203,7 @@ parse_options(char *options, uid_t *uid, gid_t *gid, int *mode, int *reserved, s switch (token) { case Opt_bs: if (match_int(&args[0], &n)) - return -EINVAL; + return 0; if (n != 512 && n != 1024 && n != 2048 && n != 4096) { printk ("AFFS: Invalid blocksize (512, 1024, 2048, 4096 allowed)\n"); @@ -213,7 +213,7 @@ parse_options(char *options, uid_t *uid, gid_t *gid, int *mode, int *reserved, s break; case Opt_mode: if (match_octal(&args[0], &option)) - return 1; + return 0; *mode = option & 0777; *mount_opts |= SF_SETMODE; break; @@ -221,8 +221,6 @@ parse_options(char *options, uid_t *uid, gid_t *gid, int *mode, int *reserved, s *mount_opts |= SF_MUFS; break; case Opt_prefix: - /* Free any previous prefix */ - kfree(*prefix); *prefix = match_strdup(&args[0]); if (!*prefix) return 0; @@ -233,21 +231,21 @@ parse_options(char *options, uid_t *uid, gid_t *gid, int *mode, int *reserved, s break; case Opt_reserved: if (match_int(&args[0], reserved)) - return 1; + return 0; break; case Opt_root: if (match_int(&args[0], root)) - return 1; + return 0; break; case Opt_setgid: if (match_int(&args[0], &option)) - return 1; + return 0; *gid = option; *mount_opts |= SF_SETGID; break; case Opt_setuid: if (match_int(&args[0], &option)) - return -EINVAL; + return 0; *uid = option; *mount_opts |= SF_SETUID; break; @@ -311,11 +309,14 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) return -ENOMEM; sb->s_fs_info = sbi; mutex_init(&sbi->s_bmlock); + spin_lock_init(&sbi->symlink_lock); if (!parse_options(data,&uid,&gid,&i,&reserved,&root_block, &blocksize,&sbi->s_prefix, sbi->s_volume, &mount_flags)) { printk(KERN_ERR "AFFS: Error parsing options\n"); + kfree(sbi->s_prefix); + kfree(sbi); return -EINVAL; } /* N.B. after this point s_prefix must be released */ @@ -516,14 +517,18 @@ affs_remount(struct super_block *sb, int *flags, char *data) unsigned long mount_flags; int res = 0; char *new_opts = kstrdup(data, GFP_KERNEL); + char volume[32]; + char *prefix = NULL; pr_debug("AFFS: remount(flags=0x%x,opts=\"%s\")\n",*flags,data); *flags |= MS_NODIRATIME; + memcpy(volume, sbi->s_volume, 32); if (!parse_options(data, &uid, &gid, &mode, &reserved, &root_block, - &blocksize, &sbi->s_prefix, sbi->s_volume, + &blocksize, &prefix, volume, &mount_flags)) { + kfree(prefix); kfree(new_opts); return -EINVAL; } @@ -534,6 +539,14 @@ affs_remount(struct super_block *sb, int *flags, char *data) sbi->s_mode = mode; sbi->s_uid = uid; sbi->s_gid = gid; + /* protect against readers */ + spin_lock(&sbi->symlink_lock); + if (prefix) { + kfree(sbi->s_prefix); + sbi->s_prefix = prefix; + } + memcpy(sbi->s_volume, volume, 32); + spin_unlock(&sbi->symlink_lock); if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { unlock_kernel(); diff --git a/fs/affs/symlink.c b/fs/affs/symlink.c index 4178253..ee00f08 100644 --- a/fs/affs/symlink.c +++ b/fs/affs/symlink.c @@ -20,7 +20,6 @@ static int affs_symlink_readpage(struct file *file, struct page *page) int i, j; char c; char lc; - char *pf; pr_debug("AFFS: follow_link(ino=%lu)\n",inode->i_ino); @@ -32,11 +31,15 @@ static int affs_symlink_readpage(struct file *file, struct page *page) j = 0; lf = (struct slink_front *)bh->b_data; lc = 0; - pf = AFFS_SB(inode->i_sb)->s_prefix ? AFFS_SB(inode->i_sb)->s_prefix : "/"; if (strchr(lf->symname,':')) { /* Handle assign or volume name */ + struct affs_sb_info *sbi = AFFS_SB(inode->i_sb); + char *pf; + spin_lock(&sbi->symlink_lock); + pf = sbi->s_prefix ? sbi->s_prefix : "/"; while (i < 1023 && (c = pf[i])) link[i++] = c; + spin_unlock(&sbi->symlink_lock); while (i < 1023 && lf->symname[j] != ':') link[i++] = lf->symname[j++]; if (i < 1023) diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index 2ca7a7c..9f0bf13 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -35,14 +35,13 @@ static int anon_inodefs_get_sb(struct file_system_type *fs_type, int flags, mnt); } -static int anon_inodefs_delete_dentry(struct dentry *dentry) +/* + * anon_inodefs_dname() is called from d_path(). + */ +static char *anon_inodefs_dname(struct dentry *dentry, char *buffer, int buflen) { - /* - * We faked vfs to believe the dentry was hashed when we created it. - * Now we restore the flag so that dput() will work correctly. - */ - dentry->d_flags |= DCACHE_UNHASHED; - return 1; + return dynamic_dname(dentry, buffer, buflen, "anon_inode:%s", + dentry->d_name.name); } static struct file_system_type anon_inode_fs_type = { @@ -51,7 +50,7 @@ static struct file_system_type anon_inode_fs_type = { .kill_sb = kill_anon_super, }; static const struct dentry_operations anon_inodefs_dentry_operations = { - .d_delete = anon_inodefs_delete_dentry, + .d_dname = anon_inodefs_dname, }; /* @@ -88,7 +87,7 @@ struct file *anon_inode_getfile(const char *name, void *priv, int flags) { struct qstr this; - struct dentry *dentry; + struct path path; struct file *file; int error; @@ -106,10 +105,11 @@ struct file *anon_inode_getfile(const char *name, this.name = name; this.len = strlen(name); this.hash = 0; - dentry = d_alloc(anon_inode_mnt->mnt_sb->s_root, &this); - if (!dentry) + path.dentry = d_alloc(anon_inode_mnt->mnt_sb->s_root, &this); + if (!path.dentry) goto err_module; + path.mnt = mntget(anon_inode_mnt); /* * We know the anon_inode inode count is always greater than zero, * so we can avoid doing an igrab() and we can use an open-coded @@ -117,27 +117,24 @@ struct file *anon_inode_getfile(const char *name, */ atomic_inc(&anon_inode_inode->i_count); - dentry->d_op = &anon_inodefs_dentry_operations; - /* Do not publish this dentry inside the global dentry hash table */ - dentry->d_flags &= ~DCACHE_UNHASHED; - d_instantiate(dentry, anon_inode_inode); + path.dentry->d_op = &anon_inodefs_dentry_operations; + d_instantiate(path.dentry, anon_inode_inode); error = -ENFILE; - file = alloc_file(anon_inode_mnt, dentry, - FMODE_READ | FMODE_WRITE, fops); + file = alloc_file(&path, OPEN_FMODE(flags), fops); if (!file) goto err_dput; file->f_mapping = anon_inode_inode->i_mapping; file->f_pos = 0; - file->f_flags = O_RDWR | (flags & O_NONBLOCK); + file->f_flags = flags & (O_ACCMODE | O_NONBLOCK); file->f_version = 0; file->private_data = priv; return file; err_dput: - dput(dentry); + path_put(&path); err_module: module_put(fops->owner); return ERR_PTR(error); diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 33baf27..34ddda8 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -873,6 +873,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent) brelse(bh); unacquire_priv_sbp: + kfree(befs_sb->mount_opts.iocharset); kfree(sb->s_fs_info); unacquire_none: diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 6f60336..8f3d9fd 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -353,35 +353,35 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) struct inode *inode; unsigned i, imap_len; struct bfs_sb_info *info; - long ret = -EINVAL; + int ret = -EINVAL; unsigned long i_sblock, i_eblock, i_eoff, s_size; info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) return -ENOMEM; + mutex_init(&info->bfs_lock); s->s_fs_info = info; sb_set_blocksize(s, BFS_BSIZE); - bh = sb_bread(s, 0); - if(!bh) + info->si_sbh = sb_bread(s, 0); + if (!info->si_sbh) goto out; - bfs_sb = (struct bfs_super_block *)bh->b_data; + bfs_sb = (struct bfs_super_block *)info->si_sbh->b_data; if (le32_to_cpu(bfs_sb->s_magic) != BFS_MAGIC) { if (!silent) printf("No BFS filesystem on %s (magic=%08x)\n", s->s_id, le32_to_cpu(bfs_sb->s_magic)); - goto out; + goto out1; } if (BFS_UNCLEAN(bfs_sb, s) && !silent) printf("%s is unclean, continuing\n", s->s_id); s->s_magic = BFS_MAGIC; - info->si_sbh = bh; if (le32_to_cpu(bfs_sb->s_start) > le32_to_cpu(bfs_sb->s_end)) { printf("Superblock is corrupted\n"); - goto out; + goto out1; } info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE) / @@ -390,7 +390,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) imap_len = (info->si_lasti / 8) + 1; info->si_imap = kzalloc(imap_len, GFP_KERNEL); if (!info->si_imap) - goto out; + goto out1; for (i = 0; i < BFS_ROOT_INO; i++) set_bit(i, info->si_imap); @@ -398,15 +398,13 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) inode = bfs_iget(s, BFS_ROOT_INO); if (IS_ERR(inode)) { ret = PTR_ERR(inode); - kfree(info->si_imap); - goto out; + goto out2; } s->s_root = d_alloc_root(inode); if (!s->s_root) { iput(inode); ret = -ENOMEM; - kfree(info->si_imap); - goto out; + goto out2; } info->si_blocks = (le32_to_cpu(bfs_sb->s_end) + 1) >> BFS_BSIZE_BITS; @@ -419,10 +417,8 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) bh = sb_bread(s, info->si_blocks - 1); if (!bh) { printf("Last block not available: %lu\n", info->si_blocks - 1); - iput(inode); ret = -EIO; - kfree(info->si_imap); - goto out; + goto out3; } brelse(bh); @@ -459,11 +455,8 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) printf("Inode 0x%08x corrupted\n", i); brelse(bh); - s->s_root = NULL; - kfree(info->si_imap); - kfree(info); - s->s_fs_info = NULL; - return -EIO; + ret = -EIO; + goto out3; } if (!di->i_ino) { @@ -483,11 +476,17 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) s->s_dirt = 1; } dump_imap("read_super", s); - mutex_init(&info->bfs_lock); return 0; +out3: + dput(s->s_root); + s->s_root = NULL; +out2: + kfree(info->si_imap); +out1: + brelse(info->si_sbh); out: - brelse(bh); + mutex_destroy(&info->bfs_lock); kfree(info); s->s_fs_info = NULL; return ret; diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index b639dcf..fdd3970 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -32,7 +32,7 @@ static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs); static int load_aout_library(struct file*); -static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit); +static int aout_core_dump(struct coredump_params *cprm); static struct linux_binfmt aout_format = { .module = THIS_MODULE, @@ -89,8 +89,9 @@ if (file->f_op->llseek) { \ * dumping of the process results in another error.. */ -static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit) +static int aout_core_dump(struct coredump_params *cprm) { + struct file *file = cprm->file; mm_segment_t fs; int has_dumped = 0; unsigned long dump_start, dump_size; @@ -108,16 +109,16 @@ static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, u current->flags |= PF_DUMPCORE; strncpy(dump.u_comm, current->comm, sizeof(dump.u_comm)); dump.u_ar0 = offsetof(struct user, regs); - dump.signal = signr; - aout_dump_thread(regs, &dump); + dump.signal = cprm->signr; + aout_dump_thread(cprm->regs, &dump); /* If the size of the dump file exceeds the rlimit, then see what would happen if we wrote the stack, but not the data area. */ - if ((dump.u_dsize + dump.u_ssize+1) * PAGE_SIZE > limit) + if ((dump.u_dsize + dump.u_ssize+1) * PAGE_SIZE > cprm->limit) dump.u_dsize = 0; /* Make sure we have enough room to write the stack and data areas. */ - if ((dump.u_ssize + 1) * PAGE_SIZE > limit) + if ((dump.u_ssize + 1) * PAGE_SIZE > cprm->limit) dump.u_ssize = 0; /* make sure we actually have a data and stack area to dump */ @@ -263,6 +264,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) #else set_personality(PER_LINUX); #endif + setup_new_exec(bprm); current->mm->end_code = ex.a_text + (current->mm->start_code = N_TXTADDR(ex)); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 97b6e9e..fd5b2ea 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -45,7 +45,7 @@ static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *, * don't even try. */ #ifdef CONFIG_ELF_CORE -static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit); +static int elf_core_dump(struct coredump_params *cprm); #else #define elf_core_dump NULL #endif @@ -662,27 +662,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) if (elf_interpreter[elf_ppnt->p_filesz - 1] != '\0') goto out_free_interp; - /* - * The early SET_PERSONALITY here is so that the lookup - * for the interpreter happens in the namespace of the - * to-be-execed image. SET_PERSONALITY can select an - * alternate root. - * - * However, SET_PERSONALITY is NOT allowed to switch - * this task into the new images's memory mapping - * policy - that is, TASK_SIZE must still evaluate to - * that which is appropriate to the execing application. - * This is because exit_mmap() needs to have TASK_SIZE - * evaluate to the size of the old image. - * - * So if (say) a 64-bit application is execing a 32-bit - * application it is the architecture's responsibility - * to defer changing the value of TASK_SIZE until the - * switch really is going to happen - do this in - * flush_thread(). - akpm - */ - SET_PERSONALITY(loc->elf_ex); - interpreter = open_exec(elf_interpreter); retval = PTR_ERR(interpreter); if (IS_ERR(interpreter)) @@ -730,9 +709,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) /* Verify the interpreter has a valid arch */ if (!elf_check_arch(&loc->interp_elf_ex)) goto out_free_dentry; - } else { - /* Executables without an interpreter also need a personality */ - SET_PERSONALITY(loc->elf_ex); } /* Flush all traces of the currently running executable */ @@ -752,7 +728,8 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) current->flags |= PF_RANDOMIZE; - arch_pick_mmap_layout(current->mm); + + setup_new_exec(bprm); /* Do this so that we can load the interpreter, if need be. We will change some of these later */ @@ -1272,8 +1249,9 @@ static int writenote(struct memelfnote *men, struct file *file, } #undef DUMP_WRITE -#define DUMP_WRITE(addr, nr) \ - if ((size += (nr)) > limit || !dump_write(file, (addr), (nr))) \ +#define DUMP_WRITE(addr, nr) \ + if ((size += (nr)) > cprm->limit || \ + !dump_write(cprm->file, (addr), (nr))) \ goto end_coredump; static void fill_elf_header(struct elfhdr *elf, int segs, @@ -1901,7 +1879,7 @@ static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma, * and then they are actually written out. If we run out of core limit * we just truncate. */ -static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit) +static int elf_core_dump(struct coredump_params *cprm) { int has_dumped = 0; mm_segment_t fs; @@ -1947,7 +1925,7 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un * notes. This also sets up the file header. */ if (!fill_note_info(elf, segs + 1, /* including notes section */ - &info, signr, regs)) + &info, cprm->signr, cprm->regs)) goto cleanup; has_dumped = 1; @@ -2009,14 +1987,14 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un #endif /* write out the notes section */ - if (!write_note_info(&info, file, &foffset)) + if (!write_note_info(&info, cprm->file, &foffset)) goto end_coredump; - if (elf_coredump_extra_notes_write(file, &foffset)) + if (elf_coredump_extra_notes_write(cprm->file, &foffset)) goto end_coredump; /* Align to page */ - if (!dump_seek(file, dataoff - foffset)) + if (!dump_seek(cprm->file, dataoff - foffset)) goto end_coredump; for (vma = first_vma(current, gate_vma); vma != NULL; @@ -2033,12 +2011,13 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un page = get_dump_page(addr); if (page) { void *kaddr = kmap(page); - stop = ((size += PAGE_SIZE) > limit) || - !dump_write(file, kaddr, PAGE_SIZE); + stop = ((size += PAGE_SIZE) > cprm->limit) || + !dump_write(cprm->file, kaddr, + PAGE_SIZE); kunmap(page); page_cache_release(page); } else - stop = !dump_seek(file, PAGE_SIZE); + stop = !dump_seek(cprm->file, PAGE_SIZE); if (stop) goto end_coredump; } diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 7b05538..18d7729 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -76,7 +76,7 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *, struct file *, struct mm_struct *); #ifdef CONFIG_ELF_CORE -static int elf_fdpic_core_dump(long, struct pt_regs *, struct file *, unsigned long limit); +static int elf_fdpic_core_dump(struct coredump_params *cprm); #endif static struct linux_binfmt elf_fdpic_format = { @@ -171,6 +171,9 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, #ifdef ELF_FDPIC_PLAT_INIT unsigned long dynaddr; #endif +#ifndef CONFIG_MMU + unsigned long stack_prot; +#endif struct file *interpreter = NULL; /* to shut gcc up */ char *interpreter_name = NULL; int executable_stack; @@ -316,6 +319,11 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, * defunct, deceased, etc. after this point we have to exit via * error_kill */ set_personality(PER_LINUX_FDPIC); + if (elf_read_implies_exec(&exec_params.hdr, executable_stack)) + current->personality |= READ_IMPLIES_EXEC; + + setup_new_exec(bprm); + set_binfmt(&elf_fdpic_format); current->mm->start_code = 0; @@ -377,9 +385,13 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, if (stack_size < PAGE_SIZE * 2) stack_size = PAGE_SIZE * 2; + stack_prot = PROT_READ | PROT_WRITE; + if (executable_stack == EXSTACK_ENABLE_X || + (executable_stack == EXSTACK_DEFAULT && VM_STACK_FLAGS & VM_EXEC)) + stack_prot |= PROT_EXEC; + down_write(¤t->mm->mmap_sem); - current->mm->start_brk = do_mmap(NULL, 0, stack_size, - PROT_READ | PROT_WRITE | PROT_EXEC, + current->mm->start_brk = do_mmap(NULL, 0, stack_size, stack_prot, MAP_PRIVATE | MAP_ANONYMOUS | MAP_UNINITIALIZED | MAP_GROWSDOWN, 0); @@ -1326,8 +1338,9 @@ static int writenote(struct memelfnote *men, struct file *file) #undef DUMP_WRITE #undef DUMP_SEEK -#define DUMP_WRITE(addr, nr) \ - if ((size += (nr)) > limit || !dump_write(file, (addr), (nr))) \ +#define DUMP_WRITE(addr, nr) \ + if ((size += (nr)) > cprm->limit || \ + !dump_write(cprm->file, (addr), (nr))) \ goto end_coredump; static inline void fill_elf_fdpic_header(struct elfhdr *elf, int segs) @@ -1582,8 +1595,7 @@ static int elf_fdpic_dump_segments(struct file *file, size_t *size, * and then they are actually written out. If we run out of core limit * we just truncate. */ -static int elf_fdpic_core_dump(long signr, struct pt_regs *regs, - struct file *file, unsigned long limit) +static int elf_fdpic_core_dump(struct coredump_params *cprm) { #define NUM_NOTES 6 int has_dumped = 0; @@ -1642,7 +1654,7 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs, goto cleanup; #endif - if (signr) { + if (cprm->signr) { struct core_thread *ct; struct elf_thread_status *tmp; @@ -1661,14 +1673,14 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs, int sz; tmp = list_entry(t, struct elf_thread_status, list); - sz = elf_dump_thread_status(signr, tmp); + sz = elf_dump_thread_status(cprm->signr, tmp); thread_status_size += sz; } } /* now collect the dump for the current */ - fill_prstatus(prstatus, current, signr); - elf_core_copy_regs(&prstatus->pr_reg, regs); + fill_prstatus(prstatus, current, cprm->signr); + elf_core_copy_regs(&prstatus->pr_reg, cprm->regs); segs = current->mm->map_count; #ifdef ELF_CORE_EXTRA_PHDRS @@ -1703,7 +1715,7 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs, /* Try to dump the FPU. */ if ((prstatus->pr_fpvalid = - elf_core_copy_task_fpregs(current, regs, fpu))) + elf_core_copy_task_fpregs(current, cprm->regs, fpu))) fill_note(notes + numnote++, "CORE", NT_PRFPREG, sizeof(*fpu), fpu); #ifdef ELF_CORE_COPY_XFPREGS @@ -1774,7 +1786,7 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs, /* write out the notes section */ for (i = 0; i < numnote; i++) - if (!writenote(notes + i, file)) + if (!writenote(notes + i, cprm->file)) goto end_coredump; /* write out the thread status notes section */ @@ -1783,25 +1795,26 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs, list_entry(t, struct elf_thread_status, list); for (i = 0; i < tmp->num_notes; i++) - if (!writenote(&tmp->notes[i], file)) + if (!writenote(&tmp->notes[i], cprm->file)) goto end_coredump; } - if (!dump_seek(file, dataoff)) + if (!dump_seek(cprm->file, dataoff)) goto end_coredump; - if (elf_fdpic_dump_segments(file, &size, &limit, mm_flags) < 0) + if (elf_fdpic_dump_segments(cprm->file, &size, &cprm->limit, + mm_flags) < 0) goto end_coredump; #ifdef ELF_CORE_WRITE_EXTRA_DATA ELF_CORE_WRITE_EXTRA_DATA; #endif - if (file->f_pos != offset) { + if (cprm->file->f_pos != offset) { /* Sanity check */ printk(KERN_WARNING "elf_core_dump: file->f_pos (%lld) != offset (%lld)\n", - file->f_pos, offset); + cprm->file->f_pos, offset); } end_coredump: diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index a279665..42c6b4a 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -87,7 +87,7 @@ static int load_flat_shared_library(int id, struct lib_info *p); #endif static int load_flat_binary(struct linux_binprm *, struct pt_regs * regs); -static int flat_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit); +static int flat_core_dump(struct coredump_params *cprm); static struct linux_binfmt flat_format = { .module = THIS_MODULE, @@ -102,10 +102,10 @@ static struct linux_binfmt flat_format = { * Currently only a stub-function. */ -static int flat_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit) +static int flat_core_dump(struct coredump_params *cprm) { printk("Process %s:%d received signr %d and should have core dumped\n", - current->comm, current->pid, (int) signr); + current->comm, current->pid, (int) cprm->signr); return(1); } @@ -519,6 +519,7 @@ static int load_flat_file(struct linux_binprm * bprm, /* OK, This is the point of no return */ set_personality(PER_LINUX_32BIT); + setup_new_exec(bprm); } /* diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c index eff74b9..cc8560f 100644 --- a/fs/binfmt_som.c +++ b/fs/binfmt_som.c @@ -43,7 +43,7 @@ static int load_som_library(struct file *); * don't even try. */ #if 0 -static int som_core_dump(long signr, struct pt_regs *regs, unsigned long limit); +static int som_core_dump(struct coredump_params *cprm); #else #define som_core_dump NULL #endif @@ -227,6 +227,7 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs) /* OK, This is the point of no return */ current->flags &= ~PF_FORKNOEXEC; current->personality = PER_HPUX; + setup_new_exec(bprm); /* Set the task size for HP-UX processes such that * the gateway page is outside the address space. diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index 49a34e7..a16f29e 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -61,7 +61,7 @@ static inline unsigned int vecs_to_idx(unsigned int nr) static inline int use_bip_pool(unsigned int idx) { - if (idx == BIOVEC_NR_POOLS) + if (idx == BIOVEC_MAX_IDX) return 1; return 0; @@ -95,6 +95,7 @@ struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio, /* Use mempool if lower order alloc failed or max vecs were requested */ if (bip == NULL) { + idx = BIOVEC_MAX_IDX; /* so we free the payload properly later */ bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask); if (unlikely(bip == NULL)) { @@ -78,7 +78,7 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size) i = 0; while (i < bio_slab_nr) { - struct bio_slab *bslab = &bio_slabs[i]; + bslab = &bio_slabs[i]; if (!bslab->slab && entry == -1) entry = i; @@ -542,13 +542,18 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page if (page == prev->bv_page && offset == prev->bv_offset + prev->bv_len) { + unsigned int prev_bv_len = prev->bv_len; prev->bv_len += len; if (q->merge_bvec_fn) { struct bvec_merge_data bvm = { + /* prev_bvec is already charged in + bi_size, discharge it in order to + simulate merging updated prev_bvec + as new bvec. */ .bi_bdev = bio->bi_bdev, .bi_sector = bio->bi_sector, - .bi_size = bio->bi_size, + .bi_size = bio->bi_size - prev_bv_len, .bi_rw = bio->bi_rw, }; diff --git a/fs/block_dev.c b/fs/block_dev.c index 73d6a73..d11d028 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -246,7 +246,8 @@ struct super_block *freeze_bdev(struct block_device *bdev) if (!sb) goto out; if (sb->s_flags & MS_RDONLY) { - deactivate_locked_super(sb); + sb->s_frozen = SB_FREEZE_TRANS; + up_write(&sb->s_umount); mutex_unlock(&bdev->bd_fsfreeze_mutex); return sb; } @@ -307,7 +308,7 @@ int thaw_bdev(struct block_device *bdev, struct super_block *sb) BUG_ON(sb->s_bdev != bdev); down_write(&sb->s_umount); if (sb->s_flags & MS_RDONLY) - goto out_deactivate; + goto out_unfrozen; if (sb->s_op->unfreeze_fs) { error = sb->s_op->unfreeze_fs(sb); @@ -321,11 +322,11 @@ int thaw_bdev(struct block_device *bdev, struct super_block *sb) } } +out_unfrozen: sb->s_frozen = SB_UNFROZEN; smp_wmb(); wake_up(&sb->s_wait_unfrozen); -out_deactivate: if (sb) deactivate_locked_super(sb); out_unlock: diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 402afe0..7bb3c02 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -4,7 +4,6 @@ config BTRFS_FS select LIBCRC32C select ZLIB_INFLATE select ZLIB_DEFLATE - select FS_JOURNAL_INFO help Btrfs is a new filesystem with extents, writable snapshotting, support for multiple devices and many more features. diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 3616042..6df6d6e 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -73,13 +73,13 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) return acl; } -static int btrfs_xattr_get_acl(struct inode *inode, int type, - void *value, size_t size) +static int btrfs_xattr_acl_get(struct dentry *dentry, const char *name, + void *value, size_t size, int type) { struct posix_acl *acl; int ret = 0; - acl = btrfs_get_acl(inode, type); + acl = btrfs_get_acl(dentry->d_inode, type); if (IS_ERR(acl)) return PTR_ERR(acl); @@ -94,7 +94,8 @@ static int btrfs_xattr_get_acl(struct inode *inode, int type, /* * Needs to be called with fs_mutex held */ -static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) +static int btrfs_set_acl(struct btrfs_trans_handle *trans, + struct inode *inode, struct posix_acl *acl, int type) { int ret, size = 0; const char *name; @@ -111,12 +112,14 @@ static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) switch (type) { case ACL_TYPE_ACCESS: mode = inode->i_mode; - ret = posix_acl_equiv_mode(acl, &mode); - if (ret < 0) - return ret; - ret = 0; - inode->i_mode = mode; name = POSIX_ACL_XATTR_ACCESS; + if (acl) { + ret = posix_acl_equiv_mode(acl, &mode); + if (ret < 0) + return ret; + inode->i_mode = mode; + } + ret = 0; break; case ACL_TYPE_DEFAULT: if (!S_ISDIR(inode->i_mode)) @@ -140,8 +143,7 @@ static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) goto out; } - ret = __btrfs_setxattr(inode, name, value, size, 0); - + ret = __btrfs_setxattr(trans, inode, name, value, size, 0); out: kfree(value); @@ -151,10 +153,10 @@ out: return ret; } -static int btrfs_xattr_set_acl(struct inode *inode, int type, - const void *value, size_t size) +static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) { - int ret = 0; + int ret; struct posix_acl *acl = NULL; if (value) { @@ -167,38 +169,13 @@ static int btrfs_xattr_set_acl(struct inode *inode, int type, } } - ret = btrfs_set_acl(inode, acl, type); + ret = btrfs_set_acl(NULL, dentry->d_inode, acl, type); posix_acl_release(acl); return ret; } - -static int btrfs_xattr_acl_access_get(struct inode *inode, const char *name, - void *value, size_t size) -{ - return btrfs_xattr_get_acl(inode, ACL_TYPE_ACCESS, value, size); -} - -static int btrfs_xattr_acl_access_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) -{ - return btrfs_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size); -} - -static int btrfs_xattr_acl_default_get(struct inode *inode, const char *name, - void *value, size_t size) -{ - return btrfs_xattr_get_acl(inode, ACL_TYPE_DEFAULT, value, size); -} - -static int btrfs_xattr_acl_default_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) -{ - return btrfs_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size); -} - int btrfs_check_acl(struct inode *inode, int mask) { struct posix_acl *acl; @@ -221,7 +198,8 @@ int btrfs_check_acl(struct inode *inode, int mask) * stuff has been fixed to work with that. If the locking stuff changes, we * need to re-evaluate the acl locking stuff. */ -int btrfs_init_acl(struct inode *inode, struct inode *dir) +int btrfs_init_acl(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *dir) { struct posix_acl *acl = NULL; int ret = 0; @@ -246,7 +224,8 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir) mode_t mode; if (S_ISDIR(inode->i_mode)) { - ret = btrfs_set_acl(inode, acl, ACL_TYPE_DEFAULT); + ret = btrfs_set_acl(trans, inode, acl, + ACL_TYPE_DEFAULT); if (ret) goto failed; } @@ -261,10 +240,11 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir) inode->i_mode = mode; if (ret > 0) { /* we need an acl */ - ret = btrfs_set_acl(inode, clone, + ret = btrfs_set_acl(trans, inode, clone, ACL_TYPE_ACCESS); } } + posix_acl_release(clone); } failed: posix_acl_release(acl); @@ -294,7 +274,7 @@ int btrfs_acl_chmod(struct inode *inode) ret = posix_acl_chmod_masq(clone, inode->i_mode); if (!ret) - ret = btrfs_set_acl(inode, clone, ACL_TYPE_ACCESS); + ret = btrfs_set_acl(NULL, inode, clone, ACL_TYPE_ACCESS); posix_acl_release(clone); @@ -303,14 +283,16 @@ int btrfs_acl_chmod(struct inode *inode) struct xattr_handler btrfs_xattr_acl_default_handler = { .prefix = POSIX_ACL_XATTR_DEFAULT, - .get = btrfs_xattr_acl_default_get, - .set = btrfs_xattr_acl_default_set, + .flags = ACL_TYPE_DEFAULT, + .get = btrfs_xattr_acl_get, + .set = btrfs_xattr_acl_set, }; struct xattr_handler btrfs_xattr_acl_access_handler = { .prefix = POSIX_ACL_XATTR_ACCESS, - .get = btrfs_xattr_acl_access_get, - .set = btrfs_xattr_acl_access_set, + .flags = ACL_TYPE_ACCESS, + .get = btrfs_xattr_acl_get, + .set = btrfs_xattr_acl_set, }; #else /* CONFIG_BTRFS_FS_POSIX_ACL */ @@ -320,7 +302,8 @@ int btrfs_acl_chmod(struct inode *inode) return 0; } -int btrfs_init_acl(struct inode *inode, struct inode *dir) +int btrfs_init_acl(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *dir) { return 0; } diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index f6783a4..3f1f50d 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -44,9 +44,6 @@ struct btrfs_inode { */ struct extent_io_tree io_failure_tree; - /* held while inesrting or deleting extents from files */ - struct mutex extent_mutex; - /* held while logging the inode in tree-log.c */ struct mutex log_mutex; @@ -166,7 +163,7 @@ static inline struct btrfs_inode *BTRFS_I(struct inode *inode) static inline void btrfs_i_size_write(struct inode *inode, u64 size) { - inode->i_size = size; + i_size_write(inode, size); BTRFS_I(inode)->disk_i_size = size; } diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index ec96f3a..c4bc570 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -37,6 +37,11 @@ static int balance_node_right(struct btrfs_trans_handle *trans, struct extent_buffer *src_buf); static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level, int slot); +static int setup_items_for_insert(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *cpu_key, u32 *data_size, + u32 total_data, u32 total_size, int nr); + struct btrfs_path *btrfs_alloc_path(void) { @@ -451,9 +456,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, extent_buffer_get(cow); spin_unlock(&root->node_lock); - btrfs_free_extent(trans, root, buf->start, buf->len, - parent_start, root->root_key.objectid, - level, 0); + btrfs_free_tree_block(trans, root, buf->start, buf->len, + parent_start, root->root_key.objectid, level); free_extent_buffer(buf); add_root_to_dirty_list(root); } else { @@ -468,9 +472,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, btrfs_set_node_ptr_generation(parent, parent_slot, trans->transid); btrfs_mark_buffer_dirty(parent); - btrfs_free_extent(trans, root, buf->start, buf->len, - parent_start, root->root_key.objectid, - level, 0); + btrfs_free_tree_block(trans, root, buf->start, buf->len, + parent_start, root->root_key.objectid, level); } if (unlock_orig) btrfs_tree_unlock(buf); @@ -1030,8 +1033,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_tree_unlock(mid); /* once for the path */ free_extent_buffer(mid); - ret = btrfs_free_extent(trans, root, mid->start, mid->len, - 0, root->root_key.objectid, level, 1); + ret = btrfs_free_tree_block(trans, root, mid->start, mid->len, + 0, root->root_key.objectid, level); /* once for the root ptr */ free_extent_buffer(mid); return ret; @@ -1095,10 +1098,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, 1); if (wret) ret = wret; - wret = btrfs_free_extent(trans, root, bytenr, - blocksize, 0, - root->root_key.objectid, - level, 0); + wret = btrfs_free_tree_block(trans, root, + bytenr, blocksize, 0, + root->root_key.objectid, + level); if (wret) ret = wret; } else { @@ -1143,9 +1146,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, wret = del_ptr(trans, root, path, level + 1, pslot); if (wret) ret = wret; - wret = btrfs_free_extent(trans, root, bytenr, blocksize, - 0, root->root_key.objectid, - level, 0); + wret = btrfs_free_tree_block(trans, root, bytenr, blocksize, + 0, root->root_key.objectid, level); if (wret) ret = wret; } else { @@ -2997,75 +2999,85 @@ again: return ret; } -/* - * This function splits a single item into two items, - * giving 'new_key' to the new item and splitting the - * old one at split_offset (from the start of the item). - * - * The path may be released by this operation. After - * the split, the path is pointing to the old item. The - * new item is going to be in the same node as the old one. - * - * Note, the item being split must be smaller enough to live alone on - * a tree block with room for one extra struct btrfs_item - * - * This allows us to split the item in place, keeping a lock on the - * leaf the entire time. - */ -int btrfs_split_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_key *new_key, - unsigned long split_offset) +static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int ins_len) { - u32 item_size; + struct btrfs_key key; struct extent_buffer *leaf; - struct btrfs_key orig_key; - struct btrfs_item *item; - struct btrfs_item *new_item; - int ret = 0; - int slot; - u32 nritems; - u32 orig_offset; - struct btrfs_disk_key disk_key; - char *buf; + struct btrfs_file_extent_item *fi; + u64 extent_len = 0; + u32 item_size; + int ret; leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &orig_key, path->slots[0]); - if (btrfs_leaf_free_space(root, leaf) >= sizeof(struct btrfs_item)) - goto split; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY && + key.type != BTRFS_EXTENT_CSUM_KEY); + + if (btrfs_leaf_free_space(root, leaf) >= ins_len) + return 0; item_size = btrfs_item_size_nr(leaf, path->slots[0]); + if (key.type == BTRFS_EXTENT_DATA_KEY) { + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + extent_len = btrfs_file_extent_num_bytes(leaf, fi); + } btrfs_release_path(root, path); - path->search_for_split = 1; path->keep_locks = 1; - - ret = btrfs_search_slot(trans, root, &orig_key, path, 0, 1); + path->search_for_split = 1; + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); path->search_for_split = 0; + if (ret < 0) + goto err; + ret = -EAGAIN; + leaf = path->nodes[0]; /* if our item isn't there or got smaller, return now */ - if (ret != 0 || item_size != btrfs_item_size_nr(path->nodes[0], - path->slots[0])) { - path->keep_locks = 0; - return -EAGAIN; + if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0])) + goto err; + + if (key.type == BTRFS_EXTENT_DATA_KEY) { + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + if (extent_len != btrfs_file_extent_num_bytes(leaf, fi)) + goto err; } btrfs_set_path_blocking(path); - ret = split_leaf(trans, root, &orig_key, path, - sizeof(struct btrfs_item), 1); - path->keep_locks = 0; + ret = split_leaf(trans, root, &key, path, ins_len, 1); BUG_ON(ret); + path->keep_locks = 0; btrfs_unlock_up_safe(path, 1); + return 0; +err: + path->keep_locks = 0; + return ret; +} + +static noinline int split_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *new_key, + unsigned long split_offset) +{ + struct extent_buffer *leaf; + struct btrfs_item *item; + struct btrfs_item *new_item; + int slot; + char *buf; + u32 nritems; + u32 item_size; + u32 orig_offset; + struct btrfs_disk_key disk_key; + leaf = path->nodes[0]; BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item)); -split: - /* - * make sure any changes to the path from split_leaf leave it - * in a blocking state - */ btrfs_set_path_blocking(path); item = btrfs_item_nr(leaf, path->slots[0]); @@ -3073,19 +3085,19 @@ split: item_size = btrfs_item_size(leaf, item); buf = kmalloc(item_size, GFP_NOFS); + if (!buf) + return -ENOMEM; + read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, path->slots[0]), item_size); - slot = path->slots[0] + 1; - leaf = path->nodes[0]; + slot = path->slots[0] + 1; nritems = btrfs_header_nritems(leaf); - if (slot != nritems) { /* shift the items */ memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1), - btrfs_item_nr_offset(slot), - (nritems - slot) * sizeof(struct btrfs_item)); - + btrfs_item_nr_offset(slot), + (nritems - slot) * sizeof(struct btrfs_item)); } btrfs_cpu_key_to_disk(&disk_key, new_key); @@ -3113,16 +3125,81 @@ split: item_size - split_offset); btrfs_mark_buffer_dirty(leaf); - ret = 0; - if (btrfs_leaf_free_space(root, leaf) < 0) { - btrfs_print_leaf(root, leaf); - BUG(); - } + BUG_ON(btrfs_leaf_free_space(root, leaf) < 0); kfree(buf); + return 0; +} + +/* + * This function splits a single item into two items, + * giving 'new_key' to the new item and splitting the + * old one at split_offset (from the start of the item). + * + * The path may be released by this operation. After + * the split, the path is pointing to the old item. The + * new item is going to be in the same node as the old one. + * + * Note, the item being split must be smaller enough to live alone on + * a tree block with room for one extra struct btrfs_item + * + * This allows us to split the item in place, keeping a lock on the + * leaf the entire time. + */ +int btrfs_split_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *new_key, + unsigned long split_offset) +{ + int ret; + ret = setup_leaf_for_split(trans, root, path, + sizeof(struct btrfs_item)); + if (ret) + return ret; + + ret = split_item(trans, root, path, new_key, split_offset); return ret; } /* + * This function duplicate a item, giving 'new_key' to the new item. + * It guarantees both items live in the same tree leaf and the new item + * is contiguous with the original item. + * + * This allows us to split file extent in place, keeping a lock on the + * leaf the entire time. + */ +int btrfs_duplicate_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *new_key) +{ + struct extent_buffer *leaf; + int ret; + u32 item_size; + + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + ret = setup_leaf_for_split(trans, root, path, + item_size + sizeof(struct btrfs_item)); + if (ret) + return ret; + + path->slots[0]++; + ret = setup_items_for_insert(trans, root, path, new_key, &item_size, + item_size, item_size + + sizeof(struct btrfs_item), 1); + BUG_ON(ret); + + leaf = path->nodes[0]; + memcpy_extent_buffer(leaf, + btrfs_item_ptr_offset(leaf, path->slots[0]), + btrfs_item_ptr_offset(leaf, path->slots[0] - 1), + item_size); + return 0; +} + +/* * make the item pointed to by the path smaller. new_size indicates * how small to make it, and from_end tells us if we just chop bytes * off the end of the item or if we shift the item to chop bytes off @@ -3714,8 +3791,8 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, */ btrfs_unlock_up_safe(path, 0); - ret = btrfs_free_extent(trans, root, leaf->start, leaf->len, - 0, root->root_key.objectid, 0, 0); + ret = btrfs_free_tree_block(trans, root, leaf->start, leaf->len, + 0, root->root_key.objectid, 0); return ret; } /* diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 444b3e9..2aa8ec6 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -310,6 +310,9 @@ struct btrfs_header { #define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \ sizeof(struct btrfs_item) - \ sizeof(struct btrfs_file_extent_item)) +#define BTRFS_MAX_XATTR_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \ + sizeof(struct btrfs_item) -\ + sizeof(struct btrfs_dir_item)) /* @@ -859,8 +862,9 @@ struct btrfs_fs_info { struct mutex ordered_operations_mutex; struct rw_semaphore extent_commit_sem; - struct rw_semaphore subvol_sem; + struct rw_semaphore cleanup_work_sem; + struct rw_semaphore subvol_sem; struct srcu_struct subvol_srcu; struct list_head trans_list; @@ -868,6 +872,9 @@ struct btrfs_fs_info { struct list_head dead_roots; struct list_head caching_block_groups; + spinlock_t delayed_iput_lock; + struct list_head delayed_iputs; + atomic_t nr_async_submits; atomic_t async_submit_draining; atomic_t nr_async_bios; @@ -1034,12 +1041,12 @@ struct btrfs_root { int ref_cows; int track_dirty; int in_radix; + int clean_orphans; u64 defrag_trans_start; struct btrfs_key defrag_progress; struct btrfs_key defrag_max; int defrag_running; - int defrag_level; char *name; int in_sysfs; @@ -1154,6 +1161,7 @@ struct btrfs_root { #define BTRFS_MOUNT_SSD_SPREAD (1 << 8) #define BTRFS_MOUNT_NOSSD (1 << 9) #define BTRFS_MOUNT_DISCARD (1 << 10) +#define BTRFS_MOUNT_FORCE_COMPRESS (1 << 11) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) @@ -1975,6 +1983,10 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, u64 parent, u64 root_objectid, struct btrfs_disk_key *key, int level, u64 hint, u64 empty_size); +int btrfs_free_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u32 blocksize, + u64 parent, u64 root_objectid, int level); struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u32 blocksize, @@ -2089,6 +2101,10 @@ int btrfs_split_item(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_key *new_key, unsigned long split_offset); +int btrfs_duplicate_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *new_key); int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_key *key, struct btrfs_path *p, int ins_len, int cow); @@ -2196,9 +2212,10 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_dir_item *di); int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, const char *name, - u16 name_len, const void *data, u16 data_len, - u64 dir); + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid, + const char *name, u16 name_len, + const void *data, u16 data_len); struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dir, @@ -2292,7 +2309,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct inode *inode, u64 new_size, u32 min_type); -int btrfs_start_delalloc_inodes(struct btrfs_root *root); +int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end); int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc); @@ -2332,6 +2349,8 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); void btrfs_orphan_cleanup(struct btrfs_root *root); int btrfs_cont_expand(struct inode *inode, loff_t size); int btrfs_invalidate_inodes(struct btrfs_root *root); +void btrfs_add_delayed_iput(struct inode *inode); +void btrfs_run_delayed_iputs(struct btrfs_root *root); extern const struct dentry_operations btrfs_dentry_operations; /* ioctl.c */ @@ -2345,12 +2364,9 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, int skip_pinned); int btrfs_check_file(struct btrfs_root *root, struct inode *inode); extern const struct file_operations btrfs_file_operations; -int btrfs_drop_extents(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, - u64 start, u64 end, u64 locked_end, - u64 inline_limit, u64 *hint_block, int drop_cache); +int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode, + u64 start, u64 end, u64 *hint_byte, int drop_cache); int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, u64 start, u64 end); int btrfs_release_file(struct inode *inode, struct file *file); @@ -2380,7 +2396,8 @@ int btrfs_check_acl(struct inode *inode, int mask); #else #define btrfs_check_acl NULL #endif -int btrfs_init_acl(struct inode *inode, struct inode *dir); +int btrfs_init_acl(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *dir); int btrfs_acl_chmod(struct inode *inode); /* relocation.c */ diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index f3a6075..e9103b3 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -68,12 +68,12 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle * into the tree */ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, const char *name, - u16 name_len, const void *data, u16 data_len, - u64 dir) + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid, + const char *name, u16 name_len, + const void *data, u16 data_len) { int ret = 0; - struct btrfs_path *path; struct btrfs_dir_item *dir_item; unsigned long name_ptr, data_ptr; struct btrfs_key key, location; @@ -81,15 +81,11 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; u32 data_size; - key.objectid = dir; + BUG_ON(name_len + data_len > BTRFS_MAX_XATTR_SIZE(root)); + + key.objectid = objectid; btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); key.offset = btrfs_name_hash(name, name_len); - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - if (name_len + data_len + sizeof(struct btrfs_dir_item) > - BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item)) - return -ENOSPC; data_size = sizeof(*dir_item) + name_len + data_len; dir_item = insert_with_overflow(trans, root, path, &key, data_size, @@ -117,7 +113,6 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, write_extent_buffer(leaf, data, data_ptr, data_len); btrfs_mark_buffer_dirty(path->nodes[0]); - btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 02b6afb..2b59201 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -892,6 +892,8 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root->stripesize = stripesize; root->ref_cows = 0; root->track_dirty = 0; + root->in_radix = 0; + root->clean_orphans = 0; root->fs_info = fs_info; root->objectid = objectid; @@ -928,7 +930,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root->defrag_trans_start = fs_info->generation; init_completion(&root->kobj_unregister); root->defrag_running = 0; - root->defrag_level = 0; root->root_key.objectid = objectid; root->anon_super.s_root = NULL; root->anon_super.s_dev = 0; @@ -980,12 +981,12 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, while (1) { ret = find_first_extent_bit(&log_root_tree->dirty_log_pages, - 0, &start, &end, EXTENT_DIRTY); + 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW); if (ret) break; - clear_extent_dirty(&log_root_tree->dirty_log_pages, - start, end, GFP_NOFS); + clear_extent_bits(&log_root_tree->dirty_log_pages, start, end, + EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); } eb = fs_info->log_root_tree->node; @@ -1210,8 +1211,10 @@ again: ret = radix_tree_insert(&fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid, root); - if (ret == 0) + if (ret == 0) { root->in_radix = 1; + root->clean_orphans = 1; + } spin_unlock(&fs_info->fs_roots_radix_lock); radix_tree_preload_end(); if (ret) { @@ -1225,10 +1228,6 @@ again: ret = btrfs_find_dead_roots(fs_info->tree_root, root->root_key.objectid); WARN_ON(ret); - - if (!(fs_info->sb->s_flags & MS_RDONLY)) - btrfs_orphan_cleanup(root); - return root; fail: free_fs_root(root); @@ -1477,6 +1476,7 @@ static int cleaner_kthread(void *arg) if (!(root->fs_info->sb->s_flags & MS_RDONLY) && mutex_trylock(&root->fs_info->cleaner_mutex)) { + btrfs_run_delayed_iputs(root); btrfs_clean_old_snapshots(root); mutex_unlock(&root->fs_info->cleaner_mutex); } @@ -1606,6 +1606,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); INIT_LIST_HEAD(&fs_info->trans_list); INIT_LIST_HEAD(&fs_info->dead_roots); + INIT_LIST_HEAD(&fs_info->delayed_iputs); INIT_LIST_HEAD(&fs_info->hashers); INIT_LIST_HEAD(&fs_info->delalloc_inodes); INIT_LIST_HEAD(&fs_info->ordered_operations); @@ -1614,6 +1615,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, spin_lock_init(&fs_info->new_trans_lock); spin_lock_init(&fs_info->ref_cache_lock); spin_lock_init(&fs_info->fs_roots_radix_lock); + spin_lock_init(&fs_info->delayed_iput_lock); init_completion(&fs_info->kobj_unregister); fs_info->tree_root = tree_root; @@ -1689,6 +1691,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, mutex_init(&fs_info->cleaner_mutex); mutex_init(&fs_info->volume_mutex); init_rwsem(&fs_info->extent_commit_sem); + init_rwsem(&fs_info->cleanup_work_sem); init_rwsem(&fs_info->subvol_sem); btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); @@ -1979,7 +1982,12 @@ struct btrfs_root *open_ctree(struct super_block *sb, if (!(sb->s_flags & MS_RDONLY)) { ret = btrfs_recover_relocation(tree_root); - BUG_ON(ret); + if (ret < 0) { + printk(KERN_WARNING + "btrfs: failed to recover relocation\n"); + err = -EINVAL; + goto fail_trans_kthread; + } } location.objectid = BTRFS_FS_TREE_OBJECTID; @@ -1990,6 +1998,12 @@ struct btrfs_root *open_ctree(struct super_block *sb, if (!fs_info->fs_root) goto fail_trans_kthread; + if (!(sb->s_flags & MS_RDONLY)) { + down_read(&fs_info->cleanup_work_sem); + btrfs_orphan_cleanup(fs_info->fs_root); + up_read(&fs_info->cleanup_work_sem); + } + return tree_root; fail_trans_kthread: @@ -2386,8 +2400,14 @@ int btrfs_commit_super(struct btrfs_root *root) int ret; mutex_lock(&root->fs_info->cleaner_mutex); + btrfs_run_delayed_iputs(root); btrfs_clean_old_snapshots(root); mutex_unlock(&root->fs_info->cleaner_mutex); + + /* wait until ongoing cleanup work done */ + down_write(&root->fs_info->cleanup_work_sem); + up_write(&root->fs_info->cleanup_work_sem); + trans = btrfs_start_transaction(root, 1); ret = btrfs_commit_transaction(trans, root); BUG_ON(ret); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 94627c4..559f724 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -83,6 +83,17 @@ static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) return (cache->flags & bits) == bits; } +void btrfs_get_block_group(struct btrfs_block_group_cache *cache) +{ + atomic_inc(&cache->count); +} + +void btrfs_put_block_group(struct btrfs_block_group_cache *cache) +{ + if (atomic_dec_and_test(&cache->count)) + kfree(cache); +} + /* * this adds the block group to the fs_info rb tree for the block group * cache @@ -156,7 +167,7 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, } } if (ret) - atomic_inc(&ret->count); + btrfs_get_block_group(ret); spin_unlock(&info->block_group_cache_lock); return ret; @@ -195,6 +206,14 @@ static int exclude_super_stripes(struct btrfs_root *root, int stripe_len; int i, nr, ret; + if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) { + stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid; + cache->bytes_super += stripe_len; + ret = add_excluded_extent(root, cache->key.objectid, + stripe_len); + BUG_ON(ret); + } + for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { bytenr = btrfs_sb_offset(i); ret = btrfs_rmap_block(&root->fs_info->mapping_tree, @@ -255,7 +274,7 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, if (ret) break; - if (extent_start == start) { + if (extent_start <= start) { start = extent_end + 1; } else if (extent_start > start && extent_start < end) { size = extent_start - start; @@ -399,6 +418,8 @@ err: put_caching_control(caching_ctl); atomic_dec(&block_group->space_info->caching_threads); + btrfs_put_block_group(block_group); + return 0; } @@ -439,6 +460,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache) up_write(&fs_info->extent_commit_sem); atomic_inc(&cache->space_info->caching_threads); + btrfs_get_block_group(cache); tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n", cache->key.objectid); @@ -478,12 +500,6 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group( return cache; } -void btrfs_put_block_group(struct btrfs_block_group_cache *cache) -{ - if (atomic_dec_and_test(&cache->count)) - kfree(cache); -} - static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, u64 flags) { @@ -2574,7 +2590,7 @@ next_block_group(struct btrfs_root *root, if (node) { cache = rb_entry(node, struct btrfs_block_group_cache, cache_node); - atomic_inc(&cache->count); + btrfs_get_block_group(cache); } else cache = NULL; spin_unlock(&root->fs_info->block_group_cache_lock); @@ -2880,9 +2896,9 @@ static noinline void flush_delalloc_async(struct btrfs_work *work) root = async->root; info = async->info; - btrfs_start_delalloc_inodes(root); + btrfs_start_delalloc_inodes(root, 0); wake_up(&info->flush_wait); - btrfs_wait_ordered_extents(root, 0); + btrfs_wait_ordered_extents(root, 0, 0); spin_lock(&info->lock); info->flushing = 0; @@ -2956,8 +2972,8 @@ static void flush_delalloc(struct btrfs_root *root, return; flush: - btrfs_start_delalloc_inodes(root); - btrfs_wait_ordered_extents(root, 0); + btrfs_start_delalloc_inodes(root, 0); + btrfs_wait_ordered_extents(root, 0, 0); spin_lock(&info->lock); info->flushing = 0; @@ -3454,14 +3470,6 @@ static int update_block_group(struct btrfs_trans_handle *trans, else old_val -= num_bytes; btrfs_set_super_bytes_used(&info->super_copy, old_val); - - /* block accounting for root item */ - old_val = btrfs_root_used(&root->root_item); - if (alloc) - old_val += num_bytes; - else - old_val -= num_bytes; - btrfs_set_root_used(&root->root_item, old_val); spin_unlock(&info->delalloc_lock); while (total) { @@ -4049,6 +4057,21 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, return ret; } +int btrfs_free_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u32 blocksize, + u64 parent, u64 root_objectid, int level) +{ + u64 used; + spin_lock(&root->node_lock); + used = btrfs_root_used(&root->root_item) - blocksize; + btrfs_set_root_used(&root->root_item, used); + spin_unlock(&root->node_lock); + + return btrfs_free_extent(trans, root, bytenr, blocksize, + parent, root_objectid, level, 0); +} + static u64 stripe_align(struct btrfs_root *root, u64 val) { u64 mask = ((u64)root->stripesize - 1); @@ -4212,7 +4235,7 @@ search: u64 offset; int cached; - atomic_inc(&block_group->count); + btrfs_get_block_group(block_group); search_start = block_group->key.objectid; have_block_group: @@ -4300,7 +4323,7 @@ have_block_group: btrfs_put_block_group(block_group); block_group = last_ptr->block_group; - atomic_inc(&block_group->count); + btrfs_get_block_group(block_group); spin_unlock(&last_ptr->lock); spin_unlock(&last_ptr->refill_lock); @@ -4578,7 +4601,6 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans, { int ret; u64 search_start = 0; - struct btrfs_fs_info *info = root->fs_info; data = btrfs_get_alloc_profile(root, data); again: @@ -4586,17 +4608,9 @@ again: * the only place that sets empty_size is btrfs_realloc_node, which * is not called recursively on allocations */ - if (empty_size || root->ref_cows) { - if (!(data & BTRFS_BLOCK_GROUP_METADATA)) { - ret = do_chunk_alloc(trans, root->fs_info->extent_root, - 2 * 1024 * 1024, - BTRFS_BLOCK_GROUP_METADATA | - (info->metadata_alloc_profile & - info->avail_metadata_alloc_bits), 0); - } + if (empty_size || root->ref_cows) ret = do_chunk_alloc(trans, root->fs_info->extent_root, num_bytes + 2 * 1024 * 1024, data, 0); - } WARN_ON(num_bytes < root->sectorsize); ret = find_free_extent(trans, root, num_bytes, empty_size, @@ -4897,6 +4911,14 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans, extent_op); BUG_ON(ret); } + + if (root_objectid == root->root_key.objectid) { + u64 used; + spin_lock(&root->node_lock); + used = btrfs_root_used(&root->root_item) + num_bytes; + btrfs_set_root_used(&root->root_item, used); + spin_unlock(&root->node_lock); + } return ret; } @@ -4919,8 +4941,16 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, btrfs_set_buffer_uptodate(buf); if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { - set_extent_dirty(&root->dirty_log_pages, buf->start, - buf->start + buf->len - 1, GFP_NOFS); + /* + * we allow two log transactions at a time, use different + * EXENT bit to differentiate dirty pages. + */ + if (root->log_transid % 2 == 0) + set_extent_dirty(&root->dirty_log_pages, buf->start, + buf->start + buf->len - 1, GFP_NOFS); + else + set_extent_new(&root->dirty_log_pages, buf->start, + buf->start + buf->len - 1, GFP_NOFS); } else { set_extent_dirty(&trans->transaction->dirty_pages, buf->start, buf->start + buf->len - 1, GFP_NOFS); @@ -5372,10 +5402,6 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, int ret; while (level >= 0) { - if (path->slots[level] >= - btrfs_header_nritems(path->nodes[level])) - break; - ret = walk_down_proc(trans, root, path, wc, lookup_info); if (ret > 0) break; @@ -5383,6 +5409,10 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, if (level == 0) break; + if (path->slots[level] >= + btrfs_header_nritems(path->nodes[level])) + break; + ret = do_walk_down(trans, root, path, wc, &lookup_info); if (ret > 0) { path->slots[level]++; @@ -7373,9 +7403,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) wait_block_group_cache_done(block_group); btrfs_remove_free_space_cache(block_group); - - WARN_ON(atomic_read(&block_group->count) != 1); - kfree(block_group); + btrfs_put_block_group(block_group); spin_lock(&info->block_group_cache_lock); } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 96577e8..b177ed3 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3165,10 +3165,9 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, spin_unlock(&tree->buffer_lock); goto free_eb; } - spin_unlock(&tree->buffer_lock); - /* add one reference for the tree */ atomic_inc(&eb->refs); + spin_unlock(&tree->buffer_lock); return eb; free_eb: diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 46bea0f..428fcac 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -155,20 +155,6 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset, return NULL; } -/* - * look for an offset in the tree, and if it can't be found, return - * the first offset we can find smaller than 'offset'. - */ -static inline struct rb_node *tree_search(struct rb_root *root, u64 offset) -{ - struct rb_node *prev; - struct rb_node *ret; - ret = __tree_search(root, offset, &prev, NULL); - if (!ret) - return prev; - return ret; -} - /* check to see if two extent_map structs are adjacent and safe to merge */ static int mergable_maps(struct extent_map *prev, struct extent_map *next) { diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 77f7593..6ed434a 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -179,18 +179,14 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, } flags = em->flags; if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) { - if (em->start <= start && - (!testend || em->start + em->len >= start + len)) { + if (testend && em->start + em->len >= start + len) { free_extent_map(em); write_unlock(&em_tree->lock); break; } - if (start < em->start) { - len = em->start - start; - } else { + start = em->start + em->len; + if (testend) len = start + len - (em->start + em->len); - start = em->start + em->len; - } free_extent_map(em); write_unlock(&em_tree->lock); continue; @@ -265,324 +261,253 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, * If an extent intersects the range but is not entirely inside the range * it is either truncated or split. Anything entirely inside the range * is deleted from the tree. - * - * inline_limit is used to tell this code which offsets in the file to keep - * if they contain inline extents. */ -noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, - u64 start, u64 end, u64 locked_end, - u64 inline_limit, u64 *hint_byte, int drop_cache) +int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode, + u64 start, u64 end, u64 *hint_byte, int drop_cache) { - u64 extent_end = 0; - u64 search_start = start; - u64 ram_bytes = 0; - u64 disk_bytenr = 0; - u64 orig_locked_end = locked_end; - u8 compression; - u8 encryption; - u16 other_encoding = 0; + struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_buffer *leaf; - struct btrfs_file_extent_item *extent; + struct btrfs_file_extent_item *fi; struct btrfs_path *path; struct btrfs_key key; - struct btrfs_file_extent_item old; - int keep; - int slot; - int bookend; - int found_type = 0; - int found_extent; - int found_inline; + struct btrfs_key new_key; + u64 search_start = start; + u64 disk_bytenr = 0; + u64 num_bytes = 0; + u64 extent_offset = 0; + u64 extent_end = 0; + int del_nr = 0; + int del_slot = 0; + int extent_type; int recow; int ret; - inline_limit = 0; if (drop_cache) btrfs_drop_extent_cache(inode, start, end - 1, 0); path = btrfs_alloc_path(); if (!path) return -ENOMEM; + while (1) { recow = 0; - btrfs_release_path(root, path); ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino, search_start, -1); if (ret < 0) - goto out; - if (ret > 0) { - if (path->slots[0] == 0) { - ret = 0; - goto out; - } - path->slots[0]--; + break; + if (ret > 0 && path->slots[0] > 0 && search_start == start) { + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1); + if (key.objectid == inode->i_ino && + key.type == BTRFS_EXTENT_DATA_KEY) + path->slots[0]--; } + ret = 0; next_slot: - keep = 0; - bookend = 0; - found_extent = 0; - found_inline = 0; - compression = 0; - encryption = 0; - extent = NULL; leaf = path->nodes[0]; - slot = path->slots[0]; - ret = 0; - btrfs_item_key_to_cpu(leaf, &key, slot); - if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY && - key.offset >= end) { - goto out; - } - if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY || - key.objectid != inode->i_ino) { - goto out; - } - if (recow) { - search_start = max(key.offset, start); - continue; - } - if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) { - extent = btrfs_item_ptr(leaf, slot, - struct btrfs_file_extent_item); - found_type = btrfs_file_extent_type(leaf, extent); - compression = btrfs_file_extent_compression(leaf, - extent); - encryption = btrfs_file_extent_encryption(leaf, - extent); - other_encoding = btrfs_file_extent_other_encoding(leaf, - extent); - if (found_type == BTRFS_FILE_EXTENT_REG || - found_type == BTRFS_FILE_EXTENT_PREALLOC) { - extent_end = - btrfs_file_extent_disk_bytenr(leaf, - extent); - if (extent_end) - *hint_byte = extent_end; - - extent_end = key.offset + - btrfs_file_extent_num_bytes(leaf, extent); - ram_bytes = btrfs_file_extent_ram_bytes(leaf, - extent); - found_extent = 1; - } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { - found_inline = 1; - extent_end = key.offset + - btrfs_file_extent_inline_len(leaf, extent); + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + BUG_ON(del_nr > 0); + ret = btrfs_next_leaf(root, path); + if (ret < 0) + break; + if (ret > 0) { + ret = 0; + break; } + leaf = path->nodes[0]; + recow = 1; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid > inode->i_ino || + key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end) + break; + + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + extent_type = btrfs_file_extent_type(leaf, fi); + + if (extent_type == BTRFS_FILE_EXTENT_REG || + extent_type == BTRFS_FILE_EXTENT_PREALLOC) { + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); + extent_offset = btrfs_file_extent_offset(leaf, fi); + extent_end = key.offset + + btrfs_file_extent_num_bytes(leaf, fi); + } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + extent_end = key.offset + + btrfs_file_extent_inline_len(leaf, fi); } else { + WARN_ON(1); extent_end = search_start; } - /* we found nothing we can drop */ - if ((!found_extent && !found_inline) || - search_start >= extent_end) { - int nextret; - u32 nritems; - nritems = btrfs_header_nritems(leaf); - if (slot >= nritems - 1) { - nextret = btrfs_next_leaf(root, path); - if (nextret) - goto out; - recow = 1; - } else { - path->slots[0]++; - } + if (extent_end <= search_start) { + path->slots[0]++; goto next_slot; } - if (end <= extent_end && start >= key.offset && found_inline) - *hint_byte = EXTENT_MAP_INLINE; - - if (found_extent) { - read_extent_buffer(leaf, &old, (unsigned long)extent, - sizeof(old)); - } - - if (end < extent_end && end >= key.offset) { - bookend = 1; - if (found_inline && start <= key.offset) - keep = 1; + search_start = max(key.offset, start); + if (recow) { + btrfs_release_path(root, path); + continue; } - if (bookend && found_extent) { - if (locked_end < extent_end) { - ret = try_lock_extent(&BTRFS_I(inode)->io_tree, - locked_end, extent_end - 1, - GFP_NOFS); - if (!ret) { - btrfs_release_path(root, path); - lock_extent(&BTRFS_I(inode)->io_tree, - locked_end, extent_end - 1, - GFP_NOFS); - locked_end = extent_end; - continue; - } - locked_end = extent_end; + /* + * | - range to drop - | + * | -------- extent -------- | + */ + if (start > key.offset && end < extent_end) { + BUG_ON(del_nr > 0); + BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); + + memcpy(&new_key, &key, sizeof(new_key)); + new_key.offset = start; + ret = btrfs_duplicate_item(trans, root, path, + &new_key); + if (ret == -EAGAIN) { + btrfs_release_path(root, path); + continue; } - disk_bytenr = le64_to_cpu(old.disk_bytenr); - if (disk_bytenr != 0) { + if (ret < 0) + break; + + leaf = path->nodes[0]; + fi = btrfs_item_ptr(leaf, path->slots[0] - 1, + struct btrfs_file_extent_item); + btrfs_set_file_extent_num_bytes(leaf, fi, + start - key.offset); + + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + + extent_offset += start - key.offset; + btrfs_set_file_extent_offset(leaf, fi, extent_offset); + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_end - start); + btrfs_mark_buffer_dirty(leaf); + + if (disk_bytenr > 0) { ret = btrfs_inc_extent_ref(trans, root, - disk_bytenr, - le64_to_cpu(old.disk_num_bytes), 0, - root->root_key.objectid, - key.objectid, key.offset - - le64_to_cpu(old.offset)); + disk_bytenr, num_bytes, 0, + root->root_key.objectid, + new_key.objectid, + start - extent_offset); BUG_ON(ret); + *hint_byte = disk_bytenr; } + key.offset = start; } + /* + * | ---- range to drop ----- | + * | -------- extent -------- | + */ + if (start <= key.offset && end < extent_end) { + BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); - if (found_inline) { - u64 mask = root->sectorsize - 1; - search_start = (extent_end + mask) & ~mask; - } else - search_start = extent_end; - - /* truncate existing extent */ - if (start > key.offset) { - u64 new_num; - u64 old_num; - keep = 1; - WARN_ON(start & (root->sectorsize - 1)); - if (found_extent) { - new_num = start - key.offset; - old_num = btrfs_file_extent_num_bytes(leaf, - extent); - *hint_byte = - btrfs_file_extent_disk_bytenr(leaf, - extent); - if (btrfs_file_extent_disk_bytenr(leaf, - extent)) { - inode_sub_bytes(inode, old_num - - new_num); - } - btrfs_set_file_extent_num_bytes(leaf, - extent, new_num); - btrfs_mark_buffer_dirty(leaf); - } else if (key.offset < inline_limit && - (end > extent_end) && - (inline_limit < extent_end)) { - u32 new_size; - new_size = btrfs_file_extent_calc_inline_size( - inline_limit - key.offset); - inode_sub_bytes(inode, extent_end - - inline_limit); - btrfs_set_file_extent_ram_bytes(leaf, extent, - new_size); - if (!compression && !encryption) { - btrfs_truncate_item(trans, root, path, - new_size, 1); - } + memcpy(&new_key, &key, sizeof(new_key)); + new_key.offset = end; + btrfs_set_item_key_safe(trans, root, path, &new_key); + + extent_offset += end - key.offset; + btrfs_set_file_extent_offset(leaf, fi, extent_offset); + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_end - end); + btrfs_mark_buffer_dirty(leaf); + if (disk_bytenr > 0) { + inode_sub_bytes(inode, end - key.offset); + *hint_byte = disk_bytenr; } + break; } - /* delete the entire extent */ - if (!keep) { - if (found_inline) - inode_sub_bytes(inode, extent_end - - key.offset); - ret = btrfs_del_item(trans, root, path); - /* TODO update progress marker and return */ - BUG_ON(ret); - extent = NULL; - btrfs_release_path(root, path); - /* the extent will be freed later */ - } - if (bookend && found_inline && start <= key.offset) { - u32 new_size; - new_size = btrfs_file_extent_calc_inline_size( - extent_end - end); - inode_sub_bytes(inode, end - key.offset); - btrfs_set_file_extent_ram_bytes(leaf, extent, - new_size); - if (!compression && !encryption) - ret = btrfs_truncate_item(trans, root, path, - new_size, 0); - BUG_ON(ret); - } - /* create bookend, splitting the extent in two */ - if (bookend && found_extent) { - struct btrfs_key ins; - ins.objectid = inode->i_ino; - ins.offset = end; - btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY); - btrfs_release_path(root, path); - path->leave_spinning = 1; - ret = btrfs_insert_empty_item(trans, root, path, &ins, - sizeof(*extent)); - BUG_ON(ret); + search_start = extent_end; + /* + * | ---- range to drop ----- | + * | -------- extent -------- | + */ + if (start > key.offset && end >= extent_end) { + BUG_ON(del_nr > 0); + BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); - leaf = path->nodes[0]; - extent = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - write_extent_buffer(leaf, &old, - (unsigned long)extent, sizeof(old)); - - btrfs_set_file_extent_compression(leaf, extent, - compression); - btrfs_set_file_extent_encryption(leaf, extent, - encryption); - btrfs_set_file_extent_other_encoding(leaf, extent, - other_encoding); - btrfs_set_file_extent_offset(leaf, extent, - le64_to_cpu(old.offset) + end - key.offset); - WARN_ON(le64_to_cpu(old.num_bytes) < - (extent_end - end)); - btrfs_set_file_extent_num_bytes(leaf, extent, - extent_end - end); + btrfs_set_file_extent_num_bytes(leaf, fi, + start - key.offset); + btrfs_mark_buffer_dirty(leaf); + if (disk_bytenr > 0) { + inode_sub_bytes(inode, extent_end - start); + *hint_byte = disk_bytenr; + } + if (end == extent_end) + break; - /* - * set the ram bytes to the size of the full extent - * before splitting. This is a worst case flag, - * but its the best we can do because we don't know - * how splitting affects compression - */ - btrfs_set_file_extent_ram_bytes(leaf, extent, - ram_bytes); - btrfs_set_file_extent_type(leaf, extent, found_type); - - btrfs_unlock_up_safe(path, 1); - btrfs_mark_buffer_dirty(path->nodes[0]); - btrfs_set_lock_blocking(path->nodes[0]); - - path->leave_spinning = 0; - btrfs_release_path(root, path); - if (disk_bytenr != 0) - inode_add_bytes(inode, extent_end - end); + path->slots[0]++; + goto next_slot; } - if (found_extent && !keep) { - u64 old_disk_bytenr = le64_to_cpu(old.disk_bytenr); + /* + * | ---- range to drop ----- | + * | ------ extent ------ | + */ + if (start <= key.offset && end >= extent_end) { + if (del_nr == 0) { + del_slot = path->slots[0]; + del_nr = 1; + } else { + BUG_ON(del_slot + del_nr != path->slots[0]); + del_nr++; + } - if (old_disk_bytenr != 0) { + if (extent_type == BTRFS_FILE_EXTENT_INLINE) { inode_sub_bytes(inode, - le64_to_cpu(old.num_bytes)); + extent_end - key.offset); + extent_end = ALIGN(extent_end, + root->sectorsize); + } else if (disk_bytenr > 0) { ret = btrfs_free_extent(trans, root, - old_disk_bytenr, - le64_to_cpu(old.disk_num_bytes), - 0, root->root_key.objectid, + disk_bytenr, num_bytes, 0, + root->root_key.objectid, key.objectid, key.offset - - le64_to_cpu(old.offset)); + extent_offset); BUG_ON(ret); - *hint_byte = old_disk_bytenr; + inode_sub_bytes(inode, + extent_end - key.offset); + *hint_byte = disk_bytenr; } - } - if (search_start >= end) { - ret = 0; - goto out; + if (end == extent_end) + break; + + if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) { + path->slots[0]++; + goto next_slot; + } + + ret = btrfs_del_items(trans, root, path, del_slot, + del_nr); + BUG_ON(ret); + + del_nr = 0; + del_slot = 0; + + btrfs_release_path(root, path); + continue; } + + BUG_ON(1); } -out: - btrfs_free_path(path); - if (locked_end > orig_locked_end) { - unlock_extent(&BTRFS_I(inode)->io_tree, orig_locked_end, - locked_end - 1, GFP_NOFS); + + if (del_nr > 0) { + ret = btrfs_del_items(trans, root, path, del_slot, del_nr); + BUG_ON(ret); } + + btrfs_free_path(path); return ret; } static int extent_mergeable(struct extent_buffer *leaf, int slot, - u64 objectid, u64 bytenr, u64 *start, u64 *end) + u64 objectid, u64 bytenr, u64 orig_offset, + u64 *start, u64 *end) { struct btrfs_file_extent_item *fi; struct btrfs_key key; @@ -598,6 +523,7 @@ static int extent_mergeable(struct extent_buffer *leaf, int slot, fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG || btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr || + btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset || btrfs_file_extent_compression(leaf, fi) || btrfs_file_extent_encryption(leaf, fi) || btrfs_file_extent_other_encoding(leaf, fi)) @@ -620,23 +546,24 @@ static int extent_mergeable(struct extent_buffer *leaf, int slot, * two or three. */ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, u64 start, u64 end) { + struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_buffer *leaf; struct btrfs_path *path; struct btrfs_file_extent_item *fi; struct btrfs_key key; + struct btrfs_key new_key; u64 bytenr; u64 num_bytes; u64 extent_end; u64 orig_offset; u64 other_start; u64 other_end; - u64 split = start; - u64 locked_end = end; - int extent_type; - int split_end = 1; + u64 split; + int del_nr = 0; + int del_slot = 0; + int recow; int ret; btrfs_drop_extent_cache(inode, start, end - 1, 0); @@ -644,12 +571,11 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); BUG_ON(!path); again: + recow = 0; + split = start; key.objectid = inode->i_ino; key.type = BTRFS_EXTENT_DATA_KEY; - if (split == start) - key.offset = split; - else - key.offset = split - 1; + key.offset = split; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0 && path->slots[0] > 0) @@ -661,159 +587,158 @@ again: key.type != BTRFS_EXTENT_DATA_KEY); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); - extent_type = btrfs_file_extent_type(leaf, fi); - BUG_ON(extent_type != BTRFS_FILE_EXTENT_PREALLOC); + BUG_ON(btrfs_file_extent_type(leaf, fi) != + BTRFS_FILE_EXTENT_PREALLOC); extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); BUG_ON(key.offset > start || extent_end < end); bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi); + memcpy(&new_key, &key, sizeof(new_key)); - if (key.offset == start) - split = end; - - if (key.offset == start && extent_end == end) { - int del_nr = 0; - int del_slot = 0; - other_start = end; - other_end = 0; - if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino, - bytenr, &other_start, &other_end)) { - extent_end = other_end; - del_slot = path->slots[0] + 1; - del_nr++; - ret = btrfs_free_extent(trans, root, bytenr, num_bytes, - 0, root->root_key.objectid, - inode->i_ino, orig_offset); - BUG_ON(ret); - } + if (start == key.offset && end < extent_end) { other_start = 0; other_end = start; - if (extent_mergeable(leaf, path->slots[0] - 1, inode->i_ino, - bytenr, &other_start, &other_end)) { - key.offset = other_start; - del_slot = path->slots[0]; - del_nr++; - ret = btrfs_free_extent(trans, root, bytenr, num_bytes, - 0, root->root_key.objectid, - inode->i_ino, orig_offset); - BUG_ON(ret); - } - split_end = 0; - if (del_nr == 0) { - btrfs_set_file_extent_type(leaf, fi, - BTRFS_FILE_EXTENT_REG); - goto done; - } - - fi = btrfs_item_ptr(leaf, del_slot - 1, - struct btrfs_file_extent_item); - btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG); - btrfs_set_file_extent_num_bytes(leaf, fi, - extent_end - key.offset); - btrfs_mark_buffer_dirty(leaf); - - ret = btrfs_del_items(trans, root, path, del_slot, del_nr); - BUG_ON(ret); - goto release; - } else if (split == start) { - if (locked_end < extent_end) { - ret = try_lock_extent(&BTRFS_I(inode)->io_tree, - locked_end, extent_end - 1, GFP_NOFS); - if (!ret) { - btrfs_release_path(root, path); - lock_extent(&BTRFS_I(inode)->io_tree, - locked_end, extent_end - 1, GFP_NOFS); - locked_end = extent_end; - goto again; - } - locked_end = extent_end; + if (extent_mergeable(leaf, path->slots[0] - 1, + inode->i_ino, bytenr, orig_offset, + &other_start, &other_end)) { + new_key.offset = end; + btrfs_set_item_key_safe(trans, root, path, &new_key); + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_end - end); + btrfs_set_file_extent_offset(leaf, fi, + end - orig_offset); + fi = btrfs_item_ptr(leaf, path->slots[0] - 1, + struct btrfs_file_extent_item); + btrfs_set_file_extent_num_bytes(leaf, fi, + end - other_start); + btrfs_mark_buffer_dirty(leaf); + goto out; } - btrfs_set_file_extent_num_bytes(leaf, fi, split - key.offset); - } else { - BUG_ON(key.offset != start); - key.offset = split; - btrfs_set_file_extent_offset(leaf, fi, key.offset - - orig_offset); - btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split); - btrfs_set_item_key_safe(trans, root, path, &key); - extent_end = split; } - if (extent_end == end) { - split_end = 0; - extent_type = BTRFS_FILE_EXTENT_REG; - } - if (extent_end == end && split == start) { + if (start > key.offset && end == extent_end) { other_start = end; other_end = 0; - if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino, - bytenr, &other_start, &other_end)) { - path->slots[0]++; + if (extent_mergeable(leaf, path->slots[0] + 1, + inode->i_ino, bytenr, orig_offset, + &other_start, &other_end)) { fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); - key.offset = split; - btrfs_set_item_key_safe(trans, root, path, &key); - btrfs_set_file_extent_offset(leaf, fi, key.offset - - orig_offset); btrfs_set_file_extent_num_bytes(leaf, fi, - other_end - split); - goto done; - } - } - if (extent_end == end && split == end) { - other_start = 0; - other_end = start; - if (extent_mergeable(leaf, path->slots[0] - 1 , inode->i_ino, - bytenr, &other_start, &other_end)) { - path->slots[0]--; + start - key.offset); + path->slots[0]++; + new_key.offset = start; + btrfs_set_item_key_safe(trans, root, path, &new_key); + fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); - btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - - other_start); - goto done; + btrfs_set_file_extent_num_bytes(leaf, fi, + other_end - start); + btrfs_set_file_extent_offset(leaf, fi, + start - orig_offset); + btrfs_mark_buffer_dirty(leaf); + goto out; } } - btrfs_mark_buffer_dirty(leaf); + while (start > key.offset || end < extent_end) { + if (key.offset == start) + split = end; - ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, - root->root_key.objectid, - inode->i_ino, orig_offset); - BUG_ON(ret); - btrfs_release_path(root, path); + new_key.offset = split; + ret = btrfs_duplicate_item(trans, root, path, &new_key); + if (ret == -EAGAIN) { + btrfs_release_path(root, path); + goto again; + } + BUG_ON(ret < 0); - key.offset = start; - ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*fi)); - BUG_ON(ret); + leaf = path->nodes[0]; + fi = btrfs_item_ptr(leaf, path->slots[0] - 1, + struct btrfs_file_extent_item); + btrfs_set_file_extent_num_bytes(leaf, fi, + split - key.offset); - leaf = path->nodes[0]; - fi = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - btrfs_set_file_extent_generation(leaf, fi, trans->transid); - btrfs_set_file_extent_type(leaf, fi, extent_type); - btrfs_set_file_extent_disk_bytenr(leaf, fi, bytenr); - btrfs_set_file_extent_disk_num_bytes(leaf, fi, num_bytes); - btrfs_set_file_extent_offset(leaf, fi, key.offset - orig_offset); - btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset); - btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); - btrfs_set_file_extent_compression(leaf, fi, 0); - btrfs_set_file_extent_encryption(leaf, fi, 0); - btrfs_set_file_extent_other_encoding(leaf, fi, 0); -done: - btrfs_mark_buffer_dirty(leaf); - -release: - btrfs_release_path(root, path); - if (split_end && split == start) { - split = end; - goto again; + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + + btrfs_set_file_extent_offset(leaf, fi, split - orig_offset); + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_end - split); + btrfs_mark_buffer_dirty(leaf); + + ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, + root->root_key.objectid, + inode->i_ino, orig_offset); + BUG_ON(ret); + + if (split == start) { + key.offset = start; + } else { + BUG_ON(start != key.offset); + path->slots[0]--; + extent_end = end; + } + recow = 1; + } + + other_start = end; + other_end = 0; + if (extent_mergeable(leaf, path->slots[0] + 1, + inode->i_ino, bytenr, orig_offset, + &other_start, &other_end)) { + if (recow) { + btrfs_release_path(root, path); + goto again; + } + extent_end = other_end; + del_slot = path->slots[0] + 1; + del_nr++; + ret = btrfs_free_extent(trans, root, bytenr, num_bytes, + 0, root->root_key.objectid, + inode->i_ino, orig_offset); + BUG_ON(ret); + } + other_start = 0; + other_end = start; + if (extent_mergeable(leaf, path->slots[0] - 1, + inode->i_ino, bytenr, orig_offset, + &other_start, &other_end)) { + if (recow) { + btrfs_release_path(root, path); + goto again; + } + key.offset = other_start; + del_slot = path->slots[0]; + del_nr++; + ret = btrfs_free_extent(trans, root, bytenr, num_bytes, + 0, root->root_key.objectid, + inode->i_ino, orig_offset); + BUG_ON(ret); } - if (locked_end > end) { - unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1, - GFP_NOFS); + if (del_nr == 0) { + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_type(leaf, fi, + BTRFS_FILE_EXTENT_REG); + btrfs_mark_buffer_dirty(leaf); + } else { + fi = btrfs_item_ptr(leaf, del_slot - 1, + struct btrfs_file_extent_item); + btrfs_set_file_extent_type(leaf, fi, + BTRFS_FILE_EXTENT_REG); + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_end - key.offset); + btrfs_mark_buffer_dirty(leaf); + + ret = btrfs_del_items(trans, root, path, del_slot, del_nr); + BUG_ON(ret); } +out: btrfs_free_path(path); return 0; } @@ -1210,7 +1135,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) } mutex_lock(&dentry->d_inode->i_mutex); out: - return ret > 0 ? EIO : ret; + return ret > 0 ? -EIO : ret; } static const struct vm_operations_struct btrfs_file_vm_ops = { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b3ad168..4deb280 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -88,13 +88,14 @@ static noinline int cow_file_range(struct inode *inode, u64 start, u64 end, int *page_started, unsigned long *nr_written, int unlock); -static int btrfs_init_inode_security(struct inode *inode, struct inode *dir) +static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *dir) { int err; - err = btrfs_init_acl(inode, dir); + err = btrfs_init_acl(trans, inode, dir); if (!err) - err = btrfs_xattr_security_init(inode, dir); + err = btrfs_xattr_security_init(trans, inode, dir); return err; } @@ -188,8 +189,18 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_free_path(path); + /* + * we're an inline extent, so nobody can + * extend the file past i_size without locking + * a page we already have locked. + * + * We must do any isize and inode updates + * before we unlock the pages. Otherwise we + * could end up racing with unlink. + */ BTRFS_I(inode)->disk_i_size = inode->i_size; btrfs_update_inode(trans, root, inode); + return 0; fail: btrfs_free_path(path); @@ -230,8 +241,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, return 1; } - ret = btrfs_drop_extents(trans, root, inode, start, - aligned_end, aligned_end, start, + ret = btrfs_drop_extents(trans, inode, start, aligned_end, &hint_byte, 1); BUG_ON(ret); @@ -416,7 +426,6 @@ again: start, end, total_compressed, pages); } - btrfs_end_transaction(trans, root); if (ret == 0) { /* * inline extent creation worked, we don't need @@ -430,9 +439,11 @@ again: EXTENT_CLEAR_DELALLOC | EXTENT_CLEAR_ACCOUNTING | EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK); - ret = 0; + + btrfs_end_transaction(trans, root); goto free_pages_out; } + btrfs_end_transaction(trans, root); } if (will_compress) { @@ -472,7 +483,8 @@ again: nr_pages_ret = 0; /* flag the file so we don't compress in the future */ - BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; + if (!btrfs_test_opt(root, FORCE_COMPRESS)) + BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; } if (will_compress) { *num_added += 1; @@ -543,7 +555,6 @@ static noinline int submit_compressed_extents(struct inode *inode, if (list_empty(&async_cow->extents)) return 0; - trans = btrfs_join_transaction(root, 1); while (!list_empty(&async_cow->extents)) { async_extent = list_entry(async_cow->extents.next, @@ -590,19 +601,15 @@ retry: lock_extent(io_tree, async_extent->start, async_extent->start + async_extent->ram_size - 1, GFP_NOFS); - /* - * here we're doing allocation and writeback of the - * compressed pages - */ - btrfs_drop_extent_cache(inode, async_extent->start, - async_extent->start + - async_extent->ram_size - 1, 0); + trans = btrfs_join_transaction(root, 1); ret = btrfs_reserve_extent(trans, root, async_extent->compressed_size, async_extent->compressed_size, 0, alloc_hint, (u64)-1, &ins, 1); + btrfs_end_transaction(trans, root); + if (ret) { int i; for (i = 0; i < async_extent->nr_pages; i++) { @@ -618,6 +625,14 @@ retry: goto retry; } + /* + * here we're doing allocation and writeback of the + * compressed pages + */ + btrfs_drop_extent_cache(inode, async_extent->start, + async_extent->start + + async_extent->ram_size - 1, 0); + em = alloc_extent_map(GFP_NOFS); em->start = async_extent->start; em->len = async_extent->ram_size; @@ -649,8 +664,6 @@ retry: BTRFS_ORDERED_COMPRESSED); BUG_ON(ret); - btrfs_end_transaction(trans, root); - /* * clear dirty, set writeback and unlock the pages. */ @@ -672,13 +685,11 @@ retry: async_extent->nr_pages); BUG_ON(ret); - trans = btrfs_join_transaction(root, 1); alloc_hint = ins.objectid + ins.offset; kfree(async_extent); cond_resched(); } - btrfs_end_transaction(trans, root); return 0; } @@ -742,6 +753,7 @@ static noinline int cow_file_range(struct inode *inode, EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK); + *nr_written = *nr_written + (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE; *page_started = 1; @@ -1596,7 +1608,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, struct inode *inode, u64 file_pos, u64 disk_bytenr, u64 disk_num_bytes, u64 num_bytes, u64 ram_bytes, - u64 locked_end, u8 compression, u8 encryption, u16 other_encoding, int extent_type) { @@ -1622,9 +1633,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, * the caller is expected to unpin it and allow it to be merged * with the others. */ - ret = btrfs_drop_extents(trans, root, inode, file_pos, - file_pos + num_bytes, locked_end, - file_pos, &hint, 0); + ret = btrfs_drop_extents(trans, inode, file_pos, file_pos + num_bytes, + &hint, 0); BUG_ON(ret); ins.objectid = inode->i_ino; @@ -1671,24 +1681,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, * before we start the transaction. It limits the amount of btree * reads required while inside the transaction. */ -static noinline void reada_csum(struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_ordered_extent *ordered_extent) -{ - struct btrfs_ordered_sum *sum; - u64 bytenr; - - sum = list_entry(ordered_extent->list.next, struct btrfs_ordered_sum, - list); - bytenr = sum->sums[0].bytenr; - - /* - * we don't care about the results, the point of this search is - * just to get the btree leaves into ram - */ - btrfs_lookup_csum(NULL, root->fs_info->csum_root, path, bytenr, 0); -} - /* as ordered data IO finishes, this gets called so we can finish * an ordered extent if the range of bytes in the file it covers are * fully written. @@ -1699,7 +1691,6 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) struct btrfs_trans_handle *trans; struct btrfs_ordered_extent *ordered_extent = NULL; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - struct btrfs_path *path; int compressed = 0; int ret; @@ -1707,46 +1698,32 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) if (!ret) return 0; - /* - * before we join the transaction, try to do some of our IO. - * This will limit the amount of IO that we have to do with - * the transaction running. We're unlikely to need to do any - * IO if the file extents are new, the disk_i_size checks - * covers the most common case. - */ - if (start < BTRFS_I(inode)->disk_i_size) { - path = btrfs_alloc_path(); - if (path) { - ret = btrfs_lookup_file_extent(NULL, root, path, - inode->i_ino, - start, 0); - ordered_extent = btrfs_lookup_ordered_extent(inode, - start); - if (!list_empty(&ordered_extent->list)) { - btrfs_release_path(root, path); - reada_csum(root, path, ordered_extent); - } - btrfs_free_path(path); + ordered_extent = btrfs_lookup_ordered_extent(inode, start); + BUG_ON(!ordered_extent); + + if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { + BUG_ON(!list_empty(&ordered_extent->list)); + ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); + if (!ret) { + trans = btrfs_join_transaction(root, 1); + ret = btrfs_update_inode(trans, root, inode); + BUG_ON(ret); + btrfs_end_transaction(trans, root); } + goto out; } - trans = btrfs_join_transaction(root, 1); - - if (!ordered_extent) - ordered_extent = btrfs_lookup_ordered_extent(inode, start); - BUG_ON(!ordered_extent); - if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) - goto nocow; - lock_extent(io_tree, ordered_extent->file_offset, ordered_extent->file_offset + ordered_extent->len - 1, GFP_NOFS); + trans = btrfs_join_transaction(root, 1); + if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) compressed = 1; if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { BUG_ON(compressed); - ret = btrfs_mark_extent_written(trans, root, inode, + ret = btrfs_mark_extent_written(trans, inode, ordered_extent->file_offset, ordered_extent->file_offset + ordered_extent->len); @@ -1758,8 +1735,6 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) ordered_extent->disk_len, ordered_extent->len, ordered_extent->len, - ordered_extent->file_offset + - ordered_extent->len, compressed, 0, 0, BTRFS_FILE_EXTENT_REG); unpin_extent_cache(&BTRFS_I(inode)->extent_tree, @@ -1770,22 +1745,20 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) unlock_extent(io_tree, ordered_extent->file_offset, ordered_extent->file_offset + ordered_extent->len - 1, GFP_NOFS); -nocow: add_pending_csums(trans, inode, ordered_extent->file_offset, &ordered_extent->list); - mutex_lock(&BTRFS_I(inode)->extent_mutex); - btrfs_ordered_update_i_size(inode, ordered_extent); - btrfs_update_inode(trans, root, inode); - btrfs_remove_ordered_extent(inode, ordered_extent); - mutex_unlock(&BTRFS_I(inode)->extent_mutex); - + /* this also removes the ordered extent from the tree */ + btrfs_ordered_update_i_size(inode, 0, ordered_extent); + ret = btrfs_update_inode(trans, root, inode); + BUG_ON(ret); + btrfs_end_transaction(trans, root); +out: /* once for us */ btrfs_put_ordered_extent(ordered_extent); /* once for the tree */ btrfs_put_ordered_extent(ordered_extent); - btrfs_end_transaction(trans, root); return 0; } @@ -2008,6 +1981,54 @@ zeroit: return -EIO; } +struct delayed_iput { + struct list_head list; + struct inode *inode; +}; + +void btrfs_add_delayed_iput(struct inode *inode) +{ + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct delayed_iput *delayed; + + if (atomic_add_unless(&inode->i_count, -1, 1)) + return; + + delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL); + delayed->inode = inode; + + spin_lock(&fs_info->delayed_iput_lock); + list_add_tail(&delayed->list, &fs_info->delayed_iputs); + spin_unlock(&fs_info->delayed_iput_lock); +} + +void btrfs_run_delayed_iputs(struct btrfs_root *root) +{ + LIST_HEAD(list); + struct btrfs_fs_info *fs_info = root->fs_info; + struct delayed_iput *delayed; + int empty; + + spin_lock(&fs_info->delayed_iput_lock); + empty = list_empty(&fs_info->delayed_iputs); + spin_unlock(&fs_info->delayed_iput_lock); + if (empty) + return; + + down_read(&root->fs_info->cleanup_work_sem); + spin_lock(&fs_info->delayed_iput_lock); + list_splice_init(&fs_info->delayed_iputs, &list); + spin_unlock(&fs_info->delayed_iput_lock); + + while (!list_empty(&list)) { + delayed = list_entry(list.next, struct delayed_iput, list); + list_del(&delayed->list); + iput(delayed->inode); + kfree(delayed); + } + up_read(&root->fs_info->cleanup_work_sem); +} + /* * This creates an orphan entry for the given inode in case something goes * wrong in the middle of an unlink/truncate. @@ -2080,16 +2101,17 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) struct inode *inode; int ret = 0, nr_unlink = 0, nr_truncate = 0; - path = btrfs_alloc_path(); - if (!path) + if (!xchg(&root->clean_orphans, 0)) return; + + path = btrfs_alloc_path(); + BUG_ON(!path); path->reada = -1; key.objectid = BTRFS_ORPHAN_OBJECTID; btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); key.offset = (u64)-1; - while (1) { ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) { @@ -2834,37 +2856,40 @@ out: * min_type is the minimum key type to truncate down to. If set to 0, this * will kill all the items on this inode, including the INODE_ITEM_KEY. */ -noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *inode, - u64 new_size, u32 min_type) +int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, + u64 new_size, u32 min_type) { - int ret; struct btrfs_path *path; - struct btrfs_key key; - struct btrfs_key found_key; - u32 found_type = (u8)-1; struct extent_buffer *leaf; struct btrfs_file_extent_item *fi; + struct btrfs_key key; + struct btrfs_key found_key; u64 extent_start = 0; u64 extent_num_bytes = 0; u64 extent_offset = 0; u64 item_end = 0; + u64 mask = root->sectorsize - 1; + u32 found_type = (u8)-1; int found_extent; int del_item; int pending_del_nr = 0; int pending_del_slot = 0; int extent_type = -1; int encoding; - u64 mask = root->sectorsize - 1; + int ret; + int err = 0; + + BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); if (root->ref_cows) btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0); + path = btrfs_alloc_path(); BUG_ON(!path); path->reada = -1; - /* FIXME, add redo link to tree so we don't leak on crash */ key.objectid = inode->i_ino; key.offset = (u64)-1; key.type = (u8)-1; @@ -2872,17 +2897,17 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, search_again: path->leave_spinning = 1; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret < 0) - goto error; + if (ret < 0) { + err = ret; + goto out; + } if (ret > 0) { /* there are no items in the tree for us to truncate, we're * done */ - if (path->slots[0] == 0) { - ret = 0; - goto error; - } + if (path->slots[0] == 0) + goto out; path->slots[0]--; } @@ -2917,28 +2942,17 @@ search_again: } item_end--; } - if (item_end < new_size) { - if (found_type == BTRFS_DIR_ITEM_KEY) - found_type = BTRFS_INODE_ITEM_KEY; - else if (found_type == BTRFS_EXTENT_ITEM_KEY) - found_type = BTRFS_EXTENT_DATA_KEY; - else if (found_type == BTRFS_EXTENT_DATA_KEY) - found_type = BTRFS_XATTR_ITEM_KEY; - else if (found_type == BTRFS_XATTR_ITEM_KEY) - found_type = BTRFS_INODE_REF_KEY; - else if (found_type) - found_type--; - else + if (found_type > min_type) { + del_item = 1; + } else { + if (item_end < new_size) break; - btrfs_set_key_type(&key, found_type); - goto next; + if (found_key.offset >= new_size) + del_item = 1; + else + del_item = 0; } - if (found_key.offset >= new_size) - del_item = 1; - else - del_item = 0; found_extent = 0; - /* FIXME, shrink the extent if the ref count is only 1 */ if (found_type != BTRFS_EXTENT_DATA_KEY) goto delete; @@ -3025,42 +3039,36 @@ delete: inode->i_ino, extent_offset); BUG_ON(ret); } -next: - if (path->slots[0] == 0) { - if (pending_del_nr) - goto del_pending; - btrfs_release_path(root, path); - if (found_type == BTRFS_INODE_ITEM_KEY) - break; - goto search_again; - } - path->slots[0]--; - if (pending_del_nr && - path->slots[0] + 1 != pending_del_slot) { - struct btrfs_key debug; -del_pending: - btrfs_item_key_to_cpu(path->nodes[0], &debug, - pending_del_slot); - ret = btrfs_del_items(trans, root, path, - pending_del_slot, - pending_del_nr); - BUG_ON(ret); - pending_del_nr = 0; + if (found_type == BTRFS_INODE_ITEM_KEY) + break; + + if (path->slots[0] == 0 || + path->slots[0] != pending_del_slot) { + if (root->ref_cows) { + err = -EAGAIN; + goto out; + } + if (pending_del_nr) { + ret = btrfs_del_items(trans, root, path, + pending_del_slot, + pending_del_nr); + BUG_ON(ret); + pending_del_nr = 0; + } btrfs_release_path(root, path); - if (found_type == BTRFS_INODE_ITEM_KEY) - break; goto search_again; + } else { + path->slots[0]--; } } - ret = 0; -error: +out: if (pending_del_nr) { ret = btrfs_del_items(trans, root, path, pending_del_slot, pending_del_nr); } btrfs_free_path(path); - return ret; + return err; } /* @@ -3180,10 +3188,6 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) if (size <= hole_start) return 0; - err = btrfs_truncate_page(inode->i_mapping, inode->i_size); - if (err) - return err; - while (1) { struct btrfs_ordered_extent *ordered; btrfs_wait_ordered_range(inode, hole_start, @@ -3196,9 +3200,6 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) btrfs_put_ordered_extent(ordered); } - trans = btrfs_start_transaction(root, 1); - btrfs_set_trans_block_group(trans, inode); - cur_offset = hole_start; while (1) { em = btrfs_get_extent(inode, NULL, 0, cur_offset, @@ -3206,40 +3207,120 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) BUG_ON(IS_ERR(em) || !em); last_byte = min(extent_map_end(em), block_end); last_byte = (last_byte + mask) & ~mask; - if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) { + if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { u64 hint_byte = 0; hole_size = last_byte - cur_offset; - err = btrfs_drop_extents(trans, root, inode, - cur_offset, - cur_offset + hole_size, - block_end, - cur_offset, &hint_byte, 1); - if (err) - break; - err = btrfs_reserve_metadata_space(root, 1); + err = btrfs_reserve_metadata_space(root, 2); if (err) break; + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); + + err = btrfs_drop_extents(trans, inode, cur_offset, + cur_offset + hole_size, + &hint_byte, 1); + BUG_ON(err); + err = btrfs_insert_file_extent(trans, root, inode->i_ino, cur_offset, 0, 0, hole_size, 0, hole_size, 0, 0, 0); + BUG_ON(err); + btrfs_drop_extent_cache(inode, hole_start, last_byte - 1, 0); - btrfs_unreserve_metadata_space(root, 1); + + btrfs_end_transaction(trans, root); + btrfs_unreserve_metadata_space(root, 2); } free_extent_map(em); cur_offset = last_byte; - if (err || cur_offset >= block_end) + if (cur_offset >= block_end) break; } - btrfs_end_transaction(trans, root); unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS); return err; } +static int btrfs_setattr_size(struct inode *inode, struct iattr *attr) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + unsigned long nr; + int ret; + + if (attr->ia_size == inode->i_size) + return 0; + + if (attr->ia_size > inode->i_size) { + unsigned long limit; + limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; + if (attr->ia_size > inode->i_sb->s_maxbytes) + return -EFBIG; + if (limit != RLIM_INFINITY && attr->ia_size > limit) { + send_sig(SIGXFSZ, current, 0); + return -EFBIG; + } + } + + ret = btrfs_reserve_metadata_space(root, 1); + if (ret) + return ret; + + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); + + ret = btrfs_orphan_add(trans, inode); + BUG_ON(ret); + + nr = trans->blocks_used; + btrfs_end_transaction(trans, root); + btrfs_unreserve_metadata_space(root, 1); + btrfs_btree_balance_dirty(root, nr); + + if (attr->ia_size > inode->i_size) { + ret = btrfs_cont_expand(inode, attr->ia_size); + if (ret) { + btrfs_truncate(inode); + return ret; + } + + i_size_write(inode, attr->ia_size); + btrfs_ordered_update_i_size(inode, inode->i_size, NULL); + + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); + + ret = btrfs_update_inode(trans, root, inode); + BUG_ON(ret); + if (inode->i_nlink > 0) { + ret = btrfs_orphan_del(trans, inode); + BUG_ON(ret); + } + nr = trans->blocks_used; + btrfs_end_transaction(trans, root); + btrfs_btree_balance_dirty(root, nr); + return 0; + } + + /* + * We're truncating a file that used to have good data down to + * zero. Make sure it gets into the ordered flush list so that + * any new writes get down to disk quickly. + */ + if (attr->ia_size == 0) + BTRFS_I(inode)->ordered_data_close = 1; + + /* we don't support swapfiles, so vmtruncate shouldn't fail */ + ret = vmtruncate(inode, attr->ia_size); + BUG_ON(ret); + + return 0; +} + static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = dentry->d_inode; @@ -3250,23 +3331,14 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) return err; if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { - if (attr->ia_size > inode->i_size) { - err = btrfs_cont_expand(inode, attr->ia_size); - if (err) - return err; - } else if (inode->i_size > 0 && - attr->ia_size == 0) { - - /* we're truncating a file that used to have good - * data down to zero. Make sure it gets into - * the ordered flush list so that any new writes - * get down to disk quickly. - */ - BTRFS_I(inode)->ordered_data_close = 1; - } + err = btrfs_setattr_size(inode, attr); + if (err) + return err; } + attr->ia_valid &= ~ATTR_SIZE; - err = inode_setattr(inode, attr); + if (attr->ia_valid) + err = inode_setattr(inode, attr); if (!err && ((attr->ia_valid & ATTR_MODE))) err = btrfs_acl_chmod(inode); @@ -3287,36 +3359,43 @@ void btrfs_delete_inode(struct inode *inode) } btrfs_wait_ordered_range(inode, 0, (u64)-1); + if (root->fs_info->log_root_recovering) { + BUG_ON(!list_empty(&BTRFS_I(inode)->i_orphan)); + goto no_delete; + } + if (inode->i_nlink > 0) { BUG_ON(btrfs_root_refs(&root->root_item) != 0); goto no_delete; } btrfs_i_size_write(inode, 0); - trans = btrfs_join_transaction(root, 1); - btrfs_set_trans_block_group(trans, inode); - ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, 0); - if (ret) { - btrfs_orphan_del(NULL, inode); - goto no_delete_lock; - } + while (1) { + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); + ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); - btrfs_orphan_del(trans, inode); + if (ret != -EAGAIN) + break; - nr = trans->blocks_used; - clear_inode(inode); + nr = trans->blocks_used; + btrfs_end_transaction(trans, root); + trans = NULL; + btrfs_btree_balance_dirty(root, nr); + } - btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root, nr); - return; + if (ret == 0) { + ret = btrfs_orphan_del(trans, inode); + BUG_ON(ret); + } -no_delete_lock: nr = trans->blocks_used; btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(root, nr); no_delete: clear_inode(inode); + return; } /* @@ -3569,7 +3648,6 @@ static noinline void init_btrfs_i(struct inode *inode) INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations); RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree); - mutex_init(&BTRFS_I(inode)->extent_mutex); mutex_init(&BTRFS_I(inode)->log_mutex); } @@ -3695,6 +3773,13 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) } srcu_read_unlock(&root->fs_info->subvol_srcu, index); + if (root != sub_root) { + down_read(&root->fs_info->cleanup_work_sem); + if (!(inode->i_sb->s_flags & MS_RDONLY)) + btrfs_orphan_cleanup(sub_root); + up_read(&root->fs_info->cleanup_work_sem); + } + return inode; } @@ -3869,7 +3954,11 @@ skip: /* Reached end of directory/root. Bump pos past the last item. */ if (key_type == BTRFS_DIR_INDEX_KEY) - filp->f_pos = INT_LIMIT(off_t); + /* + * 32-bit glibc will use getdents64, but then strtol - + * so the last number we can serve is this. + */ + filp->f_pos = 0x7fffffff; else filp->f_pos++; nopos: @@ -4219,7 +4308,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, if (IS_ERR(inode)) goto out_unlock; - err = btrfs_init_inode_security(inode, dir); + err = btrfs_init_inode_security(trans, inode, dir); if (err) { drop_inode = 1; goto out_unlock; @@ -4290,7 +4379,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, if (IS_ERR(inode)) goto out_unlock; - err = btrfs_init_inode_security(inode, dir); + err = btrfs_init_inode_security(trans, inode, dir); if (err) { drop_inode = 1; goto out_unlock; @@ -4336,6 +4425,10 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, if (inode->i_nlink == 0) return -ENOENT; + /* do not allow sys_link's with other subvols of the same device */ + if (root->objectid != BTRFS_I(inode)->root->objectid) + return -EPERM; + /* * 1 item for inode ref * 2 items for dir items @@ -4423,7 +4516,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) drop_on_err = 1; - err = btrfs_init_inode_security(inode, dir); + err = btrfs_init_inode_security(trans, inode, dir); if (err) goto out_fail; @@ -5074,17 +5167,20 @@ static void btrfs_truncate(struct inode *inode) unsigned long nr; u64 mask = root->sectorsize - 1; - if (!S_ISREG(inode->i_mode)) - return; - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + if (!S_ISREG(inode->i_mode)) { + WARN_ON(1); return; + } ret = btrfs_truncate_page(inode->i_mapping, inode->i_size); if (ret) return; + btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); + btrfs_ordered_update_i_size(inode, inode->i_size, NULL); trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); /* * setattr is responsible for setting the ordered_data_close flag, @@ -5106,21 +5202,32 @@ static void btrfs_truncate(struct inode *inode) if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close) btrfs_add_ordered_operation(trans, root, inode); - btrfs_set_trans_block_group(trans, inode); - btrfs_i_size_write(inode, inode->i_size); + while (1) { + ret = btrfs_truncate_inode_items(trans, root, inode, + inode->i_size, + BTRFS_EXTENT_DATA_KEY); + if (ret != -EAGAIN) + break; - ret = btrfs_orphan_add(trans, inode); - if (ret) - goto out; - /* FIXME, add redo link to tree so we don't leak on crash */ - ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, - BTRFS_EXTENT_DATA_KEY); - btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, inode); + BUG_ON(ret); - ret = btrfs_orphan_del(trans, inode); + nr = trans->blocks_used; + btrfs_end_transaction(trans, root); + btrfs_btree_balance_dirty(root, nr); + + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); + } + + if (ret == 0 && inode->i_nlink > 0) { + ret = btrfs_orphan_del(trans, inode); + BUG_ON(ret); + } + + ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); -out: nr = trans->blocks_used; ret = btrfs_end_transaction_throttle(trans, root); BUG_ON(ret); @@ -5217,9 +5324,9 @@ void btrfs_destroy_inode(struct inode *inode) spin_lock(&root->list_lock); if (!list_empty(&BTRFS_I(inode)->i_orphan)) { - printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan" - " list\n", inode->i_ino); - dump_stack(); + printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n", + inode->i_ino); + list_del_init(&BTRFS_I(inode)->i_orphan); } spin_unlock(&root->list_lock); @@ -5476,7 +5583,7 @@ out_fail: * some fairly slow code that needs optimization. This walks the list * of all the inodes with pending delalloc and forces them to disk. */ -int btrfs_start_delalloc_inodes(struct btrfs_root *root) +int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) { struct list_head *head = &root->fs_info->delalloc_inodes; struct btrfs_inode *binode; @@ -5495,7 +5602,10 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root) spin_unlock(&root->fs_info->delalloc_lock); if (inode) { filemap_flush(inode->i_mapping); - iput(inode); + if (delay_iput) + btrfs_add_delayed_iput(inode); + else + iput(inode); } cond_resched(); spin_lock(&root->fs_info->delalloc_lock); @@ -5569,7 +5679,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, if (IS_ERR(inode)) goto out_unlock; - err = btrfs_init_inode_security(inode, dir); + err = btrfs_init_inode_security(trans, inode, dir); if (err) { drop_inode = 1; goto out_unlock; @@ -5641,57 +5751,77 @@ out_fail: return err; } -static int prealloc_file_range(struct btrfs_trans_handle *trans, - struct inode *inode, u64 start, u64 end, - u64 locked_end, u64 alloc_hint, int mode) +static int prealloc_file_range(struct inode *inode, u64 start, u64 end, + u64 alloc_hint, int mode, loff_t actual_len) { + struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key ins; u64 alloc_size; u64 cur_offset = start; u64 num_bytes = end - start; int ret = 0; + u64 i_size; while (num_bytes > 0) { alloc_size = min(num_bytes, root->fs_info->max_extent); - ret = btrfs_reserve_metadata_space(root, 1); - if (ret) - goto out; + trans = btrfs_start_transaction(root, 1); ret = btrfs_reserve_extent(trans, root, alloc_size, root->sectorsize, 0, alloc_hint, (u64)-1, &ins, 1); if (ret) { WARN_ON(1); - goto out; + goto stop_trans; } + + ret = btrfs_reserve_metadata_space(root, 3); + if (ret) { + btrfs_free_reserved_extent(root, ins.objectid, + ins.offset); + goto stop_trans; + } + ret = insert_reserved_file_extent(trans, inode, cur_offset, ins.objectid, ins.offset, ins.offset, - ins.offset, locked_end, - 0, 0, 0, + ins.offset, 0, 0, 0, BTRFS_FILE_EXTENT_PREALLOC); BUG_ON(ret); btrfs_drop_extent_cache(inode, cur_offset, cur_offset + ins.offset -1, 0); + num_bytes -= ins.offset; cur_offset += ins.offset; alloc_hint = ins.objectid + ins.offset; - btrfs_unreserve_metadata_space(root, 1); - } -out: - if (cur_offset > start) { + inode->i_ctime = CURRENT_TIME; BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; if (!(mode & FALLOC_FL_KEEP_SIZE) && - cur_offset > i_size_read(inode)) - btrfs_i_size_write(inode, cur_offset); + (actual_len > inode->i_size) && + (cur_offset > inode->i_size)) { + + if (cur_offset > actual_len) + i_size = actual_len; + else + i_size = cur_offset; + i_size_write(inode, i_size); + btrfs_ordered_update_i_size(inode, i_size, NULL); + } + ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); + + btrfs_end_transaction(trans, root); + btrfs_unreserve_metadata_space(root, 3); } + return ret; +stop_trans: + btrfs_end_transaction(trans, root); return ret; + } static long btrfs_fallocate(struct inode *inode, int mode, @@ -5705,8 +5835,6 @@ static long btrfs_fallocate(struct inode *inode, int mode, u64 locked_end; u64 mask = BTRFS_I(inode)->root->sectorsize - 1; struct extent_map *em; - struct btrfs_trans_handle *trans; - struct btrfs_root *root; int ret; alloc_start = offset & ~mask; @@ -5725,9 +5853,7 @@ static long btrfs_fallocate(struct inode *inode, int mode, goto out; } - root = BTRFS_I(inode)->root; - - ret = btrfs_check_data_free_space(root, inode, + ret = btrfs_check_data_free_space(BTRFS_I(inode)->root, inode, alloc_end - alloc_start); if (ret) goto out; @@ -5736,12 +5862,6 @@ static long btrfs_fallocate(struct inode *inode, int mode, while (1) { struct btrfs_ordered_extent *ordered; - trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1); - if (!trans) { - ret = -EIO; - goto out_free; - } - /* the extent lock is ordered inside the running * transaction */ @@ -5755,8 +5875,6 @@ static long btrfs_fallocate(struct inode *inode, int mode, btrfs_put_ordered_extent(ordered); unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, GFP_NOFS); - btrfs_end_transaction(trans, BTRFS_I(inode)->root); - /* * we can't wait on the range with the transaction * running or with the extent lock held @@ -5777,10 +5895,12 @@ static long btrfs_fallocate(struct inode *inode, int mode, BUG_ON(IS_ERR(em) || !em); last_byte = min(extent_map_end(em), alloc_end); last_byte = (last_byte + mask) & ~mask; - if (em->block_start == EXTENT_MAP_HOLE) { - ret = prealloc_file_range(trans, inode, cur_offset, - last_byte, locked_end + 1, - alloc_hint, mode); + if (em->block_start == EXTENT_MAP_HOLE || + (cur_offset >= inode->i_size && + !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { + ret = prealloc_file_range(inode, + cur_offset, last_byte, + alloc_hint, mode, offset+len); if (ret < 0) { free_extent_map(em); break; @@ -5799,9 +5919,8 @@ static long btrfs_fallocate(struct inode *inode, int mode, unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, GFP_NOFS); - btrfs_end_transaction(trans, BTRFS_I(inode)->root); -out_free: - btrfs_free_reserved_data_space(root, inode, alloc_end - alloc_start); + btrfs_free_reserved_data_space(BTRFS_I(inode)->root, inode, + alloc_end - alloc_start); out: mutex_unlock(&inode->i_mutex); return ret; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index cdbb054..645a179 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -237,7 +237,6 @@ static noinline int create_subvol(struct btrfs_root *root, u64 objectid; u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; u64 index = 0; - unsigned long nr = 1; /* * 1 - inode item @@ -290,7 +289,7 @@ static noinline int create_subvol(struct btrfs_root *root, btrfs_set_root_generation(&root_item, trans->transid); btrfs_set_root_level(&root_item, 0); btrfs_set_root_refs(&root_item, 1); - btrfs_set_root_used(&root_item, 0); + btrfs_set_root_used(&root_item, leaf->len); btrfs_set_root_last_snapshot(&root_item, 0); memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress)); @@ -342,24 +341,21 @@ static noinline int create_subvol(struct btrfs_root *root, d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry)); fail: - nr = trans->blocks_used; err = btrfs_commit_transaction(trans, root); if (err && !ret) ret = err; btrfs_unreserve_metadata_space(root, 6); - btrfs_btree_balance_dirty(root, nr); return ret; } static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, char *name, int namelen) { + struct inode *inode; struct btrfs_pending_snapshot *pending_snapshot; struct btrfs_trans_handle *trans; - int ret = 0; - int err; - unsigned long nr = 0; + int ret; if (!root->ref_cows) return -EINVAL; @@ -372,20 +368,20 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, */ ret = btrfs_reserve_metadata_space(root, 6); if (ret) - goto fail_unlock; + goto fail; pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); if (!pending_snapshot) { ret = -ENOMEM; btrfs_unreserve_metadata_space(root, 6); - goto fail_unlock; + goto fail; } pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS); if (!pending_snapshot->name) { ret = -ENOMEM; kfree(pending_snapshot); btrfs_unreserve_metadata_space(root, 6); - goto fail_unlock; + goto fail; } memcpy(pending_snapshot->name, name, namelen); pending_snapshot->name[namelen] = '\0'; @@ -395,10 +391,19 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, pending_snapshot->root = root; list_add(&pending_snapshot->list, &trans->transaction->pending_snapshots); - err = btrfs_commit_transaction(trans, root); + ret = btrfs_commit_transaction(trans, root); + BUG_ON(ret); + btrfs_unreserve_metadata_space(root, 6); -fail_unlock: - btrfs_btree_balance_dirty(root, nr); + inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + goto fail; + } + BUG_ON(!inode); + d_instantiate(dentry, inode); + ret = 0; +fail: return ret; } @@ -1027,8 +1032,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, BUG_ON(!trans); /* punch hole in destination first */ - btrfs_drop_extents(trans, root, inode, off, off + len, - off + len, 0, &hint_byte, 1); + btrfs_drop_extents(trans, inode, off, off + len, &hint_byte, 1); /* clone data */ key.objectid = src->i_ino; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 5799bc4..5c2a9e7 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -291,16 +291,16 @@ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) /* * remove an ordered extent from the tree. No references are dropped - * but, anyone waiting on this extent is woken up. + * and you must wake_up entry->wait. You must hold the tree mutex + * while you call this function. */ -int btrfs_remove_ordered_extent(struct inode *inode, +static int __btrfs_remove_ordered_extent(struct inode *inode, struct btrfs_ordered_extent *entry) { struct btrfs_ordered_inode_tree *tree; struct rb_node *node; tree = &BTRFS_I(inode)->ordered_tree; - mutex_lock(&tree->mutex); node = &entry->rb_node; rb_erase(node, &tree->tree); tree->last = NULL; @@ -326,16 +326,34 @@ int btrfs_remove_ordered_extent(struct inode *inode, } spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); + return 0; +} + +/* + * remove an ordered extent from the tree. No references are dropped + * but any waiters are woken. + */ +int btrfs_remove_ordered_extent(struct inode *inode, + struct btrfs_ordered_extent *entry) +{ + struct btrfs_ordered_inode_tree *tree; + int ret; + + tree = &BTRFS_I(inode)->ordered_tree; + mutex_lock(&tree->mutex); + ret = __btrfs_remove_ordered_extent(inode, entry); mutex_unlock(&tree->mutex); wake_up(&entry->wait); - return 0; + + return ret; } /* * wait for all the ordered extents in a root. This is done when balancing * space between drives. */ -int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only) +int btrfs_wait_ordered_extents(struct btrfs_root *root, + int nocow_only, int delay_iput) { struct list_head splice; struct list_head *cur; @@ -372,7 +390,10 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only) if (inode) { btrfs_start_ordered_extent(inode, ordered, 1); btrfs_put_ordered_extent(ordered); - iput(inode); + if (delay_iput) + btrfs_add_delayed_iput(inode); + else + iput(inode); } else { btrfs_put_ordered_extent(ordered); } @@ -430,7 +451,7 @@ again: btrfs_wait_ordered_range(inode, 0, (u64)-1); else filemap_flush(inode->i_mapping); - iput(inode); + btrfs_add_delayed_iput(inode); } cond_resched(); @@ -589,7 +610,7 @@ out: * After an extent is done, call this to conditionally update the on disk * i_size. i_size is updated to cover any fully written part of the file. */ -int btrfs_ordered_update_i_size(struct inode *inode, +int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *ordered) { struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; @@ -597,18 +618,32 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 disk_i_size; u64 new_i_size; u64 i_size_test; + u64 i_size = i_size_read(inode); struct rb_node *node; + struct rb_node *prev = NULL; struct btrfs_ordered_extent *test; + int ret = 1; + + if (ordered) + offset = entry_end(ordered); + else + offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize); mutex_lock(&tree->mutex); disk_i_size = BTRFS_I(inode)->disk_i_size; + /* truncate file */ + if (disk_i_size > i_size) { + BTRFS_I(inode)->disk_i_size = i_size; + ret = 0; + goto out; + } + /* * if the disk i_size is already at the inode->i_size, or * this ordered extent is inside the disk i_size, we're done */ - if (disk_i_size >= inode->i_size || - ordered->file_offset + ordered->len <= disk_i_size) { + if (disk_i_size == i_size || offset <= disk_i_size) { goto out; } @@ -616,8 +651,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, * we can't update the disk_isize if there are delalloc bytes * between disk_i_size and this ordered extent */ - if (test_range_bit(io_tree, disk_i_size, - ordered->file_offset + ordered->len - 1, + if (test_range_bit(io_tree, disk_i_size, offset - 1, EXTENT_DELALLOC, 0, NULL)) { goto out; } @@ -626,20 +660,32 @@ int btrfs_ordered_update_i_size(struct inode *inode, * if we find an ordered extent then we can't update disk i_size * yet */ - node = &ordered->rb_node; - while (1) { - node = rb_prev(node); - if (!node) - break; + if (ordered) { + node = rb_prev(&ordered->rb_node); + } else { + prev = tree_search(tree, offset); + /* + * we insert file extents without involving ordered struct, + * so there should be no ordered struct cover this offset + */ + if (prev) { + test = rb_entry(prev, struct btrfs_ordered_extent, + rb_node); + BUG_ON(offset_in_entry(test, offset)); + } + node = prev; + } + while (node) { test = rb_entry(node, struct btrfs_ordered_extent, rb_node); if (test->file_offset + test->len <= disk_i_size) break; - if (test->file_offset >= inode->i_size) + if (test->file_offset >= i_size) break; if (test->file_offset >= disk_i_size) goto out; + node = rb_prev(node); } - new_i_size = min_t(u64, entry_end(ordered), i_size_read(inode)); + new_i_size = min_t(u64, offset, i_size); /* * at this point, we know we can safely update i_size to at least @@ -647,7 +693,14 @@ int btrfs_ordered_update_i_size(struct inode *inode, * walk forward and see if ios from higher up in the file have * finished. */ - node = rb_next(&ordered->rb_node); + if (ordered) { + node = rb_next(&ordered->rb_node); + } else { + if (prev) + node = rb_next(prev); + else + node = rb_first(&tree->tree); + } i_size_test = 0; if (node) { /* @@ -655,10 +708,10 @@ int btrfs_ordered_update_i_size(struct inode *inode, * between our ordered extent and the next one. */ test = rb_entry(node, struct btrfs_ordered_extent, rb_node); - if (test->file_offset > entry_end(ordered)) + if (test->file_offset > offset) i_size_test = test->file_offset; } else { - i_size_test = i_size_read(inode); + i_size_test = i_size; } /* @@ -667,15 +720,25 @@ int btrfs_ordered_update_i_size(struct inode *inode, * are no delalloc bytes in this area, it is safe to update * disk_i_size to the end of the region. */ - if (i_size_test > entry_end(ordered) && - !test_range_bit(io_tree, entry_end(ordered), i_size_test - 1, - EXTENT_DELALLOC, 0, NULL)) { - new_i_size = min_t(u64, i_size_test, i_size_read(inode)); + if (i_size_test > offset && + !test_range_bit(io_tree, offset, i_size_test - 1, + EXTENT_DELALLOC, 0, NULL)) { + new_i_size = min_t(u64, i_size_test, i_size); } BTRFS_I(inode)->disk_i_size = new_i_size; + ret = 0; out: + /* + * we need to remove the ordered extent with the tree lock held + * so that other people calling this function don't find our fully + * processed ordered entry and skip updating the i_size + */ + if (ordered) + __btrfs_remove_ordered_extent(inode, ordered); mutex_unlock(&tree->mutex); - return 0; + if (ordered) + wake_up(&ordered->wait); + return ret; } /* diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index f82e874..1fe1282 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -150,12 +150,13 @@ void btrfs_start_ordered_extent(struct inode *inode, int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); struct btrfs_ordered_extent * btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset); -int btrfs_ordered_update_i_size(struct inode *inode, +int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *ordered); int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); -int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only); int btrfs_run_ordered_operations(struct btrfs_root *root, int wait); int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode); +int btrfs_wait_ordered_extents(struct btrfs_root *root, + int nocow_only, int delay_iput); #endif diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index cfcc93c..ab7ab53 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1561,6 +1561,20 @@ static int invalidate_extent_cache(struct btrfs_root *root, return 0; } +static void put_inodes(struct list_head *list) +{ + struct inodevec *ivec; + while (!list_empty(list)) { + ivec = list_entry(list->next, struct inodevec, list); + list_del(&ivec->list); + while (ivec->nr > 0) { + ivec->nr--; + iput(ivec->inode[ivec->nr]); + } + kfree(ivec); + } +} + static int find_next_key(struct btrfs_path *path, int level, struct btrfs_key *key) @@ -1723,6 +1737,11 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, btrfs_btree_balance_dirty(root, nr); + /* + * put inodes outside transaction, otherwise we may deadlock. + */ + put_inodes(&inode_list); + if (replaced && rc->stage == UPDATE_DATA_PTRS) invalidate_extent_cache(root, &key, &next_key); } @@ -1752,19 +1771,7 @@ out: btrfs_btree_balance_dirty(root, nr); - /* - * put inodes while we aren't holding the tree locks - */ - while (!list_empty(&inode_list)) { - struct inodevec *ivec; - ivec = list_entry(inode_list.next, struct inodevec, list); - list_del(&ivec->list); - while (ivec->nr > 0) { - ivec->nr--; - iput(ivec->inode[ivec->nr]); - } - kfree(ivec); - } + put_inodes(&inode_list); if (replaced && rc->stage == UPDATE_DATA_PTRS) invalidate_extent_cache(root, &key, &next_key); @@ -3274,8 +3281,10 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) return -ENOMEM; path = btrfs_alloc_path(); - if (!path) + if (!path) { + kfree(cluster); return -ENOMEM; + } rc->extents_found = 0; rc->extents_skipped = 0; @@ -3534,8 +3543,8 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) (unsigned long long)rc->block_group->key.objectid, (unsigned long long)rc->block_group->flags); - btrfs_start_delalloc_inodes(fs_info->tree_root); - btrfs_wait_ordered_extents(fs_info->tree_root, 0); + btrfs_start_delalloc_inodes(fs_info->tree_root, 0); + btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0); while (1) { rc->extents_found = 0; @@ -3755,6 +3764,8 @@ out: BTRFS_DATA_RELOC_TREE_OBJECTID); if (IS_ERR(fs_root)) err = PTR_ERR(fs_root); + else + btrfs_orphan_cleanup(fs_root); } return err; } diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 752a546..8a1ea6e 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -66,7 +66,8 @@ enum { Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow, Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, - Opt_compress, Opt_notreelog, Opt_ratio, Opt_flushoncommit, + Opt_compress, Opt_compress_force, Opt_notreelog, Opt_ratio, + Opt_flushoncommit, Opt_discard, Opt_err, }; @@ -82,6 +83,7 @@ static match_table_t tokens = { {Opt_alloc_start, "alloc_start=%s"}, {Opt_thread_pool, "thread_pool=%d"}, {Opt_compress, "compress"}, + {Opt_compress_force, "compress-force"}, {Opt_ssd, "ssd"}, {Opt_ssd_spread, "ssd_spread"}, {Opt_nossd, "nossd"}, @@ -128,6 +130,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) substring_t args[MAX_OPT_ARGS]; char *p, *num; int intarg; + int ret = 0; if (!options) return 0; @@ -172,6 +175,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) printk(KERN_INFO "btrfs: use compression\n"); btrfs_set_opt(info->mount_opt, COMPRESS); break; + case Opt_compress_force: + printk(KERN_INFO "btrfs: forcing compression\n"); + btrfs_set_opt(info->mount_opt, FORCE_COMPRESS); + btrfs_set_opt(info->mount_opt, COMPRESS); + break; case Opt_ssd: printk(KERN_INFO "btrfs: use ssd allocation scheme\n"); btrfs_set_opt(info->mount_opt, SSD); @@ -262,12 +270,18 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) case Opt_discard: btrfs_set_opt(info->mount_opt, DISCARD); break; + case Opt_err: + printk(KERN_INFO "btrfs: unrecognized mount option " + "'%s'\n", p); + ret = -EINVAL; + goto out; default: break; } } +out: kfree(options); - return 0; + return ret; } /* @@ -405,8 +419,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait) return 0; } - btrfs_start_delalloc_inodes(root); - btrfs_wait_ordered_extents(root, 0); + btrfs_start_delalloc_inodes(root, 0); + btrfs_wait_ordered_extents(root, 0, 0); trans = btrfs_start_transaction(root, 1); ret = btrfs_commit_transaction(trans, root); @@ -450,6 +464,8 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",notreelog"); if (btrfs_test_opt(root, FLUSHONCOMMIT)) seq_puts(seq, ",flushoncommit"); + if (btrfs_test_opt(root, DISCARD)) + seq_puts(seq, ",discard"); if (!(root->fs_info->sb->s_flags & MS_POSIXACL)) seq_puts(seq, ",noacl"); return 0; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index c207e8c..b2acc79 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -333,6 +333,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, memset(trans, 0, sizeof(*trans)); kmem_cache_free(btrfs_trans_handle_cachep, trans); + if (throttle) + btrfs_run_delayed_iputs(root); + return 0; } @@ -354,7 +357,7 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, * those extents are sent to disk but does not wait on them */ int btrfs_write_marked_extents(struct btrfs_root *root, - struct extent_io_tree *dirty_pages) + struct extent_io_tree *dirty_pages, int mark) { int ret; int err = 0; @@ -367,7 +370,7 @@ int btrfs_write_marked_extents(struct btrfs_root *root, while (1) { ret = find_first_extent_bit(dirty_pages, start, &start, &end, - EXTENT_DIRTY); + mark); if (ret) break; while (start <= end) { @@ -413,7 +416,7 @@ int btrfs_write_marked_extents(struct btrfs_root *root, * on all the pages and clear them from the dirty pages state tree */ int btrfs_wait_marked_extents(struct btrfs_root *root, - struct extent_io_tree *dirty_pages) + struct extent_io_tree *dirty_pages, int mark) { int ret; int err = 0; @@ -425,12 +428,12 @@ int btrfs_wait_marked_extents(struct btrfs_root *root, unsigned long index; while (1) { - ret = find_first_extent_bit(dirty_pages, 0, &start, &end, - EXTENT_DIRTY); + ret = find_first_extent_bit(dirty_pages, start, &start, &end, + mark); if (ret) break; - clear_extent_dirty(dirty_pages, start, end, GFP_NOFS); + clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS); while (start <= end) { index = start >> PAGE_CACHE_SHIFT; start = (u64)(index + 1) << PAGE_CACHE_SHIFT; @@ -460,13 +463,13 @@ int btrfs_wait_marked_extents(struct btrfs_root *root, * those extents are on disk for transaction or log commit */ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, - struct extent_io_tree *dirty_pages) + struct extent_io_tree *dirty_pages, int mark) { int ret; int ret2; - ret = btrfs_write_marked_extents(root, dirty_pages); - ret2 = btrfs_wait_marked_extents(root, dirty_pages); + ret = btrfs_write_marked_extents(root, dirty_pages, mark); + ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark); return ret || ret2; } @@ -479,7 +482,8 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, return filemap_write_and_wait(btree_inode->i_mapping); } return btrfs_write_and_wait_marked_extents(root, - &trans->transaction->dirty_pages); + &trans->transaction->dirty_pages, + EXTENT_DIRTY); } /* @@ -497,13 +501,16 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, { int ret; u64 old_root_bytenr; + u64 old_root_used; struct btrfs_root *tree_root = root->fs_info->tree_root; + old_root_used = btrfs_root_used(&root->root_item); btrfs_write_dirty_block_groups(trans, root); while (1) { old_root_bytenr = btrfs_root_bytenr(&root->root_item); - if (old_root_bytenr == root->node->start) + if (old_root_bytenr == root->node->start && + old_root_used == btrfs_root_used(&root->root_item)) break; btrfs_set_root_node(&root->root_item, root->node); @@ -512,6 +519,7 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, &root->root_item); BUG_ON(ret); + old_root_used = btrfs_root_used(&root->root_item); ret = btrfs_write_dirty_block_groups(trans, root); BUG_ON(ret); } @@ -795,7 +803,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, memcpy(&pending->root_key, &key, sizeof(key)); fail: kfree(new_root_item); - btrfs_unreserve_metadata_space(root, 6); return ret; } @@ -807,7 +814,6 @@ static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info, u64 index = 0; struct btrfs_trans_handle *trans; struct inode *parent_inode; - struct inode *inode; struct btrfs_root *parent_root; parent_inode = pending->dentry->d_parent->d_inode; @@ -839,8 +845,6 @@ static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info, BUG_ON(ret); - inode = btrfs_lookup_dentry(parent_inode, pending->dentry); - d_instantiate(pending->dentry, inode); fail: btrfs_end_transaction(trans, fs_info->fs_root); return ret; @@ -994,11 +998,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, mutex_unlock(&root->fs_info->trans_mutex); if (flush_on_commit) { - btrfs_start_delalloc_inodes(root); - ret = btrfs_wait_ordered_extents(root, 0); + btrfs_start_delalloc_inodes(root, 1); + ret = btrfs_wait_ordered_extents(root, 0, 1); BUG_ON(ret); } else if (snap_pending) { - ret = btrfs_wait_ordered_extents(root, 1); + ret = btrfs_wait_ordered_extents(root, 0, 1); BUG_ON(ret); } @@ -1116,6 +1120,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, current->journal_info = NULL; kmem_cache_free(btrfs_trans_handle_cachep, trans); + + if (current != root->fs_info->transaction_kthread) + btrfs_run_delayed_iputs(root); + return ret; } diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index d4e3e7a..93c7ccb 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -107,10 +107,10 @@ void btrfs_throttle(struct btrfs_root *root); int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, - struct extent_io_tree *dirty_pages); + struct extent_io_tree *dirty_pages, int mark); int btrfs_write_marked_extents(struct btrfs_root *root, - struct extent_io_tree *dirty_pages); + struct extent_io_tree *dirty_pages, int mark); int btrfs_wait_marked_extents(struct btrfs_root *root, - struct extent_io_tree *dirty_pages); + struct extent_io_tree *dirty_pages, int mark); int btrfs_transaction_in_commit(struct btrfs_fs_info *info); #endif diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 741666a..4a9434b 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -542,8 +542,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, saved_nbytes = inode_get_bytes(inode); /* drop any overlapping extents */ - ret = btrfs_drop_extents(trans, root, inode, - start, extent_end, extent_end, start, &alloc_hint, 1); + ret = btrfs_drop_extents(trans, inode, start, extent_end, + &alloc_hint, 1); BUG_ON(ret); if (found_type == BTRFS_FILE_EXTENT_REG || @@ -930,6 +930,17 @@ out_nowrite: return 0; } +static int insert_orphan_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 offset) +{ + int ret; + ret = btrfs_find_orphan_item(root, offset); + if (ret > 0) + ret = btrfs_insert_orphan_item(trans, root, offset); + return ret; +} + + /* * There are a few corners where the link count of the file can't * be properly maintained during replay. So, instead of adding @@ -997,9 +1008,13 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, } BTRFS_I(inode)->index_cnt = (u64)-1; - if (inode->i_nlink == 0 && S_ISDIR(inode->i_mode)) { - ret = replay_dir_deletes(trans, root, NULL, path, - inode->i_ino, 1); + if (inode->i_nlink == 0) { + if (S_ISDIR(inode->i_mode)) { + ret = replay_dir_deletes(trans, root, NULL, path, + inode->i_ino, 1); + BUG_ON(ret); + } + ret = insert_orphan_item(trans, root, inode->i_ino); BUG_ON(ret); } btrfs_free_path(path); @@ -1587,7 +1602,6 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, /* inode keys are done during the first stage */ if (key.type == BTRFS_INODE_ITEM_KEY && wc->stage == LOG_WALK_REPLAY_INODES) { - struct inode *inode; struct btrfs_inode_item *inode_item; u32 mode; @@ -1603,31 +1617,16 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, eb, i, &key); BUG_ON(ret); - /* for regular files, truncate away - * extents past the new EOF + /* for regular files, make sure corresponding + * orhpan item exist. extents past the new EOF + * will be truncated later by orphan cleanup. */ if (S_ISREG(mode)) { - inode = read_one_inode(root, - key.objectid); - BUG_ON(!inode); - - ret = btrfs_truncate_inode_items(wc->trans, - root, inode, inode->i_size, - BTRFS_EXTENT_DATA_KEY); + ret = insert_orphan_item(wc->trans, root, + key.objectid); BUG_ON(ret); - - /* if the nlink count is zero here, the iput - * will free the inode. We bump it to make - * sure it doesn't get freed until the link - * count fixup is done - */ - if (inode->i_nlink == 0) { - btrfs_inc_nlink(inode); - btrfs_update_inode(wc->trans, - root, inode); - } - iput(inode); } + ret = link_to_fixup_dir(wc->trans, root, path, key.objectid); BUG_ON(ret); @@ -1977,10 +1976,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, { int index1; int index2; + int mark; int ret; struct btrfs_root *log = root->log_root; struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; - u64 log_transid = 0; + unsigned long log_transid = 0; mutex_lock(&root->log_mutex); index1 = root->log_transid % 2; @@ -2014,24 +2014,29 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, goto out; } + log_transid = root->log_transid; + if (log_transid % 2 == 0) + mark = EXTENT_DIRTY; + else + mark = EXTENT_NEW; + /* we start IO on all the marked extents here, but we don't actually * wait for them until later. */ - ret = btrfs_write_marked_extents(log, &log->dirty_log_pages); + ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark); BUG_ON(ret); btrfs_set_root_node(&log->root_item, log->node); root->log_batch = 0; - log_transid = root->log_transid; root->log_transid++; log->log_transid = root->log_transid; root->log_start_pid = 0; smp_mb(); /* - * log tree has been flushed to disk, new modifications of - * the log will be written to new positions. so it's safe to - * allow log writers to go in. + * IO has been started, blocks of the log tree have WRITTEN flag set + * in their headers. new modifications of the log will be written to + * new positions. so it's safe to allow log writers to go in. */ mutex_unlock(&root->log_mutex); @@ -2052,7 +2057,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, index2 = log_root_tree->log_transid % 2; if (atomic_read(&log_root_tree->log_commit[index2])) { - btrfs_wait_marked_extents(log, &log->dirty_log_pages); + btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); wait_log_commit(trans, log_root_tree, log_root_tree->log_transid); mutex_unlock(&log_root_tree->log_mutex); @@ -2072,16 +2077,17 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * check the full commit flag again */ if (root->fs_info->last_trans_log_full_commit == trans->transid) { - btrfs_wait_marked_extents(log, &log->dirty_log_pages); + btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); mutex_unlock(&log_root_tree->log_mutex); ret = -EAGAIN; goto out_wake_log_root; } ret = btrfs_write_and_wait_marked_extents(log_root_tree, - &log_root_tree->dirty_log_pages); + &log_root_tree->dirty_log_pages, + EXTENT_DIRTY | EXTENT_NEW); BUG_ON(ret); - btrfs_wait_marked_extents(log, &log->dirty_log_pages); + btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); btrfs_set_super_log_root(&root->fs_info->super_for_commit, log_root_tree->node->start); @@ -2147,12 +2153,12 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) while (1) { ret = find_first_extent_bit(&log->dirty_log_pages, - 0, &start, &end, EXTENT_DIRTY); + 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW); if (ret) break; - clear_extent_dirty(&log->dirty_log_pages, - start, end, GFP_NOFS); + clear_extent_bits(&log->dirty_log_pages, start, end, + EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); } if (log->log_transid > 0) { diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 7eda483..41ecbb2 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1135,7 +1135,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) root->fs_info->avail_metadata_alloc_bits; if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && - root->fs_info->fs_devices->rw_devices <= 4) { + root->fs_info->fs_devices->num_devices <= 4) { printk(KERN_ERR "btrfs: unable to go below four devices " "on raid10\n"); ret = -EINVAL; @@ -1143,7 +1143,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) } if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && - root->fs_info->fs_devices->rw_devices <= 2) { + root->fs_info->fs_devices->num_devices <= 2) { printk(KERN_ERR "btrfs: unable to go below two " "devices on raid1\n"); ret = -EINVAL; @@ -1434,8 +1434,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) return -EINVAL; bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder); - if (!bdev) - return -EIO; + if (IS_ERR(bdev)) + return PTR_ERR(bdev); if (root->fs_info->fs_devices->seeding) { seeding_dev = 1; @@ -2209,7 +2209,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, max_chunk_size = 10 * calc_size; min_stripe_size = 64 * 1024 * 1024; } else if (type & BTRFS_BLOCK_GROUP_METADATA) { - max_chunk_size = 4 * calc_size; + max_chunk_size = 256 * 1024 * 1024; min_stripe_size = 32 * 1024 * 1024; } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { calc_size = 8 * 1024 * 1024; @@ -2538,6 +2538,11 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset) if (!em) return 1; + if (btrfs_test_opt(root, DEGRADED)) { + free_extent_map(em); + return 0; + } + map = (struct map_lookup *)em->bdev; for (i = 0; i < map->num_stripes; i++) { if (!map->stripes[i].dev->writeable) { @@ -2649,8 +2654,10 @@ again: em = lookup_extent_mapping(em_tree, logical, *length); read_unlock(&em_tree->lock); - if (!em && unplug_page) + if (!em && unplug_page) { + kfree(multi); return 0; + } if (!em) { printk(KERN_CRIT "unable to find logical %llu len %llu\n", diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index b6dd596..193b58f 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -85,22 +85,23 @@ out: return ret; } -int __btrfs_setxattr(struct inode *inode, const char *name, - const void *value, size_t size, int flags) +static int do_setxattr(struct btrfs_trans_handle *trans, + struct inode *inode, const char *name, + const void *value, size_t size, int flags) { struct btrfs_dir_item *di; struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_trans_handle *trans; struct btrfs_path *path; - int ret = 0, mod = 0; + size_t name_len = strlen(name); + int ret = 0; + + if (name_len + size > BTRFS_MAX_XATTR_SIZE(root)) + return -ENOSPC; path = btrfs_alloc_path(); if (!path) return -ENOMEM; - trans = btrfs_join_transaction(root, 1); - btrfs_set_trans_block_group(trans, inode); - /* first lets see if we already have this xattr */ di = btrfs_lookup_xattr(trans, root, path, inode->i_ino, name, strlen(name), -1); @@ -118,15 +119,12 @@ int __btrfs_setxattr(struct inode *inode, const char *name, } ret = btrfs_delete_one_dir_name(trans, root, path, di); - if (ret) - goto out; + BUG_ON(ret); btrfs_release_path(root, path); /* if we don't have a value then we are removing the xattr */ - if (!value) { - mod = 1; + if (!value) goto out; - } } else { btrfs_release_path(root, path); @@ -138,20 +136,45 @@ int __btrfs_setxattr(struct inode *inode, const char *name, } /* ok we have to create a completely new xattr */ - ret = btrfs_insert_xattr_item(trans, root, name, strlen(name), - value, size, inode->i_ino); + ret = btrfs_insert_xattr_item(trans, root, path, inode->i_ino, + name, name_len, value, size); + BUG_ON(ret); +out: + btrfs_free_path(path); + return ret; +} + +int __btrfs_setxattr(struct btrfs_trans_handle *trans, + struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + if (trans) + return do_setxattr(trans, inode, name, value, size, flags); + + ret = btrfs_reserve_metadata_space(root, 2); if (ret) - goto out; - mod = 1; + return ret; -out: - if (mod) { - inode->i_ctime = CURRENT_TIME; - ret = btrfs_update_inode(trans, root, inode); + trans = btrfs_start_transaction(root, 1); + if (!trans) { + ret = -ENOMEM; + goto out; } + btrfs_set_trans_block_group(trans, inode); - btrfs_end_transaction(trans, root); - btrfs_free_path(path); + ret = do_setxattr(trans, inode, name, value, size, flags); + if (ret) + goto out; + + inode->i_ctime = CURRENT_TIME; + ret = btrfs_update_inode(trans, root, inode); + BUG_ON(ret); +out: + btrfs_end_transaction_throttle(trans, root); + btrfs_unreserve_metadata_space(root, 2); return ret; } @@ -314,7 +337,9 @@ int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value, if (size == 0) value = ""; /* empty EA, do not remove */ - return __btrfs_setxattr(dentry->d_inode, name, value, size, flags); + + return __btrfs_setxattr(NULL, dentry->d_inode, name, value, size, + flags); } int btrfs_removexattr(struct dentry *dentry, const char *name) @@ -329,10 +354,13 @@ int btrfs_removexattr(struct dentry *dentry, const char *name) if (!btrfs_is_valid_xattr(name)) return -EOPNOTSUPP; - return __btrfs_setxattr(dentry->d_inode, name, NULL, 0, XATTR_REPLACE); + + return __btrfs_setxattr(NULL, dentry->d_inode, name, NULL, 0, + XATTR_REPLACE); } -int btrfs_xattr_security_init(struct inode *inode, struct inode *dir) +int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *dir) { int err; size_t len; @@ -354,7 +382,7 @@ int btrfs_xattr_security_init(struct inode *inode, struct inode *dir) } else { strcpy(name, XATTR_SECURITY_PREFIX); strcpy(name + XATTR_SECURITY_PREFIX_LEN, suffix); - err = __btrfs_setxattr(inode, name, value, len, 0); + err = __btrfs_setxattr(trans, inode, name, value, len, 0); kfree(name); } diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h index c71e9c3..721efa0 100644 --- a/fs/btrfs/xattr.h +++ b/fs/btrfs/xattr.h @@ -27,15 +27,16 @@ extern struct xattr_handler *btrfs_xattr_handlers[]; extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name, void *buffer, size_t size); -extern int __btrfs_setxattr(struct inode *inode, const char *name, - const void *value, size_t size, int flags); - +extern int __btrfs_setxattr(struct btrfs_trans_handle *trans, + struct inode *inode, const char *name, + const void *value, size_t size, int flags); extern ssize_t btrfs_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size); extern int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags); extern int btrfs_removexattr(struct dentry *dentry, const char *name); -extern int btrfs_xattr_security_init(struct inode *inode, struct inode *dir); +extern int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *dir); #endif /* __XATTR__ */ diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c index 3797e00..2906077 100644 --- a/fs/cachefiles/bind.c +++ b/fs/cachefiles/bind.c @@ -84,7 +84,7 @@ int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args) static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache) { struct cachefiles_object *fsdef; - struct nameidata nd; + struct path path; struct kstatfs stats; struct dentry *graveyard, *cachedir, *root; const struct cred *saved_cred; @@ -114,15 +114,12 @@ static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache) _debug("- fsdef %p", fsdef); /* look up the directory at the root of the cache */ - memset(&nd, 0, sizeof(nd)); - - ret = path_lookup(cache->rootdirname, LOOKUP_DIRECTORY, &nd); + ret = kern_path(cache->rootdirname, LOOKUP_DIRECTORY, &path); if (ret < 0) goto error_open_root; - cache->mnt = mntget(nd.path.mnt); - root = dget(nd.path.dentry); - path_put(&nd.path); + cache->mnt = path.mnt; + root = path.dentry; /* check parameters */ ret = -EOPNOTSUPP; diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 14ac480..eeb4986 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -348,7 +348,17 @@ int cachefiles_delete_object(struct cachefiles_cache *cache, dir = dget_parent(object->dentry); mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); - ret = cachefiles_bury_object(cache, dir, object->dentry); + + /* we need to check that our parent is _still_ our parent - it may have + * been renamed */ + if (dir == object->dentry->d_parent) { + ret = cachefiles_bury_object(cache, dir, object->dentry); + } else { + /* it got moved, presumably by cachefilesd culling it, so it's + * no longer in the key path and we can ignore it */ + mutex_unlock(&dir->d_inode->i_mutex); + ret = 0; + } dput(dir); _leave(" = %d", ret); diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index a6c8c6f..1d83325 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -11,7 +11,6 @@ #include <linux/mount.h> #include <linux/file.h> -#include <linux/ima.h> #include "internal.h" /* @@ -923,7 +922,6 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page) if (IS_ERR(file)) { ret = PTR_ERR(file); } else { - ima_counts_get(file); ret = -EIO; if (file->f_op->write) { pos = (loff_t) page->index << PAGE_SHIFT; diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES index 094ea65..49503d2 100644 --- a/fs/cifs/CHANGES +++ b/fs/cifs/CHANGES @@ -1,3 +1,7 @@ +Version 1.62 +------------ +Add sockopt=TCP_NODELAY mount option. + Version 1.61 ------------ Fix append problem to Samba servers (files opened with O_APPEND could @@ -5,7 +9,9 @@ have duplicated data). Fix oops in cifs_lookup. Workaround problem mounting to OS/400 Netserve. Fix oops in cifs_get_tcp_session. Disable use of server inode numbers when server only partially supports them (e.g. for one server querying inode numbers on -FindFirst fails but QPathInfo queries works). +FindFirst fails but QPathInfo queries works). Fix oops with dfs in +cifs_put_smb_ses. Fix mmap to work on directio mounts (needed +for OpenOffice when on forcedirectio mount e.g.) Version 1.60 ------------- diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index fea9e89..b44ce0a 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -269,7 +269,7 @@ static int add_mount_helper(struct vfsmount *newmnt, struct nameidata *nd, int err; mntget(newmnt); - err = do_add_mount(newmnt, &nd->path, nd->path.mnt->mnt_flags, mntlist); + err = do_add_mount(newmnt, &nd->path, nd->path.mnt->mnt_flags | MNT_SHRINKABLE, mntlist); switch (err) { case 0: path_put(&nd->path); @@ -371,7 +371,6 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd) if (IS_ERR(mnt)) goto out_err; - nd->path.mnt->mnt_flags |= MNT_SHRINKABLE; rc = add_mount_helper(mnt, nd, &cifs_dfs_automount_list); out: diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 29f1da7..8c6a036 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -758,7 +758,7 @@ const struct file_operations cifs_file_ops = { }; const struct file_operations cifs_file_direct_ops = { - /* no mmap, no aio, no readv - + /* no aio, no readv - BB reevaluate whether they can be done with directio, no cache */ .read = cifs_user_read, .write = cifs_user_write, @@ -767,6 +767,7 @@ const struct file_operations cifs_file_direct_ops = { .lock = cifs_lock, .fsync = cifs_fsync, .flush = cifs_flush, + .mmap = cifs_file_mmap, .splice_read = generic_file_splice_read, #ifdef CONFIG_CIFS_POSIX .unlocked_ioctl = cifs_ioctl, diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index ac2b24c..78c1b86 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -113,5 +113,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* EXPERIMENTAL */ -#define CIFS_VERSION "1.61" +#define CIFS_VERSION "1.62" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 4b35f7e..ed751bb 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -149,6 +149,7 @@ struct TCP_Server_Info { bool svlocal:1; /* local server or remote */ bool noblocksnd; /* use blocking sendmsg */ bool noautotune; /* do not autotune send buf sizes */ + bool tcp_nodelay; atomic_t inFlight; /* number of requests on the wire to server */ #ifdef CONFIG_CIFS_STATS2 atomic_t inSend; /* requests trying to send */ diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 63ea83f..2e9e09c 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -98,7 +98,7 @@ struct smb_vol { bool nostrictsync:1; /* do not force expensive SMBflush on every sync */ unsigned int rsize; unsigned int wsize; - unsigned int sockopt; + bool sockopt_tcp_nodelay:1; unsigned short int port; char *prepath; }; @@ -1142,9 +1142,11 @@ cifs_parse_mount_options(char *options, const char *devname, simple_strtoul(value, &value, 0); } } else if (strnicmp(data, "sockopt", 5) == 0) { - if (value && *value) { - vol->sockopt = - simple_strtoul(value, &value, 0); + if (!value || !*value) { + cERROR(1, ("no socket option specified")); + continue; + } else if (strnicmp(value, "TCP_NODELAY", 11) == 0) { + vol->sockopt_tcp_nodelay = 1; } } else if (strnicmp(data, "netbiosname", 4) == 0) { if (!value || !*value || (*value == ' ')) { @@ -1514,6 +1516,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info) tcp_ses->noblocksnd = volume_info->noblocksnd; tcp_ses->noautotune = volume_info->noautotune; + tcp_ses->tcp_nodelay = volume_info->sockopt_tcp_nodelay; atomic_set(&tcp_ses->inFlight, 0); init_waitqueue_head(&tcp_ses->response_q); init_waitqueue_head(&tcp_ses->request_q); @@ -1764,6 +1767,7 @@ static int ipv4_connect(struct TCP_Server_Info *server) { int rc = 0; + int val; bool connected = false; __be16 orig_port = 0; struct socket *socket = server->ssocket; @@ -1845,6 +1849,14 @@ ipv4_connect(struct TCP_Server_Info *server) socket->sk->sk_rcvbuf = 140 * 1024; } + if (server->tcp_nodelay) { + val = 1; + rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY, + (char *)&val, sizeof(val)); + if (rc) + cFYI(1, ("set TCP_NODELAY socket option error %d", rc)); + } + cFYI(1, ("sndbuf %d rcvbuf %d rcvtimeo 0x%lx", socket->sk->sk_sndbuf, socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo)); @@ -1916,6 +1928,7 @@ static int ipv6_connect(struct TCP_Server_Info *server) { int rc = 0; + int val; bool connected = false; __be16 orig_port = 0; struct socket *socket = server->ssocket; @@ -1987,6 +2000,15 @@ ipv6_connect(struct TCP_Server_Info *server) */ socket->sk->sk_rcvtimeo = 7 * HZ; socket->sk->sk_sndtimeo = 5 * HZ; + + if (server->tcp_nodelay) { + val = 1; + rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY, + (char *)&val, sizeof(val)); + if (rc) + cFYI(1, ("set TCP_NODELAY socket option error %d", rc)); + } + server->ssocket = socket; return rc; @@ -2287,12 +2309,12 @@ int cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, char *mount_data_global, const char *devname) { - int rc = 0; + int rc; int xid; struct smb_vol *volume_info; - struct cifsSesInfo *pSesInfo = NULL; - struct cifsTconInfo *tcon = NULL; - struct TCP_Server_Info *srvTcp = NULL; + struct cifsSesInfo *pSesInfo; + struct cifsTconInfo *tcon; + struct TCP_Server_Info *srvTcp; char *full_path; char *mount_data = mount_data_global; #ifdef CONFIG_CIFS_DFS_UPCALL @@ -2301,6 +2323,10 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, int referral_walks_count = 0; try_mount_again: #endif + rc = 0; + tcon = NULL; + pSesInfo = NULL; + srvTcp = NULL; full_path = NULL; xid = GetXid(); @@ -2597,6 +2623,7 @@ remote_path_check: cleanup_volume_info(&volume_info); referral_walks_count++; + FreeXid(xid); goto try_mount_again; } #else /* No DFS support, return error on mount */ diff --git a/fs/cifs/export.c b/fs/cifs/export.c index 75949d6..6177f7c 100644 --- a/fs/cifs/export.c +++ b/fs/cifs/export.c @@ -24,7 +24,7 @@ */ /* - * See Documentation/filesystems/Exporting + * See Documentation/filesystems/nfs/Exporting * and examples in fs/exportfs * * Since cifs is a network file system, an "fsid" must be included for diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index cf18ee7..e3fda97 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1762,8 +1762,18 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) CIFS_MOUNT_MAP_SPECIAL_CHR); } - if (!rc) + if (!rc) { rc = inode_setattr(inode, attrs); + + /* force revalidate when any of these times are set since some + of the fs types (eg ext3, fat) do not have fine enough + time granularity to match protocol, and we do not have a + a way (yet) to query the server fs's time granularity (and + whether it rounds times down). + */ + if (!rc && (attrs->ia_valid & (ATTR_MTIME | ATTR_CTIME))) + cifsInode->time = 0; + } out: kfree(args); kfree(full_path); diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index f84062f..c343b14 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -77,6 +77,11 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name, cFYI(1, ("For %s", name->name)); + if (parent->d_op && parent->d_op->d_hash) + parent->d_op->d_hash(parent, name); + else + name->hash = full_name_hash(name->name, name->len); + dentry = d_lookup(parent, name); if (dentry) { /* FIXME: check for inode number changes? */ @@ -666,12 +671,11 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst, min(len, max_len), nlt, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + pqst->len -= nls_nullsize(nlt); } else { pqst->name = filename; pqst->len = len; } - pqst->hash = full_name_hash(pqst->name, pqst->len); -/* cFYI(1, ("filldir on %s",pqst->name)); */ return rc; } diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 7085a62..aaa9c1c 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -223,9 +223,9 @@ static void unicode_ssetup_strings(char **pbcc_area, struct cifsSesInfo *ses, /* null user mount */ *bcc_ptr = 0; *(bcc_ptr+1) = 0; - } else { /* 300 should be long enough for any conceivable user name */ + } else { bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, ses->userName, - 300, nls_cp); + MAX_USERNAME_SIZE, nls_cp); } bcc_ptr += 2 * bytes_ret; bcc_ptr += 2; /* account for null termination */ @@ -246,11 +246,10 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifsSesInfo *ses, /* copy user */ if (ses->userName == NULL) { /* BB what about null user mounts - check that we do this BB */ - } else { /* 300 should be long enough for any conceivable user name */ - strncpy(bcc_ptr, ses->userName, 300); + } else { + strncpy(bcc_ptr, ses->userName, MAX_USERNAME_SIZE); } - /* BB improve check for overflow */ - bcc_ptr += strnlen(ses->userName, 300); + bcc_ptr += strnlen(ses->userName, MAX_USERNAME_SIZE); *bcc_ptr = 0; bcc_ptr++; /* account for null termination */ diff --git a/fs/compat.c b/fs/compat.c index 6c19040..00d90c2 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -38,8 +38,6 @@ #include <linux/dirent.h> #include <linux/fsnotify.h> #include <linux/highuid.h> -#include <linux/sunrpc/svc.h> -#include <linux/nfsd/nfsd.h> #include <linux/nfsd/syscall.h> #include <linux/personality.h> #include <linux/rwsem.h> diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 14cbc83..0ca9ec4 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -301,6 +301,12 @@ static int sg_ioctl_trans(unsigned int fd, unsigned int cmd, u32 data; void __user *dxferp; int err; + int interface_id; + + if (get_user(interface_id, &sgio32->interface_id)) + return -EFAULT; + if (interface_id != 'S') + return sys_ioctl(fd, cmd, (unsigned long)sgio32); if (get_user(iovec_count, &sgio32->iovec_count)) return -EFAULT; @@ -936,6 +942,7 @@ COMPATIBLE_IOCTL(TCSETSF) COMPATIBLE_IOCTL(TIOCLINUX) COMPATIBLE_IOCTL(TIOCSBRK) COMPATIBLE_IOCTL(TIOCCBRK) +COMPATIBLE_IOCTL(TIOCGSID) COMPATIBLE_IOCTL(TIOCGICOUNT) /* Little t */ COMPATIBLE_IOCTL(TIOCGETD) @@ -1005,6 +1012,9 @@ COMPATIBLE_IOCTL(SCSI_IOCTL_SEND_COMMAND) COMPATIBLE_IOCTL(SCSI_IOCTL_PROBE_HOST) COMPATIBLE_IOCTL(SCSI_IOCTL_GET_PCI) #endif +/* Big V (don't complain on serial console) */ +IGNORE_IOCTL(VT_OPENQRY) +IGNORE_IOCTL(VT_GETMODE) /* Little p (/dev/rtc, /dev/envctrl, etc.) */ COMPATIBLE_IOCTL(RTC_AIE_ON) COMPATIBLE_IOCTL(RTC_AIE_OFF) @@ -1035,6 +1045,8 @@ COMPATIBLE_IOCTL(FIOQSIZE) #ifdef CONFIG_BLOCK /* loop */ IGNORE_IOCTL(LOOP_CLR_FD) +/* md calls this on random blockdevs */ +IGNORE_IOCTL(RAID_VERSION) /* SG stuff */ COMPATIBLE_IOCTL(SG_SET_TIMEOUT) COMPATIBLE_IOCTL(SG_GET_TIMEOUT) @@ -1600,8 +1612,6 @@ static long do_ioctl_trans(int fd, unsigned int cmd, case KDSKBMETA: case KDSKBLED: case KDSETLED: - /* SG stuff */ - case SG_SET_TRANSFORM: /* AUTOFS */ case AUTOFS_IOC_READY: case AUTOFS_IOC_FAIL: diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c index c8afa6b..32a5f46 100644 --- a/fs/configfs/symlink.c +++ b/fs/configfs/symlink.c @@ -121,8 +121,10 @@ static int get_target(const char *symname, struct path *path, ret = -ENOENT; path_put(path); } - } else + } else { ret = -EPERM; + path_put(path); + } } return ret; diff --git a/fs/dcache.c b/fs/dcache.c index a100fa3..953173a 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -978,6 +978,7 @@ struct dentry *d_alloc_name(struct dentry *parent, const char *name) q.hash = full_name_hash(q.name, q.len); return d_alloc(parent, &q); } +EXPORT_SYMBOL(d_alloc_name); /* the caller must hold dcache_lock */ static void __d_instantiate(struct dentry *dentry, struct inode *inode) diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index b486169..274ac86 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -160,15 +160,8 @@ static int debugfs_create_by_name(const char *name, mode_t mode, * block. A pointer to that is in the struct vfsmount that we * have around. */ - if (!parent) { - if (debugfs_mount && debugfs_mount->mnt_sb) { - parent = debugfs_mount->mnt_sb->s_root; - } - } - if (!parent) { - pr_debug("debugfs: Ah! can not find a parent!\n"); - return -EFAULT; - } + if (!parent) + parent = debugfs_mount->mnt_sb->s_root; *dentry = NULL; mutex_lock(&parent->d_inode->i_mutex); diff --git a/fs/direct-io.c b/fs/direct-io.c index 4012885..e82adc2 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1206,7 +1206,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, * NOTE: filesystems with their own locking have to handle this * on their own. */ - if (dio->flags & DIO_LOCKING) { + if (flags & DIO_LOCKING) { if (unlikely((rw & WRITE) && retval < 0)) { loff_t isize = i_size_read(inode); if (end > isize) diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index fbb6e5e..7cb0a59 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -1748,7 +1748,7 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm, char *cipher_name, size_t *key_size) { char dummy_key[ECRYPTFS_MAX_KEY_BYTES]; - char *full_alg_name; + char *full_alg_name = NULL; int rc; *key_tfm = NULL; @@ -1763,7 +1763,6 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm, if (rc) goto out; *key_tfm = crypto_alloc_blkcipher(full_alg_name, 0, CRYPTO_ALG_ASYNC); - kfree(full_alg_name); if (IS_ERR(*key_tfm)) { rc = PTR_ERR(*key_tfm); printk(KERN_ERR "Unable to allocate crypto cipher with name " @@ -1786,6 +1785,7 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm, goto out; } out: + kfree(full_alg_name); return rc; } diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c index 2dda5ad..8f006a0 100644 --- a/fs/ecryptfs/dentry.c +++ b/fs/ecryptfs/dentry.c @@ -62,7 +62,7 @@ static int ecryptfs_d_revalidate(struct dentry *dentry, struct nameidata *nd) struct inode *lower_inode = ecryptfs_inode_to_lower(dentry->d_inode); - fsstack_copy_attr_all(dentry->d_inode, lower_inode, NULL); + fsstack_copy_attr_all(dentry->d_inode, lower_inode); } out: return rc; diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 9e94405..678172b 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -158,7 +158,7 @@ static int ecryptfs_open(struct inode *inode, struct file *file) struct dentry *ecryptfs_dentry = file->f_path.dentry; /* Private value of ecryptfs_dentry allocated in * ecryptfs_lookup() */ - struct dentry *lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry); + struct dentry *lower_dentry; struct ecryptfs_file_info *file_info; mount_crypt_stat = &ecryptfs_superblock_to_private( @@ -191,13 +191,6 @@ static int ecryptfs_open(struct inode *inode, struct file *file) | ECRYPTFS_ENCRYPTED); } mutex_unlock(&crypt_stat->cs_mutex); - if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_RDONLY) - && !(file->f_flags & O_RDONLY)) { - rc = -EPERM; - printk(KERN_WARNING "%s: Lower persistent file is RO; eCryptfs " - "file must hence be opened RO\n", __func__); - goto out; - } if (!ecryptfs_inode_to_private(inode)->lower_file) { rc = ecryptfs_init_persistent_file(ecryptfs_dentry); if (rc) { @@ -208,6 +201,13 @@ static int ecryptfs_open(struct inode *inode, struct file *file) goto out; } } + if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_RDONLY) + && !(file->f_flags & O_RDONLY)) { + rc = -EPERM; + printk(KERN_WARNING "%s: Lower persistent file is RO; eCryptfs " + "file must hence be opened RO\n", __func__); + goto out; + } ecryptfs_set_file_lower( file, ecryptfs_inode_to_private(inode)->lower_file); if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) { @@ -299,7 +299,6 @@ static int ecryptfs_ioctl(struct inode *inode, struct file *file, const struct file_operations ecryptfs_dir_fops = { .readdir = ecryptfs_readdir, .ioctl = ecryptfs_ioctl, - .mmap = generic_file_mmap, .open = ecryptfs_open, .flush = ecryptfs_flush, .release = ecryptfs_release, diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 056fed62..4a430ab 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -282,7 +282,8 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, goto out; } rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry, - ecryptfs_dir_inode->i_sb, 1); + ecryptfs_dir_inode->i_sb, + ECRYPTFS_INTERPOSE_FLAG_D_ADD); if (rc) { printk(KERN_ERR "%s: Error interposing; rc = [%d]\n", __func__, rc); @@ -463,9 +464,6 @@ out_lock: unlock_dir(lower_dir_dentry); dput(lower_new_dentry); dput(lower_old_dentry); - d_drop(lower_old_dentry); - d_drop(new_dentry); - d_drop(old_dentry); return rc; } @@ -614,6 +612,7 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct dentry *lower_new_dentry; struct dentry *lower_old_dir_dentry; struct dentry *lower_new_dir_dentry; + struct dentry *trap = NULL; lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry); lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry); @@ -621,14 +620,24 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry, dget(lower_new_dentry); lower_old_dir_dentry = dget_parent(lower_old_dentry); lower_new_dir_dentry = dget_parent(lower_new_dentry); - lock_rename(lower_old_dir_dentry, lower_new_dir_dentry); + trap = lock_rename(lower_old_dir_dentry, lower_new_dir_dentry); + /* source should not be ancestor of target */ + if (trap == lower_old_dentry) { + rc = -EINVAL; + goto out_lock; + } + /* target should not be ancestor of source */ + if (trap == lower_new_dentry) { + rc = -ENOTEMPTY; + goto out_lock; + } rc = vfs_rename(lower_old_dir_dentry->d_inode, lower_old_dentry, lower_new_dir_dentry->d_inode, lower_new_dentry); if (rc) goto out_lock; - fsstack_copy_attr_all(new_dir, lower_new_dir_dentry->d_inode, NULL); + fsstack_copy_attr_all(new_dir, lower_new_dir_dentry->d_inode); if (new_dir != old_dir) - fsstack_copy_attr_all(old_dir, lower_old_dir_dentry->d_inode, NULL); + fsstack_copy_attr_all(old_dir, lower_old_dir_dentry->d_inode); out_lock: unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry); dput(lower_new_dentry->d_parent); @@ -715,31 +724,31 @@ static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd) /* Released in ecryptfs_put_link(); only release here on error */ buf = kmalloc(len, GFP_KERNEL); if (!buf) { - rc = -ENOMEM; + buf = ERR_PTR(-ENOMEM); goto out; } old_fs = get_fs(); set_fs(get_ds()); rc = dentry->d_inode->i_op->readlink(dentry, (char __user *)buf, len); set_fs(old_fs); - if (rc < 0) - goto out_free; - else + if (rc < 0) { + kfree(buf); + buf = ERR_PTR(rc); + } else buf[rc] = '\0'; - rc = 0; - nd_set_link(nd, buf); - goto out; -out_free: - kfree(buf); out: - return ERR_PTR(rc); + nd_set_link(nd, buf); + return NULL; } static void ecryptfs_put_link(struct dentry *dentry, struct nameidata *nd, void *ptr) { - /* Free the char* */ - kfree(nd_get_link(nd)); + char *buf = nd_get_link(nd); + if (!IS_ERR(buf)) { + /* Free the char* */ + kfree(buf); + } } /** @@ -772,18 +781,23 @@ upper_size_to_lower_size(struct ecryptfs_crypt_stat *crypt_stat, } /** - * ecryptfs_truncate + * truncate_upper * @dentry: The ecryptfs layer dentry - * @new_length: The length to expand the file to + * @ia: Address of the ecryptfs inode's attributes + * @lower_ia: Address of the lower inode's attributes * * Function to handle truncations modifying the size of the file. Note * that the file sizes are interpolated. When expanding, we are simply - * writing strings of 0's out. When truncating, we need to modify the - * underlying file size according to the page index interpolations. + * writing strings of 0's out. When truncating, we truncate the upper + * inode and update the lower_ia according to the page index + * interpolations. If ATTR_SIZE is set in lower_ia->ia_valid upon return, + * the caller must use lower_ia in a call to notify_change() to perform + * the truncation of the lower inode. * * Returns zero on success; non-zero otherwise */ -int ecryptfs_truncate(struct dentry *dentry, loff_t new_length) +static int truncate_upper(struct dentry *dentry, struct iattr *ia, + struct iattr *lower_ia) { int rc = 0; struct inode *inode = dentry->d_inode; @@ -794,8 +808,10 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length) loff_t lower_size_before_truncate; loff_t lower_size_after_truncate; - if (unlikely((new_length == i_size))) + if (unlikely((ia->ia_size == i_size))) { + lower_ia->ia_valid &= ~ATTR_SIZE; goto out; + } crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat; /* Set up a fake ecryptfs file, this is used to interface with * the file in the underlying filesystem so that the @@ -815,28 +831,30 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length) &fake_ecryptfs_file, ecryptfs_inode_to_private(dentry->d_inode)->lower_file); /* Switch on growing or shrinking file */ - if (new_length > i_size) { + if (ia->ia_size > i_size) { char zero[] = { 0x00 }; + lower_ia->ia_valid &= ~ATTR_SIZE; /* Write a single 0 at the last position of the file; * this triggers code that will fill in 0's throughout * the intermediate portion of the previous end of the * file and the new and of the file */ rc = ecryptfs_write(&fake_ecryptfs_file, zero, - (new_length - 1), 1); - } else { /* new_length < i_size_read(inode) */ - /* We're chopping off all the pages down do the page - * in which new_length is located. Fill in the end of - * that page from (new_length & ~PAGE_CACHE_MASK) to + (ia->ia_size - 1), 1); + } else { /* ia->ia_size < i_size_read(inode) */ + /* We're chopping off all the pages down to the page + * in which ia->ia_size is located. Fill in the end of + * that page from (ia->ia_size & ~PAGE_CACHE_MASK) to * PAGE_CACHE_SIZE with zeros. */ size_t num_zeros = (PAGE_CACHE_SIZE - - (new_length & ~PAGE_CACHE_MASK)); + - (ia->ia_size & ~PAGE_CACHE_MASK)); if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { - rc = vmtruncate(inode, new_length); + rc = vmtruncate(inode, ia->ia_size); if (rc) goto out_free; - rc = vmtruncate(lower_dentry->d_inode, new_length); + lower_ia->ia_size = ia->ia_size; + lower_ia->ia_valid |= ATTR_SIZE; goto out_free; } if (num_zeros) { @@ -848,7 +866,7 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length) goto out_free; } rc = ecryptfs_write(&fake_ecryptfs_file, zeros_virt, - new_length, num_zeros); + ia->ia_size, num_zeros); kfree(zeros_virt); if (rc) { printk(KERN_ERR "Error attempting to zero out " @@ -857,7 +875,7 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length) goto out_free; } } - vmtruncate(inode, new_length); + vmtruncate(inode, ia->ia_size); rc = ecryptfs_write_inode_size_to_metadata(inode); if (rc) { printk(KERN_ERR "Problem with " @@ -870,10 +888,12 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length) lower_size_before_truncate = upper_size_to_lower_size(crypt_stat, i_size); lower_size_after_truncate = - upper_size_to_lower_size(crypt_stat, new_length); - if (lower_size_after_truncate < lower_size_before_truncate) - vmtruncate(lower_dentry->d_inode, - lower_size_after_truncate); + upper_size_to_lower_size(crypt_stat, ia->ia_size); + if (lower_size_after_truncate < lower_size_before_truncate) { + lower_ia->ia_size = lower_size_after_truncate; + lower_ia->ia_valid |= ATTR_SIZE; + } else + lower_ia->ia_valid &= ~ATTR_SIZE; } out_free: if (ecryptfs_file_to_private(&fake_ecryptfs_file)) @@ -883,6 +903,33 @@ out: return rc; } +/** + * ecryptfs_truncate + * @dentry: The ecryptfs layer dentry + * @new_length: The length to expand the file to + * + * Simple function that handles the truncation of an eCryptfs inode and + * its corresponding lower inode. + * + * Returns zero on success; non-zero otherwise + */ +int ecryptfs_truncate(struct dentry *dentry, loff_t new_length) +{ + struct iattr ia = { .ia_valid = ATTR_SIZE, .ia_size = new_length }; + struct iattr lower_ia = { .ia_valid = 0 }; + int rc; + + rc = truncate_upper(dentry, &ia, &lower_ia); + if (!rc && lower_ia.ia_valid & ATTR_SIZE) { + struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); + + mutex_lock(&lower_dentry->d_inode->i_mutex); + rc = notify_change(lower_dentry, &lower_ia); + mutex_unlock(&lower_dentry->d_inode->i_mutex); + } + return rc; +} + static int ecryptfs_permission(struct inode *inode, int mask) { @@ -905,6 +952,7 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia) { int rc = 0; struct dentry *lower_dentry; + struct iattr lower_ia; struct inode *inode; struct inode *lower_inode; struct ecryptfs_crypt_stat *crypt_stat; @@ -943,15 +991,11 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia) } } mutex_unlock(&crypt_stat->cs_mutex); + memcpy(&lower_ia, ia, sizeof(lower_ia)); + if (ia->ia_valid & ATTR_FILE) + lower_ia.ia_file = ecryptfs_file_to_lower(ia->ia_file); if (ia->ia_valid & ATTR_SIZE) { - ecryptfs_printk(KERN_DEBUG, - "ia->ia_valid = [0x%x] ATTR_SIZE" " = [0x%x]\n", - ia->ia_valid, ATTR_SIZE); - rc = ecryptfs_truncate(dentry, ia->ia_size); - /* ecryptfs_truncate handles resizing of the lower file */ - ia->ia_valid &= ~ATTR_SIZE; - ecryptfs_printk(KERN_DEBUG, "ia->ia_valid = [%x]\n", - ia->ia_valid); + rc = truncate_upper(dentry, ia, &lower_ia); if (rc < 0) goto out; } @@ -960,14 +1004,29 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia) * mode change is for clearing setuid/setgid bits. Allow lower fs * to interpret this in its own way. */ - if (ia->ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID)) - ia->ia_valid &= ~ATTR_MODE; + if (lower_ia.ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID)) + lower_ia.ia_valid &= ~ATTR_MODE; mutex_lock(&lower_dentry->d_inode->i_mutex); - rc = notify_change(lower_dentry, ia); + rc = notify_change(lower_dentry, &lower_ia); mutex_unlock(&lower_dentry->d_inode->i_mutex); out: - fsstack_copy_attr_all(inode, lower_inode, NULL); + fsstack_copy_attr_all(inode, lower_inode); + return rc; +} + +int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct kstat lower_stat; + int rc; + + rc = vfs_getattr(ecryptfs_dentry_to_lower_mnt(dentry), + ecryptfs_dentry_to_lower(dentry), &lower_stat); + if (!rc) { + generic_fillattr(dentry->d_inode, stat); + stat->blocks = lower_stat.blocks; + } return rc; } @@ -1100,6 +1159,7 @@ const struct inode_operations ecryptfs_dir_iops = { const struct inode_operations ecryptfs_main_iops = { .permission = ecryptfs_permission, .setattr = ecryptfs_setattr, + .getattr = ecryptfs_getattr, .setxattr = ecryptfs_setxattr, .getxattr = ecryptfs_getxattr, .listxattr = ecryptfs_listxattr, diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index c6ac85d..ea2f921 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -35,7 +35,6 @@ #include <linux/key.h> #include <linux/parser.h> #include <linux/fs_stack.h> -#include <linux/ima.h> #include "ecryptfs_kernel.h" /** @@ -119,7 +118,6 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry) const struct cred *cred = current_cred(); struct ecryptfs_inode_info *inode_info = ecryptfs_inode_to_private(ecryptfs_dentry->d_inode); - int opened_lower_file = 0; int rc = 0; mutex_lock(&inode_info->lower_file_mutex); @@ -136,12 +134,9 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry) "for lower_dentry [0x%p] and lower_mnt [0x%p]; " "rc = [%d]\n", lower_dentry, lower_mnt, rc); inode_info->lower_file = NULL; - } else - opened_lower_file = 1; + } } mutex_unlock(&inode_info->lower_file_mutex); - if (opened_lower_file) - ima_counts_get(inode_info->lower_file); return rc; } @@ -194,7 +189,7 @@ int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry, init_special_inode(inode, lower_inode->i_mode, lower_inode->i_rdev); dentry->d_op = &ecryptfs_dops; - fsstack_copy_attr_all(inode, lower_inode, NULL); + fsstack_copy_attr_all(inode, lower_inode); /* This size will be overwritten for real files w/ headers and * other metadata */ fsstack_copy_inode_size(inode, lower_inode); @@ -590,8 +585,8 @@ out: * with as much information as it can before needing * the lower filesystem. * ecryptfs_read_super(): this accesses the lower filesystem and uses - * ecryptfs_interpolate to perform most of the linking - * ecryptfs_interpolate(): links the lower filesystem into ecryptfs + * ecryptfs_interpose to perform most of the linking + * ecryptfs_interpose(): links the lower filesystem into ecryptfs (inode.c) */ static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data, diff --git a/fs/eventfd.c b/fs/eventfd.c index 8b47e42..7758cc3 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -135,26 +135,71 @@ static unsigned int eventfd_poll(struct file *file, poll_table *wait) return events; } -static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count, - loff_t *ppos) +static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt) +{ + *cnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count; + ctx->count -= *cnt; +} + +/** + * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue. + * @ctx: [in] Pointer to eventfd context. + * @wait: [in] Wait queue to be removed. + * @cnt: [out] Pointer to the 64bit conter value. + * + * Returns zero if successful, or the following error codes: + * + * -EAGAIN : The operation would have blocked. + * + * This is used to atomically remove a wait queue entry from the eventfd wait + * queue head, and read/reset the counter value. + */ +int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_t *wait, + __u64 *cnt) +{ + unsigned long flags; + + spin_lock_irqsave(&ctx->wqh.lock, flags); + eventfd_ctx_do_read(ctx, cnt); + __remove_wait_queue(&ctx->wqh, wait); + if (*cnt != 0 && waitqueue_active(&ctx->wqh)) + wake_up_locked_poll(&ctx->wqh, POLLOUT); + spin_unlock_irqrestore(&ctx->wqh.lock, flags); + + return *cnt != 0 ? 0 : -EAGAIN; +} +EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue); + +/** + * eventfd_ctx_read - Reads the eventfd counter or wait if it is zero. + * @ctx: [in] Pointer to eventfd context. + * @no_wait: [in] Different from zero if the operation should not block. + * @cnt: [out] Pointer to the 64bit conter value. + * + * Returns zero if successful, or the following error codes: + * + * -EAGAIN : The operation would have blocked but @no_wait was nonzero. + * -ERESTARTSYS : A signal interrupted the wait operation. + * + * If @no_wait is zero, the function might sleep until the eventfd internal + * counter becomes greater than zero. + */ +ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, __u64 *cnt) { - struct eventfd_ctx *ctx = file->private_data; ssize_t res; - __u64 ucnt = 0; DECLARE_WAITQUEUE(wait, current); - if (count < sizeof(ucnt)) - return -EINVAL; spin_lock_irq(&ctx->wqh.lock); + *cnt = 0; res = -EAGAIN; if (ctx->count > 0) - res = sizeof(ucnt); - else if (!(file->f_flags & O_NONBLOCK)) { + res = 0; + else if (!no_wait) { __add_wait_queue(&ctx->wqh, &wait); - for (res = 0;;) { + for (;;) { set_current_state(TASK_INTERRUPTIBLE); if (ctx->count > 0) { - res = sizeof(ucnt); + res = 0; break; } if (signal_pending(current)) { @@ -168,18 +213,32 @@ static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count, __remove_wait_queue(&ctx->wqh, &wait); __set_current_state(TASK_RUNNING); } - if (likely(res > 0)) { - ucnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count; - ctx->count -= ucnt; + if (likely(res == 0)) { + eventfd_ctx_do_read(ctx, cnt); if (waitqueue_active(&ctx->wqh)) wake_up_locked_poll(&ctx->wqh, POLLOUT); } spin_unlock_irq(&ctx->wqh.lock); - if (res > 0 && put_user(ucnt, (__u64 __user *) buf)) - return -EFAULT; return res; } +EXPORT_SYMBOL_GPL(eventfd_ctx_read); + +static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count, + loff_t *ppos) +{ + struct eventfd_ctx *ctx = file->private_data; + ssize_t res; + __u64 cnt; + + if (count < sizeof(cnt)) + return -EINVAL; + res = eventfd_ctx_read(ctx, file->f_flags & O_NONBLOCK, &cnt); + if (res < 0) + return res; + + return put_user(cnt, (__u64 __user *) buf) ? -EFAULT : sizeof(cnt); +} static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) @@ -339,7 +398,7 @@ struct file *eventfd_file_create(unsigned int count, int flags) ctx->flags = flags; file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx, - flags & EFD_SHARED_FCNTL_FLAGS); + O_RDWR | (flags & EFD_SHARED_FCNTL_FLAGS)); if (IS_ERR(file)) eventfd_free_ctx(ctx); diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 366c503..bd056a5 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1206,7 +1206,7 @@ SYSCALL_DEFINE1(epoll_create1, int, flags) * a file structure and a free file descriptor. */ error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep, - flags & O_CLOEXEC); + O_RDWR | (flags & O_CLOEXEC)); if (error < 0) ep_free(ep); @@ -571,6 +571,9 @@ int setup_arg_pages(struct linux_binprm *bprm, struct vm_area_struct *prev = NULL; unsigned long vm_flags; unsigned long stack_base; + unsigned long stack_size; + unsigned long stack_expand; + unsigned long rlim_stack; #ifdef CONFIG_STACK_GROWSUP /* Limit stack size to 1GB */ @@ -627,10 +630,23 @@ int setup_arg_pages(struct linux_binprm *bprm, goto out_unlock; } + stack_expand = EXTRA_STACK_VM_PAGES * PAGE_SIZE; + stack_size = vma->vm_end - vma->vm_start; + /* + * Align this down to a page boundary as expand_stack + * will align it up. + */ + rlim_stack = rlimit(RLIMIT_STACK) & PAGE_MASK; #ifdef CONFIG_STACK_GROWSUP - stack_base = vma->vm_end + EXTRA_STACK_VM_PAGES * PAGE_SIZE; + if (stack_size + stack_expand > rlim_stack) + stack_base = vma->vm_start + rlim_stack; + else + stack_base = vma->vm_end + stack_expand; #else - stack_base = vma->vm_start - EXTRA_STACK_VM_PAGES * PAGE_SIZE; + if (stack_size + stack_expand > rlim_stack) + stack_base = vma->vm_end - rlim_stack; + else + stack_base = vma->vm_start - stack_expand; #endif ret = expand_stack(vma, stack_base); if (ret) @@ -826,7 +842,9 @@ static int de_thread(struct task_struct *tsk) attach_pid(tsk, PIDTYPE_PID, task_pid(leader)); transfer_pid(leader, tsk, PIDTYPE_PGID); transfer_pid(leader, tsk, PIDTYPE_SID); + list_replace_rcu(&leader->tasks, &tsk->tasks); + list_replace_init(&leader->sibling, &tsk->sibling); tsk->group_leader = tsk; leader->group_leader = tsk; @@ -939,9 +957,7 @@ void set_task_comm(struct task_struct *tsk, char *buf) int flush_old_exec(struct linux_binprm * bprm) { - char * name; - int i, ch, retval; - char tcomm[sizeof(current->comm)]; + int retval; /* * Make sure we have a private signal table and that @@ -962,6 +978,25 @@ int flush_old_exec(struct linux_binprm * bprm) bprm->mm = NULL; /* We're using it now */ + current->flags &= ~PF_RANDOMIZE; + flush_thread(); + current->personality &= ~bprm->per_clear; + + return 0; + +out: + return retval; +} +EXPORT_SYMBOL(flush_old_exec); + +void setup_new_exec(struct linux_binprm * bprm) +{ + int i, ch; + char * name; + char tcomm[sizeof(current->comm)]; + + arch_pick_mmap_layout(current->mm); + /* This is the point of no return */ current->sas_ss_sp = current->sas_ss_size = 0; @@ -983,9 +1018,6 @@ int flush_old_exec(struct linux_binprm * bprm) tcomm[i] = '\0'; set_task_comm(current, tcomm); - current->flags &= ~PF_RANDOMIZE; - flush_thread(); - /* Set the new mm task size. We have to do that late because it may * depend on TIF_32BIT which is only updated in flush_thread() on * some architectures like powerpc @@ -1001,8 +1033,6 @@ int flush_old_exec(struct linux_binprm * bprm) set_dumpable(current->mm, suid_dumpable); } - current->personality &= ~bprm->per_clear; - /* * Flush performance counters when crossing a * security domain: @@ -1017,14 +1047,8 @@ int flush_old_exec(struct linux_binprm * bprm) flush_signal_handlers(current, 0); flush_old_files(current->files); - - return 0; - -out: - return retval; } - -EXPORT_SYMBOL(flush_old_exec); +EXPORT_SYMBOL(setup_new_exec); /* * Prepare credentials and lock ->cred_guard_mutex. @@ -1761,17 +1785,20 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) struct mm_struct *mm = current->mm; struct linux_binfmt * binfmt; struct inode * inode; - struct file * file; const struct cred *old_cred; struct cred *cred; int retval = 0; int flag = 0; int ispipe = 0; - unsigned long core_limit = current->signal->rlim[RLIMIT_CORE].rlim_cur; char **helper_argv = NULL; int helper_argc = 0; int dump_count = 0; static atomic_t core_dump_count = ATOMIC_INIT(0); + struct coredump_params cprm = { + .signr = signr, + .regs = regs, + .limit = current->signal->rlim[RLIMIT_CORE].rlim_cur, + }; audit_core_dumps(signr); @@ -1827,15 +1854,15 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) ispipe = format_corename(corename, signr); unlock_kernel(); - if ((!ispipe) && (core_limit < binfmt->min_coredump)) + if ((!ispipe) && (cprm.limit < binfmt->min_coredump)) goto fail_unlock; if (ispipe) { - if (core_limit == 0) { + if (cprm.limit == 0) { /* * Normally core limits are irrelevant to pipes, since * we're not writing to the file system, but we use - * core_limit of 0 here as a speacial value. Any + * cprm.limit of 0 here as a speacial value. Any * non-zero limit gets set to RLIM_INFINITY below, but * a limit of 0 skips the dump. This is a consistent * way to catch recursive crashes. We can still crash @@ -1868,25 +1895,25 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) goto fail_dropcount; } - core_limit = RLIM_INFINITY; + cprm.limit = RLIM_INFINITY; /* SIGPIPE can happen, but it's just never processed */ if (call_usermodehelper_pipe(helper_argv[0], helper_argv, NULL, - &file)) { + &cprm.file)) { printk(KERN_INFO "Core dump to %s pipe failed\n", corename); goto fail_dropcount; } } else - file = filp_open(corename, + cprm.file = filp_open(corename, O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag, 0600); - if (IS_ERR(file)) + if (IS_ERR(cprm.file)) goto fail_dropcount; - inode = file->f_path.dentry->d_inode; + inode = cprm.file->f_path.dentry->d_inode; if (inode->i_nlink > 1) goto close_fail; /* multiple links - don't dump */ - if (!ispipe && d_unhashed(file->f_path.dentry)) + if (!ispipe && d_unhashed(cprm.file->f_path.dentry)) goto close_fail; /* AK: actually i see no reason to not allow this for named pipes etc., @@ -1899,21 +1926,22 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) */ if (inode->i_uid != current_fsuid()) goto close_fail; - if (!file->f_op) + if (!cprm.file->f_op) goto close_fail; - if (!file->f_op->write) + if (!cprm.file->f_op->write) goto close_fail; - if (!ispipe && do_truncate(file->f_path.dentry, 0, 0, file) != 0) + if (!ispipe && + do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file) != 0) goto close_fail; - retval = binfmt->core_dump(signr, regs, file, core_limit); + retval = binfmt->core_dump(&cprm); if (retval) current->signal->group_exit_code |= 0x80; close_fail: if (ispipe && core_pipe_limit) - wait_for_dump_helpers(file); - filp_close(file, NULL); + wait_for_dump_helpers(cprm.file); + filp_close(cprm.file, NULL); fail_dropcount: if (dump_count) atomic_dec(&core_dump_count); diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 698a863..2afbceb 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -738,13 +738,28 @@ static int exofs_write_begin_export(struct file *file, fsdata); } +static int exofs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = mapping->host; + /* According to comment in simple_write_end i_mutex is held */ + loff_t i_size = inode->i_size; + int ret; + + ret = simple_write_end(file, mapping,pos, len, copied, page, fsdata); + if (i_size != inode->i_size) + mark_inode_dirty(inode); + return ret; +} + const struct address_space_operations exofs_aops = { .readpage = exofs_readpage, .readpages = exofs_readpages, .writepage = exofs_writepage, .writepages = exofs_writepages, .write_begin = exofs_write_begin_export, - .write_end = simple_write_end, + .write_end = exofs_write_end, }; /****************************************************************************** diff --git a/fs/exofs/pnfs.h b/fs/exofs/pnfs.h index 423033a..c52e988 100644 --- a/fs/exofs/pnfs.h +++ b/fs/exofs/pnfs.h @@ -15,13 +15,7 @@ #ifndef __EXOFS_PNFS_H__ #define __EXOFS_PNFS_H__ -#if defined(CONFIG_PNFS) - - -/* FIXME: move this file to: linux/exportfs/pnfs_osd_xdr.h */ -#include "../nfs/objlayout/pnfs_osd_xdr.h" - -#else /* defined(CONFIG_PNFS) */ +#if ! defined(__PNFS_OSD_XDR_H__) enum pnfs_iomode { IOMODE_READ = 1, @@ -46,6 +40,6 @@ struct pnfs_osd_data_map { u32 odm_raid_algorithm; }; -#endif /* else defined(CONFIG_PNFS) */ +#endif /* ! defined(__PNFS_OSD_XDR_H__) */ #endif /* __EXOFS_PNFS_H__ */ diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 197c7db..e9e1759 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -6,7 +6,7 @@ * and for mapping back from file handles to dentries. * * For details on why we do all the strange and hairy things in here - * take a look at Documentation/filesystems/Exporting. + * take a look at Documentation/filesystems/nfs/Exporting. */ #include <linux/exportfs.h> #include <linux/fs.h> diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index a63d442..a99e543 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -339,12 +339,12 @@ ext2_acl_chmod(struct inode *inode) * Extended attribut handlers */ static size_t -ext2_xattr_list_acl_access(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +ext2_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) { const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); - if (!test_opt(inode->i_sb, POSIX_ACL)) + if (!test_opt(dentry->d_sb, POSIX_ACL)) return 0; if (list && size <= list_size) memcpy(list, POSIX_ACL_XATTR_ACCESS, size); @@ -352,12 +352,12 @@ ext2_xattr_list_acl_access(struct inode *inode, char *list, size_t list_size, } static size_t -ext2_xattr_list_acl_default(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +ext2_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) { const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); - if (!test_opt(inode->i_sb, POSIX_ACL)) + if (!test_opt(dentry->d_sb, POSIX_ACL)) return 0; if (list && size <= list_size) memcpy(list, POSIX_ACL_XATTR_DEFAULT, size); @@ -365,15 +365,18 @@ ext2_xattr_list_acl_default(struct inode *inode, char *list, size_t list_size, } static int -ext2_xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size) +ext2_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer, + size_t size, int type) { struct posix_acl *acl; int error; - if (!test_opt(inode->i_sb, POSIX_ACL)) + if (strcmp(name, "") != 0) + return -EINVAL; + if (!test_opt(dentry->d_sb, POSIX_ACL)) return -EOPNOTSUPP; - acl = ext2_get_acl(inode, type); + acl = ext2_get_acl(dentry->d_inode, type); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl == NULL) @@ -385,33 +388,17 @@ ext2_xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size) } static int -ext2_xattr_get_acl_access(struct inode *inode, const char *name, - void *buffer, size_t size) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return ext2_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size); -} - -static int -ext2_xattr_get_acl_default(struct inode *inode, const char *name, - void *buffer, size_t size) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return ext2_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size); -} - -static int -ext2_xattr_set_acl(struct inode *inode, int type, const void *value, - size_t size) +ext2_xattr_set_acl(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags, int type) { struct posix_acl *acl; int error; - if (!test_opt(inode->i_sb, POSIX_ACL)) + if (strcmp(name, "") != 0) + return -EINVAL; + if (!test_opt(dentry->d_sb, POSIX_ACL)) return -EOPNOTSUPP; - if (!is_owner_or_cap(inode)) + if (!is_owner_or_cap(dentry->d_inode)) return -EPERM; if (value) { @@ -426,41 +413,25 @@ ext2_xattr_set_acl(struct inode *inode, int type, const void *value, } else acl = NULL; - error = ext2_set_acl(inode, type, acl); + error = ext2_set_acl(dentry->d_inode, type, acl); release_and_out: posix_acl_release(acl); return error; } -static int -ext2_xattr_set_acl_access(struct inode *inode, const char *name, - const void *value, size_t size, int flags) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return ext2_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size); -} - -static int -ext2_xattr_set_acl_default(struct inode *inode, const char *name, - const void *value, size_t size, int flags) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return ext2_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size); -} - struct xattr_handler ext2_xattr_acl_access_handler = { .prefix = POSIX_ACL_XATTR_ACCESS, + .flags = ACL_TYPE_ACCESS, .list = ext2_xattr_list_acl_access, - .get = ext2_xattr_get_acl_access, - .set = ext2_xattr_set_acl_access, + .get = ext2_xattr_get_acl, + .set = ext2_xattr_set_acl, }; struct xattr_handler ext2_xattr_acl_default_handler = { .prefix = POSIX_ACL_XATTR_DEFAULT, + .flags = ACL_TYPE_DEFAULT, .list = ext2_xattr_list_acl_default, - .get = ext2_xattr_get_acl_default, - .set = ext2_xattr_set_acl_default, + .get = ext2_xattr_get_acl, + .set = ext2_xattr_set_acl, }; diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index 7913531..904f006 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -60,6 +60,7 @@ #include <linux/mbcache.h> #include <linux/quotaops.h> #include <linux/rwsem.h> +#include <linux/security.h> #include "ext2.h" #include "xattr.h" #include "acl.h" @@ -249,8 +250,9 @@ cleanup: * used / required on success. */ static int -ext2_xattr_list(struct inode *inode, char *buffer, size_t buffer_size) +ext2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) { + struct inode *inode = dentry->d_inode; struct buffer_head *bh = NULL; struct ext2_xattr_entry *entry; char *end; @@ -300,9 +302,10 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_list", ext2_xattr_handler(entry->e_name_index); if (handler) { - size_t size = handler->list(inode, buffer, rest, + size_t size = handler->list(dentry, buffer, rest, entry->e_name, - entry->e_name_len); + entry->e_name_len, + handler->flags); if (buffer) { if (size > rest) { error = -ERANGE; @@ -330,7 +333,7 @@ cleanup: ssize_t ext2_listxattr(struct dentry *dentry, char *buffer, size_t size) { - return ext2_xattr_list(dentry->d_inode, buffer, size); + return ext2_xattr_list(dentry, buffer, size); } /* diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c index 70c0dbd..c815584 100644 --- a/fs/ext2/xattr_security.c +++ b/fs/ext2/xattr_security.c @@ -11,8 +11,8 @@ #include "xattr.h" static size_t -ext2_xattr_security_list(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +ext2_xattr_security_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) { const int prefix_len = XATTR_SECURITY_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; @@ -26,22 +26,22 @@ ext2_xattr_security_list(struct inode *inode, char *list, size_t list_size, } static int -ext2_xattr_security_get(struct inode *inode, const char *name, - void *buffer, size_t size) +ext2_xattr_security_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) { if (strcmp(name, "") == 0) return -EINVAL; - return ext2_xattr_get(inode, EXT2_XATTR_INDEX_SECURITY, name, + return ext2_xattr_get(dentry->d_inode, EXT2_XATTR_INDEX_SECURITY, name, buffer, size); } static int -ext2_xattr_security_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) +ext2_xattr_security_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) { if (strcmp(name, "") == 0) return -EINVAL; - return ext2_xattr_set(inode, EXT2_XATTR_INDEX_SECURITY, name, + return ext2_xattr_set(dentry->d_inode, EXT2_XATTR_INDEX_SECURITY, name, value, size, flags); } diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c index e8219f8..2a26d71 100644 --- a/fs/ext2/xattr_trusted.c +++ b/fs/ext2/xattr_trusted.c @@ -13,8 +13,8 @@ #include "xattr.h" static size_t -ext2_xattr_trusted_list(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +ext2_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) { const int prefix_len = XATTR_TRUSTED_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; @@ -31,22 +31,22 @@ ext2_xattr_trusted_list(struct inode *inode, char *list, size_t list_size, } static int -ext2_xattr_trusted_get(struct inode *inode, const char *name, - void *buffer, size_t size) +ext2_xattr_trusted_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) { if (strcmp(name, "") == 0) return -EINVAL; - return ext2_xattr_get(inode, EXT2_XATTR_INDEX_TRUSTED, name, + return ext2_xattr_get(dentry->d_inode, EXT2_XATTR_INDEX_TRUSTED, name, buffer, size); } static int -ext2_xattr_trusted_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) +ext2_xattr_trusted_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) { if (strcmp(name, "") == 0) return -EINVAL; - return ext2_xattr_set(inode, EXT2_XATTR_INDEX_TRUSTED, name, + return ext2_xattr_set(dentry->d_inode, EXT2_XATTR_INDEX_TRUSTED, name, value, size, flags); } diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c index 92495d2..3f6caf3 100644 --- a/fs/ext2/xattr_user.c +++ b/fs/ext2/xattr_user.c @@ -12,13 +12,13 @@ #include "xattr.h" static size_t -ext2_xattr_user_list(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +ext2_xattr_user_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) { const size_t prefix_len = XATTR_USER_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; - if (!test_opt(inode->i_sb, XATTR_USER)) + if (!test_opt(dentry->d_sb, XATTR_USER)) return 0; if (list && total_len <= list_size) { @@ -30,27 +30,28 @@ ext2_xattr_user_list(struct inode *inode, char *list, size_t list_size, } static int -ext2_xattr_user_get(struct inode *inode, const char *name, - void *buffer, size_t size) +ext2_xattr_user_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) { if (strcmp(name, "") == 0) return -EINVAL; - if (!test_opt(inode->i_sb, XATTR_USER)) + if (!test_opt(dentry->d_sb, XATTR_USER)) return -EOPNOTSUPP; - return ext2_xattr_get(inode, EXT2_XATTR_INDEX_USER, name, buffer, size); + return ext2_xattr_get(dentry->d_inode, EXT2_XATTR_INDEX_USER, + name, buffer, size); } static int -ext2_xattr_user_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) +ext2_xattr_user_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) { if (strcmp(name, "") == 0) return -EINVAL; - if (!test_opt(inode->i_sb, XATTR_USER)) + if (!test_opt(dentry->d_sb, XATTR_USER)) return -EOPNOTSUPP; - return ext2_xattr_set(inode, EXT2_XATTR_INDEX_USER, name, - value, size, flags); + return ext2_xattr_set(dentry->d_inode, EXT2_XATTR_INDEX_USER, + name, value, size, flags); } struct xattr_handler ext2_xattr_user_handler = { diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c index c9b0df3..82ba341 100644 --- a/fs/ext3/acl.c +++ b/fs/ext3/acl.c @@ -366,12 +366,12 @@ out: * Extended attribute handlers */ static size_t -ext3_xattr_list_acl_access(struct inode *inode, char *list, size_t list_len, - const char *name, size_t name_len) +ext3_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_len, + const char *name, size_t name_len, int type) { const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); - if (!test_opt(inode->i_sb, POSIX_ACL)) + if (!test_opt(dentry->d_sb, POSIX_ACL)) return 0; if (list && size <= list_len) memcpy(list, POSIX_ACL_XATTR_ACCESS, size); @@ -379,12 +379,12 @@ ext3_xattr_list_acl_access(struct inode *inode, char *list, size_t list_len, } static size_t -ext3_xattr_list_acl_default(struct inode *inode, char *list, size_t list_len, - const char *name, size_t name_len) +ext3_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_len, + const char *name, size_t name_len, int type) { const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); - if (!test_opt(inode->i_sb, POSIX_ACL)) + if (!test_opt(dentry->d_sb, POSIX_ACL)) return 0; if (list && size <= list_len) memcpy(list, POSIX_ACL_XATTR_DEFAULT, size); @@ -392,15 +392,18 @@ ext3_xattr_list_acl_default(struct inode *inode, char *list, size_t list_len, } static int -ext3_xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size) +ext3_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer, + size_t size, int type) { struct posix_acl *acl; int error; - if (!test_opt(inode->i_sb, POSIX_ACL)) + if (strcmp(name, "") != 0) + return -EINVAL; + if (!test_opt(dentry->d_sb, POSIX_ACL)) return -EOPNOTSUPP; - acl = ext3_get_acl(inode, type); + acl = ext3_get_acl(dentry->d_inode, type); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl == NULL) @@ -412,31 +415,16 @@ ext3_xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size) } static int -ext3_xattr_get_acl_access(struct inode *inode, const char *name, - void *buffer, size_t size) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return ext3_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size); -} - -static int -ext3_xattr_get_acl_default(struct inode *inode, const char *name, - void *buffer, size_t size) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return ext3_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size); -} - -static int -ext3_xattr_set_acl(struct inode *inode, int type, const void *value, - size_t size) +ext3_xattr_set_acl(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags, int type) { + struct inode *inode = dentry->d_inode; handle_t *handle; struct posix_acl *acl; int error, retries = 0; + if (strcmp(name, "") != 0) + return -EINVAL; if (!test_opt(inode->i_sb, POSIX_ACL)) return -EOPNOTSUPP; if (!is_owner_or_cap(inode)) @@ -468,34 +456,18 @@ release_and_out: return error; } -static int -ext3_xattr_set_acl_access(struct inode *inode, const char *name, - const void *value, size_t size, int flags) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return ext3_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size); -} - -static int -ext3_xattr_set_acl_default(struct inode *inode, const char *name, - const void *value, size_t size, int flags) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return ext3_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size); -} - struct xattr_handler ext3_xattr_acl_access_handler = { .prefix = POSIX_ACL_XATTR_ACCESS, + .flags = ACL_TYPE_ACCESS, .list = ext3_xattr_list_acl_access, - .get = ext3_xattr_get_acl_access, - .set = ext3_xattr_set_acl_access, + .get = ext3_xattr_get_acl, + .set = ext3_xattr_set_acl, }; struct xattr_handler ext3_xattr_acl_default_handler = { .prefix = POSIX_ACL_XATTR_DEFAULT, + .flags = ACL_TYPE_DEFAULT, .list = ext3_xattr_list_acl_default, - .get = ext3_xattr_get_acl_default, - .set = ext3_xattr_set_acl_default, + .get = ext3_xattr_get_acl, + .set = ext3_xattr_set_acl, }; diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index ad14227..455e6e6 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -970,7 +970,7 @@ static int ext3_get_block(struct inode *inode, sector_t iblock, if (max_blocks > DIO_MAX_BLOCKS) max_blocks = DIO_MAX_BLOCKS; handle = ext3_journal_start(inode, DIO_CREDITS + - 2 * EXT3_QUOTA_TRANS_BLOCKS(inode->i_sb)); + EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb)); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out; @@ -3146,8 +3146,8 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr) /* (user+group)*(old+new) structure, inode write (sb, * inode block, ? - but truncate inode update has it) */ - handle = ext3_journal_start(inode, 2*(EXT3_QUOTA_INIT_BLOCKS(inode->i_sb)+ - EXT3_QUOTA_DEL_BLOCKS(inode->i_sb))+3); + handle = ext3_journal_start(inode, EXT3_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+ + EXT3_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)+3); if (IS_ERR(handle)) { error = PTR_ERR(handle); goto err_out; @@ -3239,7 +3239,7 @@ static int ext3_writepage_trans_blocks(struct inode *inode) #ifdef CONFIG_QUOTA /* We know that structure was already allocated during vfs_dq_init so * we will be updating only the data blocks + inodes */ - ret += 2*EXT3_QUOTA_TRANS_BLOCKS(inode->i_sb); + ret += EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); #endif return ret; diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index aad6400..7b0e44f7 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -1699,7 +1699,7 @@ static int ext3_create (struct inode * dir, struct dentry * dentry, int mode, retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + - 2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb)); + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -1733,7 +1733,7 @@ static int ext3_mknod (struct inode * dir, struct dentry *dentry, retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + - 2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb)); + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -1769,7 +1769,7 @@ static int ext3_mkdir(struct inode * dir, struct dentry * dentry, int mode) retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + - 2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb)); + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -1920,7 +1920,7 @@ int ext3_orphan_add(handle_t *handle, struct inode *inode) struct ext3_iloc iloc; int err = 0, rc; - lock_super(sb); + mutex_lock(&EXT3_SB(sb)->s_orphan_lock); if (!list_empty(&EXT3_I(inode)->i_orphan)) goto out_unlock; @@ -1929,9 +1929,13 @@ int ext3_orphan_add(handle_t *handle, struct inode *inode) /* @@@ FIXME: Observation from aviro: * I think I can trigger J_ASSERT in ext3_orphan_add(). We block - * here (on lock_super()), so race with ext3_link() which might bump + * here (on s_orphan_lock), so race with ext3_link() which might bump * ->i_nlink. For, say it, character device. Not a regular file, * not a directory, not a symlink and ->i_nlink > 0. + * + * tytso, 4/25/2009: I'm not sure how that could happen; + * shouldn't the fs core protect us from these sort of + * unlink()/link() races? */ J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); @@ -1968,7 +1972,7 @@ int ext3_orphan_add(handle_t *handle, struct inode *inode) jbd_debug(4, "orphan inode %lu will point to %d\n", inode->i_ino, NEXT_ORPHAN(inode)); out_unlock: - unlock_super(sb); + mutex_unlock(&EXT3_SB(sb)->s_orphan_lock); ext3_std_error(inode->i_sb, err); return err; } @@ -1986,11 +1990,9 @@ int ext3_orphan_del(handle_t *handle, struct inode *inode) struct ext3_iloc iloc; int err = 0; - lock_super(inode->i_sb); - if (list_empty(&ei->i_orphan)) { - unlock_super(inode->i_sb); - return 0; - } + mutex_lock(&EXT3_SB(inode->i_sb)->s_orphan_lock); + if (list_empty(&ei->i_orphan)) + goto out; ino_next = NEXT_ORPHAN(inode); prev = ei->i_orphan.prev; @@ -2040,7 +2042,7 @@ int ext3_orphan_del(handle_t *handle, struct inode *inode) out_err: ext3_std_error(inode->i_sb, err); out: - unlock_super(inode->i_sb); + mutex_unlock(&EXT3_SB(inode->i_sb)->s_orphan_lock); return err; out_brelse: @@ -2175,7 +2177,7 @@ static int ext3_symlink (struct inode * dir, retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5 + - 2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb)); + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c index 5f83b61..54351ac 100644 --- a/fs/ext3/resize.c +++ b/fs/ext3/resize.c @@ -209,7 +209,7 @@ static int setup_new_group_blocks(struct super_block *sb, if (IS_ERR(handle)) return PTR_ERR(handle); - lock_super(sb); + mutex_lock(&sbi->s_resize_lock); if (input->group != sbi->s_groups_count) { err = -EBUSY; goto exit_journal; @@ -324,7 +324,7 @@ exit_bh: brelse(bh); exit_journal: - unlock_super(sb); + mutex_unlock(&sbi->s_resize_lock); if ((err2 = ext3_journal_stop(handle)) && !err) err = err2; @@ -662,11 +662,12 @@ exit_free: * important part is that the new block and inode counts are in the backup * superblocks, and the location of the new group metadata in the GDT backups. * - * We do not need lock_super() for this, because these blocks are not - * otherwise touched by the filesystem code when it is mounted. We don't - * need to worry about last changing from sbi->s_groups_count, because the - * worst that can happen is that we do not copy the full number of backups - * at this time. The resize which changed s_groups_count will backup again. + * We do not need take the s_resize_lock for this, because these + * blocks are not otherwise touched by the filesystem code when it is + * mounted. We don't need to worry about last changing from + * sbi->s_groups_count, because the worst that can happen is that we + * do not copy the full number of backups at this time. The resize + * which changed s_groups_count will backup again. */ static void update_backups(struct super_block *sb, int blk_off, char *data, int size) @@ -825,7 +826,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) goto exit_put; } - lock_super(sb); + mutex_lock(&sbi->s_resize_lock); if (input->group != sbi->s_groups_count) { ext3_warning(sb, __func__, "multiple resizers run on filesystem!"); @@ -856,7 +857,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) /* * OK, now we've set up the new group. Time to make it active. * - * Current kernels don't lock all allocations via lock_super(), + * We do not lock all allocations via s_resize_lock * so we have to be safe wrt. concurrent accesses the group * data. So we need to be careful to set all of the relevant * group descriptor data etc. *before* we enable the group. @@ -900,12 +901,12 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) * * The precise rules we use are: * - * * Writers of s_groups_count *must* hold lock_super + * * Writers of s_groups_count *must* hold s_resize_lock * AND * * Writers must perform a smp_wmb() after updating all dependent * data and before modifying the groups count * - * * Readers must hold lock_super() over the access + * * Readers must hold s_resize_lock over the access * OR * * Readers must perform an smp_rmb() after reading the groups count * and before reading any dependent data. @@ -936,7 +937,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) ext3_journal_dirty_metadata(handle, sbi->s_sbh); exit_journal: - unlock_super(sb); + mutex_unlock(&sbi->s_resize_lock); if ((err2 = ext3_journal_stop(handle)) && !err) err = err2; if (!err) { @@ -973,7 +974,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, /* We don't need to worry about locking wrt other resizers just * yet: we're going to revalidate es->s_blocks_count after - * taking lock_super() below. */ + * taking the s_resize_lock below. */ o_blocks_count = le32_to_cpu(es->s_blocks_count); o_groups_count = EXT3_SB(sb)->s_groups_count; @@ -1045,11 +1046,11 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, goto exit_put; } - lock_super(sb); + mutex_lock(&EXT3_SB(sb)->s_resize_lock); if (o_blocks_count != le32_to_cpu(es->s_blocks_count)) { ext3_warning(sb, __func__, "multiple resizers run on filesystem!"); - unlock_super(sb); + mutex_unlock(&EXT3_SB(sb)->s_resize_lock); ext3_journal_stop(handle); err = -EBUSY; goto exit_put; @@ -1059,13 +1060,13 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, EXT3_SB(sb)->s_sbh))) { ext3_warning(sb, __func__, "error %d on journal write access", err); - unlock_super(sb); + mutex_unlock(&EXT3_SB(sb)->s_resize_lock); ext3_journal_stop(handle); goto exit_put; } es->s_blocks_count = cpu_to_le32(o_blocks_count + add); ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); - unlock_super(sb); + mutex_unlock(&EXT3_SB(sb)->s_resize_lock); ext3_debug("freeing blocks %lu through "E3FSBLK"\n", o_blocks_count, o_blocks_count + add); ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks); diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 7ad1e8c..afa2b56 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -1928,6 +1928,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) sb->dq_op = &ext3_quota_operations; #endif INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ + mutex_init(&sbi->s_orphan_lock); + mutex_init(&sbi->s_resize_lock); sb->s_root = NULL; @@ -2014,14 +2016,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) } ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY); - /* - * akpm: core read_super() calls in here with the superblock locked. - * That deadlocks, because orphan cleanup needs to lock the superblock - * in numerous places. Here we just pop the lock - it's relatively - * harmless, because we are now ready to accept write_super() requests, - * and aviro says that's the only reason for hanging onto the - * superblock lock. - */ + EXT3_SB(sb)->s_mount_state |= EXT3_ORPHAN_FS; ext3_orphan_cleanup(sb, es); EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS; @@ -2403,13 +2398,11 @@ static void ext3_mark_recovery_complete(struct super_block * sb, if (journal_flush(journal) < 0) goto out; - lock_super(sb); if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) && sb->s_flags & MS_RDONLY) { EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); ext3_commit_super(sb, es, 1); } - unlock_super(sb); out: journal_unlock_updates(journal); @@ -2601,13 +2594,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) (sbi->s_mount_state & EXT3_VALID_FS)) es->s_state = cpu_to_le16(sbi->s_mount_state); - /* - * We have to unlock super so that we can wait for - * transactions. - */ - unlock_super(sb); ext3_mark_recovery_complete(sb, es); - lock_super(sb); } else { __le32 ret; if ((ret = EXT3_HAS_RO_COMPAT_FEATURE(sb, diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c index 387d92d..66895cc 100644 --- a/fs/ext3/xattr.c +++ b/fs/ext3/xattr.c @@ -99,7 +99,7 @@ static struct buffer_head *ext3_xattr_cache_find(struct inode *, struct mb_cache_entry **); static void ext3_xattr_rehash(struct ext3_xattr_header *, struct ext3_xattr_entry *); -static int ext3_xattr_list(struct inode *inode, char *buffer, +static int ext3_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size); static struct mb_cache *ext3_xattr_cache; @@ -147,7 +147,7 @@ ext3_xattr_handler(int name_index) ssize_t ext3_listxattr(struct dentry *dentry, char *buffer, size_t size) { - return ext3_xattr_list(dentry->d_inode, buffer, size); + return ext3_xattr_list(dentry, buffer, size); } static int @@ -332,7 +332,7 @@ ext3_xattr_get(struct inode *inode, int name_index, const char *name, } static int -ext3_xattr_list_entries(struct inode *inode, struct ext3_xattr_entry *entry, +ext3_xattr_list_entries(struct dentry *dentry, struct ext3_xattr_entry *entry, char *buffer, size_t buffer_size) { size_t rest = buffer_size; @@ -342,9 +342,10 @@ ext3_xattr_list_entries(struct inode *inode, struct ext3_xattr_entry *entry, ext3_xattr_handler(entry->e_name_index); if (handler) { - size_t size = handler->list(inode, buffer, rest, + size_t size = handler->list(dentry, buffer, rest, entry->e_name, - entry->e_name_len); + entry->e_name_len, + handler->flags); if (buffer) { if (size > rest) return -ERANGE; @@ -357,8 +358,9 @@ ext3_xattr_list_entries(struct inode *inode, struct ext3_xattr_entry *entry, } static int -ext3_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size) +ext3_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) { + struct inode *inode = dentry->d_inode; struct buffer_head *bh = NULL; int error; @@ -383,7 +385,7 @@ ext3_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size) goto cleanup; } ext3_xattr_cache_insert(bh); - error = ext3_xattr_list_entries(inode, BFIRST(bh), buffer, buffer_size); + error = ext3_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size); cleanup: brelse(bh); @@ -392,8 +394,9 @@ cleanup: } static int -ext3_xattr_ibody_list(struct inode *inode, char *buffer, size_t buffer_size) +ext3_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size) { + struct inode *inode = dentry->d_inode; struct ext3_xattr_ibody_header *header; struct ext3_inode *raw_inode; struct ext3_iloc iloc; @@ -411,7 +414,7 @@ ext3_xattr_ibody_list(struct inode *inode, char *buffer, size_t buffer_size) error = ext3_xattr_check_names(IFIRST(header), end); if (error) goto cleanup; - error = ext3_xattr_list_entries(inode, IFIRST(header), + error = ext3_xattr_list_entries(dentry, IFIRST(header), buffer, buffer_size); cleanup: @@ -430,12 +433,12 @@ cleanup: * used / required on success. */ static int -ext3_xattr_list(struct inode *inode, char *buffer, size_t buffer_size) +ext3_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) { int i_error, b_error; - down_read(&EXT3_I(inode)->xattr_sem); - i_error = ext3_xattr_ibody_list(inode, buffer, buffer_size); + down_read(&EXT3_I(dentry->d_inode)->xattr_sem); + i_error = ext3_xattr_ibody_list(dentry, buffer, buffer_size); if (i_error < 0) { b_error = 0; } else { @@ -443,11 +446,11 @@ ext3_xattr_list(struct inode *inode, char *buffer, size_t buffer_size) buffer += i_error; buffer_size -= i_error; } - b_error = ext3_xattr_block_list(inode, buffer, buffer_size); + b_error = ext3_xattr_block_list(dentry, buffer, buffer_size); if (b_error < 0) i_error = 0; } - up_read(&EXT3_I(inode)->xattr_sem); + up_read(&EXT3_I(dentry->d_inode)->xattr_sem); return i_error + b_error; } diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c index 37b8109..4743487 100644 --- a/fs/ext3/xattr_security.c +++ b/fs/ext3/xattr_security.c @@ -12,8 +12,8 @@ #include "xattr.h" static size_t -ext3_xattr_security_list(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +ext3_xattr_security_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) { const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; @@ -28,23 +28,23 @@ ext3_xattr_security_list(struct inode *inode, char *list, size_t list_size, } static int -ext3_xattr_security_get(struct inode *inode, const char *name, - void *buffer, size_t size) +ext3_xattr_security_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) { if (strcmp(name, "") == 0) return -EINVAL; - return ext3_xattr_get(inode, EXT3_XATTR_INDEX_SECURITY, name, - buffer, size); + return ext3_xattr_get(dentry->d_inode, EXT3_XATTR_INDEX_SECURITY, + name, buffer, size); } static int -ext3_xattr_security_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) +ext3_xattr_security_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) { if (strcmp(name, "") == 0) return -EINVAL; - return ext3_xattr_set(inode, EXT3_XATTR_INDEX_SECURITY, name, - value, size, flags); + return ext3_xattr_set(dentry->d_inode, EXT3_XATTR_INDEX_SECURITY, + name, value, size, flags); } int diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c index c7c41a4..e556284 100644 --- a/fs/ext3/xattr_trusted.c +++ b/fs/ext3/xattr_trusted.c @@ -14,8 +14,8 @@ #include "xattr.h" static size_t -ext3_xattr_trusted_list(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +ext3_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) { const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; @@ -32,22 +32,22 @@ ext3_xattr_trusted_list(struct inode *inode, char *list, size_t list_size, } static int -ext3_xattr_trusted_get(struct inode *inode, const char *name, - void *buffer, size_t size) +ext3_xattr_trusted_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) { if (strcmp(name, "") == 0) return -EINVAL; - return ext3_xattr_get(inode, EXT3_XATTR_INDEX_TRUSTED, name, - buffer, size); + return ext3_xattr_get(dentry->d_inode, EXT3_XATTR_INDEX_TRUSTED, + name, buffer, size); } static int -ext3_xattr_trusted_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) +ext3_xattr_trusted_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) { if (strcmp(name, "") == 0) return -EINVAL; - return ext3_xattr_set(inode, EXT3_XATTR_INDEX_TRUSTED, name, + return ext3_xattr_set(dentry->d_inode, EXT3_XATTR_INDEX_TRUSTED, name, value, size, flags); } diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c index 430fe63..3bcfe9e 100644 --- a/fs/ext3/xattr_user.c +++ b/fs/ext3/xattr_user.c @@ -13,13 +13,13 @@ #include "xattr.h" static size_t -ext3_xattr_user_list(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +ext3_xattr_user_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) { const size_t prefix_len = XATTR_USER_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; - if (!test_opt(inode->i_sb, XATTR_USER)) + if (!test_opt(dentry->d_sb, XATTR_USER)) return 0; if (list && total_len <= list_size) { @@ -31,26 +31,27 @@ ext3_xattr_user_list(struct inode *inode, char *list, size_t list_size, } static int -ext3_xattr_user_get(struct inode *inode, const char *name, - void *buffer, size_t size) +ext3_xattr_user_get(struct dentry *dentry, const char *name, void *buffer, + size_t size, int type) { if (strcmp(name, "") == 0) return -EINVAL; - if (!test_opt(inode->i_sb, XATTR_USER)) + if (!test_opt(dentry->d_sb, XATTR_USER)) return -EOPNOTSUPP; - return ext3_xattr_get(inode, EXT3_XATTR_INDEX_USER, name, buffer, size); + return ext3_xattr_get(dentry->d_inode, EXT3_XATTR_INDEX_USER, + name, buffer, size); } static int -ext3_xattr_user_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) +ext3_xattr_user_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) { if (strcmp(name, "") == 0) return -EINVAL; - if (!test_opt(inode->i_sb, XATTR_USER)) + if (!test_opt(dentry->d_sb, XATTR_USER)) return -EOPNOTSUPP; - return ext3_xattr_set(inode, EXT3_XATTR_INDEX_USER, name, - value, size, flags); + return ext3_xattr_set(dentry->d_inode, EXT3_XATTR_INDEX_USER, + name, value, size, flags); } struct xattr_handler ext3_xattr_user_handler = { diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index e5f6774..9ed1bb1 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -2,7 +2,6 @@ config EXT4_FS tristate "The Extended 4 (ext4) filesystem" select JBD2 select CRC16 - select FS_JOURNAL_INFO help This is the next generation of the ext3 filesystem. @@ -29,6 +28,7 @@ config EXT4_FS config EXT4_USE_FOR_EXT23 bool "Use ext4 for ext2/ext3 file systems" + depends on EXT4_FS depends on EXT3_FS=n || EXT2_FS=n default y help diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 0df88b2..8a2a29d 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -364,12 +364,12 @@ out: * Extended attribute handlers */ static size_t -ext4_xattr_list_acl_access(struct inode *inode, char *list, size_t list_len, - const char *name, size_t name_len) +ext4_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_len, + const char *name, size_t name_len, int type) { const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); - if (!test_opt(inode->i_sb, POSIX_ACL)) + if (!test_opt(dentry->d_sb, POSIX_ACL)) return 0; if (list && size <= list_len) memcpy(list, POSIX_ACL_XATTR_ACCESS, size); @@ -377,12 +377,12 @@ ext4_xattr_list_acl_access(struct inode *inode, char *list, size_t list_len, } static size_t -ext4_xattr_list_acl_default(struct inode *inode, char *list, size_t list_len, - const char *name, size_t name_len) +ext4_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_len, + const char *name, size_t name_len, int type) { const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); - if (!test_opt(inode->i_sb, POSIX_ACL)) + if (!test_opt(dentry->d_sb, POSIX_ACL)) return 0; if (list && size <= list_len) memcpy(list, POSIX_ACL_XATTR_DEFAULT, size); @@ -390,15 +390,18 @@ ext4_xattr_list_acl_default(struct inode *inode, char *list, size_t list_len, } static int -ext4_xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size) +ext4_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer, + size_t size, int type) { struct posix_acl *acl; int error; - if (!test_opt(inode->i_sb, POSIX_ACL)) + if (strcmp(name, "") != 0) + return -EINVAL; + if (!test_opt(dentry->d_sb, POSIX_ACL)) return -EOPNOTSUPP; - acl = ext4_get_acl(inode, type); + acl = ext4_get_acl(dentry->d_inode, type); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl == NULL) @@ -410,31 +413,16 @@ ext4_xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size) } static int -ext4_xattr_get_acl_access(struct inode *inode, const char *name, - void *buffer, size_t size) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return ext4_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size); -} - -static int -ext4_xattr_get_acl_default(struct inode *inode, const char *name, - void *buffer, size_t size) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return ext4_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size); -} - -static int -ext4_xattr_set_acl(struct inode *inode, int type, const void *value, - size_t size) +ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags, int type) { + struct inode *inode = dentry->d_inode; handle_t *handle; struct posix_acl *acl; int error, retries = 0; + if (strcmp(name, "") != 0) + return -EINVAL; if (!test_opt(inode->i_sb, POSIX_ACL)) return -EOPNOTSUPP; if (!is_owner_or_cap(inode)) @@ -466,34 +454,18 @@ release_and_out: return error; } -static int -ext4_xattr_set_acl_access(struct inode *inode, const char *name, - const void *value, size_t size, int flags) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return ext4_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size); -} - -static int -ext4_xattr_set_acl_default(struct inode *inode, const char *name, - const void *value, size_t size, int flags) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return ext4_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size); -} - struct xattr_handler ext4_xattr_acl_access_handler = { .prefix = POSIX_ACL_XATTR_ACCESS, + .flags = ACL_TYPE_ACCESS, .list = ext4_xattr_list_acl_access, - .get = ext4_xattr_get_acl_access, - .set = ext4_xattr_set_acl_access, + .get = ext4_xattr_get_acl, + .set = ext4_xattr_set_acl, }; struct xattr_handler ext4_xattr_acl_default_handler = { .prefix = POSIX_ACL_XATTR_DEFAULT, + .flags = ACL_TYPE_DEFAULT, .list = ext4_xattr_list_acl_default, - .get = ext4_xattr_get_acl_default, - .set = ext4_xattr_set_acl_default, + .get = ext4_xattr_get_acl, + .set = ext4_xattr_set_acl, }; diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 4df8621..a60ab9a 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -16,7 +16,6 @@ #include <linux/module.h> #include <linux/swap.h> #include <linux/pagemap.h> -#include <linux/version.h> #include <linux/blkdev.h> #include <linux/mutex.h> #include "ext4.h" diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index ab31e65..874d169 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -361,14 +361,11 @@ struct ext4_new_group_data { so set the magic i_delalloc_reserve_flag after taking the inode allocation semaphore for */ #define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004 - /* Call ext4_da_update_reserve_space() after successfully - allocating the blocks */ -#define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE 0x0008 /* caller is from the direct IO path, request to creation of an unitialized extents if not allocated, split the uninitialized extent if blocks has been preallocated already*/ -#define EXT4_GET_BLOCKS_DIO 0x0010 -#define EXT4_GET_BLOCKS_CONVERT 0x0020 +#define EXT4_GET_BLOCKS_DIO 0x0008 +#define EXT4_GET_BLOCKS_CONVERT 0x0010 #define EXT4_GET_BLOCKS_DIO_CREATE_EXT (EXT4_GET_BLOCKS_DIO|\ EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) /* Convert extent to initialized after direct IO complete */ @@ -699,11 +696,17 @@ struct ext4_inode_info { unsigned int i_reserved_meta_blocks; unsigned int i_allocated_meta_blocks; unsigned short i_delalloc_reserved_flag; + sector_t i_da_metadata_calc_last_lblock; + int i_da_metadata_calc_len; /* on-disk additional length */ __u16 i_extra_isize; spinlock_t i_block_reservation_lock; +#ifdef CONFIG_QUOTA + /* quota space reservation, managed internally by quota code */ + qsize_t i_reserved_quota; +#endif /* completed async DIOs that might need unwritten extents handling */ struct list_head i_aio_dio_complete_list; @@ -1435,8 +1438,10 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); extern int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from); extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); -extern qsize_t ext4_get_reserved_space(struct inode *inode); +extern qsize_t *ext4_get_reserved_space(struct inode *inode); extern int flush_aio_dio_completed_IO(struct inode *inode); +extern void ext4_da_update_reserve_space(struct inode *inode, + int used, int quota_claim); /* ioctl.c */ extern long ext4_ioctl(struct file *, unsigned int, unsigned long); extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 2ca6864..bdb6ce7 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -225,7 +225,8 @@ static inline void ext4_ext_mark_initialized(struct ext4_extent *ext) ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext)); } -extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks); +extern int ext4_ext_calc_metadata_amount(struct inode *inode, + sector_t lblocks); extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex); extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *); extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 3a7928f..765a482 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -296,29 +296,44 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check) * to allocate @blocks * Worse case is one block per extent */ -int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks) +int ext4_ext_calc_metadata_amount(struct inode *inode, sector_t lblock) { - int lcap, icap, rcap, leafs, idxs, num; - int newextents = blocks; - - rcap = ext4_ext_space_root_idx(inode, 0); - lcap = ext4_ext_space_block(inode, 0); - icap = ext4_ext_space_block_idx(inode, 0); + struct ext4_inode_info *ei = EXT4_I(inode); + int idxs, num = 0; - /* number of new leaf blocks needed */ - num = leafs = (newextents + lcap - 1) / lcap; + idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) + / sizeof(struct ext4_extent_idx)); /* - * Worse case, we need separate index block(s) - * to link all new leaf blocks + * If the new delayed allocation block is contiguous with the + * previous da block, it can share index blocks with the + * previous block, so we only need to allocate a new index + * block every idxs leaf blocks. At ldxs**2 blocks, we need + * an additional index block, and at ldxs**3 blocks, yet + * another index blocks. */ - idxs = (leafs + icap - 1) / icap; - do { - num += idxs; - idxs = (idxs + icap - 1) / icap; - } while (idxs > rcap); + if (ei->i_da_metadata_calc_len && + ei->i_da_metadata_calc_last_lblock+1 == lblock) { + if ((ei->i_da_metadata_calc_len % idxs) == 0) + num++; + if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0) + num++; + if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) { + num++; + ei->i_da_metadata_calc_len = 0; + } else + ei->i_da_metadata_calc_len++; + ei->i_da_metadata_calc_last_lblock++; + return num; + } - return num; + /* + * In the worst case we need a new set of index blocks at + * every level of the inode's extent tree. + */ + ei->i_da_metadata_calc_len = 1; + ei->i_da_metadata_calc_last_lblock = lblock; + return ext_depth(inode) + 1; } static int @@ -3023,6 +3038,14 @@ out: return err; } +static void unmap_underlying_metadata_blocks(struct block_device *bdev, + sector_t block, int count) +{ + int i; + for (i = 0; i < count; i++) + unmap_underlying_metadata(bdev, block + i); +} + static int ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, ext4_lblk_t iblock, unsigned int max_blocks, @@ -3098,6 +3121,30 @@ out: } else allocated = ret; set_buffer_new(bh_result); + /* + * if we allocated more blocks than requested + * we need to make sure we unmap the extra block + * allocated. The actual needed block will get + * unmapped later when we find the buffer_head marked + * new. + */ + if (allocated > max_blocks) { + unmap_underlying_metadata_blocks(inode->i_sb->s_bdev, + newblock + max_blocks, + allocated - max_blocks); + allocated = max_blocks; + } + + /* + * If we have done fallocate with the offset that is already + * delayed allocated, we would have block reservation + * and quota reservation done in the delayed write path. + * But fallocate would have already updated quota and block + * count for this offset. So cancel these reservation + */ + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) + ext4_da_update_reserve_space(inode, allocated, 0); + map_out: set_buffer_mapped(bh_result); out1: @@ -3190,7 +3237,13 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, * this situation is possible, though, _during_ tree modification; * this is why assert can't be put in ext4_ext_find_extent() */ - BUG_ON(path[depth].p_ext == NULL && depth != 0); + if (path[depth].p_ext == NULL && depth != 0) { + ext4_error(inode->i_sb, __func__, "bad extent address " + "inode: %lu, iblock: %d, depth: %d", + inode->i_ino, iblock, depth); + err = -EIO; + goto out2; + } eh = path[depth].p_hdr; ex = path[depth].p_ext; @@ -3327,9 +3380,18 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, /* previous routine could use block we allocated */ newblock = ext_pblock(&newex); allocated = ext4_ext_get_actual_len(&newex); + if (allocated > max_blocks) + allocated = max_blocks; set_buffer_new(bh_result); /* + * Update reserved blocks/metadata blocks after successful + * block allocation which had been deferred till now. + */ + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) + ext4_da_update_reserve_space(inode, allocated, 1); + + /* * Cache the extent and update transaction to commit on fdatasync only * when it is _not_ an uninitialized extent. */ diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 0b22497..98bd140 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -88,9 +88,21 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) return ext4_force_commit(inode->i_sb); commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid; - if (jbd2_log_start_commit(journal, commit_tid)) + if (jbd2_log_start_commit(journal, commit_tid)) { + /* + * When the journal is on a different device than the + * fs data disk, we need to issue the barrier in + * writeback mode. (In ordered mode, the jbd2 layer + * will take care of issuing the barrier. In + * data=journal, all of the data blocks are written to + * the journal device.) + */ + if (ext4_should_writeback_data(inode) && + (journal->j_fs_dev != journal->j_dev) && + (journal->j_flags & JBD2_BARRIER)) + blkdev_issue_flush(inode->i_sb->s_bdev, NULL); jbd2_log_wait_commit(journal, commit_tid); - else if (journal->j_flags & JBD2_BARRIER) + } else if (journal->j_flags & JBD2_BARRIER) blkdev_issue_flush(inode->i_sb->s_bdev, NULL); return ret; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5352db1..e119524 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1003,92 +1003,120 @@ out: return err; } -qsize_t ext4_get_reserved_space(struct inode *inode) +#ifdef CONFIG_QUOTA +qsize_t *ext4_get_reserved_space(struct inode *inode) { - unsigned long long total; - - spin_lock(&EXT4_I(inode)->i_block_reservation_lock); - total = EXT4_I(inode)->i_reserved_data_blocks + - EXT4_I(inode)->i_reserved_meta_blocks; - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); - - return (total << inode->i_blkbits); + return &EXT4_I(inode)->i_reserved_quota; } +#endif + /* * Calculate the number of metadata blocks need to reserve - * to allocate @blocks for non extent file based file + * to allocate a new block at @lblocks for non extent file based file */ -static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks) +static int ext4_indirect_calc_metadata_amount(struct inode *inode, + sector_t lblock) { - int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb); - int ind_blks, dind_blks, tind_blks; - - /* number of new indirect blocks needed */ - ind_blks = (blocks + icap - 1) / icap; + struct ext4_inode_info *ei = EXT4_I(inode); + int dind_mask = EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1; + int blk_bits; - dind_blks = (ind_blks + icap - 1) / icap; + if (lblock < EXT4_NDIR_BLOCKS) + return 0; - tind_blks = 1; + lblock -= EXT4_NDIR_BLOCKS; - return ind_blks + dind_blks + tind_blks; + if (ei->i_da_metadata_calc_len && + (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) { + ei->i_da_metadata_calc_len++; + return 0; + } + ei->i_da_metadata_calc_last_lblock = lblock & dind_mask; + ei->i_da_metadata_calc_len = 1; + blk_bits = roundup_pow_of_two(lblock + 1); + return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1; } /* * Calculate the number of metadata blocks need to reserve - * to allocate given number of blocks + * to allocate a block located at @lblock */ -static int ext4_calc_metadata_amount(struct inode *inode, int blocks) +static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock) { - if (!blocks) - return 0; - if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) - return ext4_ext_calc_metadata_amount(inode, blocks); + return ext4_ext_calc_metadata_amount(inode, lblock); - return ext4_indirect_calc_metadata_amount(inode, blocks); + return ext4_indirect_calc_metadata_amount(inode, lblock); } -static void ext4_da_update_reserve_space(struct inode *inode, int used) +/* + * Called with i_data_sem down, which is important since we can call + * ext4_discard_preallocations() from here. + */ +void ext4_da_update_reserve_space(struct inode *inode, + int used, int quota_claim) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - int total, mdb, mdb_free; - - spin_lock(&EXT4_I(inode)->i_block_reservation_lock); - /* recalculate the number of metablocks still need to be reserved */ - total = EXT4_I(inode)->i_reserved_data_blocks - used; - mdb = ext4_calc_metadata_amount(inode, total); - - /* figure out how many metablocks to release */ - BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); - mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; - - if (mdb_free) { - /* Account for allocated meta_blocks */ - mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks; - - /* update fs dirty blocks counter */ + struct ext4_inode_info *ei = EXT4_I(inode); + int mdb_free = 0, allocated_meta_blocks = 0; + + spin_lock(&ei->i_block_reservation_lock); + if (unlikely(used > ei->i_reserved_data_blocks)) { + ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d " + "with only %d reserved data blocks\n", + __func__, inode->i_ino, used, + ei->i_reserved_data_blocks); + WARN_ON(1); + used = ei->i_reserved_data_blocks; + } + + /* Update per-inode reservations */ + ei->i_reserved_data_blocks -= used; + used += ei->i_allocated_meta_blocks; + ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks; + allocated_meta_blocks = ei->i_allocated_meta_blocks; + ei->i_allocated_meta_blocks = 0; + percpu_counter_sub(&sbi->s_dirtyblocks_counter, used); + + if (ei->i_reserved_data_blocks == 0) { + /* + * We can release all of the reserved metadata blocks + * only when we have written all of the delayed + * allocation blocks. + */ + mdb_free = ei->i_reserved_meta_blocks; + ei->i_reserved_meta_blocks = 0; + ei->i_da_metadata_calc_len = 0; percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free); - EXT4_I(inode)->i_allocated_meta_blocks = 0; - EXT4_I(inode)->i_reserved_meta_blocks = mdb; } - - /* update per-inode reservations */ - BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks); - EXT4_I(inode)->i_reserved_data_blocks -= used; spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); - /* - * free those over-booking quota for metadata blocks - */ - if (mdb_free) - vfs_dq_release_reservation_block(inode, mdb_free); + /* Update quota subsystem */ + if (quota_claim) { + vfs_dq_claim_block(inode, used); + if (mdb_free) + vfs_dq_release_reservation_block(inode, mdb_free); + } else { + /* + * We did fallocate with an offset that is already delayed + * allocated. So on delayed allocated writeback we should + * not update the quota for allocated blocks. But then + * converting an fallocate region to initialized region would + * have caused a metadata allocation. So claim quota for + * that + */ + if (allocated_meta_blocks) + vfs_dq_claim_block(inode, allocated_meta_blocks); + vfs_dq_release_reservation_block(inode, mdb_free + used); + } /* * If we have done all the pending block allocations and if * there aren't any writers on the inode, we can discard the * inode's preallocations. */ - if (!total && (atomic_read(&inode->i_writecount) == 0)) + if ((ei->i_reserved_data_blocks == 0) && + (atomic_read(&inode->i_writecount) == 0)) ext4_discard_preallocations(inode); } @@ -1280,18 +1308,20 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, */ EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE; } - } + /* + * Update reserved blocks/metadata blocks after successful + * block allocation which had been deferred till now. We don't + * support fallocate for non extent files. So we can update + * reserve space here. + */ + if ((retval > 0) && + (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)) + ext4_da_update_reserve_space(inode, retval, 1); + } if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) EXT4_I(inode)->i_delalloc_reserved_flag = 0; - /* - * Update reserved blocks/metadata blocks after successful - * block allocation which had been deferred till now. - */ - if ((retval > 0) && (flags & EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE)) - ext4_da_update_reserve_space(inode, retval); - up_write((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && buffer_mapped(bh)) { int ret = check_block_validity(inode, "file system " @@ -1797,11 +1827,15 @@ static int ext4_journalled_write_end(struct file *file, return ret ? ret : copied; } -static int ext4_da_reserve_space(struct inode *inode, int nrblocks) +/* + * Reserve a single block located at lblock + */ +static int ext4_da_reserve_space(struct inode *inode, sector_t lblock) { int retries = 0; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - unsigned long md_needed, mdblocks, total = 0; + struct ext4_inode_info *ei = EXT4_I(inode); + unsigned long md_needed, md_reserved; /* * recalculate the amount of metadata blocks to reserve @@ -1809,86 +1843,78 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks) * worse case is one extent per block */ repeat: - spin_lock(&EXT4_I(inode)->i_block_reservation_lock); - total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks; - mdblocks = ext4_calc_metadata_amount(inode, total); - BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks); - - md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks; - total = md_needed + nrblocks; + spin_lock(&ei->i_block_reservation_lock); + md_reserved = ei->i_reserved_meta_blocks; + md_needed = ext4_calc_metadata_amount(inode, lblock); + spin_unlock(&ei->i_block_reservation_lock); /* * Make quota reservation here to prevent quota overflow * later. Real quota accounting is done at pages writeout * time. */ - if (vfs_dq_reserve_block(inode, total)) { - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + if (vfs_dq_reserve_block(inode, md_needed + 1)) return -EDQUOT; - } - if (ext4_claim_free_blocks(sbi, total)) { - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); - vfs_dq_release_reservation_block(inode, total); + if (ext4_claim_free_blocks(sbi, md_needed + 1)) { + vfs_dq_release_reservation_block(inode, md_needed + 1); if (ext4_should_retry_alloc(inode->i_sb, &retries)) { yield(); goto repeat; } return -ENOSPC; } - EXT4_I(inode)->i_reserved_data_blocks += nrblocks; - EXT4_I(inode)->i_reserved_meta_blocks = mdblocks; + spin_lock(&ei->i_block_reservation_lock); + ei->i_reserved_data_blocks++; + ei->i_reserved_meta_blocks += md_needed; + spin_unlock(&ei->i_block_reservation_lock); - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); return 0; /* success */ } static void ext4_da_release_space(struct inode *inode, int to_free) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - int total, mdb, mdb_free, release; + struct ext4_inode_info *ei = EXT4_I(inode); if (!to_free) return; /* Nothing to release, exit */ spin_lock(&EXT4_I(inode)->i_block_reservation_lock); - if (!EXT4_I(inode)->i_reserved_data_blocks) { + if (unlikely(to_free > ei->i_reserved_data_blocks)) { /* - * if there is no reserved blocks, but we try to free some - * then the counter is messed up somewhere. - * but since this function is called from invalidate - * page, it's harmless to return without any action + * if there aren't enough reserved blocks, then the + * counter is messed up somewhere. Since this + * function is called from invalidate page, it's + * harmless to return without any action. */ - printk(KERN_INFO "ext4 delalloc try to release %d reserved " - "blocks for inode %lu, but there is no reserved " - "data blocks\n", to_free, inode->i_ino); - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); - return; + ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: " + "ino %lu, to_free %d with only %d reserved " + "data blocks\n", inode->i_ino, to_free, + ei->i_reserved_data_blocks); + WARN_ON(1); + to_free = ei->i_reserved_data_blocks; } + ei->i_reserved_data_blocks -= to_free; - /* recalculate the number of metablocks still need to be reserved */ - total = EXT4_I(inode)->i_reserved_data_blocks - to_free; - mdb = ext4_calc_metadata_amount(inode, total); - - /* figure out how many metablocks to release */ - BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); - mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; - - release = to_free + mdb_free; - - /* update fs dirty blocks counter for truncate case */ - percpu_counter_sub(&sbi->s_dirtyblocks_counter, release); + if (ei->i_reserved_data_blocks == 0) { + /* + * We can release all of the reserved metadata blocks + * only when we have written all of the delayed + * allocation blocks. + */ + to_free += ei->i_reserved_meta_blocks; + ei->i_reserved_meta_blocks = 0; + ei->i_da_metadata_calc_len = 0; + } - /* update per-inode reservations */ - BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks); - EXT4_I(inode)->i_reserved_data_blocks -= to_free; + /* update fs dirty blocks counter */ + percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free); - BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); - EXT4_I(inode)->i_reserved_meta_blocks = mdb; spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); - vfs_dq_release_reservation_block(inode, release); + vfs_dq_release_reservation_block(inode, to_free); } static void ext4_da_page_release_reservation(struct page *page, @@ -2193,10 +2219,10 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) * variables are updated after the blocks have been allocated. */ new.b_state = 0; - get_blocks_flags = (EXT4_GET_BLOCKS_CREATE | - EXT4_GET_BLOCKS_DELALLOC_RESERVE); + get_blocks_flags = EXT4_GET_BLOCKS_CREATE; if (mpd->b_state & (1 << BH_Delay)) - get_blocks_flags |= EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE; + get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; + blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks, &new, get_blocks_flags); if (blks < 0) { @@ -2494,7 +2520,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, * XXX: __block_prepare_write() unmaps passed block, * is it OK? */ - ret = ext4_da_reserve_space(inode, 1); + ret = ext4_da_reserve_space(inode, iblock); if (ret) /* not enough space to reserve */ return ret; @@ -2968,8 +2994,7 @@ retry: out_writepages: if (!no_nrwrite_index_update) wbc->no_nrwrite_index_update = 0; - if (wbc->nr_to_write > nr_to_writebump) - wbc->nr_to_write -= nr_to_writebump; + wbc->nr_to_write -= nr_to_writebump; wbc->range_start = range_start; trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); return ret; @@ -2994,11 +3019,18 @@ static int ext4_nonda_switch(struct super_block *sb) if (2 * free_blocks < 3 * dirty_blocks || free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) { /* - * free block count is less that 150% of dirty blocks - * or free blocks is less that watermark + * free block count is less than 150% of dirty blocks + * or free blocks is less than watermark */ return 1; } + /* + * Even if we don't switch but are nearing capacity, + * start pushing delalloc when 1/2 of free blocks are dirty. + */ + if (free_blocks < 2 * dirty_blocks) + writeback_inodes_sb_if_idle(sb); + return 0; } @@ -3006,7 +3038,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) { - int ret, retries = 0; + int ret, retries = 0, quota_retries = 0; struct page *page; pgoff_t index; unsigned from, to; @@ -3065,6 +3097,22 @@ retry: if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; + + if ((ret == -EDQUOT) && + EXT4_I(inode)->i_reserved_meta_blocks && + (quota_retries++ < 3)) { + /* + * Since we often over-estimate the number of meta + * data blocks required, we may sometimes get a + * spurios out of quota error even though there would + * be enough space once we write the data blocks and + * find out how many meta data blocks were _really_ + * required. So try forcing the inode write to see if + * that helps. + */ + write_inode_now(inode, (quota_retries == 3)); + goto retry; + } out: return ret; } @@ -4794,6 +4842,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; inode->i_size = ext4_isize(raw_inode); ei->i_disksize = inode->i_size; +#ifdef CONFIG_QUOTA + ei->i_reserved_quota = 0; +#endif inode->i_generation = le32_to_cpu(raw_inode->i_generation); ei->i_block_group = iloc.block_group; ei->i_last_alloc_group = ~0; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index b1fd3da..d34afad 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2755,12 +2755,6 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED)) /* release all the reserved blocks if non delalloc */ percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks); - else { - percpu_counter_sub(&sbi->s_dirtyblocks_counter, - ac->ac_b_ex.fe_len); - /* convert reserved quota blocks to real quota blocks */ - vfs_dq_claim_block(ac->ac_inode, ac->ac_b_ex.fe_len); - } if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 0ca8110..436521c 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -17,7 +17,6 @@ #include <linux/proc_fs.h> #include <linux/pagemap.h> #include <linux/seq_file.h> -#include <linux/version.h> #include <linux/blkdev.h> #include <linux/mutex.h> #include "ext4_jbd2.h" diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 827bde1..735c20d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -702,8 +702,12 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei->i_reserved_data_blocks = 0; ei->i_reserved_meta_blocks = 0; ei->i_allocated_meta_blocks = 0; + ei->i_da_metadata_calc_len = 0; ei->i_delalloc_reserved_flag = 0; spin_lock_init(&(ei->i_block_reservation_lock)); +#ifdef CONFIG_QUOTA + ei->i_reserved_quota = 0; +#endif INIT_LIST_HEAD(&ei->i_aio_dio_complete_list); ei->cur_aio_dio = NULL; ei->i_sync_tid = 0; @@ -1014,7 +1018,9 @@ static const struct dquot_operations ext4_quota_operations = { .reserve_space = dquot_reserve_space, .claim_space = dquot_claim_space, .release_rsv = dquot_release_reserved_space, +#ifdef CONFIG_QUOTA .get_reserved_space = ext4_get_reserved_space, +#endif .alloc_inode = dquot_alloc_inode, .free_space = dquot_free_space, .free_inode = dquot_free_inode, @@ -2169,9 +2175,9 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a, struct super_block *sb = sbi->s_buddy_cache->i_sb; return snprintf(buf, PAGE_SIZE, "%llu\n", - sbi->s_kbytes_written + + (unsigned long long)(sbi->s_kbytes_written + ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - - EXT4_SB(sb)->s_sectors_written_start) >> 1)); + EXT4_SB(sb)->s_sectors_written_start) >> 1))); } static ssize_t inode_readahead_blks_store(struct ext4_attr *a, @@ -4000,6 +4006,7 @@ static inline void unregister_as_ext2(void) { unregister_filesystem(&ext2_fs_type); } +MODULE_ALIAS("ext2"); #else static inline void register_as_ext2(void) { } static inline void unregister_as_ext2(void) { } @@ -4026,6 +4033,7 @@ static inline void unregister_as_ext3(void) { unregister_filesystem(&ext3_fs_type); } +MODULE_ALIAS("ext3"); #else static inline void register_as_ext3(void) { } static inline void unregister_as_ext3(void) { } diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 910bf9a..f3a2f7e 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -92,7 +92,7 @@ static struct buffer_head *ext4_xattr_cache_find(struct inode *, struct mb_cache_entry **); static void ext4_xattr_rehash(struct ext4_xattr_header *, struct ext4_xattr_entry *); -static int ext4_xattr_list(struct inode *inode, char *buffer, +static int ext4_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size); static struct mb_cache *ext4_xattr_cache; @@ -140,7 +140,7 @@ ext4_xattr_handler(int name_index) ssize_t ext4_listxattr(struct dentry *dentry, char *buffer, size_t size) { - return ext4_xattr_list(dentry->d_inode, buffer, size); + return ext4_xattr_list(dentry, buffer, size); } static int @@ -325,7 +325,7 @@ ext4_xattr_get(struct inode *inode, int name_index, const char *name, } static int -ext4_xattr_list_entries(struct inode *inode, struct ext4_xattr_entry *entry, +ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry, char *buffer, size_t buffer_size) { size_t rest = buffer_size; @@ -335,9 +335,10 @@ ext4_xattr_list_entries(struct inode *inode, struct ext4_xattr_entry *entry, ext4_xattr_handler(entry->e_name_index); if (handler) { - size_t size = handler->list(inode, buffer, rest, + size_t size = handler->list(dentry, buffer, rest, entry->e_name, - entry->e_name_len); + entry->e_name_len, + handler->flags); if (buffer) { if (size > rest) return -ERANGE; @@ -350,8 +351,9 @@ ext4_xattr_list_entries(struct inode *inode, struct ext4_xattr_entry *entry, } static int -ext4_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size) +ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) { + struct inode *inode = dentry->d_inode; struct buffer_head *bh = NULL; int error; @@ -376,7 +378,7 @@ ext4_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size) goto cleanup; } ext4_xattr_cache_insert(bh); - error = ext4_xattr_list_entries(inode, BFIRST(bh), buffer, buffer_size); + error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size); cleanup: brelse(bh); @@ -385,8 +387,9 @@ cleanup: } static int -ext4_xattr_ibody_list(struct inode *inode, char *buffer, size_t buffer_size) +ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size) { + struct inode *inode = dentry->d_inode; struct ext4_xattr_ibody_header *header; struct ext4_inode *raw_inode; struct ext4_iloc iloc; @@ -404,7 +407,7 @@ ext4_xattr_ibody_list(struct inode *inode, char *buffer, size_t buffer_size) error = ext4_xattr_check_names(IFIRST(header), end); if (error) goto cleanup; - error = ext4_xattr_list_entries(inode, IFIRST(header), + error = ext4_xattr_list_entries(dentry, IFIRST(header), buffer, buffer_size); cleanup: @@ -423,12 +426,12 @@ cleanup: * used / required on success. */ static int -ext4_xattr_list(struct inode *inode, char *buffer, size_t buffer_size) +ext4_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) { int i_error, b_error; - down_read(&EXT4_I(inode)->xattr_sem); - i_error = ext4_xattr_ibody_list(inode, buffer, buffer_size); + down_read(&EXT4_I(dentry->d_inode)->xattr_sem); + i_error = ext4_xattr_ibody_list(dentry, buffer, buffer_size); if (i_error < 0) { b_error = 0; } else { @@ -436,11 +439,11 @@ ext4_xattr_list(struct inode *inode, char *buffer, size_t buffer_size) buffer += i_error; buffer_size -= i_error; } - b_error = ext4_xattr_block_list(inode, buffer, buffer_size); + b_error = ext4_xattr_block_list(dentry, buffer, buffer_size); if (b_error < 0) i_error = 0; } - up_read(&EXT4_I(inode)->xattr_sem); + up_read(&EXT4_I(dentry->d_inode)->xattr_sem); return i_error + b_error; } @@ -1329,6 +1332,8 @@ retry: goto cleanup; kfree(b_entry_name); kfree(buffer); + b_entry_name = NULL; + buffer = NULL; brelse(is->iloc.bh); kfree(is); kfree(bs); diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c index ca5f89f..983c253 100644 --- a/fs/ext4/xattr_security.c +++ b/fs/ext4/xattr_security.c @@ -12,8 +12,8 @@ #include "xattr.h" static size_t -ext4_xattr_security_list(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +ext4_xattr_security_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) { const size_t prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1; const size_t total_len = prefix_len + name_len + 1; @@ -28,23 +28,23 @@ ext4_xattr_security_list(struct inode *inode, char *list, size_t list_size, } static int -ext4_xattr_security_get(struct inode *inode, const char *name, - void *buffer, size_t size) +ext4_xattr_security_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) { if (strcmp(name, "") == 0) return -EINVAL; - return ext4_xattr_get(inode, EXT4_XATTR_INDEX_SECURITY, name, - buffer, size); + return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_SECURITY, + name, buffer, size); } static int -ext4_xattr_security_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) +ext4_xattr_security_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) { if (strcmp(name, "") == 0) return -EINVAL; - return ext4_xattr_set(inode, EXT4_XATTR_INDEX_SECURITY, name, - value, size, flags); + return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_SECURITY, + name, value, size, flags); } int diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c index ac1a52c..15b50ed 100644 --- a/fs/ext4/xattr_trusted.c +++ b/fs/ext4/xattr_trusted.c @@ -14,8 +14,8 @@ #include "xattr.h" static size_t -ext4_xattr_trusted_list(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +ext4_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) { const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; @@ -32,23 +32,23 @@ ext4_xattr_trusted_list(struct inode *inode, char *list, size_t list_size, } static int -ext4_xattr_trusted_get(struct inode *inode, const char *name, - void *buffer, size_t size) +ext4_xattr_trusted_get(struct dentry *dentry, const char *name, void *buffer, + size_t size, int type) { if (strcmp(name, "") == 0) return -EINVAL; - return ext4_xattr_get(inode, EXT4_XATTR_INDEX_TRUSTED, name, - buffer, size); + return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_TRUSTED, + name, buffer, size); } static int -ext4_xattr_trusted_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) +ext4_xattr_trusted_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) { if (strcmp(name, "") == 0) return -EINVAL; - return ext4_xattr_set(inode, EXT4_XATTR_INDEX_TRUSTED, name, - value, size, flags); + return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_TRUSTED, + name, value, size, flags); } struct xattr_handler ext4_xattr_trusted_handler = { diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c index d91aa61..c4ce057 100644 --- a/fs/ext4/xattr_user.c +++ b/fs/ext4/xattr_user.c @@ -13,13 +13,13 @@ #include "xattr.h" static size_t -ext4_xattr_user_list(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +ext4_xattr_user_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) { const size_t prefix_len = XATTR_USER_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; - if (!test_opt(inode->i_sb, XATTR_USER)) + if (!test_opt(dentry->d_sb, XATTR_USER)) return 0; if (list && total_len <= list_size) { @@ -31,26 +31,27 @@ ext4_xattr_user_list(struct inode *inode, char *list, size_t list_size, } static int -ext4_xattr_user_get(struct inode *inode, const char *name, - void *buffer, size_t size) +ext4_xattr_user_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) { if (strcmp(name, "") == 0) return -EINVAL; - if (!test_opt(inode->i_sb, XATTR_USER)) + if (!test_opt(dentry->d_sb, XATTR_USER)) return -EOPNOTSUPP; - return ext4_xattr_get(inode, EXT4_XATTR_INDEX_USER, name, buffer, size); + return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_USER, + name, buffer, size); } static int -ext4_xattr_user_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) +ext4_xattr_user_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) { if (strcmp(name, "") == 0) return -EINVAL; - if (!test_opt(inode->i_sb, XATTR_USER)) + if (!test_opt(dentry->d_sb, XATTR_USER)) return -EOPNOTSUPP; - return ext4_xattr_set(inode, EXT4_XATTR_INDEX_USER, name, - value, size, flags); + return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_USER, + name, value, size, flags); } struct xattr_handler ext4_xattr_user_handler = { diff --git a/fs/fat/fat.h b/fs/fat/fat.h index 7db0979..e6efdfa 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -44,7 +44,8 @@ struct fat_mount_options { nocase:1, /* Does this need case conversion? 0=need case conversion*/ usefree:1, /* Use free_clusters for FAT32 */ tz_utc:1, /* Filesystem timestamps are in UTC */ - rodir:1; /* allow ATTR_RO for directory */ + rodir:1, /* allow ATTR_RO for directory */ + discard:1; /* Issue discard requests on deletions */ }; #define FAT_HASH_BITS 8 diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c index a810377..81184d3 100644 --- a/fs/fat/fatent.c +++ b/fs/fat/fatent.c @@ -566,16 +566,21 @@ int fat_free_clusters(struct inode *inode, int cluster) goto error; } - /* - * Issue discard for the sectors we no longer care about, - * batching contiguous clusters into one request - */ - if (cluster != fatent.entry + 1) { - int nr_clus = fatent.entry - first_cl + 1; - - sb_issue_discard(sb, fat_clus_to_blknr(sbi, first_cl), - nr_clus * sbi->sec_per_clus); - first_cl = cluster; + if (sbi->options.discard) { + /* + * Issue discard for the sectors we no longer + * care about, batching contiguous clusters + * into one request + */ + if (cluster != fatent.entry + 1) { + int nr_clus = fatent.entry - first_cl + 1; + + sb_issue_discard(sb, + fat_clus_to_blknr(sbi, first_cl), + nr_clus * sbi->sec_per_clus); + + first_cl = cluster; + } } ops->ent_put(&fatent, FAT_ENT_FREE); diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 76b7961..14da530 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -858,6 +858,8 @@ static int fat_show_options(struct seq_file *m, struct vfsmount *mnt) seq_puts(m, ",errors=panic"); else seq_puts(m, ",errors=remount-ro"); + if (opts->discard) + seq_puts(m, ",discard"); return 0; } @@ -871,7 +873,7 @@ enum { Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes, Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes, Opt_obsolate, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont, - Opt_err_panic, Opt_err_ro, Opt_err, + Opt_err_panic, Opt_err_ro, Opt_discard, Opt_err, }; static const match_table_t fat_tokens = { @@ -899,6 +901,7 @@ static const match_table_t fat_tokens = { {Opt_err_cont, "errors=continue"}, {Opt_err_panic, "errors=panic"}, {Opt_err_ro, "errors=remount-ro"}, + {Opt_discard, "discard"}, {Opt_obsolate, "conv=binary"}, {Opt_obsolate, "conv=text"}, {Opt_obsolate, "conv=auto"}, @@ -1136,6 +1139,9 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug, case Opt_rodir: opts->rodir = 1; break; + case Opt_discard: + opts->discard = 1; + break; /* obsolete mount options */ case Opt_obsolate: @@ -618,60 +618,90 @@ static DEFINE_RWLOCK(fasync_lock); static struct kmem_cache *fasync_cache __read_mostly; /* - * fasync_helper() is used by almost all character device drivers - * to set up the fasync queue. It returns negative on error, 0 if it did - * no changes and positive if it added/deleted the entry. + * Remove a fasync entry. If successfully removed, return + * positive and clear the FASYNC flag. If no entry exists, + * do nothing and return 0. + * + * NOTE! It is very important that the FASYNC flag always + * match the state "is the filp on a fasync list". + * + * We always take the 'filp->f_lock', in since fasync_lock + * needs to be irq-safe. */ -int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fapp) +static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp) { struct fasync_struct *fa, **fp; - struct fasync_struct *new = NULL; int result = 0; - if (on) { - new = kmem_cache_alloc(fasync_cache, GFP_KERNEL); - if (!new) - return -ENOMEM; + spin_lock(&filp->f_lock); + write_lock_irq(&fasync_lock); + for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) { + if (fa->fa_file != filp) + continue; + *fp = fa->fa_next; + kmem_cache_free(fasync_cache, fa); + filp->f_flags &= ~FASYNC; + result = 1; + break; } + write_unlock_irq(&fasync_lock); + spin_unlock(&filp->f_lock); + return result; +} + +/* + * Add a fasync entry. Return negative on error, positive if + * added, and zero if did nothing but change an existing one. + * + * NOTE! It is very important that the FASYNC flag always + * match the state "is the filp on a fasync list". + */ +static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fapp) +{ + struct fasync_struct *new, *fa, **fp; + int result = 0; + + new = kmem_cache_alloc(fasync_cache, GFP_KERNEL); + if (!new) + return -ENOMEM; - /* - * We need to take f_lock first since it's not an IRQ-safe - * lock. - */ spin_lock(&filp->f_lock); write_lock_irq(&fasync_lock); for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) { - if (fa->fa_file == filp) { - if(on) { - fa->fa_fd = fd; - kmem_cache_free(fasync_cache, new); - } else { - *fp = fa->fa_next; - kmem_cache_free(fasync_cache, fa); - result = 1; - } - goto out; - } + if (fa->fa_file != filp) + continue; + fa->fa_fd = fd; + kmem_cache_free(fasync_cache, new); + goto out; } - if (on) { - new->magic = FASYNC_MAGIC; - new->fa_file = filp; - new->fa_fd = fd; - new->fa_next = *fapp; - *fapp = new; - result = 1; - } + new->magic = FASYNC_MAGIC; + new->fa_file = filp; + new->fa_fd = fd; + new->fa_next = *fapp; + *fapp = new; + result = 1; + filp->f_flags |= FASYNC; + out: - if (on) - filp->f_flags |= FASYNC; - else - filp->f_flags &= ~FASYNC; write_unlock_irq(&fasync_lock); spin_unlock(&filp->f_lock); return result; } +/* + * fasync_helper() is used by almost all character device drivers + * to set up the fasync queue, and for regular files by the file + * lease code. It returns negative on error, 0 if it did no changes + * and positive if it added/deleted the entry. + */ +int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fapp) +{ + if (!on) + return fasync_remove_entry(filp, fapp); + return fasync_add_entry(fd, filp, fapp); +} + EXPORT_SYMBOL(fasync_helper); void __kill_fasync(struct fasync_struct *fa, int sig, int band) diff --git a/fs/file_table.c b/fs/file_table.c index 4bef4c0..b98404b 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -21,9 +21,12 @@ #include <linux/fsnotify.h> #include <linux/sysctl.h> #include <linux/percpu_counter.h> +#include <linux/ima.h> #include <asm/atomic.h> +#include "internal.h" + /* sysctl tunables... */ struct files_stat_struct files_stat = { .max_files = NR_FILE @@ -147,8 +150,6 @@ fail: return NULL; } -EXPORT_SYMBOL(get_empty_filp); - /** * alloc_file - allocate and initialize a 'struct file' * @mnt: the vfsmount on which the file will reside @@ -164,8 +165,8 @@ EXPORT_SYMBOL(get_empty_filp); * If all the callers of init_file() are eliminated, its * code should be moved into this function. */ -struct file *alloc_file(struct vfsmount *mnt, struct dentry *dentry, - fmode_t mode, const struct file_operations *fop) +struct file *alloc_file(struct path *path, fmode_t mode, + const struct file_operations *fop) { struct file *file; @@ -173,35 +174,8 @@ struct file *alloc_file(struct vfsmount *mnt, struct dentry *dentry, if (!file) return NULL; - init_file(file, mnt, dentry, mode, fop); - return file; -} -EXPORT_SYMBOL(alloc_file); - -/** - * init_file - initialize a 'struct file' - * @file: the already allocated 'struct file' to initialized - * @mnt: the vfsmount on which the file resides - * @dentry: the dentry representing this file - * @mode: the mode the file is opened with - * @fop: the 'struct file_operations' for this file - * - * Use this instead of setting the members directly. Doing so - * avoids making mistakes like forgetting the mntget() or - * forgetting to take a write on the mnt. - * - * Note: This is a crappy interface. It is here to make - * merging with the existing users of get_empty_filp() - * who have complex failure logic easier. All users - * of this should be moving to alloc_file(). - */ -int init_file(struct file *file, struct vfsmount *mnt, struct dentry *dentry, - fmode_t mode, const struct file_operations *fop) -{ - int error = 0; - file->f_path.dentry = dentry; - file->f_path.mnt = mntget(mnt); - file->f_mapping = dentry->d_inode->i_mapping; + file->f_path = *path; + file->f_mapping = path->dentry->d_inode->i_mapping; file->f_mode = mode; file->f_op = fop; @@ -211,14 +185,14 @@ int init_file(struct file *file, struct vfsmount *mnt, struct dentry *dentry, * visible. We do this for consistency, and so * that we can do debugging checks at __fput() */ - if ((mode & FMODE_WRITE) && !special_file(dentry->d_inode->i_mode)) { + if ((mode & FMODE_WRITE) && !special_file(path->dentry->d_inode->i_mode)) { file_take_write(file); - error = mnt_clone_write(mnt); - WARN_ON(error); + WARN_ON(mnt_clone_write(path->mnt)); } - return error; + ima_counts_get(file); + return file; } -EXPORT_SYMBOL(init_file); +EXPORT_SYMBOL(alloc_file); void fput(struct file *file) { @@ -279,6 +253,7 @@ void __fput(struct file *file) if (file->f_op && file->f_op->release) file->f_op->release(inode, file); security_file_free(file); + ima_file_free(file); if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL)) cdev_put(inode->i_cdev); fops_put(file->f_op); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 49bc1b8..1a7c42c 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -242,6 +242,7 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi, /** * bdi_start_writeback - start writeback * @bdi: the backing device to write from + * @sb: write inodes from this super_block * @nr_pages: the number of pages to write * * Description: @@ -1187,6 +1188,23 @@ void writeback_inodes_sb(struct super_block *sb) EXPORT_SYMBOL(writeback_inodes_sb); /** + * writeback_inodes_sb_if_idle - start writeback if none underway + * @sb: the superblock + * + * Invoke writeback_inodes_sb if no writeback is currently underway. + * Returns 1 if writeback was started, 0 if not. + */ +int writeback_inodes_sb_if_idle(struct super_block *sb) +{ + if (!writeback_in_progress(sb->s_bdi)) { + writeback_inodes_sb(sb); + return 1; + } else + return 0; +} +EXPORT_SYMBOL(writeback_inodes_sb_if_idle); + +/** * sync_inodes_sb - sync sb inode pages * @sb: the superblock * diff --git a/fs/fuse/file.c b/fs/fuse/file.c index c18913a..a9f5e13 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -828,6 +828,9 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req, if (!page) break; + if (mapping_writably_mapped(mapping)) + flush_dcache_page(page); + pagefault_disable(); tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes); pagefault_enable(); diff --git a/fs/generic_acl.c b/fs/generic_acl.c index e0b53aa..5545803 100644 --- a/fs/generic_acl.c +++ b/fs/generic_acl.c @@ -1,62 +1,58 @@ /* - * fs/generic_acl.c - * * (C) 2005 Andreas Gruenbacher <agruen@suse.de> * * This file is released under the GPL. + * + * Generic ACL support for in-memory filesystems. */ #include <linux/sched.h> #include <linux/fs.h> #include <linux/generic_acl.h> +#include <linux/posix_acl.h> +#include <linux/posix_acl_xattr.h> -/** - * generic_acl_list - Generic xattr_handler->list() operation - * @ops: Filesystem specific getacl and setacl callbacks - */ -size_t -generic_acl_list(struct inode *inode, struct generic_acl_operations *ops, - int type, char *list, size_t list_size) + +static size_t +generic_acl_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) { struct posix_acl *acl; - const char *name; + const char *xname; size_t size; - acl = ops->getacl(inode, type); + acl = get_cached_acl(dentry->d_inode, type); if (!acl) return 0; posix_acl_release(acl); - switch(type) { - case ACL_TYPE_ACCESS: - name = POSIX_ACL_XATTR_ACCESS; - break; - - case ACL_TYPE_DEFAULT: - name = POSIX_ACL_XATTR_DEFAULT; - break; - - default: - return 0; + switch (type) { + case ACL_TYPE_ACCESS: + xname = POSIX_ACL_XATTR_ACCESS; + break; + case ACL_TYPE_DEFAULT: + xname = POSIX_ACL_XATTR_DEFAULT; + break; + default: + return 0; } - size = strlen(name) + 1; + size = strlen(xname) + 1; if (list && size <= list_size) - memcpy(list, name, size); + memcpy(list, xname, size); return size; } -/** - * generic_acl_get - Generic xattr_handler->get() operation - * @ops: Filesystem specific getacl and setacl callbacks - */ -int -generic_acl_get(struct inode *inode, struct generic_acl_operations *ops, - int type, void *buffer, size_t size) +static int +generic_acl_get(struct dentry *dentry, const char *name, void *buffer, + size_t size, int type) { struct posix_acl *acl; int error; - acl = ops->getacl(inode, type); + if (strcmp(name, "") != 0) + return -EINVAL; + + acl = get_cached_acl(dentry->d_inode, type); if (!acl) return -ENODATA; error = posix_acl_to_xattr(acl, buffer, size); @@ -65,17 +61,16 @@ generic_acl_get(struct inode *inode, struct generic_acl_operations *ops, return error; } -/** - * generic_acl_set - Generic xattr_handler->set() operation - * @ops: Filesystem specific getacl and setacl callbacks - */ -int -generic_acl_set(struct inode *inode, struct generic_acl_operations *ops, - int type, const void *value, size_t size) +static int +generic_acl_set(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags, int type) { + struct inode *inode = dentry->d_inode; struct posix_acl *acl = NULL; int error; + if (strcmp(name, "") != 0) + return -EINVAL; if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; if (!is_owner_or_cap(inode)) @@ -91,28 +86,27 @@ generic_acl_set(struct inode *inode, struct generic_acl_operations *ops, error = posix_acl_valid(acl); if (error) goto failed; - switch(type) { - case ACL_TYPE_ACCESS: - mode = inode->i_mode; - error = posix_acl_equiv_mode(acl, &mode); - if (error < 0) - goto failed; - inode->i_mode = mode; - if (error == 0) { - posix_acl_release(acl); - acl = NULL; - } - break; - - case ACL_TYPE_DEFAULT: - if (!S_ISDIR(inode->i_mode)) { - error = -EINVAL; - goto failed; - } - break; + switch (type) { + case ACL_TYPE_ACCESS: + mode = inode->i_mode; + error = posix_acl_equiv_mode(acl, &mode); + if (error < 0) + goto failed; + inode->i_mode = mode; + if (error == 0) { + posix_acl_release(acl); + acl = NULL; + } + break; + case ACL_TYPE_DEFAULT: + if (!S_ISDIR(inode->i_mode)) { + error = -EINVAL; + goto failed; + } + break; } } - ops->setacl(inode, type, acl); + set_cached_acl(inode, type, acl); error = 0; failed: posix_acl_release(acl); @@ -121,14 +115,12 @@ failed: /** * generic_acl_init - Take care of acl inheritance at @inode create time - * @ops: Filesystem specific getacl and setacl callbacks * * Files created inside a directory with a default ACL inherit the * directory's default ACL. */ int -generic_acl_init(struct inode *inode, struct inode *dir, - struct generic_acl_operations *ops) +generic_acl_init(struct inode *inode, struct inode *dir) { struct posix_acl *acl = NULL; mode_t mode = inode->i_mode; @@ -136,7 +128,7 @@ generic_acl_init(struct inode *inode, struct inode *dir, inode->i_mode = mode & ~current_umask(); if (!S_ISLNK(inode->i_mode)) - acl = ops->getacl(dir, ACL_TYPE_DEFAULT); + acl = get_cached_acl(dir, ACL_TYPE_DEFAULT); if (acl) { struct posix_acl *clone; @@ -145,7 +137,7 @@ generic_acl_init(struct inode *inode, struct inode *dir, error = -ENOMEM; if (!clone) goto cleanup; - ops->setacl(inode, ACL_TYPE_DEFAULT, clone); + set_cached_acl(inode, ACL_TYPE_DEFAULT, clone); posix_acl_release(clone); } clone = posix_acl_clone(acl, GFP_KERNEL); @@ -156,7 +148,7 @@ generic_acl_init(struct inode *inode, struct inode *dir, if (error >= 0) { inode->i_mode = mode; if (error > 0) - ops->setacl(inode, ACL_TYPE_ACCESS, clone); + set_cached_acl(inode, ACL_TYPE_ACCESS, clone); } posix_acl_release(clone); } @@ -169,20 +161,19 @@ cleanup: /** * generic_acl_chmod - change the access acl of @inode upon chmod() - * @ops: FIlesystem specific getacl and setacl callbacks * * A chmod also changes the permissions of the owner, group/mask, and * other ACL entries. */ int -generic_acl_chmod(struct inode *inode, struct generic_acl_operations *ops) +generic_acl_chmod(struct inode *inode) { struct posix_acl *acl, *clone; int error = 0; if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; - acl = ops->getacl(inode, ACL_TYPE_ACCESS); + acl = get_cached_acl(inode, ACL_TYPE_ACCESS); if (acl) { clone = posix_acl_clone(acl, GFP_KERNEL); posix_acl_release(acl); @@ -190,8 +181,37 @@ generic_acl_chmod(struct inode *inode, struct generic_acl_operations *ops) return -ENOMEM; error = posix_acl_chmod_masq(clone, inode->i_mode); if (!error) - ops->setacl(inode, ACL_TYPE_ACCESS, clone); + set_cached_acl(inode, ACL_TYPE_ACCESS, clone); posix_acl_release(clone); } return error; } + +int +generic_check_acl(struct inode *inode, int mask) +{ + struct posix_acl *acl = get_cached_acl(inode, ACL_TYPE_ACCESS); + + if (acl) { + int error = posix_acl_permission(inode, acl, mask); + posix_acl_release(acl); + return error; + } + return -EAGAIN; +} + +struct xattr_handler generic_acl_access_handler = { + .prefix = POSIX_ACL_XATTR_ACCESS, + .flags = ACL_TYPE_ACCESS, + .list = generic_acl_list, + .get = generic_acl_get, + .set = generic_acl_set, +}; + +struct xattr_handler generic_acl_default_handler = { + .prefix = POSIX_ACL_XATTR_DEFAULT, + .flags = ACL_TYPE_DEFAULT, + .list = generic_acl_list, + .get = generic_acl_get, + .set = generic_acl_set, +}; diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig index b192c66..4dcddf8 100644 --- a/fs/gfs2/Kconfig +++ b/fs/gfs2/Kconfig @@ -10,7 +10,6 @@ config GFS2_FS select SLOW_WORK select QUOTA select QUOTACTL - select FS_JOURNAL_INFO help A cluster filesystem. diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 3eb1ea8..87ee309 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -126,7 +126,7 @@ static int gfs2_acl_set(struct inode *inode, int type, struct posix_acl *acl) error = posix_acl_to_xattr(acl, data, len); if (error < 0) goto out; - error = gfs2_xattr_set(inode, GFS2_EATYPE_SYS, name, data, len, 0); + error = __gfs2_xattr_set(inode, name, data, len, 0, GFS2_EATYPE_SYS); if (!error) set_cached_acl(inode, type, acl); out: @@ -232,9 +232,10 @@ static int gfs2_acl_type(const char *name) return -EINVAL; } -static int gfs2_xattr_system_get(struct inode *inode, const char *name, - void *buffer, size_t size) +static int gfs2_xattr_system_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int xtype) { + struct inode *inode = dentry->d_inode; struct posix_acl *acl; int type; int error; @@ -255,9 +256,11 @@ static int gfs2_xattr_system_get(struct inode *inode, const char *name, return error; } -static int gfs2_xattr_system_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) +static int gfs2_xattr_system_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, + int xtype) { + struct inode *inode = dentry->d_inode; struct gfs2_sbd *sdp = GFS2_SB(inode); struct posix_acl *acl = NULL; int error = 0, type; @@ -319,7 +322,7 @@ static int gfs2_xattr_system_set(struct inode *inode, const char *name, } set_acl: - error = gfs2_xattr_set(inode, GFS2_EATYPE_SYS, name, value, size, 0); + error = __gfs2_xattr_set(inode, name, value, size, 0, GFS2_EATYPE_SYS); if (!error) { if (acl) set_cached_acl(inode, type, acl); @@ -334,6 +337,7 @@ out: struct xattr_handler gfs2_xattr_system_handler = { .prefix = XATTR_SYSTEM_PREFIX, + .flags = GFS2_EATYPE_SYS, .get = gfs2_xattr_system_get, .set = gfs2_xattr_system_set, }; diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 6d47379..583e823 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -541,7 +541,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock, *ptr++ = cpu_to_be64(bn++); break; } - } while (state != ALLOC_DATA); + } while ((state != ALLOC_DATA) || !dblock); ip->i_height = height; gfs2_add_inode_blocks(&ip->i_inode, alloced); diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 4eb308a..a6abbae8 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -569,6 +569,40 @@ static int gfs2_fsync(struct file *file, struct dentry *dentry, int datasync) return ret; } +/** + * gfs2_file_aio_write - Perform a write to a file + * @iocb: The io context + * @iov: The data to write + * @nr_segs: Number of @iov segments + * @pos: The file position + * + * We have to do a lock/unlock here to refresh the inode size for + * O_APPEND writes, otherwise we can land up writing at the wrong + * offset. There is still a race, but provided the app is using its + * own file locking, this will make O_APPEND work as expected. + * + */ + +static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct file *file = iocb->ki_filp; + + if (file->f_flags & O_APPEND) { + struct dentry *dentry = file->f_dentry; + struct gfs2_inode *ip = GFS2_I(dentry->d_inode); + struct gfs2_holder gh; + int ret; + + ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh); + if (ret) + return ret; + gfs2_glock_dq_uninit(&gh); + } + + return generic_file_aio_write(iocb, iov, nr_segs, pos); +} + #ifdef CONFIG_GFS2_FS_LOCKING_DLM /** @@ -711,7 +745,7 @@ const struct file_operations gfs2_file_fops = { .read = do_sync_read, .aio_read = generic_file_aio_read, .write = do_sync_write, - .aio_write = generic_file_aio_write, + .aio_write = gfs2_file_aio_write, .unlocked_ioctl = gfs2_ioctl, .mmap = gfs2_mmap, .open = gfs2_open, @@ -741,7 +775,7 @@ const struct file_operations gfs2_file_fops_nolock = { .read = do_sync_read, .aio_read = generic_file_aio_read, .write = do_sync_write, - .aio_write = generic_file_aio_write, + .aio_write = gfs2_file_aio_write, .unlocked_ioctl = gfs2_ioctl, .mmap = gfs2_mmap, .open = gfs2_open, diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index f455a03..f426633 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -769,6 +769,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, if (!gl) return -ENOMEM; + atomic_inc(&sdp->sd_glock_disposal); gl->gl_flags = 0; gl->gl_name = name; atomic_set(&gl->gl_ref, 1); @@ -1538,6 +1539,9 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp) up_write(&gfs2_umount_flush_sem); msleep(10); } + flush_workqueue(glock_workqueue); + wait_event(sdp->sd_glock_wait, atomic_read(&sdp->sd_glock_disposal) == 0); + gfs2_dump_lockstate(sdp); } void gfs2_glock_finish_truncate(struct gfs2_inode *ip) diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 13f0bd2..c0262fa 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -123,7 +123,7 @@ struct lm_lockops { int (*lm_mount) (struct gfs2_sbd *sdp, const char *fsname); void (*lm_unmount) (struct gfs2_sbd *sdp); void (*lm_withdraw) (struct gfs2_sbd *sdp); - void (*lm_put_lock) (struct kmem_cache *cachep, void *gl); + void (*lm_put_lock) (struct kmem_cache *cachep, struct gfs2_glock *gl); unsigned int (*lm_lock) (struct gfs2_glock *gl, unsigned int req_state, unsigned int flags); void (*lm_cancel) (struct gfs2_glock *gl); diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 4792200..bc0ad15 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -544,6 +544,8 @@ struct gfs2_sbd { struct gfs2_holder sd_live_gh; struct gfs2_glock *sd_rename_gl; struct gfs2_glock *sd_trans_gl; + wait_queue_head_t sd_glock_wait; + atomic_t sd_glock_disposal; /* Inode Stuff */ diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 26ba2a4..6e220f4 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -125,7 +125,7 @@ static struct inode *gfs2_iget_skip(struct super_block *sb, * directory entry when gfs2_inode_lookup() is invoked. Part of the code * segment inside gfs2_inode_lookup code needs to get moved around. * - * Clean up I_LOCK and I_NEW as well. + * Clears I_NEW as well. **/ void gfs2_set_iop(struct inode *inode) @@ -801,7 +801,8 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip) return err; } - err = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SECURITY, name, value, len, 0); + err = __gfs2_xattr_set(&ip->i_inode, name, value, len, 0, + GFS2_EATYPE_SECURITY); kfree(value); kfree(name); diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 46df988..0e5e0e7 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -21,6 +21,7 @@ static void gdlm_ast(void *arg) { struct gfs2_glock *gl = arg; unsigned ret = gl->gl_state; + struct gfs2_sbd *sdp = gl->gl_sbd; BUG_ON(gl->gl_lksb.sb_flags & DLM_SBF_DEMOTED); @@ -30,6 +31,8 @@ static void gdlm_ast(void *arg) switch (gl->gl_lksb.sb_status) { case -DLM_EUNLOCK: /* Unlocked, so glock can be freed */ kmem_cache_free(gfs2_glock_cachep, gl); + if (atomic_dec_and_test(&sdp->sd_glock_disposal)) + wake_up(&sdp->sd_glock_wait); return; case -DLM_ECANCEL: /* Cancel while getting lock */ ret |= LM_OUT_CANCELED; @@ -164,14 +167,16 @@ static unsigned int gdlm_lock(struct gfs2_glock *gl, return LM_OUT_ASYNC; } -static void gdlm_put_lock(struct kmem_cache *cachep, void *ptr) +static void gdlm_put_lock(struct kmem_cache *cachep, struct gfs2_glock *gl) { - struct gfs2_glock *gl = ptr; - struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct; + struct gfs2_sbd *sdp = gl->gl_sbd; + struct lm_lockstruct *ls = &sdp->sd_lockstruct; int error; if (gl->gl_lksb.sb_lkid == 0) { kmem_cache_free(cachep, gl); + if (atomic_dec_and_test(&sdp->sd_glock_disposal)) + wake_up(&sdp->sd_glock_wait); return; } diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index cb8d7a9..6f68a5f 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -121,7 +121,7 @@ struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp) if (aspace) { mapping_set_gfp_mask(aspace->i_mapping, GFP_NOFS); aspace->i_mapping->a_ops = &aspace_aops; - aspace->i_size = ~0ULL; + aspace->i_size = MAX_LFS_FILESIZE; ip = GFS2_I(aspace); clear_bit(GIF_USER, &ip->i_flags); insert_inode_hash(aspace); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index edfee24..a86ed63 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -82,6 +82,8 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) gfs2_tune_init(&sdp->sd_tune); + init_waitqueue_head(&sdp->sd_glock_wait); + atomic_set(&sdp->sd_glock_disposal, 0); spin_lock_init(&sdp->sd_statfs_spin); spin_lock_init(&sdp->sd_rindex_spin); @@ -723,7 +725,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) goto fail; } - error = -EINVAL; + error = -EUSERS; if (!gfs2_jindex_size(sdp)) { fs_err(sdp, "no journals!\n"); goto fail_jindex; @@ -983,9 +985,17 @@ static const match_table_t nolock_tokens = { { Opt_err, NULL }, }; +static void nolock_put_lock(struct kmem_cache *cachep, struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + kmem_cache_free(cachep, gl); + if (atomic_dec_and_test(&sdp->sd_glock_disposal)) + wake_up(&sdp->sd_glock_wait); +} + static const struct lm_lockops nolock_ops = { .lm_proto_name = "lock_nolock", - .lm_put_lock = kmem_cache_free, + .lm_put_lock = nolock_put_lock, .lm_tokens = &nolock_tokens, }; diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c index 247436c..84350e1 100644 --- a/fs/gfs2/ops_inode.c +++ b/fs/gfs2/ops_inode.c @@ -748,7 +748,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, struct gfs2_rgrpd *nrgd; unsigned int num_gh; int dir_rename = 0; - int alloc_required; + int alloc_required = 0; unsigned int x; int error; @@ -867,7 +867,9 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, goto out_gunlock; } - alloc_required = error = gfs2_diradd_alloc_required(ndir, &ndentry->d_name); + if (nip == NULL) + alloc_required = gfs2_diradd_alloc_required(ndir, &ndentry->d_name); + error = alloc_required; if (error < 0) goto out_gunlock; error = 0; @@ -1086,7 +1088,8 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd) error = vfs_follow_link(nd, buf); if (buf != array) kfree(buf); - } + } else + path_put(&nd->path); return ERR_PTR(error); } diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 0608f49..503b842 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -591,11 +591,7 @@ static int gfs2_ri_update(struct gfs2_inode *ip) u64 rgrp_count = ip->i_disksize; int error; - if (do_div(rgrp_count, sizeof(struct gfs2_rindex))) { - gfs2_consist_inode(ip); - return -EIO; - } - + do_div(rgrp_count, sizeof(struct gfs2_rindex)); clear_rgrpdi(sdp); file_ra_state_init(&ra_state, inode->i_mapping); @@ -915,7 +911,7 @@ void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd) struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip) { BUG_ON(ip->i_alloc != NULL); - ip->i_alloc = kzalloc(sizeof(struct gfs2_alloc), GFP_KERNEL); + ip->i_alloc = kzalloc(sizeof(struct gfs2_alloc), GFP_NOFS); return ip->i_alloc; } diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index c282ad4..b9dd3da 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -21,6 +21,7 @@ #include <linux/gfs2_ondisk.h> #include <linux/crc32.h> #include <linux/time.h> +#include <linux/wait.h> #include "gfs2.h" #include "incore.h" diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 912f5cb..c2ebdf2c 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -567,18 +567,17 @@ out: /** * gfs2_xattr_get - Get a GFS2 extended attribute * @inode: The inode - * @type: The type of extended attribute * @name: The name of the extended attribute * @buffer: The buffer to write the result into * @size: The size of the buffer + * @type: The type of extended attribute * * Returns: actual size of data on success, -errno on error */ - -int gfs2_xattr_get(struct inode *inode, int type, const char *name, - void *buffer, size_t size) +static int gfs2_xattr_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) { - struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_inode *ip = GFS2_I(dentry->d_inode); struct gfs2_ea_location el; int error; @@ -1119,7 +1118,7 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el) /** * gfs2_xattr_remove - Remove a GFS2 extended attribute - * @inode: The inode + * @ip: The inode * @type: The type of the extended attribute * @name: The name of the extended attribute * @@ -1130,9 +1129,8 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el) * Returns: 0, or errno on failure */ -static int gfs2_xattr_remove(struct inode *inode, int type, const char *name) +static int gfs2_xattr_remove(struct gfs2_inode *ip, int type, const char *name) { - struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_ea_location el; int error; @@ -1156,24 +1154,24 @@ static int gfs2_xattr_remove(struct inode *inode, int type, const char *name) } /** - * gfs2_xattr_set - Set (or remove) a GFS2 extended attribute - * @inode: The inode - * @type: The type of the extended attribute + * __gfs2_xattr_set - Set (or remove) a GFS2 extended attribute + * @ip: The inode * @name: The name of the extended attribute * @value: The value of the extended attribute (NULL for remove) * @size: The size of the @value argument * @flags: Create or Replace + * @type: The type of the extended attribute * * See gfs2_xattr_remove() for details of the removal of xattrs. * * Returns: 0 or errno on failure */ -int gfs2_xattr_set(struct inode *inode, int type, const char *name, - const void *value, size_t size, int flags) +int __gfs2_xattr_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags, int type) { - struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_ea_location el; unsigned int namel = strlen(name); int error; @@ -1184,7 +1182,7 @@ int gfs2_xattr_set(struct inode *inode, int type, const char *name, return -ERANGE; if (value == NULL) - return gfs2_xattr_remove(inode, type, name); + return gfs2_xattr_remove(ip, type, name); if (ea_check_size(sdp, namel, size)) return -ERANGE; @@ -1224,6 +1222,13 @@ int gfs2_xattr_set(struct inode *inode, int type, const char *name, return error; } +static int gfs2_xattr_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + return __gfs2_xattr_set(dentry->d_inode, name, value, + size, flags, type); +} + static int ea_acl_chmod_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea, char *data) { @@ -1291,6 +1296,7 @@ fail: int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data) { + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_ea_location el; struct buffer_head *dibh; int error; @@ -1300,16 +1306,17 @@ int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data) return error; if (GFS2_EA_IS_STUFFED(el.el_ea)) { - error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + RES_EATTR, 0); - if (error) - return error; - - gfs2_trans_add_bh(ip->i_gl, el.el_bh, 1); - memcpy(GFS2_EA2DATA(el.el_ea), data, - GFS2_EA_DATA_LEN(el.el_ea)); - } else + error = gfs2_trans_begin(sdp, RES_DINODE + RES_EATTR, 0); + if (error == 0) { + gfs2_trans_add_bh(ip->i_gl, el.el_bh, 1); + memcpy(GFS2_EA2DATA(el.el_ea), data, + GFS2_EA_DATA_LEN(el.el_ea)); + } + } else { error = ea_acl_chmod_unstuffed(ip, el.el_ea, data); + } + brelse(el.el_bh); if (error) return error; @@ -1322,8 +1329,7 @@ int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data) brelse(dibh); } - gfs2_trans_end(GFS2_SB(&ip->i_inode)); - + gfs2_trans_end(sdp); return error; } @@ -1529,40 +1535,18 @@ out_alloc: return error; } -static int gfs2_xattr_user_get(struct inode *inode, const char *name, - void *buffer, size_t size) -{ - return gfs2_xattr_get(inode, GFS2_EATYPE_USR, name, buffer, size); -} - -static int gfs2_xattr_user_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) -{ - return gfs2_xattr_set(inode, GFS2_EATYPE_USR, name, value, size, flags); -} - -static int gfs2_xattr_security_get(struct inode *inode, const char *name, - void *buffer, size_t size) -{ - return gfs2_xattr_get(inode, GFS2_EATYPE_SECURITY, name, buffer, size); -} - -static int gfs2_xattr_security_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) -{ - return gfs2_xattr_set(inode, GFS2_EATYPE_SECURITY, name, value, size, flags); -} - static struct xattr_handler gfs2_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, - .get = gfs2_xattr_user_get, - .set = gfs2_xattr_user_set, + .flags = GFS2_EATYPE_USR, + .get = gfs2_xattr_get, + .set = gfs2_xattr_set, }; static struct xattr_handler gfs2_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, - .get = gfs2_xattr_security_get, - .set = gfs2_xattr_security_set, + .flags = GFS2_EATYPE_SECURITY, + .get = gfs2_xattr_get, + .set = gfs2_xattr_set, }; struct xattr_handler *gfs2_xattr_handlers[] = { diff --git a/fs/gfs2/xattr.h b/fs/gfs2/xattr.h index 8d6ae58..d392f83 100644 --- a/fs/gfs2/xattr.h +++ b/fs/gfs2/xattr.h @@ -53,10 +53,9 @@ struct gfs2_ea_location { struct gfs2_ea_header *el_prev; }; -extern int gfs2_xattr_get(struct inode *inode, int type, const char *name, - void *buffer, size_t size); -extern int gfs2_xattr_set(struct inode *inode, int type, const char *name, - const void *value, size_t size, int flags); +extern int __gfs2_xattr_set(struct inode *inode, const char *name, + const void *value, size_t size, + int flags, int type); extern ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size); extern int gfs2_ea_dealloc(struct gfs2_inode *ip); diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c index a5089a6..7239efc 100644 --- a/fs/hppfs/hppfs.c +++ b/fs/hppfs/hppfs.c @@ -646,22 +646,27 @@ static const struct super_operations hppfs_sbops = { static int hppfs_readlink(struct dentry *dentry, char __user *buffer, int buflen) { - struct dentry *proc_dentry; - - proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry; + struct dentry *proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry; return proc_dentry->d_inode->i_op->readlink(proc_dentry, buffer, buflen); } static void *hppfs_follow_link(struct dentry *dentry, struct nameidata *nd) { - struct dentry *proc_dentry; - - proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry; + struct dentry *proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry; return proc_dentry->d_inode->i_op->follow_link(proc_dentry, nd); } +static void hppfs_put_link(struct dentry *dentry, struct nameidata *nd, + void *cookie) +{ + struct dentry *proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry; + + if (proc_dentry->d_inode->i_op->put_link) + proc_dentry->d_inode->i_op->put_link(proc_dentry, nd, cookie); +} + static const struct inode_operations hppfs_dir_iops = { .lookup = hppfs_lookup, }; @@ -669,6 +674,7 @@ static const struct inode_operations hppfs_dir_iops = { static const struct inode_operations hppfs_link_iops = { .readlink = hppfs_readlink, .follow_link = hppfs_follow_link, + .put_link = hppfs_put_link, }; static struct inode *get_inode(struct super_block *sb, struct dentry *dentry) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 87a1258..a0bbd3d 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -30,7 +30,6 @@ #include <linux/dnotify.h> #include <linux/statfs.h> #include <linux/security.h> -#include <linux/ima.h> #include <linux/magic.h> #include <asm/uaccess.h> @@ -922,7 +921,8 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag, int error = -ENOMEM; struct file *file; struct inode *inode; - struct dentry *dentry, *root; + struct path path; + struct dentry *root; struct qstr quick_string; *user = NULL; @@ -944,10 +944,11 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag, quick_string.name = name; quick_string.len = strlen(quick_string.name); quick_string.hash = 0; - dentry = d_alloc(root, &quick_string); - if (!dentry) + path.dentry = d_alloc(root, &quick_string); + if (!path.dentry) goto out_shm_unlock; + path.mnt = mntget(hugetlbfs_vfsmount); error = -ENOSPC; inode = hugetlbfs_get_inode(root->d_sb, current_fsuid(), current_fsgid(), S_IFREG | S_IRWXUGO, 0); @@ -960,24 +961,22 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag, acctflag)) goto out_inode; - d_instantiate(dentry, inode); + d_instantiate(path.dentry, inode); inode->i_size = size; inode->i_nlink = 0; error = -ENFILE; - file = alloc_file(hugetlbfs_vfsmount, dentry, - FMODE_WRITE | FMODE_READ, + file = alloc_file(&path, FMODE_WRITE | FMODE_READ, &hugetlbfs_file_operations); if (!file) goto out_dentry; /* inode is already attached */ - ima_counts_get(file); return file; out_inode: iput(inode); out_dentry: - dput(dentry); + path_put(&path); out_shm_unlock: if (*user) { user_shm_unlock(size, *user); @@ -113,7 +113,7 @@ static void wake_up_inode(struct inode *inode) * Prevent speculative execution through spin_unlock(&inode_lock); */ smp_mb(); - wake_up_bit(&inode->i_state, __I_LOCK); + wake_up_bit(&inode->i_state, __I_NEW); } /** @@ -690,17 +690,17 @@ void unlock_new_inode(struct inode *inode) } #endif /* - * This is special! We do not need the spinlock when clearing I_LOCK, + * This is special! We do not need the spinlock when clearing I_NEW, * because we're guaranteed that nobody else tries to do anything about * the state of the inode when it is locked, as we just created it (so - * there can be no old holders that haven't tested I_LOCK). + * there can be no old holders that haven't tested I_NEW). * However we must emit the memory barrier so that other CPUs reliably - * see the clearing of I_LOCK after the other inode initialisation has + * see the clearing of I_NEW after the other inode initialisation has * completed. */ smp_mb(); - WARN_ON((inode->i_state & (I_LOCK|I_NEW)) != (I_LOCK|I_NEW)); - inode->i_state &= ~(I_LOCK|I_NEW); + WARN_ON(!(inode->i_state & I_NEW)); + inode->i_state &= ~I_NEW; wake_up_inode(inode); } EXPORT_SYMBOL(unlock_new_inode); @@ -731,7 +731,7 @@ static struct inode *get_new_inode(struct super_block *sb, goto set_failed; __inode_add_to_lists(sb, head, inode); - inode->i_state = I_LOCK|I_NEW; + inode->i_state = I_NEW; spin_unlock(&inode_lock); /* Return the locked inode with I_NEW set, the @@ -778,7 +778,7 @@ static struct inode *get_new_inode_fast(struct super_block *sb, if (!old) { inode->i_ino = ino; __inode_add_to_lists(sb, head, inode); - inode->i_state = I_LOCK|I_NEW; + inode->i_state = I_NEW; spin_unlock(&inode_lock); /* Return the locked inode with I_NEW set, the @@ -1083,7 +1083,7 @@ int insert_inode_locked(struct inode *inode) ino_t ino = inode->i_ino; struct hlist_head *head = inode_hashtable + hash(sb, ino); - inode->i_state |= I_LOCK|I_NEW; + inode->i_state |= I_NEW; while (1) { struct hlist_node *node; struct inode *old = NULL; @@ -1120,7 +1120,7 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval, struct super_block *sb = inode->i_sb; struct hlist_head *head = inode_hashtable + hash(sb, hashval); - inode->i_state |= I_LOCK|I_NEW; + inode->i_state |= I_NEW; while (1) { struct hlist_node *node; @@ -1510,7 +1510,7 @@ EXPORT_SYMBOL(inode_wait); * until the deletion _might_ have completed. Callers are responsible * to recheck inode state. * - * It doesn't matter if I_LOCK is not set initially, a call to + * It doesn't matter if I_NEW is not set initially, a call to * wake_up_inode() after removing from the hash list will DTRT. * * This is called with inode_lock held. @@ -1518,8 +1518,8 @@ EXPORT_SYMBOL(inode_wait); static void __wait_on_freeing_inode(struct inode *inode) { wait_queue_head_t *wq; - DEFINE_WAIT_BIT(wait, &inode->i_state, __I_LOCK); - wq = bit_waitqueue(&inode->i_state, __I_LOCK); + DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); + wq = bit_waitqueue(&inode->i_state, __I_NEW); prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); spin_unlock(&inode_lock); schedule(); diff --git a/fs/internal.h b/fs/internal.h index 515175b..e96a166 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -79,8 +79,16 @@ extern void chroot_fs_refs(struct path *, struct path *); * file_table.c */ extern void mark_files_ro(struct super_block *); +extern struct file *get_empty_filp(void); /* * super.c */ extern int do_remount_sb(struct super_block *, int, void *, int); + +/* + * open.c + */ +struct nameidata; +extern struct file *nameidata_to_filp(struct nameidata *); +extern void release_open_intent(struct nameidata *); diff --git a/fs/isofs/export.c b/fs/isofs/export.c index e81a305..ed752cb 100644 --- a/fs/isofs/export.c +++ b/fs/isofs/export.c @@ -9,7 +9,7 @@ * * The following files are helpful: * - * Documentation/filesystems/Exporting + * Documentation/filesystems/nfs/Exporting * fs/exportfs/expfs.c. */ diff --git a/fs/jbd/Kconfig b/fs/jbd/Kconfig index a840898..4e28bee 100644 --- a/fs/jbd/Kconfig +++ b/fs/jbd/Kconfig @@ -1,6 +1,5 @@ config JBD tristate - select FS_JOURNAL_INFO help This is a generic journalling layer for block devices. It is currently used by the ext3 file system, but it could also be diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 4160afa..bd224ee 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -1913,7 +1913,7 @@ static void __init jbd_create_debugfs_entry(void) { jbd_debugfs_dir = debugfs_create_dir("jbd", NULL); if (jbd_debugfs_dir) - jbd_debug = debugfs_create_u8("jbd-debug", S_IRUGO, + jbd_debug = debugfs_create_u8("jbd-debug", S_IRUGO | S_IWUSR, jbd_debugfs_dir, &journal_enable_debug); } diff --git a/fs/jbd2/Kconfig b/fs/jbd2/Kconfig index 0f7d1ce..f32f346 100644 --- a/fs/jbd2/Kconfig +++ b/fs/jbd2/Kconfig @@ -1,7 +1,6 @@ config JBD2 tristate select CRC32 - select FS_JOURNAL_INFO help This is a generic journaling layer for block devices that support both 32-bit and 64-bit block numbers. It is currently used by diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index ca0f5eb..8868493 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -22,6 +22,7 @@ #include <linux/jbd2.h> #include <linux/errno.h> #include <linux/slab.h> +#include <linux/blkdev.h> #include <trace/events/jbd2.h> /* @@ -515,6 +516,20 @@ int jbd2_cleanup_journal_tail(journal_t *journal) journal->j_tail_sequence = first_tid; journal->j_tail = blocknr; spin_unlock(&journal->j_state_lock); + + /* + * If there is an external journal, we need to make sure that + * any data blocks that were recently written out --- perhaps + * by jbd2_log_do_checkpoint() --- are flushed out before we + * drop the transactions from the external journal. It's + * unlikely this will be necessary, especially with a + * appropriately sized journal, but we need this to guarantee + * correctness. Fortunately jbd2_cleanup_journal_tail() + * doesn't get called all that often. + */ + if ((journal->j_fs_dev != journal->j_dev) && + (journal->j_flags & JBD2_BARRIER)) + blkdev_issue_flush(journal->j_fs_dev, NULL); if (!(journal->j_flags & JBD2_ABORT)) jbd2_journal_update_superblock(journal, 1); return 0; diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 6a10238..1bc74b6 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -259,6 +259,7 @@ static int journal_submit_data_buffers(journal_t *journal, ret = err; spin_lock(&journal->j_list_lock); J_ASSERT(jinode->i_transaction == commit_transaction); + commit_transaction->t_flushed_data_blocks = 1; jinode->i_flags &= ~JI_COMMIT_RUNNING; wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); } @@ -708,8 +709,17 @@ start_journal_io: } } - /* Done it all: now write the commit record asynchronously. */ + /* + * If the journal is not located on the file system device, + * then we must flush the file system device before we issue + * the commit record + */ + if (commit_transaction->t_flushed_data_blocks && + (journal->j_fs_dev != journal->j_dev) && + (journal->j_flags & JBD2_BARRIER)) + blkdev_issue_flush(journal->j_fs_dev, NULL); + /* Done it all: now write the commit record asynchronously. */ if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { err = journal_submit_commit_record(journal, commit_transaction, @@ -720,13 +730,6 @@ start_journal_io: blkdev_issue_flush(journal->j_dev, NULL); } - /* - * This is the right place to wait for data buffers both for ASYNC - * and !ASYNC commit. If commit is ASYNC, we need to wait only after - * the commit block went to disk (which happens above). If commit is - * SYNC, we need to wait for data buffers before we start writing - * commit block, which happens below in such setting. - */ err = journal_finish_inode_data_buffers(journal, commit_transaction); if (err) { printk(KERN_WARNING diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index b7ca3a9..ac0d027 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -814,7 +814,7 @@ static journal_t * journal_init_common (void) journal_t *journal; int err; - journal = kzalloc(sizeof(*journal), GFP_KERNEL|__GFP_NOFAIL); + journal = kzalloc(sizeof(*journal), GFP_KERNEL); if (!journal) goto fail; @@ -2115,7 +2115,8 @@ static void __init jbd2_create_debugfs_entry(void) { jbd2_debugfs_dir = debugfs_create_dir("jbd2", NULL); if (jbd2_debugfs_dir) - jbd2_debug = debugfs_create_u8(JBD2_DEBUG_NAME, S_IRUGO, + jbd2_debug = debugfs_create_u8(JBD2_DEBUG_NAME, + S_IRUGO | S_IWUSR, jbd2_debugfs_dir, &jbd2_journal_enable_debug); } diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index 7edb62e..7cdc319 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -350,8 +350,8 @@ int jffs2_acl_chmod(struct inode *inode) return rc; } -static size_t jffs2_acl_access_listxattr(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +static size_t jffs2_acl_access_listxattr(struct dentry *dentry, char *list, + size_t list_size, const char *name, size_t name_len, int type) { const int retlen = sizeof(POSIX_ACL_XATTR_ACCESS); @@ -360,8 +360,8 @@ static size_t jffs2_acl_access_listxattr(struct inode *inode, char *list, size_t return retlen; } -static size_t jffs2_acl_default_listxattr(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +static size_t jffs2_acl_default_listxattr(struct dentry *dentry, char *list, + size_t list_size, const char *name, size_t name_len, int type) { const int retlen = sizeof(POSIX_ACL_XATTR_DEFAULT); @@ -370,12 +370,16 @@ static size_t jffs2_acl_default_listxattr(struct inode *inode, char *list, size_ return retlen; } -static int jffs2_acl_getxattr(struct inode *inode, int type, void *buffer, size_t size) +static int jffs2_acl_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) { struct posix_acl *acl; int rc; - acl = jffs2_get_acl(inode, type); + if (name[0] != '\0') + return -EINVAL; + + acl = jffs2_get_acl(dentry->d_inode, type); if (IS_ERR(acl)) return PTR_ERR(acl); if (!acl) @@ -386,26 +390,15 @@ static int jffs2_acl_getxattr(struct inode *inode, int type, void *buffer, size_ return rc; } -static int jffs2_acl_access_getxattr(struct inode *inode, const char *name, void *buffer, size_t size) -{ - if (name[0] != '\0') - return -EINVAL; - return jffs2_acl_getxattr(inode, ACL_TYPE_ACCESS, buffer, size); -} - -static int jffs2_acl_default_getxattr(struct inode *inode, const char *name, void *buffer, size_t size) -{ - if (name[0] != '\0') - return -EINVAL; - return jffs2_acl_getxattr(inode, ACL_TYPE_DEFAULT, buffer, size); -} - -static int jffs2_acl_setxattr(struct inode *inode, int type, const void *value, size_t size) +static int jffs2_acl_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) { struct posix_acl *acl; int rc; - if (!is_owner_or_cap(inode)) + if (name[0] != '\0') + return -EINVAL; + if (!is_owner_or_cap(dentry->d_inode)) return -EPERM; if (value) { @@ -420,38 +413,24 @@ static int jffs2_acl_setxattr(struct inode *inode, int type, const void *value, } else { acl = NULL; } - rc = jffs2_set_acl(inode, type, acl); + rc = jffs2_set_acl(dentry->d_inode, type, acl); out: posix_acl_release(acl); return rc; } -static int jffs2_acl_access_setxattr(struct inode *inode, const char *name, - const void *buffer, size_t size, int flags) -{ - if (name[0] != '\0') - return -EINVAL; - return jffs2_acl_setxattr(inode, ACL_TYPE_ACCESS, buffer, size); -} - -static int jffs2_acl_default_setxattr(struct inode *inode, const char *name, - const void *buffer, size_t size, int flags) -{ - if (name[0] != '\0') - return -EINVAL; - return jffs2_acl_setxattr(inode, ACL_TYPE_DEFAULT, buffer, size); -} - struct xattr_handler jffs2_acl_access_xattr_handler = { .prefix = POSIX_ACL_XATTR_ACCESS, + .flags = ACL_TYPE_DEFAULT, .list = jffs2_acl_access_listxattr, - .get = jffs2_acl_access_getxattr, - .set = jffs2_acl_access_setxattr, + .get = jffs2_acl_getxattr, + .set = jffs2_acl_setxattr, }; struct xattr_handler jffs2_acl_default_xattr_handler = { .prefix = POSIX_ACL_XATTR_DEFAULT, + .flags = ACL_TYPE_DEFAULT, .list = jffs2_acl_default_listxattr, - .get = jffs2_acl_default_getxattr, - .set = jffs2_acl_default_setxattr, + .get = jffs2_acl_getxattr, + .set = jffs2_acl_setxattr, }; diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c index 02c39c6..eaccee0 100644 --- a/fs/jffs2/security.c +++ b/fs/jffs2/security.c @@ -44,26 +44,28 @@ int jffs2_init_security(struct inode *inode, struct inode *dir) } /* ---- XATTR Handler for "security.*" ----------------- */ -static int jffs2_security_getxattr(struct inode *inode, const char *name, - void *buffer, size_t size) +static int jffs2_security_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) { if (!strcmp(name, "")) return -EINVAL; - return do_jffs2_getxattr(inode, JFFS2_XPREFIX_SECURITY, name, buffer, size); + return do_jffs2_getxattr(dentry->d_inode, JFFS2_XPREFIX_SECURITY, + name, buffer, size); } -static int jffs2_security_setxattr(struct inode *inode, const char *name, const void *buffer, - size_t size, int flags) +static int jffs2_security_setxattr(struct dentry *dentry, const char *name, + const void *buffer, size_t size, int flags, int type) { if (!strcmp(name, "")) return -EINVAL; - return do_jffs2_setxattr(inode, JFFS2_XPREFIX_SECURITY, name, buffer, size, flags); + return do_jffs2_setxattr(dentry->d_inode, JFFS2_XPREFIX_SECURITY, + name, buffer, size, flags); } -static size_t jffs2_security_listxattr(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +static size_t jffs2_security_listxattr(struct dentry *dentry, char *list, + size_t list_size, const char *name, size_t name_len, int type) { size_t retlen = XATTR_SECURITY_PREFIX_LEN + name_len + 1; diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c index 4b10788..9e75c62c 100644 --- a/fs/jffs2/xattr.c +++ b/fs/jffs2/xattr.c @@ -990,9 +990,11 @@ ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size) if (!xhandle) continue; if (buffer) { - rc = xhandle->list(inode, buffer+len, size-len, xd->xname, xd->name_len); + rc = xhandle->list(dentry, buffer+len, size-len, + xd->xname, xd->name_len, xd->flags); } else { - rc = xhandle->list(inode, NULL, 0, xd->xname, xd->name_len); + rc = xhandle->list(dentry, NULL, 0, xd->xname, + xd->name_len, xd->flags); } if (rc < 0) goto out; diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c index 8ec5765..3e5a5e3 100644 --- a/fs/jffs2/xattr_trusted.c +++ b/fs/jffs2/xattr_trusted.c @@ -16,24 +16,26 @@ #include <linux/mtd/mtd.h> #include "nodelist.h" -static int jffs2_trusted_getxattr(struct inode *inode, const char *name, - void *buffer, size_t size) +static int jffs2_trusted_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) { if (!strcmp(name, "")) return -EINVAL; - return do_jffs2_getxattr(inode, JFFS2_XPREFIX_TRUSTED, name, buffer, size); + return do_jffs2_getxattr(dentry->d_inode, JFFS2_XPREFIX_TRUSTED, + name, buffer, size); } -static int jffs2_trusted_setxattr(struct inode *inode, const char *name, const void *buffer, - size_t size, int flags) +static int jffs2_trusted_setxattr(struct dentry *dentry, const char *name, + const void *buffer, size_t size, int flags, int type) { if (!strcmp(name, "")) return -EINVAL; - return do_jffs2_setxattr(inode, JFFS2_XPREFIX_TRUSTED, name, buffer, size, flags); + return do_jffs2_setxattr(dentry->d_inode, JFFS2_XPREFIX_TRUSTED, + name, buffer, size, flags); } -static size_t jffs2_trusted_listxattr(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +static size_t jffs2_trusted_listxattr(struct dentry *dentry, char *list, + size_t list_size, const char *name, size_t name_len, int type) { size_t retlen = XATTR_TRUSTED_PREFIX_LEN + name_len + 1; diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c index 8bbeab9..8544af6 100644 --- a/fs/jffs2/xattr_user.c +++ b/fs/jffs2/xattr_user.c @@ -16,24 +16,26 @@ #include <linux/mtd/mtd.h> #include "nodelist.h" -static int jffs2_user_getxattr(struct inode *inode, const char *name, - void *buffer, size_t size) +static int jffs2_user_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) { if (!strcmp(name, "")) return -EINVAL; - return do_jffs2_getxattr(inode, JFFS2_XPREFIX_USER, name, buffer, size); + return do_jffs2_getxattr(dentry->d_inode, JFFS2_XPREFIX_USER, + name, buffer, size); } -static int jffs2_user_setxattr(struct inode *inode, const char *name, const void *buffer, - size_t size, int flags) +static int jffs2_user_setxattr(struct dentry *dentry, const char *name, + const void *buffer, size_t size, int flags, int type) { if (!strcmp(name, "")) return -EINVAL; - return do_jffs2_setxattr(inode, JFFS2_XPREFIX_USER, name, buffer, size, flags); + return do_jffs2_setxattr(dentry->d_inode, JFFS2_XPREFIX_USER, + name, buffer, size, flags); } -static size_t jffs2_user_listxattr(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +static size_t jffs2_user_listxattr(struct dentry *dentry, char *list, + size_t list_size, const char *name, size_t name_len, int type) { size_t retlen = XATTR_USER_PREFIX_LEN + name_len + 1; diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index f26e4d0..d945ea7 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -1292,7 +1292,7 @@ int txCommit(tid_t tid, /* transaction identifier */ */ /* * I believe this code is no longer needed. Splitting I_LOCK - * into two bits, I_LOCK and I_SYNC should prevent this + * into two bits, I_NEW and I_SYNC should prevent this * deadlock as well. But since I don't have a JFS testload * to verify this, only a trivial s/I_LOCK/I_SYNC/ was done. * Joern diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 2234c73..d929a82 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -524,7 +524,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) * Page cache is indexed by long. * I would use MAX_LFS_FILESIZE, but it's only half as big */ - sb->s_maxbytes = min(((u64) PAGE_CACHE_SIZE << 32) - 1, sb->s_maxbytes); + sb->s_maxbytes = min(((u64) PAGE_CACHE_SIZE << 32) - 1, (u64)sb->s_maxbytes); #endif sb->s_time_gran = 1; return 0; @@ -848,7 +848,6 @@ EXPORT_SYMBOL(simple_write_end); EXPORT_SYMBOL(simple_dir_inode_operations); EXPORT_SYMBOL(simple_dir_operations); EXPORT_SYMBOL(simple_empty); -EXPORT_SYMBOL(d_alloc_name); EXPORT_SYMBOL(simple_fill_super); EXPORT_SYMBOL(simple_getattr); EXPORT_SYMBOL(simple_link); diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index bd173a6..a7966ee 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -11,10 +11,6 @@ #include <linux/time.h> #include <linux/slab.h> #include <linux/smp_lock.h> -#include <linux/in.h> -#include <linux/sunrpc/svc.h> -#include <linux/sunrpc/clnt.h> -#include <linux/nfsd/nfsd.h> #include <linux/lockd/lockd.h> #include <linux/lockd/share.h> diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index e1d28dd..56c9519 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -11,10 +11,6 @@ #include <linux/time.h> #include <linux/slab.h> #include <linux/smp_lock.h> -#include <linux/in.h> -#include <linux/sunrpc/svc.h> -#include <linux/sunrpc/clnt.h> -#include <linux/nfsd/nfsd.h> #include <linux/lockd/lockd.h> #include <linux/lockd/share.h> @@ -35,7 +35,7 @@ #include <linux/fs_struct.h> #include <asm/uaccess.h> -#define ACC_MODE(x) ("\000\004\002\006"[(x)&O_ACCMODE]) +#include "internal.h" /* [Feb-1997 T. Schoebel-Theuer] * Fundamental changes in the pathname lookup mechanisms (namei) @@ -108,8 +108,6 @@ * any extra contention... */ -static int __link_path_walk(const char *name, struct nameidata *nd); - /* In order to reduce some races, while at the same time doing additional * checking and hopefully speeding things up, we copy filenames to the * kernel data space before using them.. @@ -234,6 +232,7 @@ int generic_permission(struct inode *inode, int mask, /* * Searching includes executable on directories, else just read. */ + mask &= MAY_READ | MAY_WRITE | MAY_EXEC; if (mask == MAY_READ || (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))) if (capable(CAP_DAC_READ_SEARCH)) return 0; @@ -414,36 +413,55 @@ do_revalidate(struct dentry *dentry, struct nameidata *nd) } /* - * Internal lookup() using the new generic dcache. - * SMP-safe + * force_reval_path - force revalidation of a dentry + * + * In some situations the path walking code will trust dentries without + * revalidating them. This causes problems for filesystems that depend on + * d_revalidate to handle file opens (e.g. NFSv4). When FS_REVAL_DOT is set + * (which indicates that it's possible for the dentry to go stale), force + * a d_revalidate call before proceeding. + * + * Returns 0 if the revalidation was successful. If the revalidation fails, + * either return the error returned by d_revalidate or -ESTALE if the + * revalidation it just returned 0. If d_revalidate returns 0, we attempt to + * invalidate the dentry. It's up to the caller to handle putting references + * to the path if necessary. */ -static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name, struct nameidata *nd) +static int +force_reval_path(struct path *path, struct nameidata *nd) { - struct dentry * dentry = __d_lookup(parent, name); + int status; + struct dentry *dentry = path->dentry; - /* lockess __d_lookup may fail due to concurrent d_move() - * in some unrelated directory, so try with d_lookup + /* + * only check on filesystems where it's possible for the dentry to + * become stale. It's assumed that if this flag is set then the + * d_revalidate op will also be defined. */ - if (!dentry) - dentry = d_lookup(parent, name); + if (!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) + return 0; - if (dentry && dentry->d_op && dentry->d_op->d_revalidate) - dentry = do_revalidate(dentry, nd); + status = dentry->d_op->d_revalidate(dentry, nd); + if (status > 0) + return 0; - return dentry; + if (!status) { + d_invalidate(dentry); + status = -ESTALE; + } + return status; } /* - * Short-cut version of permission(), for calling by - * path_walk(), when dcache lock is held. Combines parts - * of permission() and generic_permission(), and tests ONLY for - * MAY_EXEC permission. + * Short-cut version of permission(), for calling on directories + * during pathname resolution. Combines parts of permission() + * and generic_permission(), and tests ONLY for MAY_EXEC permission. * * If appropriate, check DAC only. If not appropriate, or - * short-cut DAC fails, then call permission() to do more + * short-cut DAC fails, then call ->permission() to do more * complete permission check. */ -static int exec_permission_lite(struct inode *inode) +static int exec_permission(struct inode *inode) { int ret; @@ -465,99 +483,6 @@ ok: return security_inode_permission(inode, MAY_EXEC); } -/* - * This is called when everything else fails, and we actually have - * to go to the low-level filesystem to find out what we should do.. - * - * We get the directory semaphore, and after getting that we also - * make sure that nobody added the entry to the dcache in the meantime.. - * SMP-safe - */ -static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, struct nameidata *nd) -{ - struct dentry * result; - struct inode *dir = parent->d_inode; - - mutex_lock(&dir->i_mutex); - /* - * First re-do the cached lookup just in case it was created - * while we waited for the directory semaphore.. - * - * FIXME! This could use version numbering or similar to - * avoid unnecessary cache lookups. - * - * The "dcache_lock" is purely to protect the RCU list walker - * from concurrent renames at this point (we mustn't get false - * negatives from the RCU list walk here, unlike the optimistic - * fast walk). - * - * so doing d_lookup() (with seqlock), instead of lockfree __d_lookup - */ - result = d_lookup(parent, name); - if (!result) { - struct dentry *dentry; - - /* Don't create child dentry for a dead directory. */ - result = ERR_PTR(-ENOENT); - if (IS_DEADDIR(dir)) - goto out_unlock; - - dentry = d_alloc(parent, name); - result = ERR_PTR(-ENOMEM); - if (dentry) { - result = dir->i_op->lookup(dir, dentry, nd); - if (result) - dput(dentry); - else - result = dentry; - } -out_unlock: - mutex_unlock(&dir->i_mutex); - return result; - } - - /* - * Uhhuh! Nasty case: the cache was re-populated while - * we waited on the semaphore. Need to revalidate. - */ - mutex_unlock(&dir->i_mutex); - if (result->d_op && result->d_op->d_revalidate) { - result = do_revalidate(result, nd); - if (!result) - result = ERR_PTR(-ENOENT); - } - return result; -} - -/* - * Wrapper to retry pathname resolution whenever the underlying - * file system returns an ESTALE. - * - * Retry the whole path once, forcing real lookup requests - * instead of relying on the dcache. - */ -static __always_inline int link_path_walk(const char *name, struct nameidata *nd) -{ - struct path save = nd->path; - int result; - - /* make sure the stuff we saved doesn't go away */ - path_get(&save); - - result = __link_path_walk(name, nd); - if (result == -ESTALE) { - /* nd->path had been dropped */ - nd->path = save; - path_get(&nd->path); - nd->flags |= LOOKUP_REVAL; - result = __link_path_walk(name, nd); - } - - path_put(&save); - - return result; -} - static __always_inline void set_root(struct nameidata *nd) { if (!nd->root.mnt) { @@ -569,6 +494,8 @@ static __always_inline void set_root(struct nameidata *nd) } } +static int link_path_walk(const char *, struct nameidata *); + static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link) { int res = 0; @@ -634,6 +561,7 @@ static __always_inline int __do_follow_link(struct path *path, struct nameidata dget(dentry); } mntget(path->mnt); + nd->last_type = LAST_BIND; cookie = dentry->d_inode->i_op->follow_link(dentry, nd); error = PTR_ERR(cookie); if (!IS_ERR(cookie)) { @@ -641,11 +569,14 @@ static __always_inline int __do_follow_link(struct path *path, struct nameidata error = 0; if (s) error = __vfs_follow_link(nd, s); + else if (nd->last_type == LAST_BIND) { + error = force_reval_path(&nd->path, nd); + if (error) + path_put(&nd->path); + } if (dentry->d_inode->i_op->put_link) dentry->d_inode->i_op->put_link(dentry, nd, cookie); } - path_put(path); - return error; } @@ -672,6 +603,7 @@ static inline int do_follow_link(struct path *path, struct nameidata *nd) current->total_link_count++; nd->depth++; err = __do_follow_link(path, nd); + path_put(path); current->link_count--; nd->depth--; return err; @@ -797,8 +729,19 @@ static int do_lookup(struct nameidata *nd, struct qstr *name, struct path *path) { struct vfsmount *mnt = nd->path.mnt; - struct dentry *dentry = __d_lookup(nd->path.dentry, name); + struct dentry *dentry, *parent; + struct inode *dir; + /* + * See if the low-level filesystem might want + * to use its own hash.. + */ + if (nd->path.dentry->d_op && nd->path.dentry->d_op->d_hash) { + int err = nd->path.dentry->d_op->d_hash(nd->path.dentry, name); + if (err < 0) + return err; + } + dentry = __d_lookup(nd->path.dentry, name); if (!dentry) goto need_lookup; if (dentry->d_op && dentry->d_op->d_revalidate) @@ -810,7 +753,59 @@ done: return 0; need_lookup: - dentry = real_lookup(nd->path.dentry, name, nd); + parent = nd->path.dentry; + dir = parent->d_inode; + + mutex_lock(&dir->i_mutex); + /* + * First re-do the cached lookup just in case it was created + * while we waited for the directory semaphore.. + * + * FIXME! This could use version numbering or similar to + * avoid unnecessary cache lookups. + * + * The "dcache_lock" is purely to protect the RCU list walker + * from concurrent renames at this point (we mustn't get false + * negatives from the RCU list walk here, unlike the optimistic + * fast walk). + * + * so doing d_lookup() (with seqlock), instead of lockfree __d_lookup + */ + dentry = d_lookup(parent, name); + if (!dentry) { + struct dentry *new; + + /* Don't create child dentry for a dead directory. */ + dentry = ERR_PTR(-ENOENT); + if (IS_DEADDIR(dir)) + goto out_unlock; + + new = d_alloc(parent, name); + dentry = ERR_PTR(-ENOMEM); + if (new) { + dentry = dir->i_op->lookup(dir, new, nd); + if (dentry) + dput(new); + else + dentry = new; + } +out_unlock: + mutex_unlock(&dir->i_mutex); + if (IS_ERR(dentry)) + goto fail; + goto done; + } + + /* + * Uhhuh! Nasty case: the cache was re-populated while + * we waited on the semaphore. Need to revalidate. + */ + mutex_unlock(&dir->i_mutex); + if (dentry->d_op && dentry->d_op->d_revalidate) { + dentry = do_revalidate(dentry, nd); + if (!dentry) + dentry = ERR_PTR(-ENOENT); + } if (IS_ERR(dentry)) goto fail; goto done; @@ -828,6 +823,17 @@ fail: } /* + * This is a temporary kludge to deal with "automount" symlinks; proper + * solution is to trigger them on follow_mount(), so that do_lookup() + * would DTRT. To be killed before 2.6.34-final. + */ +static inline int follow_on_final(struct inode *inode, unsigned lookup_flags) +{ + return inode && unlikely(inode->i_op->follow_link) && + ((lookup_flags & LOOKUP_FOLLOW) || S_ISDIR(inode->i_mode)); +} + +/* * Name resolution. * This is the basic name resolution function, turning a pathname into * the final dentry. We expect 'base' to be positive and a directory. @@ -835,7 +841,7 @@ fail: * Returns 0 and nd will have valid dentry and mnt on success. * Returns error and drops reference to input namei data on failure. */ -static int __link_path_walk(const char *name, struct nameidata *nd) +static int link_path_walk(const char *name, struct nameidata *nd) { struct path next; struct inode *inode; @@ -858,7 +864,7 @@ static int __link_path_walk(const char *name, struct nameidata *nd) unsigned int c; nd->flags |= LOOKUP_CONTINUE; - err = exec_permission_lite(inode); + err = exec_permission(inode); if (err) break; @@ -898,16 +904,6 @@ static int __link_path_walk(const char *name, struct nameidata *nd) case 1: continue; } - /* - * See if the low-level filesystem might want - * to use its own hash.. - */ - if (nd->path.dentry->d_op && nd->path.dentry->d_op->d_hash) { - err = nd->path.dentry->d_op->d_hash(nd->path.dentry, - &this); - if (err < 0) - break; - } /* This does the actual lookups.. */ err = do_lookup(nd, &this, &next); if (err) @@ -953,18 +949,11 @@ last_component: case 1: goto return_reval; } - if (nd->path.dentry->d_op && nd->path.dentry->d_op->d_hash) { - err = nd->path.dentry->d_op->d_hash(nd->path.dentry, - &this); - if (err < 0) - break; - } err = do_lookup(nd, &this, &next); if (err) break; inode = next.dentry->d_inode; - if ((lookup_flags & LOOKUP_FOLLOW) - && inode && inode->i_op->follow_link) { + if (follow_on_final(inode, lookup_flags)) { err = do_follow_link(&next, nd); if (err) goto return_err; @@ -1017,8 +1006,27 @@ return_err: static int path_walk(const char *name, struct nameidata *nd) { + struct path save = nd->path; + int result; + current->total_link_count = 0; - return link_path_walk(name, nd); + + /* make sure the stuff we saved doesn't go away */ + path_get(&save); + + result = link_path_walk(name, nd); + if (result == -ESTALE) { + /* nd->path had been dropped */ + current->total_link_count = 0; + nd->path = save; + path_get(&nd->path); + nd->flags |= LOOKUP_REVAL; + result = link_path_walk(name, nd); + } + + path_put(&save); + + return result; } static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd) @@ -1141,36 +1149,6 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, return retval; } -/** - * path_lookup_open - lookup a file path with open intent - * @dfd: the directory to use as base, or AT_FDCWD - * @name: pointer to file name - * @lookup_flags: lookup intent flags - * @nd: pointer to nameidata - * @open_flags: open intent flags - */ -static int path_lookup_open(int dfd, const char *name, - unsigned int lookup_flags, struct nameidata *nd, int open_flags) -{ - struct file *filp = get_empty_filp(); - int err; - - if (filp == NULL) - return -ENFILE; - nd->intent.open.file = filp; - nd->intent.open.flags = open_flags; - nd->intent.open.create_mode = 0; - err = do_path_lookup(dfd, name, lookup_flags|LOOKUP_OPEN, nd); - if (IS_ERR(nd->intent.open.file)) { - if (err == 0) { - err = PTR_ERR(nd->intent.open.file); - path_put(&nd->path); - } - } else if (err != 0) - release_open_intent(nd); - return err; -} - static struct dentry *__lookup_hash(struct qstr *name, struct dentry *base, struct nameidata *nd) { @@ -1191,7 +1169,17 @@ static struct dentry *__lookup_hash(struct qstr *name, goto out; } - dentry = cached_lookup(base, name, nd); + dentry = __d_lookup(base, name); + + /* lockess __d_lookup may fail due to concurrent d_move() + * in some unrelated directory, so try with d_lookup + */ + if (!dentry) + dentry = d_lookup(base, name); + + if (dentry && dentry->d_op && dentry->d_op->d_revalidate) + dentry = do_revalidate(dentry, nd); + if (!dentry) { struct dentry *new; @@ -1223,7 +1211,7 @@ static struct dentry *lookup_hash(struct nameidata *nd) { int err; - err = inode_permission(nd->path.dentry->d_inode, MAY_EXEC); + err = exec_permission(nd->path.dentry->d_inode); if (err) return ERR_PTR(err); return __lookup_hash(&nd->last, nd->path.dentry, nd); @@ -1273,7 +1261,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) if (err) return ERR_PTR(err); - err = inode_permission(base->d_inode, MAY_EXEC); + err = exec_permission(base->d_inode); if (err) return ERR_PTR(err); return __lookup_hash(&this, base, NULL); @@ -1511,69 +1499,45 @@ int may_open(struct path *path, int acc_mode, int flag) if (error) return error; - error = ima_path_check(path, acc_mode ? - acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC) : - ACC_MODE(flag) & (MAY_READ | MAY_WRITE), - IMA_COUNT_UPDATE); - - if (error) - return error; /* * An append-only file must be opened in append mode for writing. */ if (IS_APPEND(inode)) { - error = -EPERM; if ((flag & FMODE_WRITE) && !(flag & O_APPEND)) - goto err_out; + return -EPERM; if (flag & O_TRUNC) - goto err_out; + return -EPERM; } /* O_NOATIME can only be set by the owner or superuser */ - if (flag & O_NOATIME) - if (!is_owner_or_cap(inode)) { - error = -EPERM; - goto err_out; - } + if (flag & O_NOATIME && !is_owner_or_cap(inode)) + return -EPERM; /* * Ensure there are no outstanding leases on the file. */ - error = break_lease(inode, flag); - if (error) - goto err_out; - - if (flag & O_TRUNC) { - error = get_write_access(inode); - if (error) - goto err_out; - - /* - * Refuse to truncate files with mandatory locks held on them. - */ - error = locks_verify_locked(inode); - if (!error) - error = security_path_truncate(path, 0, - ATTR_MTIME|ATTR_CTIME|ATTR_OPEN); - if (!error) { - vfs_dq_init(inode); - - error = do_truncate(dentry, 0, - ATTR_MTIME|ATTR_CTIME|ATTR_OPEN, - NULL); - } - put_write_access(inode); - if (error) - goto err_out; - } else - if (flag & FMODE_WRITE) - vfs_dq_init(inode); + return break_lease(inode, flag); +} - return 0; -err_out: - ima_counts_put(path, acc_mode ? - acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC) : - ACC_MODE(flag) & (MAY_READ | MAY_WRITE)); +static int handle_truncate(struct path *path) +{ + struct inode *inode = path->dentry->d_inode; + int error = get_write_access(inode); + if (error) + return error; + /* + * Refuse to truncate files with mandatory locks held on them. + */ + error = locks_verify_locked(inode); + if (!error) + error = security_path_truncate(path, 0, + ATTR_MTIME|ATTR_CTIME|ATTR_OPEN); + if (!error) { + error = do_truncate(path->dentry, 0, + ATTR_MTIME|ATTR_CTIME|ATTR_OPEN, + NULL); + } + put_write_access(inode); return error; } @@ -1628,7 +1592,7 @@ static inline int open_to_namei_flags(int flag) return flag; } -static int open_will_write_to_fs(int flag, struct inode *inode) +static int open_will_truncate(int flag, struct inode *inode) { /* * We'll never write to the fs underlying @@ -1653,8 +1617,9 @@ struct file *do_filp_open(int dfd, const char *pathname, struct path path; struct dentry *dir; int count = 0; - int will_write; + int will_truncate; int flag = open_to_namei_flags(open_flag); + int force_reval = 0; /* * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only @@ -1666,7 +1631,7 @@ struct file *do_filp_open(int dfd, const char *pathname, open_flag |= O_DSYNC; if (!acc_mode) - acc_mode = MAY_OPEN | ACC_MODE(flag); + acc_mode = MAY_OPEN | ACC_MODE(open_flag); /* O_TRUNC implies we need access checks for write permissions */ if (flag & O_TRUNC) @@ -1681,8 +1646,23 @@ struct file *do_filp_open(int dfd, const char *pathname, * The simplest case - just a plain lookup. */ if (!(flag & O_CREAT)) { - error = path_lookup_open(dfd, pathname, lookup_flags(flag), - &nd, flag); + filp = get_empty_filp(); + + if (filp == NULL) + return ERR_PTR(-ENFILE); + nd.intent.open.file = filp; + filp->f_flags = open_flag; + nd.intent.open.flags = flag; + nd.intent.open.create_mode = 0; + error = do_path_lookup(dfd, pathname, + lookup_flags(flag)|LOOKUP_OPEN, &nd); + if (IS_ERR(nd.intent.open.file)) { + if (error == 0) { + error = PTR_ERR(nd.intent.open.file); + path_put(&nd.path); + } + } else if (error) + release_open_intent(&nd); if (error) return ERR_PTR(error); goto ok; @@ -1691,9 +1671,12 @@ struct file *do_filp_open(int dfd, const char *pathname, /* * Create - we need to know the parent. */ +reval: error = path_init(dfd, pathname, LOOKUP_PARENT, &nd); if (error) return ERR_PTR(error); + if (force_reval) + nd.flags |= LOOKUP_REVAL; error = path_walk(pathname, &nd); if (error) { if (nd.root.mnt) @@ -1717,6 +1700,7 @@ struct file *do_filp_open(int dfd, const char *pathname, if (filp == NULL) goto exit_parent; nd.intent.open.file = filp; + filp->f_flags = open_flag; nd.intent.open.flags = flag; nd.intent.open.create_mode = mode; dir = nd.path.dentry; @@ -1757,14 +1741,17 @@ do_last: mnt_drop_write(nd.path.mnt); goto exit; } - filp = nameidata_to_filp(&nd, open_flag); - if (IS_ERR(filp)) - ima_counts_put(&nd.path, - acc_mode & (MAY_READ | MAY_WRITE | - MAY_EXEC)); + filp = nameidata_to_filp(&nd); mnt_drop_write(nd.path.mnt); if (nd.root.mnt) path_put(&nd.root); + if (!IS_ERR(filp)) { + error = ima_file_check(filp, acc_mode); + if (error) { + fput(filp); + filp = ERR_PTR(error); + } + } return filp; } @@ -1792,7 +1779,7 @@ do_last: path_to_nameidata(&path, &nd); error = -EISDIR; - if (path.dentry->d_inode && S_ISDIR(path.dentry->d_inode->i_mode)) + if (S_ISDIR(path.dentry->d_inode->i_mode)) goto exit; ok: /* @@ -1805,28 +1792,44 @@ ok: * be avoided. Taking this mnt write here * ensures that (2) can not occur. */ - will_write = open_will_write_to_fs(flag, nd.path.dentry->d_inode); - if (will_write) { + will_truncate = open_will_truncate(flag, nd.path.dentry->d_inode); + if (will_truncate) { error = mnt_want_write(nd.path.mnt); if (error) goto exit; } error = may_open(&nd.path, acc_mode, flag); if (error) { - if (will_write) + if (will_truncate) mnt_drop_write(nd.path.mnt); goto exit; } - filp = nameidata_to_filp(&nd, open_flag); - if (IS_ERR(filp)) - ima_counts_put(&nd.path, - acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC)); + filp = nameidata_to_filp(&nd); + if (!IS_ERR(filp)) { + error = ima_file_check(filp, acc_mode); + if (error) { + fput(filp); + filp = ERR_PTR(error); + } + } + if (!IS_ERR(filp)) { + if (acc_mode & MAY_WRITE) + vfs_dq_init(nd.path.dentry->d_inode); + + if (will_truncate) { + error = handle_truncate(&nd.path); + if (error) { + fput(filp); + filp = ERR_PTR(error); + } + } + } /* * It is now safe to drop the mnt write * because the filp has had a write taken * on its behalf. */ - if (will_write) + if (will_truncate) mnt_drop_write(nd.path.mnt); if (nd.root.mnt) path_put(&nd.root); @@ -1864,6 +1867,7 @@ do_link: if (error) goto exit_dput; error = __do_follow_link(&path, &nd); + path_put(&path); if (error) { /* Does someone understand code flow here? Or it is only * me so stupid? Anathema to whoever designed this non-sense @@ -1872,6 +1876,10 @@ do_link: release_open_intent(&nd); if (nd.root.mnt) path_put(&nd.root); + if (error == -ESTALE && !force_reval) { + force_reval = 1; + goto reval; + } return ERR_PTR(error); } nd.flags &= ~LOOKUP_PARENT; diff --git a/fs/namespace.c b/fs/namespace.c index 7d70d63..c768f73 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -965,10 +965,12 @@ EXPORT_SYMBOL(may_umount_tree); int may_umount(struct vfsmount *mnt) { int ret = 1; + down_read(&namespace_sem); spin_lock(&vfsmount_lock); if (propagate_mount_busy(mnt, 2)) ret = 0; spin_unlock(&vfsmount_lock); + up_read(&namespace_sem); return ret; } @@ -1352,12 +1354,12 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt, if (err) goto out_cleanup_ids; + spin_lock(&vfsmount_lock); + if (IS_MNT_SHARED(dest_mnt)) { for (p = source_mnt; p; p = next_mnt(p, source_mnt)) set_mnt_shared(p); } - - spin_lock(&vfsmount_lock); if (parent_path) { detach_mnt(source_mnt, parent_path); attach_mnt(source_mnt, path); @@ -1534,8 +1536,12 @@ static int do_remount(struct path *path, int flags, int mnt_flags, err = change_mount_flags(path->mnt, flags); else err = do_remount_sb(sb, flags, data, 0); - if (!err) + if (!err) { + spin_lock(&vfsmount_lock); + mnt_flags |= path->mnt->mnt_flags & MNT_PNODE_MASK; path->mnt->mnt_flags = mnt_flags; + spin_unlock(&vfsmount_lock); + } up_write(&sb->s_umount); if (!err) { security_sb_post_remount(path->mnt, flags, data); @@ -1665,6 +1671,8 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path, { int err; + mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD); + down_write(&namespace_sem); /* Something was mounted here while we slept */ while (d_mountpoint(path->dentry) && diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index 2a77bc2..59e5673 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -90,7 +90,7 @@ config ROOT_NFS If you want your system to mount its root file system via NFS, choose Y here. This is common practice for managing systems without local permanent storage. For details, read - <file:Documentation/filesystems/nfsroot.txt>. + <file:Documentation/filesystems/nfs/nfsroot.txt>. Most people say N here. diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 2c5ace4..3c7f03b 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1615,6 +1615,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out; new_dentry = dentry; + rehash = NULL; new_inode = NULL; } } diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index e1d415e..0d28982 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -342,6 +342,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq, data->res.fattr = &data->fattr; data->res.eof = 0; data->res.count = bytes; + nfs_fattr_init(&data->fattr); msg.rpc_argp = &data->args; msg.rpc_resp = &data->res; @@ -575,6 +576,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) data->res.count = 0; data->res.fattr = &data->fattr; data->res.verf = &data->verf; + nfs_fattr_init(&data->fattr); NFS_PROTO(data->inode)->commit_setup(data, &msg); @@ -766,6 +768,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq, data->res.fattr = &data->fattr; data->res.count = bytes; data->res.verf = &data->verf; + nfs_fattr_init(&data->fattr); task_setup_data.task = &data->task; task_setup_data.callback_data = data; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 6b89132..63f2071 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -486,6 +486,8 @@ static int nfs_release_page(struct page *page, gfp_t gfp) { dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page); + if (gfp & __GFP_WAIT) + nfs_wb_page(page->mapping->host, page); /* If PagePrivate() is set, then the page is not freeable */ if (PagePrivate(page)) return 0; diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index fa58800..237874f 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -354,12 +354,11 @@ void nfs_fscache_reset_inode_cookie(struct inode *inode) */ int nfs_fscache_release_page(struct page *page, gfp_t gfp) { - struct nfs_inode *nfsi = NFS_I(page->mapping->host); - struct fscache_cookie *cookie = nfsi->fscache; - - BUG_ON(!cookie); - if (PageFsCache(page)) { + struct nfs_inode *nfsi = NFS_I(page->mapping->host); + struct fscache_cookie *cookie = nfsi->fscache; + + BUG_ON(!cookie); dfprintk(FSCACHE, "NFS: fscache releasepage (0x%p/0x%p/0x%p)\n", cookie, page, nfsi); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index faa0918..f141bde 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1261,8 +1261,10 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) if (fattr->valid & NFS_ATTR_FATTR_MODE) { if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) { + umode_t newmode = inode->i_mode & S_IFMT; + newmode |= fattr->mode & S_IALLUGO; + inode->i_mode = newmode; invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; - inode->i_mode = fattr->mode; } } else if (server->caps & NFS_CAP_MODE) invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index 0adefc4..59047f8 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -120,7 +120,7 @@ static struct { { .status = MNT3ERR_INVAL, .errno = -EINVAL, }, { .status = MNT3ERR_NAMETOOLONG, .errno = -ENAMETOOLONG, }, { .status = MNT3ERR_NOTSUPP, .errno = -ENOTSUPP, }, - { .status = MNT3ERR_SERVERFAULT, .errno = -ESERVERFAULT, }, + { .status = MNT3ERR_SERVERFAULT, .errno = -EREMOTEIO, }, }; struct mountres { diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index 5e078b2..7bc2da8 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -699,7 +699,7 @@ static struct { { NFSERR_BAD_COOKIE, -EBADCOOKIE }, { NFSERR_NOTSUPP, -ENOTSUPP }, { NFSERR_TOOSMALL, -ETOOSMALL }, - { NFSERR_SERVERFAULT, -ESERVERFAULT }, + { NFSERR_SERVERFAULT, -EREMOTEIO }, { NFSERR_BADTYPE, -EBADTYPE }, { NFSERR_JUKEBOX, -EJUKEBOX }, { -1, -EIO } diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 7e57b04..0c6fda3 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -108,6 +108,10 @@ enum { NFS_OWNER_RECLAIM_NOGRACE }; +#define NFS_LOCK_NEW 0 +#define NFS_LOCK_RECLAIM 1 +#define NFS_LOCK_EXPIRED 2 + /* * struct nfs4_state maintains the client-side state for a given * (state_owner,inode) tuple (OPEN) or state_owner (LOCK). @@ -142,6 +146,7 @@ enum { NFS_O_RDWR_STATE, /* OPEN stateid has read/write state */ NFS_STATE_RECLAIM_REBOOT, /* OPEN stateid server rebooted */ NFS_STATE_RECLAIM_NOGRACE, /* OPEN stateid needs to recover state */ + NFS_STATE_POSIX_LOCKS, /* Posix locks are supported */ }; struct nfs4_state { @@ -273,6 +278,7 @@ extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t); extern void nfs4_schedule_state_recovery(struct nfs_client *); extern void nfs4_schedule_state_manager(struct nfs_client *); extern int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state); +extern int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state); extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags); extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); @@ -282,6 +288,7 @@ extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter); extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task); extern void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid); extern void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid); +extern void nfs_release_seqid(struct nfs_seqid *seqid); extern void nfs_free_seqid(struct nfs_seqid *seqid); extern const nfs4_stateid zero_stateid; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 9f5f11e..375f0fa 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -64,6 +64,7 @@ struct nfs4_opendata; static int _nfs4_proc_open(struct nfs4_opendata *data); +static int _nfs4_recover_proc_open(struct nfs4_opendata *data); static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *); static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr); @@ -248,19 +249,15 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, if (state == NULL) break; nfs4_state_mark_reclaim_nograce(clp, state); - case -NFS4ERR_STALE_CLIENTID: + goto do_state_recovery; case -NFS4ERR_STALE_STATEID: - case -NFS4ERR_EXPIRED: - nfs4_schedule_state_recovery(clp); - ret = nfs4_wait_clnt_recover(clp); - if (ret == 0) - exception->retry = 1; -#if !defined(CONFIG_NFS_V4_1) - break; -#else /* !defined(CONFIG_NFS_V4_1) */ - if (!nfs4_has_session(server->nfs_client)) + if (state == NULL) break; - /* FALLTHROUGH */ + nfs4_state_mark_reclaim_reboot(clp, state); + case -NFS4ERR_STALE_CLIENTID: + case -NFS4ERR_EXPIRED: + goto do_state_recovery; +#if defined(CONFIG_NFS_V4_1) case -NFS4ERR_BADSESSION: case -NFS4ERR_BADSLOT: case -NFS4ERR_BAD_HIGH_SLOT: @@ -273,7 +270,7 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, nfs4_schedule_state_recovery(clp); exception->retry = 1; break; -#endif /* !defined(CONFIG_NFS_V4_1) */ +#endif /* defined(CONFIG_NFS_V4_1) */ case -NFS4ERR_FILE_OPEN: if (exception->timeout > HZ) { /* We have retried a decent amount, time to @@ -292,6 +289,12 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, } /* We failed to handle the error */ return nfs4_map_errors(ret); +do_state_recovery: + nfs4_schedule_state_recovery(clp); + ret = nfs4_wait_clnt_recover(clp); + if (ret == 0) + exception->retry = 1; + return ret; } @@ -341,6 +344,27 @@ nfs4_free_slot(struct nfs4_slot_table *tbl, u8 free_slotid) free_slotid, tbl->highest_used_slotid); } +/* + * Signal state manager thread if session is drained + */ +static void nfs41_check_drain_session_complete(struct nfs4_session *ses) +{ + struct rpc_task *task; + + if (!test_bit(NFS4CLNT_SESSION_DRAINING, &ses->clp->cl_state)) { + task = rpc_wake_up_next(&ses->fc_slot_table.slot_tbl_waitq); + if (task) + rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); + return; + } + + if (ses->fc_slot_table.highest_used_slotid != -1) + return; + + dprintk("%s COMPLETE: Session Drained\n", __func__); + complete(&ses->complete); +} + static void nfs41_sequence_free_slot(const struct nfs_client *clp, struct nfs4_sequence_res *res) { @@ -356,15 +380,7 @@ static void nfs41_sequence_free_slot(const struct nfs_client *clp, spin_lock(&tbl->slot_tbl_lock); nfs4_free_slot(tbl, res->sr_slotid); - - /* Signal state manager thread if session is drained */ - if (test_bit(NFS4CLNT_SESSION_DRAINING, &clp->cl_state)) { - if (tbl->highest_used_slotid == -1) { - dprintk("%s COMPLETE: Session Drained\n", __func__); - complete(&clp->cl_session->complete); - } - } else - rpc_wake_up_next(&tbl->slot_tbl_waitq); + nfs41_check_drain_session_complete(clp->cl_session); spin_unlock(&tbl->slot_tbl_lock); res->sr_slotid = NFS4_MAX_SLOT_TABLE; } @@ -421,7 +437,7 @@ out: * Note: must be called with under the slot_tbl_lock. */ static u8 -nfs4_find_slot(struct nfs4_slot_table *tbl, struct rpc_task *task) +nfs4_find_slot(struct nfs4_slot_table *tbl) { int slotid; u8 ret_id = NFS4_MAX_SLOT_TABLE; @@ -463,7 +479,8 @@ static int nfs41_setup_sequence(struct nfs4_session *session, tbl = &session->fc_slot_table; spin_lock(&tbl->slot_tbl_lock); - if (test_bit(NFS4CLNT_SESSION_DRAINING, &session->clp->cl_state)) { + if (test_bit(NFS4CLNT_SESSION_DRAINING, &session->clp->cl_state) && + !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) { /* * The state manager will wait until the slot table is empty. * Schedule the reset thread @@ -474,7 +491,15 @@ static int nfs41_setup_sequence(struct nfs4_session *session, return -EAGAIN; } - slotid = nfs4_find_slot(tbl, task); + if (!rpc_queue_empty(&tbl->slot_tbl_waitq) && + !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) { + rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL); + spin_unlock(&tbl->slot_tbl_lock); + dprintk("%s enforce FIFO order\n", __func__); + return -EAGAIN; + } + + slotid = nfs4_find_slot(tbl); if (slotid == NFS4_MAX_SLOT_TABLE) { rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL); spin_unlock(&tbl->slot_tbl_lock); @@ -483,6 +508,7 @@ static int nfs41_setup_sequence(struct nfs4_session *session, } spin_unlock(&tbl->slot_tbl_lock); + rpc_task_set_priority(task, RPC_PRIORITY_NORMAL); slot = tbl->slots + slotid; args->sa_session = session; args->sa_slotid = slotid; @@ -545,6 +571,12 @@ static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata) rpc_call_start(task); } +static void nfs41_call_priv_sync_prepare(struct rpc_task *task, void *calldata) +{ + rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); + nfs41_call_sync_prepare(task, calldata); +} + static void nfs41_call_sync_done(struct rpc_task *task, void *calldata) { struct nfs41_call_sync_data *data = calldata; @@ -557,12 +589,18 @@ struct rpc_call_ops nfs41_call_sync_ops = { .rpc_call_done = nfs41_call_sync_done, }; +struct rpc_call_ops nfs41_call_priv_sync_ops = { + .rpc_call_prepare = nfs41_call_priv_sync_prepare, + .rpc_call_done = nfs41_call_sync_done, +}; + static int nfs4_call_sync_sequence(struct nfs_client *clp, struct rpc_clnt *clnt, struct rpc_message *msg, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, - int cache_reply) + int cache_reply, + int privileged) { int ret; struct rpc_task *task; @@ -580,6 +618,8 @@ static int nfs4_call_sync_sequence(struct nfs_client *clp, }; res->sr_slotid = NFS4_MAX_SLOT_TABLE; + if (privileged) + task_setup.callback_ops = &nfs41_call_priv_sync_ops; task = rpc_run_task(&task_setup); if (IS_ERR(task)) ret = PTR_ERR(task); @@ -597,7 +637,7 @@ int _nfs4_call_sync_session(struct nfs_server *server, int cache_reply) { return nfs4_call_sync_sequence(server->nfs_client, server->client, - msg, args, res, cache_reply); + msg, args, res, cache_reply, 0); } #endif /* CONFIG_NFS_V4_1 */ @@ -1035,7 +1075,7 @@ static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, fmode_t fmod memset(&opendata->o_res, 0, sizeof(opendata->o_res)); memset(&opendata->c_res, 0, sizeof(opendata->c_res)); nfs4_init_opendata_res(opendata); - ret = _nfs4_proc_open(opendata); + ret = _nfs4_recover_proc_open(opendata); if (ret != 0) return ret; newstate = nfs4_opendata_to_nfs4_state(opendata); @@ -1326,6 +1366,12 @@ out_no_action: } +static void nfs4_recover_open_prepare(struct rpc_task *task, void *calldata) +{ + rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); + nfs4_open_prepare(task, calldata); +} + static void nfs4_open_done(struct rpc_task *task, void *calldata) { struct nfs4_opendata *data = calldata; @@ -1384,10 +1430,13 @@ static const struct rpc_call_ops nfs4_open_ops = { .rpc_release = nfs4_open_release, }; -/* - * Note: On error, nfs4_proc_open will free the struct nfs4_opendata - */ -static int _nfs4_proc_open(struct nfs4_opendata *data) +static const struct rpc_call_ops nfs4_recover_open_ops = { + .rpc_call_prepare = nfs4_recover_open_prepare, + .rpc_call_done = nfs4_open_done, + .rpc_release = nfs4_open_release, +}; + +static int nfs4_run_open_task(struct nfs4_opendata *data, int isrecover) { struct inode *dir = data->dir->d_inode; struct nfs_server *server = NFS_SERVER(dir); @@ -1414,21 +1463,57 @@ static int _nfs4_proc_open(struct nfs4_opendata *data) data->rpc_done = 0; data->rpc_status = 0; data->cancelled = 0; + if (isrecover) + task_setup_data.callback_ops = &nfs4_recover_open_ops; task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) - return PTR_ERR(task); - status = nfs4_wait_for_completion_rpc_task(task); - if (status != 0) { - data->cancelled = 1; - smp_wmb(); - } else - status = data->rpc_status; - rpc_put_task(task); + if (IS_ERR(task)) + return PTR_ERR(task); + status = nfs4_wait_for_completion_rpc_task(task); + if (status != 0) { + data->cancelled = 1; + smp_wmb(); + } else + status = data->rpc_status; + rpc_put_task(task); + + return status; +} + +static int _nfs4_recover_proc_open(struct nfs4_opendata *data) +{ + struct inode *dir = data->dir->d_inode; + struct nfs_openres *o_res = &data->o_res; + int status; + + status = nfs4_run_open_task(data, 1); if (status != 0 || !data->rpc_done) return status; - if (o_res->fh.size == 0) - _nfs4_proc_lookup(dir, o_arg->name, &o_res->fh, o_res->f_attr); + nfs_refresh_inode(dir, o_res->dir_attr); + + if (o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) { + status = _nfs4_proc_open_confirm(data); + if (status != 0) + return status; + } + + return status; +} + +/* + * Note: On error, nfs4_proc_open will free the struct nfs4_opendata + */ +static int _nfs4_proc_open(struct nfs4_opendata *data) +{ + struct inode *dir = data->dir->d_inode; + struct nfs_server *server = NFS_SERVER(dir); + struct nfs_openargs *o_arg = &data->o_arg; + struct nfs_openres *o_res = &data->o_res; + int status; + + status = nfs4_run_open_task(data, 0); + if (status != 0 || !data->rpc_done) + return status; if (o_arg->open_flags & O_CREAT) { update_changeattr(dir, &o_res->cinfo); @@ -1575,6 +1660,8 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, in status = PTR_ERR(state); if (IS_ERR(state)) goto err_opendata_put; + if ((opendata->o_res.rflags & NFS4_OPEN_RESULT_LOCKTYPE_POSIX) != 0) + set_bit(NFS_STATE_POSIX_LOCKS, &state->flags); nfs4_opendata_put(opendata); nfs4_put_state_owner(sp); *res = state; @@ -1752,11 +1839,10 @@ static void nfs4_close_done(struct rpc_task *task, void *data) if (calldata->arg.fmode == 0) break; default: - if (nfs4_async_handle_error(task, server, state) == -EAGAIN) { - nfs_restart_rpc(task, server->nfs_client); - return; - } + if (nfs4_async_handle_error(task, server, state) == -EAGAIN) + rpc_restart_call_prepare(task); } + nfs_release_seqid(calldata->arg.seqid); nfs_refresh_inode(calldata->inode, calldata->res.fattr); } @@ -1848,8 +1934,6 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait) calldata->state = state; calldata->arg.fh = NFS_FH(state->inode); calldata->arg.stateid = &state->open_stateid; - if (nfs4_has_session(server->nfs_client)) - memset(calldata->arg.stateid->data, 0, 4); /* clear seqid */ /* Serialization for the sequence id */ calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid); if (calldata->arg.seqid == NULL) @@ -3342,15 +3426,14 @@ _nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, if (state == NULL) break; nfs4_state_mark_reclaim_nograce(clp, state); - case -NFS4ERR_STALE_CLIENTID: + goto do_state_recovery; case -NFS4ERR_STALE_STATEID: + if (state == NULL) + break; + nfs4_state_mark_reclaim_reboot(clp, state); + case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_EXPIRED: - rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL); - nfs4_schedule_state_recovery(clp); - if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0) - rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task); - task->tk_status = 0; - return -EAGAIN; + goto do_state_recovery; #if defined(CONFIG_NFS_V4_1) case -NFS4ERR_BADSESSION: case -NFS4ERR_BADSLOT: @@ -3378,6 +3461,13 @@ _nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, } task->tk_status = nfs4_map_errors(task->tk_status); return 0; +do_state_recovery: + rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL); + nfs4_schedule_state_recovery(clp); + if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0) + rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task); + task->tk_status = 0; + return -EAGAIN; } static int @@ -3941,6 +4031,12 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata) dprintk("%s: done!, ret = %d\n", __func__, data->rpc_status); } +static void nfs4_recover_lock_prepare(struct rpc_task *task, void *calldata) +{ + rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); + nfs4_lock_prepare(task, calldata); +} + static void nfs4_lock_done(struct rpc_task *task, void *calldata) { struct nfs4_lockdata *data = calldata; @@ -3996,7 +4092,35 @@ static const struct rpc_call_ops nfs4_lock_ops = { .rpc_release = nfs4_lock_release, }; -static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *fl, int reclaim) +static const struct rpc_call_ops nfs4_recover_lock_ops = { + .rpc_call_prepare = nfs4_recover_lock_prepare, + .rpc_call_done = nfs4_lock_done, + .rpc_release = nfs4_lock_release, +}; + +static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_state *lsp, int new_lock_owner, int error) +{ + struct nfs_client *clp = server->nfs_client; + struct nfs4_state *state = lsp->ls_state; + + switch (error) { + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_EXPIRED: + if (new_lock_owner != 0 || + (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0) + nfs4_state_mark_reclaim_nograce(clp, state); + lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED; + break; + case -NFS4ERR_STALE_STATEID: + if (new_lock_owner != 0 || + (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0) + nfs4_state_mark_reclaim_reboot(clp, state); + lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED; + }; +} + +static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *fl, int recovery_type) { struct nfs4_lockdata *data; struct rpc_task *task; @@ -4020,8 +4144,11 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f return -ENOMEM; if (IS_SETLKW(cmd)) data->arg.block = 1; - if (reclaim != 0) - data->arg.reclaim = 1; + if (recovery_type > NFS_LOCK_NEW) { + if (recovery_type == NFS_LOCK_RECLAIM) + data->arg.reclaim = NFS_LOCK_RECLAIM; + task_setup_data.callback_ops = &nfs4_recover_lock_ops; + } msg.rpc_argp = &data->arg, msg.rpc_resp = &data->res, task_setup_data.callback_data = data; @@ -4031,6 +4158,9 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f ret = nfs4_wait_for_completion_rpc_task(task); if (ret == 0) { ret = data->rpc_status; + if (ret) + nfs4_handle_setlk_error(data->server, data->lsp, + data->arg.new_lock_owner, ret); } else data->cancelled = 1; rpc_put_task(task); @@ -4048,7 +4178,7 @@ static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request /* Cache the lock if possible... */ if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0) return 0; - err = _nfs4_do_setlk(state, F_SETLK, request, 1); + err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_RECLAIM); if (err != -NFS4ERR_DELAY) break; nfs4_handle_exception(server, err, &exception); @@ -4068,7 +4198,7 @@ static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request do { if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0) return 0; - err = _nfs4_do_setlk(state, F_SETLK, request, 0); + err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_EXPIRED); switch (err) { default: goto out; @@ -4086,8 +4216,11 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock { struct nfs_inode *nfsi = NFS_I(state->inode); unsigned char fl_flags = request->fl_flags; - int status; + int status = -ENOLCK; + if ((fl_flags & FL_POSIX) && + !test_bit(NFS_STATE_POSIX_LOCKS, &state->flags)) + goto out; /* Is this a delegated open? */ status = nfs4_set_lock_state(state, request); if (status != 0) @@ -4104,7 +4237,7 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock status = do_vfs_lock(request->fl_file, request); goto out_unlock; } - status = _nfs4_do_setlk(state, cmd, request, 0); + status = _nfs4_do_setlk(state, cmd, request, NFS_LOCK_NEW); if (status != 0) goto out_unlock; /* Note: we always want to sleep here! */ @@ -4187,7 +4320,7 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl) if (err != 0) goto out; do { - err = _nfs4_do_setlk(state, F_SETLK, fl, 0); + err = _nfs4_do_setlk(state, F_SETLK, fl, NFS_LOCK_NEW); switch (err) { default: printk(KERN_ERR "%s: unhandled error %d.\n", @@ -4395,11 +4528,12 @@ static void nfs4_get_lease_time_prepare(struct rpc_task *task, (struct nfs4_get_lease_time_data *)calldata; dprintk("--> %s\n", __func__); + rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); /* just setup sequence, do not trigger session recovery since we're invoked within one */ ret = nfs41_setup_sequence(data->clp->cl_session, - &data->args->la_seq_args, - &data->res->lr_seq_res, 0, task); + &data->args->la_seq_args, + &data->res->lr_seq_res, 0, task); BUG_ON(ret == -EAGAIN); rpc_call_start(task); @@ -4619,7 +4753,7 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp) tbl = &session->fc_slot_table; tbl->highest_used_slotid = -1; spin_lock_init(&tbl->slot_tbl_lock); - rpc_init_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table"); + rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table"); tbl = &session->bc_slot_table; tbl->highest_used_slotid = -1; @@ -4838,14 +4972,22 @@ int nfs4_init_session(struct nfs_server *server) { struct nfs_client *clp = server->nfs_client; struct nfs4_session *session; + unsigned int rsize, wsize; int ret; if (!nfs4_has_session(clp)) return 0; + rsize = server->rsize; + if (rsize == 0) + rsize = NFS_MAX_FILE_IO_SIZE; + wsize = server->wsize; + if (wsize == 0) + wsize = NFS_MAX_FILE_IO_SIZE; + session = clp->cl_session; - session->fc_attrs.max_rqst_sz = server->wsize + nfs41_maxwrite_overhead; - session->fc_attrs.max_resp_sz = server->rsize + nfs41_maxread_overhead; + session->fc_attrs.max_rqst_sz = wsize + nfs41_maxwrite_overhead; + session->fc_attrs.max_resp_sz = rsize + nfs41_maxread_overhead; ret = nfs4_recover_expired_lease(server); if (!ret) @@ -4871,7 +5013,7 @@ static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) args.sa_cache_this = 0; return nfs4_call_sync_sequence(clp, clp->cl_rpcclient, &msg, &args, - &res, 0); + &res, args.sa_cache_this, 1); } void nfs41_sequence_call_done(struct rpc_task *task, void *data) @@ -4953,6 +5095,7 @@ static void nfs4_reclaim_complete_prepare(struct rpc_task *task, void *data) { struct nfs4_reclaim_complete_data *calldata = data; + rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); if (nfs4_setup_sequence(calldata->clp, &calldata->arg.seq_args, &calldata->res.seq_res, 0, task)) return; diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index e76427e..c1e2733 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -135,16 +135,30 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp) return status; } -static void nfs41_end_drain_session(struct nfs_client *clp, - struct nfs4_session *ses) +static void nfs4_end_drain_session(struct nfs_client *clp) { - if (test_and_clear_bit(NFS4CLNT_SESSION_DRAINING, &clp->cl_state)) - rpc_wake_up(&ses->fc_slot_table.slot_tbl_waitq); + struct nfs4_session *ses = clp->cl_session; + int max_slots; + + if (test_and_clear_bit(NFS4CLNT_SESSION_DRAINING, &clp->cl_state)) { + spin_lock(&ses->fc_slot_table.slot_tbl_lock); + max_slots = ses->fc_slot_table.max_slots; + while (max_slots--) { + struct rpc_task *task; + + task = rpc_wake_up_next(&ses->fc_slot_table. + slot_tbl_waitq); + if (!task) + break; + rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); + } + spin_unlock(&ses->fc_slot_table.slot_tbl_lock); + } } -static int nfs41_begin_drain_session(struct nfs_client *clp, - struct nfs4_session *ses) +static int nfs4_begin_drain_session(struct nfs_client *clp) { + struct nfs4_session *ses = clp->cl_session; struct nfs4_slot_table *tbl = &ses->fc_slot_table; spin_lock(&tbl->slot_tbl_lock); @@ -162,16 +176,13 @@ int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred) { int status; - status = nfs41_begin_drain_session(clp, clp->cl_session); - if (status != 0) - goto out; + nfs4_begin_drain_session(clp); status = nfs4_proc_exchange_id(clp, cred); if (status != 0) goto out; status = nfs4_proc_create_session(clp); if (status != 0) goto out; - nfs41_end_drain_session(clp, clp->cl_session); nfs41_setup_state_renewal(clp); nfs_mark_client_ready(clp, NFS_CS_READY); out: @@ -755,16 +766,21 @@ struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter) return new; } -void nfs_free_seqid(struct nfs_seqid *seqid) +void nfs_release_seqid(struct nfs_seqid *seqid) { if (!list_empty(&seqid->list)) { struct rpc_sequence *sequence = seqid->sequence->sequence; spin_lock(&sequence->lock); - list_del(&seqid->list); + list_del_init(&seqid->list); spin_unlock(&sequence->lock); rpc_wake_up(&sequence->wait); } +} + +void nfs_free_seqid(struct nfs_seqid *seqid) +{ + nfs_release_seqid(seqid); kfree(seqid); } @@ -885,7 +901,7 @@ void nfs4_schedule_state_recovery(struct nfs_client *clp) nfs4_schedule_state_manager(clp); } -static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state) +int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state) { set_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags); @@ -1257,13 +1273,9 @@ void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags) static int nfs4_reset_session(struct nfs_client *clp) { - struct nfs4_session *ses = clp->cl_session; int status; - status = nfs41_begin_drain_session(clp, ses); - if (status != 0) - return status; - + nfs4_begin_drain_session(clp); status = nfs4_proc_destroy_session(clp->cl_session); if (status && status != -NFS4ERR_BADSESSION && status != -NFS4ERR_DEADSESSION) { @@ -1279,19 +1291,17 @@ static int nfs4_reset_session(struct nfs_client *clp) out: /* * Let the state manager reestablish state - * without waking other tasks yet. */ - if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) { - /* Wake up the next rpc task */ - nfs41_end_drain_session(clp, ses); - if (status == 0) - nfs41_setup_state_renewal(clp); - } + if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) && + status == 0) + nfs41_setup_state_renewal(clp); + return status; } #else /* CONFIG_NFS_V4_1 */ static int nfs4_reset_session(struct nfs_client *clp) { return 0; } +static int nfs4_end_drain_session(struct nfs_client *clp) { return 0; } #endif /* CONFIG_NFS_V4_1 */ /* Set NFS4CLNT_LEASE_EXPIRED for all v4.0 errors and for recoverable errors @@ -1382,6 +1392,7 @@ static void nfs4_state_manager(struct nfs_client *clp) goto out_error; } + nfs4_end_drain_session(clp); if (test_and_clear_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) { nfs_client_return_marked_delegations(clp); continue; @@ -1398,6 +1409,7 @@ static void nfs4_state_manager(struct nfs_client *clp) out_error: printk(KERN_WARNING "Error: state manager failed on NFSv4 server %s" " with error %d\n", clp->cl_hostname, -status); + nfs4_end_drain_session(clp); nfs4_clear_state_manager_bit(clp); } diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index e437fd6..5cd5184 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -4631,7 +4631,7 @@ static int decode_sequence(struct xdr_stream *xdr, * If the server returns different values for sessionID, slotID or * sequence number, the server is looney tunes. */ - status = -ESERVERFAULT; + status = -EREMOTEIO; if (memcmp(id.data, res->sr_session->sess_id.data, NFS4_MAX_SESSIONID_LEN)) { @@ -5774,7 +5774,7 @@ static struct { { NFS4ERR_BAD_COOKIE, -EBADCOOKIE }, { NFS4ERR_NOTSUPP, -ENOTSUPP }, { NFS4ERR_TOOSMALL, -ETOOSMALL }, - { NFS4ERR_SERVERFAULT, -ESERVERFAULT }, + { NFS4ERR_SERVERFAULT, -EREMOTEIO }, { NFS4ERR_BADTYPE, -EBADTYPE }, { NFS4ERR_LOCKED, -EAGAIN }, { NFS4ERR_SYMLINK, -ELOOP }, @@ -5801,7 +5801,7 @@ nfs4_stat_to_errno(int stat) } if (stat <= 10000 || stat > 10100) { /* The server is looney tunes. */ - return -ESERVERFAULT; + return -EREMOTEIO; } /* If we cannot translate the error, the recovery routines should * handle it. diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index e297593..a12c45b 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -176,6 +176,12 @@ void nfs_release_request(struct nfs_page *req) kref_put(&req->wb_kref, nfs_free_request); } +static int nfs_wait_bit_uninterruptible(void *word) +{ + io_schedule(); + return 0; +} + /** * nfs_wait_on_request - Wait for a request to complete. * @req: request to wait upon. @@ -186,14 +192,9 @@ void nfs_release_request(struct nfs_page *req) int nfs_wait_on_request(struct nfs_page *req) { - int ret = 0; - - if (!test_bit(PG_BUSY, &req->wb_flags)) - goto out; - ret = out_of_line_wait_on_bit(&req->wb_flags, PG_BUSY, - nfs_wait_bit_killable, TASK_KILLABLE); -out: - return ret; + return wait_on_bit(&req->wb_flags, PG_BUSY, + nfs_wait_bit_uninterruptible, + TASK_UNINTERRUPTIBLE); } /** diff --git a/fs/nfs/super.c b/fs/nfs/super.c index ce907ef..f1afee4 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -243,6 +243,7 @@ static int nfs_show_stats(struct seq_file *, struct vfsmount *); static int nfs_get_sb(struct file_system_type *, int, const char *, void *, struct vfsmount *); static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); +static void nfs_put_super(struct super_block *); static void nfs_kill_super(struct super_block *); static int nfs_remount(struct super_block *sb, int *flags, char *raw_data); @@ -266,6 +267,7 @@ static const struct super_operations nfs_sops = { .alloc_inode = nfs_alloc_inode, .destroy_inode = nfs_destroy_inode, .write_inode = nfs_write_inode, + .put_super = nfs_put_super, .statfs = nfs_statfs, .clear_inode = nfs_clear_inode, .umount_begin = nfs_umount_begin, @@ -335,6 +337,7 @@ static const struct super_operations nfs4_sops = { .alloc_inode = nfs_alloc_inode, .destroy_inode = nfs_destroy_inode, .write_inode = nfs_write_inode, + .put_super = nfs_put_super, .statfs = nfs_statfs, .clear_inode = nfs4_clear_inode, .umount_begin = nfs_umount_begin, @@ -2258,6 +2261,17 @@ error_splat_super: } /* + * Ensure that we unregister the bdi before kill_anon_super + * releases the device name + */ +static void nfs_put_super(struct super_block *s) +{ + struct nfs_server *server = NFS_SB(s); + + bdi_unregister(&server->backing_dev_info); +} + +/* * Destroy an NFS2/3 superblock */ static void nfs_kill_super(struct super_block *s) @@ -2265,7 +2279,6 @@ static void nfs_kill_super(struct super_block *s) struct nfs_server *server = NFS_SB(s); kill_anon_super(s); - bdi_unregister(&server->backing_dev_info); nfs_fscache_release_super_cookie(s); nfs_free_server(server); } diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c index 70e1fbb..ad4d2e7 100644 --- a/fs/nfs/sysctl.c +++ b/fs/nfs/sysctl.c @@ -15,8 +15,10 @@ #include "callback.h" +#ifdef CONFIG_NFS_V4 static const int nfs_set_port_min = 0; static const int nfs_set_port_max = 65535; +#endif static struct ctl_table_header *nfs_callback_sysctl_table; static ctl_table nfs_cb_sysctls[] = { diff --git a/fs/nfs/write.c b/fs/nfs/write.c index d171696..d63d964 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1233,7 +1233,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) -void nfs_commitdata_release(void *data) +static void nfs_commitdata_release(void *data) { struct nfs_write_data *wdata = data; @@ -1541,6 +1541,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page) break; } ret = nfs_wait_on_request(req); + nfs_release_request(req); if (ret < 0) goto out; } @@ -1597,8 +1598,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage, struct nfs_page *req; int ret; - if (PageFsCache(page)) - nfs_fscache_release_page(page, GFP_KERNEL); + nfs_fscache_release_page(page, GFP_KERNEL); req = nfs_find_and_lock_request(page); ret = PTR_ERR(req); diff --git a/fs/nfsctl.c b/fs/nfsctl.c index 8f9a205..d3854d9 100644 --- a/fs/nfsctl.c +++ b/fs/nfsctl.c @@ -7,8 +7,6 @@ #include <linux/types.h> #include <linux/file.h> #include <linux/fs.h> -#include <linux/sunrpc/svc.h> -#include <linux/nfsd/nfsd.h> #include <linux/nfsd/syscall.h> #include <linux/cred.h> #include <linux/sched.h> diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index 36fcabbf..79717a4 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c @@ -1,15 +1,7 @@ -/* - * linux/fs/nfsd/auth.c - * - * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> - */ +/* Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> */ -#include <linux/types.h> #include <linux/sched.h> -#include <linux/sunrpc/svc.h> -#include <linux/sunrpc/svcauth.h> -#include <linux/nfsd/nfsd.h> -#include <linux/nfsd/export.h> +#include "nfsd.h" #include "auth.h" int nfsexp_flags(struct svc_rqst *rqstp, struct svc_export *exp) diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h new file mode 100644 index 0000000..d892be6 --- /dev/null +++ b/fs/nfsd/cache.h @@ -0,0 +1,83 @@ +/* + * Request reply cache. This was heavily inspired by the + * implementation in 4.3BSD/4.4BSD. + * + * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> + */ + +#ifndef NFSCACHE_H +#define NFSCACHE_H + +#include <linux/sunrpc/svc.h> + +/* + * Representation of a reply cache entry. + */ +struct svc_cacherep { + struct hlist_node c_hash; + struct list_head c_lru; + + unsigned char c_state, /* unused, inprog, done */ + c_type, /* status, buffer */ + c_secure : 1; /* req came from port < 1024 */ + struct sockaddr_in c_addr; + __be32 c_xid; + u32 c_prot; + u32 c_proc; + u32 c_vers; + unsigned long c_timestamp; + union { + struct kvec u_vec; + __be32 u_status; + } c_u; +}; + +#define c_replvec c_u.u_vec +#define c_replstat c_u.u_status + +/* cache entry states */ +enum { + RC_UNUSED, + RC_INPROG, + RC_DONE +}; + +/* return values */ +enum { + RC_DROPIT, + RC_REPLY, + RC_DOIT, + RC_INTR +}; + +/* + * Cache types. + * We may want to add more types one day, e.g. for diropres and + * attrstat replies. Using cache entries with fixed length instead + * of buffer pointers may be more efficient. + */ +enum { + RC_NOCACHE, + RC_REPLSTAT, + RC_REPLBUFF, +}; + +/* + * If requests are retransmitted within this interval, they're dropped. + */ +#define RC_DELAY (HZ/5) + +int nfsd_reply_cache_init(void); +void nfsd_reply_cache_shutdown(void); +int nfsd_cache_lookup(struct svc_rqst *, int); +void nfsd_cache_update(struct svc_rqst *, int, __be32 *); + +#ifdef CONFIG_NFSD_V4 +void nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp); +#else /* CONFIG_NFSD_V4 */ +static inline void nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp) +{ +} +#endif /* CONFIG_NFSD_V4 */ + +#endif /* NFSCACHE_H */ diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index c1c9e03..a0c4016 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -1,7 +1,5 @@ #define MSNFS /* HACK HACK */ /* - * linux/fs/nfsd/export.c - * * NFS exporting and validation. * * We maintain a list of clients, each of which has a list of @@ -14,29 +12,16 @@ * Copyright (C) 1995, 1996 Olaf Kirch, <okir@monad.swb.de> */ -#include <linux/unistd.h> -#include <linux/slab.h> -#include <linux/stat.h> -#include <linux/in.h> -#include <linux/seq_file.h> -#include <linux/syscalls.h> -#include <linux/rwsem.h> -#include <linux/dcache.h> #include <linux/namei.h> -#include <linux/mount.h> -#include <linux/hash.h> #include <linux/module.h> #include <linux/exportfs.h> -#include <linux/sunrpc/svc.h> -#include <linux/nfsd/nfsd.h> -#include <linux/nfsd/nfsfh.h> #include <linux/nfsd/syscall.h> -#include <linux/lockd/bind.h> -#include <linux/sunrpc/msg_prot.h> -#include <linux/sunrpc/gss_api.h> #include <net/ipv6.h> +#include "nfsd.h" +#include "nfsfh.h" + #define NFSDDBG_FACILITY NFSDDBG_EXPORT typedef struct auth_domain svc_client; @@ -369,16 +354,25 @@ static struct svc_export *svc_export_update(struct svc_export *new, struct svc_export *old); static struct svc_export *svc_export_lookup(struct svc_export *); -static int check_export(struct inode *inode, int flags, unsigned char *uuid) +static int check_export(struct inode *inode, int *flags, unsigned char *uuid) { - /* We currently export only dirs and regular files. - * This is what umountd does. + /* + * We currently export only dirs, regular files, and (for v4 + * pseudoroot) symlinks. */ if (!S_ISDIR(inode->i_mode) && + !S_ISLNK(inode->i_mode) && !S_ISREG(inode->i_mode)) return -ENOTDIR; + /* + * Mountd should never pass down a writeable V4ROOT export, but, + * just to make sure: + */ + if (*flags & NFSEXP_V4ROOT) + *flags |= NFSEXP_READONLY; + /* There are two requirements on a filesystem to be exportable. * 1: We must be able to identify the filesystem from a number. * either a device number (so FS_REQUIRES_DEV needed) @@ -387,7 +381,7 @@ static int check_export(struct inode *inode, int flags, unsigned char *uuid) * This means that s_export_op must be set. */ if (!(inode->i_sb->s_type->fs_flags & FS_REQUIRES_DEV) && - !(flags & NFSEXP_FSID) && + !(*flags & NFSEXP_FSID) && uuid == NULL) { dprintk("exp_export: export of non-dev fs without fsid\n"); return -EINVAL; @@ -602,7 +596,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) goto out4; } - err = check_export(exp.ex_path.dentry->d_inode, exp.ex_flags, + err = check_export(exp.ex_path.dentry->d_inode, &exp.ex_flags, exp.ex_uuid); if (err) goto out4; @@ -1041,7 +1035,7 @@ exp_export(struct nfsctl_export *nxp) goto finish; } - err = check_export(path.dentry->d_inode, nxp->ex_flags, NULL); + err = check_export(path.dentry->d_inode, &nxp->ex_flags, NULL); if (err) goto finish; err = -ENOMEM; @@ -1320,6 +1314,15 @@ rqst_exp_parent(struct svc_rqst *rqstp, struct path *path) return exp; } +static struct svc_export *find_fsidzero_export(struct svc_rqst *rqstp) +{ + u32 fsidv[2]; + + mk_fsid(FSID_NUM, fsidv, 0, 0, 0, NULL); + + return rqst_exp_find(rqstp, FSID_NUM, fsidv); +} + /* * Called when we need the filehandle for the root of the pseudofs, * for a given NFSv4 client. The root is defined to be the @@ -1330,11 +1333,8 @@ exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp) { struct svc_export *exp; __be32 rv; - u32 fsidv[2]; - - mk_fsid(FSID_NUM, fsidv, 0, 0, 0, NULL); - exp = rqst_exp_find(rqstp, FSID_NUM, fsidv); + exp = find_fsidzero_export(rqstp); if (IS_ERR(exp)) return nfserrno(PTR_ERR(exp)); rv = fh_compose(fhp, exp, exp->ex_path.dentry, NULL); @@ -1425,6 +1425,7 @@ static struct flags { { NFSEXP_CROSSMOUNT, {"crossmnt", ""}}, { NFSEXP_NOSUBTREECHECK, {"no_subtree_check", ""}}, { NFSEXP_NOAUTHNLM, {"insecure_locks", ""}}, + { NFSEXP_V4ROOT, {"v4root", ""}}, #ifdef MSNFS { NFSEXP_MSNFS, {"msnfs", ""}}, #endif diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c index b2786a5..0c6d816 100644 --- a/fs/nfsd/lockd.c +++ b/fs/nfsd/lockd.c @@ -1,6 +1,4 @@ /* - * linux/fs/nfsd/lockd.c - * * This file contains all the stubs needed when communicating with lockd. * This level of indirection is necessary so we can run nfsd+lockd without * requiring the nfs client to be compiled in/loaded, and vice versa. @@ -8,14 +6,10 @@ * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de> */ -#include <linux/types.h> -#include <linux/fs.h> #include <linux/file.h> -#include <linux/mount.h> -#include <linux/sunrpc/clnt.h> -#include <linux/sunrpc/svc.h> -#include <linux/nfsd/nfsd.h> #include <linux/lockd/bind.h> +#include "nfsd.h" +#include "vfs.h" #define NFSDDBG_FACILITY NFSDDBG_LOCKD diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index 4e3219e..f20589d 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -1,19 +1,15 @@ /* - * linux/fs/nfsd/nfs2acl.c - * * Process version 2 NFSACL requests. * * Copyright (C) 2002-2003 Andreas Gruenbacher <agruen@suse.de> */ -#include <linux/sunrpc/svc.h> -#include <linux/nfs.h> -#include <linux/nfsd/nfsd.h> -#include <linux/nfsd/cache.h> -#include <linux/nfsd/xdr.h> -#include <linux/nfsd/xdr3.h> -#include <linux/posix_acl.h> +#include "nfsd.h" +/* FIXME: nfsacl.h is a broken header */ #include <linux/nfsacl.h> +#include "cache.h" +#include "xdr3.h" +#include "vfs.h" #define NFSDDBG_FACILITY NFSDDBG_PROC #define RETURN_STATUS(st) { resp->status = (st); return (st); } @@ -217,6 +213,16 @@ static int nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p, * XDR encode functions */ +/* + * There must be an encoding function for void results so svc_process + * will work properly. + */ +int +nfsaclsvc_encode_voidres(struct svc_rqst *rqstp, __be32 *p, void *dummy) +{ + return xdr_ressize_check(rqstp, p); +} + /* GETACL */ static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p, struct nfsd3_getaclres *resp) @@ -308,7 +314,6 @@ static int nfsaclsvc_release_access(struct svc_rqst *rqstp, __be32 *p, } #define nfsaclsvc_decode_voidargs NULL -#define nfsaclsvc_encode_voidres NULL #define nfsaclsvc_release_void NULL #define nfsd3_fhandleargs nfsd_fhandle #define nfsd3_attrstatres nfsd_attrstat @@ -346,5 +351,5 @@ struct svc_version nfsd_acl_version2 = { .vs_proc = nfsd_acl_procedures2, .vs_dispatch = nfsd_dispatch, .vs_xdrsize = NFS3_SVC_XDRSIZE, - .vs_hidden = 1, + .vs_hidden = 0, }; diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index 9981dbb..e0c4846 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -1,18 +1,15 @@ /* - * linux/fs/nfsd/nfs3acl.c - * * Process version 3 NFSACL requests. * * Copyright (C) 2002-2003 Andreas Gruenbacher <agruen@suse.de> */ -#include <linux/sunrpc/svc.h> -#include <linux/nfs3.h> -#include <linux/nfsd/nfsd.h> -#include <linux/nfsd/cache.h> -#include <linux/nfsd/xdr3.h> -#include <linux/posix_acl.h> +#include "nfsd.h" +/* FIXME: nfsacl.h is a broken header */ #include <linux/nfsacl.h> +#include "cache.h" +#include "xdr3.h" +#include "vfs.h" #define RETURN_STATUS(st) { resp->status = (st); return (st); } @@ -264,6 +261,6 @@ struct svc_version nfsd_acl_version3 = { .vs_proc = nfsd_acl_procedures3, .vs_dispatch = nfsd_dispatch, .vs_xdrsize = NFS3_SVC_XDRSIZE, - .vs_hidden = 1, + .vs_hidden = 0, }; diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index a713c41..3d68f45 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -1,30 +1,16 @@ /* - * linux/fs/nfsd/nfs3proc.c - * * Process version 3 NFS requests. * * Copyright (C) 1996, 1997, 1998 Olaf Kirch <okir@monad.swb.de> */ -#include <linux/linkage.h> -#include <linux/time.h> -#include <linux/errno.h> #include <linux/fs.h> #include <linux/ext2_fs.h> -#include <linux/stat.h> -#include <linux/fcntl.h> -#include <linux/net.h> -#include <linux/in.h> -#include <linux/unistd.h> -#include <linux/slab.h> -#include <linux/major.h> #include <linux/magic.h> -#include <linux/sunrpc/svc.h> -#include <linux/nfsd/nfsd.h> -#include <linux/nfsd/cache.h> -#include <linux/nfsd/xdr3.h> -#include <linux/nfs3.h> +#include "cache.h" +#include "xdr3.h" +#include "vfs.h" #define NFSDDBG_FACILITY NFSDDBG_PROC diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index d0a2ce1..2a533a0 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -1,6 +1,4 @@ /* - * linux/fs/nfsd/nfs3xdr.c - * * XDR support for nfsd/protocol version 3. * * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de> @@ -8,19 +6,8 @@ * 2003-08-09 Jamie Lokier: Use htonl() for nanoseconds, not htons()! */ -#include <linux/types.h> -#include <linux/time.h> -#include <linux/nfs3.h> -#include <linux/list.h> -#include <linux/spinlock.h> -#include <linux/dcache.h> #include <linux/namei.h> -#include <linux/mm.h> -#include <linux/vfs.h> -#include <linux/sunrpc/xdr.h> -#include <linux/sunrpc/svc.h> -#include <linux/nfsd/nfsd.h> -#include <linux/nfsd/xdr3.h> +#include "xdr3.h" #include "auth.h" #define NFSDDBG_FACILITY NFSDDBG_XDR diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index 725d02f..8815068 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -1,6 +1,4 @@ /* - * fs/nfs4acl/acl.c - * * Common NFSv4 ACL handling code. * * Copyright (c) 2002, 2003 The Regents of the University of Michigan. @@ -36,15 +34,7 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include <linux/string.h> -#include <linux/slab.h> -#include <linux/list.h> -#include <linux/types.h> -#include <linux/fs.h> -#include <linux/module.h> #include <linux/nfs_fs.h> -#include <linux/posix_acl.h> -#include <linux/nfs4.h> #include <linux/nfs4_acl.h> @@ -389,7 +379,7 @@ sort_pacl(struct posix_acl *pacl) sort_pacl_range(pacl, 1, i-1); BUG_ON(pacl->a_entries[i].e_tag != ACL_GROUP_OBJ); - j = i++; + j = ++i; while (pacl->a_entries[j].e_tag == ACL_GROUP) j++; sort_pacl_range(pacl, i, j-1); diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 24e8d78..c6eed2a 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -1,6 +1,4 @@ /* - * linux/fs/nfsd/nfs4callback.c - * * Copyright (c) 2001 The Regents of the University of Michigan. * All rights reserved. * @@ -33,22 +31,9 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include <linux/module.h> -#include <linux/list.h> -#include <linux/inet.h> -#include <linux/errno.h> -#include <linux/delay.h> -#include <linux/sched.h> -#include <linux/kthread.h> -#include <linux/sunrpc/xdr.h> -#include <linux/sunrpc/svc.h> #include <linux/sunrpc/clnt.h> -#include <linux/sunrpc/svcsock.h> -#include <linux/nfsd/nfsd.h> -#include <linux/nfsd/state.h> -#include <linux/sunrpc/sched.h> -#include <linux/nfs4.h> -#include <linux/sunrpc/xprtsock.h> +#include "nfsd.h" +#include "state.h" #define NFSDDBG_FACILITY NFSDDBG_PROC diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index ba2c199..6e2983b 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -1,6 +1,4 @@ /* - * fs/nfsd/nfs4idmap.c - * * Mapping of UID/GIDs to name and vice versa. * * Copyright (c) 2002, 2003 The Regents of the University of @@ -35,22 +33,9 @@ */ #include <linux/module.h> -#include <linux/init.h> - -#include <linux/mm.h> -#include <linux/errno.h> -#include <linux/string.h> -#include <linux/sunrpc/clnt.h> -#include <linux/nfs.h> -#include <linux/nfs4.h> -#include <linux/nfs_fs.h> -#include <linux/nfs_page.h> -#include <linux/sunrpc/cache.h> #include <linux/nfsd_idmap.h> -#include <linux/list.h> -#include <linux/time.h> #include <linux/seq_file.h> -#include <linux/sunrpc/svcauth.h> +#include <linux/sched.h> /* * Cache entry diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index bebc0c2..37514c4 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1,6 +1,4 @@ /* - * fs/nfsd/nfs4proc.c - * * Server-side procedures for NFSv4. * * Copyright (c) 2002 The Regents of the University of Michigan. @@ -34,20 +32,11 @@ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ - -#include <linux/param.h> -#include <linux/major.h> -#include <linux/slab.h> #include <linux/file.h> -#include <linux/sunrpc/svc.h> -#include <linux/nfsd/nfsd.h> -#include <linux/nfsd/cache.h> -#include <linux/nfs4.h> -#include <linux/nfsd/state.h> -#include <linux/nfsd/xdr4.h> -#include <linux/nfs4_acl.h> -#include <linux/sunrpc/gss_api.h> +#include "cache.h" +#include "xdr4.h" +#include "vfs.h" #define NFSDDBG_FACILITY NFSDDBG_PROC @@ -170,7 +159,7 @@ do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfs accmode |= NFSD_MAY_READ; if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) accmode |= (NFSD_MAY_WRITE | NFSD_MAY_TRUNC); - if (open->op_share_deny & NFS4_SHARE_DENY_WRITE) + if (open->op_share_deny & NFS4_SHARE_DENY_READ) accmode |= NFSD_MAY_WRITE; status = fh_verify(rqstp, current_fh, S_IFREG, accmode); diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index b534840..5a754f7 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -1,6 +1,4 @@ /* -* linux/fs/nfsd/nfs4recover.c -* * Copyright (c) 2004 The Regents of the University of Michigan. * All rights reserved. * @@ -33,20 +31,14 @@ * */ -#include <linux/err.h> -#include <linux/sunrpc/svc.h> -#include <linux/nfsd/nfsd.h> -#include <linux/nfs4.h> -#include <linux/nfsd/state.h> -#include <linux/nfsd/xdr4.h> -#include <linux/param.h> #include <linux/file.h> #include <linux/namei.h> -#include <asm/uaccess.h> -#include <linux/scatterlist.h> #include <linux/crypto.h> #include <linux/sched.h> -#include <linux/mount.h> + +#include "nfsd.h" +#include "state.h" +#include "vfs.h" #define NFSDDBG_FACILITY NFSDDBG_PROC diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 2153f9bd..f19ed86 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1,6 +1,4 @@ /* -* linux/fs/nfsd/nfs4state.c -* * Copyright (c) 2001 The Regents of the University of Michigan. * All rights reserved. * @@ -34,28 +32,14 @@ * */ -#include <linux/param.h> -#include <linux/major.h> -#include <linux/slab.h> - -#include <linux/sunrpc/svc.h> -#include <linux/nfsd/nfsd.h> -#include <linux/nfsd/cache.h> #include <linux/file.h> -#include <linux/mount.h> -#include <linux/workqueue.h> #include <linux/smp_lock.h> -#include <linux/kthread.h> -#include <linux/nfs4.h> -#include <linux/nfsd/state.h> -#include <linux/nfsd/xdr4.h> #include <linux/namei.h> #include <linux/swap.h> -#include <linux/mutex.h> -#include <linux/lockd/bind.h> -#include <linux/module.h> #include <linux/sunrpc/svcauth_gss.h> #include <linux/sunrpc/clnt.h> +#include "xdr4.h" +#include "vfs.h" #define NFSDDBG_FACILITY NFSDDBG_PROC @@ -477,13 +461,14 @@ static int set_forechannel_drc_size(struct nfsd4_channel_attrs *fchan) /* * fchan holds the client values on input, and the server values on output + * sv_max_mesg is the maximum payload plus one page for overhead. */ static int init_forechannel_attrs(struct svc_rqst *rqstp, struct nfsd4_channel_attrs *session_fchan, struct nfsd4_channel_attrs *fchan) { int status = 0; - __u32 maxcount = svc_max_payload(rqstp); + __u32 maxcount = nfsd_serv->sv_max_mesg; /* headerpadsz set to zero in encode routine */ @@ -523,6 +508,15 @@ free_session_slots(struct nfsd4_session *ses) kfree(ses->se_slots[i]); } +/* + * We don't actually need to cache the rpc and session headers, so we + * can allocate a little less for each slot: + */ +static inline int slot_bytes(struct nfsd4_channel_attrs *ca) +{ + return ca->maxresp_cached - NFSD_MIN_HDR_SEQ_SZ; +} + static int alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp, struct nfsd4_create_session *cses) @@ -554,7 +548,7 @@ alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp, memcpy(new, &tmp, sizeof(*new)); /* allocate each struct nfsd4_slot and data cache in one piece */ - cachesize = new->se_fchannel.maxresp_cached - NFSD_MIN_HDR_SEQ_SZ; + cachesize = slot_bytes(&new->se_fchannel); for (i = 0; i < new->se_fchannel.maxreqs; i++) { sp = kzalloc(sizeof(*sp) + cachesize, GFP_KERNEL); if (!sp) @@ -628,10 +622,12 @@ void free_session(struct kref *kref) { struct nfsd4_session *ses; + int mem; ses = container_of(kref, struct nfsd4_session, se_ref); spin_lock(&nfsd_drc_lock); - nfsd_drc_mem_used -= ses->se_fchannel.maxreqs * NFSD_SLOT_CACHE_SIZE; + mem = ses->se_fchannel.maxreqs * slot_bytes(&ses->se_fchannel); + nfsd_drc_mem_used -= mem; spin_unlock(&nfsd_drc_lock); free_session_slots(ses); kfree(ses); @@ -2404,11 +2400,8 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta memcpy(&open->op_delegate_stateid, &dp->dl_stateid, sizeof(dp->dl_stateid)); - dprintk("NFSD: delegation stateid=(%08x/%08x/%08x/%08x)\n\n", - dp->dl_stateid.si_boot, - dp->dl_stateid.si_stateownerid, - dp->dl_stateid.si_fileid, - dp->dl_stateid.si_generation); + dprintk("NFSD: delegation stateid=" STATEID_FMT "\n", + STATEID_VAL(&dp->dl_stateid)); out: if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS && flag == NFS4_OPEN_DELEGATE_NONE @@ -2498,9 +2491,8 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf status = nfs_ok; - dprintk("nfs4_process_open2: stateid=(%08x/%08x/%08x/%08x)\n", - stp->st_stateid.si_boot, stp->st_stateid.si_stateownerid, - stp->st_stateid.si_fileid, stp->st_stateid.si_generation); + dprintk("%s: stateid=" STATEID_FMT "\n", __func__, + STATEID_VAL(&stp->st_stateid)); out: if (fp) put_nfs4_file(fp); @@ -2666,9 +2658,8 @@ STALE_STATEID(stateid_t *stateid) { if (time_after((unsigned long)boot_time, (unsigned long)stateid->si_boot)) { - dprintk("NFSD: stale stateid (%08x/%08x/%08x/%08x)!\n", - stateid->si_boot, stateid->si_stateownerid, - stateid->si_fileid, stateid->si_generation); + dprintk("NFSD: stale stateid " STATEID_FMT "!\n", + STATEID_VAL(stateid)); return 1; } return 0; @@ -2680,9 +2671,8 @@ EXPIRED_STATEID(stateid_t *stateid) if (time_before((unsigned long)boot_time, ((unsigned long)stateid->si_boot)) && time_before((unsigned long)(stateid->si_boot + lease_time), get_seconds())) { - dprintk("NFSD: expired stateid (%08x/%08x/%08x/%08x)!\n", - stateid->si_boot, stateid->si_stateownerid, - stateid->si_fileid, stateid->si_generation); + dprintk("NFSD: expired stateid " STATEID_FMT "!\n", + STATEID_VAL(stateid)); return 1; } return 0; @@ -2696,9 +2686,8 @@ stateid_error_map(stateid_t *stateid) if (EXPIRED_STATEID(stateid)) return nfserr_expired; - dprintk("NFSD: bad stateid (%08x/%08x/%08x/%08x)!\n", - stateid->si_boot, stateid->si_stateownerid, - stateid->si_fileid, stateid->si_generation); + dprintk("NFSD: bad stateid " STATEID_FMT "!\n", + STATEID_VAL(stateid)); return nfserr_bad_stateid; } @@ -2884,10 +2873,8 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, struct svc_fh *current_fh = &cstate->current_fh; __be32 status; - dprintk("NFSD: preprocess_seqid_op: seqid=%d " - "stateid = (%08x/%08x/%08x/%08x)\n", seqid, - stateid->si_boot, stateid->si_stateownerid, stateid->si_fileid, - stateid->si_generation); + dprintk("NFSD: %s: seqid=%d stateid = " STATEID_FMT "\n", __func__, + seqid, STATEID_VAL(stateid)); *stpp = NULL; *sopp = NULL; @@ -3019,12 +3006,8 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, sop->so_confirmed = 1; update_stateid(&stp->st_stateid); memcpy(&oc->oc_resp_stateid, &stp->st_stateid, sizeof(stateid_t)); - dprintk("NFSD: nfsd4_open_confirm: success, seqid=%d " - "stateid=(%08x/%08x/%08x/%08x)\n", oc->oc_seqid, - stp->st_stateid.si_boot, - stp->st_stateid.si_stateownerid, - stp->st_stateid.si_fileid, - stp->st_stateid.si_generation); + dprintk("NFSD: %s: success, seqid=%d stateid=" STATEID_FMT "\n", + __func__, oc->oc_seqid, STATEID_VAL(&stp->st_stateid)); nfsd4_create_clid_dir(sop->so_client); out: @@ -3283,9 +3266,8 @@ find_delegation_stateid(struct inode *ino, stateid_t *stid) struct nfs4_file *fp; struct nfs4_delegation *dl; - dprintk("NFSD:find_delegation_stateid stateid=(%08x/%08x/%08x/%08x)\n", - stid->si_boot, stid->si_stateownerid, - stid->si_fileid, stid->si_generation); + dprintk("NFSD: %s: stateid=" STATEID_FMT "\n", __func__, + STATEID_VAL(stid)); fp = find_file(ino); if (!fp) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 0fbd50c..a8587e9 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -40,24 +40,16 @@ * at the end of nfs4svc_decode_compoundargs. */ -#include <linux/param.h> -#include <linux/smp.h> -#include <linux/fs.h> #include <linux/namei.h> -#include <linux/vfs.h> +#include <linux/statfs.h> #include <linux/utsname.h> -#include <linux/sunrpc/xdr.h> -#include <linux/sunrpc/svc.h> -#include <linux/sunrpc/clnt.h> -#include <linux/nfsd/nfsd.h> -#include <linux/nfsd/state.h> -#include <linux/nfsd/xdr4.h> #include <linux/nfsd_idmap.h> -#include <linux/nfs4.h> #include <linux/nfs4_acl.h> -#include <linux/sunrpc/gss_api.h> #include <linux/sunrpc/svcauth_gss.h> +#include "xdr4.h" +#include "vfs.h" + #define NFSDDBG_FACILITY NFSDDBG_XDR /* @@ -2204,11 +2196,14 @@ nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd, * we will not follow the cross mount and will fill the attribtutes * directly from the mountpoint dentry. */ - if (d_mountpoint(dentry) && !attributes_need_mount(cd->rd_bmval)) - ignore_crossmnt = 1; - else if (d_mountpoint(dentry)) { + if (nfsd_mountpoint(dentry, exp)) { int err; + if (!(exp->ex_flags & NFSEXP_V4ROOT) + && !attributes_need_mount(cd->rd_bmval)) { + ignore_crossmnt = 1; + goto out_encode; + } /* * Why the heck aren't we just using nfsd_lookup?? * Different "."/".." handling? Something else? @@ -2224,6 +2219,7 @@ nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd, goto out_put; } +out_encode: nfserr = nfsd4_encode_fattr(NULL, exp, dentry, p, buflen, cd->rd_bmval, cd->rd_rqstp, ignore_crossmnt); out_put: diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 4638635..da08560 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -1,6 +1,4 @@ /* - * linux/fs/nfsd/nfscache.c - * * Request reply cache. This is currently a global cache, but this may * change in the future and be a per-client cache. * @@ -10,16 +8,8 @@ * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> */ -#include <linux/kernel.h> -#include <linux/time.h> -#include <linux/slab.h> -#include <linux/string.h> -#include <linux/spinlock.h> -#include <linux/list.h> - -#include <linux/sunrpc/svc.h> -#include <linux/nfsd/nfsd.h> -#include <linux/nfsd/cache.h> +#include "nfsd.h" +#include "cache.h" /* Size of reply cache. Common values are: * 4.3BSD: 128 diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 5c01fc1..2604c3e 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1,46 +1,20 @@ /* - * linux/fs/nfsd/nfsctl.c - * * Syscall interface to knfsd. * * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> */ -#include <linux/module.h> - -#include <linux/linkage.h> -#include <linux/time.h> -#include <linux/errno.h> -#include <linux/fs.h> #include <linux/namei.h> -#include <linux/fcntl.h> -#include <linux/net.h> -#include <linux/in.h> -#include <linux/syscalls.h> -#include <linux/unistd.h> -#include <linux/slab.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h> -#include <linux/pagemap.h> -#include <linux/init.h> -#include <linux/inet.h> -#include <linux/string.h> #include <linux/ctype.h> -#include <linux/nfs.h> #include <linux/nfsd_idmap.h> -#include <linux/lockd/bind.h> -#include <linux/sunrpc/svc.h> #include <linux/sunrpc/svcsock.h> -#include <linux/nfsd/nfsd.h> -#include <linux/nfsd/cache.h> -#include <linux/nfsd/xdr.h> #include <linux/nfsd/syscall.h> #include <linux/lockd/lockd.h> #include <linux/sunrpc/clnt.h> -#include <asm/uaccess.h> -#include <net/ipv6.h> +#include "nfsd.h" +#include "cache.h" /* * We have a single directory with 9 nodes in it. @@ -55,6 +29,7 @@ enum { NFSD_Getfd, NFSD_Getfs, NFSD_List, + NFSD_Export_features, NFSD_Fh, NFSD_FO_UnlockIP, NFSD_FO_UnlockFS, @@ -173,6 +148,24 @@ static const struct file_operations exports_operations = { .owner = THIS_MODULE, }; +static int export_features_show(struct seq_file *m, void *v) +{ + seq_printf(m, "0x%x 0x%x\n", NFSEXP_ALLFLAGS, NFSEXP_SECINFO_FLAGS); + return 0; +} + +static int export_features_open(struct inode *inode, struct file *file) +{ + return single_open(file, export_features_show, NULL); +} + +static struct file_operations export_features_operations = { + .open = export_features_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + extern int nfsd_pool_stats_open(struct inode *inode, struct file *file); extern int nfsd_pool_stats_release(struct inode *inode, struct file *file); @@ -1330,6 +1323,8 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent) [NFSD_Getfd] = {".getfd", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Getfs] = {".getfs", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_List] = {"exports", &exports_operations, S_IRUGO}, + [NFSD_Export_features] = {"export_features", + &export_features_operations, S_IRUGO}, [NFSD_FO_UnlockIP] = {"unlock_ip", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_FO_UnlockFS] = {"unlock_filesystem", diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h new file mode 100644 index 0000000..e942a1a --- /dev/null +++ b/fs/nfsd/nfsd.h @@ -0,0 +1,338 @@ +/* + * Hodge-podge collection of knfsd-related stuff. + * I will sort this out later. + * + * Copyright (C) 1995-1997 Olaf Kirch <okir@monad.swb.de> + */ + +#ifndef LINUX_NFSD_NFSD_H +#define LINUX_NFSD_NFSD_H + +#include <linux/types.h> +#include <linux/mount.h> + +#include <linux/nfsd/debug.h> +#include <linux/nfsd/export.h> +#include <linux/nfsd/stats.h> +/* + * nfsd version + */ +#define NFSD_SUPPORTED_MINOR_VERSION 1 + +struct readdir_cd { + __be32 err; /* 0, nfserr, or nfserr_eof */ +}; + + +extern struct svc_program nfsd_program; +extern struct svc_version nfsd_version2, nfsd_version3, + nfsd_version4; +extern u32 nfsd_supported_minorversion; +extern struct mutex nfsd_mutex; +extern struct svc_serv *nfsd_serv; +extern spinlock_t nfsd_drc_lock; +extern unsigned int nfsd_drc_max_mem; +extern unsigned int nfsd_drc_mem_used; + +extern const struct seq_operations nfs_exports_op; + +/* + * Function prototypes. + */ +int nfsd_svc(unsigned short port, int nrservs); +int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp); + +int nfsd_nrthreads(void); +int nfsd_nrpools(void); +int nfsd_get_nrthreads(int n, int *); +int nfsd_set_nrthreads(int n, int *); + +#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) +#ifdef CONFIG_NFSD_V2_ACL +extern struct svc_version nfsd_acl_version2; +#else +#define nfsd_acl_version2 NULL +#endif +#ifdef CONFIG_NFSD_V3_ACL +extern struct svc_version nfsd_acl_version3; +#else +#define nfsd_acl_version3 NULL +#endif +#endif + +enum vers_op {NFSD_SET, NFSD_CLEAR, NFSD_TEST, NFSD_AVAIL }; +int nfsd_vers(int vers, enum vers_op change); +int nfsd_minorversion(u32 minorversion, enum vers_op change); +void nfsd_reset_versions(void); +int nfsd_create_serv(void); + +extern int nfsd_max_blksize; + +static inline int nfsd_v4client(struct svc_rqst *rq) +{ + return rq->rq_prog == NFS_PROGRAM && rq->rq_vers == 4; +} + +/* + * NFSv4 State + */ +#ifdef CONFIG_NFSD_V4 +extern unsigned int max_delegations; +int nfs4_state_init(void); +void nfsd4_free_slabs(void); +int nfs4_state_start(void); +void nfs4_state_shutdown(void); +time_t nfs4_lease_time(void); +void nfs4_reset_lease(time_t leasetime); +int nfs4_reset_recoverydir(char *recdir); +#else +static inline int nfs4_state_init(void) { return 0; } +static inline void nfsd4_free_slabs(void) { } +static inline int nfs4_state_start(void) { return 0; } +static inline void nfs4_state_shutdown(void) { } +static inline time_t nfs4_lease_time(void) { return 0; } +static inline void nfs4_reset_lease(time_t leasetime) { } +static inline int nfs4_reset_recoverydir(char *recdir) { return 0; } +#endif + +/* + * lockd binding + */ +void nfsd_lockd_init(void); +void nfsd_lockd_shutdown(void); + + +/* + * These macros provide pre-xdr'ed values for faster operation. + */ +#define nfs_ok cpu_to_be32(NFS_OK) +#define nfserr_perm cpu_to_be32(NFSERR_PERM) +#define nfserr_noent cpu_to_be32(NFSERR_NOENT) +#define nfserr_io cpu_to_be32(NFSERR_IO) +#define nfserr_nxio cpu_to_be32(NFSERR_NXIO) +#define nfserr_eagain cpu_to_be32(NFSERR_EAGAIN) +#define nfserr_acces cpu_to_be32(NFSERR_ACCES) +#define nfserr_exist cpu_to_be32(NFSERR_EXIST) +#define nfserr_xdev cpu_to_be32(NFSERR_XDEV) +#define nfserr_nodev cpu_to_be32(NFSERR_NODEV) +#define nfserr_notdir cpu_to_be32(NFSERR_NOTDIR) +#define nfserr_isdir cpu_to_be32(NFSERR_ISDIR) +#define nfserr_inval cpu_to_be32(NFSERR_INVAL) +#define nfserr_fbig cpu_to_be32(NFSERR_FBIG) +#define nfserr_nospc cpu_to_be32(NFSERR_NOSPC) +#define nfserr_rofs cpu_to_be32(NFSERR_ROFS) +#define nfserr_mlink cpu_to_be32(NFSERR_MLINK) +#define nfserr_opnotsupp cpu_to_be32(NFSERR_OPNOTSUPP) +#define nfserr_nametoolong cpu_to_be32(NFSERR_NAMETOOLONG) +#define nfserr_notempty cpu_to_be32(NFSERR_NOTEMPTY) +#define nfserr_dquot cpu_to_be32(NFSERR_DQUOT) +#define nfserr_stale cpu_to_be32(NFSERR_STALE) +#define nfserr_remote cpu_to_be32(NFSERR_REMOTE) +#define nfserr_wflush cpu_to_be32(NFSERR_WFLUSH) +#define nfserr_badhandle cpu_to_be32(NFSERR_BADHANDLE) +#define nfserr_notsync cpu_to_be32(NFSERR_NOT_SYNC) +#define nfserr_badcookie cpu_to_be32(NFSERR_BAD_COOKIE) +#define nfserr_notsupp cpu_to_be32(NFSERR_NOTSUPP) +#define nfserr_toosmall cpu_to_be32(NFSERR_TOOSMALL) +#define nfserr_serverfault cpu_to_be32(NFSERR_SERVERFAULT) +#define nfserr_badtype cpu_to_be32(NFSERR_BADTYPE) +#define nfserr_jukebox cpu_to_be32(NFSERR_JUKEBOX) +#define nfserr_denied cpu_to_be32(NFSERR_DENIED) +#define nfserr_deadlock cpu_to_be32(NFSERR_DEADLOCK) +#define nfserr_expired cpu_to_be32(NFSERR_EXPIRED) +#define nfserr_bad_cookie cpu_to_be32(NFSERR_BAD_COOKIE) +#define nfserr_same cpu_to_be32(NFSERR_SAME) +#define nfserr_clid_inuse cpu_to_be32(NFSERR_CLID_INUSE) +#define nfserr_stale_clientid cpu_to_be32(NFSERR_STALE_CLIENTID) +#define nfserr_resource cpu_to_be32(NFSERR_RESOURCE) +#define nfserr_moved cpu_to_be32(NFSERR_MOVED) +#define nfserr_nofilehandle cpu_to_be32(NFSERR_NOFILEHANDLE) +#define nfserr_minor_vers_mismatch cpu_to_be32(NFSERR_MINOR_VERS_MISMATCH) +#define nfserr_share_denied cpu_to_be32(NFSERR_SHARE_DENIED) +#define nfserr_stale_stateid cpu_to_be32(NFSERR_STALE_STATEID) +#define nfserr_old_stateid cpu_to_be32(NFSERR_OLD_STATEID) +#define nfserr_bad_stateid cpu_to_be32(NFSERR_BAD_STATEID) +#define nfserr_bad_seqid cpu_to_be32(NFSERR_BAD_SEQID) +#define nfserr_symlink cpu_to_be32(NFSERR_SYMLINK) +#define nfserr_not_same cpu_to_be32(NFSERR_NOT_SAME) +#define nfserr_restorefh cpu_to_be32(NFSERR_RESTOREFH) +#define nfserr_attrnotsupp cpu_to_be32(NFSERR_ATTRNOTSUPP) +#define nfserr_bad_xdr cpu_to_be32(NFSERR_BAD_XDR) +#define nfserr_openmode cpu_to_be32(NFSERR_OPENMODE) +#define nfserr_locks_held cpu_to_be32(NFSERR_LOCKS_HELD) +#define nfserr_op_illegal cpu_to_be32(NFSERR_OP_ILLEGAL) +#define nfserr_grace cpu_to_be32(NFSERR_GRACE) +#define nfserr_no_grace cpu_to_be32(NFSERR_NO_GRACE) +#define nfserr_reclaim_bad cpu_to_be32(NFSERR_RECLAIM_BAD) +#define nfserr_badname cpu_to_be32(NFSERR_BADNAME) +#define nfserr_cb_path_down cpu_to_be32(NFSERR_CB_PATH_DOWN) +#define nfserr_locked cpu_to_be32(NFSERR_LOCKED) +#define nfserr_wrongsec cpu_to_be32(NFSERR_WRONGSEC) +#define nfserr_badiomode cpu_to_be32(NFS4ERR_BADIOMODE) +#define nfserr_badlayout cpu_to_be32(NFS4ERR_BADLAYOUT) +#define nfserr_bad_session_digest cpu_to_be32(NFS4ERR_BAD_SESSION_DIGEST) +#define nfserr_badsession cpu_to_be32(NFS4ERR_BADSESSION) +#define nfserr_badslot cpu_to_be32(NFS4ERR_BADSLOT) +#define nfserr_complete_already cpu_to_be32(NFS4ERR_COMPLETE_ALREADY) +#define nfserr_conn_not_bound_to_session cpu_to_be32(NFS4ERR_CONN_NOT_BOUND_TO_SESSION) +#define nfserr_deleg_already_wanted cpu_to_be32(NFS4ERR_DELEG_ALREADY_WANTED) +#define nfserr_back_chan_busy cpu_to_be32(NFS4ERR_BACK_CHAN_BUSY) +#define nfserr_layouttrylater cpu_to_be32(NFS4ERR_LAYOUTTRYLATER) +#define nfserr_layoutunavailable cpu_to_be32(NFS4ERR_LAYOUTUNAVAILABLE) +#define nfserr_nomatching_layout cpu_to_be32(NFS4ERR_NOMATCHING_LAYOUT) +#define nfserr_recallconflict cpu_to_be32(NFS4ERR_RECALLCONFLICT) +#define nfserr_unknown_layouttype cpu_to_be32(NFS4ERR_UNKNOWN_LAYOUTTYPE) +#define nfserr_seq_misordered cpu_to_be32(NFS4ERR_SEQ_MISORDERED) +#define nfserr_sequence_pos cpu_to_be32(NFS4ERR_SEQUENCE_POS) +#define nfserr_req_too_big cpu_to_be32(NFS4ERR_REQ_TOO_BIG) +#define nfserr_rep_too_big cpu_to_be32(NFS4ERR_REP_TOO_BIG) +#define nfserr_rep_too_big_to_cache cpu_to_be32(NFS4ERR_REP_TOO_BIG_TO_CACHE) +#define nfserr_retry_uncached_rep cpu_to_be32(NFS4ERR_RETRY_UNCACHED_REP) +#define nfserr_unsafe_compound cpu_to_be32(NFS4ERR_UNSAFE_COMPOUND) +#define nfserr_too_many_ops cpu_to_be32(NFS4ERR_TOO_MANY_OPS) +#define nfserr_op_not_in_session cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION) +#define nfserr_hash_alg_unsupp cpu_to_be32(NFS4ERR_HASH_ALG_UNSUPP) +#define nfserr_clientid_busy cpu_to_be32(NFS4ERR_CLIENTID_BUSY) +#define nfserr_pnfs_io_hole cpu_to_be32(NFS4ERR_PNFS_IO_HOLE) +#define nfserr_seq_false_retry cpu_to_be32(NFS4ERR_SEQ_FALSE_RETRY) +#define nfserr_bad_high_slot cpu_to_be32(NFS4ERR_BAD_HIGH_SLOT) +#define nfserr_deadsession cpu_to_be32(NFS4ERR_DEADSESSION) +#define nfserr_encr_alg_unsupp cpu_to_be32(NFS4ERR_ENCR_ALG_UNSUPP) +#define nfserr_pnfs_no_layout cpu_to_be32(NFS4ERR_PNFS_NO_LAYOUT) +#define nfserr_not_only_op cpu_to_be32(NFS4ERR_NOT_ONLY_OP) +#define nfserr_wrong_cred cpu_to_be32(NFS4ERR_WRONG_CRED) +#define nfserr_wrong_type cpu_to_be32(NFS4ERR_WRONG_TYPE) +#define nfserr_dirdeleg_unavail cpu_to_be32(NFS4ERR_DIRDELEG_UNAVAIL) +#define nfserr_reject_deleg cpu_to_be32(NFS4ERR_REJECT_DELEG) +#define nfserr_returnconflict cpu_to_be32(NFS4ERR_RETURNCONFLICT) +#define nfserr_deleg_revoked cpu_to_be32(NFS4ERR_DELEG_REVOKED) + +/* error codes for internal use */ +/* if a request fails due to kmalloc failure, it gets dropped. + * Client should resend eventually + */ +#define nfserr_dropit cpu_to_be32(30000) +/* end-of-file indicator in readdir */ +#define nfserr_eof cpu_to_be32(30001) +/* replay detected */ +#define nfserr_replay_me cpu_to_be32(11001) +/* nfs41 replay detected */ +#define nfserr_replay_cache cpu_to_be32(11002) + +/* Check for dir entries '.' and '..' */ +#define isdotent(n, l) (l < 3 && n[0] == '.' && (l == 1 || n[1] == '.')) + +/* + * Time of server startup + */ +extern struct timeval nfssvc_boot; + +#ifdef CONFIG_NFSD_V4 + +/* before processing a COMPOUND operation, we have to check that there + * is enough space in the buffer for XDR encode to succeed. otherwise, + * we might process an operation with side effects, and be unable to + * tell the client that the operation succeeded. + * + * COMPOUND_SLACK_SPACE - this is the minimum bytes of buffer space + * needed to encode an "ordinary" _successful_ operation. (GETATTR, + * READ, READDIR, and READLINK have their own buffer checks.) if we + * fall below this level, we fail the next operation with NFS4ERR_RESOURCE. + * + * COMPOUND_ERR_SLACK_SPACE - this is the minimum bytes of buffer space + * needed to encode an operation which has failed with NFS4ERR_RESOURCE. + * care is taken to ensure that we never fall below this level for any + * reason. + */ +#define COMPOUND_SLACK_SPACE 140 /* OP_GETFH */ +#define COMPOUND_ERR_SLACK_SPACE 12 /* OP_SETATTR */ + +#define NFSD_LEASE_TIME (nfs4_lease_time()) +#define NFSD_LAUNDROMAT_MINTIMEOUT 10 /* seconds */ + +/* + * The following attributes are currently not supported by the NFSv4 server: + * ARCHIVE (deprecated anyway) + * HIDDEN (unlikely to be supported any time soon) + * MIMETYPE (unlikely to be supported any time soon) + * QUOTA_* (will be supported in a forthcoming patch) + * SYSTEM (unlikely to be supported any time soon) + * TIME_BACKUP (unlikely to be supported any time soon) + * TIME_CREATE (unlikely to be supported any time soon) + */ +#define NFSD4_SUPPORTED_ATTRS_WORD0 \ +(FATTR4_WORD0_SUPPORTED_ATTRS | FATTR4_WORD0_TYPE | FATTR4_WORD0_FH_EXPIRE_TYPE \ + | FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE | FATTR4_WORD0_LINK_SUPPORT \ + | FATTR4_WORD0_SYMLINK_SUPPORT | FATTR4_WORD0_NAMED_ATTR | FATTR4_WORD0_FSID \ + | FATTR4_WORD0_UNIQUE_HANDLES | FATTR4_WORD0_LEASE_TIME | FATTR4_WORD0_RDATTR_ERROR \ + | FATTR4_WORD0_ACLSUPPORT | FATTR4_WORD0_CANSETTIME | FATTR4_WORD0_CASE_INSENSITIVE \ + | FATTR4_WORD0_CASE_PRESERVING | FATTR4_WORD0_CHOWN_RESTRICTED \ + | FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FILEID | FATTR4_WORD0_FILES_AVAIL \ + | FATTR4_WORD0_FILES_FREE | FATTR4_WORD0_FILES_TOTAL | FATTR4_WORD0_FS_LOCATIONS | FATTR4_WORD0_HOMOGENEOUS \ + | FATTR4_WORD0_MAXFILESIZE | FATTR4_WORD0_MAXLINK | FATTR4_WORD0_MAXNAME \ + | FATTR4_WORD0_MAXREAD | FATTR4_WORD0_MAXWRITE | FATTR4_WORD0_ACL) + +#define NFSD4_SUPPORTED_ATTRS_WORD1 \ +(FATTR4_WORD1_MODE | FATTR4_WORD1_NO_TRUNC | FATTR4_WORD1_NUMLINKS \ + | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP | FATTR4_WORD1_RAWDEV \ + | FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE | FATTR4_WORD1_SPACE_TOTAL \ + | FATTR4_WORD1_SPACE_USED | FATTR4_WORD1_TIME_ACCESS | FATTR4_WORD1_TIME_ACCESS_SET \ + | FATTR4_WORD1_TIME_DELTA | FATTR4_WORD1_TIME_METADATA \ + | FATTR4_WORD1_TIME_MODIFY | FATTR4_WORD1_TIME_MODIFY_SET | FATTR4_WORD1_MOUNTED_ON_FILEID) + +#define NFSD4_SUPPORTED_ATTRS_WORD2 0 + +#define NFSD4_1_SUPPORTED_ATTRS_WORD0 \ + NFSD4_SUPPORTED_ATTRS_WORD0 + +#define NFSD4_1_SUPPORTED_ATTRS_WORD1 \ + NFSD4_SUPPORTED_ATTRS_WORD1 + +#define NFSD4_1_SUPPORTED_ATTRS_WORD2 \ + (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT) + +static inline u32 nfsd_suppattrs0(u32 minorversion) +{ + return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD0 + : NFSD4_SUPPORTED_ATTRS_WORD0; +} + +static inline u32 nfsd_suppattrs1(u32 minorversion) +{ + return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD1 + : NFSD4_SUPPORTED_ATTRS_WORD1; +} + +static inline u32 nfsd_suppattrs2(u32 minorversion) +{ + return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD2 + : NFSD4_SUPPORTED_ATTRS_WORD2; +} + +/* These will return ERR_INVAL if specified in GETATTR or READDIR. */ +#define NFSD_WRITEONLY_ATTRS_WORD1 \ +(FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET) + +/* These are the only attrs allowed in CREATE/OPEN/SETATTR. */ +#define NFSD_WRITEABLE_ATTRS_WORD0 \ +(FATTR4_WORD0_SIZE | FATTR4_WORD0_ACL ) +#define NFSD_WRITEABLE_ATTRS_WORD1 \ +(FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP \ + | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET) +#define NFSD_WRITEABLE_ATTRS_WORD2 0 + +#define NFSD_SUPPATTR_EXCLCREAT_WORD0 \ + NFSD_WRITEABLE_ATTRS_WORD0 +/* + * we currently store the exclusive create verifier in the v_{a,m}time + * attributes so the client can't set these at create time using EXCLUSIVE4_1 + */ +#define NFSD_SUPPATTR_EXCLCREAT_WORD1 \ + (NFSD_WRITEABLE_ATTRS_WORD1 & \ + ~(FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET)) +#define NFSD_SUPPATTR_EXCLCREAT_WORD2 \ + NFSD_WRITEABLE_ATTRS_WORD2 + +#endif /* CONFIG_NFSD_V4 */ + +#endif /* LINUX_NFSD_NFSD_H */ diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 01965b2..55c8e63 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -1,6 +1,4 @@ /* - * linux/fs/nfsd/nfsfh.c - * * NFS server file handle treatment. * * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> @@ -9,19 +7,11 @@ * ... and again Southern-Winter 2001 to support export_operations */ -#include <linux/slab.h> -#include <linux/fs.h> -#include <linux/unistd.h> -#include <linux/string.h> -#include <linux/stat.h> -#include <linux/dcache.h> #include <linux/exportfs.h> -#include <linux/mount.h> -#include <linux/sunrpc/clnt.h> -#include <linux/sunrpc/svc.h> #include <linux/sunrpc/svcauth_gss.h> -#include <linux/nfsd/nfsd.h> +#include "nfsd.h" +#include "vfs.h" #include "auth.h" #define NFSDDBG_FACILITY NFSDDBG_FH @@ -96,8 +86,10 @@ nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, int type) static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp, struct svc_export *exp) { + int flags = nfsexp_flags(rqstp, exp); + /* Check if the request originated from a secure port. */ - if (!rqstp->rq_secure && EX_SECURE(exp)) { + if (!rqstp->rq_secure && !(flags & NFSEXP_INSECURE_PORT)) { RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); dprintk(KERN_WARNING "nfsd: request from insecure port %s!\n", @@ -109,6 +101,36 @@ static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp, return nfserrno(nfsd_setuser(rqstp, exp)); } +static inline __be32 check_pseudo_root(struct svc_rqst *rqstp, + struct dentry *dentry, struct svc_export *exp) +{ + if (!(exp->ex_flags & NFSEXP_V4ROOT)) + return nfs_ok; + /* + * v2/v3 clients have no need for the V4ROOT export--they use + * the mount protocl instead; also, further V4ROOT checks may be + * in v4-specific code, in which case v2/v3 clients could bypass + * them. + */ + if (!nfsd_v4client(rqstp)) + return nfserr_stale; + /* + * We're exposing only the directories and symlinks that have to be + * traversed on the way to real exports: + */ + if (unlikely(!S_ISDIR(dentry->d_inode->i_mode) && + !S_ISLNK(dentry->d_inode->i_mode))) + return nfserr_stale; + /* + * A pseudoroot export gives permission to access only one + * single directory; the kernel has to make another upcall + * before granting access to anything else under it: + */ + if (unlikely(dentry != exp->ex_path.dentry)) + return nfserr_stale; + return nfs_ok; +} + /* * Use the given filehandle to look up the corresponding export and * dentry. On success, the results are used to set fh_export and @@ -232,14 +254,6 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp) goto out; } - if (exp->ex_flags & NFSEXP_NOSUBTREECHECK) { - error = nfsd_setuser_and_check_port(rqstp, exp); - if (error) { - dput(dentry); - goto out; - } - } - if (S_ISDIR(dentry->d_inode->i_mode) && (dentry->d_flags & DCACHE_DISCONNECTED)) { printk("nfsd: find_fh_dentry returned a DISCONNECTED directory: %s/%s\n", @@ -294,28 +308,32 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access) error = nfsd_set_fh_dentry(rqstp, fhp); if (error) goto out; - dentry = fhp->fh_dentry; - exp = fhp->fh_export; - } else { - /* - * just rechecking permissions - * (e.g. nfsproc_create calls fh_verify, then nfsd_create - * does as well) - */ - dprintk("nfsd: fh_verify - just checking\n"); - dentry = fhp->fh_dentry; - exp = fhp->fh_export; - /* - * Set user creds for this exportpoint; necessary even - * in the "just checking" case because this may be a - * filehandle that was created by fh_compose, and that - * is about to be used in another nfsv4 compound - * operation. - */ - error = nfsd_setuser_and_check_port(rqstp, exp); - if (error) - goto out; } + dentry = fhp->fh_dentry; + exp = fhp->fh_export; + /* + * We still have to do all these permission checks, even when + * fh_dentry is already set: + * - fh_verify may be called multiple times with different + * "access" arguments (e.g. nfsd_proc_create calls + * fh_verify(...,NFSD_MAY_EXEC) first, then later (in + * nfsd_create) calls fh_verify(...,NFSD_MAY_CREATE). + * - in the NFSv4 case, the filehandle may have been filled + * in by fh_compose, and given a dentry, but further + * compound operations performed with that filehandle + * still need permissions checks. In the worst case, a + * mountpoint crossing may have changed the export + * options, and we may now need to use a different uid + * (for example, if different id-squashing options are in + * effect on the new filesystem). + */ + error = check_pseudo_root(rqstp, dentry, exp); + if (error) + goto out; + + error = nfsd_setuser_and_check_port(rqstp, exp); + if (error) + goto out; error = nfsd_mode_check(rqstp, dentry->d_inode->i_mode, type); if (error) diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h new file mode 100644 index 0000000..cdfb8c6 --- /dev/null +++ b/fs/nfsd/nfsfh.h @@ -0,0 +1,208 @@ +/* Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de> */ + +#ifndef _LINUX_NFSD_FH_INT_H +#define _LINUX_NFSD_FH_INT_H + +#include <linux/nfsd/nfsfh.h> + +enum nfsd_fsid { + FSID_DEV = 0, + FSID_NUM, + FSID_MAJOR_MINOR, + FSID_ENCODE_DEV, + FSID_UUID4_INUM, + FSID_UUID8, + FSID_UUID16, + FSID_UUID16_INUM, +}; + +enum fsid_source { + FSIDSOURCE_DEV, + FSIDSOURCE_FSID, + FSIDSOURCE_UUID, +}; +extern enum fsid_source fsid_source(struct svc_fh *fhp); + + +/* This might look a little large to "inline" but in all calls except + * one, 'vers' is constant so moste of the function disappears. + */ +static inline void mk_fsid(int vers, u32 *fsidv, dev_t dev, ino_t ino, + u32 fsid, unsigned char *uuid) +{ + u32 *up; + switch(vers) { + case FSID_DEV: + fsidv[0] = htonl((MAJOR(dev)<<16) | + MINOR(dev)); + fsidv[1] = ino_t_to_u32(ino); + break; + case FSID_NUM: + fsidv[0] = fsid; + break; + case FSID_MAJOR_MINOR: + fsidv[0] = htonl(MAJOR(dev)); + fsidv[1] = htonl(MINOR(dev)); + fsidv[2] = ino_t_to_u32(ino); + break; + + case FSID_ENCODE_DEV: + fsidv[0] = new_encode_dev(dev); + fsidv[1] = ino_t_to_u32(ino); + break; + + case FSID_UUID4_INUM: + /* 4 byte fsid and inode number */ + up = (u32*)uuid; + fsidv[0] = ino_t_to_u32(ino); + fsidv[1] = up[0] ^ up[1] ^ up[2] ^ up[3]; + break; + + case FSID_UUID8: + /* 8 byte fsid */ + up = (u32*)uuid; + fsidv[0] = up[0] ^ up[2]; + fsidv[1] = up[1] ^ up[3]; + break; + + case FSID_UUID16: + /* 16 byte fsid - NFSv3+ only */ + memcpy(fsidv, uuid, 16); + break; + + case FSID_UUID16_INUM: + /* 8 byte inode and 16 byte fsid */ + *(u64*)fsidv = (u64)ino; + memcpy(fsidv+2, uuid, 16); + break; + default: BUG(); + } +} + +static inline int key_len(int type) +{ + switch(type) { + case FSID_DEV: return 8; + case FSID_NUM: return 4; + case FSID_MAJOR_MINOR: return 12; + case FSID_ENCODE_DEV: return 8; + case FSID_UUID4_INUM: return 8; + case FSID_UUID8: return 8; + case FSID_UUID16: return 16; + case FSID_UUID16_INUM: return 24; + default: return 0; + } +} + +/* + * Shorthand for dprintk()'s + */ +extern char * SVCFH_fmt(struct svc_fh *fhp); + +/* + * Function prototypes + */ +__be32 fh_verify(struct svc_rqst *, struct svc_fh *, int, int); +__be32 fh_compose(struct svc_fh *, struct svc_export *, struct dentry *, struct svc_fh *); +__be32 fh_update(struct svc_fh *); +void fh_put(struct svc_fh *); + +static __inline__ struct svc_fh * +fh_copy(struct svc_fh *dst, struct svc_fh *src) +{ + WARN_ON(src->fh_dentry || src->fh_locked); + + *dst = *src; + return dst; +} + +static inline void +fh_copy_shallow(struct knfsd_fh *dst, struct knfsd_fh *src) +{ + dst->fh_size = src->fh_size; + memcpy(&dst->fh_base, &src->fh_base, src->fh_size); +} + +static __inline__ struct svc_fh * +fh_init(struct svc_fh *fhp, int maxsize) +{ + memset(fhp, 0, sizeof(*fhp)); + fhp->fh_maxsize = maxsize; + return fhp; +} + +#ifdef CONFIG_NFSD_V3 +/* + * Fill in the pre_op attr for the wcc data + */ +static inline void +fill_pre_wcc(struct svc_fh *fhp) +{ + struct inode *inode; + + inode = fhp->fh_dentry->d_inode; + if (!fhp->fh_pre_saved) { + fhp->fh_pre_mtime = inode->i_mtime; + fhp->fh_pre_ctime = inode->i_ctime; + fhp->fh_pre_size = inode->i_size; + fhp->fh_pre_change = inode->i_version; + fhp->fh_pre_saved = 1; + } +} + +extern void fill_post_wcc(struct svc_fh *); +#else +#define fill_pre_wcc(ignored) +#define fill_post_wcc(notused) +#endif /* CONFIG_NFSD_V3 */ + + +/* + * Lock a file handle/inode + * NOTE: both fh_lock and fh_unlock are done "by hand" in + * vfs.c:nfsd_rename as it needs to grab 2 i_mutex's at once + * so, any changes here should be reflected there. + */ + +static inline void +fh_lock_nested(struct svc_fh *fhp, unsigned int subclass) +{ + struct dentry *dentry = fhp->fh_dentry; + struct inode *inode; + + BUG_ON(!dentry); + + if (fhp->fh_locked) { + printk(KERN_WARNING "fh_lock: %s/%s already locked!\n", + dentry->d_parent->d_name.name, dentry->d_name.name); + return; + } + + inode = dentry->d_inode; + mutex_lock_nested(&inode->i_mutex, subclass); + fill_pre_wcc(fhp); + fhp->fh_locked = 1; +} + +static inline void +fh_lock(struct svc_fh *fhp) +{ + fh_lock_nested(fhp, I_MUTEX_NORMAL); +} + +/* + * Unlock a file handle/inode + */ +static inline void +fh_unlock(struct svc_fh *fhp) +{ + BUG_ON(!fhp->fh_dentry); + + if (fhp->fh_locked) { + fill_post_wcc(fhp); + mutex_unlock(&fhp->fh_dentry->d_inode->i_mutex); + fhp->fh_locked = 0; + } +} + +#endif /* _LINUX_NFSD_FH_INT_H */ diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 0eb9c82..a047ad6 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -1,29 +1,14 @@ /* - * nfsproc2.c Process version 2 NFS requests. - * linux/fs/nfsd/nfs2proc.c - * * Process version 2 NFS requests. * * Copyright (C) 1995-1997 Olaf Kirch <okir@monad.swb.de> */ -#include <linux/linkage.h> -#include <linux/time.h> -#include <linux/errno.h> -#include <linux/fs.h> -#include <linux/stat.h> -#include <linux/fcntl.h> -#include <linux/net.h> -#include <linux/in.h> #include <linux/namei.h> -#include <linux/unistd.h> -#include <linux/slab.h> -#include <linux/sunrpc/clnt.h> -#include <linux/sunrpc/svc.h> -#include <linux/nfsd/nfsd.h> -#include <linux/nfsd/cache.h> -#include <linux/nfsd/xdr.h> +#include "cache.h" +#include "xdr.h" +#include "vfs.h" typedef struct svc_rqst svc_rqst; typedef struct svc_buf svc_buf; @@ -758,6 +743,7 @@ nfserrno (int errno) { nfserr_io, -ETXTBSY }, { nfserr_notsupp, -EOPNOTSUPP }, { nfserr_toosmall, -ETOOSMALL }, + { nfserr_serverfault, -ESERVERFAULT }, }; int i; diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 67ea83e..171699e 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -1,6 +1,4 @@ /* - * linux/fs/nfsd/nfssvc.c - * * Central processing for nfsd. * * Authors: Olaf Kirch (okir@monad.swb.de) @@ -8,33 +6,19 @@ * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de> */ -#include <linux/module.h> #include <linux/sched.h> -#include <linux/time.h> -#include <linux/errno.h> -#include <linux/nfs.h> -#include <linux/in.h> -#include <linux/uio.h> -#include <linux/unistd.h> -#include <linux/slab.h> -#include <linux/smp.h> #include <linux/freezer.h> #include <linux/fs_struct.h> -#include <linux/kthread.h> #include <linux/swap.h> -#include <linux/sunrpc/types.h> #include <linux/sunrpc/stats.h> -#include <linux/sunrpc/svc.h> #include <linux/sunrpc/svcsock.h> -#include <linux/sunrpc/cache.h> -#include <linux/nfsd/nfsd.h> -#include <linux/nfsd/stats.h> -#include <linux/nfsd/cache.h> -#include <linux/nfsd/syscall.h> #include <linux/lockd/bind.h> #include <linux/nfsacl.h> #include <linux/seq_file.h> +#include "nfsd.h" +#include "cache.h" +#include "vfs.h" #define NFSDDBG_FACILITY NFSDDBG_SVC diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index afd08e2..4ce005d 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -1,20 +1,10 @@ /* - * linux/fs/nfsd/nfsxdr.c - * * XDR support for nfsd * * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> */ -#include <linux/types.h> -#include <linux/time.h> -#include <linux/nfs.h> -#include <linux/vfs.h> -#include <linux/sunrpc/xdr.h> -#include <linux/sunrpc/svc.h> -#include <linux/nfsd/nfsd.h> -#include <linux/nfsd/xdr.h> -#include <linux/mm.h> +#include "xdr.h" #include "auth.h" #define NFSDDBG_FACILITY NFSDDBG_XDR diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h new file mode 100644 index 0000000..fefeae2 --- /dev/null +++ b/fs/nfsd/state.h @@ -0,0 +1,408 @@ +/* + * Copyright (c) 2001 The Regents of the University of Michigan. + * All rights reserved. + * + * Kendrick Smith <kmsmith@umich.edu> + * Andy Adamson <andros@umich.edu> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef _NFSD4_STATE_H +#define _NFSD4_STATE_H + +#include <linux/nfsd/nfsfh.h> +#include "nfsfh.h" + +typedef struct { + u32 cl_boot; + u32 cl_id; +} clientid_t; + +typedef struct { + u32 so_boot; + u32 so_stateownerid; + u32 so_fileid; +} stateid_opaque_t; + +typedef struct { + u32 si_generation; + stateid_opaque_t si_opaque; +} stateid_t; +#define si_boot si_opaque.so_boot +#define si_stateownerid si_opaque.so_stateownerid +#define si_fileid si_opaque.so_fileid + +#define STATEID_FMT "(%08x/%08x/%08x/%08x)" +#define STATEID_VAL(s) \ + (s)->si_boot, \ + (s)->si_stateownerid, \ + (s)->si_fileid, \ + (s)->si_generation + +struct nfsd4_cb_sequence { + /* args/res */ + u32 cbs_minorversion; + struct nfs4_client *cbs_clp; +}; + +struct nfs4_delegation { + struct list_head dl_perfile; + struct list_head dl_perclnt; + struct list_head dl_recall_lru; /* delegation recalled */ + atomic_t dl_count; /* ref count */ + struct nfs4_client *dl_client; + struct nfs4_file *dl_file; + struct file_lock *dl_flock; + struct file *dl_vfs_file; + u32 dl_type; + time_t dl_time; +/* For recall: */ + u32 dl_ident; + stateid_t dl_stateid; + struct knfsd_fh dl_fh; + int dl_retries; +}; + +/* client delegation callback info */ +struct nfs4_cb_conn { + /* SETCLIENTID info */ + struct sockaddr_storage cb_addr; + size_t cb_addrlen; + u32 cb_prog; + u32 cb_minorversion; + u32 cb_ident; /* minorversion 0 only */ + /* RPC client info */ + atomic_t cb_set; /* successful CB_NULL call */ + struct rpc_clnt * cb_client; +}; + +/* Maximum number of slots per session. 160 is useful for long haul TCP */ +#define NFSD_MAX_SLOTS_PER_SESSION 160 +/* Maximum number of operations per session compound */ +#define NFSD_MAX_OPS_PER_COMPOUND 16 +/* Maximum session per slot cache size */ +#define NFSD_SLOT_CACHE_SIZE 1024 +/* Maximum number of NFSD_SLOT_CACHE_SIZE slots per session */ +#define NFSD_CACHE_SIZE_SLOTS_PER_SESSION 32 +#define NFSD_MAX_MEM_PER_SESSION \ + (NFSD_CACHE_SIZE_SLOTS_PER_SESSION * NFSD_SLOT_CACHE_SIZE) + +struct nfsd4_slot { + bool sl_inuse; + bool sl_cachethis; + u16 sl_opcnt; + u32 sl_seqid; + __be32 sl_status; + u32 sl_datalen; + char sl_data[]; +}; + +struct nfsd4_channel_attrs { + u32 headerpadsz; + u32 maxreq_sz; + u32 maxresp_sz; + u32 maxresp_cached; + u32 maxops; + u32 maxreqs; + u32 nr_rdma_attrs; + u32 rdma_attrs; +}; + +struct nfsd4_create_session { + clientid_t clientid; + struct nfs4_sessionid sessionid; + u32 seqid; + u32 flags; + struct nfsd4_channel_attrs fore_channel; + struct nfsd4_channel_attrs back_channel; + u32 callback_prog; + u32 uid; + u32 gid; +}; + +/* The single slot clientid cache structure */ +struct nfsd4_clid_slot { + u32 sl_seqid; + __be32 sl_status; + struct nfsd4_create_session sl_cr_ses; +}; + +struct nfsd4_session { + struct kref se_ref; + struct list_head se_hash; /* hash by sessionid */ + struct list_head se_perclnt; + u32 se_flags; + struct nfs4_client *se_client; /* for expire_client */ + struct nfs4_sessionid se_sessionid; + struct nfsd4_channel_attrs se_fchannel; + struct nfsd4_channel_attrs se_bchannel; + struct nfsd4_slot *se_slots[]; /* forward channel slots */ +}; + +static inline void +nfsd4_put_session(struct nfsd4_session *ses) +{ + extern void free_session(struct kref *kref); + kref_put(&ses->se_ref, free_session); +} + +static inline void +nfsd4_get_session(struct nfsd4_session *ses) +{ + kref_get(&ses->se_ref); +} + +/* formatted contents of nfs4_sessionid */ +struct nfsd4_sessionid { + clientid_t clientid; + u32 sequence; + u32 reserved; +}; + +#define HEXDIR_LEN 33 /* hex version of 16 byte md5 of cl_name plus '\0' */ + +/* + * struct nfs4_client - one per client. Clientids live here. + * o Each nfs4_client is hashed by clientid. + * + * o Each nfs4_clients is also hashed by name + * (the opaque quantity initially sent by the client to identify itself). + * + * o cl_perclient list is used to ensure no dangling stateowner references + * when we expire the nfs4_client + */ +struct nfs4_client { + struct list_head cl_idhash; /* hash by cl_clientid.id */ + struct list_head cl_strhash; /* hash by cl_name */ + struct list_head cl_openowners; + struct list_head cl_delegations; + struct list_head cl_lru; /* tail queue */ + struct xdr_netobj cl_name; /* id generated by client */ + char cl_recdir[HEXDIR_LEN]; /* recovery dir */ + nfs4_verifier cl_verifier; /* generated by client */ + time_t cl_time; /* time of last lease renewal */ + struct sockaddr_storage cl_addr; /* client ipaddress */ + u32 cl_flavor; /* setclientid pseudoflavor */ + char *cl_principal; /* setclientid principal name */ + struct svc_cred cl_cred; /* setclientid principal */ + clientid_t cl_clientid; /* generated by server */ + nfs4_verifier cl_confirm; /* generated by server */ + struct nfs4_cb_conn cl_cb_conn; /* callback info */ + atomic_t cl_count; /* ref count */ + u32 cl_firststate; /* recovery dir creation */ + + /* for nfs41 */ + struct list_head cl_sessions; + struct nfsd4_clid_slot cl_cs_slot; /* create_session slot */ + u32 cl_exchange_flags; + struct nfs4_sessionid cl_sessionid; + + /* for nfs41 callbacks */ + /* We currently support a single back channel with a single slot */ + unsigned long cl_cb_slot_busy; + u32 cl_cb_seq_nr; + struct svc_xprt *cl_cb_xprt; /* 4.1 callback transport */ + struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */ + /* wait here for slots */ +}; + +/* struct nfs4_client_reset + * one per old client. Populates reset_str_hashtbl. Filled from conf_id_hashtbl + * upon lease reset, or from upcall to state_daemon (to read in state + * from non-volitile storage) upon reboot. + */ +struct nfs4_client_reclaim { + struct list_head cr_strhash; /* hash by cr_name */ + char cr_recdir[HEXDIR_LEN]; /* recover dir */ +}; + +static inline void +update_stateid(stateid_t *stateid) +{ + stateid->si_generation++; +} + +/* A reasonable value for REPLAY_ISIZE was estimated as follows: + * The OPEN response, typically the largest, requires + * 4(status) + 8(stateid) + 20(changeinfo) + 4(rflags) + 8(verifier) + + * 4(deleg. type) + 8(deleg. stateid) + 4(deleg. recall flag) + + * 20(deleg. space limit) + ~32(deleg. ace) = 112 bytes + */ + +#define NFSD4_REPLAY_ISIZE 112 + +/* + * Replay buffer, where the result of the last seqid-mutating operation + * is cached. + */ +struct nfs4_replay { + __be32 rp_status; + unsigned int rp_buflen; + char *rp_buf; + unsigned intrp_allocated; + struct knfsd_fh rp_openfh; + char rp_ibuf[NFSD4_REPLAY_ISIZE]; +}; + +/* +* nfs4_stateowner can either be an open_owner, or a lock_owner +* +* so_idhash: stateid_hashtbl[] for open owner, lockstateid_hashtbl[] +* for lock_owner +* so_strhash: ownerstr_hashtbl[] for open_owner, lock_ownerstr_hashtbl[] +* for lock_owner +* so_perclient: nfs4_client->cl_perclient entry - used when nfs4_client +* struct is reaped. +* so_perfilestate: heads the list of nfs4_stateid (either open or lock) +* and is used to ensure no dangling nfs4_stateid references when we +* release a stateowner. +* so_perlockowner: (open) nfs4_stateid->st_perlockowner entry - used when +* close is called to reap associated byte-range locks +* so_close_lru: (open) stateowner is placed on this list instead of being +* reaped (when so_perfilestate is empty) to hold the last close replay. +* reaped by laundramat thread after lease period. +*/ +struct nfs4_stateowner { + struct kref so_ref; + struct list_head so_idhash; /* hash by so_id */ + struct list_head so_strhash; /* hash by op_name */ + struct list_head so_perclient; + struct list_head so_stateids; + struct list_head so_perstateid; /* for lockowners only */ + struct list_head so_close_lru; /* tail queue */ + time_t so_time; /* time of placement on so_close_lru */ + int so_is_open_owner; /* 1=openowner,0=lockowner */ + u32 so_id; + struct nfs4_client * so_client; + /* after increment in ENCODE_SEQID_OP_TAIL, represents the next + * sequence id expected from the client: */ + u32 so_seqid; + struct xdr_netobj so_owner; /* open owner name */ + int so_confirmed; /* successful OPEN_CONFIRM? */ + struct nfs4_replay so_replay; +}; + +/* +* nfs4_file: a file opened by some number of (open) nfs4_stateowners. +* o fi_perfile list is used to search for conflicting +* share_acces, share_deny on the file. +*/ +struct nfs4_file { + atomic_t fi_ref; + struct list_head fi_hash; /* hash by "struct inode *" */ + struct list_head fi_stateids; + struct list_head fi_delegations; + struct inode *fi_inode; + u32 fi_id; /* used with stateowner->so_id + * for stateid_hashtbl hash */ + bool fi_had_conflict; +}; + +/* +* nfs4_stateid can either be an open stateid or (eventually) a lock stateid +* +* (open)nfs4_stateid: one per (open)nfs4_stateowner, nfs4_file +* +* st_hash: stateid_hashtbl[] entry or lockstateid_hashtbl entry +* st_perfile: file_hashtbl[] entry. +* st_perfile_state: nfs4_stateowner->so_perfilestate +* st_perlockowner: (open stateid) list of lock nfs4_stateowners +* st_access_bmap: used only for open stateid +* st_deny_bmap: used only for open stateid +* st_openstp: open stateid lock stateid was derived from +* +* XXX: open stateids and lock stateids have diverged sufficiently that +* we should consider defining separate structs for the two cases. +*/ + +struct nfs4_stateid { + struct list_head st_hash; + struct list_head st_perfile; + struct list_head st_perstateowner; + struct list_head st_lockowners; + struct nfs4_stateowner * st_stateowner; + struct nfs4_file * st_file; + stateid_t st_stateid; + struct file * st_vfs_file; + unsigned long st_access_bmap; + unsigned long st_deny_bmap; + struct nfs4_stateid * st_openstp; +}; + +/* flags for preprocess_seqid_op() */ +#define HAS_SESSION 0x00000001 +#define CONFIRM 0x00000002 +#define OPEN_STATE 0x00000004 +#define LOCK_STATE 0x00000008 +#define RD_STATE 0x00000010 +#define WR_STATE 0x00000020 +#define CLOSE_STATE 0x00000040 + +#define seqid_mutating_err(err) \ + (((err) != nfserr_stale_clientid) && \ + ((err) != nfserr_bad_seqid) && \ + ((err) != nfserr_stale_stateid) && \ + ((err) != nfserr_bad_stateid)) + +struct nfsd4_compound_state; + +extern __be32 nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, + stateid_t *stateid, int flags, struct file **filp); +extern void nfs4_lock_state(void); +extern void nfs4_unlock_state(void); +extern int nfs4_in_grace(void); +extern __be32 nfs4_check_open_reclaim(clientid_t *clid); +extern void put_nfs4_client(struct nfs4_client *clp); +extern void nfs4_free_stateowner(struct kref *kref); +extern int set_callback_cred(void); +extern void nfsd4_probe_callback(struct nfs4_client *clp); +extern void nfsd4_cb_recall(struct nfs4_delegation *dp); +extern void nfs4_put_delegation(struct nfs4_delegation *dp); +extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname); +extern void nfsd4_init_recdir(char *recdir_name); +extern int nfsd4_recdir_load(void); +extern void nfsd4_shutdown_recdir(void); +extern int nfs4_client_to_reclaim(const char *name); +extern int nfs4_has_reclaimed_state(const char *name, bool use_exchange_id); +extern void nfsd4_recdir_purge_old(void); +extern int nfsd4_create_clid_dir(struct nfs4_client *clp); +extern void nfsd4_remove_clid_dir(struct nfs4_client *clp); + +static inline void +nfs4_put_stateowner(struct nfs4_stateowner *so) +{ + kref_put(&so->so_ref, nfs4_free_stateowner); +} + +static inline void +nfs4_get_stateowner(struct nfs4_stateowner *so) +{ + kref_get(&so->so_ref); +} + +#endif /* NFSD4_STATE_H */ diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c index 71944cd..5232d3e 100644 --- a/fs/nfsd/stats.c +++ b/fs/nfsd/stats.c @@ -1,6 +1,4 @@ /* - * linux/fs/nfsd/stats.c - * * procfs-based user access to knfsd statistics * * /proc/net/rpc/nfsd @@ -23,18 +21,13 @@ * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de> */ -#include <linux/kernel.h> -#include <linux/time.h> -#include <linux/proc_fs.h> #include <linux/seq_file.h> -#include <linux/stat.h> #include <linux/module.h> - -#include <linux/sunrpc/svc.h> #include <linux/sunrpc/stats.h> -#include <linux/nfsd/nfsd.h> #include <linux/nfsd/stats.h> +#include "nfsd.h" + struct nfsd_stats nfsdstats; struct svc_stat nfsd_svcstats = { .program = &nfsd_program, diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index a293f02..8715d19 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1,7 +1,5 @@ #define MSNFS /* HACK HACK */ /* - * linux/fs/nfsd/vfs.c - * * File operations used by nfsd. Some of these have been ripped from * other parts of the kernel because they weren't exported, others * are partial duplicates with added or changed functionality. @@ -16,48 +14,31 @@ * Zerocpy NFS support (C) 2002 Hirokazu Takahashi <taka@valinux.co.jp> */ -#include <linux/string.h> -#include <linux/time.h> -#include <linux/errno.h> #include <linux/fs.h> #include <linux/file.h> -#include <linux/mount.h> -#include <linux/major.h> #include <linux/splice.h> -#include <linux/proc_fs.h> -#include <linux/stat.h> #include <linux/fcntl.h> -#include <linux/net.h> -#include <linux/unistd.h> -#include <linux/slab.h> -#include <linux/pagemap.h> -#include <linux/in.h> -#include <linux/module.h> #include <linux/namei.h> -#include <linux/vfs.h> #include <linux/delay.h> -#include <linux/sunrpc/svc.h> -#include <linux/nfsd/nfsd.h> -#ifdef CONFIG_NFSD_V3 -#include <linux/nfs3.h> -#include <linux/nfsd/xdr3.h> -#endif /* CONFIG_NFSD_V3 */ -#include <linux/nfsd/nfsfh.h> #include <linux/quotaops.h> #include <linux/fsnotify.h> -#include <linux/posix_acl.h> #include <linux/posix_acl_xattr.h> #include <linux/xattr.h> +#include <linux/jhash.h> +#include <linux/ima.h> +#include <asm/uaccess.h> + +#ifdef CONFIG_NFSD_V3 +#include "xdr3.h" +#endif /* CONFIG_NFSD_V3 */ + #ifdef CONFIG_NFSD_V4 -#include <linux/nfs4.h> #include <linux/nfs4_acl.h> #include <linux/nfsd_idmap.h> -#include <linux/security.h> #endif /* CONFIG_NFSD_V4 */ -#include <linux/jhash.h> -#include <linux/ima.h> -#include <asm/uaccess.h> +#include "nfsd.h" +#include "vfs.h" #define NFSDDBG_FACILITY NFSDDBG_FILEOP @@ -89,12 +70,6 @@ struct raparm_hbucket { #define RAPARM_HASH_MASK (RAPARM_HASH_SIZE-1) static struct raparm_hbucket raparm_hash[RAPARM_HASH_SIZE]; -static inline int -nfsd_v4client(struct svc_rqst *rq) -{ - return rq->rq_prog == NFS_PROGRAM && rq->rq_vers == 4; -} - /* * Called from nfsd_lookup and encode_dirent. Check if we have crossed * a mount point. @@ -116,8 +91,16 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, exp2 = rqst_exp_get_by_name(rqstp, &path); if (IS_ERR(exp2)) { - if (PTR_ERR(exp2) != -ENOENT) - err = PTR_ERR(exp2); + err = PTR_ERR(exp2); + /* + * We normally allow NFS clients to continue + * "underneath" a mountpoint that is not exported. + * The exception is V4ROOT, where no traversal is ever + * allowed without an explicit export of the new + * directory. + */ + if (err == -ENOENT && !(exp->ex_flags & NFSEXP_V4ROOT)) + err = 0; path_put(&path); goto out; } @@ -141,6 +124,53 @@ out: return err; } +static void follow_to_parent(struct path *path) +{ + struct dentry *dp; + + while (path->dentry == path->mnt->mnt_root && follow_up(path)) + ; + dp = dget_parent(path->dentry); + dput(path->dentry); + path->dentry = dp; +} + +static int nfsd_lookup_parent(struct svc_rqst *rqstp, struct dentry *dparent, struct svc_export **exp, struct dentry **dentryp) +{ + struct svc_export *exp2; + struct path path = {.mnt = mntget((*exp)->ex_path.mnt), + .dentry = dget(dparent)}; + + follow_to_parent(&path); + + exp2 = rqst_exp_parent(rqstp, &path); + if (PTR_ERR(exp2) == -ENOENT) { + *dentryp = dget(dparent); + } else if (IS_ERR(exp2)) { + path_put(&path); + return PTR_ERR(exp2); + } else { + *dentryp = dget(path.dentry); + exp_put(*exp); + *exp = exp2; + } + path_put(&path); + return 0; +} + +/* + * For nfsd purposes, we treat V4ROOT exports as though there was an + * export at *every* directory. + */ +int nfsd_mountpoint(struct dentry *dentry, struct svc_export *exp) +{ + if (d_mountpoint(dentry)) + return 1; + if (!(exp->ex_flags & NFSEXP_V4ROOT)) + return 0; + return dentry->d_inode != NULL; +} + __be32 nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name, unsigned int len, @@ -169,35 +199,13 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp, dentry = dget(dparent); else if (dparent != exp->ex_path.dentry) dentry = dget_parent(dparent); - else if (!EX_NOHIDE(exp)) + else if (!EX_NOHIDE(exp) && !nfsd_v4client(rqstp)) dentry = dget(dparent); /* .. == . just like at / */ else { /* checking mountpoint crossing is very different when stepping up */ - struct svc_export *exp2 = NULL; - struct dentry *dp; - struct path path = {.mnt = mntget(exp->ex_path.mnt), - .dentry = dget(dparent)}; - - while (path.dentry == path.mnt->mnt_root && - follow_up(&path)) - ; - dp = dget_parent(path.dentry); - dput(path.dentry); - path.dentry = dp; - - exp2 = rqst_exp_parent(rqstp, &path); - if (PTR_ERR(exp2) == -ENOENT) { - dentry = dget(dparent); - } else if (IS_ERR(exp2)) { - host_err = PTR_ERR(exp2); - path_put(&path); + host_err = nfsd_lookup_parent(rqstp, dparent, &exp, &dentry); + if (host_err) goto out_nfserr; - } else { - dentry = dget(path.dentry); - exp_put(exp); - exp = exp2; - } - path_put(&path); } } else { fh_lock(fhp); @@ -208,7 +216,7 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp, /* * check if we have crossed a mount point ... */ - if (d_mountpoint(dentry)) { + if (nfsd_mountpoint(dentry, exp)) { if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) { dput(dentry); goto out_nfserr; @@ -745,7 +753,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, if (IS_ERR(*filp)) host_err = PTR_ERR(*filp); else - ima_counts_get(*filp); + host_err = ima_file_check(*filp, access); out_nfserr: err = nfserrno(host_err); out: @@ -774,12 +782,9 @@ static inline int nfsd_dosync(struct file *filp, struct dentry *dp, int (*fsync) (struct file *, struct dentry *, int); int err; - err = filemap_fdatawrite(inode->i_mapping); + err = filemap_write_and_wait(inode->i_mapping); if (err == 0 && fop && (fsync = fop->fsync)) err = fsync(filp, dp, 0); - if (err == 0) - err = filemap_fdatawait(inode->i_mapping); - return err; } @@ -2124,8 +2129,6 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, */ path.mnt = exp->ex_path.mnt; path.dentry = dentry; - err = ima_path_check(&path, acc & (MAY_READ | MAY_WRITE | MAY_EXEC), - IMA_COUNT_LEAVE); nfsd_out: return err? nfserrno(err) : 0; } diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h new file mode 100644 index 0000000..4b1de0a --- /dev/null +++ b/fs/nfsd/vfs.h @@ -0,0 +1,101 @@ +/* + * Copyright (C) 1995-1997 Olaf Kirch <okir@monad.swb.de> + */ + +#ifndef LINUX_NFSD_VFS_H +#define LINUX_NFSD_VFS_H + +#include "nfsfh.h" + +/* + * Flags for nfsd_permission + */ +#define NFSD_MAY_NOP 0 +#define NFSD_MAY_EXEC 1 /* == MAY_EXEC */ +#define NFSD_MAY_WRITE 2 /* == MAY_WRITE */ +#define NFSD_MAY_READ 4 /* == MAY_READ */ +#define NFSD_MAY_SATTR 8 +#define NFSD_MAY_TRUNC 16 +#define NFSD_MAY_LOCK 32 +#define NFSD_MAY_OWNER_OVERRIDE 64 +#define NFSD_MAY_LOCAL_ACCESS 128 /* IRIX doing local access check on device special file*/ +#define NFSD_MAY_BYPASS_GSS_ON_ROOT 256 + +#define NFSD_MAY_CREATE (NFSD_MAY_EXEC|NFSD_MAY_WRITE) +#define NFSD_MAY_REMOVE (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC) + +/* + * Callback function for readdir + */ +typedef int (*nfsd_dirop_t)(struct inode *, struct dentry *, int, int); + +/* nfsd/vfs.c */ +int fh_lock_parent(struct svc_fh *, struct dentry *); +int nfsd_racache_init(int); +void nfsd_racache_shutdown(void); +int nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, + struct svc_export **expp); +__be32 nfsd_lookup(struct svc_rqst *, struct svc_fh *, + const char *, unsigned int, struct svc_fh *); +__be32 nfsd_lookup_dentry(struct svc_rqst *, struct svc_fh *, + const char *, unsigned int, + struct svc_export **, struct dentry **); +__be32 nfsd_setattr(struct svc_rqst *, struct svc_fh *, + struct iattr *, int, time_t); +int nfsd_mountpoint(struct dentry *, struct svc_export *); +#ifdef CONFIG_NFSD_V4 +__be32 nfsd4_set_nfs4_acl(struct svc_rqst *, struct svc_fh *, + struct nfs4_acl *); +int nfsd4_get_nfs4_acl(struct svc_rqst *, struct dentry *, struct nfs4_acl **); +#endif /* CONFIG_NFSD_V4 */ +__be32 nfsd_create(struct svc_rqst *, struct svc_fh *, + char *name, int len, struct iattr *attrs, + int type, dev_t rdev, struct svc_fh *res); +#ifdef CONFIG_NFSD_V3 +__be32 nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *); +__be32 nfsd_create_v3(struct svc_rqst *, struct svc_fh *, + char *name, int len, struct iattr *attrs, + struct svc_fh *res, int createmode, + u32 *verifier, int *truncp, int *created); +__be32 nfsd_commit(struct svc_rqst *, struct svc_fh *, + loff_t, unsigned long); +#endif /* CONFIG_NFSD_V3 */ +__be32 nfsd_open(struct svc_rqst *, struct svc_fh *, int, + int, struct file **); +void nfsd_close(struct file *); +__be32 nfsd_read(struct svc_rqst *, struct svc_fh *, struct file *, + loff_t, struct kvec *, int, unsigned long *); +__be32 nfsd_write(struct svc_rqst *, struct svc_fh *,struct file *, + loff_t, struct kvec *,int, unsigned long *, int *); +__be32 nfsd_readlink(struct svc_rqst *, struct svc_fh *, + char *, int *); +__be32 nfsd_symlink(struct svc_rqst *, struct svc_fh *, + char *name, int len, char *path, int plen, + struct svc_fh *res, struct iattr *); +__be32 nfsd_link(struct svc_rqst *, struct svc_fh *, + char *, int, struct svc_fh *); +__be32 nfsd_rename(struct svc_rqst *, + struct svc_fh *, char *, int, + struct svc_fh *, char *, int); +__be32 nfsd_remove(struct svc_rqst *, + struct svc_fh *, char *, int); +__be32 nfsd_unlink(struct svc_rqst *, struct svc_fh *, int type, + char *name, int len); +int nfsd_truncate(struct svc_rqst *, struct svc_fh *, + unsigned long size); +__be32 nfsd_readdir(struct svc_rqst *, struct svc_fh *, + loff_t *, struct readdir_cd *, filldir_t); +__be32 nfsd_statfs(struct svc_rqst *, struct svc_fh *, + struct kstatfs *, int access); + +int nfsd_notify_change(struct inode *, struct iattr *); +__be32 nfsd_permission(struct svc_rqst *, struct svc_export *, + struct dentry *, int); +int nfsd_sync_dir(struct dentry *dp); + +#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) +struct posix_acl *nfsd_get_posix_acl(struct svc_fh *, int); +int nfsd_set_posix_acl(struct svc_fh *, int, struct posix_acl *); +#endif + +#endif /* LINUX_NFSD_VFS_H */ diff --git a/fs/nfsd/xdr.h b/fs/nfsd/xdr.h new file mode 100644 index 0000000..53b1863 --- /dev/null +++ b/fs/nfsd/xdr.h @@ -0,0 +1,173 @@ +/* XDR types for nfsd. This is mainly a typing exercise. */ + +#ifndef LINUX_NFSD_H +#define LINUX_NFSD_H + +#include <linux/vfs.h> +#include "nfsd.h" +#include "nfsfh.h" + +struct nfsd_fhandle { + struct svc_fh fh; +}; + +struct nfsd_sattrargs { + struct svc_fh fh; + struct iattr attrs; +}; + +struct nfsd_diropargs { + struct svc_fh fh; + char * name; + unsigned int len; +}; + +struct nfsd_readargs { + struct svc_fh fh; + __u32 offset; + __u32 count; + int vlen; +}; + +struct nfsd_writeargs { + svc_fh fh; + __u32 offset; + int len; + int vlen; +}; + +struct nfsd_createargs { + struct svc_fh fh; + char * name; + unsigned int len; + struct iattr attrs; +}; + +struct nfsd_renameargs { + struct svc_fh ffh; + char * fname; + unsigned int flen; + struct svc_fh tfh; + char * tname; + unsigned int tlen; +}; + +struct nfsd_readlinkargs { + struct svc_fh fh; + char * buffer; +}; + +struct nfsd_linkargs { + struct svc_fh ffh; + struct svc_fh tfh; + char * tname; + unsigned int tlen; +}; + +struct nfsd_symlinkargs { + struct svc_fh ffh; + char * fname; + unsigned int flen; + char * tname; + unsigned int tlen; + struct iattr attrs; +}; + +struct nfsd_readdirargs { + struct svc_fh fh; + __u32 cookie; + __u32 count; + __be32 * buffer; +}; + +struct nfsd_attrstat { + struct svc_fh fh; + struct kstat stat; +}; + +struct nfsd_diropres { + struct svc_fh fh; + struct kstat stat; +}; + +struct nfsd_readlinkres { + int len; +}; + +struct nfsd_readres { + struct svc_fh fh; + unsigned long count; + struct kstat stat; +}; + +struct nfsd_readdirres { + int count; + + struct readdir_cd common; + __be32 * buffer; + int buflen; + __be32 * offset; +}; + +struct nfsd_statfsres { + struct kstatfs stats; +}; + +/* + * Storage requirements for XDR arguments and results. + */ +union nfsd_xdrstore { + struct nfsd_sattrargs sattr; + struct nfsd_diropargs dirop; + struct nfsd_readargs read; + struct nfsd_writeargs write; + struct nfsd_createargs create; + struct nfsd_renameargs rename; + struct nfsd_linkargs link; + struct nfsd_symlinkargs symlink; + struct nfsd_readdirargs readdir; +}; + +#define NFS2_SVC_XDRSIZE sizeof(union nfsd_xdrstore) + + +int nfssvc_decode_void(struct svc_rqst *, __be32 *, void *); +int nfssvc_decode_fhandle(struct svc_rqst *, __be32 *, struct nfsd_fhandle *); +int nfssvc_decode_sattrargs(struct svc_rqst *, __be32 *, + struct nfsd_sattrargs *); +int nfssvc_decode_diropargs(struct svc_rqst *, __be32 *, + struct nfsd_diropargs *); +int nfssvc_decode_readargs(struct svc_rqst *, __be32 *, + struct nfsd_readargs *); +int nfssvc_decode_writeargs(struct svc_rqst *, __be32 *, + struct nfsd_writeargs *); +int nfssvc_decode_createargs(struct svc_rqst *, __be32 *, + struct nfsd_createargs *); +int nfssvc_decode_renameargs(struct svc_rqst *, __be32 *, + struct nfsd_renameargs *); +int nfssvc_decode_readlinkargs(struct svc_rqst *, __be32 *, + struct nfsd_readlinkargs *); +int nfssvc_decode_linkargs(struct svc_rqst *, __be32 *, + struct nfsd_linkargs *); +int nfssvc_decode_symlinkargs(struct svc_rqst *, __be32 *, + struct nfsd_symlinkargs *); +int nfssvc_decode_readdirargs(struct svc_rqst *, __be32 *, + struct nfsd_readdirargs *); +int nfssvc_encode_void(struct svc_rqst *, __be32 *, void *); +int nfssvc_encode_attrstat(struct svc_rqst *, __be32 *, struct nfsd_attrstat *); +int nfssvc_encode_diropres(struct svc_rqst *, __be32 *, struct nfsd_diropres *); +int nfssvc_encode_readlinkres(struct svc_rqst *, __be32 *, struct nfsd_readlinkres *); +int nfssvc_encode_readres(struct svc_rqst *, __be32 *, struct nfsd_readres *); +int nfssvc_encode_statfsres(struct svc_rqst *, __be32 *, struct nfsd_statfsres *); +int nfssvc_encode_readdirres(struct svc_rqst *, __be32 *, struct nfsd_readdirres *); + +int nfssvc_encode_entry(void *, const char *name, + int namlen, loff_t offset, u64 ino, unsigned int); + +int nfssvc_release_fhandle(struct svc_rqst *, __be32 *, struct nfsd_fhandle *); + +/* Helper functions for NFSv2 ACL code */ +__be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp); +__be32 *nfs2svc_decode_fh(__be32 *p, struct svc_fh *fhp); + +#endif /* LINUX_NFSD_H */ diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h new file mode 100644 index 0000000..7df980e --- /dev/null +++ b/fs/nfsd/xdr3.h @@ -0,0 +1,344 @@ +/* + * XDR types for NFSv3 in nfsd. + * + * Copyright (C) 1996-1998, Olaf Kirch <okir@monad.swb.de> + */ + +#ifndef _LINUX_NFSD_XDR3_H +#define _LINUX_NFSD_XDR3_H + +#include "xdr.h" + +struct nfsd3_sattrargs { + struct svc_fh fh; + struct iattr attrs; + int check_guard; + time_t guardtime; +}; + +struct nfsd3_diropargs { + struct svc_fh fh; + char * name; + unsigned int len; +}; + +struct nfsd3_accessargs { + struct svc_fh fh; + unsigned int access; +}; + +struct nfsd3_readargs { + struct svc_fh fh; + __u64 offset; + __u32 count; + int vlen; +}; + +struct nfsd3_writeargs { + svc_fh fh; + __u64 offset; + __u32 count; + int stable; + __u32 len; + int vlen; +}; + +struct nfsd3_createargs { + struct svc_fh fh; + char * name; + unsigned int len; + int createmode; + struct iattr attrs; + __be32 * verf; +}; + +struct nfsd3_mknodargs { + struct svc_fh fh; + char * name; + unsigned int len; + __u32 ftype; + __u32 major, minor; + struct iattr attrs; +}; + +struct nfsd3_renameargs { + struct svc_fh ffh; + char * fname; + unsigned int flen; + struct svc_fh tfh; + char * tname; + unsigned int tlen; +}; + +struct nfsd3_readlinkargs { + struct svc_fh fh; + char * buffer; +}; + +struct nfsd3_linkargs { + struct svc_fh ffh; + struct svc_fh tfh; + char * tname; + unsigned int tlen; +}; + +struct nfsd3_symlinkargs { + struct svc_fh ffh; + char * fname; + unsigned int flen; + char * tname; + unsigned int tlen; + struct iattr attrs; +}; + +struct nfsd3_readdirargs { + struct svc_fh fh; + __u64 cookie; + __u32 dircount; + __u32 count; + __be32 * verf; + __be32 * buffer; +}; + +struct nfsd3_commitargs { + struct svc_fh fh; + __u64 offset; + __u32 count; +}; + +struct nfsd3_getaclargs { + struct svc_fh fh; + int mask; +}; + +struct posix_acl; +struct nfsd3_setaclargs { + struct svc_fh fh; + int mask; + struct posix_acl *acl_access; + struct posix_acl *acl_default; +}; + +struct nfsd3_attrstat { + __be32 status; + struct svc_fh fh; + struct kstat stat; +}; + +/* LOOKUP, CREATE, MKDIR, SYMLINK, MKNOD */ +struct nfsd3_diropres { + __be32 status; + struct svc_fh dirfh; + struct svc_fh fh; +}; + +struct nfsd3_accessres { + __be32 status; + struct svc_fh fh; + __u32 access; +}; + +struct nfsd3_readlinkres { + __be32 status; + struct svc_fh fh; + __u32 len; +}; + +struct nfsd3_readres { + __be32 status; + struct svc_fh fh; + unsigned long count; + int eof; +}; + +struct nfsd3_writeres { + __be32 status; + struct svc_fh fh; + unsigned long count; + int committed; +}; + +struct nfsd3_renameres { + __be32 status; + struct svc_fh ffh; + struct svc_fh tfh; +}; + +struct nfsd3_linkres { + __be32 status; + struct svc_fh tfh; + struct svc_fh fh; +}; + +struct nfsd3_readdirres { + __be32 status; + struct svc_fh fh; + int count; + __be32 verf[2]; + + struct readdir_cd common; + __be32 * buffer; + int buflen; + __be32 * offset; + __be32 * offset1; + struct svc_rqst * rqstp; + +}; + +struct nfsd3_fsstatres { + __be32 status; + struct kstatfs stats; + __u32 invarsec; +}; + +struct nfsd3_fsinfores { + __be32 status; + __u32 f_rtmax; + __u32 f_rtpref; + __u32 f_rtmult; + __u32 f_wtmax; + __u32 f_wtpref; + __u32 f_wtmult; + __u32 f_dtpref; + __u64 f_maxfilesize; + __u32 f_properties; +}; + +struct nfsd3_pathconfres { + __be32 status; + __u32 p_link_max; + __u32 p_name_max; + __u32 p_no_trunc; + __u32 p_chown_restricted; + __u32 p_case_insensitive; + __u32 p_case_preserving; +}; + +struct nfsd3_commitres { + __be32 status; + struct svc_fh fh; +}; + +struct nfsd3_getaclres { + __be32 status; + struct svc_fh fh; + int mask; + struct posix_acl *acl_access; + struct posix_acl *acl_default; +}; + +/* dummy type for release */ +struct nfsd3_fhandle_pair { + __u32 dummy; + struct svc_fh fh1; + struct svc_fh fh2; +}; + +/* + * Storage requirements for XDR arguments and results. + */ +union nfsd3_xdrstore { + struct nfsd3_sattrargs sattrargs; + struct nfsd3_diropargs diropargs; + struct nfsd3_readargs readargs; + struct nfsd3_writeargs writeargs; + struct nfsd3_createargs createargs; + struct nfsd3_renameargs renameargs; + struct nfsd3_linkargs linkargs; + struct nfsd3_symlinkargs symlinkargs; + struct nfsd3_readdirargs readdirargs; + struct nfsd3_diropres diropres; + struct nfsd3_accessres accessres; + struct nfsd3_readlinkres readlinkres; + struct nfsd3_readres readres; + struct nfsd3_writeres writeres; + struct nfsd3_renameres renameres; + struct nfsd3_linkres linkres; + struct nfsd3_readdirres readdirres; + struct nfsd3_fsstatres fsstatres; + struct nfsd3_fsinfores fsinfores; + struct nfsd3_pathconfres pathconfres; + struct nfsd3_commitres commitres; + struct nfsd3_getaclres getaclres; +}; + +#define NFS3_SVC_XDRSIZE sizeof(union nfsd3_xdrstore) + +int nfs3svc_decode_fhandle(struct svc_rqst *, __be32 *, struct nfsd_fhandle *); +int nfs3svc_decode_sattrargs(struct svc_rqst *, __be32 *, + struct nfsd3_sattrargs *); +int nfs3svc_decode_diropargs(struct svc_rqst *, __be32 *, + struct nfsd3_diropargs *); +int nfs3svc_decode_accessargs(struct svc_rqst *, __be32 *, + struct nfsd3_accessargs *); +int nfs3svc_decode_readargs(struct svc_rqst *, __be32 *, + struct nfsd3_readargs *); +int nfs3svc_decode_writeargs(struct svc_rqst *, __be32 *, + struct nfsd3_writeargs *); +int nfs3svc_decode_createargs(struct svc_rqst *, __be32 *, + struct nfsd3_createargs *); +int nfs3svc_decode_mkdirargs(struct svc_rqst *, __be32 *, + struct nfsd3_createargs *); +int nfs3svc_decode_mknodargs(struct svc_rqst *, __be32 *, + struct nfsd3_mknodargs *); +int nfs3svc_decode_renameargs(struct svc_rqst *, __be32 *, + struct nfsd3_renameargs *); +int nfs3svc_decode_readlinkargs(struct svc_rqst *, __be32 *, + struct nfsd3_readlinkargs *); +int nfs3svc_decode_linkargs(struct svc_rqst *, __be32 *, + struct nfsd3_linkargs *); +int nfs3svc_decode_symlinkargs(struct svc_rqst *, __be32 *, + struct nfsd3_symlinkargs *); +int nfs3svc_decode_readdirargs(struct svc_rqst *, __be32 *, + struct nfsd3_readdirargs *); +int nfs3svc_decode_readdirplusargs(struct svc_rqst *, __be32 *, + struct nfsd3_readdirargs *); +int nfs3svc_decode_commitargs(struct svc_rqst *, __be32 *, + struct nfsd3_commitargs *); +int nfs3svc_encode_voidres(struct svc_rqst *, __be32 *, void *); +int nfs3svc_encode_attrstat(struct svc_rqst *, __be32 *, + struct nfsd3_attrstat *); +int nfs3svc_encode_wccstat(struct svc_rqst *, __be32 *, + struct nfsd3_attrstat *); +int nfs3svc_encode_diropres(struct svc_rqst *, __be32 *, + struct nfsd3_diropres *); +int nfs3svc_encode_accessres(struct svc_rqst *, __be32 *, + struct nfsd3_accessres *); +int nfs3svc_encode_readlinkres(struct svc_rqst *, __be32 *, + struct nfsd3_readlinkres *); +int nfs3svc_encode_readres(struct svc_rqst *, __be32 *, struct nfsd3_readres *); +int nfs3svc_encode_writeres(struct svc_rqst *, __be32 *, struct nfsd3_writeres *); +int nfs3svc_encode_createres(struct svc_rqst *, __be32 *, + struct nfsd3_diropres *); +int nfs3svc_encode_renameres(struct svc_rqst *, __be32 *, + struct nfsd3_renameres *); +int nfs3svc_encode_linkres(struct svc_rqst *, __be32 *, + struct nfsd3_linkres *); +int nfs3svc_encode_readdirres(struct svc_rqst *, __be32 *, + struct nfsd3_readdirres *); +int nfs3svc_encode_fsstatres(struct svc_rqst *, __be32 *, + struct nfsd3_fsstatres *); +int nfs3svc_encode_fsinfores(struct svc_rqst *, __be32 *, + struct nfsd3_fsinfores *); +int nfs3svc_encode_pathconfres(struct svc_rqst *, __be32 *, + struct nfsd3_pathconfres *); +int nfs3svc_encode_commitres(struct svc_rqst *, __be32 *, + struct nfsd3_commitres *); + +int nfs3svc_release_fhandle(struct svc_rqst *, __be32 *, + struct nfsd3_attrstat *); +int nfs3svc_release_fhandle2(struct svc_rqst *, __be32 *, + struct nfsd3_fhandle_pair *); +int nfs3svc_encode_entry(void *, const char *name, + int namlen, loff_t offset, u64 ino, + unsigned int); +int nfs3svc_encode_entry_plus(void *, const char *name, + int namlen, loff_t offset, u64 ino, + unsigned int); +/* Helper functions for NFSv3 ACL code */ +__be32 *nfs3svc_encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, + struct svc_fh *fhp); +__be32 *nfs3svc_decode_fh(__be32 *p, struct svc_fh *fhp); + + +#endif /* _LINUX_NFSD_XDR3_H */ diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h new file mode 100644 index 0000000..efa3377 --- /dev/null +++ b/fs/nfsd/xdr4.h @@ -0,0 +1,562 @@ +/* + * Server-side types for NFSv4. + * + * Copyright (c) 2002 The Regents of the University of Michigan. + * All rights reserved. + * + * Kendrick Smith <kmsmith@umich.edu> + * Andy Adamson <andros@umich.edu> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef _LINUX_NFSD_XDR4_H +#define _LINUX_NFSD_XDR4_H + +#include "state.h" +#include "nfsd.h" + +#define NFSD4_MAX_TAGLEN 128 +#define XDR_LEN(n) (((n) + 3) & ~3) + +struct nfsd4_compound_state { + struct svc_fh current_fh; + struct svc_fh save_fh; + struct nfs4_stateowner *replay_owner; + /* For sessions DRC */ + struct nfsd4_session *session; + struct nfsd4_slot *slot; + __be32 *datap; + size_t iovlen; + u32 minorversion; + u32 status; +}; + +static inline bool nfsd4_has_session(struct nfsd4_compound_state *cs) +{ + return cs->slot != NULL; +} + +struct nfsd4_change_info { + u32 atomic; + bool change_supported; + u32 before_ctime_sec; + u32 before_ctime_nsec; + u64 before_change; + u32 after_ctime_sec; + u32 after_ctime_nsec; + u64 after_change; +}; + +struct nfsd4_access { + u32 ac_req_access; /* request */ + u32 ac_supported; /* response */ + u32 ac_resp_access; /* response */ +}; + +struct nfsd4_close { + u32 cl_seqid; /* request */ + stateid_t cl_stateid; /* request+response */ + struct nfs4_stateowner * cl_stateowner; /* response */ +}; + +struct nfsd4_commit { + u64 co_offset; /* request */ + u32 co_count; /* request */ + nfs4_verifier co_verf; /* response */ +}; + +struct nfsd4_create { + u32 cr_namelen; /* request */ + char * cr_name; /* request */ + u32 cr_type; /* request */ + union { /* request */ + struct { + u32 namelen; + char *name; + } link; /* NF4LNK */ + struct { + u32 specdata1; + u32 specdata2; + } dev; /* NF4BLK, NF4CHR */ + } u; + u32 cr_bmval[3]; /* request */ + struct iattr cr_iattr; /* request */ + struct nfsd4_change_info cr_cinfo; /* response */ + struct nfs4_acl *cr_acl; +}; +#define cr_linklen u.link.namelen +#define cr_linkname u.link.name +#define cr_specdata1 u.dev.specdata1 +#define cr_specdata2 u.dev.specdata2 + +struct nfsd4_delegreturn { + stateid_t dr_stateid; +}; + +struct nfsd4_getattr { + u32 ga_bmval[3]; /* request */ + struct svc_fh *ga_fhp; /* response */ +}; + +struct nfsd4_link { + u32 li_namelen; /* request */ + char * li_name; /* request */ + struct nfsd4_change_info li_cinfo; /* response */ +}; + +struct nfsd4_lock_denied { + clientid_t ld_clientid; + struct nfs4_stateowner *ld_sop; + u64 ld_start; + u64 ld_length; + u32 ld_type; +}; + +struct nfsd4_lock { + /* request */ + u32 lk_type; + u32 lk_reclaim; /* boolean */ + u64 lk_offset; + u64 lk_length; + u32 lk_is_new; + union { + struct { + u32 open_seqid; + stateid_t open_stateid; + u32 lock_seqid; + clientid_t clientid; + struct xdr_netobj owner; + } new; + struct { + stateid_t lock_stateid; + u32 lock_seqid; + } old; + } v; + + /* response */ + union { + struct { + stateid_t stateid; + } ok; + struct nfsd4_lock_denied denied; + } u; + /* The lk_replay_owner is the open owner in the open_to_lock_owner + * case and the lock owner otherwise: */ + struct nfs4_stateowner *lk_replay_owner; +}; +#define lk_new_open_seqid v.new.open_seqid +#define lk_new_open_stateid v.new.open_stateid +#define lk_new_lock_seqid v.new.lock_seqid +#define lk_new_clientid v.new.clientid +#define lk_new_owner v.new.owner +#define lk_old_lock_stateid v.old.lock_stateid +#define lk_old_lock_seqid v.old.lock_seqid + +#define lk_rflags u.ok.rflags +#define lk_resp_stateid u.ok.stateid +#define lk_denied u.denied + + +struct nfsd4_lockt { + u32 lt_type; + clientid_t lt_clientid; + struct xdr_netobj lt_owner; + u64 lt_offset; + u64 lt_length; + struct nfs4_stateowner * lt_stateowner; + struct nfsd4_lock_denied lt_denied; +}; + + +struct nfsd4_locku { + u32 lu_type; + u32 lu_seqid; + stateid_t lu_stateid; + u64 lu_offset; + u64 lu_length; + struct nfs4_stateowner *lu_stateowner; +}; + + +struct nfsd4_lookup { + u32 lo_len; /* request */ + char * lo_name; /* request */ +}; + +struct nfsd4_putfh { + u32 pf_fhlen; /* request */ + char *pf_fhval; /* request */ +}; + +struct nfsd4_open { + u32 op_claim_type; /* request */ + struct xdr_netobj op_fname; /* request - everything but CLAIM_PREV */ + u32 op_delegate_type; /* request - CLAIM_PREV only */ + stateid_t op_delegate_stateid; /* request - response */ + u32 op_create; /* request */ + u32 op_createmode; /* request */ + u32 op_bmval[3]; /* request */ + struct iattr iattr; /* UNCHECKED4, GUARDED4, EXCLUSIVE4_1 */ + nfs4_verifier verf; /* EXCLUSIVE4 */ + clientid_t op_clientid; /* request */ + struct xdr_netobj op_owner; /* request */ + u32 op_seqid; /* request */ + u32 op_share_access; /* request */ + u32 op_share_deny; /* request */ + stateid_t op_stateid; /* response */ + u32 op_recall; /* recall */ + struct nfsd4_change_info op_cinfo; /* response */ + u32 op_rflags; /* response */ + int op_truncate; /* used during processing */ + struct nfs4_stateowner *op_stateowner; /* used during processing */ + struct nfs4_acl *op_acl; +}; +#define op_iattr iattr +#define op_verf verf + +struct nfsd4_open_confirm { + stateid_t oc_req_stateid /* request */; + u32 oc_seqid /* request */; + stateid_t oc_resp_stateid /* response */; + struct nfs4_stateowner * oc_stateowner; /* response */ +}; + +struct nfsd4_open_downgrade { + stateid_t od_stateid; + u32 od_seqid; + u32 od_share_access; + u32 od_share_deny; + struct nfs4_stateowner *od_stateowner; +}; + + +struct nfsd4_read { + stateid_t rd_stateid; /* request */ + u64 rd_offset; /* request */ + u32 rd_length; /* request */ + int rd_vlen; + struct file *rd_filp; + + struct svc_rqst *rd_rqstp; /* response */ + struct svc_fh * rd_fhp; /* response */ +}; + +struct nfsd4_readdir { + u64 rd_cookie; /* request */ + nfs4_verifier rd_verf; /* request */ + u32 rd_dircount; /* request */ + u32 rd_maxcount; /* request */ + u32 rd_bmval[3]; /* request */ + struct svc_rqst *rd_rqstp; /* response */ + struct svc_fh * rd_fhp; /* response */ + + struct readdir_cd common; + __be32 * buffer; + int buflen; + __be32 * offset; +}; + +struct nfsd4_release_lockowner { + clientid_t rl_clientid; + struct xdr_netobj rl_owner; +}; +struct nfsd4_readlink { + struct svc_rqst *rl_rqstp; /* request */ + struct svc_fh * rl_fhp; /* request */ +}; + +struct nfsd4_remove { + u32 rm_namelen; /* request */ + char * rm_name; /* request */ + struct nfsd4_change_info rm_cinfo; /* response */ +}; + +struct nfsd4_rename { + u32 rn_snamelen; /* request */ + char * rn_sname; /* request */ + u32 rn_tnamelen; /* request */ + char * rn_tname; /* request */ + struct nfsd4_change_info rn_sinfo; /* response */ + struct nfsd4_change_info rn_tinfo; /* response */ +}; + +struct nfsd4_secinfo { + u32 si_namelen; /* request */ + char *si_name; /* request */ + struct svc_export *si_exp; /* response */ +}; + +struct nfsd4_setattr { + stateid_t sa_stateid; /* request */ + u32 sa_bmval[3]; /* request */ + struct iattr sa_iattr; /* request */ + struct nfs4_acl *sa_acl; +}; + +struct nfsd4_setclientid { + nfs4_verifier se_verf; /* request */ + u32 se_namelen; /* request */ + char * se_name; /* request */ + u32 se_callback_prog; /* request */ + u32 se_callback_netid_len; /* request */ + char * se_callback_netid_val; /* request */ + u32 se_callback_addr_len; /* request */ + char * se_callback_addr_val; /* request */ + u32 se_callback_ident; /* request */ + clientid_t se_clientid; /* response */ + nfs4_verifier se_confirm; /* response */ +}; + +struct nfsd4_setclientid_confirm { + clientid_t sc_clientid; + nfs4_verifier sc_confirm; +}; + +/* also used for NVERIFY */ +struct nfsd4_verify { + u32 ve_bmval[3]; /* request */ + u32 ve_attrlen; /* request */ + char * ve_attrval; /* request */ +}; + +struct nfsd4_write { + stateid_t wr_stateid; /* request */ + u64 wr_offset; /* request */ + u32 wr_stable_how; /* request */ + u32 wr_buflen; /* request */ + int wr_vlen; + + u32 wr_bytes_written; /* response */ + u32 wr_how_written; /* response */ + nfs4_verifier wr_verifier; /* response */ +}; + +struct nfsd4_exchange_id { + nfs4_verifier verifier; + struct xdr_netobj clname; + u32 flags; + clientid_t clientid; + u32 seqid; + int spa_how; +}; + +struct nfsd4_sequence { + struct nfs4_sessionid sessionid; /* request/response */ + u32 seqid; /* request/response */ + u32 slotid; /* request/response */ + u32 maxslots; /* request/response */ + u32 cachethis; /* request */ +#if 0 + u32 target_maxslots; /* response */ + u32 status_flags; /* response */ +#endif /* not yet */ +}; + +struct nfsd4_destroy_session { + struct nfs4_sessionid sessionid; +}; + +struct nfsd4_op { + int opnum; + __be32 status; + union { + struct nfsd4_access access; + struct nfsd4_close close; + struct nfsd4_commit commit; + struct nfsd4_create create; + struct nfsd4_delegreturn delegreturn; + struct nfsd4_getattr getattr; + struct svc_fh * getfh; + struct nfsd4_link link; + struct nfsd4_lock lock; + struct nfsd4_lockt lockt; + struct nfsd4_locku locku; + struct nfsd4_lookup lookup; + struct nfsd4_verify nverify; + struct nfsd4_open open; + struct nfsd4_open_confirm open_confirm; + struct nfsd4_open_downgrade open_downgrade; + struct nfsd4_putfh putfh; + struct nfsd4_read read; + struct nfsd4_readdir readdir; + struct nfsd4_readlink readlink; + struct nfsd4_remove remove; + struct nfsd4_rename rename; + clientid_t renew; + struct nfsd4_secinfo secinfo; + struct nfsd4_setattr setattr; + struct nfsd4_setclientid setclientid; + struct nfsd4_setclientid_confirm setclientid_confirm; + struct nfsd4_verify verify; + struct nfsd4_write write; + struct nfsd4_release_lockowner release_lockowner; + + /* NFSv4.1 */ + struct nfsd4_exchange_id exchange_id; + struct nfsd4_create_session create_session; + struct nfsd4_destroy_session destroy_session; + struct nfsd4_sequence sequence; + } u; + struct nfs4_replay * replay; +}; + +struct nfsd4_compoundargs { + /* scratch variables for XDR decode */ + __be32 * p; + __be32 * end; + struct page ** pagelist; + int pagelen; + __be32 tmp[8]; + __be32 * tmpp; + struct tmpbuf { + struct tmpbuf *next; + void (*release)(const void *); + void *buf; + } *to_free; + + struct svc_rqst *rqstp; + + u32 taglen; + char * tag; + u32 minorversion; + u32 opcnt; + struct nfsd4_op *ops; + struct nfsd4_op iops[8]; +}; + +struct nfsd4_compoundres { + /* scratch variables for XDR encode */ + __be32 * p; + __be32 * end; + struct xdr_buf * xbuf; + struct svc_rqst * rqstp; + + u32 taglen; + char * tag; + u32 opcnt; + __be32 * tagp; /* tag, opcount encode location */ + struct nfsd4_compound_state cstate; +}; + +static inline bool nfsd4_is_solo_sequence(struct nfsd4_compoundres *resp) +{ + struct nfsd4_compoundargs *args = resp->rqstp->rq_argp; + return resp->opcnt == 1 && args->ops[0].opnum == OP_SEQUENCE; +} + +static inline bool nfsd4_not_cached(struct nfsd4_compoundres *resp) +{ + return !resp->cstate.slot->sl_cachethis || nfsd4_is_solo_sequence(resp); +} + +#define NFS4_SVC_XDRSIZE sizeof(struct nfsd4_compoundargs) + +static inline void +set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp) +{ + BUG_ON(!fhp->fh_pre_saved || !fhp->fh_post_saved); + cinfo->atomic = 1; + cinfo->change_supported = IS_I_VERSION(fhp->fh_dentry->d_inode); + if (cinfo->change_supported) { + cinfo->before_change = fhp->fh_pre_change; + cinfo->after_change = fhp->fh_post_change; + } else { + cinfo->before_ctime_sec = fhp->fh_pre_ctime.tv_sec; + cinfo->before_ctime_nsec = fhp->fh_pre_ctime.tv_nsec; + cinfo->after_ctime_sec = fhp->fh_post_attr.ctime.tv_sec; + cinfo->after_ctime_nsec = fhp->fh_post_attr.ctime.tv_nsec; + } +} + +int nfs4svc_encode_voidres(struct svc_rqst *, __be32 *, void *); +int nfs4svc_decode_compoundargs(struct svc_rqst *, __be32 *, + struct nfsd4_compoundargs *); +int nfs4svc_encode_compoundres(struct svc_rqst *, __be32 *, + struct nfsd4_compoundres *); +void nfsd4_encode_operation(struct nfsd4_compoundres *, struct nfsd4_op *); +void nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op); +__be32 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, + struct dentry *dentry, __be32 *buffer, int *countp, + u32 *bmval, struct svc_rqst *, int ignore_crossmnt); +extern __be32 nfsd4_setclientid(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, + struct nfsd4_setclientid *setclid); +extern __be32 nfsd4_setclientid_confirm(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, + struct nfsd4_setclientid_confirm *setclientid_confirm); +extern void nfsd4_store_cache_entry(struct nfsd4_compoundres *resp); +extern __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp, + struct nfsd4_sequence *seq); +extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, +struct nfsd4_exchange_id *); + extern __be32 nfsd4_create_session(struct svc_rqst *, + struct nfsd4_compound_state *, + struct nfsd4_create_session *); +extern __be32 nfsd4_sequence(struct svc_rqst *, + struct nfsd4_compound_state *, + struct nfsd4_sequence *); +extern __be32 nfsd4_destroy_session(struct svc_rqst *, + struct nfsd4_compound_state *, + struct nfsd4_destroy_session *); +extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *, + struct nfsd4_open *open); +extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp, + struct svc_fh *current_fh, struct nfsd4_open *open); +extern __be32 nfsd4_open_confirm(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, struct nfsd4_open_confirm *oc); +extern __be32 nfsd4_close(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, + struct nfsd4_close *close); +extern __be32 nfsd4_open_downgrade(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, + struct nfsd4_open_downgrade *od); +extern __be32 nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *, + struct nfsd4_lock *lock); +extern __be32 nfsd4_lockt(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, + struct nfsd4_lockt *lockt); +extern __be32 nfsd4_locku(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, + struct nfsd4_locku *locku); +extern __be32 +nfsd4_release_lockowner(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, + struct nfsd4_release_lockowner *rlockowner); +extern void nfsd4_release_compoundargs(struct nfsd4_compoundargs *); +extern __be32 nfsd4_delegreturn(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, struct nfsd4_delegreturn *dr); +extern __be32 nfsd4_renew(struct svc_rqst *rqstp, + struct nfsd4_compound_state *, clientid_t *clid); +#endif + +/* + * Local variables: + * c-basic-offset: 8 + * End: + */ diff --git a/fs/nilfs2/Kconfig b/fs/nilfs2/Kconfig index 1225af7..251da07 100644 --- a/fs/nilfs2/Kconfig +++ b/fs/nilfs2/Kconfig @@ -2,7 +2,6 @@ config NILFS2_FS tristate "NILFS2 file system support (EXPERIMENTAL)" depends on EXPERIMENTAL select CRC32 - select FS_JOURNAL_INFO help NILFS2 is a log-structured file system (LFS) supporting continuous snapshotting. In addition to versioning capability of the entire diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c index f4a14ea..effdbdb 100644 --- a/fs/nilfs2/bmap.c +++ b/fs/nilfs2/bmap.c @@ -417,8 +417,8 @@ __u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *bmap, key = page_index(bh->b_page) << (PAGE_CACHE_SHIFT - bmap->b_inode->i_blkbits); - for (pbh = page_buffers(bh->b_page); pbh != bh; - pbh = pbh->b_this_page, key++); + for (pbh = page_buffers(bh->b_page); pbh != bh; pbh = pbh->b_this_page) + key++; return key; } diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c index d5ad54e..1873781 100644 --- a/fs/nilfs2/cpfile.c +++ b/fs/nilfs2/cpfile.c @@ -328,19 +328,24 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile, tnicps += nicps; nilfs_mdt_mark_buffer_dirty(cp_bh); nilfs_mdt_mark_dirty(cpfile); - if (!nilfs_cpfile_is_in_first(cpfile, cno) && - (count = nilfs_cpfile_block_sub_valid_checkpoints( - cpfile, cp_bh, kaddr, nicps)) == 0) { - /* make hole */ - kunmap_atomic(kaddr, KM_USER0); - brelse(cp_bh); - ret = nilfs_cpfile_delete_checkpoint_block( - cpfile, cno); - if (ret == 0) - continue; - printk(KERN_ERR "%s: cannot delete block\n", - __func__); - break; + if (!nilfs_cpfile_is_in_first(cpfile, cno)) { + count = + nilfs_cpfile_block_sub_valid_checkpoints( + cpfile, cp_bh, kaddr, nicps); + if (count == 0) { + /* make hole */ + kunmap_atomic(kaddr, KM_USER0); + brelse(cp_bh); + ret = + nilfs_cpfile_delete_checkpoint_block( + cpfile, cno); + if (ret == 0) + continue; + printk(KERN_ERR + "%s: cannot delete block\n", + __func__); + break; + } } } diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c index d369ac7..236753d 100644 --- a/fs/nilfs2/direct.c +++ b/fs/nilfs2/direct.c @@ -51,11 +51,11 @@ static int nilfs_direct_lookup(const struct nilfs_bmap *bmap, struct nilfs_direct *direct; __u64 ptr; - direct = (struct nilfs_direct *)bmap; - if ((key > NILFS_DIRECT_KEY_MAX) || - (level != 1) || /* XXX: use macro for level 1 */ - ((ptr = nilfs_direct_get_ptr(direct, key)) == - NILFS_BMAP_INVALID_PTR)) + direct = (struct nilfs_direct *)bmap; /* XXX: use macro for level 1 */ + if (key > NILFS_DIRECT_KEY_MAX || level != 1) + return -ENOENT; + ptr = nilfs_direct_get_ptr(direct, key); + if (ptr == NILFS_BMAP_INVALID_PTR) return -ENOENT; if (ptrp != NULL) @@ -73,9 +73,10 @@ static int nilfs_direct_lookup_contig(const struct nilfs_bmap *bmap, sector_t blocknr; int ret, cnt; - if (key > NILFS_DIRECT_KEY_MAX || - (ptr = nilfs_direct_get_ptr(direct, key)) == - NILFS_BMAP_INVALID_PTR) + if (key > NILFS_DIRECT_KEY_MAX) + return -ENOENT; + ptr = nilfs_direct_get_ptr(direct, key); + if (ptr == NILFS_BMAP_INVALID_PTR) return -ENOENT; if (NILFS_BMAP_USE_VBN(bmap)) { diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index f6af760..d6b2b83 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -480,7 +480,7 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp, unsigned int cmd, void __user *argp) { struct nilfs_argv argv[5]; - const static size_t argsz[5] = { + static const size_t argsz[5] = { sizeof(struct nilfs_vdesc), sizeof(struct nilfs_period), sizeof(__u64), diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 17584c5..105b508 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -2829,7 +2829,7 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci) || sci->sc_seq_request != sci->sc_seq_done); spin_unlock(&sci->sc_state_lock); - if (flag || nilfs_segctor_confirm(sci)) + if (flag || !nilfs_segctor_confirm(sci)) nilfs_segctor_write_out(sci); WARN_ON(!list_empty(&sci->sc_copied_buffers)); diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 5403b3e..8173fae 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -1118,8 +1118,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags, /* Abandoning the newly allocated superblock */ mutex_unlock(&nilfs->ns_mount_mutex); put_nilfs(nilfs); - up_write(&s->s_umount); - deactivate_super(s); + deactivate_locked_super(s); /* * deactivate_super() invokes close_bdev_exclusive(). * We must finish all post-cleaning before this call; diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index c9ee67b..1afb0a1 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -121,7 +121,7 @@ static int idr_callback(int id, void *p, void *data) if (warned) return 0; - warned = false; + warned = true; entry = p; ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 5ef5f36..a94e8bd 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -552,7 +552,7 @@ retry: spin_lock(&group->inotify_data.idr_lock); ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry, - group->inotify_data.last_wd, + group->inotify_data.last_wd+1, &tmp_ientry->wd); spin_unlock(&group->inotify_data.idr_lock); if (ret) { @@ -632,7 +632,7 @@ static struct fsnotify_group *inotify_new_group(struct user_struct *user, unsign spin_lock_init(&group->inotify_data.idr_lock); idr_init(&group->inotify_data.idr); - group->inotify_data.last_wd = 1; + group->inotify_data.last_wd = 0; group->inotify_data.user = user; group->inotify_data.fa = NULL; @@ -646,6 +646,7 @@ SYSCALL_DEFINE1(inotify_init1, int, flags) struct fsnotify_group *group; struct user_struct *user; struct file *filp; + struct path path; int fd, ret; /* Check the IN_* constants for consistency. */ @@ -659,12 +660,6 @@ SYSCALL_DEFINE1(inotify_init1, int, flags) if (fd < 0) return fd; - filp = get_empty_filp(); - if (!filp) { - ret = -ENFILE; - goto out_put_fd; - } - user = get_current_user(); if (unlikely(atomic_read(&user->inotify_devs) >= inotify_max_user_instances)) { @@ -679,24 +674,28 @@ SYSCALL_DEFINE1(inotify_init1, int, flags) goto out_free_uid; } - filp->f_op = &inotify_fops; - filp->f_path.mnt = mntget(inotify_mnt); - filp->f_path.dentry = dget(inotify_mnt->mnt_root); - filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping; - filp->f_mode = FMODE_READ; + atomic_inc(&user->inotify_devs); + + path.mnt = inotify_mnt; + path.dentry = inotify_mnt->mnt_root; + path_get(&path); + filp = alloc_file(&path, FMODE_READ, &inotify_fops); + if (!filp) + goto Enfile; + filp->f_flags = O_RDONLY | (flags & O_NONBLOCK); filp->private_data = group; - atomic_inc(&user->inotify_devs); - fd_install(fd, filp); return fd; +Enfile: + ret = -ENFILE; + path_put(&path); + atomic_dec(&user->inotify_devs); out_free_uid: free_uid(user); - put_filp(filp); -out_put_fd: put_unused_fd(fd); return ret; } diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index 9938034..dc2505a 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -530,7 +530,7 @@ err_corrupt_attr: * the ntfs inode. * * Q: What locks are held when the function is called? - * A: i_state has I_LOCK set, hence the inode is locked, also + * A: i_state has I_NEW set, hence the inode is locked, also * i_count is set to 1, so it is not going to go away * i_flags is set to 0 and we have no business touching it. Only an ioctl() * is allowed to write to them. We should of course be honouring them but @@ -1207,7 +1207,7 @@ err_out: * necessary fields in @vi as well as initializing the ntfs inode. * * Q: What locks are held when the function is called? - * A: i_state has I_LOCK set, hence the inode is locked, also + * A: i_state has I_NEW set, hence the inode is locked, also * i_count is set to 1, so it is not going to go away * * Return 0 on success and -errno on error. In the error case, the inode will @@ -1474,7 +1474,7 @@ err_out: * normal directory inodes. * * Q: What locks are held when the function is called? - * A: i_state has I_LOCK set, hence the inode is locked, also + * A: i_state has I_NEW set, hence the inode is locked, also * i_count is set to 1, so it is not going to go away * * Return 0 on success and -errno on error. In the error case, the inode will diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig index 701b7a3..0d84066 100644 --- a/fs/ocfs2/Kconfig +++ b/fs/ocfs2/Kconfig @@ -6,6 +6,7 @@ config OCFS2_FS select CRC32 select QUOTA select QUOTA_TREE + select FS_POSIX_ACL help OCFS2 is a general purpose extent based shared disk cluster file system with many similarities to ext3. It supports 64 bit inode @@ -74,12 +75,3 @@ config OCFS2_DEBUG_FS This option will enable expensive consistency checks. Enable this option for debugging only as it is likely to decrease performance of the filesystem. - -config OCFS2_FS_POSIX_ACL - bool "OCFS2 POSIX Access Control Lists" - depends on OCFS2_FS - select FS_POSIX_ACL - default n - help - Posix Access Control Lists (ACLs) support permissions for users and - groups beyond the owner/group/world scheme. diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile index 31f25ce..600d2d2 100644 --- a/fs/ocfs2/Makefile +++ b/fs/ocfs2/Makefile @@ -39,11 +39,8 @@ ocfs2-objs := \ ver.o \ quota_local.o \ quota_global.o \ - xattr.o - -ifeq ($(CONFIG_OCFS2_FS_POSIX_ACL),y) -ocfs2-objs += acl.o -endif + xattr.o \ + acl.o ocfs2_stackglue-objs := stackglue.o ocfs2_stack_o2cb-objs := stack_o2cb.o diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index fbeaec7..0501974 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -98,15 +98,11 @@ static struct posix_acl *ocfs2_get_acl_nolock(struct inode *inode, int type, struct buffer_head *di_bh) { - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); int name_index; char *value = NULL; struct posix_acl *acl; int retval; - if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) - return NULL; - switch (type) { case ACL_TYPE_ACCESS: name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS; @@ -331,13 +327,14 @@ cleanup: return ret; } -static size_t ocfs2_xattr_list_acl_access(struct inode *inode, +static size_t ocfs2_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_len, const char *name, - size_t name_len) + size_t name_len, + int type) { - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) @@ -348,13 +345,14 @@ static size_t ocfs2_xattr_list_acl_access(struct inode *inode, return size; } -static size_t ocfs2_xattr_list_acl_default(struct inode *inode, +static size_t ocfs2_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_len, const char *name, - size_t name_len) + size_t name_len, + int type) { - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) @@ -365,19 +363,19 @@ static size_t ocfs2_xattr_list_acl_default(struct inode *inode, return size; } -static int ocfs2_xattr_get_acl(struct inode *inode, - int type, - void *buffer, - size_t size) +static int ocfs2_xattr_get_acl(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) { - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); struct posix_acl *acl; int ret; + if (strcmp(name, "") != 0) + return -EINVAL; if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) return -EOPNOTSUPP; - acl = ocfs2_get_acl(inode, type); + acl = ocfs2_get_acl(dentry->d_inode, type); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl == NULL) @@ -388,35 +386,16 @@ static int ocfs2_xattr_get_acl(struct inode *inode, return ret; } -static int ocfs2_xattr_get_acl_access(struct inode *inode, - const char *name, - void *buffer, - size_t size) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return ocfs2_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size); -} - -static int ocfs2_xattr_get_acl_default(struct inode *inode, - const char *name, - void *buffer, - size_t size) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return ocfs2_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size); -} - -static int ocfs2_xattr_set_acl(struct inode *inode, - int type, - const void *value, - size_t size) +static int ocfs2_xattr_set_acl(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) { + struct inode *inode = dentry->d_inode; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct posix_acl *acl; int ret = 0; + if (strcmp(name, "") != 0) + return -EINVAL; if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) return -EOPNOTSUPP; @@ -442,38 +421,18 @@ cleanup: return ret; } -static int ocfs2_xattr_set_acl_access(struct inode *inode, - const char *name, - const void *value, - size_t size, - int flags) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return ocfs2_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size); -} - -static int ocfs2_xattr_set_acl_default(struct inode *inode, - const char *name, - const void *value, - size_t size, - int flags) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return ocfs2_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size); -} - struct xattr_handler ocfs2_xattr_acl_access_handler = { .prefix = POSIX_ACL_XATTR_ACCESS, + .flags = ACL_TYPE_ACCESS, .list = ocfs2_xattr_list_acl_access, - .get = ocfs2_xattr_get_acl_access, - .set = ocfs2_xattr_set_acl_access, + .get = ocfs2_xattr_get_acl, + .set = ocfs2_xattr_set_acl, }; struct xattr_handler ocfs2_xattr_acl_default_handler = { .prefix = POSIX_ACL_XATTR_DEFAULT, + .flags = ACL_TYPE_DEFAULT, .list = ocfs2_xattr_list_acl_default, - .get = ocfs2_xattr_get_acl_default, - .set = ocfs2_xattr_set_acl_default, + .get = ocfs2_xattr_get_acl, + .set = ocfs2_xattr_set_acl, }; diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h index 8f6389e..5c5d31f 100644 --- a/fs/ocfs2/acl.h +++ b/fs/ocfs2/acl.h @@ -26,8 +26,6 @@ struct ocfs2_acl_entry { __le32 e_id; }; -#ifdef CONFIG_OCFS2_FS_POSIX_ACL - extern int ocfs2_check_acl(struct inode *, int); extern int ocfs2_acl_chmod(struct inode *); extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *, @@ -35,24 +33,4 @@ extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *, struct ocfs2_alloc_context *, struct ocfs2_alloc_context *); -#else /* CONFIG_OCFS2_FS_POSIX_ACL*/ - -#define ocfs2_check_acl NULL -static inline int ocfs2_acl_chmod(struct inode *inode) -{ - return 0; -} -static inline int ocfs2_init_acl(handle_t *handle, - struct inode *inode, - struct inode *dir, - struct buffer_head *di_bh, - struct buffer_head *dir_bh, - struct ocfs2_alloc_context *meta_ac, - struct ocfs2_alloc_context *data_ac) -{ - return 0; -} - -#endif /* CONFIG_OCFS2_FS_POSIX_ACL*/ - #endif /* OCFS2_ACL_H */ diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 7c7198a..d17bdc7 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -1765,9 +1765,9 @@ set_and_inc: * * The array index of the subtree root is passed back. */ -static int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et, - struct ocfs2_path *left, - struct ocfs2_path *right) +int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et, + struct ocfs2_path *left, + struct ocfs2_path *right) { int i = 0; @@ -2872,8 +2872,8 @@ out: * This looks similar, but is subtly different to * ocfs2_find_cpos_for_left_leaf(). */ -static int ocfs2_find_cpos_for_right_leaf(struct super_block *sb, - struct ocfs2_path *path, u32 *cpos) +int ocfs2_find_cpos_for_right_leaf(struct super_block *sb, + struct ocfs2_path *path, u32 *cpos) { int i, j, ret = 0; u64 blkno; @@ -7190,8 +7190,8 @@ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle, * wait on them - the truncate_inode_pages() call later will * do that for us. */ - ret = do_sync_mapping_range(inode->i_mapping, range_start, - range_end - 1, SYNC_FILE_RANGE_WRITE); + ret = filemap_fdatawrite_range(inode->i_mapping, range_start, + range_end - 1); if (ret) mlog_errno(ret); diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index 9c122d57..1db4359 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -317,4 +317,9 @@ int ocfs2_path_bh_journal_access(handle_t *handle, int ocfs2_journal_access_path(struct ocfs2_caching_info *ci, handle_t *handle, struct ocfs2_path *path); +int ocfs2_find_cpos_for_right_leaf(struct super_block *sb, + struct ocfs2_path *path, u32 *cpos); +int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et, + struct ocfs2_path *left, + struct ocfs2_path *right); #endif /* OCFS2_ALLOC_H */ diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 3dae4a1..7e9df11 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -599,7 +599,7 @@ bail: return ret; } -/* +/* * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're * particularly interested in the aio/dio case. Like the core uses * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from @@ -670,7 +670,7 @@ static ssize_t ocfs2_direct_IO(int rw, ret = blockdev_direct_IO_no_locking(rw, iocb, inode, inode->i_sb->s_bdev, iov, offset, - nr_segs, + nr_segs, ocfs2_direct_IO_get_blocks, ocfs2_dio_end_io); diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c index d43d34a..21c808f 100644 --- a/fs/ocfs2/buffer_head_io.c +++ b/fs/ocfs2/buffer_head_io.c @@ -368,7 +368,7 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, } ocfs2_metadata_cache_io_unlock(ci); - mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s, flags=0x%x\n", + mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s, flags=0x%x\n", (unsigned long long)block, nr, ((flags & OCFS2_BH_IGNORE_CACHE) || ignore_cache) ? "no" : "yes", flags); diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index c452d11..5c98900 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -78,7 +78,7 @@ static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type); unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD; -/* Only sets a new threshold if there are no active regions. +/* Only sets a new threshold if there are no active regions. * * No locking or otherwise interesting code is required for reading * o2hb_dead_threshold as it can't change once regions are active and @@ -170,13 +170,14 @@ static void o2hb_write_timeout(struct work_struct *work) mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u " "milliseconds\n", reg->hr_dev_name, - jiffies_to_msecs(jiffies - reg->hr_last_timeout_start)); + jiffies_to_msecs(jiffies - reg->hr_last_timeout_start)); o2quo_disk_timeout(); } static void o2hb_arm_write_timeout(struct o2hb_region *reg) { - mlog(0, "Queue write timeout for %u ms\n", O2HB_MAX_WRITE_TIMEOUT_MS); + mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n", + O2HB_MAX_WRITE_TIMEOUT_MS); cancel_delayed_work(®->hr_write_timeout_work); reg->hr_last_timeout_start = jiffies; @@ -623,7 +624,7 @@ static int o2hb_check_slot(struct o2hb_region *reg, "seq %llu last %llu changed %u equal %u\n", slot->ds_node_num, (long long)slot->ds_last_generation, le32_to_cpu(hb_block->hb_cksum), - (unsigned long long)le64_to_cpu(hb_block->hb_seq), + (unsigned long long)le64_to_cpu(hb_block->hb_seq), (unsigned long long)slot->ds_last_time, slot->ds_changed_samples, slot->ds_equal_samples); @@ -874,7 +875,8 @@ static int o2hb_thread(void *data) do_gettimeofday(&after_hb); elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb); - mlog(0, "start = %lu.%lu, end = %lu.%lu, msec = %u\n", + mlog(ML_HEARTBEAT, + "start = %lu.%lu, end = %lu.%lu, msec = %u\n", before_hb.tv_sec, (unsigned long) before_hb.tv_usec, after_hb.tv_sec, (unsigned long) after_hb.tv_usec, elapsed_msec); diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index 7ee6188..c81142e 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -35,6 +35,10 @@ * cluster references throughout where nodes are looked up */ struct o2nm_cluster *o2nm_single_cluster = NULL; +char *o2nm_fence_method_desc[O2NM_FENCE_METHODS] = { + "reset", /* O2NM_FENCE_RESET */ + "panic", /* O2NM_FENCE_PANIC */ +}; struct o2nm_node *o2nm_get_node_by_num(u8 node_num) { @@ -579,6 +583,43 @@ static ssize_t o2nm_cluster_attr_reconnect_delay_ms_write( return o2nm_cluster_attr_write(page, count, &cluster->cl_reconnect_delay_ms); } + +static ssize_t o2nm_cluster_attr_fence_method_read( + struct o2nm_cluster *cluster, char *page) +{ + ssize_t ret = 0; + + if (cluster) + ret = sprintf(page, "%s\n", + o2nm_fence_method_desc[cluster->cl_fence_method]); + return ret; +} + +static ssize_t o2nm_cluster_attr_fence_method_write( + struct o2nm_cluster *cluster, const char *page, size_t count) +{ + unsigned int i; + + if (page[count - 1] != '\n') + goto bail; + + for (i = 0; i < O2NM_FENCE_METHODS; ++i) { + if (count != strlen(o2nm_fence_method_desc[i]) + 1) + continue; + if (strncasecmp(page, o2nm_fence_method_desc[i], count - 1)) + continue; + if (cluster->cl_fence_method != i) { + printk(KERN_INFO "ocfs2: Changing fence method to %s\n", + o2nm_fence_method_desc[i]); + cluster->cl_fence_method = i; + } + return count; + } + +bail: + return -EINVAL; +} + static struct o2nm_cluster_attribute o2nm_cluster_attr_idle_timeout_ms = { .attr = { .ca_owner = THIS_MODULE, .ca_name = "idle_timeout_ms", @@ -603,10 +644,19 @@ static struct o2nm_cluster_attribute o2nm_cluster_attr_reconnect_delay_ms = { .store = o2nm_cluster_attr_reconnect_delay_ms_write, }; +static struct o2nm_cluster_attribute o2nm_cluster_attr_fence_method = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "fence_method", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = o2nm_cluster_attr_fence_method_read, + .store = o2nm_cluster_attr_fence_method_write, +}; + static struct configfs_attribute *o2nm_cluster_attrs[] = { &o2nm_cluster_attr_idle_timeout_ms.attr, &o2nm_cluster_attr_keepalive_delay_ms.attr, &o2nm_cluster_attr_reconnect_delay_ms.attr, + &o2nm_cluster_attr_fence_method.attr, NULL, }; static ssize_t o2nm_cluster_show(struct config_item *item, @@ -778,6 +828,7 @@ static struct config_group *o2nm_cluster_group_make_group(struct config_group *g cluster->cl_reconnect_delay_ms = O2NET_RECONNECT_DELAY_MS_DEFAULT; cluster->cl_idle_timeout_ms = O2NET_IDLE_TIMEOUT_MS_DEFAULT; cluster->cl_keepalive_delay_ms = O2NET_KEEPALIVE_DELAY_MS_DEFAULT; + cluster->cl_fence_method = O2NM_FENCE_RESET; ret = &cluster->cl_group; o2nm_single_cluster = cluster; diff --git a/fs/ocfs2/cluster/nodemanager.h b/fs/ocfs2/cluster/nodemanager.h index c992ea0..09ea2d3 100644 --- a/fs/ocfs2/cluster/nodemanager.h +++ b/fs/ocfs2/cluster/nodemanager.h @@ -33,6 +33,12 @@ #include <linux/configfs.h> #include <linux/rbtree.h> +enum o2nm_fence_method { + O2NM_FENCE_RESET = 0, + O2NM_FENCE_PANIC, + O2NM_FENCE_METHODS, /* Number of fence methods */ +}; + struct o2nm_node { spinlock_t nd_lock; struct config_item nd_item; @@ -58,6 +64,7 @@ struct o2nm_cluster { unsigned int cl_idle_timeout_ms; unsigned int cl_keepalive_delay_ms; unsigned int cl_reconnect_delay_ms; + enum o2nm_fence_method cl_fence_method; /* this bitmap is part of a hack for disk bitmap.. will go eventually. - zab */ unsigned long cl_nodes_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c index bbacf7d..6390240 100644 --- a/fs/ocfs2/cluster/quorum.c +++ b/fs/ocfs2/cluster/quorum.c @@ -74,8 +74,20 @@ static void o2quo_fence_self(void) * threads can still schedule, etc, etc */ o2hb_stop_all_regions(); - printk("ocfs2 is very sorry to be fencing this system by restarting\n"); - emergency_restart(); + switch (o2nm_single_cluster->cl_fence_method) { + case O2NM_FENCE_PANIC: + panic("*** ocfs2 is very sorry to be fencing this system by " + "panicing ***\n"); + break; + default: + WARN_ON(o2nm_single_cluster->cl_fence_method >= + O2NM_FENCE_METHODS); + case O2NM_FENCE_RESET: + printk(KERN_ERR "*** ocfs2 is very sorry to be fencing this " + "system by restarting ***\n"); + emergency_restart(); + break; + }; } /* Indicate that a timeout occured on a hearbeat region write. The diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 334f231..d8d0c65 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -485,7 +485,7 @@ static void o2net_set_nn_state(struct o2net_node *nn, } if (was_valid && !valid) { - printk(KERN_INFO "o2net: no longer connected to " + printk(KERN_NOTICE "o2net: no longer connected to " SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc)); o2net_complete_nodes_nsw(nn); } @@ -493,7 +493,7 @@ static void o2net_set_nn_state(struct o2net_node *nn, if (!was_valid && valid) { o2quo_conn_up(o2net_num_from_nn(nn)); cancel_delayed_work(&nn->nn_connect_expired); - printk(KERN_INFO "o2net: %s " SC_NODEF_FMT "\n", + printk(KERN_NOTICE "o2net: %s " SC_NODEF_FMT "\n", o2nm_this_node() > sc->sc_node->nd_num ? "connected to" : "accepted connection from", SC_NODEF_ARGS(sc)); @@ -930,7 +930,7 @@ static void o2net_sendpage(struct o2net_sock_container *sc, cond_resched(); continue; } - mlog(ML_ERROR, "sendpage of size %zu to " SC_NODEF_FMT + mlog(ML_ERROR, "sendpage of size %zu to " SC_NODEF_FMT " failed with %zd\n", size, SC_NODEF_ARGS(sc), ret); o2net_ensure_shutdown(nn, sc, 0); break; @@ -1476,14 +1476,14 @@ static void o2net_idle_timer(unsigned long data) do_gettimeofday(&now); - printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u " + printk(KERN_NOTICE "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u " "seconds, shutting it down.\n", SC_NODEF_ARGS(sc), o2net_idle_timeout() / 1000, o2net_idle_timeout() % 1000); mlog(ML_NOTICE, "here are some times that might help debug the " "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv " "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n", - sc->sc_tv_timer.tv_sec, (long) sc->sc_tv_timer.tv_usec, + sc->sc_tv_timer.tv_sec, (long) sc->sc_tv_timer.tv_usec, now.tv_sec, (long) now.tv_usec, sc->sc_tv_data_ready.tv_sec, (long) sc->sc_tv_data_ready.tv_usec, sc->sc_tv_advance_start.tv_sec, diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h index 8d58cfe..96fa7eb 100644 --- a/fs/ocfs2/cluster/tcp_internal.h +++ b/fs/ocfs2/cluster/tcp_internal.h @@ -32,10 +32,10 @@ * on their number */ #define O2NET_QUORUM_DELAY_MS ((o2hb_dead_threshold + 2) * O2HB_REGION_TIMEOUT_MS) -/* +/* * This version number represents quite a lot, unfortunately. It not * only represents the raw network message protocol on the wire but also - * locking semantics of the file system using the protocol. It should + * locking semantics of the file system using the protocol. It should * be somewhere else, I'm sure, but right now it isn't. * * With version 11, we separate out the filesystem locking portion. The diff --git a/fs/ocfs2/dlm/dlmapi.h b/fs/ocfs2/dlm/dlmapi.h index b5786a7..3cfa114 100644 --- a/fs/ocfs2/dlm/dlmapi.h +++ b/fs/ocfs2/dlm/dlmapi.h @@ -95,7 +95,7 @@ const char *dlm_errname(enum dlm_status err); mlog(ML_ERROR, "dlm status = %s\n", dlm_errname((st))); \ } while (0) -#define DLM_LKSB_UNUSED1 0x01 +#define DLM_LKSB_UNUSED1 0x01 #define DLM_LKSB_PUT_LVB 0x02 #define DLM_LKSB_GET_LVB 0x04 #define DLM_LKSB_UNUSED2 0x08 diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c index 01cf8cc..dccc439 100644 --- a/fs/ocfs2/dlm/dlmast.c +++ b/fs/ocfs2/dlm/dlmast.c @@ -123,7 +123,7 @@ static void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock) dlm_lock_put(lock); /* free up the reserved bast that we are cancelling. * guaranteed that this will not be the last reserved - * ast because *both* an ast and a bast were reserved + * ast because *both* an ast and a bast were reserved * to get to this point. the res->spinlock will not be * taken here */ dlm_lockres_release_ast(dlm, res); diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c index ca96bce..f283bce 100644 --- a/fs/ocfs2/dlm/dlmconvert.c +++ b/fs/ocfs2/dlm/dlmconvert.c @@ -396,7 +396,7 @@ static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm, /* instead of logging the same network error over * and over, sleep here and wait for the heartbeat * to notice the node is dead. times out after 5s. */ - dlm_wait_for_node_death(dlm, res->owner, + dlm_wait_for_node_death(dlm, res->owner, DLM_NODE_DEATH_WAIT_MAX); ret = DLM_RECOVERING; mlog(0, "node %u died so returning DLM_RECOVERING " diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index 42b0bad..0cd24cf 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -102,7 +102,7 @@ void __dlm_print_one_lock_resource(struct dlm_lock_resource *res) assert_spin_locked(&res->spinlock); stringify_lockname(res->lockname.name, res->lockname.len, - buf, sizeof(buf) - 1); + buf, sizeof(buf)); printk("lockres: %s, owner=%u, state=%u\n", buf, res->owner, res->state); printk(" last used: %lu, refcnt: %u, on purge list: %s\n", diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 0334000..988c905 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -816,7 +816,7 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data, } /* Once the dlm ctxt is marked as leaving then we don't want - * to be put in someone's domain map. + * to be put in someone's domain map. * Also, explicitly disallow joining at certain troublesome * times (ie. during recovery). */ if (dlm && dlm->dlm_state != DLM_CTXT_LEAVING) { diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c index 437698e..73333777 100644 --- a/fs/ocfs2/dlm/dlmlock.c +++ b/fs/ocfs2/dlm/dlmlock.c @@ -269,7 +269,7 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm, } dlm_revert_pending_lock(res, lock); dlm_lock_put(lock); - } else if (dlm_is_recovery_lock(res->lockname.name, + } else if (dlm_is_recovery_lock(res->lockname.name, res->lockname.len)) { /* special case for the $RECOVERY lock. * there will never be an AST delivered to put diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 03ccf9a..a659606 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -366,7 +366,7 @@ void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up) struct dlm_master_list_entry *mle; assert_spin_locked(&dlm->spinlock); - + list_for_each_entry(mle, &dlm->mle_hb_events, hb_events) { if (node_up) dlm_mle_node_up(dlm, mle, NULL, idx); @@ -833,7 +833,7 @@ lookup: __dlm_insert_mle(dlm, mle); /* still holding the dlm spinlock, check the recovery map - * to see if there are any nodes that still need to be + * to see if there are any nodes that still need to be * considered. these will not appear in the mle nodemap * but they might own this lockres. wait on them. */ bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0); @@ -883,7 +883,7 @@ redo_request: msleep(500); } continue; - } + } dlm_kick_recovery_thread(dlm); msleep(1000); @@ -939,8 +939,8 @@ wait: res->lockname.name, blocked); if (++tries > 20) { mlog(ML_ERROR, "%s:%.*s: spinning on " - "dlm_wait_for_lock_mastery, blocked=%d\n", - dlm->name, res->lockname.len, + "dlm_wait_for_lock_mastery, blocked=%d\n", + dlm->name, res->lockname.len, res->lockname.name, blocked); dlm_print_one_lock_resource(res); dlm_print_one_mle(mle); @@ -1029,7 +1029,7 @@ recheck: ret = dlm_restart_lock_mastery(dlm, res, mle, *blocked); b = (mle->type == DLM_MLE_BLOCK); if ((*blocked && !b) || (!*blocked && b)) { - mlog(0, "%s:%.*s: status change: old=%d new=%d\n", + mlog(0, "%s:%.*s: status change: old=%d new=%d\n", dlm->name, res->lockname.len, res->lockname.name, *blocked, b); *blocked = b; @@ -1602,7 +1602,7 @@ send_response: } mlog(0, "%u is the owner of %.*s, cleaning everyone else\n", dlm->node_num, res->lockname.len, res->lockname.name); - ret = dlm_dispatch_assert_master(dlm, res, 0, request->node_idx, + ret = dlm_dispatch_assert_master(dlm, res, 0, request->node_idx, DLM_ASSERT_MASTER_MLE_CLEANUP); if (ret < 0) { mlog(ML_ERROR, "failed to dispatch assert master work\n"); @@ -1701,7 +1701,7 @@ again: if (r & DLM_ASSERT_RESPONSE_REASSERT) { mlog(0, "%.*s: node %u create mles on other " - "nodes and requests a re-assert\n", + "nodes and requests a re-assert\n", namelen, lockname, to); reassert = 1; } @@ -1812,7 +1812,7 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data, spin_unlock(&dlm->master_lock); spin_unlock(&dlm->spinlock); goto done; - } + } } } spin_unlock(&dlm->master_lock); @@ -1883,7 +1883,7 @@ ok: int extra_ref = 0; int nn = -1; int rr, err = 0; - + spin_lock(&mle->spinlock); if (mle->type == DLM_MLE_BLOCK || mle->type == DLM_MLE_MIGRATION) extra_ref = 1; @@ -1891,7 +1891,7 @@ ok: /* MASTER mle: if any bits set in the response map * then the calling node needs to re-assert to clear * up nodes that this node contacted */ - while ((nn = find_next_bit (mle->response_map, O2NM_MAX_NODES, + while ((nn = find_next_bit (mle->response_map, O2NM_MAX_NODES, nn+1)) < O2NM_MAX_NODES) { if (nn != dlm->node_num && nn != assert->node_idx) master_request = 1; @@ -2002,7 +2002,7 @@ kill: __dlm_print_one_lock_resource(res); spin_unlock(&res->spinlock); spin_unlock(&dlm->spinlock); - *ret_data = (void *)res; + *ret_data = (void *)res; dlm_put(dlm); return -EINVAL; } @@ -2040,10 +2040,10 @@ int dlm_dispatch_assert_master(struct dlm_ctxt *dlm, item->u.am.request_from = request_from; item->u.am.flags = flags; - if (ignore_higher) - mlog(0, "IGNORE HIGHER: %.*s\n", res->lockname.len, + if (ignore_higher) + mlog(0, "IGNORE HIGHER: %.*s\n", res->lockname.len, res->lockname.name); - + spin_lock(&dlm->work_lock); list_add_tail(&item->list, &dlm->work_list); spin_unlock(&dlm->work_lock); @@ -2133,7 +2133,7 @@ put: * think that $RECOVERY is currently mastered by a dead node. If so, * we wait a short time to allow that node to get notified by its own * heartbeat stack, then check again. All $RECOVERY lock resources - * mastered by dead nodes are purged when the hearbeat callback is + * mastered by dead nodes are purged when the hearbeat callback is * fired, so we can know for sure that it is safe to continue once * the node returns a live node or no node. */ static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm, @@ -2174,7 +2174,7 @@ static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm, ret = -EAGAIN; } spin_unlock(&dlm->spinlock); - mlog(0, "%s: reco lock master is %u\n", dlm->name, + mlog(0, "%s: reco lock master is %u\n", dlm->name, master); break; } @@ -2602,7 +2602,7 @@ fail: mlog(0, "%s:%.*s: timed out during migration\n", dlm->name, res->lockname.len, res->lockname.name); - /* avoid hang during shutdown when migrating lockres + /* avoid hang during shutdown when migrating lockres * to a node which also goes down */ if (dlm_is_node_dead(dlm, target)) { mlog(0, "%s:%.*s: expected migration " @@ -2738,7 +2738,7 @@ static int dlm_migration_can_proceed(struct dlm_ctxt *dlm, can_proceed = !!(res->state & DLM_LOCK_RES_MIGRATING); spin_unlock(&res->spinlock); - /* target has died, so make the caller break out of the + /* target has died, so make the caller break out of the * wait_event, but caller must recheck the domain_map */ spin_lock(&dlm->spinlock); if (!test_bit(mig_target, dlm->domain_map)) diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index d9fa3d2..344bcf9 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -1050,7 +1050,7 @@ static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm, if (lock->ml.node == dead_node) { mlog(0, "AHA! there was " "a $RECOVERY lock for dead " - "node %u (%s)!\n", + "node %u (%s)!\n", dead_node, dlm->name); list_del_init(&lock->list); dlm_lock_put(lock); @@ -1164,6 +1164,39 @@ static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres, mres->master = master; } +static void dlm_prepare_lvb_for_migration(struct dlm_lock *lock, + struct dlm_migratable_lockres *mres, + int queue) +{ + if (!lock->lksb) + return; + + /* Ignore lvb in all locks in the blocked list */ + if (queue == DLM_BLOCKED_LIST) + return; + + /* Only consider lvbs in locks with granted EX or PR lock levels */ + if (lock->ml.type != LKM_EXMODE && lock->ml.type != LKM_PRMODE) + return; + + if (dlm_lvb_is_empty(mres->lvb)) { + memcpy(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN); + return; + } + + /* Ensure the lvb copied for migration matches in other valid locks */ + if (!memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN)) + return; + + mlog(ML_ERROR, "Mismatched lvb in lock cookie=%u:%llu, name=%.*s, " + "node=%u\n", + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), + lock->lockres->lockname.len, lock->lockres->lockname.name, + lock->ml.node); + dlm_print_one_lock_resource(lock->lockres); + BUG(); +} /* returns 1 if this lock fills the network structure, * 0 otherwise */ @@ -1181,20 +1214,7 @@ static int dlm_add_lock_to_array(struct dlm_lock *lock, ml->list = queue; if (lock->lksb) { ml->flags = lock->lksb->flags; - /* send our current lvb */ - if (ml->type == LKM_EXMODE || - ml->type == LKM_PRMODE) { - /* if it is already set, this had better be a PR - * and it has to match */ - if (!dlm_lvb_is_empty(mres->lvb) && - (ml->type == LKM_EXMODE || - memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))) { - mlog(ML_ERROR, "mismatched lvbs!\n"); - dlm_print_one_lock_resource(lock->lockres); - BUG(); - } - memcpy(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN); - } + dlm_prepare_lvb_for_migration(lock, mres, queue); } ml->node = lock->ml.node; mres->num_locks++; @@ -1730,6 +1750,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, struct dlm_lock *lock = NULL; u8 from = O2NM_MAX_NODES; unsigned int added = 0; + __be64 c; mlog(0, "running %d locks for this lockres\n", mres->num_locks); for (i=0; i<mres->num_locks; i++) { @@ -1777,19 +1798,48 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, /* lock is always created locally first, and * destroyed locally last. it must be on the list */ if (!lock) { - __be64 c = ml->cookie; - mlog(ML_ERROR, "could not find local lock " - "with cookie %u:%llu!\n", + c = ml->cookie; + mlog(ML_ERROR, "Could not find local lock " + "with cookie %u:%llu, node %u, " + "list %u, flags 0x%x, type %d, " + "conv %d, highest blocked %d\n", dlm_get_lock_cookie_node(be64_to_cpu(c)), - dlm_get_lock_cookie_seq(be64_to_cpu(c))); + dlm_get_lock_cookie_seq(be64_to_cpu(c)), + ml->node, ml->list, ml->flags, ml->type, + ml->convert_type, ml->highest_blocked); + __dlm_print_one_lock_resource(res); + BUG(); + } + + if (lock->ml.node != ml->node) { + c = lock->ml.cookie; + mlog(ML_ERROR, "Mismatched node# in lock " + "cookie %u:%llu, name %.*s, node %u\n", + dlm_get_lock_cookie_node(be64_to_cpu(c)), + dlm_get_lock_cookie_seq(be64_to_cpu(c)), + res->lockname.len, res->lockname.name, + lock->ml.node); + c = ml->cookie; + mlog(ML_ERROR, "Migrate lock cookie %u:%llu, " + "node %u, list %u, flags 0x%x, type %d, " + "conv %d, highest blocked %d\n", + dlm_get_lock_cookie_node(be64_to_cpu(c)), + dlm_get_lock_cookie_seq(be64_to_cpu(c)), + ml->node, ml->list, ml->flags, ml->type, + ml->convert_type, ml->highest_blocked); __dlm_print_one_lock_resource(res); BUG(); } - BUG_ON(lock->ml.node != ml->node); if (tmpq != queue) { - mlog(0, "lock was on %u instead of %u for %.*s\n", - j, ml->list, res->lockname.len, res->lockname.name); + c = ml->cookie; + mlog(0, "Lock cookie %u:%llu was on list %u " + "instead of list %u for %.*s\n", + dlm_get_lock_cookie_node(be64_to_cpu(c)), + dlm_get_lock_cookie_seq(be64_to_cpu(c)), + j, ml->list, res->lockname.len, + res->lockname.name); + __dlm_print_one_lock_resource(res); spin_unlock(&res->spinlock); continue; } @@ -1839,7 +1889,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, * the lvb. */ memcpy(res->lvb, mres->lvb, DLM_LVB_LEN); } else { - /* otherwise, the node is sending its + /* otherwise, the node is sending its * most recent valid lvb info */ BUG_ON(ml->type != LKM_EXMODE && ml->type != LKM_PRMODE); @@ -1886,7 +1936,7 @@ skip_lvb: spin_lock(&res->spinlock); list_for_each_entry(lock, queue, list) { if (lock->ml.cookie == ml->cookie) { - __be64 c = lock->ml.cookie; + c = lock->ml.cookie; mlog(ML_ERROR, "%s:%.*s: %u:%llu: lock already " "exists on this lockres!\n", dlm->name, res->lockname.len, res->lockname.name, @@ -2114,7 +2164,7 @@ static void dlm_revalidate_lvb(struct dlm_ctxt *dlm, assert_spin_locked(&res->spinlock); if (res->owner == dlm->node_num) - /* if this node owned the lockres, and if the dead node + /* if this node owned the lockres, and if the dead node * had an EX when he died, blank out the lvb */ search_node = dead_node; else { @@ -2152,7 +2202,7 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm, /* this node is the lockres master: * 1) remove any stale locks for the dead node - * 2) if the dead node had an EX when he died, blank out the lvb + * 2) if the dead node had an EX when he died, blank out the lvb */ assert_spin_locked(&dlm->spinlock); assert_spin_locked(&res->spinlock); @@ -2193,7 +2243,12 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm, mlog(0, "%s:%.*s: freed %u locks for dead node %u, " "dropping ref from lockres\n", dlm->name, res->lockname.len, res->lockname.name, freed, dead_node); - BUG_ON(!test_bit(dead_node, res->refmap)); + if(!test_bit(dead_node, res->refmap)) { + mlog(ML_ERROR, "%s:%.*s: freed %u locks for dead node %u, " + "but ref was not set\n", dlm->name, + res->lockname.len, res->lockname.name, freed, dead_node); + __dlm_print_one_lock_resource(res); + } dlm_lockres_clear_refmap_bit(dead_node, res); } else if (test_bit(dead_node, res->refmap)) { mlog(0, "%s:%.*s: dead node %u had a ref, but had " @@ -2260,7 +2315,7 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) } spin_unlock(&res->spinlock); continue; - } + } spin_lock(&res->spinlock); /* zero the lvb if necessary */ dlm_revalidate_lvb(dlm, res, dead_node); @@ -2411,7 +2466,7 @@ static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st) * this function on each node racing to become the recovery * master will not stop attempting this until either: * a) this node gets the EX (and becomes the recovery master), - * or b) dlm->reco.new_master gets set to some nodenum + * or b) dlm->reco.new_master gets set to some nodenum * != O2NM_INVALID_NODE_NUM (another node will do the reco). * so each time a recovery master is needed, the entire cluster * will sync at this point. if the new master dies, that will @@ -2424,7 +2479,7 @@ static int dlm_pick_recovery_master(struct dlm_ctxt *dlm) mlog(0, "starting recovery of %s at %lu, dead=%u, this=%u\n", dlm->name, jiffies, dlm->reco.dead_node, dlm->node_num); -again: +again: memset(&lksb, 0, sizeof(lksb)); ret = dlmlock(dlm, LKM_EXMODE, &lksb, LKM_NOQUEUE|LKM_RECOVERY, @@ -2437,8 +2492,8 @@ again: if (ret == DLM_NORMAL) { mlog(0, "dlm=%s dlmlock says I got it (this=%u)\n", dlm->name, dlm->node_num); - - /* got the EX lock. check to see if another node + + /* got the EX lock. check to see if another node * just became the reco master */ if (dlm_reco_master_ready(dlm)) { mlog(0, "%s: got reco EX lock, but %u will " @@ -2451,12 +2506,12 @@ again: /* see if recovery was already finished elsewhere */ spin_lock(&dlm->spinlock); if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) { - status = -EINVAL; + status = -EINVAL; mlog(0, "%s: got reco EX lock, but " "node got recovered already\n", dlm->name); if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) { mlog(ML_ERROR, "%s: new master is %u " - "but no dead node!\n", + "but no dead node!\n", dlm->name, dlm->reco.new_master); BUG(); } @@ -2468,7 +2523,7 @@ again: * set the master and send the messages to begin recovery */ if (!status) { mlog(0, "%s: dead=%u, this=%u, sending " - "begin_reco now\n", dlm->name, + "begin_reco now\n", dlm->name, dlm->reco.dead_node, dlm->node_num); status = dlm_send_begin_reco_message(dlm, dlm->reco.dead_node); @@ -2501,7 +2556,7 @@ again: mlog(0, "dlm=%s dlmlock says another node got it (this=%u)\n", dlm->name, dlm->node_num); /* another node is master. wait on - * reco.new_master != O2NM_INVALID_NODE_NUM + * reco.new_master != O2NM_INVALID_NODE_NUM * for at most one second */ wait_event_timeout(dlm->dlm_reco_thread_wq, dlm_reco_master_ready(dlm), @@ -2589,9 +2644,23 @@ retry: "begin reco msg (%d)\n", dlm->name, nodenum, ret); ret = 0; } + + /* + * Prior to commit aad1b15310b9bcd59fa81ab8f2b1513b59553ea8, + * dlm_begin_reco_handler() returned EAGAIN and not -EAGAIN. + * We are handling both for compatibility reasons. + */ + if (ret == -EAGAIN || ret == EAGAIN) { + mlog(0, "%s: trying to start recovery of node " + "%u, but node %u is waiting for last recovery " + "to complete, backoff for a bit\n", dlm->name, + dead_node, nodenum); + msleep(100); + goto retry; + } if (ret < 0) { struct dlm_lock_resource *res; - /* this is now a serious problem, possibly ENOMEM + /* this is now a serious problem, possibly ENOMEM * in the network stack. must retry */ mlog_errno(ret); mlog(ML_ERROR, "begin reco of dlm %s to node %u " @@ -2604,18 +2673,10 @@ retry: } else { mlog(ML_ERROR, "recovery lock not found\n"); } - /* sleep for a bit in hopes that we can avoid + /* sleep for a bit in hopes that we can avoid * another ENOMEM */ msleep(100); goto retry; - } else if (ret == EAGAIN) { - mlog(0, "%s: trying to start recovery of node " - "%u, but node %u is waiting for last recovery " - "to complete, backoff for a bit\n", dlm->name, - dead_node, nodenum); - /* TODO Look into replacing msleep with cond_resched() */ - msleep(100); - goto retry; } } @@ -2639,7 +2700,7 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data, dlm->name, br->node_idx, br->dead_node, dlm->reco.dead_node, dlm->reco.new_master); spin_unlock(&dlm->spinlock); - return EAGAIN; + return -EAGAIN; } spin_unlock(&dlm->spinlock); @@ -2664,7 +2725,7 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data, } if (dlm->reco.dead_node != O2NM_INVALID_NODE_NUM) { mlog(ML_NOTICE, "%s: dead_node previously set to %u, " - "node %u changing it to %u\n", dlm->name, + "node %u changing it to %u\n", dlm->name, dlm->reco.dead_node, br->node_idx, br->dead_node); } dlm_set_reco_master(dlm, br->node_idx); @@ -2730,8 +2791,8 @@ stage2: if (ret < 0) { mlog_errno(ret); if (dlm_is_host_down(ret)) { - /* this has no effect on this recovery - * session, so set the status to zero to + /* this has no effect on this recovery + * session, so set the status to zero to * finish out the last recovery */ mlog(ML_ERROR, "node %u went down after this " "node finished recovery.\n", nodenum); @@ -2768,7 +2829,7 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data, mlog(0, "%s: node %u finalizing recovery stage%d of " "node %u (%u:%u)\n", dlm->name, fr->node_idx, stage, fr->dead_node, dlm->reco.dead_node, dlm->reco.new_master); - + spin_lock(&dlm->spinlock); if (dlm->reco.new_master != fr->node_idx) { diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index 00f53b2..49e29ec 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -190,8 +190,8 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm, actions &= ~(DLM_UNLOCK_REMOVE_LOCK| DLM_UNLOCK_REGRANT_LOCK| DLM_UNLOCK_CLEAR_CONVERT_TYPE); - } else if (status == DLM_RECOVERING || - status == DLM_MIGRATING || + } else if (status == DLM_RECOVERING || + status == DLM_MIGRATING || status == DLM_FORWARD) { /* must clear the actions because this unlock * is about to be retried. cannot free or do @@ -661,14 +661,14 @@ retry: if (call_ast) { mlog(0, "calling unlockast(%p, %d)\n", data, status); if (is_master) { - /* it is possible that there is one last bast + /* it is possible that there is one last bast * pending. make sure it is flushed, then * call the unlockast. * not an issue if this is a mastered remotely, * since this lock has been removed from the * lockres queues and cannot be found. */ dlm_kick_thread(dlm, NULL); - wait_event(dlm->ast_wq, + wait_event(dlm->ast_wq, dlm_lock_basts_flushed(dlm, lock)); } (*unlockast)(data, status); diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index c5e4a49..e044019 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -875,6 +875,14 @@ static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lo lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); lockres->l_level = lockres->l_requested; + + /* + * We set the OCFS2_LOCK_UPCONVERT_FINISHING flag before clearing + * the OCFS2_LOCK_BUSY flag to prevent the dc thread from + * downconverting the lock before the upconvert has fully completed. + */ + lockres_or_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING); + lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); mlog_exit_void(); @@ -907,8 +915,6 @@ static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres, assert_spin_locked(&lockres->l_lock); - lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED); - if (level > lockres->l_blocking) { /* only schedule a downconvert if we haven't already scheduled * one that goes low enough to satisfy the level we're @@ -921,6 +927,9 @@ static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres, lockres->l_blocking = level; } + if (needs_downconvert) + lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED); + mlog_exit(needs_downconvert); return needs_downconvert; } @@ -1133,6 +1142,7 @@ static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres, mlog_entry_void(); spin_lock_irqsave(&lockres->l_lock, flags); lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); + lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING); if (convert) lockres->l_action = OCFS2_AST_INVALID; else @@ -1323,13 +1333,13 @@ static int __ocfs2_cluster_lock(struct ocfs2_super *osb, again: wait = 0; + spin_lock_irqsave(&lockres->l_lock, flags); + if (catch_signals && signal_pending(current)) { ret = -ERESTARTSYS; - goto out; + goto unlock; } - spin_lock_irqsave(&lockres->l_lock, flags); - mlog_bug_on_msg(lockres->l_flags & OCFS2_LOCK_FREEING, "Cluster lock called on freeing lockres %s! flags " "0x%lx\n", lockres->l_name, lockres->l_flags); @@ -1346,6 +1356,25 @@ again: goto unlock; } + if (lockres->l_flags & OCFS2_LOCK_UPCONVERT_FINISHING) { + /* + * We've upconverted. If the lock now has a level we can + * work with, we take it. If, however, the lock is not at the + * required level, we go thru the full cycle. One way this could + * happen is if a process requesting an upconvert to PR is + * closely followed by another requesting upconvert to an EX. + * If the process requesting EX lands here, we want it to + * continue attempting to upconvert and let the process + * requesting PR take the lock. + * If multiple processes request upconvert to PR, the first one + * here will take the lock. The others will have to go thru the + * OCFS2_LOCK_BLOCKED check to ensure that there is no pending + * downconvert request. + */ + if (level <= lockres->l_level) + goto update_holders; + } + if (lockres->l_flags & OCFS2_LOCK_BLOCKED && !ocfs2_may_continue_on_blocked_lock(lockres, level)) { /* is the lock is currently blocked on behalf of @@ -1416,11 +1445,14 @@ again: goto again; } +update_holders: /* Ok, if we get here then we're good to go. */ ocfs2_inc_holders(lockres, level); ret = 0; unlock: + lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING); + spin_unlock_irqrestore(&lockres->l_lock, flags); out: /* @@ -3155,7 +3187,7 @@ out: /* Mark the lockres as being dropped. It will no longer be * queued if blocking, but we still may have to wait on it * being dequeued from the downconvert thread before we can consider - * it safe to drop. + * it safe to drop. * * You can *not* attempt to call cluster_lock on this lockres anymore. */ void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres) @@ -3352,6 +3384,7 @@ static int ocfs2_unblock_lock(struct ocfs2_super *osb, unsigned long flags; int blocking; int new_level; + int level; int ret = 0; int set_lvb = 0; unsigned int gen; @@ -3360,9 +3393,17 @@ static int ocfs2_unblock_lock(struct ocfs2_super *osb, spin_lock_irqsave(&lockres->l_lock, flags); - BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED)); - recheck: + /* + * Is it still blocking? If not, we have no more work to do. + */ + if (!(lockres->l_flags & OCFS2_LOCK_BLOCKED)) { + BUG_ON(lockres->l_blocking != DLM_LOCK_NL); + spin_unlock_irqrestore(&lockres->l_lock, flags); + ret = 0; + goto leave; + } + if (lockres->l_flags & OCFS2_LOCK_BUSY) { /* XXX * This is a *big* race. The OCFS2_LOCK_PENDING flag @@ -3401,6 +3442,31 @@ recheck: goto leave; } + /* + * This prevents livelocks. OCFS2_LOCK_UPCONVERT_FINISHING flag is + * set when the ast is received for an upconvert just before the + * OCFS2_LOCK_BUSY flag is cleared. Now if the fs received a bast + * on the heels of the ast, we want to delay the downconvert just + * enough to allow the up requestor to do its task. Because this + * lock is in the blocked queue, the lock will be downconverted + * as soon as the requestor is done with the lock. + */ + if (lockres->l_flags & OCFS2_LOCK_UPCONVERT_FINISHING) + goto leave_requeue; + + /* + * How can we block and yet be at NL? We were trying to upconvert + * from NL and got canceled. The code comes back here, and now + * we notice and clear BLOCKING. + */ + if (lockres->l_level == DLM_LOCK_NL) { + BUG_ON(lockres->l_ex_holders || lockres->l_ro_holders); + lockres->l_blocking = DLM_LOCK_NL; + lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED); + spin_unlock_irqrestore(&lockres->l_lock, flags); + goto leave; + } + /* if we're blocking an exclusive and we have *any* holders, * then requeue. */ if ((lockres->l_blocking == DLM_LOCK_EX) @@ -3438,6 +3504,7 @@ recheck: * may sleep, so we save off a copy of what we're blocking as * it may change while we're not holding the spin lock. */ blocking = lockres->l_blocking; + level = lockres->l_level; spin_unlock_irqrestore(&lockres->l_lock, flags); ctl->unblock_action = lockres->l_ops->downconvert_worker(lockres, blocking); @@ -3446,7 +3513,7 @@ recheck: goto leave; spin_lock_irqsave(&lockres->l_lock, flags); - if (blocking != lockres->l_blocking) { + if ((blocking != lockres->l_blocking) || (level != lockres->l_level)) { /* If this changed underneath us, then we can't drop * it just yet. */ goto recheck; diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c index 15713cb..19ad145 100644 --- a/fs/ocfs2/export.c +++ b/fs/ocfs2/export.c @@ -239,7 +239,7 @@ static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len, mlog(0, "Encoding parent: blkno: %llu, generation: %u\n", (unsigned long long)blkno, generation); } - + *max_len = len; bail: diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index 843db64..5328529 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -37,6 +37,7 @@ #include "extent_map.h" #include "inode.h" #include "super.h" +#include "symlink.h" #include "buffer_head_io.h" @@ -191,7 +192,7 @@ static int ocfs2_try_to_merge_extent_map(struct ocfs2_extent_map_item *emi, emi->ei_clusters += ins->ei_clusters; return 1; } else if ((ins->ei_phys + ins->ei_clusters) == emi->ei_phys && - (ins->ei_cpos + ins->ei_clusters) == emi->ei_phys && + (ins->ei_cpos + ins->ei_clusters) == emi->ei_cpos && ins->ei_flags == emi->ei_flags) { emi->ei_phys = ins->ei_phys; emi->ei_cpos = ins->ei_cpos; @@ -703,6 +704,12 @@ out: return ret; } +/* + * The ocfs2_fiemap_inline() may be a little bit misleading, since + * it not only handles the fiemap for inlined files, but also deals + * with the fast symlink, cause they have no difference for extent + * mapping per se. + */ static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh, struct fiemap_extent_info *fieinfo, u64 map_start) @@ -715,11 +722,18 @@ static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh, struct ocfs2_inode_info *oi = OCFS2_I(inode); di = (struct ocfs2_dinode *)di_bh->b_data; - id_count = le16_to_cpu(di->id2.i_data.id_count); + if (ocfs2_inode_is_fast_symlink(inode)) + id_count = ocfs2_fast_symlink_chars(inode->i_sb); + else + id_count = le16_to_cpu(di->id2.i_data.id_count); if (map_start < id_count) { phys = oi->ip_blkno << inode->i_sb->s_blocksize_bits; - phys += offsetof(struct ocfs2_dinode, id2.i_data.id_data); + if (ocfs2_inode_is_fast_symlink(inode)) + phys += offsetof(struct ocfs2_dinode, id2.i_symlink); + else + phys += offsetof(struct ocfs2_dinode, + id2.i_data.id_data); ret = fiemap_fill_next_extent(fieinfo, 0, phys, id_count, flags); @@ -756,9 +770,10 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, down_read(&OCFS2_I(inode)->ip_alloc_sem); /* - * Handle inline-data separately. + * Handle inline-data and fast symlink separately. */ - if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + if ((OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) || + ocfs2_inode_is_fast_symlink(inode)) { ret = ocfs2_fiemap_inline(inode, di_bh, fieinfo, map_start); goto out_unlock; } @@ -786,6 +801,8 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, fe_flags = 0; if (rec.e_flags & OCFS2_EXT_UNWRITTEN) fe_flags |= FIEMAP_EXTENT_UNWRITTEN; + if (rec.e_flags & OCFS2_EXT_REFCOUNTED) + fe_flags |= FIEMAP_EXTENT_SHARED; if (is_last) fe_flags |= FIEMAP_EXTENT_LAST; len_bytes = (u64)le16_to_cpu(rec.e_leaf_clusters) << osb->s_clustersize_bits; diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 3d30a1c..558ce03 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -749,7 +749,7 @@ static int ocfs2_write_zero_page(struct inode *inode, int ret; offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */ - /* ugh. in prepare/commit_write, if from==to==start of block, we + /* ugh. in prepare/commit_write, if from==to==start of block, we ** skip the prepare. make sure we never send an offset for the start ** of a block */ @@ -1772,13 +1772,14 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry, loff_t *ppos, size_t count, int appending, - int *direct_io) + int *direct_io, + int *has_refcount) { int ret = 0, meta_level = 0; struct inode *inode = dentry->d_inode; loff_t saved_pos, end; - /* + /* * We start with a read level meta lock and only jump to an ex * if we need to make modifications here. */ @@ -1833,6 +1834,8 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry, saved_pos, count, &meta_level); + if (has_refcount) + *has_refcount = 1; } if (ret < 0) { @@ -1856,6 +1859,10 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry, break; } + if (has_refcount && *has_refcount == 1) { + *direct_io = 0; + break; + } /* * Allowing concurrent direct writes means * i_size changes wouldn't be synchronized, so @@ -1899,7 +1906,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, loff_t pos) { int ret, direct_io, appending, rw_level, have_alloc_sem = 0; - int can_do_direct; + int can_do_direct, has_refcount = 0; ssize_t written = 0; size_t ocount; /* original count */ size_t count; /* after file limit checks */ @@ -1942,7 +1949,7 @@ relock: can_do_direct = direct_io; ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos, iocb->ki_left, appending, - &can_do_direct); + &can_do_direct, &has_refcount); if (ret < 0) { mlog_errno(ret); goto out; @@ -2006,14 +2013,16 @@ out_dio: /* buffered aio wouldn't have proper lock coverage today */ BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT)); - if ((file->f_flags & O_DSYNC && !direct_io) || IS_SYNC(inode)) { + if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) || + ((file->f_flags & O_DIRECT) && has_refcount)) { ret = filemap_fdatawrite_range(file->f_mapping, pos, pos + count - 1); if (ret < 0) written = ret; if (!ret && (old_size != i_size_read(inode) || - old_clusters != OCFS2_I(inode)->ip_clusters)) { + old_clusters != OCFS2_I(inode)->ip_clusters || + has_refcount)) { ret = jbd2_journal_force_commit(osb->journal->j_journal); if (ret < 0) written = ret; @@ -2024,7 +2033,7 @@ out_dio: pos + count - 1); } - /* + /* * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io * function pointer which is called when o_direct io completes so that * it can unlock our rw lock. (it's the clustered equivalent of @@ -2062,7 +2071,7 @@ static int ocfs2_splice_to_file(struct pipe_inode_info *pipe, int ret; ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, &sd->pos, - sd->total_len, 0, NULL); + sd->total_len, 0, NULL, NULL); if (ret < 0) { mlog_errno(ret); return ret; @@ -2189,7 +2198,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, goto bail; } - /* + /* * buffered reads protect themselves in ->readpage(). O_DIRECT reads * need locks to protect pending reads from racing with truncate. */ @@ -2211,10 +2220,10 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, * We're fine letting folks race truncates and extending * writes with read across the cluster, just like they can * locally. Hence no rw_lock during read. - * + * * Take and drop the meta data lock to update inode fields * like i_size. This allows the checks down below - * generic_file_aio_read() a chance of actually working. + * generic_file_aio_read() a chance of actually working. */ ret = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level); if (ret < 0) { @@ -2239,7 +2248,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, bail: if (have_alloc_sem) up_read(&inode->i_alloc_sem); - if (rw_level != -1) + if (rw_level != -1) ocfs2_rw_unlock(inode, rw_level); mlog_exit(ret); diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 0297fb8..88459bd 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -475,7 +475,7 @@ static int ocfs2_read_locked_inode(struct inode *inode, if (args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) { status = ocfs2_try_open_lock(inode, 0); if (status) { - make_bad_inode(inode); + make_bad_inode(inode); return status; } } @@ -684,7 +684,7 @@ bail: return status; } -/* +/* * Serialize with orphan dir recovery. If the process doing * recovery on this orphan dir does an iget() with the dir * i_mutex held, we'll deadlock here. Instead we detect this diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 31fbb06..7d9d9c1 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -7,6 +7,7 @@ #include <linux/fs.h> #include <linux/mount.h> +#include <linux/compat.h> #define MLOG_MASK_PREFIX ML_INODE #include <cluster/masklog.h> @@ -181,6 +182,10 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) #ifdef CONFIG_COMPAT long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg) { + bool preserve; + struct reflink_arguments args; + struct inode *inode = file->f_path.dentry->d_inode; + switch (cmd) { case OCFS2_IOC32_GETFLAGS: cmd = OCFS2_IOC_GETFLAGS; @@ -195,8 +200,15 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg) case OCFS2_IOC_GROUP_EXTEND: case OCFS2_IOC_GROUP_ADD: case OCFS2_IOC_GROUP_ADD64: - case OCFS2_IOC_REFLINK: break; + case OCFS2_IOC_REFLINK: + if (copy_from_user(&args, (struct reflink_arguments *)arg, + sizeof(args))) + return -EFAULT; + preserve = (args.preserve != 0); + + return ocfs2_reflink_ioctl(inode, compat_ptr(args.old_path), + compat_ptr(args.new_path), preserve); default: return -ENOIOCTLCMD; } diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index bf34c49..9336c60 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -2034,7 +2034,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb, status = -ENOENT; mlog_errno(status); return status; - } + } mutex_lock(&orphan_dir_inode->i_mutex); status = ocfs2_inode_lock(orphan_dir_inode, NULL, 0); diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index f010b22..50fb26a 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -2108,6 +2108,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, } did_quota_inode = 1; + inode->i_nlink = 0; /* do the real work now. */ status = ocfs2_mknod_locked(osb, dir, inode, 0, &new_di_bh, parent_di_bh, handle, @@ -2136,6 +2137,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, if (status < 0) mlog_errno(status); + insert_inode_hash(inode); leave: if (status < 0 && did_quota_inode) vfs_dq_free_inode(inode); @@ -2267,6 +2269,8 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, di = (struct ocfs2_dinode *)di_bh->b_data; le32_add_cpu(&di->i_flags, -OCFS2_ORPHANED_FL); di->i_orphaned_slot = 0; + inode->i_nlink = 1; + ocfs2_set_links_count(di, inode->i_nlink); ocfs2_journal_dirty(handle, di_bh); status = ocfs2_add_entry(handle, dentry, inode, @@ -2284,7 +2288,6 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, goto out_commit; } - insert_inode_hash(inode); dentry->d_op = &ocfs2_dentry_ops; d_instantiate(dentry, inode); status = 0; @@ -2326,4 +2329,5 @@ const struct inode_operations ocfs2_dir_iops = { .getxattr = generic_getxattr, .listxattr = ocfs2_listxattr, .removexattr = generic_removexattr, + .fiemap = ocfs2_fiemap, }; diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index d963d86..740f448 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -136,6 +136,10 @@ enum ocfs2_unlock_action { #define OCFS2_LOCK_PENDING (0x00000400) /* This lockres is pending a call to dlm_lock. Only exists with BUSY set. */ +#define OCFS2_LOCK_UPCONVERT_FINISHING (0x00000800) /* blocks the dc thread + * from downconverting + * before the upconvert + * has completed */ struct ocfs2_lock_res_ops; @@ -245,9 +249,11 @@ enum ocfs2_mount_options OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */ OCFS2_MOUNT_NOUSERXATTR = 1 << 6, /* No user xattr */ OCFS2_MOUNT_INODE64 = 1 << 7, /* Allow inode numbers > 2^32 */ - OCFS2_MOUNT_POSIX_ACL = 1 << 8, /* POSIX access control lists */ - OCFS2_MOUNT_USRQUOTA = 1 << 9, /* We support user quotas */ - OCFS2_MOUNT_GRPQUOTA = 1 << 10, /* We support group quotas */ + OCFS2_MOUNT_POSIX_ACL = 1 << 8, /* Force POSIX access control lists */ + OCFS2_MOUNT_NO_POSIX_ACL = 1 << 9, /* Disable POSIX access + control lists */ + OCFS2_MOUNT_USRQUOTA = 1 << 10, /* We support user quotas */ + OCFS2_MOUNT_GRPQUOTA = 1 << 11, /* We support group quotas */ }; #define OCFS2_OSB_SOFT_RO 0x0001 diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index e9431e4..7638a38 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -1202,7 +1202,7 @@ struct ocfs2_local_disk_dqinfo { /* Header of one chunk of a quota file */ struct ocfs2_local_disk_chunk { __le32 dqc_free; /* Number of free entries in the bitmap */ - u8 dqc_bitmap[0]; /* Bitmap of entries in the corresponding + __u8 dqc_bitmap[0]; /* Bitmap of entries in the corresponding * chunk of quota file */ }; @@ -1417,9 +1417,16 @@ static inline int ocfs2_fast_symlink_chars(int blocksize) return blocksize - offsetof(struct ocfs2_dinode, id2.i_symlink); } -static inline int ocfs2_max_inline_data(int blocksize) +static inline int ocfs2_max_inline_data_with_xattr(int blocksize, + struct ocfs2_dinode *di) { - return blocksize - offsetof(struct ocfs2_dinode, id2.i_data.id_data); + if (di && (di->i_dyn_features & OCFS2_INLINE_XATTR_FL)) + return blocksize - + offsetof(struct ocfs2_dinode, id2.i_data.id_data) - + di->i_xattr_inline_size; + else + return blocksize - + offsetof(struct ocfs2_dinode, id2.i_data.id_data); } static inline int ocfs2_extent_recs_per_inode(int blocksize) diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 30967e3..8ae65c9 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -276,7 +276,7 @@ static void ocfs2_erase_refcount_tree_from_list(struct ocfs2_super *osb, spin_unlock(&osb->osb_lock); } -void ocfs2_kref_remove_refcount_tree(struct kref *kref) +static void ocfs2_kref_remove_refcount_tree(struct kref *kref) { struct ocfs2_refcount_tree *tree = container_of(kref, struct ocfs2_refcount_tree, rf_getcnt); @@ -524,23 +524,6 @@ out: return ret; } -int ocfs2_lock_refcount_tree_by_inode(struct inode *inode, int rw, - struct ocfs2_refcount_tree **ret_tree, - struct buffer_head **ref_bh) -{ - int ret; - u64 ref_blkno; - - ret = ocfs2_get_refcount_block(inode, &ref_blkno); - if (ret) { - mlog_errno(ret); - return ret; - } - - return ocfs2_lock_refcount_tree(OCFS2_SB(inode->i_sb), ref_blkno, - rw, ret_tree, ref_bh); -} - void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb, struct ocfs2_refcount_tree *tree, int rw) { @@ -969,6 +952,103 @@ out: } /* + * Find the end range for a leaf refcount block indicated by + * el->l_recs[index].e_blkno. + */ +static int ocfs2_get_refcount_cpos_end(struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + struct ocfs2_extent_block *eb, + struct ocfs2_extent_list *el, + int index, u32 *cpos_end) +{ + int ret, i, subtree_root; + u32 cpos; + u64 blkno; + struct super_block *sb = ocfs2_metadata_cache_get_super(ci); + struct ocfs2_path *left_path = NULL, *right_path = NULL; + struct ocfs2_extent_tree et; + struct ocfs2_extent_list *tmp_el; + + if (index < le16_to_cpu(el->l_next_free_rec) - 1) { + /* + * We have a extent rec after index, so just use the e_cpos + * of the next extent rec. + */ + *cpos_end = le32_to_cpu(el->l_recs[index+1].e_cpos); + return 0; + } + + if (!eb || (eb && !eb->h_next_leaf_blk)) { + /* + * We are the last extent rec, so any high cpos should + * be stored in this leaf refcount block. + */ + *cpos_end = UINT_MAX; + return 0; + } + + /* + * If the extent block isn't the last one, we have to find + * the subtree root between this extent block and the next + * leaf extent block and get the corresponding e_cpos from + * the subroot. Otherwise we may corrupt the b-tree. + */ + ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh); + + left_path = ocfs2_new_path_from_et(&et); + if (!left_path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + cpos = le32_to_cpu(eb->h_list.l_recs[index].e_cpos); + ret = ocfs2_find_path(ci, left_path, cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + right_path = ocfs2_new_path_from_path(left_path); + if (!right_path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_cpos_for_right_leaf(sb, left_path, &cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_path(ci, right_path, cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + subtree_root = ocfs2_find_subtree_root(&et, left_path, + right_path); + + tmp_el = left_path->p_node[subtree_root].el; + blkno = left_path->p_node[subtree_root+1].bh->b_blocknr; + for (i = 0; i < le32_to_cpu(tmp_el->l_next_free_rec); i++) { + if (le64_to_cpu(tmp_el->l_recs[i].e_blkno) == blkno) { + *cpos_end = le32_to_cpu(tmp_el->l_recs[i+1].e_cpos); + break; + } + } + + BUG_ON(i == le32_to_cpu(tmp_el->l_next_free_rec)); + +out: + ocfs2_free_path(left_path); + ocfs2_free_path(right_path); + return ret; +} + +/* * Given a cpos and len, try to find the refcount record which contains cpos. * 1. If cpos can be found in one refcount record, return the record. * 2. If cpos can't be found, return a fake record which start from cpos @@ -983,10 +1063,10 @@ static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci, struct buffer_head **ret_bh) { int ret = 0, i, found; - u32 low_cpos; + u32 low_cpos, uninitialized_var(cpos_end); struct ocfs2_extent_list *el; - struct ocfs2_extent_rec *tmp, *rec = NULL; - struct ocfs2_extent_block *eb; + struct ocfs2_extent_rec *rec = NULL; + struct ocfs2_extent_block *eb = NULL; struct buffer_head *eb_bh = NULL, *ref_leaf_bh = NULL; struct super_block *sb = ocfs2_metadata_cache_get_super(ci); struct ocfs2_refcount_block *rb = @@ -1034,12 +1114,16 @@ static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci, } } - /* adjust len when we have ocfs2_extent_rec after it. */ - if (found && i < le16_to_cpu(el->l_next_free_rec) - 1) { - tmp = &el->l_recs[i+1]; + if (found) { + ret = ocfs2_get_refcount_cpos_end(ci, ref_root_bh, + eb, el, i, &cpos_end); + if (ret) { + mlog_errno(ret); + goto out; + } - if (le32_to_cpu(tmp->e_cpos) < cpos + len) - len = le32_to_cpu(tmp->e_cpos) - cpos; + if (cpos_end < low_cpos + len) + len = cpos_end - low_cpos; } ret = ocfs2_read_refcount_block(ci, le64_to_cpu(rec->e_blkno), @@ -1418,7 +1502,7 @@ static int ocfs2_divide_leaf_refcount_block(struct buffer_head *ref_leaf_bh, /* change old and new rl_used accordingly. */ le16_add_cpu(&rl->rl_used, -num_moved); - new_rl->rl_used = cpu_to_le32(num_moved); + new_rl->rl_used = cpu_to_le16(num_moved); sort(&rl->rl_recs, le16_to_cpu(rl->rl_used), sizeof(struct ocfs2_refcount_rec), @@ -1797,7 +1881,8 @@ static int ocfs2_split_refcount_rec(handle_t *handle, recs_need++; /* If the leaf block don't have enough record, expand it. */ - if (le16_to_cpu(rf_list->rl_used) + recs_need > rf_list->rl_count) { + if (le16_to_cpu(rf_list->rl_used) + recs_need > + le16_to_cpu(rf_list->rl_count)) { struct ocfs2_refcount_rec tmp_rec; u64 cpos = le64_to_cpu(orig_rec->r_cpos); len = le32_to_cpu(orig_rec->r_clusters); @@ -1859,7 +1944,7 @@ static int ocfs2_split_refcount_rec(handle_t *handle, memcpy(tail_rec, orig_rec, sizeof(struct ocfs2_refcount_rec)); le64_add_cpu(&tail_rec->r_cpos, le32_to_cpu(tail_rec->r_clusters) - len); - tail_rec->r_clusters = le32_to_cpu(len); + tail_rec->r_clusters = cpu_to_le32(len); } /* @@ -2860,7 +2945,7 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle, while (offset < end) { page_index = offset >> PAGE_CACHE_SHIFT; - map_end = (page_index + 1) << PAGE_CACHE_SHIFT; + map_end = ((loff_t)page_index + 1) << PAGE_CACHE_SHIFT; if (map_end > end) map_end = end; @@ -2872,8 +2957,12 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle, page = grab_cache_page(mapping, page_index); - /* This page can't be dirtied before we CoW it out. */ - BUG_ON(PageDirty(page)); + /* + * In case PAGE_CACHE_SIZE <= CLUSTER_SIZE, This page + * can't be dirtied before we CoW it out. + */ + if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize) + BUG_ON(PageDirty(page)); if (!PageUptodate(page)) { ret = block_read_full_page(page, ocfs2_get_block); @@ -3085,7 +3174,7 @@ static int ocfs2_cow_sync_writeback(struct super_block *sb, while (offset < end) { page_index = offset >> PAGE_CACHE_SHIFT; - map_end = (page_index + 1) << PAGE_CACHE_SHIFT; + map_end = ((loff_t)page_index + 1) << PAGE_CACHE_SHIFT; if (map_end > end) map_end = end; @@ -3840,8 +3929,7 @@ static int ocfs2_add_refcounted_extent(struct inode *inode, } ret = ocfs2_insert_extent(handle, et, cpos, - cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb, - p_cluster)), + ocfs2_clusters_to_blocks(inode->i_sb, p_cluster), num_clusters, ext_flags, meta_ac); if (ret) { mlog_errno(ret); @@ -4253,8 +4341,8 @@ static int ocfs2_user_path_parent(const char __user *path, * @new_dentry: target dentry * @preserve: if true, preserve all file attributes */ -int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir, - struct dentry *new_dentry, bool preserve) +static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir, + struct dentry *new_dentry, bool preserve) { struct inode *inode = old_dentry->d_inode; int error; diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c index e49c410..3038c92 100644 --- a/fs/ocfs2/stack_o2cb.c +++ b/fs/ocfs2/stack_o2cb.c @@ -277,7 +277,7 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn) u32 dlm_key; struct dlm_ctxt *dlm; struct o2dlm_private *priv; - struct dlm_protocol_version dlm_version; + struct dlm_protocol_version fs_version; BUG_ON(conn == NULL); BUG_ON(o2cb_stack.sp_proto == NULL); @@ -304,18 +304,18 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn) /* used by the dlm code to make message headers unique, each * node in this domain must agree on this. */ dlm_key = crc32_le(0, conn->cc_name, conn->cc_namelen); - dlm_version.pv_major = conn->cc_version.pv_major; - dlm_version.pv_minor = conn->cc_version.pv_minor; + fs_version.pv_major = conn->cc_version.pv_major; + fs_version.pv_minor = conn->cc_version.pv_minor; - dlm = dlm_register_domain(conn->cc_name, dlm_key, &dlm_version); + dlm = dlm_register_domain(conn->cc_name, dlm_key, &fs_version); if (IS_ERR(dlm)) { rc = PTR_ERR(dlm); mlog_errno(rc); goto out_free; } - conn->cc_version.pv_major = dlm_version.pv_major; - conn->cc_version.pv_minor = dlm_version.pv_minor; + conn->cc_version.pv_major = fs_version.pv_major; + conn->cc_version.pv_minor = fs_version.pv_minor; conn->cc_lockspace = dlm; dlm_register_eviction_cb(dlm, &priv->op_eviction_cb); diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index ff4c798..da78a2a 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -814,7 +814,7 @@ static int fs_protocol_compare(struct ocfs2_protocol_version *existing, static int user_cluster_connect(struct ocfs2_cluster_connection *conn) { dlm_lockspace_t *fsdlm; - struct ocfs2_live_connection *control; + struct ocfs2_live_connection *uninitialized_var(control); int rc = 0; BUG_ON(conn == NULL); diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 14f47d2..755cd49 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -100,6 +100,8 @@ struct mount_options static int ocfs2_parse_options(struct super_block *sb, char *options, struct mount_options *mopt, int is_remount); +static int ocfs2_check_set_options(struct super_block *sb, + struct mount_options *options); static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt); static void ocfs2_put_super(struct super_block *sb); static int ocfs2_mount_volume(struct super_block *sb); @@ -600,7 +602,8 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data) lock_kernel(); - if (!ocfs2_parse_options(sb, data, &parsed_options, 1)) { + if (!ocfs2_parse_options(sb, data, &parsed_options, 1) || + !ocfs2_check_set_options(sb, &parsed_options)) { ret = -EINVAL; goto out; } @@ -691,8 +694,6 @@ unlock_osb: if (!ret) { /* Only save off the new mount options in case of a successful * remount. */ - if (!(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR)) - parsed_options.mount_opt &= ~OCFS2_MOUNT_POSIX_ACL; osb->s_mount_opt = parsed_options.mount_opt; osb->s_atime_quantum = parsed_options.atime_quantum; osb->preferred_slot = parsed_options.slot; @@ -701,6 +702,10 @@ unlock_osb: if (!ocfs2_is_hard_readonly(osb)) ocfs2_set_journal_params(osb); + + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? + MS_POSIXACL : 0); } out: unlock_kernel(); @@ -1011,31 +1016,16 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) brelse(bh); bh = NULL; - if (!(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR)) - parsed_options.mount_opt &= ~OCFS2_MOUNT_POSIX_ACL; - + if (!ocfs2_check_set_options(sb, &parsed_options)) { + status = -EINVAL; + goto read_super_error; + } osb->s_mount_opt = parsed_options.mount_opt; osb->s_atime_quantum = parsed_options.atime_quantum; osb->preferred_slot = parsed_options.slot; osb->osb_commit_interval = parsed_options.commit_interval; osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt); osb->local_alloc_bits = osb->local_alloc_default_bits; - if (osb->s_mount_opt & OCFS2_MOUNT_USRQUOTA && - !OCFS2_HAS_RO_COMPAT_FEATURE(sb, - OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) { - status = -EINVAL; - mlog(ML_ERROR, "User quotas were requested, but this " - "filesystem does not have the feature enabled.\n"); - goto read_super_error; - } - if (osb->s_mount_opt & OCFS2_MOUNT_GRPQUOTA && - !OCFS2_HAS_RO_COMPAT_FEATURE(sb, - OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) { - status = -EINVAL; - mlog(ML_ERROR, "Group quotas were requested, but this " - "filesystem does not have the feature enabled.\n"); - goto read_super_error; - } status = ocfs2_verify_userspace_stack(osb, &parsed_options); if (status) @@ -1072,7 +1062,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) "file system, but write access is " "unavailable.\n"); else - mlog_errno(status); + mlog_errno(status); goto read_super_error; } @@ -1245,6 +1235,40 @@ static struct file_system_type ocfs2_fs_type = { .next = NULL }; +static int ocfs2_check_set_options(struct super_block *sb, + struct mount_options *options) +{ + if (options->mount_opt & OCFS2_MOUNT_USRQUOTA && + !OCFS2_HAS_RO_COMPAT_FEATURE(sb, + OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) { + mlog(ML_ERROR, "User quotas were requested, but this " + "filesystem does not have the feature enabled.\n"); + return 0; + } + if (options->mount_opt & OCFS2_MOUNT_GRPQUOTA && + !OCFS2_HAS_RO_COMPAT_FEATURE(sb, + OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) { + mlog(ML_ERROR, "Group quotas were requested, but this " + "filesystem does not have the feature enabled.\n"); + return 0; + } + if (options->mount_opt & OCFS2_MOUNT_POSIX_ACL && + !OCFS2_HAS_INCOMPAT_FEATURE(sb, OCFS2_FEATURE_INCOMPAT_XATTR)) { + mlog(ML_ERROR, "ACL support requested but extended attributes " + "feature is not enabled\n"); + return 0; + } + /* No ACL setting specified? Use XATTR feature... */ + if (!(options->mount_opt & (OCFS2_MOUNT_POSIX_ACL | + OCFS2_MOUNT_NO_POSIX_ACL))) { + if (OCFS2_HAS_INCOMPAT_FEATURE(sb, OCFS2_FEATURE_INCOMPAT_XATTR)) + options->mount_opt |= OCFS2_MOUNT_POSIX_ACL; + else + options->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL; + } + return 1; +} + static int ocfs2_parse_options(struct super_block *sb, char *options, struct mount_options *mopt, @@ -1392,40 +1416,19 @@ static int ocfs2_parse_options(struct super_block *sb, mopt->mount_opt |= OCFS2_MOUNT_INODE64; break; case Opt_usrquota: - /* We check only on remount, otherwise features - * aren't yet initialized. */ - if (is_remount && !OCFS2_HAS_RO_COMPAT_FEATURE(sb, - OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) { - mlog(ML_ERROR, "User quota requested but " - "filesystem feature is not set\n"); - status = 0; - goto bail; - } mopt->mount_opt |= OCFS2_MOUNT_USRQUOTA; break; case Opt_grpquota: - if (is_remount && !OCFS2_HAS_RO_COMPAT_FEATURE(sb, - OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) { - mlog(ML_ERROR, "Group quota requested but " - "filesystem feature is not set\n"); - status = 0; - goto bail; - } mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA; break; -#ifdef CONFIG_OCFS2_FS_POSIX_ACL case Opt_acl: mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL; + mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL; break; case Opt_noacl: + mopt->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL; mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL; break; -#else - case Opt_acl: - case Opt_noacl: - printk(KERN_INFO "ocfs2 (no)acl options not supported\n"); - break; -#endif default: mlog(ML_ERROR, "Unrecognized mount option \"%s\" " @@ -1502,12 +1505,10 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt) if (opts & OCFS2_MOUNT_INODE64) seq_printf(s, ",inode64"); -#ifdef CONFIG_OCFS2_FS_POSIX_ACL if (opts & OCFS2_MOUNT_POSIX_ACL) seq_printf(s, ",acl"); else seq_printf(s, ",noacl"); -#endif return 0; } diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c index e342103..32499d21 100644 --- a/fs/ocfs2/symlink.c +++ b/fs/ocfs2/symlink.c @@ -137,20 +137,20 @@ static void *ocfs2_fast_follow_link(struct dentry *dentry, } memcpy(link, target, len); - nd_set_link(nd, link); bail: + nd_set_link(nd, status ? ERR_PTR(status) : link); brelse(bh); mlog_exit(status); - return status ? ERR_PTR(status) : link; + return NULL; } static void ocfs2_fast_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) { - char *link = cookie; - - kfree(link); + char *link = nd_get_link(nd); + if (!IS_ERR(link)) + kfree(link); } const struct inode_operations ocfs2_symlink_inode_operations = { @@ -163,6 +163,7 @@ const struct inode_operations ocfs2_symlink_inode_operations = { .getxattr = generic_getxattr, .listxattr = ocfs2_listxattr, .removexattr = generic_removexattr, + .fiemap = ocfs2_fiemap, }; const struct inode_operations ocfs2_fast_symlink_inode_operations = { .readlink = ocfs2_readlink, @@ -174,4 +175,5 @@ const struct inode_operations ocfs2_fast_symlink_inode_operations = { .getxattr = generic_getxattr, .listxattr = ocfs2_listxattr, .removexattr = generic_removexattr, + .fiemap = ocfs2_fiemap, }; diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c index c613693..a0a120e 100644 --- a/fs/ocfs2/uptodate.c +++ b/fs/ocfs2/uptodate.c @@ -267,8 +267,8 @@ static int ocfs2_buffer_cached(struct ocfs2_caching_info *ci, } /* Warning: even if it returns true, this does *not* guarantee that - * the block is stored in our inode metadata cache. - * + * the block is stored in our inode metadata cache. + * * This can be called under lock_buffer() */ int ocfs2_buffer_uptodate(struct ocfs2_caching_info *ci, diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index fe34190..8fc6fb0 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -98,10 +98,8 @@ static struct ocfs2_xattr_def_value_root def_xv = { struct xattr_handler *ocfs2_xattr_handlers[] = { &ocfs2_xattr_user_handler, -#ifdef CONFIG_OCFS2_FS_POSIX_ACL &ocfs2_xattr_acl_access_handler, &ocfs2_xattr_acl_default_handler, -#endif &ocfs2_xattr_trusted_handler, &ocfs2_xattr_security_handler, NULL @@ -109,12 +107,10 @@ struct xattr_handler *ocfs2_xattr_handlers[] = { static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = { [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler, -#ifdef CONFIG_OCFS2_FS_POSIX_ACL [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS] = &ocfs2_xattr_acl_access_handler, [OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ocfs2_xattr_acl_default_handler, -#endif [OCFS2_XATTR_INDEX_TRUSTED] = &ocfs2_xattr_trusted_handler, [OCFS2_XATTR_INDEX_SECURITY] = &ocfs2_xattr_security_handler, }; @@ -205,8 +201,6 @@ static int ocfs2_get_xattr_tree_value_root(struct super_block *sb, int offset, struct ocfs2_xattr_value_root **xv, struct buffer_head **bh); -static int ocfs2_xattr_security_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags); static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb) { @@ -6066,7 +6060,7 @@ static int ocfs2_value_metas_in_xattr_header(struct super_block *sb, * to the extent block, so just calculate a maximum record num. */ if (!xv->xr_list.l_tree_depth) - *num_recs += xv->xr_list.l_next_free_rec; + *num_recs += le16_to_cpu(xv->xr_list.l_next_free_rec); else *num_recs += ocfs2_clusters_for_bytes(sb, XATTR_SIZE_MAX); @@ -6978,9 +6972,9 @@ int ocfs2_init_security_and_acl(struct inode *dir, ret = ocfs2_init_security_get(inode, dir, &si); if (!ret) { - ret = ocfs2_xattr_security_set(inode, si.name, - si.value, si.value_len, - XATTR_CREATE); + ret = ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY, + si.name, si.value, si.value_len, + XATTR_CREATE); if (ret) { mlog_errno(ret); goto leave; @@ -7008,9 +7002,9 @@ leave: /* * 'security' attributes support */ -static size_t ocfs2_xattr_security_list(struct inode *inode, char *list, +static size_t ocfs2_xattr_security_list(struct dentry *dentry, char *list, size_t list_size, const char *name, - size_t name_len) + size_t name_len, int type) { const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; @@ -7023,23 +7017,23 @@ static size_t ocfs2_xattr_security_list(struct inode *inode, char *list, return total_len; } -static int ocfs2_xattr_security_get(struct inode *inode, const char *name, - void *buffer, size_t size) +static int ocfs2_xattr_security_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) { if (strcmp(name, "") == 0) return -EINVAL; - return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_SECURITY, name, - buffer, size); + return ocfs2_xattr_get(dentry->d_inode, OCFS2_XATTR_INDEX_SECURITY, + name, buffer, size); } -static int ocfs2_xattr_security_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) +static int ocfs2_xattr_security_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) { if (strcmp(name, "") == 0) return -EINVAL; - return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY, name, value, - size, flags); + return ocfs2_xattr_set(dentry->d_inode, OCFS2_XATTR_INDEX_SECURITY, + name, value, size, flags); } int ocfs2_init_security_get(struct inode *inode, @@ -7076,9 +7070,9 @@ struct xattr_handler ocfs2_xattr_security_handler = { /* * 'trusted' attributes support */ -static size_t ocfs2_xattr_trusted_list(struct inode *inode, char *list, +static size_t ocfs2_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size, const char *name, - size_t name_len) + size_t name_len, int type) { const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; @@ -7091,23 +7085,23 @@ static size_t ocfs2_xattr_trusted_list(struct inode *inode, char *list, return total_len; } -static int ocfs2_xattr_trusted_get(struct inode *inode, const char *name, - void *buffer, size_t size) +static int ocfs2_xattr_trusted_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) { if (strcmp(name, "") == 0) return -EINVAL; - return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_TRUSTED, name, - buffer, size); + return ocfs2_xattr_get(dentry->d_inode, OCFS2_XATTR_INDEX_TRUSTED, + name, buffer, size); } -static int ocfs2_xattr_trusted_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) +static int ocfs2_xattr_trusted_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) { if (strcmp(name, "") == 0) return -EINVAL; - return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_TRUSTED, name, value, - size, flags); + return ocfs2_xattr_set(dentry->d_inode, OCFS2_XATTR_INDEX_TRUSTED, + name, value, size, flags); } struct xattr_handler ocfs2_xattr_trusted_handler = { @@ -7120,13 +7114,13 @@ struct xattr_handler ocfs2_xattr_trusted_handler = { /* * 'user' attributes support */ -static size_t ocfs2_xattr_user_list(struct inode *inode, char *list, +static size_t ocfs2_xattr_user_list(struct dentry *dentry, char *list, size_t list_size, const char *name, - size_t name_len) + size_t name_len, int type) { const size_t prefix_len = XATTR_USER_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) return 0; @@ -7139,31 +7133,31 @@ static size_t ocfs2_xattr_user_list(struct inode *inode, char *list, return total_len; } -static int ocfs2_xattr_user_get(struct inode *inode, const char *name, - void *buffer, size_t size) +static int ocfs2_xattr_user_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) { - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); if (strcmp(name, "") == 0) return -EINVAL; if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) return -EOPNOTSUPP; - return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_USER, name, + return ocfs2_xattr_get(dentry->d_inode, OCFS2_XATTR_INDEX_USER, name, buffer, size); } -static int ocfs2_xattr_user_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) +static int ocfs2_xattr_user_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) { - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); if (strcmp(name, "") == 0) return -EINVAL; if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) return -EOPNOTSUPP; - return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_USER, name, value, - size, flags); + return ocfs2_xattr_set(dentry->d_inode, OCFS2_XATTR_INDEX_USER, + name, value, size, flags); } struct xattr_handler ocfs2_xattr_user_handler = { diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h index 08e3638..abd72a4 100644 --- a/fs/ocfs2/xattr.h +++ b/fs/ocfs2/xattr.h @@ -40,10 +40,8 @@ struct ocfs2_security_xattr_info { extern struct xattr_handler ocfs2_xattr_user_handler; extern struct xattr_handler ocfs2_xattr_trusted_handler; extern struct xattr_handler ocfs2_xattr_security_handler; -#ifdef CONFIG_OCFS2_FS_POSIX_ACL extern struct xattr_handler ocfs2_xattr_acl_access_handler; extern struct xattr_handler ocfs2_xattr_acl_default_handler; -#endif extern struct xattr_handler *ocfs2_xattr_handlers[]; ssize_t ocfs2_listxattr(struct dentry *, char *, size_t); @@ -30,6 +30,9 @@ #include <linux/audit.h> #include <linux/falloc.h> #include <linux/fs_struct.h> +#include <linux/ima.h> + +#include "internal.h" int vfs_statfs(struct dentry *dentry, struct kstatfs *buf) { @@ -818,15 +821,14 @@ static inline int __get_file_write_access(struct inode *inode, } static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, - int flags, struct file *f, + struct file *f, int (*open)(struct inode *, struct file *), const struct cred *cred) { struct inode *inode; int error; - f->f_flags = flags; - f->f_mode = (__force fmode_t)((flags+1) & O_ACCMODE) | FMODE_LSEEK | + f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; inode = dentry->d_inode; if (f->f_mode & FMODE_WRITE) { @@ -855,6 +857,7 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, if (error) goto cleanup_all; } + ima_counts_get(f); f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); @@ -926,7 +929,6 @@ struct file *lookup_instantiate_filp(struct nameidata *nd, struct dentry *dentry if (IS_ERR(dentry)) goto out_err; nd->intent.open.file = __dentry_open(dget(dentry), mntget(nd->path.mnt), - nd->intent.open.flags - 1, nd->intent.open.file, open, cred); out: @@ -945,7 +947,7 @@ EXPORT_SYMBOL_GPL(lookup_instantiate_filp); * * Note that this function destroys the original nameidata */ -struct file *nameidata_to_filp(struct nameidata *nd, int flags) +struct file *nameidata_to_filp(struct nameidata *nd) { const struct cred *cred = current_cred(); struct file *filp; @@ -954,7 +956,7 @@ struct file *nameidata_to_filp(struct nameidata *nd, int flags) filp = nd->intent.open.file; /* Has the filesystem initialised the file for us? */ if (filp->f_path.dentry == NULL) - filp = __dentry_open(nd->path.dentry, nd->path.mnt, flags, filp, + filp = __dentry_open(nd->path.dentry, nd->path.mnt, filp, NULL, cred); else path_put(&nd->path); @@ -993,7 +995,8 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags, return ERR_PTR(error); } - return __dentry_open(dentry, mnt, flags, f, NULL, cred); + f->f_flags = flags; + return __dentry_open(dentry, mnt, f, NULL, cred); } EXPORT_SYMBOL(dentry_open); @@ -906,17 +906,6 @@ void free_pipe_info(struct inode *inode) } static struct vfsmount *pipe_mnt __read_mostly; -static int pipefs_delete_dentry(struct dentry *dentry) -{ - /* - * At creation time, we pretended this dentry was hashed - * (by clearing DCACHE_UNHASHED bit in d_flags) - * At delete time, we restore the truth : not hashed. - * (so that dput() can proceed correctly) - */ - dentry->d_flags |= DCACHE_UNHASHED; - return 0; -} /* * pipefs_dname() is called from d_path(). @@ -928,7 +917,6 @@ static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen) } static const struct dentry_operations pipefs_dentry_operations = { - .d_delete = pipefs_delete_dentry, .d_dname = pipefs_dname, }; @@ -974,7 +962,7 @@ struct file *create_write_pipe(int flags) int err; struct inode *inode; struct file *f; - struct dentry *dentry; + struct path path; struct qstr name = { .name = "" }; err = -ENFILE; @@ -983,21 +971,16 @@ struct file *create_write_pipe(int flags) goto err; err = -ENOMEM; - dentry = d_alloc(pipe_mnt->mnt_sb->s_root, &name); - if (!dentry) + path.dentry = d_alloc(pipe_mnt->mnt_sb->s_root, &name); + if (!path.dentry) goto err_inode; + path.mnt = mntget(pipe_mnt); - dentry->d_op = &pipefs_dentry_operations; - /* - * We dont want to publish this dentry into global dentry hash table. - * We pretend dentry is already hashed, by unsetting DCACHE_UNHASHED - * This permits a working /proc/$pid/fd/XXX on pipes - */ - dentry->d_flags &= ~DCACHE_UNHASHED; - d_instantiate(dentry, inode); + path.dentry->d_op = &pipefs_dentry_operations; + d_instantiate(path.dentry, inode); err = -ENFILE; - f = alloc_file(pipe_mnt, dentry, FMODE_WRITE, &write_pipefifo_fops); + f = alloc_file(&path, FMODE_WRITE, &write_pipefifo_fops); if (!f) goto err_dentry; f->f_mapping = inode->i_mapping; @@ -1009,7 +992,7 @@ struct file *create_write_pipe(int flags) err_dentry: free_pipe_info(inode); - dput(dentry); + path_put(&path); return ERR_PTR(err); err_inode: @@ -1028,20 +1011,14 @@ void free_write_pipe(struct file *f) struct file *create_read_pipe(struct file *wrf, int flags) { - struct file *f = get_empty_filp(); + /* Grab pipe from the writer */ + struct file *f = alloc_file(&wrf->f_path, FMODE_READ, + &read_pipefifo_fops); if (!f) return ERR_PTR(-ENFILE); - /* Grab pipe from the writer */ - f->f_path = wrf->f_path; path_get(&wrf->f_path); - f->f_mapping = wrf->f_path.dentry->d_inode->i_mapping; - - f->f_pos = 0; f->f_flags = O_RDONLY | (flags & O_NONBLOCK); - f->f_op = &read_pipefifo_fops; - f->f_mode = FMODE_READ; - f->f_version = 0; return f; } diff --git a/fs/proc/array.c b/fs/proc/array.c index 4badde1..13b5d07 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -134,13 +134,16 @@ static inline void task_name(struct seq_file *m, struct task_struct *p) * simple bit tests. */ static const char *task_state_array[] = { - "R (running)", /* 0 */ - "S (sleeping)", /* 1 */ - "D (disk sleep)", /* 2 */ - "T (stopped)", /* 4 */ - "T (tracing stop)", /* 8 */ - "Z (zombie)", /* 16 */ - "X (dead)" /* 32 */ + "R (running)", /* 0 */ + "S (sleeping)", /* 1 */ + "D (disk sleep)", /* 2 */ + "T (stopped)", /* 4 */ + "t (tracing stop)", /* 8 */ + "Z (zombie)", /* 16 */ + "X (dead)", /* 32 */ + "x (dead)", /* 64 */ + "K (wakekill)", /* 128 */ + "W (waking)", /* 256 */ }; static inline const char *get_task_state(struct task_struct *tsk) @@ -148,6 +151,8 @@ static inline const char *get_task_state(struct task_struct *tsk) unsigned int state = (tsk->state & TASK_REPORT) | tsk->exit_state; const char **p = &task_state_array[0]; + BUILD_BUG_ON(1 + ilog2(TASK_STATE_MAX) != ARRAY_SIZE(task_state_array)); + while (state) { p++; state >>= 1; @@ -322,94 +327,6 @@ static inline void task_context_switch_counts(struct seq_file *m, p->nivcsw); } -#ifdef CONFIG_MMU - -struct stack_stats { - struct vm_area_struct *vma; - unsigned long startpage; - unsigned long usage; -}; - -static int stack_usage_pte_range(pmd_t *pmd, unsigned long addr, - unsigned long end, struct mm_walk *walk) -{ - struct stack_stats *ss = walk->private; - struct vm_area_struct *vma = ss->vma; - pte_t *pte, ptent; - spinlock_t *ptl; - int ret = 0; - - pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); - for (; addr != end; pte++, addr += PAGE_SIZE) { - ptent = *pte; - -#ifdef CONFIG_STACK_GROWSUP - if (pte_present(ptent) || is_swap_pte(ptent)) - ss->usage = addr - ss->startpage + PAGE_SIZE; -#else - if (pte_present(ptent) || is_swap_pte(ptent)) { - ss->usage = ss->startpage - addr + PAGE_SIZE; - pte++; - ret = 1; - break; - } -#endif - } - pte_unmap_unlock(pte - 1, ptl); - cond_resched(); - return ret; -} - -static inline unsigned long get_stack_usage_in_bytes(struct vm_area_struct *vma, - struct task_struct *task) -{ - struct stack_stats ss; - struct mm_walk stack_walk = { - .pmd_entry = stack_usage_pte_range, - .mm = vma->vm_mm, - .private = &ss, - }; - - if (!vma->vm_mm || is_vm_hugetlb_page(vma)) - return 0; - - ss.vma = vma; - ss.startpage = task->stack_start & PAGE_MASK; - ss.usage = 0; - -#ifdef CONFIG_STACK_GROWSUP - walk_page_range(KSTK_ESP(task) & PAGE_MASK, vma->vm_end, - &stack_walk); -#else - walk_page_range(vma->vm_start, (KSTK_ESP(task) & PAGE_MASK) + PAGE_SIZE, - &stack_walk); -#endif - return ss.usage; -} - -static inline void task_show_stack_usage(struct seq_file *m, - struct task_struct *task) -{ - struct vm_area_struct *vma; - struct mm_struct *mm = get_task_mm(task); - - if (mm) { - down_read(&mm->mmap_sem); - vma = find_vma(mm, task->stack_start); - if (vma) - seq_printf(m, "Stack usage:\t%lu kB\n", - get_stack_usage_in_bytes(vma, task) >> 10); - - up_read(&mm->mmap_sem); - mmput(mm); - } -} -#else -static void task_show_stack_usage(struct seq_file *m, struct task_struct *task) -{ -} -#endif /* CONFIG_MMU */ - static void task_cpus_allowed(struct seq_file *m, struct task_struct *task) { seq_printf(m, "Cpus_allowed:\t"); @@ -440,7 +357,6 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, task_show_regs(m, task); #endif task_context_switch_counts(m, task); - task_show_stack_usage(m, task); return 0; } diff --git a/fs/proc/base.c b/fs/proc/base.c index 18d5cc6..58324c2 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1419,7 +1419,6 @@ static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd) goto out; error = PROC_I(inode)->op.proc_get_link(inode, &nd->path); - nd->last_type = LAST_BIND; out: return ERR_PTR(error); } @@ -2370,16 +2369,30 @@ static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd) { struct pid_namespace *ns = dentry->d_sb->s_fs_info; pid_t tgid = task_tgid_nr_ns(current, ns); - char tmp[PROC_NUMBUF]; - if (!tgid) - return ERR_PTR(-ENOENT); - sprintf(tmp, "%d", task_tgid_nr_ns(current, ns)); - return ERR_PTR(vfs_follow_link(nd,tmp)); + char *name = ERR_PTR(-ENOENT); + if (tgid) { + name = __getname(); + if (!name) + name = ERR_PTR(-ENOMEM); + else + sprintf(name, "%d", tgid); + } + nd_set_link(nd, name); + return NULL; +} + +static void proc_self_put_link(struct dentry *dentry, struct nameidata *nd, + void *cookie) +{ + char *s = nd_get_link(nd); + if (!IS_ERR(s)) + __putname(s); } static const struct inode_operations proc_self_inode_operations = { .readlink = proc_self_readlink, .follow_link = proc_self_follow_link, + .put_link = proc_self_put_link, }; /* diff --git a/fs/proc/page.c b/fs/proc/page.c index 5033ce0..180cf5a 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -8,6 +8,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/hugetlb.h> +#include <linux/kernel-page-flags.h> #include <asm/uaccess.h> #include "internal.h" @@ -71,52 +72,12 @@ static const struct file_operations proc_kpagecount_operations = { * physical page flags. */ -/* These macros are used to decouple internal flags from exported ones */ - -#define KPF_LOCKED 0 -#define KPF_ERROR 1 -#define KPF_REFERENCED 2 -#define KPF_UPTODATE 3 -#define KPF_DIRTY 4 -#define KPF_LRU 5 -#define KPF_ACTIVE 6 -#define KPF_SLAB 7 -#define KPF_WRITEBACK 8 -#define KPF_RECLAIM 9 -#define KPF_BUDDY 10 - -/* 11-20: new additions in 2.6.31 */ -#define KPF_MMAP 11 -#define KPF_ANON 12 -#define KPF_SWAPCACHE 13 -#define KPF_SWAPBACKED 14 -#define KPF_COMPOUND_HEAD 15 -#define KPF_COMPOUND_TAIL 16 -#define KPF_HUGE 17 -#define KPF_UNEVICTABLE 18 -#define KPF_HWPOISON 19 -#define KPF_NOPAGE 20 - -#define KPF_KSM 21 - -/* kernel hacking assistances - * WARNING: subject to change, never rely on them! - */ -#define KPF_RESERVED 32 -#define KPF_MLOCKED 33 -#define KPF_MAPPEDTODISK 34 -#define KPF_PRIVATE 35 -#define KPF_PRIVATE_2 36 -#define KPF_OWNER_PRIVATE 37 -#define KPF_ARCH 38 -#define KPF_UNCACHED 39 - static inline u64 kpf_copy_bit(u64 kflags, int ubit, int kbit) { return ((kflags >> kbit) & 1) << ubit; } -static u64 get_uflags(struct page *page) +u64 stable_page_flags(struct page *page) { u64 k; u64 u; @@ -219,7 +180,7 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf, else ppage = NULL; - if (put_user(get_uflags(ppage), out)) { + if (put_user(stable_page_flags(ppage), out)) { ret = -EFAULT; break; } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 47c03f4..f277c4a 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -361,12 +361,11 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, if (!pte_present(ptent)) continue; - mss->resident += PAGE_SIZE; - page = vm_normal_page(vma, addr, ptent); if (!page) continue; + mss->resident += PAGE_SIZE; /* Accumulate the size in pages that have been accessed. */ if (pte_young(ptent) || PageReferenced(page)) mss->referenced += PAGE_SIZE; diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index cd6bb9a..3fc62b0 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -323,6 +323,30 @@ int dquot_mark_dquot_dirty(struct dquot *dquot) } EXPORT_SYMBOL(dquot_mark_dquot_dirty); +/* Dirtify all the dquots - this can block when journalling */ +static inline int mark_all_dquot_dirty(struct dquot * const *dquot) +{ + int ret, err, cnt; + + ret = err = 0; + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (dquot[cnt]) + /* Even in case of error we have to continue */ + ret = mark_dquot_dirty(dquot[cnt]); + if (!err) + err = ret; + } + return err; +} + +static inline void dqput_all(struct dquot **dquot) +{ + unsigned int cnt; + + for (cnt = 0; cnt < MAXQUOTAS; cnt++) + dqput(dquot[cnt]); +} + /* This function needs dq_list_lock */ static inline int clear_dquot_dirty(struct dquot *dquot) { @@ -1268,8 +1292,7 @@ int dquot_initialize(struct inode *inode, int type) out_err: up_write(&sb_dqopt(sb)->dqptr_sem); /* Drop unused references */ - for (cnt = 0; cnt < MAXQUOTAS; cnt++) - dqput(got[cnt]); + dqput_all(got); return ret; } EXPORT_SYMBOL(dquot_initialize); @@ -1288,9 +1311,7 @@ int dquot_drop(struct inode *inode) inode->i_dquot[cnt] = NULL; } up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); - - for (cnt = 0; cnt < MAXQUOTAS; cnt++) - dqput(put[cnt]); + dqput_all(put); return 0; } EXPORT_SYMBOL(dquot_drop); @@ -1319,6 +1340,70 @@ void vfs_dq_drop(struct inode *inode) EXPORT_SYMBOL(vfs_dq_drop); /* + * inode_reserved_space is managed internally by quota, and protected by + * i_lock similar to i_blocks+i_bytes. + */ +static qsize_t *inode_reserved_space(struct inode * inode) +{ + /* Filesystem must explicitly define it's own method in order to use + * quota reservation interface */ + BUG_ON(!inode->i_sb->dq_op->get_reserved_space); + return inode->i_sb->dq_op->get_reserved_space(inode); +} + +static void inode_add_rsv_space(struct inode *inode, qsize_t number) +{ + spin_lock(&inode->i_lock); + *inode_reserved_space(inode) += number; + spin_unlock(&inode->i_lock); +} + + +static void inode_claim_rsv_space(struct inode *inode, qsize_t number) +{ + spin_lock(&inode->i_lock); + *inode_reserved_space(inode) -= number; + __inode_add_bytes(inode, number); + spin_unlock(&inode->i_lock); +} + +static void inode_sub_rsv_space(struct inode *inode, qsize_t number) +{ + spin_lock(&inode->i_lock); + *inode_reserved_space(inode) -= number; + spin_unlock(&inode->i_lock); +} + +static qsize_t inode_get_rsv_space(struct inode *inode) +{ + qsize_t ret; + + if (!inode->i_sb->dq_op->get_reserved_space) + return 0; + spin_lock(&inode->i_lock); + ret = *inode_reserved_space(inode); + spin_unlock(&inode->i_lock); + return ret; +} + +static void inode_incr_space(struct inode *inode, qsize_t number, + int reserve) +{ + if (reserve) + inode_add_rsv_space(inode, number); + else + inode_add_bytes(inode, number); +} + +static void inode_decr_space(struct inode *inode, qsize_t number, int reserve) +{ + if (reserve) + inode_sub_rsv_space(inode, number); + else + inode_sub_bytes(inode, number); +} + +/* * Following four functions update i_blocks+i_bytes fields and * quota information (together with appropriate checks) * NOTE: We absolutely rely on the fact that caller dirties @@ -1336,6 +1421,21 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int cnt, ret = QUOTA_OK; char warntype[MAXQUOTAS]; + /* + * First test before acquiring mutex - solves deadlocks when we + * re-enter the quota code and are already holding the mutex + */ + if (IS_NOQUOTA(inode)) { + inode_incr_space(inode, number, reserve); + goto out; + } + + down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + if (IS_NOQUOTA(inode)) { + inode_incr_space(inode, number, reserve); + goto out_unlock; + } + for (cnt = 0; cnt < MAXQUOTAS; cnt++) warntype[cnt] = QUOTA_NL_NOWARN; @@ -1346,7 +1446,8 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, if (check_bdq(inode->i_dquot[cnt], number, warn, warntype+cnt) == NO_QUOTA) { ret = NO_QUOTA; - goto out_unlock; + spin_unlock(&dq_data_lock); + goto out_flush_warn; } } for (cnt = 0; cnt < MAXQUOTAS; cnt++) { @@ -1357,64 +1458,29 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, else dquot_incr_space(inode->i_dquot[cnt], number); } - if (!reserve) - inode_add_bytes(inode, number); -out_unlock: + inode_incr_space(inode, number, reserve); spin_unlock(&dq_data_lock); + + if (reserve) + goto out_flush_warn; + mark_all_dquot_dirty(inode->i_dquot); +out_flush_warn: flush_warnings(inode->i_dquot, warntype); +out_unlock: + up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); +out: return ret; } int dquot_alloc_space(struct inode *inode, qsize_t number, int warn) { - int cnt, ret = QUOTA_OK; - - /* - * First test before acquiring mutex - solves deadlocks when we - * re-enter the quota code and are already holding the mutex - */ - if (IS_NOQUOTA(inode)) { - inode_add_bytes(inode, number); - goto out; - } - - down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - if (IS_NOQUOTA(inode)) { - inode_add_bytes(inode, number); - goto out_unlock; - } - - ret = __dquot_alloc_space(inode, number, warn, 0); - if (ret == NO_QUOTA) - goto out_unlock; - - /* Dirtify all the dquots - this can block when journalling */ - for (cnt = 0; cnt < MAXQUOTAS; cnt++) - if (inode->i_dquot[cnt]) - mark_dquot_dirty(inode->i_dquot[cnt]); -out_unlock: - up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); -out: - return ret; + return __dquot_alloc_space(inode, number, warn, 0); } EXPORT_SYMBOL(dquot_alloc_space); int dquot_reserve_space(struct inode *inode, qsize_t number, int warn) { - int ret = QUOTA_OK; - - if (IS_NOQUOTA(inode)) - goto out; - - down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - if (IS_NOQUOTA(inode)) - goto out_unlock; - - ret = __dquot_alloc_space(inode, number, warn, 1); -out_unlock: - up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); -out: - return ret; + return __dquot_alloc_space(inode, number, warn, 1); } EXPORT_SYMBOL(dquot_reserve_space); @@ -1455,10 +1521,7 @@ int dquot_alloc_inode(const struct inode *inode, qsize_t number) warn_put_all: spin_unlock(&dq_data_lock); if (ret == QUOTA_OK) - /* Dirtify all the dquots - this can block when journalling */ - for (cnt = 0; cnt < MAXQUOTAS; cnt++) - if (inode->i_dquot[cnt]) - mark_dquot_dirty(inode->i_dquot[cnt]); + mark_all_dquot_dirty(inode->i_dquot); flush_warnings(inode->i_dquot, warntype); up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); return ret; @@ -1471,14 +1534,14 @@ int dquot_claim_space(struct inode *inode, qsize_t number) int ret = QUOTA_OK; if (IS_NOQUOTA(inode)) { - inode_add_bytes(inode, number); + inode_claim_rsv_space(inode, number); goto out; } down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); if (IS_NOQUOTA(inode)) { up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - inode_add_bytes(inode, number); + inode_claim_rsv_space(inode, number); goto out; } @@ -1490,12 +1553,9 @@ int dquot_claim_space(struct inode *inode, qsize_t number) number); } /* Update inode bytes */ - inode_add_bytes(inode, number); + inode_claim_rsv_space(inode, number); spin_unlock(&dq_data_lock); - /* Dirtify all the dquots - this can block when journalling */ - for (cnt = 0; cnt < MAXQUOTAS; cnt++) - if (inode->i_dquot[cnt]) - mark_dquot_dirty(inode->i_dquot[cnt]); + mark_all_dquot_dirty(inode->i_dquot); up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); out: return ret; @@ -1503,38 +1563,9 @@ out: EXPORT_SYMBOL(dquot_claim_space); /* - * Release reserved quota space - */ -void dquot_release_reserved_space(struct inode *inode, qsize_t number) -{ - int cnt; - - if (IS_NOQUOTA(inode)) - goto out; - - down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - if (IS_NOQUOTA(inode)) - goto out_unlock; - - spin_lock(&dq_data_lock); - /* Release reserved dquots */ - for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - if (inode->i_dquot[cnt]) - dquot_free_reserved_space(inode->i_dquot[cnt], number); - } - spin_unlock(&dq_data_lock); - -out_unlock: - up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); -out: - return; -} -EXPORT_SYMBOL(dquot_release_reserved_space); - -/* * This operation can block, but only after everything is updated */ -int dquot_free_space(struct inode *inode, qsize_t number) +int __dquot_free_space(struct inode *inode, qsize_t number, int reserve) { unsigned int cnt; char warntype[MAXQUOTAS]; @@ -1543,7 +1574,7 @@ int dquot_free_space(struct inode *inode, qsize_t number) * re-enter the quota code and are already holding the mutex */ if (IS_NOQUOTA(inode)) { out_sub: - inode_sub_bytes(inode, number); + inode_decr_space(inode, number, reserve); return QUOTA_OK; } @@ -1558,21 +1589,40 @@ out_sub: if (!inode->i_dquot[cnt]) continue; warntype[cnt] = info_bdq_free(inode->i_dquot[cnt], number); - dquot_decr_space(inode->i_dquot[cnt], number); + if (reserve) + dquot_free_reserved_space(inode->i_dquot[cnt], number); + else + dquot_decr_space(inode->i_dquot[cnt], number); } - inode_sub_bytes(inode, number); + inode_decr_space(inode, number, reserve); spin_unlock(&dq_data_lock); - /* Dirtify all the dquots - this can block when journalling */ - for (cnt = 0; cnt < MAXQUOTAS; cnt++) - if (inode->i_dquot[cnt]) - mark_dquot_dirty(inode->i_dquot[cnt]); + + if (reserve) + goto out_unlock; + mark_all_dquot_dirty(inode->i_dquot); +out_unlock: flush_warnings(inode->i_dquot, warntype); up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); return QUOTA_OK; } + +int dquot_free_space(struct inode *inode, qsize_t number) +{ + return __dquot_free_space(inode, number, 0); +} EXPORT_SYMBOL(dquot_free_space); /* + * Release reserved quota space + */ +void dquot_release_reserved_space(struct inode *inode, qsize_t number) +{ + __dquot_free_space(inode, number, 1); + +} +EXPORT_SYMBOL(dquot_release_reserved_space); + +/* * This operation can block, but only after everything is updated */ int dquot_free_inode(const struct inode *inode, qsize_t number) @@ -1599,10 +1649,7 @@ int dquot_free_inode(const struct inode *inode, qsize_t number) dquot_decr_inodes(inode->i_dquot[cnt], number); } spin_unlock(&dq_data_lock); - /* Dirtify all the dquots - this can block when journalling */ - for (cnt = 0; cnt < MAXQUOTAS; cnt++) - if (inode->i_dquot[cnt]) - mark_dquot_dirty(inode->i_dquot[cnt]); + mark_all_dquot_dirty(inode->i_dquot); flush_warnings(inode->i_dquot, warntype); up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); return QUOTA_OK; @@ -1610,19 +1657,6 @@ int dquot_free_inode(const struct inode *inode, qsize_t number) EXPORT_SYMBOL(dquot_free_inode); /* - * call back function, get reserved quota space from underlying fs - */ -qsize_t dquot_get_reserved_space(struct inode *inode) -{ - qsize_t reserved_space = 0; - - if (sb_any_quota_active(inode->i_sb) && - inode->i_sb->dq_op->get_reserved_space) - reserved_space = inode->i_sb->dq_op->get_reserved_space(inode); - return reserved_space; -} - -/* * Transfer the number of inode and blocks from one diskquota to an other. * * This operation can block, but only after everything is updated @@ -1665,7 +1699,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) } spin_lock(&dq_data_lock); cur_space = inode_get_bytes(inode); - rsv_space = dquot_get_reserved_space(inode); + rsv_space = inode_get_rsv_space(inode); space = cur_space + rsv_space; /* Build the transfer_from list and check the limits */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { @@ -1709,25 +1743,18 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) spin_unlock(&dq_data_lock); up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); - /* Dirtify all the dquots - this can block when journalling */ - for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - if (transfer_from[cnt]) - mark_dquot_dirty(transfer_from[cnt]); - if (transfer_to[cnt]) { - mark_dquot_dirty(transfer_to[cnt]); - /* The reference we got is transferred to the inode */ - transfer_to[cnt] = NULL; - } - } + mark_all_dquot_dirty(transfer_from); + mark_all_dquot_dirty(transfer_to); + /* The reference we got is transferred to the inode */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) + transfer_to[cnt] = NULL; warn_put_all: flush_warnings(transfer_to, warntype_to); flush_warnings(transfer_from, warntype_from_inodes); flush_warnings(transfer_from, warntype_from_space); put_all: - for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - dqput(transfer_from[cnt]); - dqput(transfer_to[cnt]); - } + dqput_all(transfer_from); + dqput_all(transfer_to); return ret; over_quota: spin_unlock(&dq_data_lock); diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c index 3dfc23e..e3da02f 100644 --- a/fs/quota/quota_v2.c +++ b/fs/quota/quota_v2.c @@ -97,8 +97,11 @@ static int v2_read_file_info(struct super_block *sb, int type) unsigned int version; if (!v2_read_header(sb, type, &dqhead)) - return 0; + return -1; version = le32_to_cpu(dqhead.dqh_version); + if ((info->dqi_fmt_id == QFMT_VFS_V0 && version != 0) || + (info->dqi_fmt_id == QFMT_VFS_V1 && version != 1)) + return -1; size = sb->s_op->quota_read(sb, type, (char *)&dinfo, sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF); @@ -120,8 +123,8 @@ static int v2_read_file_info(struct super_block *sb, int type) info->dqi_maxilimit = 0xffffffff; } else { /* used space is stored as unsigned 64-bit value */ - info->dqi_maxblimit = 0xffffffffffffffff; /* 2^64-1 */ - info->dqi_maxilimit = 0xffffffffffffffff; + info->dqi_maxblimit = 0xffffffffffffffffULL; /* 2^64-1 */ + info->dqi_maxilimit = 0xffffffffffffffffULL; } info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace); info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace); diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 32fae40..1739a4a 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -60,7 +60,7 @@ const struct inode_operations ramfs_file_inode_operations = { */ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) { - unsigned long npages, xpages, loop, limit; + unsigned long npages, xpages, loop; struct page *pages; unsigned order; void *data; @@ -123,30 +123,6 @@ add_error: /*****************************************************************************/ /* - * check that file shrinkage doesn't leave any VMAs dangling in midair - */ -static int ramfs_nommu_check_mappings(struct inode *inode, - size_t newsize, size_t size) -{ - struct vm_area_struct *vma; - struct prio_tree_iter iter; - - /* search for VMAs that fall within the dead zone */ - vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, - newsize >> PAGE_SHIFT, - (size + PAGE_SIZE - 1) >> PAGE_SHIFT - ) { - /* found one - only interested if it's shared out of the page - * cache */ - if (vma->vm_flags & VM_SHARED) - return -ETXTBSY; /* not quite true, but near enough */ - } - - return 0; -} - -/*****************************************************************************/ -/* * */ static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size) @@ -164,7 +140,7 @@ static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size) /* check that a decrease in size doesn't cut off any shared mappings */ if (newsize < size) { - ret = ramfs_nommu_check_mappings(inode, newsize, size); + ret = nommu_shrink_inode_mappings(inode, size, newsize); if (ret < 0) return ret; } diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig index ac7cd75..513f431 100644 --- a/fs/reiserfs/Kconfig +++ b/fs/reiserfs/Kconfig @@ -1,7 +1,6 @@ config REISERFS_FS tristate "Reiserfs support" select CRC32 - select FS_JOURNAL_INFO help Stores not just filenames but the files themselves in a balanced tree. Uses journalling. diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c index 6854957..65c8727 100644 --- a/fs/reiserfs/bitmap.c +++ b/fs/reiserfs/bitmap.c @@ -1277,7 +1277,10 @@ int reiserfs_init_bitmap_cache(struct super_block *sb) struct reiserfs_bitmap_info *bitmap; unsigned int bmap_nr = reiserfs_bmap_count(sb); + /* Avoid lock recursion in fault case */ + reiserfs_write_unlock(sb); bitmap = vmalloc(sizeof(*bitmap) * bmap_nr); + reiserfs_write_lock(sb); if (bitmap == NULL) return -ENOMEM; diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 3a28e77..2df0f5c 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -31,11 +31,12 @@ void reiserfs_delete_inode(struct inode *inode) JOURNAL_PER_BALANCE_CNT * 2 + 2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb); struct reiserfs_transaction_handle th; + int depth; int err; truncate_inode_pages(&inode->i_data, 0); - reiserfs_write_lock(inode->i_sb); + depth = reiserfs_write_lock_once(inode->i_sb); /* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */ if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) { /* also handles bad_inode case */ @@ -74,7 +75,7 @@ void reiserfs_delete_inode(struct inode *inode) out: clear_inode(inode); /* note this must go after the journal_end to prevent deadlock */ inode->i_blocks = 0; - reiserfs_write_unlock(inode->i_sb); + reiserfs_write_unlock_once(inode->i_sb, depth); } static void _make_cpu_key(struct cpu_key *key, int version, __u32 dirid, @@ -1496,9 +1497,11 @@ struct inode *reiserfs_iget(struct super_block *s, const struct cpu_key *key) args.objectid = key->on_disk_key.k_objectid; args.dirid = key->on_disk_key.k_dir_id; + reiserfs_write_unlock(s); inode = iget5_locked(s, key->on_disk_key.k_objectid, reiserfs_find_actor, reiserfs_init_locked_inode, (void *)(&args)); + reiserfs_write_lock(s); if (!inode) return ERR_PTR(-ENOMEM); @@ -2538,6 +2541,12 @@ static int reiserfs_writepage(struct page *page, struct writeback_control *wbc) return reiserfs_write_full_page(page, wbc); } +static void reiserfs_truncate_failed_write(struct inode *inode) +{ + truncate_inode_pages(inode->i_mapping, inode->i_size); + reiserfs_truncate_file(inode, 0); +} + static int reiserfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, @@ -2604,6 +2613,8 @@ static int reiserfs_write_begin(struct file *file, if (ret) { unlock_page(page); page_cache_release(page); + /* Truncate allocated blocks */ + reiserfs_truncate_failed_write(inode); } return ret; } @@ -2701,9 +2712,7 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping, ** transaction tracking stuff when the size changes. So, we have ** to do the i_size updates here. */ - pos += copied; - - if (pos > inode->i_size) { + if (pos + copied > inode->i_size) { struct reiserfs_transaction_handle myth; lock_depth = reiserfs_write_lock_once(inode->i_sb); locked = true; @@ -2721,7 +2730,7 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping, goto journal_error; reiserfs_update_inode_transaction(inode); - inode->i_size = pos; + inode->i_size = pos + copied; /* * this will just nest into our transaction. It's important * to use mark_inode_dirty so the inode gets pushed around on the @@ -2751,6 +2760,10 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping, reiserfs_write_unlock_once(inode->i_sb, lock_depth); unlock_page(page); page_cache_release(page); + + if (pos + len > inode->i_size) + reiserfs_truncate_failed_write(inode); + return ret == 0 ? copied : ret; journal_error: @@ -3051,13 +3064,14 @@ static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb, int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = dentry->d_inode; - int error; unsigned int ia_valid; + int depth; + int error; /* must be turned off for recursive notify_change calls */ ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID); - reiserfs_write_lock(inode->i_sb); + depth = reiserfs_write_lock_once(inode->i_sb); if (attr->ia_valid & ATTR_SIZE) { /* version 2 items will be caught by the s_maxbytes check ** done for us in vmtruncate @@ -3138,8 +3152,17 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) journal_end(&th, inode->i_sb, jbegin_count); } } - if (!error) + if (!error) { + /* + * Relax the lock here, as it might truncate the + * inode pages and wait for inode pages locks. + * To release such page lock, the owner needs the + * reiserfs lock + */ + reiserfs_write_unlock_once(inode->i_sb, depth); error = inode_setattr(inode, attr); + depth = reiserfs_write_lock_once(inode->i_sb); + } } if (!error && reiserfs_posixacl(inode->i_sb)) { @@ -3148,7 +3171,8 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) } out: - reiserfs_write_unlock(inode->i_sb); + reiserfs_write_unlock_once(inode->i_sb, depth); + return error; } diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index ace7745..f53505d 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -104,9 +104,10 @@ setflags_out: err = put_user(inode->i_generation, (int __user *)arg); break; case REISERFS_IOC_SETVERSION: - if (!is_owner_or_cap(inode)) + if (!is_owner_or_cap(inode)) { err = -EPERM; break; + } err = mnt_want_write(filp->f_path.mnt); if (err) break; diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 2f8a7e7..ba98546 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2009,10 +2009,11 @@ static int do_journal_release(struct reiserfs_transaction_handle *th, destroy_workqueue(commit_wq); commit_wq = NULL; } - reiserfs_write_lock(sb); free_journal_ram(sb); + reiserfs_write_lock(sb); + return 0; } @@ -2758,11 +2759,18 @@ int journal_init(struct super_block *sb, const char *j_dev_name, struct reiserfs_journal *journal; struct reiserfs_journal_list *jl; char b[BDEVNAME_SIZE]; + int ret; + /* + * Unlock here to avoid various RECLAIM-FS-ON <-> IN-RECLAIM-FS + * dependency inversion warnings. + */ + reiserfs_write_unlock(sb); journal = SB_JOURNAL(sb) = vmalloc(sizeof(struct reiserfs_journal)); if (!journal) { reiserfs_warning(sb, "journal-1256", "unable to get memory for journal structure"); + reiserfs_write_lock(sb); return 1; } memset(journal, 0, sizeof(struct reiserfs_journal)); @@ -2771,10 +2779,12 @@ int journal_init(struct super_block *sb, const char *j_dev_name, INIT_LIST_HEAD(&journal->j_working_list); INIT_LIST_HEAD(&journal->j_journal_list); journal->j_persistent_trans = 0; - if (reiserfs_allocate_list_bitmaps(sb, - journal->j_list_bitmap, - reiserfs_bmap_count(sb))) + ret = reiserfs_allocate_list_bitmaps(sb, journal->j_list_bitmap, + reiserfs_bmap_count(sb)); + reiserfs_write_lock(sb); + if (ret) goto free_and_return; + allocate_bitmap_nodes(sb); /* reserved for journal area support */ @@ -2903,7 +2913,9 @@ int journal_init(struct super_block *sb, const char *j_dev_name, journal->j_mount_id = 10; journal->j_state = 0; atomic_set(&(journal->j_jlock), 0); + reiserfs_write_unlock(sb); journal->j_cnode_free_list = allocate_cnodes(num_cnodes); + reiserfs_write_lock(sb); journal->j_cnode_free_orig = journal->j_cnode_free_list; journal->j_cnode_free = journal->j_cnode_free_list ? num_cnodes : 0; journal->j_cnode_used = 0; diff --git a/fs/reiserfs/lock.c b/fs/reiserfs/lock.c index ee2cfc0..b87aa2c 100644 --- a/fs/reiserfs/lock.c +++ b/fs/reiserfs/lock.c @@ -86,3 +86,12 @@ void reiserfs_check_lock_depth(struct super_block *sb, char *caller) reiserfs_panic(sb, "%s called without kernel lock held %d", caller); } + +#ifdef CONFIG_REISERFS_CHECK +void reiserfs_lock_check_recursive(struct super_block *sb) +{ + struct reiserfs_sb_info *sb_i = REISERFS_SB(sb); + + WARN_ONCE((sb_i->lock_depth > 0), "Unwanted recursive reiserfs lock!\n"); +} +#endif diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index e296ff7..9d4dcf0 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -921,6 +921,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry) struct reiserfs_transaction_handle th; int jbegin_count; unsigned long savelink; + int depth; inode = dentry->d_inode; @@ -932,7 +933,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry) JOURNAL_PER_BALANCE_CNT * 2 + 2 + 4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb); - reiserfs_write_lock(dir->i_sb); + depth = reiserfs_write_lock_once(dir->i_sb); retval = journal_begin(&th, dir->i_sb, jbegin_count); if (retval) goto out_unlink; @@ -993,7 +994,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry) retval = journal_end(&th, dir->i_sb, jbegin_count); reiserfs_check_path(&path); - reiserfs_write_unlock(dir->i_sb); + reiserfs_write_unlock_once(dir->i_sb, depth); return retval; end_unlink: @@ -1003,7 +1004,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry) if (err) retval = err; out_unlink: - reiserfs_write_unlock(dir->i_sb); + reiserfs_write_unlock_once(dir->i_sb, depth); return retval; } diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 58aa8e7..81f09fa 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -48,6 +48,7 @@ #include <net/checksum.h> #include <linux/stat.h> #include <linux/quotaops.h> +#include <linux/security.h> #define PRIVROOT_NAME ".reiserfs_priv" #define XAROOT_NAME "xattrs" @@ -82,7 +83,8 @@ static int xattr_unlink(struct inode *dir, struct dentry *dentry) BUG_ON(!mutex_is_locked(&dir->i_mutex)); vfs_dq_init(dir); - mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD); + reiserfs_mutex_lock_nested_safe(&dentry->d_inode->i_mutex, + I_MUTEX_CHILD, dir->i_sb); error = dir->i_op->unlink(dir, dentry); mutex_unlock(&dentry->d_inode->i_mutex); @@ -97,7 +99,8 @@ static int xattr_rmdir(struct inode *dir, struct dentry *dentry) BUG_ON(!mutex_is_locked(&dir->i_mutex)); vfs_dq_init(dir); - mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD); + reiserfs_mutex_lock_nested_safe(&dentry->d_inode->i_mutex, + I_MUTEX_CHILD, dir->i_sb); dentry_unhash(dentry); error = dir->i_op->rmdir(dir, dentry); if (!error) @@ -234,16 +237,22 @@ static int reiserfs_for_each_xattr(struct inode *inode, if (IS_PRIVATE(inode) || get_inode_sd_version(inode) == STAT_DATA_V1) return 0; + reiserfs_write_unlock(inode->i_sb); dir = open_xa_dir(inode, XATTR_REPLACE); if (IS_ERR(dir)) { err = PTR_ERR(dir); + reiserfs_write_lock(inode->i_sb); goto out; } else if (!dir->d_inode) { err = 0; + reiserfs_write_lock(inode->i_sb); goto out_dir; } mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_XATTR); + + reiserfs_write_lock(inode->i_sb); + buf.xadir = dir; err = reiserfs_readdir_dentry(dir, &buf, fill_with_dentries, &pos); while ((err == 0 || err == -ENOSPC) && buf.count) { @@ -282,8 +291,9 @@ static int reiserfs_for_each_xattr(struct inode *inode, err = journal_begin(&th, inode->i_sb, blocks); if (!err) { int jerror; - mutex_lock_nested(&dir->d_parent->d_inode->i_mutex, - I_MUTEX_XATTR); + reiserfs_mutex_lock_nested_safe( + &dir->d_parent->d_inode->i_mutex, + I_MUTEX_XATTR, inode->i_sb); err = action(dir, data); jerror = journal_end(&th, inode->i_sb, blocks); mutex_unlock(&dir->d_parent->d_inode->i_mutex); @@ -442,7 +452,9 @@ static int lookup_and_delete_xattr(struct inode *inode, const char *name) } if (dentry->d_inode) { + reiserfs_write_lock(inode->i_sb); err = xattr_unlink(xadir->d_inode, dentry); + reiserfs_write_unlock(inode->i_sb); update_ctime(inode); } @@ -476,15 +488,24 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th, if (get_inode_sd_version(inode) == STAT_DATA_V1) return -EOPNOTSUPP; - if (!buffer) - return lookup_and_delete_xattr(inode, name); + reiserfs_write_unlock(inode->i_sb); + + if (!buffer) { + err = lookup_and_delete_xattr(inode, name); + reiserfs_write_lock(inode->i_sb); + return err; + } dentry = xattr_lookup(inode, name, flags); - if (IS_ERR(dentry)) + if (IS_ERR(dentry)) { + reiserfs_write_lock(inode->i_sb); return PTR_ERR(dentry); + } down_write(&REISERFS_I(inode)->i_xattr_sem); + reiserfs_write_lock(inode->i_sb); + xahash = xattr_hash(buffer, buffer_size); while (buffer_pos < buffer_size || buffer_pos == 0) { size_t chunk; @@ -539,8 +560,12 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th, .ia_size = buffer_size, .ia_valid = ATTR_SIZE | ATTR_CTIME, }; + + reiserfs_write_unlock(inode->i_sb); mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_XATTR); down_write(&dentry->d_inode->i_alloc_sem); + reiserfs_write_lock(inode->i_sb); + err = reiserfs_setattr(dentry, &newattrs); up_write(&dentry->d_inode->i_alloc_sem); mutex_unlock(&dentry->d_inode->i_mutex); @@ -726,15 +751,14 @@ ssize_t reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer, size_t size) { - struct inode *inode = dentry->d_inode; struct xattr_handler *handler; - handler = find_xattr_handler_prefix(inode->i_sb->s_xattr, name); + handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); - if (!handler || get_inode_sd_version(inode) == STAT_DATA_V1) + if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1) return -EOPNOTSUPP; - return handler->get(inode, name, buffer, size); + return handler->get(dentry, name, buffer, size, handler->flags); } /* @@ -746,15 +770,14 @@ int reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { - struct inode *inode = dentry->d_inode; struct xattr_handler *handler; - handler = find_xattr_handler_prefix(inode->i_sb->s_xattr, name); + handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); - if (!handler || get_inode_sd_version(inode) == STAT_DATA_V1) + if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1) return -EOPNOTSUPP; - return handler->set(inode, name, value, size, flags); + return handler->set(dentry, name, value, size, flags, handler->flags); } /* @@ -764,21 +787,20 @@ reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value, */ int reiserfs_removexattr(struct dentry *dentry, const char *name) { - struct inode *inode = dentry->d_inode; struct xattr_handler *handler; - handler = find_xattr_handler_prefix(inode->i_sb->s_xattr, name); + handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); - if (!handler || get_inode_sd_version(inode) == STAT_DATA_V1) + if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1) return -EOPNOTSUPP; - return handler->set(inode, name, NULL, 0, XATTR_REPLACE); + return handler->set(dentry, name, NULL, 0, XATTR_REPLACE, handler->flags); } struct listxattr_buf { size_t size; size_t pos; char *buf; - struct inode *inode; + struct dentry *dentry; }; static int listxattr_filler(void *buf, const char *name, int namelen, @@ -789,17 +811,19 @@ static int listxattr_filler(void *buf, const char *name, int namelen, if (name[0] != '.' || (namelen != 1 && (name[1] != '.' || namelen != 2))) { struct xattr_handler *handler; - handler = find_xattr_handler_prefix(b->inode->i_sb->s_xattr, + handler = find_xattr_handler_prefix(b->dentry->d_sb->s_xattr, name); if (!handler) /* Unsupported xattr name */ return 0; if (b->buf) { - size = handler->list(b->inode, b->buf + b->pos, - b->size, name, namelen); + size = handler->list(b->dentry, b->buf + b->pos, + b->size, name, namelen, + handler->flags); if (size > b->size) return -ERANGE; } else { - size = handler->list(b->inode, NULL, 0, name, namelen); + size = handler->list(b->dentry, NULL, 0, name, + namelen, handler->flags); } b->pos += size; @@ -820,7 +844,7 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size) int err = 0; loff_t pos = 0; struct listxattr_buf buf = { - .inode = dentry->d_inode, + .dentry = dentry, .buf = buffer, .size = buffer ? size : 0, }; diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index 35d6e67..dd20a78 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -15,8 +15,10 @@ static int reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct posix_acl *acl); static int -xattr_set_acl(struct inode *inode, int type, const void *value, size_t size) +posix_acl_set(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags, int type) { + struct inode *inode = dentry->d_inode; struct posix_acl *acl; int error, error2; struct reiserfs_transaction_handle th; @@ -60,15 +62,16 @@ xattr_set_acl(struct inode *inode, int type, const void *value, size_t size) } static int -xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size) +posix_acl_get(struct dentry *dentry, const char *name, void *buffer, + size_t size, int type) { struct posix_acl *acl; int error; - if (!reiserfs_posixacl(inode->i_sb)) + if (!reiserfs_posixacl(dentry->d_sb)) return -EOPNOTSUPP; - acl = reiserfs_get_acl(inode, type); + acl = reiserfs_get_acl(dentry->d_inode, type); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl == NULL) @@ -452,7 +455,9 @@ int reiserfs_acl_chmod(struct inode *inode) return 0; } + reiserfs_write_unlock(inode->i_sb); acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS); + reiserfs_write_lock(inode->i_sb); if (!acl) return 0; if (IS_ERR(acl)) @@ -482,30 +487,12 @@ int reiserfs_acl_chmod(struct inode *inode) return error; } -static int -posix_acl_access_get(struct inode *inode, const char *name, - void *buffer, size_t size) -{ - if (strlen(name) != sizeof(POSIX_ACL_XATTR_ACCESS) - 1) - return -EINVAL; - return xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size); -} - -static int -posix_acl_access_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) -{ - if (strlen(name) != sizeof(POSIX_ACL_XATTR_ACCESS) - 1) - return -EINVAL; - return xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size); -} - -static size_t posix_acl_access_list(struct inode *inode, char *list, +static size_t posix_acl_access_list(struct dentry *dentry, char *list, size_t list_size, const char *name, - size_t name_len) + size_t name_len, int type) { const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); - if (!reiserfs_posixacl(inode->i_sb)) + if (!reiserfs_posixacl(dentry->d_sb)) return 0; if (list && size <= list_size) memcpy(list, POSIX_ACL_XATTR_ACCESS, size); @@ -514,35 +501,18 @@ static size_t posix_acl_access_list(struct inode *inode, char *list, struct xattr_handler reiserfs_posix_acl_access_handler = { .prefix = POSIX_ACL_XATTR_ACCESS, - .get = posix_acl_access_get, - .set = posix_acl_access_set, + .flags = ACL_TYPE_ACCESS, + .get = posix_acl_get, + .set = posix_acl_set, .list = posix_acl_access_list, }; -static int -posix_acl_default_get(struct inode *inode, const char *name, - void *buffer, size_t size) -{ - if (strlen(name) != sizeof(POSIX_ACL_XATTR_DEFAULT) - 1) - return -EINVAL; - return xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size); -} - -static int -posix_acl_default_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) -{ - if (strlen(name) != sizeof(POSIX_ACL_XATTR_DEFAULT) - 1) - return -EINVAL; - return xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size); -} - -static size_t posix_acl_default_list(struct inode *inode, char *list, +static size_t posix_acl_default_list(struct dentry *dentry, char *list, size_t list_size, const char *name, - size_t name_len) + size_t name_len, int type) { const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); - if (!reiserfs_posixacl(inode->i_sb)) + if (!reiserfs_posixacl(dentry->d_sb)) return 0; if (list && size <= list_size) memcpy(list, POSIX_ACL_XATTR_DEFAULT, size); @@ -551,7 +521,8 @@ static size_t posix_acl_default_list(struct inode *inode, char *list, struct xattr_handler reiserfs_posix_acl_default_handler = { .prefix = POSIX_ACL_XATTR_DEFAULT, - .get = posix_acl_default_get, - .set = posix_acl_default_set, + .flags = ACL_TYPE_DEFAULT, + .get = posix_acl_get, + .set = posix_acl_set, .list = posix_acl_default_list, }; diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c index a92c879..d8b5bfc 100644 --- a/fs/reiserfs/xattr_security.c +++ b/fs/reiserfs/xattr_security.c @@ -8,36 +8,37 @@ #include <asm/uaccess.h> static int -security_get(struct inode *inode, const char *name, void *buffer, size_t size) +security_get(struct dentry *dentry, const char *name, void *buffer, size_t size, + int handler_flags) { if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX)) return -EINVAL; - if (IS_PRIVATE(inode)) + if (IS_PRIVATE(dentry->d_inode)) return -EPERM; - return reiserfs_xattr_get(inode, name, buffer, size); + return reiserfs_xattr_get(dentry->d_inode, name, buffer, size); } static int -security_set(struct inode *inode, const char *name, const void *buffer, - size_t size, int flags) +security_set(struct dentry *dentry, const char *name, const void *buffer, + size_t size, int flags, int handler_flags) { if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX)) return -EINVAL; - if (IS_PRIVATE(inode)) + if (IS_PRIVATE(dentry->d_inode)) return -EPERM; - return reiserfs_xattr_set(inode, name, buffer, size, flags); + return reiserfs_xattr_set(dentry->d_inode, name, buffer, size, flags); } -static size_t security_list(struct inode *inode, char *list, size_t list_len, - const char *name, size_t namelen) +static size_t security_list(struct dentry *dentry, char *list, size_t list_len, + const char *name, size_t namelen, int handler_flags) { const size_t len = namelen + 1; - if (IS_PRIVATE(inode)) + if (IS_PRIVATE(dentry->d_inode)) return 0; if (list && len <= list_len) { diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c index a865042..5b08aac 100644 --- a/fs/reiserfs/xattr_trusted.c +++ b/fs/reiserfs/xattr_trusted.c @@ -8,36 +8,37 @@ #include <asm/uaccess.h> static int -trusted_get(struct inode *inode, const char *name, void *buffer, size_t size) +trusted_get(struct dentry *dentry, const char *name, void *buffer, size_t size, + int handler_flags) { if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX)) return -EINVAL; - if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode)) + if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(dentry->d_inode)) return -EPERM; - return reiserfs_xattr_get(inode, name, buffer, size); + return reiserfs_xattr_get(dentry->d_inode, name, buffer, size); } static int -trusted_set(struct inode *inode, const char *name, const void *buffer, - size_t size, int flags) +trusted_set(struct dentry *dentry, const char *name, const void *buffer, + size_t size, int flags, int handler_flags) { if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX)) return -EINVAL; - if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode)) + if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(dentry->d_inode)) return -EPERM; - return reiserfs_xattr_set(inode, name, buffer, size, flags); + return reiserfs_xattr_set(dentry->d_inode, name, buffer, size, flags); } -static size_t trusted_list(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +static size_t trusted_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int handler_flags) { const size_t len = name_len + 1; - if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode)) + if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(dentry->d_inode)) return 0; if (list && len <= list_size) { diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c index e3238dc..75d59c4 100644 --- a/fs/reiserfs/xattr_user.c +++ b/fs/reiserfs/xattr_user.c @@ -7,34 +7,35 @@ #include <asm/uaccess.h> static int -user_get(struct inode *inode, const char *name, void *buffer, size_t size) +user_get(struct dentry *dentry, const char *name, void *buffer, size_t size, + int handler_flags) { if (strlen(name) < sizeof(XATTR_USER_PREFIX)) return -EINVAL; - if (!reiserfs_xattrs_user(inode->i_sb)) + if (!reiserfs_xattrs_user(dentry->d_sb)) return -EOPNOTSUPP; - return reiserfs_xattr_get(inode, name, buffer, size); + return reiserfs_xattr_get(dentry->d_inode, name, buffer, size); } static int -user_set(struct inode *inode, const char *name, const void *buffer, - size_t size, int flags) +user_set(struct dentry *dentry, const char *name, const void *buffer, + size_t size, int flags, int handler_flags) { if (strlen(name) < sizeof(XATTR_USER_PREFIX)) return -EINVAL; - if (!reiserfs_xattrs_user(inode->i_sb)) + if (!reiserfs_xattrs_user(dentry->d_sb)) return -EOPNOTSUPP; - return reiserfs_xattr_set(inode, name, buffer, size, flags); + return reiserfs_xattr_set(dentry->d_inode, name, buffer, size, flags); } -static size_t user_list(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) +static size_t user_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int handler_flags) { const size_t len = name_len + 1; - if (!reiserfs_xattrs_user(inode->i_sb)) + if (!reiserfs_xattrs_user(dentry->d_sb)) return 0; if (list && len <= list_size) { memcpy(list, name, name_len); diff --git a/fs/romfs/super.c b/fs/romfs/super.c index c117fa8..42d2135 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -544,6 +544,7 @@ error: error_rsb_inval: ret = -EINVAL; error_rsb: + kfree(rsb); return ret; } diff --git a/fs/signalfd.c b/fs/signalfd.c index b07565c..1dabe4e 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -236,7 +236,7 @@ SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask, * anon_inode_getfd() will install the fd. */ ufd = anon_inode_getfd("[signalfd]", &signalfd_fops, ctx, - flags & (O_CLOEXEC | O_NONBLOCK)); + O_RDWR | (flags & (O_CLOEXEC | O_NONBLOCK))); if (ufd < 0) kfree(ctx); } else { @@ -7,18 +7,63 @@ * This function cannot be inlined since i_size_{read,write} is rather * heavy-weight on 32-bit systems */ -void fsstack_copy_inode_size(struct inode *dst, const struct inode *src) +void fsstack_copy_inode_size(struct inode *dst, struct inode *src) { - i_size_write(dst, i_size_read((struct inode *)src)); - dst->i_blocks = src->i_blocks; + loff_t i_size; + blkcnt_t i_blocks; + + /* + * i_size_read() includes its own seqlocking and protection from + * preemption (see include/linux/fs.h): we need nothing extra for + * that here, and prefer to avoid nesting locks than attempt to keep + * i_size and i_blocks in sync together. + */ + i_size = i_size_read(src); + + /* + * But if CONFIG_LBDAF (on 32-bit), we ought to make an effort to + * keep the two halves of i_blocks in sync despite SMP or PREEMPT - + * though stat's generic_fillattr() doesn't bother, and we won't be + * applying quotas (where i_blocks does become important) at the + * upper level. + * + * We don't actually know what locking is used at the lower level; + * but if it's a filesystem that supports quotas, it will be using + * i_lock as in inode_add_bytes(). tmpfs uses other locking, and + * its 32-bit is (just) able to exceed 2TB i_size with the aid of + * holes; but its i_blocks cannot carry into the upper long without + * almost 2TB swap - let's ignore that case. + */ + if (sizeof(i_blocks) > sizeof(long)) + spin_lock(&src->i_lock); + i_blocks = src->i_blocks; + if (sizeof(i_blocks) > sizeof(long)) + spin_unlock(&src->i_lock); + + /* + * If CONFIG_SMP or CONFIG_PREEMPT on 32-bit, it's vital for + * fsstack_copy_inode_size() to hold some lock around + * i_size_write(), otherwise i_size_read() may spin forever (see + * include/linux/fs.h). We don't necessarily hold i_mutex when this + * is called, so take i_lock for that case. + * + * And if CONFIG_LBADF (on 32-bit), continue our effort to keep the + * two halves of i_blocks in sync despite SMP or PREEMPT: use i_lock + * for that case too, and do both at once by combining the tests. + * + * There is none of this locking overhead in the 64-bit case. + */ + if (sizeof(i_size) > sizeof(long) || sizeof(i_blocks) > sizeof(long)) + spin_lock(&dst->i_lock); + i_size_write(dst, i_size); + dst->i_blocks = i_blocks; + if (sizeof(i_size) > sizeof(long) || sizeof(i_blocks) > sizeof(long)) + spin_unlock(&dst->i_lock); } EXPORT_SYMBOL_GPL(fsstack_copy_inode_size); -/* copy all attributes; get_nlinks is optional way to override the i_nlink - * copying - */ -void fsstack_copy_attr_all(struct inode *dest, const struct inode *src, - int (*get_nlinks)(struct inode *)) +/* copy all attributes */ +void fsstack_copy_attr_all(struct inode *dest, const struct inode *src) { dest->i_mode = src->i_mode; dest->i_uid = src->i_uid; @@ -29,14 +74,6 @@ void fsstack_copy_attr_all(struct inode *dest, const struct inode *src, dest->i_ctime = src->i_ctime; dest->i_blkbits = src->i_blkbits; dest->i_flags = src->i_flags; - - /* - * Update the nlinks AFTER updating the above fields, because the - * get_links callback may depend on them. - */ - if (!get_nlinks) - dest->i_nlink = src->i_nlink; - else - dest->i_nlink = (*get_nlinks)(dest); + dest->i_nlink = src->i_nlink; } EXPORT_SYMBOL_GPL(fsstack_copy_attr_all); @@ -401,9 +401,9 @@ SYSCALL_DEFINE4(fstatat64, int, dfd, char __user *, filename, } #endif /* __ARCH_WANT_STAT64 */ -void inode_add_bytes(struct inode *inode, loff_t bytes) +/* Caller is here responsible for sufficient locking (ie. inode->i_lock) */ +void __inode_add_bytes(struct inode *inode, loff_t bytes) { - spin_lock(&inode->i_lock); inode->i_blocks += bytes >> 9; bytes &= 511; inode->i_bytes += bytes; @@ -411,6 +411,12 @@ void inode_add_bytes(struct inode *inode, loff_t bytes) inode->i_blocks++; inode->i_bytes -= 512; } +} + +void inode_add_bytes(struct inode *inode, loff_t bytes) +{ + spin_lock(&inode->i_lock); + __inode_add_bytes(inode, bytes); spin_unlock(&inode->i_lock); } @@ -901,8 +901,9 @@ int get_sb_single(struct file_system_type *fs_type, return error; } s->s_flags |= MS_ACTIVE; + } else { + do_remount_sb(s, flags, data, 0); } - do_remount_sb(s, flags, data, 0); simple_set_mnt(mnt, s); return 0; } @@ -355,6 +355,7 @@ SYSCALL_DEFINE(sync_file_range)(int fd, loff_t offset, loff_t nbytes, { int ret; struct file *file; + struct address_space *mapping; loff_t endbyte; /* inclusive */ int fput_needed; umode_t i_mode; @@ -405,7 +406,28 @@ SYSCALL_DEFINE(sync_file_range)(int fd, loff_t offset, loff_t nbytes, !S_ISLNK(i_mode)) goto out_put; - ret = do_sync_mapping_range(file->f_mapping, offset, endbyte, flags); + mapping = file->f_mapping; + if (!mapping) { + ret = -EINVAL; + goto out_put; + } + + ret = 0; + if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) { + ret = filemap_fdatawait_range(mapping, offset, endbyte); + if (ret < 0) + goto out_put; + } + + if (flags & SYNC_FILE_RANGE_WRITE) { + ret = filemap_fdatawrite_range(mapping, offset, endbyte); + if (ret < 0) + goto out_put; + } + + if (flags & SYNC_FILE_RANGE_WAIT_AFTER) + ret = filemap_fdatawait_range(mapping, offset, endbyte); + out_put: fput_light(file, fput_needed); out: @@ -437,38 +459,3 @@ asmlinkage long SyS_sync_file_range2(long fd, long flags, } SYSCALL_ALIAS(sys_sync_file_range2, SyS_sync_file_range2); #endif - -/* - * `endbyte' is inclusive - */ -int do_sync_mapping_range(struct address_space *mapping, loff_t offset, - loff_t endbyte, unsigned int flags) -{ - int ret; - - if (!mapping) { - ret = -EINVAL; - goto out; - } - - ret = 0; - if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) { - ret = filemap_fdatawait_range(mapping, offset, endbyte); - if (ret < 0) - goto out; - } - - if (flags & SYNC_FILE_RANGE_WRITE) { - ret = __filemap_fdatawrite_range(mapping, offset, endbyte, - WB_SYNC_ALL); - if (ret < 0) - goto out; - } - - if (flags & SYNC_FILE_RANGE_WAIT_AFTER) { - ret = filemap_fdatawait_range(mapping, offset, endbyte); - } -out: - return ret; -} -EXPORT_SYMBOL_GPL(do_sync_mapping_range); diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c index 60c702b..a0a500a 100644 --- a/fs/sysfs/bin.c +++ b/fs/sysfs/bin.c @@ -483,7 +483,8 @@ void unmap_bin_file(struct sysfs_dirent *attr_sd) * @attr: attribute descriptor. */ -int sysfs_create_bin_file(struct kobject * kobj, struct bin_attribute * attr) +int sysfs_create_bin_file(struct kobject *kobj, + const struct bin_attribute *attr) { BUG_ON(!kobj || !kobj->sd || !attr); @@ -497,7 +498,8 @@ int sysfs_create_bin_file(struct kobject * kobj, struct bin_attribute * attr) * @attr: attribute descriptor. */ -void sysfs_remove_bin_file(struct kobject * kobj, struct bin_attribute * attr) +void sysfs_remove_bin_file(struct kobject *kobj, + const struct bin_attribute *attr) { sysfs_hash_and_remove(kobj->sd, attr->attr.name); } diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index f05f230..699f371 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -106,8 +106,10 @@ static struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd) return NULL; t = atomic_cmpxchg(&sd->s_active, v, v + 1); - if (likely(t == v)) + if (likely(t == v)) { + rwsem_acquire_read(&sd->dep_map, 0, 1, _RET_IP_); return sd; + } if (t < 0) return NULL; @@ -130,6 +132,7 @@ static void sysfs_put_active(struct sysfs_dirent *sd) if (unlikely(!sd)) return; + rwsem_release(&sd->dep_map, 1, _RET_IP_); v = atomic_dec_return(&sd->s_active); if (likely(v != SD_DEACTIVATED_BIAS)) return; @@ -194,15 +197,21 @@ static void sysfs_deactivate(struct sysfs_dirent *sd) BUG_ON(sd->s_sibling || !(sd->s_flags & SYSFS_FLAG_REMOVED)); sd->s_sibling = (void *)&wait; + rwsem_acquire(&sd->dep_map, 0, 0, _RET_IP_); /* atomic_add_return() is a mb(), put_active() will always see * the updated sd->s_sibling. */ v = atomic_add_return(SD_DEACTIVATED_BIAS, &sd->s_active); - if (v != SD_DEACTIVATED_BIAS) + if (v != SD_DEACTIVATED_BIAS) { + lock_contended(&sd->dep_map, _RET_IP_); wait_for_completion(&wait); + } sd->s_sibling = NULL; + + lock_acquired(&sd->dep_map, _RET_IP_); + rwsem_release(&sd->dep_map, 1, _RET_IP_); } static int sysfs_alloc_ino(ino_t *pino) @@ -345,6 +354,7 @@ struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type) atomic_set(&sd->s_count, 1); atomic_set(&sd->s_active, 0); + sysfs_dirent_init_lockdep(sd); sd->s_name = name; sd->s_mode = mode; diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index 220b758..6a06a1d 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -81,24 +81,23 @@ int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr * iattr) if (!sd_attrs) return -ENOMEM; sd->s_iattr = sd_attrs; - } else { - /* attributes were changed at least once in past */ - iattrs = &sd_attrs->ia_iattr; - - if (ia_valid & ATTR_UID) - iattrs->ia_uid = iattr->ia_uid; - if (ia_valid & ATTR_GID) - iattrs->ia_gid = iattr->ia_gid; - if (ia_valid & ATTR_ATIME) - iattrs->ia_atime = iattr->ia_atime; - if (ia_valid & ATTR_MTIME) - iattrs->ia_mtime = iattr->ia_mtime; - if (ia_valid & ATTR_CTIME) - iattrs->ia_ctime = iattr->ia_ctime; - if (ia_valid & ATTR_MODE) { - umode_t mode = iattr->ia_mode; - iattrs->ia_mode = sd->s_mode = mode; - } + } + /* attributes were changed at least once in past */ + iattrs = &sd_attrs->ia_iattr; + + if (ia_valid & ATTR_UID) + iattrs->ia_uid = iattr->ia_uid; + if (ia_valid & ATTR_GID) + iattrs->ia_gid = iattr->ia_gid; + if (ia_valid & ATTR_ATIME) + iattrs->ia_atime = iattr->ia_atime; + if (ia_valid & ATTR_MTIME) + iattrs->ia_mtime = iattr->ia_mtime; + if (ia_valid & ATTR_CTIME) + iattrs->ia_ctime = iattr->ia_ctime; + if (ia_valid & ATTR_MODE) { + umode_t mode = iattr->ia_mode; + iattrs->ia_mode = sd->s_mode = mode; } return 0; } diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index ca52e7b..cdd9377 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -8,6 +8,7 @@ * This file is released under the GPLv2. */ +#include <linux/lockdep.h> #include <linux/fs.h> struct sysfs_open_dirent; @@ -50,6 +51,9 @@ struct sysfs_inode_attrs { struct sysfs_dirent { atomic_t s_count; atomic_t s_active; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif struct sysfs_dirent *s_parent; struct sysfs_dirent *s_sibling; const char *s_name; @@ -84,6 +88,17 @@ static inline unsigned int sysfs_type(struct sysfs_dirent *sd) return sd->s_flags & SYSFS_TYPE_MASK; } +#ifdef CONFIG_DEBUG_LOCK_ALLOC +#define sysfs_dirent_init_lockdep(sd) \ +do { \ + static struct lock_class_key __key; \ + \ + lockdep_init_map(&sd->dep_map, "s_active", &__key, 0); \ +} while(0) +#else +#define sysfs_dirent_init_lockdep(sd) do {} while(0) +#endif + /* * Context structure to be used while adding/removing nodes. */ diff --git a/fs/timerfd.c b/fs/timerfd.c index b042bd7..1bfc95a 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -200,7 +200,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS); ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx, - flags & TFD_SHARED_FCNTL_FLAGS); + O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS)); if (ufd < 0) kfree(ctx); diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 39849f8..16a6444 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -45,7 +45,7 @@ * * Similarly, @i_mutex is not always locked in 'ubifs_readpage()', e.g., the * read-ahead path does not lock it ("sys_read -> generic_file_aio_read -> - * ondemand_readahead -> readpage"). In case of readahead, @I_LOCK flag is not + * ondemand_readahead -> readpage"). In case of readahead, @I_SYNC flag is not * set as well. However, UBIFS disables readahead. */ diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c index 618c270..e5a3d8e 100644 --- a/fs/ubifs/gc.c +++ b/fs/ubifs/gc.c @@ -54,6 +54,7 @@ */ #include <linux/pagemap.h> +#include <linux/list_sort.h> #include "ubifs.h" /* @@ -108,101 +109,6 @@ static int switch_gc_head(struct ubifs_info *c) } /** - * list_sort - sort a list. - * @priv: private data, passed to @cmp - * @head: the list to sort - * @cmp: the elements comparison function - * - * This function has been implemented by Mark J Roberts <mjr@znex.org>. It - * implements "merge sort" which has O(nlog(n)) complexity. The list is sorted - * in ascending order. - * - * The comparison function @cmp is supposed to return a negative value if @a is - * than @b, and a positive value if @a is greater than @b. If @a and @b are - * equivalent, then it does not matter what this function returns. - */ -static void list_sort(void *priv, struct list_head *head, - int (*cmp)(void *priv, struct list_head *a, - struct list_head *b)) -{ - struct list_head *p, *q, *e, *list, *tail, *oldhead; - int insize, nmerges, psize, qsize, i; - - if (list_empty(head)) - return; - - list = head->next; - list_del(head); - insize = 1; - for (;;) { - p = oldhead = list; - list = tail = NULL; - nmerges = 0; - - while (p) { - nmerges++; - q = p; - psize = 0; - for (i = 0; i < insize; i++) { - psize++; - q = q->next == oldhead ? NULL : q->next; - if (!q) - break; - } - - qsize = insize; - while (psize > 0 || (qsize > 0 && q)) { - if (!psize) { - e = q; - q = q->next; - qsize--; - if (q == oldhead) - q = NULL; - } else if (!qsize || !q) { - e = p; - p = p->next; - psize--; - if (p == oldhead) - p = NULL; - } else if (cmp(priv, p, q) <= 0) { - e = p; - p = p->next; - psize--; - if (p == oldhead) - p = NULL; - } else { - e = q; - q = q->next; - qsize--; - if (q == oldhead) - q = NULL; - } - if (tail) - tail->next = e; - else - list = e; - e->prev = tail; - tail = e; - } - p = q; - } - - tail->next = list; - list->prev = tail; - - if (nmerges <= 1) - break; - - insize *= 2; - } - - head->next = list; - head->prev = list->prev; - list->prev->next = head; - list->prev = head; -} - -/** * data_nodes_cmp - compare 2 data nodes. * @priv: UBIFS file-system description object * @a: first data node @@ -615,12 +615,11 @@ ssize_t generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size) { struct xattr_handler *handler; - struct inode *inode = dentry->d_inode; - handler = xattr_resolve_name(inode->i_sb->s_xattr, &name); + handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name); if (!handler) return -EOPNOTSUPP; - return handler->get(inode, name, buffer, size); + return handler->get(dentry, name, buffer, size, handler->flags); } /* @@ -630,18 +629,20 @@ generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t s ssize_t generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) { - struct inode *inode = dentry->d_inode; - struct xattr_handler *handler, **handlers = inode->i_sb->s_xattr; + struct xattr_handler *handler, **handlers = dentry->d_sb->s_xattr; unsigned int size = 0; if (!buffer) { - for_each_xattr_handler(handlers, handler) - size += handler->list(inode, NULL, 0, NULL, 0); + for_each_xattr_handler(handlers, handler) { + size += handler->list(dentry, NULL, 0, NULL, 0, + handler->flags); + } } else { char *buf = buffer; for_each_xattr_handler(handlers, handler) { - size = handler->list(inode, buf, buffer_size, NULL, 0); + size = handler->list(dentry, buf, buffer_size, + NULL, 0, handler->flags); if (size > buffer_size) return -ERANGE; buf += size; @@ -659,14 +660,13 @@ int generic_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { struct xattr_handler *handler; - struct inode *inode = dentry->d_inode; if (size == 0) value = ""; /* empty EA, do not remove */ - handler = xattr_resolve_name(inode->i_sb->s_xattr, &name); + handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name); if (!handler) return -EOPNOTSUPP; - return handler->set(inode, name, value, size, flags); + return handler->set(dentry, name, value, size, 0, handler->flags); } /* @@ -677,12 +677,12 @@ int generic_removexattr(struct dentry *dentry, const char *name) { struct xattr_handler *handler; - struct inode *inode = dentry->d_inode; - handler = xattr_resolve_name(inode->i_sb->s_xattr, &name); + handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name); if (!handler) return -EOPNOTSUPP; - return handler->set(inode, name, NULL, 0, XATTR_REPLACE); + return handler->set(dentry, name, NULL, 0, + XATTR_REPLACE, handler->flags); } EXPORT_SYMBOL(generic_getxattr); diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c index 69e598b..883ca5a 100644 --- a/fs/xfs/linux-2.6/xfs_acl.c +++ b/fs/xfs/linux-2.6/xfs_acl.c @@ -251,8 +251,9 @@ xfs_set_mode(struct inode *inode, mode_t mode) if (mode != inode->i_mode) { struct iattr iattr; - iattr.ia_valid = ATTR_MODE; + iattr.ia_valid = ATTR_MODE | ATTR_CTIME; iattr.ia_mode = mode; + iattr.ia_ctime = current_fs_time(inode->i_sb); error = -xfs_setattr(XFS_I(inode), &iattr, XFS_ATTR_NOACL); } @@ -354,37 +355,14 @@ xfs_acl_chmod(struct inode *inode) return error; } -/* - * System xattr handlers. - * - * Currently Posix ACLs are the only system namespace extended attribute - * handlers supported by XFS, so we just implement the handlers here. - * If we ever support other system extended attributes this will need - * some refactoring. - */ - static int -xfs_decode_acl(const char *name) -{ - if (strcmp(name, "posix_acl_access") == 0) - return ACL_TYPE_ACCESS; - else if (strcmp(name, "posix_acl_default") == 0) - return ACL_TYPE_DEFAULT; - return -EINVAL; -} - -static int -xfs_xattr_system_get(struct inode *inode, const char *name, - void *value, size_t size) +xfs_xattr_acl_get(struct dentry *dentry, const char *name, + void *value, size_t size, int type) { struct posix_acl *acl; - int type, error; - - type = xfs_decode_acl(name); - if (type < 0) - return type; + int error; - acl = xfs_get_acl(inode, type); + acl = xfs_get_acl(dentry->d_inode, type); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl == NULL) @@ -397,15 +375,13 @@ xfs_xattr_system_get(struct inode *inode, const char *name, } static int -xfs_xattr_system_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) +xfs_xattr_acl_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) { + struct inode *inode = dentry->d_inode; struct posix_acl *acl = NULL; - int error = 0, type; + int error = 0; - type = xfs_decode_acl(name); - if (type < 0) - return type; if (flags & XATTR_CREATE) return -EINVAL; if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) @@ -462,8 +438,16 @@ xfs_xattr_system_set(struct inode *inode, const char *name, return error; } -struct xattr_handler xfs_xattr_system_handler = { - .prefix = XATTR_SYSTEM_PREFIX, - .get = xfs_xattr_system_get, - .set = xfs_xattr_system_set, +struct xattr_handler xfs_xattr_acl_access_handler = { + .prefix = POSIX_ACL_XATTR_ACCESS, + .flags = ACL_TYPE_ACCESS, + .get = xfs_xattr_acl_get, + .set = xfs_xattr_acl_set, +}; + +struct xattr_handler xfs_xattr_acl_default_handler = { + .prefix = POSIX_ACL_XATTR_DEFAULT, + .flags = ACL_TYPE_DEFAULT, + .get = xfs_xattr_acl_get, + .set = xfs_xattr_acl_set, }; diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index b4c7d42..77b8be8 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -292,6 +292,7 @@ _xfs_buf_free_pages( { if (bp->b_pages != bp->b_page_array) { kmem_free(bp->b_pages); + bp->b_pages = NULL; } } @@ -323,9 +324,8 @@ xfs_buf_free( ASSERT(!PagePrivate(page)); page_cache_release(page); } - _xfs_buf_free_pages(bp); } - + _xfs_buf_free_pages(bp); xfs_buf_deallocate(bp); } @@ -1149,10 +1149,14 @@ _xfs_buf_ioapply( if (bp->b_flags & XBF_ORDERED) { ASSERT(!(bp->b_flags & XBF_READ)); rw = WRITE_BARRIER; - } else if (bp->b_flags & _XBF_RUN_QUEUES) { + } else if (bp->b_flags & XBF_LOG_BUFFER) { ASSERT(!(bp->b_flags & XBF_READ_AHEAD)); bp->b_flags &= ~_XBF_RUN_QUEUES; rw = (bp->b_flags & XBF_WRITE) ? WRITE_SYNC : READ_SYNC; + } else if (bp->b_flags & _XBF_RUN_QUEUES) { + ASSERT(!(bp->b_flags & XBF_READ_AHEAD)); + bp->b_flags &= ~_XBF_RUN_QUEUES; + rw = (bp->b_flags & XBF_WRITE) ? WRITE_META : READ_META; } else { rw = (bp->b_flags & XBF_WRITE) ? WRITE : (bp->b_flags & XBF_READ_AHEAD) ? READA : READ; diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index a509f4a..a34c7b5 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -55,6 +55,7 @@ typedef enum { XBF_FS_MANAGED = (1 << 8), /* filesystem controls freeing memory */ XBF_ORDERED = (1 << 11), /* use ordered writes */ XBF_READ_AHEAD = (1 << 12), /* asynchronous read-ahead */ + XBF_LOG_BUFFER = (1 << 13), /* this is a buffer used for the log */ /* flags used only as arguments to access routines */ XBF_LOCK = (1 << 14), /* lock requested */ diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index 1d5b298..2259460 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -794,7 +794,7 @@ xfs_setup_inode( struct inode *inode = &ip->i_vnode; inode->i_ino = ip->i_ino; - inode->i_state = I_NEW|I_LOCK; + inode->i_state = I_NEW; inode_add_to_lists(ip->i_mount->m_super, inode); inode->i_mode = ip->i_d.di_mode; diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 09783cc..77414db 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -954,16 +954,14 @@ xfs_fs_destroy_inode( ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIM)); /* - * If we have nothing to flush with this inode then complete the - * teardown now, otherwise delay the flush operation. + * We always use background reclaim here because even if the + * inode is clean, it still may be under IO and hence we have + * to take the flush lock. The background reclaim path handles + * this more efficiently than we can here, so simply let background + * reclaim tear down all inodes. */ - if (!xfs_inode_clean(ip)) { - xfs_inode_set_reclaim_tag(ip); - return; - } - out_reclaim: - xfs_ireclaim(ip); + xfs_inode_set_reclaim_tag(ip); } /* diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index 6fed97a..1f5e4bb 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -65,7 +65,6 @@ xfs_inode_ag_lookup( * as the tree is sparse and a gang lookup walks to find * the number of objects requested. */ - read_lock(&pag->pag_ici_lock); if (tag == XFS_ICI_NO_TAG) { nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void **)&ip, *first_index, 1); @@ -74,7 +73,7 @@ xfs_inode_ag_lookup( (void **)&ip, *first_index, 1, tag); } if (!nr_found) - goto unlock; + return NULL; /* * Update the index for the next lookup. Catch overflows @@ -84,13 +83,8 @@ xfs_inode_ag_lookup( */ *first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); if (*first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) - goto unlock; - + return NULL; return ip; - -unlock: - read_unlock(&pag->pag_ici_lock); - return NULL; } STATIC int @@ -100,7 +94,8 @@ xfs_inode_ag_walk( int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), int flags, - int tag) + int tag, + int exclusive) { struct xfs_perag *pag = &mp->m_perag[ag]; uint32_t first_index; @@ -114,10 +109,20 @@ restart: int error = 0; xfs_inode_t *ip; + if (exclusive) + write_lock(&pag->pag_ici_lock); + else + read_lock(&pag->pag_ici_lock); ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag); - if (!ip) + if (!ip) { + if (exclusive) + write_unlock(&pag->pag_ici_lock); + else + read_unlock(&pag->pag_ici_lock); break; + } + /* execute releases pag->pag_ici_lock */ error = execute(ip, pag, flags); if (error == EAGAIN) { skipped++; @@ -125,9 +130,8 @@ restart: } if (error) last_error = error; - /* - * bail out if the filesystem is corrupted. - */ + + /* bail out if the filesystem is corrupted. */ if (error == EFSCORRUPTED) break; @@ -148,7 +152,8 @@ xfs_inode_ag_iterator( int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), int flags, - int tag) + int tag, + int exclusive) { int error = 0; int last_error = 0; @@ -157,7 +162,8 @@ xfs_inode_ag_iterator( for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) { if (!mp->m_perag[ag].pag_ici_init) continue; - error = xfs_inode_ag_walk(mp, ag, execute, flags, tag); + error = xfs_inode_ag_walk(mp, ag, execute, flags, tag, + exclusive); if (error) { last_error = error; if (error == EFSCORRUPTED) @@ -174,30 +180,31 @@ xfs_sync_inode_valid( struct xfs_perag *pag) { struct inode *inode = VFS_I(ip); + int error = EFSCORRUPTED; /* nothing to sync during shutdown */ - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { - read_unlock(&pag->pag_ici_lock); - return EFSCORRUPTED; - } + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + goto out_unlock; - /* - * If we can't get a reference on the inode, it must be in reclaim. - * Leave it for the reclaim code to flush. Also avoid inodes that - * haven't been fully initialised. - */ - if (!igrab(inode)) { - read_unlock(&pag->pag_ici_lock); - return ENOENT; - } - read_unlock(&pag->pag_ici_lock); + /* avoid new or reclaimable inodes. Leave for reclaim code to flush */ + error = ENOENT; + if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM)) + goto out_unlock; - if (is_bad_inode(inode) || xfs_iflags_test(ip, XFS_INEW)) { + /* If we can't grab the inode, it must on it's way to reclaim. */ + if (!igrab(inode)) + goto out_unlock; + + if (is_bad_inode(inode)) { IRELE(ip); - return ENOENT; + goto out_unlock; } - return 0; + /* inode is valid */ + error = 0; +out_unlock: + read_unlock(&pag->pag_ici_lock); + return error; } STATIC int @@ -282,7 +289,7 @@ xfs_sync_data( ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0); error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags, - XFS_ICI_NO_TAG); + XFS_ICI_NO_TAG, 0); if (error) return XFS_ERROR(error); @@ -304,7 +311,7 @@ xfs_sync_attr( ASSERT((flags & ~SYNC_WAIT) == 0); return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags, - XFS_ICI_NO_TAG); + XFS_ICI_NO_TAG, 0); } STATIC int @@ -664,60 +671,6 @@ xfs_syncd_stop( kthread_stop(mp->m_sync_task); } -STATIC int -xfs_reclaim_inode( - xfs_inode_t *ip, - int sync_mode) -{ - xfs_perag_t *pag = xfs_get_perag(ip->i_mount, ip->i_ino); - - /* The hash lock here protects a thread in xfs_iget_core from - * racing with us on linking the inode back with a vnode. - * Once we have the XFS_IRECLAIM flag set it will not touch - * us. - */ - write_lock(&pag->pag_ici_lock); - spin_lock(&ip->i_flags_lock); - if (__xfs_iflags_test(ip, XFS_IRECLAIM) || - !__xfs_iflags_test(ip, XFS_IRECLAIMABLE)) { - spin_unlock(&ip->i_flags_lock); - write_unlock(&pag->pag_ici_lock); - return -EAGAIN; - } - __xfs_iflags_set(ip, XFS_IRECLAIM); - spin_unlock(&ip->i_flags_lock); - write_unlock(&pag->pag_ici_lock); - xfs_put_perag(ip->i_mount, pag); - - /* - * If the inode is still dirty, then flush it out. If the inode - * is not in the AIL, then it will be OK to flush it delwri as - * long as xfs_iflush() does not keep any references to the inode. - * We leave that decision up to xfs_iflush() since it has the - * knowledge of whether it's OK to simply do a delwri flush of - * the inode or whether we need to wait until the inode is - * pulled from the AIL. - * We get the flush lock regardless, though, just to make sure - * we don't free it while it is being flushed. - */ - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_iflock(ip); - - /* - * In the case of a forced shutdown we rely on xfs_iflush() to - * wait for the inode to be unpinned before returning an error. - */ - if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) { - /* synchronize with xfs_iflush_done */ - xfs_iflock(ip); - xfs_ifunlock(ip); - } - - xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_ireclaim(ip); - return 0; -} - void __xfs_inode_set_reclaim_tag( struct xfs_perag *pag, @@ -760,19 +713,55 @@ __xfs_inode_clear_reclaim_tag( } STATIC int -xfs_reclaim_inode_now( +xfs_reclaim_inode( struct xfs_inode *ip, struct xfs_perag *pag, - int flags) + int sync_mode) { - /* ignore if already under reclaim */ - if (xfs_iflags_test(ip, XFS_IRECLAIM)) { - read_unlock(&pag->pag_ici_lock); + /* + * The radix tree lock here protects a thread in xfs_iget from racing + * with us starting reclaim on the inode. Once we have the + * XFS_IRECLAIM flag set it will not touch us. + */ + spin_lock(&ip->i_flags_lock); + ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE)); + if (__xfs_iflags_test(ip, XFS_IRECLAIM)) { + /* ignore as it is already under reclaim */ + spin_unlock(&ip->i_flags_lock); + write_unlock(&pag->pag_ici_lock); return 0; } - read_unlock(&pag->pag_ici_lock); + __xfs_iflags_set(ip, XFS_IRECLAIM); + spin_unlock(&ip->i_flags_lock); + write_unlock(&pag->pag_ici_lock); - return xfs_reclaim_inode(ip, flags); + /* + * If the inode is still dirty, then flush it out. If the inode + * is not in the AIL, then it will be OK to flush it delwri as + * long as xfs_iflush() does not keep any references to the inode. + * We leave that decision up to xfs_iflush() since it has the + * knowledge of whether it's OK to simply do a delwri flush of + * the inode or whether we need to wait until the inode is + * pulled from the AIL. + * We get the flush lock regardless, though, just to make sure + * we don't free it while it is being flushed. + */ + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_iflock(ip); + + /* + * In the case of a forced shutdown we rely on xfs_iflush() to + * wait for the inode to be unpinned before returning an error. + */ + if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) { + /* synchronize with xfs_iflush_done */ + xfs_iflock(ip); + xfs_ifunlock(ip); + } + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_ireclaim(ip); + return 0; } int @@ -780,6 +769,6 @@ xfs_reclaim_inodes( xfs_mount_t *mp, int mode) { - return xfs_inode_ag_iterator(mp, xfs_reclaim_inode_now, mode, - XFS_ICI_RECLAIM_TAG); + return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode, + XFS_ICI_RECLAIM_TAG, 1); } diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h index a500b4d..ea932b4 100644 --- a/fs/xfs/linux-2.6/xfs_sync.h +++ b/fs/xfs/linux-2.6/xfs_sync.h @@ -54,6 +54,6 @@ void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag, int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag); int xfs_inode_ag_iterator(struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), - int flags, int tag); + int flags, int tag, int write_lock); #endif diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h index c40834b..c22a608 100644 --- a/fs/xfs/linux-2.6/xfs_trace.h +++ b/fs/xfs/linux-2.6/xfs_trace.h @@ -33,51 +33,55 @@ struct xfs_dquot; struct xlog_ticket; struct log; +DECLARE_EVENT_CLASS(xfs_attr_list_class, + TP_PROTO(struct xfs_attr_list_context *ctx), + TP_ARGS(ctx), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(u32, hashval) + __field(u32, blkno) + __field(u32, offset) + __field(void *, alist) + __field(int, bufsize) + __field(int, count) + __field(int, firstu) + __field(int, dupcnt) + __field(int, flags) + ), + TP_fast_assign( + __entry->dev = VFS_I(ctx->dp)->i_sb->s_dev; + __entry->ino = ctx->dp->i_ino; + __entry->hashval = ctx->cursor->hashval; + __entry->blkno = ctx->cursor->blkno; + __entry->offset = ctx->cursor->offset; + __entry->alist = ctx->alist; + __entry->bufsize = ctx->bufsize; + __entry->count = ctx->count; + __entry->firstu = ctx->firstu; + __entry->flags = ctx->flags; + ), + TP_printk("dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u " + "alist 0x%p size %u count %u firstu %u flags %d %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->hashval, + __entry->blkno, + __entry->offset, + __entry->dupcnt, + __entry->alist, + __entry->bufsize, + __entry->count, + __entry->firstu, + __entry->flags, + __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS) + ) +) + #define DEFINE_ATTR_LIST_EVENT(name) \ -TRACE_EVENT(name, \ +DEFINE_EVENT(xfs_attr_list_class, name, \ TP_PROTO(struct xfs_attr_list_context *ctx), \ - TP_ARGS(ctx), \ - TP_STRUCT__entry( \ - __field(dev_t, dev) \ - __field(xfs_ino_t, ino) \ - __field(u32, hashval) \ - __field(u32, blkno) \ - __field(u32, offset) \ - __field(void *, alist) \ - __field(int, bufsize) \ - __field(int, count) \ - __field(int, firstu) \ - __field(int, dupcnt) \ - __field(int, flags) \ - ), \ - TP_fast_assign( \ - __entry->dev = VFS_I(ctx->dp)->i_sb->s_dev; \ - __entry->ino = ctx->dp->i_ino; \ - __entry->hashval = ctx->cursor->hashval; \ - __entry->blkno = ctx->cursor->blkno; \ - __entry->offset = ctx->cursor->offset; \ - __entry->alist = ctx->alist; \ - __entry->bufsize = ctx->bufsize; \ - __entry->count = ctx->count; \ - __entry->firstu = ctx->firstu; \ - __entry->flags = ctx->flags; \ - ), \ - TP_printk("dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u " \ - "alist 0x%p size %u count %u firstu %u flags %d %s", \ - MAJOR(__entry->dev), MINOR(__entry->dev), \ - __entry->ino, \ - __entry->hashval, \ - __entry->blkno, \ - __entry->offset, \ - __entry->dupcnt, \ - __entry->alist, \ - __entry->bufsize, \ - __entry->count, \ - __entry->firstu, \ - __entry->flags, \ - __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS) \ - ) \ -) + TP_ARGS(ctx)) DEFINE_ATTR_LIST_EVENT(xfs_attr_list_sf); DEFINE_ATTR_LIST_EVENT(xfs_attr_list_sf_all); DEFINE_ATTR_LIST_EVENT(xfs_attr_list_leaf); @@ -178,91 +182,99 @@ TRACE_EVENT(xfs_iext_insert, (char *)__entry->caller_ip) ); +DECLARE_EVENT_CLASS(xfs_bmap_class, + TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx, int state, + unsigned long caller_ip), + TP_ARGS(ip, idx, state, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_extnum_t, idx) + __field(xfs_fileoff_t, startoff) + __field(xfs_fsblock_t, startblock) + __field(xfs_filblks_t, blockcount) + __field(xfs_exntst_t, state) + __field(int, bmap_state) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + struct xfs_ifork *ifp = (state & BMAP_ATTRFORK) ? + ip->i_afp : &ip->i_df; + struct xfs_bmbt_irec r; + + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &r); + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->idx = idx; + __entry->startoff = r.br_startoff; + __entry->startblock = r.br_startblock; + __entry->blockcount = r.br_blockcount; + __entry->state = r.br_state; + __entry->bmap_state = state; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d ino 0x%llx state %s idx %ld " + "offset %lld block %s count %lld flag %d caller %pf", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS), + (long)__entry->idx, + __entry->startoff, + xfs_fmtfsblock(__entry->startblock), + __entry->blockcount, + __entry->state, + (char *)__entry->caller_ip) +) + #define DEFINE_BMAP_EVENT(name) \ -TRACE_EVENT(name, \ +DEFINE_EVENT(xfs_bmap_class, name, \ TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx, int state, \ unsigned long caller_ip), \ - TP_ARGS(ip, idx, state, caller_ip), \ - TP_STRUCT__entry( \ - __field(dev_t, dev) \ - __field(xfs_ino_t, ino) \ - __field(xfs_extnum_t, idx) \ - __field(xfs_fileoff_t, startoff) \ - __field(xfs_fsblock_t, startblock) \ - __field(xfs_filblks_t, blockcount) \ - __field(xfs_exntst_t, state) \ - __field(int, bmap_state) \ - __field(unsigned long, caller_ip) \ - ), \ - TP_fast_assign( \ - struct xfs_ifork *ifp = (state & BMAP_ATTRFORK) ? \ - ip->i_afp : &ip->i_df; \ - struct xfs_bmbt_irec r; \ - \ - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &r); \ - __entry->dev = VFS_I(ip)->i_sb->s_dev; \ - __entry->ino = ip->i_ino; \ - __entry->idx = idx; \ - __entry->startoff = r.br_startoff; \ - __entry->startblock = r.br_startblock; \ - __entry->blockcount = r.br_blockcount; \ - __entry->state = r.br_state; \ - __entry->bmap_state = state; \ - __entry->caller_ip = caller_ip; \ - ), \ - TP_printk("dev %d:%d ino 0x%llx state %s idx %ld " \ - "offset %lld block %s count %lld flag %d caller %pf", \ - MAJOR(__entry->dev), MINOR(__entry->dev), \ - __entry->ino, \ - __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS), \ - (long)__entry->idx, \ - __entry->startoff, \ - xfs_fmtfsblock(__entry->startblock), \ - __entry->blockcount, \ - __entry->state, \ - (char *)__entry->caller_ip) \ -) - + TP_ARGS(ip, idx, state, caller_ip)) DEFINE_BMAP_EVENT(xfs_iext_remove); DEFINE_BMAP_EVENT(xfs_bmap_pre_update); DEFINE_BMAP_EVENT(xfs_bmap_post_update); DEFINE_BMAP_EVENT(xfs_extlist); -#define DEFINE_BUF_EVENT(tname) \ -TRACE_EVENT(tname, \ - TP_PROTO(struct xfs_buf *bp, unsigned long caller_ip), \ - TP_ARGS(bp, caller_ip), \ - TP_STRUCT__entry( \ - __field(dev_t, dev) \ - __field(xfs_daddr_t, bno) \ - __field(size_t, buffer_length) \ - __field(int, hold) \ - __field(int, pincount) \ - __field(unsigned, lockval) \ - __field(unsigned, flags) \ - __field(unsigned long, caller_ip) \ - ), \ - TP_fast_assign( \ - __entry->dev = bp->b_target->bt_dev; \ - __entry->bno = bp->b_bn; \ - __entry->buffer_length = bp->b_buffer_length; \ - __entry->hold = atomic_read(&bp->b_hold); \ - __entry->pincount = atomic_read(&bp->b_pin_count); \ - __entry->lockval = xfs_buf_lock_value(bp); \ - __entry->flags = bp->b_flags; \ - __entry->caller_ip = caller_ip; \ - ), \ - TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " \ - "lock %d flags %s caller %pf", \ - MAJOR(__entry->dev), MINOR(__entry->dev), \ - (unsigned long long)__entry->bno, \ - __entry->buffer_length, \ - __entry->hold, \ - __entry->pincount, \ - __entry->lockval, \ - __print_flags(__entry->flags, "|", XFS_BUF_FLAGS), \ - (void *)__entry->caller_ip) \ +DECLARE_EVENT_CLASS(xfs_buf_class, + TP_PROTO(struct xfs_buf *bp, unsigned long caller_ip), + TP_ARGS(bp, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_daddr_t, bno) + __field(size_t, buffer_length) + __field(int, hold) + __field(int, pincount) + __field(unsigned, lockval) + __field(unsigned, flags) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = bp->b_target->bt_dev; + __entry->bno = bp->b_bn; + __entry->buffer_length = bp->b_buffer_length; + __entry->hold = atomic_read(&bp->b_hold); + __entry->pincount = atomic_read(&bp->b_pin_count); + __entry->lockval = xfs_buf_lock_value(bp); + __entry->flags = bp->b_flags; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " + "lock %d flags %s caller %pf", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->bno, + __entry->buffer_length, + __entry->hold, + __entry->pincount, + __entry->lockval, + __print_flags(__entry->flags, "|", XFS_BUF_FLAGS), + (void *)__entry->caller_ip) ) + +#define DEFINE_BUF_EVENT(name) \ +DEFINE_EVENT(xfs_buf_class, name, \ + TP_PROTO(struct xfs_buf *bp, unsigned long caller_ip), \ + TP_ARGS(bp, caller_ip)) DEFINE_BUF_EVENT(xfs_buf_init); DEFINE_BUF_EVENT(xfs_buf_free); DEFINE_BUF_EVENT(xfs_buf_hold); @@ -299,41 +311,45 @@ DEFINE_BUF_EVENT(xfs_reset_dqcounts); DEFINE_BUF_EVENT(xfs_inode_item_push); /* pass flags explicitly */ -#define DEFINE_BUF_FLAGS_EVENT(tname) \ -TRACE_EVENT(tname, \ - TP_PROTO(struct xfs_buf *bp, unsigned flags, unsigned long caller_ip), \ - TP_ARGS(bp, flags, caller_ip), \ - TP_STRUCT__entry( \ - __field(dev_t, dev) \ - __field(xfs_daddr_t, bno) \ - __field(size_t, buffer_length) \ - __field(int, hold) \ - __field(int, pincount) \ - __field(unsigned, lockval) \ - __field(unsigned, flags) \ - __field(unsigned long, caller_ip) \ - ), \ - TP_fast_assign( \ - __entry->dev = bp->b_target->bt_dev; \ - __entry->bno = bp->b_bn; \ - __entry->buffer_length = bp->b_buffer_length; \ - __entry->flags = flags; \ - __entry->hold = atomic_read(&bp->b_hold); \ - __entry->pincount = atomic_read(&bp->b_pin_count); \ - __entry->lockval = xfs_buf_lock_value(bp); \ - __entry->caller_ip = caller_ip; \ - ), \ - TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " \ - "lock %d flags %s caller %pf", \ - MAJOR(__entry->dev), MINOR(__entry->dev), \ - (unsigned long long)__entry->bno, \ - __entry->buffer_length, \ - __entry->hold, \ - __entry->pincount, \ - __entry->lockval, \ - __print_flags(__entry->flags, "|", XFS_BUF_FLAGS), \ - (void *)__entry->caller_ip) \ +DECLARE_EVENT_CLASS(xfs_buf_flags_class, + TP_PROTO(struct xfs_buf *bp, unsigned flags, unsigned long caller_ip), + TP_ARGS(bp, flags, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_daddr_t, bno) + __field(size_t, buffer_length) + __field(int, hold) + __field(int, pincount) + __field(unsigned, lockval) + __field(unsigned, flags) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = bp->b_target->bt_dev; + __entry->bno = bp->b_bn; + __entry->buffer_length = bp->b_buffer_length; + __entry->flags = flags; + __entry->hold = atomic_read(&bp->b_hold); + __entry->pincount = atomic_read(&bp->b_pin_count); + __entry->lockval = xfs_buf_lock_value(bp); + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " + "lock %d flags %s caller %pf", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->bno, + __entry->buffer_length, + __entry->hold, + __entry->pincount, + __entry->lockval, + __print_flags(__entry->flags, "|", XFS_BUF_FLAGS), + (void *)__entry->caller_ip) ) + +#define DEFINE_BUF_FLAGS_EVENT(name) \ +DEFINE_EVENT(xfs_buf_flags_class, name, \ + TP_PROTO(struct xfs_buf *bp, unsigned flags, unsigned long caller_ip), \ + TP_ARGS(bp, flags, caller_ip)) DEFINE_BUF_FLAGS_EVENT(xfs_buf_find); DEFINE_BUF_FLAGS_EVENT(xfs_buf_get); DEFINE_BUF_FLAGS_EVENT(xfs_buf_read); @@ -376,55 +392,58 @@ TRACE_EVENT(xfs_buf_ioerror, (void *)__entry->caller_ip) ); -#define DEFINE_BUF_ITEM_EVENT(tname) \ -TRACE_EVENT(tname, \ - TP_PROTO(struct xfs_buf_log_item *bip), \ - TP_ARGS(bip), \ - TP_STRUCT__entry( \ - __field(dev_t, dev) \ - __field(xfs_daddr_t, buf_bno) \ - __field(size_t, buf_len) \ - __field(int, buf_hold) \ - __field(int, buf_pincount) \ - __field(int, buf_lockval) \ - __field(unsigned, buf_flags) \ - __field(unsigned, bli_recur) \ - __field(int, bli_refcount) \ - __field(unsigned, bli_flags) \ - __field(void *, li_desc) \ - __field(unsigned, li_flags) \ - ), \ - TP_fast_assign( \ - __entry->dev = bip->bli_buf->b_target->bt_dev; \ - __entry->bli_flags = bip->bli_flags; \ - __entry->bli_recur = bip->bli_recur; \ - __entry->bli_refcount = atomic_read(&bip->bli_refcount); \ - __entry->buf_bno = bip->bli_buf->b_bn; \ - __entry->buf_len = bip->bli_buf->b_buffer_length; \ - __entry->buf_flags = bip->bli_buf->b_flags; \ - __entry->buf_hold = atomic_read(&bip->bli_buf->b_hold); \ - __entry->buf_pincount = \ - atomic_read(&bip->bli_buf->b_pin_count); \ - __entry->buf_lockval = xfs_buf_lock_value(bip->bli_buf); \ - __entry->li_desc = bip->bli_item.li_desc; \ - __entry->li_flags = bip->bli_item.li_flags; \ - ), \ - TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " \ - "lock %d flags %s recur %d refcount %d bliflags %s " \ - "lidesc 0x%p liflags %s", \ - MAJOR(__entry->dev), MINOR(__entry->dev), \ - (unsigned long long)__entry->buf_bno, \ - __entry->buf_len, \ - __entry->buf_hold, \ - __entry->buf_pincount, \ - __entry->buf_lockval, \ - __print_flags(__entry->buf_flags, "|", XFS_BUF_FLAGS), \ - __entry->bli_recur, \ - __entry->bli_refcount, \ - __print_flags(__entry->bli_flags, "|", XFS_BLI_FLAGS), \ - __entry->li_desc, \ - __print_flags(__entry->li_flags, "|", XFS_LI_FLAGS)) \ +DECLARE_EVENT_CLASS(xfs_buf_item_class, + TP_PROTO(struct xfs_buf_log_item *bip), + TP_ARGS(bip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_daddr_t, buf_bno) + __field(size_t, buf_len) + __field(int, buf_hold) + __field(int, buf_pincount) + __field(int, buf_lockval) + __field(unsigned, buf_flags) + __field(unsigned, bli_recur) + __field(int, bli_refcount) + __field(unsigned, bli_flags) + __field(void *, li_desc) + __field(unsigned, li_flags) + ), + TP_fast_assign( + __entry->dev = bip->bli_buf->b_target->bt_dev; + __entry->bli_flags = bip->bli_flags; + __entry->bli_recur = bip->bli_recur; + __entry->bli_refcount = atomic_read(&bip->bli_refcount); + __entry->buf_bno = bip->bli_buf->b_bn; + __entry->buf_len = bip->bli_buf->b_buffer_length; + __entry->buf_flags = bip->bli_buf->b_flags; + __entry->buf_hold = atomic_read(&bip->bli_buf->b_hold); + __entry->buf_pincount = atomic_read(&bip->bli_buf->b_pin_count); + __entry->buf_lockval = xfs_buf_lock_value(bip->bli_buf); + __entry->li_desc = bip->bli_item.li_desc; + __entry->li_flags = bip->bli_item.li_flags; + ), + TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " + "lock %d flags %s recur %d refcount %d bliflags %s " + "lidesc 0x%p liflags %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->buf_bno, + __entry->buf_len, + __entry->buf_hold, + __entry->buf_pincount, + __entry->buf_lockval, + __print_flags(__entry->buf_flags, "|", XFS_BUF_FLAGS), + __entry->bli_recur, + __entry->bli_refcount, + __print_flags(__entry->bli_flags, "|", XFS_BLI_FLAGS), + __entry->li_desc, + __print_flags(__entry->li_flags, "|", XFS_LI_FLAGS)) ) + +#define DEFINE_BUF_ITEM_EVENT(name) \ +DEFINE_EVENT(xfs_buf_item_class, name, \ + TP_PROTO(struct xfs_buf_log_item *bip), \ + TP_ARGS(bip)) DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_stale); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format); @@ -450,78 +469,90 @@ DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold); DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release); DEFINE_BUF_ITEM_EVENT(xfs_trans_binval); +DECLARE_EVENT_CLASS(xfs_lock_class, + TP_PROTO(struct xfs_inode *ip, unsigned lock_flags, + unsigned long caller_ip), + TP_ARGS(ip, lock_flags, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, lock_flags) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->lock_flags = lock_flags; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d ino 0x%llx flags %s caller %pf", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_flags(__entry->lock_flags, "|", XFS_LOCK_FLAGS), + (void *)__entry->caller_ip) +) + #define DEFINE_LOCK_EVENT(name) \ -TRACE_EVENT(name, \ +DEFINE_EVENT(xfs_lock_class, name, \ TP_PROTO(struct xfs_inode *ip, unsigned lock_flags, \ unsigned long caller_ip), \ - TP_ARGS(ip, lock_flags, caller_ip), \ - TP_STRUCT__entry( \ - __field(dev_t, dev) \ - __field(xfs_ino_t, ino) \ - __field(int, lock_flags) \ - __field(unsigned long, caller_ip) \ - ), \ - TP_fast_assign( \ - __entry->dev = VFS_I(ip)->i_sb->s_dev; \ - __entry->ino = ip->i_ino; \ - __entry->lock_flags = lock_flags; \ - __entry->caller_ip = caller_ip; \ - ), \ - TP_printk("dev %d:%d ino 0x%llx flags %s caller %pf", \ - MAJOR(__entry->dev), MINOR(__entry->dev), \ - __entry->ino, \ - __print_flags(__entry->lock_flags, "|", XFS_LOCK_FLAGS), \ - (void *)__entry->caller_ip) \ -) - + TP_ARGS(ip, lock_flags, caller_ip)) DEFINE_LOCK_EVENT(xfs_ilock); DEFINE_LOCK_EVENT(xfs_ilock_nowait); DEFINE_LOCK_EVENT(xfs_ilock_demote); DEFINE_LOCK_EVENT(xfs_iunlock); +DECLARE_EVENT_CLASS(xfs_iget_class, + TP_PROTO(struct xfs_inode *ip), + TP_ARGS(ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + ), + TP_printk("dev %d:%d ino 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino) +) + #define DEFINE_IGET_EVENT(name) \ -TRACE_EVENT(name, \ +DEFINE_EVENT(xfs_iget_class, name, \ TP_PROTO(struct xfs_inode *ip), \ - TP_ARGS(ip), \ - TP_STRUCT__entry( \ - __field(dev_t, dev) \ - __field(xfs_ino_t, ino) \ - ), \ - TP_fast_assign( \ - __entry->dev = VFS_I(ip)->i_sb->s_dev; \ - __entry->ino = ip->i_ino; \ - ), \ - TP_printk("dev %d:%d ino 0x%llx", \ - MAJOR(__entry->dev), MINOR(__entry->dev), \ - __entry->ino) \ -) + TP_ARGS(ip)) DEFINE_IGET_EVENT(xfs_iget_skip); DEFINE_IGET_EVENT(xfs_iget_reclaim); DEFINE_IGET_EVENT(xfs_iget_found); DEFINE_IGET_EVENT(xfs_iget_alloc); +DECLARE_EVENT_CLASS(xfs_inode_class, + TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), + TP_ARGS(ip, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, count) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->count = atomic_read(&VFS_I(ip)->i_count); + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d ino 0x%llx count %d caller %pf", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->count, + (char *)__entry->caller_ip) +) + #define DEFINE_INODE_EVENT(name) \ -TRACE_EVENT(name, \ +DEFINE_EVENT(xfs_inode_class, name, \ TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \ - TP_ARGS(ip, caller_ip), \ - TP_STRUCT__entry( \ - __field(dev_t, dev) \ - __field(xfs_ino_t, ino) \ - __field(int, count) \ - __field(unsigned long, caller_ip) \ - ), \ - TP_fast_assign( \ - __entry->dev = VFS_I(ip)->i_sb->s_dev; \ - __entry->ino = ip->i_ino; \ - __entry->count = atomic_read(&VFS_I(ip)->i_count); \ - __entry->caller_ip = caller_ip; \ - ), \ - TP_printk("dev %d:%d ino 0x%llx count %d caller %pf", \ - MAJOR(__entry->dev), MINOR(__entry->dev), \ - __entry->ino, \ - __entry->count, \ - (char *)__entry->caller_ip) \ -) + TP_ARGS(ip, caller_ip)) DEFINE_INODE_EVENT(xfs_ihold); DEFINE_INODE_EVENT(xfs_irele); /* the old xfs_itrace_entry tracer - to be replaced by s.th. in the VFS */ @@ -529,55 +560,59 @@ DEFINE_INODE_EVENT(xfs_inode); #define xfs_itrace_entry(ip) \ trace_xfs_inode(ip, _THIS_IP_) -#define DEFINE_DQUOT_EVENT(tname) \ -TRACE_EVENT(tname, \ - TP_PROTO(struct xfs_dquot *dqp), \ - TP_ARGS(dqp), \ - TP_STRUCT__entry( \ - __field(dev_t, dev) \ - __field(__be32, id) \ - __field(unsigned, flags) \ - __field(unsigned, nrefs) \ - __field(unsigned long long, res_bcount) \ - __field(unsigned long long, bcount) \ - __field(unsigned long long, icount) \ - __field(unsigned long long, blk_hardlimit) \ - __field(unsigned long long, blk_softlimit) \ - __field(unsigned long long, ino_hardlimit) \ - __field(unsigned long long, ino_softlimit) \ - ), \ - TP_fast_assign( \ - __entry->dev = dqp->q_mount->m_super->s_dev; \ - __entry->id = dqp->q_core.d_id; \ - __entry->flags = dqp->dq_flags; \ - __entry->nrefs = dqp->q_nrefs; \ - __entry->res_bcount = dqp->q_res_bcount; \ - __entry->bcount = be64_to_cpu(dqp->q_core.d_bcount); \ - __entry->icount = be64_to_cpu(dqp->q_core.d_icount); \ - __entry->blk_hardlimit = \ - be64_to_cpu(dqp->q_core.d_blk_hardlimit); \ - __entry->blk_softlimit = \ - be64_to_cpu(dqp->q_core.d_blk_softlimit); \ - __entry->ino_hardlimit = \ - be64_to_cpu(dqp->q_core.d_ino_hardlimit); \ - __entry->ino_softlimit = \ - be64_to_cpu(dqp->q_core.d_ino_softlimit); \ +DECLARE_EVENT_CLASS(xfs_dquot_class, + TP_PROTO(struct xfs_dquot *dqp), + TP_ARGS(dqp), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(__be32, id) + __field(unsigned, flags) + __field(unsigned, nrefs) + __field(unsigned long long, res_bcount) + __field(unsigned long long, bcount) + __field(unsigned long long, icount) + __field(unsigned long long, blk_hardlimit) + __field(unsigned long long, blk_softlimit) + __field(unsigned long long, ino_hardlimit) + __field(unsigned long long, ino_softlimit) ), \ - TP_printk("dev %d:%d id 0x%x flags %s nrefs %u res_bc 0x%llx " \ - "bcnt 0x%llx [hard 0x%llx | soft 0x%llx] " \ - "icnt 0x%llx [hard 0x%llx | soft 0x%llx]", \ - MAJOR(__entry->dev), MINOR(__entry->dev), \ - be32_to_cpu(__entry->id), \ - __print_flags(__entry->flags, "|", XFS_DQ_FLAGS), \ - __entry->nrefs, \ - __entry->res_bcount, \ - __entry->bcount, \ - __entry->blk_hardlimit, \ - __entry->blk_softlimit, \ - __entry->icount, \ - __entry->ino_hardlimit, \ - __entry->ino_softlimit) \ + TP_fast_assign( + __entry->dev = dqp->q_mount->m_super->s_dev; + __entry->id = dqp->q_core.d_id; + __entry->flags = dqp->dq_flags; + __entry->nrefs = dqp->q_nrefs; + __entry->res_bcount = dqp->q_res_bcount; + __entry->bcount = be64_to_cpu(dqp->q_core.d_bcount); + __entry->icount = be64_to_cpu(dqp->q_core.d_icount); + __entry->blk_hardlimit = + be64_to_cpu(dqp->q_core.d_blk_hardlimit); + __entry->blk_softlimit = + be64_to_cpu(dqp->q_core.d_blk_softlimit); + __entry->ino_hardlimit = + be64_to_cpu(dqp->q_core.d_ino_hardlimit); + __entry->ino_softlimit = + be64_to_cpu(dqp->q_core.d_ino_softlimit); + ), + TP_printk("dev %d:%d id 0x%x flags %s nrefs %u res_bc 0x%llx " + "bcnt 0x%llx [hard 0x%llx | soft 0x%llx] " + "icnt 0x%llx [hard 0x%llx | soft 0x%llx]", + MAJOR(__entry->dev), MINOR(__entry->dev), + be32_to_cpu(__entry->id), + __print_flags(__entry->flags, "|", XFS_DQ_FLAGS), + __entry->nrefs, + __entry->res_bcount, + __entry->bcount, + __entry->blk_hardlimit, + __entry->blk_softlimit, + __entry->icount, + __entry->ino_hardlimit, + __entry->ino_softlimit) ) + +#define DEFINE_DQUOT_EVENT(name) \ +DEFINE_EVENT(xfs_dquot_class, name, \ + TP_PROTO(struct xfs_dquot *dqp), \ + TP_ARGS(dqp)) DEFINE_DQUOT_EVENT(xfs_dqadjust); DEFINE_DQUOT_EVENT(xfs_dqshake_dirty); DEFINE_DQUOT_EVENT(xfs_dqshake_unlink); @@ -610,72 +645,75 @@ DEFINE_DQUOT_EVENT(xfs_dqflush_done); DEFINE_IGET_EVENT(xfs_dquot_dqalloc); DEFINE_IGET_EVENT(xfs_dquot_dqdetach); +DECLARE_EVENT_CLASS(xfs_loggrant_class, + TP_PROTO(struct log *log, struct xlog_ticket *tic), + TP_ARGS(log, tic), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned, trans_type) + __field(char, ocnt) + __field(char, cnt) + __field(int, curr_res) + __field(int, unit_res) + __field(unsigned int, flags) + __field(void *, reserve_headq) + __field(void *, write_headq) + __field(int, grant_reserve_cycle) + __field(int, grant_reserve_bytes) + __field(int, grant_write_cycle) + __field(int, grant_write_bytes) + __field(int, curr_cycle) + __field(int, curr_block) + __field(xfs_lsn_t, tail_lsn) + ), + TP_fast_assign( + __entry->dev = log->l_mp->m_super->s_dev; + __entry->trans_type = tic->t_trans_type; + __entry->ocnt = tic->t_ocnt; + __entry->cnt = tic->t_cnt; + __entry->curr_res = tic->t_curr_res; + __entry->unit_res = tic->t_unit_res; + __entry->flags = tic->t_flags; + __entry->reserve_headq = log->l_reserve_headq; + __entry->write_headq = log->l_write_headq; + __entry->grant_reserve_cycle = log->l_grant_reserve_cycle; + __entry->grant_reserve_bytes = log->l_grant_reserve_bytes; + __entry->grant_write_cycle = log->l_grant_write_cycle; + __entry->grant_write_bytes = log->l_grant_write_bytes; + __entry->curr_cycle = log->l_curr_cycle; + __entry->curr_block = log->l_curr_block; + __entry->tail_lsn = log->l_tail_lsn; + ), + TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u " + "t_unit_res %u t_flags %s reserve_headq 0x%p " + "write_headq 0x%p grant_reserve_cycle %d " + "grant_reserve_bytes %d grant_write_cycle %d " + "grant_write_bytes %d curr_cycle %d curr_block %d " + "tail_cycle %d tail_block %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->trans_type, XFS_TRANS_TYPES), + __entry->ocnt, + __entry->cnt, + __entry->curr_res, + __entry->unit_res, + __print_flags(__entry->flags, "|", XLOG_TIC_FLAGS), + __entry->reserve_headq, + __entry->write_headq, + __entry->grant_reserve_cycle, + __entry->grant_reserve_bytes, + __entry->grant_write_cycle, + __entry->grant_write_bytes, + __entry->curr_cycle, + __entry->curr_block, + CYCLE_LSN(__entry->tail_lsn), + BLOCK_LSN(__entry->tail_lsn) + ) +) -#define DEFINE_LOGGRANT_EVENT(tname) \ -TRACE_EVENT(tname, \ +#define DEFINE_LOGGRANT_EVENT(name) \ +DEFINE_EVENT(xfs_loggrant_class, name, \ TP_PROTO(struct log *log, struct xlog_ticket *tic), \ - TP_ARGS(log, tic), \ - TP_STRUCT__entry( \ - __field(dev_t, dev) \ - __field(unsigned, trans_type) \ - __field(char, ocnt) \ - __field(char, cnt) \ - __field(int, curr_res) \ - __field(int, unit_res) \ - __field(unsigned int, flags) \ - __field(void *, reserve_headq) \ - __field(void *, write_headq) \ - __field(int, grant_reserve_cycle) \ - __field(int, grant_reserve_bytes) \ - __field(int, grant_write_cycle) \ - __field(int, grant_write_bytes) \ - __field(int, curr_cycle) \ - __field(int, curr_block) \ - __field(xfs_lsn_t, tail_lsn) \ - ), \ - TP_fast_assign( \ - __entry->dev = log->l_mp->m_super->s_dev; \ - __entry->trans_type = tic->t_trans_type; \ - __entry->ocnt = tic->t_ocnt; \ - __entry->cnt = tic->t_cnt; \ - __entry->curr_res = tic->t_curr_res; \ - __entry->unit_res = tic->t_unit_res; \ - __entry->flags = tic->t_flags; \ - __entry->reserve_headq = log->l_reserve_headq; \ - __entry->write_headq = log->l_write_headq; \ - __entry->grant_reserve_cycle = log->l_grant_reserve_cycle; \ - __entry->grant_reserve_bytes = log->l_grant_reserve_bytes; \ - __entry->grant_write_cycle = log->l_grant_write_cycle; \ - __entry->grant_write_bytes = log->l_grant_write_bytes; \ - __entry->curr_cycle = log->l_curr_cycle; \ - __entry->curr_block = log->l_curr_block; \ - __entry->tail_lsn = log->l_tail_lsn; \ - ), \ - TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u " \ - "t_unit_res %u t_flags %s reserve_headq 0x%p " \ - "write_headq 0x%p grant_reserve_cycle %d " \ - "grant_reserve_bytes %d grant_write_cycle %d " \ - "grant_write_bytes %d curr_cycle %d curr_block %d " \ - "tail_cycle %d tail_block %d", \ - MAJOR(__entry->dev), MINOR(__entry->dev), \ - __print_symbolic(__entry->trans_type, XFS_TRANS_TYPES), \ - __entry->ocnt, \ - __entry->cnt, \ - __entry->curr_res, \ - __entry->unit_res, \ - __print_flags(__entry->flags, "|", XLOG_TIC_FLAGS), \ - __entry->reserve_headq, \ - __entry->write_headq, \ - __entry->grant_reserve_cycle, \ - __entry->grant_reserve_bytes, \ - __entry->grant_write_cycle, \ - __entry->grant_write_bytes, \ - __entry->curr_cycle, \ - __entry->curr_block, \ - CYCLE_LSN(__entry->tail_lsn), \ - BLOCK_LSN(__entry->tail_lsn) \ - ) \ -) + TP_ARGS(log, tic)) DEFINE_LOGGRANT_EVENT(xfs_log_done_nonperm); DEFINE_LOGGRANT_EVENT(xfs_log_done_perm); DEFINE_LOGGRANT_EVENT(xfs_log_reserve); @@ -815,7 +853,7 @@ TRACE_EVENT(name, \ ), \ TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \ "offset 0x%llx count %zd flags %s " \ - "startoff 0x%llx startblock 0x%llx blockcount 0x%llx", \ + "startoff 0x%llx startblock %s blockcount 0x%llx", \ MAJOR(__entry->dev), MINOR(__entry->dev), \ __entry->ino, \ __entry->size, \ @@ -824,7 +862,7 @@ TRACE_EVENT(name, \ __entry->count, \ __print_flags(__entry->flags, "|", BMAPI_FLAGS), \ __entry->startoff, \ - __entry->startblock, \ + xfs_fmtfsblock(__entry->startblock), \ __entry->blockcount) \ ) DEFINE_IOMAP_EVENT(xfs_iomap_enter); @@ -897,28 +935,32 @@ TRACE_EVENT(xfs_itruncate_start, __entry->toss_finish) ); +DECLARE_EVENT_CLASS(xfs_itrunc_class, + TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), + TP_ARGS(ip, new_size), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fsize_t, size) + __field(xfs_fsize_t, new_size) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->size = ip->i_d.di_size; + __entry->new_size = new_size; + ), + TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->size, + __entry->new_size) +) + #define DEFINE_ITRUNC_EVENT(name) \ -TRACE_EVENT(name, \ +DEFINE_EVENT(xfs_itrunc_class, name, \ TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), \ - TP_ARGS(ip, new_size), \ - TP_STRUCT__entry( \ - __field(dev_t, dev) \ - __field(xfs_ino_t, ino) \ - __field(xfs_fsize_t, size) \ - __field(xfs_fsize_t, new_size) \ - ), \ - TP_fast_assign( \ - __entry->dev = VFS_I(ip)->i_sb->s_dev; \ - __entry->ino = ip->i_ino; \ - __entry->size = ip->i_d.di_size; \ - __entry->new_size = new_size; \ - ), \ - TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx", \ - MAJOR(__entry->dev), MINOR(__entry->dev), \ - __entry->ino, \ - __entry->size, \ - __entry->new_size) \ -) + TP_ARGS(ip, new_size)) DEFINE_ITRUNC_EVENT(xfs_itruncate_finish_start); DEFINE_ITRUNC_EVENT(xfs_itruncate_finish_end); @@ -1037,28 +1079,28 @@ TRACE_EVENT(xfs_alloc_unbusy, TRACE_EVENT(xfs_alloc_busysearch, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, - xfs_extlen_t len, int found), - TP_ARGS(mp, agno, agbno, len, found), + xfs_extlen_t len, xfs_lsn_t lsn), + TP_ARGS(mp, agno, agbno, len, lsn), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, agbno) __field(xfs_extlen_t, len) - __field(int, found) + __field(xfs_lsn_t, lsn) ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; __entry->agno = agno; __entry->agbno = agbno; __entry->len = len; - __entry->found = found; + __entry->lsn = lsn; ), - TP_printk("dev %d:%d agno %u agbno %u len %u %s", + TP_printk("dev %d:%d agno %u agbno %u len %u force lsn 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->agbno, __entry->len, - __print_symbolic(__entry->found, XFS_BUSY_STATES)) + __entry->lsn) ); TRACE_EVENT(xfs_agf, @@ -1152,77 +1194,80 @@ TRACE_EVENT(xfs_free_extent, ); -#define DEFINE_ALLOC_EVENT(name) \ -TRACE_EVENT(name, \ - TP_PROTO(struct xfs_alloc_arg *args), \ - TP_ARGS(args), \ - TP_STRUCT__entry( \ - __field(dev_t, dev) \ - __field(xfs_agnumber_t, agno) \ - __field(xfs_agblock_t, agbno) \ - __field(xfs_extlen_t, minlen) \ - __field(xfs_extlen_t, maxlen) \ - __field(xfs_extlen_t, mod) \ - __field(xfs_extlen_t, prod) \ - __field(xfs_extlen_t, minleft) \ - __field(xfs_extlen_t, total) \ - __field(xfs_extlen_t, alignment) \ - __field(xfs_extlen_t, minalignslop) \ - __field(xfs_extlen_t, len) \ - __field(short, type) \ - __field(short, otype) \ - __field(char, wasdel) \ - __field(char, wasfromfl) \ - __field(char, isfl) \ - __field(char, userdata) \ - __field(xfs_fsblock_t, firstblock) \ - ), \ - TP_fast_assign( \ - __entry->dev = args->mp->m_super->s_dev; \ - __entry->agno = args->agno; \ - __entry->agbno = args->agbno; \ - __entry->minlen = args->minlen; \ - __entry->maxlen = args->maxlen; \ - __entry->mod = args->mod; \ - __entry->prod = args->prod; \ - __entry->minleft = args->minleft; \ - __entry->total = args->total; \ - __entry->alignment = args->alignment; \ - __entry->minalignslop = args->minalignslop; \ - __entry->len = args->len; \ - __entry->type = args->type; \ - __entry->otype = args->otype; \ - __entry->wasdel = args->wasdel; \ - __entry->wasfromfl = args->wasfromfl; \ - __entry->isfl = args->isfl; \ - __entry->userdata = args->userdata; \ - __entry->firstblock = args->firstblock; \ - ), \ - TP_printk("dev %d:%d agno %u agbno %u minlen %u maxlen %u mod %u " \ - "prod %u minleft %u total %u alignment %u minalignslop %u " \ - "len %u type %s otype %s wasdel %d wasfromfl %d isfl %d " \ - "userdata %d firstblock 0x%llx", \ - MAJOR(__entry->dev), MINOR(__entry->dev), \ - __entry->agno, \ - __entry->agbno, \ - __entry->minlen, \ - __entry->maxlen, \ - __entry->mod, \ - __entry->prod, \ - __entry->minleft, \ - __entry->total, \ - __entry->alignment, \ - __entry->minalignslop, \ - __entry->len, \ - __print_symbolic(__entry->type, XFS_ALLOC_TYPES), \ - __print_symbolic(__entry->otype, XFS_ALLOC_TYPES), \ - __entry->wasdel, \ - __entry->wasfromfl, \ - __entry->isfl, \ - __entry->userdata, \ - __entry->firstblock) \ +DECLARE_EVENT_CLASS(xfs_alloc_class, + TP_PROTO(struct xfs_alloc_arg *args), + TP_ARGS(args), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, minlen) + __field(xfs_extlen_t, maxlen) + __field(xfs_extlen_t, mod) + __field(xfs_extlen_t, prod) + __field(xfs_extlen_t, minleft) + __field(xfs_extlen_t, total) + __field(xfs_extlen_t, alignment) + __field(xfs_extlen_t, minalignslop) + __field(xfs_extlen_t, len) + __field(short, type) + __field(short, otype) + __field(char, wasdel) + __field(char, wasfromfl) + __field(char, isfl) + __field(char, userdata) + __field(xfs_fsblock_t, firstblock) + ), + TP_fast_assign( + __entry->dev = args->mp->m_super->s_dev; + __entry->agno = args->agno; + __entry->agbno = args->agbno; + __entry->minlen = args->minlen; + __entry->maxlen = args->maxlen; + __entry->mod = args->mod; + __entry->prod = args->prod; + __entry->minleft = args->minleft; + __entry->total = args->total; + __entry->alignment = args->alignment; + __entry->minalignslop = args->minalignslop; + __entry->len = args->len; + __entry->type = args->type; + __entry->otype = args->otype; + __entry->wasdel = args->wasdel; + __entry->wasfromfl = args->wasfromfl; + __entry->isfl = args->isfl; + __entry->userdata = args->userdata; + __entry->firstblock = args->firstblock; + ), + TP_printk("dev %d:%d agno %u agbno %u minlen %u maxlen %u mod %u " + "prod %u minleft %u total %u alignment %u minalignslop %u " + "len %u type %s otype %s wasdel %d wasfromfl %d isfl %d " + "userdata %d firstblock 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->minlen, + __entry->maxlen, + __entry->mod, + __entry->prod, + __entry->minleft, + __entry->total, + __entry->alignment, + __entry->minalignslop, + __entry->len, + __print_symbolic(__entry->type, XFS_ALLOC_TYPES), + __print_symbolic(__entry->otype, XFS_ALLOC_TYPES), + __entry->wasdel, + __entry->wasfromfl, + __entry->isfl, + __entry->userdata, + __entry->firstblock) ) +#define DEFINE_ALLOC_EVENT(name) \ +DEFINE_EVENT(xfs_alloc_class, name, \ + TP_PROTO(struct xfs_alloc_arg *args), \ + TP_ARGS(args)) DEFINE_ALLOC_EVENT(xfs_alloc_exact_done); DEFINE_ALLOC_EVENT(xfs_alloc_exact_error); DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft); @@ -1245,92 +1290,100 @@ DEFINE_ALLOC_EVENT(xfs_alloc_vextent_noagbp); DEFINE_ALLOC_EVENT(xfs_alloc_vextent_loopfailed); DEFINE_ALLOC_EVENT(xfs_alloc_vextent_allfailed); -#define DEFINE_DIR2_TRACE(tname) \ -TRACE_EVENT(tname, \ +DECLARE_EVENT_CLASS(xfs_dir2_class, + TP_PROTO(struct xfs_da_args *args), + TP_ARGS(args), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __dynamic_array(char, name, args->namelen) + __field(int, namelen) + __field(xfs_dahash_t, hashval) + __field(xfs_ino_t, inumber) + __field(int, op_flags) + ), + TP_fast_assign( + __entry->dev = VFS_I(args->dp)->i_sb->s_dev; + __entry->ino = args->dp->i_ino; + if (args->namelen) + memcpy(__get_str(name), args->name, args->namelen); + __entry->namelen = args->namelen; + __entry->hashval = args->hashval; + __entry->inumber = args->inumber; + __entry->op_flags = args->op_flags; + ), + TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d hashval 0x%x " + "inumber 0x%llx op_flags %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->namelen, + __entry->namelen ? __get_str(name) : NULL, + __entry->namelen, + __entry->hashval, + __entry->inumber, + __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS)) +) + +#define DEFINE_DIR2_EVENT(name) \ +DEFINE_EVENT(xfs_dir2_class, name, \ TP_PROTO(struct xfs_da_args *args), \ - TP_ARGS(args), \ - TP_STRUCT__entry( \ - __field(dev_t, dev) \ - __field(xfs_ino_t, ino) \ - __dynamic_array(char, name, args->namelen) \ - __field(int, namelen) \ - __field(xfs_dahash_t, hashval) \ - __field(xfs_ino_t, inumber) \ - __field(int, op_flags) \ - ), \ - TP_fast_assign( \ - __entry->dev = VFS_I(args->dp)->i_sb->s_dev; \ - __entry->ino = args->dp->i_ino; \ - if (args->namelen) \ - memcpy(__get_str(name), args->name, args->namelen); \ - __entry->namelen = args->namelen; \ - __entry->hashval = args->hashval; \ - __entry->inumber = args->inumber; \ - __entry->op_flags = args->op_flags; \ - ), \ - TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d hashval 0x%x " \ - "inumber 0x%llx op_flags %s", \ - MAJOR(__entry->dev), MINOR(__entry->dev), \ - __entry->ino, \ - __entry->namelen, \ - __entry->namelen ? __get_str(name) : NULL, \ - __entry->namelen, \ - __entry->hashval, \ - __entry->inumber, \ - __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS)) \ + TP_ARGS(args)) +DEFINE_DIR2_EVENT(xfs_dir2_sf_addname); +DEFINE_DIR2_EVENT(xfs_dir2_sf_create); +DEFINE_DIR2_EVENT(xfs_dir2_sf_lookup); +DEFINE_DIR2_EVENT(xfs_dir2_sf_replace); +DEFINE_DIR2_EVENT(xfs_dir2_sf_removename); +DEFINE_DIR2_EVENT(xfs_dir2_sf_toino4); +DEFINE_DIR2_EVENT(xfs_dir2_sf_toino8); +DEFINE_DIR2_EVENT(xfs_dir2_sf_to_block); +DEFINE_DIR2_EVENT(xfs_dir2_block_addname); +DEFINE_DIR2_EVENT(xfs_dir2_block_lookup); +DEFINE_DIR2_EVENT(xfs_dir2_block_replace); +DEFINE_DIR2_EVENT(xfs_dir2_block_removename); +DEFINE_DIR2_EVENT(xfs_dir2_block_to_sf); +DEFINE_DIR2_EVENT(xfs_dir2_block_to_leaf); +DEFINE_DIR2_EVENT(xfs_dir2_leaf_addname); +DEFINE_DIR2_EVENT(xfs_dir2_leaf_lookup); +DEFINE_DIR2_EVENT(xfs_dir2_leaf_replace); +DEFINE_DIR2_EVENT(xfs_dir2_leaf_removename); +DEFINE_DIR2_EVENT(xfs_dir2_leaf_to_block); +DEFINE_DIR2_EVENT(xfs_dir2_leaf_to_node); +DEFINE_DIR2_EVENT(xfs_dir2_node_addname); +DEFINE_DIR2_EVENT(xfs_dir2_node_lookup); +DEFINE_DIR2_EVENT(xfs_dir2_node_replace); +DEFINE_DIR2_EVENT(xfs_dir2_node_removename); +DEFINE_DIR2_EVENT(xfs_dir2_node_to_leaf); + +DECLARE_EVENT_CLASS(xfs_dir2_space_class, + TP_PROTO(struct xfs_da_args *args, int idx), + TP_ARGS(args, idx), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, op_flags) + __field(int, idx) + ), + TP_fast_assign( + __entry->dev = VFS_I(args->dp)->i_sb->s_dev; + __entry->ino = args->dp->i_ino; + __entry->op_flags = args->op_flags; + __entry->idx = idx; + ), + TP_printk("dev %d:%d ino 0x%llx op_flags %s index %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS), + __entry->idx) ) -DEFINE_DIR2_TRACE(xfs_dir2_sf_addname); -DEFINE_DIR2_TRACE(xfs_dir2_sf_create); -DEFINE_DIR2_TRACE(xfs_dir2_sf_lookup); -DEFINE_DIR2_TRACE(xfs_dir2_sf_replace); -DEFINE_DIR2_TRACE(xfs_dir2_sf_removename); -DEFINE_DIR2_TRACE(xfs_dir2_sf_toino4); -DEFINE_DIR2_TRACE(xfs_dir2_sf_toino8); -DEFINE_DIR2_TRACE(xfs_dir2_sf_to_block); -DEFINE_DIR2_TRACE(xfs_dir2_block_addname); -DEFINE_DIR2_TRACE(xfs_dir2_block_lookup); -DEFINE_DIR2_TRACE(xfs_dir2_block_replace); -DEFINE_DIR2_TRACE(xfs_dir2_block_removename); -DEFINE_DIR2_TRACE(xfs_dir2_block_to_sf); -DEFINE_DIR2_TRACE(xfs_dir2_block_to_leaf); -DEFINE_DIR2_TRACE(xfs_dir2_leaf_addname); -DEFINE_DIR2_TRACE(xfs_dir2_leaf_lookup); -DEFINE_DIR2_TRACE(xfs_dir2_leaf_replace); -DEFINE_DIR2_TRACE(xfs_dir2_leaf_removename); -DEFINE_DIR2_TRACE(xfs_dir2_leaf_to_block); -DEFINE_DIR2_TRACE(xfs_dir2_leaf_to_node); -DEFINE_DIR2_TRACE(xfs_dir2_node_addname); -DEFINE_DIR2_TRACE(xfs_dir2_node_lookup); -DEFINE_DIR2_TRACE(xfs_dir2_node_replace); -DEFINE_DIR2_TRACE(xfs_dir2_node_removename); -DEFINE_DIR2_TRACE(xfs_dir2_node_to_leaf); -#define DEFINE_DIR2_SPACE_TRACE(tname) \ -TRACE_EVENT(tname, \ +#define DEFINE_DIR2_SPACE_EVENT(name) \ +DEFINE_EVENT(xfs_dir2_space_class, name, \ TP_PROTO(struct xfs_da_args *args, int idx), \ - TP_ARGS(args, idx), \ - TP_STRUCT__entry( \ - __field(dev_t, dev) \ - __field(xfs_ino_t, ino) \ - __field(int, op_flags) \ - __field(int, idx) \ - ), \ - TP_fast_assign( \ - __entry->dev = VFS_I(args->dp)->i_sb->s_dev; \ - __entry->ino = args->dp->i_ino; \ - __entry->op_flags = args->op_flags; \ - __entry->idx = idx; \ - ), \ - TP_printk("dev %d:%d ino 0x%llx op_flags %s index %d", \ - MAJOR(__entry->dev), MINOR(__entry->dev), \ - __entry->ino, \ - __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS), \ - __entry->idx) \ -) -DEFINE_DIR2_SPACE_TRACE(xfs_dir2_leafn_add); -DEFINE_DIR2_SPACE_TRACE(xfs_dir2_leafn_remove); -DEFINE_DIR2_SPACE_TRACE(xfs_dir2_grow_inode); -DEFINE_DIR2_SPACE_TRACE(xfs_dir2_shrink_inode); + TP_ARGS(args, idx)) +DEFINE_DIR2_SPACE_EVENT(xfs_dir2_leafn_add); +DEFINE_DIR2_SPACE_EVENT(xfs_dir2_leafn_remove); +DEFINE_DIR2_SPACE_EVENT(xfs_dir2_grow_inode); +DEFINE_DIR2_SPACE_EVENT(xfs_dir2_shrink_inode); TRACE_EVENT(xfs_dir2_leafn_moveents, TP_PROTO(struct xfs_da_args *args, int src_idx, int dst_idx, int count), diff --git a/fs/xfs/linux-2.6/xfs_xattr.c b/fs/xfs/linux-2.6/xfs_xattr.c index 497c7fb..0b18788 100644 --- a/fs/xfs/linux-2.6/xfs_xattr.c +++ b/fs/xfs/linux-2.6/xfs_xattr.c @@ -30,10 +30,10 @@ static int -__xfs_xattr_get(struct inode *inode, const char *name, +xfs_xattr_get(struct dentry *dentry, const char *name, void *value, size_t size, int xflags) { - struct xfs_inode *ip = XFS_I(inode); + struct xfs_inode *ip = XFS_I(dentry->d_inode); int error, asize = size; if (strcmp(name, "") == 0) @@ -52,10 +52,10 @@ __xfs_xattr_get(struct inode *inode, const char *name, } static int -__xfs_xattr_set(struct inode *inode, const char *name, const void *value, +xfs_xattr_set(struct dentry *dentry, const char *name, const void *value, size_t size, int flags, int xflags) { - struct xfs_inode *ip = XFS_I(inode); + struct xfs_inode *ip = XFS_I(dentry->d_inode); if (strcmp(name, "") == 0) return -EINVAL; @@ -71,75 +71,34 @@ __xfs_xattr_set(struct inode *inode, const char *name, const void *value, return -xfs_attr_set(ip, name, (void *)value, size, xflags); } -static int -xfs_xattr_user_get(struct inode *inode, const char *name, - void *value, size_t size) -{ - return __xfs_xattr_get(inode, name, value, size, 0); -} - -static int -xfs_xattr_user_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) -{ - return __xfs_xattr_set(inode, name, value, size, flags, 0); -} - static struct xattr_handler xfs_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, - .get = xfs_xattr_user_get, - .set = xfs_xattr_user_set, + .flags = 0, /* no flags implies user namespace */ + .get = xfs_xattr_get, + .set = xfs_xattr_set, }; - -static int -xfs_xattr_trusted_get(struct inode *inode, const char *name, - void *value, size_t size) -{ - return __xfs_xattr_get(inode, name, value, size, ATTR_ROOT); -} - -static int -xfs_xattr_trusted_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) -{ - return __xfs_xattr_set(inode, name, value, size, flags, ATTR_ROOT); -} - static struct xattr_handler xfs_xattr_trusted_handler = { .prefix = XATTR_TRUSTED_PREFIX, - .get = xfs_xattr_trusted_get, - .set = xfs_xattr_trusted_set, + .flags = ATTR_ROOT, + .get = xfs_xattr_get, + .set = xfs_xattr_set, }; - -static int -xfs_xattr_secure_get(struct inode *inode, const char *name, - void *value, size_t size) -{ - return __xfs_xattr_get(inode, name, value, size, ATTR_SECURE); -} - -static int -xfs_xattr_secure_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) -{ - return __xfs_xattr_set(inode, name, value, size, flags, ATTR_SECURE); -} - static struct xattr_handler xfs_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, - .get = xfs_xattr_secure_get, - .set = xfs_xattr_secure_set, + .flags = ATTR_SECURE, + .get = xfs_xattr_get, + .set = xfs_xattr_set, }; - struct xattr_handler *xfs_xattr_handlers[] = { &xfs_xattr_user_handler, &xfs_xattr_trusted_handler, &xfs_xattr_security_handler, #ifdef CONFIG_XFS_POSIX_ACL - &xfs_xattr_system_handler, + &xfs_xattr_acl_access_handler, + &xfs_xattr_acl_default_handler, #endif NULL }; diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c index 71af76f..873e07e 100644 --- a/fs/xfs/quota/xfs_qm_syscalls.c +++ b/fs/xfs/quota/xfs_qm_syscalls.c @@ -891,7 +891,7 @@ xfs_qm_dqrele_all_inodes( uint flags) { ASSERT(mp->m_quotainfo); - xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG); + xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG, 0); } /*------------------------------------------------------------------------*/ diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h index 947b150..00fd357c 100644 --- a/fs/xfs/xfs_acl.h +++ b/fs/xfs/xfs_acl.h @@ -49,7 +49,8 @@ extern int xfs_acl_chmod(struct inode *inode); extern int posix_acl_access_exists(struct inode *inode); extern int posix_acl_default_exists(struct inode *inode); -extern struct xattr_handler xfs_xattr_system_handler; +extern struct xattr_handler xfs_xattr_acl_access_handler; +extern struct xattr_handler xfs_xattr_acl_default_handler; #else # define xfs_check_acl NULL # define xfs_get_acl(inode, type) NULL diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index a1c65fc..275b1f4 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -2563,43 +2563,41 @@ xfs_alloc_search_busy(xfs_trans_t *tp, xfs_mount_t *mp; xfs_perag_busy_t *bsy; xfs_agblock_t uend, bend; - xfs_lsn_t lsn; + xfs_lsn_t lsn = 0; int cnt; mp = tp->t_mountp; spin_lock(&mp->m_perag[agno].pagb_lock); - cnt = mp->m_perag[agno].pagb_count; uend = bno + len - 1; - /* search pagb_list for this slot, skipping open slots */ - for (bsy = mp->m_perag[agno].pagb_list; cnt; bsy++) { + /* + * search pagb_list for this slot, skipping open slots. We have to + * search the entire array as there may be multiple overlaps and + * we have to get the most recent LSN for the log force to push out + * all the transactions that span the range. + */ + for (cnt = 0; cnt < mp->m_perag[agno].pagb_count; cnt++) { + bsy = &mp->m_perag[agno].pagb_list[cnt]; + if (!bsy->busy_tp) + continue; - /* - * (start1,length1) within (start2, length2) - */ - if (bsy->busy_tp != NULL) { - bend = bsy->busy_start + bsy->busy_length - 1; - if ((bno > bend) || (uend < bsy->busy_start)) { - cnt--; - } else { - break; - } - } - } + bend = bsy->busy_start + bsy->busy_length - 1; + if (bno > bend || uend < bsy->busy_start) + continue; - trace_xfs_alloc_busysearch(mp, agno, bno, len, !!cnt); + /* (start1,length1) within (start2, length2) */ + if (XFS_LSN_CMP(bsy->busy_tp->t_commit_lsn, lsn) > 0) + lsn = bsy->busy_tp->t_commit_lsn; + } + spin_unlock(&mp->m_perag[agno].pagb_lock); + trace_xfs_alloc_busysearch(tp->t_mountp, agno, bno, len, lsn); /* * If a block was found, force the log through the LSN of the * transaction that freed the block */ - if (cnt) { - lsn = bsy->busy_tp->t_commit_lsn; - spin_unlock(&mp->m_perag[agno].pagb_lock); + if (lsn) xfs_log_force(mp, lsn, XFS_LOG_FORCE|XFS_LOG_SYNC); - } else { - spin_unlock(&mp->m_perag[agno].pagb_lock); - } } diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h index 5549d49..cf07ca7 100644 --- a/fs/xfs/xfs_bmap_btree.h +++ b/fs/xfs/xfs_bmap_btree.h @@ -46,20 +46,12 @@ typedef struct xfs_bmdr_block { #define BMBT_STARTBLOCK_BITLEN 52 #define BMBT_BLOCKCOUNT_BITLEN 21 - -#define BMBT_USE_64 1 - -typedef struct xfs_bmbt_rec_32 -{ - __uint32_t l0, l1, l2, l3; -} xfs_bmbt_rec_32_t; -typedef struct xfs_bmbt_rec_64 -{ +typedef struct xfs_bmbt_rec { __be64 l0, l1; -} xfs_bmbt_rec_64_t; +} xfs_bmbt_rec_t; typedef __uint64_t xfs_bmbt_rec_base_t; /* use this for casts */ -typedef xfs_bmbt_rec_64_t xfs_bmbt_rec_t, xfs_bmdr_rec_t; +typedef xfs_bmbt_rec_t xfs_bmdr_rec_t; typedef struct xfs_bmbt_rec_host { __uint64_t l0, l1; diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index d1483a4..84ca1cf 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -114,10 +114,82 @@ xfs_swapext( return error; } +/* + * We need to check that the format of the data fork in the temporary inode is + * valid for the target inode before doing the swap. This is not a problem with + * attr1 because of the fixed fork offset, but attr2 has a dynamically sized + * data fork depending on the space the attribute fork is taking so we can get + * invalid formats on the target inode. + * + * E.g. target has space for 7 extents in extent format, temp inode only has + * space for 6. If we defragment down to 7 extents, then the tmp format is a + * btree, but when swapped it needs to be in extent format. Hence we can't just + * blindly swap data forks on attr2 filesystems. + * + * Note that we check the swap in both directions so that we don't end up with + * a corrupt temporary inode, either. + * + * Note that fixing the way xfs_fsr sets up the attribute fork in the source + * inode will prevent this situation from occurring, so all we do here is + * reject and log the attempt. basically we are putting the responsibility on + * userspace to get this right. + */ +static int +xfs_swap_extents_check_format( + xfs_inode_t *ip, /* target inode */ + xfs_inode_t *tip) /* tmp inode */ +{ + + /* Should never get a local format */ + if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL || + tip->i_d.di_format == XFS_DINODE_FMT_LOCAL) + return EINVAL; + + /* + * if the target inode has less extents that then temporary inode then + * why did userspace call us? + */ + if (ip->i_d.di_nextents < tip->i_d.di_nextents) + return EINVAL; + + /* + * if the target inode is in extent form and the temp inode is in btree + * form then we will end up with the target inode in the wrong format + * as we already know there are less extents in the temp inode. + */ + if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && + tip->i_d.di_format == XFS_DINODE_FMT_BTREE) + return EINVAL; + + /* Check temp in extent form to max in target */ + if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > ip->i_df.if_ext_max) + return EINVAL; + + /* Check target in extent form to max in temp */ + if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > tip->i_df.if_ext_max) + return EINVAL; + + /* Check root block of temp in btree form to max in target */ + if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE && + XFS_IFORK_BOFF(ip) && + tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip)) + return EINVAL; + + /* Check root block of target in btree form to max in temp */ + if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE && + XFS_IFORK_BOFF(tip) && + ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip)) + return EINVAL; + + return 0; +} + int xfs_swap_extents( - xfs_inode_t *ip, - xfs_inode_t *tip, + xfs_inode_t *ip, /* target inode */ + xfs_inode_t *tip, /* tmp inode */ xfs_swapext_t *sxp) { xfs_mount_t *mp; @@ -161,13 +233,6 @@ xfs_swap_extents( goto out_unlock; } - /* Should never get a local format */ - if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL || - tip->i_d.di_format == XFS_DINODE_FMT_LOCAL) { - error = XFS_ERROR(EINVAL); - goto out_unlock; - } - if (VN_CACHED(VFS_I(tip)) != 0) { error = xfs_flushinval_pages(tip, 0, -1, FI_REMAPF_LOCKED); @@ -189,13 +254,12 @@ xfs_swap_extents( goto out_unlock; } - /* - * If the target has extended attributes, the tmp file - * must also in order to ensure the correct data fork - * format. - */ - if ( XFS_IFORK_Q(ip) != XFS_IFORK_Q(tip) ) { - error = XFS_ERROR(EINVAL); + /* check inode formats now that data is flushed */ + error = xfs_swap_extents_check_format(ip, tip); + if (error) { + xfs_fs_cmn_err(CE_NOTE, mp, + "%s: inode 0x%llx format is incompatible for exchanging.", + __FILE__, ip->i_ino); goto out_unlock; } @@ -276,6 +340,16 @@ xfs_swap_extents( *tifp = *tempifp; /* struct copy */ /* + * Fix the in-memory data fork values that are dependent on the fork + * offset in the inode. We can't assume they remain the same as attr2 + * has dynamic fork offsets. + */ + ifp->if_ext_max = XFS_IFORK_SIZE(ip, XFS_DATA_FORK) / + (uint)sizeof(xfs_bmbt_rec_t); + tifp->if_ext_max = XFS_IFORK_SIZE(tip, XFS_DATA_FORK) / + (uint)sizeof(xfs_bmbt_rec_t); + + /* * Fix the on-disk inode values */ tmp = (__uint64_t)ip->i_d.di_nblocks; diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index f5c904a..155e798 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -73,7 +73,6 @@ xfs_inode_alloc( ASSERT(atomic_read(&ip->i_pincount) == 0); ASSERT(!spin_is_locked(&ip->i_flags_lock)); ASSERT(completion_done(&ip->i_flush)); - ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); @@ -91,7 +90,7 @@ xfs_inode_alloc( ip->i_new_size = 0; /* prevent anyone from using this yet */ - VFS_I(ip)->i_state = I_NEW|I_LOCK; + VFS_I(ip)->i_state = I_NEW; return ip; } @@ -217,7 +216,7 @@ xfs_iget_cache_hit( trace_xfs_iget_reclaim(ip); goto out_error; } - inode->i_state = I_LOCK|I_NEW; + inode->i_state = I_NEW; } else { /* If the VFS inode is being torn down, pause and try again. */ if (!igrab(inode)) { @@ -478,17 +477,21 @@ xfs_ireclaim( { struct xfs_mount *mp = ip->i_mount; struct xfs_perag *pag; + xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); XFS_STATS_INC(xs_ig_reclaims); /* - * Remove the inode from the per-AG radix tree. It doesn't matter - * if it was never added to it because radix_tree_delete can deal - * with that case just fine. + * Remove the inode from the per-AG radix tree. + * + * Because radix_tree_delete won't complain even if the item was never + * added to the tree assert that it's been there before to catch + * problems with the inode life time early on. */ pag = xfs_get_perag(mp, ip->i_ino); write_lock(&pag->pag_ici_lock); - radix_tree_delete(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino)); + if (!radix_tree_delete(&pag->pag_ici_root, agino)) + ASSERT(0); write_unlock(&pag->pag_ici_lock); xfs_put_perag(mp, pag); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index ce278b3..ef77fd8 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2841,8 +2841,8 @@ xfs_iflush( mp = ip->i_mount; /* - * If the inode isn't dirty, then just release the inode - * flush lock and do nothing. + * If the inode isn't dirty, then just release the inode flush lock and + * do nothing. */ if (xfs_inode_clean(ip)) { xfs_ifunlock(ip); @@ -2868,6 +2868,19 @@ xfs_iflush( xfs_iunpin_wait(ip); /* + * For stale inodes we cannot rely on the backing buffer remaining + * stale in cache for the remaining life of the stale inode and so + * xfs_itobp() below may give us a buffer that no longer contains + * inodes below. We have to check this after ensuring the inode is + * unpinned so that it is safe to reclaim the stale inode after the + * flush call. + */ + if (xfs_iflags_test(ip, XFS_ISTALE)) { + xfs_ifunlock(ip); + return 0; + } + + /* * This may have been unpinned because the filesystem is shutting * down forcibly. If that's the case we must not write this inode * to disk, because the log record didn't make it to disk! diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index 65bae4c..cc8df1a 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -127,7 +127,7 @@ static inline int xfs_ilog_fdata(int w) #ifdef __KERNEL__ struct xfs_buf; -struct xfs_bmbt_rec_64; +struct xfs_bmbt_rec; struct xfs_inode; struct xfs_mount; @@ -140,9 +140,9 @@ typedef struct xfs_inode_log_item { unsigned short ili_flags; /* misc flags */ unsigned short ili_logged; /* flushed logged data */ unsigned int ili_last_fields; /* fields when flushed */ - struct xfs_bmbt_rec_64 *ili_extents_buf; /* array of logged + struct xfs_bmbt_rec *ili_extents_buf; /* array of logged data exts */ - struct xfs_bmbt_rec_64 *ili_aextents_buf; /* array of logged + struct xfs_bmbt_rec *ili_aextents_buf; /* array of logged attr exts */ unsigned int ili_pushbuf_flag; /* one bit used in push_ail */ diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 4cb1792..600b5b0 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1441,6 +1441,7 @@ xlog_sync(xlog_t *log, XFS_BUF_ZEROFLAGS(bp); XFS_BUF_BUSY(bp); XFS_BUF_ASYNC(bp); + bp->b_flags |= XBF_LOG_BUFFER; /* * Do an ordered write for the log block. * Its unnecessary to flush the first split block in the log wrap case. @@ -1478,6 +1479,7 @@ xlog_sync(xlog_t *log, XFS_BUF_ZEROFLAGS(bp); XFS_BUF_BUSY(bp); XFS_BUF_ASYNC(bp); + bp->b_flags |= XBF_LOG_BUFFER; if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) XFS_BUF_ORDERED(bp); dptr = XFS_BUF_PTR(bp); diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 9e15a11..6be05f7 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -1517,6 +1517,8 @@ xfs_rtfree_range( */ error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1, &postblock); + if (error) + return error; /* * If there are blocks not being freed at the front of the * old extent, add summary data for them to be allocated. diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 6558ffd..6f26875 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -70,7 +70,6 @@ xfs_setattr( uint commit_flags=0; uid_t uid=0, iuid=0; gid_t gid=0, igid=0; - int timeflags = 0; struct xfs_dquot *udqp, *gdqp, *olddquot1, *olddquot2; int need_iolock = 1; @@ -135,16 +134,13 @@ xfs_setattr( if (flags & XFS_ATTR_NOLOCK) need_iolock = 0; if (!(mask & ATTR_SIZE)) { - if ((mask != (ATTR_CTIME|ATTR_ATIME|ATTR_MTIME)) || - (mp->m_flags & XFS_MOUNT_WSYNC)) { - tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); - commit_flags = 0; - if ((code = xfs_trans_reserve(tp, 0, - XFS_ICHANGE_LOG_RES(mp), 0, - 0, 0))) { - lock_flags = 0; - goto error_return; - } + tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); + commit_flags = 0; + code = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), + 0, 0, 0); + if (code) { + lock_flags = 0; + goto error_return; } } else { if (DM_EVENT_ENABLED(ip, DM_EVENT_TRUNCATE) && @@ -295,15 +291,23 @@ xfs_setattr( * or we are explicitly asked to change it. This handles * the semantic difference between truncate() and ftruncate() * as implemented in the VFS. + * + * The regular truncate() case without ATTR_CTIME and ATTR_MTIME + * is a special case where we need to update the times despite + * not having these flags set. For all other operations the + * VFS set these flags explicitly if it wants a timestamp + * update. */ - if (iattr->ia_size != ip->i_size || (mask & ATTR_CTIME)) - timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; + if (iattr->ia_size != ip->i_size && + (!(mask & (ATTR_CTIME | ATTR_MTIME)))) { + iattr->ia_ctime = iattr->ia_mtime = + current_fs_time(inode->i_sb); + mask |= ATTR_CTIME | ATTR_MTIME; + } if (iattr->ia_size > ip->i_size) { ip->i_d.di_size = iattr->ia_size; ip->i_size = iattr->ia_size; - if (!(flags & XFS_ATTR_DMI)) - xfs_ichgtime(ip, XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } else if (iattr->ia_size <= ip->i_size || (iattr->ia_size == 0 && ip->i_d.di_nextents)) { @@ -374,9 +378,6 @@ xfs_setattr( ip->i_d.di_gid = gid; inode->i_gid = gid; } - - xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE); - timeflags |= XFS_ICHGTIME_CHG; } /* @@ -393,51 +394,37 @@ xfs_setattr( inode->i_mode &= S_IFMT; inode->i_mode |= mode & ~S_IFMT; - - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - timeflags |= XFS_ICHGTIME_CHG; } /* * Change file access or modified times. */ - if (mask & (ATTR_ATIME|ATTR_MTIME)) { - if (mask & ATTR_ATIME) { - inode->i_atime = iattr->ia_atime; - ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec; - ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec; - ip->i_update_core = 1; - } - if (mask & ATTR_MTIME) { - inode->i_mtime = iattr->ia_mtime; - ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec; - ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec; - timeflags &= ~XFS_ICHGTIME_MOD; - timeflags |= XFS_ICHGTIME_CHG; - } - if (tp && (mask & (ATTR_MTIME_SET|ATTR_ATIME_SET))) - xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE); + if (mask & ATTR_ATIME) { + inode->i_atime = iattr->ia_atime; + ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec; + ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec; + ip->i_update_core = 1; } - - /* - * Change file inode change time only if ATTR_CTIME set - * AND we have been called by a DMI function. - */ - - if ((flags & XFS_ATTR_DMI) && (mask & ATTR_CTIME)) { + if (mask & ATTR_CTIME) { inode->i_ctime = iattr->ia_ctime; ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec; ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec; ip->i_update_core = 1; - timeflags &= ~XFS_ICHGTIME_CHG; + } + if (mask & ATTR_MTIME) { + inode->i_mtime = iattr->ia_mtime; + ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec; + ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec; + ip->i_update_core = 1; } /* - * Send out timestamp changes that need to be set to the - * current time. Not done when called by a DMI function. + * And finally, log the inode core if any attribute in it + * has been changed. */ - if (timeflags && !(flags & XFS_ATTR_DMI)) - xfs_ichgtime(ip, timeflags); + if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE| + ATTR_ATIME|ATTR_CTIME|ATTR_MTIME)) + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); XFS_STATS_INC(xs_ig_attrchg); @@ -452,12 +439,10 @@ xfs_setattr( * mix so this probably isn't worth the trouble to optimize. */ code = 0; - if (tp) { - if (mp->m_flags & XFS_MOUNT_WSYNC) - xfs_trans_set_sync(tp); + if (mp->m_flags & XFS_MOUNT_WSYNC) + xfs_trans_set_sync(tp); - code = xfs_trans_commit(tp, commit_flags); - } + code = xfs_trans_commit(tp, commit_flags); xfs_iunlock(ip, lock_flags); |