From b084f598df36b62dfae83c10ed17f0b66b50f442 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 31 May 2011 12:24:58 -0400 Subject: nfsd: fix dependency of nfsd on auth_rpcgss Commit b0b0c0a26e84 "nfsd: add proc file listing kernel's gss_krb5 enctypes" added an nunnecessary dependency of nfsd on the auth_rpcgss module. It's a little ad hoc, but since the only piece of information nfsd needs from rpcsec_gss_krb5 is a single static string, one solution is just to share it with an include file. Cc: stable@kernel.org Reported-by: Michael Guntsche Cc: Kevin Coffman Signed-off-by: J. Bruce Fields --- fs/nfsd/nfsctl.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 1f5eae4..2b1449d 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "idmap.h" #include "nfsd.h" @@ -189,18 +190,10 @@ static struct file_operations export_features_operations = { .release = single_release, }; -#ifdef CONFIG_SUNRPC_GSS +#if defined(CONFIG_SUNRPC_GSS) || defined(CONFIG_SUNRPC_GSS_MODULE) static int supported_enctypes_show(struct seq_file *m, void *v) { - struct gss_api_mech *k5mech; - - k5mech = gss_mech_get_by_name("krb5"); - if (k5mech == NULL) - goto out; - if (k5mech->gm_upcall_enctypes != NULL) - seq_printf(m, k5mech->gm_upcall_enctypes); - gss_mech_put(k5mech); -out: + seq_printf(m, KRB5_SUPPORTED_ENCTYPES); return 0; } @@ -215,7 +208,7 @@ static struct file_operations supported_enctypes_ops = { .llseek = seq_lseek, .release = single_release, }; -#endif /* CONFIG_SUNRPC_GSS */ +#endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */ extern int nfsd_pool_stats_open(struct inode *inode, struct file *file); extern int nfsd_pool_stats_release(struct inode *inode, struct file *file); @@ -1427,9 +1420,9 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent) [NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO}, [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO}, -#ifdef CONFIG_SUNRPC_GSS +#if defined(CONFIG_SUNRPC_GSS) || defined(CONFIG_SUNRPC_GSS_MODULE) [NFSD_SupportedEnctypes] = {"supported_krb5_enctypes", &supported_enctypes_ops, S_IRUGO}, -#endif /* CONFIG_SUNRPC_GSS */ +#endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */ #ifdef CONFIG_NFSD_V4 [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR}, -- cgit v1.1 From be1f4084b4824301e640e81d63b6275cd99ee6a1 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 6 Jun 2011 11:22:17 -0700 Subject: nfsd: v4 support requires CRYPTO nfsd V4 support uses crypto interfaces, so select CRYPTO to fix build errors in 2.6.39: ERROR: "crypto_destroy_tfm" [fs/nfsd/nfsd.ko] undefined! ERROR: "crypto_alloc_base" [fs/nfsd/nfsd.ko] undefined! Reported-by: Wakko Warner Signed-off-by: Randy Dunlap Cc: stable@kernel.org Signed-off-by: J. Bruce Fields --- fs/nfsd/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index 18b3e89..fbb2a5e 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -82,6 +82,7 @@ config NFSD_V4 select NFSD_V3 select FS_POSIX_ACL select SUNRPC_GSS + select CRYPTO help This option enables support in your system's NFS server for version 4 of the NFS protocol (RFC 3530). -- cgit v1.1 From 7d751f6f8c679f51b73d01a1b5269347a929004c Mon Sep 17 00:00:00 2001 From: Casey Bodley Date: Fri, 3 Jun 2011 12:21:23 -0400 Subject: nfsd: link returns nfserr_delay when breaking lease fix for commit 4795bb37effb7b8fe77e2d2034545d062d3788a8, nfsd: break lease on unlink, link, and rename if the LINK operation breaks a delegation, it returns NFS4ERR_NOENT (which is not a valid error in rfc 5661) instead of NFS4ERR_DELAY. the return value of nfsd_break_lease() in nfsd_link() must be converted from host_err to err Signed-off-by: Casey Bodley Cc: stable@kernel.org Signed-off-by: J. Bruce Fields --- fs/nfsd/vfs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index d571827..f3fb61b 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1660,8 +1660,10 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, if (!dold->d_inode) goto out_drop_write; host_err = nfsd_break_lease(dold->d_inode); - if (host_err) + if (host_err) { + err = nfserrno(host_err); goto out_drop_write; + } host_err = vfs_link(dold, dirp, dnew); if (!host_err) { err = nfserrno(commit_metadata(ffhp)); -- cgit v1.1 From ff78fca2a03c08436535d3f7152a30752d8131d1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 12 Jun 2011 09:42:17 -0400 Subject: fix leak in proc_set_super() set_anon_super() can fail... Signed-off-by: Al Viro --- fs/proc/root.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/proc/root.c b/fs/proc/root.c index a9000e9..d6c3b41 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -28,11 +28,12 @@ static int proc_test_super(struct super_block *sb, void *data) static int proc_set_super(struct super_block *sb, void *data) { - struct pid_namespace *ns; - - ns = (struct pid_namespace *)data; - sb->s_fs_info = get_pid_ns(ns); - return set_anon_super(sb, NULL); + int err = set_anon_super(sb, NULL); + if (!err) { + struct pid_namespace *ns = (struct pid_namespace *)data; + sb->s_fs_info = get_pid_ns(ns); + } + return err; } static struct dentry *proc_mount(struct file_system_type *fs_type, -- cgit v1.1 From b1c27ab3f93daede979f804afc38b189c2f17c60 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 12 Jun 2011 10:07:03 -0400 Subject: ubifs: split allocation of ubifs_info into a separate function preparation to ubifs sget() race fixes Signed-off-by: Al Viro --- fs/ubifs/super.c | 87 +++++++++++++++++++++++++++++++------------------------- 1 file changed, 48 insertions(+), 39 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index b5aeb5a..ddc3b02 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1971,6 +1971,53 @@ static struct ubi_volume_desc *open_ubi(const char *name, int mode) return ERR_PTR(-EINVAL); } +static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi) +{ + struct ubifs_info *c; + + c = kzalloc(sizeof(struct ubifs_info), GFP_KERNEL); + if (c) { + spin_lock_init(&c->cnt_lock); + spin_lock_init(&c->cs_lock); + spin_lock_init(&c->buds_lock); + spin_lock_init(&c->space_lock); + spin_lock_init(&c->orphan_lock); + init_rwsem(&c->commit_sem); + mutex_init(&c->lp_mutex); + mutex_init(&c->tnc_mutex); + mutex_init(&c->log_mutex); + mutex_init(&c->mst_mutex); + mutex_init(&c->umount_mutex); + mutex_init(&c->bu_mutex); + mutex_init(&c->write_reserve_mutex); + init_waitqueue_head(&c->cmt_wq); + c->buds = RB_ROOT; + c->old_idx = RB_ROOT; + c->size_tree = RB_ROOT; + c->orph_tree = RB_ROOT; + INIT_LIST_HEAD(&c->infos_list); + INIT_LIST_HEAD(&c->idx_gc); + INIT_LIST_HEAD(&c->replay_list); + INIT_LIST_HEAD(&c->replay_buds); + INIT_LIST_HEAD(&c->uncat_list); + INIT_LIST_HEAD(&c->empty_list); + INIT_LIST_HEAD(&c->freeable_list); + INIT_LIST_HEAD(&c->frdi_idx_list); + INIT_LIST_HEAD(&c->unclean_leb_list); + INIT_LIST_HEAD(&c->old_buds); + INIT_LIST_HEAD(&c->orph_list); + INIT_LIST_HEAD(&c->orph_new); + c->no_chk_data_crc = 1; + + c->highest_inum = UBIFS_FIRST_INO; + c->lhead_lnum = c->ltail_lnum = UBIFS_LOG_LNUM; + + ubi_get_volume_info(ubi, &c->vi); + ubi_get_device_info(c->vi.ubi_num, &c->di); + } + return c; +} + static int ubifs_fill_super(struct super_block *sb, void *data, int silent) { struct ubi_volume_desc *ubi = sb->s_fs_info; @@ -1978,49 +2025,11 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) struct inode *root; int err; - c = kzalloc(sizeof(struct ubifs_info), GFP_KERNEL); + c = alloc_ubifs_info(ubi); if (!c) return -ENOMEM; - spin_lock_init(&c->cnt_lock); - spin_lock_init(&c->cs_lock); - spin_lock_init(&c->buds_lock); - spin_lock_init(&c->space_lock); - spin_lock_init(&c->orphan_lock); - init_rwsem(&c->commit_sem); - mutex_init(&c->lp_mutex); - mutex_init(&c->tnc_mutex); - mutex_init(&c->log_mutex); - mutex_init(&c->mst_mutex); - mutex_init(&c->umount_mutex); - mutex_init(&c->bu_mutex); - mutex_init(&c->write_reserve_mutex); - init_waitqueue_head(&c->cmt_wq); - c->buds = RB_ROOT; - c->old_idx = RB_ROOT; - c->size_tree = RB_ROOT; - c->orph_tree = RB_ROOT; - INIT_LIST_HEAD(&c->infos_list); - INIT_LIST_HEAD(&c->idx_gc); - INIT_LIST_HEAD(&c->replay_list); - INIT_LIST_HEAD(&c->replay_buds); - INIT_LIST_HEAD(&c->uncat_list); - INIT_LIST_HEAD(&c->empty_list); - INIT_LIST_HEAD(&c->freeable_list); - INIT_LIST_HEAD(&c->frdi_idx_list); - INIT_LIST_HEAD(&c->unclean_leb_list); - INIT_LIST_HEAD(&c->old_buds); - INIT_LIST_HEAD(&c->orph_list); - INIT_LIST_HEAD(&c->orph_new); - c->no_chk_data_crc = 1; - c->vfs_sb = sb; - c->highest_inum = UBIFS_FIRST_INO; - c->lhead_lnum = c->ltail_lnum = UBIFS_LOG_LNUM; - - ubi_get_volume_info(ubi, &c->vi); - ubi_get_device_info(c->vi.ubi_num, &c->di); - /* Re-open the UBI device in read-write mode */ c->ubi = ubi_open_volume(c->vi.ubi_num, c->vi.vol_id, UBI_READWRITE); if (IS_ERR(c->ubi)) { -- cgit v1.1 From d251ed271d528afb407cc2ede30923e34cb209a5 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 12 Jun 2011 10:24:33 -0400 Subject: ubifs: fix sget races * allocate ubifs_info in ->mount(), fill it enough for sb_test() and set ->s_fs_info to it in set() callback passed to sget(). * do *not* free it in ->put_super(); do that in ->kill_sb() after we'd done kill_anon_super(). * don't free it in ubifs_fill_super() either - deactivate_locked_super() done by caller when ubifs_fill_super() returns an error will take care of that sucker. * get rid of kludge with passing ubi to ubifs_fill_super() in ->s_fs_info; we only need it in alloc_ubifs_info(), so ubifs_fill_super() will need only ubifs_info. Which it will find in ->s_fs_info just fine, no need to reassign anything... As the result, sb_test() becomes safe to apply to all superblocks that can be found by sget() (and a kludge with temporary use of ->s_fs_info to store a pointer to very different structure goes away). Signed-off-by: Al Viro --- fs/ubifs/super.c | 54 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 30 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index ddc3b02..8c892c2 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1848,7 +1848,6 @@ static void ubifs_put_super(struct super_block *sb) bdi_destroy(&c->bdi); ubi_close_volume(c->ubi); mutex_unlock(&c->umount_mutex); - kfree(c); } static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data) @@ -2020,21 +2019,16 @@ static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi) static int ubifs_fill_super(struct super_block *sb, void *data, int silent) { - struct ubi_volume_desc *ubi = sb->s_fs_info; - struct ubifs_info *c; + struct ubifs_info *c = sb->s_fs_info; struct inode *root; int err; - c = alloc_ubifs_info(ubi); - if (!c) - return -ENOMEM; - c->vfs_sb = sb; /* Re-open the UBI device in read-write mode */ c->ubi = ubi_open_volume(c->vi.ubi_num, c->vi.vol_id, UBI_READWRITE); if (IS_ERR(c->ubi)) { err = PTR_ERR(c->ubi); - goto out_free; + goto out; } /* @@ -2100,24 +2094,29 @@ out_bdi: bdi_destroy(&c->bdi); out_close: ubi_close_volume(c->ubi); -out_free: - kfree(c); +out: return err; } static int sb_test(struct super_block *sb, void *data) { - dev_t *dev = data; + struct ubifs_info *c1 = data; struct ubifs_info *c = sb->s_fs_info; - return c->vi.cdev == *dev; + return c->vi.cdev == c1->vi.cdev; +} + +static int sb_set(struct super_block *sb, void *data) +{ + sb->s_fs_info = data; + return set_anon_super(sb, NULL); } static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags, const char *name, void *data) { struct ubi_volume_desc *ubi; - struct ubi_volume_info vi; + struct ubifs_info *c; struct super_block *sb; int err; @@ -2134,19 +2133,24 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags, name, (int)PTR_ERR(ubi)); return ERR_CAST(ubi); } - ubi_get_volume_info(ubi, &vi); - dbg_gen("opened ubi%d_%d", vi.ubi_num, vi.vol_id); + c = alloc_ubifs_info(ubi); + if (!c) { + err = -ENOMEM; + goto out_close; + } + + dbg_gen("opened ubi%d_%d", c->vi.ubi_num, c->vi.vol_id); - sb = sget(fs_type, &sb_test, &set_anon_super, &vi.cdev); + sb = sget(fs_type, sb_test, sb_set, c); if (IS_ERR(sb)) { err = PTR_ERR(sb); - goto out_close; + kfree(c); } if (sb->s_root) { struct ubifs_info *c1 = sb->s_fs_info; - + kfree(c); /* A new mount point for already mounted UBIFS */ dbg_gen("this ubi volume is already mounted"); if (!!(flags & MS_RDONLY) != c1->ro_mount) { @@ -2155,11 +2159,6 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags, } } else { sb->s_flags = flags; - /* - * Pass 'ubi' to 'fill_super()' in sb->s_fs_info where it is - * replaced by 'c'. - */ - sb->s_fs_info = ubi; err = ubifs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0); if (err) goto out_deact; @@ -2179,11 +2178,18 @@ out_close: return ERR_PTR(err); } +static void kill_ubifs_super(struct super_block *s) +{ + struct ubifs_info *c = s->s_fs_info; + kill_anon_super(s); + kfree(c); +} + static struct file_system_type ubifs_fs_type = { .name = "ubifs", .owner = THIS_MODULE, .mount = ubifs_mount, - .kill_sb = kill_anon_super, + .kill_sb = kill_ubifs_super, }; /* -- cgit v1.1 From dde194a64bb5c3fd05d965775dc92e8a4920a53a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 12 Jun 2011 16:01:21 -0400 Subject: afs: fix sget() races, close leak on umount * set ->s_fs_info in set() callback passed to sget() * allocate the thing and set it up enough for afs_test_super() before making it visible * have it freed in ->kill_sb() (current tree simply leaks it) * have ->put_super() leave ->s_fs_info->volume alone; it's too early for dropping it; do that from ->kill_sb() after having called kill_anon_super(). Signed-off-by: Al Viro --- fs/afs/super.c | 73 +++++++++++++++++++++++++--------------------------------- 1 file changed, 32 insertions(+), 41 deletions(-) (limited to 'fs') diff --git a/fs/afs/super.c b/fs/afs/super.c index fb240e8..b7d48d7 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -31,8 +31,8 @@ static void afs_i_init_once(void *foo); static struct dentry *afs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data); +static void afs_kill_super(struct super_block *sb); static struct inode *afs_alloc_inode(struct super_block *sb); -static void afs_put_super(struct super_block *sb); static void afs_destroy_inode(struct inode *inode); static int afs_statfs(struct dentry *dentry, struct kstatfs *buf); @@ -40,7 +40,7 @@ struct file_system_type afs_fs_type = { .owner = THIS_MODULE, .name = "afs", .mount = afs_mount, - .kill_sb = kill_anon_super, + .kill_sb = afs_kill_super, .fs_flags = 0, }; @@ -50,7 +50,6 @@ static const struct super_operations afs_super_ops = { .drop_inode = afs_drop_inode, .destroy_inode = afs_destroy_inode, .evict_inode = afs_evict_inode, - .put_super = afs_put_super, .show_options = generic_show_options, }; @@ -282,19 +281,25 @@ static int afs_parse_device_name(struct afs_mount_params *params, */ static int afs_test_super(struct super_block *sb, void *data) { - struct afs_mount_params *params = data; + struct afs_super_info *as1 = data; struct afs_super_info *as = sb->s_fs_info; - return as->volume == params->volume; + return as->volume == as1->volume; +} + +static int afs_set_super(struct super_block *sb, void *data) +{ + sb->s_fs_info = data; + return set_anon_super(sb, NULL); } /* * fill in the superblock */ -static int afs_fill_super(struct super_block *sb, void *data) +static int afs_fill_super(struct super_block *sb, + struct afs_mount_params *params) { - struct afs_mount_params *params = data; - struct afs_super_info *as = NULL; + struct afs_super_info *as = sb->s_fs_info; struct afs_fid fid; struct dentry *root = NULL; struct inode *inode = NULL; @@ -302,22 +307,11 @@ static int afs_fill_super(struct super_block *sb, void *data) _enter(""); - /* allocate a superblock info record */ - as = kzalloc(sizeof(struct afs_super_info), GFP_KERNEL); - if (!as) { - _leave(" = -ENOMEM"); - return -ENOMEM; - } - - afs_get_volume(params->volume); - as->volume = params->volume; - /* fill in the superblock */ sb->s_blocksize = PAGE_CACHE_SIZE; sb->s_blocksize_bits = PAGE_CACHE_SHIFT; sb->s_magic = AFS_FS_MAGIC; sb->s_op = &afs_super_ops; - sb->s_fs_info = as; sb->s_bdi = &as->volume->bdi; /* allocate the root inode and dentry */ @@ -326,7 +320,7 @@ static int afs_fill_super(struct super_block *sb, void *data) fid.unique = 1; inode = afs_iget(sb, params->key, &fid, NULL, NULL); if (IS_ERR(inode)) - goto error_inode; + return PTR_ERR(inode); if (params->autocell) set_bit(AFS_VNODE_AUTOCELL, &AFS_FS_I(inode)->flags); @@ -342,16 +336,8 @@ static int afs_fill_super(struct super_block *sb, void *data) _leave(" = 0"); return 0; -error_inode: - ret = PTR_ERR(inode); - inode = NULL; error: iput(inode); - afs_put_volume(as->volume); - kfree(as); - - sb->s_fs_info = NULL; - _leave(" = %d", ret); return ret; } @@ -367,6 +353,7 @@ static struct dentry *afs_mount(struct file_system_type *fs_type, struct afs_volume *vol; struct key *key; char *new_opts = kstrdup(options, GFP_KERNEL); + struct afs_super_info *as; int ret; _enter(",,%s,%p", dev_name, options); @@ -399,12 +386,22 @@ static struct dentry *afs_mount(struct file_system_type *fs_type, ret = PTR_ERR(vol); goto error; } - params.volume = vol; + + /* allocate a superblock info record */ + as = kzalloc(sizeof(struct afs_super_info), GFP_KERNEL); + if (!as) { + ret = -ENOMEM; + afs_put_volume(vol); + goto error; + } + as->volume = vol; /* allocate a deviceless superblock */ - sb = sget(fs_type, afs_test_super, set_anon_super, ¶ms); + sb = sget(fs_type, afs_test_super, afs_set_super, as); if (IS_ERR(sb)) { ret = PTR_ERR(sb); + afs_put_volume(vol); + kfree(as); goto error; } @@ -422,16 +419,16 @@ static struct dentry *afs_mount(struct file_system_type *fs_type, } else { _debug("reuse"); ASSERTCMP(sb->s_flags, &, MS_ACTIVE); + afs_put_volume(vol); + kfree(as); } - afs_put_volume(params.volume); afs_put_cell(params.cell); kfree(new_opts); _leave(" = 0 [%p]", sb); return dget(sb->s_root); error: - afs_put_volume(params.volume); afs_put_cell(params.cell); key_put(params.key); kfree(new_opts); @@ -439,18 +436,12 @@ error: return ERR_PTR(ret); } -/* - * finish the unmounting process on the superblock - */ -static void afs_put_super(struct super_block *sb) +static void afs_kill_super(struct super_block *sb) { struct afs_super_info *as = sb->s_fs_info; - - _enter(""); - + kill_anon_super(sb); afs_put_volume(as->volume); - - _leave(""); + kfree(as); } /* -- cgit v1.1 From a685e08987d1edf1995b76511d4c98ea0e905377 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 8 Jun 2011 21:13:01 -0400 Subject: Delay struct net freeing while there's a sysfs instance refering to it * new refcount in struct net, controlling actual freeing of the memory * new method in kobj_ns_type_operations (->drop_ns()) * ->current_ns() semantics change - it's supposed to be followed by corresponding ->drop_ns(). For struct net in case of CONFIG_NET_NS it bumps the new refcount; net_drop_ns() decrements it and calls net_free() if the last reference has been dropped. Method renamed to ->grab_current_ns(). * old net_free() callers call net_drop_ns() instead. * sysfs_exit_ns() is gone, along with a large part of callchain leading to it; now that the references stored in ->ns[...] stay valid we do not need to hunt them down and replace them with NULL. That fixes problems in sysfs_lookup() and sysfs_readdir(), along with getting rid of sb->s_instances abuse. Note that struct net *shutdown* logics has not changed - net_cleanup() is called exactly when it used to be called. The only thing postponed by having a sysfs instance refering to that struct net is actual freeing of memory occupied by struct net. Signed-off-by: Al Viro --- fs/sysfs/mount.c | 37 +++++++++++-------------------------- fs/sysfs/sysfs.h | 2 +- 2 files changed, 12 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 2668957..e34f0d9 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -95,6 +95,14 @@ static int sysfs_set_super(struct super_block *sb, void *data) return error; } +static void free_sysfs_super_info(struct sysfs_super_info *info) +{ + int type; + for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) + kobj_ns_drop(type, info->ns[type]); + kfree(info); +} + static struct dentry *sysfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { @@ -108,11 +116,11 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, return ERR_PTR(-ENOMEM); for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) - info->ns[type] = kobj_ns_current(type); + info->ns[type] = kobj_ns_grab_current(type); sb = sget(fs_type, sysfs_test_super, sysfs_set_super, info); if (IS_ERR(sb) || sb->s_fs_info != info) - kfree(info); + free_sysfs_super_info(info); if (IS_ERR(sb)) return ERR_CAST(sb); if (!sb->s_root) { @@ -131,12 +139,11 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, static void sysfs_kill_sb(struct super_block *sb) { struct sysfs_super_info *info = sysfs_info(sb); - /* Remove the superblock from fs_supers/s_instances * so we can't find it, before freeing sysfs_super_info. */ kill_anon_super(sb); - kfree(info); + free_sysfs_super_info(info); } static struct file_system_type sysfs_fs_type = { @@ -145,28 +152,6 @@ static struct file_system_type sysfs_fs_type = { .kill_sb = sysfs_kill_sb, }; -void sysfs_exit_ns(enum kobj_ns_type type, const void *ns) -{ - struct super_block *sb; - - mutex_lock(&sysfs_mutex); - spin_lock(&sb_lock); - list_for_each_entry(sb, &sysfs_fs_type.fs_supers, s_instances) { - struct sysfs_super_info *info = sysfs_info(sb); - /* - * If we see a superblock on the fs_supers/s_instances - * list the unmount has not completed and sb->s_fs_info - * points to a valid struct sysfs_super_info. - */ - /* Ignore superblocks with the wrong ns */ - if (info->ns[type] != ns) - continue; - info->ns[type] = NULL; - } - spin_unlock(&sb_lock); - mutex_unlock(&sysfs_mutex); -} - int __init sysfs_init(void) { int err = -ENOMEM; diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 3d28af31..2ed2404 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -136,7 +136,7 @@ struct sysfs_addrm_cxt { * instance). */ struct sysfs_super_info { - const void *ns[KOBJ_NS_TYPES]; + void *ns[KOBJ_NS_TYPES]; }; #define sysfs_info(SB) ((struct sysfs_super_info *)(SB->s_fs_info)) extern struct sysfs_dirent sysfs_root; -- cgit v1.1 From ac08aedfa5d3de0dcb3825b598d16c2e57991f54 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 13 Jun 2011 11:28:50 -0400 Subject: Btrfs: check the return value from set_anon_super Al Viro noticed we weren't checking for set_anon_super failures. This adds the required checks. Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 9f68c68..20c111b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1312,7 +1312,9 @@ again: spin_lock_init(&root->cache_lock); init_waitqueue_head(&root->cache_wait); - set_anon_super(&root->anon_super, NULL); + ret = set_anon_super(&root->anon_super, NULL); + if (ret) + goto fail; if (btrfs_root_refs(&root->root_item) == 0) { ret = -ENOENT; -- cgit v1.1 From f4c44016218a6fce357715b9bbabbbbe1f69853c Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 13 Jun 2011 11:30:47 -0400 Subject: Btrfs: drop the delalloc_bytes check in shrink_delalloc Even when delalloc_bytes is zero, we may need to sleep while waiting for delalloc space. Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index b42efc2..1f61bf5 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3314,10 +3314,6 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, if (reserved == 0) return 0; - /* nothing to shrink - nothing to reclaim */ - if (root->fs_info->delalloc_bytes == 0) - return 0; - max_reclaim = min(reserved, to_reclaim); while (loops < 1024) { -- cgit v1.1 From 1123d93963cbd2546449d4d9f0c568e323cb0ac6 Mon Sep 17 00:00:00 2001 From: Max Asbock Date: Mon, 13 Jun 2011 10:18:32 -0700 Subject: timerfd: Fix wakeup of processes when timer is cancelled on clock change Currently processes waiting with poll on cancelable timerfd timers are not woken up when the timers are canceled. When the system time is set the clock_was_set() function calls timerfd_clock_was_set() to cancel and wake up processes waiting on potential cancelable timerfd timers. However the wake up currently has no effect because in the case of timerfd_read it is dependent on ctx->ticks not being 0. timerfd_poll also requires ctx->ticks being non zero. As a consequence processes waiting on cancelable timers only get woken up when the timers expire. This patch fixes this by incrementing ctx->ticks before calling wake_up. Signed-off-by: Max Asbock Cc: kay.sievers@vrfy.org Cc: virtuoso@slind.org Cc: johnstul Link: http://lkml.kernel.org/r/1307985512.4710.41.camel@w-amax.beaverton.ibm.com Signed-off-by: Thomas Gleixner --- fs/timerfd.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/timerfd.c b/fs/timerfd.c index f67acbd..dffeb37 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -61,7 +61,9 @@ static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr) /* * Called when the clock was set to cancel the timers in the cancel - * list. + * list. This will wake up processes waiting on these timers. The + * wake-up requires ctx->ticks to be non zero, therefore we increment + * it before calling wake_up_locked(). */ void timerfd_clock_was_set(void) { @@ -76,6 +78,7 @@ void timerfd_clock_was_set(void) spin_lock_irqsave(&ctx->wqh.lock, flags); if (ctx->moffs.tv64 != moffs.tv64) { ctx->moffs.tv64 = KTIME_MAX; + ctx->ticks++; wake_up_locked(&ctx->wqh); } spin_unlock_irqrestore(&ctx->wqh.lock, flags); -- cgit v1.1 From c46a131c0c0f4c2457e6b1e430c578a5cb057334 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 5 Jun 2011 11:12:31 +0000 Subject: xfs: fix ->mknod() return value on xfs_get_acl() failure ->mknod() should return negative on errors and PTR_ERR() gives already negative value... Signed-off-by: Al Viro Reviewed-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_iops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index dd21784..d44d92c 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -182,7 +182,7 @@ xfs_vn_mknod( if (IS_POSIXACL(dir)) { default_acl = xfs_get_acl(dir, ACL_TYPE_DEFAULT); if (IS_ERR(default_acl)) - return -PTR_ERR(default_acl); + return PTR_ERR(default_acl); if (!default_acl) mode &= ~current_umask(); -- cgit v1.1 From 71d7aed014457147e8f71a843d5fbf03235e4a85 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 14 Jun 2011 14:24:32 -0400 Subject: Btrfs: fix path leakage on subvol deletion The delayed ref patch accidently removed the btrfs_free_path in btrfs_unlink_subvol, this puts it back and means we don't leak a path. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c15636b..5813dec 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3076,6 +3076,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, ret = btrfs_update_inode(trans, root, dir); BUG_ON(ret); + btrfs_free_path(path); return 0; } -- cgit v1.1 From 8351583e3f6e430ce8f71913909a96ad5cc6a2f6 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 14 Jun 2011 15:16:14 -0400 Subject: Btrfs: protect the pending_snapshots list with trans_lock Currently there is nothing protecting the pending_snapshots list on the transaction. We only hold the directory mutex that we are snapshotting and a read lock on the subvol_sem, so we could race with somebody else creating a snapshot in a different directory and end up with list corruption. So protect this list with the trans_lock. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ioctl.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index b793d11..a3c4751 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -482,8 +482,10 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, ret = btrfs_snap_reserve_metadata(trans, pending_snapshot); BUG_ON(ret); + spin_lock(&root->fs_info->trans_lock); list_add(&pending_snapshot->list, &trans->transaction->pending_snapshots); + spin_unlock(&root->fs_info->trans_lock); if (async_transid) { *async_transid = trans->transid; ret = btrfs_commit_transaction_async(trans, -- cgit v1.1 From ed0ca14021e5ae3147602128641aa7f742ab227c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 14 Jun 2011 16:22:15 -0400 Subject: Btrfs: set no_trans_join after trying to expand the transaction We can lockup if we try to allow new writers join the transaction and we have flushoncommit set or have a pending snapshot. This is because we set no_trans_join and then loop around and try to wait for ordered extents again. The problem is the ordered endio stuff needs to join the transaction, which it can't do because no_trans_join is set. So instead wait until after this loop to set no_trans_join and then make sure to wait for num_writers == 1 in case anybody got started in between us exiting the loop and setting no_trans_join. This could easily be reproduced by mounting -o flushoncommit and running xfstest 13. It cannot be reproduced with this patch. Thanks, Reported-by: Jim Schutt Signed-off-by: Josef Bacik --- fs/btrfs/transaction.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 2b3590b..5669559 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1241,12 +1241,20 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, schedule_timeout(1); finish_wait(&cur_trans->writer_wait, &wait); - spin_lock(&root->fs_info->trans_lock); - root->fs_info->trans_no_join = 1; - spin_unlock(&root->fs_info->trans_lock); } while (atomic_read(&cur_trans->num_writers) > 1 || (should_grow && cur_trans->num_joined != joined)); + /* + * Ok now we need to make sure to block out any other joins while we + * commit the transaction. We could have started a join before setting + * no_join so make sure to wait for num_writers to == 1 again. + */ + spin_lock(&root->fs_info->trans_lock); + root->fs_info->trans_no_join = 1; + spin_unlock(&root->fs_info->trans_lock); + wait_event(cur_trans->writer_wait, + atomic_read(&cur_trans->num_writers) == 1); + ret = create_pending_snapshots(trans, root->fs_info); BUG_ON(ret); -- cgit v1.1 From 793925334f32e9026c22baee5c3c340f47d4ef7e Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 15 Jun 2011 12:47:04 -0700 Subject: proc: Fix Oops on stat of /proc//ns/net Don't call iput with the inode half setup to be a namespace filedescriptor. Instead rearrange the code so that we don't initialize ei->ns_ops until after I ns_ops->get succeeds, preventing us from invoking ns_ops->put when ns_ops->get failed. Reported-by: Ingo Saitz Signed-off-by: Eric W. Biederman --- fs/proc/namespaces.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 781dec5..be177f7 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -38,18 +38,21 @@ static struct dentry *proc_ns_instantiate(struct inode *dir, struct inode *inode; struct proc_inode *ei; struct dentry *error = ERR_PTR(-ENOENT); + void *ns; inode = proc_pid_make_inode(dir->i_sb, task); if (!inode) goto out; + ns = ns_ops->get(task); + if (!ns) + goto out_iput; + ei = PROC_I(inode); inode->i_mode = S_IFREG|S_IRUSR; inode->i_fop = &ns_file_operations; ei->ns_ops = ns_ops; - ei->ns = ns_ops->get(task); - if (!ei->ns) - goto out_iput; + ei->ns = ns; dentry->d_op = &pid_dentry_operations; d_add(dentry, inode); -- cgit v1.1 From 50338b889dc504c69e0cb316ac92d1b9e51f3c8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?T=C3=B6r=C3=B6k=20Edwin?= Date: Thu, 16 Jun 2011 00:06:14 +0300 Subject: fix wrong iput on d_inode introduced by e6bc45d65d MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Git bisection shows that commit e6bc45d65df8599fdbae73be9cec4ceed274db53 causes BUG_ONs under high I/O load: kernel BUG at fs/inode.c:1368! [ 2862.501007] Call Trace: [ 2862.501007] [] d_kill+0xf8/0x140 [ 2862.501007] [] dput+0xc9/0x190 [ 2862.501007] [] fput+0x15f/0x210 [ 2862.501007] [] filp_close+0x61/0x90 [ 2862.501007] [] sys_close+0xb1/0x110 [ 2862.501007] [] system_call_fastpath+0x16/0x1b A reliable way to reproduce this bug is: Login to KDE, run 'rsnapshot sync', and apt-get install openjdk-6-jdk, and apt-get remove openjdk-6-jdk. The buggy part of the patch is this: struct inode *inode = NULL; ..... - if (nd.last.name[nd.last.len]) - goto slashes; inode = dentry->d_inode; - if (inode) - ihold(inode); + if (nd.last.name[nd.last.len] || !inode) + goto slashes; + ihold(inode) ... if (inode) iput(inode); /* truncate the inode here */ If nd.last.name[nd.last.len] is nonzero (and thus goto slashes branch is taken), and dentry->d_inode is non-NULL, then this code now does an additional iput on the inode, which is wrong. Fix this by only setting the inode variable if nd.last.name[nd.last.len] is 0. Reference: https://lkml.org/lkml/2011/6/15/50 Reported-by: Norbert Preining Reported-by: Török Edwin Cc: "Theodore Ts'o" Cc: Al Viro Signed-off-by: Török Edwin Signed-off-by: Al Viro --- fs/namei.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 9802345..6301963 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2713,8 +2713,10 @@ static long do_unlinkat(int dfd, const char __user *pathname) error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { /* Why not before? Because we want correct error value */ + if (nd.last.name[nd.last.len]) + goto slashes; inode = dentry->d_inode; - if (nd.last.name[nd.last.len] || !inode) + if (!inode) goto slashes; ihold(inode); error = mnt_want_write(nd.path.mnt); -- cgit v1.1 From 8aef18845266f5c05904c610088f2d1ed58f6be3 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 16 Jun 2011 15:10:06 +0100 Subject: VFS: Fix vfsmount overput on simultaneous automount [Kudos to dhowells for tracking that crap down] If two processes attempt to cause automounting on the same mountpoint at the same time, the vfsmount holding the mountpoint will be left with one too few references on it, causing a BUG when the kernel tries to clean up. The problem is that lock_mount() drops the caller's reference to the mountpoint's vfsmount in the case where it finds something already mounted on the mountpoint as it transits to the mounted filesystem and replaces path->mnt with the new mountpoint vfsmount. During a pathwalk, however, we don't take a reference on the vfsmount if it is the same as the one in the nameidata struct, but do_add_mount() doesn't know this. The fix is to make sure we have a ref on the vfsmount of the mountpoint before calling do_add_mount(). However, if lock_mount() doesn't transit, we're then left with an extra ref on the mountpoint vfsmount which needs releasing. We can handle that in follow_managed() by not making assumptions about what we can and what we cannot get from lookup_mnt() as the current code does. The callers of follow_managed() expect that reference to path->mnt will be grabbed iff path->mnt has been changed. follow_managed() and follow_automount() keep track of whether such reference has been grabbed and assume that it'll happen in those and only those cases that'll have us return with changed path->mnt. That assumption is almost correct - it breaks in case of racing automounts and in even harder to hit race between following a mountpoint and a couple of mount --move. The thing is, we don't need to make that assumption at all - after the end of loop in follow_manage() we can check if path->mnt has ended up unchanged and do mntput() if needed. The BUG can be reproduced with the following test program: #include #include #include #include #include int main(int argc, char **argv) { int pid, ws; struct stat buf; pid = fork(); stat(argv[1], &buf); if (pid > 0) wait(&ws); return 0; } and the following procedure: (1) Mount an NFS volume that on the server has something else mounted on a subdirectory. For instance, I can mount / from my server: mount warthog:/ /mnt -t nfs4 -r On the server /data has another filesystem mounted on it, so NFS will see a change in FSID as it walks down the path, and will mark /mnt/data as being a mountpoint. This will cause the automount code to be triggered. !!! Do not look inside the mounted fs at this point !!! (2) Run the above program on a file within the submount to generate two simultaneous automount requests: /tmp/forkstat /mnt/data/testfile (3) Unmount the automounted submount: umount /mnt/data (4) Unmount the original mount: umount /mnt At this point the kernel should throw a BUG with something like the following: BUG: Dentry ffff880032e3c5c0{i=2,n=} still in use (1) [unmount of nfs4 0:12] Note that the bug appears on the root dentry of the original mount, not the mountpoint and not the submount because sys_umount() hasn't got to its final mntput_no_expire() yet, but this isn't so obvious from the call trace: [] shrink_dcache_for_umount+0x69/0x82 [] generic_shutdown_super+0x37/0x15b [] ? nfs_super_return_all_delegations+0x2e/0x1b1 [nfs] [] kill_anon_super+0x1d/0x7e [] nfs4_kill_super+0x60/0xb6 [nfs] [] deactivate_locked_super+0x34/0x83 [] deactivate_super+0x6f/0x7b [] mntput_no_expire+0x18d/0x199 [] mntput+0x3b/0x44 [] release_mounts+0xa2/0xbf [] sys_umount+0x47a/0x4ba [] ? trace_hardirqs_on_caller+0x1fd/0x22f [] system_call_fastpath+0x16/0x1b as do_umount() is inlined. However, you can see release_mounts() in there. Note also that it may be necessary to have multiple CPU cores to be able to trigger this bug. Tested-by: Jeff Layton Tested-by: Ian Kent Signed-off-by: David Howells Signed-off-by: Al Viro --- fs/namei.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 6301963..9e425e7 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -812,6 +812,11 @@ static int follow_automount(struct path *path, unsigned flags, if (!mnt) /* mount collision */ return 0; + if (!*need_mntput) { + /* lock_mount() may release path->mnt on error */ + mntget(path->mnt); + *need_mntput = true; + } err = finish_automount(mnt, path); switch (err) { @@ -819,12 +824,9 @@ static int follow_automount(struct path *path, unsigned flags, /* Someone else made a mount here whilst we were busy */ return 0; case 0: - dput(path->dentry); - if (*need_mntput) - mntput(path->mnt); + path_put(path); path->mnt = mnt; path->dentry = dget(mnt->mnt_root); - *need_mntput = true; return 0; default: return err; @@ -844,9 +846,10 @@ static int follow_automount(struct path *path, unsigned flags, */ static int follow_managed(struct path *path, unsigned flags) { + struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */ unsigned managed; bool need_mntput = false; - int ret; + int ret = 0; /* Given that we're not holding a lock here, we retain the value in a * local variable for each dentry as we look at it so that we don't see @@ -861,7 +864,7 @@ static int follow_managed(struct path *path, unsigned flags) BUG_ON(!path->dentry->d_op->d_manage); ret = path->dentry->d_op->d_manage(path->dentry, false); if (ret < 0) - return ret == -EISDIR ? 0 : ret; + break; } /* Transit to a mounted filesystem. */ @@ -887,14 +890,19 @@ static int follow_managed(struct path *path, unsigned flags) if (managed & DCACHE_NEED_AUTOMOUNT) { ret = follow_automount(path, flags, &need_mntput); if (ret < 0) - return ret == -EISDIR ? 0 : ret; + break; continue; } /* We didn't change the current path point */ break; } - return 0; + + if (need_mntput && path->mnt == mnt) + mntput(path->mnt); + if (ret == -EISDIR) + ret = 0; + return ret; } int follow_down_one(struct path *path) -- cgit v1.1 From 5e7f23373bf9a853e9256e81e86724cdd0a33c29 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Mon, 13 Jun 2011 22:31:12 +0100 Subject: afs: afs_fill_page reads too much, or wrong data afs_fill_page should read the page that is about to be written but the current implementation has a number of issues. If we aren't extending the file we always read PAGE_CACHE_SIZE at offset 0. If we are extending the file we try to read the entire file. Change afs_fill_page to read PAGE_CACHE_SIZE at the right offset, clamped to i_size. While here, avoid calling afs_fill_page when we are doing a PAGE_CACHE_SIZE write. Signed-off-by: Anton Blanchard Signed-off-by: David Howells Signed-off-by: Al Viro --- fs/afs/write.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/afs/write.c b/fs/afs/write.c index 789b3af..b806285 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -84,23 +84,21 @@ void afs_put_writeback(struct afs_writeback *wb) * partly or wholly fill a page that's under preparation for writing */ static int afs_fill_page(struct afs_vnode *vnode, struct key *key, - loff_t pos, unsigned len, struct page *page) + loff_t pos, struct page *page) { loff_t i_size; - unsigned eof; int ret; + int len; - _enter(",,%llu,%u", (unsigned long long)pos, len); - - ASSERTCMP(len, <=, PAGE_CACHE_SIZE); + _enter(",,%llu", (unsigned long long)pos); i_size = i_size_read(&vnode->vfs_inode); - if (pos + len > i_size) - eof = i_size; + if (pos + PAGE_CACHE_SIZE > i_size) + len = i_size - pos; else - eof = PAGE_CACHE_SIZE; + len = PAGE_CACHE_SIZE; - ret = afs_vnode_fetch_data(vnode, key, 0, eof, page); + ret = afs_vnode_fetch_data(vnode, key, pos, len, page); if (ret < 0) { if (ret == -ENOENT) { _debug("got NOENT from server" @@ -153,9 +151,8 @@ int afs_write_begin(struct file *file, struct address_space *mapping, *pagep = page; /* page won't leak in error case: it eventually gets cleaned off LRU */ - if (!PageUptodate(page)) { - _debug("not up to date"); - ret = afs_fill_page(vnode, key, pos, len, page); + if (!PageUptodate(page) && len != PAGE_CACHE_SIZE) { + ret = afs_fill_page(vnode, key, index << PAGE_CACHE_SHIFT, page); if (ret < 0) { kfree(candidate); _leave(" = %d [prep]", ret); -- cgit v1.1 From f9f07b6c1372b1436aa6b45333445b443ffd8c95 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 14 Jun 2011 00:58:27 +0200 Subject: vfs: Fix data corruption after failed write in __block_write_begin() I've got a report of a file corruption from fsxlinux on ext3. The important operations to the page were: mapwrite to a hole partial write to the page read - found the page zeroed from the end of the normal write The culprit seems to be that if get_block() fails in __block_write_begin() (e.g. transient ENOSPC in ext3), the function does ClearPageUptodate(page). Thus when we retry the write, the logic in __block_write_begin() thinks zeroing of the page is needed and overwrites old data. In fact, I don't see why we should ever need to zero the uptodate bit here - either the page was uptodate when we entered __block_write_begin() and it should stay so when we leave it, or it was not uptodate and noone had right to set it uptodate during __block_write_begin() so it remains !uptodate when we leave as well. So just remove clearing of the bit. Signed-off-by: Jan Kara Signed-off-by: Al Viro --- fs/buffer.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index 49c9aad..1a80b04 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1902,10 +1902,8 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len, if (!buffer_uptodate(*wait_bh)) err = -EIO; } - if (unlikely(err)) { + if (unlikely(err)) page_zero_new_buffers(page, from, to); - ClearPageUptodate(page); - } return err; } EXPORT_SYMBOL(__block_write_begin); -- cgit v1.1 From 2e41ae225f742ded5b7d9847cd8bd605f27daba8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 14 Jun 2011 00:38:44 +0100 Subject: AFS: Set s_id in the superblock to the volume name Set s_id in the superblock to the name of the AFS volume that this superblock corresponds to. Signed-off-by: David Howells Signed-off-by: Al Viro --- fs/afs/super.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/afs/super.c b/fs/afs/super.c index b7d48d7..356dcf0 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -313,6 +313,7 @@ static int afs_fill_super(struct super_block *sb, sb->s_magic = AFS_FS_MAGIC; sb->s_op = &afs_super_ops; sb->s_bdi = &as->volume->bdi; + strlcpy(sb->s_id, as->volume->vlocation->vldb.name, sizeof(sb->s_id)); /* allocate the root inode and dentry */ fid.vid = as->volume->vid; -- cgit v1.1 From d6e43f751f252c68ca69fa6d18665d88d69ef8b7 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 14 Jun 2011 00:45:44 +0100 Subject: AFS: Use i_generation not i_version for the vnode uniquifier Store the AFS vnode uniquifier in the i_generation field, not the i_version field of the inode struct. i_version can then be given the AFS data version number. Signed-off-by: David Howells Signed-off-by: Al Viro --- fs/afs/dir.c | 8 ++++---- fs/afs/fsclient.c | 3 ++- fs/afs/inode.c | 10 +++++----- 3 files changed, 11 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 20c106f..1b0b195 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -584,11 +584,11 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, success: d_add(dentry, inode); - _leave(" = 0 { vn=%u u=%u } -> { ino=%lu v=%llu }", + _leave(" = 0 { vn=%u u=%u } -> { ino=%lu v=%u }", fid.vnode, fid.unique, dentry->d_inode->i_ino, - (unsigned long long)dentry->d_inode->i_version); + dentry->d_inode->i_generation); return NULL; } @@ -671,10 +671,10 @@ static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd) * been deleted and replaced, and the original vnode ID has * been reused */ if (fid.unique != vnode->fid.unique) { - _debug("%s: file deleted (uq %u -> %u I:%llu)", + _debug("%s: file deleted (uq %u -> %u I:%u)", dentry->d_name.name, fid.unique, vnode->fid.unique, - (unsigned long long)dentry->d_inode->i_version); + dentry->d_inode->i_generation); spin_lock(&vnode->lock); set_bit(AFS_VNODE_DELETED, &vnode->flags); spin_unlock(&vnode->lock); diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 4bd0218..346e328 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -89,7 +89,7 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp, i_size_write(&vnode->vfs_inode, size); vnode->vfs_inode.i_uid = status->owner; vnode->vfs_inode.i_gid = status->group; - vnode->vfs_inode.i_version = vnode->fid.unique; + vnode->vfs_inode.i_generation = vnode->fid.unique; vnode->vfs_inode.i_nlink = status->nlink; mode = vnode->vfs_inode.i_mode; @@ -102,6 +102,7 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp, vnode->vfs_inode.i_ctime.tv_sec = status->mtime_server; vnode->vfs_inode.i_mtime = vnode->vfs_inode.i_ctime; vnode->vfs_inode.i_atime = vnode->vfs_inode.i_ctime; + vnode->vfs_inode.i_version = data_version; } expected_version = status->data_version; diff --git a/fs/afs/inode.c b/fs/afs/inode.c index db66c52..0fdab6e 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -75,7 +75,8 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key) inode->i_ctime.tv_nsec = 0; inode->i_atime = inode->i_mtime = inode->i_ctime; inode->i_blocks = 0; - inode->i_version = vnode->fid.unique; + inode->i_generation = vnode->fid.unique; + inode->i_version = vnode->status.data_version; inode->i_mapping->a_ops = &afs_fs_aops; /* check to see whether a symbolic link is really a mountpoint */ @@ -100,7 +101,7 @@ static int afs_iget5_test(struct inode *inode, void *opaque) struct afs_iget_data *data = opaque; return inode->i_ino == data->fid.vnode && - inode->i_version == data->fid.unique; + inode->i_generation == data->fid.unique; } /* @@ -122,7 +123,7 @@ static int afs_iget5_set(struct inode *inode, void *opaque) struct afs_vnode *vnode = AFS_FS_I(inode); inode->i_ino = data->fid.vnode; - inode->i_version = data->fid.unique; + inode->i_generation = data->fid.unique; vnode->fid = data->fid; vnode->volume = data->volume; @@ -380,8 +381,7 @@ int afs_getattr(struct vfsmount *mnt, struct dentry *dentry, inode = dentry->d_inode; - _enter("{ ino=%lu v=%llu }", inode->i_ino, - (unsigned long long)inode->i_version); + _enter("{ ino=%lu v=%u }", inode->i_ino, inode->i_generation); generic_fillattr(inode, stat); return 0; -- cgit v1.1 From a27a263bae072a499acc77b632238a6dacccf888 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 16 Jun 2011 12:02:23 +0000 Subject: xfs: make log devices with write back caches work There's no reason not to support cache flushing on external log devices. The only thing this really requires is flushing the data device first both in fsync and log commits. A side effect is that we also have to remove the barrier write test during mount, which has been superflous since the new FLUSH+FUA code anyway. Also use the chance to flush the RT subvolume write cache before the fsync commit, which is required for correct semantics. Signed-off-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_file.c | 50 ++++++++++++++++++----------- fs/xfs/linux-2.6/xfs_super.c | 75 -------------------------------------------- fs/xfs/xfs_log.c | 11 ++++++- 3 files changed, 41 insertions(+), 95 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index f4213ba..7f782af 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -131,19 +131,34 @@ xfs_file_fsync( { struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; int error = 0; int log_flushed = 0; trace_xfs_file_fsync(ip); - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + if (XFS_FORCED_SHUTDOWN(mp)) return -XFS_ERROR(EIO); xfs_iflags_clear(ip, XFS_ITRUNCATED); xfs_ioend_wait(ip); + if (mp->m_flags & XFS_MOUNT_BARRIER) { + /* + * If we have an RT and/or log subvolume we need to make sure + * to flush the write cache the device used for file data + * first. This is to ensure newly written file data make + * it to disk before logging the new inode size in case of + * an extending write. + */ + if (XFS_IS_REALTIME_INODE(ip)) + xfs_blkdev_issue_flush(mp->m_rtdev_targp); + else if (mp->m_logdev_targp != mp->m_ddev_targp) + xfs_blkdev_issue_flush(mp->m_ddev_targp); + } + /* * We always need to make sure that the required inode state is safe on * disk. The inode might be clean but we still might need to force the @@ -175,9 +190,9 @@ xfs_file_fsync( * updates. The sync transaction will also force the log. */ xfs_iunlock(ip, XFS_ILOCK_SHARED); - tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS); + tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); error = xfs_trans_reserve(tp, 0, - XFS_FSYNC_TS_LOG_RES(ip->i_mount), 0, 0, 0); + XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0); if (error) { xfs_trans_cancel(tp, 0); return -error; @@ -209,28 +224,25 @@ xfs_file_fsync( * force the log. */ if (xfs_ipincount(ip)) { - error = _xfs_log_force_lsn(ip->i_mount, + error = _xfs_log_force_lsn(mp, ip->i_itemp->ili_last_lsn, XFS_LOG_SYNC, &log_flushed); } xfs_iunlock(ip, XFS_ILOCK_SHARED); } - if (ip->i_mount->m_flags & XFS_MOUNT_BARRIER) { - /* - * If the log write didn't issue an ordered tag we need - * to flush the disk cache for the data device now. - */ - if (!log_flushed) - xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp); - - /* - * If this inode is on the RT dev we need to flush that - * cache as well. - */ - if (XFS_IS_REALTIME_INODE(ip)) - xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp); - } + /* + * If we only have a single device, and the log force about was + * a no-op we might have to flush the data device cache here. + * This can only happen for fdatasync/O_DSYNC if we were overwriting + * an already allocated file and thus do not have any metadata to + * commit. + */ + if ((mp->m_flags & XFS_MOUNT_BARRIER) && + mp->m_logdev_targp == mp->m_ddev_targp && + !XFS_IS_REALTIME_INODE(ip) && + !log_flushed) + xfs_blkdev_issue_flush(mp->m_ddev_targp); return -error; } diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 1e3a7ce..a1a881e 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -627,68 +627,6 @@ xfs_blkdev_put( blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); } -/* - * Try to write out the superblock using barriers. - */ -STATIC int -xfs_barrier_test( - xfs_mount_t *mp) -{ - xfs_buf_t *sbp = xfs_getsb(mp, 0); - int error; - - XFS_BUF_UNDONE(sbp); - XFS_BUF_UNREAD(sbp); - XFS_BUF_UNDELAYWRITE(sbp); - XFS_BUF_WRITE(sbp); - XFS_BUF_UNASYNC(sbp); - XFS_BUF_ORDERED(sbp); - - xfsbdstrat(mp, sbp); - error = xfs_buf_iowait(sbp); - - /* - * Clear all the flags we set and possible error state in the - * buffer. We only did the write to try out whether barriers - * worked and shouldn't leave any traces in the superblock - * buffer. - */ - XFS_BUF_DONE(sbp); - XFS_BUF_ERROR(sbp, 0); - XFS_BUF_UNORDERED(sbp); - - xfs_buf_relse(sbp); - return error; -} - -STATIC void -xfs_mountfs_check_barriers(xfs_mount_t *mp) -{ - int error; - - if (mp->m_logdev_targp != mp->m_ddev_targp) { - xfs_notice(mp, - "Disabling barriers, not supported with external log device"); - mp->m_flags &= ~XFS_MOUNT_BARRIER; - return; - } - - if (xfs_readonly_buftarg(mp->m_ddev_targp)) { - xfs_notice(mp, - "Disabling barriers, underlying device is readonly"); - mp->m_flags &= ~XFS_MOUNT_BARRIER; - return; - } - - error = xfs_barrier_test(mp); - if (error) { - xfs_notice(mp, - "Disabling barriers, trial barrier write failed"); - mp->m_flags &= ~XFS_MOUNT_BARRIER; - return; - } -} - void xfs_blkdev_issue_flush( xfs_buftarg_t *buftarg) @@ -1240,14 +1178,6 @@ xfs_fs_remount( switch (token) { case Opt_barrier: mp->m_flags |= XFS_MOUNT_BARRIER; - - /* - * Test if barriers are actually working if we can, - * else delay this check until the filesystem is - * marked writeable. - */ - if (!(mp->m_flags & XFS_MOUNT_RDONLY)) - xfs_mountfs_check_barriers(mp); break; case Opt_nobarrier: mp->m_flags &= ~XFS_MOUNT_BARRIER; @@ -1282,8 +1212,6 @@ xfs_fs_remount( /* ro -> rw */ if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) { mp->m_flags &= ~XFS_MOUNT_RDONLY; - if (mp->m_flags & XFS_MOUNT_BARRIER) - xfs_mountfs_check_barriers(mp); /* * If this is the first remount to writeable state we @@ -1465,9 +1393,6 @@ xfs_fs_fill_super( if (error) goto out_free_sb; - if (mp->m_flags & XFS_MOUNT_BARRIER) - xfs_mountfs_check_barriers(mp); - error = xfs_filestream_mount(mp); if (error) goto out_free_sb; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 2119302..41d5b8f 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1372,8 +1372,17 @@ xlog_sync(xlog_t *log, XFS_BUF_ASYNC(bp); bp->b_flags |= XBF_LOG_BUFFER; - if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) + if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) { + /* + * If we have an external log device, flush the data device + * before flushing the log to make sure all meta data + * written back from the AIL actually made it to disk + * before writing out the new log tail LSN in the log buffer. + */ + if (log->l_mp->m_logdev_targp != log->l_mp->m_ddev_targp) + xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp); XFS_BUF_ORDERED(bp); + } ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize); -- cgit v1.1 From 879669961b11e7f40b518784863a259f735a72bf Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 17 Jun 2011 11:25:59 +0100 Subject: KEYS/DNS: Fix ____call_usermodehelper() to not lose the session keyring ____call_usermodehelper() now erases any credentials set by the subprocess_inf::init() function. The problem is that commit 17f60a7da150 ("capabilites: allow the application of capability limits to usermode helpers") creates and commits new credentials with prepare_kernel_cred() after the call to the init() function. This wipes all keyrings after umh_keys_init() is called. The best way to deal with this is to put the init() call just prior to the commit_creds() call, and pass the cred pointer to init(). That means that umh_keys_init() and suchlike can modify the credentials _before_ they are published and potentially in use by the rest of the system. This prevents request_key() from working as it is prevented from passing the session keyring it set up with the authorisation token to /sbin/request-key, and so the latter can't assume the authority to instantiate the key. This causes the in-kernel DNS resolver to fail with ENOKEY unconditionally. Signed-off-by: David Howells Acked-by: Eric Paris Tested-by: Jeff Layton Signed-off-by: Linus Torvalds --- fs/exec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 97e0d52..6075a1e 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1996,7 +1996,7 @@ static void wait_for_dump_helpers(struct file *file) * is a special value that we use to trap recursive * core dumps */ -static int umh_pipe_setup(struct subprocess_info *info) +static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) { struct file *rp, *wp; struct fdtable *fdt; -- cgit v1.1 From 7585717f304f5ed005cc4ad933a69aab3efbd136 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 13 Jun 2011 20:00:16 -0400 Subject: Btrfs: fix relocation races The recent commit to get rid of our trans_mutex introduced some races with block group relocation. The problem is that relocation needs to do some record keeping about each root, and it was relying on the transaction mutex to coordinate things in subtle ways. This fix adds a mutex just for the relocation code and makes sure it doesn't have a big impact on normal operations. The race is really fixed in btrfs_record_root_in_trans, which is where we step back and wait for the relocation code to finish accounting setup. Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 14 ++++++++++ fs/btrfs/disk-io.c | 1 + fs/btrfs/relocation.c | 30 ++++++++++++++------- fs/btrfs/transaction.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++--- 4 files changed, 105 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 8490ee0..a2c91a1 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -967,6 +967,12 @@ struct btrfs_fs_info { struct srcu_struct subvol_srcu; spinlock_t trans_lock; + /* + * the reloc mutex goes with the trans lock, it is taken + * during commit to protect us from the relocation code + */ + struct mutex reloc_mutex; + struct list_head trans_list; struct list_head hashers; struct list_head dead_roots; @@ -1172,6 +1178,14 @@ struct btrfs_root { u32 type; u64 highest_objectid; + + /* btrfs_record_root_in_trans is a multi-step process, + * and it can race with the balancing code. But the + * race is very small, and only the first time the root + * is added to each transaction. So in_trans_setup + * is used to tell us when more checks are required + */ + unsigned long in_trans_setup; int ref_cows; int track_dirty; int in_radix; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 20c111b..0b2b4b7 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1620,6 +1620,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, spin_lock_init(&fs_info->fs_roots_radix_lock); spin_lock_init(&fs_info->delayed_iput_lock); spin_lock_init(&fs_info->defrag_inodes_lock); + mutex_init(&fs_info->reloc_mutex); init_completion(&fs_info->kobj_unregister); fs_info->tree_root = tree_root; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index f25b10a..086b1e6 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1368,7 +1368,7 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, int ret; if (!root->reloc_root) - return 0; + goto out; reloc_root = root->reloc_root; root_item = &reloc_root->root_item; @@ -1390,6 +1390,8 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, ret = btrfs_update_root(trans, root->fs_info->tree_root, &reloc_root->root_key, root_item); BUG_ON(ret); + +out: return 0; } @@ -2142,10 +2144,11 @@ int prepare_to_merge(struct reloc_control *rc, int err) u64 num_bytes = 0; int ret; - spin_lock(&root->fs_info->trans_lock); + mutex_lock(&root->fs_info->reloc_mutex); rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; rc->merging_rsv_size += rc->nodes_relocated * 2; - spin_unlock(&root->fs_info->trans_lock); + mutex_unlock(&root->fs_info->reloc_mutex); + again: if (!err) { num_bytes = rc->merging_rsv_size; @@ -2214,9 +2217,16 @@ int merge_reloc_roots(struct reloc_control *rc) int ret; again: root = rc->extent_root; - spin_lock(&root->fs_info->trans_lock); + + /* + * this serializes us with btrfs_record_root_in_transaction, + * we have to make sure nobody is in the middle of + * adding their roots to the list while we are + * doing this splice + */ + mutex_lock(&root->fs_info->reloc_mutex); list_splice_init(&rc->reloc_roots, &reloc_roots); - spin_unlock(&root->fs_info->trans_lock); + mutex_unlock(&root->fs_info->reloc_mutex); while (!list_empty(&reloc_roots)) { found = 1; @@ -3590,17 +3600,19 @@ next: static void set_reloc_control(struct reloc_control *rc) { struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; - spin_lock(&fs_info->trans_lock); + + mutex_lock(&fs_info->reloc_mutex); fs_info->reloc_ctl = rc; - spin_unlock(&fs_info->trans_lock); + mutex_unlock(&fs_info->reloc_mutex); } static void unset_reloc_control(struct reloc_control *rc) { struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; - spin_lock(&fs_info->trans_lock); + + mutex_lock(&fs_info->reloc_mutex); fs_info->reloc_ctl = NULL; - spin_unlock(&fs_info->trans_lock); + mutex_unlock(&fs_info->reloc_mutex); } static int check_extent_flags(u64 flags) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 2b3590b..833996a 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -126,28 +126,85 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail) * to make sure the old root from before we joined the transaction is deleted * when the transaction commits */ -int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, +static int record_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root) { if (root->ref_cows && root->last_trans < trans->transid) { WARN_ON(root == root->fs_info->extent_root); WARN_ON(root->commit_root != root->node); + /* + * see below for in_trans_setup usage rules + * we have the reloc mutex held now, so there + * is only one writer in this function + */ + root->in_trans_setup = 1; + + /* make sure readers find in_trans_setup before + * they find our root->last_trans update + */ + smp_wmb(); + spin_lock(&root->fs_info->fs_roots_radix_lock); if (root->last_trans == trans->transid) { spin_unlock(&root->fs_info->fs_roots_radix_lock); return 0; } - root->last_trans = trans->transid; radix_tree_tag_set(&root->fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid, BTRFS_ROOT_TRANS_TAG); spin_unlock(&root->fs_info->fs_roots_radix_lock); + root->last_trans = trans->transid; + + /* this is pretty tricky. We don't want to + * take the relocation lock in btrfs_record_root_in_trans + * unless we're really doing the first setup for this root in + * this transaction. + * + * Normally we'd use root->last_trans as a flag to decide + * if we want to take the expensive mutex. + * + * But, we have to set root->last_trans before we + * init the relocation root, otherwise, we trip over warnings + * in ctree.c. The solution used here is to flag ourselves + * with root->in_trans_setup. When this is 1, we're still + * fixing up the reloc trees and everyone must wait. + * + * When this is zero, they can trust root->last_trans and fly + * through btrfs_record_root_in_trans without having to take the + * lock. smp_wmb() makes sure that all the writes above are + * done before we pop in the zero below + */ btrfs_init_reloc_root(trans, root); + smp_wmb(); + root->in_trans_setup = 0; } return 0; } + +int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + if (!root->ref_cows) + return 0; + + /* + * see record_root_in_trans for comments about in_trans_setup usage + * and barriers + */ + smp_rmb(); + if (root->last_trans == trans->transid && + !root->in_trans_setup) + return 0; + + mutex_lock(&root->fs_info->reloc_mutex); + record_root_in_trans(trans, root); + mutex_unlock(&root->fs_info->reloc_mutex); + + return 0; +} + /* wait for commit against the current transaction to become unblocked * when this is done, it is safe to start a new transaction, but the current * transaction might not be fully on disk. @@ -882,7 +939,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, parent = dget_parent(dentry); parent_inode = parent->d_inode; parent_root = BTRFS_I(parent_inode)->root; - btrfs_record_root_in_trans(trans, parent_root); + record_root_in_trans(trans, parent_root); /* * insert the directory item @@ -900,7 +957,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, ret = btrfs_update_inode(trans, parent_root, parent_inode); BUG_ON(ret); - btrfs_record_root_in_trans(trans, root); + record_root_in_trans(trans, root); btrfs_set_root_last_snapshot(&root->root_item, trans->transid); memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); btrfs_check_and_init_root_item(new_root_item); @@ -1247,6 +1304,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, } while (atomic_read(&cur_trans->num_writers) > 1 || (should_grow && cur_trans->num_joined != joined)); + /* + * the reloc mutex makes sure that we stop + * the balancing code from coming in and moving + * extents around in the middle of the commit + */ + mutex_lock(&root->fs_info->reloc_mutex); + ret = create_pending_snapshots(trans, root->fs_info); BUG_ON(ret); @@ -1312,6 +1376,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, root->fs_info->running_transaction = NULL; root->fs_info->trans_no_join = 0; spin_unlock(&root->fs_info->trans_lock); + mutex_unlock(&root->fs_info->reloc_mutex); wake_up(&root->fs_info->transaction_wait); -- cgit v1.1 From 3ed4498caf381a73d6259d3ffacc914b17a507ec Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 13 Jun 2011 17:54:22 +0000 Subject: btrfs: fix dereference of ERR_PTR value smatch reports: btrfs_recover_log_trees error: 'wc.replay_dest' dereferencing possible ERR_PTR() Signed-off-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 592396c..4ce8a9f 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3177,7 +3177,7 @@ again: tmp_key.offset = (u64)-1; wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); - BUG_ON(!wc.replay_dest); + BUG_ON(IS_ERR_OR_NULL(wc.replay_dest)); wc.replay_dest->log_root = log; btrfs_record_root_in_trans(trans, wc.replay_dest); -- cgit v1.1 From 9fe6a50fb764f508dd2de47a66e62e51388791fb Mon Sep 17 00:00:00 2001 From: Maarten Lankhorst Date: Thu, 16 Jun 2011 09:04:57 +0000 Subject: btrfs: Remove unused sysfs code Removes code no longer used. The sysfs file itself is kept, because the btrfs developers expressed interest in putting new entries to sysfs. Signed-off-by: Maarten Lankhorst Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 1 - fs/btrfs/disk-io.c | 1 - fs/btrfs/sysfs.c | 146 ----------------------------------------------------- 3 files changed, 148 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index a2c91a1..8e948ec 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1195,7 +1195,6 @@ struct btrfs_root { struct btrfs_key defrag_max; int defrag_running; char *name; - int in_sysfs; /* the dirty list is only used by non-reference counted roots */ struct list_head dirty_list; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0b2b4b7..c25ef5a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1044,7 +1044,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root->last_trans = 0; root->highest_objectid = 0; root->name = NULL; - root->in_sysfs = 0; root->inode_tree = RB_ROOT; INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC); root->block_rsv = NULL; diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index c3c223a..daac9ae 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -28,152 +28,6 @@ #include "disk-io.h" #include "transaction.h" -static ssize_t root_blocks_used_show(struct btrfs_root *root, char *buf) -{ - return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)btrfs_root_used(&root->root_item)); -} - -static ssize_t root_block_limit_show(struct btrfs_root *root, char *buf) -{ - return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)btrfs_root_limit(&root->root_item)); -} - -static ssize_t super_blocks_used_show(struct btrfs_fs_info *fs, char *buf) -{ - - return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)btrfs_super_bytes_used(&fs->super_copy)); -} - -static ssize_t super_total_blocks_show(struct btrfs_fs_info *fs, char *buf) -{ - return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)btrfs_super_total_bytes(&fs->super_copy)); -} - -static ssize_t super_blocksize_show(struct btrfs_fs_info *fs, char *buf) -{ - return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)btrfs_super_sectorsize(&fs->super_copy)); -} - -/* this is for root attrs (subvols/snapshots) */ -struct btrfs_root_attr { - struct attribute attr; - ssize_t (*show)(struct btrfs_root *, char *); - ssize_t (*store)(struct btrfs_root *, const char *, size_t); -}; - -#define ROOT_ATTR(name, mode, show, store) \ -static struct btrfs_root_attr btrfs_root_attr_##name = __ATTR(name, mode, \ - show, store) - -ROOT_ATTR(blocks_used, 0444, root_blocks_used_show, NULL); -ROOT_ATTR(block_limit, 0644, root_block_limit_show, NULL); - -static struct attribute *btrfs_root_attrs[] = { - &btrfs_root_attr_blocks_used.attr, - &btrfs_root_attr_block_limit.attr, - NULL, -}; - -/* this is for super attrs (actual full fs) */ -struct btrfs_super_attr { - struct attribute attr; - ssize_t (*show)(struct btrfs_fs_info *, char *); - ssize_t (*store)(struct btrfs_fs_info *, const char *, size_t); -}; - -#define SUPER_ATTR(name, mode, show, store) \ -static struct btrfs_super_attr btrfs_super_attr_##name = __ATTR(name, mode, \ - show, store) - -SUPER_ATTR(blocks_used, 0444, super_blocks_used_show, NULL); -SUPER_ATTR(total_blocks, 0444, super_total_blocks_show, NULL); -SUPER_ATTR(blocksize, 0444, super_blocksize_show, NULL); - -static struct attribute *btrfs_super_attrs[] = { - &btrfs_super_attr_blocks_used.attr, - &btrfs_super_attr_total_blocks.attr, - &btrfs_super_attr_blocksize.attr, - NULL, -}; - -static ssize_t btrfs_super_attr_show(struct kobject *kobj, - struct attribute *attr, char *buf) -{ - struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info, - super_kobj); - struct btrfs_super_attr *a = container_of(attr, - struct btrfs_super_attr, - attr); - - return a->show ? a->show(fs, buf) : 0; -} - -static ssize_t btrfs_super_attr_store(struct kobject *kobj, - struct attribute *attr, - const char *buf, size_t len) -{ - struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info, - super_kobj); - struct btrfs_super_attr *a = container_of(attr, - struct btrfs_super_attr, - attr); - - return a->store ? a->store(fs, buf, len) : 0; -} - -static ssize_t btrfs_root_attr_show(struct kobject *kobj, - struct attribute *attr, char *buf) -{ - struct btrfs_root *root = container_of(kobj, struct btrfs_root, - root_kobj); - struct btrfs_root_attr *a = container_of(attr, - struct btrfs_root_attr, - attr); - - return a->show ? a->show(root, buf) : 0; -} - -static ssize_t btrfs_root_attr_store(struct kobject *kobj, - struct attribute *attr, - const char *buf, size_t len) -{ - struct btrfs_root *root = container_of(kobj, struct btrfs_root, - root_kobj); - struct btrfs_root_attr *a = container_of(attr, - struct btrfs_root_attr, - attr); - return a->store ? a->store(root, buf, len) : 0; -} - -static void btrfs_super_release(struct kobject *kobj) -{ - struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info, - super_kobj); - complete(&fs->kobj_unregister); -} - -static void btrfs_root_release(struct kobject *kobj) -{ - struct btrfs_root *root = container_of(kobj, struct btrfs_root, - root_kobj); - complete(&root->kobj_unregister); -} - -static const struct sysfs_ops btrfs_super_attr_ops = { - .show = btrfs_super_attr_show, - .store = btrfs_super_attr_store, -}; - -static const struct sysfs_ops btrfs_root_attr_ops = { - .show = btrfs_root_attr_show, - .store = btrfs_root_attr_store, -}; - /* /sys/fs/btrfs/ entry */ static struct kset *btrfs_kset; -- cgit v1.1 From 19fd294957e426bfdd8e19085096467ec18df5c4 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 15 Jun 2011 10:47:30 +0000 Subject: btrfs: fix wrong reservation when doing delayed inode operations We have migrated the space for the delayed inode items from trans_block_rsv to global_block_rsv, but we forgot to set trans->block_rsv to global_block_rsv when we doing delayed inode operations, and the following Oops happened: [ 9792.654889] ------------[ cut here ]------------ [ 9792.654898] WARNING: at fs/btrfs/extent-tree.c:5681 btrfs_alloc_free_block+0xca/0x27c [btrfs]() [ 9792.654899] Hardware name: To Be Filled By O.E.M. [ 9792.654900] Modules linked in: btrfs zlib_deflate libcrc32c ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 ip6table_filter ip6_tables arc4 rt61pci rt2x00pci rt2x00lib snd_hda_codec_hdmi mac80211 snd_hda_codec_realtek cfg80211 snd_hda_intel edac_core snd_seq rfkill pcspkr serio_raw snd_hda_codec eeprom_93cx6 edac_mce_amd sp5100_tco i2c_piix4 k10temp snd_hwdep snd_seq_device snd_pcm floppy r8169 xhci_hcd mii snd_timer snd soundcore snd_page_alloc ipv6 firewire_ohci pata_acpi ata_generic firewire_core pata_via crc_itu_t radeon ttm drm_kms_helper drm i2c_algo_bit i2c_core [last unloaded: scsi_wait_scan] [ 9792.654919] Pid: 2762, comm: rm Tainted: G W 2.6.39+ #1 [ 9792.654920] Call Trace: [ 9792.654922] [] warn_slowpath_common+0x83/0x9b [ 9792.654925] [] warn_slowpath_null+0x1a/0x1c [ 9792.654933] [] btrfs_alloc_free_block+0xca/0x27c [btrfs] [ 9792.654945] [] ? map_extent_buffer+0x6e/0xa8 [btrfs] [ 9792.654953] [] __btrfs_cow_block+0xfc/0x30c [btrfs] [ 9792.654963] [] ? btrfs_buffer_uptodate+0x47/0x58 [btrfs] [ 9792.654970] [] ? read_block_for_search+0x94/0x368 [btrfs] [ 9792.654978] [] btrfs_cow_block+0xfe/0x146 [btrfs] [ 9792.654986] [] btrfs_search_slot+0x14d/0x4b6 [btrfs] [ 9792.654997] [] ? map_extent_buffer+0x6e/0xa8 [btrfs] [ 9792.655022] [] btrfs_lookup_inode+0x2f/0x8f [btrfs] [ 9792.655025] [] ? _cond_resched+0xe/0x22 [ 9792.655027] [] ? mutex_lock+0x29/0x50 [ 9792.655039] [] btrfs_update_delayed_inode+0x72/0x137 [btrfs] [ 9792.655051] [] btrfs_run_delayed_items+0x90/0xdb [btrfs] [ 9792.655062] [] btrfs_commit_transaction+0x228/0x654 [btrfs] [ 9792.655064] [] ? remove_wait_queue+0x3a/0x3a [ 9792.655075] [] btrfs_evict_inode+0x14d/0x202 [btrfs] [ 9792.655077] [] evict+0x71/0x111 [ 9792.655079] [] iput+0x12a/0x132 [ 9792.655081] [] do_unlinkat+0x106/0x155 [ 9792.655083] [] ? path_put+0x1f/0x23 [ 9792.655085] [] ? audit_syscall_entry+0x145/0x171 [ 9792.655087] [] ? putname+0x34/0x36 [ 9792.655090] [] sys_unlinkat+0x29/0x2b [ 9792.655092] [] system_call_fastpath+0x16/0x1b [ 9792.655093] ---[ end trace 02b696eb02b3f768 ]--- This patch fix it by setting the reservation of the transaction handle to the correct one. Reported-by: Josef Bacik Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/delayed-inode.c | 25 ++++++++++++++++++++----- fs/btrfs/delayed-inode.h | 1 - 2 files changed, 20 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 6462c29..fc515b7 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -297,7 +297,6 @@ struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len) item->data_len = data_len; item->ins_or_del = 0; item->bytes_reserved = 0; - item->block_rsv = NULL; item->delayed_node = NULL; atomic_set(&item->refs, 1); } @@ -593,10 +592,8 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, num_bytes = btrfs_calc_trans_metadata_size(root, 1); ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); - if (!ret) { + if (!ret) item->bytes_reserved = num_bytes; - item->block_rsv = dst_rsv; - } return ret; } @@ -604,10 +601,13 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, struct btrfs_delayed_item *item) { + struct btrfs_block_rsv *rsv; + if (!item->bytes_reserved) return; - btrfs_block_rsv_release(root, item->block_rsv, + rsv = &root->fs_info->global_block_rsv; + btrfs_block_rsv_release(root, rsv, item->bytes_reserved); } @@ -1014,6 +1014,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, struct btrfs_delayed_root *delayed_root; struct btrfs_delayed_node *curr_node, *prev_node; struct btrfs_path *path; + struct btrfs_block_rsv *block_rsv; int ret = 0; path = btrfs_alloc_path(); @@ -1021,6 +1022,9 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, return -ENOMEM; path->leave_spinning = 1; + block_rsv = trans->block_rsv; + trans->block_rsv = &root->fs_info->global_block_rsv; + delayed_root = btrfs_get_delayed_root(root); curr_node = btrfs_first_delayed_node(delayed_root); @@ -1045,6 +1049,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, } btrfs_free_path(path); + trans->block_rsv = block_rsv; return ret; } @@ -1052,6 +1057,7 @@ static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, struct btrfs_delayed_node *node) { struct btrfs_path *path; + struct btrfs_block_rsv *block_rsv; int ret; path = btrfs_alloc_path(); @@ -1059,6 +1065,9 @@ static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, return -ENOMEM; path->leave_spinning = 1; + block_rsv = trans->block_rsv; + trans->block_rsv = &node->root->fs_info->global_block_rsv; + ret = btrfs_insert_delayed_items(trans, path, node->root, node); if (!ret) ret = btrfs_delete_delayed_items(trans, path, node->root, node); @@ -1066,6 +1075,7 @@ static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, ret = btrfs_update_delayed_inode(trans, node->root, path, node); btrfs_free_path(path); + trans->block_rsv = block_rsv; return ret; } @@ -1116,6 +1126,7 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) struct btrfs_path *path; struct btrfs_delayed_node *delayed_node = NULL; struct btrfs_root *root; + struct btrfs_block_rsv *block_rsv; unsigned long nr = 0; int need_requeue = 0; int ret; @@ -1134,6 +1145,9 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) if (IS_ERR(trans)) goto free_path; + block_rsv = trans->block_rsv; + trans->block_rsv = &root->fs_info->global_block_rsv; + ret = btrfs_insert_delayed_items(trans, path, root, delayed_node); if (!ret) ret = btrfs_delete_delayed_items(trans, path, root, @@ -1176,6 +1190,7 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) nr = trans->blocks_used; + trans->block_rsv = block_rsv; btrfs_end_transaction_dmeta(trans, root); __btrfs_btree_balance_dirty(root, nr); free_path: diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index eb7d240..cb79b67 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -75,7 +75,6 @@ struct btrfs_delayed_item { struct list_head tree_list; /* used for batch insert/delete items */ struct list_head readdir_list; /* used for readdir items */ u64 bytes_reserved; - struct btrfs_block_rsv *block_rsv; struct btrfs_delayed_node *delayed_node; atomic_t refs; int ins_or_del; -- cgit v1.1 From 35a30d7ce54e087d8025a725d4e5a2fdee723a9f Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 13 Jun 2011 15:18:23 +0000 Subject: btrfs: fix uninitialized return value When allocation fails in btrfs_read_fs_root_no_name, ret is not set although it is returned, holding a garbage value. Signed-off-by: David Sterba Reviewed-by: Li Zefan Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index c25ef5a..1ac8db5d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1299,12 +1299,12 @@ again: return root; root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS); - if (!root->free_ino_ctl) - goto fail; root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned), GFP_NOFS); - if (!root->free_ino_pinned) + if (!root->free_ino_pinned || !root->free_ino_ctl) { + ret = -ENOMEM; goto fail; + } btrfs_init_free_ino_ctl(root); mutex_init(&root->fs_commit_mutex); -- cgit v1.1 From e999376f094162aa425ae749aa1df95ab928d010 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 17 Jun 2011 16:14:09 -0400 Subject: Btrfs: avoid delayed metadata items during commits Snapshot creation has two phases. One is the initial snapshot setup, and the second is done during commit, while nobody is allowed to modify the root we are snapshotting. The delayed metadata insertion code can break that rule, it does a delayed inode update on the inode of the parent of the snapshot, and delayed directory item insertion. This makes sure to run the pending delayed operations before we record the snapshot root, which avoids corruptions. Signed-off-by: Chris Mason --- fs/btrfs/delayed-inode.c | 7 +++++++ fs/btrfs/delayed-inode.h | 4 ++++ fs/btrfs/transaction.c | 27 +++++++++++++++++---------- 3 files changed, 28 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index fc515b7..f1cbd02 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1237,6 +1237,13 @@ again: return 0; } +void btrfs_assert_delayed_root_empty(struct btrfs_root *root) +{ + struct btrfs_delayed_root *delayed_root; + delayed_root = btrfs_get_delayed_root(root); + WARN_ON(btrfs_first_delayed_node(delayed_root)); +} + void btrfs_balance_delayed_items(struct btrfs_root *root) { struct btrfs_delayed_root *delayed_root; diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index cb79b67..d1a6a29 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -137,4 +137,8 @@ int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent, /* for init */ int __init btrfs_delayed_inode_init(void); void btrfs_delayed_inode_exit(void); + +/* for debugging */ +void btrfs_assert_delayed_root_empty(struct btrfs_root *root); + #endif diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index c073d85..51dcec8 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -957,6 +957,15 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, ret = btrfs_update_inode(trans, parent_root, parent_inode); BUG_ON(ret); + /* + * pull in the delayed directory update + * and the delayed inode item + * otherwise we corrupt the FS during + * snapshot + */ + ret = btrfs_run_delayed_items(trans, root); + BUG_ON(ret); + record_root_in_trans(trans, root); btrfs_set_root_last_snapshot(&root->root_item, trans->transid); memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); @@ -1018,14 +1027,6 @@ static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans, int ret; list_for_each_entry(pending, head, list) { - /* - * We must deal with the delayed items before creating - * snapshots, or we will create a snapthot with inconsistent - * information. - */ - ret = btrfs_run_delayed_items(trans, fs_info->fs_root); - BUG_ON(ret); - ret = create_pending_snapshot(trans, fs_info, pending); BUG_ON(ret); } @@ -1319,15 +1320,21 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, */ mutex_lock(&root->fs_info->reloc_mutex); - ret = create_pending_snapshots(trans, root->fs_info); + ret = btrfs_run_delayed_items(trans, root); BUG_ON(ret); - ret = btrfs_run_delayed_items(trans, root); + ret = create_pending_snapshots(trans, root->fs_info); BUG_ON(ret); ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); BUG_ON(ret); + /* + * make sure none of the code above managed to slip in a + * delayed item + */ + btrfs_assert_delayed_root_empty(root); + WARN_ON(cur_trans != trans->transaction); btrfs_scrub_pause(root); -- cgit v1.1 From c11760c6d80ab6aa20e383cf378a7287305f591c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 8 Jun 2011 10:30:03 -0700 Subject: isofs: fix bh leak in isofs_fill_super() error case MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In isofs_fill_super(), when an iso_primary_descriptor is found, it is kept in pri_bh. The error cases don't properly release it. Fix it. Reported-and-tested-by: 김원석 Cc: Andrew Morton Signed-off-by: Linus Torvalds --- fs/isofs/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 3db5ba4..b3cc858 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -974,7 +974,7 @@ out_no_inode: out_no_read: printk(KERN_WARNING "%s: bread failed, dev=%s, iso_blknum=%d, block=%d\n", __func__, s->s_id, iso_blknum, block); - goto out_freesbi; + goto out_freebh; out_bad_zone_size: printk(KERN_WARNING "ISOFS: Bad logical zone size %ld\n", sbi->s_log_zone_size); @@ -989,6 +989,7 @@ out_unknown_format: out_freebh: brelse(bh); + brelse(pri_bh); out_freesbi: kfree(opt.iocharset); kfree(sbi); -- cgit v1.1 From 105f4622104848ff1ee1f644d661bef9dec3eb27 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 7 Jun 2011 11:50:23 -0400 Subject: nfsd4: fix break_lease flags on nfsd open Thanks to Casey Bodley for pointing out that on a read open we pass 0, instead of O_RDONLY, to break_lease, with the result that a read open is treated like a write open for the purposes of lease breaking! Reported-by: Casey Bodley Cc: stable@kernel.org Signed-off-by: J. Bruce Fields --- fs/nfsd/vfs.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index f3fb61b..fd0acca 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -696,7 +696,15 @@ nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *suppor } #endif /* CONFIG_NFSD_V3 */ +static int nfsd_open_break_lease(struct inode *inode, int access) +{ + unsigned int mode; + if (access & NFSD_MAY_NOT_BREAK_LEASE) + return 0; + mode = (access & NFSD_MAY_WRITE) ? O_WRONLY : O_RDONLY; + return break_lease(inode, mode | O_NONBLOCK); +} /* * Open an existing file or directory. @@ -744,12 +752,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, if (!inode->i_fop) goto out; - /* - * Check to see if there are any leases on this file. - * This may block while leases are broken. - */ - if (!(access & NFSD_MAY_NOT_BREAK_LEASE)) - host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? O_WRONLY : 0)); + host_err = nfsd_open_break_lease(inode, access); if (host_err) /* NOMEM or WOULDBLOCK */ goto out_nfserr; -- cgit v1.1 From 185bf87393afe6b966881e36c459949d90930a7a Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 20 Jun 2011 10:10:24 +0300 Subject: ubifs: dereferencing an ERR_PTR in ubifs_mount() d251ed271d5 "ubifs: fix sget races" left out the goto from this error path so the static checkers complain that we're dereferencing "sb" when it's an ERR_PTR. Signed-off-by: Dan Carpenter Signed-off-by: Al Viro --- fs/ubifs/super.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 8c892c2..529be05 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -2146,6 +2146,7 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags, if (IS_ERR(sb)) { err = PTR_ERR(sb); kfree(c); + goto out_close; } if (sb->s_root) { -- cgit v1.1 From 1712c20dae7b770b62b2e3272100b3b40af0157c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 18 Jun 2011 19:59:04 -0400 Subject: bad_inode_permission() is safe from RCU mode return -EIO; is *not* a blocking operation, thank you very much. Nick, what the hell have you been smoking? Signed-off-by: Al Viro --- fs/bad_inode.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 9ad2369..bfcb18f 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -231,9 +231,6 @@ static int bad_inode_readlink(struct dentry *dentry, char __user *buffer, static int bad_inode_permission(struct inode *inode, int mask, unsigned int flags) { - if (flags & IPERM_FLAG_RCU) - return -ECHILD; - return -EIO; } -- cgit v1.1 From ec12781f192568f7ea860f440f890389ba393df7 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 18 Jun 2011 20:03:36 -0400 Subject: cifs_permission() doesn't need to bail out in RCU mode nothing potentially blocking except generic_permission(), which will DTRT Signed-off-by: Al Viro --- fs/cifs/cifsfs.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index e9def99..2f0c586 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -257,9 +257,6 @@ static int cifs_permission(struct inode *inode, int mask, unsigned int flags) { struct cifs_sb_info *cifs_sb; - if (flags & IPERM_FLAG_RCU) - return -ECHILD; - cifs_sb = CIFS_SB(inode->i_sb); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) { -- cgit v1.1 From 6b419951f1e44c8a46fccfc6551eca9a9980acd6 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 18 Jun 2011 20:11:43 -0400 Subject: coda_ioctl_permission() is safe in RCU mode return (mask & MAY_EXEC) ? -EACCES : 0; is non-blocking... Signed-off-by: Al Viro --- fs/coda/pioctl.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c index 6cbb3af..cb140ef 100644 --- a/fs/coda/pioctl.c +++ b/fs/coda/pioctl.c @@ -43,8 +43,6 @@ const struct file_operations coda_ioctl_operations = { /* the coda pioctl inode ops */ static int coda_ioctl_permission(struct inode *inode, int mask, unsigned int flags) { - if (flags & IPERM_FLAG_RCU) - return -ECHILD; return (mask & MAY_EXEC) ? -EACCES : 0; } -- cgit v1.1 From a63ab94d67879bc0630ea9821c582ddf58ba5527 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 18 Jun 2011 20:17:22 -0400 Subject: logfs doesn't need ->permission() at all ... and never did, what with its ->permission() being what we do by default when ->permission is NULL... Signed-off-by: Al Viro --- fs/logfs/dir.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'fs') diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c index 9ed89d1..1afae26 100644 --- a/fs/logfs/dir.c +++ b/fs/logfs/dir.c @@ -555,13 +555,6 @@ static int logfs_symlink(struct inode *dir, struct dentry *dentry, return __logfs_create(dir, dentry, inode, target, destlen); } -static int logfs_permission(struct inode *inode, int mask, unsigned int flags) -{ - if (flags & IPERM_FLAG_RCU) - return -ECHILD; - return generic_permission(inode, mask, flags, NULL); -} - static int logfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { @@ -820,7 +813,6 @@ const struct inode_operations logfs_dir_iops = { .mknod = logfs_mknod, .rename = logfs_rename, .rmdir = logfs_rmdir, - .permission = logfs_permission, .symlink = logfs_symlink, .unlink = logfs_unlink, }; -- cgit v1.1 From 730e908f3539066d4aa01f4720ebfc750ce4d045 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 18 Jun 2011 20:21:44 -0400 Subject: nilfs2_permission() doesn't need to bail out in RCU mode Nothing blocking except for generic_permission(). Which will DTRT. Signed-off-by: Al Viro --- fs/nilfs2/inode.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index b954878..b9b45fc 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -801,12 +801,7 @@ out_err: int nilfs_permission(struct inode *inode, int mask, unsigned int flags) { - struct nilfs_root *root; - - if (flags & IPERM_FLAG_RCU) - return -ECHILD; - - root = NILFS_I(inode)->i_root; + struct nilfs_root *root = NILFS_I(inode)->i_root; if ((mask & MAY_WRITE) && root && root->cno != NILFS_CPTREE_CURRENT_CNO) return -EROFS; /* snapshot is not writable */ -- cgit v1.1 From cf1279111686d9742cbc4145bc9d526c83f59fea Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 18 Jun 2011 20:35:23 -0400 Subject: proc_fd_permission() is doesn't need to bail out in RCU mode nothing blocking except generic_permission() Signed-off-by: Al Viro --- fs/proc/base.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 14def99..8a84210 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2169,11 +2169,7 @@ static const struct file_operations proc_fd_operations = { */ static int proc_fd_permission(struct inode *inode, int mask, unsigned int flags) { - int rv; - - if (flags & IPERM_FLAG_RCU) - return -ECHILD; - rv = generic_permission(inode, mask, flags, NULL); + int rv = generic_permission(inode, mask, flags, NULL); if (rv == 0) return 0; if (task_pid(current) == proc_pid(inode)) -- cgit v1.1 From 1d29b5a2ed7eb8862c9b66daf475f3e4c1a40299 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 18 Jun 2011 20:37:33 -0400 Subject: reiserfs_permission() doesn't need to bail out in RCU mode nothing blocking other than generic_permission() (and check_acl callback does bail out in RCU mode). Signed-off-by: Al Viro --- fs/reiserfs/xattr.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index e8a62f4..d780896 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -954,8 +954,6 @@ static int xattr_mount_check(struct super_block *s) int reiserfs_permission(struct inode *inode, int mask, unsigned int flags) { - if (flags & IPERM_FLAG_RCU) - return -ECHILD; /* * We don't do permission checks on the internal objects. * Permissions are determined by the "owning" object. -- cgit v1.1 From 1aec7036d0c2996c86ce483ca0a28f3b20807b43 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 18 Jun 2011 20:42:00 -0400 Subject: proc_sys_permission() is OK in RCU mode nothing blocking there, since all instances of sysctl ->permissions() method are non-blocking - both of them, that is. Signed-off-by: Al Viro --- fs/proc/proc_sysctl.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index f50133c..d167de3 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -304,9 +304,6 @@ static int proc_sys_permission(struct inode *inode, int mask,unsigned int flags) struct ctl_table *table; int error; - if (flags & IPERM_FLAG_RCU) - return -ECHILD; - /* Executable files are not allowed under /proc/sys/ */ if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) return -EACCES; -- cgit v1.1 From 6291176bcd71a2766a19a10cbd9bab07d289e1d7 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 17 Jun 2011 19:20:48 -0400 Subject: kill obsolete comment for follow_down() Signed-off-by: Al Viro --- fs/namei.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 9e425e7..975c406 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1011,9 +1011,6 @@ failed: * Follow down to the covering mount currently visible to userspace. At each * point, the filesystem owning that dentry may be queried as to whether the * caller is permitted to proceed or not. - * - * Care must be taken as namespace_sem may be held (indicated by mounting_here - * being true). */ int follow_down(struct path *path) { -- cgit v1.1 From 8e833fd2e1f0107ee7a4b6bc4de3c9f0e9b0ed41 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 19 Jun 2011 01:56:53 -0400 Subject: fix comment in generic_permission() CAP_DAC_OVERRIDE is enough for MAY_EXEC on directory, even if no exec bits are set. Signed-off-by: Al Viro --- fs/namei.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 975c406..0223c41 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -238,7 +238,8 @@ int generic_permission(struct inode *inode, int mask, unsigned int flags, /* * Read/write DACs are always overridable. - * Executable DACs are overridable if at least one exec bit is set. + * Executable DACs are overridable for all directories and + * for non-directories that have least one exec bit set. */ if (!(mask & MAY_EXEC) || execute_ok(inode)) if (ns_capable(inode_userns(inode), CAP_DAC_OVERRIDE)) -- cgit v1.1