From 91435650c233b93e0da389db74f4b2c11c5ad2d4 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 16 Feb 2011 13:10:41 -0500 Subject: Btrfs: put ENOSPC debugging under a mount option ENOSPC in btrfs is getting to the point where the extra debugging isn't required. I've put it under mount -o enospc_debug just in case someone is having difficult problems. Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 1 + fs/btrfs/extent-tree.c | 2 +- fs/btrfs/super.c | 7 ++++++- 3 files changed, 8 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 7219537..6297701b 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1254,6 +1254,7 @@ struct btrfs_root { #define BTRFS_MOUNT_SPACE_CACHE (1 << 12) #define BTRFS_MOUNT_CLEAR_CACHE (1 << 13) #define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14) +#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a7aaa10..d375fc0 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5377,7 +5377,7 @@ again: num_bytes, data, 1); goto again; } - if (ret == -ENOSPC) { + if (ret == -ENOSPC && btrfs_test_opt(root, ENOSPC_DEBUG)) { struct btrfs_space_info *sinfo; sinfo = __find_space_info(root->fs_info, data); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 0209b5f..db0a827 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -155,7 +155,8 @@ enum { Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_compress_type, Opt_compress_force, Opt_compress_force_type, Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, - Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, Opt_err, + Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, + Opt_enospc_debug, Opt_err, }; static match_table_t tokens = { @@ -184,6 +185,7 @@ static match_table_t tokens = { {Opt_space_cache, "space_cache"}, {Opt_clear_cache, "clear_cache"}, {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"}, + {Opt_enospc_debug, "enospc_debug"}, {Opt_err, NULL}, }; @@ -358,6 +360,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) case Opt_user_subvol_rm_allowed: btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED); break; + case Opt_enospc_debug: + btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG); + break; case Opt_err: printk(KERN_INFO "btrfs: unrecognized mount option " "'%s'\n", p); -- cgit v1.1 From c87f08ca44e83b2c8d28f63f9c33f3a270a04bbe Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 16 Feb 2011 13:57:04 -0500 Subject: Btrfs: allow balance to explicitly allocate chunks as it relocates Btrfs device shrinking and balancing ends up reallocating all the blocks in order to allow COW to move them to new destinations. It is somewhat awkward in terms of ENOSPC because most of the enospc code is built around the idea that some operation on a reference counted tree triggers allocations in the non-reference counted trees. This commit changes the balancing code to deal with enospc by trying to allocate a new chunk. If that allocation succeeds, we go ahead and retry whatever failed due to enospc. Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 2 ++ fs/btrfs/extent-tree.c | 7 +++++++ fs/btrfs/relocation.c | 13 ++++++++++++- 3 files changed, 21 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 6297701b..28188a7 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2219,6 +2219,8 @@ int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end); int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr, u64 num_bytes); +int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 type); /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d375fc0..100e409 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -8066,6 +8066,13 @@ out: return ret; } +int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 type) +{ + u64 alloc_flags = get_alloc_profile(root, type); + return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1); +} + /* * helper to account the unused space of all the readonly block group in the * list. takes mirrors into account. diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 0825e4e..31ade58 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3654,6 +3654,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) u32 item_size; int ret; int err = 0; + int progress = 0; path = btrfs_alloc_path(); if (!path) @@ -3666,9 +3667,10 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) } while (1) { + progress++; trans = btrfs_start_transaction(rc->extent_root, 0); BUG_ON(IS_ERR(trans)); - +restart: if (update_backref_cache(trans, &rc->backref_cache)) { btrfs_end_transaction(trans, rc->extent_root); continue; @@ -3781,6 +3783,15 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) } } } + if (trans && progress && err == -ENOSPC) { + ret = btrfs_force_chunk_alloc(trans, rc->extent_root, + rc->block_group->flags); + if (ret == 0) { + err = 0; + progress = 0; + goto restart; + } + } btrfs_release_path(rc->extent_root, path); clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY, -- cgit v1.1 From b4dc2b8c694ead005b828f5fb7fa1134db5b6275 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 16 Feb 2011 06:06:34 +0000 Subject: Btrfs: Fix BTRFS_IOC_SUBVOL_SETFLAGS ioctl - Check user-specified flags correctly - Check the inode owership - Search root item in root tree but not fs tree Reported-by: Dan Rosenberg Signed-off-by: Li Zefan Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index be2d4f6..5fdb2ab 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1071,12 +1071,15 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file, if (copy_from_user(&flags, arg, sizeof(flags))) return -EFAULT; - if (flags & ~BTRFS_SUBVOL_CREATE_ASYNC) + if (flags & BTRFS_SUBVOL_CREATE_ASYNC) return -EINVAL; if (flags & ~BTRFS_SUBVOL_RDONLY) return -EOPNOTSUPP; + if (!is_owner_or_cap(inode)) + return -EACCES; + down_write(&root->fs_info->subvol_sem); /* nothing to do */ @@ -1097,7 +1100,7 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file, goto out_reset; } - ret = btrfs_update_root(trans, root, + ret = btrfs_update_root(trans, root->fs_info->tree_root, &root->root_key, &root->root_item); btrfs_commit_transaction(trans, root); -- cgit v1.1 From ca9b688c1c9a21635cfc8af8b68565b154185196 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 16 Feb 2011 06:06:41 +0000 Subject: Btrfs: Avoid accessing unmapped kernel address When decompressing a chunk of data, we'll copy the data out to a working buffer if the data is stored in more than one page, otherwise we'll use the mapped page directly to avoid memory copy. In the latter case, we'll end up accessing the kernel address after we've unmapped the page in a corner case. Reported-by: Juan Francisco Cantero Hurtado Signed-off-by: Li Zefan Signed-off-by: Chris Mason --- fs/btrfs/lzo.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index cc9b450..a178f5e 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -280,6 +280,7 @@ static int lzo_decompress_biovec(struct list_head *ws, unsigned long tot_out; unsigned long tot_len; char *buf; + bool may_late_unmap, need_unmap; data_in = kmap(pages_in[0]); tot_len = read_compress_length(data_in); @@ -300,11 +301,13 @@ static int lzo_decompress_biovec(struct list_head *ws, tot_in += in_len; working_bytes = in_len; + may_late_unmap = need_unmap = false; /* fast path: avoid using the working buffer */ if (in_page_bytes_left >= in_len) { buf = data_in + in_offset; bytes = in_len; + may_late_unmap = true; goto cont; } @@ -329,14 +332,17 @@ cont: if (working_bytes == 0 && tot_in >= tot_len) break; - kunmap(pages_in[page_in_index]); - page_in_index++; - if (page_in_index >= total_pages_in) { + if (page_in_index + 1 >= total_pages_in) { ret = -1; - data_in = NULL; goto done; } - data_in = kmap(pages_in[page_in_index]); + + if (may_late_unmap) + need_unmap = true; + else + kunmap(pages_in[page_in_index]); + + data_in = kmap(pages_in[++page_in_index]); in_page_bytes_left = PAGE_CACHE_SIZE; in_offset = 0; @@ -346,6 +352,8 @@ cont: out_len = lzo1x_worst_compress(PAGE_CACHE_SIZE); ret = lzo1x_decompress_safe(buf, in_len, workspace->buf, &out_len); + if (need_unmap) + kunmap(pages_in[page_in_index - 1]); if (ret != LZO_E_OK) { printk(KERN_WARNING "btrfs decompress failed\n"); ret = -1; @@ -363,8 +371,7 @@ cont: break; } done: - if (data_in) - kunmap(pages_in[page_in_index]); + kunmap(pages_in[page_in_index]); return ret; } -- cgit v1.1 From 9b3517e9136824346227b7b04f8f7ea1f3a726cc Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 15 Feb 2011 18:14:25 +0000 Subject: Btrfs: make btrfs_rm_device() fail gracefully If shrinking done as part of the online device removal fails add that device back to the allocation list and increment the rw_devices counter. This fixes two bugs: 1) we could have a perfectly good device out of alloc list for no good reason; 2) in the btrfs consisting of two devices, failure in btrfs_rm_device() could lead to a situation where it was impossible to remove any of the devices because of the "unable to remove the only writeable device" error. Signed-off-by: Ilya Dryomov Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index dadaaa8..f31c331 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1337,11 +1337,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) ret = btrfs_shrink_device(device, 0); if (ret) - goto error_brelse; + goto error_undo; ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); if (ret) - goto error_brelse; + goto error_undo; device->in_fs_metadata = 0; @@ -1415,6 +1415,13 @@ out: mutex_unlock(&root->fs_info->volume_mutex); mutex_unlock(&uuid_mutex); return ret; +error_undo: + if (device->writeable) { + list_add(&device->dev_alloc_list, + &root->fs_info->fs_devices->alloc_list); + root->fs_info->fs_devices->rw_devices++; + } + goto error_brelse; } /* -- cgit v1.1 From fb01aa85b8b29c1a4e1f4a28ea54175de6bf7559 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 15 Feb 2011 18:12:57 +0000 Subject: Btrfs: set FMODE_EXCL in btrfs_device->mode This fixes a bug introduced in d4d77629, where the device added online (and therefore initialized via btrfs_init_new_device()) would be left with the positive bdev->bd_holders after unmount. Since d4d77629 we no longer OR FMODE_EXCL explicitly on blkdev_put(), set it in btrfs_device->mode. Signed-off-by: Ilya Dryomov Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f31c331..94334d9 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1639,7 +1639,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) device->dev_root = root->fs_info->dev_root; device->bdev = bdev; device->in_fs_metadata = 1; - device->mode = 0; + device->mode = FMODE_EXCL; set_blocksize(device->bdev, 4096); if (seeding_dev) { -- cgit v1.1 From 705773a6656bba66f2a80a44ddaacf9620df8a59 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 3 Feb 2011 14:16:19 +0100 Subject: ocfs2: Fix estimate of necessary credits for mkdir In the rare case that INLINE_DATA, INDEX_DIR, QUOTA, XATTR features are disabled and both the allocation of the directory inode and the allocation of the first directory block need to relink allocation group, there need not be enough credits reserved in a transaction. Fix the estimate. CC: Mark Fasheh Signed-off-by: Jan Kara Signed-off-by: Joel Becker --- fs/ocfs2/journal.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 43e56b9..6180da1 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -405,9 +405,9 @@ static inline int ocfs2_remove_extent_credits(struct super_block *sb) ocfs2_quota_trans_credits(sb); } -/* data block for new dir/symlink, 2 for bitmap updates (bitmap fe + - * bitmap block for the new bit) dx_root update for free list */ -#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2 + 1) +/* data block for new dir/symlink, allocation of directory block, dx_root + * update for free list */ +#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + OCFS2_SUBALLOC_ALLOC + 1) static inline int ocfs2_add_dir_index_credits(struct super_block *sb) { -- cgit v1.1 From acf3bb007e5636ef4c17505affb0974175108553 Mon Sep 17 00:00:00 2001 From: Tristan Ye Date: Fri, 21 Jan 2011 18:20:18 +0800 Subject: Ocfs2/refcounttree: Fix a bug for refcounttree to writeback clusters in a right number. Current refcounttree codes actually didn't writeback the new pages out in write-back mode, due to a bug of always passing a ZERO number of clusters to 'ocfs2_cow_sync_writeback', the patch tries to pass a proper one in. Signed-off-by: Tristan Ye Cc: stable@kernel.org Signed-off-by: Joel Becker --- fs/ocfs2/refcounttree.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index b5f9160..19ebc5a 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -3228,7 +3228,7 @@ static int ocfs2_make_clusters_writable(struct super_block *sb, u32 num_clusters, unsigned int e_flags) { int ret, delete, index, credits = 0; - u32 new_bit, new_len; + u32 new_bit, new_len, orig_num_clusters; unsigned int set_len; struct ocfs2_super *osb = OCFS2_SB(sb); handle_t *handle; @@ -3261,6 +3261,8 @@ static int ocfs2_make_clusters_writable(struct super_block *sb, goto out; } + orig_num_clusters = num_clusters; + while (num_clusters) { ret = ocfs2_get_refcount_rec(ref_ci, context->ref_root_bh, p_cluster, num_clusters, @@ -3348,7 +3350,8 @@ static int ocfs2_make_clusters_writable(struct super_block *sb, * in write-back mode. */ if (context->get_clusters == ocfs2_di_get_clusters) { - ret = ocfs2_cow_sync_writeback(sb, context, cpos, num_clusters); + ret = ocfs2_cow_sync_writeback(sb, context, cpos, + orig_num_clusters); if (ret) mlog_errno(ret); } -- cgit v1.1 From 52c303c56c3638944b5f733e3961dc58eb8c7270 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Mon, 31 Jan 2011 11:31:04 -0800 Subject: ocfs2: Check heartbeat mode for kernel stacks only Commit 2c442719e90a44a6982c033d69df4aae4b167cfa added some checks for proper heartbeat mode when the o2cb stack is running. Unfortunately, it didn't take into account that a userpsace stack could be running. Fix this by only doing the check if o2cb is in use. This patch allows userspace stacks to mount the fs again. Cc: stable@kernel.org Signed-off-by: Mark Fasheh Signed-off-by: Joel Becker --- fs/ocfs2/super.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 38f986d..36c423f 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1316,7 +1316,7 @@ static int ocfs2_parse_options(struct super_block *sb, struct mount_options *mopt, int is_remount) { - int status; + int status, user_stack = 0; char *p; u32 tmp; @@ -1459,6 +1459,15 @@ static int ocfs2_parse_options(struct super_block *sb, memcpy(mopt->cluster_stack, args[0].from, OCFS2_STACK_LABEL_LEN); mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0'; + /* + * Open code the memcmp here as we don't have + * an osb to pass to + * ocfs2_userspace_stack(). + */ + if (memcmp(mopt->cluster_stack, + OCFS2_CLASSIC_CLUSTER_STACK, + OCFS2_STACK_LABEL_LEN)) + user_stack = 1; break; case Opt_inode64: mopt->mount_opt |= OCFS2_MOUNT_INODE64; @@ -1514,13 +1523,16 @@ static int ocfs2_parse_options(struct super_block *sb, } } - /* Ensure only one heartbeat mode */ - tmp = mopt->mount_opt & (OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL | - OCFS2_MOUNT_HB_NONE); - if (hweight32(tmp) != 1) { - mlog(ML_ERROR, "Invalid heartbeat mount options\n"); - status = 0; - goto bail; + if (user_stack == 0) { + /* Ensure only one heartbeat mode */ + tmp = mopt->mount_opt & (OCFS2_MOUNT_HB_LOCAL | + OCFS2_MOUNT_HB_GLOBAL | + OCFS2_MOUNT_HB_NONE); + if (hweight32(tmp) != 1) { + mlog(ML_ERROR, "Invalid heartbeat mount options\n"); + status = 0; + goto bail; + } } status = 1; -- cgit v1.1 From ec29ed5b407d618a8128f5942aade9e1758aa14b Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 23 Feb 2011 16:23:20 -0500 Subject: Btrfs: fix fiemap bugs with delalloc The Btrfs fiemap code wasn't properly returning delalloc extents, so applications that trust fiemap to decide if there are holes in the file see holes instead of delalloc. This reworks the btrfs fiemap code, adding a get_extent helper that searches for delalloc ranges and also adding a helper for extent_fiemap that skips past holes in the file. Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 138 ++++++++++++++++++++++++++++++++++++--------------- fs/btrfs/extent_io.h | 2 +- fs/btrfs/inode.c | 126 +++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 224 insertions(+), 42 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index e7aeba2..ff45b80 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1433,12 +1433,13 @@ int extent_clear_unlock_delalloc(struct inode *inode, */ u64 count_range_bits(struct extent_io_tree *tree, u64 *start, u64 search_end, u64 max_bytes, - unsigned long bits) + unsigned long bits, int contig) { struct rb_node *node; struct extent_state *state; u64 cur_start = *start; u64 total_bytes = 0; + u64 last = 0; int found = 0; if (search_end <= cur_start) { @@ -1463,7 +1464,9 @@ u64 count_range_bits(struct extent_io_tree *tree, state = rb_entry(node, struct extent_state, rb_node); if (state->start > search_end) break; - if (state->end >= cur_start && (state->state & bits)) { + if (contig && found && state->start > last + 1) + break; + if (state->end >= cur_start && (state->state & bits) == bits) { total_bytes += min(search_end, state->end) + 1 - max(cur_start, state->start); if (total_bytes >= max_bytes) @@ -1472,6 +1475,9 @@ u64 count_range_bits(struct extent_io_tree *tree, *start = state->start; found = 1; } + last = state->end; + } else if (contig && found) { + break; } node = rb_next(node); if (!node) @@ -2912,6 +2918,46 @@ out: return sector; } +/* + * helper function for fiemap, which doesn't want to see any holes. + * This maps until we find something past 'last' + */ +static struct extent_map *get_extent_skip_holes(struct inode *inode, + u64 offset, + u64 last, + get_extent_t *get_extent) +{ + u64 sectorsize = BTRFS_I(inode)->root->sectorsize; + struct extent_map *em; + u64 len; + + if (offset >= last) + return NULL; + + while(1) { + len = last - offset; + if (len == 0) + break; + len = (len + sectorsize - 1) & ~(sectorsize - 1); + em = get_extent(inode, NULL, 0, offset, len, 0); + if (!em || IS_ERR(em)) + return em; + + /* if this isn't a hole return it */ + if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) && + em->block_start != EXTENT_MAP_HOLE) { + return em; + } + + /* this is a hole, advance to the next extent */ + offset = extent_map_end(em); + free_extent_map(em); + if (offset >= last) + break; + } + return NULL; +} + int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len, get_extent_t *get_extent) { @@ -2921,16 +2967,19 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u32 flags = 0; u32 found_type; u64 last; + u64 last_for_get_extent = 0; u64 disko = 0; + u64 isize = i_size_read(inode); struct btrfs_key found_key; struct extent_map *em = NULL; struct extent_state *cached_state = NULL; struct btrfs_path *path; struct btrfs_file_extent_item *item; int end = 0; - u64 em_start = 0, em_len = 0; + u64 em_start = 0; + u64 em_len = 0; + u64 em_end = 0; unsigned long emflags; - int hole = 0; if (len == 0) return -EINVAL; @@ -2940,6 +2989,10 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return -ENOMEM; path->leave_spinning = 1; + /* + * lookup the last file extent. We're not using i_size here + * because there might be preallocation past i_size + */ ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root, path, inode->i_ino, -1, 0); if (ret < 0) { @@ -2953,18 +3006,38 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); found_type = btrfs_key_type(&found_key); - /* No extents, just return */ + /* No extents, but there might be delalloc bits */ if (found_key.objectid != inode->i_ino || found_type != BTRFS_EXTENT_DATA_KEY) { - btrfs_free_path(path); - return 0; + /* have to trust i_size as the end */ + last = (u64)-1; + last_for_get_extent = isize; + } else { + /* + * remember the start of the last extent. There are a + * bunch of different factors that go into the length of the + * extent, so its much less complex to remember where it started + */ + last = found_key.offset; + last_for_get_extent = last + 1; } - last = found_key.offset; btrfs_free_path(path); + /* + * we might have some extents allocated but more delalloc past those + * extents. so, we trust isize unless the start of the last extent is + * beyond isize + */ + if (last < isize) { + last = (u64)-1; + last_for_get_extent = isize; + } + lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, &cached_state, GFP_NOFS); - em = get_extent(inode, NULL, 0, off, max - off, 0); + + em = get_extent_skip_holes(inode, off, last_for_get_extent, + get_extent); if (!em) goto out; if (IS_ERR(em)) { @@ -2973,19 +3046,14 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } while (!end) { - hole = 0; - off = em->start + em->len; + off = extent_map_end(em); if (off >= max) end = 1; - if (em->block_start == EXTENT_MAP_HOLE) { - hole = 1; - goto next; - } - em_start = em->start; em_len = em->len; - + em_end = extent_map_end(em); + emflags = em->flags; disko = 0; flags = 0; @@ -3004,37 +3072,29 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) flags |= FIEMAP_EXTENT_ENCODED; -next: - emflags = em->flags; free_extent_map(em); em = NULL; - if (!end) { - em = get_extent(inode, NULL, 0, off, max - off, 0); - if (!em) - goto out; - if (IS_ERR(em)) { - ret = PTR_ERR(em); - goto out; - } - emflags = em->flags; - } - - if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) { + if ((em_start >= last) || em_len == (u64)-1 || + (last == (u64)-1 && isize <= em_end)) { flags |= FIEMAP_EXTENT_LAST; end = 1; } - if (em_start == last) { + /* now scan forward to see if this is really the last extent. */ + em = get_extent_skip_holes(inode, off, last_for_get_extent, + get_extent); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out; + } + if (!em) { flags |= FIEMAP_EXTENT_LAST; end = 1; } - - if (!hole) { - ret = fiemap_fill_next_extent(fieinfo, em_start, disko, - em_len, flags); - if (ret) - goto out_free; - } + ret = fiemap_fill_next_extent(fieinfo, em_start, disko, + em_len, flags); + if (ret) + goto out_free; } out_free: free_extent_map(em); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 7083cfa..9318dfe 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -191,7 +191,7 @@ void extent_io_exit(void); u64 count_range_bits(struct extent_io_tree *tree, u64 *start, u64 search_end, - u64 max_bytes, unsigned long bits); + u64 max_bytes, unsigned long bits, int contig); void free_extent_state(struct extent_state *state); int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8d392ed..44b9266 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1913,7 +1913,7 @@ static int btrfs_clean_io_failures(struct inode *inode, u64 start) private = 0; if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, - (u64)-1, 1, EXTENT_DIRTY)) { + (u64)-1, 1, EXTENT_DIRTY, 0)) { ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start, &private_failure); if (ret == 0) { @@ -5282,6 +5282,128 @@ out: return em; } +struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page, + size_t pg_offset, u64 start, u64 len, + int create) +{ + struct extent_map *em; + struct extent_map *hole_em = NULL; + u64 range_start = start; + u64 end; + u64 found; + u64 found_end; + int err = 0; + + em = btrfs_get_extent(inode, page, pg_offset, start, len, create); + if (IS_ERR(em)) + return em; + if (em) { + /* + * if our em maps to a hole, there might + * actually be delalloc bytes behind it + */ + if (em->block_start != EXTENT_MAP_HOLE) + return em; + else + hole_em = em; + } + + /* check to see if we've wrapped (len == -1 or similar) */ + end = start + len; + if (end < start) + end = (u64)-1; + else + end -= 1; + + em = NULL; + + /* ok, we didn't find anything, lets look for delalloc */ + found = count_range_bits(&BTRFS_I(inode)->io_tree, &range_start, + end, len, EXTENT_DELALLOC, 1); + found_end = range_start + found; + if (found_end < range_start) + found_end = (u64)-1; + + /* + * we didn't find anything useful, return + * the original results from get_extent() + */ + if (range_start > end || found_end <= start) { + em = hole_em; + hole_em = NULL; + goto out; + } + + /* adjust the range_start to make sure it doesn't + * go backwards from the start they passed in + */ + range_start = max(start,range_start); + found = found_end - range_start; + + if (found > 0) { + u64 hole_start = start; + u64 hole_len = len; + + em = alloc_extent_map(GFP_NOFS); + if (!em) { + err = -ENOMEM; + goto out; + } + /* + * when btrfs_get_extent can't find anything it + * returns one huge hole + * + * make sure what it found really fits our range, and + * adjust to make sure it is based on the start from + * the caller + */ + if (hole_em) { + u64 calc_end = extent_map_end(hole_em); + + if (calc_end <= start || (hole_em->start > end)) { + free_extent_map(hole_em); + hole_em = NULL; + } else { + hole_start = max(hole_em->start, start); + hole_len = calc_end - hole_start; + } + } + em->bdev = NULL; + if (hole_em && range_start > hole_start) { + /* our hole starts before our delalloc, so we + * have to return just the parts of the hole + * that go until the delalloc starts + */ + em->len = min(hole_len, + range_start - hole_start); + em->start = hole_start; + em->orig_start = hole_start; + /* + * don't adjust block start at all, + * it is fixed at EXTENT_MAP_HOLE + */ + em->block_start = hole_em->block_start; + em->block_len = hole_len; + } else { + em->start = range_start; + em->len = found; + em->orig_start = range_start; + em->block_start = EXTENT_MAP_DELALLOC; + em->block_len = found; + } + } else if (hole_em) { + return hole_em; + } +out: + + free_extent_map(hole_em); + if (err) { + free_extent_map(em); + return ERR_PTR(err); + } + return em; +} + static struct extent_map *btrfs_new_extent_direct(struct inode *inode, u64 start, u64 len) { @@ -6104,7 +6226,7 @@ out: static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len) { - return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent); + return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap); } int btrfs_readpage(struct file *file, struct page *page) -- cgit v1.1 From 93b270f76e7ef3b81001576860c2701931cdc78b Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 24 Feb 2011 17:25:47 +1100 Subject: Fix over-zealous flush_disk when changing device size. There are two cases when we call flush_disk. In one, the device has disappeared (check_disk_change) so any data will hold becomes irrelevant. In the oter, the device has changed size (check_disk_size_change) so data we hold may be irrelevant. In both cases it makes sense to discard any 'clean' buffers, so they will be read back from the device if needed. In the former case it makes sense to discard 'dirty' buffers as there will never be anywhere safe to write the data. In the second case it *does*not* make sense to discard dirty buffers as that will lead to file system corruption when you simply enlarge the containing devices. flush_disk calls __invalidate_devices. __invalidate_device calls both invalidate_inodes and invalidate_bdev. invalidate_inodes *does* discard I_DIRTY inodes and this does lead to fs corruption. invalidate_bev *does*not* discard dirty pages, but I don't really care about that at present. So this patch adds a flag to __invalidate_device (calling it __invalidate_device2) to indicate whether dirty buffers should be killed, and this is passed to invalidate_inodes which can choose to skip dirty inodes. flusk_disk then passes true from check_disk_change and false from check_disk_size_change. dm avoids tripping over this problem by calling i_size_write directly rathher than using check_disk_size_change. md does use check_disk_size_change and so is affected. This regression was introduced by commit 608aeef17a which causes check_disk_size_change to call flush_disk, so it is suitable for any kernel since 2.6.27. Cc: stable@kernel.org Acked-by: Jeff Moyer Cc: Andrew Patterson Cc: Jens Axboe Signed-off-by: NeilBrown --- fs/block_dev.c | 12 ++++++------ fs/inode.c | 9 ++++++++- fs/internal.h | 2 +- 3 files changed, 15 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index 333a7bb..5e23152 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -927,9 +927,9 @@ EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); * when a disk has been changed -- either by a media change or online * resize. */ -static void flush_disk(struct block_device *bdev) +static void flush_disk(struct block_device *bdev, bool kill_dirty) { - if (__invalidate_device(bdev)) { + if (__invalidate_device(bdev, kill_dirty)) { char name[BDEVNAME_SIZE] = ""; if (bdev->bd_disk) @@ -966,7 +966,7 @@ void check_disk_size_change(struct gendisk *disk, struct block_device *bdev) "%s: detected capacity change from %lld to %lld\n", name, bdev_size, disk_size); i_size_write(bdev->bd_inode, disk_size); - flush_disk(bdev); + flush_disk(bdev, false); } } EXPORT_SYMBOL(check_disk_size_change); @@ -1019,7 +1019,7 @@ int check_disk_change(struct block_device *bdev) if (!(events & DISK_EVENT_MEDIA_CHANGE)) return 0; - flush_disk(bdev); + flush_disk(bdev, true); if (bdops->revalidate_disk) bdops->revalidate_disk(bdev->bd_disk); return 1; @@ -1601,7 +1601,7 @@ fail: } EXPORT_SYMBOL(lookup_bdev); -int __invalidate_device(struct block_device *bdev) +int __invalidate_device(struct block_device *bdev, bool kill_dirty) { struct super_block *sb = get_super(bdev); int res = 0; @@ -1614,7 +1614,7 @@ int __invalidate_device(struct block_device *bdev) * hold). */ shrink_dcache_sb(sb); - res = invalidate_inodes(sb); + res = invalidate_inodes(sb, kill_dirty); drop_super(sb); } invalidate_bdev(bdev); diff --git a/fs/inode.c b/fs/inode.c index da85e56..c50d7fe 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -540,11 +540,14 @@ void evict_inodes(struct super_block *sb) /** * invalidate_inodes - attempt to free all inodes on a superblock * @sb: superblock to operate on + * @kill_dirty: flag to guide handling of dirty inodes * * Attempts to free all inodes for a given superblock. If there were any * busy inodes return a non-zero value, else zero. + * If @kill_dirty is set, discard dirty inodes too, otherwise treat + * them as busy. */ -int invalidate_inodes(struct super_block *sb) +int invalidate_inodes(struct super_block *sb, bool kill_dirty) { int busy = 0; struct inode *inode, *next; @@ -556,6 +559,10 @@ int invalidate_inodes(struct super_block *sb) list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) continue; + if (inode->i_state & I_DIRTY && !kill_dirty) { + busy = 1; + continue; + } if (atomic_read(&inode->i_count)) { busy = 1; continue; diff --git a/fs/internal.h b/fs/internal.h index 0663568..9b976b5 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -112,4 +112,4 @@ extern void release_open_intent(struct nameidata *); */ extern int get_nr_dirty_inodes(void); extern void evict_inodes(struct super_block *); -extern int invalidate_inodes(struct super_block *); +extern int invalidate_inodes(struct super_block *, bool); -- cgit v1.1 From bf9faa2aa30e2ebf30287536712ed2717bb47002 Mon Sep 17 00:00:00 2001 From: "J. R. Okajima" Date: Wed, 23 Feb 2011 16:59:49 +0900 Subject: Unlock vfsmount_lock in do_umount By the commit b3e19d9 2011-01-07 fs: scale mntget/mntput vfsmount_lock was introduced around testing mnt_count. Fix the mis-typed 'unlock' Signed-off-by: J. R. Okajima Acked-by: Al Viro Signed-off-by: Al Viro --- fs/namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index 7b0b953..d1edf26 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1244,7 +1244,7 @@ static int do_umount(struct vfsmount *mnt, int flags) */ br_write_lock(vfsmount_lock); if (mnt_get_count(mnt) != 2) { - br_write_lock(vfsmount_lock); + br_write_unlock(vfsmount_lock); return -EBUSY; } br_write_unlock(vfsmount_lock); -- cgit v1.1 From e7407d1619713f4b1fdff3a485e1bd8e77bd480d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 24 Feb 2011 09:56:32 +0100 Subject: block: bd_link_disk_holder() should hold on to holder_dir The new implementation of bd_link_disk_holder() added by 49731baa41d (block: restore multiple bd_link_disk_holder() support) didn't get an extra reference for the holder_dir kobject of the slave bdev; however, bdev kills holder_dir on removal, not release, so if the slave bdev is removed while there are holder links, the holder_dir will be destroyed while there still are holder links, which leads to oops later when bd_unlink_disk_order() tries to remove those links. Make bd_link_disk_holder() grab an extra reference for the slave's holder_dir and put it in bd_unlink_disk_holder(). Signed-off-by: Tejun Heo Reported-by: "Hawrylewicz Czarnowski, Przemyslaw" Tested-by: "Hawrylewicz Czarnowski, Przemyslaw" Cc: Neil Brown Cc: Jens Axboe Signed-off-by: Linus Torvalds --- fs/block_dev.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index 4fb8a34..94d41db 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -873,6 +873,11 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj); if (ret) goto out_del; + /* + * bdev could be deleted beneath us which would implicitly destroy + * the holder directory. Hold on to it. + */ + kobject_get(bdev->bd_part->holder_dir); list_add(&holder->list, &bdev->bd_holder_disks); goto out_unlock; @@ -909,6 +914,7 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); del_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj); + kobject_put(bdev->bd_part->holder_dir); list_del_init(&holder->list); kfree(holder); } -- cgit v1.1 From 5a18ec176c934ca1bc9dc61580a5e0e90a9b5733 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 25 Feb 2011 14:44:58 +0100 Subject: fuse: fix hang of single threaded fuseblk filesystem Single threaded NTFS-3G could get stuck if a delayed RELEASE reply triggered a DESTROY request via path_put(). Fix this by a) making RELEASE requests synchronous, whenever possible, on fuseblk filesystems b) if not possible (triggered by an asynchronous read/write) then do the path_put() in a separate thread with schedule_work(). Reported-by: Oliver Neukum Cc: stable@kernel.org Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 52 +++++++++++++++++++++++++++++++++++++++++++++------- fs/fuse/fuse_i.h | 6 +++++- 2 files changed, 50 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 95da1bc..9e0832d 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -86,18 +86,52 @@ struct fuse_file *fuse_file_get(struct fuse_file *ff) return ff; } +static void fuse_release_async(struct work_struct *work) +{ + struct fuse_req *req; + struct fuse_conn *fc; + struct path path; + + req = container_of(work, struct fuse_req, misc.release.work); + path = req->misc.release.path; + fc = get_fuse_conn(path.dentry->d_inode); + + fuse_put_request(fc, req); + path_put(&path); +} + static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req) { - path_put(&req->misc.release.path); + if (fc->destroy_req) { + /* + * If this is a fuseblk mount, then it's possible that + * releasing the path will result in releasing the + * super block and sending the DESTROY request. If + * the server is single threaded, this would hang. + * For this reason do the path_put() in a separate + * thread. + */ + atomic_inc(&req->count); + INIT_WORK(&req->misc.release.work, fuse_release_async); + schedule_work(&req->misc.release.work); + } else { + path_put(&req->misc.release.path); + } } -static void fuse_file_put(struct fuse_file *ff) +static void fuse_file_put(struct fuse_file *ff, bool sync) { if (atomic_dec_and_test(&ff->count)) { struct fuse_req *req = ff->reserved_req; - req->end = fuse_release_end; - fuse_request_send_background(ff->fc, req); + if (sync) { + fuse_request_send(ff->fc, req); + path_put(&req->misc.release.path); + fuse_put_request(ff->fc, req); + } else { + req->end = fuse_release_end; + fuse_request_send_background(ff->fc, req); + } kfree(ff); } } @@ -219,8 +253,12 @@ void fuse_release_common(struct file *file, int opcode) * Normally this will send the RELEASE request, however if * some asynchronous READ or WRITE requests are outstanding, * the sending will be delayed. + * + * Make the release synchronous if this is a fuseblk mount, + * synchronous RELEASE is allowed (and desirable) in this case + * because the server can be trusted not to screw up. */ - fuse_file_put(ff); + fuse_file_put(ff, ff->fc->destroy_req != NULL); } static int fuse_open(struct inode *inode, struct file *file) @@ -558,7 +596,7 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req) page_cache_release(page); } if (req->ff) - fuse_file_put(req->ff); + fuse_file_put(req->ff, false); } static void fuse_send_readpages(struct fuse_req *req, struct file *file) @@ -1137,7 +1175,7 @@ static ssize_t fuse_direct_write(struct file *file, const char __user *buf, static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req) { __free_page(req->pages[0]); - fuse_file_put(req->ff); + fuse_file_put(req->ff, false); } static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index ae5744a..d428694 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -21,6 +21,7 @@ #include #include #include +#include /** Max number of pages that can be used in a single read request */ #define FUSE_MAX_PAGES_PER_REQ 32 @@ -262,7 +263,10 @@ struct fuse_req { /** Data for asynchronous requests */ union { struct { - struct fuse_release_in in; + union { + struct fuse_release_in in; + struct work_struct work; + }; struct path path; } release; struct fuse_init_in init_in; -- cgit v1.1 From 8d56addd70c7c0626502569e22cc8fce49ae39f5 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 25 Feb 2011 14:44:58 +0100 Subject: fuse: fix truncate after open Commit e1181ee6 "vfs: pass struct file to do_truncate on O_TRUNC opens" broke the behavior of open(O_TRUNC|O_RDONLY) in fuse. Fuse assumed that when called from open, a truncate() will be done, not an ftruncate(). Fix by restoring the old behavior, based on the ATTR_OPEN flag. Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index bfed844..83543b5 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1283,8 +1283,11 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr, if (err) return err; - if ((attr->ia_valid & ATTR_OPEN) && fc->atomic_o_trunc) - return 0; + if (attr->ia_valid & ATTR_OPEN) { + if (fc->atomic_o_trunc) + return 0; + file = NULL; + } if (attr->ia_valid & ATTR_SIZE) is_truncate = true; -- cgit v1.1 From f129ccc9231c95513a1227ca9da876beeb03e577 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Fri, 25 Feb 2011 15:33:02 +0000 Subject: afs: Fix oops in afs_unlink_writeback I'm seeing the following oops when testing afs: Unable to handle kernel paging request for data at address 0x00000008 ... NIP [c0000000003393b0] .afs_unlink_writeback+0x38/0xc0 LR [c00000000033987c] .afs_put_writeback+0x98/0xec Call Trace: [c00000000345f600] [c00000000033987c] .afs_put_writeback+0x98/0xec [c00000000345f690] [c00000000033ae80] .afs_write_begin+0x6a4/0x75c [c00000000345f790] [c00000000012b77c] .generic_file_buffered_write+0x148/0x320 [c00000000345f8d0] [c00000000012e1b8] .__generic_file_aio_write+0x37c/0x3e4 [c00000000345f9d0] [c00000000012e2a8] .generic_file_aio_write+0x88/0xfc [c00000000345fa90] [c0000000003390a8] .afs_file_write+0x10c/0x178 [c00000000345fb40] [c000000000188788] .do_sync_write+0xc4/0x128 [c00000000345fcc0] [c000000000189658] .vfs_write+0xe8/0x1d8 [c00000000345fd70] [c000000000189884] .SyS_write+0x68/0xb0 [c00000000345fe30] [c000000000008564] syscall_exit+0x0/0x40 afs_write_begin hits an error and calls afs_unlink_writeback. In there we do list_del_init on an uninitialised list. The patch below initialises ->link when creating the afs_writeback struct. Signed-off-by: Anton Blanchard Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- fs/afs/write.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/afs/write.c b/fs/afs/write.c index 15690bb..789b3af 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -140,6 +140,7 @@ int afs_write_begin(struct file *file, struct address_space *mapping, candidate->first = candidate->last = index; candidate->offset_first = from; candidate->to_last = to; + INIT_LIST_HEAD(&candidate->link); candidate->usage = 1; candidate->state = AFS_WBACK_PENDING; init_waitqueue_head(&candidate->waitq); -- cgit v1.1 From 22bacca48a1755f79b7e0f192ddb9fbb7fc6e64e Mon Sep 17 00:00:00 2001 From: Davide Libenzi Date: Fri, 25 Feb 2011 14:44:12 -0800 Subject: epoll: prevent creating circular epoll structures In several places, an epoll fd can call another file's ->f_op->poll() method with ep->mtx held. This is in general unsafe, because that other file could itself be an epoll fd that contains the original epoll fd. The code defends against this possibility in its own ->poll() method using ep_call_nested, but there are several other unsafe calls to ->poll elsewhere that can be made to deadlock. For example, the following simple program causes the call in ep_insert recursively call the original fd's ->poll, leading to deadlock: #include #include int main(void) { int e1, e2, p[2]; struct epoll_event evt = { .events = EPOLLIN }; e1 = epoll_create(1); e2 = epoll_create(2); pipe(p); epoll_ctl(e2, EPOLL_CTL_ADD, e1, &evt); epoll_ctl(e1, EPOLL_CTL_ADD, p[0], &evt); write(p[1], p, sizeof p); epoll_ctl(e1, EPOLL_CTL_ADD, e2, &evt); return 0; } On insertion, check whether the inserted file is itself a struct epoll, and if so, do a recursive walk to detect whether inserting this file would create a loop of epoll structures, which could lead to deadlock. [nelhage@ksplice.com: Use epmutex to serialize concurrent inserts] Signed-off-by: Davide Libenzi Signed-off-by: Nelson Elhage Reported-by: Nelson Elhage Tested-by: Nelson Elhage Cc: [2.6.34+, possibly earlier] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 95 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) (limited to 'fs') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 267d0ad..4a09af9 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -63,6 +63,13 @@ * cleanup path and it is also acquired by eventpoll_release_file() * if a file has been pushed inside an epoll set and it is then * close()d without a previous call toepoll_ctl(EPOLL_CTL_DEL). + * It is also acquired when inserting an epoll fd onto another epoll + * fd. We do this so that we walk the epoll tree and ensure that this + * insertion does not create a cycle of epoll file descriptors, which + * could lead to deadlock. We need a global mutex to prevent two + * simultaneous inserts (A into B and B into A) from racing and + * constructing a cycle without either insert observing that it is + * going to. * It is possible to drop the "ep->mtx" and to use the global * mutex "epmutex" (together with "ep->lock") to have it working, * but having "ep->mtx" will make the interface more scalable. @@ -224,6 +231,9 @@ static long max_user_watches __read_mostly; */ static DEFINE_MUTEX(epmutex); +/* Used to check for epoll file descriptor inclusion loops */ +static struct nested_calls poll_loop_ncalls; + /* Used for safe wake up implementation */ static struct nested_calls poll_safewake_ncalls; @@ -1198,6 +1208,62 @@ retry: return res; } +/** + * ep_loop_check_proc - Callback function to be passed to the @ep_call_nested() + * API, to verify that adding an epoll file inside another + * epoll structure, does not violate the constraints, in + * terms of closed loops, or too deep chains (which can + * result in excessive stack usage). + * + * @priv: Pointer to the epoll file to be currently checked. + * @cookie: Original cookie for this call. This is the top-of-the-chain epoll + * data structure pointer. + * @call_nests: Current dept of the @ep_call_nested() call stack. + * + * Returns: Returns zero if adding the epoll @file inside current epoll + * structure @ep does not violate the constraints, or -1 otherwise. + */ +static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) +{ + int error = 0; + struct file *file = priv; + struct eventpoll *ep = file->private_data; + struct rb_node *rbp; + struct epitem *epi; + + mutex_lock(&ep->mtx); + for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { + epi = rb_entry(rbp, struct epitem, rbn); + if (unlikely(is_file_epoll(epi->ffd.file))) { + error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, + ep_loop_check_proc, epi->ffd.file, + epi->ffd.file->private_data, current); + if (error != 0) + break; + } + } + mutex_unlock(&ep->mtx); + + return error; +} + +/** + * ep_loop_check - Performs a check to verify that adding an epoll file (@file) + * another epoll file (represented by @ep) does not create + * closed loops or too deep chains. + * + * @ep: Pointer to the epoll private data structure. + * @file: Pointer to the epoll file to be checked. + * + * Returns: Returns zero if adding the epoll @file inside current epoll + * structure @ep does not violate the constraints, or -1 otherwise. + */ +static int ep_loop_check(struct eventpoll *ep, struct file *file) +{ + return ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, + ep_loop_check_proc, file, ep, current); +} + /* * Open an eventpoll file descriptor. */ @@ -1246,6 +1312,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, struct epoll_event __user *, event) { int error; + int did_lock_epmutex = 0; struct file *file, *tfile; struct eventpoll *ep; struct epitem *epi; @@ -1287,6 +1354,25 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, */ ep = file->private_data; + /* + * When we insert an epoll file descriptor, inside another epoll file + * descriptor, there is the change of creating closed loops, which are + * better be handled here, than in more critical paths. + * + * We hold epmutex across the loop check and the insert in this case, in + * order to prevent two separate inserts from racing and each doing the + * insert "at the same time" such that ep_loop_check passes on both + * before either one does the insert, thereby creating a cycle. + */ + if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD)) { + mutex_lock(&epmutex); + did_lock_epmutex = 1; + error = -ELOOP; + if (ep_loop_check(ep, tfile) != 0) + goto error_tgt_fput; + } + + mutex_lock(&ep->mtx); /* @@ -1322,6 +1408,9 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, mutex_unlock(&ep->mtx); error_tgt_fput: + if (unlikely(did_lock_epmutex)) + mutex_unlock(&epmutex); + fput(tfile); error_fput: fput(file); @@ -1441,6 +1530,12 @@ static int __init eventpoll_init(void) EP_ITEM_COST; BUG_ON(max_user_watches < 0); + /* + * Initialize the structure used to perform epoll file descriptor + * inclusion loops checks. + */ + ep_nested_calls_init(&poll_loop_ncalls); + /* Initialize the structure used to perform safe poll wait head wake ups */ ep_nested_calls_init(&poll_safewake_ncalls); -- cgit v1.1 From 294f6cf48666825d23c9372ef37631232746e40d Mon Sep 17 00:00:00 2001 From: Timo Warns Date: Fri, 25 Feb 2011 14:44:21 -0800 Subject: ldm: corrupted partition table can cause kernel oops The kernel automatically evaluates partition tables of storage devices. The code for evaluating LDM partitions (in fs/partitions/ldm.c) contains a bug that causes a kernel oops on certain corrupted LDM partitions. A kernel subsystem seems to crash, because, after the oops, the kernel no longer recognizes newly connected storage devices. The patch changes ldm_parse_vmdb() to Validate the value of vblk_size. Signed-off-by: Timo Warns Cc: Eugene Teo Acked-by: Richard Russon Cc: Harvey Harrison Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/partitions/ldm.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c index 789c625..b10e354 100644 --- a/fs/partitions/ldm.c +++ b/fs/partitions/ldm.c @@ -251,6 +251,11 @@ static bool ldm_parse_vmdb (const u8 *data, struct vmdb *vm) } vm->vblk_size = get_unaligned_be32(data + 0x08); + if (vm->vblk_size == 0) { + ldm_error ("Illegal VBLK size"); + return false; + } + vm->vblk_offset = get_unaligned_be32(data + 0x0C); vm->last_vblk_seq = get_unaligned_be32(data + 0x04); -- cgit v1.1 From 3bd9a5d734c7cc7533b27abf451416c7f50095a7 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 25 Feb 2011 14:44:26 -0800 Subject: aio: fix rcu ioctx lookup aio-dio-invalidate-failure GPFs in aio_put_req from io_submit. lookup_ioctx doesn't implement the rcu lookup pattern properly. rcu_read_lock does not prevent refcount going to zero, so we might take a refcount on a zero count ioctx. Fix the bug by atomically testing for zero refcount before incrementing. [jack@suse.cz: added comment into the code] Reviewed-by: Jeff Moyer Signed-off-by: Nick Piggin Signed-off-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/aio.c | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index fc557a3..b4dd668 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -239,15 +239,23 @@ static void __put_ioctx(struct kioctx *ctx) call_rcu(&ctx->rcu_head, ctx_rcu_free); } -#define get_ioctx(kioctx) do { \ - BUG_ON(atomic_read(&(kioctx)->users) <= 0); \ - atomic_inc(&(kioctx)->users); \ -} while (0) -#define put_ioctx(kioctx) do { \ - BUG_ON(atomic_read(&(kioctx)->users) <= 0); \ - if (unlikely(atomic_dec_and_test(&(kioctx)->users))) \ - __put_ioctx(kioctx); \ -} while (0) +static inline void get_ioctx(struct kioctx *kioctx) +{ + BUG_ON(atomic_read(&kioctx->users) <= 0); + atomic_inc(&kioctx->users); +} + +static inline int try_get_ioctx(struct kioctx *kioctx) +{ + return atomic_inc_not_zero(&kioctx->users); +} + +static inline void put_ioctx(struct kioctx *kioctx) +{ + BUG_ON(atomic_read(&kioctx->users) <= 0); + if (unlikely(atomic_dec_and_test(&kioctx->users))) + __put_ioctx(kioctx); +} /* ioctx_alloc * Allocates and initializes an ioctx. Returns an ERR_PTR if it failed. @@ -601,8 +609,13 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) rcu_read_lock(); hlist_for_each_entry_rcu(ctx, n, &mm->ioctx_list, list) { - if (ctx->user_id == ctx_id && !ctx->dead) { - get_ioctx(ctx); + /* + * RCU protects us against accessing freed memory but + * we have to be careful not to get a reference when the + * reference count already dropped to 0 (ctx->dead test + * is unreliable because of races). + */ + if (ctx->user_id == ctx_id && !ctx->dead && try_get_ioctx(ctx)){ ret = ctx; break; } -- cgit v1.1 From 7137c6bd455234bcb7560fd829e6ee49cae5fed6 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 25 Feb 2011 14:44:27 -0800 Subject: aio: fix race between io_destroy() and io_submit() A race can occur when io_submit() races with io_destroy(): CPU1 CPU2 io_submit() do_io_submit() ... ctx = lookup_ioctx(ctx_id); io_destroy() Now do_io_submit() holds the last reference to ctx. ... queue new AIO put_ioctx(ctx) - frees ctx with active AIOs We solve this issue by checking whether ctx is being destroyed in AIO submission path after adding new AIO to ctx. Then we are guaranteed that either io_destroy() waits for new AIO or we see that ctx is being destroyed and bail out. Cc: Nick Piggin Reviewed-by: Jeff Moyer Signed-off-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/aio.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index b4dd668..26869cd 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1642,6 +1642,23 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, goto out_put_req; spin_lock_irq(&ctx->ctx_lock); + /* + * We could have raced with io_destroy() and are currently holding a + * reference to ctx which should be destroyed. We cannot submit IO + * since ctx gets freed as soon as io_submit() puts its reference. The + * check here is reliable: io_destroy() sets ctx->dead before waiting + * for outstanding IO and the barrier between these two is realized by + * unlock of mm->ioctx_lock and lock of ctx->ctx_lock. Analogously we + * increment ctx->reqs_active before checking for ctx->dead and the + * barrier is realized by unlock and lock of ctx->ctx_lock. Thus if we + * don't see ctx->dead set here, io_destroy() waits for our IO to + * finish. + */ + if (ctx->dead) { + spin_unlock_irq(&ctx->ctx_lock); + ret = -EINVAL; + goto out_put_req; + } aio_run_iocb(req); if (!list_empty(&ctx->run_list)) { /* drain the run list */ -- cgit v1.1 From e6eb5ce1b202ac9cdcfda5be559c9b9d8ec7542c Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 26 Feb 2011 10:54:00 -0800 Subject: fs/block_dev.c: fix new kernel-doc warning Fix new kernel-doc warning in fs/block_dev.c: Warning(fs/block_dev.c:937): No description found for parameter 'kill_dirty' Signed-off-by: Randy Dunlap Signed-off-by: Linus Torvalds --- fs/block_dev.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index f05bf16..8892870 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -928,6 +928,7 @@ EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); * flush_disk - invalidates all buffer-cache entries on a disk * * @bdev: struct block device to be flushed + * @kill_dirty: flag to guide handling of dirty inodes * * Invalidates all buffer-cache entries on a disk. It should be called * when a disk has been changed -- either by a media change or online -- cgit v1.1