From 13db62b7a1e8c64763a93c155091620f85ff8920 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Mon, 13 Jun 2011 19:56:13 +0200 Subject: btrfs scrub: added unverified_errors In normal operation, scrub is reading data sequentially in large portions. In case of an i/o error, we try to find the corrupted area(s) by issuing page sized read requests. With this commit we increment the unverified_errors counter if all of the small size requests succeed. Userland patches carrying such conspicous events to the administrator should already be around. Signed-off-by: Jan Schmidt --- fs/btrfs/scrub.c | 37 ++++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) (limited to 'fs/btrfs/scrub.c') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index a8d03d5..35099fa 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -201,18 +201,25 @@ nomem: * recheck_error gets called for every page in the bio, even though only * one may be bad */ -static void scrub_recheck_error(struct scrub_bio *sbio, int ix) +static int scrub_recheck_error(struct scrub_bio *sbio, int ix) { + struct scrub_dev *sdev = sbio->sdev; + u64 sector = (sbio->physical + ix * PAGE_SIZE) >> 9; + if (sbio->err) { - if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, - (sbio->physical + ix * PAGE_SIZE) >> 9, + if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, sector, sbio->bio->bi_io_vec[ix].bv_page) == 0) { if (scrub_fixup_check(sbio, ix) == 0) - return; + return 0; } } + spin_lock(&sdev->stat_lock); + ++sdev->stat.read_errors; + spin_unlock(&sdev->stat_lock); + scrub_fixup(sbio, ix); + return 1; } static int scrub_fixup_check(struct scrub_bio *sbio, int ix) @@ -382,8 +389,14 @@ static void scrub_checksum(struct btrfs_work *work) int ret; if (sbio->err) { + ret = 0; for (i = 0; i < sbio->count; ++i) - scrub_recheck_error(sbio, i); + ret |= scrub_recheck_error(sbio, i); + if (!ret) { + spin_lock(&sdev->stat_lock); + ++sdev->stat.unverified_errors; + spin_unlock(&sdev->stat_lock); + } sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1); sbio->bio->bi_flags |= 1 << BIO_UPTODATE; @@ -396,10 +409,6 @@ static void scrub_checksum(struct btrfs_work *work) bi->bv_offset = 0; bi->bv_len = PAGE_SIZE; } - - spin_lock(&sdev->stat_lock); - ++sdev->stat.read_errors; - spin_unlock(&sdev->stat_lock); goto out; } for (i = 0; i < sbio->count; ++i) { @@ -420,8 +429,14 @@ static void scrub_checksum(struct btrfs_work *work) WARN_ON(1); } kunmap_atomic(buffer, KM_USER0); - if (ret) - scrub_recheck_error(sbio, i); + if (ret) { + ret = scrub_recheck_error(sbio, i); + if (!ret) { + spin_lock(&sdev->stat_lock); + ++sdev->stat.unverified_errors; + spin_unlock(&sdev->stat_lock); + } + } } out: -- cgit v1.1 From 558540c17771eaf89b1a3be39aa2c8bc837da1a6 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Mon, 13 Jun 2011 19:59:12 +0200 Subject: btrfs scrub: print paths of corrupted files While scrubbing, we may encounter various errors. Previously, a logical address was printed to the log only. Now, all paths belonging to that address are resolved and printed separately. That should work for hardlinks as well as reflinks. Signed-off-by: Jan Schmidt --- fs/btrfs/scrub.c | 169 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 163 insertions(+), 6 deletions(-) (limited to 'fs/btrfs/scrub.c') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 35099fa..221fd5c 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -17,10 +17,12 @@ */ #include +#include #include "ctree.h" #include "volumes.h" #include "disk-io.h" #include "ordered-data.h" +#include "backref.h" /* * This is only the first step towards a full-features scrub. It reads all @@ -100,6 +102,19 @@ struct scrub_dev { spinlock_t stat_lock; }; +struct scrub_warning { + struct btrfs_path *path; + u64 extent_item_size; + char *scratch_buf; + char *msg_buf; + const char *errstr; + sector_t sector; + u64 logical; + struct btrfs_device *dev; + int msg_bufsize; + int scratch_bufsize; +}; + static void scrub_free_csums(struct scrub_dev *sdev) { while (!list_empty(&sdev->csum_list)) { @@ -195,6 +210,143 @@ nomem: return ERR_PTR(-ENOMEM); } +static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx) +{ + u64 isize; + u32 nlink; + int ret; + int i; + struct extent_buffer *eb; + struct btrfs_inode_item *inode_item; + struct scrub_warning *swarn = ctx; + struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info; + struct inode_fs_paths *ipath = NULL; + struct btrfs_root *local_root; + struct btrfs_key root_key; + + root_key.objectid = root; + root_key.type = BTRFS_ROOT_ITEM_KEY; + root_key.offset = (u64)-1; + local_root = btrfs_read_fs_root_no_name(fs_info, &root_key); + if (IS_ERR(local_root)) { + ret = PTR_ERR(local_root); + goto err; + } + + ret = inode_item_info(inum, 0, local_root, swarn->path); + if (ret) { + btrfs_release_path(swarn->path); + goto err; + } + + eb = swarn->path->nodes[0]; + inode_item = btrfs_item_ptr(eb, swarn->path->slots[0], + struct btrfs_inode_item); + isize = btrfs_inode_size(eb, inode_item); + nlink = btrfs_inode_nlink(eb, inode_item); + btrfs_release_path(swarn->path); + + ipath = init_ipath(4096, local_root, swarn->path); + ret = paths_from_inode(inum, ipath); + + if (ret < 0) + goto err; + + /* + * we deliberately ignore the bit ipath might have been too small to + * hold all of the paths here + */ + for (i = 0; i < ipath->fspath->elem_cnt; ++i) + printk(KERN_WARNING "btrfs: %s at logical %llu on dev " + "%s, sector %llu, root %llu, inode %llu, offset %llu, " + "length %llu, links %u (path: %s)\n", swarn->errstr, + swarn->logical, swarn->dev->name, + (unsigned long long)swarn->sector, root, inum, offset, + min(isize - offset, (u64)PAGE_SIZE), nlink, + ipath->fspath->str[i]); + + free_ipath(ipath); + return 0; + +err: + printk(KERN_WARNING "btrfs: %s at logical %llu on dev " + "%s, sector %llu, root %llu, inode %llu, offset %llu: path " + "resolving failed with ret=%d\n", swarn->errstr, + swarn->logical, swarn->dev->name, + (unsigned long long)swarn->sector, root, inum, offset, ret); + + free_ipath(ipath); + return 0; +} + +static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio, + int ix) +{ + struct btrfs_device *dev = sbio->sdev->dev; + struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; + struct btrfs_path *path; + struct btrfs_key found_key; + struct extent_buffer *eb; + struct btrfs_extent_item *ei; + struct scrub_warning swarn; + u32 item_size; + int ret; + u64 ref_root; + u8 ref_level; + unsigned long ptr = 0; + const int bufsize = 4096; + u64 extent_offset; + + path = btrfs_alloc_path(); + + swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS); + swarn.msg_buf = kmalloc(bufsize, GFP_NOFS); + swarn.sector = (sbio->physical + ix * PAGE_SIZE) >> 9; + swarn.logical = sbio->logical + ix * PAGE_SIZE; + swarn.errstr = errstr; + swarn.dev = dev; + swarn.msg_bufsize = bufsize; + swarn.scratch_bufsize = bufsize; + + if (!path || !swarn.scratch_buf || !swarn.msg_buf) + goto out; + + ret = extent_from_logical(fs_info, swarn.logical, path, &found_key); + if (ret < 0) + goto out; + + extent_offset = swarn.logical - found_key.objectid; + swarn.extent_item_size = found_key.offset; + + eb = path->nodes[0]; + ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); + item_size = btrfs_item_size_nr(eb, path->slots[0]); + + if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + do { + ret = tree_backref_for_extent(&ptr, eb, ei, item_size, + &ref_root, &ref_level); + printk(KERN_WARNING "%s at logical %llu on dev %s, " + "sector %llu: metadata %s (level %d) in tree " + "%llu\n", errstr, swarn.logical, dev->name, + (unsigned long long)swarn.sector, + ref_level ? "node" : "leaf", + ret < 0 ? -1 : ref_level, + ret < 0 ? -1 : ref_root); + } while (ret != 1); + } else { + swarn.path = path; + iterate_extent_inodes(fs_info, path, found_key.objectid, + extent_offset, + scrub_print_warning_inode, &swarn); + } + +out: + btrfs_free_path(path); + kfree(swarn.scratch_buf); + kfree(swarn.msg_buf); +} + /* * scrub_recheck_error gets called when either verification of the page * failed or the bio failed to read, e.g. with EIO. In the latter case, @@ -205,6 +357,8 @@ static int scrub_recheck_error(struct scrub_bio *sbio, int ix) { struct scrub_dev *sdev = sbio->sdev; u64 sector = (sbio->physical + ix * PAGE_SIZE) >> 9; + static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); if (sbio->err) { if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, sector, @@ -212,6 +366,11 @@ static int scrub_recheck_error(struct scrub_bio *sbio, int ix) if (scrub_fixup_check(sbio, ix) == 0) return 0; } + if (__ratelimit(&_rs)) + scrub_print_warning("i/o error", sbio, ix); + } else { + if (__ratelimit(&_rs)) + scrub_print_warning("checksum error", sbio, ix); } spin_lock(&sdev->stat_lock); @@ -326,9 +485,8 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) ++sdev->stat.corrected_errors; spin_unlock(&sdev->stat_lock); - if (printk_ratelimit()) - printk(KERN_ERR "btrfs: fixed up at %llu\n", - (unsigned long long)logical); + printk_ratelimited(KERN_ERR "btrfs: fixed up error at logical %llu\n", + (unsigned long long)logical); return; uncorrectable: @@ -337,9 +495,8 @@ uncorrectable: ++sdev->stat.uncorrectable_errors; spin_unlock(&sdev->stat_lock); - if (printk_ratelimit()) - printk(KERN_ERR "btrfs: unable to fixup at %llu\n", - (unsigned long long)logical); + printk_ratelimited(KERN_ERR "btrfs: unable to fixup (regular) error at " + "logical %llu\n", (unsigned long long)logical); } static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, -- cgit v1.1 From 193ea74b2729e6ddc08fb6bde6e15a3bd4d94071 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Mon, 13 Jun 2011 19:56:54 +0200 Subject: btrfs scrub: bugfix: mirror_num off by one Fix the mirror_num determination in scrub_stripe. The rest of the scrub code did not use mirror_num for anything important and that error went unnoticed. The nodatasum fixup patch of this set depends on a correct mirror_num. Signed-off-by: Jan Schmidt --- fs/btrfs/scrub.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs/btrfs/scrub.c') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 221fd5c..59caf8f 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -452,7 +452,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) * first find a good copy */ for (i = 0; i < multi->num_stripes; ++i) { - if (i == sbio->spag[ix].mirror_num) + if (i + 1 == sbio->spag[ix].mirror_num) continue; if (scrub_fixup_io(READ, multi->stripes[i].dev->bdev, @@ -930,21 +930,21 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, if (map->type & BTRFS_BLOCK_GROUP_RAID0) { offset = map->stripe_len * num; increment = map->stripe_len * map->num_stripes; - mirror_num = 0; + mirror_num = 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { int factor = map->num_stripes / map->sub_stripes; offset = map->stripe_len * (num / map->sub_stripes); increment = map->stripe_len * factor; - mirror_num = num % map->sub_stripes; + mirror_num = num % map->sub_stripes + 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { increment = map->stripe_len; - mirror_num = num % map->num_stripes; + mirror_num = num % map->num_stripes + 1; } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { increment = map->stripe_len; - mirror_num = num % map->num_stripes; + mirror_num = num % map->num_stripes + 1; } else { increment = map->stripe_len; - mirror_num = 0; + mirror_num = 1; } path = btrfs_alloc_path(); -- cgit v1.1 From e12fa9cd390f8e93a9144bd99bd6f6ed316fbc1e Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Fri, 17 Jun 2011 15:55:21 +0200 Subject: btrfs scrub: use int for mirror_num, not u64 the rest of the code uses int mirror_num, and so should scrub Signed-off-by: Jan Schmidt --- fs/btrfs/scrub.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/btrfs/scrub.c') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 59caf8f..41a0114 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -65,7 +65,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix); struct scrub_page { u64 flags; /* extent flags */ u64 generation; - u64 mirror_num; + int mirror_num; int have_csum; u8 csum[BTRFS_CSUM_SIZE]; }; @@ -776,7 +776,7 @@ nomem: } static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len, - u64 physical, u64 flags, u64 gen, u64 mirror_num, + u64 physical, u64 flags, u64 gen, int mirror_num, u8 *csum, int force) { struct scrub_bio *sbio; @@ -873,7 +873,7 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, /* scrub extent tries to collect up to 64 kB for each bio */ static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, - u64 physical, u64 flags, u64 gen, u64 mirror_num) + u64 physical, u64 flags, u64 gen, int mirror_num) { int ret; u8 csum[BTRFS_CSUM_SIZE]; @@ -919,7 +919,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, u64 physical; u64 logical; u64 generation; - u64 mirror_num; + int mirror_num; u64 increment = map->stripe_len; u64 offset; -- cgit v1.1 From 0ef8e45158f97dde4801b535e25f70f7caf01a27 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Mon, 13 Jun 2011 20:04:15 +0200 Subject: btrfs scrub: add fixup code for errors on nodatasum files This removes a FIXME comment and introduces the first part of nodatasum fixup: It gets the corresponding inode for a logical address and triggers a regular readpage for the corrupted sector. Once we have on-the-fly error correction our error will be automatically corrected. The correction code is expected to clear the newly introduced EXTENT_DAMAGED flag, making scrub report that error as "corrected" instead of "uncorrectable" eventually. Signed-off-by: Jan Schmidt --- fs/btrfs/scrub.c | 188 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 182 insertions(+), 6 deletions(-) (limited to 'fs/btrfs/scrub.c') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 41a0114..db09f01 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -22,6 +22,7 @@ #include "volumes.h" #include "disk-io.h" #include "ordered-data.h" +#include "transaction.h" #include "backref.h" /* @@ -89,6 +90,7 @@ struct scrub_dev { int first_free; int curr; atomic_t in_flight; + atomic_t fixup_cnt; spinlock_t list_lock; wait_queue_head_t list_wait; u16 csum_size; @@ -102,6 +104,14 @@ struct scrub_dev { spinlock_t stat_lock; }; +struct scrub_fixup_nodatasum { + struct scrub_dev *sdev; + u64 logical; + struct btrfs_root *root; + struct btrfs_work work; + int mirror_num; +}; + struct scrub_warning { struct btrfs_path *path; u64 extent_item_size; @@ -190,12 +200,13 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) if (i != SCRUB_BIOS_PER_DEV-1) sdev->bios[i]->next_free = i + 1; - else + else sdev->bios[i]->next_free = -1; } sdev->first_free = 0; sdev->curr = -1; atomic_set(&sdev->in_flight, 0); + atomic_set(&sdev->fixup_cnt, 0); atomic_set(&sdev->cancel_req, 0); sdev->csum_size = btrfs_super_csum_size(&fs_info->super_copy); INIT_LIST_HEAD(&sdev->csum_list); @@ -347,6 +358,151 @@ out: kfree(swarn.msg_buf); } +static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx) +{ + struct page *page; + unsigned long index; + struct scrub_fixup_nodatasum *fixup = ctx; + int ret; + int corrected; + struct btrfs_key key; + struct inode *inode; + u64 end = offset + PAGE_SIZE - 1; + struct btrfs_root *local_root; + + key.objectid = root; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key); + if (IS_ERR(local_root)) + return PTR_ERR(local_root); + + key.type = BTRFS_INODE_ITEM_KEY; + key.objectid = inum; + key.offset = 0; + inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + ret = set_extent_bit(&BTRFS_I(inode)->io_tree, offset, end, + EXTENT_DAMAGED, 0, NULL, NULL, GFP_NOFS); + + /* set_extent_bit should either succeed or give proper error */ + WARN_ON(ret > 0); + if (ret) + return ret < 0 ? ret : -EFAULT; + + index = offset >> PAGE_CACHE_SHIFT; + + page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); + if (!page) + return -ENOMEM; + + ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page, + btrfs_get_extent, fixup->mirror_num); + wait_on_page_locked(page); + corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset, end, + EXTENT_DAMAGED, 0, NULL); + + if (corrected) + WARN_ON(!PageUptodate(page)); + else + clear_extent_bit(&BTRFS_I(inode)->io_tree, offset, end, + EXTENT_DAMAGED, 0, 0, NULL, GFP_NOFS); + + put_page(page); + iput(inode); + + if (ret < 0) + return ret; + + if (ret == 0 && corrected) { + /* + * we only need to call readpage for one of the inodes belonging + * to this extent. so make iterate_extent_inodes stop + */ + return 1; + } + + return -EIO; +} + +static void scrub_fixup_nodatasum(struct btrfs_work *work) +{ + int ret; + struct scrub_fixup_nodatasum *fixup; + struct scrub_dev *sdev; + struct btrfs_trans_handle *trans = NULL; + struct btrfs_fs_info *fs_info; + struct btrfs_path *path; + int uncorrectable = 0; + + fixup = container_of(work, struct scrub_fixup_nodatasum, work); + sdev = fixup->sdev; + fs_info = fixup->root->fs_info; + + path = btrfs_alloc_path(); + if (!path) { + spin_lock(&sdev->stat_lock); + ++sdev->stat.malloc_errors; + spin_unlock(&sdev->stat_lock); + uncorrectable = 1; + goto out; + } + + trans = btrfs_join_transaction(fixup->root); + if (IS_ERR(trans)) { + uncorrectable = 1; + goto out; + } + + /* + * the idea is to trigger a regular read through the standard path. we + * read a page from the (failed) logical address by specifying the + * corresponding copynum of the failed sector. thus, that readpage is + * expected to fail. + * that is the point where on-the-fly error correction will kick in + * (once it's finished) and rewrite the failed sector if a good copy + * can be found. + */ + ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info, + path, scrub_fixup_readpage, + fixup); + if (ret < 0) { + uncorrectable = 1; + goto out; + } + WARN_ON(ret != 1); + + spin_lock(&sdev->stat_lock); + ++sdev->stat.corrected_errors; + spin_unlock(&sdev->stat_lock); + +out: + if (trans && !IS_ERR(trans)) + btrfs_end_transaction(trans, fixup->root); + if (uncorrectable) { + spin_lock(&sdev->stat_lock); + ++sdev->stat.uncorrectable_errors; + spin_unlock(&sdev->stat_lock); + printk_ratelimited(KERN_ERR "btrfs: unable to fixup " + "(nodatasum) error at logical %llu\n", + fixup->logical); + } + + btrfs_free_path(path); + kfree(fixup); + + /* see caller why we're pretending to be paused in the scrub counters */ + mutex_lock(&fs_info->scrub_lock); + atomic_dec(&fs_info->scrubs_running); + atomic_dec(&fs_info->scrubs_paused); + mutex_unlock(&fs_info->scrub_lock); + atomic_dec(&sdev->fixup_cnt); + wake_up(&fs_info->scrub_pause_wait); + wake_up(&sdev->list_wait); +} + /* * scrub_recheck_error gets called when either verification of the page * failed or the bio failed to read, e.g. with EIO. In the latter case, @@ -417,6 +573,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; struct btrfs_multi_bio *multi = NULL; + struct scrub_fixup_nodatasum *fixup; u64 logical = sbio->logical + ix * PAGE_SIZE; u64 length; int i; @@ -425,12 +582,30 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) && (sbio->spag[ix].have_csum == 0)) { + fixup = kzalloc(sizeof(*fixup), GFP_NOFS); + if (!fixup) + goto uncorrectable; + fixup->sdev = sdev; + fixup->logical = logical; + fixup->root = fs_info->extent_root; + fixup->mirror_num = sbio->spag[ix].mirror_num; /* - * nodatasum, don't try to fix anything - * FIXME: we can do better, open the inode and trigger a - * writeback + * increment scrubs_running to prevent cancel requests from + * completing as long as a fixup worker is running. we must also + * increment scrubs_paused to prevent deadlocking on pause + * requests used for transactions commits (as the worker uses a + * transaction context). it is safe to regard the fixup worker + * as paused for all matters practical. effectively, we only + * avoid cancellation requests from completing. */ - goto uncorrectable; + mutex_lock(&fs_info->scrub_lock); + atomic_inc(&fs_info->scrubs_running); + atomic_inc(&fs_info->scrubs_paused); + mutex_unlock(&fs_info->scrub_lock); + atomic_inc(&sdev->fixup_cnt); + fixup->work.func = scrub_fixup_nodatasum; + btrfs_queue_worker(&fs_info->scrub_workers, &fixup->work); + return; } length = PAGE_SIZE; @@ -1425,10 +1600,11 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, ret = scrub_enumerate_chunks(sdev, start, end); wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); - atomic_dec(&fs_info->scrubs_running); wake_up(&fs_info->scrub_pause_wait); + wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0); + if (progress) memcpy(progress, &sdev->stat, sizeof(*progress)); -- cgit v1.1 From a1d3c4786a4b9c71c0767aa656a759968f7554b6 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Thu, 4 Aug 2011 17:15:33 +0200 Subject: btrfs: btrfs_multi_bio replaced with btrfs_bio btrfs_bio is a bio abstraction able to split and not complete after the last bio has returned (like the old btrfs_multi_bio). Additionally, btrfs_bio tracks the mirror_num used to read data which can be used for error correction purposes. Signed-off-by: Jan Schmidt --- fs/btrfs/scrub.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'fs/btrfs/scrub.c') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index db09f01..97142a2 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -572,7 +572,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) struct scrub_dev *sdev = sbio->sdev; struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; - struct btrfs_multi_bio *multi = NULL; + struct btrfs_bio *bbio = NULL; struct scrub_fixup_nodatasum *fixup; u64 logical = sbio->logical + ix * PAGE_SIZE; u64 length; @@ -610,8 +610,8 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) length = PAGE_SIZE; ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, - &multi, 0); - if (ret || !multi || length < PAGE_SIZE) { + &bbio, 0); + if (ret || !bbio || length < PAGE_SIZE) { printk(KERN_ERR "scrub_fixup: btrfs_map_block failed us for %llu\n", (unsigned long long)logical); @@ -619,19 +619,19 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) return; } - if (multi->num_stripes == 1) + if (bbio->num_stripes == 1) /* there aren't any replicas */ goto uncorrectable; /* * first find a good copy */ - for (i = 0; i < multi->num_stripes; ++i) { + for (i = 0; i < bbio->num_stripes; ++i) { if (i + 1 == sbio->spag[ix].mirror_num) continue; - if (scrub_fixup_io(READ, multi->stripes[i].dev->bdev, - multi->stripes[i].physical >> 9, + if (scrub_fixup_io(READ, bbio->stripes[i].dev->bdev, + bbio->stripes[i].physical >> 9, sbio->bio->bi_io_vec[ix].bv_page)) { /* I/O-error, this is not a good copy */ continue; @@ -640,7 +640,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) if (scrub_fixup_check(sbio, ix) == 0) break; } - if (i == multi->num_stripes) + if (i == bbio->num_stripes) goto uncorrectable; if (!sdev->readonly) { @@ -655,7 +655,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) } } - kfree(multi); + kfree(bbio); spin_lock(&sdev->stat_lock); ++sdev->stat.corrected_errors; spin_unlock(&sdev->stat_lock); @@ -665,7 +665,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) return; uncorrectable: - kfree(multi); + kfree(bbio); spin_lock(&sdev->stat_lock); ++sdev->stat.uncorrectable_errors; spin_unlock(&sdev->stat_lock); -- cgit v1.1 From 5da6fcbc4eb50c0f55d520750332f5a6ab13508c Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Thu, 4 Aug 2011 18:11:04 +0200 Subject: btrfs: integrating raid-repair and scrub-fixup-nodatasum This ties nodatasum fixup in scrub together with raid repair patches. While both series are working fine alone, scrub will report uncorrectable errors if they occur in a nodatasum extent *and* the page is in the page cache. Previously, we would have triggered readpage to find good data and do the repair. However, readpage wouldn't read anything in the case where the page is up to date in the cache. So, we simply take that good data we have and call repair_io_failure directly (unless the page in the cache is dirty). Signed-off-by: Jan Schmidt --- fs/btrfs/scrub.c | 92 +++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 67 insertions(+), 25 deletions(-) (limited to 'fs/btrfs/scrub.c') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 97142a2..eba42e5 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -24,6 +24,7 @@ #include "ordered-data.h" #include "transaction.h" #include "backref.h" +#include "extent_io.h" /* * This is only the first step towards a full-features scrub. It reads all @@ -360,13 +361,13 @@ out: static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx) { - struct page *page; + struct page *page = NULL; unsigned long index; struct scrub_fixup_nodatasum *fixup = ctx; int ret; - int corrected; + int corrected = 0; struct btrfs_key key; - struct inode *inode; + struct inode *inode = NULL; u64 end = offset + PAGE_SIZE - 1; struct btrfs_root *local_root; @@ -384,34 +385,75 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx) if (IS_ERR(inode)) return PTR_ERR(inode); - ret = set_extent_bit(&BTRFS_I(inode)->io_tree, offset, end, - EXTENT_DAMAGED, 0, NULL, NULL, GFP_NOFS); - - /* set_extent_bit should either succeed or give proper error */ - WARN_ON(ret > 0); - if (ret) - return ret < 0 ? ret : -EFAULT; - index = offset >> PAGE_CACHE_SHIFT; page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); - if (!page) - return -ENOMEM; + if (!page) { + ret = -ENOMEM; + goto out; + } + + if (PageUptodate(page)) { + struct btrfs_mapping_tree *map_tree; + if (PageDirty(page)) { + /* + * we need to write the data to the defect sector. the + * data that was in that sector is not in memory, + * because the page was modified. we must not write the + * modified page to that sector. + * + * TODO: what could be done here: wait for the delalloc + * runner to write out that page (might involve + * COW) and see whether the sector is still + * referenced afterwards. + * + * For the meantime, we'll treat this error + * incorrectable, although there is a chance that a + * later scrub will find the bad sector again and that + * there's no dirty page in memory, then. + */ + ret = -EIO; + goto out; + } + map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; + ret = repair_io_failure(map_tree, offset, PAGE_SIZE, + fixup->logical, page, + fixup->mirror_num); + unlock_page(page); + corrected = !ret; + } else { + /* + * we need to get good data first. the general readpage path + * will call repair_io_failure for us, we just have to make + * sure we read the bad mirror. + */ + ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end, + EXTENT_DAMAGED, GFP_NOFS); + if (ret) { + /* set_extent_bits should give proper error */ + WARN_ON(ret > 0); + if (ret > 0) + ret = -EFAULT; + goto out; + } - ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page, - btrfs_get_extent, fixup->mirror_num); - wait_on_page_locked(page); - corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset, end, - EXTENT_DAMAGED, 0, NULL); + ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page, + btrfs_get_extent, + fixup->mirror_num); + wait_on_page_locked(page); - if (corrected) - WARN_ON(!PageUptodate(page)); - else - clear_extent_bit(&BTRFS_I(inode)->io_tree, offset, end, - EXTENT_DAMAGED, 0, 0, NULL, GFP_NOFS); + corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset, + end, EXTENT_DAMAGED, 0, NULL); + if (!corrected) + clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end, + EXTENT_DAMAGED, GFP_NOFS); + } - put_page(page); - iput(inode); +out: + if (page) + put_page(page); + if (inode) + iput(inode); if (ret < 0) return ret; -- cgit v1.1 From 7a26285eea8eb92e0088db011571d887d4551b0f Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Fri, 10 Jun 2011 12:39:23 +0200 Subject: btrfs: use readahead API for scrub Scrub uses a simple tree-enumeration to bring the relevant portions of the extent- and csum-tree into the page cache before starting the scrub-I/O. This is now replaced by using the new readahead-API. During readahead the scrub is being accounted as paused, so it won't hold off transaction commits. This change raises the average disk bandwith utilisation on my test volume from 70% to 90%. On another volume, the time for a test run went down from 89s to 43s. Changes v5: - reada1/2 are now of type struct reada_control * Signed-off-by: Arne Jansen --- fs/btrfs/scrub.c | 112 +++++++++++++++++++++++++------------------------------ 1 file changed, 50 insertions(+), 62 deletions(-) (limited to 'fs/btrfs/scrub.c') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index a8d03d5..f930f27 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -29,15 +29,12 @@ * any can be found. * * Future enhancements: - * - To enhance the performance, better read-ahead strategies for the - * extent-tree can be employed. * - In case an unrepairable extent is encountered, track which files are * affected and report them * - In case of a read error on files with nodatasum, map the file and read * the extent to trigger a writeback of the good copy * - track and record media errors, throw out bad devices * - add a mode to also read unallocated space - * - make the prefetch cancellable */ struct scrub_bio; @@ -741,13 +738,16 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, int slot; int i; u64 nstripes; - int start_stripe; struct extent_buffer *l; struct btrfs_key key; u64 physical; u64 logical; u64 generation; u64 mirror_num; + struct reada_control *reada1; + struct reada_control *reada2; + struct btrfs_key key_start; + struct btrfs_key key_end; u64 increment = map->stripe_len; u64 offset; @@ -779,81 +779,67 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, if (!path) return -ENOMEM; - path->reada = 2; path->search_commit_root = 1; path->skip_locking = 1; /* - * find all extents for each stripe and just read them to get - * them into the page cache - * FIXME: we can do better. build a more intelligent prefetching + * trigger the readahead for extent tree csum tree and wait for + * completion. During readahead, the scrub is officially paused + * to not hold off transaction commits */ logical = base + offset; - physical = map->stripes[num].physical; - ret = 0; - for (i = 0; i < nstripes; ++i) { - key.objectid = logical; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = (u64)0; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out_noplug; - - /* - * we might miss half an extent here, but that doesn't matter, - * as it's only the prefetch - */ - while (1) { - l = path->nodes[0]; - slot = path->slots[0]; - if (slot >= btrfs_header_nritems(l)) { - ret = btrfs_next_leaf(root, path); - if (ret == 0) - continue; - if (ret < 0) - goto out_noplug; - break; - } - btrfs_item_key_to_cpu(l, &key, slot); + wait_event(sdev->list_wait, + atomic_read(&sdev->in_flight) == 0); + atomic_inc(&fs_info->scrubs_paused); + wake_up(&fs_info->scrub_pause_wait); - if (key.objectid >= logical + map->stripe_len) - break; + /* FIXME it might be better to start readahead at commit root */ + key_start.objectid = logical; + key_start.type = BTRFS_EXTENT_ITEM_KEY; + key_start.offset = (u64)0; + key_end.objectid = base + offset + nstripes * increment; + key_end.type = BTRFS_EXTENT_ITEM_KEY; + key_end.offset = (u64)0; + reada1 = btrfs_reada_add(root, &key_start, &key_end); + + key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + key_start.type = BTRFS_EXTENT_CSUM_KEY; + key_start.offset = logical; + key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + key_end.type = BTRFS_EXTENT_CSUM_KEY; + key_end.offset = base + offset + nstripes * increment; + reada2 = btrfs_reada_add(csum_root, &key_start, &key_end); + + if (!IS_ERR(reada1)) + btrfs_reada_wait(reada1); + if (!IS_ERR(reada2)) + btrfs_reada_wait(reada2); - path->slots[0]++; - } - btrfs_release_path(path); - logical += increment; - physical += map->stripe_len; - cond_resched(); + mutex_lock(&fs_info->scrub_lock); + while (atomic_read(&fs_info->scrub_pause_req)) { + mutex_unlock(&fs_info->scrub_lock); + wait_event(fs_info->scrub_pause_wait, + atomic_read(&fs_info->scrub_pause_req) == 0); + mutex_lock(&fs_info->scrub_lock); } + atomic_dec(&fs_info->scrubs_paused); + mutex_unlock(&fs_info->scrub_lock); + wake_up(&fs_info->scrub_pause_wait); /* * collect all data csums for the stripe to avoid seeking during * the scrub. This might currently (crc32) end up to be about 1MB */ - start_stripe = 0; blk_start_plug(&plug); -again: - logical = base + offset + start_stripe * increment; - for (i = start_stripe; i < nstripes; ++i) { - ret = btrfs_lookup_csums_range(csum_root, logical, - logical + map->stripe_len - 1, - &sdev->csum_list, 1); - if (ret) - goto out; - logical += increment; - cond_resched(); - } /* * now find all extents for each stripe and scrub them */ - logical = base + offset + start_stripe * increment; - physical = map->stripes[num].physical + start_stripe * map->stripe_len; + logical = base + offset; + physical = map->stripes[num].physical; ret = 0; - for (i = start_stripe; i < nstripes; ++i) { + for (i = 0; i < nstripes; ++i) { /* * canceled? */ @@ -882,11 +868,14 @@ again: atomic_dec(&fs_info->scrubs_paused); mutex_unlock(&fs_info->scrub_lock); wake_up(&fs_info->scrub_pause_wait); - scrub_free_csums(sdev); - start_stripe = i; - goto again; } + ret = btrfs_lookup_csums_range(csum_root, logical, + logical + map->stripe_len - 1, + &sdev->csum_list, 1); + if (ret) + goto out; + key.objectid = logical; key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = (u64)0; @@ -982,7 +971,6 @@ next: out: blk_finish_plug(&plug); -out_noplug: btrfs_free_path(path); return ret < 0 ? ret : 0; } -- cgit v1.1 From 6c41761fc6efe1503103a1afe03a6635c0b5d4ec Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 13 Apr 2011 15:41:04 +0200 Subject: btrfs: separate superblock items out of fs_info fs_info has now ~9kb, more than fits into one page. This will cause mount failure when memory is too fragmented. Top space consumers are super block structures super_copy and super_for_commit, ~2.8kb each. Allocate them dynamically. fs_info will be ~3.5kb. (measured on x86_64) Add a wrapper for freeing fs_info and all of it's dynamically allocated members. Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/scrub.c') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index a8d03d5..69a600f 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -182,7 +182,7 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) sdev->curr = -1; atomic_set(&sdev->in_flight, 0); atomic_set(&sdev->cancel_req, 0); - sdev->csum_size = btrfs_super_csum_size(&fs_info->super_copy); + sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy); INIT_LIST_HEAD(&sdev->csum_list); spin_lock_init(&sdev->list_lock); -- cgit v1.1 From 740c3d226cbba6cd6a32adfb66809c94938f3e57 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 2 Nov 2011 15:48:34 -0400 Subject: Btrfs: fix the new inspection ioctls for 32 bit compat The new ioctls to follow backrefs are not clean for 32/64 bit compat. This reworks them for u64s everywhere. They are brand new, so there are no problems with changing the interface now. Signed-off-by: Chris Mason --- fs/btrfs/scrub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/scrub.c') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 94cd3a1..562dad1 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -272,7 +272,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx) swarn->logical, swarn->dev->name, (unsigned long long)swarn->sector, root, inum, offset, min(isize - offset, (u64)PAGE_SIZE), nlink, - ipath->fspath->str[i]); + (char *)ipath->fspath->val[i]); free_ipath(ipath); return 0; -- cgit v1.1 From 56d2a48f81a1bde827c625b90221fade72fe9c61 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 4 Nov 2011 09:41:02 -0400 Subject: Btrfs: fix a potential btrfs_bio leak on scrub fixups In case we were able to map less than we wanted (length < PAGE_SIZE clause is true) btrfs_bio is still allocated and we have to free it. Signed-off-by: Ilya Dryomov Signed-off-by: Chris Mason --- fs/btrfs/scrub.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/btrfs/scrub.c') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 562dad1..ed11d38 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -655,6 +655,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) "scrub_fixup: btrfs_map_block failed us for %llu\n", (unsigned long long)logical); WARN_ON(1); + kfree(bbio); return; } -- cgit v1.1