Btrfs, scrub: repair the common data on RAID5/6 if it is corrupted

This patch implement the RAID5/6 common data repair function, the implementation is similar to the scrub on the other RAID such as RAID1, the differentia is that we don't read the data from the mirror, we use the data repair function of RAID5/6. Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
author: Miao Xie <miaox@cn.fujitsu.com> 2014-10-23 14:42:50 +0800
committer: Miao Xie <miaox@cn.fujitsu.com> 2014-12-03 10:18:45 +0800
commit: af8e2d1df9848b39dd86b1e696bf8781d2020a88 (patch)
tree: 0abf72105056f1c5fe7038c37f4c5d63ea29c875
parent: b89e1b012c7f81123344058d5f245b844464d30c (diff)
download: op-kernel-dev-af8e2d1df9848b39dd86b1e696bf8781d2020a88.zip
op-kernel-dev-af8e2d1df9848b39dd86b1e696bf8781d2020a88.tar.gz
5 files changed, 235 insertions, 33 deletions
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index c54b0e6..95053a9 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -58,6 +58,15 @@
  */
 #define RBIO_CACHE_READY_BIT	3
 
+/*
+ * bbio and raid_map is managed by the caller, so we shouldn't free
+ * them here. And besides that, all rbios with this flag should not
+ * be cached, because we need raid_map to check the rbios' stripe
+ * is the same or not, but it is very likely that the caller has
+ * free raid_map, so don't cache those rbios.
+ */
+#define RBIO_HOLD_BBIO_MAP_BIT	4
+
 #define RBIO_CACHE_SIZE 1024
 
 struct btrfs_raid_bio {
@@ -799,6 +808,21 @@ done_nolock:
 		remove_rbio_from_cache(rbio);
 }
 
+static inline void
+__free_bbio_and_raid_map(struct btrfs_bio *bbio, u64 *raid_map, int need)
+{
+	if (need) {
+		kfree(raid_map);
+		kfree(bbio);
+	}
+}
+
+static inline void free_bbio_and_raid_map(struct btrfs_raid_bio *rbio)
+{
+	__free_bbio_and_raid_map(rbio->bbio, rbio->raid_map,
+			!test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags));
+}
+
 static void __free_raid_bio(struct btrfs_raid_bio *rbio)
 {
 	int i;
@@ -817,8 +841,9 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio)
 			rbio->stripe_pages[i] = NULL;
 		}
 	}
-	kfree(rbio->raid_map);
-	kfree(rbio->bbio);
+
+	free_bbio_and_raid_map(rbio);
+
 	kfree(rbio);
 }
 
@@ -933,11 +958,8 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
 
 	rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2,
 			GFP_NOFS);
-	if (!rbio) {
-		kfree(raid_map);
-		kfree(bbio);
+	if (!rbio)
 		return ERR_PTR(-ENOMEM);
-	}
 
 	bio_list_init(&rbio->bio_list);
 	INIT_LIST_HEAD(&rbio->plug_list);
@@ -1692,8 +1714,10 @@ int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
 	struct blk_plug_cb *cb;
 
 	rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
-	if (IS_ERR(rbio))
+	if (IS_ERR(rbio)) {
+		__free_bbio_and_raid_map(bbio, raid_map, 1);
 		return PTR_ERR(rbio);
+	}
 	bio_list_add(&rbio->bio_list, bio);
 	rbio->bio_list_bytes = bio->bi_iter.bi_size;
 
@@ -1888,7 +1912,8 @@ cleanup:
 cleanup_io:
 
 	if (rbio->read_rebuild) {
-		if (err == 0)
+		if (err == 0 &&
+		    !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags))
 			cache_rbio_pages(rbio);
 		else
 			clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
@@ -2038,15 +2063,19 @@ cleanup:
  */
 int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
 			  struct btrfs_bio *bbio, u64 *raid_map,
-			  u64 stripe_len, int mirror_num)
+			  u64 stripe_len, int mirror_num, int hold_bbio)
 {
 	struct btrfs_raid_bio *rbio;
 	int ret;
 
 	rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
-	if (IS_ERR(rbio))
+	if (IS_ERR(rbio)) {
+		__free_bbio_and_raid_map(bbio, raid_map, !hold_bbio);
 		return PTR_ERR(rbio);
+	}
 
+	if (hold_bbio)
+		set_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags);
 	rbio->read_rebuild = 1;
 	bio_list_add(&rbio->bio_list, bio);
 	rbio->bio_list_bytes = bio->bi_iter.bi_size;
@@ -2054,8 +2083,7 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
 	rbio->faila = find_logical_bio_stripe(rbio, bio);
 	if (rbio->faila == -1) {
 		BUG();
-		kfree(raid_map);
-		kfree(bbio);
+		__free_bbio_and_raid_map(bbio, raid_map, !hold_bbio);
 		kfree(rbio);
 		return -EIO;
 	}
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index ea5d73b..b310e8c 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -41,7 +41,7 @@ static inline int nr_data_stripes(struct map_lookup *map)
 
 int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
 				 struct btrfs_bio *bbio, u64 *raid_map,
-				 u64 stripe_len, int mirror_num);
+				 u64 stripe_len, int mirror_num, int hold_bbio);
 int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
 			       struct btrfs_bio *bbio, u64 *raid_map,
 			       u64 stripe_len);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index efa0831..ca4b9eb 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -63,6 +63,13 @@ struct scrub_ctx;
  */
 #define SCRUB_MAX_PAGES_PER_BLOCK	16	/* 64k per node/leaf/sector */
 
+struct scrub_recover {
+	atomic_t		refs;
+	struct btrfs_bio	*bbio;
+	u64			*raid_map;
+	u64			map_length;
+};
+
 struct scrub_page {
 	struct scrub_block	*sblock;
 	struct page		*page;
@@ -79,6 +86,8 @@ struct scrub_page {
 		unsigned int	io_error:1;
 	};
 	u8			csum[BTRFS_CSUM_SIZE];
+
+	struct scrub_recover	*recover;
 };
 
 struct scrub_bio {
@@ -196,7 +205,7 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
 				struct scrub_block *sblock, int is_metadata,
 				int have_csum, u8 *csum, u64 generation,
-				u16 csum_size);
+				u16 csum_size, int retry_failed_mirror);
 static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
 					 struct scrub_block *sblock,
 					 int is_metadata, int have_csum,
@@ -790,6 +799,20 @@ out:
 	scrub_pending_trans_workers_dec(sctx);
 }
 
+static inline void scrub_get_recover(struct scrub_recover *recover)
+{
+	atomic_inc(&recover->refs);
+}
+
+static inline void scrub_put_recover(struct scrub_recover *recover)
+{
+	if (atomic_dec_and_test(&recover->refs)) {
+		kfree(recover->bbio);
+		kfree(recover->raid_map);
+		kfree(recover);
+	}
+}
+
 /*
  * scrub_handle_errored_block gets called when either verification of the
  * pages failed or the bio failed to read, e.g. with EIO. In the latter
@@ -906,7 +929,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 
 	/* build and submit the bios for the failed mirror, check checksums */
 	scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
-			    csum, generation, sctx->csum_size);
+			    csum, generation, sctx->csum_size, 1);
 
 	if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
 	    sblock_bad->no_io_error_seen) {
@@ -1019,7 +1042,7 @@ nodatasum_case:
 		/* build and submit the bios, check checksums */
 		scrub_recheck_block(fs_info, sblock_other, is_metadata,
 				    have_csum, csum, generation,
-				    sctx->csum_size);
+				    sctx->csum_size, 0);
 
 		if (!sblock_other->header_error &&
 		    !sblock_other->checksum_error &&
@@ -1169,7 +1192,7 @@ nodatasum_case:
 			 */
 			scrub_recheck_block(fs_info, sblock_bad,
 					    is_metadata, have_csum, csum,
-					    generation, sctx->csum_size);
+					    generation, sctx->csum_size, 1);
 			if (!sblock_bad->header_error &&
 			    !sblock_bad->checksum_error &&
 			    sblock_bad->no_io_error_seen)
@@ -1201,11 +1224,18 @@ out:
 		     mirror_index++) {
 			struct scrub_block *sblock = sblocks_for_recheck +
 						     mirror_index;
+			struct scrub_recover *recover;
 			int page_index;
 
 			for (page_index = 0; page_index < sblock->page_count;
 			     page_index++) {
 				sblock->pagev[page_index]->sblock = NULL;
+				recover = sblock->pagev[page_index]->recover;
+				if (recover) {
+					scrub_put_recover(recover);
+					sblock->pagev[page_index]->recover =
+									NULL;
+				}
 				scrub_page_put(sblock->pagev[page_index]);
 			}
 		}
@@ -1215,14 +1245,63 @@ out:
 	return 0;
 }
 
+static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio, u64 *raid_map)
+{
+	if (raid_map) {
+		if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
+			return 3;
+		else
+			return 2;
+	} else {
+		return (int)bbio->num_stripes;
+	}
+}
+
+static inline void scrub_stripe_index_and_offset(u64 logical, u64 *raid_map,
+						 u64 mapped_length,
+						 int nstripes, int mirror,
+						 int *stripe_index,
+						 u64 *stripe_offset)
+{
+	int i;
+
+	if (raid_map) {
+		/* RAID5/6 */
+		for (i = 0; i < nstripes; i++) {
+			if (raid_map[i] == RAID6_Q_STRIPE ||
+			    raid_map[i] == RAID5_P_STRIPE)
+				continue;
+
+			if (logical >= raid_map[i] &&
+			    logical < raid_map[i] + mapped_length)
+				break;
+		}
+
+		*stripe_index = i;
+		*stripe_offset = logical - raid_map[i];
+	} else {
+		/* The other RAID type */
+		*stripe_index = mirror;
+		*stripe_offset = 0;
+	}
+}
+
 static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
 				     struct btrfs_fs_info *fs_info,
 				     struct scrub_block *original_sblock,
 				     u64 length, u64 logical,
 				     struct scrub_block *sblocks_for_recheck)
 {
+	struct scrub_recover *recover;
+	struct btrfs_bio *bbio;
+	u64 *raid_map;
+	u64 sublen;
+	u64 mapped_length;
+	u64 stripe_offset;
+	int stripe_index;
 	int page_index;
 	int mirror_index;
+	int nmirrors;
 	int ret;
 
 	/*
@@ -1233,23 +1312,39 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
 
 	page_index = 0;
 	while (length > 0) {
-		u64 sublen = min_t(u64, length, PAGE_SIZE);
-		u64 mapped_length = sublen;
-		struct btrfs_bio *bbio = NULL;
+		sublen = min_t(u64, length, PAGE_SIZE);
+		mapped_length = sublen;
+		bbio = NULL;
+		raid_map = NULL;
 
 		/*
 		 * with a length of PAGE_SIZE, each returned stripe
 		 * represents one mirror
 		 */
-		ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical,
-				      &mapped_length, &bbio, 0);
+		ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical,
+				       &mapped_length, &bbio, 0, &raid_map);
 		if (ret || !bbio || mapped_length < sublen) {
 			kfree(bbio);
+			kfree(raid_map);
 			return -EIO;
 		}
 
+		recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
+		if (!recover) {
+			kfree(bbio);
+			kfree(raid_map);
+			return -ENOMEM;
+		}
+
+		atomic_set(&recover->refs, 1);
+		recover->bbio = bbio;
+		recover->raid_map = raid_map;
+		recover->map_length = mapped_length;
+
 		BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
-		for (mirror_index = 0; mirror_index < (int)bbio->num_stripes;
+
+		nmirrors = scrub_nr_raid_mirrors(bbio, raid_map);
+		for (mirror_index = 0; mirror_index < nmirrors;
 		     mirror_index++) {
 			struct scrub_block *sblock;
 			struct scrub_page *page;
@@ -1265,26 +1360,38 @@ leave_nomem:
 				spin_lock(&sctx->stat_lock);
 				sctx->stat.malloc_errors++;
 				spin_unlock(&sctx->stat_lock);
-				kfree(bbio);
+				scrub_put_recover(recover);
 				return -ENOMEM;
 			}
 			scrub_page_get(page);
 			sblock->pagev[page_index] = page;
 			page->logical = logical;
-			page->physical = bbio->stripes[mirror_index].physical;
+
+			scrub_stripe_index_and_offset(logical, raid_map,
+						      mapped_length,
+						      bbio->num_stripes,
+						      mirror_index,
+						      &stripe_index,
+						      &stripe_offset);
+			page->physical = bbio->stripes[stripe_index].physical +
+					 stripe_offset;
+			page->dev = bbio->stripes[stripe_index].dev;
+
 			BUG_ON(page_index >= original_sblock->page_count);
 			page->physical_for_dev_replace =
 				original_sblock->pagev[page_index]->
 				physical_for_dev_replace;
 			/* for missing devices, dev->bdev is NULL */
-			page->dev = bbio->stripes[mirror_index].dev;
 			page->mirror_num = mirror_index + 1;
 			sblock->page_count++;
 			page->page = alloc_page(GFP_NOFS);
 			if (!page->page)
 				goto leave_nomem;
+
+			scrub_get_recover(recover);
+			page->recover = recover;
 		}
-		kfree(bbio);
+		scrub_put_recover(recover);
 		length -= sublen;
 		logical += sublen;
 		page_index++;
@@ -1293,6 +1400,51 @@ leave_nomem:
 	return 0;
 }
 
+struct scrub_bio_ret {
+	struct completion event;
+	int error;
+};
+
+static void scrub_bio_wait_endio(struct bio *bio, int error)
+{
+	struct scrub_bio_ret *ret = bio->bi_private;
+
+	ret->error = error;
+	complete(&ret->event);
+}
+
+static inline int scrub_is_page_on_raid56(struct scrub_page *page)
+{
+	return page->recover && page->recover->raid_map;
+}
+
+static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
+					struct bio *bio,
+					struct scrub_page *page)
+{
+	struct scrub_bio_ret done;
+	int ret;
+
+	init_completion(&done.event);
+	done.error = 0;
+	bio->bi_iter.bi_sector = page->logical >> 9;
+	bio->bi_private = &done;
+	bio->bi_end_io = scrub_bio_wait_endio;
+
+	ret = raid56_parity_recover(fs_info->fs_root, bio, page->recover->bbio,
+				    page->recover->raid_map,
+				    page->recover->map_length,
+				    page->mirror_num, 1);
+	if (ret)
+		return ret;
+
+	wait_for_completion(&done.event);
+	if (done.error)
+		return -EIO;
+
+	return 0;
+}
+
 /*
  * this function will check the on disk data for checksum errors, header
  * errors and read I/O errors. If any I/O errors happen, the exact pages
@@ -1303,7 +1455,7 @@ leave_nomem:
 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
 				struct scrub_block *sblock, int is_metadata,
 				int have_csum, u8 *csum, u64 generation,
-				u16 csum_size)
+				u16 csum_size, int retry_failed_mirror)
 {
 	int page_num;
 
@@ -1329,11 +1481,17 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
 			continue;
 		}
 		bio->bi_bdev = page->dev->bdev;
-		bio->bi_iter.bi_sector = page->physical >> 9;
 
 		bio_add_page(bio, page->page, PAGE_SIZE, 0);
-		if (btrfsic_submit_bio_wait(READ, bio))
-			sblock->no_io_error_seen = 0;
+		if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) {
+			if (scrub_submit_raid56_bio_wait(fs_info, bio, page))
+				sblock->no_io_error_seen = 0;
+		} else {
+			bio->bi_iter.bi_sector = page->physical >> 9;
+
+			if (btrfsic_submit_bio_wait(READ, bio))
+				sblock->no_io_error_seen = 0;
+		}
 
 		bio_put(bio);
 	}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 6f5b302..217c42e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5161,7 +5161,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 				BTRFS_BLOCK_GROUP_RAID6)) {
 		u64 tmp;
 
-		if (raid_map_ret && ((rw & REQ_WRITE) || mirror_num > 1)) {
+		if (raid_map_ret &&
+		    ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) ||
+		     mirror_num > 1)) {
 			int i, rot;
 
 			/* push stripe_nr back to the start of the full stripe */
@@ -5440,6 +5442,16 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 				 mirror_num, NULL);
 }
 
+/* For Scrub/replace */
+int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw,
+		     u64 logical, u64 *length,
+		     struct btrfs_bio **bbio_ret, int mirror_num,
+		     u64 **raid_map_ret)
+{
+	return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
+				 mirror_num, raid_map_ret);
+}
+
 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
 		     u64 chunk_start, u64 physical, u64 devid,
 		     u64 **logical, int *naddrs, int *stripe_len)
@@ -5809,7 +5821,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 		} else {
 			ret = raid56_parity_recover(root, bio, bbio,
 						    raid_map, map_length,
-						    mirror_num);
+						    mirror_num, 0);
 		}
 		/*
 		 * FIXME, replace dosen't support raid56 yet, please fix
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 08980fa..01094bb 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -393,6 +393,10 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
 int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 		    u64 logical, u64 *length,
 		    struct btrfs_bio **bbio_ret, int mirror_num);
+int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw,
+		     u64 logical, u64 *length,
+		     struct btrfs_bio **bbio_ret, int mirror_num,
+		     u64 **raid_map_ret);
 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
 		     u64 chunk_start, u64 physical, u64 devid,
 		     u64 **logical, int *naddrs, int *stripe_len);
author	Miao Xie <miaox@cn.fujitsu.com>	2014-10-23 14:42:50 +0800
committer	Miao Xie <miaox@cn.fujitsu.com>	2014-12-03 10:18:45 +0800
commit	af8e2d1df9848b39dd86b1e696bf8781d2020a88 (patch)
tree	0abf72105056f1c5fe7038c37f4c5d63ea29c875
parent	b89e1b012c7f81123344058d5f245b844464d30c (diff)
download	op-kernel-dev-af8e2d1df9848b39dd86b1e696bf8781d2020a88.zip op-kernel-dev-af8e2d1df9848b39dd86b1e696bf8781d2020a88.tar.gz