From 4ca40c2ce099e4f1ce35445994f49836662596c8 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 23 Dec 2011 10:17:55 +1100
Subject: md/raid10: Allow replacement device to be replace old drive.

When recovery finish and spare_active is called, check for a
replace that might have just become fully synced and mark it
as such, marking the original as failed.

Then when the original is removed, move the replacement into
its position.

This means that 'replacement' and spontaneously become NULL in some
situations.  Make sure we check for those.
It also means that 'rdev' and 'replacement' could appear to be
identical - check for that too.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid10.c | 72 +++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 61 insertions(+), 11 deletions(-)

(limited to 'drivers/md/raid10.c')

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 403f05a..90e95173 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -397,14 +397,17 @@ static void raid10_end_write_request(struct bio *bio, int error)
 	int dec_rdev = 1;
 	struct r10conf *conf = r10_bio->mddev->private;
 	int slot, repl;
-	struct md_rdev *rdev;
+	struct md_rdev *rdev = NULL;
 
 	dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
 
 	if (repl)
 		rdev = conf->mirrors[dev].replacement;
-	else
+	if (!rdev) {
+		smp_rmb();
+		repl = 0;
 		rdev = conf->mirrors[dev].rdev;
+	}
 	/*
 	 * this branch is our 'one mirror IO has finished' event handler:
 	 */
@@ -1089,6 +1092,8 @@ retry_write:
 		struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
 		struct md_rdev *rrdev = rcu_dereference(
 			conf->mirrors[d].replacement);
+		if (rdev == rrdev)
+			rrdev = NULL;
 		if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
 			atomic_inc(&rdev->nr_pending);
 			blocked_rdev = rdev;
@@ -1170,9 +1175,15 @@ retry_write:
 				rdev_dec_pending(conf->mirrors[d].rdev, mddev);
 			}
 			if (r10_bio->devs[j].repl_bio) {
+				struct md_rdev *rdev;
 				d = r10_bio->devs[j].devnum;
-				rdev_dec_pending(
-					conf->mirrors[d].replacement, mddev);
+				rdev = conf->mirrors[d].replacement;
+				if (!rdev) {
+					/* Race with remove_disk */
+					smp_mb();
+					rdev = conf->mirrors[d].rdev;
+				}
+				rdev_dec_pending(rdev, mddev);
 			}
 		}
 		allow_barrier(conf);
@@ -1230,6 +1241,10 @@ retry_write:
 			    max_sectors);
 		r10_bio->devs[i].repl_bio = mbio;
 
+		/* We are actively writing to the original device
+		 * so it cannot disappear, so the replacement cannot
+		 * become NULL here
+		 */
 		mbio->bi_sector	= (r10_bio->devs[i].addr+
 				   conf->mirrors[d].replacement->data_offset);
 		mbio->bi_bdev = conf->mirrors[d].replacement->bdev;
@@ -1404,9 +1419,27 @@ static int raid10_spare_active(struct mddev *mddev)
 	 */
 	for (i = 0; i < conf->raid_disks; i++) {
 		tmp = conf->mirrors + i;
-		if (tmp->rdev
-		    && !test_bit(Faulty, &tmp->rdev->flags)
-		    && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
+		if (tmp->replacement
+		    && tmp->replacement->recovery_offset == MaxSector
+		    && !test_bit(Faulty, &tmp->replacement->flags)
+		    && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
+			/* Replacement has just become active */
+			if (!tmp->rdev
+			    || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
+				count++;
+			if (tmp->rdev) {
+				/* Replaced device not technically faulty,
+				 * but we need to be sure it gets removed
+				 * and never re-added.
+				 */
+				set_bit(Faulty, &tmp->rdev->flags);
+				sysfs_notify_dirent_safe(
+					tmp->rdev->sysfs_state);
+			}
+			sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
+		} else if (tmp->rdev
+			   && !test_bit(Faulty, &tmp->rdev->flags)
+			   && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
 			count++;
 			sysfs_notify_dirent(tmp->rdev->sysfs_state);
 		}
@@ -1506,6 +1539,7 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 	 */
 	if (!test_bit(Faulty, &rdev->flags) &&
 	    mddev->recovery_disabled != p->recovery_disabled &&
+	    (!p->replacement || p->replacement == rdev) &&
 	    enough(conf, -1)) {
 		err = -EBUSY;
 		goto abort;
@@ -1517,7 +1551,21 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 		err = -EBUSY;
 		*rdevp = rdev;
 		goto abort;
-	}
+	} else if (p->replacement) {
+		/* We must have just cleared 'rdev' */
+		p->rdev = p->replacement;
+		clear_bit(Replacement, &p->replacement->flags);
+		smp_mb(); /* Make sure other CPUs may see both as identical
+			   * but will never see neither -- if they are careful.
+			   */
+		p->replacement = NULL;
+		clear_bit(WantReplacement, &rdev->flags);
+	} else
+		/* We might have just remove the Replacement as faulty
+		 * Clear the flag just in case
+		 */
+		clear_bit(WantReplacement, &rdev->flags);
+
 	err = md_integrity_register(mddev);
 
 abort:
@@ -1595,13 +1643,15 @@ static void end_sync_write(struct bio *bio, int error)
 	int bad_sectors;
 	int slot;
 	int repl;
-	struct md_rdev *rdev;
+	struct md_rdev *rdev = NULL;
 
 	d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
 	if (repl)
 		rdev = conf->mirrors[d].replacement;
-	else
+	if (!rdev) {
+		smp_mb();
 		rdev = conf->mirrors[d].rdev;
+	}
 
 	if (!uptodate) {
 		if (repl)
@@ -2368,7 +2418,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
 			}
 			bio = r10_bio->devs[m].repl_bio;
 			rdev = conf->mirrors[dev].replacement;
-			if (bio == IO_MADE_GOOD) {
+			if (rdev && bio == IO_MADE_GOOD) {
 				rdev_clear_badblocks(
 					rdev,
 					r10_bio->devs[m].addr,
-- 
cgit v1.1