drbd: do not reset rs_pending_cnt too early

Fix asserts like block drbd0: in got_BlockAck:4634: rs_pending_cnt = -35 < 0 ! We reset the resync lru cache and related information (rs_pending_cnt), once we successfully finished a resync or online verify, or if the replication connection is lost. We also need to reset it if a resync or online verify is aborted because a lower level disk failed. In that case the replication link is still established, and we may still have packets queued in the network buffers which want to touch rs_pending_cnt. We do not have any synchronization mechanism to know for sure when all such pending resync related packets have been drained. To avoid this counter to go negative (and violate the ASSERT that it will always be >= 0), just do not reset it when we lose a disk. It is good enough to make sure it is re-initialized before the next resync can start: reset it when we re-attach a disk. Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com> Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
author: Lars Ellenberg <lars.ellenberg@linbit.com> 2012-06-14 18:02:52 +0200
committer: Philipp Reisner <philipp.reisner@linbit.com> 2012-07-24 14:09:53 +0200
commit: 0029d62434d9045bc3e8b2eb48ae696e30336e92 (patch)
tree: d1e2ce7a3130e5b489320b12b3568c8a58828697 /drivers/block
parent: 88437879fbd0ac5cde3f683f8eee455bbf83aaac (diff)
download: op-kernel-dev-0029d62434d9045bc3e8b2eb48ae696e30336e92.zip
op-kernel-dev-0029d62434d9045bc3e8b2eb48ae696e30336e92.tar.gz
3 files changed, 12 insertions, 12 deletions
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 41ccb58..91a4853 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -1514,6 +1514,13 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 
 	/* Do not change the order of the if above and the two below... */
 	if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) {      /* attach on the peer */
+		/* we probably will start a resync soon.
+		 * make sure those things are properly reset. */
+		mdev->rs_total = 0;
+		mdev->rs_failed = 0;
+		atomic_set(&mdev->rs_pending_cnt, 0);
+		drbd_rs_cancel_all(mdev);
+
 		drbd_send_uuids(mdev);
 		drbd_send_state(mdev, ns);
 	}
@@ -1681,10 +1688,6 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
                                 "ASSERT FAILED: disk is %s while going diskless\n",
                                 drbd_disk_str(mdev->state.disk));
 
-                mdev->rs_total = 0;
-                mdev->rs_failed = 0;
-                atomic_set(&mdev->rs_pending_cnt, 0);
-
 		if (ns.conn >= C_CONNECTED)
 			drbd_send_state(mdev, ns);
 
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 03fc853..a68d9bf 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -959,6 +959,11 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 	/* make sure there is no leftover from previous force-detach attempts */
 	clear_bit(FORCE_DETACH, &mdev->flags);
 
+	/* and no leftover from previously aborted resync or verify, either */
+	mdev->rs_total = 0;
+	mdev->rs_failed = 0;
+	atomic_set(&mdev->rs_pending_cnt, 0);
+
 	/* allocation not in the IO path, cqueue thread context */
 	nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL);
 	if (!nbc) {
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index a35393f..6bce2cc 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -1501,14 +1501,6 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
 		return;
 	}
 
-	if (mdev->state.conn < C_AHEAD) {
-		/* In case a previous resync run was aborted by an IO error/detach on the peer. */
-		drbd_rs_cancel_all(mdev);
-		/* This should be done when we abort the resync. We definitely do not
-		   want to have this for connections going back and forth between
-		   Ahead/Behind and SyncSource/SyncTarget */
-	}
-
 	if (side == C_SYNC_TARGET) {
 		/* Since application IO was locked out during C_WF_BITMAP_T and
 		   C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET
author	Lars Ellenberg <lars.ellenberg@linbit.com>	2012-06-14 18:02:52 +0200
committer	Philipp Reisner <philipp.reisner@linbit.com>	2012-07-24 14:09:53 +0200
commit	0029d62434d9045bc3e8b2eb48ae696e30336e92 (patch)
tree	d1e2ce7a3130e5b489320b12b3568c8a58828697 /drivers/block
parent	88437879fbd0ac5cde3f683f8eee455bbf83aaac (diff)
download	op-kernel-dev-0029d62434d9045bc3e8b2eb48ae696e30336e92.zip op-kernel-dev-0029d62434d9045bc3e8b2eb48ae696e30336e92.tar.gz