From 5ea0541321e9ba54c56acd578c1c2fd41c7d0b1d Mon Sep 17 00:00:00 2001
From: trociny <trociny@FreeBSD.org>
Date: Thu, 19 Sep 2013 20:19:08 +0000
Subject: When updating the map of dirty extents, most recently used extents
 are kept dirty to reduce the number of on-disk metadata updates. The sequence
 of operations is:

1) acquire the activemap lock;
2) update in-memory map;
3) if the list of keepdirty extents is changed, update on-disk metadata;
4) release the lock.

On-disk updates are not frequent in comparison with in-memory updates,
while require much more time. So situations are possible when one
thread is updating on-disk metadata and another one is waiting for the
activemap lock just to update the in-memory map.

Improve this by introducing additional, on-disk map lock: when
in-memory map is updated and it is detected that the on-disk map needs
update too, the on-disk map lock is acquired and the on-memory lock is
released before flushing the map.

Reported by:	Yamagi Burmeister yamagi.org
Tested by:	Yamagi Burmeister yamagi.org
Reviewed by:	pjd
Approved by:	re (marius)
MFC after:	2 weeks
---
 sbin/hastd/hast.h    |  4 +++-
 sbin/hastd/primary.c | 30 ++++++++++++++++++++++--------
 2 files changed, 25 insertions(+), 9 deletions(-)

(limited to 'sbin/hastd')

diff --git a/sbin/hastd/hast.h b/sbin/hastd/hast.h
index 381e195..65c24f8 100644
--- a/sbin/hastd/hast.h
+++ b/sbin/hastd/hast.h
@@ -226,8 +226,10 @@ struct hast_resource {
 
 	/* Activemap structure. */
 	struct activemap *hr_amp;
-	/* Locked used to synchronize access to hr_amp. */
+	/* Lock used to synchronize access to hr_amp. */
 	pthread_mutex_t hr_amp_lock;
+	/* Lock used to synchronize access to hr_amp diskmap. */
+	pthread_mutex_t hr_amp_diskmap_lock;
 
 	/* Number of BIO_READ requests. */
 	uint64_t	hr_stat_read;
diff --git a/sbin/hastd/primary.c b/sbin/hastd/primary.c
index 5758f41..09ae17b 100644
--- a/sbin/hastd/primary.c
+++ b/sbin/hastd/primary.c
@@ -291,22 +291,28 @@ primary_exitx(int exitcode, const char *fmt, ...)
 	exit(exitcode);
 }
 
+/* Expects res->hr_amp locked, returns unlocked. */
 static int
 hast_activemap_flush(struct hast_resource *res)
 {
 	const unsigned char *buf;
 	size_t size;
+	int ret;
 
+	mtx_lock(&res->hr_amp_diskmap_lock);
 	buf = activemap_bitmap(res->hr_amp, &size);
+	mtx_unlock(&res->hr_amp_lock);
 	PJDLOG_ASSERT(buf != NULL);
 	PJDLOG_ASSERT((size % res->hr_local_sectorsize) == 0);
+	ret = 0;
 	if (pwrite(res->hr_localfd, buf, size, METADATA_SIZE) !=
 	    (ssize_t)size) {
 		pjdlog_errno(LOG_ERR, "Unable to flush activemap to disk");
 		res->hr_stat_activemap_write_error++;
-		return (-1);
+		ret = -1;
 	}
-	if (res->hr_metaflush == 1 && g_flush(res->hr_localfd) == -1) {
+	if (ret == 0 && res->hr_metaflush == 1 &&
+	    g_flush(res->hr_localfd) == -1) {
 		if (errno == EOPNOTSUPP) {
 			pjdlog_warning("The %s provider doesn't support flushing write cache. Disabling it.",
 			    res->hr_localpath);
@@ -315,10 +321,11 @@ hast_activemap_flush(struct hast_resource *res)
 			pjdlog_errno(LOG_ERR,
 			    "Unable to flush disk cache on activemap update");
 			res->hr_stat_activemap_flush_error++;
-			return (-1);
+			ret = -1;
 		}
 	}
-	return (0);
+	mtx_unlock(&res->hr_amp_diskmap_lock);
+	return (ret);
 }
 
 static bool
@@ -783,6 +790,7 @@ init_remote(struct hast_resource *res, struct proto_conn **inp,
 		 * Now that we merged bitmaps from both nodes, flush it to the
 		 * disk before we start to synchronize.
 		 */
+		mtx_lock(&res->hr_amp_lock);
 		(void)hast_activemap_flush(res);
 	}
 	nv_free(nvin);
@@ -1288,8 +1296,9 @@ ggate_recv_thread(void *arg)
 			    ggio->gctl_offset, ggio->gctl_length)) {
 				res->hr_stat_activemap_update++;
 				(void)hast_activemap_flush(res);
+			} else {
+				mtx_unlock(&res->hr_amp_lock);
 			}
-			mtx_unlock(&res->hr_amp_lock);
 			break;
 		case BIO_DELETE:
 			res->hr_stat_delete++;
@@ -1650,8 +1659,9 @@ done_queue:
 			if (activemap_need_sync(res->hr_amp, ggio->gctl_offset,
 			    ggio->gctl_length)) {
 				(void)hast_activemap_flush(res);
+			} else {
+				mtx_unlock(&res->hr_amp_lock);
 			}
-			mtx_unlock(&res->hr_amp_lock);
 			if (hio->hio_replication == HAST_REPLICATION_MEMSYNC)
 				(void)refcnt_release(&hio->hio_countdown);
 		}
@@ -1918,8 +1928,9 @@ ggate_send_thread(void *arg)
 			    ggio->gctl_offset, ggio->gctl_length)) {
 				res->hr_stat_activemap_update++;
 				(void)hast_activemap_flush(res);
+			} else {
+				mtx_unlock(&res->hr_amp_lock);
 			}
-			mtx_unlock(&res->hr_amp_lock);
 		}
 		if (ggio->gctl_cmd == BIO_WRITE) {
 			/*
@@ -2015,8 +2026,11 @@ sync_thread(void *arg __unused)
 			 */
 			if (activemap_extent_complete(res->hr_amp, syncext))
 				(void)hast_activemap_flush(res);
+			else
+				mtx_unlock(&res->hr_amp_lock);
+		} else {
+			mtx_unlock(&res->hr_amp_lock);
 		}
-		mtx_unlock(&res->hr_amp_lock);
 		if (dorewind) {
 			dorewind = false;
 			if (offset == -1)
-- 
cgit v1.1