MFC r287702: 5987 zfs prefetch code needs work

Rewrite the ZFS prefetch code to detect only forward, sequential streams. The following kstats have been added: kstat.zfs.misc.arcstats.sync_wait_for_async How many sync reads have waited for async read to complete. (less is better) kstat.zfs.misc.arcstats.demand_hit_predictive_prefetch How many demand read didn't have to wait for I/O because of predictive prefetch. (more is better) zfetch kstats have been similified to hits, misses, and max_streams, with max_streams representing times when we were not able to create new stream because we already have the maximum number of sequences for a file. The sysctl variable/loader tunable vfs.zfs.zfetch.block_cap have been replaced by vfs.zfs.zfetch.max_distance, which controls maximum bytes to prefetch per stream. illumos/illumos-gate@cf6106c8a0d6598b045811f9650d66e07eb332af Illumos ZFS issues: 5987 zfs prefetch code needs work https://www.illumos.org/issues/5987
author: mav <mav@FreeBSD.org> 2015-10-03 11:35:18 +0000
committer: mav <mav@FreeBSD.org> 2015-10-03 11:35:18 +0000
commit: 3f77fc2885eaae036939a434b28be8c3f8b5ee87 (patch)
tree: f4daa96d6a365dec1c22cfb553fe3711918fc51c /sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
parent: 2ceef49597891c2bd9ea4a774bafbf2091be51a6 (diff)
download: FreeBSD-src-3f77fc2885eaae036939a434b28be8c3f8b5ee87.zip
FreeBSD-src-3f77fc2885eaae036939a434b28be8c3f8b5ee87.tar.gz
1 files changed, 69 insertions, 9 deletions
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
index e3afc56..1b537f1 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
@@ -213,7 +213,7 @@ static int		arc_min_prefetch_lifespan;
 int arc_lotsfree_percent = 10;
 
 static int arc_dead;
-extern int zfs_prefetch_disable;
+extern boolean_t zfs_prefetch_disable;
 
 /*
  * The arc has filled available memory and has now warmed up.
@@ -585,6 +585,8 @@ typedef struct arc_stats {
 	kstat_named_t arcstat_meta_limit;
 	kstat_named_t arcstat_meta_max;
 	kstat_named_t arcstat_meta_min;
+	kstat_named_t arcstat_sync_wait_for_async;
+	kstat_named_t arcstat_demand_hit_predictive_prefetch;
 } arc_stats_t;
 
 static arc_stats_t arc_stats = {
@@ -683,7 +685,9 @@ static arc_stats_t arc_stats = {
 	{ "arc_meta_used",		KSTAT_DATA_UINT64 },
 	{ "arc_meta_limit",		KSTAT_DATA_UINT64 },
 	{ "arc_meta_max",		KSTAT_DATA_UINT64 },
-	{ "arc_meta_min",		KSTAT_DATA_UINT64 }
+	{ "arc_meta_min",		KSTAT_DATA_UINT64 },
+	{ "sync_wait_for_async",	KSTAT_DATA_UINT64 },
+	{ "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
 };
 
 #define	ARCSTAT(stat)	(arc_stats.stat.value.ui64)
@@ -4253,6 +4257,36 @@ top:
 
 		if (HDR_IO_IN_PROGRESS(hdr)) {
 
+			if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) &&
+			    priority == ZIO_PRIORITY_SYNC_READ) {
+				/*
+				 * This sync read must wait for an
+				 * in-progress async read (e.g. a predictive
+				 * prefetch).  Async reads are queued
+				 * separately at the vdev_queue layer, so
+				 * this is a form of priority inversion.
+				 * Ideally, we would "inherit" the demand
+				 * i/o's priority by moving the i/o from
+				 * the async queue to the synchronous queue,
+				 * but there is currently no mechanism to do
+				 * so.  Track this so that we can evaluate
+				 * the magnitude of this potential performance
+				 * problem.
+				 *
+				 * Note that if the prefetch i/o is already
+				 * active (has been issued to the device),
+				 * the prefetch improved performance, because
+				 * we issued it sooner than we would have
+				 * without the prefetch.
+				 */
+				DTRACE_PROBE1(arc__sync__wait__for__async,
+				    arc_buf_hdr_t *, hdr);
+				ARCSTAT_BUMP(arcstat_sync_wait_for_async);
+			}
+			if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
+				hdr->b_flags &= ~ARC_FLAG_PREDICTIVE_PREFETCH;
+			}
+
 			if (*arc_flags & ARC_FLAG_WAIT) {
 				cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
 				mutex_exit(hash_lock);
@@ -4261,7 +4295,7 @@ top:
 			ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
 
 			if (done) {
-				arc_callback_t	*acb = NULL;
+				arc_callback_t *acb = NULL;
 
 				acb = kmem_zalloc(sizeof (arc_callback_t),
 				    KM_SLEEP);
@@ -4286,6 +4320,19 @@ top:
 		    hdr->b_l1hdr.b_state == arc_mfu);
 
 		if (done) {
+			if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
+				/*
+				 * This is a demand read which does not have to
+				 * wait for i/o because we did a predictive
+				 * prefetch i/o for it, which has completed.
+				 */
+				DTRACE_PROBE1(
+				    arc__demand__hit__predictive__prefetch,
+				    arc_buf_hdr_t *, hdr);
+				ARCSTAT_BUMP(
+				    arcstat_demand_hit_predictive_prefetch);
+				hdr->b_flags &= ~ARC_FLAG_PREDICTIVE_PREFETCH;
+			}
 			add_reference(hdr, hash_lock, private);
 			/*
 			 * If this block is already in use, create a new
@@ -4348,12 +4395,16 @@ top:
 				goto top; /* restart the IO request */
 			}
 
-			/* if this is a prefetch, we don't have a reference */
-			if (*arc_flags & ARC_FLAG_PREFETCH) {
+			/*
+			 * If there is a callback, we pass our reference to
+			 * it; otherwise we remove our reference.
+			 */
+			if (done == NULL) {
 				(void) remove_reference(hdr, hash_lock,
 				    private);
-				hdr->b_flags |= ARC_FLAG_PREFETCH;
 			}
+			if (*arc_flags & ARC_FLAG_PREFETCH)
+				hdr->b_flags |= ARC_FLAG_PREFETCH;
 			if (*arc_flags & ARC_FLAG_L2CACHE)
 				hdr->b_flags |= ARC_FLAG_L2CACHE;
 			if (*arc_flags & ARC_FLAG_L2COMPRESS)
@@ -4376,11 +4427,13 @@ top:
 			ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 			ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
 
-			/* if this is a prefetch, we don't have a reference */
+			/*
+			 * If there is a callback, we pass a reference to it.
+			 */
+			if (done != NULL)
+				add_reference(hdr, hash_lock, private);
 			if (*arc_flags & ARC_FLAG_PREFETCH)
 				hdr->b_flags |= ARC_FLAG_PREFETCH;
-			else
-				add_reference(hdr, hash_lock, private);
 			if (*arc_flags & ARC_FLAG_L2CACHE)
 				hdr->b_flags |= ARC_FLAG_L2CACHE;
 			if (*arc_flags & ARC_FLAG_L2COMPRESS)
@@ -4398,6 +4451,8 @@ top:
 			arc_access(hdr, hash_lock);
 		}
 
+		if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH)
+			hdr->b_flags |= ARC_FLAG_PREDICTIVE_PREFETCH;
 		ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state));
 
 		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
@@ -4440,6 +4495,11 @@ top:
 		curthread->td_ru.ru_inblock++;
 #endif
 
+		if (priority == ZIO_PRIORITY_ASYNC_READ)
+			hdr->b_flags |= ARC_FLAG_PRIO_ASYNC_READ;
+		else
+			hdr->b_flags &= ~ARC_FLAG_PRIO_ASYNC_READ;
+
 		if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
 			/*
 			 * Read from the L2ARC if the following are true:
author	mav <mav@FreeBSD.org>	2015-10-03 11:35:18 +0000
committer	mav <mav@FreeBSD.org>	2015-10-03 11:35:18 +0000
commit	3f77fc2885eaae036939a434b28be8c3f8b5ee87 (patch)
tree	f4daa96d6a365dec1c22cfb553fe3711918fc51c /sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
parent	2ceef49597891c2bd9ea4a774bafbf2091be51a6 (diff)
download	FreeBSD-src-3f77fc2885eaae036939a434b28be8c3f8b5ee87.zip FreeBSD-src-3f77fc2885eaae036939a434b28be8c3f8b5ee87.tar.gz