summaryrefslogtreecommitdiffstats
path: root/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
diff options
context:
space:
mode:
authormav <mav@FreeBSD.org>2015-10-03 11:35:18 +0000
committermav <mav@FreeBSD.org>2015-10-03 11:35:18 +0000
commit3f77fc2885eaae036939a434b28be8c3f8b5ee87 (patch)
treef4daa96d6a365dec1c22cfb553fe3711918fc51c /sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
parent2ceef49597891c2bd9ea4a774bafbf2091be51a6 (diff)
downloadFreeBSD-src-3f77fc2885eaae036939a434b28be8c3f8b5ee87.zip
FreeBSD-src-3f77fc2885eaae036939a434b28be8c3f8b5ee87.tar.gz
MFC r287702: 5987 zfs prefetch code needs work
Rewrite the ZFS prefetch code to detect only forward, sequential streams. The following kstats have been added: kstat.zfs.misc.arcstats.sync_wait_for_async How many sync reads have waited for async read to complete. (less is better) kstat.zfs.misc.arcstats.demand_hit_predictive_prefetch How many demand read didn't have to wait for I/O because of predictive prefetch. (more is better) zfetch kstats have been similified to hits, misses, and max_streams, with max_streams representing times when we were not able to create new stream because we already have the maximum number of sequences for a file. The sysctl variable/loader tunable vfs.zfs.zfetch.block_cap have been replaced by vfs.zfs.zfetch.max_distance, which controls maximum bytes to prefetch per stream. illumos/illumos-gate@cf6106c8a0d6598b045811f9650d66e07eb332af Illumos ZFS issues: 5987 zfs prefetch code needs work https://www.illumos.org/issues/5987
Diffstat (limited to 'sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c')
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c78
1 files changed, 69 insertions, 9 deletions
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
index e3afc56..1b537f1 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
@@ -213,7 +213,7 @@ static int arc_min_prefetch_lifespan;
int arc_lotsfree_percent = 10;
static int arc_dead;
-extern int zfs_prefetch_disable;
+extern boolean_t zfs_prefetch_disable;
/*
* The arc has filled available memory and has now warmed up.
@@ -585,6 +585,8 @@ typedef struct arc_stats {
kstat_named_t arcstat_meta_limit;
kstat_named_t arcstat_meta_max;
kstat_named_t arcstat_meta_min;
+ kstat_named_t arcstat_sync_wait_for_async;
+ kstat_named_t arcstat_demand_hit_predictive_prefetch;
} arc_stats_t;
static arc_stats_t arc_stats = {
@@ -683,7 +685,9 @@ static arc_stats_t arc_stats = {
{ "arc_meta_used", KSTAT_DATA_UINT64 },
{ "arc_meta_limit", KSTAT_DATA_UINT64 },
{ "arc_meta_max", KSTAT_DATA_UINT64 },
- { "arc_meta_min", KSTAT_DATA_UINT64 }
+ { "arc_meta_min", KSTAT_DATA_UINT64 },
+ { "sync_wait_for_async", KSTAT_DATA_UINT64 },
+ { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
};
#define ARCSTAT(stat) (arc_stats.stat.value.ui64)
@@ -4253,6 +4257,36 @@ top:
if (HDR_IO_IN_PROGRESS(hdr)) {
+ if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) &&
+ priority == ZIO_PRIORITY_SYNC_READ) {
+ /*
+ * This sync read must wait for an
+ * in-progress async read (e.g. a predictive
+ * prefetch). Async reads are queued
+ * separately at the vdev_queue layer, so
+ * this is a form of priority inversion.
+ * Ideally, we would "inherit" the demand
+ * i/o's priority by moving the i/o from
+ * the async queue to the synchronous queue,
+ * but there is currently no mechanism to do
+ * so. Track this so that we can evaluate
+ * the magnitude of this potential performance
+ * problem.
+ *
+ * Note that if the prefetch i/o is already
+ * active (has been issued to the device),
+ * the prefetch improved performance, because
+ * we issued it sooner than we would have
+ * without the prefetch.
+ */
+ DTRACE_PROBE1(arc__sync__wait__for__async,
+ arc_buf_hdr_t *, hdr);
+ ARCSTAT_BUMP(arcstat_sync_wait_for_async);
+ }
+ if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
+ hdr->b_flags &= ~ARC_FLAG_PREDICTIVE_PREFETCH;
+ }
+
if (*arc_flags & ARC_FLAG_WAIT) {
cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
mutex_exit(hash_lock);
@@ -4261,7 +4295,7 @@ top:
ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
if (done) {
- arc_callback_t *acb = NULL;
+ arc_callback_t *acb = NULL;
acb = kmem_zalloc(sizeof (arc_callback_t),
KM_SLEEP);
@@ -4286,6 +4320,19 @@ top:
hdr->b_l1hdr.b_state == arc_mfu);
if (done) {
+ if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
+ /*
+ * This is a demand read which does not have to
+ * wait for i/o because we did a predictive
+ * prefetch i/o for it, which has completed.
+ */
+ DTRACE_PROBE1(
+ arc__demand__hit__predictive__prefetch,
+ arc_buf_hdr_t *, hdr);
+ ARCSTAT_BUMP(
+ arcstat_demand_hit_predictive_prefetch);
+ hdr->b_flags &= ~ARC_FLAG_PREDICTIVE_PREFETCH;
+ }
add_reference(hdr, hash_lock, private);
/*
* If this block is already in use, create a new
@@ -4348,12 +4395,16 @@ top:
goto top; /* restart the IO request */
}
- /* if this is a prefetch, we don't have a reference */
- if (*arc_flags & ARC_FLAG_PREFETCH) {
+ /*
+ * If there is a callback, we pass our reference to
+ * it; otherwise we remove our reference.
+ */
+ if (done == NULL) {
(void) remove_reference(hdr, hash_lock,
private);
- hdr->b_flags |= ARC_FLAG_PREFETCH;
}
+ if (*arc_flags & ARC_FLAG_PREFETCH)
+ hdr->b_flags |= ARC_FLAG_PREFETCH;
if (*arc_flags & ARC_FLAG_L2CACHE)
hdr->b_flags |= ARC_FLAG_L2CACHE;
if (*arc_flags & ARC_FLAG_L2COMPRESS)
@@ -4376,11 +4427,13 @@ top:
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
- /* if this is a prefetch, we don't have a reference */
+ /*
+ * If there is a callback, we pass a reference to it.
+ */
+ if (done != NULL)
+ add_reference(hdr, hash_lock, private);
if (*arc_flags & ARC_FLAG_PREFETCH)
hdr->b_flags |= ARC_FLAG_PREFETCH;
- else
- add_reference(hdr, hash_lock, private);
if (*arc_flags & ARC_FLAG_L2CACHE)
hdr->b_flags |= ARC_FLAG_L2CACHE;
if (*arc_flags & ARC_FLAG_L2COMPRESS)
@@ -4398,6 +4451,8 @@ top:
arc_access(hdr, hash_lock);
}
+ if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH)
+ hdr->b_flags |= ARC_FLAG_PREDICTIVE_PREFETCH;
ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state));
acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
@@ -4440,6 +4495,11 @@ top:
curthread->td_ru.ru_inblock++;
#endif
+ if (priority == ZIO_PRIORITY_ASYNC_READ)
+ hdr->b_flags |= ARC_FLAG_PRIO_ASYNC_READ;
+ else
+ hdr->b_flags &= ~ARC_FLAG_PRIO_ASYNC_READ;
+
if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
/*
* Read from the L2ARC if the following are true:
OpenPOWER on IntegriCloud