From 6eeac1fd52e44e90b32d7ba537b3d3def3e2e558 Mon Sep 17 00:00:00 2001 From: mav Date: Sat, 3 Oct 2015 07:32:34 +0000 Subject: MFC r286579: 5313 Allow I/Os to be aggregated across ZIO priority classes Reviewed by: Andriy Gapon Reviewed by: Will Andrews Reviewed by: Matt Ahrens Reviewed by: George Wilson Approved by: Robert Mustacchi Author: Justin T. Gibbs illumos/illumos-gate@fe319232d24f4ae183730a5a24a09423d8ab4429 --- .../opensolaris/uts/common/fs/zfs/sys/vdev_impl.h | 2 + .../opensolaris/uts/common/fs/zfs/sys/zio.h | 1 + .../opensolaris/uts/common/fs/zfs/vdev_queue.c | 68 +++++++++++++++------- 3 files changed, 51 insertions(+), 20 deletions(-) (limited to 'sys/cddl/contrib') diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h index 6934245..a31d04c 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h @@ -113,6 +113,8 @@ struct vdev_queue { vdev_t *vq_vdev; vdev_queue_class_t vq_class[ZIO_PRIORITY_NUM_QUEUEABLE]; avl_tree_t vq_active_tree; + avl_tree_t vq_read_offset_tree; + avl_tree_t vq_write_offset_tree; uint64_t vq_last_offset; hrtime_t vq_io_complete_ts; /* time last i/o completed */ kmutex_t vq_lock; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h index ca80dd1..fea7db5 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h @@ -461,6 +461,7 @@ struct zio { uint64_t io_offset; hrtime_t io_timestamp; avl_node_t io_queue_node; + avl_node_t io_offset_node; /* Internal pipeline state */ enum zio_flag io_flags; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c index 1dba319..245f360 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c @@ -302,6 +302,22 @@ vdev_queue_offset_compare(const void *x1, const void *x2) return (0); } +static inline avl_tree_t * +vdev_queue_class_tree(vdev_queue_t *vq, zio_priority_t p) +{ + return (&vq->vq_class[p].vqc_queued_tree); +} + +static inline avl_tree_t * +vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t) +{ + ASSERT(t == ZIO_TYPE_READ || t == ZIO_TYPE_WRITE); + if (t == ZIO_TYPE_READ) + return (&vq->vq_read_offset_tree); + else + return (&vq->vq_write_offset_tree); +} + int vdev_queue_timestamp_compare(const void *x1, const void *x2) { @@ -336,19 +352,27 @@ vdev_queue_init(vdev_t *vd) avl_create(&vq->vq_active_tree, vdev_queue_offset_compare, sizeof (zio_t), offsetof(struct zio, io_queue_node)); + avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_READ), + vdev_queue_offset_compare, sizeof (zio_t), + offsetof(struct zio, io_offset_node)); + avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE), + vdev_queue_offset_compare, sizeof (zio_t), + offsetof(struct zio, io_offset_node)); for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { + int (*compfn) (const void *, const void *); + /* - * The synchronous i/o queues are FIFO rather than LBA ordered. - * This provides more consistent latency for these i/os, and - * they tend to not be tightly clustered anyway so there is - * little to no throughput loss. + * The synchronous i/o queues are dispatched in FIFO rather + * than LBA order. This provides more consistent latency for + * these i/os. */ - boolean_t fifo = (p == ZIO_PRIORITY_SYNC_READ || - p == ZIO_PRIORITY_SYNC_WRITE); - avl_create(&vq->vq_class[p].vqc_queued_tree, - fifo ? vdev_queue_timestamp_compare : - vdev_queue_offset_compare, + if (p == ZIO_PRIORITY_SYNC_READ || p == ZIO_PRIORITY_SYNC_WRITE) + compfn = vdev_queue_timestamp_compare; + else + compfn = vdev_queue_offset_compare; + + avl_create(vdev_queue_class_tree(vq, p), compfn, sizeof (zio_t), offsetof(struct zio, io_queue_node)); } @@ -361,8 +385,10 @@ vdev_queue_fini(vdev_t *vd) vdev_queue_t *vq = &vd->vdev_queue; for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) - avl_destroy(&vq->vq_class[p].vqc_queued_tree); + avl_destroy(vdev_queue_class_tree(vq, p)); avl_destroy(&vq->vq_active_tree); + avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ)); + avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE)); mutex_destroy(&vq->vq_lock); } @@ -373,7 +399,8 @@ vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) spa_t *spa = zio->io_spa; ASSERT(MUTEX_HELD(&vq->vq_lock)); ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); - avl_add(&vq->vq_class[zio->io_priority].vqc_queued_tree, zio); + avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio); + avl_add(vdev_queue_type_tree(vq, zio->io_type), zio); #ifdef illumos mutex_enter(&spa->spa_iokstat_lock); @@ -390,7 +417,8 @@ vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) spa_t *spa = zio->io_spa; ASSERT(MUTEX_HELD(&vq->vq_lock)); ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); - avl_remove(&vq->vq_class[zio->io_priority].vqc_queued_tree, zio); + avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio); + avl_remove(vdev_queue_type_tree(vq, zio->io_type), zio); #ifdef illumos mutex_enter(&spa->spa_iokstat_lock); @@ -563,7 +591,7 @@ vdev_queue_class_to_issue(vdev_queue_t *vq) /* find a queue that has not reached its minimum # outstanding i/os */ for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { - if (avl_numnodes(&vq->vq_class[p].vqc_queued_tree) > 0 && + if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 && vq->vq_class[p].vqc_active < vdev_queue_class_min_active(p)) return (p); @@ -574,7 +602,7 @@ vdev_queue_class_to_issue(vdev_queue_t *vq) * maximum # outstanding i/os. */ for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { - if (avl_numnodes(&vq->vq_class[p].vqc_queued_tree) > 0 && + if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 && vq->vq_class[p].vqc_active < vdev_queue_class_max_active(spa, p)) return (p); @@ -642,7 +670,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) * recording the last non-option I/O. */ flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT; - t = &vq->vq_class[zio->io_priority].vqc_queued_tree; + t = vdev_queue_type_tree(vq, zio->io_type); while ((dio = AVL_PREV(t, first)) != NULL && (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && IO_SPAN(dio, last) <= zfs_vdev_aggregation_limit && @@ -751,7 +779,7 @@ vdev_queue_io_to_issue(vdev_queue_t *vq) zio_t *zio, *aio; zio_priority_t p; avl_index_t idx; - vdev_queue_class_t *vqc; + avl_tree_t *tree; zio_t search; again: @@ -770,13 +798,13 @@ again: * * For FIFO queues (sync), issue the i/o with the lowest timestamp. */ - vqc = &vq->vq_class[p]; + tree = vdev_queue_class_tree(vq, p); search.io_timestamp = 0; search.io_offset = vq->vq_last_offset + 1; - VERIFY3P(avl_find(&vqc->vqc_queued_tree, &search, &idx), ==, NULL); - zio = avl_nearest(&vqc->vqc_queued_tree, idx, AVL_AFTER); + VERIFY3P(avl_find(tree, &search, &idx), ==, NULL); + zio = avl_nearest(tree, idx, AVL_AFTER); if (zio == NULL) - zio = avl_first(&vqc->vqc_queued_tree); + zio = avl_first(tree); ASSERT3U(zio->io_priority, ==, p); aio = vdev_queue_aggregate(vq, zio); -- cgit v1.1