diff options
Diffstat (limited to 'sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c')
-rw-r--r-- | sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c | 61 |
1 files changed, 48 insertions, 13 deletions
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c index d7d0dee..60fd4c3 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c @@ -40,9 +40,9 @@ * * ZFS issues I/O operations to leaf vdevs to satisfy and complete zios. The * I/O scheduler determines when and in what order those operations are - * issued. The I/O scheduler divides operations into five I/O classes + * issued. The I/O scheduler divides operations into six I/O classes * prioritized in the following order: sync read, sync write, async read, - * async write, and scrub/resilver. Each queue defines the minimum and + * async write, scrub/resilver and trim. Each queue defines the minimum and * maximum number of concurrent operations that may be issued to the device. * In addition, the device has an aggregate maximum. Note that the sum of the * per-queue minimums must not exceed the aggregate maximum, and if the @@ -61,7 +61,7 @@ * done in the order specified above. No further operations are issued if the * aggregate maximum number of concurrent operations has been hit or if there * are no operations queued for an I/O class that has not hit its maximum. - * Every time an i/o is queued or an operation completes, the I/O scheduler + * Every time an I/O is queued or an operation completes, the I/O scheduler * looks for new operations to issue. * * All I/O classes have a fixed maximum number of outstanding operations @@ -70,7 +70,7 @@ * transaction groups (see txg.c). Transaction groups enter the syncing state * periodically so the number of queued async writes will quickly burst up and * then bleed down to zero. Rather than servicing them as quickly as possible, - * the I/O scheduler changes the maximum number of active async write i/os + * the I/O scheduler changes the maximum number of active async write I/Os * according to the amount of dirty data in the pool (see dsl_pool.c). Since * both throughput and latency typically increase with the number of * concurrent operations issued to physical devices, reducing the burstiness @@ -113,14 +113,14 @@ */ /* - * The maximum number of i/os active to each device. Ideally, this will be >= + * The maximum number of I/Os active to each device. Ideally, this will be >= * the sum of each queue's max_active. It must be at least the sum of each * queue's min_active. */ uint32_t zfs_vdev_max_active = 1000; /* - * Per-queue limits on the number of i/os active to each device. If the + * Per-queue limits on the number of I/Os active to each device. If the * sum of the queue's max_active is < zfs_vdev_max_active, then the * min_active comes into play. We will send min_active from each queue, * and then select from queues in the order defined by zio_priority_t. @@ -145,6 +145,14 @@ uint32_t zfs_vdev_async_write_min_active = 1; uint32_t zfs_vdev_async_write_max_active = 10; uint32_t zfs_vdev_scrub_min_active = 1; uint32_t zfs_vdev_scrub_max_active = 2; +uint32_t zfs_vdev_trim_min_active = 1; +/* + * TRIM max active is large in comparison to the other values due to the fact + * that TRIM IOs are coalesced at the device layer. This value is set such + * that a typical SSD can process the queued IOs in a single request. + */ +uint32_t zfs_vdev_trim_max_active = 64; + /* * When the pool has less than zfs_vdev_async_write_active_min_dirty_percent @@ -171,7 +179,7 @@ SYSCTL_DECL(_vfs_zfs_vdev); TUNABLE_INT("vfs.zfs.vdev.max_active", &zfs_vdev_max_active); SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, max_active, CTLFLAG_RW, &zfs_vdev_max_active, 0, - "The maximum number of i/os of all types active for each device."); + "The maximum number of I/Os of all types active for each device."); #define ZFS_VDEV_QUEUE_KNOB_MIN(name) \ TUNABLE_INT("vfs.zfs.vdev." #name "_min_active", \ @@ -199,6 +207,8 @@ ZFS_VDEV_QUEUE_KNOB_MIN(async_write); ZFS_VDEV_QUEUE_KNOB_MAX(async_write); ZFS_VDEV_QUEUE_KNOB_MIN(scrub); ZFS_VDEV_QUEUE_KNOB_MAX(scrub); +ZFS_VDEV_QUEUE_KNOB_MIN(trim); +ZFS_VDEV_QUEUE_KNOB_MAX(trim); #undef ZFS_VDEV_QUEUE_KNOB @@ -299,6 +309,7 @@ static void vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) { spa_t *spa = zio->io_spa; + ASSERT(MUTEX_HELD(&vq->vq_lock)); ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); avl_add(&vq->vq_class[zio->io_priority].vqc_queued_tree, zio); @@ -315,6 +326,7 @@ static void vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) { spa_t *spa = zio->io_spa; + ASSERT(MUTEX_HELD(&vq->vq_lock)); ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); avl_remove(&vq->vq_class[zio->io_priority].vqc_queued_tree, zio); @@ -403,6 +415,8 @@ vdev_queue_class_min_active(zio_priority_t p) return (zfs_vdev_async_write_min_active); case ZIO_PRIORITY_SCRUB: return (zfs_vdev_scrub_min_active); + case ZIO_PRIORITY_TRIM: + return (zfs_vdev_trim_min_active); default: panic("invalid priority %u", p); return (0); @@ -454,6 +468,8 @@ vdev_queue_class_max_active(spa_t *spa, zio_priority_t p) spa->spa_dsl_pool->dp_dirty_total)); case ZIO_PRIORITY_SCRUB: return (zfs_vdev_scrub_max_active); + case ZIO_PRIORITY_TRIM: + return (zfs_vdev_trim_max_active); default: panic("invalid priority %u", p); return (0); @@ -470,6 +486,8 @@ vdev_queue_class_to_issue(vdev_queue_t *vq) spa_t *spa = vq->vq_vdev->vdev_spa; zio_priority_t p; + ASSERT(MUTEX_HELD(&vq->vq_lock)); + if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active) return (ZIO_PRIORITY_NUM_QUEUEABLE); @@ -511,10 +529,11 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) zio_t *first, *last, *aio, *dio, *mandatory, *nio; uint64_t maxgap = 0; uint64_t size; - boolean_t stretch = B_FALSE; - vdev_queue_class_t *vqc = &vq->vq_class[zio->io_priority]; - avl_tree_t *t = &vqc->vqc_queued_tree; - enum zio_flag flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT; + boolean_t stretch; + avl_tree_t *t; + enum zio_flag flags; + + ASSERT(MUTEX_HELD(&vq->vq_lock)); if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE) return (NULL); @@ -552,6 +571,8 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) * Walk backwards through sufficiently contiguous I/Os * recording the last non-option I/O. */ + flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT; + t = &vq->vq_class[zio->io_priority].vqc_queued_tree; while ((dio = AVL_PREV(t, first)) != NULL && (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && IO_SPAN(dio, last) <= zfs_vdev_aggregation_limit && @@ -591,6 +612,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) * non-optional I/O is close enough to make aggregation * worthwhile. */ + stretch = B_FALSE; if (zio->io_type == ZIO_TYPE_WRITE && mandatory != NULL) { zio_t *nio = last; while ((dio = AVL_NEXT(t, nio)) != NULL && @@ -731,11 +753,13 @@ vdev_queue_io(zio_t *zio) zio->io_priority != ZIO_PRIORITY_ASYNC_READ && zio->io_priority != ZIO_PRIORITY_SCRUB) zio->io_priority = ZIO_PRIORITY_ASYNC_READ; - } else { - ASSERT(zio->io_type == ZIO_TYPE_WRITE); + } else if (zio->io_type == ZIO_TYPE_WRITE) { if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE && zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE) zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE; + } else { + ASSERT(zio->io_type == ZIO_TYPE_FREE); + zio->io_priority = ZIO_PRIORITY_TRIM; } zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE; @@ -772,14 +796,25 @@ vdev_queue_io_done(zio_t *zio) vq->vq_io_complete_ts = gethrtime(); + if (zio->io_flags & ZIO_FLAG_QUEUE_IO_DONE) { + /* + * Executing from a previous vdev_queue_io_done so + * to avoid recursion we just unlock and return. + */ + mutex_exit(&vq->vq_lock); + return; + } + while ((nio = vdev_queue_io_to_issue(vq)) != NULL) { mutex_exit(&vq->vq_lock); + nio->io_flags |= ZIO_FLAG_QUEUE_IO_DONE; if (nio->io_done == vdev_queue_agg_io_done) { zio_nowait(nio); } else { zio_vdev_io_reissue(nio); zio_execute(nio); } + nio->io_flags &= ~ZIO_FLAG_QUEUE_IO_DONE; mutex_enter(&vq->vq_lock); } |