summaryrefslogtreecommitdiffstats
path: root/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
diff options
context:
space:
mode:
Diffstat (limited to 'sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c')
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c61
1 files changed, 48 insertions, 13 deletions
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
index d7d0dee..60fd4c3 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
@@ -40,9 +40,9 @@
*
* ZFS issues I/O operations to leaf vdevs to satisfy and complete zios. The
* I/O scheduler determines when and in what order those operations are
- * issued. The I/O scheduler divides operations into five I/O classes
+ * issued. The I/O scheduler divides operations into six I/O classes
* prioritized in the following order: sync read, sync write, async read,
- * async write, and scrub/resilver. Each queue defines the minimum and
+ * async write, scrub/resilver and trim. Each queue defines the minimum and
* maximum number of concurrent operations that may be issued to the device.
* In addition, the device has an aggregate maximum. Note that the sum of the
* per-queue minimums must not exceed the aggregate maximum, and if the
@@ -61,7 +61,7 @@
* done in the order specified above. No further operations are issued if the
* aggregate maximum number of concurrent operations has been hit or if there
* are no operations queued for an I/O class that has not hit its maximum.
- * Every time an i/o is queued or an operation completes, the I/O scheduler
+ * Every time an I/O is queued or an operation completes, the I/O scheduler
* looks for new operations to issue.
*
* All I/O classes have a fixed maximum number of outstanding operations
@@ -70,7 +70,7 @@
* transaction groups (see txg.c). Transaction groups enter the syncing state
* periodically so the number of queued async writes will quickly burst up and
* then bleed down to zero. Rather than servicing them as quickly as possible,
- * the I/O scheduler changes the maximum number of active async write i/os
+ * the I/O scheduler changes the maximum number of active async write I/Os
* according to the amount of dirty data in the pool (see dsl_pool.c). Since
* both throughput and latency typically increase with the number of
* concurrent operations issued to physical devices, reducing the burstiness
@@ -113,14 +113,14 @@
*/
/*
- * The maximum number of i/os active to each device. Ideally, this will be >=
+ * The maximum number of I/Os active to each device. Ideally, this will be >=
* the sum of each queue's max_active. It must be at least the sum of each
* queue's min_active.
*/
uint32_t zfs_vdev_max_active = 1000;
/*
- * Per-queue limits on the number of i/os active to each device. If the
+ * Per-queue limits on the number of I/Os active to each device. If the
* sum of the queue's max_active is < zfs_vdev_max_active, then the
* min_active comes into play. We will send min_active from each queue,
* and then select from queues in the order defined by zio_priority_t.
@@ -145,6 +145,14 @@ uint32_t zfs_vdev_async_write_min_active = 1;
uint32_t zfs_vdev_async_write_max_active = 10;
uint32_t zfs_vdev_scrub_min_active = 1;
uint32_t zfs_vdev_scrub_max_active = 2;
+uint32_t zfs_vdev_trim_min_active = 1;
+/*
+ * TRIM max active is large in comparison to the other values due to the fact
+ * that TRIM IOs are coalesced at the device layer. This value is set such
+ * that a typical SSD can process the queued IOs in a single request.
+ */
+uint32_t zfs_vdev_trim_max_active = 64;
+
/*
* When the pool has less than zfs_vdev_async_write_active_min_dirty_percent
@@ -171,7 +179,7 @@ SYSCTL_DECL(_vfs_zfs_vdev);
TUNABLE_INT("vfs.zfs.vdev.max_active", &zfs_vdev_max_active);
SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, max_active, CTLFLAG_RW,
&zfs_vdev_max_active, 0,
- "The maximum number of i/os of all types active for each device.");
+ "The maximum number of I/Os of all types active for each device.");
#define ZFS_VDEV_QUEUE_KNOB_MIN(name) \
TUNABLE_INT("vfs.zfs.vdev." #name "_min_active", \
@@ -199,6 +207,8 @@ ZFS_VDEV_QUEUE_KNOB_MIN(async_write);
ZFS_VDEV_QUEUE_KNOB_MAX(async_write);
ZFS_VDEV_QUEUE_KNOB_MIN(scrub);
ZFS_VDEV_QUEUE_KNOB_MAX(scrub);
+ZFS_VDEV_QUEUE_KNOB_MIN(trim);
+ZFS_VDEV_QUEUE_KNOB_MAX(trim);
#undef ZFS_VDEV_QUEUE_KNOB
@@ -299,6 +309,7 @@ static void
vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
{
spa_t *spa = zio->io_spa;
+ ASSERT(MUTEX_HELD(&vq->vq_lock));
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
avl_add(&vq->vq_class[zio->io_priority].vqc_queued_tree, zio);
@@ -315,6 +326,7 @@ static void
vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
{
spa_t *spa = zio->io_spa;
+ ASSERT(MUTEX_HELD(&vq->vq_lock));
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
avl_remove(&vq->vq_class[zio->io_priority].vqc_queued_tree, zio);
@@ -403,6 +415,8 @@ vdev_queue_class_min_active(zio_priority_t p)
return (zfs_vdev_async_write_min_active);
case ZIO_PRIORITY_SCRUB:
return (zfs_vdev_scrub_min_active);
+ case ZIO_PRIORITY_TRIM:
+ return (zfs_vdev_trim_min_active);
default:
panic("invalid priority %u", p);
return (0);
@@ -454,6 +468,8 @@ vdev_queue_class_max_active(spa_t *spa, zio_priority_t p)
spa->spa_dsl_pool->dp_dirty_total));
case ZIO_PRIORITY_SCRUB:
return (zfs_vdev_scrub_max_active);
+ case ZIO_PRIORITY_TRIM:
+ return (zfs_vdev_trim_max_active);
default:
panic("invalid priority %u", p);
return (0);
@@ -470,6 +486,8 @@ vdev_queue_class_to_issue(vdev_queue_t *vq)
spa_t *spa = vq->vq_vdev->vdev_spa;
zio_priority_t p;
+ ASSERT(MUTEX_HELD(&vq->vq_lock));
+
if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active)
return (ZIO_PRIORITY_NUM_QUEUEABLE);
@@ -511,10 +529,11 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
zio_t *first, *last, *aio, *dio, *mandatory, *nio;
uint64_t maxgap = 0;
uint64_t size;
- boolean_t stretch = B_FALSE;
- vdev_queue_class_t *vqc = &vq->vq_class[zio->io_priority];
- avl_tree_t *t = &vqc->vqc_queued_tree;
- enum zio_flag flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
+ boolean_t stretch;
+ avl_tree_t *t;
+ enum zio_flag flags;
+
+ ASSERT(MUTEX_HELD(&vq->vq_lock));
if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE)
return (NULL);
@@ -552,6 +571,8 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
* Walk backwards through sufficiently contiguous I/Os
* recording the last non-option I/O.
*/
+ flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
+ t = &vq->vq_class[zio->io_priority].vqc_queued_tree;
while ((dio = AVL_PREV(t, first)) != NULL &&
(dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
IO_SPAN(dio, last) <= zfs_vdev_aggregation_limit &&
@@ -591,6 +612,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
* non-optional I/O is close enough to make aggregation
* worthwhile.
*/
+ stretch = B_FALSE;
if (zio->io_type == ZIO_TYPE_WRITE && mandatory != NULL) {
zio_t *nio = last;
while ((dio = AVL_NEXT(t, nio)) != NULL &&
@@ -731,11 +753,13 @@ vdev_queue_io(zio_t *zio)
zio->io_priority != ZIO_PRIORITY_ASYNC_READ &&
zio->io_priority != ZIO_PRIORITY_SCRUB)
zio->io_priority = ZIO_PRIORITY_ASYNC_READ;
- } else {
- ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+ } else if (zio->io_type == ZIO_TYPE_WRITE) {
if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE &&
zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE)
zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE;
+ } else {
+ ASSERT(zio->io_type == ZIO_TYPE_FREE);
+ zio->io_priority = ZIO_PRIORITY_TRIM;
}
zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE;
@@ -772,14 +796,25 @@ vdev_queue_io_done(zio_t *zio)
vq->vq_io_complete_ts = gethrtime();
+ if (zio->io_flags & ZIO_FLAG_QUEUE_IO_DONE) {
+ /*
+ * Executing from a previous vdev_queue_io_done so
+ * to avoid recursion we just unlock and return.
+ */
+ mutex_exit(&vq->vq_lock);
+ return;
+ }
+
while ((nio = vdev_queue_io_to_issue(vq)) != NULL) {
mutex_exit(&vq->vq_lock);
+ nio->io_flags |= ZIO_FLAG_QUEUE_IO_DONE;
if (nio->io_done == vdev_queue_agg_io_done) {
zio_nowait(nio);
} else {
zio_vdev_io_reissue(nio);
zio_execute(nio);
}
+ nio->io_flags &= ~ZIO_FLAG_QUEUE_IO_DONE;
mutex_enter(&vq->vq_lock);
}
OpenPOWER on IntegriCloud