MFC r305331: MFV r304155:

7090 zfs should improve allocation order and throttle allocations illumos/illumos-gate@0f7643c7376dd69a08acbfc9d1d7d548b10c846a https://github.com/illumos/illumos-gate/commit/0f7643c7376dd69a08acbfc9d1d7d548b 10c846a https://www.illumos.org/issues/7090 When write I/Os are issued, they are issued in block order but the ZIO pipelin e will drive them asynchronously through the allocation stage which can result i n blocks being allocated out-of-order. It would be nice to preserve as much of the logical order as possible. In addition, the allocations are equally scattered across all top-level VDEVs but not all top-level VDEVs are created equally. The pipeline should be able t o detect devices that are more capable of handling allocations and should allocate more blocks to those devices. This allows for dynamic allocation distribution when devices are imbalanced as fuller devices will tend to be slower than empty devices. The change includes a new pool-wide allocation queue which would throttle and order allocations in the ZIO pipeline. The queue would be ordered by issued time and offset and would provide an initial amount of allocation of work to each top-level vdev. The allocation logic utilizes a reservation system to reserve allocations that will be performed by the allocator. Once an allocatio n is successfully completed it's scheduled on a given top-level vdev. Each top- level vdev maintains a maximum number of allocations that it can handle (mg_alloc_queue_depth). The pool-wide reserved allocations (top-levels * mg_alloc_queue_depth) are distributed across the top-level vdevs metaslab groups and round robin across all eligible metaslab groups to distribute the work. As top-levels complete their work, they receive additional work from the pool-wide allocation queue until the allocation queue is emptied. Reviewed by: Adam Leventhal <ahl@delphix.com> Reviewed by: Alex Reece <alex@delphix.com> Reviewed by: Christopher Siden <christopher.siden@delphix.com> Reviewed by: Dan Kimmel <dan.kimmel@delphix.com> Reviewed by: Matthew Ahrens <mahrens@delphix.com> Reviewed by: Paul Dagnelie <paul.dagnelie@delphix.com> Reviewed by: Prakash Surya <prakash.surya@delphix.com> Reviewed by: Sebastien Roy <sebastien.roy@delphix.com> Approved by: Robert Mustacchi <rm@joyent.com> Author: George Wilson <george.wilson@delphix.com>
author: mav <mav@FreeBSD.org> 2016-10-14 07:27:40 +0000
committer: mav <mav@FreeBSD.org> 2016-10-14 07:27:40 +0000
commit: abee620b90ee78d42fe90dfe3b0fd4c17bc19450 (patch)
tree: 8b5ca4a582f31a1393d5e009fb3ddcbf90fed2e3 /sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
parent: 31639742e4524e2d518502cb2dde45f4b16a015b (diff)
download: FreeBSD-src-abee620b90ee78d42fe90dfe3b0fd4c17bc19450.zip
FreeBSD-src-abee620b90ee78d42fe90dfe3b0fd4c17bc19450.tar.gz
1 files changed, 25 insertions, 1 deletions
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
index 55405b7..bec3f84 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
@@ -34,6 +34,7 @@
 #include <sys/zio.h>
 #include <sys/avl.h>
 #include <sys/dsl_pool.h>
+#include <sys/metaslab_impl.h>
 
 /*
  * ZFS I/O Scheduler
@@ -175,6 +176,23 @@ int zfs_vdev_aggregation_limit = SPA_OLD_MAXBLOCKSIZE;
 int zfs_vdev_read_gap_limit = 32 << 10;
 int zfs_vdev_write_gap_limit = 4 << 10;
 
+/*
+ * Define the queue depth percentage for each top-level. This percentage is
+ * used in conjunction with zfs_vdev_async_max_active to determine how many
+ * allocations a specific top-level vdev should handle. Once the queue depth
+ * reaches zfs_vdev_queue_depth_pct * zfs_vdev_async_write_max_active / 100
+ * then allocator will stop allocating blocks on that top-level device.
+ * The default kernel setting is 1000% which will yield 100 allocations per
+ * device. For userland testing, the default setting is 300% which equates
+ * to 30 allocations per device.
+ */
+#ifdef _KERNEL
+int zfs_vdev_queue_depth_pct = 1000;
+#else
+int zfs_vdev_queue_depth_pct = 300;
+#endif
+
+
 #ifdef __FreeBSD__
 #ifdef _KERNEL
 SYSCTL_DECL(_vfs_zfs_vdev);
@@ -245,6 +263,9 @@ TUNABLE_INT("vfs.zfs.vdev.write_gap_limit", &zfs_vdev_write_gap_limit);
 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, write_gap_limit, CTLFLAG_RWTUN,
     &zfs_vdev_write_gap_limit, 0,
     "Acceptable gap between two writes being aggregated");
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, queue_depth_pct, CTLFLAG_RWTUN,
+    &zfs_vdev_queue_depth_pct, 0,
+    "Queue depth percentage for each top-level");
 
 static int
 sysctl_zfs_async_write_active_min_dirty_percent(SYSCTL_HANDLER_ARGS)
@@ -402,6 +423,7 @@ vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	avl_tree_t *qtt;
+
 	ASSERT(MUTEX_HELD(&vq->vq_lock));
 	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
 	avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
@@ -423,6 +445,7 @@ vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	avl_tree_t *qtt;
+
 	ASSERT(MUTEX_HELD(&vq->vq_lock));
 	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
 	avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
@@ -492,7 +515,8 @@ vdev_queue_agg_io_done(zio_t *aio)
 {
 	if (aio->io_type == ZIO_TYPE_READ) {
 		zio_t *pio;
-		while ((pio = zio_walk_parents(aio)) != NULL) {
+		zio_link_t *zl = NULL;
+		while ((pio = zio_walk_parents(aio, &zl)) != NULL) {
 			bcopy((char *)aio->io_data + (pio->io_offset -
 			    aio->io_offset), pio->io_data, pio->io_size);
 		}
author	mav <mav@FreeBSD.org>	2016-10-14 07:27:40 +0000
committer	mav <mav@FreeBSD.org>	2016-10-14 07:27:40 +0000
commit	abee620b90ee78d42fe90dfe3b0fd4c17bc19450 (patch)
tree	8b5ca4a582f31a1393d5e009fb3ddcbf90fed2e3 /sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
parent	31639742e4524e2d518502cb2dde45f4b16a015b (diff)
download	FreeBSD-src-abee620b90ee78d42fe90dfe3b0fd4c17bc19450.zip FreeBSD-src-abee620b90ee78d42fe90dfe3b0fd4c17bc19450.tar.gz