summaryrefslogtreecommitdiffstats
path: root/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
diff options
context:
space:
mode:
authorpjd <pjd@FreeBSD.org>2007-04-06 01:09:06 +0000
committerpjd <pjd@FreeBSD.org>2007-04-06 01:09:06 +0000
commit3b005d330261f33318ca1ee3fef1940237fd788b (patch)
tree3061c8734d9ce560165e672836837a0f411a83c9 /sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
parent3be454b8211f48e634e6587f53807d3b5013e973 (diff)
downloadFreeBSD-src-3b005d330261f33318ca1ee3fef1940237fd788b.zip
FreeBSD-src-3b005d330261f33318ca1ee3fef1940237fd788b.tar.gz
Please welcome ZFS - The last word in file systems.
ZFS file system was ported from OpenSolaris operating system. The code in under CDDL license. I'd like to thank all SUN developers that created this great piece of software. Supported by: Wheel LTD (http://www.wheel.pl/) Supported by: The FreeBSD Foundation (http://www.freebsdfoundation.org/) Supported by: Sentex (http://www.sentex.net/)
Diffstat (limited to 'sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c')
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c323
1 files changed, 323 insertions, 0 deletions
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
new file mode 100644
index 0000000..7e99c1f
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
@@ -0,0 +1,323 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#include <sys/avl.h>
+
+/*
+ * These tunables are for performance analysis.
+ */
+/*
+ * zfs_vdev_max_pending is the maximum number of i/os concurrently
+ * pending to each device. zfs_vdev_min_pending is the initial number
+ * of i/os pending to each device (before it starts ramping up to
+ * max_pending).
+ */
+int zfs_vdev_max_pending = 35;
+int zfs_vdev_min_pending = 4;
+
+/* deadline = pri + (lbolt >> time_shift) */
+int zfs_vdev_time_shift = 6;
+
+/* exponential I/O issue ramp-up rate */
+int zfs_vdev_ramp_rate = 2;
+
+/*
+ * i/os will be aggregated into a single large i/o up to
+ * zfs_vdev_aggregation_limit bytes long.
+ */
+int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE;
+
+/*
+ * Virtual device vector for disk I/O scheduling.
+ */
+int
+vdev_queue_deadline_compare(const void *x1, const void *x2)
+{
+ const zio_t *z1 = x1;
+ const zio_t *z2 = x2;
+
+ if (z1->io_deadline < z2->io_deadline)
+ return (-1);
+ if (z1->io_deadline > z2->io_deadline)
+ return (1);
+
+ if (z1->io_offset < z2->io_offset)
+ return (-1);
+ if (z1->io_offset > z2->io_offset)
+ return (1);
+
+ if (z1 < z2)
+ return (-1);
+ if (z1 > z2)
+ return (1);
+
+ return (0);
+}
+
+int
+vdev_queue_offset_compare(const void *x1, const void *x2)
+{
+ const zio_t *z1 = x1;
+ const zio_t *z2 = x2;
+
+ if (z1->io_offset < z2->io_offset)
+ return (-1);
+ if (z1->io_offset > z2->io_offset)
+ return (1);
+
+ if (z1 < z2)
+ return (-1);
+ if (z1 > z2)
+ return (1);
+
+ return (0);
+}
+
+void
+vdev_queue_init(vdev_t *vd)
+{
+ vdev_queue_t *vq = &vd->vdev_queue;
+
+ mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ avl_create(&vq->vq_deadline_tree, vdev_queue_deadline_compare,
+ sizeof (zio_t), offsetof(struct zio, io_deadline_node));
+
+ avl_create(&vq->vq_read_tree, vdev_queue_offset_compare,
+ sizeof (zio_t), offsetof(struct zio, io_offset_node));
+
+ avl_create(&vq->vq_write_tree, vdev_queue_offset_compare,
+ sizeof (zio_t), offsetof(struct zio, io_offset_node));
+
+ avl_create(&vq->vq_pending_tree, vdev_queue_offset_compare,
+ sizeof (zio_t), offsetof(struct zio, io_offset_node));
+}
+
+void
+vdev_queue_fini(vdev_t *vd)
+{
+ vdev_queue_t *vq = &vd->vdev_queue;
+
+ avl_destroy(&vq->vq_deadline_tree);
+ avl_destroy(&vq->vq_read_tree);
+ avl_destroy(&vq->vq_write_tree);
+ avl_destroy(&vq->vq_pending_tree);
+
+ mutex_destroy(&vq->vq_lock);
+}
+
+static void
+vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
+{
+ avl_add(&vq->vq_deadline_tree, zio);
+ avl_add(zio->io_vdev_tree, zio);
+}
+
+static void
+vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
+{
+ avl_remove(&vq->vq_deadline_tree, zio);
+ avl_remove(zio->io_vdev_tree, zio);
+}
+
+static void
+vdev_queue_agg_io_done(zio_t *aio)
+{
+ zio_t *dio;
+ uint64_t offset = 0;
+
+ while ((dio = aio->io_delegate_list) != NULL) {
+ if (aio->io_type == ZIO_TYPE_READ)
+ bcopy((char *)aio->io_data + offset, dio->io_data,
+ dio->io_size);
+ offset += dio->io_size;
+ aio->io_delegate_list = dio->io_delegate_next;
+ dio->io_delegate_next = NULL;
+ dio->io_error = aio->io_error;
+ zio_next_stage(dio);
+ }
+ ASSERT3U(offset, ==, aio->io_size);
+
+ zio_buf_free(aio->io_data, aio->io_size);
+}
+
+#define IS_ADJACENT(io, nio) \
+ ((io)->io_offset + (io)->io_size == (nio)->io_offset)
+
+typedef void zio_issue_func_t(zio_t *);
+
+static zio_t *
+vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit,
+ zio_issue_func_t **funcp)
+{
+ zio_t *fio, *lio, *aio, *dio;
+ avl_tree_t *tree;
+ uint64_t size;
+
+ ASSERT(MUTEX_HELD(&vq->vq_lock));
+
+ *funcp = NULL;
+
+ if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit ||
+ avl_numnodes(&vq->vq_deadline_tree) == 0)
+ return (NULL);
+
+ fio = lio = avl_first(&vq->vq_deadline_tree);
+
+ tree = fio->io_vdev_tree;
+ size = fio->io_size;
+
+ while ((dio = AVL_PREV(tree, fio)) != NULL && IS_ADJACENT(dio, fio) &&
+ size + dio->io_size <= zfs_vdev_aggregation_limit) {
+ dio->io_delegate_next = fio;
+ fio = dio;
+ size += dio->io_size;
+ }
+
+ while ((dio = AVL_NEXT(tree, lio)) != NULL && IS_ADJACENT(lio, dio) &&
+ size + dio->io_size <= zfs_vdev_aggregation_limit) {
+ lio->io_delegate_next = dio;
+ lio = dio;
+ size += dio->io_size;
+ }
+
+ if (fio != lio) {
+ char *buf = zio_buf_alloc(size);
+ uint64_t offset = 0;
+ int nagg = 0;
+
+ ASSERT(size <= zfs_vdev_aggregation_limit);
+
+ aio = zio_vdev_child_io(fio, NULL, fio->io_vd,
+ fio->io_offset, buf, size, fio->io_type,
+ ZIO_PRIORITY_NOW, ZIO_FLAG_DONT_QUEUE |
+ ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE |
+ ZIO_FLAG_NOBOOKMARK,
+ vdev_queue_agg_io_done, NULL);
+
+ aio->io_delegate_list = fio;
+
+ for (dio = fio; dio != NULL; dio = dio->io_delegate_next) {
+ ASSERT(dio->io_type == aio->io_type);
+ ASSERT(dio->io_vdev_tree == tree);
+ if (dio->io_type == ZIO_TYPE_WRITE)
+ bcopy(dio->io_data, buf + offset, dio->io_size);
+ offset += dio->io_size;
+ vdev_queue_io_remove(vq, dio);
+ zio_vdev_io_bypass(dio);
+ nagg++;
+ }
+
+ ASSERT(offset == size);
+
+ dprintf("%5s T=%llu off=%8llx agg=%3d "
+ "old=%5llx new=%5llx\n",
+ zio_type_name[fio->io_type],
+ fio->io_deadline, fio->io_offset, nagg, fio->io_size, size);
+
+ avl_add(&vq->vq_pending_tree, aio);
+
+ *funcp = zio_nowait;
+ return (aio);
+ }
+
+ ASSERT(fio->io_vdev_tree == tree);
+ vdev_queue_io_remove(vq, fio);
+
+ avl_add(&vq->vq_pending_tree, fio);
+
+ *funcp = zio_next_stage;
+
+ return (fio);
+}
+
+zio_t *
+vdev_queue_io(zio_t *zio)
+{
+ vdev_queue_t *vq = &zio->io_vd->vdev_queue;
+ zio_t *nio;
+ zio_issue_func_t *func;
+
+ ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
+
+ if (zio->io_flags & ZIO_FLAG_DONT_QUEUE)
+ return (zio);
+
+ zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE;
+
+ if (zio->io_type == ZIO_TYPE_READ)
+ zio->io_vdev_tree = &vq->vq_read_tree;
+ else
+ zio->io_vdev_tree = &vq->vq_write_tree;
+
+ mutex_enter(&vq->vq_lock);
+
+ zio->io_deadline = (zio->io_timestamp >> zfs_vdev_time_shift) +
+ zio->io_priority;
+
+ vdev_queue_io_add(vq, zio);
+
+ nio = vdev_queue_io_to_issue(vq, zfs_vdev_min_pending, &func);
+
+ mutex_exit(&vq->vq_lock);
+
+ if (nio == NULL || func != zio_nowait)
+ return (nio);
+
+ func(nio);
+ return (NULL);
+}
+
+void
+vdev_queue_io_done(zio_t *zio)
+{
+ vdev_queue_t *vq = &zio->io_vd->vdev_queue;
+ zio_t *nio;
+ zio_issue_func_t *func;
+ int i;
+
+ mutex_enter(&vq->vq_lock);
+
+ avl_remove(&vq->vq_pending_tree, zio);
+
+ for (i = 0; i < zfs_vdev_ramp_rate; i++) {
+ nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending, &func);
+ if (nio == NULL)
+ break;
+ mutex_exit(&vq->vq_lock);
+ if (func == zio_next_stage)
+ zio_vdev_io_reissue(nio);
+ func(nio);
+ mutex_enter(&vq->vq_lock);
+ }
+
+ mutex_exit(&vq->vq_lock);
+}
OpenPOWER on IntegriCloud