diff options
Diffstat (limited to 'sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c')
-rw-r--r-- | sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c | 639 |
1 files changed, 639 insertions, 0 deletions
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c new file mode 100644 index 0000000..5c52042 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c @@ -0,0 +1,639 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2012 Pawel Jakub Dawidek <pawel@dawidek.net>. + * All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/spa_impl.h> +#include <sys/vdev_impl.h> +#include <sys/trim_map.h> +#include <sys/time.h> + +/* + * Calculate the zio end, upgrading based on ashift which would be + * done by zio_vdev_io_start. + * + * This makes free range consolidation much more effective + * than it would otherwise be as well as ensuring that entire + * blocks are invalidated by writes. + */ +#define TRIM_ZIO_END(vd, offset, size) (offset + \ + P2ROUNDUP(size, 1ULL << vd->vdev_top->vdev_ashift)) + +#define TRIM_MAP_SINC(tm, size) \ + atomic_add_64(&(tm)->tm_bytes, (size)) + +#define TRIM_MAP_SDEC(tm, size) \ + atomic_add_64(&(tm)->tm_bytes, -(size)) + +#define TRIM_MAP_QINC(tm) \ + atomic_inc_64(&(tm)->tm_pending); \ + +#define TRIM_MAP_QDEC(tm) \ + atomic_dec_64(&(tm)->tm_pending); + +typedef struct trim_map { + list_t tm_head; /* List of segments sorted by txg. */ + avl_tree_t tm_queued_frees; /* AVL tree of segments waiting for TRIM. */ + avl_tree_t tm_inflight_frees; /* AVL tree of in-flight TRIMs. */ + avl_tree_t tm_inflight_writes; /* AVL tree of in-flight writes. */ + list_t tm_pending_writes; /* Writes blocked on in-flight frees. */ + kmutex_t tm_lock; + uint64_t tm_pending; /* Count of pending TRIMs. */ + uint64_t tm_bytes; /* Total size in bytes of queued TRIMs. */ +} trim_map_t; + +typedef struct trim_seg { + avl_node_t ts_node; /* AVL node. */ + list_node_t ts_next; /* List element. */ + uint64_t ts_start; /* Starting offset of this segment. */ + uint64_t ts_end; /* Ending offset (non-inclusive). */ + uint64_t ts_txg; /* Segment creation txg. */ + hrtime_t ts_time; /* Segment creation time. */ +} trim_seg_t; + +extern boolean_t zfs_trim_enabled; + +static u_int trim_txg_delay = 32; +static u_int trim_timeout = 30; +static u_int trim_max_interval = 1; +/* Limit outstanding TRIMs to 2G (max size for a single TRIM request) */ +static uint64_t trim_vdev_max_bytes = 2147483648; +/* Limit outstanding TRIMs to 64 (max ranges for a single TRIM request) */ +static u_int trim_vdev_max_pending = 64; + +SYSCTL_DECL(_vfs_zfs); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, trim, CTLFLAG_RD, 0, "ZFS TRIM"); + +SYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, txg_delay, CTLFLAG_RWTUN, &trim_txg_delay, + 0, "Delay TRIMs by up to this many TXGs"); +SYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, timeout, CTLFLAG_RWTUN, &trim_timeout, 0, + "Delay TRIMs by up to this many seconds"); +SYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, max_interval, CTLFLAG_RWTUN, + &trim_max_interval, 0, + "Maximum interval between TRIM queue processing (seconds)"); + +SYSCTL_DECL(_vfs_zfs_vdev); +SYSCTL_QUAD(_vfs_zfs_vdev, OID_AUTO, trim_max_bytes, CTLFLAG_RWTUN, + &trim_vdev_max_bytes, 0, + "Maximum pending TRIM bytes for a vdev"); +SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, trim_max_pending, CTLFLAG_RWTUN, + &trim_vdev_max_pending, 0, + "Maximum pending TRIM segments for a vdev"); + +static void trim_map_vdev_commit_done(spa_t *spa, vdev_t *vd); + +static int +trim_map_seg_compare(const void *x1, const void *x2) +{ + const trim_seg_t *s1 = x1; + const trim_seg_t *s2 = x2; + + if (s1->ts_start < s2->ts_start) { + if (s1->ts_end > s2->ts_start) + return (0); + return (-1); + } + if (s1->ts_start > s2->ts_start) { + if (s1->ts_start < s2->ts_end) + return (0); + return (1); + } + return (0); +} + +static int +trim_map_zio_compare(const void *x1, const void *x2) +{ + const zio_t *z1 = x1; + const zio_t *z2 = x2; + + if (z1->io_offset < z2->io_offset) { + if (z1->io_offset + z1->io_size > z2->io_offset) + return (0); + return (-1); + } + if (z1->io_offset > z2->io_offset) { + if (z1->io_offset < z2->io_offset + z2->io_size) + return (0); + return (1); + } + return (0); +} + +void +trim_map_create(vdev_t *vd) +{ + trim_map_t *tm; + + ASSERT(vd->vdev_ops->vdev_op_leaf); + + if (!zfs_trim_enabled) + return; + + tm = kmem_zalloc(sizeof (*tm), KM_SLEEP); + mutex_init(&tm->tm_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&tm->tm_head, sizeof (trim_seg_t), + offsetof(trim_seg_t, ts_next)); + list_create(&tm->tm_pending_writes, sizeof (zio_t), + offsetof(zio_t, io_trim_link)); + avl_create(&tm->tm_queued_frees, trim_map_seg_compare, + sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node)); + avl_create(&tm->tm_inflight_frees, trim_map_seg_compare, + sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node)); + avl_create(&tm->tm_inflight_writes, trim_map_zio_compare, + sizeof (zio_t), offsetof(zio_t, io_trim_node)); + vd->vdev_trimmap = tm; +} + +void +trim_map_destroy(vdev_t *vd) +{ + trim_map_t *tm; + trim_seg_t *ts; + + ASSERT(vd->vdev_ops->vdev_op_leaf); + + if (!zfs_trim_enabled) + return; + + tm = vd->vdev_trimmap; + if (tm == NULL) + return; + + /* + * We may have been called before trim_map_vdev_commit_done() + * had a chance to run, so do it now to prune the remaining + * inflight frees. + */ + trim_map_vdev_commit_done(vd->vdev_spa, vd); + + mutex_enter(&tm->tm_lock); + while ((ts = list_head(&tm->tm_head)) != NULL) { + avl_remove(&tm->tm_queued_frees, ts); + list_remove(&tm->tm_head, ts); + kmem_free(ts, sizeof (*ts)); + TRIM_MAP_SDEC(tm, ts->ts_end - ts->ts_start); + TRIM_MAP_QDEC(tm); + } + mutex_exit(&tm->tm_lock); + + avl_destroy(&tm->tm_queued_frees); + avl_destroy(&tm->tm_inflight_frees); + avl_destroy(&tm->tm_inflight_writes); + list_destroy(&tm->tm_pending_writes); + list_destroy(&tm->tm_head); + mutex_destroy(&tm->tm_lock); + kmem_free(tm, sizeof (*tm)); + vd->vdev_trimmap = NULL; +} + +static void +trim_map_segment_add(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg) +{ + avl_index_t where; + trim_seg_t tsearch, *ts_before, *ts_after, *ts; + boolean_t merge_before, merge_after; + hrtime_t time; + + ASSERT(MUTEX_HELD(&tm->tm_lock)); + VERIFY(start < end); + + time = gethrtime(); + tsearch.ts_start = start; + tsearch.ts_end = end; + + ts = avl_find(&tm->tm_queued_frees, &tsearch, &where); + if (ts != NULL) { + if (start < ts->ts_start) + trim_map_segment_add(tm, start, ts->ts_start, txg); + if (end > ts->ts_end) + trim_map_segment_add(tm, ts->ts_end, end, txg); + return; + } + + ts_before = avl_nearest(&tm->tm_queued_frees, where, AVL_BEFORE); + ts_after = avl_nearest(&tm->tm_queued_frees, where, AVL_AFTER); + + merge_before = (ts_before != NULL && ts_before->ts_end == start); + merge_after = (ts_after != NULL && ts_after->ts_start == end); + + if (merge_before && merge_after) { + TRIM_MAP_SINC(tm, ts_after->ts_start - ts_before->ts_end); + TRIM_MAP_QDEC(tm); + avl_remove(&tm->tm_queued_frees, ts_before); + list_remove(&tm->tm_head, ts_before); + ts_after->ts_start = ts_before->ts_start; + ts_after->ts_txg = txg; + ts_after->ts_time = time; + kmem_free(ts_before, sizeof (*ts_before)); + } else if (merge_before) { + TRIM_MAP_SINC(tm, end - ts_before->ts_end); + ts_before->ts_end = end; + ts_before->ts_txg = txg; + ts_before->ts_time = time; + } else if (merge_after) { + TRIM_MAP_SINC(tm, ts_after->ts_start - start); + ts_after->ts_start = start; + ts_after->ts_txg = txg; + ts_after->ts_time = time; + } else { + TRIM_MAP_SINC(tm, end - start); + TRIM_MAP_QINC(tm); + ts = kmem_alloc(sizeof (*ts), KM_SLEEP); + ts->ts_start = start; + ts->ts_end = end; + ts->ts_txg = txg; + ts->ts_time = time; + avl_insert(&tm->tm_queued_frees, ts, where); + list_insert_tail(&tm->tm_head, ts); + } +} + +static void +trim_map_segment_remove(trim_map_t *tm, trim_seg_t *ts, uint64_t start, + uint64_t end) +{ + trim_seg_t *nts; + boolean_t left_over, right_over; + + ASSERT(MUTEX_HELD(&tm->tm_lock)); + + left_over = (ts->ts_start < start); + right_over = (ts->ts_end > end); + + TRIM_MAP_SDEC(tm, end - start); + if (left_over && right_over) { + nts = kmem_alloc(sizeof (*nts), KM_SLEEP); + nts->ts_start = end; + nts->ts_end = ts->ts_end; + nts->ts_txg = ts->ts_txg; + nts->ts_time = ts->ts_time; + ts->ts_end = start; + avl_insert_here(&tm->tm_queued_frees, nts, ts, AVL_AFTER); + list_insert_after(&tm->tm_head, ts, nts); + TRIM_MAP_QINC(tm); + } else if (left_over) { + ts->ts_end = start; + } else if (right_over) { + ts->ts_start = end; + } else { + avl_remove(&tm->tm_queued_frees, ts); + list_remove(&tm->tm_head, ts); + TRIM_MAP_QDEC(tm); + kmem_free(ts, sizeof (*ts)); + } +} + +static void +trim_map_free_locked(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg) +{ + zio_t zsearch, *zs; + + ASSERT(MUTEX_HELD(&tm->tm_lock)); + + zsearch.io_offset = start; + zsearch.io_size = end - start; + + zs = avl_find(&tm->tm_inflight_writes, &zsearch, NULL); + if (zs == NULL) { + trim_map_segment_add(tm, start, end, txg); + return; + } + if (start < zs->io_offset) + trim_map_free_locked(tm, start, zs->io_offset, txg); + if (zs->io_offset + zs->io_size < end) + trim_map_free_locked(tm, zs->io_offset + zs->io_size, end, txg); +} + +void +trim_map_free(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg) +{ + trim_map_t *tm = vd->vdev_trimmap; + + if (!zfs_trim_enabled || vd->vdev_notrim || tm == NULL) + return; + + mutex_enter(&tm->tm_lock); + trim_map_free_locked(tm, offset, TRIM_ZIO_END(vd, offset, size), txg); + mutex_exit(&tm->tm_lock); +} + +boolean_t +trim_map_write_start(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + trim_map_t *tm = vd->vdev_trimmap; + trim_seg_t tsearch, *ts; + boolean_t left_over, right_over; + uint64_t start, end; + + if (!zfs_trim_enabled || vd->vdev_notrim || tm == NULL) + return (B_TRUE); + + start = zio->io_offset; + end = TRIM_ZIO_END(zio->io_vd, start, zio->io_size); + tsearch.ts_start = start; + tsearch.ts_end = end; + + mutex_enter(&tm->tm_lock); + + /* + * Checking for colliding in-flight frees. + */ + ts = avl_find(&tm->tm_inflight_frees, &tsearch, NULL); + if (ts != NULL) { + list_insert_tail(&tm->tm_pending_writes, zio); + mutex_exit(&tm->tm_lock); + return (B_FALSE); + } + + ts = avl_find(&tm->tm_queued_frees, &tsearch, NULL); + if (ts != NULL) { + /* + * Loop until all overlapping segments are removed. + */ + do { + trim_map_segment_remove(tm, ts, start, end); + ts = avl_find(&tm->tm_queued_frees, &tsearch, NULL); + } while (ts != NULL); + } + avl_add(&tm->tm_inflight_writes, zio); + + mutex_exit(&tm->tm_lock); + + return (B_TRUE); +} + +void +trim_map_write_done(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + trim_map_t *tm = vd->vdev_trimmap; + + /* + * Don't check for vdev_notrim, since the write could have + * started before vdev_notrim was set. + */ + if (!zfs_trim_enabled || tm == NULL) + return; + + mutex_enter(&tm->tm_lock); + /* + * Don't fail if the write isn't in the tree, since the write + * could have started after vdev_notrim was set. + */ + if (zio->io_trim_node.avl_child[0] || + zio->io_trim_node.avl_child[1] || + AVL_XPARENT(&zio->io_trim_node) || + tm->tm_inflight_writes.avl_root == &zio->io_trim_node) + avl_remove(&tm->tm_inflight_writes, zio); + mutex_exit(&tm->tm_lock); +} + +/* + * Return the oldest segment (the one with the lowest txg / time) or NULL if: + * 1. The list is empty + * 2. The first element's txg is greater than txgsafe + * 3. The first element's txg is not greater than the txg argument and the + * the first element's time is not greater than time argument + */ +static trim_seg_t * +trim_map_first(trim_map_t *tm, uint64_t txg, uint64_t txgsafe, hrtime_t time) +{ + trim_seg_t *ts; + + ASSERT(MUTEX_HELD(&tm->tm_lock)); + VERIFY(txgsafe >= txg); + + ts = list_head(&tm->tm_head); + if (ts != NULL && ts->ts_txg <= txgsafe && + (ts->ts_txg <= txg || ts->ts_time <= time || + tm->tm_bytes > trim_vdev_max_bytes || + tm->tm_pending > trim_vdev_max_pending)) + return (ts); + return (NULL); +} + +static void +trim_map_vdev_commit(spa_t *spa, zio_t *zio, vdev_t *vd) +{ + trim_map_t *tm = vd->vdev_trimmap; + trim_seg_t *ts; + uint64_t size, offset, txgtarget, txgsafe; + hrtime_t timelimit; + + ASSERT(vd->vdev_ops->vdev_op_leaf); + + if (tm == NULL) + return; + + timelimit = gethrtime() - trim_timeout * NANOSEC; + if (vd->vdev_isl2cache) { + txgsafe = UINT64_MAX; + txgtarget = UINT64_MAX; + } else { + txgsafe = MIN(spa_last_synced_txg(spa), spa_freeze_txg(spa)); + if (txgsafe > trim_txg_delay) + txgtarget = txgsafe - trim_txg_delay; + else + txgtarget = 0; + } + + mutex_enter(&tm->tm_lock); + /* Loop until we have sent all outstanding free's */ + while ((ts = trim_map_first(tm, txgtarget, txgsafe, timelimit)) + != NULL) { + list_remove(&tm->tm_head, ts); + avl_remove(&tm->tm_queued_frees, ts); + avl_add(&tm->tm_inflight_frees, ts); + size = ts->ts_end - ts->ts_start; + offset = ts->ts_start; + TRIM_MAP_SDEC(tm, size); + TRIM_MAP_QDEC(tm); + /* + * We drop the lock while we call zio_nowait as the IO + * scheduler can result in a different IO being run e.g. + * a write which would result in a recursive lock. + */ + mutex_exit(&tm->tm_lock); + + zio_nowait(zio_trim(zio, spa, vd, offset, size)); + + mutex_enter(&tm->tm_lock); + } + mutex_exit(&tm->tm_lock); +} + +static void +trim_map_vdev_commit_done(spa_t *spa, vdev_t *vd) +{ + trim_map_t *tm = vd->vdev_trimmap; + trim_seg_t *ts; + list_t pending_writes; + zio_t *zio; + uint64_t start, size; + void *cookie; + + ASSERT(vd->vdev_ops->vdev_op_leaf); + + if (tm == NULL) + return; + + mutex_enter(&tm->tm_lock); + if (!avl_is_empty(&tm->tm_inflight_frees)) { + cookie = NULL; + while ((ts = avl_destroy_nodes(&tm->tm_inflight_frees, + &cookie)) != NULL) { + kmem_free(ts, sizeof (*ts)); + } + } + list_create(&pending_writes, sizeof (zio_t), offsetof(zio_t, + io_trim_link)); + list_move_tail(&pending_writes, &tm->tm_pending_writes); + mutex_exit(&tm->tm_lock); + + while ((zio = list_remove_head(&pending_writes)) != NULL) { + zio_vdev_io_reissue(zio); + zio_execute(zio); + } + list_destroy(&pending_writes); +} + +static void +trim_map_commit(spa_t *spa, zio_t *zio, vdev_t *vd) +{ + int c; + + if (vd == NULL) + return; + + if (vd->vdev_ops->vdev_op_leaf) { + trim_map_vdev_commit(spa, zio, vd); + } else { + for (c = 0; c < vd->vdev_children; c++) + trim_map_commit(spa, zio, vd->vdev_child[c]); + } +} + +static void +trim_map_commit_done(spa_t *spa, vdev_t *vd) +{ + int c; + + if (vd == NULL) + return; + + if (vd->vdev_ops->vdev_op_leaf) { + trim_map_vdev_commit_done(spa, vd); + } else { + for (c = 0; c < vd->vdev_children; c++) + trim_map_commit_done(spa, vd->vdev_child[c]); + } +} + +static void +trim_thread(void *arg) +{ + spa_t *spa = arg; + zio_t *zio; + +#ifdef _KERNEL + (void) snprintf(curthread->td_name, sizeof(curthread->td_name), + "trim %s", spa_name(spa)); +#endif + + for (;;) { + mutex_enter(&spa->spa_trim_lock); + if (spa->spa_trim_thread == NULL) { + spa->spa_trim_thread = curthread; + cv_signal(&spa->spa_trim_cv); + mutex_exit(&spa->spa_trim_lock); + thread_exit(); + } + + (void) cv_timedwait(&spa->spa_trim_cv, &spa->spa_trim_lock, + hz * trim_max_interval); + mutex_exit(&spa->spa_trim_lock); + + zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); + + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + trim_map_commit(spa, zio, spa->spa_root_vdev); + (void) zio_wait(zio); + trim_map_commit_done(spa, spa->spa_root_vdev); + spa_config_exit(spa, SCL_STATE, FTAG); + } +} + +void +trim_thread_create(spa_t *spa) +{ + + if (!zfs_trim_enabled) + return; + + mutex_init(&spa->spa_trim_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&spa->spa_trim_cv, NULL, CV_DEFAULT, NULL); + mutex_enter(&spa->spa_trim_lock); + spa->spa_trim_thread = thread_create(NULL, 0, trim_thread, spa, 0, &p0, + TS_RUN, minclsyspri); + mutex_exit(&spa->spa_trim_lock); +} + +void +trim_thread_destroy(spa_t *spa) +{ + + if (!zfs_trim_enabled) + return; + if (spa->spa_trim_thread == NULL) + return; + + mutex_enter(&spa->spa_trim_lock); + /* Setting spa_trim_thread to NULL tells the thread to stop. */ + spa->spa_trim_thread = NULL; + cv_signal(&spa->spa_trim_cv); + /* The thread will set it back to != NULL on exit. */ + while (spa->spa_trim_thread == NULL) + cv_wait(&spa->spa_trim_cv, &spa->spa_trim_lock); + spa->spa_trim_thread = NULL; + mutex_exit(&spa->spa_trim_lock); + + cv_destroy(&spa->spa_trim_cv); + mutex_destroy(&spa->spa_trim_lock); +} + +void +trim_thread_wakeup(spa_t *spa) +{ + + if (!zfs_trim_enabled) + return; + if (spa->spa_trim_thread == NULL) + return; + + mutex_enter(&spa->spa_trim_lock); + cv_signal(&spa->spa_trim_cv); + mutex_exit(&spa->spa_trim_lock); +} |