summaryrefslogtreecommitdiffstats
path: root/sys/cddl
diff options
context:
space:
mode:
Diffstat (limited to 'sys/cddl')
-rw-r--r--sys/cddl/compat/opensolaris/sys/dkio.h2
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c5
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c19
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h3
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/trim_map.h51
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h1
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h2
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h47
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h10
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c541
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c9
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c56
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c18
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c7
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c36
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c4
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c107
17 files changed, 867 insertions, 51 deletions
diff --git a/sys/cddl/compat/opensolaris/sys/dkio.h b/sys/cddl/compat/opensolaris/sys/dkio.h
index a365c33..50cafab 100644
--- a/sys/cddl/compat/opensolaris/sys/dkio.h
+++ b/sys/cddl/compat/opensolaris/sys/dkio.h
@@ -75,6 +75,8 @@ extern "C" {
*/
#define DKIOCFLUSHWRITECACHE (DKIOC|34) /* flush cache to phys medium */
+#define DKIOCTRIM (DKIOC|35) /* TRIM a block */
+
struct dk_callback {
void (*dkc_callback)(void *dkc_cookie, int error);
void *dkc_cookie;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c
index 5fc37e2..659d18e 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c
@@ -393,7 +393,8 @@ void
dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp)
{
ASSERT(dsl_pool_sync_context(dp));
- zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, pio->io_flags));
+ zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, BP_GET_PSIZE(bpp),
+ pio->io_flags));
}
int
@@ -1387,7 +1388,7 @@ dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
}
zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa,
- dmu_tx_get_txg(tx), bp, 0));
+ dmu_tx_get_txg(tx), bp, BP_GET_PSIZE(bp), 0));
dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
-bp_get_dsize_sync(scn->scn_dp->dp_spa, bp),
-BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
index 7ee082a..e08b312 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
@@ -62,6 +62,7 @@
#include <sys/dsl_scan.h>
#include <sys/zfeature.h>
#include <sys/zvol.h>
+#include <sys/trim_map.h>
#ifdef _KERNEL
#include <sys/callb.h>
@@ -1001,6 +1002,11 @@ spa_activate(spa_t *spa, int mode)
spa_create_zio_taskqs(spa);
}
+ /*
+ * Start TRIM thread.
+ */
+ trim_thread_create(spa);
+
list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
offsetof(vdev_t, vdev_config_dirty_node));
list_create(&spa->spa_state_dirty_list, sizeof (vdev_t),
@@ -1029,6 +1035,12 @@ spa_deactivate(spa_t *spa)
ASSERT(spa->spa_async_zio_root == NULL);
ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
+ /*
+ * Stop TRIM thread in case spa_unload() wasn't called directly
+ * before spa_deactivate().
+ */
+ trim_thread_destroy(spa);
+
txg_list_destroy(&spa->spa_vdev_txg_list);
list_destroy(&spa->spa_config_dirty_list);
@@ -1145,6 +1157,11 @@ spa_unload(spa_t *spa)
ASSERT(MUTEX_HELD(&spa_namespace_lock));
/*
+ * Stop TRIM thread.
+ */
+ trim_thread_destroy(spa);
+
+ /*
* Stop async tasks.
*/
spa_async_suspend(spa);
@@ -5677,7 +5694,7 @@ spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
zio_t *zio = arg;
zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp,
- zio->io_flags));
+ BP_GET_PSIZE(bp), zio->io_flags));
return (0);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
index da31a22..ea9ae2e 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
@@ -220,6 +220,9 @@ struct spa {
spa_proc_state_t spa_proc_state; /* see definition */
struct proc *spa_proc; /* "zpool-poolname" process */
uint64_t spa_did; /* if procp != p0, did of t1 */
+ kthread_t *spa_trim_thread; /* thread sending TRIM I/Os */
+ kmutex_t spa_trim_lock; /* protects spa_trim_cv */
+ kcondvar_t spa_trim_cv; /* used to notify TRIM thread */
boolean_t spa_autoreplace; /* autoreplace set in open */
int spa_vdev_locks; /* locks grabbed */
uint64_t spa_creation_version; /* version at pool creation */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/trim_map.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/trim_map.h
new file mode 100644
index 0000000..ba82a05
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/trim_map.h
@@ -0,0 +1,51 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2012 Pawel Jakub Dawidek <pawel@dawidek.net>.
+ * All rights reserved.
+ */
+
+#ifndef _SYS_TRIM_MAP_H
+#define _SYS_TRIM_MAP_H
+
+#include <sys/avl.h>
+#include <sys/list.h>
+#include <sys/spa.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern void trim_map_create(vdev_t *vd);
+extern void trim_map_destroy(vdev_t *vd);
+extern void trim_map_free(zio_t *zio);
+extern boolean_t trim_map_write_start(zio_t *zio);
+extern void trim_map_write_done(zio_t *zio);
+
+extern void trim_thread_create(spa_t *spa);
+extern void trim_thread_destroy(spa_t *spa);
+extern void trim_thread_wakeup(spa_t *spa);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_TRIM_MAP_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h
index 7e34889..a8788a8 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h
@@ -46,6 +46,7 @@ typedef enum vdev_dtl_type {
} vdev_dtl_type_t;
extern boolean_t zfs_nocacheflush;
+extern boolean_t zfs_notrim;
extern int vdev_open(vdev_t *);
extern void vdev_open_children(vdev_t *);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
index 7eed4a1..411ec91 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
@@ -182,6 +182,7 @@ struct vdev {
uint64_t vdev_unspare; /* unspare when resilvering done */
hrtime_t vdev_last_try; /* last reopen time */
boolean_t vdev_nowritecache; /* true if flushwritecache failed */
+ boolean_t vdev_notrim; /* true if trim failed */
boolean_t vdev_checkremove; /* temporary online test */
boolean_t vdev_forcefault; /* force online fault */
boolean_t vdev_splitting; /* split or repair in progress */
@@ -197,6 +198,7 @@ struct vdev {
spa_aux_vdev_t *vdev_aux; /* for l2cache vdevs */
zio_t *vdev_probe_zio; /* root of current probe */
vdev_aux_t vdev_label_aux; /* on-disk aux state */
+ struct trim_map *vdev_trimmap;
/*
* For DTrace to work in userland (libzpool) context, these fields must
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
index 80d9336..068e68b 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
@@ -31,6 +31,7 @@
#include <sys/spa.h>
#include <sys/txg.h>
#include <sys/avl.h>
+#include <sys/kstat.h>
#include <sys/fs/zfs.h>
#include <sys/zio_impl.h>
@@ -133,7 +134,8 @@ enum zio_compress {
#define ZIO_PRIORITY_RESILVER (zio_priority_table[9])
#define ZIO_PRIORITY_SCRUB (zio_priority_table[10])
#define ZIO_PRIORITY_DDT_PREFETCH (zio_priority_table[11])
-#define ZIO_PRIORITY_TABLE_SIZE 12
+#define ZIO_PRIORITY_TRIM (zio_priority_table[12])
+#define ZIO_PRIORITY_TABLE_SIZE 13
#define ZIO_PIPELINE_CONTINUE 0x100
#define ZIO_PIPELINE_STOP 0x101
@@ -360,6 +362,39 @@ typedef struct zio_link {
list_node_t zl_child_node;
} zio_link_t;
+/*
+ * Used for TRIM kstat.
+ */
+typedef struct zio_trim_stats {
+ /*
+ * Number of bytes successfully TRIMmed.
+ */
+ kstat_named_t zio_trim_bytes;
+
+ /*
+ * Number of successful TRIM requests.
+ */
+ kstat_named_t zio_trim_success;
+
+ /*
+ * Number of TRIM requests that failed because TRIM is not
+ * supported.
+ */
+ kstat_named_t zio_trim_unsupported;
+
+ /*
+ * Number of TRIM requests that failed for other reasons.
+ */
+ kstat_named_t zio_trim_failed;
+} zio_trim_stats_t;
+
+extern zio_trim_stats_t zio_trim_stats;
+
+#define ZIO_TRIM_STAT_INCR(stat, val) \
+ atomic_add_64(&zio_trim_stats.stat.value.ui64, (val));
+#define ZIO_TRIM_STAT_BUMP(stat) \
+ ZIO_TRIM_STAT_INCR(stat, 1);
+
struct zio {
/* Core information about this I/O */
zbookmark_t io_bookmark;
@@ -433,6 +468,8 @@ struct zio {
/* FreeBSD only. */
struct ostask io_task;
#endif
+ avl_node_t io_trim_node;
+ list_node_t io_trim_link;
};
extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd,
@@ -463,8 +500,8 @@ extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg,
zio_done_func_t *done, void *priv, enum zio_flag flags);
extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
- zio_done_func_t *done, void *priv, int priority,
- enum zio_flag flags);
+ uint64_t offset, uint64_t size, zio_done_func_t *done, void *priv,
+ int priority, enum zio_flag flags);
extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
uint64_t size, void *data, int checksum,
@@ -477,12 +514,14 @@ extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
boolean_t labels);
extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg,
- const blkptr_t *bp, enum zio_flag flags);
+ const blkptr_t *bp, uint64_t size, enum zio_flag flags);
extern int zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp,
blkptr_t *old_bp, uint64_t size, boolean_t use_slog);
extern void zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp);
extern void zio_flush(zio_t *zio, vdev_t *vd);
+extern zio_t *zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset,
+ uint64_t size);
extern void zio_shrink(zio_t *zio, uint64_t size);
extern int zio_wait(zio_t *zio);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h
index 9a58ac0..86ea979 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h
@@ -60,9 +60,9 @@ enum zio_stage {
ZIO_STAGE_READY = 1 << 15, /* RWFCI */
- ZIO_STAGE_VDEV_IO_START = 1 << 16, /* RW--I */
- ZIO_STAGE_VDEV_IO_DONE = 1 << 17, /* RW--- */
- ZIO_STAGE_VDEV_IO_ASSESS = 1 << 18, /* RW--I */
+ ZIO_STAGE_VDEV_IO_START = 1 << 16, /* RWF-I */
+ ZIO_STAGE_VDEV_IO_DONE = 1 << 17, /* RWF-- */
+ ZIO_STAGE_VDEV_IO_ASSESS = 1 << 18, /* RWF-I */
ZIO_STAGE_CHECKSUM_VERIFY = 1 << 19, /* R---- */
@@ -143,7 +143,9 @@ enum zio_stage {
#define ZIO_FREE_PIPELINE \
(ZIO_INTERLOCK_STAGES | \
ZIO_STAGE_FREE_BP_INIT | \
- ZIO_STAGE_DVA_FREE)
+ ZIO_STAGE_DVA_FREE | \
+ ZIO_STAGE_VDEV_IO_START | \
+ ZIO_STAGE_VDEV_IO_ASSESS)
#define ZIO_DDT_FREE_PIPELINE \
(ZIO_INTERLOCK_STAGES | \
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c
new file mode 100644
index 0000000..5de5f2a
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c
@@ -0,0 +1,541 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2012 Pawel Jakub Dawidek <pawel@dawidek.net>.
+ * All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/trim_map.h>
+
+typedef struct trim_map {
+ list_t tm_head; /* List of segments sorted by txg. */
+ avl_tree_t tm_queued_frees; /* AVL tree of segments waiting for TRIM. */
+ avl_tree_t tm_inflight_frees; /* AVL tree of in-flight TRIMs. */
+ avl_tree_t tm_inflight_writes; /* AVL tree of in-flight writes. */
+ list_t tm_pending_writes; /* Writes blocked on in-flight frees. */
+ kmutex_t tm_lock;
+} trim_map_t;
+
+typedef struct trim_seg {
+ avl_node_t ts_node; /* AVL node. */
+ list_node_t ts_next; /* List element. */
+ uint64_t ts_start; /* Starting offset of this segment. */
+ uint64_t ts_end; /* Ending offset (non-inclusive). */
+ uint64_t ts_txg; /* Segment creation txg. */
+} trim_seg_t;
+
+extern boolean_t zfs_notrim;
+
+SYSCTL_DECL(_vfs_zfs);
+/* Delay TRIMs by that many TXGs. */
+static int trim_txg_limit = 64;
+TUNABLE_INT("vfs.zfs.trim_txg_limit", &trim_txg_limit);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, trim_txg_limit, CTLFLAG_RW, &trim_txg_limit, 0,
+ "Delay TRIMs by that many TXGs.");
+
+static void trim_map_vdev_commit_done(spa_t *spa, vdev_t *vd);
+
+static int
+trim_map_seg_compare(const void *x1, const void *x2)
+{
+ const trim_seg_t *s1 = x1;
+ const trim_seg_t *s2 = x2;
+
+ if (s1->ts_start < s2->ts_start) {
+ if (s1->ts_end > s2->ts_start)
+ return (0);
+ return (-1);
+ }
+ if (s1->ts_start > s2->ts_start) {
+ if (s1->ts_start < s2->ts_end)
+ return (0);
+ return (1);
+ }
+ return (0);
+}
+
+static int
+trim_map_zio_compare(const void *x1, const void *x2)
+{
+ const zio_t *z1 = x1;
+ const zio_t *z2 = x2;
+
+ if (z1->io_offset < z2->io_offset) {
+ if (z1->io_offset + z1->io_size > z2->io_offset)
+ return (0);
+ return (-1);
+ }
+ if (z1->io_offset > z2->io_offset) {
+ if (z1->io_offset < z2->io_offset + z2->io_size)
+ return (0);
+ return (1);
+ }
+ return (0);
+}
+
+void
+trim_map_create(vdev_t *vd)
+{
+ trim_map_t *tm;
+
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+
+ if (zfs_notrim)
+ return;
+
+ tm = kmem_zalloc(sizeof (*tm), KM_SLEEP);
+ mutex_init(&tm->tm_lock, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&tm->tm_head, sizeof (trim_seg_t),
+ offsetof(trim_seg_t, ts_next));
+ list_create(&tm->tm_pending_writes, sizeof (zio_t),
+ offsetof(zio_t, io_trim_link));
+ avl_create(&tm->tm_queued_frees, trim_map_seg_compare,
+ sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node));
+ avl_create(&tm->tm_inflight_frees, trim_map_seg_compare,
+ sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node));
+ avl_create(&tm->tm_inflight_writes, trim_map_zio_compare,
+ sizeof (zio_t), offsetof(zio_t, io_trim_node));
+ vd->vdev_trimmap = tm;
+}
+
+void
+trim_map_destroy(vdev_t *vd)
+{
+ trim_map_t *tm;
+ trim_seg_t *ts;
+
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+
+ if (zfs_notrim)
+ return;
+
+ tm = vd->vdev_trimmap;
+ if (tm == NULL)
+ return;
+
+ /*
+ * We may have been called before trim_map_vdev_commit_done()
+ * had a chance to run, so do it now to prune the remaining
+ * inflight frees.
+ */
+ trim_map_vdev_commit_done(vd->vdev_spa, vd);
+
+ mutex_enter(&tm->tm_lock);
+ while ((ts = list_head(&tm->tm_head)) != NULL) {
+ avl_remove(&tm->tm_queued_frees, ts);
+ list_remove(&tm->tm_head, ts);
+ kmem_free(ts, sizeof (*ts));
+ }
+ mutex_exit(&tm->tm_lock);
+
+ avl_destroy(&tm->tm_queued_frees);
+ avl_destroy(&tm->tm_inflight_frees);
+ avl_destroy(&tm->tm_inflight_writes);
+ list_destroy(&tm->tm_pending_writes);
+ list_destroy(&tm->tm_head);
+ mutex_destroy(&tm->tm_lock);
+ kmem_free(tm, sizeof (*tm));
+ vd->vdev_trimmap = NULL;
+}
+
+static void
+trim_map_segment_add(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg)
+{
+ avl_index_t where;
+ trim_seg_t tsearch, *ts_before, *ts_after, *ts;
+ boolean_t merge_before, merge_after;
+
+ ASSERT(MUTEX_HELD(&tm->tm_lock));
+ VERIFY(start < end);
+
+ tsearch.ts_start = start;
+ tsearch.ts_end = end;
+
+ ts = avl_find(&tm->tm_queued_frees, &tsearch, &where);
+ if (ts != NULL) {
+ if (start < ts->ts_start)
+ trim_map_segment_add(tm, start, ts->ts_start, txg);
+ if (end > ts->ts_end)
+ trim_map_segment_add(tm, ts->ts_end, end, txg);
+ return;
+ }
+
+ ts_before = avl_nearest(&tm->tm_queued_frees, where, AVL_BEFORE);
+ ts_after = avl_nearest(&tm->tm_queued_frees, where, AVL_AFTER);
+
+ merge_before = (ts_before != NULL && ts_before->ts_end == start &&
+ ts_before->ts_txg == txg);
+ merge_after = (ts_after != NULL && ts_after->ts_start == end &&
+ ts_after->ts_txg == txg);
+
+ if (merge_before && merge_after) {
+ avl_remove(&tm->tm_queued_frees, ts_before);
+ list_remove(&tm->tm_head, ts_before);
+ ts_after->ts_start = ts_before->ts_start;
+ kmem_free(ts_before, sizeof (*ts_before));
+ } else if (merge_before) {
+ ts_before->ts_end = end;
+ } else if (merge_after) {
+ ts_after->ts_start = start;
+ } else {
+ ts = kmem_alloc(sizeof (*ts), KM_SLEEP);
+ ts->ts_start = start;
+ ts->ts_end = end;
+ ts->ts_txg = txg;
+ avl_insert(&tm->tm_queued_frees, ts, where);
+ list_insert_tail(&tm->tm_head, ts);
+ }
+}
+
+static void
+trim_map_segment_remove(trim_map_t *tm, trim_seg_t *ts, uint64_t start,
+ uint64_t end)
+{
+ trim_seg_t *nts;
+ boolean_t left_over, right_over;
+
+ ASSERT(MUTEX_HELD(&tm->tm_lock));
+
+ left_over = (ts->ts_start < start);
+ right_over = (ts->ts_end > end);
+
+ if (left_over && right_over) {
+ nts = kmem_alloc(sizeof (*nts), KM_SLEEP);
+ nts->ts_start = end;
+ nts->ts_end = ts->ts_end;
+ nts->ts_txg = ts->ts_txg;
+ ts->ts_end = start;
+ avl_insert_here(&tm->tm_queued_frees, nts, ts, AVL_AFTER);
+ list_insert_after(&tm->tm_head, ts, nts);
+ } else if (left_over) {
+ ts->ts_end = start;
+ } else if (right_over) {
+ ts->ts_start = end;
+ } else {
+ avl_remove(&tm->tm_queued_frees, ts);
+ list_remove(&tm->tm_head, ts);
+ kmem_free(ts, sizeof (*ts));
+ }
+}
+
+static void
+trim_map_free_locked(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg)
+{
+ zio_t zsearch, *zs;
+
+ ASSERT(MUTEX_HELD(&tm->tm_lock));
+
+ zsearch.io_offset = start;
+ zsearch.io_size = end - start;
+
+ zs = avl_find(&tm->tm_inflight_writes, &zsearch, NULL);
+ if (zs == NULL) {
+ trim_map_segment_add(tm, start, end, txg);
+ return;
+ }
+ if (start < zs->io_offset)
+ trim_map_free_locked(tm, start, zs->io_offset, txg);
+ if (zs->io_offset + zs->io_size < end)
+ trim_map_free_locked(tm, zs->io_offset + zs->io_size, end, txg);
+}
+
+void
+trim_map_free(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ trim_map_t *tm = vd->vdev_trimmap;
+
+ if (zfs_notrim || vd->vdev_notrim || tm == NULL)
+ return;
+
+ mutex_enter(&tm->tm_lock);
+ trim_map_free_locked(tm, zio->io_offset, zio->io_offset + zio->io_size,
+ vd->vdev_spa->spa_syncing_txg);
+ mutex_exit(&tm->tm_lock);
+}
+
+boolean_t
+trim_map_write_start(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ trim_map_t *tm = vd->vdev_trimmap;
+ trim_seg_t tsearch, *ts;
+ boolean_t left_over, right_over;
+ uint64_t start, end;
+
+ if (zfs_notrim || vd->vdev_notrim || tm == NULL)
+ return (B_TRUE);
+
+ start = zio->io_offset;
+ end = start + zio->io_size;
+ tsearch.ts_start = start;
+ tsearch.ts_end = end;
+
+ mutex_enter(&tm->tm_lock);
+
+ /*
+ * Checking for colliding in-flight frees.
+ */
+ ts = avl_find(&tm->tm_inflight_frees, &tsearch, NULL);
+ if (ts != NULL) {
+ list_insert_tail(&tm->tm_pending_writes, zio);
+ mutex_exit(&tm->tm_lock);
+ return (B_FALSE);
+ }
+
+ ts = avl_find(&tm->tm_queued_frees, &tsearch, NULL);
+ if (ts != NULL) {
+ /*
+ * Loop until all overlapping segments are removed.
+ */
+ do {
+ trim_map_segment_remove(tm, ts, start, end);
+ ts = avl_find(&tm->tm_queued_frees, &tsearch, NULL);
+ } while (ts != NULL);
+ }
+ avl_add(&tm->tm_inflight_writes, zio);
+
+ mutex_exit(&tm->tm_lock);
+
+ return (B_TRUE);
+}
+
+void
+trim_map_write_done(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ trim_map_t *tm = vd->vdev_trimmap;
+
+ /*
+ * Don't check for vdev_notrim, since the write could have
+ * started before vdev_notrim was set.
+ */
+ if (zfs_notrim || tm == NULL)
+ return;
+
+ mutex_enter(&tm->tm_lock);
+ /*
+ * Don't fail if the write isn't in the tree, since the write
+ * could have started after vdev_notrim was set.
+ */
+ if (zio->io_trim_node.avl_child[0] ||
+ zio->io_trim_node.avl_child[1] ||
+ AVL_XPARENT(&zio->io_trim_node) ||
+ tm->tm_inflight_writes.avl_root == &zio->io_trim_node)
+ avl_remove(&tm->tm_inflight_writes, zio);
+ mutex_exit(&tm->tm_lock);
+}
+
+/*
+ * Return the oldest segment (the one with the lowest txg) or false if
+ * the list is empty or the first element's txg is greater than txg given
+ * as function argument.
+ */
+static trim_seg_t *
+trim_map_first(trim_map_t *tm, uint64_t txg)
+{
+ trim_seg_t *ts;
+
+ ASSERT(MUTEX_HELD(&tm->tm_lock));
+
+ ts = list_head(&tm->tm_head);
+ if (ts != NULL && ts->ts_txg <= txg)
+ return (ts);
+ return (NULL);
+}
+
+static void
+trim_map_vdev_commit(spa_t *spa, zio_t *zio, vdev_t *vd)
+{
+ trim_map_t *tm = vd->vdev_trimmap;
+ trim_seg_t *ts;
+ uint64_t start, size, txglimit;
+
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+
+ if (tm == NULL)
+ return;
+
+ txglimit = MIN(spa->spa_syncing_txg, spa_freeze_txg(spa)) -
+ trim_txg_limit;
+
+ mutex_enter(&tm->tm_lock);
+ /*
+ * Loop until we send all frees up to the txglimit.
+ */
+ while ((ts = trim_map_first(tm, txglimit)) != NULL) {
+ list_remove(&tm->tm_head, ts);
+ avl_remove(&tm->tm_queued_frees, ts);
+ avl_add(&tm->tm_inflight_frees, ts);
+ zio_nowait(zio_trim(zio, spa, vd, ts->ts_start,
+ ts->ts_end - ts->ts_start));
+ }
+ mutex_exit(&tm->tm_lock);
+}
+
+static void
+trim_map_vdev_commit_done(spa_t *spa, vdev_t *vd)
+{
+ trim_map_t *tm = vd->vdev_trimmap;
+ trim_seg_t *ts;
+ list_t pending_writes;
+ zio_t *zio;
+ uint64_t start, size;
+ void *cookie;
+
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+
+ if (tm == NULL)
+ return;
+
+ mutex_enter(&tm->tm_lock);
+ if (!avl_is_empty(&tm->tm_inflight_frees)) {
+ cookie = NULL;
+ while ((ts = avl_destroy_nodes(&tm->tm_inflight_frees,
+ &cookie)) != NULL) {
+ kmem_free(ts, sizeof (*ts));
+ }
+ }
+ list_create(&pending_writes, sizeof (zio_t), offsetof(zio_t,
+ io_trim_link));
+ list_move_tail(&pending_writes, &tm->tm_pending_writes);
+ mutex_exit(&tm->tm_lock);
+
+ while ((zio = list_remove_head(&pending_writes)) != NULL) {
+ zio_vdev_io_reissue(zio);
+ zio_execute(zio);
+ }
+ list_destroy(&pending_writes);
+}
+
+static void
+trim_map_commit(spa_t *spa, zio_t *zio, vdev_t *vd)
+{
+ int c;
+
+ if (vd == NULL || spa->spa_syncing_txg <= trim_txg_limit)
+ return;
+
+ if (vd->vdev_ops->vdev_op_leaf) {
+ trim_map_vdev_commit(spa, zio, vd);
+ } else {
+ for (c = 0; c < vd->vdev_children; c++)
+ trim_map_commit(spa, zio, vd->vdev_child[c]);
+ }
+}
+
+static void
+trim_map_commit_done(spa_t *spa, vdev_t *vd)
+{
+ int c;
+
+ if (vd == NULL)
+ return;
+
+ if (vd->vdev_ops->vdev_op_leaf) {
+ trim_map_vdev_commit_done(spa, vd);
+ } else {
+ for (c = 0; c < vd->vdev_children; c++)
+ trim_map_commit_done(spa, vd->vdev_child[c]);
+ }
+}
+
+static void
+trim_thread(void *arg)
+{
+ spa_t *spa = arg;
+ zio_t *zio;
+
+ for (;;) {
+ mutex_enter(&spa->spa_trim_lock);
+ if (spa->spa_trim_thread == NULL) {
+ spa->spa_trim_thread = curthread;
+ cv_signal(&spa->spa_trim_cv);
+ mutex_exit(&spa->spa_trim_lock);
+ thread_exit();
+ }
+ cv_wait(&spa->spa_trim_cv, &spa->spa_trim_lock);
+ mutex_exit(&spa->spa_trim_lock);
+
+ zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+
+ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+ trim_map_commit(spa, zio, spa->spa_root_vdev);
+ (void) zio_wait(zio);
+ trim_map_commit_done(spa, spa->spa_root_vdev);
+ spa_config_exit(spa, SCL_STATE, FTAG);
+ }
+}
+
+void
+trim_thread_create(spa_t *spa)
+{
+
+ if (zfs_notrim)
+ return;
+
+ mutex_init(&spa->spa_trim_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&spa->spa_trim_cv, NULL, CV_DEFAULT, NULL);
+ mutex_enter(&spa->spa_trim_lock);
+ spa->spa_trim_thread = thread_create(NULL, 0, trim_thread, spa, 0, &p0,
+ TS_RUN, minclsyspri);
+ mutex_exit(&spa->spa_trim_lock);
+}
+
+void
+trim_thread_destroy(spa_t *spa)
+{
+
+ if (zfs_notrim)
+ return;
+ if (spa->spa_trim_thread == NULL)
+ return;
+
+ mutex_enter(&spa->spa_trim_lock);
+ /* Setting spa_trim_thread to NULL tells the thread to stop. */
+ spa->spa_trim_thread = NULL;
+ cv_signal(&spa->spa_trim_cv);
+ /* The thread will set it back to != NULL on exit. */
+ while (spa->spa_trim_thread == NULL)
+ cv_wait(&spa->spa_trim_cv, &spa->spa_trim_lock);
+ spa->spa_trim_thread = NULL;
+ mutex_exit(&spa->spa_trim_lock);
+
+ cv_destroy(&spa->spa_trim_cv);
+ mutex_destroy(&spa->spa_trim_lock);
+}
+
+void
+trim_thread_wakeup(spa_t *spa)
+{
+
+ if (zfs_notrim)
+ return;
+ if (spa->spa_trim_thread == NULL)
+ return;
+
+ mutex_enter(&spa->spa_trim_lock);
+ cv_signal(&spa->spa_trim_cv);
+ mutex_exit(&spa->spa_trim_lock);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
index 9eaf94c..f2456a2 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
@@ -42,6 +42,7 @@
#include <sys/arc.h>
#include <sys/zil.h>
#include <sys/dsl_scan.h>
+#include <sys/trim_map.h>
SYSCTL_DECL(_vfs_zfs);
SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV");
@@ -1195,6 +1196,11 @@ vdev_open(vdev_t *vd)
if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops)
return (0);
+ if (vd->vdev_ops->vdev_op_leaf) {
+ vd->vdev_notrim = B_FALSE;
+ trim_map_create(vd);
+ }
+
for (int c = 0; c < vd->vdev_children; c++) {
if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
@@ -1439,6 +1445,9 @@ vdev_close(vdev_t *vd)
vdev_cache_purge(vd);
+ if (vd->vdev_ops->vdev_op_leaf)
+ trim_map_destroy(vd);
+
/*
* We record the previous state before we close it, so that if we are
* doing a reopen(), we don't generate FMA ereports if we notice that
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
index b2aada7..d69d9cc 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
@@ -49,14 +49,17 @@ struct g_class zfs_vdev_class = {
DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev);
-/*
- * Don't send BIO_FLUSH.
- */
+SYSCTL_DECL(_vfs_zfs_vdev);
+/* Don't send BIO_FLUSH. */
static int vdev_geom_bio_flush_disable = 0;
TUNABLE_INT("vfs.zfs.vdev.bio_flush_disable", &vdev_geom_bio_flush_disable);
-SYSCTL_DECL(_vfs_zfs_vdev);
SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RW,
&vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH");
+/* Don't send BIO_DELETE. */
+static int vdev_geom_bio_delete_disable = 0;
+TUNABLE_INT("vfs.zfs.vdev.bio_delete_disable", &vdev_geom_bio_delete_disable);
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_delete_disable, CTLFLAG_RW,
+ &vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE");
static void
vdev_geom_orphan(struct g_consumer *cp)
@@ -499,8 +502,8 @@ vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
*ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1;
/*
- * Clear the nowritecache bit, so that on a vdev_reopen() we will
- * try again.
+ * Clear the nowritecache settings, so that on a vdev_reopen()
+ * we will try again.
*/
vd->vdev_nowritecache = B_FALSE;
@@ -546,6 +549,15 @@ vdev_geom_io_intr(struct bio *bp)
*/
vd->vdev_nowritecache = B_TRUE;
}
+ if (bp->bio_cmd == BIO_DELETE && bp->bio_error == ENOTSUP) {
+ /*
+ * If we get ENOTSUP, we know that no future
+ * attempts will ever succeed. In this case we
+ * set a persistent bit so that we don't bother
+ * with the ioctl in the future.
+ */
+ vd->vdev_notrim = B_TRUE;
+ }
if (zio->io_error == EIO && !vd->vdev_remove_wanted) {
/*
* If provider's error is set we assume it is being
@@ -588,17 +600,21 @@ vdev_geom_io_start(zio_t *zio)
}
switch (zio->io_cmd) {
-
case DKIOCFLUSHWRITECACHE:
-
if (zfs_nocacheflush || vdev_geom_bio_flush_disable)
break;
-
if (vd->vdev_nowritecache) {
zio->io_error = ENOTSUP;
break;
}
-
+ goto sendreq;
+ case DKIOCTRIM:
+ if (vdev_geom_bio_delete_disable)
+ break;
+ if (vd->vdev_notrim) {
+ zio->io_error = ENOTSUP;
+ break;
+ }
goto sendreq;
default:
zio->io_error = ENOTSUP;
@@ -623,11 +639,21 @@ sendreq:
bp->bio_length = zio->io_size;
break;
case ZIO_TYPE_IOCTL:
- bp->bio_cmd = BIO_FLUSH;
- bp->bio_flags |= BIO_ORDERED;
- bp->bio_data = NULL;
- bp->bio_offset = cp->provider->mediasize;
- bp->bio_length = 0;
+ switch (zio->io_cmd) {
+ case DKIOCFLUSHWRITECACHE:
+ bp->bio_cmd = BIO_FLUSH;
+ bp->bio_flags |= BIO_ORDERED;
+ bp->bio_data = NULL;
+ bp->bio_offset = cp->provider->mediasize;
+ bp->bio_length = 0;
+ break;
+ case DKIOCTRIM:
+ bp->bio_cmd = BIO_DELETE;
+ bp->bio_data = NULL;
+ bp->bio_offset = zio->io_offset;
+ bp->bio_length = zio->io_size;
+ break;
+ }
break;
}
bp->bio_done = vdev_geom_io_intr;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
index e573feb..6cf9106 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
@@ -145,6 +145,7 @@
#include <sys/metaslab.h>
#include <sys/zio.h>
#include <sys/dsl_scan.h>
+#include <sys/trim_map.h>
#include <sys/fs/zfs.h>
/*
@@ -718,6 +719,16 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
}
/*
+ * TRIM the whole thing so that we start with a clean slate.
+ * It's just an optimization, so we don't care if it fails.
+ * Don't TRIM if removing so that we don't interfere with zpool
+ * disaster recovery.
+ */
+ if (!zfs_notrim && (reason == VDEV_LABEL_CREATE ||
+ reason == VDEV_LABEL_SPARE || reason == VDEV_LABEL_L2CACHE))
+ zio_wait(zio_trim(NULL, spa, vd, 0, vd->vdev_psize));
+
+ /*
* Initialize its label.
*/
vp = zio_buf_alloc(sizeof (vdev_phys_t));
@@ -1282,5 +1293,10 @@ vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg, boolean_t tryhard)
* to disk to ensure that all odd-label updates are committed to
* stable storage before the next transaction group begins.
*/
- return (vdev_label_sync_list(spa, 1, txg, flags));
+ if ((error = vdev_label_sync_list(spa, 1, txg, flags)) != 0)
+ return (error);
+
+ trim_thread_wakeup(spa);
+
+ return (0);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
index a28ca3e..33a2269 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
@@ -293,10 +293,11 @@ vdev_mirror_io_start(zio_t *zio)
c = vdev_mirror_child_select(zio);
children = (c >= 0);
} else {
- ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+ ASSERT(zio->io_type == ZIO_TYPE_WRITE ||
+ zio->io_type == ZIO_TYPE_FREE);
/*
- * Writes go to all children.
+ * Writes and frees go to all children.
*/
c = 0;
children = mm->mm_children;
@@ -377,6 +378,8 @@ vdev_mirror_io_done(zio_t *zio)
zio->io_error = vdev_mirror_worst_error(mm);
}
return;
+ } else if (zio->io_type == ZIO_TYPE_FREE) {
+ return;
}
ASSERT(zio->io_type == ZIO_TYPE_READ);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
index efae534..0e9fd6b 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
@@ -259,7 +259,9 @@ vdev_raidz_map_free(raidz_map_t *rm)
size_t size;
for (c = 0; c < rm->rm_firstdatacol; c++) {
- zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
+ if (rm->rm_col[c].rc_data != NULL)
+ zio_buf_free(rm->rm_col[c].rc_data,
+ rm->rm_col[c].rc_size);
if (rm->rm_col[c].rc_gdata != NULL)
zio_buf_free(rm->rm_col[c].rc_gdata,
@@ -504,14 +506,20 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
ASSERT3U(rm->rm_nskip, <=, nparity);
- for (c = 0; c < rm->rm_firstdatacol; c++)
- rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
+ if (zio->io_type != ZIO_TYPE_FREE) {
+ for (c = 0; c < rm->rm_firstdatacol; c++) {
+ rm->rm_col[c].rc_data =
+ zio_buf_alloc(rm->rm_col[c].rc_size);
+ }
- rm->rm_col[c].rc_data = zio->io_data;
+ rm->rm_col[c].rc_data = zio->io_data;
- for (c = c + 1; c < acols; c++)
- rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
- rm->rm_col[c - 1].rc_size;
+ for (c = c + 1; c < acols; c++) {
+ rm->rm_col[c].rc_data =
+ (char *)rm->rm_col[c - 1].rc_data +
+ rm->rm_col[c - 1].rc_size;
+ }
+ }
/*
* If all data stored spans all columns, there's a danger that parity
@@ -1535,6 +1543,18 @@ vdev_raidz_io_start(zio_t *zio)
ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
+ if (zio->io_type == ZIO_TYPE_FREE) {
+ for (c = 0; c < rm->rm_cols; c++) {
+ rc = &rm->rm_col[c];
+ cvd = vd->vdev_child[rc->rc_devidx];
+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+ rc->rc_offset, rc->rc_data, rc->rc_size,
+ zio->io_type, zio->io_priority, 0,
+ vdev_raidz_child_done, rc));
+ }
+ return (ZIO_PIPELINE_CONTINUE);
+ }
+
if (zio->io_type == ZIO_TYPE_WRITE) {
vdev_raidz_generate_parity(rm);
@@ -1917,6 +1937,8 @@ vdev_raidz_io_done(zio_t *zio)
zio->io_error = vdev_raidz_worst_error(rm);
return;
+ } else if (zio->io_type == ZIO_TYPE_FREE) {
+ return;
}
ASSERT(zio->io_type == ZIO_TYPE_READ);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
index 64230ff..0306899 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
@@ -83,6 +83,10 @@ boolean_t zfs_nocacheflush = B_FALSE;
TUNABLE_INT("vfs.zfs.cache_flush_disable", &zfs_nocacheflush);
SYSCTL_INT(_vfs_zfs, OID_AUTO, cache_flush_disable, CTLFLAG_RDTUN,
&zfs_nocacheflush, 0, "Disable cache flush");
+boolean_t zfs_notrim = B_TRUE;
+TUNABLE_INT("vfs.zfs.trim_disable", &zfs_notrim);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, trim_disable, CTLFLAG_RDTUN, &zfs_notrim, 0,
+ "Disable trim");
static kmem_cache_t *zil_lwb_cache;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
index f8d3c34..be7d274 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
@@ -35,6 +35,7 @@
#include <sys/dmu_objset.h>
#include <sys/arc.h>
#include <sys/ddt.h>
+#include <sys/trim_map.h>
SYSCTL_DECL(_vfs_zfs);
SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO");
@@ -48,6 +49,18 @@ SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, &zio_exclude
"Exclude metadata buffers from dumps as well");
/*
+ * See zio.h for more information about these fields.
+ */
+zio_trim_stats_t zio_trim_stats = {
+ { "zio_trim_bytes", KSTAT_DATA_UINT64 },
+ { "zio_trim_success", KSTAT_DATA_UINT64 },
+ { "zio_trim_unsupported", KSTAT_DATA_UINT64 },
+ { "zio_trim_failed", KSTAT_DATA_UINT64 },
+};
+
+static kstat_t *zio_trim_ksp;
+
+/*
* ==========================================================================
* I/O priority table
* ==========================================================================
@@ -65,6 +78,7 @@ uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = {
10, /* ZIO_PRIORITY_RESILVER */
20, /* ZIO_PRIORITY_SCRUB */
2, /* ZIO_PRIORITY_DDT_PREFETCH */
+ 30, /* ZIO_PRIORITY_TRIM */
};
/*
@@ -188,6 +202,16 @@ zio_init(void)
zfs_mg_alloc_failures = 8;
zio_inject_init();
+
+ zio_trim_ksp = kstat_create("zfs", 0, "zio_trim", "misc",
+ KSTAT_TYPE_NAMED,
+ sizeof(zio_trim_stats) / sizeof(kstat_named_t),
+ KSTAT_FLAG_VIRTUAL);
+
+ if (zio_trim_ksp != NULL) {
+ zio_trim_ksp->ks_data = &zio_trim_stats;
+ kstat_install(zio_trim_ksp);
+ }
}
void
@@ -215,6 +239,11 @@ zio_fini(void)
kmem_cache_destroy(zio_cache);
zio_inject_fini();
+
+ if (zio_trim_ksp != NULL) {
+ kstat_delete(zio_trim_ksp);
+ zio_trim_ksp = NULL;
+ }
}
/*
@@ -523,7 +552,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
{
zio_t *zio;
- ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
+ ASSERT3U(type == ZIO_TYPE_FREE || size, <=, SPA_MAXBLOCKSIZE);
ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
@@ -704,7 +733,7 @@ zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
zio_t *
zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
- enum zio_flag flags)
+ uint64_t size, enum zio_flag flags)
{
zio_t *zio;
@@ -715,7 +744,7 @@ zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
ASSERT(spa_syncing_txg(spa) == txg);
ASSERT(spa_sync_pass(spa) <= SYNC_PASS_DEFERRED_FREE);
- zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
+ zio = zio_create(pio, spa, txg, bp, NULL, size,
NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags,
NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE);
@@ -752,15 +781,16 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
}
zio_t *
-zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
- zio_done_func_t *done, void *private, int priority, enum zio_flag flags)
+zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, uint64_t offset,
+ uint64_t size, zio_done_func_t *done, void *private, int priority,
+ enum zio_flag flags)
{
zio_t *zio;
int c;
if (vd->vdev_children == 0) {
- zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
- ZIO_TYPE_IOCTL, priority, flags, vd, 0, NULL,
+ zio = zio_create(pio, spa, 0, NULL, NULL, size, done, private,
+ ZIO_TYPE_IOCTL, priority, flags, vd, offset, NULL,
ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
zio->io_cmd = cmd;
@@ -769,7 +799,7 @@ zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
for (c = 0; c < vd->vdev_children; c++)
zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
- done, private, priority, flags));
+ offset, size, done, private, priority, flags));
}
return (zio);
@@ -894,11 +924,22 @@ zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
void
zio_flush(zio_t *zio, vdev_t *vd)
{
- zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE,
+ zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 0, 0,
NULL, NULL, ZIO_PRIORITY_NOW,
ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
}
+zio_t *
+zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset, uint64_t size)
+{
+
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+
+ return zio_ioctl(zio, spa, vd, DKIOCTRIM, offset, size,
+ NULL, NULL, ZIO_PRIORITY_TRIM,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY);
+}
+
void
zio_shrink(zio_t *zio, uint64_t size)
{
@@ -1502,6 +1543,7 @@ zio_t *
zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
{
return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
+ BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp),
ZIO_GANG_CHILD_FLAGS(pio)));
}
@@ -1634,7 +1676,7 @@ zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data)
}
}
- if (gn == gio->io_gang_tree)
+ if (gn == gio->io_gang_tree && gio->io_data != NULL)
ASSERT3P((char *)gio->io_data + gio->io_size, ==, data);
if (zio != pio)
@@ -2322,6 +2364,11 @@ zio_vdev_io_start(zio_t *zio)
return (vdev_mirror_ops.vdev_op_io_start(zio));
}
+ if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE) {
+ trim_map_free(zio);
+ return (ZIO_PIPELINE_CONTINUE);
+ }
+
/*
* We keep track of time-sensitive I/Os so that the scan thread
* can quickly react to certain workloads. In particular, we care
@@ -2346,18 +2393,22 @@ zio_vdev_io_start(zio_t *zio)
if (P2PHASE(zio->io_size, align) != 0) {
uint64_t asize = P2ROUNDUP(zio->io_size, align);
- char *abuf = zio_buf_alloc(asize);
+ char *abuf = NULL;
+ if (zio->io_type == ZIO_TYPE_READ ||
+ zio->io_type == ZIO_TYPE_WRITE)
+ abuf = zio_buf_alloc(asize);
ASSERT(vd == vd->vdev_top);
if (zio->io_type == ZIO_TYPE_WRITE) {
bcopy(zio->io_data, abuf, zio->io_size);
bzero(abuf + zio->io_size, asize - zio->io_size);
}
- zio_push_transform(zio, abuf, asize, asize, zio_subblock);
+ zio_push_transform(zio, abuf, asize, abuf ? asize : 0,
+ zio_subblock);
}
ASSERT(P2PHASE(zio->io_offset, align) == 0);
ASSERT(P2PHASE(zio->io_size, align) == 0);
- VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa));
+ VERIFY(zio->io_type == ZIO_TYPE_READ || spa_writeable(spa));
/*
* If this is a repair I/O, and there's no self-healing involved --
@@ -2397,6 +2448,11 @@ zio_vdev_io_start(zio_t *zio)
}
}
+ if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_WRITE) {
+ if (!trim_map_write_start(zio))
+ return (ZIO_PIPELINE_STOP);
+ }
+
return (vd->vdev_ops->vdev_op_io_start(zio));
}
@@ -2410,9 +2466,16 @@ zio_vdev_io_done(zio_t *zio)
if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
return (ZIO_PIPELINE_STOP);
- ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
+ ASSERT(zio->io_type == ZIO_TYPE_READ ||
+ zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE);
- if (vd != NULL && vd->vdev_ops->vdev_op_leaf) {
+ if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
+ zio->io_type == ZIO_TYPE_WRITE) {
+ trim_map_write_done(zio);
+ }
+
+ if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
+ (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
vdev_queue_io_done(zio);
@@ -2488,6 +2551,20 @@ zio_vdev_io_assess(zio_t *zio)
if (zio_injection_enabled && zio->io_error == 0)
zio->io_error = zio_handle_fault_injection(zio, EIO);
+ if (zio->io_type == ZIO_TYPE_IOCTL && zio->io_cmd == DKIOCTRIM)
+ switch (zio->io_error) {
+ case 0:
+ ZIO_TRIM_STAT_INCR(zio_trim_bytes, zio->io_size);
+ ZIO_TRIM_STAT_BUMP(zio_trim_success);
+ break;
+ case EOPNOTSUPP:
+ ZIO_TRIM_STAT_BUMP(zio_trim_unsupported);
+ break;
+ default:
+ ZIO_TRIM_STAT_BUMP(zio_trim_failed);
+ break;
+ }
+
/*
* If the I/O failed, determine whether we should attempt to retry it.
*
OpenPOWER on IntegriCloud