summaryrefslogtreecommitdiffstats
path: root/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
diff options
context:
space:
mode:
authorpjd <pjd@FreeBSD.org>2012-09-23 19:40:58 +0000
committerpjd <pjd@FreeBSD.org>2012-09-23 19:40:58 +0000
commit618888b019e4005dec19c1688531e25d6ff63c76 (patch)
tree85c57e3b0f3bd89091304524a42b74d8be588cd6 /sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
parent3d431ee1496a9253b224fdc00c445d0ab59e1eb3 (diff)
downloadFreeBSD-src-618888b019e4005dec19c1688531e25d6ff63c76.zip
FreeBSD-src-618888b019e4005dec19c1688531e25d6ff63c76.tar.gz
Add TRIM support.
The code builds a map of regions that were freed. On every write the code consults the map and eventually removes ranges that were freed before, but are now overwritten. Freed blocks are not TRIMed immediately. There is a tunable that defines how many txg we should wait with TRIMming freed blocks (64 by default). There is a low priority thread that TRIMs ranges when the time comes. During TRIM we keep in-flight ranges on a list to detect colliding writes - we have to delay writes that collide with in-flight TRIMs in case something will be reordered and write will reached the disk before the TRIM. We don't have to do the same for in-flight writes, as colliding writes just remove ranges to TRIM. Sponsored by: multiplay.co.uk This work includes some important fixes and some improvements obtained from the zfsonlinux project, including TRIMming entire vdevs on pool create/add/attach and on pool import for spare and cache vdevs. Obtained from: zfsonlinux Submitted by: Etienne Dechamps <etienne.dechamps@ovh.net>
Diffstat (limited to 'sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c')
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c107
1 files changed, 92 insertions, 15 deletions
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
index f8d3c34..be7d274 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
@@ -35,6 +35,7 @@
#include <sys/dmu_objset.h>
#include <sys/arc.h>
#include <sys/ddt.h>
+#include <sys/trim_map.h>
SYSCTL_DECL(_vfs_zfs);
SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO");
@@ -48,6 +49,18 @@ SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, &zio_exclude
"Exclude metadata buffers from dumps as well");
/*
+ * See zio.h for more information about these fields.
+ */
+zio_trim_stats_t zio_trim_stats = {
+ { "zio_trim_bytes", KSTAT_DATA_UINT64 },
+ { "zio_trim_success", KSTAT_DATA_UINT64 },
+ { "zio_trim_unsupported", KSTAT_DATA_UINT64 },
+ { "zio_trim_failed", KSTAT_DATA_UINT64 },
+};
+
+static kstat_t *zio_trim_ksp;
+
+/*
* ==========================================================================
* I/O priority table
* ==========================================================================
@@ -65,6 +78,7 @@ uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = {
10, /* ZIO_PRIORITY_RESILVER */
20, /* ZIO_PRIORITY_SCRUB */
2, /* ZIO_PRIORITY_DDT_PREFETCH */
+ 30, /* ZIO_PRIORITY_TRIM */
};
/*
@@ -188,6 +202,16 @@ zio_init(void)
zfs_mg_alloc_failures = 8;
zio_inject_init();
+
+ zio_trim_ksp = kstat_create("zfs", 0, "zio_trim", "misc",
+ KSTAT_TYPE_NAMED,
+ sizeof(zio_trim_stats) / sizeof(kstat_named_t),
+ KSTAT_FLAG_VIRTUAL);
+
+ if (zio_trim_ksp != NULL) {
+ zio_trim_ksp->ks_data = &zio_trim_stats;
+ kstat_install(zio_trim_ksp);
+ }
}
void
@@ -215,6 +239,11 @@ zio_fini(void)
kmem_cache_destroy(zio_cache);
zio_inject_fini();
+
+ if (zio_trim_ksp != NULL) {
+ kstat_delete(zio_trim_ksp);
+ zio_trim_ksp = NULL;
+ }
}
/*
@@ -523,7 +552,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
{
zio_t *zio;
- ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
+ ASSERT3U(type == ZIO_TYPE_FREE || size, <=, SPA_MAXBLOCKSIZE);
ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
@@ -704,7 +733,7 @@ zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
zio_t *
zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
- enum zio_flag flags)
+ uint64_t size, enum zio_flag flags)
{
zio_t *zio;
@@ -715,7 +744,7 @@ zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
ASSERT(spa_syncing_txg(spa) == txg);
ASSERT(spa_sync_pass(spa) <= SYNC_PASS_DEFERRED_FREE);
- zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
+ zio = zio_create(pio, spa, txg, bp, NULL, size,
NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags,
NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE);
@@ -752,15 +781,16 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
}
zio_t *
-zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
- zio_done_func_t *done, void *private, int priority, enum zio_flag flags)
+zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, uint64_t offset,
+ uint64_t size, zio_done_func_t *done, void *private, int priority,
+ enum zio_flag flags)
{
zio_t *zio;
int c;
if (vd->vdev_children == 0) {
- zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
- ZIO_TYPE_IOCTL, priority, flags, vd, 0, NULL,
+ zio = zio_create(pio, spa, 0, NULL, NULL, size, done, private,
+ ZIO_TYPE_IOCTL, priority, flags, vd, offset, NULL,
ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
zio->io_cmd = cmd;
@@ -769,7 +799,7 @@ zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
for (c = 0; c < vd->vdev_children; c++)
zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
- done, private, priority, flags));
+ offset, size, done, private, priority, flags));
}
return (zio);
@@ -894,11 +924,22 @@ zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
void
zio_flush(zio_t *zio, vdev_t *vd)
{
- zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE,
+ zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 0, 0,
NULL, NULL, ZIO_PRIORITY_NOW,
ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
}
+zio_t *
+zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset, uint64_t size)
+{
+
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+
+ return zio_ioctl(zio, spa, vd, DKIOCTRIM, offset, size,
+ NULL, NULL, ZIO_PRIORITY_TRIM,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY);
+}
+
void
zio_shrink(zio_t *zio, uint64_t size)
{
@@ -1502,6 +1543,7 @@ zio_t *
zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
{
return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
+ BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp),
ZIO_GANG_CHILD_FLAGS(pio)));
}
@@ -1634,7 +1676,7 @@ zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data)
}
}
- if (gn == gio->io_gang_tree)
+ if (gn == gio->io_gang_tree && gio->io_data != NULL)
ASSERT3P((char *)gio->io_data + gio->io_size, ==, data);
if (zio != pio)
@@ -2322,6 +2364,11 @@ zio_vdev_io_start(zio_t *zio)
return (vdev_mirror_ops.vdev_op_io_start(zio));
}
+ if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE) {
+ trim_map_free(zio);
+ return (ZIO_PIPELINE_CONTINUE);
+ }
+
/*
* We keep track of time-sensitive I/Os so that the scan thread
* can quickly react to certain workloads. In particular, we care
@@ -2346,18 +2393,22 @@ zio_vdev_io_start(zio_t *zio)
if (P2PHASE(zio->io_size, align) != 0) {
uint64_t asize = P2ROUNDUP(zio->io_size, align);
- char *abuf = zio_buf_alloc(asize);
+ char *abuf = NULL;
+ if (zio->io_type == ZIO_TYPE_READ ||
+ zio->io_type == ZIO_TYPE_WRITE)
+ abuf = zio_buf_alloc(asize);
ASSERT(vd == vd->vdev_top);
if (zio->io_type == ZIO_TYPE_WRITE) {
bcopy(zio->io_data, abuf, zio->io_size);
bzero(abuf + zio->io_size, asize - zio->io_size);
}
- zio_push_transform(zio, abuf, asize, asize, zio_subblock);
+ zio_push_transform(zio, abuf, asize, abuf ? asize : 0,
+ zio_subblock);
}
ASSERT(P2PHASE(zio->io_offset, align) == 0);
ASSERT(P2PHASE(zio->io_size, align) == 0);
- VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa));
+ VERIFY(zio->io_type == ZIO_TYPE_READ || spa_writeable(spa));
/*
* If this is a repair I/O, and there's no self-healing involved --
@@ -2397,6 +2448,11 @@ zio_vdev_io_start(zio_t *zio)
}
}
+ if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_WRITE) {
+ if (!trim_map_write_start(zio))
+ return (ZIO_PIPELINE_STOP);
+ }
+
return (vd->vdev_ops->vdev_op_io_start(zio));
}
@@ -2410,9 +2466,16 @@ zio_vdev_io_done(zio_t *zio)
if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
return (ZIO_PIPELINE_STOP);
- ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
+ ASSERT(zio->io_type == ZIO_TYPE_READ ||
+ zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE);
- if (vd != NULL && vd->vdev_ops->vdev_op_leaf) {
+ if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
+ zio->io_type == ZIO_TYPE_WRITE) {
+ trim_map_write_done(zio);
+ }
+
+ if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
+ (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
vdev_queue_io_done(zio);
@@ -2488,6 +2551,20 @@ zio_vdev_io_assess(zio_t *zio)
if (zio_injection_enabled && zio->io_error == 0)
zio->io_error = zio_handle_fault_injection(zio, EIO);
+ if (zio->io_type == ZIO_TYPE_IOCTL && zio->io_cmd == DKIOCTRIM)
+ switch (zio->io_error) {
+ case 0:
+ ZIO_TRIM_STAT_INCR(zio_trim_bytes, zio->io_size);
+ ZIO_TRIM_STAT_BUMP(zio_trim_success);
+ break;
+ case EOPNOTSUPP:
+ ZIO_TRIM_STAT_BUMP(zio_trim_unsupported);
+ break;
+ default:
+ ZIO_TRIM_STAT_BUMP(zio_trim_failed);
+ break;
+ }
+
/*
* If the I/O failed, determine whether we should attempt to retry it.
*
OpenPOWER on IntegriCloud