diff options
author | pjd <pjd@FreeBSD.org> | 2012-09-23 19:40:58 +0000 |
---|---|---|
committer | pjd <pjd@FreeBSD.org> | 2012-09-23 19:40:58 +0000 |
commit | 618888b019e4005dec19c1688531e25d6ff63c76 (patch) | |
tree | 85c57e3b0f3bd89091304524a42b74d8be588cd6 /sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c | |
parent | 3d431ee1496a9253b224fdc00c445d0ab59e1eb3 (diff) | |
download | FreeBSD-src-618888b019e4005dec19c1688531e25d6ff63c76.zip FreeBSD-src-618888b019e4005dec19c1688531e25d6ff63c76.tar.gz |
Add TRIM support.
The code builds a map of regions that were freed. On every write the
code consults the map and eventually removes ranges that were freed
before, but are now overwritten.
Freed blocks are not TRIMed immediately. There is a tunable that defines
how many txg we should wait with TRIMming freed blocks (64 by default).
There is a low priority thread that TRIMs ranges when the time comes.
During TRIM we keep in-flight ranges on a list to detect colliding
writes - we have to delay writes that collide with in-flight TRIMs in
case something will be reordered and write will reached the disk before
the TRIM. We don't have to do the same for in-flight writes, as
colliding writes just remove ranges to TRIM.
Sponsored by: multiplay.co.uk
This work includes some important fixes and some improvements obtained
from the zfsonlinux project, including TRIMming entire vdevs on pool
create/add/attach and on pool import for spare and cache vdevs.
Obtained from: zfsonlinux
Submitted by: Etienne Dechamps <etienne.dechamps@ovh.net>
Diffstat (limited to 'sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c')
-rw-r--r-- | sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c | 107 |
1 files changed, 92 insertions, 15 deletions
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c index f8d3c34..be7d274 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c @@ -35,6 +35,7 @@ #include <sys/dmu_objset.h> #include <sys/arc.h> #include <sys/ddt.h> +#include <sys/trim_map.h> SYSCTL_DECL(_vfs_zfs); SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); @@ -48,6 +49,18 @@ SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, &zio_exclude "Exclude metadata buffers from dumps as well"); /* + * See zio.h for more information about these fields. + */ +zio_trim_stats_t zio_trim_stats = { + { "zio_trim_bytes", KSTAT_DATA_UINT64 }, + { "zio_trim_success", KSTAT_DATA_UINT64 }, + { "zio_trim_unsupported", KSTAT_DATA_UINT64 }, + { "zio_trim_failed", KSTAT_DATA_UINT64 }, +}; + +static kstat_t *zio_trim_ksp; + +/* * ========================================================================== * I/O priority table * ========================================================================== @@ -65,6 +78,7 @@ uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { 10, /* ZIO_PRIORITY_RESILVER */ 20, /* ZIO_PRIORITY_SCRUB */ 2, /* ZIO_PRIORITY_DDT_PREFETCH */ + 30, /* ZIO_PRIORITY_TRIM */ }; /* @@ -188,6 +202,16 @@ zio_init(void) zfs_mg_alloc_failures = 8; zio_inject_init(); + + zio_trim_ksp = kstat_create("zfs", 0, "zio_trim", "misc", + KSTAT_TYPE_NAMED, + sizeof(zio_trim_stats) / sizeof(kstat_named_t), + KSTAT_FLAG_VIRTUAL); + + if (zio_trim_ksp != NULL) { + zio_trim_ksp->ks_data = &zio_trim_stats; + kstat_install(zio_trim_ksp); + } } void @@ -215,6 +239,11 @@ zio_fini(void) kmem_cache_destroy(zio_cache); zio_inject_fini(); + + if (zio_trim_ksp != NULL) { + kstat_delete(zio_trim_ksp); + zio_trim_ksp = NULL; + } } /* @@ -523,7 +552,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, { zio_t *zio; - ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); + ASSERT3U(type == ZIO_TYPE_FREE || size, <=, SPA_MAXBLOCKSIZE); ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); @@ -704,7 +733,7 @@ zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) zio_t * zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, - enum zio_flag flags) + uint64_t size, enum zio_flag flags) { zio_t *zio; @@ -715,7 +744,7 @@ zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, ASSERT(spa_syncing_txg(spa) == txg); ASSERT(spa_sync_pass(spa) <= SYNC_PASS_DEFERRED_FREE); - zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), + zio = zio_create(pio, spa, txg, bp, NULL, size, NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE); @@ -752,15 +781,16 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, } zio_t * -zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, - zio_done_func_t *done, void *private, int priority, enum zio_flag flags) +zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, uint64_t offset, + uint64_t size, zio_done_func_t *done, void *private, int priority, + enum zio_flag flags) { zio_t *zio; int c; if (vd->vdev_children == 0) { - zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, - ZIO_TYPE_IOCTL, priority, flags, vd, 0, NULL, + zio = zio_create(pio, spa, 0, NULL, NULL, size, done, private, + ZIO_TYPE_IOCTL, priority, flags, vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); zio->io_cmd = cmd; @@ -769,7 +799,7 @@ zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, for (c = 0; c < vd->vdev_children; c++) zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, - done, private, priority, flags)); + offset, size, done, private, priority, flags)); } return (zio); @@ -894,11 +924,22 @@ zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, void zio_flush(zio_t *zio, vdev_t *vd) { - zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, + zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 0, 0, NULL, NULL, ZIO_PRIORITY_NOW, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); } +zio_t * +zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset, uint64_t size) +{ + + ASSERT(vd->vdev_ops->vdev_op_leaf); + + return zio_ioctl(zio, spa, vd, DKIOCTRIM, offset, size, + NULL, NULL, ZIO_PRIORITY_TRIM, + ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY); +} + void zio_shrink(zio_t *zio, uint64_t size) { @@ -1502,6 +1543,7 @@ zio_t * zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) { return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, + BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp), ZIO_GANG_CHILD_FLAGS(pio))); } @@ -1634,7 +1676,7 @@ zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) } } - if (gn == gio->io_gang_tree) + if (gn == gio->io_gang_tree && gio->io_data != NULL) ASSERT3P((char *)gio->io_data + gio->io_size, ==, data); if (zio != pio) @@ -2322,6 +2364,11 @@ zio_vdev_io_start(zio_t *zio) return (vdev_mirror_ops.vdev_op_io_start(zio)); } + if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE) { + trim_map_free(zio); + return (ZIO_PIPELINE_CONTINUE); + } + /* * We keep track of time-sensitive I/Os so that the scan thread * can quickly react to certain workloads. In particular, we care @@ -2346,18 +2393,22 @@ zio_vdev_io_start(zio_t *zio) if (P2PHASE(zio->io_size, align) != 0) { uint64_t asize = P2ROUNDUP(zio->io_size, align); - char *abuf = zio_buf_alloc(asize); + char *abuf = NULL; + if (zio->io_type == ZIO_TYPE_READ || + zio->io_type == ZIO_TYPE_WRITE) + abuf = zio_buf_alloc(asize); ASSERT(vd == vd->vdev_top); if (zio->io_type == ZIO_TYPE_WRITE) { bcopy(zio->io_data, abuf, zio->io_size); bzero(abuf + zio->io_size, asize - zio->io_size); } - zio_push_transform(zio, abuf, asize, asize, zio_subblock); + zio_push_transform(zio, abuf, asize, abuf ? asize : 0, + zio_subblock); } ASSERT(P2PHASE(zio->io_offset, align) == 0); ASSERT(P2PHASE(zio->io_size, align) == 0); - VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa)); + VERIFY(zio->io_type == ZIO_TYPE_READ || spa_writeable(spa)); /* * If this is a repair I/O, and there's no self-healing involved -- @@ -2397,6 +2448,11 @@ zio_vdev_io_start(zio_t *zio) } } + if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_WRITE) { + if (!trim_map_write_start(zio)) + return (ZIO_PIPELINE_STOP); + } + return (vd->vdev_ops->vdev_op_io_start(zio)); } @@ -2410,9 +2466,16 @@ zio_vdev_io_done(zio_t *zio) if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) return (ZIO_PIPELINE_STOP); - ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); + ASSERT(zio->io_type == ZIO_TYPE_READ || + zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE); - if (vd != NULL && vd->vdev_ops->vdev_op_leaf) { + if (vd != NULL && vd->vdev_ops->vdev_op_leaf && + zio->io_type == ZIO_TYPE_WRITE) { + trim_map_write_done(zio); + } + + if (vd != NULL && vd->vdev_ops->vdev_op_leaf && + (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { vdev_queue_io_done(zio); @@ -2488,6 +2551,20 @@ zio_vdev_io_assess(zio_t *zio) if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_fault_injection(zio, EIO); + if (zio->io_type == ZIO_TYPE_IOCTL && zio->io_cmd == DKIOCTRIM) + switch (zio->io_error) { + case 0: + ZIO_TRIM_STAT_INCR(zio_trim_bytes, zio->io_size); + ZIO_TRIM_STAT_BUMP(zio_trim_success); + break; + case EOPNOTSUPP: + ZIO_TRIM_STAT_BUMP(zio_trim_unsupported); + break; + default: + ZIO_TRIM_STAT_BUMP(zio_trim_failed); + break; + } + /* * If the I/O failed, determine whether we should attempt to retry it. * |