summaryrefslogtreecommitdiffstats
path: root/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
diff options
context:
space:
mode:
Diffstat (limited to 'sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c')
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c134
1 files changed, 122 insertions, 12 deletions
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
index be7d274..c2720dc 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
@@ -107,6 +107,31 @@ extern vmem_t *zio_alloc_arena;
extern int zfs_mg_alloc_failures;
/*
+ * The following actions directly effect the spa's sync-to-convergence logic.
+ * The values below define the sync pass when we start performing the action.
+ * Care should be taken when changing these values as they directly impact
+ * spa_sync() performance. Tuning these values may introduce subtle performance
+ * pathologies and should only be done in the context of performance analysis.
+ * These tunables will eventually be removed and replaced with #defines once
+ * enough analysis has been done to determine optimal values.
+ *
+ * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that
+ * regular blocks are not deferred.
+ */
+int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */
+TUNABLE_INT("vfs.zfs.sync_pass_deferred_free", &zfs_sync_pass_deferred_free);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_deferred_free, CTLFLAG_RDTUN,
+ &zfs_sync_pass_deferred_free, 0, "defer frees starting in this pass");
+int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */
+TUNABLE_INT("vfs.zfs.sync_pass_dont_compress", &zfs_sync_pass_dont_compress);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_dont_compress, CTLFLAG_RDTUN,
+ &zfs_sync_pass_dont_compress, 0, "don't compress starting in this pass");
+int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */
+TUNABLE_INT("vfs.zfs.sync_pass_rewrite", &zfs_sync_pass_rewrite);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_rewrite, CTLFLAG_RDTUN,
+ &zfs_sync_pass_rewrite, 0, "rewrite new bps starting in this pass");
+
+/*
* An allocating zio is one that either currently has the DVA allocate
* stage set or will have it later in its lifetime.
*/
@@ -684,9 +709,7 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
DMU_OT_IS_VALID(zp->zp_type) &&
zp->zp_level < 32 &&
zp->zp_copies > 0 &&
- zp->zp_copies <= spa_max_replication(spa) &&
- zp->zp_dedup <= 1 &&
- zp->zp_dedup_verify <= 1);
+ zp->zp_copies <= spa_max_replication(spa));
zio = zio_create(pio, spa, txg, bp, data, size, done, private,
ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
@@ -714,13 +737,20 @@ zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
}
void
-zio_write_override(zio_t *zio, blkptr_t *bp, int copies)
+zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite)
{
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
+ /*
+ * We must reset the io_prop to match the values that existed
+ * when the bp was first written by dmu_sync() keeping in mind
+ * that nopwrite and dedup are mutually exclusive.
+ */
+ zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup;
+ zio->io_prop.zp_nopwrite = nopwrite;
zio->io_prop.zp_copies = copies;
zio->io_bp_override = bp;
}
@@ -742,7 +772,7 @@ zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
ASSERT(!BP_IS_HOLE(bp));
ASSERT(spa_syncing_txg(spa) == txg);
- ASSERT(spa_sync_pass(spa) <= SYNC_PASS_DEFERRED_FREE);
+ ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free);
zio = zio_create(pio, spa, txg, bp, NULL, size,
NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags,
@@ -1020,6 +1050,19 @@ zio_write_bp_init(zio_t *zio)
*bp = *zio->io_bp_override;
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+ /*
+ * If we've been overridden and nopwrite is set then
+ * set the flag accordingly to indicate that a nopwrite
+ * has already occurred.
+ */
+ if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) {
+ ASSERT(!zp->zp_dedup);
+ zio->io_flags |= ZIO_FLAG_NOPWRITE;
+ return (ZIO_PIPELINE_CONTINUE);
+ }
+
+ ASSERT(!zp->zp_nopwrite);
+
if (BP_IS_HOLE(bp) || !zp->zp_dedup)
return (ZIO_PIPELINE_CONTINUE);
@@ -1051,7 +1094,7 @@ zio_write_bp_init(zio_t *zio)
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
ASSERT(!BP_GET_DEDUP(bp));
- if (pass > SYNC_PASS_DONT_COMPRESS)
+ if (pass >= zfs_sync_pass_dont_compress)
compress = ZIO_COMPRESS_OFF;
/* Make sure someone doesn't change their mind on overwrites */
@@ -1080,7 +1123,7 @@ zio_write_bp_init(zio_t *zio)
* There should only be a handful of blocks after pass 1 in any case.
*/
if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize &&
- pass > SYNC_PASS_REWRITE) {
+ pass >= zfs_sync_pass_rewrite) {
ASSERT(psize != 0);
enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
@@ -1107,6 +1150,11 @@ zio_write_bp_init(zio_t *zio)
ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
}
+ if (zp->zp_nopwrite) {
+ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+ ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
+ zio->io_pipeline |= ZIO_STAGE_NOP_WRITE;
+ }
}
return (ZIO_PIPELINE_CONTINUE);
@@ -1328,6 +1376,7 @@ zio_reexecute(zio_t *pio)
pio->io_stage = pio->io_orig_stage;
pio->io_pipeline = pio->io_orig_pipeline;
pio->io_reexecute = 0;
+ pio->io_flags |= ZIO_FLAG_REEXECUTED;
pio->io_error = 0;
for (int w = 0; w < ZIO_WAIT_TYPES; w++)
pio->io_state[w] = 0;
@@ -1804,8 +1853,9 @@ zio_write_gang_block(zio_t *pio)
zp.zp_type = DMU_OT_NONE;
zp.zp_level = 0;
zp.zp_copies = gio->io_prop.zp_copies;
- zp.zp_dedup = 0;
- zp.zp_dedup_verify = 0;
+ zp.zp_dedup = B_FALSE;
+ zp.zp_dedup_verify = B_FALSE;
+ zp.zp_nopwrite = B_FALSE;
zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
(char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
@@ -1825,6 +1875,62 @@ zio_write_gang_block(zio_t *pio)
}
/*
+ * The zio_nop_write stage in the pipeline determines if allocating
+ * a new bp is necessary. By leveraging a cryptographically secure checksum,
+ * such as SHA256, we can compare the checksums of the new data and the old
+ * to determine if allocating a new block is required. The nopwrite
+ * feature can handle writes in either syncing or open context (i.e. zil
+ * writes) and as a result is mutually exclusive with dedup.
+ */
+static int
+zio_nop_write(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+ blkptr_t *bp_orig = &zio->io_bp_orig;
+ zio_prop_t *zp = &zio->io_prop;
+
+ ASSERT(BP_GET_LEVEL(bp) == 0);
+ ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
+ ASSERT(zp->zp_nopwrite);
+ ASSERT(!zp->zp_dedup);
+ ASSERT(zio->io_bp_override == NULL);
+ ASSERT(IO_IS_ALLOCATING(zio));
+
+ /*
+ * Check to see if the original bp and the new bp have matching
+ * characteristics (i.e. same checksum, compression algorithms, etc).
+ * If they don't then just continue with the pipeline which will
+ * allocate a new bp.
+ */
+ if (BP_IS_HOLE(bp_orig) ||
+ !zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_dedup ||
+ BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) ||
+ BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) ||
+ BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) ||
+ zp->zp_copies != BP_GET_NDVAS(bp_orig))
+ return (ZIO_PIPELINE_CONTINUE);
+
+ /*
+ * If the checksums match then reset the pipeline so that we
+ * avoid allocating a new bp and issuing any I/O.
+ */
+ if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) {
+ ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup);
+ ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig));
+ ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig));
+ ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF);
+ ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop,
+ sizeof (uint64_t)) == 0);
+
+ *bp = *bp_orig;
+ zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+ zio->io_flags |= ZIO_FLAG_NOPWRITE;
+ }
+
+ return (ZIO_PIPELINE_CONTINUE);
+}
+
+/*
* ==========================================================================
* Dedup
* ==========================================================================
@@ -2096,7 +2202,7 @@ zio_ddt_write(zio_t *zio)
zio->io_stage = ZIO_STAGE_OPEN;
BP_ZERO(bp);
} else {
- zp->zp_dedup = 0;
+ zp->zp_dedup = B_FALSE;
}
zio->io_pipeline = ZIO_WRITE_PIPELINE;
ddt_exit(ddt);
@@ -2753,7 +2859,8 @@ zio_ready(zio_t *zio)
if (zio->io_ready) {
ASSERT(IO_IS_ALLOCATING(zio));
- ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
+ ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) ||
+ (zio->io_flags & ZIO_FLAG_NOPWRITE));
ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
zio->io_ready(zio);
@@ -2835,6 +2942,8 @@ zio_done(zio_t *zio)
ASSERT(BP_COUNT_GANG(bp) == 0 ||
(BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp)));
}
+ if (zio->io_flags & ZIO_FLAG_NOPWRITE)
+ VERIFY(BP_EQUAL(bp, &zio->io_bp_orig));
}
/*
@@ -2944,7 +3053,7 @@ zio_done(zio_t *zio)
if ((zio->io_error || zio->io_reexecute) &&
IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
- !(zio->io_flags & ZIO_FLAG_IO_REWRITE))
+ !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)))
zio_dva_unallocate(zio, zio->io_gang_tree, bp);
zio_gang_tree_free(&zio->io_gang_tree);
@@ -3088,6 +3197,7 @@ static zio_pipe_stage_t *zio_pipeline[] = {
zio_issue_async,
zio_write_bp_init,
zio_checksum_generate,
+ zio_nop_write,
zio_ddt_read_start,
zio_ddt_read_done,
zio_ddt_write,
OpenPOWER on IntegriCloud