summaryrefslogtreecommitdiffstats
path: root/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
diff options
context:
space:
mode:
Diffstat (limited to 'sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c')
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c139
1 files changed, 100 insertions, 39 deletions
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
index a4c63f3..3167d42 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
@@ -40,11 +40,21 @@
#include <sys/zfs_ioctl.h>
#include <sys/zap.h>
#include <sys/zio_checksum.h>
+#include <sys/zio_compress.h>
#include <sys/sa.h>
#ifdef _KERNEL
#include <sys/zfs_znode.h>
#endif
+/*
+ * Enable/disable nopwrite feature.
+ */
+int zfs_nopwrite_enabled = 1;
+SYSCTL_DECL(_vfs_zfs);
+TUNABLE_INT("vfs.zfs.nopwrite_enabled", &zfs_nopwrite_enabled);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, nopwrite_enabled, CTLFLAG_RDTUN,
+ &zfs_nopwrite_enabled, 0, "Enable nopwrite feature");
+
const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
{ DMU_BSWAP_UINT8, TRUE, "unallocated" },
{ DMU_BSWAP_ZAP, TRUE, "object directory" },
@@ -1287,6 +1297,16 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
mutex_enter(&db->db_mtx);
ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
if (zio->io_error == 0) {
+ dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE);
+ if (dr->dt.dl.dr_nopwrite) {
+ blkptr_t *bp = zio->io_bp;
+ blkptr_t *bp_orig = &zio->io_bp_orig;
+ uint8_t chksum = BP_GET_CHECKSUM(bp_orig);
+
+ ASSERT(BP_EQUAL(bp, bp_orig));
+ ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF);
+ ASSERT(zio_checksum_table[chksum].ci_dedup);
+ }
dr->dt.dl.dr_overridden_by = *zio->io_bp;
dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
dr->dt.dl.dr_copies = zio->io_prop.zp_copies;
@@ -1308,11 +1328,22 @@ dmu_sync_late_arrival_done(zio_t *zio)
{
blkptr_t *bp = zio->io_bp;
dmu_sync_arg_t *dsa = zio->io_private;
+ blkptr_t *bp_orig = &zio->io_bp_orig;
if (zio->io_error == 0 && !BP_IS_HOLE(bp)) {
- ASSERT(zio->io_bp->blk_birth == zio->io_txg);
- ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
- zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
+ /*
+ * If we didn't allocate a new block (i.e. ZIO_FLAG_NOPWRITE)
+ * then there is nothing to do here. Otherwise, free the
+ * newly allocated block in this txg.
+ */
+ if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
+ ASSERT(BP_EQUAL(bp, bp_orig));
+ } else {
+ ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig));
+ ASSERT(zio->io_bp->blk_birth == zio->io_txg);
+ ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
+ zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
+ }
}
dmu_tx_commit(dsa->dsa_tx);
@@ -1357,7 +1388,7 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
*
* Return values:
*
- * EEXIST: this txg has already been synced, so there's nothing to to.
+ * EEXIST: this txg has already been synced, so there's nothing to do.
* The caller should not log the write.
*
* ENOENT: the block was dbuf_free_range()'d, so there's nothing to do.
@@ -1389,7 +1420,6 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
dnode_t *dn;
ASSERT(pio != NULL);
- ASSERT(BP_IS_HOLE(bp));
ASSERT(txg != 0);
SET_BOOKMARK(&zb, ds->ds_object,
@@ -1444,6 +1474,23 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
return (ENOENT);
}
+ ASSERT(dr->dr_next == NULL || dr->dr_next->dr_txg < txg);
+
+ /*
+ * Assume the on-disk data is X, the current syncing data is Y,
+ * and the current in-memory data is Z (currently in dmu_sync).
+ * X and Z are identical but Y is has been modified. Normally,
+ * when X and Z are the same we will perform a nopwrite but if Y
+ * is different we must disable nopwrite since the resulting write
+ * of Y to disk can free the block containing X. If we allowed a
+ * nopwrite to occur the block pointing to Z would reference a freed
+ * block. Since this is a rare case we simplify this by disabling
+ * nopwrite if the current dmu_sync-ing dbuf has been modified in
+ * a previous transaction.
+ */
+ if (dr->dr_next)
+ zp.zp_nopwrite = B_FALSE;
+
ASSERT(dr->dr_txg == txg);
if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC ||
dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
@@ -1519,7 +1566,6 @@ dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
int zfs_mdcomp_disable = 0;
TUNABLE_INT("vfs.zfs.mdcomp_disable", &zfs_mdcomp_disable);
-SYSCTL_DECL(_vfs_zfs);
SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RW,
&zfs_mdcomp_disable, 0, "Disable metadata compression");
@@ -1532,15 +1578,27 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
enum zio_checksum checksum = os->os_checksum;
enum zio_compress compress = os->os_compress;
enum zio_checksum dedup_checksum = os->os_dedup_checksum;
- boolean_t dedup;
+ boolean_t dedup = B_FALSE;
+ boolean_t nopwrite = B_FALSE;
boolean_t dedup_verify = os->os_dedup_verify;
int copies = os->os_copies;
/*
- * Determine checksum setting.
+ * We maintain different write policies for each of the following
+ * types of data:
+ * 1. metadata
+ * 2. preallocated blocks (i.e. level-0 blocks of a dump device)
+ * 3. all other level 0 blocks
*/
if (ismd) {
/*
+ * XXX -- we should design a compression algorithm
+ * that specializes in arrays of bps.
+ */
+ compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY :
+ ZIO_COMPRESS_LZJB;
+
+ /*
* Metadata always gets checksummed. If the data
* checksum is multi-bit correctable, and it's not a
* ZBT-style checksum, then it's suitable for metadata
@@ -1550,45 +1608,47 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
if (zio_checksum_table[checksum].ci_correctable < 1 ||
zio_checksum_table[checksum].ci_eck)
checksum = ZIO_CHECKSUM_FLETCHER_4;
- } else {
- checksum = zio_checksum_select(dn->dn_checksum, checksum);
- }
+ } else if (wp & WP_NOFILL) {
+ ASSERT(level == 0);
- /*
- * Determine compression setting.
- */
- if (ismd) {
/*
- * XXX -- we should design a compression algorithm
- * that specializes in arrays of bps.
+ * If we're writing preallocated blocks, we aren't actually
+ * writing them so don't set any policy properties. These
+ * blocks are currently only used by an external subsystem
+ * outside of zfs (i.e. dump) and not written by the zio
+ * pipeline.
*/
- compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY :
- ZIO_COMPRESS_LZJB;
+ compress = ZIO_COMPRESS_OFF;
+ checksum = ZIO_CHECKSUM_OFF;
} else {
compress = zio_compress_select(dn->dn_compress, compress);
- }
- /*
- * Determine dedup setting. If we are in dmu_sync(), we won't
- * actually dedup now because that's all done in syncing context;
- * but we do want to use the dedup checkum. If the checksum is not
- * strong enough to ensure unique signatures, force dedup_verify.
- */
- dedup = (!ismd && dedup_checksum != ZIO_CHECKSUM_OFF);
- if (dedup) {
- checksum = dedup_checksum;
- if (!zio_checksum_table[checksum].ci_dedup)
- dedup_verify = 1;
- }
+ checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ?
+ zio_checksum_select(dn->dn_checksum, checksum) :
+ dedup_checksum;
- if (wp & WP_DMU_SYNC)
- dedup = 0;
+ /*
+ * Determine dedup setting. If we are in dmu_sync(),
+ * we won't actually dedup now because that's all
+ * done in syncing context; but we do want to use the
+ * dedup checkum. If the checksum is not strong
+ * enough to ensure unique signatures, force
+ * dedup_verify.
+ */
+ if (dedup_checksum != ZIO_CHECKSUM_OFF) {
+ dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE;
+ if (!zio_checksum_table[checksum].ci_dedup)
+ dedup_verify = B_TRUE;
+ }
- if (wp & WP_NOFILL) {
- ASSERT(!ismd && level == 0);
- checksum = ZIO_CHECKSUM_OFF;
- compress = ZIO_COMPRESS_OFF;
- dedup = B_FALSE;
+ /*
+ * Enable nopwrite if we have a cryptographically secure
+ * checksum that has no known collisions (i.e. SHA-256)
+ * and compression is enabled. We don't enable nopwrite if
+ * dedup is enabled as the two features are mutually exclusive.
+ */
+ nopwrite = (!dedup && zio_checksum_table[checksum].ci_dedup &&
+ compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled);
}
zp->zp_checksum = checksum;
@@ -1598,6 +1658,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
zp->zp_copies = MIN(copies + ismd, spa_max_replication(os->os_spa));
zp->zp_dedup = dedup;
zp->zp_dedup_verify = dedup && dedup_verify;
+ zp->zp_nopwrite = nopwrite;
}
int
OpenPOWER on IntegriCloud