summaryrefslogtreecommitdiffstats
path: root/sys/cddl/contrib/opensolaris/uts/common/fs/zfs
diff options
context:
space:
mode:
Diffstat (limited to 'sys/cddl/contrib/opensolaris/uts/common/fs/zfs')
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c51
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c108
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c5
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c33
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c21
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c541
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c82
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c4
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c14
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c149
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c12
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c13
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/edonr_zfs.c102
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c11
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c35
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/skein_zfs.c91
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c52
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c17
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c4
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h18
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h2
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h4
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h1
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h18
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h2
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h23
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h9
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h5
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h22
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h5
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h53
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zrlock.h9
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c11
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c6
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c4
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c6
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c4
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c79
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c13
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c4
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c6
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c43
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c169
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zrlock.c8
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c107
45 files changed, 1609 insertions, 367 deletions
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
index b053993..29ef565 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
@@ -1134,6 +1134,24 @@ static boolean_t l2arc_compress_buf(arc_buf_hdr_t *);
static void l2arc_decompress_zio(zio_t *, arc_buf_hdr_t *, enum zio_compress);
static void l2arc_release_cdata_buf(arc_buf_hdr_t *);
+static void
+l2arc_trim(const arc_buf_hdr_t *hdr)
+{
+ l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
+
+ ASSERT(HDR_HAS_L2HDR(hdr));
+ ASSERT(MUTEX_HELD(&dev->l2ad_mtx));
+
+ if (hdr->b_l2hdr.b_daddr == L2ARC_ADDR_UNSET)
+ return;
+ if (hdr->b_l2hdr.b_asize != 0) {
+ trim_map_free(dev->l2ad_vdev, hdr->b_l2hdr.b_daddr,
+ hdr->b_l2hdr.b_asize, 0);
+ } else {
+ ASSERT3U(hdr->b_l2hdr.b_compress, ==, ZIO_COMPRESS_EMPTY);
+ }
+}
+
static uint64_t
buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
{
@@ -1555,7 +1573,7 @@ arc_cksum_verify(arc_buf_t *buf)
mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
return;
}
- fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
+ fletcher_2_native(buf->b_data, buf->b_hdr->b_size, NULL, &zc);
if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
panic("buffer modified while frozen!");
mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
@@ -1568,7 +1586,7 @@ arc_cksum_equal(arc_buf_t *buf)
int equal;
mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
- fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
+ fletcher_2_native(buf->b_data, buf->b_hdr->b_size, NULL, &zc);
equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
@@ -1588,7 +1606,7 @@ arc_cksum_compute(arc_buf_t *buf, boolean_t force)
}
buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
- buf->b_hdr->b_freeze_cksum);
+ NULL, buf->b_hdr->b_freeze_cksum);
mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
#ifdef illumos
arc_buf_watch(buf);
@@ -2406,10 +2424,7 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
* want to re-destroy the header's L2 portion.
*/
if (HDR_HAS_L2HDR(hdr)) {
- if (hdr->b_l2hdr.b_daddr != L2ARC_ADDR_UNSET)
- trim_map_free(dev->l2ad_vdev,
- hdr->b_l2hdr.b_daddr,
- hdr->b_l2hdr.b_asize, 0);
+ l2arc_trim(hdr);
arc_hdr_l2hdr_destroy(hdr);
}
@@ -4779,10 +4794,7 @@ arc_release(arc_buf_t *buf, void *tag)
* to acquire the l2ad_mtx.
*/
if (HDR_HAS_L2HDR(hdr)) {
- if (hdr->b_l2hdr.b_daddr != L2ARC_ADDR_UNSET)
- trim_map_free(hdr->b_l2hdr.b_dev->l2ad_vdev,
- hdr->b_l2hdr.b_daddr,
- hdr->b_l2hdr.b_asize, 0);
+ l2arc_trim(hdr);
arc_hdr_l2hdr_destroy(hdr);
}
@@ -5298,6 +5310,16 @@ arc_init(void)
arc_c_max = arc_c_min;
arc_c_max = MAX(arc_c * 5, arc_c_max);
+ /*
+ * In userland, there's only the memory pressure that we artificially
+ * create (see arc_available_memory()). Don't let arc_c get too
+ * small, because it can cause transactions to be larger than
+ * arc_c, causing arc_tempreserve_space() to fail.
+ */
+#ifndef _KERNEL
+ arc_c_min = arc_c_max / 2;
+#endif
+
#ifdef _KERNEL
/*
* Allow the tunables to override our calculations if they are
@@ -5959,8 +5981,7 @@ top:
* Error - drop L2ARC entry.
*/
list_remove(buflist, hdr);
- trim_map_free(hdr->b_l2hdr.b_dev->l2ad_vdev,
- hdr->b_l2hdr.b_daddr, hdr->b_l2hdr.b_asize, 0);
+ l2arc_trim(hdr);
hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR;
ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize);
@@ -6246,7 +6267,8 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
boolean_t *headroom_boost)
{
arc_buf_hdr_t *hdr, *hdr_prev, *head;
- uint64_t write_asize, write_sz, headroom, buf_compress_minsz;
+ uint64_t write_asize, write_sz, headroom,
+ buf_compress_minsz;
void *buf_data;
boolean_t full;
l2arc_write_callback_t *cb;
@@ -6408,6 +6430,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
* using it to denote the header's state change.
*/
hdr->b_l2hdr.b_daddr = L2ARC_ADDR_UNSET;
+
hdr->b_flags |= ARC_FLAG_HAS_L2HDR;
mutex_enter(&dev->l2ad_mtx);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
index a28d866..f39a353 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
@@ -267,7 +267,7 @@ dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type)
*/
ASSERT3U(holds, >=, db->db_dirtycnt);
} else {
- if (db->db_immediate_evict == TRUE)
+ if (db->db_user_immediate_evict == TRUE)
ASSERT3U(holds, >=, db->db_dirtycnt);
else
ASSERT3U(holds, >, 0);
@@ -1110,6 +1110,32 @@ dbuf_release_bp(dmu_buf_impl_t *db)
(void) arc_release(db->db_buf, db);
}
+/*
+ * We already have a dirty record for this TXG, and we are being
+ * dirtied again.
+ */
+static void
+dbuf_redirty(dbuf_dirty_record_t *dr)
+{
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+
+ if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
+ /*
+ * If this buffer has already been written out,
+ * we now need to reset its state.
+ */
+ dbuf_unoverride(dr);
+ if (db->db.db_object != DMU_META_DNODE_OBJECT &&
+ db->db_state != DB_NOFILL) {
+ /* Already released on initial dirty, so just thaw. */
+ ASSERT(arc_released(db->db_buf));
+ arc_buf_thaw(db->db_buf);
+ }
+ }
+}
+
dbuf_dirty_record_t *
dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
{
@@ -1182,16 +1208,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
if (dr && dr->dr_txg == tx->tx_txg) {
DB_DNODE_EXIT(db);
- if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
- /*
- * If this buffer has already been written out,
- * we now need to reset its state.
- */
- dbuf_unoverride(dr);
- if (db->db.db_object != DMU_META_DNODE_OBJECT &&
- db->db_state != DB_NOFILL)
- arc_buf_thaw(db->db_buf);
- }
+ dbuf_redirty(dr);
mutex_exit(&db->db_mtx);
return (dr);
}
@@ -1495,6 +1512,30 @@ dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
ASSERT(tx->tx_txg != 0);
ASSERT(!refcount_is_zero(&db->db_holds));
+ /*
+ * Quick check for dirtyness. For already dirty blocks, this
+ * reduces runtime of this function by >90%, and overall performance
+ * by 50% for some workloads (e.g. file deletion with indirect blocks
+ * cached).
+ */
+ mutex_enter(&db->db_mtx);
+ dbuf_dirty_record_t *dr;
+ for (dr = db->db_last_dirty;
+ dr != NULL && dr->dr_txg >= tx->tx_txg; dr = dr->dr_next) {
+ /*
+ * It's possible that it is already dirty but not cached,
+ * because there are some calls to dbuf_dirty() that don't
+ * go through dmu_buf_will_dirty().
+ */
+ if (dr->dr_txg == tx->tx_txg && db->db_state == DB_CACHED) {
+ /* This dbuf is already dirty and cached. */
+ dbuf_redirty(dr);
+ mutex_exit(&db->db_mtx);
+ return;
+ }
+ }
+ mutex_exit(&db->db_mtx);
+
DB_DNODE_ENTER(db);
if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
rf |= DB_RF_HAVESTRUCT;
@@ -1829,8 +1870,9 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
db->db_blkptr = blkptr;
db->db_user = NULL;
- db->db_immediate_evict = 0;
- db->db_freed_in_flight = 0;
+ db->db_user_immediate_evict = FALSE;
+ db->db_freed_in_flight = FALSE;
+ db->db_pending_evict = FALSE;
if (blkid == DMU_BONUS_BLKID) {
ASSERT3P(parent, ==, dn->dn_dbuf);
@@ -2386,12 +2428,13 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
arc_buf_freeze(db->db_buf);
if (holds == db->db_dirtycnt &&
- db->db_level == 0 && db->db_immediate_evict)
+ db->db_level == 0 && db->db_user_immediate_evict)
dbuf_evict_user(db);
if (holds == 0) {
if (db->db_blkid == DMU_BONUS_BLKID) {
dnode_t *dn;
+ boolean_t evict_dbuf = db->db_pending_evict;
/*
* If the dnode moves here, we cannot cross this
@@ -2406,7 +2449,7 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
* Decrementing the dbuf count means that the bonus
* buffer's dnode hold is no longer discounted in
* dnode_move(). The dnode cannot move until after
- * the dnode_rele_and_unlock() below.
+ * the dnode_rele() below.
*/
DB_DNODE_EXIT(db);
@@ -2416,35 +2459,10 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
*/
mutex_exit(&db->db_mtx);
- /*
- * If the dnode has been freed, evict the bonus
- * buffer immediately. The data in the bonus
- * buffer is no longer relevant and this prevents
- * a stale bonus buffer from being associated
- * with this dnode_t should the dnode_t be reused
- * prior to being destroyed.
- */
- mutex_enter(&dn->dn_mtx);
- if (dn->dn_type == DMU_OT_NONE ||
- dn->dn_free_txg != 0) {
- /*
- * Drop dn_mtx. It is a leaf lock and
- * cannot be held when dnode_evict_bonus()
- * acquires other locks in order to
- * perform the eviction.
- *
- * Freed dnodes cannot be reused until the
- * last hold is released. Since this bonus
- * buffer has a hold, the dnode will remain
- * in the free state, even without dn_mtx
- * held, until the dnode_rele_and_unlock()
- * below.
- */
- mutex_exit(&dn->dn_mtx);
+ if (evict_dbuf)
dnode_evict_bonus(dn);
- mutex_enter(&dn->dn_mtx);
- }
- dnode_rele_and_unlock(dn, db);
+
+ dnode_rele(dn, db);
} else if (db->db_buf == NULL) {
/*
* This is a special case: we never associated this
@@ -2491,7 +2509,7 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
} else {
dbuf_clear(db);
}
- } else if (db->db_objset->os_evicting ||
+ } else if (db->db_pending_evict ||
arc_buf_eviction_needed(db->db_buf)) {
dbuf_clear(db);
} else {
@@ -2539,7 +2557,7 @@ dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
- db->db_immediate_evict = TRUE;
+ db->db_user_immediate_evict = TRUE;
return (dmu_buf_set_user(db_fake, user));
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c
index 7863e6a..d4151bb 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -65,7 +65,8 @@ ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
spa_t *spa = ddt->ddt_spa;
objset_t *os = ddt->ddt_os;
uint64_t *objectp = &ddt->ddt_object[type][class];
- boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_dedup;
+ boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_flags &
+ ZCHECKSUM_FLAG_DEDUP;
char name[DDT_NAMELEN];
ddt_object_name(ddt, type, class, name);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
index fe6f60d..93a0426 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
@@ -1494,7 +1494,8 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
ASSERT(BP_EQUAL(bp, bp_orig));
ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF);
- ASSERT(zio_checksum_table[chksum].ci_dedup);
+ ASSERT(zio_checksum_table[chksum].ci_flags &
+ ZCHECKSUM_FLAG_NOPWRITE);
}
dr->dt.dl.dr_overridden_by = *zio->io_bp;
dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
@@ -1739,7 +1740,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
int
dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
- dmu_tx_t *tx)
+ dmu_tx_t *tx)
{
dnode_t *dn;
int err;
@@ -1754,7 +1755,7 @@ dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
void
dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
- dmu_tx_t *tx)
+ dmu_tx_t *tx)
{
dnode_t *dn;
@@ -1774,7 +1775,7 @@ dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
void
dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
- dmu_tx_t *tx)
+ dmu_tx_t *tx)
{
dnode_t *dn;
@@ -1842,8 +1843,10 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
* as well. Otherwise, the metadata checksum defaults
* to fletcher4.
*/
- if (zio_checksum_table[checksum].ci_correctable < 1 ||
- zio_checksum_table[checksum].ci_eck)
+ if (!(zio_checksum_table[checksum].ci_flags &
+ ZCHECKSUM_FLAG_METADATA) ||
+ (zio_checksum_table[checksum].ci_flags &
+ ZCHECKSUM_FLAG_EMBEDDED))
checksum = ZIO_CHECKSUM_FLETCHER_4;
if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL ||
@@ -1882,17 +1885,20 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
*/
if (dedup_checksum != ZIO_CHECKSUM_OFF) {
dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE;
- if (!zio_checksum_table[checksum].ci_dedup)
+ if (!(zio_checksum_table[checksum].ci_flags &
+ ZCHECKSUM_FLAG_DEDUP))
dedup_verify = B_TRUE;
}
/*
- * Enable nopwrite if we have a cryptographically secure
- * checksum that has no known collisions (i.e. SHA-256)
- * and compression is enabled. We don't enable nopwrite if
- * dedup is enabled as the two features are mutually exclusive.
+ * Enable nopwrite if we have secure enough checksum
+ * algorithm (see comment in zio_nop_write) and
+ * compression is enabled. We don't enable nopwrite if
+ * dedup is enabled as the two features are mutually
+ * exclusive.
*/
- nopwrite = (!dedup && zio_checksum_table[checksum].ci_dedup &&
+ nopwrite = (!dedup && (zio_checksum_table[checksum].ci_flags &
+ ZCHECKSUM_FLAG_NOPWRITE) &&
compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled);
}
@@ -1940,7 +1946,8 @@ dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
* ID and wait for that to be synced.
*/
int
-dmu_object_wait_synced(objset_t *os, uint64_t object) {
+dmu_object_wait_synced(objset_t *os, uint64_t object)
+{
dnode_t *dn;
int error, i;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
index f84ff37..79de1d1 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
@@ -362,6 +362,17 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
* checksum/compression/copies.
*/
if (ds != NULL) {
+ boolean_t needlock = B_FALSE;
+
+ /*
+ * Note: it's valid to open the objset if the dataset is
+ * long-held, in which case the pool_config lock will not
+ * be held.
+ */
+ if (!dsl_pool_config_held(dmu_objset_pool(os))) {
+ needlock = B_TRUE;
+ dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
+ }
err = dsl_prop_register(ds,
zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
primary_cache_changed_cb, os);
@@ -413,6 +424,8 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
recordsize_changed_cb, os);
}
}
+ if (needlock)
+ dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
if (err != 0) {
VERIFY(arc_buf_remove_ref(os->os_phys_buf,
&os->os_phys_buf));
@@ -469,6 +482,13 @@ dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp)
{
int err = 0;
+ /*
+ * We shouldn't be doing anything with dsl_dataset_t's unless the
+ * pool_config lock is held, or the dataset is long-held.
+ */
+ ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool) ||
+ dsl_dataset_long_held(ds));
+
mutex_enter(&ds->ds_opening_lock);
if (ds->ds_objset == NULL) {
objset_t *os;
@@ -686,7 +706,6 @@ dmu_objset_evict(objset_t *os)
if (os->os_sa)
sa_tear_down(os);
- os->os_evicting = B_TRUE;
dmu_objset_evict_dbufs(os);
mutex_enter(&os->os_lock);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
index ef13961..ede1555 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
@@ -66,12 +66,14 @@ int zfs_send_queue_length = 16 * 1024 * 1024;
int zfs_recv_queue_length = 16 * 1024 * 1024;
static char *dmu_recv_tag = "dmu_recv_tag";
-static const char *recv_clone_name = "%recv";
+const char *recv_clone_name = "%recv";
#define BP_SPAN(datablkszsec, indblkshift, level) \
(((uint64_t)datablkszsec) << (SPA_MINBLOCKSHIFT + \
(level) * (indblkshift - SPA_BLKPTRSHIFT)))
+static void byteswap_record(dmu_replay_record_t *drr);
+
struct send_thread_arg {
bqueue_t q;
dsl_dataset_t *ds; /* Dataset to traverse */
@@ -79,6 +81,7 @@ struct send_thread_arg {
int flags; /* flags to pass to traverse_dataset */
int error_code;
boolean_t cancel;
+ zbookmark_phys_t resume;
};
struct send_block_record {
@@ -93,7 +96,7 @@ struct send_block_record {
static int
dump_bytes(dmu_sendarg_t *dsp, void *buf, int len)
{
- dsl_dataset_t *ds = dsp->dsa_os->os_dsl_dataset;
+ dsl_dataset_t *ds = dmu_objset_ds(dsp->dsa_os);
struct uio auio;
struct iovec aiov;
ASSERT0(len % 8);
@@ -166,7 +169,7 @@ dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
* that the receiving system doesn't have any dbufs in the range
* being freed. This is always true because there is a one-record
* constraint: we only send one WRITE record for any given
- * object+offset. We know that the one-record constraint is
+ * object,offset. We know that the one-record constraint is
* true because we always send data in increasing order by
* object,offset.
*
@@ -289,7 +292,8 @@ dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type,
drrw->drr_checksumtype = ZIO_CHECKSUM_OFF;
} else {
drrw->drr_checksumtype = BP_GET_CHECKSUM(bp);
- if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup)
+ if (zio_checksum_table[drrw->drr_checksumtype].ci_flags &
+ ZCHECKSUM_FLAG_DEDUP)
drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP;
DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp));
DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp));
@@ -414,6 +418,19 @@ dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp)
{
struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object);
+ if (object < dsp->dsa_resume_object) {
+ /*
+ * Note: when resuming, we will visit all the dnodes in
+ * the block of dnodes that we are resuming from. In
+ * this case it's unnecessary to send the dnodes prior to
+ * the one we are resuming from. We should be at most one
+ * block's worth of dnodes behind the resume point.
+ */
+ ASSERT3U(dsp->dsa_resume_object - object, <,
+ 1 << (DNODE_BLOCK_SHIFT - DNODE_SHIFT));
+ return (0);
+ }
+
if (dnp == NULL || dnp->dn_type == DMU_OT_NONE)
return (dump_freeobjects(dsp, object, 1));
@@ -494,6 +511,9 @@ send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
uint64_t record_size;
int err = 0;
+ ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT ||
+ zb->zb_object >= sta->resume.zb_object);
+
if (sta->cancel)
return (SET_ERROR(EINTR));
@@ -530,8 +550,10 @@ send_traverse_thread(void *arg)
struct send_block_record *data;
if (st_arg->ds != NULL) {
- err = traverse_dataset(st_arg->ds, st_arg->fromtxg,
- st_arg->flags, send_cb, arg);
+ err = traverse_dataset_resume(st_arg->ds,
+ st_arg->fromtxg, &st_arg->resume,
+ st_arg->flags, send_cb, st_arg);
+
if (err != EINTR)
st_arg->error_code = err;
}
@@ -560,6 +582,9 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data)
ASSERT3U(zb->zb_level, >=, 0);
+ ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT ||
+ zb->zb_object >= dsa->dsa_resume_object);
+
if (zb->zb_object != DMU_META_DNODE_OBJECT &&
DMU_OBJECT_IS_SPECIAL(zb->zb_object)) {
return (0);
@@ -620,6 +645,10 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data)
uint64_t offset;
ASSERT0(zb->zb_level);
+ ASSERT(zb->zb_object > dsa->dsa_resume_object ||
+ (zb->zb_object == dsa->dsa_resume_object &&
+ zb->zb_blkid * blksz >= dsa->dsa_resume_offset));
+
if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
&aflags, zb) != 0) {
@@ -680,11 +709,13 @@ get_next_record(bqueue_t *bq, struct send_block_record *data)
*/
static int
dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
- zfs_bookmark_phys_t *ancestor_zb, boolean_t is_clone, boolean_t embedok,
+ zfs_bookmark_phys_t *ancestor_zb,
+ boolean_t is_clone, boolean_t embedok, boolean_t large_block_ok, int outfd,
+ uint64_t resumeobj, uint64_t resumeoff,
#ifdef illumos
- boolean_t large_block_ok, int outfd, vnode_t *vp, offset_t *off)
+ vnode_t *vp, offset_t *off)
#else
- boolean_t large_block_ok, int outfd, struct file *fp, offset_t *off)
+ struct file *fp, offset_t *off)
#endif
{
objset_t *os;
@@ -693,7 +724,7 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
int err;
uint64_t fromtxg = 0;
uint64_t featureflags = 0;
- struct send_thread_arg to_arg;
+ struct send_thread_arg to_arg = { 0 };
err = dmu_objset_from_ds(to_ds, &os);
if (err != 0) {
@@ -730,6 +761,10 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA_LZ4;
}
+ if (resumeobj != 0 || resumeoff != 0) {
+ featureflags |= DMU_BACKUP_FEATURE_RESUMING;
+ }
+
DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo,
featureflags);
@@ -766,6 +801,8 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
dsp->dsa_pending_op = PENDING_NONE;
dsp->dsa_incremental = (ancestor_zb != NULL);
dsp->dsa_featureflags = featureflags;
+ dsp->dsa_resume_object = resumeobj;
+ dsp->dsa_resume_offset = resumeoff;
mutex_enter(&to_ds->ds_sendstream_lock);
list_insert_head(&to_ds->ds_sendstreams, dsp);
@@ -774,7 +811,27 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
dsl_dataset_long_hold(to_ds, FTAG);
dsl_pool_rele(dp, tag);
- if (dump_record(dsp, NULL, 0) != 0) {
+ void *payload = NULL;
+ size_t payload_len = 0;
+ if (resumeobj != 0 || resumeoff != 0) {
+ dmu_object_info_t to_doi;
+ err = dmu_object_info(os, resumeobj, &to_doi);
+ if (err != 0)
+ goto out;
+ SET_BOOKMARK(&to_arg.resume, to_ds->ds_object, resumeobj, 0,
+ resumeoff / to_doi.doi_data_block_size);
+
+ nvlist_t *nvl = fnvlist_alloc();
+ fnvlist_add_uint64(nvl, "resume_object", resumeobj);
+ fnvlist_add_uint64(nvl, "resume_offset", resumeoff);
+ payload = fnvlist_pack(nvl, &payload_len);
+ drr->drr_payloadlen = payload_len;
+ fnvlist_free(nvl);
+ }
+
+ err = dump_record(dsp, payload, payload_len);
+ fnvlist_pack_free(payload, payload_len);
+ if (err != 0) {
err = dsp->dsa_err;
goto out;
}
@@ -889,22 +946,22 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
is_clone = (fromds->ds_dir != ds->ds_dir);
dsl_dataset_rele(fromds, FTAG);
err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
- embedok, large_block_ok, outfd, fp, off);
+ embedok, large_block_ok, outfd, 0, 0, fp, off);
} else {
err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
- embedok, large_block_ok, outfd, fp, off);
+ embedok, large_block_ok, outfd, 0, 0, fp, off);
}
dsl_dataset_rele(ds, FTAG);
return (err);
}
int
-dmu_send(const char *tosnap, const char *fromsnap,
- boolean_t embedok, boolean_t large_block_ok,
+dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
+ boolean_t large_block_ok, int outfd, uint64_t resumeobj, uint64_t resumeoff,
#ifdef illumos
- int outfd, vnode_t *vp, offset_t *off)
+ vnode_t *vp, offset_t *off)
#else
- int outfd, struct file *fp, offset_t *off)
+ struct file *fp, offset_t *off)
#endif
{
dsl_pool_t *dp;
@@ -972,10 +1029,12 @@ dmu_send(const char *tosnap, const char *fromsnap,
return (err);
}
err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
- embedok, large_block_ok, outfd, fp, off);
+ embedok, large_block_ok,
+ outfd, resumeobj, resumeoff, fp, off);
} else {
err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
- embedok, large_block_ok, outfd, fp, off);
+ embedok, large_block_ok,
+ outfd, resumeobj, resumeoff, fp, off);
}
if (owned)
dsl_dataset_disown(ds, FTAG);
@@ -1218,6 +1277,7 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
/* already checked */
ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
+ ASSERT(!(featureflags & DMU_BACKUP_FEATURE_RESUMING));
if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
DMU_COMPOUNDSTREAM ||
@@ -1230,6 +1290,10 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
spa_version(dp->dp_spa) < SPA_VERSION_SA)
return (SET_ERROR(ENOTSUP));
+ if (drba->drba_cookie->drc_resumable &&
+ !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EXTENSIBLE_DATASET))
+ return (SET_ERROR(ENOTSUP));
+
/*
* The receiving code doesn't know how to translate a WRITE_EMBEDDED
* record to a plan WRITE record, so the pool must have the
@@ -1333,15 +1397,16 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
{
dmu_recv_begin_arg_t *drba = arg;
dsl_pool_t *dp = dmu_tx_pool(tx);
+ objset_t *mos = dp->dp_meta_objset;
struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
const char *tofs = drba->drba_cookie->drc_tofs;
dsl_dataset_t *ds, *newds;
uint64_t dsobj;
int error;
- uint64_t crflags;
+ uint64_t crflags = 0;
- crflags = (drrb->drr_flags & DRR_FLAG_CI_DATA) ?
- DS_FLAG_CI_DATASET : 0;
+ if (drrb->drr_flags & DRR_FLAG_CI_DATA)
+ crflags |= DS_FLAG_CI_DATASET;
error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
if (error == 0) {
@@ -1379,6 +1444,31 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
}
VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds));
+ if (drba->drba_cookie->drc_resumable) {
+ dsl_dataset_zapify(newds, tx);
+ if (drrb->drr_fromguid != 0) {
+ VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_FROMGUID,
+ 8, 1, &drrb->drr_fromguid, tx));
+ }
+ VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TOGUID,
+ 8, 1, &drrb->drr_toguid, tx));
+ VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TONAME,
+ 1, strlen(drrb->drr_toname) + 1, drrb->drr_toname, tx));
+ uint64_t one = 1;
+ uint64_t zero = 0;
+ VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OBJECT,
+ 8, 1, &one, tx));
+ VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OFFSET,
+ 8, 1, &zero, tx));
+ VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_BYTES,
+ 8, 1, &zero, tx));
+ if (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
+ DMU_BACKUP_FEATURE_EMBED_DATA) {
+ VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_EMBEDOK,
+ 8, 1, &one, tx));
+ }
+ }
+
dmu_buf_will_dirty(newds->ds_dbuf, tx);
dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT;
@@ -1396,56 +1486,192 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
spa_history_log_internal_ds(newds, "receive", tx, "");
}
+static int
+dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx)
+{
+ dmu_recv_begin_arg_t *drba = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
+ int error;
+ uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
+ dsl_dataset_t *ds;
+ const char *tofs = drba->drba_cookie->drc_tofs;
+
+ /* already checked */
+ ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
+ ASSERT(featureflags & DMU_BACKUP_FEATURE_RESUMING);
+
+ if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
+ DMU_COMPOUNDSTREAM ||
+ drrb->drr_type >= DMU_OST_NUMTYPES)
+ return (SET_ERROR(EINVAL));
+
+ /* Verify pool version supports SA if SA_SPILL feature set */
+ if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) &&
+ spa_version(dp->dp_spa) < SPA_VERSION_SA)
+ return (SET_ERROR(ENOTSUP));
+
+ /*
+ * The receiving code doesn't know how to translate a WRITE_EMBEDDED
+ * record to a plain WRITE record, so the pool must have the
+ * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED
+ * records. Same with WRITE_EMBEDDED records that use LZ4 compression.
+ */
+ if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) &&
+ !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA))
+ return (SET_ERROR(ENOTSUP));
+ if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4) &&
+ !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
+ return (SET_ERROR(ENOTSUP));
+
+ char recvname[ZFS_MAXNAMELEN];
+
+ (void) snprintf(recvname, sizeof (recvname), "%s/%s",
+ tofs, recv_clone_name);
+
+ if (dsl_dataset_hold(dp, recvname, FTAG, &ds) != 0) {
+ /* %recv does not exist; continue in tofs */
+ error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
+ if (error != 0)
+ return (error);
+ }
+
+ /* check that ds is marked inconsistent */
+ if (!DS_IS_INCONSISTENT(ds)) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+
+ /* check that there is resuming data, and that the toguid matches */
+ if (!dsl_dataset_is_zapified(ds)) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+ uint64_t val;
+ error = zap_lookup(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val);
+ if (error != 0 || drrb->drr_toguid != val) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * Check if the receive is still running. If so, it will be owned.
+ * Note that nothing else can own the dataset (e.g. after the receive
+ * fails) because it will be marked inconsistent.
+ */
+ if (dsl_dataset_has_owner(ds)) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(EBUSY));
+ }
+
+ /* There should not be any snapshots of this fs yet. */
+ if (ds->ds_prev != NULL && ds->ds_prev->ds_dir == ds->ds_dir) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * Note: resume point will be checked when we process the first WRITE
+ * record.
+ */
+
+ /* check that the origin matches */
+ val = 0;
+ (void) zap_lookup(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val);
+ if (drrb->drr_fromguid != val) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+}
+
+static void
+dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx)
+{
+ dmu_recv_begin_arg_t *drba = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ const char *tofs = drba->drba_cookie->drc_tofs;
+ dsl_dataset_t *ds;
+ uint64_t dsobj;
+ char recvname[ZFS_MAXNAMELEN];
+
+ (void) snprintf(recvname, sizeof (recvname), "%s/%s",
+ tofs, recv_clone_name);
+
+ if (dsl_dataset_hold(dp, recvname, FTAG, &ds) != 0) {
+ /* %recv does not exist; continue in tofs */
+ VERIFY0(dsl_dataset_hold(dp, tofs, FTAG, &ds));
+ drba->drba_cookie->drc_newfs = B_TRUE;
+ }
+
+ /* clear the inconsistent flag so that we can own it */
+ ASSERT(DS_IS_INCONSISTENT(ds));
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT;
+ dsobj = ds->ds_object;
+ dsl_dataset_rele(ds, FTAG);
+
+ VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &ds));
+
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_INCONSISTENT;
+
+ ASSERT(!BP_IS_HOLE(dsl_dataset_get_blkptr(ds)));
+
+ drba->drba_cookie->drc_ds = ds;
+
+ spa_history_log_internal_ds(ds, "resume receive", tx, "");
+}
+
/*
* NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin()
* succeeds; otherwise we will leak the holds on the datasets.
*/
int
-dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb,
- boolean_t force, char *origin, dmu_recv_cookie_t *drc)
+dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin,
+ boolean_t force, boolean_t resumable, char *origin, dmu_recv_cookie_t *drc)
{
dmu_recv_begin_arg_t drba = { 0 };
- dmu_replay_record_t *drr;
bzero(drc, sizeof (dmu_recv_cookie_t));
- drc->drc_drrb = drrb;
+ drc->drc_drr_begin = drr_begin;
+ drc->drc_drrb = &drr_begin->drr_u.drr_begin;
drc->drc_tosnap = tosnap;
drc->drc_tofs = tofs;
drc->drc_force = force;
+ drc->drc_resumable = resumable;
drc->drc_cred = CRED();
- if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC))
+ if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) {
drc->drc_byteswap = B_TRUE;
- else if (drrb->drr_magic != DMU_BACKUP_MAGIC)
- return (SET_ERROR(EINVAL));
-
- drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
- drr->drr_type = DRR_BEGIN;
- drr->drr_u.drr_begin = *drc->drc_drrb;
- if (drc->drc_byteswap) {
- fletcher_4_incremental_byteswap(drr,
+ fletcher_4_incremental_byteswap(drr_begin,
sizeof (dmu_replay_record_t), &drc->drc_cksum);
- } else {
- fletcher_4_incremental_native(drr,
+ byteswap_record(drr_begin);
+ } else if (drc->drc_drrb->drr_magic == DMU_BACKUP_MAGIC) {
+ fletcher_4_incremental_native(drr_begin,
sizeof (dmu_replay_record_t), &drc->drc_cksum);
- }
- kmem_free(drr, sizeof (dmu_replay_record_t));
-
- if (drc->drc_byteswap) {
- drrb->drr_magic = BSWAP_64(drrb->drr_magic);
- drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo);
- drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time);
- drrb->drr_type = BSWAP_32(drrb->drr_type);
- drrb->drr_toguid = BSWAP_64(drrb->drr_toguid);
- drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid);
+ } else {
+ return (SET_ERROR(EINVAL));
}
drba.drba_origin = origin;
drba.drba_cookie = drc;
drba.drba_cred = CRED();
- return (dsl_sync_task(tofs, dmu_recv_begin_check, dmu_recv_begin_sync,
- &drba, 5, ZFS_SPACE_CHECK_NORMAL));
+ if (DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo) &
+ DMU_BACKUP_FEATURE_RESUMING) {
+ return (dsl_sync_task(tofs,
+ dmu_recv_resume_begin_check, dmu_recv_resume_begin_sync,
+ &drba, 5, ZFS_SPACE_CHECK_NORMAL));
+ } else {
+ return (dsl_sync_task(tofs,
+ dmu_recv_begin_check, dmu_recv_begin_sync,
+ &drba, 5, ZFS_SPACE_CHECK_NORMAL));
+ }
}
struct receive_record_arg {
@@ -1457,6 +1683,7 @@ struct receive_record_arg {
*/
arc_buf_t *write_buf;
int payload_size;
+ uint64_t bytes_read; /* bytes read from stream when record created */
boolean_t eos_marker; /* Marks the end of the stream */
bqueue_node_t node;
};
@@ -1465,6 +1692,7 @@ struct receive_writer_arg {
objset_t *os;
boolean_t byteswap;
bqueue_t q;
+
/*
* These three args are used to signal to the main thread that we're
* done.
@@ -1472,9 +1700,13 @@ struct receive_writer_arg {
kmutex_t mutex;
kcondvar_t cv;
boolean_t done;
+
int err;
/* A map from guid to dataset to help handle dedup'd streams. */
avl_tree_t *guid_to_ds_map;
+ boolean_t resumable;
+ uint64_t last_object, last_offset;
+ uint64_t bytes_read; /* bytes read when current record created */
};
struct receive_arg {
@@ -1482,6 +1714,7 @@ struct receive_arg {
kthread_t *td;
struct file *fp;
uint64_t voff; /* The current offset in the stream */
+ uint64_t bytes_read;
/*
* A record that has had its payload read in, but hasn't yet been handed
* off to the worker thread.
@@ -1577,14 +1810,21 @@ receive_read(struct receive_arg *ra, int len, void *buf)
ra->err = restore_bytes(ra, buf + done,
len - done, ra->voff, &resid);
- if (resid == len - done)
- ra->err = SET_ERROR(EINVAL);
+ if (resid == len - done) {
+ /*
+ * Note: ECKSUM indicates that the receive
+ * was interrupted and can potentially be resumed.
+ */
+ ra->err = SET_ERROR(ECKSUM);
+ }
ra->voff += len - done - resid;
done = len - resid;
if (ra->err != 0)
return (ra->err);
}
+ ra->bytes_read += len;
+
ASSERT3U(done, ==, len);
return (0);
}
@@ -1685,6 +1925,43 @@ deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size)
}
}
+static void
+save_resume_state(struct receive_writer_arg *rwa,
+ uint64_t object, uint64_t offset, dmu_tx_t *tx)
+{
+ int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
+
+ if (!rwa->resumable)
+ return;
+
+ /*
+ * We use ds_resume_bytes[] != 0 to indicate that we need to
+ * update this on disk, so it must not be 0.
+ */
+ ASSERT(rwa->bytes_read != 0);
+
+ /*
+ * We only resume from write records, which have a valid
+ * (non-meta-dnode) object number.
+ */
+ ASSERT(object != 0);
+
+ /*
+ * For resuming to work correctly, we must receive records in order,
+ * sorted by object,offset. This is checked by the callers, but
+ * assert it here for good measure.
+ */
+ ASSERT3U(object, >=, rwa->os->os_dsl_dataset->ds_resume_object[txgoff]);
+ ASSERT(object != rwa->os->os_dsl_dataset->ds_resume_object[txgoff] ||
+ offset >= rwa->os->os_dsl_dataset->ds_resume_offset[txgoff]);
+ ASSERT3U(rwa->bytes_read, >=,
+ rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff]);
+
+ rwa->os->os_dsl_dataset->ds_resume_object[txgoff] = object;
+ rwa->os->os_dsl_dataset->ds_resume_offset[txgoff] = offset;
+ rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff] = rwa->bytes_read;
+}
+
static int
receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
void *data)
@@ -1781,6 +2058,7 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
dmu_buf_rele(db, FTAG);
}
dmu_tx_commit(tx);
+
return (0);
}
@@ -1806,6 +2084,7 @@ receive_freeobjects(struct receive_writer_arg *rwa,
if (err != 0)
return (err);
}
+
return (0);
}
@@ -1820,6 +2099,18 @@ receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw,
!DMU_OT_IS_VALID(drrw->drr_type))
return (SET_ERROR(EINVAL));
+ /*
+ * For resuming to work, records must be in increasing order
+ * by (object, offset).
+ */
+ if (drrw->drr_object < rwa->last_object ||
+ (drrw->drr_object == rwa->last_object &&
+ drrw->drr_offset < rwa->last_offset)) {
+ return (SET_ERROR(EINVAL));
+ }
+ rwa->last_object = drrw->drr_object;
+ rwa->last_offset = drrw->drr_offset;
+
if (dmu_object_info(rwa->os, drrw->drr_object, NULL) != 0)
return (SET_ERROR(EINVAL));
@@ -1843,8 +2134,17 @@ receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw,
if (dmu_bonus_hold(rwa->os, drrw->drr_object, FTAG, &bonus) != 0)
return (SET_ERROR(EINVAL));
dmu_assign_arcbuf(bonus, drrw->drr_offset, abuf, tx);
+
+ /*
+ * Note: If the receive fails, we want the resume stream to start
+ * with the same record that we last successfully received (as opposed
+ * to the next record), so that we can verify that we are
+ * resuming from the correct location.
+ */
+ save_resume_state(rwa, drrw->drr_object, drrw->drr_offset, tx);
dmu_tx_commit(tx);
dmu_buf_rele(bonus, FTAG);
+
return (0);
}
@@ -1903,43 +2203,48 @@ receive_write_byref(struct receive_writer_arg *rwa,
dmu_write(rwa->os, drrwbr->drr_object,
drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx);
dmu_buf_rele(dbp, FTAG);
+
+ /* See comment in restore_write. */
+ save_resume_state(rwa, drrwbr->drr_object, drrwbr->drr_offset, tx);
dmu_tx_commit(tx);
return (0);
}
static int
receive_write_embedded(struct receive_writer_arg *rwa,
- struct drr_write_embedded *drrwnp, void *data)
+ struct drr_write_embedded *drrwe, void *data)
{
dmu_tx_t *tx;
int err;
- if (drrwnp->drr_offset + drrwnp->drr_length < drrwnp->drr_offset)
+ if (drrwe->drr_offset + drrwe->drr_length < drrwe->drr_offset)
return (EINVAL);
- if (drrwnp->drr_psize > BPE_PAYLOAD_SIZE)
+ if (drrwe->drr_psize > BPE_PAYLOAD_SIZE)
return (EINVAL);
- if (drrwnp->drr_etype >= NUM_BP_EMBEDDED_TYPES)
+ if (drrwe->drr_etype >= NUM_BP_EMBEDDED_TYPES)
return (EINVAL);
- if (drrwnp->drr_compression >= ZIO_COMPRESS_FUNCTIONS)
+ if (drrwe->drr_compression >= ZIO_COMPRESS_FUNCTIONS)
return (EINVAL);
tx = dmu_tx_create(rwa->os);
- dmu_tx_hold_write(tx, drrwnp->drr_object,
- drrwnp->drr_offset, drrwnp->drr_length);
+ dmu_tx_hold_write(tx, drrwe->drr_object,
+ drrwe->drr_offset, drrwe->drr_length);
err = dmu_tx_assign(tx, TXG_WAIT);
if (err != 0) {
dmu_tx_abort(tx);
return (err);
}
- dmu_write_embedded(rwa->os, drrwnp->drr_object,
- drrwnp->drr_offset, data, drrwnp->drr_etype,
- drrwnp->drr_compression, drrwnp->drr_lsize, drrwnp->drr_psize,
+ dmu_write_embedded(rwa->os, drrwe->drr_object,
+ drrwe->drr_offset, data, drrwe->drr_etype,
+ drrwe->drr_compression, drrwe->drr_lsize, drrwe->drr_psize,
rwa->byteswap ^ ZFS_HOST_BYTEORDER, tx);
+ /* See comment in restore_write. */
+ save_resume_state(rwa, drrwe->drr_object, drrwe->drr_offset, tx);
dmu_tx_commit(tx);
return (0);
}
@@ -2013,10 +2318,16 @@ receive_free(struct receive_writer_arg *rwa, struct drr_free *drrf)
static void
dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc)
{
- char name[MAXNAMELEN];
- dsl_dataset_name(drc->drc_ds, name);
- dsl_dataset_disown(drc->drc_ds, dmu_recv_tag);
- (void) dsl_destroy_head(name);
+ if (drc->drc_resumable) {
+ /* wait for our resume state to be written to disk */
+ txg_wait_synced(drc->drc_ds->ds_dir->dd_pool, 0);
+ dsl_dataset_disown(drc->drc_ds, dmu_recv_tag);
+ } else {
+ char name[MAXNAMELEN];
+ dsl_dataset_name(drc->drc_ds, name);
+ dsl_dataset_disown(drc->drc_ds, dmu_recv_tag);
+ (void) dsl_destroy_head(name);
+ }
}
static void
@@ -2043,12 +2354,17 @@ receive_read_payload_and_next_header(struct receive_arg *ra, int len, void *buf)
if (len != 0) {
ASSERT3U(len, <=, SPA_MAXBLOCKSIZE);
- ra->rrd->payload = buf;
- ra->rrd->payload_size = len;
- err = receive_read(ra, len, ra->rrd->payload);
+ err = receive_read(ra, len, buf);
if (err != 0)
return (err);
- receive_cksum(ra, len, ra->rrd->payload);
+ receive_cksum(ra, len, buf);
+
+ /* note: rrd is NULL when reading the begin record's payload */
+ if (ra->rrd != NULL) {
+ ra->rrd->payload = buf;
+ ra->rrd->payload_size = len;
+ ra->rrd->bytes_read = ra->bytes_read;
+ }
}
ra->prev_cksum = ra->cksum;
@@ -2056,6 +2372,7 @@ receive_read_payload_and_next_header(struct receive_arg *ra, int len, void *buf)
ra->next_rrd = kmem_zalloc(sizeof (*ra->next_rrd), KM_SLEEP);
err = receive_read(ra, sizeof (ra->next_rrd->header),
&ra->next_rrd->header);
+ ra->next_rrd->bytes_read = ra->bytes_read;
if (err != 0) {
kmem_free(ra->next_rrd, sizeof (*ra->next_rrd));
ra->next_rrd = NULL;
@@ -2235,7 +2552,7 @@ receive_read_record(struct receive_arg *ra)
{
struct drr_end *drre = &ra->rrd->header.drr_u.drr_end;
if (!ZIO_CHECKSUM_EQUAL(ra->prev_cksum, drre->drr_checksum))
- return (SET_ERROR(EINVAL));
+ return (SET_ERROR(ECKSUM));
return (0);
}
case DRR_SPILL:
@@ -2262,6 +2579,10 @@ receive_process_record(struct receive_writer_arg *rwa,
{
int err;
+ /* Processing in order, therefore bytes_read should be increasing. */
+ ASSERT3U(rrd->bytes_read, >=, rwa->bytes_read);
+ rwa->bytes_read = rrd->bytes_read;
+
switch (rrd->header.drr_type) {
case DRR_OBJECT:
{
@@ -2357,6 +2678,33 @@ receive_writer_thread(void *arg)
thread_exit();
}
+static int
+resume_check(struct receive_arg *ra, nvlist_t *begin_nvl)
+{
+ uint64_t val;
+ objset_t *mos = dmu_objset_pool(ra->os)->dp_meta_objset;
+ uint64_t dsobj = dmu_objset_id(ra->os);
+ uint64_t resume_obj, resume_off;
+
+ if (nvlist_lookup_uint64(begin_nvl,
+ "resume_object", &resume_obj) != 0 ||
+ nvlist_lookup_uint64(begin_nvl,
+ "resume_offset", &resume_off) != 0) {
+ return (SET_ERROR(EINVAL));
+ }
+ VERIFY0(zap_lookup(mos, dsobj,
+ DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val));
+ if (resume_obj != val)
+ return (SET_ERROR(EINVAL));
+ VERIFY0(zap_lookup(mos, dsobj,
+ DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val));
+ if (resume_off != val)
+ return (SET_ERROR(EINVAL));
+
+ return (0);
+}
+
+
/*
* Read in the stream's records, one by one, and apply them to the pool. There
* are two threads involved; the thread that calls this function will spin up a
@@ -2377,12 +2725,20 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp,
struct receive_arg ra = { 0 };
struct receive_writer_arg rwa = { 0 };
int featureflags;
+ nvlist_t *begin_nvl = NULL;
ra.byteswap = drc->drc_byteswap;
ra.cksum = drc->drc_cksum;
ra.td = curthread;
ra.fp = fp;
ra.voff = *voffp;
+
+ if (dsl_dataset_is_zapified(drc->drc_ds)) {
+ (void) zap_lookup(drc->drc_ds->ds_dir->dd_pool->dp_meta_objset,
+ drc->drc_ds->ds_object, DS_FIELD_RESUME_BYTES,
+ sizeof (ra.bytes_read), 1, &ra.bytes_read);
+ }
+
list_create(&ra.ignore_obj_list, sizeof (struct receive_ign_obj_node),
offsetof(struct receive_ign_obj_node, node));
@@ -2435,9 +2791,29 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp,
drc->drc_guid_to_ds_map = rwa.guid_to_ds_map;
}
- err = receive_read_payload_and_next_header(&ra, 0, NULL);
- if (err)
+ uint32_t payloadlen = drc->drc_drr_begin->drr_payloadlen;
+ void *payload = NULL;
+ if (payloadlen != 0)
+ payload = kmem_alloc(payloadlen, KM_SLEEP);
+
+ err = receive_read_payload_and_next_header(&ra, payloadlen, payload);
+ if (err != 0) {
+ if (payloadlen != 0)
+ kmem_free(payload, payloadlen);
goto out;
+ }
+ if (payloadlen != 0) {
+ err = nvlist_unpack(payload, payloadlen, &begin_nvl, KM_SLEEP);
+ kmem_free(payload, payloadlen);
+ if (err != 0)
+ goto out;
+ }
+
+ if (featureflags & DMU_BACKUP_FEATURE_RESUMING) {
+ err = resume_check(&ra, begin_nvl);
+ if (err != 0)
+ goto out;
+ }
(void) bqueue_init(&rwa.q, zfs_recv_queue_length,
offsetof(struct receive_record_arg, node));
@@ -2445,6 +2821,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp,
mutex_init(&rwa.mutex, NULL, MUTEX_DEFAULT, NULL);
rwa.os = ra.os;
rwa.byteswap = drc->drc_byteswap;
+ rwa.resumable = drc->drc_resumable;
(void) thread_create(NULL, 0, receive_writer_thread, &rwa, 0, &p0,
TS_RUN, minclsyspri);
@@ -2503,13 +2880,15 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp,
err = rwa.err;
out:
+ nvlist_free(begin_nvl);
if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1))
zfs_onexit_fd_rele(cleanup_fd);
if (err != 0) {
/*
- * destroy what we created, so we don't leave it in the
- * inconsistent restoring state.
+ * Clean up references. If receive is not resumable,
+ * destroy what we created, so we don't leave it in
+ * the inconsistent state.
*/
dmu_recv_cleanup_ds(drc);
}
@@ -2669,6 +3048,20 @@ dmu_recv_end_sync(void *arg, dmu_tx_t *tx)
dmu_buf_will_dirty(ds->ds_dbuf, tx);
dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT;
+ if (dsl_dataset_has_resume_receive_state(ds)) {
+ (void) zap_remove(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_FROMGUID, tx);
+ (void) zap_remove(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_OBJECT, tx);
+ (void) zap_remove(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_OFFSET, tx);
+ (void) zap_remove(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_BYTES, tx);
+ (void) zap_remove(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_TOGUID, tx);
+ (void) zap_remove(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_TONAME, tx);
+ }
}
drc->drc_newsnapobj = dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj;
/*
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
index 151d04c..2c718df 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
@@ -48,6 +48,7 @@ typedef struct prefetch_data {
int pd_flags;
boolean_t pd_cancel;
boolean_t pd_exited;
+ zbookmark_phys_t pd_resume;
} prefetch_data_t;
typedef struct traverse_data {
@@ -307,59 +308,52 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
arc_flags_t flags = ARC_FLAG_WAIT;
int i;
int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
- dnode_phys_t *cdnp;
err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
if (err != 0)
goto post;
- cdnp = buf->b_data;
+ dnode_phys_t *child_dnp = buf->b_data;
for (i = 0; i < epb; i++) {
- prefetch_dnode_metadata(td, &cdnp[i], zb->zb_objset,
- zb->zb_blkid * epb + i);
+ prefetch_dnode_metadata(td, &child_dnp[i],
+ zb->zb_objset, zb->zb_blkid * epb + i);
}
/* recursively visitbp() blocks below this */
for (i = 0; i < epb; i++) {
- err = traverse_dnode(td, &cdnp[i], zb->zb_objset,
- zb->zb_blkid * epb + i);
+ err = traverse_dnode(td, &child_dnp[i],
+ zb->zb_objset, zb->zb_blkid * epb + i);
if (err != 0)
break;
}
} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
arc_flags_t flags = ARC_FLAG_WAIT;
- objset_phys_t *osp;
- dnode_phys_t *mdnp, *gdnp, *udnp;
err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
if (err != 0)
goto post;
- osp = buf->b_data;
- mdnp = &osp->os_meta_dnode;
- gdnp = &osp->os_groupused_dnode;
- udnp = &osp->os_userused_dnode;
-
- prefetch_dnode_metadata(td, mdnp, zb->zb_objset,
+ objset_phys_t *osp = buf->b_data;
+ prefetch_dnode_metadata(td, &osp->os_meta_dnode, zb->zb_objset,
DMU_META_DNODE_OBJECT);
if (arc_buf_size(buf) >= sizeof (objset_phys_t)) {
- prefetch_dnode_metadata(td, gdnp, zb->zb_objset,
- DMU_GROUPUSED_OBJECT);
- prefetch_dnode_metadata(td, udnp, zb->zb_objset,
- DMU_USERUSED_OBJECT);
+ prefetch_dnode_metadata(td, &osp->os_groupused_dnode,
+ zb->zb_objset, DMU_GROUPUSED_OBJECT);
+ prefetch_dnode_metadata(td, &osp->os_userused_dnode,
+ zb->zb_objset, DMU_USERUSED_OBJECT);
}
- err = traverse_dnode(td, mdnp, zb->zb_objset,
+ err = traverse_dnode(td, &osp->os_meta_dnode, zb->zb_objset,
DMU_META_DNODE_OBJECT);
if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
- err = traverse_dnode(td, gdnp, zb->zb_objset,
- DMU_GROUPUSED_OBJECT);
+ err = traverse_dnode(td, &osp->os_groupused_dnode,
+ zb->zb_objset, DMU_GROUPUSED_OBJECT);
}
if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
- err = traverse_dnode(td, udnp, zb->zb_objset,
- DMU_USERUSED_OBJECT);
+ err = traverse_dnode(td, &osp->os_userused_dnode,
+ zb->zb_objset, DMU_USERUSED_OBJECT);
}
}
@@ -391,9 +385,15 @@ post:
* Set the bookmark to the first level-0 block that we need
* to visit. This way, the resuming code does not need to
* deal with resuming from indirect blocks.
+ *
+ * Note, if zb_level <= 0, dnp may be NULL, so we don't want
+ * to dereference it.
*/
- td->td_resume->zb_blkid = zb->zb_blkid <<
- (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
+ td->td_resume->zb_blkid = zb->zb_blkid;
+ if (zb->zb_level > 0) {
+ td->td_resume->zb_blkid <<= zb->zb_level *
+ (dnp->dn_indblkshift - SPA_BLKPTRSHIFT);
+ }
td->td_paused = B_TRUE;
}
@@ -425,6 +425,10 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
int j, err = 0;
zbookmark_phys_t czb;
+ if (object != DMU_META_DNODE_OBJECT && td->td_resume != NULL &&
+ object < td->td_resume->zb_object)
+ return (0);
+
if (td->td_flags & TRAVERSE_PRE) {
SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL,
ZB_DNODE_BLKID);
@@ -501,6 +505,7 @@ traverse_prefetch_thread(void *arg)
td.td_func = traverse_prefetcher;
td.td_arg = td_main->td_pfd;
td.td_pfd = NULL;
+ td.td_resume = &td_main->td_pfd->pd_resume;
SET_BOOKMARK(&czb, td.td_objset,
ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
@@ -529,12 +534,6 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
ASSERT(ds == NULL || objset == ds->ds_object);
ASSERT(!(flags & TRAVERSE_PRE) || !(flags & TRAVERSE_POST));
- /*
- * The data prefetching mechanism (the prefetch thread) is incompatible
- * with resuming from a bookmark.
- */
- ASSERT(resume == NULL || !(flags & TRAVERSE_PREFETCH_DATA));
-
td.td_spa = spa;
td.td_objset = objset;
td.td_rootbp = rootbp;
@@ -554,6 +553,8 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
}
pd.pd_flags = flags;
+ if (resume != NULL)
+ pd.pd_resume = *resume;
mutex_init(&pd.pd_mtx, NULL, MUTEX_DEFAULT, NULL);
cv_init(&pd.pd_cv, NULL, CV_DEFAULT, NULL);
@@ -601,11 +602,19 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
* in syncing context).
*/
int
-traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags,
- blkptr_cb_t func, void *arg)
+traverse_dataset_resume(dsl_dataset_t *ds, uint64_t txg_start,
+ zbookmark_phys_t *resume,
+ int flags, blkptr_cb_t func, void *arg)
{
return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, ds->ds_object,
- &dsl_dataset_phys(ds)->ds_bp, txg_start, NULL, flags, func, arg));
+ &dsl_dataset_phys(ds)->ds_bp, txg_start, resume, flags, func, arg));
+}
+
+int
+traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start,
+ int flags, blkptr_cb_t func, void *arg)
+{
+ return (traverse_dataset_resume(ds, txg_start, NULL, flags, func, arg));
}
int
@@ -625,7 +634,6 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
blkptr_cb_t func, void *arg)
{
int err;
- uint64_t obj;
dsl_pool_t *dp = spa_get_dsl(spa);
objset_t *mos = dp->dp_meta_objset;
boolean_t hard = (flags & TRAVERSE_HARD);
@@ -637,8 +645,8 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
return (err);
/* visit each dataset */
- for (obj = 1; err == 0;
- err = dmu_object_next(mos, &obj, FALSE, txg_start)) {
+ for (uint64_t obj = 1; err == 0;
+ err = dmu_object_next(mos, &obj, B_FALSE, txg_start)) {
dmu_object_info_t doi;
err = dmu_object_info(mos, obj, &doi);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c
index 0d65896..2242b4a 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c
@@ -24,7 +24,7 @@
*/
/*
- * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -49,7 +49,7 @@ uint32_t zfetch_max_streams = 8;
uint32_t zfetch_min_sec_reap = 2;
/* max bytes to prefetch per stream (default 8MB) */
uint32_t zfetch_max_distance = 8 * 1024 * 1024;
-/* number of bytes in a array_read at which we stop prefetching (1MB) */
+/* max number of bytes in an array_read in which we allow prefetching (1MB) */
uint64_t zfetch_array_rd_sz = 1024 * 1024;
SYSCTL_DECL(_vfs_zfs);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
index 0787885..9aee513 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
@@ -424,6 +424,7 @@ dnode_evict_dbufs(dnode_t *dn)
db_next = AVL_NEXT(&dn->dn_dbufs, &db_marker);
avl_remove(&dn->dn_dbufs, &db_marker);
} else {
+ db->db_pending_evict = TRUE;
mutex_exit(&db->db_mtx);
db_next = AVL_NEXT(&dn->dn_dbufs, db);
}
@@ -437,10 +438,14 @@ void
dnode_evict_bonus(dnode_t *dn)
{
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
- if (dn->dn_bonus && refcount_is_zero(&dn->dn_bonus->db_holds)) {
- mutex_enter(&dn->dn_bonus->db_mtx);
- dbuf_evict(dn->dn_bonus);
- dn->dn_bonus = NULL;
+ if (dn->dn_bonus != NULL) {
+ if (refcount_is_zero(&dn->dn_bonus->db_holds)) {
+ mutex_enter(&dn->dn_bonus->db_mtx);
+ dbuf_evict(dn->dn_bonus);
+ dn->dn_bonus = NULL;
+ } else {
+ dn->dn_bonus->db_pending_evict = TRUE;
+ }
}
rw_exit(&dn->dn_struct_rwlock);
}
@@ -492,7 +497,6 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]);
dnode_evict_dbufs(dn);
- ASSERT(avl_is_empty(&dn->dn_dbufs));
/*
* XXX - It would be nice to assert this, but we may still
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
index 4fbbe7c..95e5392 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
@@ -34,6 +34,7 @@
#include <sys/dsl_synctask.h>
#include <sys/dmu_traverse.h>
#include <sys/dmu_impl.h>
+#include <sys/dmu_send.h>
#include <sys/dmu_tx.h>
#include <sys/arc.h>
#include <sys/zio.h>
@@ -51,6 +52,10 @@
#include <sys/dsl_destroy.h>
#include <sys/dsl_userhold.h>
#include <sys/dsl_bookmark.h>
+#include <sys/dmu_send.h>
+#include <sys/zio_checksum.h>
+#include <sys/zio_compress.h>
+#include <zfs_fletcher.h>
SYSCTL_DECL(_vfs_zfs);
@@ -130,10 +135,16 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
dsl_dataset_phys(ds)->ds_compressed_bytes += compressed;
dsl_dataset_phys(ds)->ds_uncompressed_bytes += uncompressed;
dsl_dataset_phys(ds)->ds_unique_bytes += used;
+
if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE) {
ds->ds_feature_activation_needed[SPA_FEATURE_LARGE_BLOCKS] =
B_TRUE;
}
+
+ spa_feature_t f = zio_checksum_to_feature(BP_GET_CHECKSUM(bp));
+ if (f != SPA_FEATURE_NONE)
+ ds->ds_feature_activation_needed[f] = B_TRUE;
+
mutex_exit(&ds->ds_lock);
dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,
compressed, uncompressed, tx);
@@ -701,6 +712,7 @@ dsl_dataset_tryown(dsl_dataset_t *ds, void *tag)
{
boolean_t gotit = FALSE;
+ ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
mutex_enter(&ds->ds_lock);
if (ds->ds_owner == NULL && !DS_IS_INCONSISTENT(ds)) {
ds->ds_owner = tag;
@@ -711,6 +723,16 @@ dsl_dataset_tryown(dsl_dataset_t *ds, void *tag)
return (gotit);
}
+boolean_t
+dsl_dataset_has_owner(dsl_dataset_t *ds)
+{
+ boolean_t rv;
+ mutex_enter(&ds->ds_lock);
+ rv = (ds->ds_owner != NULL);
+ mutex_exit(&ds->ds_lock);
+ return (rv);
+}
+
static void
dsl_dataset_activate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx)
{
@@ -1657,6 +1679,21 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
dmu_buf_will_dirty(ds->ds_dbuf, tx);
dsl_dataset_phys(ds)->ds_fsid_guid = ds->ds_fsid_guid;
+ if (ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] != 0) {
+ VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
+ ds->ds_object, DS_FIELD_RESUME_OBJECT, 8, 1,
+ &ds->ds_resume_object[tx->tx_txg & TXG_MASK], tx));
+ VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
+ ds->ds_object, DS_FIELD_RESUME_OFFSET, 8, 1,
+ &ds->ds_resume_offset[tx->tx_txg & TXG_MASK], tx));
+ VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
+ ds->ds_object, DS_FIELD_RESUME_BYTES, 8, 1,
+ &ds->ds_resume_bytes[tx->tx_txg & TXG_MASK], tx));
+ ds->ds_resume_object[tx->tx_txg & TXG_MASK] = 0;
+ ds->ds_resume_offset[tx->tx_txg & TXG_MASK] = 0;
+ ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] = 0;
+ }
+
dmu_objset_sync(ds->ds_objset, zio, tx);
for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
@@ -1712,6 +1749,76 @@ fail:
nvlist_free(propval);
}
+static void
+get_receive_resume_stats(dsl_dataset_t *ds, nvlist_t *nv)
+{
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+
+ if (dsl_dataset_has_resume_receive_state(ds)) {
+ char *str;
+ void *packed;
+ uint8_t *compressed;
+ uint64_t val;
+ nvlist_t *token_nv = fnvlist_alloc();
+ size_t packed_size, compressed_size;
+
+ if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val) == 0) {
+ fnvlist_add_uint64(token_nv, "fromguid", val);
+ }
+ if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val) == 0) {
+ fnvlist_add_uint64(token_nv, "object", val);
+ }
+ if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val) == 0) {
+ fnvlist_add_uint64(token_nv, "offset", val);
+ }
+ if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_BYTES, sizeof (val), 1, &val) == 0) {
+ fnvlist_add_uint64(token_nv, "bytes", val);
+ }
+ if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val) == 0) {
+ fnvlist_add_uint64(token_nv, "toguid", val);
+ }
+ char buf[256];
+ if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_TONAME, 1, sizeof (buf), buf) == 0) {
+ fnvlist_add_string(token_nv, "toname", buf);
+ }
+ if (zap_contains(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_EMBEDOK) == 0) {
+ fnvlist_add_boolean(token_nv, "embedok");
+ }
+ packed = fnvlist_pack(token_nv, &packed_size);
+ fnvlist_free(token_nv);
+ compressed = kmem_alloc(packed_size, KM_SLEEP);
+
+ compressed_size = gzip_compress(packed, compressed,
+ packed_size, packed_size, 6);
+
+ zio_cksum_t cksum;
+ fletcher_4_native(compressed, compressed_size, NULL, &cksum);
+
+ str = kmem_alloc(compressed_size * 2 + 1, KM_SLEEP);
+ for (int i = 0; i < compressed_size; i++) {
+ (void) sprintf(str + i * 2, "%02x", compressed[i]);
+ }
+ str[compressed_size * 2] = '\0';
+ char *propval = kmem_asprintf("%u-%llx-%llx-%s",
+ ZFS_SEND_RESUME_TOKEN_VERSION,
+ (longlong_t)cksum.zc_word[0],
+ (longlong_t)packed_size, str);
+ dsl_prop_nvlist_add_string(nv,
+ ZFS_PROP_RECEIVE_RESUME_TOKEN, propval);
+ kmem_free(packed, packed_size);
+ kmem_free(str, compressed_size * 2 + 1);
+ kmem_free(compressed, packed_size);
+ strfree(propval);
+ }
+}
+
void
dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
{
@@ -1783,6 +1890,29 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
}
}
}
+
+ if (!dsl_dataset_is_snapshot(ds)) {
+ /*
+ * A failed "newfs" (e.g. full) resumable receive leaves
+ * the stats set on this dataset. Check here for the prop.
+ */
+ get_receive_resume_stats(ds, nv);
+
+ /*
+ * A failed incremental resumable receive leaves the
+ * stats set on our child named "%recv". Check the child
+ * for the prop.
+ */
+ char recvname[ZFS_MAXNAMELEN];
+ dsl_dataset_t *recv_ds;
+ dsl_dataset_name(ds, recvname);
+ (void) strcat(recvname, "/");
+ (void) strcat(recvname, recv_clone_name);
+ if (dsl_dataset_hold(dp, recvname, FTAG, &recv_ds) == 0) {
+ get_receive_resume_stats(recv_ds, nv);
+ dsl_dataset_rele(recv_ds, FTAG);
+ }
+ }
}
void
@@ -3428,7 +3558,7 @@ dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap,
*/
boolean_t
dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier,
- uint64_t earlier_txg)
+ uint64_t earlier_txg)
{
dsl_pool_t *dp = later->ds_dir->dd_pool;
int error;
@@ -3467,3 +3597,20 @@ dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx)
objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
dmu_object_zapify(mos, ds->ds_object, DMU_OT_DSL_DATASET, tx);
}
+
+boolean_t
+dsl_dataset_is_zapified(dsl_dataset_t *ds)
+{
+ dmu_object_info_t doi;
+
+ dmu_object_info_from_db(ds->ds_dbuf, &doi);
+ return (doi.doi_type == DMU_OTN_ZAP_METADATA);
+}
+
+boolean_t
+dsl_dataset_has_resume_receive_state(dsl_dataset_t *ds)
+{
+ return (dsl_dataset_is_zapified(ds) &&
+ zap_contains(ds->ds_dir->dd_pool->dp_meta_objset,
+ ds->ds_object, DS_FIELD_RESUME_TOGUID) == 0);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c
index c7a623c..7de9845 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c
@@ -968,9 +968,17 @@ dsl_destroy_inconsistent(const char *dsname, void *arg)
objset_t *os;
if (dmu_objset_hold(dsname, FTAG, &os) == 0) {
- boolean_t inconsistent = DS_IS_INCONSISTENT(dmu_objset_ds(os));
+ boolean_t need_destroy = DS_IS_INCONSISTENT(dmu_objset_ds(os));
+
+ /*
+ * If the dataset is inconsistent because a resumable receive
+ * has failed, then do not destroy it.
+ */
+ if (dsl_dataset_has_resume_receive_state(dmu_objset_ds(os)))
+ need_destroy = B_FALSE;
+
dmu_objset_rele(os, FTAG);
- if (inconsistent)
+ if (need_destroy)
(void) dsl_destroy_head(dsname);
}
return (0);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c
index 06cfced..3c6a29b 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
*/
#include <sys/dsl_scan.h>
@@ -111,6 +111,14 @@ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, free_max_blocks, CTLFLAG_RWTUN,
extern int zfs_txg_timeout;
+/*
+ * Enable/disable the processing of the free_bpobj object.
+ */
+boolean_t zfs_free_bpobj_enabled = B_TRUE;
+
+SYSCTL_INT(_vfs_zfs, OID_AUTO, free_bpobj_enabled, CTLFLAG_RWTUN,
+ &zfs_free_bpobj_enabled, 0, "Enable free_bpobj processing");
+
/* the order has to match pool_scan_type */
static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = {
NULL,
@@ -1460,7 +1468,8 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
* have to worry about traversing it. It is also faster to free the
* blocks than to scrub them.
*/
- if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
+ if (zfs_free_bpobj_enabled &&
+ spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
scn->scn_is_bptree = B_FALSE;
scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
NULL, ZIO_FLAG_MUSTSUCCEED);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/edonr_zfs.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/edonr_zfs.c
new file mode 100644
index 0000000..93f1221
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/edonr_zfs.c
@@ -0,0 +1,102 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ * Use is subject to license terms.
+ */
+#include <sys/zfs_context.h>
+#include <sys/zio.h>
+#include <sys/edonr.h>
+
+#define EDONR_MODE 512
+#define EDONR_BLOCK_SIZE EdonR512_BLOCK_SIZE
+
+/*
+ * Native zio_checksum interface for the Edon-R hash function.
+ */
+/*ARGSUSED*/
+void
+zio_checksum_edonr_native(const void *buf, uint64_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
+{
+ uint8_t digest[EDONR_MODE / 8];
+ EdonRState ctx;
+
+ ASSERT(ctx_template != NULL);
+ bcopy(ctx_template, &ctx, sizeof (ctx));
+ EdonRUpdate(&ctx, buf, size * 8);
+ EdonRFinal(&ctx, digest);
+ bcopy(digest, zcp->zc_word, sizeof (zcp->zc_word));
+}
+
+/*
+ * Byteswapped zio_checksum interface for the Edon-R hash function.
+ */
+void
+zio_checksum_edonr_byteswap(const void *buf, uint64_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
+{
+ zio_cksum_t tmp;
+
+ zio_checksum_edonr_native(buf, size, ctx_template, &tmp);
+ zcp->zc_word[0] = BSWAP_64(zcp->zc_word[0]);
+ zcp->zc_word[1] = BSWAP_64(zcp->zc_word[1]);
+ zcp->zc_word[2] = BSWAP_64(zcp->zc_word[2]);
+ zcp->zc_word[3] = BSWAP_64(zcp->zc_word[3]);
+}
+
+void *
+zio_checksum_edonr_tmpl_init(const zio_cksum_salt_t *salt)
+{
+ EdonRState *ctx;
+ uint8_t salt_block[EDONR_BLOCK_SIZE];
+
+ /*
+ * Edon-R needs all but the last hash invocation to be on full-size
+ * blocks, but the salt is too small. Rather than simply padding it
+ * with zeros, we expand the salt into a new salt block of proper
+ * size by double-hashing it (the new salt block will be composed of
+ * H(salt) || H(H(salt))).
+ */
+ CTASSERT(EDONR_BLOCK_SIZE == 2 * (EDONR_MODE / 8));
+ EdonRHash(EDONR_MODE, salt->zcs_bytes, sizeof (salt->zcs_bytes) * 8,
+ salt_block);
+ EdonRHash(EDONR_MODE, salt_block, EDONR_MODE, salt_block +
+ EDONR_MODE / 8);
+
+ /*
+ * Feed the new salt block into the hash function - this will serve
+ * as our MAC key.
+ */
+ ctx = kmem_zalloc(sizeof (*ctx), KM_SLEEP);
+ EdonRInit(ctx, EDONR_MODE);
+ EdonRUpdate(ctx, salt_block, sizeof (salt_block) * 8);
+ return (ctx);
+}
+
+void
+zio_checksum_edonr_tmpl_free(void *ctx_template)
+{
+ EdonRState *ctx = ctx_template;
+
+ bzero(ctx, sizeof (*ctx));
+ kmem_free(ctx, sizeof (*ctx));
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
index 2f26218..be0a688 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
*/
@@ -1815,10 +1815,11 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
ASSERT(msp->ms_loaded);
- spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, "
- "smp size %llu, segments %lu, forcing condense=%s", txg,
- msp->ms_id, msp, space_map_length(msp->ms_sm),
- avl_numnodes(&msp->ms_tree->rt_root),
+ spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, vdev id %llu, "
+ "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg,
+ msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id,
+ msp->ms_group->mg_vd->vdev_spa->spa_name,
+ space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root),
msp->ms_condense_wanted ? "TRUE" : "FALSE");
msp->ms_condense_wanted = B_FALSE;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c
index 816c09a..a64d6ef 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c
@@ -22,6 +22,9 @@
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
+/*
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ */
#include <sys/zfs_context.h>
#include <sys/zio.h>
#ifdef _KERNEL
@@ -30,8 +33,10 @@
#include <sha256.h>
#endif
+/*ARGSUSED*/
void
-zio_checksum_SHA256(const void *buf, uint64_t size, zio_cksum_t *zcp)
+zio_checksum_SHA256(const void *buf, uint64_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
{
SHA256_CTX ctx;
zio_cksum_t tmp;
@@ -52,3 +57,31 @@ zio_checksum_SHA256(const void *buf, uint64_t size, zio_cksum_t *zcp)
zcp->zc_word[2] = BE_64(tmp.zc_word[2]);
zcp->zc_word[3] = BE_64(tmp.zc_word[3]);
}
+
+#ifdef illumos
+/*ARGSUSED*/
+void
+zio_checksum_SHA512_native(const void *buf, uint64_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
+{
+ SHA2_CTX ctx;
+
+ SHA2Init(SHA512_256, &ctx);
+ SHA2Update(&ctx, buf, size);
+ SHA2Final(zcp, &ctx);
+}
+
+/*ARGSUSED*/
+void
+zio_checksum_SHA512_byteswap(const void *buf, uint64_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
+{
+ zio_cksum_t tmp;
+
+ zio_checksum_SHA512_native(buf, size, ctx_template, &tmp);
+ zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]);
+ zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]);
+ zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]);
+ zcp->zc_word[3] = BSWAP_64(tmp.zc_word[3]);
+}
+#endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/skein_zfs.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/skein_zfs.c
new file mode 100644
index 0000000..6592340
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/skein_zfs.c
@@ -0,0 +1,91 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ */
+#include <sys/zfs_context.h>
+#include <sys/zio.h>
+#include <sys/skein.h>
+
+/*
+ * Computes a native 256-bit skein MAC checksum. Please note that this
+ * function requires the presence of a ctx_template that should be allocated
+ * using zio_checksum_skein_tmpl_init.
+ */
+/*ARGSUSED*/
+void
+zio_checksum_skein_native(const void *buf, uint64_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
+{
+ Skein_512_Ctxt_t ctx;
+
+ ASSERT(ctx_template != NULL);
+ bcopy(ctx_template, &ctx, sizeof (ctx));
+ (void) Skein_512_Update(&ctx, buf, size);
+ (void) Skein_512_Final(&ctx, (uint8_t *)zcp);
+ bzero(&ctx, sizeof (ctx));
+}
+
+/*
+ * Byteswapped version of zio_checksum_skein_native. This just invokes
+ * the native checksum function and byteswaps the resulting checksum (since
+ * skein is internally endian-insensitive).
+ */
+void
+zio_checksum_skein_byteswap(const void *buf, uint64_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
+{
+ zio_cksum_t tmp;
+
+ zio_checksum_skein_native(buf, size, ctx_template, &tmp);
+ zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]);
+ zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]);
+ zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]);
+ zcp->zc_word[3] = BSWAP_64(tmp.zc_word[3]);
+}
+
+/*
+ * Allocates a skein MAC template suitable for using in skein MAC checksum
+ * computations and returns a pointer to it.
+ */
+void *
+zio_checksum_skein_tmpl_init(const zio_cksum_salt_t *salt)
+{
+ Skein_512_Ctxt_t *ctx;
+
+ ctx = kmem_zalloc(sizeof (*ctx), KM_SLEEP);
+ (void) Skein_512_InitExt(ctx, sizeof (zio_cksum_t) * 8, 0,
+ salt->zcs_bytes, sizeof (salt->zcs_bytes));
+ return (ctx);
+}
+
+/*
+ * Frees a skein context template previously allocated using
+ * zio_checksum_skein_tmpl_init.
+ */
+void
+zio_checksum_skein_tmpl_free(void *ctx_template)
+{
+ Skein_512_Ctxt_t *ctx = ctx_template;
+
+ bzero(ctx, sizeof (*ctx));
+ kmem_free(ctx, sizeof (*ctx));
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
index b57eb95..fd3e537 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
@@ -22,9 +22,10 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
- * Copyright (c) 2013, 2014, Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
*/
/*
@@ -2579,6 +2580,19 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE));
}
+ /* Grab the secret checksum salt from the MOS. */
+ error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_CHECKSUM_SALT, 1,
+ sizeof (spa->spa_cksum_salt.zcs_bytes),
+ spa->spa_cksum_salt.zcs_bytes);
+ if (error == ENOENT) {
+ /* Generate a new salt for subsequent use */
+ (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
+ sizeof (spa->spa_cksum_salt.zcs_bytes));
+ } else if (error != 0) {
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ }
+
if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0)
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj);
@@ -3747,6 +3761,12 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
spa_history_create_obj(spa, tx);
/*
+ * Generate some random noise for salted checksums to operate on.
+ */
+ (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
+ sizeof (spa->spa_cksum_salt.zcs_bytes));
+
+ /*
* Set pool properties.
*/
spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
@@ -3771,6 +3791,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
txg_wait_synced(spa->spa_dsl_pool, txg);
spa_config_sync(spa, B_FALSE, B_TRUE);
+ spa_event_notify(spa, NULL, ESC_ZFS_POOL_CREATE);
spa_history_log_version(spa, "create");
@@ -4234,6 +4255,7 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
spa_configfile_set(spa, props, B_FALSE);
spa_config_sync(spa, B_FALSE, B_TRUE);
+ spa_event_notify(spa, NULL, ESC_ZFS_POOL_IMPORT);
mutex_exit(&spa_namespace_lock);
return (0);
@@ -4364,9 +4386,12 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
*/
spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
- mutex_exit(&spa_namespace_lock);
spa_history_log_version(spa, "import");
+ spa_event_notify(spa, NULL, ESC_ZFS_POOL_IMPORT);
+
+ mutex_exit(&spa_namespace_lock);
+
#ifdef __FreeBSD__
#ifdef _KERNEL
zvol_create_minors(pool);
@@ -4712,6 +4737,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
mutex_enter(&spa_namespace_lock);
spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
+ spa_event_notify(spa, NULL, ESC_ZFS_VDEV_ADD);
mutex_exit(&spa_namespace_lock);
return (0);
@@ -4906,6 +4932,11 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
*/
dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
+ if (spa->spa_bootfs)
+ spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH);
+
+ spa_event_notify(spa, newvd, ESC_ZFS_VDEV_ATTACH);
+
/*
* Commit the config
*/
@@ -4920,9 +4951,6 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
spa_strfree(oldvdpath);
spa_strfree(newvdpath);
- if (spa->spa_bootfs)
- spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH);
-
return (0);
}
@@ -6543,6 +6571,20 @@ spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx)
if (lz4_en && !lz4_ac)
spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx);
}
+
+ /*
+ * If we haven't written the salt, do so now. Note that the
+ * feature may not be activated yet, but that's fine since
+ * the presence of this ZAP entry is backwards compatible.
+ */
+ if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_CHECKSUM_SALT) == ENOENT) {
+ VERIFY0(zap_add(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1,
+ sizeof (spa->spa_cksum_salt.zcs_bytes),
+ spa->spa_cksum_salt.zcs_bytes, tx));
+ }
+
rrw_exit(&dp->dp_config_rwlock, FTAG);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
index baffee0..e18ffd2 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
@@ -24,6 +24,7 @@
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -51,7 +52,7 @@
#include <sys/arc.h>
#include <sys/ddt.h>
#include "zfs_prop.h"
-#include "zfeature_common.h"
+#include <sys/zfeature.h>
/*
* SPA locking
@@ -259,7 +260,7 @@ SYSCTL_INT(_debug, OID_AUTO, zfs_flags, CTLFLAG_RWTUN, &zfs_flags, 0,
boolean_t zfs_recover = B_FALSE;
SYSCTL_DECL(_vfs_zfs);
TUNABLE_INT("vfs.zfs.recover", &zfs_recover);
-SYSCTL_INT(_vfs_zfs, OID_AUTO, recover, CTLFLAG_RDTUN, &zfs_recover, 0,
+SYSCTL_INT(_vfs_zfs, OID_AUTO, recover, CTLFLAG_RWTUN, &zfs_recover, 0,
"Try to recover from otherwise-fatal errors.");
/*
@@ -608,6 +609,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_cksum_tmpls_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -770,6 +772,8 @@ spa_remove(spa_t *spa)
for (int t = 0; t < TXG_SIZE; t++)
bplist_destroy(&spa->spa_free_bplist[t]);
+ zio_checksum_templates_free(spa);
+
cv_destroy(&spa->spa_async_cv);
cv_destroy(&spa->spa_evicting_os_cv);
cv_destroy(&spa->spa_proc_cv);
@@ -783,6 +787,7 @@ spa_remove(spa_t *spa)
mutex_destroy(&spa->spa_history_lock);
mutex_destroy(&spa->spa_proc_lock);
mutex_destroy(&spa->spa_props_lock);
+ mutex_destroy(&spa->spa_cksum_tmpls_lock);
mutex_destroy(&spa->spa_scrub_lock);
mutex_destroy(&spa->spa_suspend_lock);
mutex_destroy(&spa->spa_vdev_top_lock);
@@ -1807,7 +1812,13 @@ dva_get_dsize_sync(spa_t *spa, const dva_t *dva)
ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
if (asize != 0 && spa->spa_deflate) {
- vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
+ uint64_t vdev = DVA_GET_VDEV(dva);
+ vdev_t *vd = vdev_lookup_top(spa, vdev);
+ if (vd == NULL) {
+ panic(
+ "dva_get_dsize_sync(): bad DVA %llu:%llu",
+ (u_longlong_t)vdev, (u_longlong_t)asize);
+ }
dsize = (asize >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio;
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c
index a508092..3d99059 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c
@@ -23,7 +23,7 @@
* Use is subject to license terms.
*/
/*
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -103,7 +103,7 @@ space_reftree_add_node(avl_tree_t *t, uint64_t offset, int64_t refcnt)
void
space_reftree_add_seg(avl_tree_t *t, uint64_t start, uint64_t end,
- int64_t refcnt)
+ int64_t refcnt)
{
space_reftree_add_node(t, start, refcnt);
space_reftree_add_node(t, end, -refcnt);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
index 482ccb0..233d541 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
@@ -230,9 +230,25 @@ typedef struct dmu_buf_impl {
/* User callback information. */
dmu_buf_user_t *db_user;
- uint8_t db_immediate_evict;
+ /*
+ * Evict user data as soon as the dirty and reference
+ * counts are equal.
+ */
+ uint8_t db_user_immediate_evict;
+
+ /*
+ * This block was freed while a read or write was
+ * active.
+ */
uint8_t db_freed_in_flight;
+ /*
+ * dnode_evict_dbufs() or dnode_evict_bonus() tried to
+ * evict this dbuf, but couldn't due to outstanding
+ * references. Evict once the refcount drops to 0.
+ */
+ uint8_t db_pending_evict;
+
uint8_t db_dirtycnt;
} dmu_buf_impl_t;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
index 3b055cc..6ba19c9 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
@@ -27,6 +27,7 @@
* Copyright 2013 DEY Storage Systems, Inc.
* Copyright 2014 HybridCluster. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
*/
/* Portions Copyright 2010 Robert Milkowski */
@@ -320,6 +321,7 @@ typedef struct dmu_buf {
#define DMU_POOL_FREE_BPOBJ "free_bpobj"
#define DMU_POOL_BPTREE_OBJ "bptree_obj"
#define DMU_POOL_EMPTY_BPOBJ "empty_bpobj"
+#define DMU_POOL_CHECKSUM_SALT "org.illumos:checksum_salt"
/*
* Allocate an object from this objset. The range of object numbers
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h
index 6f67b5a..e8d6294 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h
@@ -25,7 +25,7 @@
/*
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
* Copyright (c) 2012, Martin Matuska <mm@FreeBSD.org>. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
*/
#ifndef _SYS_DMU_IMPL_H
@@ -300,6 +300,8 @@ typedef struct dmu_sendarg {
uint64_t dsa_featureflags;
uint64_t dsa_last_data_object;
uint64_t dsa_last_data_offset;
+ uint64_t dsa_resume_object;
+ uint64_t dsa_resume_offset;
} dmu_sendarg_t;
void dmu_object_zapify(objset_t *, uint64_t, dmu_object_type_t, dmu_tx_t *);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
index 9e98350..8a263a3 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
@@ -93,7 +93,6 @@ struct objset {
uint8_t os_copies;
enum zio_checksum os_dedup_checksum;
boolean_t os_dedup_verify;
- boolean_t os_evicting;
zfs_logbias_op_t os_logbias;
zfs_cache_type_t os_primary_cache;
zfs_cache_type_t os_secondary_cache;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h
index 143d43f..2865e82 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h
@@ -35,13 +35,16 @@ struct vnode;
struct dsl_dataset;
struct drr_begin;
struct avl_tree;
+struct dmu_replay_record;
-int dmu_send(const char *tosnap, const char *fromsnap,
- boolean_t embedok, boolean_t large_block_ok,
+extern const char *recv_clone_name;
+
+int dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
+ boolean_t large_block_ok, int outfd, uint64_t resumeobj, uint64_t resumeoff,
#ifdef illumos
- int outfd, struct vnode *vp, offset_t *off);
+ struct vnode *vp, offset_t *off);
#else
- int outfd, struct file *fp, offset_t *off);
+ struct file *fp, offset_t *off);
#endif
int dmu_send_estimate(struct dsl_dataset *ds, struct dsl_dataset *fromds,
uint64_t *sizep);
@@ -57,12 +60,14 @@ int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
typedef struct dmu_recv_cookie {
struct dsl_dataset *drc_ds;
+ struct dmu_replay_record *drc_drr_begin;
struct drr_begin *drc_drrb;
const char *drc_tofs;
const char *drc_tosnap;
boolean_t drc_newfs;
boolean_t drc_byteswap;
boolean_t drc_force;
+ boolean_t drc_resumable;
struct avl_tree *drc_guid_to_ds_map;
zio_cksum_t drc_cksum;
uint64_t drc_newsnapobj;
@@ -70,8 +75,9 @@ typedef struct dmu_recv_cookie {
cred_t *drc_cred;
} dmu_recv_cookie_t;
-int dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb,
- boolean_t force, char *origin, dmu_recv_cookie_t *drc);
+int dmu_recv_begin(char *tofs, char *tosnap,
+ struct dmu_replay_record *drr_begin,
+ boolean_t force, boolean_t resumable, char *origin, dmu_recv_cookie_t *drc);
#ifdef illumos
int dmu_recv_stream(dmu_recv_cookie_t *drc, struct vnode *vp, offset_t *voffp,
#else
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h
index 544b721..c010edd 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h
@@ -54,6 +54,8 @@ typedef int (blkptr_cb_t)(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
int traverse_dataset(struct dsl_dataset *ds,
uint64_t txg_start, int flags, blkptr_cb_t func, void *arg);
+int traverse_dataset_resume(struct dsl_dataset *ds, uint64_t txg_start,
+ zbookmark_phys_t *resume, int flags, blkptr_cb_t func, void *arg);
int traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr,
uint64_t txg_start, zbookmark_phys_t *resume, int flags,
blkptr_cb_t func, void *arg);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h
index c9cd589..d7df05b 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h
@@ -92,6 +92,18 @@ struct dsl_pool;
#define DS_FIELD_LARGE_BLOCKS "org.open-zfs:large_blocks"
/*
+ * These fields are set on datasets that are in the middle of a resumable
+ * receive, and allow the sender to resume the send if it is interrupted.
+ */
+#define DS_FIELD_RESUME_FROMGUID "com.delphix:resume_fromguid"
+#define DS_FIELD_RESUME_TONAME "com.delphix:resume_toname"
+#define DS_FIELD_RESUME_TOGUID "com.delphix:resume_toguid"
+#define DS_FIELD_RESUME_OBJECT "com.delphix:resume_object"
+#define DS_FIELD_RESUME_OFFSET "com.delphix:resume_offset"
+#define DS_FIELD_RESUME_BYTES "com.delphix:resume_bytes"
+#define DS_FIELD_RESUME_EMBEDOK "com.delphix:resume_embedok"
+
+/*
* DS_FLAG_CI_DATASET is set if the dataset contains a file system whose
* name lookups should be performed case-insensitively.
*/
@@ -184,6 +196,14 @@ typedef struct dsl_dataset {
kmutex_t ds_sendstream_lock;
list_t ds_sendstreams;
+ /*
+ * When in the middle of a resumable receive, tracks how much
+ * progress we have made.
+ */
+ uint64_t ds_resume_object[TXG_SIZE];
+ uint64_t ds_resume_offset[TXG_SIZE];
+ uint64_t ds_resume_bytes[TXG_SIZE];
+
/* Protected by our dsl_dir's dd_lock */
list_t ds_prop_cbs;
@@ -235,6 +255,7 @@ int dsl_dataset_own_obj(struct dsl_pool *dp, uint64_t dsobj,
void dsl_dataset_disown(dsl_dataset_t *ds, void *tag);
void dsl_dataset_name(dsl_dataset_t *ds, char *name);
boolean_t dsl_dataset_tryown(dsl_dataset_t *ds, void *tag);
+boolean_t dsl_dataset_has_owner(dsl_dataset_t *ds);
uint64_t dsl_dataset_create_sync(dsl_dir_t *pds, const char *lastname,
dsl_dataset_t *origin, uint64_t flags, cred_t *, dmu_tx_t *);
uint64_t dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
@@ -315,6 +336,8 @@ int dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx,
void dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds,
zprop_source_t source, uint64_t value, dmu_tx_t *tx);
void dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx);
+boolean_t dsl_dataset_is_zapified(dsl_dataset_t *ds);
+boolean_t dsl_dataset_has_resume_receive_state(dsl_dataset_t *ds);
int dsl_dataset_rollback(const char *fsname, void *owner, nvlist_t *result);
void dsl_dataset_deactivate_feature(uint64_t dsobj,
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
index 09f16bc..be00621 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
@@ -23,6 +23,7 @@
* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
*/
#ifndef _SYS_SPA_H
@@ -163,6 +164,14 @@ typedef struct zio_cksum {
} zio_cksum_t;
/*
+ * Some checksums/hashes need a 256-bit initialization salt. This salt is kept
+ * secret and is suitable for use in MAC algorithms as the key.
+ */
+typedef struct zio_cksum_salt {
+ uint8_t zcs_bytes[32];
+} zio_cksum_salt_t;
+
+/*
* Each block is described by its DVAs, time of birth, checksum, etc.
* The word-by-word, bit-by-bit layout of the blkptr is as follows:
*
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
index 626d9d5..0f9a18b 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
@@ -24,6 +24,7 @@
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
*/
#ifndef _SYS_SPA_IMPL_H
@@ -166,6 +167,10 @@ struct spa {
uint64_t spa_syncing_txg; /* txg currently syncing */
bpobj_t spa_deferred_bpobj; /* deferred-free bplist */
bplist_t spa_free_bplist[TXG_SIZE]; /* bplist of stuff to free */
+ zio_cksum_salt_t spa_cksum_salt; /* secret salt for cksum */
+ /* checksum context templates */
+ kmutex_t spa_cksum_tmpls_lock;
+ void *spa_cksum_tmpls[ZIO_CHECKSUM_FUNCTIONS];
uberblock_t spa_ubsync; /* last synced uberblock */
uberblock_t spa_uberblock; /* current uberblock */
boolean_t spa_extreme_rewind; /* rewind past deferred frees */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
index f9eca27..20bf545 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
@@ -79,14 +79,15 @@ typedef enum drr_headertype {
* Feature flags for zfs send streams (flags in drr_versioninfo)
*/
-#define DMU_BACKUP_FEATURE_DEDUP (1<<0)
-#define DMU_BACKUP_FEATURE_DEDUPPROPS (1<<1)
-#define DMU_BACKUP_FEATURE_SA_SPILL (1<<2)
+#define DMU_BACKUP_FEATURE_DEDUP (1 << 0)
+#define DMU_BACKUP_FEATURE_DEDUPPROPS (1 << 1)
+#define DMU_BACKUP_FEATURE_SA_SPILL (1 << 2)
/* flags #3 - #15 are reserved for incompatible closed-source implementations */
-#define DMU_BACKUP_FEATURE_EMBED_DATA (1<<16)
-#define DMU_BACKUP_FEATURE_EMBED_DATA_LZ4 (1<<17)
+#define DMU_BACKUP_FEATURE_EMBED_DATA (1 << 16)
+#define DMU_BACKUP_FEATURE_EMBED_DATA_LZ4 (1 << 17)
/* flag #18 is reserved for a Delphix feature */
-#define DMU_BACKUP_FEATURE_LARGE_BLOCKS (1<<19)
+#define DMU_BACKUP_FEATURE_LARGE_BLOCKS (1 << 19)
+#define DMU_BACKUP_FEATURE_RESUMING (1 << 20)
/*
* Mask of all supported backup features
@@ -94,11 +95,16 @@ typedef enum drr_headertype {
#define DMU_BACKUP_FEATURE_MASK (DMU_BACKUP_FEATURE_DEDUP | \
DMU_BACKUP_FEATURE_DEDUPPROPS | DMU_BACKUP_FEATURE_SA_SPILL | \
DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_EMBED_DATA_LZ4 | \
+ DMU_BACKUP_FEATURE_RESUMING | \
DMU_BACKUP_FEATURE_LARGE_BLOCKS)
/* Are all features in the given flag word currently supported? */
#define DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK))
+typedef enum dmu_send_resume_token_version {
+ ZFS_SEND_RESUME_TOKEN_VERSION = 1
+} dmu_send_resume_token_version_t;
+
/*
* The drr_versioninfo field of the dmu_replay_record has the
* following layout:
@@ -358,14 +364,14 @@ typedef struct zfs_cmd {
zfs_share_t zc_share;
uint64_t zc_jailid;
dmu_objset_stats_t zc_objset_stats;
- struct drr_begin zc_begin_record;
+ dmu_replay_record_t zc_begin_record;
zinject_record_t zc_inject_record;
uint32_t zc_defer_destroy;
uint32_t zc_flags;
uint64_t zc_action_handle;
int zc_cleanup_fd;
uint8_t zc_simple;
- uint8_t zc_pad[3]; /* alignment */
+ boolean_t zc_resumable;
uint64_t zc_sendobj;
uint64_t zc_fromobj;
uint64_t zc_createtxg;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
index ad35273..16d9ad2 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
@@ -82,6 +82,11 @@ enum zio_checksum {
ZIO_CHECKSUM_SHA256,
ZIO_CHECKSUM_ZILOG2,
ZIO_CHECKSUM_NOPARITY,
+#ifdef illumos
+ ZIO_CHECKSUM_SHA512,
+ ZIO_CHECKSUM_SKEIN,
+ ZIO_CHECKSUM_EDONR,
+#endif
ZIO_CHECKSUM_FUNCTIONS
};
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h
index 0c293ab..0a9d772 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h
@@ -20,13 +20,15 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014, 2015 by Delphix. All rights reserved.
+ * Copyright Saso Kiselkov 2013, All rights reserved.
*/
#ifndef _SYS_ZIO_CHECKSUM_H
#define _SYS_ZIO_CHECKSUM_H
#include <sys/zio.h>
+#include <zfeature_common.h>
#ifdef __cplusplus
extern "C" {
@@ -35,17 +37,34 @@ extern "C" {
/*
* Signature for checksum functions.
*/
-typedef void zio_checksum_func_t(const void *, uint64_t, zio_cksum_t *);
+typedef void zio_checksum_t(const void *data, uint64_t size,
+ const void *ctx_template, zio_cksum_t *zcp);
+typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt);
+typedef void zio_checksum_tmpl_free_t(void *ctx_template);
+
+typedef enum zio_checksum_flags {
+ /* Strong enough for metadata? */
+ ZCHECKSUM_FLAG_METADATA = (1 << 1),
+ /* ZIO embedded checksum */
+ ZCHECKSUM_FLAG_EMBEDDED = (1 << 2),
+ /* Strong enough for dedup (without verification)? */
+ ZCHECKSUM_FLAG_DEDUP = (1 << 3),
+ /* Uses salt value */
+ ZCHECKSUM_FLAG_SALTED = (1 << 4),
+ /* Strong enough for nopwrite? */
+ ZCHECKSUM_FLAG_NOPWRITE = (1 << 5)
+} zio_checksum_flags_t;
/*
* Information about each checksum function.
*/
typedef struct zio_checksum_info {
- zio_checksum_func_t *ci_func[2]; /* checksum function per byteorder */
- int ci_correctable; /* number of correctable bits */
- int ci_eck; /* uses zio embedded checksum? */
- boolean_t ci_dedup; /* strong enough for dedup? */
- char *ci_name; /* descriptive name */
+ /* checksum function for each byteorder */
+ zio_checksum_t *ci_func[2];
+ zio_checksum_tmpl_init_t *ci_tmpl_init;
+ zio_checksum_tmpl_free_t *ci_tmpl_free;
+ zio_checksum_flags_t ci_flags;
+ char *ci_name; /* descriptive name */
} zio_checksum_info_t;
typedef struct zio_bad_cksum {
@@ -62,12 +81,30 @@ extern zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS];
/*
* Checksum routines.
*/
-extern zio_checksum_func_t zio_checksum_SHA256;
+extern zio_checksum_t zio_checksum_SHA256;
+#ifdef illumos
+extern zio_checksum_t zio_checksum_SHA512_native;
+extern zio_checksum_t zio_checksum_SHA512_byteswap;
+
+/* Skein */
+extern zio_checksum_t zio_checksum_skein_native;
+extern zio_checksum_t zio_checksum_skein_byteswap;
+extern zio_checksum_tmpl_init_t zio_checksum_skein_tmpl_init;
+extern zio_checksum_tmpl_free_t zio_checksum_skein_tmpl_free;
+
+/* Edon-R */
+extern zio_checksum_t zio_checksum_edonr_native;
+extern zio_checksum_t zio_checksum_edonr_byteswap;
+extern zio_checksum_tmpl_init_t zio_checksum_edonr_tmpl_init;
+extern zio_checksum_tmpl_free_t zio_checksum_edonr_tmpl_free;
+#endif
extern void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
void *data, uint64_t size);
extern int zio_checksum_error(zio_t *zio, zio_bad_cksum_t *out);
extern enum zio_checksum spa_dedup_checksum(spa_t *spa);
+extern void zio_checksum_templates_free(spa_t *spa);
+extern spa_feature_t zio_checksum_to_feature(enum zio_checksum cksum);
#ifdef __cplusplus
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zrlock.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zrlock.h
index dcd63f7..b6eba1a 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zrlock.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zrlock.h
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015 by Delphix. All rights reserved.
*/
#ifndef _SYS_ZRLOCK_H
@@ -44,12 +45,8 @@ typedef struct zrlock {
extern void zrl_init(zrlock_t *);
extern void zrl_destroy(zrlock_t *);
-#ifdef ZFS_DEBUG
-#define zrl_add(_z) zrl_add_debug((_z), __func__)
-extern void zrl_add_debug(zrlock_t *, const char *);
-#else
-extern void zrl_add(zrlock_t *);
-#endif
+#define zrl_add(_z) zrl_add_impl((_z), __func__)
+extern void zrl_add_impl(zrlock_t *, const char *);
extern void zrl_remove(zrlock_t *);
extern int zrl_tryenter(zrlock_t *);
extern void zrl_exit(zrlock_t *);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
index 06824f7..a846487 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
@@ -2503,6 +2503,7 @@ int
vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
{
vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev;
+ boolean_t postevent = B_FALSE;
spa_vdev_state_enter(spa, SCL_NONE);
@@ -2512,6 +2513,10 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
if (!vd->vdev_ops->vdev_op_leaf)
return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
+ postevent =
+ (vd->vdev_offline == B_TRUE || vd->vdev_tmpoffline == B_TRUE) ?
+ B_TRUE : B_FALSE;
+
tvd = vd->vdev_top;
vd->vdev_offline = B_FALSE;
vd->vdev_tmpoffline = B_FALSE;
@@ -2547,6 +2552,10 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
return (spa_vdev_state_exit(spa, vd, ENOTSUP));
spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
}
+
+ if (postevent)
+ spa_event_notify(spa, vd, ESC_ZFS_VDEV_ONLINE);
+
return (spa_vdev_state_exit(spa, vd, 0));
}
@@ -3403,8 +3412,6 @@ vdev_is_bootable(vdev_t *vd)
strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) {
return (B_FALSE);
}
- } else if (vd->vdev_wholedisk == 1) {
- return (B_FALSE);
}
for (int c = 0; c < vd->vdev_children; c++) {
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
index 966f2fa..9befa75 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
*/
/*
@@ -185,7 +185,7 @@ vdev_label_number(uint64_t psize, uint64_t offset)
static void
vdev_label_read(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
- uint64_t size, zio_done_func_t *done, void *private, int flags)
+ uint64_t size, zio_done_func_t *done, void *private, int flags)
{
ASSERT(spa_config_held(zio->io_spa, SCL_STATE_ALL, RW_WRITER) ==
SCL_STATE_ALL);
@@ -199,7 +199,7 @@ vdev_label_read(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
static void
vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
- uint64_t size, zio_done_func_t *done, void *private, int flags)
+ uint64_t size, zio_done_func_t *done, void *private, int flags)
{
ASSERT(spa_config_held(zio->io_spa, SCL_ALL, RW_WRITER) == SCL_ALL ||
(spa_config_held(zio->io_spa, SCL_CONFIG | SCL_STATE, RW_READER) ==
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c
index 96358f7..c8c3660 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
*/
/*
@@ -535,7 +535,7 @@ zap_entry_read_name(zap_t *zap, const zap_entry_handle_t *zeh, uint16_t buflen,
int
zap_entry_update(zap_entry_handle_t *zeh,
- uint8_t integer_size, uint64_t num_integers, const void *buf)
+ uint8_t integer_size, uint64_t num_integers, const void *buf)
{
int delta_chunks;
zap_leaf_t *l = zeh->zeh_leaf;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c
index 80a3f0b..78b2912 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c
@@ -269,7 +269,8 @@ feature_get_refcount_from_disk(spa_t *spa, zfeature_info_t *feature,
static int
-feature_get_enabled_txg(spa_t *spa, zfeature_info_t *feature, uint64_t *res) {
+feature_get_enabled_txg(spa_t *spa, zfeature_info_t *feature, uint64_t *res)
+{
uint64_t enabled_txg_obj = spa->spa_feat_enabled_txg_obj;
ASSERT(zfeature_depends_on(feature->fi_feature,
@@ -493,7 +494,8 @@ spa_feature_is_active(spa_t *spa, spa_feature_t fid)
* Returns B_FALSE otherwise (i.e. if the feature is not enabled).
*/
boolean_t
-spa_feature_enabled_txg(spa_t *spa, spa_feature_t fid, uint64_t *txg) {
+spa_feature_enabled_txg(spa_t *spa, spa_feature_t fid, uint64_t *txg)
+{
int err;
ASSERT(VALID_FEATURE_FID(fid));
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c
index f7c2b6a..e2a3bff 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
*/
#include <sys/types.h>
@@ -806,7 +806,7 @@ zfs_dropname(zfs_dirlock_t *dl, znode_t *zp, znode_t *dzp, dmu_tx_t *tx,
*/
int
zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
- boolean_t *unlinkedp)
+ boolean_t *unlinkedp)
{
znode_t *dzp = dl->dl_dzp;
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
index d3a339e..6d9877b 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
@@ -27,7 +27,7 @@
* Copyright 2014 Xin Li <delphij@FreeBSD.org>. All rights reserved.
* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2014, Joyent, Inc. All rights reserved.
- * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2013 Steven Hartland. All rights reserved.
*/
@@ -185,6 +185,7 @@
#include <sys/dsl_bookmark.h>
#include <sys/dsl_userhold.h>
#include <sys/zfeature.h>
+#include <sys/zio_checksum.h>
#include "zfs_namecheck.h"
#include "zfs_prop.h"
@@ -3902,11 +3903,6 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr)
return (SET_ERROR(ENOTSUP));
break;
- case ZFS_PROP_DEDUP:
- if (zfs_earlier_version(dsname, SPA_VERSION_DEDUP))
- return (SET_ERROR(ENOTSUP));
- break;
-
case ZFS_PROP_RECORDSIZE:
/* Record sizes above 128k need the feature to be enabled */
if (nvpair_value_uint64(pair, &intval) == 0 &&
@@ -3920,7 +3916,7 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr)
*/
if (zfs_is_bootfs(dsname) &&
intval > SPA_OLD_MAXBLOCKSIZE) {
- return (SET_ERROR(EDOM));
+ return (SET_ERROR(ERANGE));
}
/*
@@ -3929,7 +3925,7 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr)
*/
if (intval > zfs_max_recordsize ||
intval > SPA_MAXBLOCKSIZE)
- return (SET_ERROR(EDOM));
+ return (SET_ERROR(ERANGE));
if ((err = spa_open(dsname, &spa, FTAG)) != 0)
return (err);
@@ -3957,6 +3953,45 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr)
return (SET_ERROR(ENOTSUP));
}
break;
+
+ case ZFS_PROP_CHECKSUM:
+ case ZFS_PROP_DEDUP:
+ {
+ spa_feature_t feature;
+ spa_t *spa;
+
+ /* dedup feature version checks */
+ if (prop == ZFS_PROP_DEDUP &&
+ zfs_earlier_version(dsname, SPA_VERSION_DEDUP))
+ return (SET_ERROR(ENOTSUP));
+
+ if (nvpair_value_uint64(pair, &intval) != 0)
+ return (SET_ERROR(EINVAL));
+
+ /* check prop value is enabled in features */
+ feature = zio_checksum_to_feature(intval);
+ if (feature == SPA_FEATURE_NONE)
+ break;
+
+ if ((err = spa_open(dsname, &spa, FTAG)) != 0)
+ return (err);
+ /*
+ * Salted checksums are not supported on root pools.
+ */
+ if (spa_bootfs(spa) != 0 &&
+ intval < ZIO_CHECKSUM_FUNCTIONS &&
+ (zio_checksum_table[intval].ci_flags &
+ ZCHECKSUM_FLAG_SALTED)) {
+ spa_close(spa, FTAG);
+ return (SET_ERROR(ERANGE));
+ }
+ if (!spa_feature_is_enabled(spa, feature)) {
+ spa_close(spa, FTAG);
+ return (SET_ERROR(ENOTSUP));
+ }
+ spa_close(spa, FTAG);
+ break;
+ }
}
return (zfs_secpolicy_setprop(dsname, prop, pair, CRED()));
@@ -4155,6 +4190,7 @@ static boolean_t zfs_ioc_recv_inject_err;
* zc_guid force flag
* zc_cleanup_fd cleanup-on-exit file descriptor
* zc_action_handle handle for this guid/ds mapping (or zero on first call)
+ * zc_resumable if data is incomplete assume sender will resume
*
* outputs:
* zc_cookie number of bytes read
@@ -4206,13 +4242,13 @@ zfs_ioc_recv(zfs_cmd_t *zc)
return (SET_ERROR(EBADF));
}
- VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ errors = fnvlist_alloc();
if (zc->zc_string[0])
origin = zc->zc_string;
error = dmu_recv_begin(tofs, tosnap,
- &zc->zc_begin_record, force, origin, &drc);
+ &zc->zc_begin_record, force, zc->zc_resumable, origin, &drc);
if (error != 0)
goto out;
@@ -5431,6 +5467,8 @@ zfs_ioc_unjail(zfs_cmd_t *zc)
* indicates that blocks > 128KB are permitted
* (optional) "embedok" -> (value ignored)
* presence indicates DRR_WRITE_EMBEDDED records are permitted
+ * (optional) "resume_object" and "resume_offset" -> (uint64)
+ * if present, resume send stream from specified object and offset.
* }
*
* outnvl is unused
@@ -5447,6 +5485,8 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
int fd;
boolean_t largeblockok;
boolean_t embedok;
+ uint64_t resumeobj = 0;
+ uint64_t resumeoff = 0;
error = nvlist_lookup_int32(innvl, "fd", &fd);
if (error != 0)
@@ -5457,6 +5497,9 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
largeblockok = nvlist_exists(innvl, "largeblockok");
embedok = nvlist_exists(innvl, "embedok");
+ (void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj);
+ (void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff);
+
#ifdef illumos
file_t *fp = getf(fd);
#else
@@ -5466,11 +5509,11 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
return (SET_ERROR(EBADF));
off = fp->f_offset;
- error = dmu_send(snapname, fromname, embedok, largeblockok,
+ error = dmu_send(snapname, fromname, embedok, largeblockok, fd,
#ifdef illumos
- fd, fp->f_vnode, &off);
+ resumeobj, resumeoff, fp->f_vnode, &off);
#else
- fd, fp, &off);
+ resumeobj, resumeoff, fp, &off);
#endif
#ifdef illumos
@@ -5664,7 +5707,7 @@ zfs_ioctl_register_dataset_read(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func)
static void
zfs_ioctl_register_dataset_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
- zfs_secpolicy_func_t *secpolicy)
+ zfs_secpolicy_func_t *secpolicy)
{
zfs_ioctl_register_legacy(ioc, func, secpolicy,
DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY);
@@ -6099,6 +6142,14 @@ zfsdev_ioctl(struct cdev *dev, u_long zcmd, caddr_t arg, int flag,
goto out;
}
break;
+ case ZFS_IOCVER_EDBP:
+ if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_edbp_t)) {
+ error = SET_ERROR(EFAULT);
+ goto out;
+ }
+ compat = B_TRUE;
+ cflag = ZFS_CMD_COMPAT_EDBP;
+ break;
case ZFS_IOCVER_ZCMD:
if (zc_iocparm->zfs_cmd_size > sizeof(zfs_cmd_t) ||
zc_iocparm->zfs_cmd_size < sizeof(zfs_cmd_zcmd_t)) {
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c
index 7432290..30b3b52 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015 by Delphix. All rights reserved.
*/
#include <sys/types.h>
@@ -348,7 +349,7 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
*/
void
zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
- znode_t *dzp, char *name, uint64_t foid)
+ znode_t *dzp, char *name, uint64_t foid)
{
itx_t *itx;
lr_remove_t *lr;
@@ -372,7 +373,7 @@ zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
*/
void
zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
- znode_t *dzp, znode_t *zp, char *name)
+ znode_t *dzp, znode_t *zp, char *name)
{
itx_t *itx;
lr_link_t *lr;
@@ -427,7 +428,7 @@ zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
*/
void
zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
- znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp)
+ znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp)
{
itx_t *itx;
lr_rename_t *lr;
@@ -455,7 +456,7 @@ ssize_t zfs_immediate_write_sz = 32768;
void
zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
- znode_t *zp, offset_t off, ssize_t resid, int ioflag)
+ znode_t *zp, offset_t off, ssize_t resid, int ioflag)
{
itx_wr_state_t write_state;
boolean_t slogging;
@@ -532,7 +533,7 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
*/
void
zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
- znode_t *zp, uint64_t off, uint64_t len)
+ znode_t *zp, uint64_t off, uint64_t len)
{
itx_t *itx;
lr_truncate_t *lr;
@@ -555,7 +556,7 @@ zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
*/
void
zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
- znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp)
+ znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp)
{
itx_t *itx;
lr_setattr_t *lr;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c
index 6e0c243..ae24ef0 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
*/
#include <sys/types.h>
@@ -54,7 +54,7 @@
static void
zfs_init_vattr(vattr_t *vap, uint64_t mask, uint64_t mode,
- uint64_t uid, uint64_t gid, uint64_t rdev, uint64_t nodeid)
+ uint64_t uid, uint64_t gid, uint64_t rdev, uint64_t nodeid)
{
VATTR_NULL(vap);
vap->va_mask = (uint_t)mask;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
index 3a4f348..5c26efe 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
* Copyright 2013 Nexenta Systems, Inc. All rights reserved.
*/
@@ -4505,7 +4505,7 @@ top:
/* ARGSUSED */
static int
zfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
- size_t *lenp, int flags, cred_t *cr)
+ size_t *lenp, int flags, cred_t *cr)
{
pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR);
return (0);
@@ -4531,7 +4531,7 @@ zfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
/* ARGSUSED */
static int
zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
- size_t *lenp, int flags, cred_t *cr)
+ size_t *lenp, int flags, cred_t *cr)
{
znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
index 8548b2d..cce7fd1 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
@@ -330,7 +330,7 @@ zio_data_buf_free(void *buf, size_t size)
*/
static void
zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize,
- zio_transform_func_t *transform)
+ zio_transform_func_t *transform)
{
zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
@@ -999,7 +999,7 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
zio->io_prop.zp_checksum = checksum;
- if (zio_checksum_table[checksum].ci_eck) {
+ if (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
/*
* zec checksums are necessarily destructive -- they modify
* the end of the write buffer to hold the verifier/checksum.
@@ -1068,8 +1068,8 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
zio_t *
zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
- int type, zio_priority_t priority, enum zio_flag flags,
- zio_done_func_t *done, void *private)
+ int type, zio_priority_t priority, enum zio_flag flags,
+ zio_done_func_t *done, void *private)
{
zio_t *zio;
@@ -1211,8 +1211,8 @@ zio_write_bp_init(zio_t *zio)
if (BP_IS_HOLE(bp) || !zp->zp_dedup)
return (ZIO_PIPELINE_CONTINUE);
- ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup ||
- zp->zp_dedup_verify);
+ ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags &
+ ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify);
if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) {
BP_SET_DEDUP(bp, 1);
@@ -2074,12 +2074,22 @@ zio_write_gang_block(zio_t *pio)
}
/*
- * The zio_nop_write stage in the pipeline determines if allocating
- * a new bp is necessary. By leveraging a cryptographically secure checksum,
- * such as SHA256, we can compare the checksums of the new data and the old
- * to determine if allocating a new block is required. The nopwrite
- * feature can handle writes in either syncing or open context (i.e. zil
- * writes) and as a result is mutually exclusive with dedup.
+ * The zio_nop_write stage in the pipeline determines if allocating a
+ * new bp is necessary. The nopwrite feature can handle writes in
+ * either syncing or open context (i.e. zil writes) and as a result is
+ * mutually exclusive with dedup.
+ *
+ * By leveraging a cryptographically secure checksum, such as SHA256, we
+ * can compare the checksums of the new data and the old to determine if
+ * allocating a new block is required. Note that our requirements for
+ * cryptographic strength are fairly weak: there can't be any accidental
+ * hash collisions, but we don't need to be secure against intentional
+ * (malicious) collisions. To trigger a nopwrite, you have to be able
+ * to write the file to begin with, and triggering an incorrect (hash
+ * collision) nopwrite is no worse than simply writing to the file.
+ * That said, there are no known attacks against the checksum algorithms
+ * used for nopwrite, assuming that the salt and the checksums
+ * themselves remain secret.
*/
static int
zio_nop_write(zio_t *zio)
@@ -2102,7 +2112,8 @@ zio_nop_write(zio_t *zio)
* allocate a new bp.
*/
if (BP_IS_HOLE(bp_orig) ||
- !zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_dedup ||
+ !(zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_flags &
+ ZCHECKSUM_FLAG_NOPWRITE) ||
BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) ||
BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) ||
BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) ||
@@ -2114,7 +2125,8 @@ zio_nop_write(zio_t *zio)
* avoid allocating a new bp and issuing any I/O.
*/
if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) {
- ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup);
+ ASSERT(zio_checksum_table[zp->zp_checksum].ci_flags &
+ ZCHECKSUM_FLAG_NOPWRITE);
ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig));
ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig));
ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF);
@@ -2395,7 +2407,8 @@ zio_ddt_write(zio_t *zio)
* we can't resolve it, so just convert to an ordinary write.
* (And automatically e-mail a paper to Nature?)
*/
- if (!zio_checksum_table[zp->zp_checksum].ci_dedup) {
+ if (!(zio_checksum_table[zp->zp_checksum].ci_flags &
+ ZCHECKSUM_FLAG_DEDUP)) {
zp->zp_checksum = spa_dedup_checksum(spa);
zio_pop_transforms(zio);
zio->io_stage = ZIO_STAGE_OPEN;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c
index d1c60c3..6ba64e0 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c
@@ -20,12 +20,14 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
*/
#include <sys/zfs_context.h>
#include <sys/spa.h>
+#include <sys/spa_impl.h>
#include <sys/zio.h>
#include <sys/zio_checksum.h>
#include <sys/zil.h>
@@ -59,29 +61,99 @@
* checksum function of the appropriate strength. When reading a block,
* we compare the expected checksum against the actual checksum, which we
* compute via the checksum function specified by BP_GET_CHECKSUM(bp).
+ *
+ * SALTED CHECKSUMS
+ *
+ * To enable the use of less secure hash algorithms with dedup, we
+ * introduce the notion of salted checksums (MACs, really). A salted
+ * checksum is fed both a random 256-bit value (the salt) and the data
+ * to be checksummed. This salt is kept secret (stored on the pool, but
+ * never shown to the user). Thus even if an attacker knew of collision
+ * weaknesses in the hash algorithm, they won't be able to mount a known
+ * plaintext attack on the DDT, since the actual hash value cannot be
+ * known ahead of time. How the salt is used is algorithm-specific
+ * (some might simply prefix it to the data block, others might need to
+ * utilize a full-blown HMAC). On disk the salt is stored in a ZAP
+ * object in the MOS (DMU_POOL_CHECKSUM_SALT).
+ *
+ * CONTEXT TEMPLATES
+ *
+ * Some hashing algorithms need to perform a substantial amount of
+ * initialization work (e.g. salted checksums above may need to pre-hash
+ * the salt) before being able to process data. Performing this
+ * redundant work for each block would be wasteful, so we instead allow
+ * a checksum algorithm to do the work once (the first time it's used)
+ * and then keep this pre-initialized context as a template inside the
+ * spa_t (spa_cksum_tmpls). If the zio_checksum_info_t contains
+ * non-NULL ci_tmpl_init and ci_tmpl_free callbacks, they are used to
+ * construct and destruct the pre-initialized checksum context. The
+ * pre-initialized context is then reused during each checksum
+ * invocation and passed to the checksum function.
*/
/*ARGSUSED*/
static void
-zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp)
+zio_checksum_off(const void *buf, uint64_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
{
ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
}
zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
- {{NULL, NULL}, 0, 0, 0, "inherit"},
- {{NULL, NULL}, 0, 0, 0, "on"},
- {{zio_checksum_off, zio_checksum_off}, 0, 0, 0, "off"},
- {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, 0, "label"},
- {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, 0, "gang_header"},
- {{fletcher_2_native, fletcher_2_byteswap}, 0, 1, 0, "zilog"},
- {{fletcher_2_native, fletcher_2_byteswap}, 0, 0, 0, "fletcher2"},
- {{fletcher_4_native, fletcher_4_byteswap}, 1, 0, 0, "fletcher4"},
- {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 0, 1, "sha256"},
- {{fletcher_4_native, fletcher_4_byteswap}, 0, 1, 0, "zilog2"},
- {{zio_checksum_off, zio_checksum_off}, 0, 0, 0, "noparity"},
+ {{NULL, NULL}, NULL, NULL, 0, "inherit"},
+ {{NULL, NULL}, NULL, NULL, 0, "on"},
+ {{zio_checksum_off, zio_checksum_off},
+ NULL, NULL, 0, "off"},
+ {{zio_checksum_SHA256, zio_checksum_SHA256},
+ NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED,
+ "label"},
+ {{zio_checksum_SHA256, zio_checksum_SHA256},
+ NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED,
+ "gang_header"},
+ {{fletcher_2_native, fletcher_2_byteswap},
+ NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
+ {{fletcher_2_native, fletcher_2_byteswap},
+ NULL, NULL, 0, "fletcher2"},
+ {{fletcher_4_native, fletcher_4_byteswap},
+ NULL, NULL, ZCHECKSUM_FLAG_METADATA, "fletcher4"},
+ {{zio_checksum_SHA256, zio_checksum_SHA256},
+ NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
+ ZCHECKSUM_FLAG_NOPWRITE, "sha256"},
+ {{fletcher_4_native, fletcher_4_byteswap},
+ NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog2"},
+ {{zio_checksum_off, zio_checksum_off},
+ NULL, NULL, 0, "noparity"},
+#ifdef illumos
+ {{zio_checksum_SHA512_native, zio_checksum_SHA512_byteswap},
+ NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
+ ZCHECKSUM_FLAG_NOPWRITE, "sha512"},
+ {{zio_checksum_skein_native, zio_checksum_skein_byteswap},
+ zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free,
+ ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
+ ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"},
+ {{zio_checksum_edonr_native, zio_checksum_edonr_byteswap},
+ zio_checksum_edonr_tmpl_init, zio_checksum_edonr_tmpl_free,
+ ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED |
+ ZCHECKSUM_FLAG_NOPWRITE, "edonr"},
+#endif
};
+spa_feature_t
+zio_checksum_to_feature(enum zio_checksum cksum)
+{
+#ifdef illumos
+ switch (cksum) {
+ case ZIO_CHECKSUM_SHA512:
+ return (SPA_FEATURE_SHA512);
+ case ZIO_CHECKSUM_SKEIN:
+ return (SPA_FEATURE_SKEIN);
+ case ZIO_CHECKSUM_EDONR:
+ return (SPA_FEATURE_EDONR);
+ }
+#endif
+ return (SPA_FEATURE_NONE);
+}
+
enum zio_checksum
zio_checksum_select(enum zio_checksum child, enum zio_checksum parent)
{
@@ -115,7 +187,8 @@ zio_checksum_dedup_select(spa_t *spa, enum zio_checksum child,
if (child == (ZIO_CHECKSUM_ON | ZIO_CHECKSUM_VERIFY))
return (spa_dedup_checksum(spa) | ZIO_CHECKSUM_VERIFY);
- ASSERT(zio_checksum_table[child & ZIO_CHECKSUM_MASK].ci_dedup ||
+ ASSERT((zio_checksum_table[child & ZIO_CHECKSUM_MASK].ci_flags &
+ ZCHECKSUM_FLAG_DEDUP) ||
(child & ZIO_CHECKSUM_VERIFY) || child == ZIO_CHECKSUM_OFF);
return (child);
@@ -148,21 +221,48 @@ zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
}
/*
+ * Calls the template init function of a checksum which supports context
+ * templates and installs the template into the spa_t.
+ */
+static void
+zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
+{
+ zio_checksum_info_t *ci = &zio_checksum_table[checksum];
+
+ if (ci->ci_tmpl_init == NULL)
+ return;
+ if (spa->spa_cksum_tmpls[checksum] != NULL)
+ return;
+
+ VERIFY(ci->ci_tmpl_free != NULL);
+ mutex_enter(&spa->spa_cksum_tmpls_lock);
+ if (spa->spa_cksum_tmpls[checksum] == NULL) {
+ spa->spa_cksum_tmpls[checksum] =
+ ci->ci_tmpl_init(&spa->spa_cksum_salt);
+ VERIFY(spa->spa_cksum_tmpls[checksum] != NULL);
+ }
+ mutex_exit(&spa->spa_cksum_tmpls_lock);
+}
+
+/*
* Generate the checksum.
*/
void
zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
- void *data, uint64_t size)
+ void *data, uint64_t size)
{
blkptr_t *bp = zio->io_bp;
uint64_t offset = zio->io_offset;
zio_checksum_info_t *ci = &zio_checksum_table[checksum];
zio_cksum_t cksum;
+ spa_t *spa = zio->io_spa;
ASSERT((uint_t)checksum < ZIO_CHECKSUM_FUNCTIONS);
ASSERT(ci->ci_func[0] != NULL);
- if (ci->ci_eck) {
+ zio_checksum_template_init(checksum, spa);
+
+ if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
zio_eck_t *eck;
if (checksum == ZIO_CHECKSUM_ZILOG2) {
@@ -181,10 +281,12 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
else
bp->blk_cksum = eck->zec_cksum;
eck->zec_magic = ZEC_MAGIC;
- ci->ci_func[0](data, size, &cksum);
+ ci->ci_func[0](data, size, spa->spa_cksum_tmpls[checksum],
+ &cksum);
eck->zec_cksum = cksum;
} else {
- ci->ci_func[0](data, size, &bp->blk_cksum);
+ ci->ci_func[0](data, size, spa->spa_cksum_tmpls[checksum],
+ &bp->blk_cksum);
}
}
@@ -202,11 +304,14 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info)
void *data = zio->io_data;
zio_checksum_info_t *ci = &zio_checksum_table[checksum];
zio_cksum_t actual_cksum, expected_cksum, verifier;
+ spa_t *spa = zio->io_spa;
if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL)
return (SET_ERROR(EINVAL));
- if (ci->ci_eck) {
+ zio_checksum_template_init(checksum, spa);
+
+ if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
zio_eck_t *eck;
if (checksum == ZIO_CHECKSUM_ZILOG2) {
@@ -243,7 +348,8 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info)
expected_cksum = eck->zec_cksum;
eck->zec_cksum = verifier;
- ci->ci_func[byteswap](data, size, &actual_cksum);
+ ci->ci_func[byteswap](data, size,
+ spa->spa_cksum_tmpls[checksum], &actual_cksum);
eck->zec_cksum = expected_cksum;
if (byteswap)
@@ -253,7 +359,8 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info)
ASSERT(!BP_IS_GANG(bp));
byteswap = BP_SHOULD_BYTESWAP(bp);
expected_cksum = bp->blk_cksum;
- ci->ci_func[byteswap](data, size, &actual_cksum);
+ ci->ci_func[byteswap](data, size,
+ spa->spa_cksum_tmpls[checksum], &actual_cksum);
}
info->zbc_expected = expected_cksum;
@@ -275,3 +382,23 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info)
return (0);
}
+
+/*
+ * Called by a spa_t that's about to be deallocated. This steps through
+ * all of the checksum context templates and deallocates any that were
+ * initialized using the algorithm-specific template init function.
+ */
+void
+zio_checksum_templates_free(spa_t *spa)
+{
+ for (enum zio_checksum checksum = 0;
+ checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) {
+ if (spa->spa_cksum_tmpls[checksum] != NULL) {
+ zio_checksum_info_t *ci = &zio_checksum_table[checksum];
+
+ VERIFY(ci->ci_tmpl_free != NULL);
+ ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]);
+ spa->spa_cksum_tmpls[checksum] = NULL;
+ }
+ }
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zrlock.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zrlock.c
index 2215184..7f6beee 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zrlock.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zrlock.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014, 2015 by Delphix. All rights reserved.
*/
/*
@@ -69,11 +69,7 @@ zrl_destroy(zrlock_t *zrl)
}
void
-#ifdef ZFS_DEBUG
-zrl_add_debug(zrlock_t *zrl, const char *zc)
-#else
-zrl_add(zrlock_t *zrl)
-#endif
+zrl_add_impl(zrlock_t *zrl, const char *zc)
{
uint32_t n = (uint32_t)zrl->zr_refcount;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
index 491c365..ce8eed3 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
@@ -2336,13 +2336,15 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize)
vdev_t *vd = spa->spa_root_vdev;
nvlist_t *nv = NULL;
uint64_t version = spa_version(spa);
- enum zio_checksum checksum;
+ uint64_t checksum, compress, refresrv, vbs, dedup;
ASSERT(MUTEX_HELD(&zfsdev_state_lock));
ASSERT(vd->vdev_ops == &vdev_root_ops);
error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, 0,
DMU_OBJECT_END);
+ if (error != 0)
+ return (error);
/* wait for dmu_free_long_range to actually free the blocks */
txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
@@ -2366,24 +2368,42 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize)
2, ZFS_SPACE_CHECK_RESERVED);
}
+ if (!resize) {
+ error = dsl_prop_get_integer(zv->zv_name,
+ zfs_prop_to_name(ZFS_PROP_COMPRESSION), &compress, NULL);
+ if (error == 0) {
+ error = dsl_prop_get_integer(zv->zv_name,
+ zfs_prop_to_name(ZFS_PROP_CHECKSUM), &checksum,
+ NULL);
+ }
+ if (error == 0) {
+ error = dsl_prop_get_integer(zv->zv_name,
+ zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
+ &refresrv, NULL);
+ }
+ if (error == 0) {
+ error = dsl_prop_get_integer(zv->zv_name,
+ zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &vbs,
+ NULL);
+ }
+ if (version >= SPA_VERSION_DEDUP && error == 0) {
+ error = dsl_prop_get_integer(zv->zv_name,
+ zfs_prop_to_name(ZFS_PROP_DEDUP), &dedup, NULL);
+ }
+ }
+ if (error != 0)
+ return (error);
+
tx = dmu_tx_create(os);
dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
dmu_tx_hold_bonus(tx, ZVOL_OBJ);
error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
+ if (error != 0) {
dmu_tx_abort(tx);
return (error);
}
/*
- * If MULTI_VDEV_CRASH_DUMP is active, use the NOPARITY checksum
- * function. Otherwise, use the old default -- OFF.
- */
- checksum = spa_feature_is_active(spa,
- SPA_FEATURE_MULTI_VDEV_CRASH_DUMP) ? ZIO_CHECKSUM_NOPARITY :
- ZIO_CHECKSUM_OFF;
-
- /*
* If we are resizing the dump device then we only need to
* update the refreservation to match the newly updated
* zvolsize. Otherwise, we save off the original state of the
@@ -2394,37 +2414,30 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize)
zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
&zv->zv_volsize, tx);
} else {
- uint64_t checksum, compress, refresrv, vbs, dedup;
-
- error = dsl_prop_get_integer(zv->zv_name,
- zfs_prop_to_name(ZFS_PROP_COMPRESSION), &compress, NULL);
- error = error ? error : dsl_prop_get_integer(zv->zv_name,
- zfs_prop_to_name(ZFS_PROP_CHECKSUM), &checksum, NULL);
- error = error ? error : dsl_prop_get_integer(zv->zv_name,
- zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &refresrv, NULL);
- error = error ? error : dsl_prop_get_integer(zv->zv_name,
- zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &vbs, NULL);
- if (version >= SPA_VERSION_DEDUP) {
- error = error ? error :
- dsl_prop_get_integer(zv->zv_name,
- zfs_prop_to_name(ZFS_PROP_DEDUP), &dedup, NULL);
- }
-
- error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
+ error = zap_update(os, ZVOL_ZAP_OBJ,
zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1,
&compress, tx);
- error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
- zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum, tx);
- error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
- zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
- &refresrv, tx);
- error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
- zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1,
- &vbs, tx);
- error = error ? error : dmu_object_set_blocksize(
- os, ZVOL_OBJ, SPA_OLD_MAXBLOCKSIZE, 0, tx);
- if (version >= SPA_VERSION_DEDUP) {
- error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
+ if (error == 0) {
+ error = zap_update(os, ZVOL_ZAP_OBJ,
+ zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1,
+ &checksum, tx);
+ }
+ if (error == 0) {
+ error = zap_update(os, ZVOL_ZAP_OBJ,
+ zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
+ &refresrv, tx);
+ }
+ if (error == 0) {
+ error = zap_update(os, ZVOL_ZAP_OBJ,
+ zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1,
+ &vbs, tx);
+ }
+ if (error == 0) {
+ error = dmu_object_set_blocksize(
+ os, ZVOL_OBJ, SPA_OLD_MAXBLOCKSIZE, 0, tx);
+ }
+ if (version >= SPA_VERSION_DEDUP && error == 0) {
+ error = zap_update(os, ZVOL_ZAP_OBJ,
zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1,
&dedup, tx);
}
@@ -2437,7 +2450,15 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize)
* We only need update the zvol's property if we are initializing
* the dump area for the first time.
*/
- if (!resize) {
+ if (error == 0 && !resize) {
+ /*
+ * If MULTI_VDEV_CRASH_DUMP is active, use the NOPARITY checksum
+ * function. Otherwise, use the old default -- OFF.
+ */
+ checksum = spa_feature_is_active(spa,
+ SPA_FEATURE_MULTI_VDEV_CRASH_DUMP) ? ZIO_CHECKSUM_NOPARITY :
+ ZIO_CHECKSUM_OFF;
+
VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
VERIFY(nvlist_add_uint64(nv,
zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 0) == 0);
@@ -2456,13 +2477,11 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize)
error = zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL,
nv, NULL);
nvlist_free(nv);
-
- if (error)
- return (error);
}
/* Allocate the space for the dump */
- error = zvol_prealloc(zv);
+ if (error == 0)
+ error = zvol_prealloc(zv);
return (error);
}
OpenPOWER on IntegriCloud