diff options
Diffstat (limited to 'sys/cddl/contrib/opensolaris/uts/common/fs/zfs')
45 files changed, 1609 insertions, 367 deletions
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c index b053993..29ef565 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c @@ -1134,6 +1134,24 @@ static boolean_t l2arc_compress_buf(arc_buf_hdr_t *); static void l2arc_decompress_zio(zio_t *, arc_buf_hdr_t *, enum zio_compress); static void l2arc_release_cdata_buf(arc_buf_hdr_t *); +static void +l2arc_trim(const arc_buf_hdr_t *hdr) +{ + l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; + + ASSERT(HDR_HAS_L2HDR(hdr)); + ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); + + if (hdr->b_l2hdr.b_daddr == L2ARC_ADDR_UNSET) + return; + if (hdr->b_l2hdr.b_asize != 0) { + trim_map_free(dev->l2ad_vdev, hdr->b_l2hdr.b_daddr, + hdr->b_l2hdr.b_asize, 0); + } else { + ASSERT3U(hdr->b_l2hdr.b_compress, ==, ZIO_COMPRESS_EMPTY); + } +} + static uint64_t buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) { @@ -1555,7 +1573,7 @@ arc_cksum_verify(arc_buf_t *buf) mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); return; } - fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); + fletcher_2_native(buf->b_data, buf->b_hdr->b_size, NULL, &zc); if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) panic("buffer modified while frozen!"); mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); @@ -1568,7 +1586,7 @@ arc_cksum_equal(arc_buf_t *buf) int equal; mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); - fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); + fletcher_2_native(buf->b_data, buf->b_hdr->b_size, NULL, &zc); equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc); mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); @@ -1588,7 +1606,7 @@ arc_cksum_compute(arc_buf_t *buf, boolean_t force) } buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); fletcher_2_native(buf->b_data, buf->b_hdr->b_size, - buf->b_hdr->b_freeze_cksum); + NULL, buf->b_hdr->b_freeze_cksum); mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); #ifdef illumos arc_buf_watch(buf); @@ -2406,10 +2424,7 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) * want to re-destroy the header's L2 portion. */ if (HDR_HAS_L2HDR(hdr)) { - if (hdr->b_l2hdr.b_daddr != L2ARC_ADDR_UNSET) - trim_map_free(dev->l2ad_vdev, - hdr->b_l2hdr.b_daddr, - hdr->b_l2hdr.b_asize, 0); + l2arc_trim(hdr); arc_hdr_l2hdr_destroy(hdr); } @@ -4779,10 +4794,7 @@ arc_release(arc_buf_t *buf, void *tag) * to acquire the l2ad_mtx. */ if (HDR_HAS_L2HDR(hdr)) { - if (hdr->b_l2hdr.b_daddr != L2ARC_ADDR_UNSET) - trim_map_free(hdr->b_l2hdr.b_dev->l2ad_vdev, - hdr->b_l2hdr.b_daddr, - hdr->b_l2hdr.b_asize, 0); + l2arc_trim(hdr); arc_hdr_l2hdr_destroy(hdr); } @@ -5298,6 +5310,16 @@ arc_init(void) arc_c_max = arc_c_min; arc_c_max = MAX(arc_c * 5, arc_c_max); + /* + * In userland, there's only the memory pressure that we artificially + * create (see arc_available_memory()). Don't let arc_c get too + * small, because it can cause transactions to be larger than + * arc_c, causing arc_tempreserve_space() to fail. + */ +#ifndef _KERNEL + arc_c_min = arc_c_max / 2; +#endif + #ifdef _KERNEL /* * Allow the tunables to override our calculations if they are @@ -5959,8 +5981,7 @@ top: * Error - drop L2ARC entry. */ list_remove(buflist, hdr); - trim_map_free(hdr->b_l2hdr.b_dev->l2ad_vdev, - hdr->b_l2hdr.b_daddr, hdr->b_l2hdr.b_asize, 0); + l2arc_trim(hdr); hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize); @@ -6246,7 +6267,8 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, boolean_t *headroom_boost) { arc_buf_hdr_t *hdr, *hdr_prev, *head; - uint64_t write_asize, write_sz, headroom, buf_compress_minsz; + uint64_t write_asize, write_sz, headroom, + buf_compress_minsz; void *buf_data; boolean_t full; l2arc_write_callback_t *cb; @@ -6408,6 +6430,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, * using it to denote the header's state change. */ hdr->b_l2hdr.b_daddr = L2ARC_ADDR_UNSET; + hdr->b_flags |= ARC_FLAG_HAS_L2HDR; mutex_enter(&dev->l2ad_mtx); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c index a28d866..f39a353 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c @@ -267,7 +267,7 @@ dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type) */ ASSERT3U(holds, >=, db->db_dirtycnt); } else { - if (db->db_immediate_evict == TRUE) + if (db->db_user_immediate_evict == TRUE) ASSERT3U(holds, >=, db->db_dirtycnt); else ASSERT3U(holds, >, 0); @@ -1110,6 +1110,32 @@ dbuf_release_bp(dmu_buf_impl_t *db) (void) arc_release(db->db_buf, db); } +/* + * We already have a dirty record for this TXG, and we are being + * dirtied again. + */ +static void +dbuf_redirty(dbuf_dirty_record_t *dr) +{ + dmu_buf_impl_t *db = dr->dr_dbuf; + + ASSERT(MUTEX_HELD(&db->db_mtx)); + + if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) { + /* + * If this buffer has already been written out, + * we now need to reset its state. + */ + dbuf_unoverride(dr); + if (db->db.db_object != DMU_META_DNODE_OBJECT && + db->db_state != DB_NOFILL) { + /* Already released on initial dirty, so just thaw. */ + ASSERT(arc_released(db->db_buf)); + arc_buf_thaw(db->db_buf); + } + } +} + dbuf_dirty_record_t * dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { @@ -1182,16 +1208,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) if (dr && dr->dr_txg == tx->tx_txg) { DB_DNODE_EXIT(db); - if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) { - /* - * If this buffer has already been written out, - * we now need to reset its state. - */ - dbuf_unoverride(dr); - if (db->db.db_object != DMU_META_DNODE_OBJECT && - db->db_state != DB_NOFILL) - arc_buf_thaw(db->db_buf); - } + dbuf_redirty(dr); mutex_exit(&db->db_mtx); return (dr); } @@ -1495,6 +1512,30 @@ dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) ASSERT(tx->tx_txg != 0); ASSERT(!refcount_is_zero(&db->db_holds)); + /* + * Quick check for dirtyness. For already dirty blocks, this + * reduces runtime of this function by >90%, and overall performance + * by 50% for some workloads (e.g. file deletion with indirect blocks + * cached). + */ + mutex_enter(&db->db_mtx); + dbuf_dirty_record_t *dr; + for (dr = db->db_last_dirty; + dr != NULL && dr->dr_txg >= tx->tx_txg; dr = dr->dr_next) { + /* + * It's possible that it is already dirty but not cached, + * because there are some calls to dbuf_dirty() that don't + * go through dmu_buf_will_dirty(). + */ + if (dr->dr_txg == tx->tx_txg && db->db_state == DB_CACHED) { + /* This dbuf is already dirty and cached. */ + dbuf_redirty(dr); + mutex_exit(&db->db_mtx); + return; + } + } + mutex_exit(&db->db_mtx); + DB_DNODE_ENTER(db); if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock)) rf |= DB_RF_HAVESTRUCT; @@ -1829,8 +1870,9 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, db->db_blkptr = blkptr; db->db_user = NULL; - db->db_immediate_evict = 0; - db->db_freed_in_flight = 0; + db->db_user_immediate_evict = FALSE; + db->db_freed_in_flight = FALSE; + db->db_pending_evict = FALSE; if (blkid == DMU_BONUS_BLKID) { ASSERT3P(parent, ==, dn->dn_dbuf); @@ -2386,12 +2428,13 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) arc_buf_freeze(db->db_buf); if (holds == db->db_dirtycnt && - db->db_level == 0 && db->db_immediate_evict) + db->db_level == 0 && db->db_user_immediate_evict) dbuf_evict_user(db); if (holds == 0) { if (db->db_blkid == DMU_BONUS_BLKID) { dnode_t *dn; + boolean_t evict_dbuf = db->db_pending_evict; /* * If the dnode moves here, we cannot cross this @@ -2406,7 +2449,7 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) * Decrementing the dbuf count means that the bonus * buffer's dnode hold is no longer discounted in * dnode_move(). The dnode cannot move until after - * the dnode_rele_and_unlock() below. + * the dnode_rele() below. */ DB_DNODE_EXIT(db); @@ -2416,35 +2459,10 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) */ mutex_exit(&db->db_mtx); - /* - * If the dnode has been freed, evict the bonus - * buffer immediately. The data in the bonus - * buffer is no longer relevant and this prevents - * a stale bonus buffer from being associated - * with this dnode_t should the dnode_t be reused - * prior to being destroyed. - */ - mutex_enter(&dn->dn_mtx); - if (dn->dn_type == DMU_OT_NONE || - dn->dn_free_txg != 0) { - /* - * Drop dn_mtx. It is a leaf lock and - * cannot be held when dnode_evict_bonus() - * acquires other locks in order to - * perform the eviction. - * - * Freed dnodes cannot be reused until the - * last hold is released. Since this bonus - * buffer has a hold, the dnode will remain - * in the free state, even without dn_mtx - * held, until the dnode_rele_and_unlock() - * below. - */ - mutex_exit(&dn->dn_mtx); + if (evict_dbuf) dnode_evict_bonus(dn); - mutex_enter(&dn->dn_mtx); - } - dnode_rele_and_unlock(dn, db); + + dnode_rele(dn, db); } else if (db->db_buf == NULL) { /* * This is a special case: we never associated this @@ -2491,7 +2509,7 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) } else { dbuf_clear(db); } - } else if (db->db_objset->os_evicting || + } else if (db->db_pending_evict || arc_buf_eviction_needed(db->db_buf)) { dbuf_clear(db); } else { @@ -2539,7 +2557,7 @@ dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - db->db_immediate_evict = TRUE; + db->db_user_immediate_evict = TRUE; return (dmu_buf_set_user(db_fake, user)); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c index 7863e6a..d4151bb 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -65,7 +65,8 @@ ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class, spa_t *spa = ddt->ddt_spa; objset_t *os = ddt->ddt_os; uint64_t *objectp = &ddt->ddt_object[type][class]; - boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_dedup; + boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_flags & + ZCHECKSUM_FLAG_DEDUP; char name[DDT_NAMELEN]; ddt_object_name(ddt, type, class, name); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c index fe6f60d..93a0426 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c @@ -1494,7 +1494,8 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) ASSERT(BP_EQUAL(bp, bp_orig)); ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF); - ASSERT(zio_checksum_table[chksum].ci_dedup); + ASSERT(zio_checksum_table[chksum].ci_flags & + ZCHECKSUM_FLAG_NOPWRITE); } dr->dt.dl.dr_overridden_by = *zio->io_bp; dr->dt.dl.dr_override_state = DR_OVERRIDDEN; @@ -1739,7 +1740,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs, - dmu_tx_t *tx) + dmu_tx_t *tx) { dnode_t *dn; int err; @@ -1754,7 +1755,7 @@ dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs, void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, - dmu_tx_t *tx) + dmu_tx_t *tx) { dnode_t *dn; @@ -1774,7 +1775,7 @@ dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, - dmu_tx_t *tx) + dmu_tx_t *tx) { dnode_t *dn; @@ -1842,8 +1843,10 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) * as well. Otherwise, the metadata checksum defaults * to fletcher4. */ - if (zio_checksum_table[checksum].ci_correctable < 1 || - zio_checksum_table[checksum].ci_eck) + if (!(zio_checksum_table[checksum].ci_flags & + ZCHECKSUM_FLAG_METADATA) || + (zio_checksum_table[checksum].ci_flags & + ZCHECKSUM_FLAG_EMBEDDED)) checksum = ZIO_CHECKSUM_FLETCHER_4; if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL || @@ -1882,17 +1885,20 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) */ if (dedup_checksum != ZIO_CHECKSUM_OFF) { dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE; - if (!zio_checksum_table[checksum].ci_dedup) + if (!(zio_checksum_table[checksum].ci_flags & + ZCHECKSUM_FLAG_DEDUP)) dedup_verify = B_TRUE; } /* - * Enable nopwrite if we have a cryptographically secure - * checksum that has no known collisions (i.e. SHA-256) - * and compression is enabled. We don't enable nopwrite if - * dedup is enabled as the two features are mutually exclusive. + * Enable nopwrite if we have secure enough checksum + * algorithm (see comment in zio_nop_write) and + * compression is enabled. We don't enable nopwrite if + * dedup is enabled as the two features are mutually + * exclusive. */ - nopwrite = (!dedup && zio_checksum_table[checksum].ci_dedup && + nopwrite = (!dedup && (zio_checksum_table[checksum].ci_flags & + ZCHECKSUM_FLAG_NOPWRITE) && compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled); } @@ -1940,7 +1946,8 @@ dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) * ID and wait for that to be synced. */ int -dmu_object_wait_synced(objset_t *os, uint64_t object) { +dmu_object_wait_synced(objset_t *os, uint64_t object) +{ dnode_t *dn; int error, i; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c index f84ff37..79de1d1 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c @@ -362,6 +362,17 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, * checksum/compression/copies. */ if (ds != NULL) { + boolean_t needlock = B_FALSE; + + /* + * Note: it's valid to open the objset if the dataset is + * long-held, in which case the pool_config lock will not + * be held. + */ + if (!dsl_pool_config_held(dmu_objset_pool(os))) { + needlock = B_TRUE; + dsl_pool_config_enter(dmu_objset_pool(os), FTAG); + } err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE), primary_cache_changed_cb, os); @@ -413,6 +424,8 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, recordsize_changed_cb, os); } } + if (needlock) + dsl_pool_config_exit(dmu_objset_pool(os), FTAG); if (err != 0) { VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf)); @@ -469,6 +482,13 @@ dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp) { int err = 0; + /* + * We shouldn't be doing anything with dsl_dataset_t's unless the + * pool_config lock is held, or the dataset is long-held. + */ + ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool) || + dsl_dataset_long_held(ds)); + mutex_enter(&ds->ds_opening_lock); if (ds->ds_objset == NULL) { objset_t *os; @@ -686,7 +706,6 @@ dmu_objset_evict(objset_t *os) if (os->os_sa) sa_tear_down(os); - os->os_evicting = B_TRUE; dmu_objset_evict_dbufs(os); mutex_enter(&os->os_lock); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c index ef13961..ede1555 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c @@ -66,12 +66,14 @@ int zfs_send_queue_length = 16 * 1024 * 1024; int zfs_recv_queue_length = 16 * 1024 * 1024; static char *dmu_recv_tag = "dmu_recv_tag"; -static const char *recv_clone_name = "%recv"; +const char *recv_clone_name = "%recv"; #define BP_SPAN(datablkszsec, indblkshift, level) \ (((uint64_t)datablkszsec) << (SPA_MINBLOCKSHIFT + \ (level) * (indblkshift - SPA_BLKPTRSHIFT))) +static void byteswap_record(dmu_replay_record_t *drr); + struct send_thread_arg { bqueue_t q; dsl_dataset_t *ds; /* Dataset to traverse */ @@ -79,6 +81,7 @@ struct send_thread_arg { int flags; /* flags to pass to traverse_dataset */ int error_code; boolean_t cancel; + zbookmark_phys_t resume; }; struct send_block_record { @@ -93,7 +96,7 @@ struct send_block_record { static int dump_bytes(dmu_sendarg_t *dsp, void *buf, int len) { - dsl_dataset_t *ds = dsp->dsa_os->os_dsl_dataset; + dsl_dataset_t *ds = dmu_objset_ds(dsp->dsa_os); struct uio auio; struct iovec aiov; ASSERT0(len % 8); @@ -166,7 +169,7 @@ dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, * that the receiving system doesn't have any dbufs in the range * being freed. This is always true because there is a one-record * constraint: we only send one WRITE record for any given - * object+offset. We know that the one-record constraint is + * object,offset. We know that the one-record constraint is * true because we always send data in increasing order by * object,offset. * @@ -289,7 +292,8 @@ dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type, drrw->drr_checksumtype = ZIO_CHECKSUM_OFF; } else { drrw->drr_checksumtype = BP_GET_CHECKSUM(bp); - if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup) + if (zio_checksum_table[drrw->drr_checksumtype].ci_flags & + ZCHECKSUM_FLAG_DEDUP) drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP; DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp)); DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp)); @@ -414,6 +418,19 @@ dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp) { struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object); + if (object < dsp->dsa_resume_object) { + /* + * Note: when resuming, we will visit all the dnodes in + * the block of dnodes that we are resuming from. In + * this case it's unnecessary to send the dnodes prior to + * the one we are resuming from. We should be at most one + * block's worth of dnodes behind the resume point. + */ + ASSERT3U(dsp->dsa_resume_object - object, <, + 1 << (DNODE_BLOCK_SHIFT - DNODE_SHIFT)); + return (0); + } + if (dnp == NULL || dnp->dn_type == DMU_OT_NONE) return (dump_freeobjects(dsp, object, 1)); @@ -494,6 +511,9 @@ send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, uint64_t record_size; int err = 0; + ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT || + zb->zb_object >= sta->resume.zb_object); + if (sta->cancel) return (SET_ERROR(EINTR)); @@ -530,8 +550,10 @@ send_traverse_thread(void *arg) struct send_block_record *data; if (st_arg->ds != NULL) { - err = traverse_dataset(st_arg->ds, st_arg->fromtxg, - st_arg->flags, send_cb, arg); + err = traverse_dataset_resume(st_arg->ds, + st_arg->fromtxg, &st_arg->resume, + st_arg->flags, send_cb, st_arg); + if (err != EINTR) st_arg->error_code = err; } @@ -560,6 +582,9 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) ASSERT3U(zb->zb_level, >=, 0); + ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT || + zb->zb_object >= dsa->dsa_resume_object); + if (zb->zb_object != DMU_META_DNODE_OBJECT && DMU_OBJECT_IS_SPECIAL(zb->zb_object)) { return (0); @@ -620,6 +645,10 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) uint64_t offset; ASSERT0(zb->zb_level); + ASSERT(zb->zb_object > dsa->dsa_resume_object || + (zb->zb_object == dsa->dsa_resume_object && + zb->zb_blkid * blksz >= dsa->dsa_resume_offset)); + if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &aflags, zb) != 0) { @@ -680,11 +709,13 @@ get_next_record(bqueue_t *bq, struct send_block_record *data) */ static int dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, - zfs_bookmark_phys_t *ancestor_zb, boolean_t is_clone, boolean_t embedok, + zfs_bookmark_phys_t *ancestor_zb, + boolean_t is_clone, boolean_t embedok, boolean_t large_block_ok, int outfd, + uint64_t resumeobj, uint64_t resumeoff, #ifdef illumos - boolean_t large_block_ok, int outfd, vnode_t *vp, offset_t *off) + vnode_t *vp, offset_t *off) #else - boolean_t large_block_ok, int outfd, struct file *fp, offset_t *off) + struct file *fp, offset_t *off) #endif { objset_t *os; @@ -693,7 +724,7 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, int err; uint64_t fromtxg = 0; uint64_t featureflags = 0; - struct send_thread_arg to_arg; + struct send_thread_arg to_arg = { 0 }; err = dmu_objset_from_ds(to_ds, &os); if (err != 0) { @@ -730,6 +761,10 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA_LZ4; } + if (resumeobj != 0 || resumeoff != 0) { + featureflags |= DMU_BACKUP_FEATURE_RESUMING; + } + DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo, featureflags); @@ -766,6 +801,8 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, dsp->dsa_pending_op = PENDING_NONE; dsp->dsa_incremental = (ancestor_zb != NULL); dsp->dsa_featureflags = featureflags; + dsp->dsa_resume_object = resumeobj; + dsp->dsa_resume_offset = resumeoff; mutex_enter(&to_ds->ds_sendstream_lock); list_insert_head(&to_ds->ds_sendstreams, dsp); @@ -774,7 +811,27 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, dsl_dataset_long_hold(to_ds, FTAG); dsl_pool_rele(dp, tag); - if (dump_record(dsp, NULL, 0) != 0) { + void *payload = NULL; + size_t payload_len = 0; + if (resumeobj != 0 || resumeoff != 0) { + dmu_object_info_t to_doi; + err = dmu_object_info(os, resumeobj, &to_doi); + if (err != 0) + goto out; + SET_BOOKMARK(&to_arg.resume, to_ds->ds_object, resumeobj, 0, + resumeoff / to_doi.doi_data_block_size); + + nvlist_t *nvl = fnvlist_alloc(); + fnvlist_add_uint64(nvl, "resume_object", resumeobj); + fnvlist_add_uint64(nvl, "resume_offset", resumeoff); + payload = fnvlist_pack(nvl, &payload_len); + drr->drr_payloadlen = payload_len; + fnvlist_free(nvl); + } + + err = dump_record(dsp, payload, payload_len); + fnvlist_pack_free(payload, payload_len); + if (err != 0) { err = dsp->dsa_err; goto out; } @@ -889,22 +946,22 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, is_clone = (fromds->ds_dir != ds->ds_dir); dsl_dataset_rele(fromds, FTAG); err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, - embedok, large_block_ok, outfd, fp, off); + embedok, large_block_ok, outfd, 0, 0, fp, off); } else { err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, - embedok, large_block_ok, outfd, fp, off); + embedok, large_block_ok, outfd, 0, 0, fp, off); } dsl_dataset_rele(ds, FTAG); return (err); } int -dmu_send(const char *tosnap, const char *fromsnap, - boolean_t embedok, boolean_t large_block_ok, +dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, + boolean_t large_block_ok, int outfd, uint64_t resumeobj, uint64_t resumeoff, #ifdef illumos - int outfd, vnode_t *vp, offset_t *off) + vnode_t *vp, offset_t *off) #else - int outfd, struct file *fp, offset_t *off) + struct file *fp, offset_t *off) #endif { dsl_pool_t *dp; @@ -972,10 +1029,12 @@ dmu_send(const char *tosnap, const char *fromsnap, return (err); } err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, - embedok, large_block_ok, outfd, fp, off); + embedok, large_block_ok, + outfd, resumeobj, resumeoff, fp, off); } else { err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, - embedok, large_block_ok, outfd, fp, off); + embedok, large_block_ok, + outfd, resumeobj, resumeoff, fp, off); } if (owned) dsl_dataset_disown(ds, FTAG); @@ -1218,6 +1277,7 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) /* already checked */ ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); + ASSERT(!(featureflags & DMU_BACKUP_FEATURE_RESUMING)); if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == DMU_COMPOUNDSTREAM || @@ -1230,6 +1290,10 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) spa_version(dp->dp_spa) < SPA_VERSION_SA) return (SET_ERROR(ENOTSUP)); + if (drba->drba_cookie->drc_resumable && + !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EXTENSIBLE_DATASET)) + return (SET_ERROR(ENOTSUP)); + /* * The receiving code doesn't know how to translate a WRITE_EMBEDDED * record to a plan WRITE record, so the pool must have the @@ -1333,15 +1397,16 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) { dmu_recv_begin_arg_t *drba = arg; dsl_pool_t *dp = dmu_tx_pool(tx); + objset_t *mos = dp->dp_meta_objset; struct drr_begin *drrb = drba->drba_cookie->drc_drrb; const char *tofs = drba->drba_cookie->drc_tofs; dsl_dataset_t *ds, *newds; uint64_t dsobj; int error; - uint64_t crflags; + uint64_t crflags = 0; - crflags = (drrb->drr_flags & DRR_FLAG_CI_DATA) ? - DS_FLAG_CI_DATASET : 0; + if (drrb->drr_flags & DRR_FLAG_CI_DATA) + crflags |= DS_FLAG_CI_DATASET; error = dsl_dataset_hold(dp, tofs, FTAG, &ds); if (error == 0) { @@ -1379,6 +1444,31 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) } VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds)); + if (drba->drba_cookie->drc_resumable) { + dsl_dataset_zapify(newds, tx); + if (drrb->drr_fromguid != 0) { + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_FROMGUID, + 8, 1, &drrb->drr_fromguid, tx)); + } + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TOGUID, + 8, 1, &drrb->drr_toguid, tx)); + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TONAME, + 1, strlen(drrb->drr_toname) + 1, drrb->drr_toname, tx)); + uint64_t one = 1; + uint64_t zero = 0; + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OBJECT, + 8, 1, &one, tx)); + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OFFSET, + 8, 1, &zero, tx)); + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_BYTES, + 8, 1, &zero, tx)); + if (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & + DMU_BACKUP_FEATURE_EMBED_DATA) { + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_EMBEDOK, + 8, 1, &one, tx)); + } + } + dmu_buf_will_dirty(newds->ds_dbuf, tx); dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT; @@ -1396,56 +1486,192 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) spa_history_log_internal_ds(newds, "receive", tx, ""); } +static int +dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx) +{ + dmu_recv_begin_arg_t *drba = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + struct drr_begin *drrb = drba->drba_cookie->drc_drrb; + int error; + uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); + dsl_dataset_t *ds; + const char *tofs = drba->drba_cookie->drc_tofs; + + /* already checked */ + ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); + ASSERT(featureflags & DMU_BACKUP_FEATURE_RESUMING); + + if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == + DMU_COMPOUNDSTREAM || + drrb->drr_type >= DMU_OST_NUMTYPES) + return (SET_ERROR(EINVAL)); + + /* Verify pool version supports SA if SA_SPILL feature set */ + if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) && + spa_version(dp->dp_spa) < SPA_VERSION_SA) + return (SET_ERROR(ENOTSUP)); + + /* + * The receiving code doesn't know how to translate a WRITE_EMBEDDED + * record to a plain WRITE record, so the pool must have the + * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED + * records. Same with WRITE_EMBEDDED records that use LZ4 compression. + */ + if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) && + !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) + return (SET_ERROR(ENOTSUP)); + if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4) && + !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) + return (SET_ERROR(ENOTSUP)); + + char recvname[ZFS_MAXNAMELEN]; + + (void) snprintf(recvname, sizeof (recvname), "%s/%s", + tofs, recv_clone_name); + + if (dsl_dataset_hold(dp, recvname, FTAG, &ds) != 0) { + /* %recv does not exist; continue in tofs */ + error = dsl_dataset_hold(dp, tofs, FTAG, &ds); + if (error != 0) + return (error); + } + + /* check that ds is marked inconsistent */ + if (!DS_IS_INCONSISTENT(ds)) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EINVAL)); + } + + /* check that there is resuming data, and that the toguid matches */ + if (!dsl_dataset_is_zapified(ds)) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EINVAL)); + } + uint64_t val; + error = zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val); + if (error != 0 || drrb->drr_toguid != val) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EINVAL)); + } + + /* + * Check if the receive is still running. If so, it will be owned. + * Note that nothing else can own the dataset (e.g. after the receive + * fails) because it will be marked inconsistent. + */ + if (dsl_dataset_has_owner(ds)) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EBUSY)); + } + + /* There should not be any snapshots of this fs yet. */ + if (ds->ds_prev != NULL && ds->ds_prev->ds_dir == ds->ds_dir) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EINVAL)); + } + + /* + * Note: resume point will be checked when we process the first WRITE + * record. + */ + + /* check that the origin matches */ + val = 0; + (void) zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val); + if (drrb->drr_fromguid != val) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EINVAL)); + } + + dsl_dataset_rele(ds, FTAG); + return (0); +} + +static void +dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx) +{ + dmu_recv_begin_arg_t *drba = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + const char *tofs = drba->drba_cookie->drc_tofs; + dsl_dataset_t *ds; + uint64_t dsobj; + char recvname[ZFS_MAXNAMELEN]; + + (void) snprintf(recvname, sizeof (recvname), "%s/%s", + tofs, recv_clone_name); + + if (dsl_dataset_hold(dp, recvname, FTAG, &ds) != 0) { + /* %recv does not exist; continue in tofs */ + VERIFY0(dsl_dataset_hold(dp, tofs, FTAG, &ds)); + drba->drba_cookie->drc_newfs = B_TRUE; + } + + /* clear the inconsistent flag so that we can own it */ + ASSERT(DS_IS_INCONSISTENT(ds)); + dmu_buf_will_dirty(ds->ds_dbuf, tx); + dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT; + dsobj = ds->ds_object; + dsl_dataset_rele(ds, FTAG); + + VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &ds)); + + dmu_buf_will_dirty(ds->ds_dbuf, tx); + dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_INCONSISTENT; + + ASSERT(!BP_IS_HOLE(dsl_dataset_get_blkptr(ds))); + + drba->drba_cookie->drc_ds = ds; + + spa_history_log_internal_ds(ds, "resume receive", tx, ""); +} + /* * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin() * succeeds; otherwise we will leak the holds on the datasets. */ int -dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb, - boolean_t force, char *origin, dmu_recv_cookie_t *drc) +dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin, + boolean_t force, boolean_t resumable, char *origin, dmu_recv_cookie_t *drc) { dmu_recv_begin_arg_t drba = { 0 }; - dmu_replay_record_t *drr; bzero(drc, sizeof (dmu_recv_cookie_t)); - drc->drc_drrb = drrb; + drc->drc_drr_begin = drr_begin; + drc->drc_drrb = &drr_begin->drr_u.drr_begin; drc->drc_tosnap = tosnap; drc->drc_tofs = tofs; drc->drc_force = force; + drc->drc_resumable = resumable; drc->drc_cred = CRED(); - if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) + if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) { drc->drc_byteswap = B_TRUE; - else if (drrb->drr_magic != DMU_BACKUP_MAGIC) - return (SET_ERROR(EINVAL)); - - drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); - drr->drr_type = DRR_BEGIN; - drr->drr_u.drr_begin = *drc->drc_drrb; - if (drc->drc_byteswap) { - fletcher_4_incremental_byteswap(drr, + fletcher_4_incremental_byteswap(drr_begin, sizeof (dmu_replay_record_t), &drc->drc_cksum); - } else { - fletcher_4_incremental_native(drr, + byteswap_record(drr_begin); + } else if (drc->drc_drrb->drr_magic == DMU_BACKUP_MAGIC) { + fletcher_4_incremental_native(drr_begin, sizeof (dmu_replay_record_t), &drc->drc_cksum); - } - kmem_free(drr, sizeof (dmu_replay_record_t)); - - if (drc->drc_byteswap) { - drrb->drr_magic = BSWAP_64(drrb->drr_magic); - drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo); - drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); - drrb->drr_type = BSWAP_32(drrb->drr_type); - drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); - drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); + } else { + return (SET_ERROR(EINVAL)); } drba.drba_origin = origin; drba.drba_cookie = drc; drba.drba_cred = CRED(); - return (dsl_sync_task(tofs, dmu_recv_begin_check, dmu_recv_begin_sync, - &drba, 5, ZFS_SPACE_CHECK_NORMAL)); + if (DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo) & + DMU_BACKUP_FEATURE_RESUMING) { + return (dsl_sync_task(tofs, + dmu_recv_resume_begin_check, dmu_recv_resume_begin_sync, + &drba, 5, ZFS_SPACE_CHECK_NORMAL)); + } else { + return (dsl_sync_task(tofs, + dmu_recv_begin_check, dmu_recv_begin_sync, + &drba, 5, ZFS_SPACE_CHECK_NORMAL)); + } } struct receive_record_arg { @@ -1457,6 +1683,7 @@ struct receive_record_arg { */ arc_buf_t *write_buf; int payload_size; + uint64_t bytes_read; /* bytes read from stream when record created */ boolean_t eos_marker; /* Marks the end of the stream */ bqueue_node_t node; }; @@ -1465,6 +1692,7 @@ struct receive_writer_arg { objset_t *os; boolean_t byteswap; bqueue_t q; + /* * These three args are used to signal to the main thread that we're * done. @@ -1472,9 +1700,13 @@ struct receive_writer_arg { kmutex_t mutex; kcondvar_t cv; boolean_t done; + int err; /* A map from guid to dataset to help handle dedup'd streams. */ avl_tree_t *guid_to_ds_map; + boolean_t resumable; + uint64_t last_object, last_offset; + uint64_t bytes_read; /* bytes read when current record created */ }; struct receive_arg { @@ -1482,6 +1714,7 @@ struct receive_arg { kthread_t *td; struct file *fp; uint64_t voff; /* The current offset in the stream */ + uint64_t bytes_read; /* * A record that has had its payload read in, but hasn't yet been handed * off to the worker thread. @@ -1577,14 +1810,21 @@ receive_read(struct receive_arg *ra, int len, void *buf) ra->err = restore_bytes(ra, buf + done, len - done, ra->voff, &resid); - if (resid == len - done) - ra->err = SET_ERROR(EINVAL); + if (resid == len - done) { + /* + * Note: ECKSUM indicates that the receive + * was interrupted and can potentially be resumed. + */ + ra->err = SET_ERROR(ECKSUM); + } ra->voff += len - done - resid; done = len - resid; if (ra->err != 0) return (ra->err); } + ra->bytes_read += len; + ASSERT3U(done, ==, len); return (0); } @@ -1685,6 +1925,43 @@ deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size) } } +static void +save_resume_state(struct receive_writer_arg *rwa, + uint64_t object, uint64_t offset, dmu_tx_t *tx) +{ + int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; + + if (!rwa->resumable) + return; + + /* + * We use ds_resume_bytes[] != 0 to indicate that we need to + * update this on disk, so it must not be 0. + */ + ASSERT(rwa->bytes_read != 0); + + /* + * We only resume from write records, which have a valid + * (non-meta-dnode) object number. + */ + ASSERT(object != 0); + + /* + * For resuming to work correctly, we must receive records in order, + * sorted by object,offset. This is checked by the callers, but + * assert it here for good measure. + */ + ASSERT3U(object, >=, rwa->os->os_dsl_dataset->ds_resume_object[txgoff]); + ASSERT(object != rwa->os->os_dsl_dataset->ds_resume_object[txgoff] || + offset >= rwa->os->os_dsl_dataset->ds_resume_offset[txgoff]); + ASSERT3U(rwa->bytes_read, >=, + rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff]); + + rwa->os->os_dsl_dataset->ds_resume_object[txgoff] = object; + rwa->os->os_dsl_dataset->ds_resume_offset[txgoff] = offset; + rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff] = rwa->bytes_read; +} + static int receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, void *data) @@ -1781,6 +2058,7 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, dmu_buf_rele(db, FTAG); } dmu_tx_commit(tx); + return (0); } @@ -1806,6 +2084,7 @@ receive_freeobjects(struct receive_writer_arg *rwa, if (err != 0) return (err); } + return (0); } @@ -1820,6 +2099,18 @@ receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw, !DMU_OT_IS_VALID(drrw->drr_type)) return (SET_ERROR(EINVAL)); + /* + * For resuming to work, records must be in increasing order + * by (object, offset). + */ + if (drrw->drr_object < rwa->last_object || + (drrw->drr_object == rwa->last_object && + drrw->drr_offset < rwa->last_offset)) { + return (SET_ERROR(EINVAL)); + } + rwa->last_object = drrw->drr_object; + rwa->last_offset = drrw->drr_offset; + if (dmu_object_info(rwa->os, drrw->drr_object, NULL) != 0) return (SET_ERROR(EINVAL)); @@ -1843,8 +2134,17 @@ receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw, if (dmu_bonus_hold(rwa->os, drrw->drr_object, FTAG, &bonus) != 0) return (SET_ERROR(EINVAL)); dmu_assign_arcbuf(bonus, drrw->drr_offset, abuf, tx); + + /* + * Note: If the receive fails, we want the resume stream to start + * with the same record that we last successfully received (as opposed + * to the next record), so that we can verify that we are + * resuming from the correct location. + */ + save_resume_state(rwa, drrw->drr_object, drrw->drr_offset, tx); dmu_tx_commit(tx); dmu_buf_rele(bonus, FTAG); + return (0); } @@ -1903,43 +2203,48 @@ receive_write_byref(struct receive_writer_arg *rwa, dmu_write(rwa->os, drrwbr->drr_object, drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx); dmu_buf_rele(dbp, FTAG); + + /* See comment in restore_write. */ + save_resume_state(rwa, drrwbr->drr_object, drrwbr->drr_offset, tx); dmu_tx_commit(tx); return (0); } static int receive_write_embedded(struct receive_writer_arg *rwa, - struct drr_write_embedded *drrwnp, void *data) + struct drr_write_embedded *drrwe, void *data) { dmu_tx_t *tx; int err; - if (drrwnp->drr_offset + drrwnp->drr_length < drrwnp->drr_offset) + if (drrwe->drr_offset + drrwe->drr_length < drrwe->drr_offset) return (EINVAL); - if (drrwnp->drr_psize > BPE_PAYLOAD_SIZE) + if (drrwe->drr_psize > BPE_PAYLOAD_SIZE) return (EINVAL); - if (drrwnp->drr_etype >= NUM_BP_EMBEDDED_TYPES) + if (drrwe->drr_etype >= NUM_BP_EMBEDDED_TYPES) return (EINVAL); - if (drrwnp->drr_compression >= ZIO_COMPRESS_FUNCTIONS) + if (drrwe->drr_compression >= ZIO_COMPRESS_FUNCTIONS) return (EINVAL); tx = dmu_tx_create(rwa->os); - dmu_tx_hold_write(tx, drrwnp->drr_object, - drrwnp->drr_offset, drrwnp->drr_length); + dmu_tx_hold_write(tx, drrwe->drr_object, + drrwe->drr_offset, drrwe->drr_length); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { dmu_tx_abort(tx); return (err); } - dmu_write_embedded(rwa->os, drrwnp->drr_object, - drrwnp->drr_offset, data, drrwnp->drr_etype, - drrwnp->drr_compression, drrwnp->drr_lsize, drrwnp->drr_psize, + dmu_write_embedded(rwa->os, drrwe->drr_object, + drrwe->drr_offset, data, drrwe->drr_etype, + drrwe->drr_compression, drrwe->drr_lsize, drrwe->drr_psize, rwa->byteswap ^ ZFS_HOST_BYTEORDER, tx); + /* See comment in restore_write. */ + save_resume_state(rwa, drrwe->drr_object, drrwe->drr_offset, tx); dmu_tx_commit(tx); return (0); } @@ -2013,10 +2318,16 @@ receive_free(struct receive_writer_arg *rwa, struct drr_free *drrf) static void dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc) { - char name[MAXNAMELEN]; - dsl_dataset_name(drc->drc_ds, name); - dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); - (void) dsl_destroy_head(name); + if (drc->drc_resumable) { + /* wait for our resume state to be written to disk */ + txg_wait_synced(drc->drc_ds->ds_dir->dd_pool, 0); + dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); + } else { + char name[MAXNAMELEN]; + dsl_dataset_name(drc->drc_ds, name); + dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); + (void) dsl_destroy_head(name); + } } static void @@ -2043,12 +2354,17 @@ receive_read_payload_and_next_header(struct receive_arg *ra, int len, void *buf) if (len != 0) { ASSERT3U(len, <=, SPA_MAXBLOCKSIZE); - ra->rrd->payload = buf; - ra->rrd->payload_size = len; - err = receive_read(ra, len, ra->rrd->payload); + err = receive_read(ra, len, buf); if (err != 0) return (err); - receive_cksum(ra, len, ra->rrd->payload); + receive_cksum(ra, len, buf); + + /* note: rrd is NULL when reading the begin record's payload */ + if (ra->rrd != NULL) { + ra->rrd->payload = buf; + ra->rrd->payload_size = len; + ra->rrd->bytes_read = ra->bytes_read; + } } ra->prev_cksum = ra->cksum; @@ -2056,6 +2372,7 @@ receive_read_payload_and_next_header(struct receive_arg *ra, int len, void *buf) ra->next_rrd = kmem_zalloc(sizeof (*ra->next_rrd), KM_SLEEP); err = receive_read(ra, sizeof (ra->next_rrd->header), &ra->next_rrd->header); + ra->next_rrd->bytes_read = ra->bytes_read; if (err != 0) { kmem_free(ra->next_rrd, sizeof (*ra->next_rrd)); ra->next_rrd = NULL; @@ -2235,7 +2552,7 @@ receive_read_record(struct receive_arg *ra) { struct drr_end *drre = &ra->rrd->header.drr_u.drr_end; if (!ZIO_CHECKSUM_EQUAL(ra->prev_cksum, drre->drr_checksum)) - return (SET_ERROR(EINVAL)); + return (SET_ERROR(ECKSUM)); return (0); } case DRR_SPILL: @@ -2262,6 +2579,10 @@ receive_process_record(struct receive_writer_arg *rwa, { int err; + /* Processing in order, therefore bytes_read should be increasing. */ + ASSERT3U(rrd->bytes_read, >=, rwa->bytes_read); + rwa->bytes_read = rrd->bytes_read; + switch (rrd->header.drr_type) { case DRR_OBJECT: { @@ -2357,6 +2678,33 @@ receive_writer_thread(void *arg) thread_exit(); } +static int +resume_check(struct receive_arg *ra, nvlist_t *begin_nvl) +{ + uint64_t val; + objset_t *mos = dmu_objset_pool(ra->os)->dp_meta_objset; + uint64_t dsobj = dmu_objset_id(ra->os); + uint64_t resume_obj, resume_off; + + if (nvlist_lookup_uint64(begin_nvl, + "resume_object", &resume_obj) != 0 || + nvlist_lookup_uint64(begin_nvl, + "resume_offset", &resume_off) != 0) { + return (SET_ERROR(EINVAL)); + } + VERIFY0(zap_lookup(mos, dsobj, + DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val)); + if (resume_obj != val) + return (SET_ERROR(EINVAL)); + VERIFY0(zap_lookup(mos, dsobj, + DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val)); + if (resume_off != val) + return (SET_ERROR(EINVAL)); + + return (0); +} + + /* * Read in the stream's records, one by one, and apply them to the pool. There * are two threads involved; the thread that calls this function will spin up a @@ -2377,12 +2725,20 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp, struct receive_arg ra = { 0 }; struct receive_writer_arg rwa = { 0 }; int featureflags; + nvlist_t *begin_nvl = NULL; ra.byteswap = drc->drc_byteswap; ra.cksum = drc->drc_cksum; ra.td = curthread; ra.fp = fp; ra.voff = *voffp; + + if (dsl_dataset_is_zapified(drc->drc_ds)) { + (void) zap_lookup(drc->drc_ds->ds_dir->dd_pool->dp_meta_objset, + drc->drc_ds->ds_object, DS_FIELD_RESUME_BYTES, + sizeof (ra.bytes_read), 1, &ra.bytes_read); + } + list_create(&ra.ignore_obj_list, sizeof (struct receive_ign_obj_node), offsetof(struct receive_ign_obj_node, node)); @@ -2435,9 +2791,29 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp, drc->drc_guid_to_ds_map = rwa.guid_to_ds_map; } - err = receive_read_payload_and_next_header(&ra, 0, NULL); - if (err) + uint32_t payloadlen = drc->drc_drr_begin->drr_payloadlen; + void *payload = NULL; + if (payloadlen != 0) + payload = kmem_alloc(payloadlen, KM_SLEEP); + + err = receive_read_payload_and_next_header(&ra, payloadlen, payload); + if (err != 0) { + if (payloadlen != 0) + kmem_free(payload, payloadlen); goto out; + } + if (payloadlen != 0) { + err = nvlist_unpack(payload, payloadlen, &begin_nvl, KM_SLEEP); + kmem_free(payload, payloadlen); + if (err != 0) + goto out; + } + + if (featureflags & DMU_BACKUP_FEATURE_RESUMING) { + err = resume_check(&ra, begin_nvl); + if (err != 0) + goto out; + } (void) bqueue_init(&rwa.q, zfs_recv_queue_length, offsetof(struct receive_record_arg, node)); @@ -2445,6 +2821,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp, mutex_init(&rwa.mutex, NULL, MUTEX_DEFAULT, NULL); rwa.os = ra.os; rwa.byteswap = drc->drc_byteswap; + rwa.resumable = drc->drc_resumable; (void) thread_create(NULL, 0, receive_writer_thread, &rwa, 0, &p0, TS_RUN, minclsyspri); @@ -2503,13 +2880,15 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp, err = rwa.err; out: + nvlist_free(begin_nvl); if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1)) zfs_onexit_fd_rele(cleanup_fd); if (err != 0) { /* - * destroy what we created, so we don't leave it in the - * inconsistent restoring state. + * Clean up references. If receive is not resumable, + * destroy what we created, so we don't leave it in + * the inconsistent state. */ dmu_recv_cleanup_ds(drc); } @@ -2669,6 +3048,20 @@ dmu_recv_end_sync(void *arg, dmu_tx_t *tx) dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT; + if (dsl_dataset_has_resume_receive_state(ds)) { + (void) zap_remove(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_FROMGUID, tx); + (void) zap_remove(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_OBJECT, tx); + (void) zap_remove(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_OFFSET, tx); + (void) zap_remove(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_BYTES, tx); + (void) zap_remove(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_TOGUID, tx); + (void) zap_remove(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_TONAME, tx); + } } drc->drc_newsnapobj = dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj; /* diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c index 151d04c..2c718df 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c @@ -48,6 +48,7 @@ typedef struct prefetch_data { int pd_flags; boolean_t pd_cancel; boolean_t pd_exited; + zbookmark_phys_t pd_resume; } prefetch_data_t; typedef struct traverse_data { @@ -307,59 +308,52 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, arc_flags_t flags = ARC_FLAG_WAIT; int i; int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; - dnode_phys_t *cdnp; err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); if (err != 0) goto post; - cdnp = buf->b_data; + dnode_phys_t *child_dnp = buf->b_data; for (i = 0; i < epb; i++) { - prefetch_dnode_metadata(td, &cdnp[i], zb->zb_objset, - zb->zb_blkid * epb + i); + prefetch_dnode_metadata(td, &child_dnp[i], + zb->zb_objset, zb->zb_blkid * epb + i); } /* recursively visitbp() blocks below this */ for (i = 0; i < epb; i++) { - err = traverse_dnode(td, &cdnp[i], zb->zb_objset, - zb->zb_blkid * epb + i); + err = traverse_dnode(td, &child_dnp[i], + zb->zb_objset, zb->zb_blkid * epb + i); if (err != 0) break; } } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { arc_flags_t flags = ARC_FLAG_WAIT; - objset_phys_t *osp; - dnode_phys_t *mdnp, *gdnp, *udnp; err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); if (err != 0) goto post; - osp = buf->b_data; - mdnp = &osp->os_meta_dnode; - gdnp = &osp->os_groupused_dnode; - udnp = &osp->os_userused_dnode; - - prefetch_dnode_metadata(td, mdnp, zb->zb_objset, + objset_phys_t *osp = buf->b_data; + prefetch_dnode_metadata(td, &osp->os_meta_dnode, zb->zb_objset, DMU_META_DNODE_OBJECT); if (arc_buf_size(buf) >= sizeof (objset_phys_t)) { - prefetch_dnode_metadata(td, gdnp, zb->zb_objset, - DMU_GROUPUSED_OBJECT); - prefetch_dnode_metadata(td, udnp, zb->zb_objset, - DMU_USERUSED_OBJECT); + prefetch_dnode_metadata(td, &osp->os_groupused_dnode, + zb->zb_objset, DMU_GROUPUSED_OBJECT); + prefetch_dnode_metadata(td, &osp->os_userused_dnode, + zb->zb_objset, DMU_USERUSED_OBJECT); } - err = traverse_dnode(td, mdnp, zb->zb_objset, + err = traverse_dnode(td, &osp->os_meta_dnode, zb->zb_objset, DMU_META_DNODE_OBJECT); if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) { - err = traverse_dnode(td, gdnp, zb->zb_objset, - DMU_GROUPUSED_OBJECT); + err = traverse_dnode(td, &osp->os_groupused_dnode, + zb->zb_objset, DMU_GROUPUSED_OBJECT); } if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) { - err = traverse_dnode(td, udnp, zb->zb_objset, - DMU_USERUSED_OBJECT); + err = traverse_dnode(td, &osp->os_userused_dnode, + zb->zb_objset, DMU_USERUSED_OBJECT); } } @@ -391,9 +385,15 @@ post: * Set the bookmark to the first level-0 block that we need * to visit. This way, the resuming code does not need to * deal with resuming from indirect blocks. + * + * Note, if zb_level <= 0, dnp may be NULL, so we don't want + * to dereference it. */ - td->td_resume->zb_blkid = zb->zb_blkid << - (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); + td->td_resume->zb_blkid = zb->zb_blkid; + if (zb->zb_level > 0) { + td->td_resume->zb_blkid <<= zb->zb_level * + (dnp->dn_indblkshift - SPA_BLKPTRSHIFT); + } td->td_paused = B_TRUE; } @@ -425,6 +425,10 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, int j, err = 0; zbookmark_phys_t czb; + if (object != DMU_META_DNODE_OBJECT && td->td_resume != NULL && + object < td->td_resume->zb_object) + return (0); + if (td->td_flags & TRAVERSE_PRE) { SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL, ZB_DNODE_BLKID); @@ -501,6 +505,7 @@ traverse_prefetch_thread(void *arg) td.td_func = traverse_prefetcher; td.td_arg = td_main->td_pfd; td.td_pfd = NULL; + td.td_resume = &td_main->td_pfd->pd_resume; SET_BOOKMARK(&czb, td.td_objset, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); @@ -529,12 +534,6 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, ASSERT(ds == NULL || objset == ds->ds_object); ASSERT(!(flags & TRAVERSE_PRE) || !(flags & TRAVERSE_POST)); - /* - * The data prefetching mechanism (the prefetch thread) is incompatible - * with resuming from a bookmark. - */ - ASSERT(resume == NULL || !(flags & TRAVERSE_PREFETCH_DATA)); - td.td_spa = spa; td.td_objset = objset; td.td_rootbp = rootbp; @@ -554,6 +553,8 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, } pd.pd_flags = flags; + if (resume != NULL) + pd.pd_resume = *resume; mutex_init(&pd.pd_mtx, NULL, MUTEX_DEFAULT, NULL); cv_init(&pd.pd_cv, NULL, CV_DEFAULT, NULL); @@ -601,11 +602,19 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, * in syncing context). */ int -traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags, - blkptr_cb_t func, void *arg) +traverse_dataset_resume(dsl_dataset_t *ds, uint64_t txg_start, + zbookmark_phys_t *resume, + int flags, blkptr_cb_t func, void *arg) { return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, ds->ds_object, - &dsl_dataset_phys(ds)->ds_bp, txg_start, NULL, flags, func, arg)); + &dsl_dataset_phys(ds)->ds_bp, txg_start, resume, flags, func, arg)); +} + +int +traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, + int flags, blkptr_cb_t func, void *arg) +{ + return (traverse_dataset_resume(ds, txg_start, NULL, flags, func, arg)); } int @@ -625,7 +634,6 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags, blkptr_cb_t func, void *arg) { int err; - uint64_t obj; dsl_pool_t *dp = spa_get_dsl(spa); objset_t *mos = dp->dp_meta_objset; boolean_t hard = (flags & TRAVERSE_HARD); @@ -637,8 +645,8 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags, return (err); /* visit each dataset */ - for (obj = 1; err == 0; - err = dmu_object_next(mos, &obj, FALSE, txg_start)) { + for (uint64_t obj = 1; err == 0; + err = dmu_object_next(mos, &obj, B_FALSE, txg_start)) { dmu_object_info_t doi; err = dmu_object_info(mos, obj, &doi); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c index 0d65896..2242b4a 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2013, 2014 by Delphix. All rights reserved. + * Copyright (c) 2013, 2015 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -49,7 +49,7 @@ uint32_t zfetch_max_streams = 8; uint32_t zfetch_min_sec_reap = 2; /* max bytes to prefetch per stream (default 8MB) */ uint32_t zfetch_max_distance = 8 * 1024 * 1024; -/* number of bytes in a array_read at which we stop prefetching (1MB) */ +/* max number of bytes in an array_read in which we allow prefetching (1MB) */ uint64_t zfetch_array_rd_sz = 1024 * 1024; SYSCTL_DECL(_vfs_zfs); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c index 0787885..9aee513 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c @@ -424,6 +424,7 @@ dnode_evict_dbufs(dnode_t *dn) db_next = AVL_NEXT(&dn->dn_dbufs, &db_marker); avl_remove(&dn->dn_dbufs, &db_marker); } else { + db->db_pending_evict = TRUE; mutex_exit(&db->db_mtx); db_next = AVL_NEXT(&dn->dn_dbufs, db); } @@ -437,10 +438,14 @@ void dnode_evict_bonus(dnode_t *dn) { rw_enter(&dn->dn_struct_rwlock, RW_WRITER); - if (dn->dn_bonus && refcount_is_zero(&dn->dn_bonus->db_holds)) { - mutex_enter(&dn->dn_bonus->db_mtx); - dbuf_evict(dn->dn_bonus); - dn->dn_bonus = NULL; + if (dn->dn_bonus != NULL) { + if (refcount_is_zero(&dn->dn_bonus->db_holds)) { + mutex_enter(&dn->dn_bonus->db_mtx); + dbuf_evict(dn->dn_bonus); + dn->dn_bonus = NULL; + } else { + dn->dn_bonus->db_pending_evict = TRUE; + } } rw_exit(&dn->dn_struct_rwlock); } @@ -492,7 +497,6 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx) dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]); dnode_evict_dbufs(dn); - ASSERT(avl_is_empty(&dn->dn_dbufs)); /* * XXX - It would be nice to assert this, but we may still diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c index 4fbbe7c..95e5392 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c @@ -34,6 +34,7 @@ #include <sys/dsl_synctask.h> #include <sys/dmu_traverse.h> #include <sys/dmu_impl.h> +#include <sys/dmu_send.h> #include <sys/dmu_tx.h> #include <sys/arc.h> #include <sys/zio.h> @@ -51,6 +52,10 @@ #include <sys/dsl_destroy.h> #include <sys/dsl_userhold.h> #include <sys/dsl_bookmark.h> +#include <sys/dmu_send.h> +#include <sys/zio_checksum.h> +#include <sys/zio_compress.h> +#include <zfs_fletcher.h> SYSCTL_DECL(_vfs_zfs); @@ -130,10 +135,16 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) dsl_dataset_phys(ds)->ds_compressed_bytes += compressed; dsl_dataset_phys(ds)->ds_uncompressed_bytes += uncompressed; dsl_dataset_phys(ds)->ds_unique_bytes += used; + if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE) { ds->ds_feature_activation_needed[SPA_FEATURE_LARGE_BLOCKS] = B_TRUE; } + + spa_feature_t f = zio_checksum_to_feature(BP_GET_CHECKSUM(bp)); + if (f != SPA_FEATURE_NONE) + ds->ds_feature_activation_needed[f] = B_TRUE; + mutex_exit(&ds->ds_lock); dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta, compressed, uncompressed, tx); @@ -701,6 +712,7 @@ dsl_dataset_tryown(dsl_dataset_t *ds, void *tag) { boolean_t gotit = FALSE; + ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); mutex_enter(&ds->ds_lock); if (ds->ds_owner == NULL && !DS_IS_INCONSISTENT(ds)) { ds->ds_owner = tag; @@ -711,6 +723,16 @@ dsl_dataset_tryown(dsl_dataset_t *ds, void *tag) return (gotit); } +boolean_t +dsl_dataset_has_owner(dsl_dataset_t *ds) +{ + boolean_t rv; + mutex_enter(&ds->ds_lock); + rv = (ds->ds_owner != NULL); + mutex_exit(&ds->ds_lock); + return (rv); +} + static void dsl_dataset_activate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx) { @@ -1657,6 +1679,21 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_dataset_phys(ds)->ds_fsid_guid = ds->ds_fsid_guid; + if (ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] != 0) { + VERIFY0(zap_update(tx->tx_pool->dp_meta_objset, + ds->ds_object, DS_FIELD_RESUME_OBJECT, 8, 1, + &ds->ds_resume_object[tx->tx_txg & TXG_MASK], tx)); + VERIFY0(zap_update(tx->tx_pool->dp_meta_objset, + ds->ds_object, DS_FIELD_RESUME_OFFSET, 8, 1, + &ds->ds_resume_offset[tx->tx_txg & TXG_MASK], tx)); + VERIFY0(zap_update(tx->tx_pool->dp_meta_objset, + ds->ds_object, DS_FIELD_RESUME_BYTES, 8, 1, + &ds->ds_resume_bytes[tx->tx_txg & TXG_MASK], tx)); + ds->ds_resume_object[tx->tx_txg & TXG_MASK] = 0; + ds->ds_resume_offset[tx->tx_txg & TXG_MASK] = 0; + ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] = 0; + } + dmu_objset_sync(ds->ds_objset, zio, tx); for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { @@ -1712,6 +1749,76 @@ fail: nvlist_free(propval); } +static void +get_receive_resume_stats(dsl_dataset_t *ds, nvlist_t *nv) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + + if (dsl_dataset_has_resume_receive_state(ds)) { + char *str; + void *packed; + uint8_t *compressed; + uint64_t val; + nvlist_t *token_nv = fnvlist_alloc(); + size_t packed_size, compressed_size; + + if (zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val) == 0) { + fnvlist_add_uint64(token_nv, "fromguid", val); + } + if (zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val) == 0) { + fnvlist_add_uint64(token_nv, "object", val); + } + if (zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val) == 0) { + fnvlist_add_uint64(token_nv, "offset", val); + } + if (zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_BYTES, sizeof (val), 1, &val) == 0) { + fnvlist_add_uint64(token_nv, "bytes", val); + } + if (zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val) == 0) { + fnvlist_add_uint64(token_nv, "toguid", val); + } + char buf[256]; + if (zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_TONAME, 1, sizeof (buf), buf) == 0) { + fnvlist_add_string(token_nv, "toname", buf); + } + if (zap_contains(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_EMBEDOK) == 0) { + fnvlist_add_boolean(token_nv, "embedok"); + } + packed = fnvlist_pack(token_nv, &packed_size); + fnvlist_free(token_nv); + compressed = kmem_alloc(packed_size, KM_SLEEP); + + compressed_size = gzip_compress(packed, compressed, + packed_size, packed_size, 6); + + zio_cksum_t cksum; + fletcher_4_native(compressed, compressed_size, NULL, &cksum); + + str = kmem_alloc(compressed_size * 2 + 1, KM_SLEEP); + for (int i = 0; i < compressed_size; i++) { + (void) sprintf(str + i * 2, "%02x", compressed[i]); + } + str[compressed_size * 2] = '\0'; + char *propval = kmem_asprintf("%u-%llx-%llx-%s", + ZFS_SEND_RESUME_TOKEN_VERSION, + (longlong_t)cksum.zc_word[0], + (longlong_t)packed_size, str); + dsl_prop_nvlist_add_string(nv, + ZFS_PROP_RECEIVE_RESUME_TOKEN, propval); + kmem_free(packed, packed_size); + kmem_free(str, compressed_size * 2 + 1); + kmem_free(compressed, packed_size); + strfree(propval); + } +} + void dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) { @@ -1783,6 +1890,29 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) } } } + + if (!dsl_dataset_is_snapshot(ds)) { + /* + * A failed "newfs" (e.g. full) resumable receive leaves + * the stats set on this dataset. Check here for the prop. + */ + get_receive_resume_stats(ds, nv); + + /* + * A failed incremental resumable receive leaves the + * stats set on our child named "%recv". Check the child + * for the prop. + */ + char recvname[ZFS_MAXNAMELEN]; + dsl_dataset_t *recv_ds; + dsl_dataset_name(ds, recvname); + (void) strcat(recvname, "/"); + (void) strcat(recvname, recv_clone_name); + if (dsl_dataset_hold(dp, recvname, FTAG, &recv_ds) == 0) { + get_receive_resume_stats(recv_ds, nv); + dsl_dataset_rele(recv_ds, FTAG); + } + } } void @@ -3428,7 +3558,7 @@ dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, */ boolean_t dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier, - uint64_t earlier_txg) + uint64_t earlier_txg) { dsl_pool_t *dp = later->ds_dir->dd_pool; int error; @@ -3467,3 +3597,20 @@ dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx) objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; dmu_object_zapify(mos, ds->ds_object, DMU_OT_DSL_DATASET, tx); } + +boolean_t +dsl_dataset_is_zapified(dsl_dataset_t *ds) +{ + dmu_object_info_t doi; + + dmu_object_info_from_db(ds->ds_dbuf, &doi); + return (doi.doi_type == DMU_OTN_ZAP_METADATA); +} + +boolean_t +dsl_dataset_has_resume_receive_state(dsl_dataset_t *ds) +{ + return (dsl_dataset_is_zapified(ds) && + zap_contains(ds->ds_dir->dd_pool->dp_meta_objset, + ds->ds_object, DS_FIELD_RESUME_TOGUID) == 0); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c index c7a623c..7de9845 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c @@ -968,9 +968,17 @@ dsl_destroy_inconsistent(const char *dsname, void *arg) objset_t *os; if (dmu_objset_hold(dsname, FTAG, &os) == 0) { - boolean_t inconsistent = DS_IS_INCONSISTENT(dmu_objset_ds(os)); + boolean_t need_destroy = DS_IS_INCONSISTENT(dmu_objset_ds(os)); + + /* + * If the dataset is inconsistent because a resumable receive + * has failed, then do not destroy it. + */ + if (dsl_dataset_has_resume_receive_state(dmu_objset_ds(os))) + need_destroy = B_FALSE; + dmu_objset_rele(os, FTAG); - if (inconsistent) + if (need_destroy) (void) dsl_destroy_head(dsname); } return (0); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c index 06cfced..3c6a29b 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2014 by Delphix. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. */ #include <sys/dsl_scan.h> @@ -111,6 +111,14 @@ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, free_max_blocks, CTLFLAG_RWTUN, extern int zfs_txg_timeout; +/* + * Enable/disable the processing of the free_bpobj object. + */ +boolean_t zfs_free_bpobj_enabled = B_TRUE; + +SYSCTL_INT(_vfs_zfs, OID_AUTO, free_bpobj_enabled, CTLFLAG_RWTUN, + &zfs_free_bpobj_enabled, 0, "Enable free_bpobj processing"); + /* the order has to match pool_scan_type */ static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = { NULL, @@ -1460,7 +1468,8 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) * have to worry about traversing it. It is also faster to free the * blocks than to scrub them. */ - if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) { + if (zfs_free_bpobj_enabled && + spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) { scn->scn_is_bptree = B_FALSE; scn->scn_zio_root = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/edonr_zfs.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/edonr_zfs.c new file mode 100644 index 0000000..93f1221 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/edonr_zfs.c @@ -0,0 +1,102 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2013 Saso Kiselkov. All rights reserved. + * Use is subject to license terms. + */ +#include <sys/zfs_context.h> +#include <sys/zio.h> +#include <sys/edonr.h> + +#define EDONR_MODE 512 +#define EDONR_BLOCK_SIZE EdonR512_BLOCK_SIZE + +/* + * Native zio_checksum interface for the Edon-R hash function. + */ +/*ARGSUSED*/ +void +zio_checksum_edonr_native(const void *buf, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + uint8_t digest[EDONR_MODE / 8]; + EdonRState ctx; + + ASSERT(ctx_template != NULL); + bcopy(ctx_template, &ctx, sizeof (ctx)); + EdonRUpdate(&ctx, buf, size * 8); + EdonRFinal(&ctx, digest); + bcopy(digest, zcp->zc_word, sizeof (zcp->zc_word)); +} + +/* + * Byteswapped zio_checksum interface for the Edon-R hash function. + */ +void +zio_checksum_edonr_byteswap(const void *buf, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + zio_cksum_t tmp; + + zio_checksum_edonr_native(buf, size, ctx_template, &tmp); + zcp->zc_word[0] = BSWAP_64(zcp->zc_word[0]); + zcp->zc_word[1] = BSWAP_64(zcp->zc_word[1]); + zcp->zc_word[2] = BSWAP_64(zcp->zc_word[2]); + zcp->zc_word[3] = BSWAP_64(zcp->zc_word[3]); +} + +void * +zio_checksum_edonr_tmpl_init(const zio_cksum_salt_t *salt) +{ + EdonRState *ctx; + uint8_t salt_block[EDONR_BLOCK_SIZE]; + + /* + * Edon-R needs all but the last hash invocation to be on full-size + * blocks, but the salt is too small. Rather than simply padding it + * with zeros, we expand the salt into a new salt block of proper + * size by double-hashing it (the new salt block will be composed of + * H(salt) || H(H(salt))). + */ + CTASSERT(EDONR_BLOCK_SIZE == 2 * (EDONR_MODE / 8)); + EdonRHash(EDONR_MODE, salt->zcs_bytes, sizeof (salt->zcs_bytes) * 8, + salt_block); + EdonRHash(EDONR_MODE, salt_block, EDONR_MODE, salt_block + + EDONR_MODE / 8); + + /* + * Feed the new salt block into the hash function - this will serve + * as our MAC key. + */ + ctx = kmem_zalloc(sizeof (*ctx), KM_SLEEP); + EdonRInit(ctx, EDONR_MODE); + EdonRUpdate(ctx, salt_block, sizeof (salt_block) * 8); + return (ctx); +} + +void +zio_checksum_edonr_tmpl_free(void *ctx_template) +{ + EdonRState *ctx = ctx_template; + + bzero(ctx, sizeof (*ctx)); + kmem_free(ctx, sizeof (*ctx)); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c index 2f26218..be0a688 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2014 by Delphix. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */ @@ -1815,10 +1815,11 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) ASSERT(msp->ms_loaded); - spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, " - "smp size %llu, segments %lu, forcing condense=%s", txg, - msp->ms_id, msp, space_map_length(msp->ms_sm), - avl_numnodes(&msp->ms_tree->rt_root), + spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, vdev id %llu, " + "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg, + msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id, + msp->ms_group->mg_vd->vdev_spa->spa_name, + space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root), msp->ms_condense_wanted ? "TRUE" : "FALSE"); msp->ms_condense_wanted = B_FALSE; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c index 816c09a..a64d6ef 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c @@ -22,6 +22,9 @@ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright 2013 Saso Kiselkov. All rights reserved. + */ #include <sys/zfs_context.h> #include <sys/zio.h> #ifdef _KERNEL @@ -30,8 +33,10 @@ #include <sha256.h> #endif +/*ARGSUSED*/ void -zio_checksum_SHA256(const void *buf, uint64_t size, zio_cksum_t *zcp) +zio_checksum_SHA256(const void *buf, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) { SHA256_CTX ctx; zio_cksum_t tmp; @@ -52,3 +57,31 @@ zio_checksum_SHA256(const void *buf, uint64_t size, zio_cksum_t *zcp) zcp->zc_word[2] = BE_64(tmp.zc_word[2]); zcp->zc_word[3] = BE_64(tmp.zc_word[3]); } + +#ifdef illumos +/*ARGSUSED*/ +void +zio_checksum_SHA512_native(const void *buf, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + SHA2_CTX ctx; + + SHA2Init(SHA512_256, &ctx); + SHA2Update(&ctx, buf, size); + SHA2Final(zcp, &ctx); +} + +/*ARGSUSED*/ +void +zio_checksum_SHA512_byteswap(const void *buf, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + zio_cksum_t tmp; + + zio_checksum_SHA512_native(buf, size, ctx_template, &tmp); + zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]); + zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]); + zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]); + zcp->zc_word[3] = BSWAP_64(tmp.zc_word[3]); +} +#endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/skein_zfs.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/skein_zfs.c new file mode 100644 index 0000000..6592340 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/skein_zfs.c @@ -0,0 +1,91 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2013 Saso Kiselkov. All rights reserved. + */ +#include <sys/zfs_context.h> +#include <sys/zio.h> +#include <sys/skein.h> + +/* + * Computes a native 256-bit skein MAC checksum. Please note that this + * function requires the presence of a ctx_template that should be allocated + * using zio_checksum_skein_tmpl_init. + */ +/*ARGSUSED*/ +void +zio_checksum_skein_native(const void *buf, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + Skein_512_Ctxt_t ctx; + + ASSERT(ctx_template != NULL); + bcopy(ctx_template, &ctx, sizeof (ctx)); + (void) Skein_512_Update(&ctx, buf, size); + (void) Skein_512_Final(&ctx, (uint8_t *)zcp); + bzero(&ctx, sizeof (ctx)); +} + +/* + * Byteswapped version of zio_checksum_skein_native. This just invokes + * the native checksum function and byteswaps the resulting checksum (since + * skein is internally endian-insensitive). + */ +void +zio_checksum_skein_byteswap(const void *buf, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + zio_cksum_t tmp; + + zio_checksum_skein_native(buf, size, ctx_template, &tmp); + zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]); + zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]); + zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]); + zcp->zc_word[3] = BSWAP_64(tmp.zc_word[3]); +} + +/* + * Allocates a skein MAC template suitable for using in skein MAC checksum + * computations and returns a pointer to it. + */ +void * +zio_checksum_skein_tmpl_init(const zio_cksum_salt_t *salt) +{ + Skein_512_Ctxt_t *ctx; + + ctx = kmem_zalloc(sizeof (*ctx), KM_SLEEP); + (void) Skein_512_InitExt(ctx, sizeof (zio_cksum_t) * 8, 0, + salt->zcs_bytes, sizeof (salt->zcs_bytes)); + return (ctx); +} + +/* + * Frees a skein context template previously allocated using + * zio_checksum_skein_tmpl_init. + */ +void +zio_checksum_skein_tmpl_free(void *ctx_template) +{ + Skein_512_Ctxt_t *ctx = ctx_template; + + bzero(ctx, sizeof (*ctx)); + kmem_free(ctx, sizeof (*ctx)); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c index b57eb95..fd3e537 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c @@ -22,9 +22,10 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. - * Copyright (c) 2013, 2014, Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright 2013 Saso Kiselkov. All rights reserved. */ /* @@ -2579,6 +2580,19 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE)); } + /* Grab the secret checksum salt from the MOS. */ + error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_CHECKSUM_SALT, 1, + sizeof (spa->spa_cksum_salt.zcs_bytes), + spa->spa_cksum_salt.zcs_bytes); + if (error == ENOENT) { + /* Generate a new salt for subsequent use */ + (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, + sizeof (spa->spa_cksum_salt.zcs_bytes)); + } else if (error != 0) { + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } + if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); @@ -3747,6 +3761,12 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, spa_history_create_obj(spa, tx); /* + * Generate some random noise for salted checksums to operate on. + */ + (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, + sizeof (spa->spa_cksum_salt.zcs_bytes)); + + /* * Set pool properties. */ spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); @@ -3771,6 +3791,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, txg_wait_synced(spa->spa_dsl_pool, txg); spa_config_sync(spa, B_FALSE, B_TRUE); + spa_event_notify(spa, NULL, ESC_ZFS_POOL_CREATE); spa_history_log_version(spa, "create"); @@ -4234,6 +4255,7 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) spa_configfile_set(spa, props, B_FALSE); spa_config_sync(spa, B_FALSE, B_TRUE); + spa_event_notify(spa, NULL, ESC_ZFS_POOL_IMPORT); mutex_exit(&spa_namespace_lock); return (0); @@ -4364,9 +4386,12 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) */ spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); - mutex_exit(&spa_namespace_lock); spa_history_log_version(spa, "import"); + spa_event_notify(spa, NULL, ESC_ZFS_POOL_IMPORT); + + mutex_exit(&spa_namespace_lock); + #ifdef __FreeBSD__ #ifdef _KERNEL zvol_create_minors(pool); @@ -4712,6 +4737,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) mutex_enter(&spa_namespace_lock); spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); + spa_event_notify(spa, NULL, ESC_ZFS_VDEV_ADD); mutex_exit(&spa_namespace_lock); return (0); @@ -4906,6 +4932,11 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) */ dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); + if (spa->spa_bootfs) + spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH); + + spa_event_notify(spa, newvd, ESC_ZFS_VDEV_ATTACH); + /* * Commit the config */ @@ -4920,9 +4951,6 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) spa_strfree(oldvdpath); spa_strfree(newvdpath); - if (spa->spa_bootfs) - spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH); - return (0); } @@ -6543,6 +6571,20 @@ spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) if (lz4_en && !lz4_ac) spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx); } + + /* + * If we haven't written the salt, do so now. Note that the + * feature may not be activated yet, but that's fine since + * the presence of this ZAP entry is backwards compatible. + */ + if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_CHECKSUM_SALT) == ENOENT) { + VERIFY0(zap_add(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1, + sizeof (spa->spa_cksum_salt.zcs_bytes), + spa->spa_cksum_salt.zcs_bytes, tx)); + } + rrw_exit(&dp->dp_config_rwlock, FTAG); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c index baffee0..e18ffd2 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c @@ -24,6 +24,7 @@ * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright 2013 Saso Kiselkov. All rights reserved. */ #include <sys/zfs_context.h> @@ -51,7 +52,7 @@ #include <sys/arc.h> #include <sys/ddt.h> #include "zfs_prop.h" -#include "zfeature_common.h" +#include <sys/zfeature.h> /* * SPA locking @@ -259,7 +260,7 @@ SYSCTL_INT(_debug, OID_AUTO, zfs_flags, CTLFLAG_RWTUN, &zfs_flags, 0, boolean_t zfs_recover = B_FALSE; SYSCTL_DECL(_vfs_zfs); TUNABLE_INT("vfs.zfs.recover", &zfs_recover); -SYSCTL_INT(_vfs_zfs, OID_AUTO, recover, CTLFLAG_RDTUN, &zfs_recover, 0, +SYSCTL_INT(_vfs_zfs, OID_AUTO, recover, CTLFLAG_RWTUN, &zfs_recover, 0, "Try to recover from otherwise-fatal errors."); /* @@ -608,6 +609,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_cksum_tmpls_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL); @@ -770,6 +772,8 @@ spa_remove(spa_t *spa) for (int t = 0; t < TXG_SIZE; t++) bplist_destroy(&spa->spa_free_bplist[t]); + zio_checksum_templates_free(spa); + cv_destroy(&spa->spa_async_cv); cv_destroy(&spa->spa_evicting_os_cv); cv_destroy(&spa->spa_proc_cv); @@ -783,6 +787,7 @@ spa_remove(spa_t *spa) mutex_destroy(&spa->spa_history_lock); mutex_destroy(&spa->spa_proc_lock); mutex_destroy(&spa->spa_props_lock); + mutex_destroy(&spa->spa_cksum_tmpls_lock); mutex_destroy(&spa->spa_scrub_lock); mutex_destroy(&spa->spa_suspend_lock); mutex_destroy(&spa->spa_vdev_top_lock); @@ -1807,7 +1812,13 @@ dva_get_dsize_sync(spa_t *spa, const dva_t *dva) ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); if (asize != 0 && spa->spa_deflate) { - vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); + uint64_t vdev = DVA_GET_VDEV(dva); + vdev_t *vd = vdev_lookup_top(spa, vdev); + if (vd == NULL) { + panic( + "dva_get_dsize_sync(): bad DVA %llu:%llu", + (u_longlong_t)vdev, (u_longlong_t)asize); + } dsize = (asize >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio; } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c index a508092..3d99059 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c @@ -23,7 +23,7 @@ * Use is subject to license terms. */ /* - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2013, 2015 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -103,7 +103,7 @@ space_reftree_add_node(avl_tree_t *t, uint64_t offset, int64_t refcnt) void space_reftree_add_seg(avl_tree_t *t, uint64_t start, uint64_t end, - int64_t refcnt) + int64_t refcnt) { space_reftree_add_node(t, start, refcnt); space_reftree_add_node(t, end, -refcnt); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h index 482ccb0..233d541 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h @@ -230,9 +230,25 @@ typedef struct dmu_buf_impl { /* User callback information. */ dmu_buf_user_t *db_user; - uint8_t db_immediate_evict; + /* + * Evict user data as soon as the dirty and reference + * counts are equal. + */ + uint8_t db_user_immediate_evict; + + /* + * This block was freed while a read or write was + * active. + */ uint8_t db_freed_in_flight; + /* + * dnode_evict_dbufs() or dnode_evict_bonus() tried to + * evict this dbuf, but couldn't due to outstanding + * references. Evict once the refcount drops to 0. + */ + uint8_t db_pending_evict; + uint8_t db_dirtycnt; } dmu_buf_impl_t; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h index 3b055cc..6ba19c9 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h @@ -27,6 +27,7 @@ * Copyright 2013 DEY Storage Systems, Inc. * Copyright 2014 HybridCluster. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright 2013 Saso Kiselkov. All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ @@ -320,6 +321,7 @@ typedef struct dmu_buf { #define DMU_POOL_FREE_BPOBJ "free_bpobj" #define DMU_POOL_BPTREE_OBJ "bptree_obj" #define DMU_POOL_EMPTY_BPOBJ "empty_bpobj" +#define DMU_POOL_CHECKSUM_SALT "org.illumos:checksum_salt" /* * Allocate an object from this objset. The range of object numbers diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h index 6f67b5a..e8d6294 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h @@ -25,7 +25,7 @@ /* * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright (c) 2012, Martin Matuska <mm@FreeBSD.org>. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2013, 2014 by Delphix. All rights reserved. */ #ifndef _SYS_DMU_IMPL_H @@ -300,6 +300,8 @@ typedef struct dmu_sendarg { uint64_t dsa_featureflags; uint64_t dsa_last_data_object; uint64_t dsa_last_data_offset; + uint64_t dsa_resume_object; + uint64_t dsa_resume_offset; } dmu_sendarg_t; void dmu_object_zapify(objset_t *, uint64_t, dmu_object_type_t, dmu_tx_t *); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h index 9e98350..8a263a3 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h @@ -93,7 +93,6 @@ struct objset { uint8_t os_copies; enum zio_checksum os_dedup_checksum; boolean_t os_dedup_verify; - boolean_t os_evicting; zfs_logbias_op_t os_logbias; zfs_cache_type_t os_primary_cache; zfs_cache_type_t os_secondary_cache; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h index 143d43f..2865e82 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h @@ -35,13 +35,16 @@ struct vnode; struct dsl_dataset; struct drr_begin; struct avl_tree; +struct dmu_replay_record; -int dmu_send(const char *tosnap, const char *fromsnap, - boolean_t embedok, boolean_t large_block_ok, +extern const char *recv_clone_name; + +int dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, + boolean_t large_block_ok, int outfd, uint64_t resumeobj, uint64_t resumeoff, #ifdef illumos - int outfd, struct vnode *vp, offset_t *off); + struct vnode *vp, offset_t *off); #else - int outfd, struct file *fp, offset_t *off); + struct file *fp, offset_t *off); #endif int dmu_send_estimate(struct dsl_dataset *ds, struct dsl_dataset *fromds, uint64_t *sizep); @@ -57,12 +60,14 @@ int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, typedef struct dmu_recv_cookie { struct dsl_dataset *drc_ds; + struct dmu_replay_record *drc_drr_begin; struct drr_begin *drc_drrb; const char *drc_tofs; const char *drc_tosnap; boolean_t drc_newfs; boolean_t drc_byteswap; boolean_t drc_force; + boolean_t drc_resumable; struct avl_tree *drc_guid_to_ds_map; zio_cksum_t drc_cksum; uint64_t drc_newsnapobj; @@ -70,8 +75,9 @@ typedef struct dmu_recv_cookie { cred_t *drc_cred; } dmu_recv_cookie_t; -int dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb, - boolean_t force, char *origin, dmu_recv_cookie_t *drc); +int dmu_recv_begin(char *tofs, char *tosnap, + struct dmu_replay_record *drr_begin, + boolean_t force, boolean_t resumable, char *origin, dmu_recv_cookie_t *drc); #ifdef illumos int dmu_recv_stream(dmu_recv_cookie_t *drc, struct vnode *vp, offset_t *voffp, #else diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h index 544b721..c010edd 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h @@ -54,6 +54,8 @@ typedef int (blkptr_cb_t)(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, int traverse_dataset(struct dsl_dataset *ds, uint64_t txg_start, int flags, blkptr_cb_t func, void *arg); +int traverse_dataset_resume(struct dsl_dataset *ds, uint64_t txg_start, + zbookmark_phys_t *resume, int flags, blkptr_cb_t func, void *arg); int traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr, uint64_t txg_start, zbookmark_phys_t *resume, int flags, blkptr_cb_t func, void *arg); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h index c9cd589..d7df05b 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h @@ -92,6 +92,18 @@ struct dsl_pool; #define DS_FIELD_LARGE_BLOCKS "org.open-zfs:large_blocks" /* + * These fields are set on datasets that are in the middle of a resumable + * receive, and allow the sender to resume the send if it is interrupted. + */ +#define DS_FIELD_RESUME_FROMGUID "com.delphix:resume_fromguid" +#define DS_FIELD_RESUME_TONAME "com.delphix:resume_toname" +#define DS_FIELD_RESUME_TOGUID "com.delphix:resume_toguid" +#define DS_FIELD_RESUME_OBJECT "com.delphix:resume_object" +#define DS_FIELD_RESUME_OFFSET "com.delphix:resume_offset" +#define DS_FIELD_RESUME_BYTES "com.delphix:resume_bytes" +#define DS_FIELD_RESUME_EMBEDOK "com.delphix:resume_embedok" + +/* * DS_FLAG_CI_DATASET is set if the dataset contains a file system whose * name lookups should be performed case-insensitively. */ @@ -184,6 +196,14 @@ typedef struct dsl_dataset { kmutex_t ds_sendstream_lock; list_t ds_sendstreams; + /* + * When in the middle of a resumable receive, tracks how much + * progress we have made. + */ + uint64_t ds_resume_object[TXG_SIZE]; + uint64_t ds_resume_offset[TXG_SIZE]; + uint64_t ds_resume_bytes[TXG_SIZE]; + /* Protected by our dsl_dir's dd_lock */ list_t ds_prop_cbs; @@ -235,6 +255,7 @@ int dsl_dataset_own_obj(struct dsl_pool *dp, uint64_t dsobj, void dsl_dataset_disown(dsl_dataset_t *ds, void *tag); void dsl_dataset_name(dsl_dataset_t *ds, char *name); boolean_t dsl_dataset_tryown(dsl_dataset_t *ds, void *tag); +boolean_t dsl_dataset_has_owner(dsl_dataset_t *ds); uint64_t dsl_dataset_create_sync(dsl_dir_t *pds, const char *lastname, dsl_dataset_t *origin, uint64_t flags, cred_t *, dmu_tx_t *); uint64_t dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, @@ -315,6 +336,8 @@ int dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx, void dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds, zprop_source_t source, uint64_t value, dmu_tx_t *tx); void dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx); +boolean_t dsl_dataset_is_zapified(dsl_dataset_t *ds); +boolean_t dsl_dataset_has_resume_receive_state(dsl_dataset_t *ds); int dsl_dataset_rollback(const char *fsname, void *owner, nvlist_t *result); void dsl_dataset_deactivate_feature(uint64_t dsobj, diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h index 09f16bc..be00621 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h @@ -23,6 +23,7 @@ * Copyright (c) 2011, 2014 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright 2013 Saso Kiselkov. All rights reserved. */ #ifndef _SYS_SPA_H @@ -163,6 +164,14 @@ typedef struct zio_cksum { } zio_cksum_t; /* + * Some checksums/hashes need a 256-bit initialization salt. This salt is kept + * secret and is suitable for use in MAC algorithms as the key. + */ +typedef struct zio_cksum_salt { + uint8_t zcs_bytes[32]; +} zio_cksum_salt_t; + +/* * Each block is described by its DVAs, time of birth, checksum, etc. * The word-by-word, bit-by-bit layout of the blkptr is as follows: * diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h index 626d9d5..0f9a18b 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h @@ -24,6 +24,7 @@ * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright 2013 Saso Kiselkov. All rights reserved. */ #ifndef _SYS_SPA_IMPL_H @@ -166,6 +167,10 @@ struct spa { uint64_t spa_syncing_txg; /* txg currently syncing */ bpobj_t spa_deferred_bpobj; /* deferred-free bplist */ bplist_t spa_free_bplist[TXG_SIZE]; /* bplist of stuff to free */ + zio_cksum_salt_t spa_cksum_salt; /* secret salt for cksum */ + /* checksum context templates */ + kmutex_t spa_cksum_tmpls_lock; + void *spa_cksum_tmpls[ZIO_CHECKSUM_FUNCTIONS]; uberblock_t spa_ubsync; /* last synced uberblock */ uberblock_t spa_uberblock; /* current uberblock */ boolean_t spa_extreme_rewind; /* rewind past deferred frees */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h index f9eca27..20bf545 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h @@ -79,14 +79,15 @@ typedef enum drr_headertype { * Feature flags for zfs send streams (flags in drr_versioninfo) */ -#define DMU_BACKUP_FEATURE_DEDUP (1<<0) -#define DMU_BACKUP_FEATURE_DEDUPPROPS (1<<1) -#define DMU_BACKUP_FEATURE_SA_SPILL (1<<2) +#define DMU_BACKUP_FEATURE_DEDUP (1 << 0) +#define DMU_BACKUP_FEATURE_DEDUPPROPS (1 << 1) +#define DMU_BACKUP_FEATURE_SA_SPILL (1 << 2) /* flags #3 - #15 are reserved for incompatible closed-source implementations */ -#define DMU_BACKUP_FEATURE_EMBED_DATA (1<<16) -#define DMU_BACKUP_FEATURE_EMBED_DATA_LZ4 (1<<17) +#define DMU_BACKUP_FEATURE_EMBED_DATA (1 << 16) +#define DMU_BACKUP_FEATURE_EMBED_DATA_LZ4 (1 << 17) /* flag #18 is reserved for a Delphix feature */ -#define DMU_BACKUP_FEATURE_LARGE_BLOCKS (1<<19) +#define DMU_BACKUP_FEATURE_LARGE_BLOCKS (1 << 19) +#define DMU_BACKUP_FEATURE_RESUMING (1 << 20) /* * Mask of all supported backup features @@ -94,11 +95,16 @@ typedef enum drr_headertype { #define DMU_BACKUP_FEATURE_MASK (DMU_BACKUP_FEATURE_DEDUP | \ DMU_BACKUP_FEATURE_DEDUPPROPS | DMU_BACKUP_FEATURE_SA_SPILL | \ DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_EMBED_DATA_LZ4 | \ + DMU_BACKUP_FEATURE_RESUMING | \ DMU_BACKUP_FEATURE_LARGE_BLOCKS) /* Are all features in the given flag word currently supported? */ #define DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK)) +typedef enum dmu_send_resume_token_version { + ZFS_SEND_RESUME_TOKEN_VERSION = 1 +} dmu_send_resume_token_version_t; + /* * The drr_versioninfo field of the dmu_replay_record has the * following layout: @@ -358,14 +364,14 @@ typedef struct zfs_cmd { zfs_share_t zc_share; uint64_t zc_jailid; dmu_objset_stats_t zc_objset_stats; - struct drr_begin zc_begin_record; + dmu_replay_record_t zc_begin_record; zinject_record_t zc_inject_record; uint32_t zc_defer_destroy; uint32_t zc_flags; uint64_t zc_action_handle; int zc_cleanup_fd; uint8_t zc_simple; - uint8_t zc_pad[3]; /* alignment */ + boolean_t zc_resumable; uint64_t zc_sendobj; uint64_t zc_fromobj; uint64_t zc_createtxg; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h index ad35273..16d9ad2 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h @@ -82,6 +82,11 @@ enum zio_checksum { ZIO_CHECKSUM_SHA256, ZIO_CHECKSUM_ZILOG2, ZIO_CHECKSUM_NOPARITY, +#ifdef illumos + ZIO_CHECKSUM_SHA512, + ZIO_CHECKSUM_SKEIN, + ZIO_CHECKSUM_EDONR, +#endif ZIO_CHECKSUM_FUNCTIONS }; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h index 0c293ab..0a9d772 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h @@ -20,13 +20,15 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2014 by Delphix. All rights reserved. + * Copyright (c) 2014, 2015 by Delphix. All rights reserved. + * Copyright Saso Kiselkov 2013, All rights reserved. */ #ifndef _SYS_ZIO_CHECKSUM_H #define _SYS_ZIO_CHECKSUM_H #include <sys/zio.h> +#include <zfeature_common.h> #ifdef __cplusplus extern "C" { @@ -35,17 +37,34 @@ extern "C" { /* * Signature for checksum functions. */ -typedef void zio_checksum_func_t(const void *, uint64_t, zio_cksum_t *); +typedef void zio_checksum_t(const void *data, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp); +typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt); +typedef void zio_checksum_tmpl_free_t(void *ctx_template); + +typedef enum zio_checksum_flags { + /* Strong enough for metadata? */ + ZCHECKSUM_FLAG_METADATA = (1 << 1), + /* ZIO embedded checksum */ + ZCHECKSUM_FLAG_EMBEDDED = (1 << 2), + /* Strong enough for dedup (without verification)? */ + ZCHECKSUM_FLAG_DEDUP = (1 << 3), + /* Uses salt value */ + ZCHECKSUM_FLAG_SALTED = (1 << 4), + /* Strong enough for nopwrite? */ + ZCHECKSUM_FLAG_NOPWRITE = (1 << 5) +} zio_checksum_flags_t; /* * Information about each checksum function. */ typedef struct zio_checksum_info { - zio_checksum_func_t *ci_func[2]; /* checksum function per byteorder */ - int ci_correctable; /* number of correctable bits */ - int ci_eck; /* uses zio embedded checksum? */ - boolean_t ci_dedup; /* strong enough for dedup? */ - char *ci_name; /* descriptive name */ + /* checksum function for each byteorder */ + zio_checksum_t *ci_func[2]; + zio_checksum_tmpl_init_t *ci_tmpl_init; + zio_checksum_tmpl_free_t *ci_tmpl_free; + zio_checksum_flags_t ci_flags; + char *ci_name; /* descriptive name */ } zio_checksum_info_t; typedef struct zio_bad_cksum { @@ -62,12 +81,30 @@ extern zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS]; /* * Checksum routines. */ -extern zio_checksum_func_t zio_checksum_SHA256; +extern zio_checksum_t zio_checksum_SHA256; +#ifdef illumos +extern zio_checksum_t zio_checksum_SHA512_native; +extern zio_checksum_t zio_checksum_SHA512_byteswap; + +/* Skein */ +extern zio_checksum_t zio_checksum_skein_native; +extern zio_checksum_t zio_checksum_skein_byteswap; +extern zio_checksum_tmpl_init_t zio_checksum_skein_tmpl_init; +extern zio_checksum_tmpl_free_t zio_checksum_skein_tmpl_free; + +/* Edon-R */ +extern zio_checksum_t zio_checksum_edonr_native; +extern zio_checksum_t zio_checksum_edonr_byteswap; +extern zio_checksum_tmpl_init_t zio_checksum_edonr_tmpl_init; +extern zio_checksum_tmpl_free_t zio_checksum_edonr_tmpl_free; +#endif extern void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, void *data, uint64_t size); extern int zio_checksum_error(zio_t *zio, zio_bad_cksum_t *out); extern enum zio_checksum spa_dedup_checksum(spa_t *spa); +extern void zio_checksum_templates_free(spa_t *spa); +extern spa_feature_t zio_checksum_to_feature(enum zio_checksum cksum); #ifdef __cplusplus } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zrlock.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zrlock.h index dcd63f7..b6eba1a 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zrlock.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zrlock.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2015 by Delphix. All rights reserved. */ #ifndef _SYS_ZRLOCK_H @@ -44,12 +45,8 @@ typedef struct zrlock { extern void zrl_init(zrlock_t *); extern void zrl_destroy(zrlock_t *); -#ifdef ZFS_DEBUG -#define zrl_add(_z) zrl_add_debug((_z), __func__) -extern void zrl_add_debug(zrlock_t *, const char *); -#else -extern void zrl_add(zrlock_t *); -#endif +#define zrl_add(_z) zrl_add_impl((_z), __func__) +extern void zrl_add_impl(zrlock_t *, const char *); extern void zrl_remove(zrlock_t *); extern int zrl_tryenter(zrlock_t *); extern void zrl_exit(zrlock_t *); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c index 06824f7..a846487 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c @@ -2503,6 +2503,7 @@ int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) { vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev; + boolean_t postevent = B_FALSE; spa_vdev_state_enter(spa, SCL_NONE); @@ -2512,6 +2513,10 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); + postevent = + (vd->vdev_offline == B_TRUE || vd->vdev_tmpoffline == B_TRUE) ? + B_TRUE : B_FALSE; + tvd = vd->vdev_top; vd->vdev_offline = B_FALSE; vd->vdev_tmpoffline = B_FALSE; @@ -2547,6 +2552,10 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) return (spa_vdev_state_exit(spa, vd, ENOTSUP)); spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } + + if (postevent) + spa_event_notify(spa, vd, ESC_ZFS_VDEV_ONLINE); + return (spa_vdev_state_exit(spa, vd, 0)); } @@ -3403,8 +3412,6 @@ vdev_is_bootable(vdev_t *vd) strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) { return (B_FALSE); } - } else if (vd->vdev_wholedisk == 1) { - return (B_FALSE); } for (int c = 0; c < vd->vdev_children; c++) { diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c index 966f2fa..9befa75 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. */ /* @@ -185,7 +185,7 @@ vdev_label_number(uint64_t psize, uint64_t offset) static void vdev_label_read(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset, - uint64_t size, zio_done_func_t *done, void *private, int flags) + uint64_t size, zio_done_func_t *done, void *private, int flags) { ASSERT(spa_config_held(zio->io_spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); @@ -199,7 +199,7 @@ vdev_label_read(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset, static void vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset, - uint64_t size, zio_done_func_t *done, void *private, int flags) + uint64_t size, zio_done_func_t *done, void *private, int flags) { ASSERT(spa_config_held(zio->io_spa, SCL_ALL, RW_WRITER) == SCL_ALL || (spa_config_held(zio->io_spa, SCL_CONFIG | SCL_STATE, RW_READER) == diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c index 96358f7..c8c3660 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013, 2014 by Delphix. All rights reserved. + * Copyright (c) 2013, 2015 by Delphix. All rights reserved. */ /* @@ -535,7 +535,7 @@ zap_entry_read_name(zap_t *zap, const zap_entry_handle_t *zeh, uint16_t buflen, int zap_entry_update(zap_entry_handle_t *zeh, - uint8_t integer_size, uint64_t num_integers, const void *buf) + uint8_t integer_size, uint64_t num_integers, const void *buf) { int delta_chunks; zap_leaf_t *l = zeh->zeh_leaf; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c index 80a3f0b..78b2912 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c @@ -269,7 +269,8 @@ feature_get_refcount_from_disk(spa_t *spa, zfeature_info_t *feature, static int -feature_get_enabled_txg(spa_t *spa, zfeature_info_t *feature, uint64_t *res) { +feature_get_enabled_txg(spa_t *spa, zfeature_info_t *feature, uint64_t *res) +{ uint64_t enabled_txg_obj = spa->spa_feat_enabled_txg_obj; ASSERT(zfeature_depends_on(feature->fi_feature, @@ -493,7 +494,8 @@ spa_feature_is_active(spa_t *spa, spa_feature_t fid) * Returns B_FALSE otherwise (i.e. if the feature is not enabled). */ boolean_t -spa_feature_enabled_txg(spa_t *spa, spa_feature_t fid, uint64_t *txg) { +spa_feature_enabled_txg(spa_t *spa, spa_feature_t fid, uint64_t *txg) +{ int err; ASSERT(VALID_FEATURE_FID(fid)); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c index f7c2b6a..e2a3bff 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013, 2014 by Delphix. All rights reserved. + * Copyright (c) 2013, 2015 by Delphix. All rights reserved. */ #include <sys/types.h> @@ -806,7 +806,7 @@ zfs_dropname(zfs_dirlock_t *dl, znode_t *zp, znode_t *dzp, dmu_tx_t *tx, */ int zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag, - boolean_t *unlinkedp) + boolean_t *unlinkedp) { znode_t *dzp = dl->dl_dzp; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c index d3a339e..6d9877b 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c @@ -27,7 +27,7 @@ * Copyright 2014 Xin Li <delphij@FreeBSD.org>. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014, Joyent, Inc. All rights reserved. - * Copyright (c) 2011, 2014 by Delphix. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. */ @@ -185,6 +185,7 @@ #include <sys/dsl_bookmark.h> #include <sys/dsl_userhold.h> #include <sys/zfeature.h> +#include <sys/zio_checksum.h> #include "zfs_namecheck.h" #include "zfs_prop.h" @@ -3902,11 +3903,6 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) return (SET_ERROR(ENOTSUP)); break; - case ZFS_PROP_DEDUP: - if (zfs_earlier_version(dsname, SPA_VERSION_DEDUP)) - return (SET_ERROR(ENOTSUP)); - break; - case ZFS_PROP_RECORDSIZE: /* Record sizes above 128k need the feature to be enabled */ if (nvpair_value_uint64(pair, &intval) == 0 && @@ -3920,7 +3916,7 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) */ if (zfs_is_bootfs(dsname) && intval > SPA_OLD_MAXBLOCKSIZE) { - return (SET_ERROR(EDOM)); + return (SET_ERROR(ERANGE)); } /* @@ -3929,7 +3925,7 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) */ if (intval > zfs_max_recordsize || intval > SPA_MAXBLOCKSIZE) - return (SET_ERROR(EDOM)); + return (SET_ERROR(ERANGE)); if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); @@ -3957,6 +3953,45 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) return (SET_ERROR(ENOTSUP)); } break; + + case ZFS_PROP_CHECKSUM: + case ZFS_PROP_DEDUP: + { + spa_feature_t feature; + spa_t *spa; + + /* dedup feature version checks */ + if (prop == ZFS_PROP_DEDUP && + zfs_earlier_version(dsname, SPA_VERSION_DEDUP)) + return (SET_ERROR(ENOTSUP)); + + if (nvpair_value_uint64(pair, &intval) != 0) + return (SET_ERROR(EINVAL)); + + /* check prop value is enabled in features */ + feature = zio_checksum_to_feature(intval); + if (feature == SPA_FEATURE_NONE) + break; + + if ((err = spa_open(dsname, &spa, FTAG)) != 0) + return (err); + /* + * Salted checksums are not supported on root pools. + */ + if (spa_bootfs(spa) != 0 && + intval < ZIO_CHECKSUM_FUNCTIONS && + (zio_checksum_table[intval].ci_flags & + ZCHECKSUM_FLAG_SALTED)) { + spa_close(spa, FTAG); + return (SET_ERROR(ERANGE)); + } + if (!spa_feature_is_enabled(spa, feature)) { + spa_close(spa, FTAG); + return (SET_ERROR(ENOTSUP)); + } + spa_close(spa, FTAG); + break; + } } return (zfs_secpolicy_setprop(dsname, prop, pair, CRED())); @@ -4155,6 +4190,7 @@ static boolean_t zfs_ioc_recv_inject_err; * zc_guid force flag * zc_cleanup_fd cleanup-on-exit file descriptor * zc_action_handle handle for this guid/ds mapping (or zero on first call) + * zc_resumable if data is incomplete assume sender will resume * * outputs: * zc_cookie number of bytes read @@ -4206,13 +4242,13 @@ zfs_ioc_recv(zfs_cmd_t *zc) return (SET_ERROR(EBADF)); } - VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0); + errors = fnvlist_alloc(); if (zc->zc_string[0]) origin = zc->zc_string; error = dmu_recv_begin(tofs, tosnap, - &zc->zc_begin_record, force, origin, &drc); + &zc->zc_begin_record, force, zc->zc_resumable, origin, &drc); if (error != 0) goto out; @@ -5431,6 +5467,8 @@ zfs_ioc_unjail(zfs_cmd_t *zc) * indicates that blocks > 128KB are permitted * (optional) "embedok" -> (value ignored) * presence indicates DRR_WRITE_EMBEDDED records are permitted + * (optional) "resume_object" and "resume_offset" -> (uint64) + * if present, resume send stream from specified object and offset. * } * * outnvl is unused @@ -5447,6 +5485,8 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) int fd; boolean_t largeblockok; boolean_t embedok; + uint64_t resumeobj = 0; + uint64_t resumeoff = 0; error = nvlist_lookup_int32(innvl, "fd", &fd); if (error != 0) @@ -5457,6 +5497,9 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) largeblockok = nvlist_exists(innvl, "largeblockok"); embedok = nvlist_exists(innvl, "embedok"); + (void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj); + (void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff); + #ifdef illumos file_t *fp = getf(fd); #else @@ -5466,11 +5509,11 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) return (SET_ERROR(EBADF)); off = fp->f_offset; - error = dmu_send(snapname, fromname, embedok, largeblockok, + error = dmu_send(snapname, fromname, embedok, largeblockok, fd, #ifdef illumos - fd, fp->f_vnode, &off); + resumeobj, resumeoff, fp->f_vnode, &off); #else - fd, fp, &off); + resumeobj, resumeoff, fp, &off); #endif #ifdef illumos @@ -5664,7 +5707,7 @@ zfs_ioctl_register_dataset_read(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func) static void zfs_ioctl_register_dataset_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, - zfs_secpolicy_func_t *secpolicy) + zfs_secpolicy_func_t *secpolicy) { zfs_ioctl_register_legacy(ioc, func, secpolicy, DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); @@ -6099,6 +6142,14 @@ zfsdev_ioctl(struct cdev *dev, u_long zcmd, caddr_t arg, int flag, goto out; } break; + case ZFS_IOCVER_EDBP: + if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_edbp_t)) { + error = SET_ERROR(EFAULT); + goto out; + } + compat = B_TRUE; + cflag = ZFS_CMD_COMPAT_EDBP; + break; case ZFS_IOCVER_ZCMD: if (zc_iocparm->zfs_cmd_size > sizeof(zfs_cmd_t) || zc_iocparm->zfs_cmd_size < sizeof(zfs_cmd_zcmd_t)) { diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c index 7432290..30b3b52 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2015 by Delphix. All rights reserved. */ #include <sys/types.h> @@ -348,7 +349,7 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, */ void zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - znode_t *dzp, char *name, uint64_t foid) + znode_t *dzp, char *name, uint64_t foid) { itx_t *itx; lr_remove_t *lr; @@ -372,7 +373,7 @@ zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, */ void zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - znode_t *dzp, znode_t *zp, char *name) + znode_t *dzp, znode_t *zp, char *name) { itx_t *itx; lr_link_t *lr; @@ -427,7 +428,7 @@ zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, */ void zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp) + znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp) { itx_t *itx; lr_rename_t *lr; @@ -455,7 +456,7 @@ ssize_t zfs_immediate_write_sz = 32768; void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, - znode_t *zp, offset_t off, ssize_t resid, int ioflag) + znode_t *zp, offset_t off, ssize_t resid, int ioflag) { itx_wr_state_t write_state; boolean_t slogging; @@ -532,7 +533,7 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, */ void zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype, - znode_t *zp, uint64_t off, uint64_t len) + znode_t *zp, uint64_t off, uint64_t len) { itx_t *itx; lr_truncate_t *lr; @@ -555,7 +556,7 @@ zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype, */ void zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, - znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp) + znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp) { itx_t *itx; lr_setattr_t *lr; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c index 6e0c243..ae24ef0 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2013, 2015 by Delphix. All rights reserved. */ #include <sys/types.h> @@ -54,7 +54,7 @@ static void zfs_init_vattr(vattr_t *vap, uint64_t mask, uint64_t mode, - uint64_t uid, uint64_t gid, uint64_t rdev, uint64_t nodeid) + uint64_t uid, uint64_t gid, uint64_t rdev, uint64_t nodeid) { VATTR_NULL(vap); vap->va_mask = (uint_t)mask; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c index 3a4f348..5c26efe 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright 2013 Nexenta Systems, Inc. All rights reserved. */ @@ -4505,7 +4505,7 @@ top: /* ARGSUSED */ static int zfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, - size_t *lenp, int flags, cred_t *cr) + size_t *lenp, int flags, cred_t *cr) { pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR); return (0); @@ -4531,7 +4531,7 @@ zfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, /* ARGSUSED */ static int zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, - size_t *lenp, int flags, cred_t *cr) + size_t *lenp, int flags, cred_t *cr) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c index 8548b2d..cce7fd1 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c @@ -330,7 +330,7 @@ zio_data_buf_free(void *buf, size_t size) */ static void zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, - zio_transform_func_t *transform) + zio_transform_func_t *transform) { zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); @@ -999,7 +999,7 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, zio->io_prop.zp_checksum = checksum; - if (zio_checksum_table[checksum].ci_eck) { + if (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { /* * zec checksums are necessarily destructive -- they modify * the end of the write buffer to hold the verifier/checksum. @@ -1068,8 +1068,8 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, zio_t * zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, - int type, zio_priority_t priority, enum zio_flag flags, - zio_done_func_t *done, void *private) + int type, zio_priority_t priority, enum zio_flag flags, + zio_done_func_t *done, void *private) { zio_t *zio; @@ -1211,8 +1211,8 @@ zio_write_bp_init(zio_t *zio) if (BP_IS_HOLE(bp) || !zp->zp_dedup) return (ZIO_PIPELINE_CONTINUE); - ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup || - zp->zp_dedup_verify); + ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags & + ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify); if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) { BP_SET_DEDUP(bp, 1); @@ -2074,12 +2074,22 @@ zio_write_gang_block(zio_t *pio) } /* - * The zio_nop_write stage in the pipeline determines if allocating - * a new bp is necessary. By leveraging a cryptographically secure checksum, - * such as SHA256, we can compare the checksums of the new data and the old - * to determine if allocating a new block is required. The nopwrite - * feature can handle writes in either syncing or open context (i.e. zil - * writes) and as a result is mutually exclusive with dedup. + * The zio_nop_write stage in the pipeline determines if allocating a + * new bp is necessary. The nopwrite feature can handle writes in + * either syncing or open context (i.e. zil writes) and as a result is + * mutually exclusive with dedup. + * + * By leveraging a cryptographically secure checksum, such as SHA256, we + * can compare the checksums of the new data and the old to determine if + * allocating a new block is required. Note that our requirements for + * cryptographic strength are fairly weak: there can't be any accidental + * hash collisions, but we don't need to be secure against intentional + * (malicious) collisions. To trigger a nopwrite, you have to be able + * to write the file to begin with, and triggering an incorrect (hash + * collision) nopwrite is no worse than simply writing to the file. + * That said, there are no known attacks against the checksum algorithms + * used for nopwrite, assuming that the salt and the checksums + * themselves remain secret. */ static int zio_nop_write(zio_t *zio) @@ -2102,7 +2112,8 @@ zio_nop_write(zio_t *zio) * allocate a new bp. */ if (BP_IS_HOLE(bp_orig) || - !zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_dedup || + !(zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_flags & + ZCHECKSUM_FLAG_NOPWRITE) || BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) || BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) || BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) || @@ -2114,7 +2125,8 @@ zio_nop_write(zio_t *zio) * avoid allocating a new bp and issuing any I/O. */ if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) { - ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup); + ASSERT(zio_checksum_table[zp->zp_checksum].ci_flags & + ZCHECKSUM_FLAG_NOPWRITE); ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig)); ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig)); ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF); @@ -2395,7 +2407,8 @@ zio_ddt_write(zio_t *zio) * we can't resolve it, so just convert to an ordinary write. * (And automatically e-mail a paper to Nature?) */ - if (!zio_checksum_table[zp->zp_checksum].ci_dedup) { + if (!(zio_checksum_table[zp->zp_checksum].ci_flags & + ZCHECKSUM_FLAG_DEDUP)) { zp->zp_checksum = spa_dedup_checksum(spa); zio_pop_transforms(zio); zio->io_stage = ZIO_STAGE_OPEN; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c index d1c60c3..6ba64e0 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c @@ -20,12 +20,14 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2013, 2015 by Delphix. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright 2013 Saso Kiselkov. All rights reserved. */ #include <sys/zfs_context.h> #include <sys/spa.h> +#include <sys/spa_impl.h> #include <sys/zio.h> #include <sys/zio_checksum.h> #include <sys/zil.h> @@ -59,29 +61,99 @@ * checksum function of the appropriate strength. When reading a block, * we compare the expected checksum against the actual checksum, which we * compute via the checksum function specified by BP_GET_CHECKSUM(bp). + * + * SALTED CHECKSUMS + * + * To enable the use of less secure hash algorithms with dedup, we + * introduce the notion of salted checksums (MACs, really). A salted + * checksum is fed both a random 256-bit value (the salt) and the data + * to be checksummed. This salt is kept secret (stored on the pool, but + * never shown to the user). Thus even if an attacker knew of collision + * weaknesses in the hash algorithm, they won't be able to mount a known + * plaintext attack on the DDT, since the actual hash value cannot be + * known ahead of time. How the salt is used is algorithm-specific + * (some might simply prefix it to the data block, others might need to + * utilize a full-blown HMAC). On disk the salt is stored in a ZAP + * object in the MOS (DMU_POOL_CHECKSUM_SALT). + * + * CONTEXT TEMPLATES + * + * Some hashing algorithms need to perform a substantial amount of + * initialization work (e.g. salted checksums above may need to pre-hash + * the salt) before being able to process data. Performing this + * redundant work for each block would be wasteful, so we instead allow + * a checksum algorithm to do the work once (the first time it's used) + * and then keep this pre-initialized context as a template inside the + * spa_t (spa_cksum_tmpls). If the zio_checksum_info_t contains + * non-NULL ci_tmpl_init and ci_tmpl_free callbacks, they are used to + * construct and destruct the pre-initialized checksum context. The + * pre-initialized context is then reused during each checksum + * invocation and passed to the checksum function. */ /*ARGSUSED*/ static void -zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp) +zio_checksum_off(const void *buf, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) { ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); } zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { - {{NULL, NULL}, 0, 0, 0, "inherit"}, - {{NULL, NULL}, 0, 0, 0, "on"}, - {{zio_checksum_off, zio_checksum_off}, 0, 0, 0, "off"}, - {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, 0, "label"}, - {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, 0, "gang_header"}, - {{fletcher_2_native, fletcher_2_byteswap}, 0, 1, 0, "zilog"}, - {{fletcher_2_native, fletcher_2_byteswap}, 0, 0, 0, "fletcher2"}, - {{fletcher_4_native, fletcher_4_byteswap}, 1, 0, 0, "fletcher4"}, - {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 0, 1, "sha256"}, - {{fletcher_4_native, fletcher_4_byteswap}, 0, 1, 0, "zilog2"}, - {{zio_checksum_off, zio_checksum_off}, 0, 0, 0, "noparity"}, + {{NULL, NULL}, NULL, NULL, 0, "inherit"}, + {{NULL, NULL}, NULL, NULL, 0, "on"}, + {{zio_checksum_off, zio_checksum_off}, + NULL, NULL, 0, "off"}, + {{zio_checksum_SHA256, zio_checksum_SHA256}, + NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, + "label"}, + {{zio_checksum_SHA256, zio_checksum_SHA256}, + NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, + "gang_header"}, + {{fletcher_2_native, fletcher_2_byteswap}, + NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog"}, + {{fletcher_2_native, fletcher_2_byteswap}, + NULL, NULL, 0, "fletcher2"}, + {{fletcher_4_native, fletcher_4_byteswap}, + NULL, NULL, ZCHECKSUM_FLAG_METADATA, "fletcher4"}, + {{zio_checksum_SHA256, zio_checksum_SHA256}, + NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | + ZCHECKSUM_FLAG_NOPWRITE, "sha256"}, + {{fletcher_4_native, fletcher_4_byteswap}, + NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog2"}, + {{zio_checksum_off, zio_checksum_off}, + NULL, NULL, 0, "noparity"}, +#ifdef illumos + {{zio_checksum_SHA512_native, zio_checksum_SHA512_byteswap}, + NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | + ZCHECKSUM_FLAG_NOPWRITE, "sha512"}, + {{zio_checksum_skein_native, zio_checksum_skein_byteswap}, + zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free, + ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | + ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"}, + {{zio_checksum_edonr_native, zio_checksum_edonr_byteswap}, + zio_checksum_edonr_tmpl_init, zio_checksum_edonr_tmpl_free, + ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED | + ZCHECKSUM_FLAG_NOPWRITE, "edonr"}, +#endif }; +spa_feature_t +zio_checksum_to_feature(enum zio_checksum cksum) +{ +#ifdef illumos + switch (cksum) { + case ZIO_CHECKSUM_SHA512: + return (SPA_FEATURE_SHA512); + case ZIO_CHECKSUM_SKEIN: + return (SPA_FEATURE_SKEIN); + case ZIO_CHECKSUM_EDONR: + return (SPA_FEATURE_EDONR); + } +#endif + return (SPA_FEATURE_NONE); +} + enum zio_checksum zio_checksum_select(enum zio_checksum child, enum zio_checksum parent) { @@ -115,7 +187,8 @@ zio_checksum_dedup_select(spa_t *spa, enum zio_checksum child, if (child == (ZIO_CHECKSUM_ON | ZIO_CHECKSUM_VERIFY)) return (spa_dedup_checksum(spa) | ZIO_CHECKSUM_VERIFY); - ASSERT(zio_checksum_table[child & ZIO_CHECKSUM_MASK].ci_dedup || + ASSERT((zio_checksum_table[child & ZIO_CHECKSUM_MASK].ci_flags & + ZCHECKSUM_FLAG_DEDUP) || (child & ZIO_CHECKSUM_VERIFY) || child == ZIO_CHECKSUM_OFF); return (child); @@ -148,21 +221,48 @@ zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset) } /* + * Calls the template init function of a checksum which supports context + * templates and installs the template into the spa_t. + */ +static void +zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa) +{ + zio_checksum_info_t *ci = &zio_checksum_table[checksum]; + + if (ci->ci_tmpl_init == NULL) + return; + if (spa->spa_cksum_tmpls[checksum] != NULL) + return; + + VERIFY(ci->ci_tmpl_free != NULL); + mutex_enter(&spa->spa_cksum_tmpls_lock); + if (spa->spa_cksum_tmpls[checksum] == NULL) { + spa->spa_cksum_tmpls[checksum] = + ci->ci_tmpl_init(&spa->spa_cksum_salt); + VERIFY(spa->spa_cksum_tmpls[checksum] != NULL); + } + mutex_exit(&spa->spa_cksum_tmpls_lock); +} + +/* * Generate the checksum. */ void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, - void *data, uint64_t size) + void *data, uint64_t size) { blkptr_t *bp = zio->io_bp; uint64_t offset = zio->io_offset; zio_checksum_info_t *ci = &zio_checksum_table[checksum]; zio_cksum_t cksum; + spa_t *spa = zio->io_spa; ASSERT((uint_t)checksum < ZIO_CHECKSUM_FUNCTIONS); ASSERT(ci->ci_func[0] != NULL); - if (ci->ci_eck) { + zio_checksum_template_init(checksum, spa); + + if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { zio_eck_t *eck; if (checksum == ZIO_CHECKSUM_ZILOG2) { @@ -181,10 +281,12 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, else bp->blk_cksum = eck->zec_cksum; eck->zec_magic = ZEC_MAGIC; - ci->ci_func[0](data, size, &cksum); + ci->ci_func[0](data, size, spa->spa_cksum_tmpls[checksum], + &cksum); eck->zec_cksum = cksum; } else { - ci->ci_func[0](data, size, &bp->blk_cksum); + ci->ci_func[0](data, size, spa->spa_cksum_tmpls[checksum], + &bp->blk_cksum); } } @@ -202,11 +304,14 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) void *data = zio->io_data; zio_checksum_info_t *ci = &zio_checksum_table[checksum]; zio_cksum_t actual_cksum, expected_cksum, verifier; + spa_t *spa = zio->io_spa; if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL) return (SET_ERROR(EINVAL)); - if (ci->ci_eck) { + zio_checksum_template_init(checksum, spa); + + if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { zio_eck_t *eck; if (checksum == ZIO_CHECKSUM_ZILOG2) { @@ -243,7 +348,8 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) expected_cksum = eck->zec_cksum; eck->zec_cksum = verifier; - ci->ci_func[byteswap](data, size, &actual_cksum); + ci->ci_func[byteswap](data, size, + spa->spa_cksum_tmpls[checksum], &actual_cksum); eck->zec_cksum = expected_cksum; if (byteswap) @@ -253,7 +359,8 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) ASSERT(!BP_IS_GANG(bp)); byteswap = BP_SHOULD_BYTESWAP(bp); expected_cksum = bp->blk_cksum; - ci->ci_func[byteswap](data, size, &actual_cksum); + ci->ci_func[byteswap](data, size, + spa->spa_cksum_tmpls[checksum], &actual_cksum); } info->zbc_expected = expected_cksum; @@ -275,3 +382,23 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) return (0); } + +/* + * Called by a spa_t that's about to be deallocated. This steps through + * all of the checksum context templates and deallocates any that were + * initialized using the algorithm-specific template init function. + */ +void +zio_checksum_templates_free(spa_t *spa) +{ + for (enum zio_checksum checksum = 0; + checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) { + if (spa->spa_cksum_tmpls[checksum] != NULL) { + zio_checksum_info_t *ci = &zio_checksum_table[checksum]; + + VERIFY(ci->ci_tmpl_free != NULL); + ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]); + spa->spa_cksum_tmpls[checksum] = NULL; + } + } +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zrlock.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zrlock.c index 2215184..7f6beee 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zrlock.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zrlock.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2014 by Delphix. All rights reserved. + * Copyright (c) 2014, 2015 by Delphix. All rights reserved. */ /* @@ -69,11 +69,7 @@ zrl_destroy(zrlock_t *zrl) } void -#ifdef ZFS_DEBUG -zrl_add_debug(zrlock_t *zrl, const char *zc) -#else -zrl_add(zrlock_t *zrl) -#endif +zrl_add_impl(zrlock_t *zrl, const char *zc) { uint32_t n = (uint32_t)zrl->zr_refcount; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c index 491c365..ce8eed3 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c @@ -2336,13 +2336,15 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize) vdev_t *vd = spa->spa_root_vdev; nvlist_t *nv = NULL; uint64_t version = spa_version(spa); - enum zio_checksum checksum; + uint64_t checksum, compress, refresrv, vbs, dedup; ASSERT(MUTEX_HELD(&zfsdev_state_lock)); ASSERT(vd->vdev_ops == &vdev_root_ops); error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, 0, DMU_OBJECT_END); + if (error != 0) + return (error); /* wait for dmu_free_long_range to actually free the blocks */ txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); @@ -2366,24 +2368,42 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize) 2, ZFS_SPACE_CHECK_RESERVED); } + if (!resize) { + error = dsl_prop_get_integer(zv->zv_name, + zfs_prop_to_name(ZFS_PROP_COMPRESSION), &compress, NULL); + if (error == 0) { + error = dsl_prop_get_integer(zv->zv_name, + zfs_prop_to_name(ZFS_PROP_CHECKSUM), &checksum, + NULL); + } + if (error == 0) { + error = dsl_prop_get_integer(zv->zv_name, + zfs_prop_to_name(ZFS_PROP_REFRESERVATION), + &refresrv, NULL); + } + if (error == 0) { + error = dsl_prop_get_integer(zv->zv_name, + zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &vbs, + NULL); + } + if (version >= SPA_VERSION_DEDUP && error == 0) { + error = dsl_prop_get_integer(zv->zv_name, + zfs_prop_to_name(ZFS_PROP_DEDUP), &dedup, NULL); + } + } + if (error != 0) + return (error); + tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); dmu_tx_hold_bonus(tx, ZVOL_OBJ); error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { + if (error != 0) { dmu_tx_abort(tx); return (error); } /* - * If MULTI_VDEV_CRASH_DUMP is active, use the NOPARITY checksum - * function. Otherwise, use the old default -- OFF. - */ - checksum = spa_feature_is_active(spa, - SPA_FEATURE_MULTI_VDEV_CRASH_DUMP) ? ZIO_CHECKSUM_NOPARITY : - ZIO_CHECKSUM_OFF; - - /* * If we are resizing the dump device then we only need to * update the refreservation to match the newly updated * zvolsize. Otherwise, we save off the original state of the @@ -2394,37 +2414,30 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize) zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, &zv->zv_volsize, tx); } else { - uint64_t checksum, compress, refresrv, vbs, dedup; - - error = dsl_prop_get_integer(zv->zv_name, - zfs_prop_to_name(ZFS_PROP_COMPRESSION), &compress, NULL); - error = error ? error : dsl_prop_get_integer(zv->zv_name, - zfs_prop_to_name(ZFS_PROP_CHECKSUM), &checksum, NULL); - error = error ? error : dsl_prop_get_integer(zv->zv_name, - zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &refresrv, NULL); - error = error ? error : dsl_prop_get_integer(zv->zv_name, - zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &vbs, NULL); - if (version >= SPA_VERSION_DEDUP) { - error = error ? error : - dsl_prop_get_integer(zv->zv_name, - zfs_prop_to_name(ZFS_PROP_DEDUP), &dedup, NULL); - } - - error = error ? error : zap_update(os, ZVOL_ZAP_OBJ, + error = zap_update(os, ZVOL_ZAP_OBJ, zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1, &compress, tx); - error = error ? error : zap_update(os, ZVOL_ZAP_OBJ, - zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum, tx); - error = error ? error : zap_update(os, ZVOL_ZAP_OBJ, - zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, - &refresrv, tx); - error = error ? error : zap_update(os, ZVOL_ZAP_OBJ, - zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1, - &vbs, tx); - error = error ? error : dmu_object_set_blocksize( - os, ZVOL_OBJ, SPA_OLD_MAXBLOCKSIZE, 0, tx); - if (version >= SPA_VERSION_DEDUP) { - error = error ? error : zap_update(os, ZVOL_ZAP_OBJ, + if (error == 0) { + error = zap_update(os, ZVOL_ZAP_OBJ, + zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, + &checksum, tx); + } + if (error == 0) { + error = zap_update(os, ZVOL_ZAP_OBJ, + zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, + &refresrv, tx); + } + if (error == 0) { + error = zap_update(os, ZVOL_ZAP_OBJ, + zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1, + &vbs, tx); + } + if (error == 0) { + error = dmu_object_set_blocksize( + os, ZVOL_OBJ, SPA_OLD_MAXBLOCKSIZE, 0, tx); + } + if (version >= SPA_VERSION_DEDUP && error == 0) { + error = zap_update(os, ZVOL_ZAP_OBJ, zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1, &dedup, tx); } @@ -2437,7 +2450,15 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize) * We only need update the zvol's property if we are initializing * the dump area for the first time. */ - if (!resize) { + if (error == 0 && !resize) { + /* + * If MULTI_VDEV_CRASH_DUMP is active, use the NOPARITY checksum + * function. Otherwise, use the old default -- OFF. + */ + checksum = spa_feature_is_active(spa, + SPA_FEATURE_MULTI_VDEV_CRASH_DUMP) ? ZIO_CHECKSUM_NOPARITY : + ZIO_CHECKSUM_OFF; + VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_uint64(nv, zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 0) == 0); @@ -2456,13 +2477,11 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize) error = zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL, nv, NULL); nvlist_free(nv); - - if (error) - return (error); } /* Allocate the space for the dump */ - error = zvol_prealloc(zv); + if (error == 0) + error = zvol_prealloc(zv); return (error); } |