diff options
author | neel <neel@FreeBSD.org> | 2012-12-04 04:37:42 +0000 |
---|---|---|
committer | neel <neel@FreeBSD.org> | 2012-12-04 04:37:42 +0000 |
commit | d8091074f2295a4be6817c9fe8802b9b0db9346d (patch) | |
tree | 6ef29583fd1f17922e7c5b7d49d2d96d8ff38ebb /sys/cddl/contrib | |
parent | d45d8a8d668a9ee12073939bf775c404be8cf175 (diff) | |
parent | 6b76c5a1b878e8001537fb323cb0c88fca9dc2df (diff) | |
download | FreeBSD-src-d8091074f2295a4be6817c9fe8802b9b0db9346d.zip FreeBSD-src-d8091074f2295a4be6817c9fe8802b9b0db9346d.tar.gz |
IFC @r243836
Diffstat (limited to 'sys/cddl/contrib')
25 files changed, 786 insertions, 317 deletions
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c b/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c index 398963a..5f5b8da 100644 --- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c +++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c @@ -109,6 +109,7 @@ zfs_prop_init(void) { "discard", ZFS_ACL_DISCARD }, { "groupmask", ZFS_ACL_GROUPMASK }, { "passthrough", ZFS_ACL_PASSTHROUGH }, + { "restricted", ZFS_ACL_RESTRICTED }, { NULL } }; @@ -217,7 +218,8 @@ zfs_prop_init(void) "hidden | visible", "SNAPDIR", snapdir_table); zprop_register_index(ZFS_PROP_ACLMODE, "aclmode", ZFS_ACL_DISCARD, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, - "discard | groupmask | passthrough", "ACLMODE", acl_mode_table); + "discard | groupmask | passthrough | restricted", "ACLMODE", + acl_mode_table); zprop_register_index(ZFS_PROP_ACLINHERIT, "aclinherit", ZFS_ACL_RESTRICTED, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "discard | noallow | restricted | passthrough | passthrough-x", diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c b/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c index 436918b3..59944a1 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c @@ -665,8 +665,10 @@ gfs_file_inactive(vnode_t *vp) ge = NULL; found: +#ifdef TODO if (vp->v_flag & V_XATTRDIR) VI_LOCK(fp->gfs_parent); +#endif VI_LOCK(vp); /* * Really remove this vnode @@ -687,16 +689,17 @@ found: if (fp->gfs_parent) { if (dp) gfs_dir_unlock(dp); - VI_LOCK(fp->gfs_parent); - fp->gfs_parent->v_usecount--; - VI_UNLOCK(fp->gfs_parent); + VOP_UNLOCK(vp, 0); + VN_RELE(fp->gfs_parent); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } else { ASSERT(vp->v_vfsp != NULL); VFS_RELE(vp->v_vfsp); } +#ifdef TODO if (vp->v_flag & V_XATTRDIR) VI_UNLOCK(fp->gfs_parent); - +#endif return (data); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c index 607364f..cdd3fe7 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c @@ -3688,6 +3688,12 @@ arc_write_done(zio_t *zio) arc_hdr_destroy(exists); exists = buf_hash_insert(hdr, &hash_lock); ASSERT3P(exists, ==, NULL); + } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) { + /* nopwrite */ + ASSERT(zio->io_prop.zp_nopwrite); + if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) + panic("bad nopwrite, hdr=%p exists=%p", + (void *)hdr, (void *)exists); } else { /* Dedup */ ASSERT(hdr->b_datacnt == 1); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c index 50f1bea..31d98cd 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c @@ -768,13 +768,15 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) ASSERT(db->db_data_pending != dr); /* free this block */ - if (!BP_IS_HOLE(bp)) { + if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) { spa_t *spa; DB_GET_SPA(&spa, db); zio_free(spa, txg, bp); } dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; + dr->dt.dl.dr_nopwrite = B_FALSE; + /* * Release the already-written buffer, so we leave it in * a consistent dirty state. Note that all callers are @@ -2189,6 +2191,13 @@ dmu_buf_freeable(dmu_buf_t *dbuf) return (res); } +blkptr_t * +dmu_buf_get_blkptr(dmu_buf_t *db) +{ + dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; + return (dbi->db_blkptr); +} + static void dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) { @@ -2531,7 +2540,11 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) ASSERT0(zio->io_error); ASSERT(db->db_blkptr == bp); - if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { + /* + * For nopwrites and rewrites we ensure that the bp matches our + * original and bypass all the accounting. + */ + if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) { ASSERT(BP_EQUAL(bp, bp_orig)); } else { objset_t *os; @@ -2722,7 +2735,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) mutex_enter(&db->db_mtx); dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, - dr->dt.dl.dr_copies); + dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite); mutex_exit(&db->db_mtx); } else if (db->db_state == DB_NOFILL) { ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c index a4c63f3..3167d42 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c @@ -40,11 +40,21 @@ #include <sys/zfs_ioctl.h> #include <sys/zap.h> #include <sys/zio_checksum.h> +#include <sys/zio_compress.h> #include <sys/sa.h> #ifdef _KERNEL #include <sys/zfs_znode.h> #endif +/* + * Enable/disable nopwrite feature. + */ +int zfs_nopwrite_enabled = 1; +SYSCTL_DECL(_vfs_zfs); +TUNABLE_INT("vfs.zfs.nopwrite_enabled", &zfs_nopwrite_enabled); +SYSCTL_INT(_vfs_zfs, OID_AUTO, nopwrite_enabled, CTLFLAG_RDTUN, + &zfs_nopwrite_enabled, 0, "Enable nopwrite feature"); + const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { { DMU_BSWAP_UINT8, TRUE, "unallocated" }, { DMU_BSWAP_ZAP, TRUE, "object directory" }, @@ -1287,6 +1297,16 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) mutex_enter(&db->db_mtx); ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC); if (zio->io_error == 0) { + dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE); + if (dr->dt.dl.dr_nopwrite) { + blkptr_t *bp = zio->io_bp; + blkptr_t *bp_orig = &zio->io_bp_orig; + uint8_t chksum = BP_GET_CHECKSUM(bp_orig); + + ASSERT(BP_EQUAL(bp, bp_orig)); + ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF); + ASSERT(zio_checksum_table[chksum].ci_dedup); + } dr->dt.dl.dr_overridden_by = *zio->io_bp; dr->dt.dl.dr_override_state = DR_OVERRIDDEN; dr->dt.dl.dr_copies = zio->io_prop.zp_copies; @@ -1308,11 +1328,22 @@ dmu_sync_late_arrival_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; dmu_sync_arg_t *dsa = zio->io_private; + blkptr_t *bp_orig = &zio->io_bp_orig; if (zio->io_error == 0 && !BP_IS_HOLE(bp)) { - ASSERT(zio->io_bp->blk_birth == zio->io_txg); - ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa)); - zio_free(zio->io_spa, zio->io_txg, zio->io_bp); + /* + * If we didn't allocate a new block (i.e. ZIO_FLAG_NOPWRITE) + * then there is nothing to do here. Otherwise, free the + * newly allocated block in this txg. + */ + if (zio->io_flags & ZIO_FLAG_NOPWRITE) { + ASSERT(BP_EQUAL(bp, bp_orig)); + } else { + ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig)); + ASSERT(zio->io_bp->blk_birth == zio->io_txg); + ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa)); + zio_free(zio->io_spa, zio->io_txg, zio->io_bp); + } } dmu_tx_commit(dsa->dsa_tx); @@ -1357,7 +1388,7 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, * * Return values: * - * EEXIST: this txg has already been synced, so there's nothing to to. + * EEXIST: this txg has already been synced, so there's nothing to do. * The caller should not log the write. * * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do. @@ -1389,7 +1420,6 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) dnode_t *dn; ASSERT(pio != NULL); - ASSERT(BP_IS_HOLE(bp)); ASSERT(txg != 0); SET_BOOKMARK(&zb, ds->ds_object, @@ -1444,6 +1474,23 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) return (ENOENT); } + ASSERT(dr->dr_next == NULL || dr->dr_next->dr_txg < txg); + + /* + * Assume the on-disk data is X, the current syncing data is Y, + * and the current in-memory data is Z (currently in dmu_sync). + * X and Z are identical but Y is has been modified. Normally, + * when X and Z are the same we will perform a nopwrite but if Y + * is different we must disable nopwrite since the resulting write + * of Y to disk can free the block containing X. If we allowed a + * nopwrite to occur the block pointing to Z would reference a freed + * block. Since this is a rare case we simplify this by disabling + * nopwrite if the current dmu_sync-ing dbuf has been modified in + * a previous transaction. + */ + if (dr->dr_next) + zp.zp_nopwrite = B_FALSE; + ASSERT(dr->dr_txg == txg); if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC || dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { @@ -1519,7 +1566,6 @@ dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, int zfs_mdcomp_disable = 0; TUNABLE_INT("vfs.zfs.mdcomp_disable", &zfs_mdcomp_disable); -SYSCTL_DECL(_vfs_zfs); SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RW, &zfs_mdcomp_disable, 0, "Disable metadata compression"); @@ -1532,15 +1578,27 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) enum zio_checksum checksum = os->os_checksum; enum zio_compress compress = os->os_compress; enum zio_checksum dedup_checksum = os->os_dedup_checksum; - boolean_t dedup; + boolean_t dedup = B_FALSE; + boolean_t nopwrite = B_FALSE; boolean_t dedup_verify = os->os_dedup_verify; int copies = os->os_copies; /* - * Determine checksum setting. + * We maintain different write policies for each of the following + * types of data: + * 1. metadata + * 2. preallocated blocks (i.e. level-0 blocks of a dump device) + * 3. all other level 0 blocks */ if (ismd) { /* + * XXX -- we should design a compression algorithm + * that specializes in arrays of bps. + */ + compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY : + ZIO_COMPRESS_LZJB; + + /* * Metadata always gets checksummed. If the data * checksum is multi-bit correctable, and it's not a * ZBT-style checksum, then it's suitable for metadata @@ -1550,45 +1608,47 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) if (zio_checksum_table[checksum].ci_correctable < 1 || zio_checksum_table[checksum].ci_eck) checksum = ZIO_CHECKSUM_FLETCHER_4; - } else { - checksum = zio_checksum_select(dn->dn_checksum, checksum); - } + } else if (wp & WP_NOFILL) { + ASSERT(level == 0); - /* - * Determine compression setting. - */ - if (ismd) { /* - * XXX -- we should design a compression algorithm - * that specializes in arrays of bps. + * If we're writing preallocated blocks, we aren't actually + * writing them so don't set any policy properties. These + * blocks are currently only used by an external subsystem + * outside of zfs (i.e. dump) and not written by the zio + * pipeline. */ - compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY : - ZIO_COMPRESS_LZJB; + compress = ZIO_COMPRESS_OFF; + checksum = ZIO_CHECKSUM_OFF; } else { compress = zio_compress_select(dn->dn_compress, compress); - } - /* - * Determine dedup setting. If we are in dmu_sync(), we won't - * actually dedup now because that's all done in syncing context; - * but we do want to use the dedup checkum. If the checksum is not - * strong enough to ensure unique signatures, force dedup_verify. - */ - dedup = (!ismd && dedup_checksum != ZIO_CHECKSUM_OFF); - if (dedup) { - checksum = dedup_checksum; - if (!zio_checksum_table[checksum].ci_dedup) - dedup_verify = 1; - } + checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ? + zio_checksum_select(dn->dn_checksum, checksum) : + dedup_checksum; - if (wp & WP_DMU_SYNC) - dedup = 0; + /* + * Determine dedup setting. If we are in dmu_sync(), + * we won't actually dedup now because that's all + * done in syncing context; but we do want to use the + * dedup checkum. If the checksum is not strong + * enough to ensure unique signatures, force + * dedup_verify. + */ + if (dedup_checksum != ZIO_CHECKSUM_OFF) { + dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE; + if (!zio_checksum_table[checksum].ci_dedup) + dedup_verify = B_TRUE; + } - if (wp & WP_NOFILL) { - ASSERT(!ismd && level == 0); - checksum = ZIO_CHECKSUM_OFF; - compress = ZIO_COMPRESS_OFF; - dedup = B_FALSE; + /* + * Enable nopwrite if we have a cryptographically secure + * checksum that has no known collisions (i.e. SHA-256) + * and compression is enabled. We don't enable nopwrite if + * dedup is enabled as the two features are mutually exclusive. + */ + nopwrite = (!dedup && zio_checksum_table[checksum].ci_dedup && + compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled); } zp->zp_checksum = checksum; @@ -1598,6 +1658,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) zp->zp_copies = MIN(copies + ismd, spa_max_replication(os->os_spa)); zp->zp_dedup = dedup; zp->zp_dedup_verify = dedup && dedup_verify; + zp->zp_nopwrite = nopwrite; } int diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c index 510039d..73c43da 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c @@ -440,7 +440,6 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) * clean up our in-memory structures accumulated while syncing: * * - move dead blocks from the pending deadlist to the on-disk deadlist - * - clean up zil records * - release hold from dsl_dataset_dirty() */ while (ds = list_remove_head(&synced_datasets)) { diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c index a0723a3..5d83f69 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c @@ -881,8 +881,9 @@ metaslab_activate(metaslab_t *msp, uint64_t activation_weight) if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { space_map_load_wait(sm); if (!sm->sm_loaded) { - int error = space_map_load(sm, sm_ops, SM_FREE, - &msp->ms_smo, + space_map_obj_t *smo = &msp->ms_smo; + + int error = space_map_load(sm, sm_ops, SM_FREE, smo, spa_meta_objset(msp->ms_group->mg_vd->vdev_spa)); if (error) { metaslab_group_sort(msp->ms_group, msp, 0); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c index 7a2e657..c978f8a 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c @@ -139,6 +139,7 @@ boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ uint_t zio_taskq_basedc = 80; /* base duty cycle */ boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ +extern int zfs_sync_pass_deferred_free; /* * This (illegal) pool name is used when temporarily importing a spa_t in order @@ -3755,43 +3756,120 @@ out: #else -extern int -vdev_geom_read_pool_label(const char *name, nvlist_t **config); +extern int vdev_geom_read_pool_label(const char *name, nvlist_t ***configs, + uint64_t *count); static nvlist_t * spa_generate_rootconf(const char *name) { + nvlist_t **configs, **tops; nvlist_t *config; - nvlist_t *nvtop, *nvroot; + nvlist_t *best_cfg, *nvtop, *nvroot; + uint64_t *holes; + uint64_t best_txg; + uint64_t nchildren; uint64_t pgid; + uint64_t count; + uint64_t i; + uint_t nholes; - if (vdev_geom_read_pool_label(name, &config) != 0) + if (vdev_geom_read_pool_label(name, &configs, &count) != 0) return (NULL); + ASSERT3U(count, !=, 0); + best_txg = 0; + for (i = 0; i < count; i++) { + uint64_t txg; + + VERIFY(nvlist_lookup_uint64(configs[i], ZPOOL_CONFIG_POOL_TXG, + &txg) == 0); + if (txg > best_txg) { + best_txg = txg; + best_cfg = configs[i]; + } + } + /* - * Add this top-level vdev to the child array. + * Multi-vdev root pool configuration discovery is not supported yet. */ - VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, - &nvtop) == 0); - VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, - &pgid) == 0); + nchildren = 0; + VERIFY(nvlist_lookup_uint64(best_cfg, ZPOOL_CONFIG_VDEV_CHILDREN, + &nchildren) == 0); + holes = NULL; + nvlist_lookup_uint64_array(best_cfg, ZPOOL_CONFIG_HOLE_ARRAY, + &holes, &nholes); + + tops = kmem_alloc(nchildren * sizeof(void *), KM_SLEEP | KM_ZERO); + for (i = 0; i < nchildren; i++) { + if (i >= count) + break; + if (configs[i] == NULL) + continue; + VERIFY(nvlist_lookup_nvlist(configs[i], ZPOOL_CONFIG_VDEV_TREE, + &nvtop) == 0); + nvlist_dup(nvtop, &tops[i], KM_SLEEP); + } + for (i = 0; holes != NULL && i < nholes; i++) { + if (i >= nchildren) + continue; + if (tops[holes[i]] != NULL) + continue; + nvlist_alloc(&tops[holes[i]], NV_UNIQUE_NAME, KM_SLEEP); + VERIFY(nvlist_add_string(tops[holes[i]], ZPOOL_CONFIG_TYPE, + VDEV_TYPE_HOLE) == 0); + VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_ID, + holes[i]) == 0); + VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_GUID, + 0) == 0); + } + for (i = 0; i < nchildren; i++) { + if (tops[i] != NULL) + continue; + nvlist_alloc(&tops[i], NV_UNIQUE_NAME, KM_SLEEP); + VERIFY(nvlist_add_string(tops[i], ZPOOL_CONFIG_TYPE, + VDEV_TYPE_MISSING) == 0); + VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_ID, + i) == 0); + VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_GUID, + 0) == 0); + } + + /* + * Create pool config based on the best vdev config. + */ + nvlist_dup(best_cfg, &config, KM_SLEEP); /* * Put this pool's top-level vdevs into a root vdev. */ + VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, + &pgid) == 0); VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) == 0); VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, - &nvtop, 1) == 0); + tops, nchildren) == 0); /* * Replace the existing vdev_tree with the new root vdev in * this pool's configuration (remove the old, add the new). */ VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); + + /* + * Drop vdev config elements that should not be present at pool level. + */ + nvlist_remove(config, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64); + nvlist_remove(config, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64); + + for (i = 0; i < count; i++) + nvlist_free(configs[i]); + kmem_free(configs, count * sizeof(void *)); + for (i = 0; i < nchildren; i++) + nvlist_free(tops[i]); + kmem_free(tops, nchildren * sizeof(void *)); nvlist_free(nvroot); return (config); } @@ -3810,25 +3888,38 @@ spa_import_rootpool(const char *name) * Read the label from the boot device and generate a configuration. */ config = spa_generate_rootconf(name); - if (config == NULL) { - cmn_err(CE_NOTE, "Cannot find the pool label for '%s'", - name); - return (EIO); - } - - VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, - &pname) == 0 && strcmp(name, pname) == 0); - VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); mutex_enter(&spa_namespace_lock); - if ((spa = spa_lookup(pname)) != NULL) { + if (config != NULL) { + VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, + &pname) == 0 && strcmp(name, pname) == 0); + VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) + == 0); + + if ((spa = spa_lookup(pname)) != NULL) { + /* + * Remove the existing root pool from the namespace so + * that we can replace it with the correct config + * we just read in. + */ + spa_remove(spa); + } + spa = spa_add(pname, config, NULL); + /* - * Remove the existing root pool from the namespace so that we - * can replace it with the correct config we just read in. + * Set spa_ubsync.ub_version as it can be used in vdev_alloc() + * via spa_version(). */ - spa_remove(spa); + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, + &spa->spa_ubsync.ub_version) != 0) + spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; + } else if ((spa = spa_lookup(name)) == NULL) { + cmn_err(CE_NOTE, "Cannot find the pool label for '%s'", + name); + return (EIO); + } else { + VERIFY(nvlist_dup(spa->spa_config, &config, KM_SLEEP) == 0); } - spa = spa_add(pname, config, NULL); spa->spa_is_root = B_TRUE; spa->spa_import_flags = ZFS_IMPORT_VERBATIM; @@ -3849,15 +3940,13 @@ spa_import_rootpool(const char *name) return (error); } - error = 0; - spa_history_log_version(spa, LOG_POOL_IMPORT); -out: spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); vdev_free(rvd); spa_config_exit(spa, SCL_ALL, FTAG); mutex_exit(&spa_namespace_lock); - return (error); + nvlist_free(config); + return (0); } #endif /* sun */ @@ -5896,6 +5985,14 @@ spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) config = spa_config_generate(spa, spa->spa_root_vdev, dmu_tx_get_txg(tx), B_FALSE); + /* + * If we're upgrading the spa version then make sure that + * the config object gets updated with the correct version. + */ + if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) + fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, + spa->spa_uberblock.ub_version); + spa_config_exit(spa, SCL_STATE, FTAG); if (spa->spa_config_syncing) @@ -6214,7 +6311,7 @@ spa_sync(spa_t *spa, uint64_t txg) spa_errlog_sync(spa, txg); dsl_pool_sync(dp, txg); - if (pass <= SYNC_PASS_DEFERRED_FREE) { + if (pass < zfs_sync_pass_deferred_free) { zio_t *zio = zio_root(spa, NULL, NULL, 0); bplist_iterate(free_bpl, spa_free_sync_cb, zio, tx); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c index ae58ae9..606fd18 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c @@ -1619,6 +1619,7 @@ spa_init(int mode) #endif /* illumos */ refcount_sysinit(); unique_init(); + space_map_init(); zio_init(); dmu_init(); zil_init(); @@ -1641,6 +1642,7 @@ spa_fini(void) zil_fini(); dmu_fini(); zio_fini(); + space_map_fini(); unique_fini(); refcount_fini(); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c index c7be1b4..bebb0f3 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c @@ -39,6 +39,23 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, space_map_last_hope, CTLFLAG_RDTUN, &space_map_last_hope, 0, "If kernel panic in space_map code on pool import, import the pool in readonly mode and backup all your data before trying this option."); +static kmem_cache_t *space_seg_cache; + +void +space_map_init(void) +{ + ASSERT(space_seg_cache == NULL); + space_seg_cache = kmem_cache_create("space_seg_cache", + sizeof (space_seg_t), 0, NULL, NULL, NULL, NULL, NULL, 0); +} + +void +space_map_fini(void) +{ + kmem_cache_destroy(space_seg_cache); + space_seg_cache = NULL; +} + /* * Space map routines. * NOTE: caller is responsible for all locking. @@ -148,7 +165,7 @@ again: avl_remove(sm->sm_pp_root, ss_after); } ss_after->ss_start = ss_before->ss_start; - kmem_free(ss_before, sizeof (*ss_before)); + kmem_cache_free(space_seg_cache, ss_before); ss = ss_after; } else if (merge_before) { ss_before->ss_end = end; @@ -161,7 +178,7 @@ again: avl_remove(sm->sm_pp_root, ss_after); ss = ss_after; } else { - ss = kmem_alloc(sizeof (*ss), KM_SLEEP); + ss = kmem_cache_alloc(space_seg_cache, KM_SLEEP); ss->ss_start = start; ss->ss_end = end; avl_insert(&sm->sm_root, ss, where); @@ -207,7 +224,7 @@ space_map_remove(space_map_t *sm, uint64_t start, uint64_t size) avl_remove(sm->sm_pp_root, ss); if (left_over && right_over) { - newseg = kmem_alloc(sizeof (*newseg), KM_SLEEP); + newseg = kmem_cache_alloc(space_seg_cache, KM_SLEEP); newseg->ss_start = end; newseg->ss_end = ss->ss_end; ss->ss_end = start; @@ -220,7 +237,7 @@ space_map_remove(space_map_t *sm, uint64_t start, uint64_t size) ss->ss_start = end; } else { avl_remove(&sm->sm_root, ss); - kmem_free(ss, sizeof (*ss)); + kmem_cache_free(space_seg_cache, ss); ss = NULL; } @@ -260,7 +277,7 @@ space_map_vacate(space_map_t *sm, space_map_func_t *func, space_map_t *mdest) while ((ss = avl_destroy_nodes(&sm->sm_root, &cookie)) != NULL) { if (func != NULL) func(mdest, ss->ss_start, ss->ss_end - ss->ss_start); - kmem_free(ss, sizeof (*ss)); + kmem_cache_free(space_seg_cache, ss); } sm->sm_space = 0; } @@ -432,7 +449,7 @@ space_map_sync(space_map_t *sm, uint8_t maptype, spa_t *spa = dmu_objset_spa(os); void *cookie = NULL; space_seg_t *ss; - uint64_t bufsize, start, size, run_len; + uint64_t bufsize, start, size, run_len, delta, sm_space; uint64_t *entry, *entry_map, *entry_map_end; ASSERT(MUTEX_HELD(sm->sm_lock)); @@ -461,11 +478,13 @@ space_map_sync(space_map_t *sm, uint8_t maptype, SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(spa)) | SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx)); + delta = 0; + sm_space = sm->sm_space; while ((ss = avl_destroy_nodes(&sm->sm_root, &cookie)) != NULL) { size = ss->ss_end - ss->ss_start; start = (ss->ss_start - sm->sm_start) >> sm->sm_shift; - sm->sm_space -= size; + delta += size; size >>= sm->sm_shift; while (size) { @@ -487,7 +506,7 @@ space_map_sync(space_map_t *sm, uint8_t maptype, start += run_len; size -= run_len; } - kmem_free(ss, sizeof (*ss)); + kmem_cache_free(space_seg_cache, ss); } if (entry != entry_map) { @@ -499,8 +518,15 @@ space_map_sync(space_map_t *sm, uint8_t maptype, smo->smo_objsize += size; } + /* + * Ensure that the space_map's accounting wasn't changed + * while we were in the middle of writing it out. + */ + VERIFY3U(sm->sm_space, ==, sm_space); + zio_buf_free(entry_map, bufsize); + sm->sm_space -= delta; VERIFY0(sm->sm_space); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h index cf1bbc0..8591f15 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #ifndef _SYS_DBUF_H @@ -130,6 +131,7 @@ typedef struct dbuf_dirty_record { blkptr_t dr_overridden_by; override_states_t dr_override_state; uint8_t dr_copies; + boolean_t dr_nopwrite; } dl; } dt; } dbuf_dirty_record_t; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h index ef9d0cc..4d36b465 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h @@ -505,6 +505,11 @@ void dmu_evict_user(objset_t *os, dmu_buf_evict_func_t *func); void *dmu_buf_get_user(dmu_buf_t *db); /* + * Returns the blkptr associated with this dbuf, or NULL if not set. + */ +struct blkptr *dmu_buf_get_blkptr(dmu_buf_t *db); + +/* * Indicate that you are going to modify the buffer's data (db_data). * * The transaction (tx) must be assigned to a txg (ie. you've called diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h index 6c670a1..f1f1b38 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h @@ -21,7 +21,10 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright (c) 2011 by Delphix. All rights reserved. + */ + +/* + * Copyright (c) 2012 by Delphix. All rights reserved. */ #ifndef _SYS_METASLAB_IMPL_H diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h index 0211ed5..1ee766a 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h @@ -489,14 +489,6 @@ extern int spa_scan_stop(spa_t *spa); extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */ extern void spa_sync_allpools(void); -/* - * DEFERRED_FREE must be large enough that regular blocks are not - * deferred. XXX so can't we change it back to 1? - */ -#define SYNC_PASS_DEFERRED_FREE 2 /* defer frees after this pass */ -#define SYNC_PASS_DONT_COMPRESS 4 /* don't compress after this pass */ -#define SYNC_PASS_REWRITE 1 /* rewrite new bps after this pass */ - /* spa namespace global mutex */ extern kmutex_t spa_namespace_lock; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h index 6f935c9..463b6bb 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h @@ -23,6 +23,10 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + #ifndef _SYS_SPACE_MAP_H #define _SYS_SPACE_MAP_H @@ -136,6 +140,8 @@ struct space_map_ops { typedef void space_map_func_t(space_map_t *sm, uint64_t start, uint64_t size); +extern void space_map_init(void); +extern void space_map_fini(void); extern void space_map_create(space_map_t *sm, uint64_t start, uint64_t size, uint8_t shift, kmutex_t *lp); extern void space_map_destroy(space_map_t *sm); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h index d3955d7..6281f8e 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h @@ -207,8 +207,6 @@ typedef struct znode { list_node_t z_link_node; /* all znodes in fs link */ sa_handle_t *z_sa_hdl; /* handle to sa data */ boolean_t z_is_sa; /* are we native sa? */ - /* FreeBSD-specific field. */ - struct task z_task; } znode_t; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h index 068e68b..47a0bba 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h @@ -188,7 +188,9 @@ enum zio_flag { ZIO_FLAG_RAW = 1 << 21, ZIO_FLAG_GANG_CHILD = 1 << 22, ZIO_FLAG_DDT_CHILD = 1 << 23, - ZIO_FLAG_GODFATHER = 1 << 24 + ZIO_FLAG_GODFATHER = 1 << 24, + ZIO_FLAG_NOPWRITE = 1 << 25, + ZIO_FLAG_REEXECUTED = 1 << 26, }; #define ZIO_FLAG_MUSTSUCCEED 0 @@ -287,8 +289,9 @@ typedef struct zio_prop { dmu_object_type_t zp_type; uint8_t zp_level; uint8_t zp_copies; - uint8_t zp_dedup; - uint8_t zp_dedup_verify; + boolean_t zp_dedup; + boolean_t zp_dedup_verify; + boolean_t zp_nopwrite; } zio_prop_t; typedef struct zio_cksum_report zio_cksum_report_t; @@ -491,7 +494,8 @@ extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, uint64_t size, zio_done_func_t *done, void *priv, int priority, enum zio_flag flags, zbookmark_t *zb); -extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies); +extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies, + boolean_t nopwrite); extern void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h index 86ea979..aa44744 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h @@ -23,6 +23,10 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + #ifndef _ZIO_IMPL_H #define _ZIO_IMPL_H @@ -34,6 +38,70 @@ extern "C" { #endif /* + * XXX -- Describe ZFS I/O pipleine here. Fill in as needed. + * + * The ZFS I/O pipeline is comprised of various stages which are defined + * in the zio_stage enum below. The individual stages are used to construct + * these basic I/O operations: Read, Write, Free, Claim, and Ioctl. + * + * I/O operations: (XXX - provide detail for each of the operations) + * + * Read: + * Write: + * Free: + * Claim: + * Ioctl: + * + * Although the most common pipeline are used by the basic I/O operations + * above, there are some helper pipelines (one could consider them + * sub-pipelines) which are used internally by the ZIO module and are + * explained below: + * + * Interlock Pipeline: + * The interlock pipeline is the most basic pipeline and is used by all + * of the I/O operations. The interlock pipeline does not perform any I/O + * and is used to coordinate the dependencies between I/Os that are being + * issued (i.e. the parent/child relationship). + * + * Vdev child Pipeline: + * The vdev child pipeline is responsible for performing the physical I/O. + * It is in this pipeline where the I/O are queued and possibly cached. + * + * In addition to performing I/O, the pipeline is also responsible for + * data transformations. The transformations performed are based on the + * specific properties that user may have selected and modify the + * behavior of the pipeline. Examples of supported transformations are + * compression, dedup, and nop writes. Transformations will either modify + * the data or the pipeline. This list below further describes each of + * the supported transformations: + * + * Compression: + * ZFS supports three different flavors of compression -- gzip, lzjb, and + * zle. Compression occurs as part of the write pipeline and is performed + * in the ZIO_STAGE_WRITE_BP_INIT stage. + * + * Dedup: + * Dedup reads are handled by the ZIO_STAGE_DDT_READ_START and + * ZIO_STAGE_DDT_READ_DONE stages. These stages are added to an existing + * read pipeline if the dedup bit is set on the block pointer. + * Writing a dedup block is performed by the ZIO_STAGE_DDT_WRITE stage + * and added to a write pipeline if a user has enabled dedup on that + * particular dataset. + * + * NOP Write: + * The NOP write feature is performed by the ZIO_STAGE_NOP_WRITE stage + * and is added to an existing write pipeline if a crypographically + * secure checksum (i.e. SHA256) is enabled and compression is turned on. + * The NOP write stage will compare the checksums of the current data + * on-disk (level-0 blocks only) and the data that is currently being written. + * If the checksum values are identical then the pipeline is converted to + * an interlock pipeline skipping block allocation and bypassing the + * physical I/O. The nop write feature can handle writes in either + * syncing or open context (i.e. zil writes) and as a result is mutually + * exclusive with dedup. + */ + +/* * zio pipeline stage definitions */ enum zio_stage { @@ -46,27 +114,29 @@ enum zio_stage { ZIO_STAGE_CHECKSUM_GENERATE = 1 << 5, /* -W--- */ - ZIO_STAGE_DDT_READ_START = 1 << 6, /* R---- */ - ZIO_STAGE_DDT_READ_DONE = 1 << 7, /* R---- */ - ZIO_STAGE_DDT_WRITE = 1 << 8, /* -W--- */ - ZIO_STAGE_DDT_FREE = 1 << 9, /* --F-- */ + ZIO_STAGE_NOP_WRITE = 1 << 6, /* -W--- */ - ZIO_STAGE_GANG_ASSEMBLE = 1 << 10, /* RWFC- */ - ZIO_STAGE_GANG_ISSUE = 1 << 11, /* RWFC- */ + ZIO_STAGE_DDT_READ_START = 1 << 7, /* R---- */ + ZIO_STAGE_DDT_READ_DONE = 1 << 8, /* R---- */ + ZIO_STAGE_DDT_WRITE = 1 << 9, /* -W--- */ + ZIO_STAGE_DDT_FREE = 1 << 10, /* --F-- */ - ZIO_STAGE_DVA_ALLOCATE = 1 << 12, /* -W--- */ - ZIO_STAGE_DVA_FREE = 1 << 13, /* --F-- */ - ZIO_STAGE_DVA_CLAIM = 1 << 14, /* ---C- */ + ZIO_STAGE_GANG_ASSEMBLE = 1 << 11, /* RWFC- */ + ZIO_STAGE_GANG_ISSUE = 1 << 12, /* RWFC- */ - ZIO_STAGE_READY = 1 << 15, /* RWFCI */ + ZIO_STAGE_DVA_ALLOCATE = 1 << 13, /* -W--- */ + ZIO_STAGE_DVA_FREE = 1 << 14, /* --F-- */ + ZIO_STAGE_DVA_CLAIM = 1 << 15, /* ---C- */ - ZIO_STAGE_VDEV_IO_START = 1 << 16, /* RWF-I */ - ZIO_STAGE_VDEV_IO_DONE = 1 << 17, /* RWF-- */ - ZIO_STAGE_VDEV_IO_ASSESS = 1 << 18, /* RWF-I */ + ZIO_STAGE_READY = 1 << 16, /* RWFCI */ - ZIO_STAGE_CHECKSUM_VERIFY = 1 << 19, /* R---- */ + ZIO_STAGE_VDEV_IO_START = 1 << 17, /* RWF-I */ + ZIO_STAGE_VDEV_IO_DONE = 1 << 18, /* RWF-- */ + ZIO_STAGE_VDEV_IO_ASSESS = 1 << 19, /* RWF-I */ - ZIO_STAGE_DONE = 1 << 20 /* RWFCI */ + ZIO_STAGE_CHECKSUM_VERIFY = 1 << 20, /* R---- */ + + ZIO_STAGE_DONE = 1 << 21 /* RWFCI */ }; #define ZIO_INTERLOCK_STAGES \ @@ -143,6 +213,7 @@ enum zio_stage { #define ZIO_FREE_PIPELINE \ (ZIO_INTERLOCK_STAGES | \ ZIO_STAGE_FREE_BP_INIT | \ + ZIO_STAGE_ISSUE_ASYNC | \ ZIO_STAGE_DVA_FREE | \ ZIO_STAGE_VDEV_IO_START | \ ZIO_STAGE_VDEV_IO_ASSESS) diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c index 78e47e5..5a51dee 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c @@ -292,27 +292,74 @@ vdev_geom_read_config(struct g_consumer *cp, nvlist_t **config) return (*config == NULL ? ENOENT : 0); } -static int -vdev_geom_check_config(nvlist_t *config, const char *name, uint64_t *best_txg) +static void +resize_configs(nvlist_t ***configs, uint64_t *count, uint64_t id) { - uint64_t vdev_guid; - uint64_t txg; + nvlist_t **new_configs; + uint64_t i; + + if (id < *count) + return; + new_configs = kmem_alloc((id + 1) * sizeof(nvlist_t *), + KM_SLEEP | KM_ZERO); + for (i = 0; i < *count; i++) + new_configs[i] = (*configs)[i]; + if (*configs != NULL) + kmem_free(*configs, *count * sizeof(void *)); + *configs = new_configs; + *count = id + 1; +} + +static void +process_vdev_config(nvlist_t ***configs, uint64_t *count, nvlist_t *cfg, + const char *name, uint64_t* known_pool_guid) +{ + nvlist_t *vdev_tree; + uint64_t pool_guid; + uint64_t vdev_guid, known_guid; + uint64_t id, txg, known_txg; char *pname; + int i; - if (nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, &pname) != 0 || + if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &pname) != 0 || strcmp(pname, name) != 0) - return (ENOENT); + goto ignore; - ZFS_LOG(1, "found pool: %s", pname); + if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0) + goto ignore; - txg = 0; - nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg); - if (txg <= *best_txg) - return (ENOENT); - *best_txg = txg; - ZFS_LOG(1, "txg: %ju", (uintmax_t)*best_txg); + if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_TOP_GUID, &vdev_guid) != 0) + goto ignore; - return (0); + if (nvlist_lookup_nvlist(cfg, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) + goto ignore; + + if (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ID, &id) != 0) + goto ignore; + + VERIFY(nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); + + if (*known_pool_guid != 0) { + if (pool_guid != *known_pool_guid) + goto ignore; + } else + *known_pool_guid = pool_guid; + + resize_configs(configs, count, id); + + if ((*configs)[id] != NULL) { + VERIFY(nvlist_lookup_uint64((*configs)[id], + ZPOOL_CONFIG_POOL_TXG, &known_txg) == 0); + if (txg <= known_txg) + goto ignore; + nvlist_free((*configs)[id]); + } + + (*configs)[id] = cfg; + return; + +ignore: + nvlist_free(cfg); } static int @@ -339,14 +386,15 @@ vdev_geom_detach_taster(struct g_consumer *cp) } int -vdev_geom_read_pool_label(const char *name, nvlist_t **config) +vdev_geom_read_pool_label(const char *name, + nvlist_t ***configs, uint64_t *count) { struct g_class *mp; struct g_geom *gp, *zgp; struct g_provider *pp; struct g_consumer *zcp; nvlist_t *vdev_cfg; - uint64_t best_txg; + uint64_t pool_guid; int error; DROP_GIANT(); @@ -357,8 +405,9 @@ vdev_geom_read_pool_label(const char *name, nvlist_t **config) zgp->orphan = vdev_geom_taste_orphan; zcp = g_new_consumer(zgp); - best_txg = 0; - *config = NULL; + *configs = NULL; + *count = 0; + pool_guid = 0; LIST_FOREACH(mp, &g_classes, class) { if (mp == &zfs_vdev_class) continue; @@ -378,14 +427,8 @@ vdev_geom_read_pool_label(const char *name, nvlist_t **config) continue; ZFS_LOG(1, "successfully read vdev config"); - error = vdev_geom_check_config(vdev_cfg, name, - &best_txg); - if (error != 0) { - nvlist_free(vdev_cfg); - continue; - } - nvlist_free(*config); - *config = vdev_cfg; + process_vdev_config(configs, count, + vdev_cfg, name, &pool_guid); } } } @@ -394,7 +437,8 @@ vdev_geom_read_pool_label(const char *name, nvlist_t **config) g_destroy_geom(zgp); g_topology_unlock(); PICKUP_GIANT(); - return (*config == NULL ? ENOENT : 0); + + return (*count > 0 ? 0 : ENOENT); } static uint64_t diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c index f110d8c..e4157df 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c @@ -3836,6 +3836,12 @@ zfs_ioc_recv(zfs_cmd_t *zc) error = 1; } #endif + +#ifdef __FreeBSD__ + if (error == 0) + zvol_create_minors(tofs); +#endif + /* * On error, restore the original props. */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c index 2c7f423..72c22dc 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c @@ -1847,18 +1847,6 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) zfsvfs->z_unmounted = B_TRUE; rrw_exit(&zfsvfs->z_teardown_lock, FTAG); rw_exit(&zfsvfs->z_teardown_inactive_lock); - -#ifdef __FreeBSD__ - /* - * Some znodes might not be fully reclaimed, wait for them. - */ - mutex_enter(&zfsvfs->z_znodes_lock); - while (list_head(&zfsvfs->z_all_znodes) != NULL) { - msleep(zfsvfs, &zfsvfs->z_znodes_lock, 0, - "zteardown", 0); - } - mutex_exit(&zfsvfs->z_znodes_lock); -#endif } /* @@ -2123,7 +2111,7 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp) VN_HOLD(*vpp); } ZFS_EXIT(zfsvfs); - err = zfs_vnode_lock(*vpp, flags | LK_RETRY); + err = zfs_vnode_lock(*vpp, flags); if (err != 0) *vpp = NULL; return (err); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c index 6b6f537..330c753 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c @@ -1203,6 +1203,12 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) DMU_READ_NO_PREFETCH); if (error == 0) { + blkptr_t *obp = dmu_buf_get_blkptr(db); + if (obp) { + ASSERT(BP_IS_HOLE(bp)); + *bp = *obp; + } + zgd->zgd_db = db; zgd->zgd_bp = bp; @@ -1908,6 +1914,9 @@ top: } if (delete_now) { +#ifdef __FreeBSD__ + panic("zfs_remove: delete_now branch taken"); +#endif if (xattr_obj_unlinked) { ASSERT3U(xzp->z_links, ==, 2); mutex_enter(&xzp->z_lock); @@ -1937,6 +1946,9 @@ top: } else if (unlinked) { mutex_exit(&zp->z_lock); zfs_unlinked_add(zp, tx); +#ifdef __FreeBSD__ + vp->v_vflag |= VV_NOSYNC; +#endif } txtype = TX_REMOVE; @@ -2667,7 +2679,7 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &crtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); if (vp->v_type == VBLK || vp->v_type == VCHR) SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL, &rdev, 8); @@ -3245,6 +3257,12 @@ top: uint64_t acl_obj; new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); + if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED && + !(zp->z_pflags & ZFS_ACL_TRIVIAL)) { + err = EPERM; + goto out; + } + if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) goto out; @@ -4560,14 +4578,22 @@ zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) * The fs has been unmounted, or we did a * suspend/resume and this file no longer exists. */ - VI_LOCK(vp); - ASSERT(vp->v_count <= 1); - vp->v_count = 0; - VI_UNLOCK(vp); + rw_exit(&zfsvfs->z_teardown_inactive_lock); vrecycle(vp); + return; + } + + mutex_enter(&zp->z_lock); + if (zp->z_unlinked) { + /* + * Fast path to recycle a vnode of a removed file. + */ + mutex_exit(&zp->z_lock); rw_exit(&zfsvfs->z_teardown_inactive_lock); + vrecycle(vp); return; } + mutex_exit(&zp->z_lock); if (zp->z_atime_dirty && zp->z_unlinked == 0) { dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); @@ -4586,8 +4612,6 @@ zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) dmu_tx_commit(tx); } } - - zfs_zinactive(zp); rw_exit(&zfsvfs->z_teardown_inactive_lock); } @@ -5550,34 +5574,61 @@ zfs_getpages(struct vnode *vp, vm_page_t *m, int count, int reqpage) znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; objset_t *os = zp->z_zfsvfs->z_os; - vm_page_t mreq; + vm_page_t mfirst, mlast, mreq; vm_object_t object; caddr_t va; struct sf_buf *sf; + off_t startoff, endoff; int i, error; - int pcount, size; + vm_pindex_t reqstart, reqend; + int pcount, lsize, reqsize, size; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); - pcount = round_page(count) / PAGE_SIZE; + pcount = OFF_TO_IDX(round_page(count)); mreq = m[reqpage]; object = mreq->object; error = 0; KASSERT(vp->v_object == object, ("mismatching object")); + if (pcount > 1 && zp->z_blksz > PAGESIZE) { + startoff = rounddown(IDX_TO_OFF(mreq->pindex), zp->z_blksz); + reqstart = OFF_TO_IDX(round_page(startoff)); + if (reqstart < m[0]->pindex) + reqstart = 0; + else + reqstart = reqstart - m[0]->pindex; + endoff = roundup(IDX_TO_OFF(mreq->pindex) + PAGE_SIZE, + zp->z_blksz); + reqend = OFF_TO_IDX(trunc_page(endoff)) - 1; + if (reqend > m[pcount - 1]->pindex) + reqend = m[pcount - 1]->pindex; + reqsize = reqend - m[reqstart]->pindex + 1; + KASSERT(reqstart <= reqpage && reqpage < reqstart + reqsize, + ("reqpage beyond [reqstart, reqstart + reqsize[ bounds")); + } else { + reqstart = reqpage; + reqsize = 1; + } + mfirst = m[reqstart]; + mlast = m[reqstart + reqsize - 1]; + VM_OBJECT_LOCK(object); - for (i = 0; i < pcount; i++) { - if (i != reqpage) { - vm_page_lock(m[i]); - vm_page_free(m[i]); - vm_page_unlock(m[i]); - } + for (i = 0; i < reqstart; i++) { + vm_page_lock(m[i]); + vm_page_free(m[i]); + vm_page_unlock(m[i]); + } + for (i = reqstart + reqsize; i < pcount; i++) { + vm_page_lock(m[i]); + vm_page_free(m[i]); + vm_page_unlock(m[i]); } - if (mreq->valid) { + if (mreq->valid && reqsize == 1) { if (mreq->valid != VM_PAGE_BITS_ALL) vm_page_zero_invalid(mreq, TRUE); VM_OBJECT_UNLOCK(object); @@ -5586,30 +5637,50 @@ zfs_getpages(struct vnode *vp, vm_page_t *m, int count, int reqpage) } PCPU_INC(cnt.v_vnodein); - PCPU_INC(cnt.v_vnodepgsin); + PCPU_ADD(cnt.v_vnodepgsin, reqsize); if (IDX_TO_OFF(mreq->pindex) >= object->un_pager.vnp.vnp_size) { + for (i = reqstart; i < reqstart + reqsize; i++) { + if (i != reqpage) { + vm_page_lock(m[i]); + vm_page_free(m[i]); + vm_page_unlock(m[i]); + } + } VM_OBJECT_UNLOCK(object); ZFS_EXIT(zfsvfs); return (VM_PAGER_BAD); } - size = PAGE_SIZE; - if (IDX_TO_OFF(mreq->pindex) + size > object->un_pager.vnp.vnp_size) - size = object->un_pager.vnp.vnp_size - IDX_TO_OFF(mreq->pindex); + lsize = PAGE_SIZE; + if (IDX_TO_OFF(mlast->pindex) + lsize > object->un_pager.vnp.vnp_size) + lsize = object->un_pager.vnp.vnp_size - IDX_TO_OFF(mlast->pindex); VM_OBJECT_UNLOCK(object); - va = zfs_map_page(mreq, &sf); - error = dmu_read(os, zp->z_id, IDX_TO_OFF(mreq->pindex), - size, va, DMU_READ_PREFETCH); - if (size != PAGE_SIZE) - bzero(va + size, PAGE_SIZE - size); - zfs_unmap_page(sf); + + for (i = reqstart; i < reqstart + reqsize; i++) { + size = PAGE_SIZE; + if (i == (reqstart + reqsize - 1)) + size = lsize; + va = zfs_map_page(m[i], &sf); + error = dmu_read(os, zp->z_id, IDX_TO_OFF(m[i]->pindex), + size, va, DMU_READ_PREFETCH); + if (size != PAGE_SIZE) + bzero(va + size, PAGE_SIZE - size); + zfs_unmap_page(sf); + if (error != 0) + break; + } + VM_OBJECT_LOCK(object); - if (!error) - mreq->valid = VM_PAGE_BITS_ALL; - KASSERT(mreq->dirty == 0, ("zfs_getpages: page %p is dirty", mreq)); + for (i = reqstart; i < reqstart + reqsize; i++) { + if (!error) + m[i]->valid = VM_PAGE_BITS_ALL; + KASSERT(m[i]->dirty == 0, ("zfs_getpages: page %p is dirty", m[i])); + if (i != reqpage) + vm_page_readahead_finish(m[i]); + } VM_OBJECT_UNLOCK(object); @@ -5633,6 +5704,30 @@ zfs_freebsd_getpages(ap) } static int +zfs_freebsd_bmap(ap) + struct vop_bmap_args /* { + struct vnode *a_vp; + daddr_t a_bn; + struct bufobj **a_bop; + daddr_t *a_bnp; + int *a_runp; + int *a_runb; + } */ *ap; +{ + + if (ap->a_bop != NULL) + *ap->a_bop = &ap->a_vp->v_bufobj; + if (ap->a_bnp != NULL) + *ap->a_bnp = ap->a_bn; + if (ap->a_runp != NULL) + *ap->a_runp = 0; + if (ap->a_runb != NULL) + *ap->a_runb = 0; + + return (0); +} + +static int zfs_freebsd_open(ap) struct vop_open_args /* { struct vnode *a_vp; @@ -6103,28 +6198,6 @@ zfs_freebsd_inactive(ap) return (0); } -static void -zfs_reclaim_complete(void *arg, int pending) -{ - znode_t *zp = arg; - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - - rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); - if (zp->z_sa_hdl != NULL) { - ZFS_OBJ_HOLD_ENTER(zfsvfs, zp->z_id); - zfs_znode_dmu_fini(zp); - ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id); - } - zfs_znode_free(zp); - rw_exit(&zfsvfs->z_teardown_inactive_lock); - /* - * If the file system is being unmounted, there is a process waiting - * for us, wake it up. - */ - if (zfsvfs->z_unmounted) - wakeup_one(zfsvfs); -} - static int zfs_freebsd_reclaim(ap) struct vop_reclaim_args /* { @@ -6135,53 +6208,25 @@ zfs_freebsd_reclaim(ap) vnode_t *vp = ap->a_vp; znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; - boolean_t rlocked; - - rlocked = rw_tryenter(&zfsvfs->z_teardown_inactive_lock, RW_READER); ASSERT(zp != NULL); - /* - * Destroy the vm object and flush associated pages. - */ + /* Destroy the vm object and flush associated pages. */ vnode_destroy_vobject(vp); - mutex_enter(&zp->z_lock); - zp->z_vnode = NULL; - mutex_exit(&zp->z_lock); - - if (zp->z_unlinked) { - ; /* Do nothing. */ - } else if (!rlocked) { - TASK_INIT(&zp->z_task, 0, zfs_reclaim_complete, zp); - taskqueue_enqueue(taskqueue_thread, &zp->z_task); - } else if (zp->z_sa_hdl == NULL) { + /* + * z_teardown_inactive_lock protects from a race with + * zfs_znode_dmu_fini in zfsvfs_teardown during + * force unmount. + */ + rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); + if (zp->z_sa_hdl == NULL) zfs_znode_free(zp); - } else /* if (!zp->z_unlinked && zp->z_dbuf != NULL) */ { - int locked; + else + zfs_zinactive(zp); + rw_exit(&zfsvfs->z_teardown_inactive_lock); - locked = MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)) ? 2 : - ZFS_OBJ_HOLD_TRYENTER(zfsvfs, zp->z_id); - if (locked == 0) { - /* - * Lock can't be obtained due to deadlock possibility, - * so defer znode destruction. - */ - TASK_INIT(&zp->z_task, 0, zfs_reclaim_complete, zp); - taskqueue_enqueue(taskqueue_thread, &zp->z_task); - } else { - zfs_znode_dmu_fini(zp); - if (locked == 1) - ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id); - zfs_znode_free(zp); - } - } - VI_LOCK(vp); vp->v_data = NULL; - ASSERT(vp->v_holdcnt >= 1); - VI_UNLOCK(vp); - if (rlocked) - rw_exit(&zfsvfs->z_teardown_inactive_lock); return (0); } @@ -6737,7 +6782,7 @@ struct vop_vector zfs_vnodeops = { .vop_remove = zfs_freebsd_remove, .vop_rename = zfs_freebsd_rename, .vop_pathconf = zfs_freebsd_pathconf, - .vop_bmap = VOP_EOPNOTSUPP, + .vop_bmap = zfs_freebsd_bmap, .vop_fid = zfs_freebsd_fid, .vop_getextattr = zfs_getextattr, .vop_deleteextattr = zfs_deleteextattr, diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c index c2ed964..18e54d2 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c @@ -1147,14 +1147,16 @@ zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) dmu_object_info_t doi; dmu_buf_t *db; znode_t *zp; - int err; + vnode_t *vp; sa_handle_t *hdl; - int first = 1; - - *zpp = NULL; + struct thread *td; + int locked; + int err; + td = curthread; getnewvnode_reserve(1); again: + *zpp = NULL; ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); @@ -1193,48 +1195,38 @@ again: if (zp->z_unlinked) { err = ENOENT; } else { - vnode_t *vp; - int dying = 0; - vp = ZTOV(zp); - if (vp == NULL) - dying = 1; - else { - VN_HOLD(vp); - if ((vp->v_iflag & VI_DOOMED) != 0) { - dying = 1; - /* - * Don't VN_RELE() vnode here, because - * it can call vn_lock() which creates - * LOR between vnode lock and znode - * lock. We will VN_RELE() the vnode - * after droping znode lock. - */ - } - } - if (dying) { - if (first) { - ZFS_LOG(1, "dying znode detected (zp=%p)", zp); - first = 0; - } - /* - * znode is dying so we can't reuse it, we must - * wait until destruction is completed. - */ - sa_buf_rele(db, NULL); - mutex_exit(&zp->z_lock); - ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); - if (vp != NULL) - VN_RELE(vp); - tsleep(zp, 0, "zcollide", 1); - goto again; - } *zpp = zp; err = 0; } sa_buf_rele(db, NULL); + + /* Don't let the vnode disappear after ZFS_OBJ_HOLD_EXIT. */ + if (err == 0) + VN_HOLD(vp); + mutex_exit(&zp->z_lock); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + + if (err == 0) { + locked = VOP_ISLOCKED(vp); + VI_LOCK(vp); + if ((vp->v_iflag & VI_DOOMED) != 0 && + locked != LK_EXCLUSIVE) { + /* + * The vnode is doomed and this thread doesn't + * hold the exclusive lock on it, so the vnode + * must be being reclaimed by another thread. + * Otherwise the doomed vnode is being reclaimed + * by this thread and zfs_zget is called from + * ZIL internals. + */ + VI_UNLOCK(vp); + VN_RELE(vp); + goto again; + } + VI_UNLOCK(vp); + } getnewvnode_drop_reserve(); return (err); } @@ -1400,7 +1392,6 @@ zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) void zfs_zinactive(znode_t *zp) { - vnode_t *vp = ZTOV(zp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; uint64_t z_id = zp->z_id; @@ -1412,19 +1403,6 @@ zfs_zinactive(znode_t *zp) ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id); mutex_enter(&zp->z_lock); - VI_LOCK(vp); - if (vp->v_count > 0) { - /* - * If the hold count is greater than zero, somebody has - * obtained a new reference on this znode while we were - * processing it here, so we are done. - */ - VI_UNLOCK(vp); - mutex_exit(&zp->z_lock); - ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); - return; - } - VI_UNLOCK(vp); /* * If this was the last reference to a file with no links, @@ -1433,14 +1411,14 @@ zfs_zinactive(znode_t *zp) if (zp->z_unlinked) { mutex_exit(&zp->z_lock); ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); - ASSERT(vp->v_count == 0); - vrecycle(vp); zfs_rmnode(zp); return; } mutex_exit(&zp->z_lock); + zfs_znode_dmu_fini(zp); ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); + zfs_znode_free(zp); } void @@ -1448,8 +1426,8 @@ zfs_znode_free(znode_t *zp) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; - ASSERT(ZTOV(zp) == NULL); ASSERT(zp->z_sa_hdl == NULL); + zp->z_vnode = NULL; mutex_enter(&zfsvfs->z_znodes_lock); POINTER_INVALIDATE(&zp->z_zfsvfs); list_remove(&zfsvfs->z_all_znodes, zp); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c index be7d274..c2720dc 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c @@ -107,6 +107,31 @@ extern vmem_t *zio_alloc_arena; extern int zfs_mg_alloc_failures; /* + * The following actions directly effect the spa's sync-to-convergence logic. + * The values below define the sync pass when we start performing the action. + * Care should be taken when changing these values as they directly impact + * spa_sync() performance. Tuning these values may introduce subtle performance + * pathologies and should only be done in the context of performance analysis. + * These tunables will eventually be removed and replaced with #defines once + * enough analysis has been done to determine optimal values. + * + * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that + * regular blocks are not deferred. + */ +int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */ +TUNABLE_INT("vfs.zfs.sync_pass_deferred_free", &zfs_sync_pass_deferred_free); +SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_deferred_free, CTLFLAG_RDTUN, + &zfs_sync_pass_deferred_free, 0, "defer frees starting in this pass"); +int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */ +TUNABLE_INT("vfs.zfs.sync_pass_dont_compress", &zfs_sync_pass_dont_compress); +SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_dont_compress, CTLFLAG_RDTUN, + &zfs_sync_pass_dont_compress, 0, "don't compress starting in this pass"); +int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */ +TUNABLE_INT("vfs.zfs.sync_pass_rewrite", &zfs_sync_pass_rewrite); +SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_rewrite, CTLFLAG_RDTUN, + &zfs_sync_pass_rewrite, 0, "rewrite new bps starting in this pass"); + +/* * An allocating zio is one that either currently has the DVA allocate * stage set or will have it later in its lifetime. */ @@ -684,9 +709,7 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, DMU_OT_IS_VALID(zp->zp_type) && zp->zp_level < 32 && zp->zp_copies > 0 && - zp->zp_copies <= spa_max_replication(spa) && - zp->zp_dedup <= 1 && - zp->zp_dedup_verify <= 1); + zp->zp_copies <= spa_max_replication(spa)); zio = zio_create(pio, spa, txg, bp, data, size, done, private, ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, @@ -714,13 +737,20 @@ zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, } void -zio_write_override(zio_t *zio, blkptr_t *bp, int copies) +zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite) { ASSERT(zio->io_type == ZIO_TYPE_WRITE); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(zio->io_stage == ZIO_STAGE_OPEN); ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); + /* + * We must reset the io_prop to match the values that existed + * when the bp was first written by dmu_sync() keeping in mind + * that nopwrite and dedup are mutually exclusive. + */ + zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup; + zio->io_prop.zp_nopwrite = nopwrite; zio->io_prop.zp_copies = copies; zio->io_bp_override = bp; } @@ -742,7 +772,7 @@ zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, ASSERT(!BP_IS_HOLE(bp)); ASSERT(spa_syncing_txg(spa) == txg); - ASSERT(spa_sync_pass(spa) <= SYNC_PASS_DEFERRED_FREE); + ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free); zio = zio_create(pio, spa, txg, bp, NULL, size, NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags, @@ -1020,6 +1050,19 @@ zio_write_bp_init(zio_t *zio) *bp = *zio->io_bp_override; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + /* + * If we've been overridden and nopwrite is set then + * set the flag accordingly to indicate that a nopwrite + * has already occurred. + */ + if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) { + ASSERT(!zp->zp_dedup); + zio->io_flags |= ZIO_FLAG_NOPWRITE; + return (ZIO_PIPELINE_CONTINUE); + } + + ASSERT(!zp->zp_nopwrite); + if (BP_IS_HOLE(bp) || !zp->zp_dedup) return (ZIO_PIPELINE_CONTINUE); @@ -1051,7 +1094,7 @@ zio_write_bp_init(zio_t *zio) ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(!BP_GET_DEDUP(bp)); - if (pass > SYNC_PASS_DONT_COMPRESS) + if (pass >= zfs_sync_pass_dont_compress) compress = ZIO_COMPRESS_OFF; /* Make sure someone doesn't change their mind on overwrites */ @@ -1080,7 +1123,7 @@ zio_write_bp_init(zio_t *zio) * There should only be a handful of blocks after pass 1 in any case. */ if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize && - pass > SYNC_PASS_REWRITE) { + pass >= zfs_sync_pass_rewrite) { ASSERT(psize != 0); enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; @@ -1107,6 +1150,11 @@ zio_write_bp_init(zio_t *zio) ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; } + if (zp->zp_nopwrite) { + ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); + ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); + zio->io_pipeline |= ZIO_STAGE_NOP_WRITE; + } } return (ZIO_PIPELINE_CONTINUE); @@ -1328,6 +1376,7 @@ zio_reexecute(zio_t *pio) pio->io_stage = pio->io_orig_stage; pio->io_pipeline = pio->io_orig_pipeline; pio->io_reexecute = 0; + pio->io_flags |= ZIO_FLAG_REEXECUTED; pio->io_error = 0; for (int w = 0; w < ZIO_WAIT_TYPES; w++) pio->io_state[w] = 0; @@ -1804,8 +1853,9 @@ zio_write_gang_block(zio_t *pio) zp.zp_type = DMU_OT_NONE; zp.zp_level = 0; zp.zp_copies = gio->io_prop.zp_copies; - zp.zp_dedup = 0; - zp.zp_dedup_verify = 0; + zp.zp_dedup = B_FALSE; + zp.zp_dedup_verify = B_FALSE; + zp.zp_nopwrite = B_FALSE; zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g], (char *)pio->io_data + (pio->io_size - resid), lsize, &zp, @@ -1825,6 +1875,62 @@ zio_write_gang_block(zio_t *pio) } /* + * The zio_nop_write stage in the pipeline determines if allocating + * a new bp is necessary. By leveraging a cryptographically secure checksum, + * such as SHA256, we can compare the checksums of the new data and the old + * to determine if allocating a new block is required. The nopwrite + * feature can handle writes in either syncing or open context (i.e. zil + * writes) and as a result is mutually exclusive with dedup. + */ +static int +zio_nop_write(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + blkptr_t *bp_orig = &zio->io_bp_orig; + zio_prop_t *zp = &zio->io_prop; + + ASSERT(BP_GET_LEVEL(bp) == 0); + ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); + ASSERT(zp->zp_nopwrite); + ASSERT(!zp->zp_dedup); + ASSERT(zio->io_bp_override == NULL); + ASSERT(IO_IS_ALLOCATING(zio)); + + /* + * Check to see if the original bp and the new bp have matching + * characteristics (i.e. same checksum, compression algorithms, etc). + * If they don't then just continue with the pipeline which will + * allocate a new bp. + */ + if (BP_IS_HOLE(bp_orig) || + !zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_dedup || + BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) || + BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) || + BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) || + zp->zp_copies != BP_GET_NDVAS(bp_orig)) + return (ZIO_PIPELINE_CONTINUE); + + /* + * If the checksums match then reset the pipeline so that we + * avoid allocating a new bp and issuing any I/O. + */ + if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) { + ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup); + ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig)); + ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig)); + ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF); + ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop, + sizeof (uint64_t)) == 0); + + *bp = *bp_orig; + zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + zio->io_flags |= ZIO_FLAG_NOPWRITE; + } + + return (ZIO_PIPELINE_CONTINUE); +} + +/* * ========================================================================== * Dedup * ========================================================================== @@ -2096,7 +2202,7 @@ zio_ddt_write(zio_t *zio) zio->io_stage = ZIO_STAGE_OPEN; BP_ZERO(bp); } else { - zp->zp_dedup = 0; + zp->zp_dedup = B_FALSE; } zio->io_pipeline = ZIO_WRITE_PIPELINE; ddt_exit(ddt); @@ -2753,7 +2859,8 @@ zio_ready(zio_t *zio) if (zio->io_ready) { ASSERT(IO_IS_ALLOCATING(zio)); - ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); + ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) || + (zio->io_flags & ZIO_FLAG_NOPWRITE)); ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); zio->io_ready(zio); @@ -2835,6 +2942,8 @@ zio_done(zio_t *zio) ASSERT(BP_COUNT_GANG(bp) == 0 || (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); } + if (zio->io_flags & ZIO_FLAG_NOPWRITE) + VERIFY(BP_EQUAL(bp, &zio->io_bp_orig)); } /* @@ -2944,7 +3053,7 @@ zio_done(zio_t *zio) if ((zio->io_error || zio->io_reexecute) && IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && - !(zio->io_flags & ZIO_FLAG_IO_REWRITE)) + !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE))) zio_dva_unallocate(zio, zio->io_gang_tree, bp); zio_gang_tree_free(&zio->io_gang_tree); @@ -3088,6 +3197,7 @@ static zio_pipe_stage_t *zio_pipeline[] = { zio_issue_async, zio_write_bp_init, zio_checksum_generate, + zio_nop_write, zio_ddt_read_start, zio_ddt_read_done, zio_ddt_write, diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c index 07fd3c6..e5d159b 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c @@ -78,6 +78,7 @@ #include <sys/vdev_impl.h> #include <sys/zvol.h> #include <sys/zil_impl.h> +#include <sys/dbuf.h> #include <geom/geom.h> #include "zfs_namecheck.h" @@ -1051,6 +1052,12 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) error = dmu_buf_hold(os, object, offset, zgd, &db, DMU_READ_NO_PREFETCH); if (error == 0) { + blkptr_t *obp = dmu_buf_get_blkptr(db); + if (obp) { + ASSERT(BP_IS_HOLE(bp)); + *bp = *obp; + } + zgd->zgd_db = db; zgd->zgd_bp = bp; |