summaryrefslogtreecommitdiffstats
path: root/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
diff options
context:
space:
mode:
authordelphij <delphij@FreeBSD.org>2014-12-22 20:58:51 +0000
committerdelphij <delphij@FreeBSD.org>2014-12-22 20:58:51 +0000
commit1ad38ed4f01c38401f2c15151edbcfab81168db1 (patch)
treebf517de79b70790f455a71edd81bb6303f96f90e /sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
parentaf2ee162da52913fdefa6d0ffae3644e39a3b369 (diff)
downloadFreeBSD-src-1ad38ed4f01c38401f2c15151edbcfab81168db1.zip
FreeBSD-src-1ad38ed4f01c38401f2c15151edbcfab81168db1.tar.gz
MFC r274337,r274673,274681,r275515:
ZFS large block support. The default recordsize remains at 128KB. A new tunable/sysctl variable, vfs.zfs.max_recordsize is added to allow adjusting the permitted maximum record size, or zfs_max_recordsize, with a default of 1MB. ZFS will not allow setting recordsize greater than zfs_max_recordsize as a safety belt, because larger recordsize means greater read and write latency and more memory usage. Please note that booting from datasets that have recordsize greater than 128KB is not supported (but it's Okay to enable the feature on the pool). Limited safety belt is provided for mounted root filesystem but use caution when using a larger value. Illumos issue: 5027 zfs large block support
Diffstat (limited to 'sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c')
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c110
1 files changed, 108 insertions, 2 deletions
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
index a3efe92..84d9f01 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
@@ -51,6 +51,22 @@
#include <sys/dsl_userhold.h>
#include <sys/dsl_bookmark.h>
+SYSCTL_DECL(_vfs_zfs);
+
+/*
+ * The SPA supports block sizes up to 16MB. However, very large blocks
+ * can have an impact on i/o latency (e.g. tying up a spinning disk for
+ * ~300ms), and also potentially on the memory allocator. Therefore,
+ * we do not allow the recordsize to be set larger than zfs_max_recordsize
+ * (default 1MB). Larger blocks can be created by changing this tunable,
+ * and pools with larger blocks can always be imported and used, regardless
+ * of this setting.
+ */
+int zfs_max_recordsize = 1 * 1024 * 1024;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, max_recordsize, CTLFLAG_RWTUN,
+ &zfs_max_recordsize, 0,
+ "Maximum block size. Expect dragons when tuning this.");
+
#define SWITCH64(x, y) \
{ \
uint64_t __tmp = (x); \
@@ -60,8 +76,6 @@
#define DS_REF_MAX (1ULL << 62)
-#define DSL_DEADLIST_BLOCKSIZE SPA_MAXBLOCKSIZE
-
/*
* Figure out how much of this delta should be propogated to the dsl_dir
* layer. If there's a refreservation, that space has already been
@@ -111,6 +125,8 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
ds->ds_phys->ds_compressed_bytes += compressed;
ds->ds_phys->ds_uncompressed_bytes += uncompressed;
ds->ds_phys->ds_unique_bytes += used;
+ if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE)
+ ds->ds_need_large_blocks = B_TRUE;
mutex_exit(&ds->ds_lock);
dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,
compressed, uncompressed, tx);
@@ -392,6 +408,14 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t),
offsetof(dmu_sendarg_t, dsa_link));
+ if (doi.doi_type == DMU_OTN_ZAP_METADATA) {
+ int zaperr = zap_contains(mos, dsobj, DS_FIELD_LARGE_BLOCKS);
+ if (zaperr != ENOENT) {
+ VERIFY0(zaperr);
+ ds->ds_large_blocks = B_TRUE;
+ }
+ }
+
if (err == 0) {
err = dsl_dir_hold_obj(dp,
ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir);
@@ -707,6 +731,9 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
dsphys->ds_flags |= origin->ds_phys->ds_flags &
(DS_FLAG_INCONSISTENT | DS_FLAG_CI_DATASET);
+ if (origin->ds_large_blocks)
+ dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx);
+
dmu_buf_will_dirty(origin->ds_dbuf, tx);
origin->ds_phys->ds_num_children++;
@@ -1262,6 +1289,9 @@ dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname,
dsphys->ds_bp = ds->ds_phys->ds_bp;
dmu_buf_rele(dbuf, FTAG);
+ if (ds->ds_large_blocks)
+ dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx);
+
ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0);
if (ds->ds_prev) {
uint64_t next_clones_obj =
@@ -1546,6 +1576,11 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid;
dmu_objset_sync(ds->ds_objset, zio, tx);
+
+ if (ds->ds_need_large_blocks && !ds->ds_large_blocks) {
+ dsl_dataset_activate_large_blocks_sync_impl(ds->ds_object, tx);
+ ds->ds_large_blocks = B_TRUE;
+ }
}
static void
@@ -3231,6 +3266,77 @@ dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap,
return (err);
}
+static int
+dsl_dataset_activate_large_blocks_check(void *arg, dmu_tx_t *tx)
+{
+ const char *dsname = arg;
+ dsl_dataset_t *ds;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ int error = 0;
+
+ if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS))
+ return (SET_ERROR(ENOTSUP));
+
+ ASSERT(spa_feature_is_enabled(dp->dp_spa,
+ SPA_FEATURE_EXTENSIBLE_DATASET));
+
+ error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
+ if (error != 0)
+ return (error);
+
+ if (ds->ds_large_blocks)
+ error = EALREADY;
+ dsl_dataset_rele(ds, FTAG);
+
+ return (error);
+}
+
+void
+dsl_dataset_activate_large_blocks_sync_impl(uint64_t dsobj, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset;
+ uint64_t zero = 0;
+
+ spa_feature_incr(spa, SPA_FEATURE_LARGE_BLOCKS, tx);
+ dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx);
+
+ VERIFY0(zap_add(mos, dsobj, DS_FIELD_LARGE_BLOCKS,
+ sizeof (zero), 1, &zero, tx));
+}
+
+static void
+dsl_dataset_activate_large_blocks_sync(void *arg, dmu_tx_t *tx)
+{
+ const char *dsname = arg;
+ dsl_dataset_t *ds;
+
+ VERIFY0(dsl_dataset_hold(dmu_tx_pool(tx), dsname, FTAG, &ds));
+
+ dsl_dataset_activate_large_blocks_sync_impl(ds->ds_object, tx);
+ ASSERT(!ds->ds_large_blocks);
+ ds->ds_large_blocks = B_TRUE;
+ dsl_dataset_rele(ds, FTAG);
+}
+
+int
+dsl_dataset_activate_large_blocks(const char *dsname)
+{
+ int error;
+
+ error = dsl_sync_task(dsname,
+ dsl_dataset_activate_large_blocks_check,
+ dsl_dataset_activate_large_blocks_sync, (void *)dsname,
+ 1, ZFS_SPACE_CHECK_RESERVED);
+
+ /*
+ * EALREADY indicates that this dataset already supports large blocks.
+ */
+ if (error == EALREADY)
+ error = 0;
+ return (error);
+}
+
/*
* Return TRUE if 'earlier' is an earlier snapshot in 'later's timeline.
* For example, they could both be snapshots of the same filesystem, and
OpenPOWER on IntegriCloud