summaryrefslogtreecommitdiffstats
path: root/sys/cddl
diff options
context:
space:
mode:
authordelphij <delphij@FreeBSD.org>2014-12-22 20:58:51 +0000
committerdelphij <delphij@FreeBSD.org>2014-12-22 20:58:51 +0000
commit1ad38ed4f01c38401f2c15151edbcfab81168db1 (patch)
treebf517de79b70790f455a71edd81bb6303f96f90e /sys/cddl
parentaf2ee162da52913fdefa6d0ffae3644e39a3b369 (diff)
downloadFreeBSD-src-1ad38ed4f01c38401f2c15151edbcfab81168db1.zip
FreeBSD-src-1ad38ed4f01c38401f2c15151edbcfab81168db1.tar.gz
MFC r274337,r274673,274681,r275515:
ZFS large block support. The default recordsize remains at 128KB. A new tunable/sysctl variable, vfs.zfs.max_recordsize is added to allow adjusting the permitted maximum record size, or zfs_max_recordsize, with a default of 1MB. ZFS will not allow setting recordsize greater than zfs_max_recordsize as a safety belt, because larger recordsize means greater read and write latency and more memory usage. Please note that booting from datasets that have recordsize greater than 128KB is not supported (but it's Okay to enable the feature on the pool). Limited safety belt is provided for mounted root filesystem but use caution when using a larger value. Illumos issue: 5027 zfs large block support
Diffstat (limited to 'sys/cddl')
-rw-r--r--sys/cddl/boot/zfs/zfsimpl.h7
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c12
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h1
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c4
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c2
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c5
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c2
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c6
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c16
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c87
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c24
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c10
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c110
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c8
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c7
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c4
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c2
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c6
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c26
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c2
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c9
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h3
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h1
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h8
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h11
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h22
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h3
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h5
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h2
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h1
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h2
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c6
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c2
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c6
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c16
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c58
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c2
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c9
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c8
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c8
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c11
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c9
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c11
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h1
44 files changed, 425 insertions, 130 deletions
diff --git a/sys/cddl/boot/zfs/zfsimpl.h b/sys/cddl/boot/zfs/zfsimpl.h
index 5f2e255..730cdf4 100644
--- a/sys/cddl/boot/zfs/zfsimpl.h
+++ b/sys/cddl/boot/zfs/zfsimpl.h
@@ -113,17 +113,14 @@
#define BSWAP_64(x) ((BSWAP_32(x) << 32) | BSWAP_32((x) >> 32))
/*
- * We currently support nine block sizes, from 512 bytes to 128K.
- * We could go higher, but the benefits are near-zero and the cost
- * of COWing a giant block to modify one byte would become excessive.
+ * Note: the boot loader can't actually read blocks larger than 128KB,
+ * due to lack of memory. Therefore its SPA_MAXBLOCKSIZE is still 128KB.
*/
#define SPA_MINBLOCKSHIFT 9
#define SPA_MAXBLOCKSHIFT 17
#define SPA_MINBLOCKSIZE (1ULL << SPA_MINBLOCKSHIFT)
#define SPA_MAXBLOCKSIZE (1ULL << SPA_MAXBLOCKSHIFT)
-#define SPA_BLOCKSIZES (SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1)
-
/*
* The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB.
* The ASIZE encoding should be at least 64 times larger (6 more bits)
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c b/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c
index 65d2858..52a355d 100644
--- a/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c
@@ -56,7 +56,8 @@ valid_char(char c, boolean_t after_colon)
{
return ((c >= 'a' && c <= 'z') ||
(c >= '0' && c <= '9') ||
- c == (after_colon ? '_' : '.'));
+ (after_colon && c == '_') ||
+ (!after_colon && (c == '.' || c == '-')));
}
/*
@@ -220,4 +221,13 @@ zpool_feature_init(void)
"com.delphix:embedded_data", "embedded_data",
"Blocks which compress very well use even less space.",
B_FALSE, B_TRUE, B_TRUE, NULL);
+
+ static const spa_feature_t large_blocks_deps[] = {
+ SPA_FEATURE_EXTENSIBLE_DATASET,
+ SPA_FEATURE_NONE
+ };
+ zfeature_register(SPA_FEATURE_LARGE_BLOCKS,
+ "org.open-zfs:large_blocks", "large_blocks",
+ "Support for blocks larger than 128KB.", B_FALSE, B_FALSE, B_FALSE,
+ large_blocks_deps);
}
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h b/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h
index 65016f1..4ffe435 100644
--- a/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h
@@ -50,6 +50,7 @@ typedef enum spa_feature {
SPA_FEATURE_EMBEDDED_DATA,
SPA_FEATURE_BOOKMARKS,
SPA_FEATURE_FS_SS_LIMIT,
+ SPA_FEATURE_LARGE_BLOCKS,
SPA_FEATURES
} spa_feature_t;
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c b/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c
index bd023c7..dda72de 100644
--- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c
@@ -409,8 +409,8 @@ zfs_prop_init(void)
/* inherit number properties */
zprop_register_number(ZFS_PROP_RECORDSIZE, "recordsize",
- SPA_MAXBLOCKSIZE, PROP_INHERIT,
- ZFS_TYPE_FILESYSTEM, "512 to 128k, power of 2", "RECSIZE");
+ SPA_OLD_MAXBLOCKSIZE, PROP_INHERIT,
+ ZFS_TYPE_FILESYSTEM, "512 to 1M, power of 2", "RECSIZE");
/* hidden properties */
zprop_register_hidden(ZFS_PROP_CREATETXG, "createtxg", PROP_TYPE_NUMBER,
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c b/sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c
index a400f82..4d906b0 100644
--- a/sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c
@@ -127,6 +127,8 @@ zpool_prop_init(void)
/* hidden properties */
zprop_register_hidden(ZPOOL_PROP_NAME, "name", PROP_TYPE_STRING,
PROP_READONLY, ZFS_TYPE_POOL, "NAME");
+ zprop_register_hidden(ZPOOL_PROP_MAXBLOCKSIZE, "maxblocksize",
+ PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_POOL, "MAXBLOCKSIZE");
}
/*
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c
index e75ae72..da4d38a 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c
@@ -43,7 +43,7 @@ bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx)
if (!spa_feature_is_active(spa, SPA_FEATURE_EMPTY_BPOBJ)) {
ASSERT0(dp->dp_empty_bpobj);
dp->dp_empty_bpobj =
- bpobj_alloc(os, SPA_MAXBLOCKSIZE, tx);
+ bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx);
VERIFY(zap_add(os,
DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
@@ -396,7 +396,8 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
if (bpo->bpo_phys->bpo_subobjs == 0) {
bpo->bpo_phys->bpo_subobjs = dmu_object_alloc(bpo->bpo_os,
- DMU_OT_BPOBJ_SUBOBJ, SPA_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx);
+ DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE,
+ DMU_OT_NONE, 0, tx);
}
dmu_object_info_t doi;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c
index c724ed0..5f7d76f 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c
@@ -65,7 +65,7 @@ bptree_alloc(objset_t *os, dmu_tx_t *tx)
bptree_phys_t *bt;
obj = dmu_object_alloc(os, DMU_OTN_UINT64_METADATA,
- SPA_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA,
+ SPA_OLD_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA,
sizeof (bptree_phys_t), tx);
/*
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
index c9ea6c0..0b9a0b9 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
@@ -2022,10 +2022,8 @@ dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
return (SET_ERROR(ENOTSUP));
if (blksz == 0)
blksz = SPA_MINBLOCKSIZE;
- if (blksz > SPA_MAXBLOCKSIZE)
- blksz = SPA_MAXBLOCKSIZE;
- else
- blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
+ ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset)));
+ blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
index 73b8e05..e7aeed1 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
@@ -255,6 +255,14 @@ logbias_changed_cb(void *arg, uint64_t newval)
zil_set_logbias(os->os_zil, newval);
}
+static void
+recordsize_changed_cb(void *arg, uint64_t newval)
+{
+ objset_t *os = arg;
+
+ os->os_recordsize = newval;
+}
+
void
dmu_objset_byteswap(void *buf, size_t size)
{
@@ -384,6 +392,11 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
ZFS_PROP_REDUNDANT_METADATA),
redundant_metadata_changed_cb, os);
}
+ if (err == 0) {
+ err = dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
+ recordsize_changed_cb, os);
+ }
}
if (err != 0) {
VERIFY(arc_buf_remove_ref(os->os_phys_buf,
@@ -642,6 +655,9 @@ dmu_objset_evict(objset_t *os)
VERIFY0(dsl_prop_unregister(ds,
zfs_prop_to_name(ZFS_PROP_REDUNDANT_METADATA),
redundant_metadata_changed_cb, os));
+ VERIFY0(dsl_prop_unregister(ds,
+ zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
+ recordsize_changed_cb, os));
}
VERIFY0(dsl_prop_unregister(ds,
zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
index 1a0cab5..00d4f3e 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
@@ -227,11 +227,12 @@ dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type,
drrw->drr_offset = offset;
drrw->drr_length = blksz;
drrw->drr_toguid = dsp->dsa_toguid;
- if (BP_IS_EMBEDDED(bp)) {
+ if (bp == NULL || BP_IS_EMBEDDED(bp)) {
/*
- * There's no pre-computed checksum of embedded BP's, so
- * (like fletcher4-checkummed blocks) userland will have
- * to compute a dedup-capable checksum itself.
+ * There's no pre-computed checksum for partial-block
+ * writes or embedded BP's, so (like
+ * fletcher4-checkummed blocks) userland will have to
+ * compute a dedup-capable checksum itself.
*/
drrw->drr_checksumtype = ZIO_CHECKSUM_OFF;
} else {
@@ -393,6 +394,10 @@ dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp)
drro->drr_compress = dnp->dn_compress;
drro->drr_toguid = dsp->dsa_toguid;
+ if (!(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+ drro->drr_blksz > SPA_OLD_MAXBLOCKSIZE)
+ drro->drr_blksz = SPA_OLD_MAXBLOCKSIZE;
+
if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
return (SET_ERROR(EINTR));
@@ -512,6 +517,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
uint32_t aflags = ARC_WAIT;
arc_buf_t *abuf;
int blksz = BP_GET_LSIZE(bp);
+ uint64_t offset;
ASSERT3U(blksz, ==, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
ASSERT0(zb->zb_level);
@@ -532,8 +538,24 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
}
}
- err = dump_write(dsp, type, zb->zb_object, zb->zb_blkid * blksz,
- blksz, bp, abuf->b_data);
+ offset = zb->zb_blkid * blksz;
+
+ if (!(dsp->dsa_featureflags &
+ DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+ blksz > SPA_OLD_MAXBLOCKSIZE) {
+ char *buf = abuf->b_data;
+ while (blksz > 0 && err == 0) {
+ int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE);
+ err = dump_write(dsp, type, zb->zb_object,
+ offset, n, NULL, buf);
+ offset += n;
+ buf += n;
+ blksz -= n;
+ }
+ } else {
+ err = dump_write(dsp, type, zb->zb_object,
+ offset, blksz, bp, abuf->b_data);
+ }
(void) arc_buf_remove_ref(abuf, &abuf);
}
@@ -548,9 +570,9 @@ static int
dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
zfs_bookmark_phys_t *fromzb, boolean_t is_clone, boolean_t embedok,
#ifdef illumos
- int outfd, vnode_t *vp, offset_t *off)
+ boolean_t large_block_ok, int outfd, vnode_t *vp, offset_t *off)
#else
- int outfd, struct file *fp, offset_t *off)
+ boolean_t large_block_ok, int outfd, struct file *fp, offset_t *off)
#endif
{
objset_t *os;
@@ -586,6 +608,8 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
}
#endif
+ if (large_block_ok && ds->ds_large_blocks)
+ featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS;
if (embedok &&
spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) {
featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA;
@@ -682,10 +706,11 @@ out:
int
dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
+ boolean_t embedok, boolean_t large_block_ok,
#ifdef illumos
- boolean_t embedok, int outfd, vnode_t *vp, offset_t *off)
+ int outfd, vnode_t *vp, offset_t *off)
#else
- boolean_t embedok, int outfd, struct file *fp, offset_t *off)
+ int outfd, struct file *fp, offset_t *off)
#endif
{
dsl_pool_t *dp;
@@ -720,18 +745,19 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
zb.zbm_guid = fromds->ds_phys->ds_guid;
is_clone = (fromds->ds_dir != ds->ds_dir);
dsl_dataset_rele(fromds, FTAG);
- err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok,
- outfd, fp, off);
+ err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
+ embedok, large_block_ok, outfd, fp, off);
} else {
- err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok,
- outfd, fp, off);
+ err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
+ embedok, large_block_ok, outfd, fp, off);
}
dsl_dataset_rele(ds, FTAG);
return (err);
}
int
-dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
+dmu_send(const char *tosnap, const char *fromsnap,
+ boolean_t embedok, boolean_t large_block_ok,
#ifdef illumos
int outfd, vnode_t *vp, offset_t *off)
#else
@@ -802,11 +828,11 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
dsl_pool_rele(dp, FTAG);
return (err);
}
- err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok,
- outfd, fp, off);
+ err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
+ embedok, large_block_ok, outfd, fp, off);
} else {
- err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok,
- outfd, fp, off);
+ err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
+ embedok, large_block_ok, outfd, fp, off);
}
if (owned)
dsl_dataset_disown(ds, FTAG);
@@ -1006,6 +1032,15 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
return (SET_ERROR(ENOTSUP));
+ /*
+ * The receiving code doesn't know how to translate large blocks
+ * to smaller ones, so the pool must have the LARGE_BLOCKS
+ * feature enabled if the stream has LARGE_BLOCKS.
+ */
+ if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+ !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS))
+ return (SET_ERROR(ENOTSUP));
+
error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
if (error == 0) {
/* target fs already exists; recv into temp clone */
@@ -1131,6 +1166,13 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
}
VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds));
+ if ((DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
+ DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+ !newds->ds_large_blocks) {
+ dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx);
+ newds->ds_large_blocks = B_TRUE;
+ }
+
dmu_buf_will_dirty(newds->ds_dbuf, tx);
newds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
@@ -1283,6 +1325,7 @@ restore_read(struct restorearg *ra, int len, char *buf)
/* some things will require 8-byte alignment, so everything must */
ASSERT0(len % 8);
+ ASSERT3U(len, <=, ra->bufsize);
while (done < len) {
ssize_t resid;
@@ -1420,7 +1463,7 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
drro->drr_blksz < SPA_MINBLOCKSIZE ||
- drro->drr_blksz > SPA_MAXBLOCKSIZE ||
+ drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(os)) ||
drro->drr_bonuslen > DN_MAX_BONUSLEN) {
return (SET_ERROR(EINVAL));
}
@@ -1693,7 +1736,7 @@ restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs)
int err;
if (drrs->drr_length < SPA_MINBLOCKSIZE ||
- drrs->drr_length > SPA_MAXBLOCKSIZE)
+ drrs->drr_length > spa_maxblocksize(dmu_objset_spa(os)))
return (SET_ERROR(EINVAL));
data = restore_read(ra, drrs->drr_length, NULL);
@@ -1781,7 +1824,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp,
ra.td = curthread;
ra.fp = fp;
ra.voff = *voffp;
- ra.bufsize = 1<<20;
+ ra.bufsize = SPA_MAXBLOCKSIZE;
ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP);
/* these were verified in dmu_recv_begin */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
index ed9757e..b97040a 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
@@ -224,7 +224,7 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
return;
min_bs = SPA_MINBLOCKSHIFT;
- max_bs = SPA_MAXBLOCKSHIFT;
+ max_bs = highbit64(txh->txh_tx->tx_objset->os_recordsize) - 1;
min_ibs = DN_MIN_INDBLKSHIFT;
max_ibs = DN_MAX_INDBLKSHIFT;
@@ -293,6 +293,14 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
*/
ASSERT(dn->dn_datablkshift != 0);
min_bs = max_bs = dn->dn_datablkshift;
+ } else {
+ /*
+ * The blocksize can increase up to the recordsize,
+ * or if it is already more than the recordsize,
+ * up to the next power of 2.
+ */
+ min_bs = highbit64(dn->dn_datablksz - 1);
+ max_bs = MAX(max_bs, highbit64(dn->dn_datablksz - 1));
}
/*
@@ -751,11 +759,11 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
bp = &dn->dn_phys->dn_blkptr[0];
if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
bp, bp->blk_birth))
- txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
+ txh->txh_space_tooverwrite += MZAP_MAX_BLKSZ;
else
- txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
+ txh->txh_space_towrite += MZAP_MAX_BLKSZ;
if (!BP_IS_HOLE(bp))
- txh->txh_space_tounref += SPA_MAXBLOCKSIZE;
+ txh->txh_space_tounref += MZAP_MAX_BLKSZ;
return;
}
@@ -1549,18 +1557,18 @@ dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)
/* If blkptr doesn't exist then add space to towrite */
if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
- txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
+ txh->txh_space_towrite += SPA_OLD_MAXBLOCKSIZE;
} else {
blkptr_t *bp;
bp = &dn->dn_phys->dn_spill;
if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
bp, bp->blk_birth))
- txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
+ txh->txh_space_tooverwrite += SPA_OLD_MAXBLOCKSIZE;
else
- txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
+ txh->txh_space_towrite += SPA_OLD_MAXBLOCKSIZE;
if (!BP_IS_HOLE(bp))
- txh->txh_space_tounref += SPA_MAXBLOCKSIZE;
+ txh->txh_space_tounref += SPA_OLD_MAXBLOCKSIZE;
}
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
index 0b19e76..b39f6b1 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
@@ -513,10 +513,10 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
{
int i;
+ ASSERT3U(blocksize, <=,
+ spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
if (blocksize == 0)
blocksize = 1 << zfs_default_bs;
- else if (blocksize > SPA_MAXBLOCKSIZE)
- blocksize = SPA_MAXBLOCKSIZE;
else
blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE);
@@ -597,7 +597,8 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
int nblkptr;
ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
- ASSERT3U(blocksize, <=, SPA_MAXBLOCKSIZE);
+ ASSERT3U(blocksize, <=,
+ spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
ASSERT0(blocksize % SPA_MINBLOCKSIZE);
ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
ASSERT(tx->tx_txg != 0);
@@ -1352,10 +1353,9 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
dmu_buf_impl_t *db;
int err;
+ ASSERT3U(size, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
if (size == 0)
size = SPA_MINBLOCKSIZE;
- if (size > SPA_MAXBLOCKSIZE)
- size = SPA_MAXBLOCKSIZE;
else
size = P2ROUNDUP(size, SPA_MINBLOCKSIZE);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
index a3efe92..84d9f01 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
@@ -51,6 +51,22 @@
#include <sys/dsl_userhold.h>
#include <sys/dsl_bookmark.h>
+SYSCTL_DECL(_vfs_zfs);
+
+/*
+ * The SPA supports block sizes up to 16MB. However, very large blocks
+ * can have an impact on i/o latency (e.g. tying up a spinning disk for
+ * ~300ms), and also potentially on the memory allocator. Therefore,
+ * we do not allow the recordsize to be set larger than zfs_max_recordsize
+ * (default 1MB). Larger blocks can be created by changing this tunable,
+ * and pools with larger blocks can always be imported and used, regardless
+ * of this setting.
+ */
+int zfs_max_recordsize = 1 * 1024 * 1024;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, max_recordsize, CTLFLAG_RWTUN,
+ &zfs_max_recordsize, 0,
+ "Maximum block size. Expect dragons when tuning this.");
+
#define SWITCH64(x, y) \
{ \
uint64_t __tmp = (x); \
@@ -60,8 +76,6 @@
#define DS_REF_MAX (1ULL << 62)
-#define DSL_DEADLIST_BLOCKSIZE SPA_MAXBLOCKSIZE
-
/*
* Figure out how much of this delta should be propogated to the dsl_dir
* layer. If there's a refreservation, that space has already been
@@ -111,6 +125,8 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
ds->ds_phys->ds_compressed_bytes += compressed;
ds->ds_phys->ds_uncompressed_bytes += uncompressed;
ds->ds_phys->ds_unique_bytes += used;
+ if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE)
+ ds->ds_need_large_blocks = B_TRUE;
mutex_exit(&ds->ds_lock);
dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,
compressed, uncompressed, tx);
@@ -392,6 +408,14 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t),
offsetof(dmu_sendarg_t, dsa_link));
+ if (doi.doi_type == DMU_OTN_ZAP_METADATA) {
+ int zaperr = zap_contains(mos, dsobj, DS_FIELD_LARGE_BLOCKS);
+ if (zaperr != ENOENT) {
+ VERIFY0(zaperr);
+ ds->ds_large_blocks = B_TRUE;
+ }
+ }
+
if (err == 0) {
err = dsl_dir_hold_obj(dp,
ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir);
@@ -707,6 +731,9 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
dsphys->ds_flags |= origin->ds_phys->ds_flags &
(DS_FLAG_INCONSISTENT | DS_FLAG_CI_DATASET);
+ if (origin->ds_large_blocks)
+ dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx);
+
dmu_buf_will_dirty(origin->ds_dbuf, tx);
origin->ds_phys->ds_num_children++;
@@ -1262,6 +1289,9 @@ dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname,
dsphys->ds_bp = ds->ds_phys->ds_bp;
dmu_buf_rele(dbuf, FTAG);
+ if (ds->ds_large_blocks)
+ dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx);
+
ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0);
if (ds->ds_prev) {
uint64_t next_clones_obj =
@@ -1546,6 +1576,11 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid;
dmu_objset_sync(ds->ds_objset, zio, tx);
+
+ if (ds->ds_need_large_blocks && !ds->ds_large_blocks) {
+ dsl_dataset_activate_large_blocks_sync_impl(ds->ds_object, tx);
+ ds->ds_large_blocks = B_TRUE;
+ }
}
static void
@@ -3231,6 +3266,77 @@ dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap,
return (err);
}
+static int
+dsl_dataset_activate_large_blocks_check(void *arg, dmu_tx_t *tx)
+{
+ const char *dsname = arg;
+ dsl_dataset_t *ds;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ int error = 0;
+
+ if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS))
+ return (SET_ERROR(ENOTSUP));
+
+ ASSERT(spa_feature_is_enabled(dp->dp_spa,
+ SPA_FEATURE_EXTENSIBLE_DATASET));
+
+ error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
+ if (error != 0)
+ return (error);
+
+ if (ds->ds_large_blocks)
+ error = EALREADY;
+ dsl_dataset_rele(ds, FTAG);
+
+ return (error);
+}
+
+void
+dsl_dataset_activate_large_blocks_sync_impl(uint64_t dsobj, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset;
+ uint64_t zero = 0;
+
+ spa_feature_incr(spa, SPA_FEATURE_LARGE_BLOCKS, tx);
+ dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx);
+
+ VERIFY0(zap_add(mos, dsobj, DS_FIELD_LARGE_BLOCKS,
+ sizeof (zero), 1, &zero, tx));
+}
+
+static void
+dsl_dataset_activate_large_blocks_sync(void *arg, dmu_tx_t *tx)
+{
+ const char *dsname = arg;
+ dsl_dataset_t *ds;
+
+ VERIFY0(dsl_dataset_hold(dmu_tx_pool(tx), dsname, FTAG, &ds));
+
+ dsl_dataset_activate_large_blocks_sync_impl(ds->ds_object, tx);
+ ASSERT(!ds->ds_large_blocks);
+ ds->ds_large_blocks = B_TRUE;
+ dsl_dataset_rele(ds, FTAG);
+}
+
+int
+dsl_dataset_activate_large_blocks(const char *dsname)
+{
+ int error;
+
+ error = dsl_sync_task(dsname,
+ dsl_dataset_activate_large_blocks_check,
+ dsl_dataset_activate_large_blocks_sync, (void *)dsname,
+ 1, ZFS_SPACE_CHECK_RESERVED);
+
+ /*
+ * EALREADY indicates that this dataset already supports large blocks.
+ */
+ if (error == EALREADY)
+ error = 0;
+ return (error);
+}
+
/*
* Return TRUE if 'earlier' is an earlier snapshot in 'later's timeline.
* For example, they could both be snapshots of the same filesystem, and
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c
index 4f39c39..8c8e374 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c
@@ -143,7 +143,7 @@ uint64_t
dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx)
{
if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
- return (bpobj_alloc(os, SPA_MAXBLOCKSIZE, tx));
+ return (bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx));
return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR,
sizeof (dsl_deadlist_phys_t), tx));
}
@@ -180,7 +180,7 @@ dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
{
if (dle->dle_bpobj.bpo_object ==
dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {
- uint64_t obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
+ uint64_t obj = bpobj_alloc(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
bpobj_close(&dle->dle_bpobj);
bpobj_decr_empty(dl->dl_os, tx);
VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
@@ -254,7 +254,7 @@ dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
dle->dle_mintxg = mintxg;
- obj = bpobj_alloc_empty(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
+ obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
avl_add(&dl->dl_tree, dle);
@@ -338,7 +338,7 @@ dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg,
if (dle->dle_mintxg >= maxtxg)
break;
- obj = bpobj_alloc_empty(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
+ obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, newobj,
dle->dle_mintxg, obj, tx));
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c
index f8a4546..1237641 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c
@@ -264,6 +264,10 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx)
obj = ds->ds_object;
+ if (ds->ds_large_blocks) {
+ ASSERT0(zap_contains(mos, obj, DS_FIELD_LARGE_BLOCKS));
+ spa_feature_decr(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS, tx);
+ }
if (ds->ds_phys->ds_prev_snap_obj != 0) {
ASSERT3P(ds->ds_prev, ==, NULL);
VERIFY0(dsl_dataset_hold_obj(dp,
@@ -720,6 +724,9 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
ASSERT0(ds->ds_reserved);
}
+ if (ds->ds_large_blocks)
+ spa_feature_decr(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS, tx);
+
dsl_scan_ds_destroyed(ds, tx);
obj = ds->ds_object;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
index 6c07fe3..1ffcffa 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
@@ -471,7 +471,7 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
FREE_DIR_NAME, &dp->dp_free_dir));
/* create and open the free_bplist */
- obj = bpobj_alloc(dp->dp_meta_objset, SPA_MAXBLOCKSIZE, tx);
+ obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx);
VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0);
VERIFY0(bpobj_open(&dp->dp_free_bpobj,
@@ -896,7 +896,7 @@ dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx)
* subobj support. So call dmu_object_alloc() directly.
*/
obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ,
- SPA_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx);
+ SPA_OLD_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx);
VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx));
VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj));
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
index dc15b3e..9568e6e 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
@@ -161,7 +161,7 @@ SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_unload, CTLFLAG_RWTUN,
* an allocation of this size then it switches to using more
* aggressive strategy (i.e search by size rather than offset).
*/
-uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE;
+uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE;
TUNABLE_QUAD("vfs.zfs.metaslab.df_alloc_threshold",
&metaslab_df_alloc_threshold);
SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, df_alloc_threshold, CTLFLAG_RWTUN,
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c
index 5d8c731..84b39dd 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c
@@ -500,7 +500,7 @@ sa_resize_spill(sa_handle_t *hdl, uint32_t size, dmu_tx_t *tx)
if (size == 0) {
blocksize = SPA_MINBLOCKSIZE;
- } else if (size > SPA_MAXBLOCKSIZE) {
+ } else if (size > SPA_OLD_MAXBLOCKSIZE) {
ASSERT(0);
return (SET_ERROR(EFBIG));
} else {
@@ -675,7 +675,7 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count,
hdrsize = sa_find_sizes(sa, attr_desc, attr_count, hdl->sa_bonus,
SA_BONUS, &i, &used, &spilling);
- if (used > SPA_MAXBLOCKSIZE)
+ if (used > SPA_OLD_MAXBLOCKSIZE)
return (SET_ERROR(EFBIG));
VERIFY(0 == dmu_set_bonus(hdl->sa_bonus, spilling ?
@@ -699,7 +699,7 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count,
attr_count - i, hdl->sa_spill, SA_SPILL, &i,
&spill_used, &dummy);
- if (spill_used > SPA_MAXBLOCKSIZE)
+ if (spill_used > SPA_OLD_MAXBLOCKSIZE)
return (SET_ERROR(EFBIG));
buf_space = hdl->sa_spill->db_size - spillhdrsize;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
index 568d470..67f0a2d 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
@@ -284,6 +284,14 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
0, ZPROP_SRC_LOCAL);
+ if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) {
+ spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
+ MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE);
+ } else {
+ spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
+ SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE);
+ }
+
if ((dp = list_head(&spa->spa_config_list)) != NULL) {
if (dp->scd_path == NULL) {
spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
@@ -498,7 +506,7 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
if (!error) {
objset_t *os;
- uint64_t compress;
+ uint64_t propval;
if (strval == NULL || strval[0] == '\0') {
objnum = zpool_prop_default_numeric(
@@ -509,15 +517,25 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
if (error = dmu_objset_hold(strval, FTAG, &os))
break;
- /* Must be ZPL and not gzip compressed. */
+ /*
+ * Must be ZPL, and its property settings
+ * must be supported by GRUB (compression
+ * is not gzip, and large blocks are not used).
+ */
if (dmu_objset_type(os) != DMU_OST_ZFS) {
error = SET_ERROR(ENOTSUP);
} else if ((error =
dsl_prop_get_int_ds(dmu_objset_ds(os),
zfs_prop_to_name(ZFS_PROP_COMPRESSION),
- &compress)) == 0 &&
- !BOOTFS_COMPRESS_VALID(compress)) {
+ &propval)) == 0 &&
+ !BOOTFS_COMPRESS_VALID(propval)) {
+ error = SET_ERROR(ENOTSUP);
+ } else if ((error =
+ dsl_prop_get_int_ds(dmu_objset_ds(os),
+ zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
+ &propval)) == 0 &&
+ propval > SPA_OLD_MAXBLOCKSIZE) {
error = SET_ERROR(ENOTSUP);
} else {
objnum = dmu_objset_id(os);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c
index 83e5217..9fc7a46 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c
@@ -90,7 +90,7 @@ spa_history_create_obj(spa_t *spa, dmu_tx_t *tx)
ASSERT(spa->spa_history == 0);
spa->spa_history = dmu_object_alloc(mos, DMU_OT_SPA_HISTORY,
- SPA_MAXBLOCKSIZE, DMU_OT_SPA_HISTORY_OFFSETS,
+ SPA_OLD_MAXBLOCKSIZE, DMU_OT_SPA_HISTORY_OFFSETS,
sizeof (spa_history_phys_t), tx);
VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
index b7a9f07..3122d9a 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
@@ -2028,3 +2028,12 @@ spa_debug_enabled(spa_t *spa)
{
return (spa->spa_debug);
}
+
+int
+spa_maxblocksize(spa_t *spa)
+{
+ if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS))
+ return (SPA_MAXBLOCKSIZE);
+ else
+ return (SPA_OLD_MAXBLOCKSIZE);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
index ad19266..d85d872 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
@@ -249,7 +249,7 @@ void zfs_znode_byteswap(void *buf, size_t size);
* The maximum number of bytes that can be accessed as part of one
* operation, including metadata.
*/
-#define DMU_MAX_ACCESS (10<<20) /* 10MB */
+#define DMU_MAX_ACCESS (32 * 1024 * 1024) /* 32MB */
#define DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */
#define DMU_USERUSED_OBJECT (-1ULL)
@@ -646,6 +646,7 @@ void xuio_stat_wbuf_copied();
void xuio_stat_wbuf_nocopy();
extern int zfs_prefetch_disable;
+extern int zfs_max_recordsize;
/*
* Asynchronously try to read in the data.
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
index 23d88fd..804f0c1 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
@@ -95,6 +95,7 @@ struct objset {
zfs_cache_type_t os_secondary_cache;
zfs_sync_type_t os_sync;
zfs_redundant_metadata_type_t os_redundant_metadata;
+ int os_recordsize;
/* no lock needed: */
struct dmu_tx *os_synctx; /* XXX sketchy */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h
index b5d6170..b03cb09 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h
@@ -36,7 +36,8 @@ struct dsl_dataset;
struct drr_begin;
struct avl_tree;
-int dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
+int dmu_send(const char *tosnap, const char *fromsnap,
+ boolean_t embedok, boolean_t large_block_ok,
#ifdef illumos
int outfd, struct vnode *vp, offset_t *off);
#else
@@ -45,10 +46,11 @@ int dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
int dmu_send_estimate(struct dsl_dataset *ds, struct dsl_dataset *fromds,
uint64_t *sizep);
int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
+ boolean_t embedok, boolean_t large_block_ok,
#ifdef illumos
- boolean_t embedok, int outfd, vnode_t *vp, offset_t *off);
+ int outfd, struct vnode *vp, offset_t *off);
#else
- boolean_t embedok, int outfd, struct file *fp, offset_t *off);
+ int outfd, struct file *fp, offset_t *off);
#endif
typedef struct dmu_recv_cookie {
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h
index d9552b2..ff90f8b 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h
@@ -83,6 +83,13 @@ struct dsl_pool;
#define DS_FIELD_BOOKMARK_NAMES "com.delphix:bookmarks"
/*
+ * This field is present (with value=0) if this dataset may contain large
+ * blocks (>128KB). If it is present, then this dataset
+ * is counted in the refcount of the SPA_FEATURE_LARGE_BLOCKS feature.
+ */
+#define DS_FIELD_LARGE_BLOCKS "org.open-zfs:large_blocks"
+
+/*
* DS_FLAG_CI_DATASET is set if the dataset contains a file system whose
* name lookups should be performed case-insensitively.
*/
@@ -135,6 +142,8 @@ typedef struct dsl_dataset {
/* only used in syncing context, only valid for non-snapshots: */
struct dsl_dataset *ds_prev;
uint64_t ds_bookmarks; /* DMU_OTN_ZAP_METADATA */
+ boolean_t ds_large_blocks;
+ boolean_t ds_need_large_blocks;
/* has internal locking: */
dsl_deadlist_t ds_deadlist;
@@ -244,6 +253,8 @@ int dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
int dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, dsl_dataset_t *last,
uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
boolean_t dsl_dataset_is_dirty(dsl_dataset_t *ds);
+int dsl_dataset_activate_large_blocks(const char *dsname);
+void dsl_dataset_activate_large_blocks_sync_impl(uint64_t dsobj, dmu_tx_t *tx);
int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
index f62e315..a5f32e8 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
@@ -94,17 +94,26 @@ _NOTE(CONSTCOND) } while (0)
_NOTE(CONSTCOND) } while (0)
/*
- * We currently support nine block sizes, from 512 bytes to 128K.
- * We could go higher, but the benefits are near-zero and the cost
- * of COWing a giant block to modify one byte would become excessive.
+ * We currently support block sizes from 512 bytes to 16MB.
+ * The benefits of larger blocks, and thus larger IO, need to be weighed
+ * against the cost of COWing a giant block to modify one byte, and the
+ * large latency of reading or writing a large block.
+ *
+ * Note that although blocks up to 16MB are supported, the recordsize
+ * property can not be set larger than zfs_max_recordsize (default 1MB).
+ * See the comment near zfs_max_recordsize in dsl_dataset.c for details.
+ *
+ * Note that although the LSIZE field of the blkptr_t can store sizes up
+ * to 32MB, the dnode's dn_datablkszsec can only store sizes up to
+ * 32MB - 512 bytes. Therefore, we limit SPA_MAXBLOCKSIZE to 16MB.
*/
#define SPA_MINBLOCKSHIFT 9
-#define SPA_MAXBLOCKSHIFT 17
+#define SPA_OLD_MAXBLOCKSHIFT 17
+#define SPA_MAXBLOCKSHIFT 24
#define SPA_MINBLOCKSIZE (1ULL << SPA_MINBLOCKSHIFT)
+#define SPA_OLD_MAXBLOCKSIZE (1ULL << SPA_OLD_MAXBLOCKSHIFT)
#define SPA_MAXBLOCKSIZE (1ULL << SPA_MAXBLOCKSHIFT)
-#define SPA_BLOCKSIZES (SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1)
-
/*
* Default maximum supported logical ashift.
*
@@ -801,6 +810,7 @@ extern boolean_t spa_has_slogs(spa_t *spa);
extern boolean_t spa_is_root(spa_t *spa);
extern boolean_t spa_writeable(spa_t *spa);
extern boolean_t spa_has_pending_synctask(spa_t *spa);
+extern int spa_maxblocksize(spa_t *spa);
extern int spa_mode(spa_t *spa);
extern uint64_t zfs_strtonum(const char *str, char **nptr);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h
index 1dc322e..29ebae5 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h
@@ -41,8 +41,7 @@ extern int fzap_default_block_shift;
#define MZAP_ENT_LEN 64
#define MZAP_NAME_LEN (MZAP_ENT_LEN - 8 - 4 - 2)
-#define MZAP_MAX_BLKSHIFT SPA_MAXBLOCKSHIFT
-#define MZAP_MAX_BLKSZ (1 << MZAP_MAX_BLKSHIFT)
+#define MZAP_MAX_BLKSZ SPA_OLD_MAXBLOCKSIZE
#define ZAP_NEED_CD (-1U)
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
index 73fbf3c..71ca044 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
@@ -85,13 +85,16 @@ typedef enum drr_headertype {
/* flags #3 - #15 are reserved for incompatible closed-source implementations */
#define DMU_BACKUP_FEATURE_EMBED_DATA (1<<16)
#define DMU_BACKUP_FEATURE_EMBED_DATA_LZ4 (1<<17)
+/* flag #18 is reserved for a Delphix feature */
+#define DMU_BACKUP_FEATURE_LARGE_BLOCKS (1<<19)
/*
* Mask of all supported backup features
*/
#define DMU_BACKUP_FEATURE_MASK (DMU_BACKUP_FEATURE_DEDUP | \
DMU_BACKUP_FEATURE_DEDUPPROPS | DMU_BACKUP_FEATURE_SA_SPILL | \
- DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_EMBED_DATA_LZ4)
+ DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_EMBED_DATA_LZ4 | \
+ DMU_BACKUP_FEATURE_LARGE_BLOCKS)
/* Are all features in the given flag word currently supported? */
#define DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK))
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h
index dae099b..3fb66c8 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h
@@ -133,8 +133,6 @@ extern "C" {
#define ZFS_SHARES_DIR "SHARES"
#define ZFS_SA_ATTRS "SA_ATTRS"
-#define ZFS_MAX_BLOCKSIZE (SPA_MAXBLOCKSIZE)
-
/*
* Path component length
*
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h
index 15ef2aa..895d632 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h
@@ -90,7 +90,6 @@ typedef struct zil_chain {
} zil_chain_t;
#define ZIL_MIN_BLKSZ 4096ULL
-#define ZIL_MAX_BLKSZ SPA_MAXBLOCKSIZE
/*
* The words of a log block checksum.
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h
index 5856620..b5c666c 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h
@@ -139,7 +139,7 @@ typedef struct zil_bp_node {
avl_node_t zn_node;
} zil_bp_node_t;
-#define ZIL_MAX_LOG_DATA (SPA_MAXBLOCKSIZE - sizeof (zil_chain_t) - \
+#define ZIL_MAX_LOG_DATA (SPA_OLD_MAXBLOCKSIZE - sizeof (zil_chain_t) - \
sizeof (lr_write_t))
#ifdef __cplusplus
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
index 10603dd..a9cd4f0 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
@@ -926,9 +926,9 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
/*
* Compute the raidz-deflation ratio. Note, we hard-code
- * in 128k (1 << 17) because it is the current "typical" blocksize.
- * Even if SPA_MAXBLOCKSIZE changes, this algorithm must never change,
- * or we will inconsistently account for existing bp's.
+ * in 128k (1 << 17) because it is the "typical" blocksize.
+ * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change,
+ * otherwise it would inconsistently account for existing bp's.
*/
vd->vdev_deflate_ratio = (1 << 17) /
(vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
index 0c2cd77..7ce67aa 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
@@ -170,7 +170,7 @@ int zfs_vdev_async_write_active_max_dirty_percent = 60;
* we include spans of optional I/Os to aid aggregation at the disk even when
* they aren't able to help us aggregate at this level.
*/
-int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE;
+int zfs_vdev_aggregation_limit = SPA_OLD_MAXBLOCKSIZE;
int zfs_vdev_read_gap_limit = 32 << 10;
int zfs_vdev_write_gap_limit = 4 << 10;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
index 925aaea..33bedc1 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
@@ -1618,7 +1618,7 @@ vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
/*
* Don't write past the end of the block
*/
- VERIFY3U(offset + size, <=, origoffset + SPA_MAXBLOCKSIZE);
+ VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
start = offset;
end = start + size;
@@ -1633,8 +1633,8 @@ vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
* KB size.
*/
rm = vdev_raidz_map_alloc(data - (offset - origoffset),
- SPA_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift, vd->vdev_children,
- vd->vdev_nparity);
+ SPA_OLD_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift,
+ vd->vdev_children, vd->vdev_nparity);
coloffset = origoffset;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c
index 4e88a53..4ed8aac 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c
@@ -33,6 +33,7 @@
#include <sys/zap_leaf.h>
#include <sys/avl.h>
#include <sys/arc.h>
+#include <sys/dmu_objset.h>
#ifdef _KERNEL
#include <sys/sunddi.h>
@@ -664,9 +665,9 @@ zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx);
ASSERT(leaf_blockshift >= SPA_MINBLOCKSHIFT &&
- leaf_blockshift <= SPA_MAXBLOCKSHIFT &&
+ leaf_blockshift <= SPA_OLD_MAXBLOCKSHIFT &&
indirect_blockshift >= SPA_MINBLOCKSHIFT &&
- indirect_blockshift <= SPA_MAXBLOCKSHIFT);
+ indirect_blockshift <= SPA_OLD_MAXBLOCKSHIFT);
VERIFY(dmu_object_set_blocksize(os, obj,
1ULL << leaf_blockshift, indirect_blockshift, tx) == 0);
@@ -1396,7 +1397,6 @@ zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
zap_t *zap;
int err = 0;
-
/*
* Since, we don't have a name, we cannot figure out which blocks will
* be affected in this operation. So, account for the worst case :
@@ -1409,7 +1409,7 @@ zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
* large microzap results in a promotion to fatzap.
*/
if (name == NULL) {
- *towrite += (3 + (add ? 4 : 0)) * SPA_MAXBLOCKSIZE;
+ *towrite += (3 + (add ? 4 : 0)) * SPA_OLD_MAXBLOCKSIZE;
return (err);
}
@@ -1433,7 +1433,7 @@ zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
/*
* We treat this case as similar to (name == NULL)
*/
- *towrite += (3 + (add ? 4 : 0)) * SPA_MAXBLOCKSIZE;
+ *towrite += (3 + (add ? 4 : 0)) * SPA_OLD_MAXBLOCKSIZE;
}
} else {
/*
@@ -1452,12 +1452,12 @@ zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
* ptrtbl blocks
*/
if (dmu_buf_freeable(zap->zap_dbuf))
- *tooverwrite += SPA_MAXBLOCKSIZE;
+ *tooverwrite += MZAP_MAX_BLKSZ;
else
- *towrite += SPA_MAXBLOCKSIZE;
+ *towrite += MZAP_MAX_BLKSZ;
if (add) {
- *towrite += 4 * SPA_MAXBLOCKSIZE;
+ *towrite += 4 * MZAP_MAX_BLKSZ;
}
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
index 8c2d11c..68a5025 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
@@ -2434,7 +2434,7 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source,
const char *propname = nvpair_name(pair);
zfs_prop_t prop = zfs_name_to_prop(propname);
uint64_t intval;
- int err;
+ int err = -1;
if (prop == ZPROP_INVAL) {
if (zfs_prop_userquota(propname))
@@ -3865,8 +3865,7 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr)
* the SPA supports it. We ignore any errors here since
* we'll catch them later.
*/
- if (nvpair_type(pair) == DATA_TYPE_UINT64 &&
- nvpair_value_uint64(pair, &intval) == 0) {
+ if (nvpair_value_uint64(pair, &intval) == 0) {
if (intval >= ZIO_COMPRESS_GZIP_1 &&
intval <= ZIO_COMPRESS_GZIP_9 &&
zfs_earlier_version(dsname,
@@ -3917,6 +3916,42 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr)
return (SET_ERROR(ENOTSUP));
break;
+ case ZFS_PROP_RECORDSIZE:
+ /* Record sizes above 128k need the feature to be enabled */
+ if (nvpair_value_uint64(pair, &intval) == 0 &&
+ intval > SPA_OLD_MAXBLOCKSIZE) {
+ spa_t *spa;
+
+ /*
+ * If this is a bootable dataset then
+ * the we don't allow large (>128K) blocks,
+ * because GRUB doesn't support them.
+ */
+ if (zfs_is_bootfs(dsname) &&
+ intval > SPA_OLD_MAXBLOCKSIZE) {
+ return (SET_ERROR(EDOM));
+ }
+
+ /*
+ * We don't allow setting the property above 1MB,
+ * unless the tunable has been changed.
+ */
+ if (intval > zfs_max_recordsize ||
+ intval > SPA_MAXBLOCKSIZE)
+ return (SET_ERROR(EDOM));
+
+ if ((err = spa_open(dsname, &spa, FTAG)) != 0)
+ return (err);
+
+ if (!spa_feature_is_enabled(spa,
+ SPA_FEATURE_LARGE_BLOCKS)) {
+ spa_close(spa, FTAG);
+ return (SET_ERROR(ENOTSUP));
+ }
+ spa_close(spa, FTAG);
+ }
+ break;
+
case ZFS_PROP_SHARESMB:
if (zpl_earlier_version(dsname, ZPL_VERSION_FUID))
return (SET_ERROR(ENOTSUP));
@@ -4345,7 +4380,7 @@ out:
* zc_fromobj objsetid of incremental fromsnap (may be zero)
* zc_guid if set, estimate size of stream only. zc_cookie is ignored.
* output size in zc_objset_type.
- * zc_flags if =1, WRITE_EMBEDDED records are permitted
+ * zc_flags lzc_send_flags
*
* outputs:
* zc_objset_type estimated size, if zc_guid is set
@@ -4357,6 +4392,7 @@ zfs_ioc_send(zfs_cmd_t *zc)
offset_t off;
boolean_t estimate = (zc->zc_guid != 0);
boolean_t embedok = (zc->zc_flags & 0x1);
+ boolean_t large_block_ok = (zc->zc_flags & 0x2);
if (zc->zc_obj != 0) {
dsl_pool_t *dp;
@@ -4421,10 +4457,11 @@ zfs_ioc_send(zfs_cmd_t *zc)
off = fp->f_offset;
error = dmu_send_obj(zc->zc_name, zc->zc_sendobj,
+ zc->zc_fromobj, embedok, large_block_ok,
#ifdef illumos
- zc->zc_fromobj, embedok, zc->zc_cookie, fp->f_vnode, &off);
+ zc->zc_cookie, fp->f_vnode, &off);
#else
- zc->zc_fromobj, embedok, zc->zc_cookie, fp, &off);
+ zc->zc_cookie, fp, &off);
#endif
if (off >= 0 && off <= MAXOFFSET_T)
@@ -5362,6 +5399,8 @@ zfs_ioc_unjail(zfs_cmd_t *zc)
* innvl: {
* "fd" -> file descriptor to write stream to (int32)
* (optional) "fromsnap" -> full snap name to send an incremental from
+ * (optional) "largeblockok" -> (value ignored)
+ * indicates that blocks > 128KB are permitted
* (optional) "embedok" -> (value ignored)
* presence indicates DRR_WRITE_EMBEDDED records are permitted
* }
@@ -5377,6 +5416,7 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
offset_t off;
char *fromname = NULL;
int fd;
+ boolean_t largeblockok;
boolean_t embedok;
error = nvlist_lookup_int32(innvl, "fd", &fd);
@@ -5385,6 +5425,7 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
(void) nvlist_lookup_string(innvl, "fromsnap", &fromname);
+ largeblockok = nvlist_exists(innvl, "largeblockok");
embedok = nvlist_exists(innvl, "embedok");
file_t *fp = getf(fd, cap_rights_init(&rights, CAP_READ));
@@ -5392,10 +5433,11 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
return (SET_ERROR(EBADF));
off = fp->f_offset;
+ error = dmu_send(snapname, fromname, embedok, largeblockok,
#ifdef illumos
- error = dmu_send(snapname, fromname, embedok, fd, fp->f_vnode, &off);
+ fd, fp->f_vnode, &off);
#else
- error = dmu_send(snapname, fromname, embedok, fd, fp, &off);
+ fd, fp, &off);
#endif
#ifdef illumos
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c
index 3029f7d..7432290 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c
@@ -490,7 +490,7 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
* If the write would overflow the largest block then split it.
*/
if (write_state != WR_INDIRECT && resid > ZIL_MAX_LOG_DATA)
- len = SPA_MAXBLOCKSIZE >> 1;
+ len = SPA_OLD_MAXBLOCKSIZE >> 1;
else
len = resid;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
index 56ca65c..cdeb454 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
@@ -271,10 +271,9 @@ static void
blksz_changed_cb(void *arg, uint64_t newval)
{
zfsvfs_t *zfsvfs = arg;
-
- if (newval < SPA_MINBLOCKSIZE ||
- newval > SPA_MAXBLOCKSIZE || !ISP2(newval))
- newval = SPA_MAXBLOCKSIZE;
+ ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os)));
+ ASSERT3U(newval, >=, SPA_MINBLOCKSIZE);
+ ASSERT(ISP2(newval));
zfsvfs->z_max_blksz = newval;
zfsvfs->z_vfs->mnt_stat.f_iosize = newval;
@@ -890,7 +889,7 @@ zfsvfs_create(const char *osname, zfsvfs_t **zfvp)
*/
zfsvfs->z_vfs = NULL;
zfsvfs->z_parent = zfsvfs;
- zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
+ zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE;
zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
zfsvfs->z_os = os;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
index 081ac61..504c130 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
@@ -1013,8 +1013,14 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
uint64_t new_blksz;
if (zp->z_blksz > max_blksz) {
+ /*
+ * File's blocksize is already larger than the
+ * "recordsize" property. Only let it grow to
+ * the next power of 2.
+ */
ASSERT(!ISP2(zp->z_blksz));
- new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE);
+ new_blksz = MIN(end_size,
+ 1 << highbit64(zp->z_blksz));
} else {
new_blksz = MIN(end_size, max_blksz);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
index d92597e..f92ddd44 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
@@ -54,6 +54,7 @@
#endif /* _KERNEL */
#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
#include <sys/refcount.h>
#include <sys/stat.h>
#include <sys/zap.h>
@@ -1543,8 +1544,13 @@ zfs_extend(znode_t *zp, uint64_t end)
* We are growing the file past the current block size.
*/
if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) {
+ /*
+ * File's blocksize is already larger than the
+ * "recordsize" property. Only let it grow to
+ * the next power of 2.
+ */
ASSERT(!ISP2(zp->z_blksz));
- newblksz = MIN(end, SPA_MAXBLOCKSIZE);
+ newblksz = MIN(end, 1 << highbit64(zp->z_blksz));
} else {
newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
index 150dfb2..df268ae 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
@@ -232,6 +232,7 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst,
sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) {
error = SET_ERROR(ECKSUM);
} else {
+ ASSERT3U(len, <=, SPA_OLD_MAXBLOCKSIZE);
bcopy(lr, dst, len);
*end = (char *)dst + len;
*nbp = zilc->zc_next_blk;
@@ -246,6 +247,8 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst,
(zilc->zc_nused > (size - sizeof (*zilc)))) {
error = SET_ERROR(ECKSUM);
} else {
+ ASSERT3U(zilc->zc_nused, <=,
+ SPA_OLD_MAXBLOCKSIZE);
bcopy(lr, dst, zilc->zc_nused);
*end = (char *)dst + zilc->zc_nused;
*nbp = zilc->zc_next_blk;
@@ -329,7 +332,7 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
* If the log has been claimed, stop if we encounter a sequence
* number greater than the highest claimed sequence number.
*/
- lrbuf = zio_buf_alloc(SPA_MAXBLOCKSIZE);
+ lrbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE);
zil_bp_tree_init(zilog);
for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) {
@@ -376,7 +379,7 @@ done:
(max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq));
zil_bp_tree_fini(zilog);
- zio_buf_free(lrbuf, SPA_MAXBLOCKSIZE);
+ zio_buf_free(lrbuf, SPA_OLD_MAXBLOCKSIZE);
return (error);
}
@@ -908,7 +911,7 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
*
* These must be a multiple of 4KB. Note only the amount used (again
* aligned to 4KB) actually gets written. However, we can't always just
- * allocate SPA_MAXBLOCKSIZE as the slog space could be exhausted.
+ * allocate SPA_OLD_MAXBLOCKSIZE as the slog space could be exhausted.
*/
uint64_t zil_block_buckets[] = {
4096, /* non TX_WRITE */
@@ -990,7 +993,7 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
continue;
zil_blksz = zil_block_buckets[i];
if (zil_blksz == UINT64_MAX)
- zil_blksz = SPA_MAXBLOCKSIZE;
+ zil_blksz = SPA_OLD_MAXBLOCKSIZE;
zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz;
for (i = 0; i < ZIL_PREV_BLKS; i++)
zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
index f9f2398..838b3cd 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
@@ -144,9 +144,8 @@ zio_init(void)
/*
* For small buffers, we want a cache for each multiple of
- * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache
- * for each quarter-power of 2. For large buffers, we want
- * a cache for each multiple of PAGESIZE.
+ * SPA_MINBLOCKSIZE. For larger buffers, we want a cache
+ * for each quarter-power of 2.
*/
for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
@@ -171,10 +170,8 @@ zio_init(void)
#endif /* illumos */
if (size <= 4 * SPA_MINBLOCKSIZE) {
align = SPA_MINBLOCKSIZE;
- } else if (IS_P2ALIGNED(size, PAGESIZE)) {
- align = PAGESIZE;
} else if (IS_P2ALIGNED(size, p2 >> 2)) {
- align = p2 >> 2;
+ align = MIN(p2 >> 2, PAGESIZE);
}
if (align != 0) {
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
index e483d33..156a627 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
@@ -260,7 +260,7 @@ int
zvol_check_volblocksize(uint64_t volblocksize)
{
if (volblocksize < SPA_MINBLOCKSIZE ||
- volblocksize > SPA_MAXBLOCKSIZE ||
+ volblocksize > SPA_OLD_MAXBLOCKSIZE ||
!ISP2(volblocksize))
return (SET_ERROR(EDOM));
@@ -830,7 +830,7 @@ zvol_prealloc(zvol_state_t *zv)
while (resid != 0) {
int error;
- uint64_t bytes = MIN(resid, SPA_MAXBLOCKSIZE);
+ uint64_t bytes = MIN(resid, SPA_OLD_MAXBLOCKSIZE);
tx = dmu_tx_create(os);
dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
@@ -1868,7 +1868,8 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
(void) strcpy(dki.dki_dname, "zvol");
dki.dki_ctype = DKC_UNKNOWN;
dki.dki_unit = getminor(dev);
- dki.dki_maxtransfer = 1 << (SPA_MAXBLOCKSHIFT - zv->zv_min_bs);
+ dki.dki_maxtransfer =
+ 1 << (SPA_OLD_MAXBLOCKSHIFT - zv->zv_min_bs);
mutex_exit(&spa_namespace_lock);
if (ddi_copyout(&dki, (void *)arg, sizeof (dki), flag))
error = SET_ERROR(EFAULT);
@@ -2187,14 +2188,14 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize)
zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1,
&vbs, tx);
error = error ? error : dmu_object_set_blocksize(
- os, ZVOL_OBJ, SPA_MAXBLOCKSIZE, 0, tx);
+ os, ZVOL_OBJ, SPA_OLD_MAXBLOCKSIZE, 0, tx);
if (version >= SPA_VERSION_DEDUP) {
error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1,
&dedup, tx);
}
if (error == 0)
- zv->zv_volblocksize = SPA_MAXBLOCKSIZE;
+ zv->zv_volblocksize = SPA_OLD_MAXBLOCKSIZE;
}
dmu_tx_commit(tx);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h b/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
index 87d0650..16d528e 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
@@ -196,6 +196,7 @@ typedef enum {
ZPOOL_PROP_FREEING,
ZPOOL_PROP_FRAGMENTATION,
ZPOOL_PROP_LEAKED,
+ ZPOOL_PROP_MAXBLOCKSIZE,
ZPOOL_NUM_PROPS
} zpool_prop_t;
OpenPOWER on IntegriCloud