diff options
author | delphij <delphij@FreeBSD.org> | 2014-12-22 20:58:51 +0000 |
---|---|---|
committer | delphij <delphij@FreeBSD.org> | 2014-12-22 20:58:51 +0000 |
commit | 1ad38ed4f01c38401f2c15151edbcfab81168db1 (patch) | |
tree | bf517de79b70790f455a71edd81bb6303f96f90e /cddl/contrib/opensolaris/cmd | |
parent | af2ee162da52913fdefa6d0ffae3644e39a3b369 (diff) | |
download | FreeBSD-src-1ad38ed4f01c38401f2c15151edbcfab81168db1.zip FreeBSD-src-1ad38ed4f01c38401f2c15151edbcfab81168db1.tar.gz |
MFC r274337,r274673,274681,r275515:
ZFS large block support. The default recordsize remains at 128KB.
A new tunable/sysctl variable, vfs.zfs.max_recordsize is added to
allow adjusting the permitted maximum record size, or
zfs_max_recordsize, with a default of 1MB. ZFS will not allow
setting recordsize greater than zfs_max_recordsize as a safety
belt, because larger recordsize means greater read and write
latency and more memory usage.
Please note that booting from datasets that have recordsize greater
than 128KB is not supported (but it's Okay to enable the feature on
the pool).
Limited safety belt is provided for mounted root filesystem but use
caution when using a larger value.
Illumos issue:
5027 zfs large block support
Diffstat (limited to 'cddl/contrib/opensolaris/cmd')
-rw-r--r-- | cddl/contrib/opensolaris/cmd/zdb/zdb.c | 32 | ||||
-rw-r--r-- | cddl/contrib/opensolaris/cmd/zfs/zfs.8 | 48 | ||||
-rw-r--r-- | cddl/contrib/opensolaris/cmd/zfs/zfs_main.c | 11 | ||||
-rw-r--r-- | cddl/contrib/opensolaris/cmd/zpool/zpool-features.7 | 29 | ||||
-rw-r--r-- | cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c | 19 | ||||
-rw-r--r-- | cddl/contrib/opensolaris/cmd/ztest/ztest.c | 14 |
6 files changed, 133 insertions, 20 deletions
diff --git a/cddl/contrib/opensolaris/cmd/zdb/zdb.c b/cddl/contrib/opensolaris/cmd/zdb/zdb.c index c2c47d5..47ab7bf 100644 --- a/cddl/contrib/opensolaris/cmd/zdb/zdb.c +++ b/cddl/contrib/opensolaris/cmd/zdb/zdb.c @@ -2147,6 +2147,8 @@ dump_label(const char *dev) (void) close(fd); } +static uint64_t num_large_blocks; + /*ARGSUSED*/ static int dump_one_dir(const char *dsname, void *arg) @@ -2159,6 +2161,8 @@ dump_one_dir(const char *dsname, void *arg) (void) printf("Could not open %s, error %d\n", dsname, error); return (0); } + if (dmu_objset_ds(os)->ds_large_blocks) + num_large_blocks++; dump_dir(os); dmu_objset_disown(os, FTAG); fuid_table_destroy(); @@ -2169,7 +2173,7 @@ dump_one_dir(const char *dsname, void *arg) /* * Block statistics. */ -#define PSIZE_HISTO_SIZE (SPA_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1) +#define PSIZE_HISTO_SIZE (SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 2) typedef struct zdb_blkstats { uint64_t zb_asize; uint64_t zb_lsize; @@ -2234,7 +2238,15 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, zb->zb_lsize += BP_GET_LSIZE(bp); zb->zb_psize += BP_GET_PSIZE(bp); zb->zb_count++; - zb->zb_psize_histogram[BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT]++; + + /* + * The histogram is only big enough to record blocks up to + * SPA_OLD_MAXBLOCKSIZE; larger blocks go into the last, + * "other", bucket. + */ + int idx = BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT; + idx = MIN(idx, SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1); + zb->zb_psize_histogram[idx]++; zb->zb_gangs += BP_COUNT_GANG(bp); @@ -2946,6 +2958,7 @@ dump_zpool(spa_t *spa) dump_metaslab_groups(spa); if (dump_opt['d'] || dump_opt['i']) { + uint64_t refcount; dump_dir(dp->dp_meta_objset); if (dump_opt['d'] >= 3) { dump_bpobj(&spa->spa_deferred_bpobj, @@ -2965,8 +2978,21 @@ dump_zpool(spa_t *spa) } (void) dmu_objset_find(spa_name(spa), dump_one_dir, NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); + + (void) feature_get_refcount(spa, + &spa_feature_table[SPA_FEATURE_LARGE_BLOCKS], &refcount); + if (num_large_blocks != refcount) { + (void) printf("large_blocks feature refcount mismatch: " + "expected %lld != actual %lld\n", + (longlong_t)num_large_blocks, + (longlong_t)refcount); + rc = 2; + } else { + (void) printf("Verified large_blocks feature refcount " + "is correct (%llu)\n", (longlong_t)refcount); + } } - if (dump_opt['b'] || dump_opt['c']) + if (rc == 0 && (dump_opt['b'] || dump_opt['c'])) rc = dump_block_stats(spa); if (rc == 0) diff --git a/cddl/contrib/opensolaris/cmd/zfs/zfs.8 b/cddl/contrib/opensolaris/cmd/zfs/zfs.8 index 2315e05..065497f 100644 --- a/cddl/contrib/opensolaris/cmd/zfs/zfs.8 +++ b/cddl/contrib/opensolaris/cmd/zfs/zfs.8 @@ -30,7 +30,7 @@ .\" .\" $FreeBSD$ .\" -.Dd June 30, 2014 +.Dd November 10, 2014 .Dt ZFS 8 .Os .Sh NAME @@ -179,12 +179,12 @@ .Ar bookmark .Nm .Cm send -.Op Fl DnPpRve +.Op Fl DnPpRveL .Op Fl i Ar snapshot | Fl I Ar snapshot .Ar snapshot .Nm .Cm send -.Op Fl e +.Op Fl eL .Op Fl i Ar snapshot Ns | Ns bookmark .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot .Nm @@ -1187,6 +1187,12 @@ systems is strongly discouraged, and may adversely affect performance. .Pp The size specified must be a power of two greater than or equal to 512 and less than or equal to 128 Kbytes. +If the +.Sy large_blocks +feature is enabled on the pool, the size may be up to 1 Mbyte. +See +.Xr zpool-features 7 +for details on ZFS feature flags. .Pp Changing the file system's .Sy recordsize @@ -2477,7 +2483,7 @@ feature. .It Xo .Nm .Cm send -.Op Fl DnPpRve +.Op Fl DnPpRveL .Op Fl i Ar snapshot | Fl I Ar snapshot .Ar snapshot .Xc @@ -2549,6 +2555,22 @@ be used regardless of the dataset's property, but performance will be much better if the filesystem uses a dedup-capable checksum (eg. .Sy sha256 ) . +.It Fl L +Generate a stream which may contain blocks larger than 128KB. +This flag +has no effect if the +.Sy large_blocks +pool feature is disabled, or if the +.Sy recordsize +property of this filesystem has never been set above 128KB. +The receiving system must have the +.Sy large_blocks +pool feature enabled as well. +See +.Xr zpool-features 7 +for details on ZFS feature flags and the +.Sy large_blocks +feature. .It Fl e Generate a more compact stream by using WRITE_EMBEDDED records for blocks which are stored more compactly on disk by the @@ -2596,7 +2618,7 @@ on future versions of .It Xo .Nm .Cm send -.Op Fl e +.Op Fl eL .Op Fl i Ar snapshot Ns | Ns Ar bookmark .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot .Xc @@ -2622,6 +2644,22 @@ specified as the last component of the name If the incremental target is a clone, the incremental source can be the origin snapshot, or an earlier snapshot in the origin's filesystem, or the origin's origin, etc. +.It Fl L +Generate a stream which may contain blocks larger than 128KB. +This flag +has no effect if the +.Sy large_blocks +pool feature is disabled, or if the +.Sy recordsize +property of this filesystem has never been set above 128KB. +The receiving system must have the +.Sy large_blocks +pool feature enabled as well. +See +.Xr zpool-features 7 +for details on ZFS feature flags and the +.Sy large_blocks +feature. .It Fl e Generate a more compact stream by using WRITE_EMBEDDED records for blocks which are stored more compactly on disk by the diff --git a/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c b/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c index a3b461e..baac993 100644 --- a/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c +++ b/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c @@ -274,9 +274,9 @@ get_usage(zfs_help_t idx) case HELP_ROLLBACK: return (gettext("\trollback [-rRf] <snapshot>\n")); case HELP_SEND: - return (gettext("\tsend [-DnPpRve] [-[iI] snapshot] " + return (gettext("\tsend [-DnPpRvLe] [-[iI] snapshot] " "<snapshot>\n" - "\tsend [-e] [-i snapshot|bookmark] " + "\tsend [-Le] [-i snapshot|bookmark] " "<filesystem|volume|snapshot>\n")); case HELP_SET: return (gettext("\tset <property=value> " @@ -3709,7 +3709,7 @@ zfs_do_send(int argc, char **argv) boolean_t extraverbose = B_FALSE; /* check options */ - while ((c = getopt(argc, argv, ":i:I:RDpvnPe")) != -1) { + while ((c = getopt(argc, argv, ":i:I:RDpvnPLe")) != -1) { switch (c) { case 'i': if (fromname) @@ -3744,6 +3744,9 @@ zfs_do_send(int argc, char **argv) case 'n': flags.dryrun = B_TRUE; break; + case 'L': + flags.largeblock = B_TRUE; + break; case 'e': flags.embed_data = B_TRUE; break; @@ -3800,6 +3803,8 @@ zfs_do_send(int argc, char **argv) if (zhp == NULL) return (1); + if (flags.largeblock) + lzc_flags |= LZC_SEND_FLAG_LARGE_BLOCK; if (flags.embed_data) lzc_flags |= LZC_SEND_FLAG_EMBED_DATA; diff --git a/cddl/contrib/opensolaris/cmd/zpool/zpool-features.7 b/cddl/contrib/opensolaris/cmd/zpool/zpool-features.7 index 15ee2f8..9e51ded 100644 --- a/cddl/contrib/opensolaris/cmd/zpool/zpool-features.7 +++ b/cddl/contrib/opensolaris/cmd/zpool/zpool-features.7 @@ -23,7 +23,7 @@ .\" .\" $FreeBSD$ .\" -.Dd July 1, 2014 +.Dd November 10, 2014 .Dt ZPOOL-FEATURES 7 .Os .Sh NAME @@ -427,6 +427,33 @@ This feature becomes as soon as it is enabled and will never return to being .Sy enabled . +.It Sy large_blocks +.Bl -column "READ\-ONLY COMPATIBLE" "org.open-zfs:large_block" +.It GUID Ta org.open-zfs:large_block +.It READ\-ONLY COMPATIBLE Ta no +.It DEPENDENCIES Ta extensible_dataset +.El +.Pp +The +.Sy large_block +feature allows the record size on a dataset to be +set larger than 128KB. +.Pp +This feature becomes +.Sy active +once a +.Sy recordsize +property has been set larger than 128KB, and will return to being +.Sy enabled +once all filesystems that have ever had their recordsize larger than 128KB +are destroyed. +.Pp +Please note that booting from datasets that have recordsize greater than +128KB is +.Em NOT +supported by the +.Fx +boot loader. .El .Sh SEE ALSO .Xr zpool 8 diff --git a/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c b/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c index dce1cb3..d99d801 100644 --- a/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c +++ b/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c @@ -54,7 +54,6 @@ uint64_t total_stream_len = 0; FILE *send_stream = 0; boolean_t do_byteswap = B_FALSE; boolean_t do_cksum = B_TRUE; -#define INITIAL_BUFLEN (1<<20) static void usage(void) @@ -67,6 +66,18 @@ usage(void) exit(1); } +static void * +safe_malloc(size_t size) +{ + void *rv = malloc(size); + if (rv == NULL) { + (void) fprintf(stderr, "ERROR; failed to allocate %zu bytes\n", + size); + abort(); + } + return (rv); +} + /* * ssread - send stream read. * @@ -158,7 +169,7 @@ print_block(char *buf, int length) int main(int argc, char *argv[]) { - char *buf = malloc(INITIAL_BUFLEN); + char *buf = safe_malloc(SPA_MAXBLOCKSIZE); uint64_t drr_record_count[DRR_NUMTYPES] = { 0 }; uint64_t total_records = 0; dmu_replay_record_t thedrr; @@ -307,9 +318,9 @@ main(int argc, char *argv[]) nvlist_t *nv; int sz = drr->drr_payloadlen; - if (sz > INITIAL_BUFLEN) { + if (sz > SPA_MAXBLOCKSIZE) { free(buf); - buf = malloc(sz); + buf = safe_malloc(sz); } (void) ssread(buf, sz, &zc); if (ferror(send_stream)) diff --git a/cddl/contrib/opensolaris/cmd/ztest/ztest.c b/cddl/contrib/opensolaris/cmd/ztest/ztest.c index 5ed87ce..ab69154 100644 --- a/cddl/contrib/opensolaris/cmd/ztest/ztest.c +++ b/cddl/contrib/opensolaris/cmd/ztest/ztest.c @@ -987,9 +987,15 @@ ztest_spa_get_ashift() { static int ztest_random_blocksize(void) { - // Choose a block size >= the ashift. - uint64_t block_shift = - ztest_random(SPA_MAXBLOCKSHIFT - ztest_spa_get_ashift() + 1); + uint64_t block_shift; + /* + * Choose a block size >= the ashift. + * If the SPA supports new MAXBLOCKSIZE, test up to 1MB blocks. + */ + int maxbs = SPA_OLD_MAXBLOCKSHIFT; + if (spa_maxblocksize(ztest_spa) == SPA_MAXBLOCKSIZE) + maxbs = 20; + block_shift = ztest_random(maxbs - ztest_spa_get_ashift() + 1); return (1 << (SPA_MINBLOCKSHIFT + block_shift)); } @@ -4789,7 +4795,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) char path0[MAXPATHLEN]; char pathrand[MAXPATHLEN]; size_t fsize; - int bshift = SPA_MAXBLOCKSHIFT + 2; /* don't scrog all labels */ + int bshift = SPA_OLD_MAXBLOCKSHIFT + 2; /* don't scrog all labels */ int iters = 1000; int maxfaults; int mirror_save; |