summaryrefslogtreecommitdiffstats
path: root/contrib/opensolaris/cmd
diff options
context:
space:
mode:
authorpjd <pjd@FreeBSD.org>2007-04-06 01:09:06 +0000
committerpjd <pjd@FreeBSD.org>2007-04-06 01:09:06 +0000
commit3b005d330261f33318ca1ee3fef1940237fd788b (patch)
tree3061c8734d9ce560165e672836837a0f411a83c9 /contrib/opensolaris/cmd
parent3be454b8211f48e634e6587f53807d3b5013e973 (diff)
downloadFreeBSD-src-3b005d330261f33318ca1ee3fef1940237fd788b.zip
FreeBSD-src-3b005d330261f33318ca1ee3fef1940237fd788b.tar.gz
Please welcome ZFS - The last word in file systems.
ZFS file system was ported from OpenSolaris operating system. The code in under CDDL license. I'd like to thank all SUN developers that created this great piece of software. Supported by: Wheel LTD (http://www.wheel.pl/) Supported by: The FreeBSD Foundation (http://www.freebsdfoundation.org/) Supported by: Sentex (http://www.sentex.net/)
Diffstat (limited to 'contrib/opensolaris/cmd')
-rw-r--r--contrib/opensolaris/cmd/zdb/zdb.897
-rw-r--r--contrib/opensolaris/cmd/zdb/zdb.c2185
-rw-r--r--contrib/opensolaris/cmd/zdb/zdb_il.c354
-rw-r--r--contrib/opensolaris/cmd/zfs/zfs.81815
-rw-r--r--contrib/opensolaris/cmd/zfs/zfs_iter.c405
-rw-r--r--contrib/opensolaris/cmd/zfs/zfs_iter.h52
-rw-r--r--contrib/opensolaris/cmd/zfs/zfs_main.c3233
-rw-r--r--contrib/opensolaris/cmd/zfs/zfs_util.h44
-rw-r--r--contrib/opensolaris/cmd/zpool/zpool.81113
-rw-r--r--contrib/opensolaris/cmd/zpool/zpool_iter.c245
-rw-r--r--contrib/opensolaris/cmd/zpool/zpool_main.c3567
-rw-r--r--contrib/opensolaris/cmd/zpool/zpool_util.c79
-rw-r--r--contrib/opensolaris/cmd/zpool/zpool_util.h72
-rw-r--r--contrib/opensolaris/cmd/zpool/zpool_vdev.c850
-rw-r--r--contrib/opensolaris/cmd/ztest/ztest.c3477
15 files changed, 17588 insertions, 0 deletions
diff --git a/contrib/opensolaris/cmd/zdb/zdb.8 b/contrib/opensolaris/cmd/zdb/zdb.8
new file mode 100644
index 0000000..2f1c541
--- /dev/null
+++ b/contrib/opensolaris/cmd/zdb/zdb.8
@@ -0,0 +1,97 @@
+'\" te
+.\" CDDL HEADER START
+.\"
+.\" The contents of this file are subject to the terms of the
+.\" Common Development and Distribution License (the "License").
+.\" You may not use this file except in compliance with the License.
+.\"
+.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+.\" or http://www.opensolaris.org/os/licensing.
+.\" See the License for the specific language governing permissions
+.\" and limitations under the License.
+.\"
+.\" When distributing Covered Code, include this CDDL HEADER in each
+.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+.\" If applicable, add the following below this CDDL HEADER, with the
+.\" fields enclosed by brackets "[]" replaced with your own identifying
+.\" information: Portions Copyright [yyyy] [name of copyright owner]
+.\"
+.\" CDDL HEADER END
+.\" Copyright (c) 2004, Sun Microsystems, Inc. All Rights Reserved.
+.TH zdb 1M "31 Oct 2005" "SunOS 5.11" "System Administration Commands"
+.SH NAME
+zdb \- ZFS debugger
+.SH SYNOPSIS
+.LP
+.nf
+\fBzdb\fR \fIpool\fR
+.fi
+
+.SH DESCRIPTION
+
+.LP
+The \fBzdb\fR command is used by support engineers to diagnose failures and gather statistics. Since the \fBZFS\fR file system is always consistent on disk and is self-repairing, \fBzdb\fR should only be run under the direction by a support engineer.
+.LP
+If no arguments are specified, \fBzdb\fR, performs basic consistency checks on the pool and associated datasets, and report any problems detected.
+.LP
+Any options supported by this command are internal to Sun and subject to change at any time.
+.SH EXIT STATUS
+
+.LP
+The following exit values are returned:
+.sp
+.ne 2
+.mk
+.na
+\fB\fB0\fR\fR
+.ad
+.RS 5n
+.rt
+The pool is consistent.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB1\fR\fR
+.ad
+.RS 5n
+.rt
+An error was detected.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB2\fR\fR
+.ad
+.RS 5n
+.rt
+Invalid command line options were specified.
+.RE
+
+.SH ATTRIBUTES
+
+.LP
+See \fBattributes\fR(5) for descriptions of the following attributes:
+.sp
+
+.sp
+.TS
+tab() box;
+cw(2.75i) |cw(2.75i)
+lw(2.75i) |lw(2.75i)
+.
+ATTRIBUTE TYPEATTRIBUTE VALUE
+_
+AvailabilitySUNWzfsu
+_
+Interface StabilityUnstable
+.TE
+
+.SH SEE ALSO
+
+.LP
+\fBzfs\fR(1M), \fBzpool\fR(1M), \fBattributes\fR(5)
diff --git a/contrib/opensolaris/cmd/zdb/zdb.c b/contrib/opensolaris/cmd/zdb/zdb.c
new file mode 100644
index 0000000..bb7a52e
--- /dev/null
+++ b/contrib/opensolaris/cmd/zdb/zdb.c
@@ -0,0 +1,2185 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <stdio.h>
+#include <stdio_ext.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/dmu.h>
+#include <sys/zap.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_znode.h>
+#include <sys/vdev.h>
+#include <sys/vdev_impl.h>
+#include <sys/metaslab_impl.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_pool.h>
+#include <sys/dbuf.h>
+#include <sys/zil.h>
+#include <sys/zil_impl.h>
+#include <sys/stat.h>
+#include <sys/resource.h>
+#include <sys/dmu_traverse.h>
+#include <sys/zio_checksum.h>
+#include <sys/zio_compress.h>
+
+const char cmdname[] = "zdb";
+uint8_t dump_opt[256];
+
+typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size);
+
+extern void dump_intent_log(zilog_t *);
+uint64_t *zopt_object = NULL;
+int zopt_objects = 0;
+int zdb_advance = ADVANCE_PRE;
+zbookmark_t zdb_noread = { 0, 0, ZB_NO_LEVEL, 0 };
+
+/*
+ * These libumem hooks provide a reasonable set of defaults for the allocator's
+ * debugging facilities.
+ */
+const char *
+_umem_debug_init()
+{
+ return ("default,verbose"); /* $UMEM_DEBUG setting */
+}
+
+const char *
+_umem_logging_init(void)
+{
+ return ("fail,contents"); /* $UMEM_LOGGING setting */
+}
+
+static void
+usage(void)
+{
+ (void) fprintf(stderr,
+ "Usage: %s [-udibcsvLU] [-O order] [-B os:obj:level:blkid] "
+ "dataset [object...]\n"
+ " %s -C [pool]\n"
+ " %s -l dev\n"
+ " %s -R vdev:offset:size:flags\n",
+ cmdname, cmdname, cmdname, cmdname);
+
+ (void) fprintf(stderr, " -u uberblock\n");
+ (void) fprintf(stderr, " -d datasets\n");
+ (void) fprintf(stderr, " -C cached pool configuration\n");
+ (void) fprintf(stderr, " -i intent logs\n");
+ (void) fprintf(stderr, " -b block statistics\n");
+ (void) fprintf(stderr, " -c checksum all data blocks\n");
+ (void) fprintf(stderr, " -s report stats on zdb's I/O\n");
+ (void) fprintf(stderr, " -v verbose (applies to all others)\n");
+ (void) fprintf(stderr, " -l dump label contents\n");
+ (void) fprintf(stderr, " -L live pool (allows some errors)\n");
+ (void) fprintf(stderr, " -O [!]<pre|post|prune|data|holes> "
+ "visitation order\n");
+ (void) fprintf(stderr, " -U use zpool.cache in /tmp\n");
+ (void) fprintf(stderr, " -B objset:object:level:blkid -- "
+ "simulate bad block\n");
+ (void) fprintf(stderr, " -R read and display block from a"
+ "device\n");
+ (void) fprintf(stderr, "Specify an option more than once (e.g. -bb) "
+ "to make only that option verbose\n");
+ (void) fprintf(stderr, "Default is to dump everything non-verbosely\n");
+ exit(1);
+}
+
+static void
+fatal(const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ (void) fprintf(stderr, "%s: ", cmdname);
+ (void) vfprintf(stderr, fmt, ap);
+ va_end(ap);
+ (void) fprintf(stderr, "\n");
+
+ exit(1);
+}
+
+static void
+dump_nvlist(nvlist_t *list, int indent)
+{
+ nvpair_t *elem = NULL;
+
+ while ((elem = nvlist_next_nvpair(list, elem)) != NULL) {
+ switch (nvpair_type(elem)) {
+ case DATA_TYPE_STRING:
+ {
+ char *value;
+
+ VERIFY(nvpair_value_string(elem, &value) == 0);
+ (void) printf("%*s%s='%s'\n", indent, "",
+ nvpair_name(elem), value);
+ }
+ break;
+
+ case DATA_TYPE_UINT64:
+ {
+ uint64_t value;
+
+ VERIFY(nvpair_value_uint64(elem, &value) == 0);
+ (void) printf("%*s%s=%llu\n", indent, "",
+ nvpair_name(elem), (u_longlong_t)value);
+ }
+ break;
+
+ case DATA_TYPE_NVLIST:
+ {
+ nvlist_t *value;
+
+ VERIFY(nvpair_value_nvlist(elem, &value) == 0);
+ (void) printf("%*s%s\n", indent, "",
+ nvpair_name(elem));
+ dump_nvlist(value, indent + 4);
+ }
+ break;
+
+ case DATA_TYPE_NVLIST_ARRAY:
+ {
+ nvlist_t **value;
+ uint_t c, count;
+
+ VERIFY(nvpair_value_nvlist_array(elem, &value,
+ &count) == 0);
+
+ for (c = 0; c < count; c++) {
+ (void) printf("%*s%s[%u]\n", indent, "",
+ nvpair_name(elem), c);
+ dump_nvlist(value[c], indent + 8);
+ }
+ }
+ break;
+
+ default:
+
+ (void) printf("bad config type %d for %s\n",
+ nvpair_type(elem), nvpair_name(elem));
+ }
+ }
+}
+
+/* ARGSUSED */
+static void
+dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size)
+{
+ nvlist_t *nv;
+ size_t nvsize = *(uint64_t *)data;
+ char *packed = umem_alloc(nvsize, UMEM_NOFAIL);
+
+ VERIFY(0 == dmu_read(os, object, 0, nvsize, packed));
+
+ VERIFY(nvlist_unpack(packed, nvsize, &nv, 0) == 0);
+
+ umem_free(packed, nvsize);
+
+ dump_nvlist(nv, 8);
+
+ nvlist_free(nv);
+}
+
+const char dump_zap_stars[] = "****************************************";
+const int dump_zap_width = sizeof (dump_zap_stars) - 1;
+
+static void
+dump_zap_histogram(uint64_t histo[ZAP_HISTOGRAM_SIZE])
+{
+ int i;
+ int minidx = ZAP_HISTOGRAM_SIZE - 1;
+ int maxidx = 0;
+ uint64_t max = 0;
+
+ for (i = 0; i < ZAP_HISTOGRAM_SIZE; i++) {
+ if (histo[i] > max)
+ max = histo[i];
+ if (histo[i] > 0 && i > maxidx)
+ maxidx = i;
+ if (histo[i] > 0 && i < minidx)
+ minidx = i;
+ }
+
+ if (max < dump_zap_width)
+ max = dump_zap_width;
+
+ for (i = minidx; i <= maxidx; i++)
+ (void) printf("\t\t\t%u: %6llu %s\n", i, (u_longlong_t)histo[i],
+ &dump_zap_stars[(max - histo[i]) * dump_zap_width / max]);
+}
+
+static void
+dump_zap_stats(objset_t *os, uint64_t object)
+{
+ int error;
+ zap_stats_t zs;
+
+ error = zap_get_stats(os, object, &zs);
+ if (error)
+ return;
+
+ if (zs.zs_ptrtbl_len == 0) {
+ ASSERT(zs.zs_num_blocks == 1);
+ (void) printf("\tmicrozap: %llu bytes, %llu entries\n",
+ (u_longlong_t)zs.zs_blocksize,
+ (u_longlong_t)zs.zs_num_entries);
+ return;
+ }
+
+ (void) printf("\tFat ZAP stats:\n");
+
+ (void) printf("\t\tPointer table:\n");
+ (void) printf("\t\t\t%llu elements\n",
+ (u_longlong_t)zs.zs_ptrtbl_len);
+ (void) printf("\t\t\tzt_blk: %llu\n",
+ (u_longlong_t)zs.zs_ptrtbl_zt_blk);
+ (void) printf("\t\t\tzt_numblks: %llu\n",
+ (u_longlong_t)zs.zs_ptrtbl_zt_numblks);
+ (void) printf("\t\t\tzt_shift: %llu\n",
+ (u_longlong_t)zs.zs_ptrtbl_zt_shift);
+ (void) printf("\t\t\tzt_blks_copied: %llu\n",
+ (u_longlong_t)zs.zs_ptrtbl_blks_copied);
+ (void) printf("\t\t\tzt_nextblk: %llu\n",
+ (u_longlong_t)zs.zs_ptrtbl_nextblk);
+
+ (void) printf("\t\tZAP entries: %llu\n",
+ (u_longlong_t)zs.zs_num_entries);
+ (void) printf("\t\tLeaf blocks: %llu\n",
+ (u_longlong_t)zs.zs_num_leafs);
+ (void) printf("\t\tTotal blocks: %llu\n",
+ (u_longlong_t)zs.zs_num_blocks);
+ (void) printf("\t\tzap_block_type: 0x%llx\n",
+ (u_longlong_t)zs.zs_block_type);
+ (void) printf("\t\tzap_magic: 0x%llx\n",
+ (u_longlong_t)zs.zs_magic);
+ (void) printf("\t\tzap_salt: 0x%llx\n",
+ (u_longlong_t)zs.zs_salt);
+
+ (void) printf("\t\tLeafs with 2^n pointers:\n");
+ dump_zap_histogram(zs.zs_leafs_with_2n_pointers);
+
+ (void) printf("\t\tBlocks with n*5 entries:\n");
+ dump_zap_histogram(zs.zs_blocks_with_n5_entries);
+
+ (void) printf("\t\tBlocks n/10 full:\n");
+ dump_zap_histogram(zs.zs_blocks_n_tenths_full);
+
+ (void) printf("\t\tEntries with n chunks:\n");
+ dump_zap_histogram(zs.zs_entries_using_n_chunks);
+
+ (void) printf("\t\tBuckets with n entries:\n");
+ dump_zap_histogram(zs.zs_buckets_with_n_entries);
+}
+
+/*ARGSUSED*/
+static void
+dump_none(objset_t *os, uint64_t object, void *data, size_t size)
+{
+}
+
+/*ARGSUSED*/
+void
+dump_uint8(objset_t *os, uint64_t object, void *data, size_t size)
+{
+}
+
+/*ARGSUSED*/
+static void
+dump_uint64(objset_t *os, uint64_t object, void *data, size_t size)
+{
+}
+
+/*ARGSUSED*/
+static void
+dump_zap(objset_t *os, uint64_t object, void *data, size_t size)
+{
+ zap_cursor_t zc;
+ zap_attribute_t attr;
+ void *prop;
+ int i;
+
+ dump_zap_stats(os, object);
+ (void) printf("\n");
+
+ for (zap_cursor_init(&zc, os, object);
+ zap_cursor_retrieve(&zc, &attr) == 0;
+ zap_cursor_advance(&zc)) {
+ (void) printf("\t\t%s = ", attr.za_name);
+ if (attr.za_num_integers == 0) {
+ (void) printf("\n");
+ continue;
+ }
+ prop = umem_zalloc(attr.za_num_integers *
+ attr.za_integer_length, UMEM_NOFAIL);
+ (void) zap_lookup(os, object, attr.za_name,
+ attr.za_integer_length, attr.za_num_integers, prop);
+ if (attr.za_integer_length == 1) {
+ (void) printf("%s", (char *)prop);
+ } else {
+ for (i = 0; i < attr.za_num_integers; i++) {
+ switch (attr.za_integer_length) {
+ case 2:
+ (void) printf("%u ",
+ ((uint16_t *)prop)[i]);
+ break;
+ case 4:
+ (void) printf("%u ",
+ ((uint32_t *)prop)[i]);
+ break;
+ case 8:
+ (void) printf("%lld ",
+ (u_longlong_t)((int64_t *)prop)[i]);
+ break;
+ }
+ }
+ }
+ (void) printf("\n");
+ umem_free(prop, attr.za_num_integers * attr.za_integer_length);
+ }
+ zap_cursor_fini(&zc);
+}
+
+static void
+dump_spacemap(objset_t *os, space_map_obj_t *smo, space_map_t *sm)
+{
+ uint64_t alloc, offset, entry;
+ uint8_t mapshift = sm->sm_shift;
+ uint64_t mapstart = sm->sm_start;
+ char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID",
+ "INVALID", "INVALID", "INVALID", "INVALID" };
+
+ if (smo->smo_object == 0)
+ return;
+
+ /*
+ * Print out the freelist entries in both encoded and decoded form.
+ */
+ alloc = 0;
+ for (offset = 0; offset < smo->smo_objsize; offset += sizeof (entry)) {
+ VERIFY(0 == dmu_read(os, smo->smo_object, offset,
+ sizeof (entry), &entry));
+ if (SM_DEBUG_DECODE(entry)) {
+ (void) printf("\t\t[%4llu] %s: txg %llu, pass %llu\n",
+ (u_longlong_t)(offset / sizeof (entry)),
+ ddata[SM_DEBUG_ACTION_DECODE(entry)],
+ (u_longlong_t)SM_DEBUG_TXG_DECODE(entry),
+ (u_longlong_t)SM_DEBUG_SYNCPASS_DECODE(entry));
+ } else {
+ (void) printf("\t\t[%4llu] %c range:"
+ " %08llx-%08llx size: %06llx\n",
+ (u_longlong_t)(offset / sizeof (entry)),
+ SM_TYPE_DECODE(entry) == SM_ALLOC ? 'A' : 'F',
+ (u_longlong_t)((SM_OFFSET_DECODE(entry) <<
+ mapshift) + mapstart),
+ (u_longlong_t)((SM_OFFSET_DECODE(entry) <<
+ mapshift) + mapstart + (SM_RUN_DECODE(entry) <<
+ mapshift)),
+ (u_longlong_t)(SM_RUN_DECODE(entry) << mapshift));
+ if (SM_TYPE_DECODE(entry) == SM_ALLOC)
+ alloc += SM_RUN_DECODE(entry) << mapshift;
+ else
+ alloc -= SM_RUN_DECODE(entry) << mapshift;
+ }
+ }
+ if (alloc != smo->smo_alloc) {
+ (void) printf("space_map_object alloc (%llu) INCONSISTENT "
+ "with space map summary (%llu)\n",
+ (u_longlong_t)smo->smo_alloc, (u_longlong_t)alloc);
+ }
+}
+
+static void
+dump_metaslab(metaslab_t *msp)
+{
+ char freebuf[5];
+ space_map_obj_t *smo = &msp->ms_smo;
+ vdev_t *vd = msp->ms_group->mg_vd;
+ spa_t *spa = vd->vdev_spa;
+
+ nicenum(msp->ms_map.sm_size - smo->smo_alloc, freebuf);
+
+ if (dump_opt['d'] <= 5) {
+ (void) printf("\t%10llx %10llu %5s\n",
+ (u_longlong_t)msp->ms_map.sm_start,
+ (u_longlong_t)smo->smo_object,
+ freebuf);
+ return;
+ }
+
+ (void) printf(
+ "\tvdev %llu offset %08llx spacemap %4llu free %5s\n",
+ (u_longlong_t)vd->vdev_id, (u_longlong_t)msp->ms_map.sm_start,
+ (u_longlong_t)smo->smo_object, freebuf);
+
+ ASSERT(msp->ms_map.sm_size == (1ULL << vd->vdev_ms_shift));
+
+ dump_spacemap(spa->spa_meta_objset, smo, &msp->ms_map);
+}
+
+static void
+dump_metaslabs(spa_t *spa)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ vdev_t *vd;
+ int c, m;
+
+ (void) printf("\nMetaslabs:\n");
+
+ for (c = 0; c < rvd->vdev_children; c++) {
+ vd = rvd->vdev_child[c];
+
+ spa_config_enter(spa, RW_READER, FTAG);
+ (void) printf("\n vdev %llu = %s\n\n",
+ (u_longlong_t)vd->vdev_id, vdev_description(vd));
+ spa_config_exit(spa, FTAG);
+
+ if (dump_opt['d'] <= 5) {
+ (void) printf("\t%10s %10s %5s\n",
+ "offset", "spacemap", "free");
+ (void) printf("\t%10s %10s %5s\n",
+ "------", "--------", "----");
+ }
+ for (m = 0; m < vd->vdev_ms_count; m++)
+ dump_metaslab(vd->vdev_ms[m]);
+ (void) printf("\n");
+ }
+}
+
+static void
+dump_dtl(vdev_t *vd, int indent)
+{
+ avl_tree_t *t = &vd->vdev_dtl_map.sm_root;
+ spa_t *spa = vd->vdev_spa;
+ space_seg_t *ss;
+ vdev_t *pvd;
+ int c;
+
+ if (indent == 0)
+ (void) printf("\nDirty time logs:\n\n");
+
+ spa_config_enter(spa, RW_READER, FTAG);
+ (void) printf("\t%*s%s\n", indent, "", vdev_description(vd));
+ spa_config_exit(spa, FTAG);
+
+ for (ss = avl_first(t); ss; ss = AVL_NEXT(t, ss)) {
+ /*
+ * Everything in this DTL must appear in all parent DTL unions.
+ */
+ for (pvd = vd; pvd; pvd = pvd->vdev_parent)
+ ASSERT(vdev_dtl_contains(&pvd->vdev_dtl_map,
+ ss->ss_start, ss->ss_end - ss->ss_start));
+ (void) printf("\t%*soutage [%llu,%llu] length %llu\n",
+ indent, "",
+ (u_longlong_t)ss->ss_start,
+ (u_longlong_t)ss->ss_end - 1,
+ (u_longlong_t)(ss->ss_end - ss->ss_start));
+ }
+
+ (void) printf("\n");
+
+ if (dump_opt['d'] > 5 && vd->vdev_children == 0) {
+ dump_spacemap(vd->vdev_spa->spa_meta_objset, &vd->vdev_dtl,
+ &vd->vdev_dtl_map);
+ (void) printf("\n");
+ }
+
+ for (c = 0; c < vd->vdev_children; c++)
+ dump_dtl(vd->vdev_child[c], indent + 4);
+}
+
+/*ARGSUSED*/
+static void
+dump_dnode(objset_t *os, uint64_t object, void *data, size_t size)
+{
+}
+
+static uint64_t
+blkid2offset(dnode_phys_t *dnp, int level, uint64_t blkid)
+{
+ if (level < 0)
+ return (blkid);
+
+ return ((blkid << (level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) *
+ dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+}
+
+static void
+sprintf_blkptr_compact(char *blkbuf, blkptr_t *bp, int alldvas)
+{
+ dva_t *dva = bp->blk_dva;
+ int ndvas = alldvas ? BP_GET_NDVAS(bp) : 1;
+ int i;
+
+ blkbuf[0] = '\0';
+
+ for (i = 0; i < ndvas; i++)
+ (void) sprintf(blkbuf + strlen(blkbuf), "%llu:%llx:%llx ",
+ (u_longlong_t)DVA_GET_VDEV(&dva[i]),
+ (u_longlong_t)DVA_GET_OFFSET(&dva[i]),
+ (u_longlong_t)DVA_GET_ASIZE(&dva[i]));
+
+ (void) sprintf(blkbuf + strlen(blkbuf), "%llxL/%llxP F=%llu B=%llu",
+ (u_longlong_t)BP_GET_LSIZE(bp),
+ (u_longlong_t)BP_GET_PSIZE(bp),
+ (u_longlong_t)bp->blk_fill,
+ (u_longlong_t)bp->blk_birth);
+}
+
+/* ARGSUSED */
+static int
+zdb_indirect_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a)
+{
+ zbookmark_t *zb = &bc->bc_bookmark;
+ blkptr_t *bp = &bc->bc_blkptr;
+ void *data = bc->bc_data;
+ dnode_phys_t *dnp = bc->bc_dnode;
+ char blkbuf[BP_SPRINTF_LEN + 80];
+ int l;
+
+ if (bc->bc_errno) {
+ (void) sprintf(blkbuf,
+ "Error %d reading <%llu, %llu, %lld, %llu>: ",
+ bc->bc_errno,
+ (u_longlong_t)zb->zb_objset,
+ (u_longlong_t)zb->zb_object,
+ (u_longlong_t)zb->zb_level,
+ (u_longlong_t)zb->zb_blkid);
+ goto out;
+ }
+
+ if (zb->zb_level == -1) {
+ ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET);
+ ASSERT3U(BP_GET_LEVEL(bp), ==, 0);
+ } else {
+ ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type);
+ ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level);
+ }
+
+ if (zb->zb_level > 0) {
+ uint64_t fill = 0;
+ blkptr_t *bpx, *bpend;
+
+ for (bpx = data, bpend = bpx + BP_GET_LSIZE(bp) / sizeof (*bpx);
+ bpx < bpend; bpx++) {
+ if (bpx->blk_birth != 0) {
+ fill += bpx->blk_fill;
+ } else {
+ ASSERT(bpx->blk_fill == 0);
+ }
+ }
+ ASSERT3U(fill, ==, bp->blk_fill);
+ }
+
+ if (zb->zb_level == 0 && dnp->dn_type == DMU_OT_DNODE) {
+ uint64_t fill = 0;
+ dnode_phys_t *dnx, *dnend;
+
+ for (dnx = data, dnend = dnx + (BP_GET_LSIZE(bp)>>DNODE_SHIFT);
+ dnx < dnend; dnx++) {
+ if (dnx->dn_type != DMU_OT_NONE)
+ fill++;
+ }
+ ASSERT3U(fill, ==, bp->blk_fill);
+ }
+
+ (void) sprintf(blkbuf, "%16llx ",
+ (u_longlong_t)blkid2offset(dnp, zb->zb_level, zb->zb_blkid));
+
+ ASSERT(zb->zb_level >= 0);
+
+ for (l = dnp->dn_nlevels - 1; l >= -1; l--) {
+ if (l == zb->zb_level) {
+ (void) sprintf(blkbuf + strlen(blkbuf), "L%llx",
+ (u_longlong_t)zb->zb_level);
+ } else {
+ (void) sprintf(blkbuf + strlen(blkbuf), " ");
+ }
+ }
+
+out:
+ if (bp->blk_birth == 0) {
+ (void) sprintf(blkbuf + strlen(blkbuf), "<hole>");
+ (void) printf("%s\n", blkbuf);
+ } else {
+ sprintf_blkptr_compact(blkbuf + strlen(blkbuf), bp,
+ dump_opt['d'] > 5 ? 1 : 0);
+ (void) printf("%s\n", blkbuf);
+ }
+
+ return (bc->bc_errno ? ERESTART : 0);
+}
+
+/*ARGSUSED*/
+static void
+dump_indirect(objset_t *os, uint64_t object, void *data, size_t size)
+{
+ traverse_handle_t *th;
+ uint64_t objset = dmu_objset_id(os);
+ int advance = zdb_advance;
+
+ (void) printf("Indirect blocks:\n");
+
+ if (object == 0)
+ advance |= ADVANCE_DATA;
+
+ th = traverse_init(dmu_objset_spa(os), zdb_indirect_cb, NULL, advance,
+ ZIO_FLAG_CANFAIL);
+ th->th_noread = zdb_noread;
+
+ traverse_add_dnode(th, 0, -1ULL, objset, object);
+
+ while (traverse_more(th) == EAGAIN)
+ continue;
+
+ (void) printf("\n");
+
+ traverse_fini(th);
+}
+
+/*ARGSUSED*/
+static void
+dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size)
+{
+ dsl_dir_phys_t *dd = data;
+ time_t crtime;
+ char used[6], compressed[6], uncompressed[6], quota[6], resv[6];
+
+ if (dd == NULL)
+ return;
+
+ ASSERT(size == sizeof (*dd));
+
+ crtime = dd->dd_creation_time;
+ nicenum(dd->dd_used_bytes, used);
+ nicenum(dd->dd_compressed_bytes, compressed);
+ nicenum(dd->dd_uncompressed_bytes, uncompressed);
+ nicenum(dd->dd_quota, quota);
+ nicenum(dd->dd_reserved, resv);
+
+ (void) printf("\t\tcreation_time = %s", ctime(&crtime));
+ (void) printf("\t\thead_dataset_obj = %llu\n",
+ (u_longlong_t)dd->dd_head_dataset_obj);
+ (void) printf("\t\tparent_dir_obj = %llu\n",
+ (u_longlong_t)dd->dd_parent_obj);
+ (void) printf("\t\tclone_parent_obj = %llu\n",
+ (u_longlong_t)dd->dd_clone_parent_obj);
+ (void) printf("\t\tchild_dir_zapobj = %llu\n",
+ (u_longlong_t)dd->dd_child_dir_zapobj);
+ (void) printf("\t\tused_bytes = %s\n", used);
+ (void) printf("\t\tcompressed_bytes = %s\n", compressed);
+ (void) printf("\t\tuncompressed_bytes = %s\n", uncompressed);
+ (void) printf("\t\tquota = %s\n", quota);
+ (void) printf("\t\treserved = %s\n", resv);
+ (void) printf("\t\tprops_zapobj = %llu\n",
+ (u_longlong_t)dd->dd_props_zapobj);
+}
+
+/*ARGSUSED*/
+static void
+dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size)
+{
+ dsl_dataset_phys_t *ds = data;
+ time_t crtime;
+ char used[6], compressed[6], uncompressed[6], unique[6];
+ char blkbuf[BP_SPRINTF_LEN];
+
+ if (ds == NULL)
+ return;
+
+ ASSERT(size == sizeof (*ds));
+ crtime = ds->ds_creation_time;
+ nicenum(ds->ds_used_bytes, used);
+ nicenum(ds->ds_compressed_bytes, compressed);
+ nicenum(ds->ds_uncompressed_bytes, uncompressed);
+ nicenum(ds->ds_unique_bytes, unique);
+ sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, &ds->ds_bp);
+
+ (void) printf("\t\tdataset_obj = %llu\n",
+ (u_longlong_t)ds->ds_dir_obj);
+ (void) printf("\t\tprev_snap_obj = %llu\n",
+ (u_longlong_t)ds->ds_prev_snap_obj);
+ (void) printf("\t\tprev_snap_txg = %llu\n",
+ (u_longlong_t)ds->ds_prev_snap_txg);
+ (void) printf("\t\tnext_snap_obj = %llu\n",
+ (u_longlong_t)ds->ds_next_snap_obj);
+ (void) printf("\t\tsnapnames_zapobj = %llu\n",
+ (u_longlong_t)ds->ds_snapnames_zapobj);
+ (void) printf("\t\tnum_children = %llu\n",
+ (u_longlong_t)ds->ds_num_children);
+ (void) printf("\t\tcreation_time = %s", ctime(&crtime));
+ (void) printf("\t\tcreation_txg = %llu\n",
+ (u_longlong_t)ds->ds_creation_txg);
+ (void) printf("\t\tdeadlist_obj = %llu\n",
+ (u_longlong_t)ds->ds_deadlist_obj);
+ (void) printf("\t\tused_bytes = %s\n", used);
+ (void) printf("\t\tcompressed_bytes = %s\n", compressed);
+ (void) printf("\t\tuncompressed_bytes = %s\n", uncompressed);
+ (void) printf("\t\tunique = %s\n", unique);
+ (void) printf("\t\tfsid_guid = %llu\n",
+ (u_longlong_t)ds->ds_fsid_guid);
+ (void) printf("\t\tguid = %llu\n",
+ (u_longlong_t)ds->ds_guid);
+ (void) printf("\t\tflags = %llx\n",
+ (u_longlong_t)ds->ds_flags);
+ (void) printf("\t\tbp = %s\n", blkbuf);
+}
+
+static void
+dump_bplist(objset_t *mos, uint64_t object, char *name)
+{
+ bplist_t bpl = { 0 };
+ blkptr_t blk, *bp = &blk;
+ uint64_t itor = 0;
+ char bytes[6];
+ char comp[6];
+ char uncomp[6];
+
+ if (dump_opt['d'] < 3)
+ return;
+
+ VERIFY(0 == bplist_open(&bpl, mos, object));
+ if (bplist_empty(&bpl)) {
+ bplist_close(&bpl);
+ return;
+ }
+
+ nicenum(bpl.bpl_phys->bpl_bytes, bytes);
+ if (bpl.bpl_dbuf->db_size == sizeof (bplist_phys_t)) {
+ nicenum(bpl.bpl_phys->bpl_comp, comp);
+ nicenum(bpl.bpl_phys->bpl_uncomp, uncomp);
+ (void) printf("\n %s: %llu entries, %s (%s/%s comp)\n",
+ name, (u_longlong_t)bpl.bpl_phys->bpl_entries,
+ bytes, comp, uncomp);
+ } else {
+ (void) printf("\n %s: %llu entries, %s\n",
+ name, (u_longlong_t)bpl.bpl_phys->bpl_entries, bytes);
+ }
+
+ if (dump_opt['d'] < 5) {
+ bplist_close(&bpl);
+ return;
+ }
+
+ (void) printf("\n");
+
+ while (bplist_iterate(&bpl, &itor, bp) == 0) {
+ char blkbuf[BP_SPRINTF_LEN];
+
+ ASSERT(bp->blk_birth != 0);
+ sprintf_blkptr_compact(blkbuf, bp, dump_opt['d'] > 5 ? 1 : 0);
+ (void) printf("\tItem %3llu: %s\n",
+ (u_longlong_t)itor - 1, blkbuf);
+ }
+
+ bplist_close(&bpl);
+}
+
+/*ARGSUSED*/
+static void
+dump_znode(objset_t *os, uint64_t object, void *data, size_t size)
+{
+ znode_phys_t *zp = data;
+ time_t z_crtime, z_atime, z_mtime, z_ctime;
+ char path[MAXPATHLEN * 2]; /* allow for xattr and failure prefix */
+ int error;
+
+ ASSERT(size >= sizeof (znode_phys_t));
+
+ error = zfs_obj_to_path(os, object, path, sizeof (path));
+ if (error != 0) {
+ (void) snprintf(path, sizeof (path), "\?\?\?<object#%llu>",
+ (u_longlong_t)object);
+ }
+
+ if (dump_opt['d'] < 3) {
+ (void) printf("\t%s\n", path);
+ return;
+ }
+
+ z_crtime = (time_t)zp->zp_crtime[0];
+ z_atime = (time_t)zp->zp_atime[0];
+ z_mtime = (time_t)zp->zp_mtime[0];
+ z_ctime = (time_t)zp->zp_ctime[0];
+
+ (void) printf("\tpath %s\n", path);
+ (void) printf("\tatime %s", ctime(&z_atime));
+ (void) printf("\tmtime %s", ctime(&z_mtime));
+ (void) printf("\tctime %s", ctime(&z_ctime));
+ (void) printf("\tcrtime %s", ctime(&z_crtime));
+ (void) printf("\tgen %llu\n", (u_longlong_t)zp->zp_gen);
+ (void) printf("\tmode %llo\n", (u_longlong_t)zp->zp_mode);
+ (void) printf("\tsize %llu\n", (u_longlong_t)zp->zp_size);
+ (void) printf("\tparent %llu\n", (u_longlong_t)zp->zp_parent);
+ (void) printf("\tlinks %llu\n", (u_longlong_t)zp->zp_links);
+ (void) printf("\txattr %llu\n", (u_longlong_t)zp->zp_xattr);
+ (void) printf("\trdev 0x%016llx\n", (u_longlong_t)zp->zp_rdev);
+}
+
+/*ARGSUSED*/
+static void
+dump_acl(objset_t *os, uint64_t object, void *data, size_t size)
+{
+}
+
+/*ARGSUSED*/
+static void
+dump_dmu_objset(objset_t *os, uint64_t object, void *data, size_t size)
+{
+}
+
+static object_viewer_t *object_viewer[DMU_OT_NUMTYPES] = {
+ dump_none, /* unallocated */
+ dump_zap, /* object directory */
+ dump_uint64, /* object array */
+ dump_none, /* packed nvlist */
+ dump_packed_nvlist, /* packed nvlist size */
+ dump_none, /* bplist */
+ dump_none, /* bplist header */
+ dump_none, /* SPA space map header */
+ dump_none, /* SPA space map */
+ dump_none, /* ZIL intent log */
+ dump_dnode, /* DMU dnode */
+ dump_dmu_objset, /* DMU objset */
+ dump_dsl_dir, /* DSL directory */
+ dump_zap, /* DSL directory child map */
+ dump_zap, /* DSL dataset snap map */
+ dump_zap, /* DSL props */
+ dump_dsl_dataset, /* DSL dataset */
+ dump_znode, /* ZFS znode */
+ dump_acl, /* ZFS ACL */
+ dump_uint8, /* ZFS plain file */
+ dump_zap, /* ZFS directory */
+ dump_zap, /* ZFS master node */
+ dump_zap, /* ZFS delete queue */
+ dump_uint8, /* zvol object */
+ dump_zap, /* zvol prop */
+ dump_uint8, /* other uint8[] */
+ dump_uint64, /* other uint64[] */
+ dump_zap, /* other ZAP */
+ dump_zap, /* persistent error log */
+ dump_uint8, /* SPA history */
+ dump_uint64, /* SPA history offsets */
+ dump_zap, /* Pool properties */
+};
+
+static void
+dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header)
+{
+ dmu_buf_t *db = NULL;
+ dmu_object_info_t doi;
+ dnode_t *dn;
+ void *bonus = NULL;
+ size_t bsize = 0;
+ char iblk[6], dblk[6], lsize[6], asize[6], bonus_size[6], segsize[6];
+ char aux[50];
+ int error;
+
+ if (*print_header) {
+ (void) printf("\n Object lvl iblk dblk lsize"
+ " asize type\n");
+ *print_header = 0;
+ }
+
+ if (object == 0) {
+ dn = os->os->os_meta_dnode;
+ } else {
+ error = dmu_bonus_hold(os, object, FTAG, &db);
+ if (error)
+ fatal("dmu_bonus_hold(%llu) failed, errno %u",
+ object, error);
+ bonus = db->db_data;
+ bsize = db->db_size;
+ dn = ((dmu_buf_impl_t *)db)->db_dnode;
+ }
+ dmu_object_info_from_dnode(dn, &doi);
+
+ nicenum(doi.doi_metadata_block_size, iblk);
+ nicenum(doi.doi_data_block_size, dblk);
+ nicenum(doi.doi_data_block_size * (doi.doi_max_block_offset + 1),
+ lsize);
+ nicenum(doi.doi_physical_blks << 9, asize);
+ nicenum(doi.doi_bonus_size, bonus_size);
+
+ aux[0] = '\0';
+
+ if (doi.doi_checksum != ZIO_CHECKSUM_INHERIT || verbosity >= 6)
+ (void) snprintf(aux + strlen(aux), sizeof (aux), " (K=%s)",
+ zio_checksum_table[doi.doi_checksum].ci_name);
+
+ if (doi.doi_compress != ZIO_COMPRESS_INHERIT || verbosity >= 6)
+ (void) snprintf(aux + strlen(aux), sizeof (aux), " (Z=%s)",
+ zio_compress_table[doi.doi_compress].ci_name);
+
+ (void) printf("%10lld %3u %5s %5s %5s %5s %s%s\n",
+ (u_longlong_t)object, doi.doi_indirection, iblk, dblk, lsize,
+ asize, dmu_ot[doi.doi_type].ot_name, aux);
+
+ if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) {
+ (void) printf("%10s %3s %5s %5s %5s %5s %s\n",
+ "", "", "", "", bonus_size, "bonus",
+ dmu_ot[doi.doi_bonus_type].ot_name);
+ }
+
+ if (verbosity >= 4) {
+ object_viewer[doi.doi_bonus_type](os, object, bonus, bsize);
+ object_viewer[doi.doi_type](os, object, NULL, 0);
+ *print_header = 1;
+ }
+
+ if (verbosity >= 5)
+ dump_indirect(os, object, NULL, 0);
+
+ if (verbosity >= 5) {
+ /*
+ * Report the list of segments that comprise the object.
+ */
+ uint64_t start = 0;
+ uint64_t end;
+ uint64_t blkfill = 1;
+ int minlvl = 1;
+
+ if (dn->dn_type == DMU_OT_DNODE) {
+ minlvl = 0;
+ blkfill = DNODES_PER_BLOCK;
+ }
+
+ for (;;) {
+ error = dnode_next_offset(dn, B_FALSE, &start, minlvl,
+ blkfill, 0);
+ if (error)
+ break;
+ end = start;
+ error = dnode_next_offset(dn, B_TRUE, &end, minlvl,
+ blkfill, 0);
+ nicenum(end - start, segsize);
+ (void) printf("\t\tsegment [%016llx, %016llx)"
+ " size %5s\n", (u_longlong_t)start,
+ (u_longlong_t)end, segsize);
+ if (error)
+ break;
+ start = end;
+ }
+ }
+
+ if (db != NULL)
+ dmu_buf_rele(db, FTAG);
+}
+
+static char *objset_types[DMU_OST_NUMTYPES] = {
+ "NONE", "META", "ZPL", "ZVOL", "OTHER", "ANY" };
+
+/*ARGSUSED*/
+static void
+dump_dir(objset_t *os)
+{
+ dmu_objset_stats_t dds;
+ uint64_t object, object_count;
+ uint64_t refdbytes, usedobjs, scratch;
+ char numbuf[8];
+ char blkbuf[BP_SPRINTF_LEN];
+ char osname[MAXNAMELEN];
+ char *type = "UNKNOWN";
+ int verbosity = dump_opt['d'];
+ int print_header = 1;
+ int i, error;
+
+ dmu_objset_fast_stat(os, &dds);
+
+ if (dds.dds_type < DMU_OST_NUMTYPES)
+ type = objset_types[dds.dds_type];
+
+ if (dds.dds_type == DMU_OST_META) {
+ dds.dds_creation_txg = TXG_INITIAL;
+ usedobjs = os->os->os_rootbp->blk_fill;
+ refdbytes =
+ os->os->os_spa->spa_dsl_pool->dp_mos_dir->dd_used_bytes;
+ } else {
+ dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch);
+ }
+
+ ASSERT3U(usedobjs, ==, os->os->os_rootbp->blk_fill);
+
+ nicenum(refdbytes, numbuf);
+
+ if (verbosity >= 4) {
+ (void) strcpy(blkbuf, ", rootbp ");
+ sprintf_blkptr(blkbuf + strlen(blkbuf),
+ BP_SPRINTF_LEN - strlen(blkbuf), os->os->os_rootbp);
+ } else {
+ blkbuf[0] = '\0';
+ }
+
+ dmu_objset_name(os, osname);
+
+ (void) printf("Dataset %s [%s], ID %llu, cr_txg %llu, "
+ "%s, %llu objects%s\n",
+ osname, type, (u_longlong_t)dmu_objset_id(os),
+ (u_longlong_t)dds.dds_creation_txg,
+ numbuf, (u_longlong_t)usedobjs, blkbuf);
+
+ dump_intent_log(dmu_objset_zil(os));
+
+ if (dmu_objset_ds(os) != NULL)
+ dump_bplist(dmu_objset_pool(os)->dp_meta_objset,
+ dmu_objset_ds(os)->ds_phys->ds_deadlist_obj, "Deadlist");
+
+ if (verbosity < 2)
+ return;
+
+ if (zopt_objects != 0) {
+ for (i = 0; i < zopt_objects; i++)
+ dump_object(os, zopt_object[i], verbosity,
+ &print_header);
+ (void) printf("\n");
+ return;
+ }
+
+ dump_object(os, 0, verbosity, &print_header);
+ object_count = 1;
+
+ object = 0;
+ while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) {
+ dump_object(os, object, verbosity, &print_header);
+ object_count++;
+ }
+
+ ASSERT3U(object_count, ==, usedobjs);
+
+ (void) printf("\n");
+
+ if (error != ESRCH)
+ fatal("dmu_object_next() = %d", error);
+}
+
+static void
+dump_uberblock(uberblock_t *ub)
+{
+ time_t timestamp = ub->ub_timestamp;
+
+ (void) printf("Uberblock\n\n");
+ (void) printf("\tmagic = %016llx\n", (u_longlong_t)ub->ub_magic);
+ (void) printf("\tversion = %llu\n", (u_longlong_t)ub->ub_version);
+ (void) printf("\ttxg = %llu\n", (u_longlong_t)ub->ub_txg);
+ (void) printf("\tguid_sum = %llu\n", (u_longlong_t)ub->ub_guid_sum);
+ (void) printf("\ttimestamp = %llu UTC = %s",
+ (u_longlong_t)ub->ub_timestamp, asctime(localtime(&timestamp)));
+ if (dump_opt['u'] >= 3) {
+ char blkbuf[BP_SPRINTF_LEN];
+ sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, &ub->ub_rootbp);
+ (void) printf("\trootbp = %s\n", blkbuf);
+ }
+ (void) printf("\n");
+}
+
+static void
+dump_config(const char *pool)
+{
+ spa_t *spa = NULL;
+
+ mutex_enter(&spa_namespace_lock);
+ while ((spa = spa_next(spa)) != NULL) {
+ if (pool == NULL)
+ (void) printf("%s\n", spa_name(spa));
+ if (pool == NULL || strcmp(pool, spa_name(spa)) == 0)
+ dump_nvlist(spa->spa_config, 4);
+ }
+ mutex_exit(&spa_namespace_lock);
+}
+
+static void
+dump_label(const char *dev)
+{
+ int fd;
+ vdev_label_t label;
+ char *buf = label.vl_vdev_phys.vp_nvlist;
+ size_t buflen = sizeof (label.vl_vdev_phys.vp_nvlist);
+ struct stat64 statbuf;
+ uint64_t psize;
+ int l;
+
+ if ((fd = open64(dev, O_RDONLY)) < 0) {
+ (void) printf("cannot open '%s': %s\n", dev, strerror(errno));
+ exit(1);
+ }
+
+ if (fstat64(fd, &statbuf) != 0) {
+ (void) printf("failed to stat '%s': %s\n", dev,
+ strerror(errno));
+ exit(1);
+ }
+
+ psize = statbuf.st_size;
+ psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t));
+
+ for (l = 0; l < VDEV_LABELS; l++) {
+
+ nvlist_t *config = NULL;
+
+ (void) printf("--------------------------------------------\n");
+ (void) printf("LABEL %d\n", l);
+ (void) printf("--------------------------------------------\n");
+
+ if (pread64(fd, &label, sizeof (label),
+ vdev_label_offset(psize, l, 0)) != sizeof (label)) {
+ (void) printf("failed to read label %d\n", l);
+ continue;
+ }
+
+ if (nvlist_unpack(buf, buflen, &config, 0) != 0) {
+ (void) printf("failed to unpack label %d\n", l);
+ continue;
+ }
+ dump_nvlist(config, 4);
+ nvlist_free(config);
+ }
+}
+
+/*ARGSUSED*/
+static int
+dump_one_dir(char *dsname, void *arg)
+{
+ int error;
+ objset_t *os;
+
+ error = dmu_objset_open(dsname, DMU_OST_ANY,
+ DS_MODE_STANDARD | DS_MODE_READONLY, &os);
+ if (error) {
+ (void) printf("Could not open %s\n", dsname);
+ return (0);
+ }
+ dump_dir(os);
+ dmu_objset_close(os);
+ return (0);
+}
+
+static void
+zdb_space_map_load(spa_t *spa)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ vdev_t *vd;
+ int c, m, error;
+
+ for (c = 0; c < rvd->vdev_children; c++) {
+ vd = rvd->vdev_child[c];
+ for (m = 0; m < vd->vdev_ms_count; m++) {
+ metaslab_t *msp = vd->vdev_ms[m];
+ mutex_enter(&msp->ms_lock);
+ error = space_map_load(&msp->ms_allocmap[0], NULL,
+ SM_ALLOC, &msp->ms_smo, spa->spa_meta_objset);
+ mutex_exit(&msp->ms_lock);
+ if (error)
+ fatal("%s bad space map #%d, error %d",
+ spa->spa_name, c, error);
+ }
+ }
+}
+
+static int
+zdb_space_map_claim(spa_t *spa, blkptr_t *bp, zbookmark_t *zb)
+{
+ dva_t *dva = bp->blk_dva;
+ vdev_t *vd;
+ metaslab_t *msp;
+ space_map_t *allocmap, *freemap;
+ int error;
+ int d;
+ blkptr_t blk = *bp;
+
+ for (d = 0; d < BP_GET_NDVAS(bp); d++) {
+ uint64_t vdev = DVA_GET_VDEV(&dva[d]);
+ uint64_t offset = DVA_GET_OFFSET(&dva[d]);
+ uint64_t size = DVA_GET_ASIZE(&dva[d]);
+
+ if ((vd = vdev_lookup_top(spa, vdev)) == NULL)
+ return (ENXIO);
+
+ if ((offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
+ return (ENXIO);
+
+ msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+ allocmap = &msp->ms_allocmap[0];
+ freemap = &msp->ms_freemap[0];
+
+ /* Prepare our copy of the bp in case we need to read GBHs */
+ if (DVA_GET_GANG(&dva[d])) {
+ size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
+ DVA_SET_ASIZE(&blk.blk_dva[d], size);
+ DVA_SET_GANG(&blk.blk_dva[d], 0);
+ }
+
+ mutex_enter(&msp->ms_lock);
+ if (space_map_contains(freemap, offset, size)) {
+ mutex_exit(&msp->ms_lock);
+ return (EAGAIN); /* allocated more than once */
+ }
+
+ if (!space_map_contains(allocmap, offset, size)) {
+ mutex_exit(&msp->ms_lock);
+ return (ESTALE); /* not allocated at all */
+ }
+
+ space_map_remove(allocmap, offset, size);
+ space_map_add(freemap, offset, size);
+
+ mutex_exit(&msp->ms_lock);
+ }
+
+ if (BP_IS_GANG(bp)) {
+ zio_gbh_phys_t gbh;
+ int g;
+
+ /* LINTED - compile time assert */
+ ASSERT(sizeof (zio_gbh_phys_t) == SPA_GANGBLOCKSIZE);
+
+ BP_SET_CHECKSUM(&blk, ZIO_CHECKSUM_GANG_HEADER);
+ BP_SET_PSIZE(&blk, SPA_GANGBLOCKSIZE);
+ BP_SET_LSIZE(&blk, SPA_GANGBLOCKSIZE);
+ BP_SET_COMPRESS(&blk, ZIO_COMPRESS_OFF);
+ error = zio_wait(zio_read(NULL, spa, &blk, &gbh,
+ SPA_GANGBLOCKSIZE, NULL, NULL, ZIO_PRIORITY_SYNC_READ,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_CONFIG_HELD, zb));
+ if (error)
+ return (error);
+ if (BP_SHOULD_BYTESWAP(&blk))
+ byteswap_uint64_array(&gbh, SPA_GANGBLOCKSIZE);
+ for (g = 0; g < SPA_GBH_NBLKPTRS; g++) {
+ if (BP_IS_HOLE(&gbh.zg_blkptr[g]))
+ break;
+ error = zdb_space_map_claim(spa, &gbh.zg_blkptr[g], zb);
+ if (error)
+ return (error);
+ }
+ }
+
+ return (0);
+}
+
+static void
+zdb_leak(space_map_t *sm, uint64_t start, uint64_t size)
+{
+ metaslab_t *msp;
+
+ /* LINTED */
+ msp = (metaslab_t *)((char *)sm - offsetof(metaslab_t, ms_allocmap[0]));
+
+ (void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n",
+ (u_longlong_t)msp->ms_group->mg_vd->vdev_id,
+ (u_longlong_t)start,
+ (u_longlong_t)size);
+}
+
+static void
+zdb_space_map_unload(spa_t *spa)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ vdev_t *vd;
+ int c, m;
+
+ for (c = 0; c < rvd->vdev_children; c++) {
+ vd = rvd->vdev_child[c];
+ for (m = 0; m < vd->vdev_ms_count; m++) {
+ metaslab_t *msp = vd->vdev_ms[m];
+ mutex_enter(&msp->ms_lock);
+ space_map_vacate(&msp->ms_allocmap[0], zdb_leak,
+ &msp->ms_allocmap[0]);
+ space_map_unload(&msp->ms_allocmap[0]);
+ space_map_vacate(&msp->ms_freemap[0], NULL, NULL);
+ mutex_exit(&msp->ms_lock);
+ }
+ }
+}
+
+static void
+zdb_refresh_ubsync(spa_t *spa)
+{
+ uberblock_t ub = { 0 };
+ vdev_t *rvd = spa->spa_root_vdev;
+ zio_t *zio;
+
+ /*
+ * Reload the uberblock.
+ */
+ zio = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
+ vdev_uberblock_load(zio, rvd, &ub);
+ (void) zio_wait(zio);
+
+ if (ub.ub_txg != 0)
+ spa->spa_ubsync = ub;
+}
+
+/*
+ * Verify that the sum of the sizes of all blocks in the pool adds up
+ * to the SPA's sa_alloc total.
+ */
+typedef struct zdb_blkstats {
+ uint64_t zb_asize;
+ uint64_t zb_lsize;
+ uint64_t zb_psize;
+ uint64_t zb_count;
+} zdb_blkstats_t;
+
+#define DMU_OT_DEFERRED DMU_OT_NONE
+#define DMU_OT_TOTAL DMU_OT_NUMTYPES
+
+#define ZB_TOTAL ZB_MAXLEVEL
+
+typedef struct zdb_cb {
+ zdb_blkstats_t zcb_type[ZB_TOTAL + 1][DMU_OT_TOTAL + 1];
+ uint64_t zcb_errors[256];
+ traverse_blk_cache_t *zcb_cache;
+ int zcb_readfails;
+ int zcb_haderrors;
+} zdb_cb_t;
+
+static void
+zdb_count_block(spa_t *spa, zdb_cb_t *zcb, blkptr_t *bp, int type)
+{
+ int i, error;
+
+ for (i = 0; i < 4; i++) {
+ int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
+ int t = (i & 1) ? type : DMU_OT_TOTAL;
+ zdb_blkstats_t *zb = &zcb->zcb_type[l][t];
+
+ zb->zb_asize += BP_GET_ASIZE(bp);
+ zb->zb_lsize += BP_GET_LSIZE(bp);
+ zb->zb_psize += BP_GET_PSIZE(bp);
+ zb->zb_count++;
+ }
+
+ if (dump_opt['L'])
+ return;
+
+ error = zdb_space_map_claim(spa, bp, &zcb->zcb_cache->bc_bookmark);
+
+ if (error == 0)
+ return;
+
+ if (error == EAGAIN)
+ (void) fatal("double-allocation, bp=%p", bp);
+
+ if (error == ESTALE)
+ (void) fatal("reference to freed block, bp=%p", bp);
+
+ (void) fatal("fatal error %d in bp %p", error, bp);
+}
+
+static int
+zdb_blkptr_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
+{
+ zbookmark_t *zb = &bc->bc_bookmark;
+ zdb_cb_t *zcb = arg;
+ blkptr_t *bp = &bc->bc_blkptr;
+ dmu_object_type_t type = BP_GET_TYPE(bp);
+ char blkbuf[BP_SPRINTF_LEN];
+ int error = 0;
+
+ if (bc->bc_errno) {
+ if (zcb->zcb_readfails++ < 10 && dump_opt['L']) {
+ zdb_refresh_ubsync(spa);
+ error = EAGAIN;
+ } else {
+ zcb->zcb_haderrors = 1;
+ zcb->zcb_errors[bc->bc_errno]++;
+ error = ERESTART;
+ }
+
+ if (dump_opt['b'] >= 3 || (dump_opt['b'] >= 2 && bc->bc_errno))
+ sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, bp);
+ else
+ blkbuf[0] = '\0';
+
+ (void) printf("zdb_blkptr_cb: Got error %d reading "
+ "<%llu, %llu, %lld, %llx> %s -- %s\n",
+ bc->bc_errno,
+ (u_longlong_t)zb->zb_objset,
+ (u_longlong_t)zb->zb_object,
+ (u_longlong_t)zb->zb_level,
+ (u_longlong_t)zb->zb_blkid,
+ blkbuf,
+ error == EAGAIN ? "retrying" : "skipping");
+
+ return (error);
+ }
+
+ zcb->zcb_readfails = 0;
+
+ ASSERT(!BP_IS_HOLE(bp));
+
+ if (dump_opt['b'] >= 4) {
+ sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, bp);
+ (void) printf("objset %llu object %llu offset 0x%llx %s\n",
+ (u_longlong_t)zb->zb_objset,
+ (u_longlong_t)zb->zb_object,
+ (u_longlong_t)blkid2offset(bc->bc_dnode,
+ zb->zb_level, zb->zb_blkid),
+ blkbuf);
+ }
+
+ zdb_count_block(spa, zcb, bp, type);
+
+ return (0);
+}
+
+static int
+dump_block_stats(spa_t *spa)
+{
+ traverse_handle_t *th;
+ zdb_cb_t zcb = { 0 };
+ traverse_blk_cache_t dummy_cache = { 0 };
+ zdb_blkstats_t *zb, *tzb;
+ uint64_t alloc, space;
+ int leaks = 0;
+ int advance = zdb_advance;
+ int flags;
+ int e;
+
+ zcb.zcb_cache = &dummy_cache;
+
+ if (dump_opt['c'])
+ advance |= ADVANCE_DATA;
+
+ advance |= ADVANCE_PRUNE | ADVANCE_ZIL;
+
+ (void) printf("\nTraversing all blocks to %sverify"
+ " nothing leaked ...\n",
+ dump_opt['c'] ? "verify checksums and " : "");
+
+ /*
+ * Load all space maps. As we traverse the pool, if we find a block
+ * that's not in its space map, that indicates a double-allocation,
+ * reference to a freed block, or an unclaimed block. Otherwise we
+ * remove the block from the space map. If the space maps are not
+ * empty when we're done, that indicates leaked blocks.
+ */
+ if (!dump_opt['L'])
+ zdb_space_map_load(spa);
+
+ /*
+ * If there's a deferred-free bplist, process that first.
+ */
+ if (spa->spa_sync_bplist_obj != 0) {
+ bplist_t *bpl = &spa->spa_sync_bplist;
+ blkptr_t blk;
+ uint64_t itor = 0;
+
+ VERIFY(0 == bplist_open(bpl, spa->spa_meta_objset,
+ spa->spa_sync_bplist_obj));
+
+ while (bplist_iterate(bpl, &itor, &blk) == 0) {
+ if (dump_opt['b'] >= 4) {
+ char blkbuf[BP_SPRINTF_LEN];
+ sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, &blk);
+ (void) printf("[%s] %s\n",
+ "deferred free", blkbuf);
+ }
+ zdb_count_block(spa, &zcb, &blk, DMU_OT_DEFERRED);
+ }
+
+ bplist_close(bpl);
+ }
+
+ /*
+ * Now traverse the pool. If we're reading all data to verify
+ * checksums, do a scrubbing read so that we validate all copies.
+ */
+ flags = ZIO_FLAG_CANFAIL;
+ if (advance & ADVANCE_DATA)
+ flags |= ZIO_FLAG_SCRUB;
+ th = traverse_init(spa, zdb_blkptr_cb, &zcb, advance, flags);
+ th->th_noread = zdb_noread;
+
+ traverse_add_pool(th, 0, spa_first_txg(spa) + TXG_CONCURRENT_STATES);
+
+ while (traverse_more(th) == EAGAIN)
+ continue;
+
+ traverse_fini(th);
+
+ if (zcb.zcb_haderrors) {
+ (void) printf("\nError counts:\n\n");
+ (void) printf("\t%5s %s\n", "errno", "count");
+ for (e = 0; e < 256; e++) {
+ if (zcb.zcb_errors[e] != 0) {
+ (void) printf("\t%5d %llu\n",
+ e, (u_longlong_t)zcb.zcb_errors[e]);
+ }
+ }
+ }
+
+ /*
+ * Report any leaked segments.
+ */
+ if (!dump_opt['L'])
+ zdb_space_map_unload(spa);
+
+ if (dump_opt['L'])
+ (void) printf("\n\n *** Live pool traversal; "
+ "block counts are only approximate ***\n\n");
+
+ alloc = spa_get_alloc(spa);
+ space = spa_get_space(spa);
+
+ tzb = &zcb.zcb_type[ZB_TOTAL][DMU_OT_TOTAL];
+
+ if (tzb->zb_asize == alloc) {
+ (void) printf("\n\tNo leaks (block sum matches space"
+ " maps exactly)\n");
+ } else {
+ (void) printf("block traversal size %llu != alloc %llu "
+ "(leaked %lld)\n",
+ (u_longlong_t)tzb->zb_asize,
+ (u_longlong_t)alloc,
+ (u_longlong_t)(alloc - tzb->zb_asize));
+ leaks = 1;
+ }
+
+ if (tzb->zb_count == 0)
+ return (2);
+
+ (void) printf("\n");
+ (void) printf("\tbp count: %10llu\n",
+ (u_longlong_t)tzb->zb_count);
+ (void) printf("\tbp logical: %10llu\t avg: %6llu\n",
+ (u_longlong_t)tzb->zb_lsize,
+ (u_longlong_t)(tzb->zb_lsize / tzb->zb_count));
+ (void) printf("\tbp physical: %10llu\t avg:"
+ " %6llu\tcompression: %6.2f\n",
+ (u_longlong_t)tzb->zb_psize,
+ (u_longlong_t)(tzb->zb_psize / tzb->zb_count),
+ (double)tzb->zb_lsize / tzb->zb_psize);
+ (void) printf("\tbp allocated: %10llu\t avg:"
+ " %6llu\tcompression: %6.2f\n",
+ (u_longlong_t)tzb->zb_asize,
+ (u_longlong_t)(tzb->zb_asize / tzb->zb_count),
+ (double)tzb->zb_lsize / tzb->zb_asize);
+ (void) printf("\tSPA allocated: %10llu\tused: %5.2f%%\n",
+ (u_longlong_t)alloc, 100.0 * alloc / space);
+
+ if (dump_opt['b'] >= 2) {
+ int l, t, level;
+ (void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE"
+ "\t avg\t comp\t%%Total\tType\n");
+
+ for (t = 0; t <= DMU_OT_NUMTYPES; t++) {
+ char csize[6], lsize[6], psize[6], asize[6], avg[6];
+ char *typename;
+
+ typename = t == DMU_OT_DEFERRED ? "deferred free" :
+ t == DMU_OT_TOTAL ? "Total" : dmu_ot[t].ot_name;
+
+ if (zcb.zcb_type[ZB_TOTAL][t].zb_asize == 0) {
+ (void) printf("%6s\t%5s\t%5s\t%5s"
+ "\t%5s\t%5s\t%6s\t%s\n",
+ "-",
+ "-",
+ "-",
+ "-",
+ "-",
+ "-",
+ "-",
+ typename);
+ continue;
+ }
+
+ for (l = ZB_TOTAL - 1; l >= -1; l--) {
+ level = (l == -1 ? ZB_TOTAL : l);
+ zb = &zcb.zcb_type[level][t];
+
+ if (zb->zb_asize == 0)
+ continue;
+
+ if (dump_opt['b'] < 3 && level != ZB_TOTAL)
+ continue;
+
+ if (level == 0 && zb->zb_asize ==
+ zcb.zcb_type[ZB_TOTAL][t].zb_asize)
+ continue;
+
+ nicenum(zb->zb_count, csize);
+ nicenum(zb->zb_lsize, lsize);
+ nicenum(zb->zb_psize, psize);
+ nicenum(zb->zb_asize, asize);
+ nicenum(zb->zb_asize / zb->zb_count, avg);
+
+ (void) printf("%6s\t%5s\t%5s\t%5s\t%5s"
+ "\t%5.2f\t%6.2f\t",
+ csize, lsize, psize, asize, avg,
+ (double)zb->zb_lsize / zb->zb_psize,
+ 100.0 * zb->zb_asize / tzb->zb_asize);
+
+ if (level == ZB_TOTAL)
+ (void) printf("%s\n", typename);
+ else
+ (void) printf(" L%d %s\n",
+ level, typename);
+ }
+ }
+ }
+
+ (void) printf("\n");
+
+ if (leaks)
+ return (2);
+
+ if (zcb.zcb_haderrors)
+ return (3);
+
+ return (0);
+}
+
+static void
+dump_zpool(spa_t *spa)
+{
+ dsl_pool_t *dp = spa_get_dsl(spa);
+ int rc = 0;
+
+ if (dump_opt['u'])
+ dump_uberblock(&spa->spa_uberblock);
+
+ if (dump_opt['d'] || dump_opt['i']) {
+ dump_dir(dp->dp_meta_objset);
+ if (dump_opt['d'] >= 3) {
+ dump_bplist(dp->dp_meta_objset,
+ spa->spa_sync_bplist_obj, "Deferred frees");
+ dump_dtl(spa->spa_root_vdev, 0);
+ dump_metaslabs(spa);
+ }
+ (void) dmu_objset_find(spa->spa_name, dump_one_dir, NULL,
+ DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
+ }
+
+ if (dump_opt['b'] || dump_opt['c'])
+ rc = dump_block_stats(spa);
+
+ if (dump_opt['s'])
+ show_pool_stats(spa);
+
+ if (rc != 0)
+ exit(rc);
+}
+
+#define ZDB_FLAG_CHECKSUM 0x0001
+#define ZDB_FLAG_DECOMPRESS 0x0002
+#define ZDB_FLAG_BSWAP 0x0004
+#define ZDB_FLAG_GBH 0x0008
+#define ZDB_FLAG_INDIRECT 0x0010
+#define ZDB_FLAG_PHYS 0x0020
+#define ZDB_FLAG_RAW 0x0040
+#define ZDB_FLAG_PRINT_BLKPTR 0x0080
+
+int flagbits[256];
+
+static void
+zdb_print_blkptr(blkptr_t *bp, int flags)
+{
+ dva_t *dva = bp->blk_dva;
+ int d;
+
+ if (flags & ZDB_FLAG_BSWAP)
+ byteswap_uint64_array((void *)bp, sizeof (blkptr_t));
+ /*
+ * Super-ick warning: This code is also duplicated in
+ * cmd/mdb/common/modules/zfs/zfs.c . Yeah, I hate code
+ * replication, too.
+ */
+ for (d = 0; d < BP_GET_NDVAS(bp); d++) {
+ (void) printf("\tDVA[%d]: vdev_id %lld / %llx\n", d,
+ (longlong_t)DVA_GET_VDEV(&dva[d]),
+ (longlong_t)DVA_GET_OFFSET(&dva[d]));
+ (void) printf("\tDVA[%d]: GANG: %-5s GRID: %04llx\t"
+ "ASIZE: %llx\n", d,
+ DVA_GET_GANG(&dva[d]) ? "TRUE" : "FALSE",
+ (longlong_t)DVA_GET_GRID(&dva[d]),
+ (longlong_t)DVA_GET_ASIZE(&dva[d]));
+ (void) printf("\tDVA[%d]: :%llu:%llx:%llx:%s%s%s%s\n", d,
+ (u_longlong_t)DVA_GET_VDEV(&dva[d]),
+ (longlong_t)DVA_GET_OFFSET(&dva[d]),
+ (longlong_t)BP_GET_PSIZE(bp),
+ BP_SHOULD_BYTESWAP(bp) ? "e" : "",
+ !DVA_GET_GANG(&dva[d]) && BP_GET_LEVEL(bp) != 0 ?
+ "d" : "",
+ DVA_GET_GANG(&dva[d]) ? "g" : "",
+ BP_GET_COMPRESS(bp) != 0 ? "d" : "");
+ }
+ (void) printf("\tLSIZE: %-16llx\t\tPSIZE: %llx\n",
+ (longlong_t)BP_GET_LSIZE(bp), (longlong_t)BP_GET_PSIZE(bp));
+ (void) printf("\tENDIAN: %6s\t\t\t\t\tTYPE: %s\n",
+ BP_GET_BYTEORDER(bp) ? "LITTLE" : "BIG",
+ dmu_ot[BP_GET_TYPE(bp)].ot_name);
+ (void) printf("\tBIRTH: %-16llx LEVEL: %-2llu\tFILL: %llx\n",
+ (u_longlong_t)bp->blk_birth, (u_longlong_t)BP_GET_LEVEL(bp),
+ (u_longlong_t)bp->blk_fill);
+ (void) printf("\tCKFUNC: %-16s\t\tCOMP: %s\n",
+ zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name,
+ zio_compress_table[BP_GET_COMPRESS(bp)].ci_name);
+ (void) printf("\tCKSUM: %llx:%llx:%llx:%llx\n",
+ (u_longlong_t)bp->blk_cksum.zc_word[0],
+ (u_longlong_t)bp->blk_cksum.zc_word[1],
+ (u_longlong_t)bp->blk_cksum.zc_word[2],
+ (u_longlong_t)bp->blk_cksum.zc_word[3]);
+}
+
+static void
+zdb_dump_indirect(blkptr_t *bp, int nbps, int flags)
+{
+ int i;
+
+ for (i = 0; i < nbps; i++)
+ zdb_print_blkptr(&bp[i], flags);
+}
+
+static void
+zdb_dump_gbh(void *buf, int flags)
+{
+ zdb_dump_indirect((blkptr_t *)buf, SPA_GBH_NBLKPTRS, flags);
+}
+
+static void
+zdb_dump_block_raw(void *buf, uint64_t size, int flags)
+{
+ if (flags & ZDB_FLAG_BSWAP)
+ byteswap_uint64_array(buf, size);
+ (void) write(2, buf, size);
+}
+
+static void
+zdb_dump_block(char *label, void *buf, uint64_t size, int flags)
+{
+ uint64_t *d = (uint64_t *)buf;
+ int nwords = size / sizeof (uint64_t);
+ int do_bswap = !!(flags & ZDB_FLAG_BSWAP);
+ int i, j;
+ char *hdr, *c;
+
+
+ if (do_bswap)
+ hdr = " 7 6 5 4 3 2 1 0 f e d c b a 9 8";
+ else
+ hdr = " 0 1 2 3 4 5 6 7 8 9 a b c d e f";
+
+ (void) printf("\n%s\n%6s %s 0123456789abcdef\n", label, "", hdr);
+
+ for (i = 0; i < nwords; i += 2) {
+ (void) printf("%06llx: %016llx %016llx ",
+ (u_longlong_t)(i * sizeof (uint64_t)),
+ (u_longlong_t)(do_bswap ? BSWAP_64(d[i]) : d[i]),
+ (u_longlong_t)(do_bswap ? BSWAP_64(d[i + 1]) : d[i + 1]));
+
+ c = (char *)&d[i];
+ for (j = 0; j < 2 * sizeof (uint64_t); j++)
+ (void) printf("%c", isprint(c[j]) ? c[j] : '.');
+ (void) printf("\n");
+ }
+}
+
+/*
+ * There are two acceptable formats:
+ * leaf_name - For example: c1t0d0 or /tmp/ztest.0a
+ * child[.child]* - For example: 0.1.1
+ *
+ * The second form can be used to specify arbitrary vdevs anywhere
+ * in the heirarchy. For example, in a pool with a mirror of
+ * RAID-Zs, you can specify either RAID-Z vdev with 0.0 or 0.1 .
+ */
+static vdev_t *
+zdb_vdev_lookup(vdev_t *vdev, char *path)
+{
+ char *s, *p, *q;
+ int i;
+
+ if (vdev == NULL)
+ return (NULL);
+
+ /* First, assume the x.x.x.x format */
+ i = (int)strtoul(path, &s, 10);
+ if (s == path || (s && *s != '.' && *s != '\0'))
+ goto name;
+ if (i < 0 || i >= vdev->vdev_children)
+ return (NULL);
+
+ vdev = vdev->vdev_child[i];
+ if (*s == '\0')
+ return (vdev);
+ return (zdb_vdev_lookup(vdev, s+1));
+
+name:
+ for (i = 0; i < vdev->vdev_children; i++) {
+ vdev_t *vc = vdev->vdev_child[i];
+
+ if (vc->vdev_path == NULL) {
+ vc = zdb_vdev_lookup(vc, path);
+ if (vc == NULL)
+ continue;
+ else
+ return (vc);
+ }
+
+ p = strrchr(vc->vdev_path, '/');
+ p = p ? p + 1 : vc->vdev_path;
+ q = &vc->vdev_path[strlen(vc->vdev_path) - 2];
+
+ if (strcmp(vc->vdev_path, path) == 0)
+ return (vc);
+ if (strcmp(p, path) == 0)
+ return (vc);
+ if (strcmp(q, "s0") == 0 && strncmp(p, path, q - p) == 0)
+ return (vc);
+ }
+
+ return (NULL);
+}
+
+/*
+ * Read a block from a pool and print it out. The syntax of the
+ * block descriptor is:
+ *
+ * pool:vdev_specifier:offset:size[:flags]
+ *
+ * pool - The name of the pool you wish to read from
+ * vdev_specifier - Which vdev (see comment for zdb_vdev_lookup)
+ * offset - offset, in hex, in bytes
+ * size - Amount of data to read, in hex, in bytes
+ * flags - A string of characters specifying options
+ * b: Decode a blkptr at given offset within block
+ * *c: Calculate and display checksums
+ * *d: Decompress data before dumping
+ * e: Byteswap data before dumping
+ * *g: Display data as a gang block header
+ * *i: Display as an indirect block
+ * p: Do I/O to physical offset
+ * r: Dump raw data to stdout
+ *
+ * * = not yet implemented
+ */
+static void
+zdb_read_block(char *thing, spa_t **spap)
+{
+ spa_t *spa = *spap;
+ int flags = 0;
+ uint64_t offset = 0, size = 0, blkptr_offset = 0;
+ zio_t *zio;
+ vdev_t *vd;
+ void *buf;
+ char *s, *p, *dup, *spa_name, *vdev, *flagstr;
+ int i, error, zio_flags;
+
+ dup = strdup(thing);
+ s = strtok(dup, ":");
+ spa_name = s ? s : "";
+ s = strtok(NULL, ":");
+ vdev = s ? s : "";
+ s = strtok(NULL, ":");
+ offset = strtoull(s ? s : "", NULL, 16);
+ s = strtok(NULL, ":");
+ size = strtoull(s ? s : "", NULL, 16);
+ s = strtok(NULL, ":");
+ flagstr = s ? s : "";
+
+ s = NULL;
+ if (size == 0)
+ s = "size must not be zero";
+ if (!IS_P2ALIGNED(size, DEV_BSIZE))
+ s = "size must be a multiple of sector size";
+ if (!IS_P2ALIGNED(offset, DEV_BSIZE))
+ s = "offset must be a multiple of sector size";
+ if (s) {
+ (void) printf("Invalid block specifier: %s - %s\n", thing, s);
+ free(dup);
+ return;
+ }
+
+ for (s = strtok(flagstr, ":"); s; s = strtok(NULL, ":")) {
+ for (i = 0; flagstr[i]; i++) {
+ int bit = flagbits[(uchar_t)flagstr[i]];
+
+ if (bit == 0) {
+ (void) printf("***Invalid flag: %c\n",
+ flagstr[i]);
+ continue;
+ }
+ flags |= bit;
+
+ /* If it's not something with an argument, keep going */
+ if ((bit & (ZDB_FLAG_CHECKSUM | ZDB_FLAG_DECOMPRESS |
+ ZDB_FLAG_PRINT_BLKPTR)) == 0)
+ continue;
+
+ p = &flagstr[i + 1];
+ if (bit == ZDB_FLAG_PRINT_BLKPTR)
+ blkptr_offset = strtoull(p, &p, 16);
+ if (*p != ':' && *p != '\0') {
+ (void) printf("***Invalid flag arg: '%s'\n", s);
+ free(dup);
+ return;
+ }
+ }
+ }
+
+ if (spa == NULL || spa->spa_name == NULL ||
+ strcmp(spa->spa_name, spa_name)) {
+ if (spa && spa->spa_name)
+ spa_close(spa, (void *)zdb_read_block);
+ error = spa_open(spa_name, spap, (void *)zdb_read_block);
+ if (error)
+ fatal("Failed to open pool '%s': errno = %d\n",
+ spa_name, error);
+ spa = *spap;
+ }
+
+ vd = zdb_vdev_lookup(spa->spa_root_vdev, vdev);
+ if (vd == NULL) {
+ (void) printf("***Invalid vdev: %s\n", vdev);
+ free(dup);
+ return;
+ } else {
+ if (vd->vdev_path)
+ (void) printf("Found vdev: %s\n", vd->vdev_path);
+ else
+ (void) printf("Found vdev type: %s\n",
+ vd->vdev_ops->vdev_op_type);
+ }
+
+ buf = umem_alloc(size, UMEM_NOFAIL);
+
+ zio_flags = ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE |
+ ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_NOBOOKMARK;
+
+ if (flags & ZDB_FLAG_PHYS)
+ zio_flags |= ZIO_FLAG_PHYSICAL;
+
+ zio = zio_root(spa, NULL, NULL, 0);
+ /* XXX todo - cons up a BP so RAID-Z will be happy */
+ zio_nowait(zio_vdev_child_io(zio, NULL, vd, offset, buf, size,
+ ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, zio_flags, NULL, NULL));
+ error = zio_wait(zio);
+
+ if (error) {
+ (void) printf("Read of %s failed, error: %d\n", thing, error);
+ goto out;
+ }
+
+ if (flags & ZDB_FLAG_PRINT_BLKPTR)
+ zdb_print_blkptr((blkptr_t *)(void *)
+ ((uintptr_t)buf + (uintptr_t)blkptr_offset), flags);
+ else if (flags & ZDB_FLAG_RAW)
+ zdb_dump_block_raw(buf, size, flags);
+ else if (flags & ZDB_FLAG_INDIRECT)
+ zdb_dump_indirect((blkptr_t *)buf, size / sizeof (blkptr_t),
+ flags);
+ else if (flags & ZDB_FLAG_GBH)
+ zdb_dump_gbh(buf, flags);
+ else
+ zdb_dump_block(thing, buf, size, flags);
+
+out:
+ umem_free(buf, size);
+ free(dup);
+}
+
+int
+main(int argc, char **argv)
+{
+ int i, c;
+ struct rlimit rl = { 1024, 1024 };
+ spa_t *spa;
+ objset_t *os = NULL;
+ char *endstr;
+ int dump_all = 1;
+ int verbose = 0;
+ int error;
+ int flag, set;
+
+ (void) setrlimit(RLIMIT_NOFILE, &rl);
+ (void) enable_extended_FILE_stdio(-1, -1);
+
+ dprintf_setup(&argc, argv);
+
+ while ((c = getopt(argc, argv, "udibcsvCLO:B:UlR")) != -1) {
+ switch (c) {
+ case 'u':
+ case 'd':
+ case 'i':
+ case 'b':
+ case 'c':
+ case 's':
+ case 'C':
+ case 'l':
+ case 'R':
+ dump_opt[c]++;
+ dump_all = 0;
+ break;
+ case 'L':
+ dump_opt[c]++;
+ break;
+ case 'O':
+ endstr = optarg;
+ if (endstr[0] == '!') {
+ endstr++;
+ set = 0;
+ } else {
+ set = 1;
+ }
+ if (strcmp(endstr, "post") == 0) {
+ flag = ADVANCE_PRE;
+ set = !set;
+ } else if (strcmp(endstr, "pre") == 0) {
+ flag = ADVANCE_PRE;
+ } else if (strcmp(endstr, "prune") == 0) {
+ flag = ADVANCE_PRUNE;
+ } else if (strcmp(endstr, "data") == 0) {
+ flag = ADVANCE_DATA;
+ } else if (strcmp(endstr, "holes") == 0) {
+ flag = ADVANCE_HOLES;
+ } else {
+ usage();
+ }
+ if (set)
+ zdb_advance |= flag;
+ else
+ zdb_advance &= ~flag;
+ break;
+ case 'B':
+ endstr = optarg - 1;
+ zdb_noread.zb_objset = strtoull(endstr + 1, &endstr, 0);
+ zdb_noread.zb_object = strtoull(endstr + 1, &endstr, 0);
+ zdb_noread.zb_level = strtol(endstr + 1, &endstr, 0);
+ zdb_noread.zb_blkid = strtoull(endstr + 1, &endstr, 16);
+ (void) printf("simulating bad block "
+ "<%llu, %llu, %lld, %llx>\n",
+ (u_longlong_t)zdb_noread.zb_objset,
+ (u_longlong_t)zdb_noread.zb_object,
+ (u_longlong_t)zdb_noread.zb_level,
+ (u_longlong_t)zdb_noread.zb_blkid);
+ break;
+ case 'v':
+ verbose++;
+ break;
+ case 'U':
+ spa_config_dir = "/tmp";
+ break;
+ default:
+ usage();
+ break;
+ }
+ }
+
+ kernel_init(FREAD);
+
+ /*
+ * Disable vdev caching. If we don't do this, live pool traversal
+ * won't make progress because it will never see disk updates.
+ */
+ zfs_vdev_cache_size = 0;
+
+ for (c = 0; c < 256; c++) {
+ if (dump_all && c != 'L' && c != 'l' && c != 'R')
+ dump_opt[c] = 1;
+ if (dump_opt[c])
+ dump_opt[c] += verbose;
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ if (argc < 1) {
+ if (dump_opt['C']) {
+ dump_config(NULL);
+ return (0);
+ }
+ usage();
+ }
+
+ if (dump_opt['l']) {
+ dump_label(argv[0]);
+ return (0);
+ }
+
+ if (dump_opt['R']) {
+ flagbits['b'] = ZDB_FLAG_PRINT_BLKPTR;
+ flagbits['c'] = ZDB_FLAG_CHECKSUM;
+ flagbits['d'] = ZDB_FLAG_DECOMPRESS;
+ flagbits['e'] = ZDB_FLAG_BSWAP;
+ flagbits['g'] = ZDB_FLAG_GBH;
+ flagbits['i'] = ZDB_FLAG_INDIRECT;
+ flagbits['p'] = ZDB_FLAG_PHYS;
+ flagbits['r'] = ZDB_FLAG_RAW;
+
+ spa = NULL;
+ while (argv[0]) {
+ zdb_read_block(argv[0], &spa);
+ argv++;
+ argc--;
+ }
+ if (spa)
+ spa_close(spa, (void *)zdb_read_block);
+ return (0);
+ }
+
+ if (dump_opt['C'])
+ dump_config(argv[0]);
+
+ if (strchr(argv[0], '/') != NULL) {
+ error = dmu_objset_open(argv[0], DMU_OST_ANY,
+ DS_MODE_STANDARD | DS_MODE_READONLY, &os);
+ } else {
+ error = spa_open(argv[0], &spa, FTAG);
+ }
+
+ if (error)
+ fatal("can't open %s: error %d", argv[0], error);
+
+ argv++;
+ if (--argc > 0) {
+ zopt_objects = argc;
+ zopt_object = calloc(zopt_objects, sizeof (uint64_t));
+ for (i = 0; i < zopt_objects; i++) {
+ errno = 0;
+ zopt_object[i] = strtoull(argv[i], NULL, 0);
+ if (zopt_object[i] == 0 && errno != 0)
+ fatal("bad object number %s: %s",
+ argv[i], strerror(errno));
+ }
+ }
+
+ if (os != NULL) {
+ dump_dir(os);
+ dmu_objset_close(os);
+ } else {
+ dump_zpool(spa);
+ spa_close(spa, FTAG);
+ }
+
+ kernel_fini();
+
+ return (0);
+}
diff --git a/contrib/opensolaris/cmd/zdb/zdb_il.c b/contrib/opensolaris/cmd/zdb/zdb_il.c
new file mode 100644
index 0000000..10dfe20
--- /dev/null
+++ b/contrib/opensolaris/cmd/zdb/zdb_il.c
@@ -0,0 +1,354 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * Print intent log header and statistics.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/dmu.h>
+#include <sys/stat.h>
+#include <sys/resource.h>
+#include <sys/zil.h>
+#include <sys/zil_impl.h>
+
+extern uint8_t dump_opt[256];
+
+static void
+print_log_bp(const blkptr_t *bp, const char *prefix)
+{
+ char blkbuf[BP_SPRINTF_LEN];
+
+ sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, bp);
+ (void) printf("%s%s\n", prefix, blkbuf);
+}
+
+/* ARGSUSED */
+static void
+zil_prt_rec_create(zilog_t *zilog, int txtype, lr_create_t *lr)
+{
+ time_t crtime = lr->lr_crtime[0];
+ char *name = (char *)(lr + 1);
+ char *link = name + strlen(name) + 1;
+
+ if (txtype == TX_SYMLINK)
+ (void) printf("\t\t\t%s -> %s\n", name, link);
+ else
+ (void) printf("\t\t\t%s\n", name);
+
+ (void) printf("\t\t\t%s", ctime(&crtime));
+ (void) printf("\t\t\tdoid %llu, foid %llu, mode %llo\n",
+ (u_longlong_t)lr->lr_doid, (u_longlong_t)lr->lr_foid,
+ (longlong_t)lr->lr_mode);
+ (void) printf("\t\t\tuid %llu, gid %llu, gen %llu, rdev 0x%llx\n",
+ (u_longlong_t)lr->lr_uid, (u_longlong_t)lr->lr_gid,
+ (u_longlong_t)lr->lr_gen, (u_longlong_t)lr->lr_rdev);
+}
+
+/* ARGSUSED */
+static void
+zil_prt_rec_remove(zilog_t *zilog, int txtype, lr_remove_t *lr)
+{
+ (void) printf("\t\t\tdoid %llu, name %s\n",
+ (u_longlong_t)lr->lr_doid, (char *)(lr + 1));
+}
+
+/* ARGSUSED */
+static void
+zil_prt_rec_link(zilog_t *zilog, int txtype, lr_link_t *lr)
+{
+ (void) printf("\t\t\tdoid %llu, link_obj %llu, name %s\n",
+ (u_longlong_t)lr->lr_doid, (u_longlong_t)lr->lr_link_obj,
+ (char *)(lr + 1));
+}
+
+/* ARGSUSED */
+static void
+zil_prt_rec_rename(zilog_t *zilog, int txtype, lr_rename_t *lr)
+{
+ char *snm = (char *)(lr + 1);
+ char *tnm = snm + strlen(snm) + 1;
+
+ (void) printf("\t\t\tsdoid %llu, tdoid %llu\n",
+ (u_longlong_t)lr->lr_sdoid, (u_longlong_t)lr->lr_tdoid);
+ (void) printf("\t\t\tsrc %s tgt %s\n", snm, tnm);
+}
+
+/* ARGSUSED */
+static void
+zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr)
+{
+ char *data, *dlimit;
+ blkptr_t *bp = &lr->lr_blkptr;
+ char buf[SPA_MAXBLOCKSIZE];
+ int verbose = MAX(dump_opt['d'], dump_opt['i']);
+ int error;
+
+ (void) printf("\t\t\tfoid %llu, offset 0x%llx,"
+ " length 0x%llx, blkoff 0x%llx\n",
+ (u_longlong_t)lr->lr_foid, (longlong_t)lr->lr_offset,
+ (u_longlong_t)lr->lr_length, (u_longlong_t)lr->lr_blkoff);
+
+ if (verbose < 5)
+ return;
+
+ if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
+ (void) printf("\t\t\thas blkptr, %s\n",
+ bp->blk_birth >= spa_first_txg(zilog->zl_spa) ?
+ "will claim" : "won't claim");
+ print_log_bp(bp, "\t\t\t");
+ if (bp->blk_birth == 0) {
+ bzero(buf, sizeof (buf));
+ } else {
+ zbookmark_t zb;
+
+ ASSERT3U(bp->blk_cksum.zc_word[ZIL_ZC_OBJSET], ==,
+ dmu_objset_id(zilog->zl_os));
+
+ zb.zb_objset = bp->blk_cksum.zc_word[ZIL_ZC_OBJSET];
+ zb.zb_object = 0;
+ zb.zb_level = -1;
+ zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
+
+ error = zio_wait(zio_read(NULL, zilog->zl_spa,
+ bp, buf, BP_GET_LSIZE(bp), NULL, NULL,
+ ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &zb));
+ if (error)
+ return;
+ }
+ data = buf + lr->lr_blkoff;
+ } else {
+ data = (char *)(lr + 1);
+ }
+
+ dlimit = data + MIN(lr->lr_length,
+ (verbose < 6 ? 20 : SPA_MAXBLOCKSIZE));
+
+ (void) printf("\t\t\t");
+ while (data < dlimit) {
+ if (isprint(*data))
+ (void) printf("%c ", *data);
+ else
+ (void) printf("%2X", *data);
+ data++;
+ }
+ (void) printf("\n");
+}
+
+/* ARGSUSED */
+static void
+zil_prt_rec_truncate(zilog_t *zilog, int txtype, lr_truncate_t *lr)
+{
+ (void) printf("\t\t\tfoid %llu, offset 0x%llx, length 0x%llx\n",
+ (u_longlong_t)lr->lr_foid, (longlong_t)lr->lr_offset,
+ (u_longlong_t)lr->lr_length);
+}
+
+/* ARGSUSED */
+static void
+zil_prt_rec_setattr(zilog_t *zilog, int txtype, lr_setattr_t *lr)
+{
+ time_t atime = (time_t)lr->lr_atime[0];
+ time_t mtime = (time_t)lr->lr_mtime[0];
+
+ (void) printf("\t\t\tfoid %llu, mask 0x%llx\n",
+ (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_mask);
+
+ if (lr->lr_mask & AT_MODE) {
+ (void) printf("\t\t\tAT_MODE %llo\n",
+ (longlong_t)lr->lr_mode);
+ }
+
+ if (lr->lr_mask & AT_UID) {
+ (void) printf("\t\t\tAT_UID %llu\n",
+ (u_longlong_t)lr->lr_uid);
+ }
+
+ if (lr->lr_mask & AT_GID) {
+ (void) printf("\t\t\tAT_GID %llu\n",
+ (u_longlong_t)lr->lr_gid);
+ }
+
+ if (lr->lr_mask & AT_SIZE) {
+ (void) printf("\t\t\tAT_SIZE %llu\n",
+ (u_longlong_t)lr->lr_size);
+ }
+
+ if (lr->lr_mask & AT_ATIME) {
+ (void) printf("\t\t\tAT_ATIME %llu.%09llu %s",
+ (u_longlong_t)lr->lr_atime[0],
+ (u_longlong_t)lr->lr_atime[1],
+ ctime(&atime));
+ }
+
+ if (lr->lr_mask & AT_MTIME) {
+ (void) printf("\t\t\tAT_MTIME %llu.%09llu %s",
+ (u_longlong_t)lr->lr_mtime[0],
+ (u_longlong_t)lr->lr_mtime[1],
+ ctime(&mtime));
+ }
+}
+
+/* ARGSUSED */
+static void
+zil_prt_rec_acl(zilog_t *zilog, int txtype, lr_acl_t *lr)
+{
+ (void) printf("\t\t\tfoid %llu, aclcnt %llu\n",
+ (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_aclcnt);
+}
+
+typedef void (*zil_prt_rec_func_t)();
+typedef struct zil_rec_info {
+ zil_prt_rec_func_t zri_print;
+ char *zri_name;
+ uint64_t zri_count;
+} zil_rec_info_t;
+
+static zil_rec_info_t zil_rec_info[TX_MAX_TYPE] = {
+ { NULL, "Total " },
+ { zil_prt_rec_create, "TX_CREATE " },
+ { zil_prt_rec_create, "TX_MKDIR " },
+ { zil_prt_rec_create, "TX_MKXATTR " },
+ { zil_prt_rec_create, "TX_SYMLINK " },
+ { zil_prt_rec_remove, "TX_REMOVE " },
+ { zil_prt_rec_remove, "TX_RMDIR " },
+ { zil_prt_rec_link, "TX_LINK " },
+ { zil_prt_rec_rename, "TX_RENAME " },
+ { zil_prt_rec_write, "TX_WRITE " },
+ { zil_prt_rec_truncate, "TX_TRUNCATE" },
+ { zil_prt_rec_setattr, "TX_SETATTR " },
+ { zil_prt_rec_acl, "TX_ACL " },
+};
+
+/* ARGSUSED */
+static void
+print_log_record(zilog_t *zilog, lr_t *lr, void *arg, uint64_t claim_txg)
+{
+ int txtype;
+ int verbose = MAX(dump_opt['d'], dump_opt['i']);
+
+ txtype = lr->lrc_txtype;
+
+ ASSERT(txtype != 0 && (uint_t)txtype < TX_MAX_TYPE);
+ ASSERT(lr->lrc_txg);
+
+ (void) printf("\t\t%s len %6llu, txg %llu, seq %llu\n",
+ zil_rec_info[txtype].zri_name,
+ (u_longlong_t)lr->lrc_reclen,
+ (u_longlong_t)lr->lrc_txg,
+ (u_longlong_t)lr->lrc_seq);
+
+ if (txtype && verbose >= 3)
+ zil_rec_info[txtype].zri_print(zilog, txtype, lr);
+
+ zil_rec_info[txtype].zri_count++;
+ zil_rec_info[0].zri_count++;
+}
+
+/* ARGSUSED */
+static void
+print_log_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
+{
+ char blkbuf[BP_SPRINTF_LEN];
+ int verbose = MAX(dump_opt['d'], dump_opt['i']);
+ char *claim;
+
+ if (verbose <= 3)
+ return;
+
+ if (verbose >= 5) {
+ (void) strcpy(blkbuf, ", ");
+ sprintf_blkptr(blkbuf + strlen(blkbuf),
+ BP_SPRINTF_LEN - strlen(blkbuf), bp);
+ } else {
+ blkbuf[0] = '\0';
+ }
+
+ if (claim_txg != 0)
+ claim = "already claimed";
+ else if (bp->blk_birth >= spa_first_txg(zilog->zl_spa))
+ claim = "will claim";
+ else
+ claim = "won't claim";
+
+ (void) printf("\tBlock seqno %llu, %s%s\n",
+ (u_longlong_t)bp->blk_cksum.zc_word[ZIL_ZC_SEQ], claim, blkbuf);
+}
+
+static void
+print_log_stats(int verbose)
+{
+ int i, w, p10;
+
+ if (verbose > 3)
+ (void) printf("\n");
+
+ if (zil_rec_info[0].zri_count == 0)
+ return;
+
+ for (w = 1, p10 = 10; zil_rec_info[0].zri_count >= p10; p10 *= 10)
+ w++;
+
+ for (i = 0; i < TX_MAX_TYPE; i++)
+ if (zil_rec_info[i].zri_count || verbose >= 3)
+ (void) printf("\t\t%s %*llu\n",
+ zil_rec_info[i].zri_name, w,
+ (u_longlong_t)zil_rec_info[i].zri_count);
+ (void) printf("\n");
+}
+
+/* ARGSUSED */
+void
+dump_intent_log(zilog_t *zilog)
+{
+ const zil_header_t *zh = zilog->zl_header;
+ int verbose = MAX(dump_opt['d'], dump_opt['i']);
+ int i;
+
+ if (zh->zh_log.blk_birth == 0 || verbose < 2)
+ return;
+
+ (void) printf("\n ZIL header: claim_txg %llu, seq %llu\n",
+ (u_longlong_t)zh->zh_claim_txg, (u_longlong_t)zh->zh_replay_seq);
+
+ if (verbose >= 4)
+ print_log_bp(&zh->zh_log, "\n\tfirst block: ");
+
+ for (i = 0; i < TX_MAX_TYPE; i++)
+ zil_rec_info[i].zri_count = 0;
+
+ if (verbose >= 2) {
+ (void) printf("\n");
+ (void) zil_parse(zilog, print_log_block, print_log_record, NULL,
+ zh->zh_claim_txg);
+ print_log_stats(verbose);
+ }
+}
diff --git a/contrib/opensolaris/cmd/zfs/zfs.8 b/contrib/opensolaris/cmd/zfs/zfs.8
new file mode 100644
index 0000000..f737ba7
--- /dev/null
+++ b/contrib/opensolaris/cmd/zfs/zfs.8
@@ -0,0 +1,1815 @@
+'\" te
+.\" CDDL HEADER START
+.\"
+.\" The contents of this file are subject to the terms of the
+.\" Common Development and Distribution License (the "License").
+.\" You may not use this file except in compliance with the License.
+.\"
+.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+.\" or http://www.opensolaris.org/os/licensing.
+.\" See the License for the specific language governing permissions
+.\" and limitations under the License.
+.\"
+.\" When distributing Covered Code, include this CDDL HEADER in each
+.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+.\" If applicable, add the following below this CDDL HEADER, with the
+.\" fields enclosed by brackets "[]" replaced with your own identifying
+.\" information: Portions Copyright [yyyy] [name of copyright owner]
+.\"
+.\" CDDL HEADER END
+.\" Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
+.TH zfs 1M "16 Mar 2007" "SunOS 5.11" "System Administration Commands"
+.SH NAME
+zfs \- configures ZFS file systems
+.SH SYNOPSIS
+.LP
+.nf
+\fBzfs\fR [\fB-?\fR]
+.fi
+.LP
+.nf
+\fBzfs\fR \fBcreate\fR [[\fB-o\fR property=\fIvalue\fR]]... \fIfilesystem\fR
+.fi
+.LP
+.nf
+\fBzfs\fR \fBcreate\fR [\fB-s\fR] [\fB-b\fR \fIblocksize\fR] [[\fB-o\fR property=\fIvalue\fR]]... \fB-V\fR \fIsize\fR \fIvolume\fR
+.fi
+.LP
+.nf
+\fBzfs\fR \fBdestroy\fR [\fB-rRf\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR
+.fi
+.LP
+.nf
+\fBzfs\fR \fBclone\fR \fIsnapshot\fR \fIfilesystem\fR|\fIvolume\fR
+.fi
+.LP
+.nf
+\fBzfs\fR \fBpromote\fR \fIfilesystem\fR
+.fi
+.LP
+.nf
+\fBzfs\fR \fBrename\fR \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR
+ [\fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR]
+.fi
+.LP
+.nf
+\fBzfs\fR \fBsnapshot\fR [\fB-r\fR] \fIfilesystem@name\fR|\fIvolume@name\fR
+.fi
+.LP
+.nf
+\fBzfs\fR \fBrollback\fR [\fB-rRf\fR] \fIsnapshot\fR
+.fi
+.LP
+.nf
+\fBzfs\fR \fBlist\fR [\fB-rH\fR] [\fB-o\fR \fIprop\fR[,\fIprop\fR] ]... [ \fB-t\fR \fItype\fR[,\fItype\fR]...]
+ [ \fB-s\fR \fIprop\fR [\fB-s\fR \fIprop\fR]... [ \fB-S\fR \fIprop\fR [\fB-S\fR \fIprop\fR]...
+ [\fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR|\fI/pathname\fR|.\fI/pathname\fR ...
+.fi
+.LP
+.nf
+\fBzfs\fR \fBset\fR \fIproperty\fR=\fIvalue\fR \fIfilesystem\fR|\fIvolume\fR ...
+.fi
+.LP
+.nf
+\fBzfs\fR \fBget\fR [\fB-rHp\fR] [\fB-o\fR \fIfield\fR[,\fIfield\fR]...]
+ [\fB-s\fR \fIsource\fR[,\fIsource\fR]...] \fIall\fR | \fIproperty\fR[,\fIproperty\fR]...
+ \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR ...
+.fi
+.LP
+.nf
+\fBzfs\fR \fBinherit\fR [\fB-r\fR] \fIproperty\fR \fIfilesystem\fR|\fIvolume\fR... ...
+.fi
+.LP
+.nf
+\fBzfs\fR \fBmount\fR
+.fi
+.LP
+.nf
+\fBzfs\fR \fBmount\fR [\fB-o \fIoptions\fR\fR] [\fB-O\fR] \fB-a\fR
+.fi
+.LP
+.nf
+\fBzfs\fR \fBmount\fR [\fB-o \fIoptions\fR\fR] [\fB-O\fR] \fIfilesystem\fR
+.fi
+.LP
+.nf
+\fBzfs\fR \fBunmount\fR [\fB-f\fR] \fB-a\fR
+.fi
+.LP
+.nf
+\fBzfs\fR \fBunmount\fR [\fB-f\fR] \fB\fIfilesystem\fR|\fImountpoint\fR\fR
+.fi
+.LP
+.nf
+\fBzfs\fR \fBshare\fR \fB-a\fR
+.fi
+.LP
+.nf
+\fBzfs\fR \fBshare\fR \fIfilesystem\fR
+.fi
+.LP
+.nf
+\fBzfs\fR \fBunshare\fR [\fB-f\fR] \fB-a\fR
+.fi
+.LP
+.nf
+\fBzfs\fR \fBunshare\fR [\fB-f\fR] \fB\fIfilesystem\fR|\fImountpoint\fR\fR
+.fi
+.LP
+.nf
+\fBzfs\fR \fBsend\fR [\fB-i\fR \fIsnapshot1\fR] \fB\fIsnapshot2\fR\fR
+.fi
+.LP
+.nf
+\fBzfs\fR \fBreceive\fR [\fB-vnF\fR ] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR
+.fi
+.LP
+.nf
+\fBzfs\fR \fBreceive\fR [\fB-vnF\fR ] \fB-d\fR \fB\fIfilesystem\fR\fR
+.fi
+.LP
+.nf
+\fBzfs\fR \fBjail\fR \fBjailid\fR \fB\fIfilesystem\fR\fR
+.fi
+.LP
+.nf
+\fBzfs\fR \fBunjail\fR \fBjailid\fR \fB\fIfilesystem\fR\fR
+.fi
+
+.SH DESCRIPTION
+
+.LP
+The \fBzfs\fR command configures \fBZFS\fR datasets within a \fBZFS\fR storage pool, as described in \fBzpool\fR(1M). A
+dataset is identified by a unique path within the \fBZFS\fR namespace. For example:
+.sp
+.in +2
+.nf
+pool/{filesystem,volume,snapshot}
+.fi
+.in -2
+.sp
+
+.LP
+where the maximum length of a dataset name is \fBMAXNAMELEN\fR (256 bytes).
+.LP
+A dataset can be one of the following:
+.sp
+.ne 2
+.mk
+.na
+\fB\fIfile system\fR\fR
+.ad
+.RS 15n
+.rt
+A standard \fBPOSIX\fR file system. \fBZFS\fR file systems can be mounted within the standard file system namespace and behave like any other file system.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fIvolume\fR\fR
+.ad
+.RS 15n
+.rt
+A logical volume exported as a raw or block device. This type of dataset should only be used under special circumstances. File systems are typically used in most environments. Volumes cannot be used in a non-global zone.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fIsnapshot\fR\fR
+.ad
+.RS 15n
+.rt
+A read-only version of a file system or volume at a given point in time. It is specified as \fIfilesystem@name\fR or \fIvolume@name\fR.
+.RE
+
+.SS ZFS File System Hierarchy
+
+.LP
+A \fBZFS\fR storage pool is a logical collection of devices that provide space for datasets. A storage pool is also the root of the \fBZFS\fR file system hierarchy.
+.LP
+The root of the pool can be accessed as a file system, such as mounting and unmounting, taking snapshots, and setting properties. The physical storage characteristics, however, are managed by the \fBzpool\fR(1M) command.
+.LP
+See \fBzpool\fR(1M) for more information on creating and administering pools.
+.SS Snapshots
+
+.LP
+A snapshot is a read-only copy of a file system or volume. Snapshots can be created extremely quickly, and initially consume no additional space within the pool. As data within the active dataset changes, the snapshot consumes more data than would otherwise be shared with the active dataset.
+.LP
+Snapshots can have arbitrary names. Snapshots of volumes can be cloned or rolled back, but cannot be accessed independently.
+.LP
+File system snapshots can be accessed under the ".zfs/snapshot" directory in the root of the file system. Snapshots are automatically mounted on demand and may be unmounted at regular intervals. The visibility of the ".zfs" directory can be controlled by the "snapdir"
+property.
+.SS Clones
+
+.LP
+A clone is a writable volume or file system whose initial contents are the same as another dataset. As with snapshots, creating a clone is nearly instantaneous, and initially consumes no additional space.
+.LP
+Clones can only be created from a snapshot. When a snapshot is cloned, it creates an implicit dependency between the parent and child. Even though the clone is created somewhere else in the dataset hierarchy, the original snapshot cannot be destroyed as long as a clone exists. The "origin"
+property exposes this dependency, and the \fBdestroy\fR command lists any such dependencies, if they exist.
+.LP
+The clone parent-child dependency relationship can be reversed by using the "\fBpromote\fR" subcommand. This causes the "origin" file system to become a clone of the specified file system, which makes it possible to destroy the file system that the clone
+was created from.
+.SS Mount Points
+
+.LP
+Creating a \fBZFS\fR file system is a simple operation, so the number of file systems per system will likely be numerous. To cope with this, \fBZFS\fR automatically manages mounting and unmounting file systems without the need to edit the \fB/etc/vfstab\fR file.
+All automatically managed file systems are mounted by \fBZFS\fR at boot time.
+.LP
+By default, file systems are mounted under \fB/\fIpath\fR\fR, where \fIpath\fR is the name of the file system in the \fBZFS\fR namespace. Directories are created and destroyed as needed.
+.LP
+A file system can also have a mount point set in the "mountpoint" property. This directory is created as needed, and \fBZFS\fR automatically mounts the file system when the "\fBzfs mount -a\fR" command is invoked (without editing \fB/etc/vfstab\fR). The mountpoint property can be inherited, so if \fBpool/home\fR has a mount point of \fB/export/stuff\fR, then \fBpool/home/user\fR automatically inherits a mount point of \fB/export/stuff/user\fR.
+.LP
+A file system mountpoint property of "none" prevents the file system from being mounted.
+.LP
+If needed, \fBZFS\fR file systems can also be managed with traditional tools (\fBmount\fR, \fBumount\fR, \fB/etc/vfstab\fR). If a file system's mount point is set to "legacy", \fBZFS\fR makes no attempt to manage
+the file system, and the administrator is responsible for mounting and unmounting the file system.
+.SS Zones
+
+.LP
+A \fBZFS\fR file system can be added to a non-global zone by using zonecfg's "\fBadd fs\fR" subcommand. A \fBZFS\fR file system that is added to a non-global zone must have its mountpoint property set to legacy.
+.LP
+The physical properties of an added file system are controlled by the global administrator. However, the zone administrator can create, modify, or destroy files within the added file system, depending on how the file system is mounted.
+.LP
+A dataset can also be delegated to a non-global zone by using zonecfg's "\fBadd dataset\fR" subcommand. You cannot delegate a dataset to one zone and the children of the same dataset to another zone. The zone administrator can change properties of the dataset or
+any of its children. However, the "quota" property is controlled by the global administrator.
+.LP
+A \fBZFS\fR volume can be added as a device to a non-global zone by using zonecfg's "\fBadd device\fR" subcommand. However, its physical properties can only be modified by the global administrator.
+.LP
+For more information about \fBzonecfg\fR syntax, see \fBzonecfg\fR(1M).
+.LP
+After a dataset is delegated to a non-global zone, the "zoned" property is automatically set. A zoned file system cannot be mounted in the global zone, since the zone administrator might have to set the mount point to an unacceptable value.
+.LP
+The global administrator can forcibly clear the "zoned" property, though this should be done with extreme care. The global administrator should verify that all the mount points are acceptable before clearing the property.
+.SS Native Properties
+
+.LP
+Properties are divided into two types, native properties and user defined properties. Native properties either export internal statistics or control \fBZFS\fR behavior. In addition, native properties are either editable or read-only. User properties have no effect on \fBZFS\fR behavior,
+but you can use them to annotate datasets in a way that is meaningful in your environment. For more information about user properties, see the "User Properties" section.
+.LP
+Every dataset has a set of properties that export statistics about the dataset as well as control various behavior. Properties are inherited from the parent unless overridden by the child. Snapshot properties can not be edited; they always inherit their inheritable properties. Properties
+that are not applicable to snapshots are not displayed.
+.LP
+The values of numeric properties can be specified using the following human-readable suffixes (for example, "k", "KB", "M", "Gb", etc, up to Z for zettabyte). The following are all valid (and equal) specifications:
+.sp
+.in +2
+.nf
+"1536M", "1.5g", "1.50GB".
+.fi
+.in -2
+.sp
+
+.LP
+The values of non-numeric properties are case sensitive and must be lowercase, except for "mountpoint" and "sharenfs".
+.LP
+The first set of properties consist of read-only statistics about the dataset. These properties cannot be set, nor are they inherited. Native properties apply to all dataset types unless otherwise noted.
+.sp
+.ne 2
+.mk
+.na
+\fBtype\fR
+.ad
+.RS 17n
+.rt
+The type of dataset: "filesystem", "volume", "snapshot", or "clone".
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fBcreation\fR
+.ad
+.RS 17n
+.rt
+The time this dataset was created.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fBused\fR
+.ad
+.RS 17n
+.rt
+The amount of space consumed by this dataset and all its descendants. This is the value that is checked against this dataset's quota and reservation. The space used does not include this dataset's reservation, but does take into account the reservations of any descendant datasets.
+The amount of space that a dataset consumes from its parent, as well as the amount of space that will be freed if this dataset is recursively destroyed, is the greater of its space used and its reservation.
+.sp
+When snapshots (see the "Snapshots" section) are created, their space is initially shared between the snapshot and the file system, and possibly with previous snapshots. As the file system changes, space that was previously shared becomes unique to the snapshot, and counted in
+the snapshot's space used. Additionally, deleting snapshots can increase the amount of space unique to (and used by) other snapshots.
+.sp
+The amount of space used, available, or referenced does not take into account pending changes. Pending changes are generally accounted for within a few seconds. Committing a change to a disk using \fBfsync\fR(3c) or \fBO_SYNC\fR does not necessarily guarantee that the space usage information is updated immediately.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fBavailable\fR
+.ad
+.RS 17n
+.rt
+The amount of space available to the dataset and all its children, assuming that there is no other activity in the pool. Because space is shared within a pool, availability can be limited by any number of factors, including physical pool size, quotas, reservations, or other datasets
+within the pool.
+.sp
+This property can also be referred to by its shortened column name, "avail".
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fBreferenced\fR
+.ad
+.RS 17n
+.rt
+The amount of data that is accessible by this dataset, which may or may not be shared with other datasets in the pool. When a snapshot or clone is created, it initially references the same amount of space as the file system or snapshot it was created from, since its contents are
+identical.
+.sp
+This property can also be referred to by its shortened column name, "refer".
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fBcompressratio\fR
+.ad
+.RS 17n
+.rt
+The compression ratio achieved for this dataset, expressed as a multiplier. Compression can be turned on by running "zfs set compression=on \fIdataset\fR". The default value is "off".
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fBmounted\fR
+.ad
+.RS 17n
+.rt
+For file systems, indicates whether the file system is currently mounted. This property can be either "yes" or "no".
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fBorigin\fR
+.ad
+.RS 17n
+.rt
+For cloned file systems or volumes, the snapshot from which the clone was created. The origin cannot be destroyed (even with the \fB-r\fR or \fB-f\fR options) so long as a clone exists.
+.RE
+
+.LP
+The following two properties can be set to control the way space is allocated between datasets. These properties are not inherited, but do affect their descendants.
+.sp
+.ne 2
+.mk
+.na
+\fBquota=\fIsize\fR | \fInone\fR\fR
+.ad
+.sp .6
+.RS 4n
+Limits the amount of space a dataset and its descendants can consume. This property enforces a hard limit on the amount of space used. This includes all space consumed by descendants, including file systems and snapshots. Setting a quota on a descendant of a dataset that already
+has a quota does not override the ancestor's quota, but rather imposes an additional limit.
+.sp
+Quotas cannot be set on volumes, as the "volsize" property acts as an implicit quota.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fBreservation=\fIsize\fR | \fInone\fR\fR
+.ad
+.sp .6
+.RS 4n
+The minimum amount of space guaranteed to a dataset and its descendants. When the amount of space used is below this value, the dataset is treated as if it were taking up the amount of space specified by its reservation. Reservations are accounted for in the parent datasets' space
+used, and count against the parent datasets' quotas and reservations.
+.sp
+This property can also be referred to by its shortened column name, "reserv".
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fBvolsize=\fIsize\fR\fR
+.ad
+.sp .6
+.RS 4n
+For volumes, specifies the logical size of the volume. By default, creating a volume establishes a reservation of equal size. Any changes to \fBvolsize\fR are reflected in an equivalent change to the reservation. The \fBvolsize\fR can only be set to a
+multiple of \fBvolblocksize\fR, and cannot be zero.
+.sp
+The reservation is kept equal to the volume's logical size to prevent unexpected behavior for consumers. Without the reservation, the volume could run out of space, resulting in undefined behavior or data corruption, depending on how the volume is used. These effects can also occur when
+the volume size is changed while it is in use (particularly when shrinking the size). Extreme care should be used when adjusting the volume size.
+.sp
+Though not recommended, a "sparse volume" (also known as "thin provisioning") can be created by specifying the \fB-s\fR option to the "\fBzfs create -V\fR" command, or by changing the reservation after the volume has been created.
+A "sparse volume" is a volume where the reservation is less then the volume size. Consequently, writes to a sparse volume can fail with \fBENOSPC\fR when the pool is low on space. For a sparse volume, changes to \fBvolsize\fR are not reflected in the reservation.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fBvolblocksize=\fIblocksize\fR\fR
+.ad
+.sp .6
+.RS 4n
+For volumes, specifies the block size of the volume. The \fBblocksize\fR cannot be changed once the volume has been written, so it should be set at volume creation time. The default \fBblocksize\fR for volumes is 8 Kbytes. Any power of 2 from 512 bytes
+to 128 Kbytes is valid.
+.sp
+This property can also be referred to by its shortened column name, "volblock".
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fBrecordsize=\fIsize\fR\fR
+.ad
+.sp .6
+.RS 4n
+Specifies a suggested block size for files in the file system. This property is designed solely for use with database workloads that access files in fixed-size records. \fBZFS\fR automatically tunes block sizes according to internal algorithms optimized for typical
+access patterns.
+.sp
+For databases that create very large files but access them in small random chunks, these algorithms may be suboptimal. Specifying a "recordsize" greater than or equal to the record size of the database can result in significant performance gains. Use of this property for general
+purpose file systems is strongly discouraged, and may adversely affect performance.
+.sp
+The size specified must be a power of two greater than or equal to 512 and less than or equal to 128 Kbytes.
+.sp
+Changing the file system's \fBrecordsize\fR only affects files created afterward; existing files are unaffected.
+.sp
+This property can also be referred to by its shortened column name, "recsize".
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fBmountpoint=\fIpath\fR | \fInone\fR | \fIlegacy\fR\fR
+.ad
+.sp .6
+.RS 4n
+Controls the mount point used for this file system. See the "Mount Points" section for more information on how this property is used.
+.sp
+When the mountpoint property is changed for a file system, the file system and any children that inherit the mount point are unmounted. If the new value is "legacy", then they remain unmounted. Otherwise, they are automatically remounted in the new location if the property was
+previously "legacy" or "none", or if they were mounted before the property was changed. In addition, any shared file systems are unshared and shared in the new location.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fBsharenfs=\fIon\fR | \fIoff\fR | \fIopts\fR\fR
+.ad
+.sp .6
+.RS 4n
+Controls whether the file system is shared via \fBNFS\fR, and what options are used. A file system with a sharenfs property of "off" is managed through traditional tools such as \fBshare\fR(1M), \fBunshare\fR(1M), and \fBdfstab\fR(4). Otherwise, the file system is automatically shared and unshared with the "\fBzfs share\fR" and "\fBzfs unshare\fR" commands. If the property is set to "on", the \fBshare\fR(1M) command is invoked with no options. Otherwise, the \fBshare\fR(1M) command is invoked with options equivalent to the contents of this property.
+.sp
+When the "sharenfs" property is changed for a dataset, the dataset and any children inheriting the property are re-shared with the new options, only if the property was previously "off", or if they were shared before the property was changed. If the new property is "off",
+the file systems are unshared.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fBshareiscsi=\fIon\fR | \fIoff\fR\fR
+.ad
+.sp .6
+.RS 4n
+Like the "sharenfs" property, "shareiscsi" indicates whether a \fBZFS\fR volume is exported as an \fBiSCSI\fR target. The acceptable values for this property are "on", "off", and "type=disk".
+The default value is "off". In the future, other target types might be supported. For example, "tape".
+.sp
+You might want to set "shareiscsi=on" for a file system so that all \fBZFS\fR volumes within the file system are shared by default. Setting this property on a file system has no direct effect, however.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fBchecksum=\fIon\fR | \fIoff\fR | \fIfletcher2\fR, | \fIfletcher4\fR | \fIsha256\fR\fR
+.ad
+.sp .6
+.RS 4n
+Controls the checksum used to verify data integrity. The default value is "on", which automatically selects an appropriate algorithm (currently, \fIfletcher2\fR, but this may change in future releases). The value "off" disables integrity
+checking on user data. Disabling checksums is NOT a recommended practice.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fBcompression=\fIon\fR | \fIoff\fR | \fIlzjb\fR\fR
+.ad
+.sp .6
+.RS 4n
+Controls the compression algorithm used for this dataset. There is currently only one algorithm, "\fIlzjb\fR", though this may change in future releases. The default value is "off".
+.sp
+This property can also be referred to by its shortened column name "compress".
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fBatime=\fIon\fR | \fIoff\fR\fR
+.ad
+.sp .6
+.RS 4n
+Controls whether the access time for files is updated when they are read. Turning this property off avoids producing write traffic when reading files and can result in significant performance gains, though it might confuse mailers and other similar utilities. The default value
+is "on".
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fBdevices=\fIon\fR | \fIoff\fR\fR
+.ad
+.sp .6
+.RS 4n
+Controls whether device nodes can be opened on this file system. The default value is "on".
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fBexec=\fIon\fR | \fIoff\fR\fR
+.ad
+.sp .6
+.RS 4n
+Controls whether processes can be executed from within this file system. The default value is "on".
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fBsetuid=\fIon\fR | \fIoff\fR\fR
+.ad
+.sp .6
+.RS 4n
+Controls whether the set-\fBUID\fR bit is respected for the file system. The default value is "on".
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fBreadonly=\fIon\fR | \fIoff\fR\fR
+.ad
+.sp .6
+.RS 4n
+Controls whether this dataset can be modified. The default value is "off".
+.sp
+This property can also be referred to by its shortened column name, "rdonly".
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fBzoned=\fIon\fR | \fIoff\fR\fR
+.ad
+.sp .6
+.RS 4n
+Controls whether the dataset is managed from a non-global zone. See the "Zones" section for more information. The default value is "off".
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fBsnapdir=\fIhidden\fR | \fIvisible\fR\fR
+.ad
+.sp .6
+.RS 4n
+Controls whether the ".zfs" directory is hidden or visible in the root of the file system as discussed in the "Snapshots" section. The default value is "hidden".
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fBaclmode=\fBdiscard\fR | \fBgroupmask\fR | \fBpassthrough\fR\fR
+.ad
+.sp .6
+.RS 4n
+Controls how an \fBACL\fR is modified during \fBchmod\fR(2). A file system with an "aclmode" property of "\fBdiscard\fR"
+deletes all \fBACL\fR entries that do not represent the mode of the file. An "aclmode" property of "\fBgroupmask\fR" (the default) reduces user or group permissions. The permissions are reduced, such that they are no greater than the group permission
+bits, unless it is a user entry that has the same \fBUID\fR as the owner of the file or directory. In this case, the \fBACL\fR permissions are reduced so that they are no greater than owner permission bits. A file system with an "aclmode" property of "\fBpassthrough\fR" indicates that no changes will be made to the \fBACL\fR other than generating the necessary \fBACL\fR entries to represent the new mode of the file or directory.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fBaclinherit=\fBdiscard\fR | \fBnoallow\fR | \fBsecure\fR | \fBpassthrough\fR\fR
+.ad
+.sp .6
+.RS 4n
+Controls how \fBACL\fR entries are inherited when files and directories are created. A file system with an "aclinherit" property of "\fBdiscard\fR" does not inherit any \fBACL\fR entries. A file system with an "aclinherit"
+property value of "\fBnoallow\fR" only inherits inheritable \fBACL\fR entries that specify "deny" permissions. The property value "\fBsecure\fR" (the default) removes the "\fBwrite_acl\fR" and "\fBwrite_owner\fR" permissions when the \fBACL\fR entry is inherited. A file system with an "aclinherit" property value of "\fBpassthrough\fR" inherits all inheritable \fBACL\fR entries without any modifications made to the \fBACL\fR entries when they are inherited.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fBcanmount=\fBon\fR | \fBoff\fR\fR
+.ad
+.sp .6
+.RS 4n
+If this property is set to "\fBoff\fR", the file system cannot be mounted, and is ignored by "\fBzfs mount -a\fR". This is similar to setting the "mountpoint" property to "\fBnone\fR", except
+that the dataset still has a normal "mountpoint" property which can be inherited. This allows datasets to be used solely as a mechanism to inherit properties. One use case is to have two logically separate datasets have the same mountpoint, so that the children of both datasets appear
+in the same directory, but may have different inherited characteristics. The default value is "\fBon\fR".
+.sp
+This property is not inherited.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fBxattr=\fBon\fR | \fBoff\fR\fR
+.ad
+.sp .6
+.RS 4n
+Controls whether extended attributes are enabled for this file system. The default value is "\fBon\fR".
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fBcopies=\fB1\fR | \fB2\fR | \fB3\fR\fR
+.ad
+.sp .6
+.RS 4n
+Controls the number of copies of data stored for this dataset. These copies are in addition to any redundancy provided by the pool (for example, mirroring or raid-z). The copies are stored on different disks if possible. The space used by multiple copies is charged to the associated
+file and dataset, changing the "used" property and counting against quotas and reservations.
+.sp
+Changing this property only affects newly-written data. Therefore, it is recommended that this property be set at file system creation time, using the "\fB-o\fR copies=" option.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fBjailed=\fIon\fR | \fIoff\fR\fR
+.ad
+.sp .6
+.RS 4n
+Controls whether the dataset is managed from within a jail. The default value is "off".
+.RE
+
+.SS iscsioptions
+
+.LP
+This read-only property, which is hidden, is used by the \fBiSCSI\fR target daemon to store persistent information, such as the \fBIQN\fR. It cannot be viewed or modified using the \fBzfs\fR command. The contents are not intended for external consumers.
+.SS Temporary Mount Point Properties
+
+.LP
+When a file system is mounted, either through \fBmount\fR(1M) for legacy mounts or the "\fBzfs mount\fR" command for normal file systems,
+its mount options are set according to its properties. The correlation between properties and mount options is as follows:
+.sp
+.in +2
+.nf
+ PROPERTY MOUNT OPTION
+ devices devices/nodevices
+ exec exec/noexec
+ readonly ro/rw
+ setuid setuid/nosetuid
+ xattr xattr/noxattr
+.fi
+.in -2
+.sp
+
+.LP
+In addition, these options can be set on a per-mount basis using the \fB-o\fR option, without affecting the property that is stored on disk. The values specified on the command line override the values stored in the dataset. The \fB-nosuid\fR option is an alias for "nodevices,nosetuid".
+These properties are reported as "temporary" by the "\fBzfs get\fR" command. If the properties are changed while the dataset is mounted, the new setting overrides any temporary settings.
+.SS User Properties
+
+.LP
+In addition to the standard native properties, \fBZFS\fR supports arbitrary user properties. User properties have no effect on \fBZFS\fR behavior, but applications or administrators can use them to annotate datasets.
+.LP
+User property names must contain a colon (":") character, to distinguish them from native properties. They might contain lowercase letters, numbers, and the following punctuation characters: colon (":"), dash ("-"), period ("."), and underscore
+("_"). The expected convention is that the property name is divided into two portions such as "\fImodule\fR:\fIproperty\fR", but this namespace is not enforced by \fBZFS\fR. User property names can be at most 256 characters,
+and cannot begin with a dash ("-").
+.LP
+When making programmatic use of user properties, it is strongly suggested to use a reversed \fBDNS\fR domain name for the \fImodule\fR component of property names to reduce the chance that two independently-developed packages use the same property name for
+different purposes. Property names beginning with "com.sun." are reserved for use by Sun Microsystems.
+.LP
+The values of user properties are arbitrary strings, are always inherited, and are never validated. All of the commands that operate on properties ("zfs list", "zfs get", "zfs set", etc.) can be used to manipulate both native properties and user properties.
+Use the "\fBzfs inherit\fR" command to clear a user property . If the property is not defined in any parent dataset, it is removed entirely. Property values are limited to 1024 characters.
+.SS Volumes as Swap or Dump Devices
+
+.LP
+To set up a swap area, create a \fBZFS\fR volume of a specific size and then enable swap on that device. For more information, see the EXAMPLES section.
+.LP
+Do not swap to a file on a \fBZFS\fR file system. A \fBZFS\fR swap file configuration is not supported.
+.LP
+Using a \fBZFS\fR volume as a dump device is not supported.
+.SH SUBCOMMANDS
+
+.LP
+All subcommands that modify state are logged persistently to the pool in their original form.
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs ?\fR\fR
+.ad
+.sp .6
+.RS 4n
+Displays a help message.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs create\fR [[\fB-o\fR property=value]...] \fIfilesystem\fR\fR
+.ad
+.sp .6
+.RS 4n
+Creates a new \fBZFS\fR file system. The file system is automatically mounted according to the "mountpoint" property inherited from the parent.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-o\fR property=value\fR
+.ad
+.RS 21n
+.rt
+Sets the specified property as if "\fBzfs set property=value\fR" was invoked at the same time the dataset was created. Any editable \fBZFS\fR property can also be set at creation time. Multiple \fB-o\fR options can be specified. An
+error results if the same property is specified in multiple \fB-o\fR options.
+.RE
+
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs create\fR [\fB-s\fR] [\fB-b\fR \fIblocksize\fR] [[\fB-o\fR property=value]...] \fB-V\fR \fIsize\fR \fIvolume\fR\fR
+.ad
+.sp .6
+.RS 4n
+Creates a volume of the given size. The volume is exported as a block device in \fB/dev/zvol/{dsk,rdsk}/\fIpath\fR\fR, where \fIpath\fR is the name of the volume in the \fBZFS\fR namespace. The size represents
+the logical size as exported by the device. By default, a reservation of equal size is created.
+.sp
+\fIsize\fR is automatically rounded up to the nearest 128 Kbytes to ensure that the volume has an integral number of blocks regardless of \fIblocksize\fR.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-s\fR\fR
+.ad
+.RS 21n
+.rt
+Creates a sparse volume with no reservation. See "volsize" in the Native Properties section for more information about sparse volumes.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-o\fR property=value\fR
+.ad
+.RS 21n
+.rt
+Sets the specified property as if "\fBzfs set property=value\fR" was invoked at the same time the dataset was created. Any editable \fBZFS\fR property can also be set at creation time. Multiple \fB-o\fR options can be specified. An
+error results if the same property is specified in multiple \fB-o\fR options.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-b\fR \fIblocksize\fR\fR
+.ad
+.RS 21n
+.rt
+Equivalent to "\fB\fR\fB-o\fR \fBvolblocksize=\fIblocksize\fR\fR". If this option is specified in conjunction with "\fB\fR\fB-o\fR \fBvolblocksize\fR", the resulting
+behavior is undefined.
+.RE
+
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs destroy\fR [\fB-rRf\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR\fR
+.ad
+.sp .6
+.RS 4n
+Destroys the given dataset. By default, the command unshares any file systems that are currently shared, unmounts any file systems that are currently mounted, and refuses to destroy a dataset that has active dependents (children, snapshots, clones).
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-r\fR\fR
+.ad
+.RS 6n
+.rt
+Recursively destroy all children. If a snapshot is specified, destroy all snapshots with this name in descendant file systems.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-R\fR\fR
+.ad
+.RS 6n
+.rt
+Recursively destroy all dependents, including cloned file systems outside the target hierarchy. If a snapshot is specified, destroy all snapshots with this name in descendant file systems.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-f\fR\fR
+.ad
+.RS 6n
+.rt
+Force an unmount of any file systems using the "\fBunmount -f\fR" command. This option has no effect on non-file systems or unmounted file systems.
+.RE
+
+Extreme care should be taken when applying either the \fB-r\fR or the \fB-f\fR options, as they can destroy large portions of a pool and cause unexpected behavior for mounted file systems in use.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs clone\fR \fIsnapshot\fR \fIfilesystem\fR|\fIvolume\fR\fR
+.ad
+.sp .6
+.RS 4n
+Creates a clone of the given snapshot. See the "Clones" section for details. The target dataset can be located anywhere in the \fBZFS\fR hierarchy, and is created as the same type as the original.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs promote\fR \fIfilesystem\fR\fR
+.ad
+.sp .6
+.RS 4n
+Promotes a clone file system to no longer be dependent on its "origin" snapshot. This makes it possible to destroy the file system that the clone was created from. The clone parent-child dependency relationship is reversed, so that the "origin" file system
+becomes a clone of the specified file system.
+.sp
+The snaphot that was cloned, and any snapshots previous to this snapshot, are now owned by the promoted clone. The space they use moves from the "origin" file system to the promoted clone, so enough space must be available to accommodate these snapshots. No new space is consumed
+by this operation, but the space accounting is adjusted. The promoted clone must not have any conflicting snapshot names of its own. The "\fBrename\fR" subcommand can be used to rename any conflicting snapshots.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs rename\fR \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR\fR
+.ad
+.sp .6
+.RS 4n
+Renames the given dataset. The new target can be located anywhere in the \fBZFS\fR hierarchy, with the exception of snapshots. Snapshots can only be renamed within the parent file system or volume. When renaming a snapshot, the parent file system of the snapshot does
+not need to be specified as part of the second argument. Renamed file systems can inherit new mount points, in which case they are unmounted and remounted at the new mount point.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs snapshot\fR [\fB-r\fR] \fIfilesystem@name\fR|\fIvolume@name\fR\fR
+.ad
+.sp .6
+.RS 4n
+Creates a snapshot with the given name. See the "Snapshots" section for details.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-r\fR\fR
+.ad
+.RS 6n
+.rt
+Recursively create snapshots of all descendant datasets. Snapshots are taken atomically, so that all recursive snapshots correspond to the same moment in time.
+.RE
+
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs rollback\fR [\fB-rRf\fR] \fIsnapshot\fR\fR
+.ad
+.sp .6
+.RS 4n
+Roll back the given dataset to a previous snapshot. When a dataset is rolled back, all data that has changed since the snapshot is discarded, and the dataset reverts to the state at the time of the snapshot. By default, the command refuses to roll back to a snapshot other than
+the most recent one. In order to do so, all intermediate snapshots must be destroyed by specifying the \fB-r\fR option. The file system is unmounted and remounted, if necessary.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-r\fR\fR
+.ad
+.RS 6n
+.rt
+Recursively destroy any snapshots more recent than the one specified.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-R\fR\fR
+.ad
+.RS 6n
+.rt
+Recursively destroy any more recent snapshots, as well as any clones of those snapshots.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-f\fR\fR
+.ad
+.RS 6n
+.rt
+Force an unmount of any file systems using the "\fBunmount -f\fR" command.
+.RE
+
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs\fR \fBlist\fR [\fB-rH\fR] [\fB-o\fR \fIprop\fR[,\fIprop\fR] ]... [ \fB-t\fR \fItype\fR[,\fItype\fR]...] [ \fB-s\fR \fIprop\fR [\fB-s\fR \fIprop\fR]... [ \fB-S\fR \fIprop\fR [\fB-S\fR \fIprop\fR]... [\fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR|\fI/pathname\fR|.\fI/pathname\fR ...\fR
+.ad
+.sp .6
+.RS 4n
+Lists the property information for the given datasets in tabular form. If specified, you can list property information by the absolute pathname or the relative pathname. By default, all datasets are displayed and contain the following fields:
+.sp
+.in +2
+.nf
+name,used,available,referenced,mountpoint
+.fi
+.in -2
+.sp
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-H\fR\fR
+.ad
+.RS 11n
+.rt
+Used for scripting mode. Do not print headers and separate fields by a single tab instead of arbitrary whitespace.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-r\fR\fR
+.ad
+.RS 11n
+.rt
+Recursively display any children of the dataset on the command line.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-o\fR \fIprop\fR\fR
+.ad
+.RS 11n
+.rt
+A comma-separated list of properties to display. The property must be one of the properties described in the "Native Properties" section, or the special value "name" to display the dataset name.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-s\fR \fIprop\fR\fR
+.ad
+.RS 11n
+.rt
+A property to use for sorting the output by column in ascending order based on the value of the property. The property must be one of the properties described in the "Properties" section, or the special value "name" to sort by the dataset name. Multiple
+properties can be specified at one time using multiple \fB-s\fR property options. Multiple \fB-s\fR options are evaluated from left to right in decreasing order of importance.
+.sp
+The following is a list of sorting criteria:
+.RS +4
+.TP
+.ie t \(bu
+.el o
+Numeric types sort in numeric order.
+.RE
+.RS +4
+.TP
+.ie t \(bu
+.el o
+String types sort in alphabetical order.
+.RE
+.RS +4
+.TP
+.ie t \(bu
+.el o
+Types inappropriate for a row sort that row to the literal bottom, regardless of the specified ordering.
+.RE
+.RS +4
+.TP
+.ie t \(bu
+.el o
+If no sorting options are specified the existing behavior of "\fBzfs list\fR" is preserved.
+.RE
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-S\fR \fIprop\fR\fR
+.ad
+.RS 11n
+.rt
+Same as the \fB-s\fR option, but sorts by property in descending order.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-t\fR \fItype\fR\fR
+.ad
+.RS 11n
+.rt
+A comma-separated list of types to display, where "type" is one of "filesystem", "snapshot" or "volume". For example, specifying "\fB-t snapshot\fR" displays only snapshots.
+.RE
+
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs set\fR \fIproperty\fR=\fIvalue\fR \fIfilesystem\fR|\fIvolume\fR ...\fR
+.ad
+.sp .6
+.RS 4n
+Sets the property to the given value for each dataset. Only some properties can be edited. See the "Properties" section for more information on what properties can be set and acceptable values. Numeric values can be specified as exact values, or in a human-readable
+form with a suffix of "B", "K", "M", "G", "T", "P", "E", "Z" (for bytes, Kbytes, Mbytes, gigabytes, terabytes, petabytes, exabytes, or zettabytes, respectively). Properties cannot be set on snapshots.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs get\fR [\fB-rHp\fR] [\fB-o\fR \fIfield\fR[,\fIfield\fR]...] [\fB-s\fR \fIsource\fR[,\fIsource\fR]...] \fIall\fR | \fIproperty\fR[,\fIproperty\fR]... \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR ...\fR
+.ad
+.sp .6
+.RS 4n
+Displays properties for the given datasets. If no datasets are specified, then the command displays properties for all datasets on the system. For each property, the following columns are displayed:
+.sp
+.in +2
+.nf
+ name Dataset name
+ property Property name
+ value Property value
+ source Property source. Can either be local, default,
+ temporary, inherited, or none (-).
+.fi
+.in -2
+.sp
+
+All columns are displayed by default, though this can be controlled by using the \fB-o\fR option. This command takes a comma-separated list of properties as described in the "Native Properties" and "User Properties" sections.
+.sp
+The special value "all" can be used to display all properties for the given dataset.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-r\fR\fR
+.ad
+.RS 13n
+.rt
+Recursively display properties for any children.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-H\fR\fR
+.ad
+.RS 13n
+.rt
+Display output in a form more easily parsed by scripts. Any headers are omitted, and fields are explicitly separated by a single tab instead of an arbitrary amount of space.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-o\fR \fIfield\fR\fR
+.ad
+.RS 13n
+.rt
+A comma-separated list of columns to display. "name,property,value,source" is the default value.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-s\fR \fIsource\fR\fR
+.ad
+.RS 13n
+.rt
+A comma-separated list of sources to display. Those properties coming from a source other than those in this list are ignored. Each source must be one of the following: "local,default,inherited,temporary,none". The default value is all sources.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-p\fR\fR
+.ad
+.RS 13n
+.rt
+Display numbers in parsable (exact) values.
+.RE
+
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs inherit\fR [\fB-r\fR] \fIproperty\fR \fIfilesystem\fR|\fIvolume\fR ...\fR
+.ad
+.sp .6
+.RS 4n
+Clears the specified property, causing it to be inherited from an ancestor. If no ancestor has the property set, then the default value is used. See the "Properties" section for a listing of default values, and details on which properties can be inherited.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-r\fR\fR
+.ad
+.RS 6n
+.rt
+Recursively inherit the given property for all children.
+.RE
+
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs mount\fR\fR
+.ad
+.sp .6
+.RS 4n
+Displays all \fBZFS\fR file systems currently mounted.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs mount\fR[\fB-o\fR \fIopts\fR] [\fB-O\fR] \fB-a\fR\fR
+.ad
+.sp .6
+.RS 4n
+Mounts all available \fBZFS\fR file systems. Invoked automatically as part of the boot process.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-o\fR \fIopts\fR\fR
+.ad
+.RS 11n
+.rt
+An optional comma-separated list of mount options to use temporarily for the duration of the mount. See the "Temporary Mount Point Properties" section for details.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-O\fR\fR
+.ad
+.RS 11n
+.rt
+Perform an overlay mount. See \fBmount\fR(1M) for more information.
+.RE
+
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs mount\fR [\fB-o\fR \fIopts\fR] [\fB-O\fR] \fIfilesystem\fR\fR
+.ad
+.sp .6
+.RS 4n
+Mounts a specific \fBZFS\fR file system. This is typically not necessary, as file systems are automatically mounted when they are created or the mountpoint property has changed. See the "Mount Points" section for details.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-o\fR \fIopts\fR\fR
+.ad
+.RS 11n
+.rt
+An optional comma-separated list of mount options to use temporarily for the duration of the mount. See the "Temporary Mount Point Properties" section for details.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-O\fR\fR
+.ad
+.RS 11n
+.rt
+Perform an overlay mount. See \fBmount\fR(1M) for more information.
+.RE
+
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs unmount\fR \fB-a\fR\fR
+.ad
+.sp .6
+.RS 4n
+Unmounts all currently mounted \fBZFS\fR file systems. Invoked automatically as part of the shutdown process.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs unmount\fR [\fB-f\fR] \fIfilesystem\fR|\fImountpoint\fR\fR
+.ad
+.sp .6
+.RS 4n
+Unmounts the given file system. The command can also be given a path to a \fBZFS\fR file system mount point on the system.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-f\fR\fR
+.ad
+.RS 6n
+.rt
+Forcefully unmount the file system, even if it is currently in use.
+.RE
+
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs share\fR \fB-a\fR\fR
+.ad
+.sp .6
+.RS 4n
+Shares all available \fBZFS\fR file systems. This is invoked automatically as part of the boot process.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs share\fR \fIfilesystem\fR\fR
+.ad
+.sp .6
+.RS 4n
+Shares a specific \fBZFS\fR file system according to the "sharenfs" property. File systems are shared when the "sharenfs" property is set.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs unshare\fR \fB-a\fR\fR
+.ad
+.sp .6
+.RS 4n
+Unshares all currently shared \fBZFS\fR file systems. This is invoked automatically as part of the shutdown process.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs unshare\fR [\fB-F\fR] \fIfilesystem\fR|\fImountpoint\fR\fR
+.ad
+.sp .6
+.RS 4n
+Unshares the given file system. The command can also be given a path to a \fBZFS\fR file system shared on the system.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-F\fR\fR
+.ad
+.RS 6n
+.rt
+Forcefully unshare the file system, even if it is currently in use.
+.RE
+
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs send\fR [\fB-i\fR \fIsnapshot1\fR] \fIsnapshot2\fR\fR
+.ad
+.sp .6
+.RS 4n
+Creates a stream representation of snapshot2, which is written to standard output. The output can be redirected to a file or to a different system (for example, using \fBssh\fR(1). By default, a full stream is generated.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-i\fR \fIsnapshot1\fR\fR
+.ad
+.RS 16n
+.rt
+Generate an incremental stream from \fIsnapshot1\fR to \fIsnapshot2\fR. The incremental source \fIsnapshot1\fR can be specified as the last component of the snapshot name (for example, the part after the "@"),
+and it is assumed to be from the same file system as \fIsnapshot2\fR.
+.RE
+
+.RE
+
+.LP
+The format of the stream is evolving. No backwards compatibility is guaranteed. You may not be able to receive your streams on future versions of \fBZFS\fR.
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs receive\fR [\fB-vnF\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR\fR
+.ad
+.br
+.na
+\fB\fBzfs receive\fR [\fB-vnF\fR] \fB-d\fR \fIfilesystem\fR\fR
+.ad
+.sp .6
+.RS 4n
+Creates a snapshot whose contents are as specified in the stream provided on standard input. If a full stream is received, then a new file system is created as well. Streams are created using the "\fBzfs send\fR" subcommand, which by default creates a full
+stream. "\fBzfs recv\fR" can be used as an alias for "\fBzfs receive\fR".
+.sp
+If an incremental stream is received, then the destination file system must already exist, and its most recent snapshot must match the incremental stream's source. The destination file system and all of its child file systems are unmounted and cannot be accessed during the receive operation.
+.sp
+The name of the snapshot (and file system, if a full stream is received) that this subcommand creates depends on the argument type and the \fB-d\fR option.
+.sp
+If the argument is a snapshot name, the specified \fIsnapshot\fR is created. If the argument is a file system or volume name, a snapshot with the same name as the sent snapshot is created within the specified \fIfilesystem\fR or \fIvolume\fR.
+If the \fB-d\fR option is specified, the snapshot name is determined by appending the sent snapshot's name to the specified \fIfilesystem\fR. If the \fB-d\fR option is specified, any required file systems within the specified one are created.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-d\fR\fR
+.ad
+.RS 6n
+.rt
+Use the name of the sent snapshot to determine the name of the new snapshot as described in the paragraph above.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-v\fR\fR
+.ad
+.RS 6n
+.rt
+Print verbose information about the stream and the time required to perform the receive operation.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-n\fR\fR
+.ad
+.RS 6n
+.rt
+Do not actually receive the stream. This can be useful in conjunction with the \fB-v\fR option to determine what name the receive operation would use.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-F\fR\fR
+.ad
+.RS 6n
+.rt
+Force a rollback of the \fIfilesystem\fR to the most recent snapshot before performing the receive operation.
+.RE
+
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs jail\fR \fIjailid\fR \fIfilesystem\fR\fR
+.ad
+.sp .6
+.RS 4n
+Attaches the given file system to the given jail. From now on this file system tree can be managed from within a jail if the "\fBjailed\fR" property has been set.
+To use this functionality, sysctl \fBsecurity.jail.enforce_statfs\fR should be set to 0 and sysctl \fBsecurity.jail.mount_allowed\fR should be set to 1.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs unjail\fR \fIjailid\fR \fIfilesystem\fR\fR
+.ad
+.sp .6
+.RS 4n
+Detaches the given file system from the given jail.
+.RE
+
+.SH EXAMPLES
+.LP
+\fBExample 1 \fRCreating a ZFS File System Hierarchy
+
+.LP
+The following commands create a file system named "\fBpool/home\fR" and a file system named "\fBpool/home/bob\fR". The mount point "\fB/export/home\fR" is set for the parent file system, and automatically inherited
+by the child file system.
+.sp
+.in +2
+.nf
+# zfs create pool/home
+# zfs set mountpoint=/export/home pool/home
+# zfs create pool/home/bob
+.fi
+.in -2
+.sp
+.LP
+\fBExample 2 \fRCreating a ZFS Snapshot
+
+.LP
+The following command creates a snapshot named "yesterday". This snapshot is mounted on demand in the ".zfs/snapshot" directory at the root of the "\fBpool/home/bob\fR" file system.
+.sp
+.in +2
+.nf
+# zfs snapshot pool/home/bob@yesterday
+.fi
+.in -2
+.sp
+.LP
+\fBExample 3 \fRTaking and destroying multiple snapshots
+
+.LP
+The following command creates snapshots named "\fByesterday\fR" of "\fBpool/home\fR" and all of its descendant file systems. Each snapshot is mounted on demand in the ".zfs/snapshot" directory at the root of its file system. The
+second command destroys the newly created snapshots.
+.sp
+.in +2
+.nf
+# \fBzfs snapshot -r pool/home@yesterday\fR
+\fB# zfs destroy -r pool/home@yesterday\fR
+.fi
+.in -2
+.sp
+.LP
+\fBExample 4 \fRTurning Off Compression
+
+.LP
+The following commands turn compression off for all file systems under "\fBpool/home\fR", but explicitly turns it on for "\fBpool/home/anne\fR".
+.sp
+.in +2
+.nf
+\fB# zfs set compression=off pool/home
+# zfs set compression=on pool/home/anne\fR
+.fi
+.in -2
+.sp
+.LP
+\fBExample 5 \fRListing ZFS Datasets
+
+.LP
+The following command lists all active file systems and volumes in the system.
+.sp
+.in +2
+.nf
+\fB# zfs list\fR
+
+
+ NAME USED AVAIL REFER MOUNTPOINT
+ pool 100G 60G - /pool
+ pool/home 100G 60G - /export/home
+ pool/home/bob 40G 60G 40G /export/home/bob
+ pool/home/bob@yesterday 3M - 40G -
+ pool/home/anne 60G 60G 40G /export/home/anne
+.fi
+.in -2
+.sp
+.LP
+\fBExample 6 \fRSetting a Quota on a ZFS File System
+
+.LP
+The following command sets a quota of 50 gbytes for "\fBpool/home/bob\fR".
+.sp
+.in +2
+.nf
+\fB# zfs set quota=50G pool/home/bob\fR
+.fi
+.in -2
+.sp
+.LP
+\fBExample 7 \fRListing ZFS Properties
+
+.LP
+The following command lists all properties for "\fBpool/home/bob\fR".
+.sp
+.in +2
+.nf
+\fB# zfs get all pool/home/bob\fR
+
+
+ NAME PROPERTY VALUE SOURCE
+ pool/home/bob type filesystem -
+ pool/home/bob creation Fri Feb 23 14:20 2007 -
+ pool/home/bob used 24.5K -
+ pool/home/bob available 50.0G -
+ pool/home/bob referenced 24.5K -
+ pool/home/bob compressratio 1.00x -
+ pool/home/bob mounted yes -
+ pool/home/bob quota 50G local
+ pool/home/bob reservation none default
+ pool/home/bob recordsize 128K default
+ pool/home/bob mountpoint /pool/home/bob default
+ pool/home/bob sharenfs off default
+ pool/home/bob shareiscsi off default
+ pool/home/bob checksum on default
+ pool/home/bob compression off default
+ pool/home/bob atime on default
+ pool/home/bob devices on default
+ pool/home/bob exec on default
+ pool/home/bob setuid on default
+ pool/home/bob readonly off default
+ pool/home/bob zoned off default
+ pool/home/bob snapdir hidden default
+ pool/home/bob aclmode groupmask default
+ pool/home/bob aclinherit secure default
+ pool/home/bob canmount on default
+ pool/home/bob xattr on default
+
+
+.fi
+.in -2
+.sp
+
+.LP
+The following command gets a single property value.
+.sp
+.in +2
+.nf
+\fB# zfs get -H -o value compression pool/home/bob\fR
+on
+.fi
+.in -2
+.sp
+
+.LP
+The following command lists all properties with local settings for "\fBpool/home/bob\fR".
+.sp
+.in +2
+.nf
+\fB# zfs get -r -s local -o name,property,value all pool/home/bob\fR
+
+ NAME PROPERTY VALUE
+ pool compression on
+ pool/home checksum off
+.fi
+.in -2
+.sp
+.LP
+\fBExample 8 \fRRolling Back a ZFS File System
+
+.LP
+The following command reverts the contents of "\fBpool/home/anne\fR" to the snapshot named "\fByesterday\fR", deleting all intermediate snapshots.
+.sp
+.in +2
+.nf
+\fB# zfs rollback -r pool/home/anne@yesterday\fR
+.fi
+.in -2
+.sp
+.LP
+\fBExample 9 \fRCreating a ZFS Clone
+
+.LP
+The following command creates a writable file system whose initial contents are the same as "\fBpool/home/bob@yesterday\fR".
+.sp
+.in +2
+.nf
+\fB# zfs clone pool/home/bob@yesterday pool/clone\fR
+.fi
+.in -2
+.sp
+.LP
+\fBExample 10 \fRPromoting a ZFS Clone
+
+.LP
+The following commands illustrate how to test out changes to a file system, and then replace the original file system with the changed one, using clones, clone promotion, and renaming:
+.sp
+.in +2
+.nf
+\fB# zfs create pool/project/production\fR
+ populate /pool/project/production with data
+\fB# zfs snapshot pool/project/production@today
+# zfs clone pool/project/production@today pool/project/beta\fR
+ make changes to /pool/project/beta and test them
+\fB# zfs promote pool/project/beta
+# zfs rename pool/project/production pool/project/legacy
+# zfs rename pool/project/beta pool/project/production\fR
+ once the legacy version is no longer needed, it can be
+ destroyed
+\fB# zfs destroy pool/project/legacy\fR
+.fi
+.in -2
+.sp
+.LP
+\fBExample 11 \fRInheriting ZFS Properties
+
+.LP
+The following command causes "\fBpool/home/bob\fR" and "\fBpool/home/anne\fR" to inherit the "checksum" property from their parent.
+.sp
+.in +2
+.nf
+\fB# zfs inherit checksum pool/home/bob pool/home/anne\fR
+.fi
+.in -2
+.sp
+.LP
+\fBExample 12 \fRRemotely Replicating ZFS Data
+
+.LP
+The following commands send a full stream and then an incremental stream to a remote machine, restoring them into "\fBpoolB/received/fs\fR@a" and "\fBpoolB/received/fs@b\fR", respectively. "\fBpoolB\fR" must contain
+the file system "\fBpoolB/received\fR", and must not initially contain "\fBpoolB/received/fs\fR".
+.sp
+.in +2
+.nf
+# zfs send pool/fs@a | \e
+ ssh host zfs receive poolB/received/fs@a
+# zfs send -i a pool/fs@b | ssh host \e
+ zfs receive poolB/received/fs
+.fi
+.in -2
+.sp
+.LP
+\fBExample 13 \fRUsing the zfs receive -d Option
+
+.LP
+The following command sends a full stream of "\fBpoolA/fsA/fsB@snap\fR" to a remote machine, receiving it into "\fBpoolB/received/fsA/fsB@snap\fR". The "\fBfsA/fsB@snap\fR" portion of the received snapshot's name
+is determined from the name of the sent snapshot. "\fBpoolB\fR" must contain the file system "\fBpoolB/received\fR". If "\fBpoolB/received/fsA\fR" does not exist, it will be created as an empty file system.
+.sp
+.in +2
+.nf
+\fB# zfs send poolA/fsA/fsB@snap | \e
+ ssh host zfs receive -d poolB/received
+ \fR
+.fi
+.in -2
+.sp
+.LP
+\fBExample 14 \fRCreating a ZFS volume as a Swap Device
+
+.LP
+The following example shows how to create a 5-Gbyte ZFS volume and then add the volume as a swap device.
+.sp
+.in +2
+.nf
+\fB# zfs create -V 5gb tank/vol
+# swap -a /dev/zvol/dsk/tank/vol\fR
+.fi
+.in -2
+.sp
+.LP
+\fBExample 15 \fRSetting User Properties
+
+.LP
+The following example sets the user defined "com.example:department" property for a dataset.
+.sp
+.in +2
+.nf
+\fB# zfs set com.example:department=12345 tank/accounting\fR
+.fi
+.in -2
+.sp
+.LP
+\fBExample 16 \fRCreating a ZFS Volume as a iSCSI Target Device
+
+.LP
+The following example shows how to create a \fBZFS\fR volume as an \fBiSCSI\fR target.
+.sp
+.in +2
+.nf
+\fB# zfs create -V 2g pool/volumes/vol1
+# zfs set shareiscsi=on pool/volumes/vol1
+# iscsitadm list target\fR
+Target: pool/volumes/vol1
+iSCSI Name:
+iqn.1986-03.com.sun:02:7b4b02a6-3277-eb1b-e686-a24762c52a8c
+Connections: 0
+.fi
+.in -2
+.sp
+
+.LP
+After the \fBiSCSI\fR target is created, set up the \fBiSCSI\fR initiator. For more information about the Solaris \fBiSCSI\fR initiator, see the Solaris Administration Guide: Devices and File Systems.
+.SH EXIT STATUS
+
+.LP
+The following exit values are returned:
+.sp
+.ne 2
+.mk
+.na
+\fB\fB0\fR\fR
+.ad
+.RS 5n
+.rt
+Successful completion.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB1\fR\fR
+.ad
+.RS 5n
+.rt
+An error occurred.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB2\fR\fR
+.ad
+.RS 5n
+.rt
+Invalid command line options were specified.
+.RE
+
+.SH ATTRIBUTES
+
+.LP
+See \fBattributes\fR(5) for descriptions of the following attributes:
+.sp
+
+.sp
+.TS
+tab() box;
+cw(2.75i) |cw(2.75i)
+lw(2.75i) |lw(2.75i)
+.
+ATTRIBUTE TYPEATTRIBUTE VALUE
+_
+AvailabilitySUNWzfsu
+_
+Interface StabilityEvolving
+.TE
+
+.SH SEE ALSO
+
+.LP
+\fBssh\fR(1), \fBmount\fR(1M), \fBshare\fR(1M), \fBunshare\fR(1M), \fBzonecfg\fR(1M), \fBzpool\fR(1M), \fBchmod\fR(2), \fBstat\fR(2), \fBfsync\fR(3c), \fBdfstab\fR(4), \fBattributes\fR(5)
diff --git a/contrib/opensolaris/cmd/zfs/zfs_iter.c b/contrib/opensolaris/cmd/zfs/zfs_iter.c
new file mode 100644
index 0000000..eb6b8b1
--- /dev/null
+++ b/contrib/opensolaris/cmd/zfs/zfs_iter.c
@@ -0,0 +1,405 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <libintl.h>
+#include <libuutil.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+
+#include <libzfs.h>
+
+#include "zfs_util.h"
+#include "zfs_iter.h"
+
+/*
+ * This is a private interface used to gather up all the datasets specified on
+ * the command line so that we can iterate over them in order.
+ *
+ * First, we iterate over all filesystems, gathering them together into an
+ * AVL tree. We report errors for any explicitly specified datasets
+ * that we couldn't open.
+ *
+ * When finished, we have an AVL tree of ZFS handles. We go through and execute
+ * the provided callback for each one, passing whatever data the user supplied.
+ */
+
+typedef struct zfs_node {
+ zfs_handle_t *zn_handle;
+ uu_avl_node_t zn_avlnode;
+} zfs_node_t;
+
+typedef struct callback_data {
+ uu_avl_t *cb_avl;
+ int cb_recurse;
+ zfs_type_t cb_types;
+ zfs_sort_column_t *cb_sortcol;
+ zfs_proplist_t **cb_proplist;
+} callback_data_t;
+
+uu_avl_pool_t *avl_pool;
+
+/*
+ * Called for each dataset. If the object the object is of an appropriate type,
+ * add it to the avl tree and recurse over any children as necessary.
+ */
+int
+zfs_callback(zfs_handle_t *zhp, void *data)
+{
+ callback_data_t *cb = data;
+ int dontclose = 0;
+
+ /*
+ * If this object is of the appropriate type, add it to the AVL tree.
+ */
+ if (zfs_get_type(zhp) & cb->cb_types) {
+ uu_avl_index_t idx;
+ zfs_node_t *node = safe_malloc(sizeof (zfs_node_t));
+
+ node->zn_handle = zhp;
+ uu_avl_node_init(node, &node->zn_avlnode, avl_pool);
+ if (uu_avl_find(cb->cb_avl, node, cb->cb_sortcol,
+ &idx) == NULL) {
+ if (cb->cb_proplist &&
+ zfs_expand_proplist(zhp, cb->cb_proplist) != 0) {
+ free(node);
+ return (-1);
+ }
+ uu_avl_insert(cb->cb_avl, node, idx);
+ dontclose = 1;
+ } else {
+ free(node);
+ }
+ }
+
+ /*
+ * Recurse if necessary.
+ */
+ if (cb->cb_recurse && (zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM ||
+ (zfs_get_type(zhp) == ZFS_TYPE_VOLUME && (cb->cb_types &
+ ZFS_TYPE_SNAPSHOT))))
+ (void) zfs_iter_children(zhp, zfs_callback, data);
+
+ if (!dontclose)
+ zfs_close(zhp);
+
+ return (0);
+}
+
+int
+zfs_add_sort_column(zfs_sort_column_t **sc, const char *name,
+ boolean_t reverse)
+{
+ zfs_sort_column_t *col;
+ zfs_prop_t prop;
+
+ if ((prop = zfs_name_to_prop(name)) == ZFS_PROP_INVAL &&
+ !zfs_prop_user(name))
+ return (-1);
+
+ col = safe_malloc(sizeof (zfs_sort_column_t));
+
+ col->sc_prop = prop;
+ col->sc_reverse = reverse;
+ if (prop == ZFS_PROP_INVAL) {
+ col->sc_user_prop = safe_malloc(strlen(name) + 1);
+ (void) strcpy(col->sc_user_prop, name);
+ }
+
+ if (*sc == NULL) {
+ col->sc_last = col;
+ *sc = col;
+ } else {
+ (*sc)->sc_last->sc_next = col;
+ (*sc)->sc_last = col;
+ }
+
+ return (0);
+}
+
+void
+zfs_free_sort_columns(zfs_sort_column_t *sc)
+{
+ zfs_sort_column_t *col;
+
+ while (sc != NULL) {
+ col = sc->sc_next;
+ free(sc->sc_user_prop);
+ free(sc);
+ sc = col;
+ }
+}
+
+/* ARGSUSED */
+static int
+zfs_compare(const void *larg, const void *rarg, void *unused)
+{
+ zfs_handle_t *l = ((zfs_node_t *)larg)->zn_handle;
+ zfs_handle_t *r = ((zfs_node_t *)rarg)->zn_handle;
+ const char *lname = zfs_get_name(l);
+ const char *rname = zfs_get_name(r);
+ char *lat, *rat;
+ uint64_t lcreate, rcreate;
+ int ret;
+
+ lat = (char *)strchr(lname, '@');
+ rat = (char *)strchr(rname, '@');
+
+ if (lat != NULL)
+ *lat = '\0';
+ if (rat != NULL)
+ *rat = '\0';
+
+ ret = strcmp(lname, rname);
+ if (ret == 0) {
+ /*
+ * If we're comparing a dataset to one of its snapshots, we
+ * always make the full dataset first.
+ */
+ if (lat == NULL) {
+ ret = -1;
+ } else if (rat == NULL) {
+ ret = 1;
+ } else {
+ /*
+ * If we have two snapshots from the same dataset, then
+ * we want to sort them according to creation time. We
+ * use the hidden CREATETXG property to get an absolute
+ * ordering of snapshots.
+ */
+ lcreate = zfs_prop_get_int(l, ZFS_PROP_CREATETXG);
+ rcreate = zfs_prop_get_int(r, ZFS_PROP_CREATETXG);
+
+ if (lcreate < rcreate)
+ ret = -1;
+ else if (lcreate > rcreate)
+ ret = 1;
+ }
+ }
+
+ if (lat != NULL)
+ *lat = '@';
+ if (rat != NULL)
+ *rat = '@';
+
+ return (ret);
+}
+
+/*
+ * Sort datasets by specified columns.
+ *
+ * o Numeric types sort in ascending order.
+ * o String types sort in alphabetical order.
+ * o Types inappropriate for a row sort that row to the literal
+ * bottom, regardless of the specified ordering.
+ *
+ * If no sort columns are specified, or two datasets compare equally
+ * across all specified columns, they are sorted alphabetically by name
+ * with snapshots grouped under their parents.
+ */
+static int
+zfs_sort(const void *larg, const void *rarg, void *data)
+{
+ zfs_handle_t *l = ((zfs_node_t *)larg)->zn_handle;
+ zfs_handle_t *r = ((zfs_node_t *)rarg)->zn_handle;
+ zfs_sort_column_t *sc = (zfs_sort_column_t *)data;
+ zfs_sort_column_t *psc;
+
+ for (psc = sc; psc != NULL; psc = psc->sc_next) {
+ char lbuf[ZFS_MAXPROPLEN], rbuf[ZFS_MAXPROPLEN];
+ char *lstr, *rstr;
+ uint64_t lnum, rnum;
+ boolean_t lvalid, rvalid;
+ int ret = 0;
+
+ /*
+ * We group the checks below the generic code. If 'lstr' and
+ * 'rstr' are non-NULL, then we do a string based comparison.
+ * Otherwise, we compare 'lnum' and 'rnum'.
+ */
+ lstr = rstr = NULL;
+ if (psc->sc_prop == ZFS_PROP_INVAL) {
+ nvlist_t *luser, *ruser;
+ nvlist_t *lval, *rval;
+
+ luser = zfs_get_user_props(l);
+ ruser = zfs_get_user_props(r);
+
+ lvalid = (nvlist_lookup_nvlist(luser,
+ psc->sc_user_prop, &lval) == 0);
+ rvalid = (nvlist_lookup_nvlist(ruser,
+ psc->sc_user_prop, &rval) == 0);
+
+ if (lvalid)
+ verify(nvlist_lookup_string(lval,
+ ZFS_PROP_VALUE, &lstr) == 0);
+ if (rvalid)
+ verify(nvlist_lookup_string(rval,
+ ZFS_PROP_VALUE, &rstr) == 0);
+
+ } else if (zfs_prop_is_string(psc->sc_prop)) {
+ lvalid = (zfs_prop_get(l, psc->sc_prop, lbuf,
+ sizeof (lbuf), NULL, NULL, 0, B_TRUE) == 0);
+ rvalid = (zfs_prop_get(r, psc->sc_prop, rbuf,
+ sizeof (rbuf), NULL, NULL, 0, B_TRUE) == 0);
+
+ lstr = lbuf;
+ rstr = rbuf;
+ } else {
+ lvalid = zfs_prop_valid_for_type(psc->sc_prop,
+ zfs_get_type(l));
+ rvalid = zfs_prop_valid_for_type(psc->sc_prop,
+ zfs_get_type(r));
+
+ if (lvalid)
+ (void) zfs_prop_get_numeric(l, psc->sc_prop,
+ &lnum, NULL, NULL, 0);
+ if (rvalid)
+ (void) zfs_prop_get_numeric(r, psc->sc_prop,
+ &rnum, NULL, NULL, 0);
+ }
+
+ if (!lvalid && !rvalid)
+ continue;
+ else if (!lvalid)
+ return (1);
+ else if (!rvalid)
+ return (-1);
+
+ if (lstr)
+ ret = strcmp(lstr, rstr);
+ if (lnum < rnum)
+ ret = -1;
+ else if (lnum > rnum)
+ ret = 1;
+
+ if (ret != 0) {
+ if (psc->sc_reverse == B_TRUE)
+ ret = (ret < 0) ? 1 : -1;
+ return (ret);
+ }
+ }
+
+ return (zfs_compare(larg, rarg, NULL));
+}
+
+int
+zfs_for_each(int argc, char **argv, boolean_t recurse, zfs_type_t types,
+ zfs_sort_column_t *sortcol, zfs_proplist_t **proplist, zfs_iter_f callback,
+ void *data, boolean_t args_can_be_paths)
+{
+ callback_data_t cb;
+ int ret = 0;
+ zfs_node_t *node;
+ uu_avl_walk_t *walk;
+
+ avl_pool = uu_avl_pool_create("zfs_pool", sizeof (zfs_node_t),
+ offsetof(zfs_node_t, zn_avlnode), zfs_sort, UU_DEFAULT);
+
+ if (avl_pool == NULL) {
+ (void) fprintf(stderr,
+ gettext("internal error: out of memory\n"));
+ exit(1);
+ }
+
+ cb.cb_sortcol = sortcol;
+ cb.cb_recurse = recurse;
+ cb.cb_proplist = proplist;
+ cb.cb_types = types;
+ if ((cb.cb_avl = uu_avl_create(avl_pool, NULL, UU_DEFAULT)) == NULL) {
+ (void) fprintf(stderr,
+ gettext("internal error: out of memory\n"));
+ exit(1);
+ }
+
+ if (argc == 0) {
+ /*
+ * If given no arguments, iterate over all datasets.
+ */
+ cb.cb_recurse = 1;
+ ret = zfs_iter_root(g_zfs, zfs_callback, &cb);
+ } else {
+ int i;
+ zfs_handle_t *zhp;
+ zfs_type_t argtype;
+
+ /*
+ * If we're recursive, then we always allow filesystems as
+ * arguments. If we also are interested in snapshots, then we
+ * can take volumes as well.
+ */
+ argtype = types;
+ if (recurse) {
+ argtype |= ZFS_TYPE_FILESYSTEM;
+ if (types & ZFS_TYPE_SNAPSHOT)
+ argtype |= ZFS_TYPE_VOLUME;
+ }
+
+ for (i = 0; i < argc; i++) {
+ if (args_can_be_paths) {
+ zhp = zfs_path_to_zhandle(g_zfs, argv[i],
+ argtype);
+ } else {
+ zhp = zfs_open(g_zfs, argv[i], argtype);
+ }
+ if (zhp != NULL)
+ ret |= zfs_callback(zhp, &cb);
+ else
+ ret = 1;
+ }
+ }
+
+ /*
+ * At this point we've got our AVL tree full of zfs handles, so iterate
+ * over each one and execute the real user callback.
+ */
+ for (node = uu_avl_first(cb.cb_avl); node != NULL;
+ node = uu_avl_next(cb.cb_avl, node))
+ ret |= callback(node->zn_handle, data);
+
+ /*
+ * Finally, clean up the AVL tree.
+ */
+ if ((walk = uu_avl_walk_start(cb.cb_avl, UU_WALK_ROBUST)) == NULL) {
+ (void) fprintf(stderr,
+ gettext("internal error: out of memory"));
+ exit(1);
+ }
+
+ while ((node = uu_avl_walk_next(walk)) != NULL) {
+ uu_avl_remove(cb.cb_avl, node);
+ zfs_close(node->zn_handle);
+ free(node);
+ }
+
+ uu_avl_walk_end(walk);
+ uu_avl_destroy(cb.cb_avl);
+ uu_avl_pool_destroy(avl_pool);
+
+ return (ret);
+}
diff --git a/contrib/opensolaris/cmd/zfs/zfs_iter.h b/contrib/opensolaris/cmd/zfs/zfs_iter.h
new file mode 100644
index 0000000..1f0ce8e
--- /dev/null
+++ b/contrib/opensolaris/cmd/zfs/zfs_iter.h
@@ -0,0 +1,52 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef ZFS_ITER_H
+#define ZFS_ITER_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct zfs_sort_column {
+ struct zfs_sort_column *sc_next;
+ struct zfs_sort_column *sc_last;
+ zfs_prop_t sc_prop;
+ char *sc_user_prop;
+ boolean_t sc_reverse;
+} zfs_sort_column_t;
+
+int zfs_for_each(int, char **, boolean_t, zfs_type_t, zfs_sort_column_t *,
+ zfs_proplist_t **, zfs_iter_f, void *, boolean_t);
+int zfs_add_sort_column(zfs_sort_column_t **, const char *, boolean_t);
+void zfs_free_sort_columns(zfs_sort_column_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ZFS_ITER_H */
diff --git a/contrib/opensolaris/cmd/zfs/zfs_main.c b/contrib/opensolaris/cmd/zfs/zfs_main.c
new file mode 100644
index 0000000..3327b2a
--- /dev/null
+++ b/contrib/opensolaris/cmd/zfs/zfs_main.c
@@ -0,0 +1,3233 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <assert.h>
+#include <ctype.h>
+#include <errno.h>
+#include <libgen.h>
+#include <libintl.h>
+#include <libuutil.h>
+#include <locale.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <zone.h>
+#include <sys/mntent.h>
+#include <sys/mnttab.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+
+#include <libzfs.h>
+
+#include "zfs_iter.h"
+#include "zfs_util.h"
+
+libzfs_handle_t *g_zfs;
+
+static FILE *mnttab_file;
+
+static int zfs_do_clone(int argc, char **argv);
+static int zfs_do_create(int argc, char **argv);
+static int zfs_do_destroy(int argc, char **argv);
+static int zfs_do_get(int argc, char **argv);
+static int zfs_do_inherit(int argc, char **argv);
+static int zfs_do_list(int argc, char **argv);
+static int zfs_do_mount(int argc, char **argv);
+static int zfs_do_rename(int argc, char **argv);
+static int zfs_do_rollback(int argc, char **argv);
+static int zfs_do_set(int argc, char **argv);
+static int zfs_do_snapshot(int argc, char **argv);
+static int zfs_do_unmount(int argc, char **argv);
+static int zfs_do_share(int argc, char **argv);
+static int zfs_do_unshare(int argc, char **argv);
+static int zfs_do_send(int argc, char **argv);
+static int zfs_do_receive(int argc, char **argv);
+static int zfs_do_promote(int argc, char **argv);
+static int zfs_do_jail(int argc, char **argv);
+static int zfs_do_unjail(int argc, char **argv);
+
+/*
+ * These libumem hooks provide a reasonable set of defaults for the allocator's
+ * debugging facilities.
+ */
+const char *
+_umem_debug_init(void)
+{
+ return ("default,verbose"); /* $UMEM_DEBUG setting */
+}
+
+const char *
+_umem_logging_init(void)
+{
+ return ("fail,contents"); /* $UMEM_LOGGING setting */
+}
+
+typedef enum {
+ HELP_CLONE,
+ HELP_CREATE,
+ HELP_DESTROY,
+ HELP_GET,
+ HELP_INHERIT,
+ HELP_JAIL,
+ HELP_UNJAIL,
+ HELP_LIST,
+ HELP_MOUNT,
+ HELP_PROMOTE,
+ HELP_RECEIVE,
+ HELP_RENAME,
+ HELP_ROLLBACK,
+ HELP_SEND,
+ HELP_SET,
+ HELP_SHARE,
+ HELP_SNAPSHOT,
+ HELP_UNMOUNT,
+ HELP_UNSHARE
+} zfs_help_t;
+
+typedef struct zfs_command {
+ const char *name;
+ int (*func)(int argc, char **argv);
+ zfs_help_t usage;
+} zfs_command_t;
+
+/*
+ * Master command table. Each ZFS command has a name, associated function, and
+ * usage message. The usage messages need to be internationalized, so we have
+ * to have a function to return the usage message based on a command index.
+ *
+ * These commands are organized according to how they are displayed in the usage
+ * message. An empty command (one with a NULL name) indicates an empty line in
+ * the generic usage message.
+ */
+static zfs_command_t command_table[] = {
+ { "create", zfs_do_create, HELP_CREATE },
+ { "destroy", zfs_do_destroy, HELP_DESTROY },
+ { NULL },
+ { "snapshot", zfs_do_snapshot, HELP_SNAPSHOT },
+ { "rollback", zfs_do_rollback, HELP_ROLLBACK },
+ { "clone", zfs_do_clone, HELP_CLONE },
+ { "promote", zfs_do_promote, HELP_PROMOTE },
+ { "rename", zfs_do_rename, HELP_RENAME },
+ { NULL },
+ { "list", zfs_do_list, HELP_LIST },
+ { NULL },
+ { "set", zfs_do_set, HELP_SET },
+ { "get", zfs_do_get, HELP_GET },
+ { "inherit", zfs_do_inherit, HELP_INHERIT },
+ { NULL },
+ { "mount", zfs_do_mount, HELP_MOUNT },
+ { NULL },
+ { "unmount", zfs_do_unmount, HELP_UNMOUNT },
+ { NULL },
+ { "share", zfs_do_share, HELP_SHARE },
+ { NULL },
+ { "unshare", zfs_do_unshare, HELP_UNSHARE },
+ { NULL },
+ { "send", zfs_do_send, HELP_SEND },
+ { "receive", zfs_do_receive, HELP_RECEIVE },
+ { NULL },
+ { "jail", zfs_do_jail, HELP_JAIL },
+ { "unjail", zfs_do_unjail, HELP_UNJAIL },
+};
+
+#define NCOMMAND (sizeof (command_table) / sizeof (command_table[0]))
+
+zfs_command_t *current_command;
+
+static const char *
+get_usage(zfs_help_t idx)
+{
+ switch (idx) {
+ case HELP_CLONE:
+ return (gettext("\tclone <snapshot> <filesystem|volume>\n"));
+ case HELP_CREATE:
+ return (gettext("\tcreate [[-o property=value] ... ] "
+ "<filesystem>\n"
+ "\tcreate [-s] [-b blocksize] [[-o property=value] ...]\n"
+ "\t -V <size> <volume>\n"));
+ case HELP_DESTROY:
+ return (gettext("\tdestroy [-rRf] "
+ "<filesystem|volume|snapshot>\n"));
+ case HELP_GET:
+ return (gettext("\tget [-rHp] [-o field[,field]...] "
+ "[-s source[,source]...]\n"
+ "\t <all | property[,property]...> "
+ "[filesystem|volume|snapshot] ...\n"));
+ case HELP_INHERIT:
+ return (gettext("\tinherit [-r] <property> "
+ "<filesystem|volume> ...\n"));
+ case HELP_JAIL:
+ return (gettext("\tjail <jailid> <filesystem>\n"));
+ case HELP_UNJAIL:
+ return (gettext("\tunjail <jailid> <filesystem>\n"));
+ case HELP_LIST:
+ return (gettext("\tlist [-rH] [-o property[,property]...] "
+ "[-t type[,type]...]\n"
+ "\t [-s property [-s property]...]"
+ " [-S property [-S property]...]\n"
+ "\t [filesystem|volume|snapshot] ...\n"));
+ case HELP_MOUNT:
+ return (gettext("\tmount\n"
+ "\tmount [-o opts] [-O] -a\n"
+ "\tmount [-o opts] [-O] <filesystem>\n"));
+ case HELP_PROMOTE:
+ return (gettext("\tpromote <clone filesystem>\n"));
+ case HELP_RECEIVE:
+ return (gettext("\treceive [-vnF] <filesystem|volume|"
+ "snapshot>\n"
+ "\treceive [-vnF] -d <filesystem>\n"));
+ case HELP_RENAME:
+ return (gettext("\trename <filesystem|volume|snapshot> "
+ "<filesystem|volume|snapshot>\n"));
+ case HELP_ROLLBACK:
+ return (gettext("\trollback [-rRf] <snapshot>\n"));
+ case HELP_SEND:
+ return (gettext("\tsend [-i <snapshot>] <snapshot>\n"));
+ case HELP_SET:
+ return (gettext("\tset <property=value> "
+ "<filesystem|volume> ...\n"));
+ case HELP_SHARE:
+ return (gettext("\tshare -a\n"
+ "\tshare <filesystem>\n"));
+ case HELP_SNAPSHOT:
+ return (gettext("\tsnapshot [-r] "
+ "<filesystem@name|volume@name>\n"));
+ case HELP_UNMOUNT:
+ return (gettext("\tunmount [-f] -a\n"
+ "\tunmount [-f] <filesystem|mountpoint>\n"));
+ case HELP_UNSHARE:
+ return (gettext("\tunshare [-f] -a\n"
+ "\tunshare [-f] <filesystem|mountpoint>\n"));
+ }
+
+ abort();
+ /* NOTREACHED */
+}
+
+/*
+ * Utility function to guarantee malloc() success.
+ */
+void *
+safe_malloc(size_t size)
+{
+ void *data;
+
+ if ((data = calloc(1, size)) == NULL) {
+ (void) fprintf(stderr, "internal error: out of memory\n");
+ exit(1);
+ }
+
+ return (data);
+}
+
+/*
+ * Callback routinue that will print out information for each of the
+ * the properties.
+ */
+static zfs_prop_t
+usage_prop_cb(zfs_prop_t prop, void *cb)
+{
+ FILE *fp = cb;
+
+ (void) fprintf(fp, "\t%-13s ", zfs_prop_to_name(prop));
+
+ if (zfs_prop_readonly(prop))
+ (void) fprintf(fp, " NO ");
+ else
+ (void) fprintf(fp, " YES ");
+
+ if (zfs_prop_inheritable(prop))
+ (void) fprintf(fp, " YES ");
+ else
+ (void) fprintf(fp, " NO ");
+
+ if (zfs_prop_values(prop) == NULL)
+ (void) fprintf(fp, "-\n");
+ else
+ (void) fprintf(fp, "%s\n", zfs_prop_values(prop));
+
+ return (ZFS_PROP_CONT);
+}
+
+/*
+ * Display usage message. If we're inside a command, display only the usage for
+ * that command. Otherwise, iterate over the entire command table and display
+ * a complete usage message.
+ */
+static void
+usage(boolean_t requested)
+{
+ int i;
+ boolean_t show_properties = B_FALSE;
+ FILE *fp = requested ? stdout : stderr;
+
+ if (current_command == NULL) {
+
+ (void) fprintf(fp, gettext("usage: zfs command args ...\n"));
+ (void) fprintf(fp,
+ gettext("where 'command' is one of the following:\n\n"));
+
+ for (i = 0; i < NCOMMAND; i++) {
+ if (command_table[i].name == NULL)
+ (void) fprintf(fp, "\n");
+ else
+ (void) fprintf(fp, "%s",
+ get_usage(command_table[i].usage));
+ }
+
+ (void) fprintf(fp, gettext("\nEach dataset is of the form: "
+ "pool/[dataset/]*dataset[@name]\n"));
+ } else {
+ (void) fprintf(fp, gettext("usage:\n"));
+ (void) fprintf(fp, "%s", get_usage(current_command->usage));
+ }
+
+ if (current_command != NULL &&
+ (strcmp(current_command->name, "set") == 0 ||
+ strcmp(current_command->name, "get") == 0 ||
+ strcmp(current_command->name, "inherit") == 0 ||
+ strcmp(current_command->name, "list") == 0))
+ show_properties = B_TRUE;
+
+ if (show_properties) {
+
+ (void) fprintf(fp,
+ gettext("\nThe following properties are supported:\n"));
+
+ (void) fprintf(fp, "\n\t%-13s %s %s %s\n\n",
+ "PROPERTY", "EDIT", "INHERIT", "VALUES");
+
+ /* Iterate over all properties */
+ (void) zfs_prop_iter(usage_prop_cb, fp, B_FALSE);
+
+ (void) fprintf(fp, gettext("\nSizes are specified in bytes "
+ "with standard units such as K, M, G, etc.\n"));
+ (void) fprintf(fp, gettext("\n\nUser-defined properties can "
+ "be specified by using a name containing a colon (:).\n"));
+ } else {
+ /*
+ * TRANSLATION NOTE:
+ * "zfs set|get" must not be localised this is the
+ * command name and arguments.
+ */
+ (void) fprintf(fp,
+ gettext("\nFor the property list, run: zfs set|get\n"));
+ }
+
+ /*
+ * See comments at end of main().
+ */
+ if (getenv("ZFS_ABORT") != NULL) {
+ (void) printf("dumping core by request\n");
+ abort();
+ }
+
+ exit(requested ? 0 : 2);
+}
+
+/*
+ * zfs clone <fs, snap, vol> fs
+ *
+ * Given an existing dataset, create a writable copy whose initial contents
+ * are the same as the source. The newly created dataset maintains a
+ * dependency on the original; the original cannot be destroyed so long as
+ * the clone exists.
+ */
+static int
+zfs_do_clone(int argc, char **argv)
+{
+ zfs_handle_t *zhp;
+ int ret;
+
+ /* check options */
+ if (argc > 1 && argv[1][0] == '-') {
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ argv[1][1]);
+ usage(B_FALSE);
+ }
+
+ /* check number of arguments */
+ if (argc < 2) {
+ (void) fprintf(stderr, gettext("missing source dataset "
+ "argument\n"));
+ usage(B_FALSE);
+ }
+ if (argc < 3) {
+ (void) fprintf(stderr, gettext("missing target dataset "
+ "argument\n"));
+ usage(B_FALSE);
+ }
+ if (argc > 3) {
+ (void) fprintf(stderr, gettext("too many arguments\n"));
+ usage(B_FALSE);
+ }
+
+ /* open the source dataset */
+ if ((zhp = zfs_open(g_zfs, argv[1], ZFS_TYPE_SNAPSHOT)) == NULL)
+ return (1);
+
+ /* pass to libzfs */
+ ret = zfs_clone(zhp, argv[2], NULL);
+
+ /* create the mountpoint if necessary */
+ if (ret == 0) {
+ zfs_handle_t *clone = zfs_open(g_zfs, argv[2], ZFS_TYPE_ANY);
+ if (clone != NULL) {
+ if ((ret = zfs_mount(clone, NULL, 0)) == 0)
+ ret = zfs_share(clone);
+ zfs_close(clone);
+ }
+ zpool_log_history(g_zfs, argc, argv, argv[2], B_FALSE, B_FALSE);
+ }
+
+ zfs_close(zhp);
+
+ return (ret == 0 ? 0 : 1);
+}
+
+/*
+ * zfs create [-o prop=value] ... fs
+ * zfs create [-s] [-b blocksize] [-o prop=value] ... -V vol size
+ *
+ * Create a new dataset. This command can be used to create filesystems
+ * and volumes. Snapshot creation is handled by 'zfs snapshot'.
+ * For volumes, the user must specify a size to be used.
+ *
+ * The '-s' flag applies only to volumes, and indicates that we should not try
+ * to set the reservation for this volume. By default we set a reservation
+ * equal to the size for any volume.
+ */
+static int
+zfs_do_create(int argc, char **argv)
+{
+ zfs_type_t type = ZFS_TYPE_FILESYSTEM;
+ zfs_handle_t *zhp = NULL;
+ uint64_t volsize;
+ int c;
+ boolean_t noreserve = B_FALSE;
+ int ret = 1;
+ nvlist_t *props = NULL;
+ uint64_t intval;
+ char *propname;
+ char *propval = NULL;
+ char *strval;
+
+ if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) {
+ (void) fprintf(stderr, gettext("internal error: "
+ "out of memory\n"));
+ return (1);
+ }
+
+ /* check options */
+ while ((c = getopt(argc, argv, ":V:b:so:")) != -1) {
+ switch (c) {
+ case 'V':
+ type = ZFS_TYPE_VOLUME;
+ if (zfs_nicestrtonum(g_zfs, optarg, &intval) != 0) {
+ (void) fprintf(stderr, gettext("bad volume "
+ "size '%s': %s\n"), optarg,
+ libzfs_error_description(g_zfs));
+ goto error;
+ }
+
+ if (nvlist_add_uint64(props,
+ zfs_prop_to_name(ZFS_PROP_VOLSIZE),
+ intval) != 0) {
+ (void) fprintf(stderr, gettext("internal "
+ "error: out of memory\n"));
+ goto error;
+ }
+ volsize = intval;
+ break;
+ case 'b':
+ if (zfs_nicestrtonum(g_zfs, optarg, &intval) != 0) {
+ (void) fprintf(stderr, gettext("bad volume "
+ "block size '%s': %s\n"), optarg,
+ libzfs_error_description(g_zfs));
+ goto error;
+ }
+
+ if (nvlist_add_uint64(props,
+ zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
+ intval) != 0) {
+ (void) fprintf(stderr, gettext("internal "
+ "error: out of memory\n"));
+ goto error;
+ }
+ break;
+ case 'o':
+ propname = optarg;
+ if ((propval = strchr(propname, '=')) == NULL) {
+ (void) fprintf(stderr, gettext("missing "
+ "'=' for -o option\n"));
+ goto error;
+ }
+ *propval = '\0';
+ propval++;
+ if (nvlist_lookup_string(props, propname,
+ &strval) == 0) {
+ (void) fprintf(stderr, gettext("property '%s' "
+ "specified multiple times\n"), propname);
+ goto error;
+ }
+ if (nvlist_add_string(props, propname, propval) != 0) {
+ (void) fprintf(stderr, gettext("internal "
+ "error: out of memory\n"));
+ goto error;
+ }
+ break;
+ case 's':
+ noreserve = B_TRUE;
+ break;
+ case ':':
+ (void) fprintf(stderr, gettext("missing size "
+ "argument\n"));
+ goto badusage;
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ goto badusage;
+ }
+ }
+
+ if (noreserve && type != ZFS_TYPE_VOLUME) {
+ (void) fprintf(stderr, gettext("'-s' can only be used when "
+ "creating a volume\n"));
+ goto badusage;
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ /* check number of arguments */
+ if (argc == 0) {
+ (void) fprintf(stderr, gettext("missing %s argument\n"),
+ zfs_type_to_name(type));
+ goto badusage;
+ }
+ if (argc > 1) {
+ (void) fprintf(stderr, gettext("too many arguments\n"));
+ goto badusage;
+ }
+
+ if (type == ZFS_TYPE_VOLUME && !noreserve &&
+ nvlist_lookup_string(props, zfs_prop_to_name(ZFS_PROP_RESERVATION),
+ &strval) != 0) {
+ if (nvlist_add_uint64(props,
+ zfs_prop_to_name(ZFS_PROP_RESERVATION),
+ volsize) != 0) {
+ (void) fprintf(stderr, gettext("internal "
+ "error: out of memory\n"));
+ nvlist_free(props);
+ return (1);
+ }
+ }
+
+ /* pass to libzfs */
+ if (zfs_create(g_zfs, argv[0], type, props) != 0)
+ goto error;
+
+ if (propval != NULL)
+ *(propval - 1) = '=';
+ zpool_log_history(g_zfs, argc + optind, argv - optind, argv[0],
+ B_FALSE, B_FALSE);
+
+ if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_ANY)) == NULL)
+ goto error;
+
+ /*
+ * Mount and/or share the new filesystem as appropriate. We provide a
+ * verbose error message to let the user know that their filesystem was
+ * in fact created, even if we failed to mount or share it.
+ */
+ if (zfs_mount(zhp, NULL, 0) != 0) {
+ (void) fprintf(stderr, gettext("filesystem successfully "
+ "created, but not mounted\n"));
+ ret = 1;
+ } else if (zfs_share(zhp) != 0) {
+ (void) fprintf(stderr, gettext("filesystem successfully "
+ "created, but not shared\n"));
+ ret = 1;
+ } else {
+ ret = 0;
+ }
+
+error:
+ if (zhp)
+ zfs_close(zhp);
+ nvlist_free(props);
+ return (ret);
+badusage:
+ nvlist_free(props);
+ usage(B_FALSE);
+ return (2);
+}
+
+/*
+ * zfs destroy [-rf] <fs, snap, vol>
+ *
+ * -r Recursively destroy all children
+ * -R Recursively destroy all dependents, including clones
+ * -f Force unmounting of any dependents
+ *
+ * Destroys the given dataset. By default, it will unmount any filesystems,
+ * and refuse to destroy a dataset that has any dependents. A dependent can
+ * either be a child, or a clone of a child.
+ */
+typedef struct destroy_cbdata {
+ boolean_t cb_first;
+ int cb_force;
+ int cb_recurse;
+ int cb_error;
+ int cb_needforce;
+ int cb_doclones;
+ boolean_t cb_closezhp;
+ zfs_handle_t *cb_target;
+ char *cb_snapname;
+} destroy_cbdata_t;
+
+/*
+ * Check for any dependents based on the '-r' or '-R' flags.
+ */
+static int
+destroy_check_dependent(zfs_handle_t *zhp, void *data)
+{
+ destroy_cbdata_t *cbp = data;
+ const char *tname = zfs_get_name(cbp->cb_target);
+ const char *name = zfs_get_name(zhp);
+
+ if (strncmp(tname, name, strlen(tname)) == 0 &&
+ (name[strlen(tname)] == '/' || name[strlen(tname)] == '@')) {
+ /*
+ * This is a direct descendant, not a clone somewhere else in
+ * the hierarchy.
+ */
+ if (cbp->cb_recurse)
+ goto out;
+
+ if (cbp->cb_first) {
+ (void) fprintf(stderr, gettext("cannot destroy '%s': "
+ "%s has children\n"),
+ zfs_get_name(cbp->cb_target),
+ zfs_type_to_name(zfs_get_type(cbp->cb_target)));
+ (void) fprintf(stderr, gettext("use '-r' to destroy "
+ "the following datasets:\n"));
+ cbp->cb_first = B_FALSE;
+ cbp->cb_error = 1;
+ }
+
+ (void) fprintf(stderr, "%s\n", zfs_get_name(zhp));
+ } else {
+ /*
+ * This is a clone. We only want to report this if the '-r'
+ * wasn't specified, or the target is a snapshot.
+ */
+ if (!cbp->cb_recurse &&
+ zfs_get_type(cbp->cb_target) != ZFS_TYPE_SNAPSHOT)
+ goto out;
+
+ if (cbp->cb_first) {
+ (void) fprintf(stderr, gettext("cannot destroy '%s': "
+ "%s has dependent clones\n"),
+ zfs_get_name(cbp->cb_target),
+ zfs_type_to_name(zfs_get_type(cbp->cb_target)));
+ (void) fprintf(stderr, gettext("use '-R' to destroy "
+ "the following datasets:\n"));
+ cbp->cb_first = B_FALSE;
+ cbp->cb_error = 1;
+ }
+
+ (void) fprintf(stderr, "%s\n", zfs_get_name(zhp));
+ }
+
+out:
+ zfs_close(zhp);
+ return (0);
+}
+
+static int
+destroy_callback(zfs_handle_t *zhp, void *data)
+{
+ destroy_cbdata_t *cbp = data;
+
+ /*
+ * Ignore pools (which we've already flagged as an error before getting
+ * here.
+ */
+ if (strchr(zfs_get_name(zhp), '/') == NULL &&
+ zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) {
+ zfs_close(zhp);
+ return (0);
+ }
+
+ /*
+ * Bail out on the first error.
+ */
+ if (zfs_unmount(zhp, NULL, cbp->cb_force ? MS_FORCE : 0) != 0 ||
+ zfs_destroy(zhp) != 0) {
+ zfs_close(zhp);
+ return (-1);
+ }
+
+ zfs_close(zhp);
+ return (0);
+}
+
+static int
+destroy_snap_clones(zfs_handle_t *zhp, void *arg)
+{
+ destroy_cbdata_t *cbp = arg;
+ char thissnap[MAXPATHLEN];
+ zfs_handle_t *szhp;
+ boolean_t closezhp = cbp->cb_closezhp;
+ int rv;
+
+ (void) snprintf(thissnap, sizeof (thissnap),
+ "%s@%s", zfs_get_name(zhp), cbp->cb_snapname);
+
+ libzfs_print_on_error(g_zfs, B_FALSE);
+ szhp = zfs_open(g_zfs, thissnap, ZFS_TYPE_SNAPSHOT);
+ libzfs_print_on_error(g_zfs, B_TRUE);
+ if (szhp) {
+ /*
+ * Destroy any clones of this snapshot
+ */
+ if (zfs_iter_dependents(szhp, B_FALSE, destroy_callback,
+ cbp) != 0) {
+ zfs_close(szhp);
+ if (closezhp)
+ zfs_close(zhp);
+ return (-1);
+ }
+ zfs_close(szhp);
+ }
+
+ cbp->cb_closezhp = B_TRUE;
+ rv = zfs_iter_filesystems(zhp, destroy_snap_clones, arg);
+ if (closezhp)
+ zfs_close(zhp);
+ return (rv);
+}
+
+static int
+zfs_do_destroy(int argc, char **argv)
+{
+ destroy_cbdata_t cb = { 0 };
+ int c;
+ zfs_handle_t *zhp;
+ char *cp;
+
+ /* check options */
+ while ((c = getopt(argc, argv, "frR")) != -1) {
+ switch (c) {
+ case 'f':
+ cb.cb_force = 1;
+ break;
+ case 'r':
+ cb.cb_recurse = 1;
+ break;
+ case 'R':
+ cb.cb_recurse = 1;
+ cb.cb_doclones = 1;
+ break;
+ case '?':
+ default:
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ /* check number of arguments */
+ if (argc == 0) {
+ (void) fprintf(stderr, gettext("missing path argument\n"));
+ usage(B_FALSE);
+ }
+ if (argc > 1) {
+ (void) fprintf(stderr, gettext("too many arguments\n"));
+ usage(B_FALSE);
+ }
+
+ /*
+ * If we are doing recursive destroy of a snapshot, then the
+ * named snapshot may not exist. Go straight to libzfs.
+ */
+ if (cb.cb_recurse && (cp = strchr(argv[0], '@'))) {
+ int ret;
+
+ *cp = '\0';
+ if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_ANY)) == NULL)
+ return (1);
+ *cp = '@';
+ cp++;
+
+ if (cb.cb_doclones) {
+ cb.cb_snapname = cp;
+ if (destroy_snap_clones(zhp, &cb) != 0) {
+ zfs_close(zhp);
+ return (1);
+ }
+ }
+
+ ret = zfs_destroy_snaps(zhp, cp);
+ zfs_close(zhp);
+ if (ret) {
+ (void) fprintf(stderr,
+ gettext("no snapshots destroyed\n"));
+ } else {
+ zpool_log_history(g_zfs, argc + optind, argv - optind,
+ argv[0], B_FALSE, B_FALSE);
+ }
+ return (ret != 0);
+ }
+
+
+ /* Open the given dataset */
+ if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_ANY)) == NULL)
+ return (1);
+
+ cb.cb_target = zhp;
+
+ /*
+ * Perform an explicit check for pools before going any further.
+ */
+ if (!cb.cb_recurse && strchr(zfs_get_name(zhp), '/') == NULL &&
+ zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) {
+ (void) fprintf(stderr, gettext("cannot destroy '%s': "
+ "operation does not apply to pools\n"),
+ zfs_get_name(zhp));
+ (void) fprintf(stderr, gettext("use 'zfs destroy -r "
+ "%s' to destroy all datasets in the pool\n"),
+ zfs_get_name(zhp));
+ (void) fprintf(stderr, gettext("use 'zpool destroy %s' "
+ "to destroy the pool itself\n"), zfs_get_name(zhp));
+ zfs_close(zhp);
+ return (1);
+ }
+
+ /*
+ * Check for any dependents and/or clones.
+ */
+ cb.cb_first = B_TRUE;
+ if (!cb.cb_doclones &&
+ zfs_iter_dependents(zhp, B_TRUE, destroy_check_dependent,
+ &cb) != 0) {
+ zfs_close(zhp);
+ return (1);
+ }
+
+
+ if (cb.cb_error ||
+ zfs_iter_dependents(zhp, B_FALSE, destroy_callback, &cb) != 0) {
+ zfs_close(zhp);
+ return (1);
+ }
+
+ /*
+ * Do the real thing. The callback will close the handle regardless of
+ * whether it succeeds or not.
+ */
+ if (destroy_callback(zhp, &cb) != 0)
+ return (1);
+
+ zpool_log_history(g_zfs, argc + optind, argv - optind, argv[0],
+ B_FALSE, B_FALSE);
+
+ return (0);
+}
+
+/*
+ * zfs get [-rHp] [-o field[,field]...] [-s source[,source]...]
+ * < all | property[,property]... > < fs | snap | vol > ...
+ *
+ * -r recurse over any child datasets
+ * -H scripted mode. Headers are stripped, and fields are separated
+ * by tabs instead of spaces.
+ * -o Set of fields to display. One of "name,property,value,source".
+ * Default is all four.
+ * -s Set of sources to allow. One of
+ * "local,default,inherited,temporary,none". Default is all
+ * five.
+ * -p Display values in parsable (literal) format.
+ *
+ * Prints properties for the given datasets. The user can control which
+ * columns to display as well as which property types to allow.
+ */
+
+/*
+ * Invoked to display the properties for a single dataset.
+ */
+static int
+get_callback(zfs_handle_t *zhp, void *data)
+{
+ char buf[ZFS_MAXPROPLEN];
+ zfs_source_t sourcetype;
+ char source[ZFS_MAXNAMELEN];
+ libzfs_get_cbdata_t *cbp = data;
+ nvlist_t *userprop = zfs_get_user_props(zhp);
+ zfs_proplist_t *pl = cbp->cb_proplist;
+ nvlist_t *propval;
+ char *strval;
+ char *sourceval;
+
+ for (; pl != NULL; pl = pl->pl_next) {
+ /*
+ * Skip the special fake placeholder. This will also skip over
+ * the name property when 'all' is specified.
+ */
+ if (pl->pl_prop == ZFS_PROP_NAME &&
+ pl == cbp->cb_proplist)
+ continue;
+
+ if (pl->pl_prop != ZFS_PROP_INVAL) {
+ if (zfs_prop_get(zhp, pl->pl_prop, buf,
+ sizeof (buf), &sourcetype, source,
+ sizeof (source),
+ cbp->cb_literal) != 0) {
+ if (pl->pl_all)
+ continue;
+ if (!zfs_prop_valid_for_type(pl->pl_prop,
+ ZFS_TYPE_ANY)) {
+ (void) fprintf(stderr,
+ gettext("No such property '%s'\n"),
+ zfs_prop_to_name(pl->pl_prop));
+ continue;
+ }
+ sourcetype = ZFS_SRC_NONE;
+ (void) strlcpy(buf, "-", sizeof (buf));
+ }
+
+ libzfs_print_one_property(zfs_get_name(zhp), cbp,
+ zfs_prop_to_name(pl->pl_prop),
+ buf, sourcetype, source);
+ } else {
+ if (nvlist_lookup_nvlist(userprop,
+ pl->pl_user_prop, &propval) != 0) {
+ if (pl->pl_all)
+ continue;
+ sourcetype = ZFS_SRC_NONE;
+ strval = "-";
+ } else {
+ verify(nvlist_lookup_string(propval,
+ ZFS_PROP_VALUE, &strval) == 0);
+ verify(nvlist_lookup_string(propval,
+ ZFS_PROP_SOURCE, &sourceval) == 0);
+
+ if (strcmp(sourceval,
+ zfs_get_name(zhp)) == 0) {
+ sourcetype = ZFS_SRC_LOCAL;
+ } else {
+ sourcetype = ZFS_SRC_INHERITED;
+ (void) strlcpy(source,
+ sourceval, sizeof (source));
+ }
+ }
+
+ libzfs_print_one_property(zfs_get_name(zhp), cbp,
+ pl->pl_user_prop, strval, sourcetype,
+ source);
+ }
+ }
+
+ return (0);
+}
+
+static int
+zfs_do_get(int argc, char **argv)
+{
+ libzfs_get_cbdata_t cb = { 0 };
+ boolean_t recurse = B_FALSE;
+ int i, c;
+ char *value, *fields;
+ int ret;
+ zfs_proplist_t fake_name = { 0 };
+
+ /*
+ * Set up default columns and sources.
+ */
+ cb.cb_sources = ZFS_SRC_ALL;
+ cb.cb_columns[0] = GET_COL_NAME;
+ cb.cb_columns[1] = GET_COL_PROPERTY;
+ cb.cb_columns[2] = GET_COL_VALUE;
+ cb.cb_columns[3] = GET_COL_SOURCE;
+
+ /* check options */
+ while ((c = getopt(argc, argv, ":o:s:rHp")) != -1) {
+ switch (c) {
+ case 'p':
+ cb.cb_literal = B_TRUE;
+ break;
+ case 'r':
+ recurse = B_TRUE;
+ break;
+ case 'H':
+ cb.cb_scripted = B_TRUE;
+ break;
+ case ':':
+ (void) fprintf(stderr, gettext("missing argument for "
+ "'%c' option\n"), optopt);
+ usage(B_FALSE);
+ break;
+ case 'o':
+ /*
+ * Process the set of columns to display. We zero out
+ * the structure to give us a blank slate.
+ */
+ bzero(&cb.cb_columns, sizeof (cb.cb_columns));
+ i = 0;
+ while (*optarg != '\0') {
+ static char *col_subopts[] =
+ { "name", "property", "value", "source",
+ NULL };
+
+ if (i == 4) {
+ (void) fprintf(stderr, gettext("too "
+ "many fields given to -o "
+ "option\n"));
+ usage(B_FALSE);
+ }
+
+ switch (getsubopt(&optarg, col_subopts,
+ &value)) {
+ case 0:
+ cb.cb_columns[i++] = GET_COL_NAME;
+ break;
+ case 1:
+ cb.cb_columns[i++] = GET_COL_PROPERTY;
+ break;
+ case 2:
+ cb.cb_columns[i++] = GET_COL_VALUE;
+ break;
+ case 3:
+ cb.cb_columns[i++] = GET_COL_SOURCE;
+ break;
+ default:
+ (void) fprintf(stderr,
+ gettext("invalid column name "
+ "'%s'\n"), value);
+ usage(B_FALSE);
+ }
+ }
+ break;
+
+ case 's':
+ cb.cb_sources = 0;
+ while (*optarg != '\0') {
+ static char *source_subopts[] = {
+ "local", "default", "inherited",
+ "temporary", "none", NULL };
+
+ switch (getsubopt(&optarg, source_subopts,
+ &value)) {
+ case 0:
+ cb.cb_sources |= ZFS_SRC_LOCAL;
+ break;
+ case 1:
+ cb.cb_sources |= ZFS_SRC_DEFAULT;
+ break;
+ case 2:
+ cb.cb_sources |= ZFS_SRC_INHERITED;
+ break;
+ case 3:
+ cb.cb_sources |= ZFS_SRC_TEMPORARY;
+ break;
+ case 4:
+ cb.cb_sources |= ZFS_SRC_NONE;
+ break;
+ default:
+ (void) fprintf(stderr,
+ gettext("invalid source "
+ "'%s'\n"), value);
+ usage(B_FALSE);
+ }
+ }
+ break;
+
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ if (argc < 1) {
+ (void) fprintf(stderr, gettext("missing property "
+ "argument\n"));
+ usage(B_FALSE);
+ }
+
+ fields = argv[0];
+
+ if (zfs_get_proplist(g_zfs, fields, &cb.cb_proplist) != 0)
+ usage(B_FALSE);
+
+ argc--;
+ argv++;
+
+ /*
+ * As part of zfs_expand_proplist(), we keep track of the maximum column
+ * width for each property. For the 'NAME' (and 'SOURCE') columns, we
+ * need to know the maximum name length. However, the user likely did
+ * not specify 'name' as one of the properties to fetch, so we need to
+ * make sure we always include at least this property for
+ * print_get_headers() to work properly.
+ */
+ if (cb.cb_proplist != NULL) {
+ fake_name.pl_prop = ZFS_PROP_NAME;
+ fake_name.pl_width = strlen(gettext("NAME"));
+ fake_name.pl_next = cb.cb_proplist;
+ cb.cb_proplist = &fake_name;
+ }
+
+ cb.cb_first = B_TRUE;
+
+ /* run for each object */
+ ret = zfs_for_each(argc, argv, recurse, ZFS_TYPE_ANY, NULL,
+ &cb.cb_proplist, get_callback, &cb, B_FALSE);
+
+ if (cb.cb_proplist == &fake_name)
+ zfs_free_proplist(fake_name.pl_next);
+ else
+ zfs_free_proplist(cb.cb_proplist);
+
+ return (ret);
+}
+
+/*
+ * inherit [-r] <property> <fs|vol> ...
+ *
+ * -r Recurse over all children
+ *
+ * For each dataset specified on the command line, inherit the given property
+ * from its parent. Inheriting a property at the pool level will cause it to
+ * use the default value. The '-r' flag will recurse over all children, and is
+ * useful for setting a property on a hierarchy-wide basis, regardless of any
+ * local modifications for each dataset.
+ */
+typedef struct inherit_cbdata {
+ char *cb_propname;
+ boolean_t cb_any_successful;
+} inherit_cbdata_t;
+
+static int
+inherit_callback(zfs_handle_t *zhp, void *data)
+{
+ inherit_cbdata_t *cbp = data;
+ int ret;
+
+ ret = zfs_prop_inherit(zhp, cbp->cb_propname);
+ if (ret == 0)
+ cbp->cb_any_successful = B_TRUE;
+ return (ret != 0);
+}
+
+static int
+zfs_do_inherit(int argc, char **argv)
+{
+ boolean_t recurse = B_FALSE;
+ int c;
+ zfs_prop_t prop;
+ inherit_cbdata_t cb;
+ int ret;
+
+ /* check options */
+ while ((c = getopt(argc, argv, "r")) != -1) {
+ switch (c) {
+ case 'r':
+ recurse = B_TRUE;
+ break;
+ case '?':
+ default:
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ /* check number of arguments */
+ if (argc < 1) {
+ (void) fprintf(stderr, gettext("missing property argument\n"));
+ usage(B_FALSE);
+ }
+ if (argc < 2) {
+ (void) fprintf(stderr, gettext("missing dataset argument\n"));
+ usage(B_FALSE);
+ }
+
+ cb.cb_propname = argv[0];
+ argc--;
+ argv++;
+
+ if ((prop = zfs_name_to_prop(cb.cb_propname)) != ZFS_PROP_INVAL) {
+ if (zfs_prop_readonly(prop)) {
+ (void) fprintf(stderr, gettext(
+ "%s property is read-only\n"),
+ cb.cb_propname);
+ return (1);
+ }
+ if (!zfs_prop_inheritable(prop)) {
+ (void) fprintf(stderr, gettext("'%s' property cannot "
+ "be inherited\n"), cb.cb_propname);
+ if (prop == ZFS_PROP_QUOTA ||
+ prop == ZFS_PROP_RESERVATION)
+ (void) fprintf(stderr, gettext("use 'zfs set "
+ "%s=none' to clear\n"), cb.cb_propname);
+ return (1);
+ }
+ } else if (!zfs_prop_user(cb.cb_propname)) {
+ (void) fprintf(stderr, gettext(
+ "invalid property '%s'\n"),
+ cb.cb_propname);
+ usage(B_FALSE);
+ }
+
+ cb.cb_any_successful = B_FALSE;
+
+ ret = zfs_for_each(argc, argv, recurse,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, NULL, NULL,
+ inherit_callback, &cb, B_FALSE);
+
+ if (cb.cb_any_successful) {
+ zpool_log_history(g_zfs, argc + optind + 1, argv - optind - 1,
+ argv[0], B_FALSE, B_FALSE);
+ }
+
+ return (ret);
+}
+
+/*
+ * list [-rH] [-o property[,property]...] [-t type[,type]...]
+ * [-s property [-s property]...] [-S property [-S property]...]
+ * <dataset> ...
+ *
+ * -r Recurse over all children
+ * -H Scripted mode; elide headers and separate colums by tabs
+ * -o Control which fields to display.
+ * -t Control which object types to display.
+ * -s Specify sort columns, descending order.
+ * -S Specify sort columns, ascending order.
+ *
+ * When given no arguments, lists all filesystems in the system.
+ * Otherwise, list the specified datasets, optionally recursing down them if
+ * '-r' is specified.
+ */
+typedef struct list_cbdata {
+ boolean_t cb_first;
+ boolean_t cb_scripted;
+ zfs_proplist_t *cb_proplist;
+} list_cbdata_t;
+
+/*
+ * Given a list of columns to display, output appropriate headers for each one.
+ */
+static void
+print_header(zfs_proplist_t *pl)
+{
+ char headerbuf[ZFS_MAXPROPLEN];
+ const char *header;
+ int i;
+ boolean_t first = B_TRUE;
+ boolean_t right_justify;
+
+ for (; pl != NULL; pl = pl->pl_next) {
+ if (!first) {
+ (void) printf(" ");
+ } else {
+ first = B_FALSE;
+ }
+
+ right_justify = B_FALSE;
+ if (pl->pl_prop != ZFS_PROP_INVAL) {
+ header = zfs_prop_column_name(pl->pl_prop);
+ right_justify = zfs_prop_align_right(pl->pl_prop);
+ } else {
+ for (i = 0; pl->pl_user_prop[i] != '\0'; i++)
+ headerbuf[i] = toupper(pl->pl_user_prop[i]);
+ headerbuf[i] = '\0';
+ header = headerbuf;
+ }
+
+ if (pl->pl_next == NULL && !right_justify)
+ (void) printf("%s", header);
+ else if (right_justify)
+ (void) printf("%*s", pl->pl_width, header);
+ else
+ (void) printf("%-*s", pl->pl_width, header);
+ }
+
+ (void) printf("\n");
+}
+
+/*
+ * Given a dataset and a list of fields, print out all the properties according
+ * to the described layout.
+ */
+static void
+print_dataset(zfs_handle_t *zhp, zfs_proplist_t *pl, int scripted)
+{
+ boolean_t first = B_TRUE;
+ char property[ZFS_MAXPROPLEN];
+ nvlist_t *userprops = zfs_get_user_props(zhp);
+ nvlist_t *propval;
+ char *propstr;
+ boolean_t right_justify;
+ int width;
+
+ for (; pl != NULL; pl = pl->pl_next) {
+ if (!first) {
+ if (scripted)
+ (void) printf("\t");
+ else
+ (void) printf(" ");
+ } else {
+ first = B_FALSE;
+ }
+
+ right_justify = B_FALSE;
+ if (pl->pl_prop != ZFS_PROP_INVAL) {
+ if (zfs_prop_get(zhp, pl->pl_prop, property,
+ sizeof (property), NULL, NULL, 0, B_FALSE) != 0)
+ propstr = "-";
+ else
+ propstr = property;
+
+ right_justify = zfs_prop_align_right(pl->pl_prop);
+ } else {
+ if (nvlist_lookup_nvlist(userprops,
+ pl->pl_user_prop, &propval) != 0)
+ propstr = "-";
+ else
+ verify(nvlist_lookup_string(propval,
+ ZFS_PROP_VALUE, &propstr) == 0);
+ }
+
+ width = pl->pl_width;
+
+ /*
+ * If this is being called in scripted mode, or if this is the
+ * last column and it is left-justified, don't include a width
+ * format specifier.
+ */
+ if (scripted || (pl->pl_next == NULL && !right_justify))
+ (void) printf("%s", propstr);
+ else if (right_justify)
+ (void) printf("%*s", width, propstr);
+ else
+ (void) printf("%-*s", width, propstr);
+ }
+
+ (void) printf("\n");
+}
+
+/*
+ * Generic callback function to list a dataset or snapshot.
+ */
+static int
+list_callback(zfs_handle_t *zhp, void *data)
+{
+ list_cbdata_t *cbp = data;
+
+ if (cbp->cb_first) {
+ if (!cbp->cb_scripted)
+ print_header(cbp->cb_proplist);
+ cbp->cb_first = B_FALSE;
+ }
+
+ print_dataset(zhp, cbp->cb_proplist, cbp->cb_scripted);
+
+ return (0);
+}
+
+static int
+zfs_do_list(int argc, char **argv)
+{
+ int c;
+ boolean_t recurse = B_FALSE;
+ boolean_t scripted = B_FALSE;
+ static char default_fields[] =
+ "name,used,available,referenced,mountpoint";
+ int types = ZFS_TYPE_ANY;
+ char *fields = NULL;
+ char *basic_fields = default_fields;
+ list_cbdata_t cb = { 0 };
+ char *value;
+ int ret;
+ char *type_subopts[] = { "filesystem", "volume", "snapshot", NULL };
+ zfs_sort_column_t *sortcol = NULL;
+
+ /* check options */
+ while ((c = getopt(argc, argv, ":o:rt:Hs:S:")) != -1) {
+ switch (c) {
+ case 'o':
+ fields = optarg;
+ break;
+ case 'r':
+ recurse = B_TRUE;
+ break;
+ case 'H':
+ scripted = B_TRUE;
+ break;
+ case 's':
+ if (zfs_add_sort_column(&sortcol, optarg,
+ B_FALSE) != 0) {
+ (void) fprintf(stderr,
+ gettext("invalid property '%s'\n"), optarg);
+ usage(B_FALSE);
+ }
+ break;
+ case 'S':
+ if (zfs_add_sort_column(&sortcol, optarg,
+ B_TRUE) != 0) {
+ (void) fprintf(stderr,
+ gettext("invalid property '%s'\n"), optarg);
+ usage(B_FALSE);
+ }
+ break;
+ case 't':
+ types = 0;
+ while (*optarg != '\0') {
+ switch (getsubopt(&optarg, type_subopts,
+ &value)) {
+ case 0:
+ types |= ZFS_TYPE_FILESYSTEM;
+ break;
+ case 1:
+ types |= ZFS_TYPE_VOLUME;
+ break;
+ case 2:
+ types |= ZFS_TYPE_SNAPSHOT;
+ break;
+ default:
+ (void) fprintf(stderr,
+ gettext("invalid type '%s'\n"),
+ value);
+ usage(B_FALSE);
+ }
+ }
+ break;
+ case ':':
+ (void) fprintf(stderr, gettext("missing argument for "
+ "'%c' option\n"), optopt);
+ usage(B_FALSE);
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ if (fields == NULL)
+ fields = basic_fields;
+
+ /*
+ * If the user specifies '-o all', the zfs_get_proplist() doesn't
+ * normally include the name of the dataset. For 'zfs list', we always
+ * want this property to be first.
+ */
+ if (zfs_get_proplist(g_zfs, fields, &cb.cb_proplist) != 0)
+ usage(B_FALSE);
+
+ cb.cb_scripted = scripted;
+ cb.cb_first = B_TRUE;
+
+ ret = zfs_for_each(argc, argv, recurse, types, sortcol, &cb.cb_proplist,
+ list_callback, &cb, B_TRUE);
+
+ zfs_free_proplist(cb.cb_proplist);
+ zfs_free_sort_columns(sortcol);
+
+ if (ret == 0 && cb.cb_first)
+ (void) printf(gettext("no datasets available\n"));
+
+ return (ret);
+}
+
+/*
+ * zfs rename <fs | snap | vol> <fs | snap | vol>
+ *
+ * Renames the given dataset to another of the same type.
+ */
+/* ARGSUSED */
+static int
+zfs_do_rename(int argc, char **argv)
+{
+ zfs_handle_t *zhp;
+ int ret;
+
+ /* check options */
+ if (argc > 1 && argv[1][0] == '-') {
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ argv[1][1]);
+ usage(B_FALSE);
+ }
+
+ /* check number of arguments */
+ if (argc < 2) {
+ (void) fprintf(stderr, gettext("missing source dataset "
+ "argument\n"));
+ usage(B_FALSE);
+ }
+ if (argc < 3) {
+ (void) fprintf(stderr, gettext("missing target dataset "
+ "argument\n"));
+ usage(B_FALSE);
+ }
+ if (argc > 3) {
+ (void) fprintf(stderr, gettext("too many arguments\n"));
+ usage(B_FALSE);
+ }
+
+ if ((zhp = zfs_open(g_zfs, argv[1], ZFS_TYPE_ANY)) == NULL)
+ return (1);
+
+ ret = (zfs_rename(zhp, argv[2]) != 0);
+
+ if (!ret)
+ zpool_log_history(g_zfs, argc, argv, argv[2], B_FALSE, B_FALSE);
+
+ zfs_close(zhp);
+ return (ret);
+}
+
+/*
+ * zfs promote <fs>
+ *
+ * Promotes the given clone fs to be the parent
+ */
+/* ARGSUSED */
+static int
+zfs_do_promote(int argc, char **argv)
+{
+ zfs_handle_t *zhp;
+ int ret;
+
+ /* check options */
+ if (argc > 1 && argv[1][0] == '-') {
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ argv[1][1]);
+ usage(B_FALSE);
+ }
+
+ /* check number of arguments */
+ if (argc < 2) {
+ (void) fprintf(stderr, gettext("missing clone filesystem"
+ " argument\n"));
+ usage(B_FALSE);
+ }
+ if (argc > 2) {
+ (void) fprintf(stderr, gettext("too many arguments\n"));
+ usage(B_FALSE);
+ }
+
+ zhp = zfs_open(g_zfs, argv[1], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
+ if (zhp == NULL)
+ return (1);
+
+ ret = (zfs_promote(zhp) != 0);
+
+ if (!ret)
+ zpool_log_history(g_zfs, argc, argv, argv[1], B_FALSE, B_FALSE);
+
+ zfs_close(zhp);
+ return (ret);
+}
+
+/*
+ * zfs rollback [-rfR] <snapshot>
+ *
+ * -r Delete any intervening snapshots before doing rollback
+ * -R Delete any snapshots and their clones
+ * -f Force unmount filesystems, even if they are in use.
+ *
+ * Given a filesystem, rollback to a specific snapshot, discarding any changes
+ * since then and making it the active dataset. If more recent snapshots exist,
+ * the command will complain unless the '-r' flag is given.
+ */
+typedef struct rollback_cbdata {
+ uint64_t cb_create;
+ boolean_t cb_first;
+ int cb_doclones;
+ char *cb_target;
+ int cb_error;
+ boolean_t cb_recurse;
+ boolean_t cb_dependent;
+} rollback_cbdata_t;
+
+/*
+ * Report any snapshots more recent than the one specified. Used when '-r' is
+ * not specified. We reuse this same callback for the snapshot dependents - if
+ * 'cb_dependent' is set, then this is a dependent and we should report it
+ * without checking the transaction group.
+ */
+static int
+rollback_check(zfs_handle_t *zhp, void *data)
+{
+ rollback_cbdata_t *cbp = data;
+
+ if (cbp->cb_doclones) {
+ zfs_close(zhp);
+ return (0);
+ }
+
+ if (!cbp->cb_dependent) {
+ if (strcmp(zfs_get_name(zhp), cbp->cb_target) != 0 &&
+ zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT &&
+ zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) >
+ cbp->cb_create) {
+
+ if (cbp->cb_first && !cbp->cb_recurse) {
+ (void) fprintf(stderr, gettext("cannot "
+ "rollback to '%s': more recent snapshots "
+ "exist\n"),
+ cbp->cb_target);
+ (void) fprintf(stderr, gettext("use '-r' to "
+ "force deletion of the following "
+ "snapshots:\n"));
+ cbp->cb_first = 0;
+ cbp->cb_error = 1;
+ }
+
+ if (cbp->cb_recurse) {
+ cbp->cb_dependent = B_TRUE;
+ if (zfs_iter_dependents(zhp, B_TRUE,
+ rollback_check, cbp) != 0) {
+ zfs_close(zhp);
+ return (-1);
+ }
+ cbp->cb_dependent = B_FALSE;
+ } else {
+ (void) fprintf(stderr, "%s\n",
+ zfs_get_name(zhp));
+ }
+ }
+ } else {
+ if (cbp->cb_first && cbp->cb_recurse) {
+ (void) fprintf(stderr, gettext("cannot rollback to "
+ "'%s': clones of previous snapshots exist\n"),
+ cbp->cb_target);
+ (void) fprintf(stderr, gettext("use '-R' to "
+ "force deletion of the following clones and "
+ "dependents:\n"));
+ cbp->cb_first = 0;
+ cbp->cb_error = 1;
+ }
+
+ (void) fprintf(stderr, "%s\n", zfs_get_name(zhp));
+ }
+
+ zfs_close(zhp);
+ return (0);
+}
+
+static int
+zfs_do_rollback(int argc, char **argv)
+{
+ int ret;
+ int c;
+ rollback_cbdata_t cb = { 0 };
+ zfs_handle_t *zhp, *snap;
+ char parentname[ZFS_MAXNAMELEN];
+ char *delim;
+ int force = 0;
+
+ /* check options */
+ while ((c = getopt(argc, argv, "rfR")) != -1) {
+ switch (c) {
+ case 'f':
+ force = 1;
+ break;
+ case 'r':
+ cb.cb_recurse = 1;
+ break;
+ case 'R':
+ cb.cb_recurse = 1;
+ cb.cb_doclones = 1;
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ /* check number of arguments */
+ if (argc < 1) {
+ (void) fprintf(stderr, gettext("missing dataset argument\n"));
+ usage(B_FALSE);
+ }
+ if (argc > 1) {
+ (void) fprintf(stderr, gettext("too many arguments\n"));
+ usage(B_FALSE);
+ }
+
+ /* open the snapshot */
+ if ((snap = zfs_open(g_zfs, argv[0], ZFS_TYPE_SNAPSHOT)) == NULL)
+ return (1);
+
+ /* open the parent dataset */
+ (void) strlcpy(parentname, argv[0], sizeof (parentname));
+ verify((delim = strrchr(parentname, '@')) != NULL);
+ *delim = '\0';
+ if ((zhp = zfs_open(g_zfs, parentname, ZFS_TYPE_ANY)) == NULL) {
+ zfs_close(snap);
+ return (1);
+ }
+
+ /*
+ * Check for more recent snapshots and/or clones based on the presence
+ * of '-r' and '-R'.
+ */
+ cb.cb_target = argv[0];
+ cb.cb_create = zfs_prop_get_int(snap, ZFS_PROP_CREATETXG);
+ cb.cb_first = B_TRUE;
+ cb.cb_error = 0;
+ if ((ret = zfs_iter_children(zhp, rollback_check, &cb)) != 0)
+ goto out;
+
+ if ((ret = cb.cb_error) != 0)
+ goto out;
+
+ /*
+ * Rollback parent to the given snapshot.
+ */
+ ret = zfs_rollback(zhp, snap, force);
+
+ if (!ret) {
+ zpool_log_history(g_zfs, argc + optind, argv - optind, argv[0],
+ B_FALSE, B_FALSE);
+ }
+
+out:
+ zfs_close(snap);
+ zfs_close(zhp);
+
+ if (ret == 0)
+ return (0);
+ else
+ return (1);
+}
+
+/*
+ * zfs set property=value { fs | snap | vol } ...
+ *
+ * Sets the given property for all datasets specified on the command line.
+ */
+typedef struct set_cbdata {
+ char *cb_propname;
+ char *cb_value;
+ boolean_t cb_any_successful;
+} set_cbdata_t;
+
+static int
+set_callback(zfs_handle_t *zhp, void *data)
+{
+ set_cbdata_t *cbp = data;
+
+ if (zfs_prop_set(zhp, cbp->cb_propname, cbp->cb_value) != 0) {
+ switch (libzfs_errno(g_zfs)) {
+ case EZFS_MOUNTFAILED:
+ (void) fprintf(stderr, gettext("property may be set "
+ "but unable to remount filesystem\n"));
+ break;
+ case EZFS_SHARENFSFAILED:
+ (void) fprintf(stderr, gettext("property may be set "
+ "but unable to reshare filesystem\n"));
+ break;
+ }
+ return (1);
+ }
+ cbp->cb_any_successful = B_TRUE;
+ return (0);
+}
+
+static int
+zfs_do_set(int argc, char **argv)
+{
+ set_cbdata_t cb;
+ int ret;
+
+ /* check for options */
+ if (argc > 1 && argv[1][0] == '-') {
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ argv[1][1]);
+ usage(B_FALSE);
+ }
+
+ /* check number of arguments */
+ if (argc < 2) {
+ (void) fprintf(stderr, gettext("missing property=value "
+ "argument\n"));
+ usage(B_FALSE);
+ }
+ if (argc < 3) {
+ (void) fprintf(stderr, gettext("missing dataset name\n"));
+ usage(B_FALSE);
+ }
+
+ /* validate property=value argument */
+ cb.cb_propname = argv[1];
+ if ((cb.cb_value = strchr(cb.cb_propname, '=')) == NULL) {
+ (void) fprintf(stderr, gettext("missing value in "
+ "property=value argument\n"));
+ usage(B_FALSE);
+ }
+
+ *cb.cb_value = '\0';
+ cb.cb_value++;
+ cb.cb_any_successful = B_FALSE;
+
+ if (*cb.cb_propname == '\0') {
+ (void) fprintf(stderr,
+ gettext("missing property in property=value argument\n"));
+ usage(B_FALSE);
+ }
+
+ ret = zfs_for_each(argc - 2, argv + 2, B_FALSE,
+ ZFS_TYPE_ANY, NULL, NULL, set_callback, &cb, B_FALSE);
+
+ if (cb.cb_any_successful) {
+ *(cb.cb_value - 1) = '=';
+ zpool_log_history(g_zfs, argc, argv, argv[2], B_FALSE, B_FALSE);
+ }
+
+ return (ret);
+}
+
+/*
+ * zfs snapshot [-r] <fs@snap>
+ *
+ * Creates a snapshot with the given name. While functionally equivalent to
+ * 'zfs create', it is a separate command to diffferentiate intent.
+ */
+static int
+zfs_do_snapshot(int argc, char **argv)
+{
+ int recursive = B_FALSE;
+ int ret;
+ char c;
+
+ /* check options */
+ while ((c = getopt(argc, argv, ":r")) != -1) {
+ switch (c) {
+ case 'r':
+ recursive = B_TRUE;
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ /* check number of arguments */
+ if (argc < 1) {
+ (void) fprintf(stderr, gettext("missing snapshot argument\n"));
+ usage(B_FALSE);
+ }
+ if (argc > 1) {
+ (void) fprintf(stderr, gettext("too many arguments\n"));
+ usage(B_FALSE);
+ }
+
+ ret = zfs_snapshot(g_zfs, argv[0], recursive);
+ if (ret && recursive)
+ (void) fprintf(stderr, gettext("no snapshots were created\n"));
+ if (!ret) {
+ zpool_log_history(g_zfs, argc + optind, argv - optind, argv[0],
+ B_FALSE, B_FALSE);
+ }
+ return (ret != 0);
+}
+
+/*
+ * zfs send [-i <@snap>] <fs@snap>
+ *
+ * Send a backup stream to stdout.
+ */
+static int
+zfs_do_send(int argc, char **argv)
+{
+ char *fromname = NULL;
+ char *cp;
+ zfs_handle_t *zhp;
+ int c, err;
+
+ /* check options */
+ while ((c = getopt(argc, argv, ":i:")) != -1) {
+ switch (c) {
+ case 'i':
+ if (fromname)
+ usage(B_FALSE);
+ fromname = optarg;
+ break;
+ case ':':
+ (void) fprintf(stderr, gettext("missing argument for "
+ "'%c' option\n"), optopt);
+ usage(B_FALSE);
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ /* check number of arguments */
+ if (argc < 1) {
+ (void) fprintf(stderr, gettext("missing snapshot argument\n"));
+ usage(B_FALSE);
+ }
+ if (argc > 1) {
+ (void) fprintf(stderr, gettext("too many arguments\n"));
+ usage(B_FALSE);
+ }
+
+ if (isatty(STDOUT_FILENO)) {
+ (void) fprintf(stderr,
+ gettext("Error: Stream can not be written to a terminal.\n"
+ "You must redirect standard output.\n"));
+ return (1);
+ }
+
+ if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_SNAPSHOT)) == NULL)
+ return (1);
+
+ /*
+ * If they specified the full path to the snapshot, chop off
+ * everything except the short name of the snapshot.
+ */
+ if (fromname && (cp = strchr(fromname, '@')) != NULL) {
+ if (cp != fromname &&
+ strncmp(argv[0], fromname, cp - fromname + 1)) {
+ (void) fprintf(stderr,
+ gettext("incremental source must be "
+ "in same filesystem\n"));
+ usage(B_FALSE);
+ }
+ fromname = cp + 1;
+ if (strchr(fromname, '@') || strchr(fromname, '/')) {
+ (void) fprintf(stderr,
+ gettext("invalid incremental source\n"));
+ usage(B_FALSE);
+ }
+ }
+
+ err = zfs_send(zhp, fromname, STDOUT_FILENO);
+ zfs_close(zhp);
+
+ return (err != 0);
+}
+
+/*
+ * zfs receive <fs@snap>
+ *
+ * Restore a backup stream from stdin.
+ */
+static int
+zfs_do_receive(int argc, char **argv)
+{
+ int c, err;
+ boolean_t isprefix = B_FALSE;
+ boolean_t dryrun = B_FALSE;
+ boolean_t verbose = B_FALSE;
+ boolean_t force = B_FALSE;
+
+ /* check options */
+ while ((c = getopt(argc, argv, ":dnvF")) != -1) {
+ switch (c) {
+ case 'd':
+ isprefix = B_TRUE;
+ break;
+ case 'n':
+ dryrun = B_TRUE;
+ break;
+ case 'v':
+ verbose = B_TRUE;
+ break;
+ case 'F':
+ force = B_TRUE;
+ break;
+ case ':':
+ (void) fprintf(stderr, gettext("missing argument for "
+ "'%c' option\n"), optopt);
+ usage(B_FALSE);
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ /* check number of arguments */
+ if (argc < 1) {
+ (void) fprintf(stderr, gettext("missing snapshot argument\n"));
+ usage(B_FALSE);
+ }
+ if (argc > 1) {
+ (void) fprintf(stderr, gettext("too many arguments\n"));
+ usage(B_FALSE);
+ }
+
+ if (isatty(STDIN_FILENO)) {
+ (void) fprintf(stderr,
+ gettext("Error: Backup stream can not be read "
+ "from a terminal.\n"
+ "You must redirect standard input.\n"));
+ return (1);
+ }
+
+ err = zfs_receive(g_zfs, argv[0], isprefix, verbose, dryrun, force,
+ STDIN_FILENO);
+
+ if (!err) {
+ zpool_log_history(g_zfs, argc + optind, argv - optind, argv[0],
+ B_FALSE, B_FALSE);
+ }
+
+ return (err != 0);
+}
+
+typedef struct get_all_cbdata {
+ zfs_handle_t **cb_handles;
+ size_t cb_alloc;
+ size_t cb_used;
+ uint_t cb_types;
+} get_all_cbdata_t;
+
+static int
+get_one_dataset(zfs_handle_t *zhp, void *data)
+{
+ get_all_cbdata_t *cbp = data;
+ zfs_type_t type = zfs_get_type(zhp);
+
+ /*
+ * Interate over any nested datasets.
+ */
+ if (type == ZFS_TYPE_FILESYSTEM &&
+ zfs_iter_filesystems(zhp, get_one_dataset, data) != 0) {
+ zfs_close(zhp);
+ return (1);
+ }
+
+ /*
+ * Skip any datasets whose type does not match.
+ */
+ if ((type & cbp->cb_types) == 0) {
+ zfs_close(zhp);
+ return (0);
+ }
+
+ if (cbp->cb_alloc == cbp->cb_used) {
+ zfs_handle_t **handles;
+
+ if (cbp->cb_alloc == 0)
+ cbp->cb_alloc = 64;
+ else
+ cbp->cb_alloc *= 2;
+
+ handles = safe_malloc(cbp->cb_alloc * sizeof (void *));
+
+ if (cbp->cb_handles) {
+ bcopy(cbp->cb_handles, handles,
+ cbp->cb_used * sizeof (void *));
+ free(cbp->cb_handles);
+ }
+
+ cbp->cb_handles = handles;
+ }
+
+ cbp->cb_handles[cbp->cb_used++] = zhp;
+
+ return (0);
+}
+
+static void
+get_all_datasets(uint_t types, zfs_handle_t ***dslist, size_t *count)
+{
+ get_all_cbdata_t cb = { 0 };
+ cb.cb_types = types;
+
+ (void) zfs_iter_root(g_zfs, get_one_dataset, &cb);
+
+ *dslist = cb.cb_handles;
+ *count = cb.cb_used;
+}
+
+static int
+dataset_cmp(const void *a, const void *b)
+{
+ zfs_handle_t **za = (zfs_handle_t **)a;
+ zfs_handle_t **zb = (zfs_handle_t **)b;
+ char mounta[MAXPATHLEN];
+ char mountb[MAXPATHLEN];
+ boolean_t gota, gotb;
+
+ if ((gota = (zfs_get_type(*za) == ZFS_TYPE_FILESYSTEM)) != 0)
+ verify(zfs_prop_get(*za, ZFS_PROP_MOUNTPOINT, mounta,
+ sizeof (mounta), NULL, NULL, 0, B_FALSE) == 0);
+ if ((gotb = (zfs_get_type(*zb) == ZFS_TYPE_FILESYSTEM)) != 0)
+ verify(zfs_prop_get(*zb, ZFS_PROP_MOUNTPOINT, mountb,
+ sizeof (mountb), NULL, NULL, 0, B_FALSE) == 0);
+
+ if (gota && gotb)
+ return (strcmp(mounta, mountb));
+
+ if (gota)
+ return (-1);
+ if (gotb)
+ return (1);
+
+ return (strcmp(zfs_get_name(a), zfs_get_name(b)));
+}
+
+/*
+ * Generic callback for sharing or mounting filesystems. Because the code is so
+ * similar, we have a common function with an extra parameter to determine which
+ * mode we are using.
+ */
+#define OP_SHARE 0x1
+#define OP_MOUNT 0x2
+
+/*
+ * Share or mount a dataset.
+ */
+static int
+share_mount_one(zfs_handle_t *zhp, int op, int flags, boolean_t explicit,
+ const char *options)
+{
+ char mountpoint[ZFS_MAXPROPLEN];
+ char shareopts[ZFS_MAXPROPLEN];
+ const char *cmdname = op == OP_SHARE ? "share" : "mount";
+ struct mnttab mnt;
+ uint64_t zoned, canmount;
+ zfs_type_t type = zfs_get_type(zhp);
+
+ assert(type & (ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME));
+
+ if (type == ZFS_TYPE_FILESYSTEM) {
+ /*
+ * Check to make sure we can mount/share this dataset. If we
+ * are in the global zone and the filesystem is exported to a
+ * local zone, or if we are in a local zone and the
+ * filesystem is not exported, then it is an error.
+ */
+ zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED);
+
+ if (zoned && getzoneid() == GLOBAL_ZONEID) {
+ if (!explicit)
+ return (0);
+
+ (void) fprintf(stderr, gettext("cannot %s '%s': "
+ "dataset is exported to a local zone\n"), cmdname,
+ zfs_get_name(zhp));
+ return (1);
+
+ } else if (!zoned && getzoneid() != GLOBAL_ZONEID) {
+ if (!explicit)
+ return (0);
+
+ (void) fprintf(stderr, gettext("cannot %s '%s': "
+ "permission denied\n"), cmdname,
+ zfs_get_name(zhp));
+ return (1);
+ }
+
+ /*
+ * Ignore any filesystems which don't apply to us. This
+ * includes those with a legacy mountpoint, or those with
+ * legacy share options.
+ */
+ verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mountpoint,
+ sizeof (mountpoint), NULL, NULL, 0, B_FALSE) == 0);
+ verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, shareopts,
+ sizeof (shareopts), NULL, NULL, 0, B_FALSE) == 0);
+ canmount = zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT);
+
+ if (op == OP_SHARE && strcmp(shareopts, "off") == 0) {
+ if (!explicit)
+ return (0);
+
+ (void) fprintf(stderr, gettext("cannot share '%s': "
+ "legacy share\n"), zfs_get_name(zhp));
+ (void) fprintf(stderr, gettext("use share(1M) to "
+ "share this filesystem\n"));
+ return (1);
+ }
+
+ /*
+ * We cannot share or mount legacy filesystems. If the
+ * shareopts is non-legacy but the mountpoint is legacy, we
+ * treat it as a legacy share.
+ */
+ if (strcmp(mountpoint, "legacy") == 0) {
+ if (!explicit)
+ return (0);
+
+ (void) fprintf(stderr, gettext("cannot %s '%s': "
+ "legacy mountpoint\n"), cmdname, zfs_get_name(zhp));
+ (void) fprintf(stderr, gettext("use %s to "
+ "%s this filesystem\n"), op == OP_SHARE ?
+ "share(1M)" : "mount(1M)", cmdname);
+ return (1);
+ }
+
+ if (strcmp(mountpoint, "none") == 0) {
+ if (!explicit)
+ return (0);
+
+ (void) fprintf(stderr, gettext("cannot %s '%s': no "
+ "mountpoint set\n"), cmdname, zfs_get_name(zhp));
+ return (1);
+ }
+
+ if (!canmount) {
+ if (!explicit)
+ return (0);
+
+ (void) fprintf(stderr, gettext("cannot %s '%s': "
+ "'canmount' property is set to 'off'\n"), cmdname,
+ zfs_get_name(zhp));
+ return (1);
+ }
+
+ /*
+ * At this point, we have verified that the mountpoint and/or
+ * shareopts are appropriate for auto management. If the
+ * filesystem is already mounted or shared, return (failing
+ * for explicit requests); otherwise mount or share the
+ * filesystem.
+ */
+ switch (op) {
+ case OP_SHARE:
+ if (zfs_is_shared_nfs(zhp, NULL)) {
+ if (!explicit)
+ return (0);
+
+ (void) fprintf(stderr, gettext("cannot share "
+ "'%s': filesystem already shared\n"),
+ zfs_get_name(zhp));
+ return (1);
+ }
+
+ if (!zfs_is_mounted(zhp, NULL) &&
+ zfs_mount(zhp, NULL, 0) != 0)
+ return (1);
+
+ if (zfs_share_nfs(zhp) != 0)
+ return (1);
+ break;
+
+ case OP_MOUNT:
+ if (options == NULL)
+ mnt.mnt_mntopts = "";
+ else
+ mnt.mnt_mntopts = (char *)options;
+
+ if (!hasmntopt(&mnt, MNTOPT_REMOUNT) &&
+ zfs_is_mounted(zhp, NULL)) {
+ if (!explicit)
+ return (0);
+
+ (void) fprintf(stderr, gettext("cannot mount "
+ "'%s': filesystem already mounted\n"),
+ zfs_get_name(zhp));
+ return (1);
+ }
+
+ if (zfs_mount(zhp, options, flags) != 0)
+ return (1);
+ break;
+ }
+ } else {
+ assert(op == OP_SHARE);
+
+ /*
+ * Ignore any volumes that aren't shared.
+ */
+ verify(zfs_prop_get(zhp, ZFS_PROP_SHAREISCSI, shareopts,
+ sizeof (shareopts), NULL, NULL, 0, B_FALSE) == 0);
+
+ if (strcmp(shareopts, "off") == 0) {
+ if (!explicit)
+ return (0);
+
+ (void) fprintf(stderr, gettext("cannot share '%s': "
+ "'shareiscsi' property not set\n"),
+ zfs_get_name(zhp));
+ (void) fprintf(stderr, gettext("set 'shareiscsi' "
+ "property or use iscsitadm(1M) to share this "
+ "volume\n"));
+ return (1);
+ }
+
+ if (zfs_is_shared_iscsi(zhp)) {
+ if (!explicit)
+ return (0);
+
+ (void) fprintf(stderr, gettext("cannot share "
+ "'%s': volume already shared\n"),
+ zfs_get_name(zhp));
+ return (1);
+ }
+
+ if (zfs_share_iscsi(zhp) != 0)
+ return (1);
+ }
+
+ return (0);
+}
+
+static int
+share_mount(int op, int argc, char **argv)
+{
+ int do_all = 0;
+ int c, ret = 0;
+ const char *options = NULL;
+ int types, flags = 0;
+
+ /* check options */
+ while ((c = getopt(argc, argv, op == OP_MOUNT ? ":ao:O" : "a"))
+ != -1) {
+ switch (c) {
+ case 'a':
+ do_all = 1;
+ break;
+ case 'o':
+ options = optarg;
+ break;
+ case 'O':
+ warnx("no overlay mounts support on FreeBSD, ignoring");
+ break;
+ case ':':
+ (void) fprintf(stderr, gettext("missing argument for "
+ "'%c' option\n"), optopt);
+ usage(B_FALSE);
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ /* check number of arguments */
+ if (do_all) {
+ zfs_handle_t **dslist = NULL;
+ size_t i, count = 0;
+
+ if (op == OP_MOUNT) {
+ types = ZFS_TYPE_FILESYSTEM;
+ } else if (argc > 0) {
+ if (strcmp(argv[0], "nfs") == 0) {
+ types = ZFS_TYPE_FILESYSTEM;
+ } else if (strcmp(argv[0], "iscsi") == 0) {
+ types = ZFS_TYPE_VOLUME;
+ } else {
+ (void) fprintf(stderr, gettext("share type "
+ "must be 'nfs' or 'iscsi'\n"));
+ usage(B_FALSE);
+ }
+
+ argc--;
+ argv++;
+ } else {
+ types = ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME;
+ }
+
+ if (argc != 0) {
+ (void) fprintf(stderr, gettext("too many arguments\n"));
+ usage(B_FALSE);
+ }
+
+ get_all_datasets(types, &dslist, &count);
+
+ if (count == 0)
+ return (0);
+
+ qsort(dslist, count, sizeof (void *), dataset_cmp);
+
+ for (i = 0; i < count; i++) {
+ if (share_mount_one(dslist[i], op, flags, B_FALSE,
+ options) != 0)
+ ret = 1;
+ zfs_close(dslist[i]);
+ }
+
+ free(dslist);
+ } else if (argc == 0) {
+ struct statfs *sfs;
+ int i, n;
+
+ if (op == OP_SHARE) {
+ (void) fprintf(stderr, gettext("missing filesystem "
+ "argument\n"));
+ usage(B_FALSE);
+ }
+
+ /*
+ * When mount is given no arguments, go through /etc/mnttab and
+ * display any active ZFS mounts. We hide any snapshots, since
+ * they are controlled automatically.
+ */
+ if ((n = getmntinfo(&sfs, MNT_WAIT)) == 0) {
+ fprintf(stderr, "getmntinfo(): %s\n", strerror(errno));
+ return (0);
+ }
+ for (i = 0; i < n; i++) {
+ if (strcmp(sfs[i].f_fstypename, MNTTYPE_ZFS) != 0 ||
+ strchr(sfs[i].f_mntfromname, '@') != NULL)
+ continue;
+
+ (void) printf("%-30s %s\n", sfs[i].f_mntfromname,
+ sfs[i].f_mntonname);
+ }
+
+ } else {
+ zfs_handle_t *zhp;
+
+ types = ZFS_TYPE_FILESYSTEM;
+ if (op == OP_SHARE)
+ types |= ZFS_TYPE_VOLUME;
+
+ if (argc > 1) {
+ (void) fprintf(stderr,
+ gettext("too many arguments\n"));
+ usage(B_FALSE);
+ }
+
+ if ((zhp = zfs_open(g_zfs, argv[0], types)) == NULL) {
+ ret = 1;
+ } else {
+ ret = share_mount_one(zhp, op, flags, B_TRUE,
+ options);
+ zfs_close(zhp);
+ }
+ }
+
+ return (ret);
+}
+
+/*
+ * zfs mount -a [nfs | iscsi]
+ * zfs mount filesystem
+ *
+ * Mount all filesystems, or mount the given filesystem.
+ */
+static int
+zfs_do_mount(int argc, char **argv)
+{
+ return (share_mount(OP_MOUNT, argc, argv));
+}
+
+/*
+ * zfs share -a [nfs | iscsi]
+ * zfs share filesystem
+ *
+ * Share all filesystems, or share the given filesystem.
+ */
+static int
+zfs_do_share(int argc, char **argv)
+{
+ return (share_mount(OP_SHARE, argc, argv));
+}
+
+typedef struct unshare_unmount_node {
+ zfs_handle_t *un_zhp;
+ char *un_mountp;
+ uu_avl_node_t un_avlnode;
+} unshare_unmount_node_t;
+
+/* ARGSUSED */
+static int
+unshare_unmount_compare(const void *larg, const void *rarg, void *unused)
+{
+ const unshare_unmount_node_t *l = larg;
+ const unshare_unmount_node_t *r = rarg;
+
+ return (strcmp(l->un_mountp, r->un_mountp));
+}
+
+/*
+ * Convenience routine used by zfs_do_umount() and manual_unmount(). Given an
+ * absolute path, find the entry /etc/mnttab, verify that its a ZFS filesystem,
+ * and unmount it appropriately.
+ */
+static int
+unshare_unmount_path(int op, char *path, int flags, boolean_t is_manual)
+{
+ zfs_handle_t *zhp;
+ int ret;
+ struct mnttab search = { 0 }, entry;
+ const char *cmdname = (op == OP_SHARE) ? "unshare" : "unmount";
+ char property[ZFS_MAXPROPLEN];
+
+ /*
+ * Search for the given (major,minor) pair in the mount table.
+ */
+ search.mnt_mountp = path;
+ rewind(mnttab_file);
+ if (getmntany(mnttab_file, &entry, &search) != 0) {
+ (void) fprintf(stderr, gettext("cannot %s '%s': not "
+ "currently mounted\n"), cmdname, path);
+ return (1);
+ }
+
+ if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) {
+ (void) fprintf(stderr, gettext("cannot %s '%s': not a ZFS "
+ "filesystem\n"), cmdname, path);
+ return (1);
+ }
+
+ if ((zhp = zfs_open(g_zfs, entry.mnt_special,
+ ZFS_TYPE_FILESYSTEM)) == NULL)
+ return (1);
+
+ verify(zfs_prop_get(zhp, op == OP_SHARE ?
+ ZFS_PROP_SHARENFS : ZFS_PROP_MOUNTPOINT, property,
+ sizeof (property), NULL, NULL, 0, B_FALSE) == 0);
+
+ if (op == OP_SHARE) {
+ if (strcmp(property, "off") == 0) {
+ (void) fprintf(stderr, gettext("cannot unshare "
+ "'%s': legacy share\n"), path);
+ (void) fprintf(stderr, gettext("use "
+ "unshare(1M) to unshare this filesystem\n"));
+ ret = 1;
+ } else if (!zfs_is_shared_nfs(zhp, NULL)) {
+ (void) fprintf(stderr, gettext("cannot unshare '%s': "
+ "not currently shared\n"), path);
+ ret = 1;
+ } else {
+ ret = zfs_unshareall_nfs(zhp);
+ }
+ } else {
+ if (is_manual) {
+ ret = zfs_unmount(zhp, NULL, flags);
+ } else if (strcmp(property, "legacy") == 0) {
+ (void) fprintf(stderr, gettext("cannot unmount "
+ "'%s': legacy mountpoint\n"),
+ zfs_get_name(zhp));
+ (void) fprintf(stderr, gettext("use umount(1M) "
+ "to unmount this filesystem\n"));
+ ret = 1;
+ } else {
+ ret = zfs_unmountall(zhp, flags);
+ }
+ }
+
+ zfs_close(zhp);
+
+ return (ret != 0);
+}
+
+/*
+ * Generic callback for unsharing or unmounting a filesystem.
+ */
+static int
+unshare_unmount(int op, int argc, char **argv)
+{
+ int do_all = 0;
+ int flags = 0;
+ int ret = 0;
+ int types, c;
+ zfs_handle_t *zhp;
+ char property[ZFS_MAXPROPLEN];
+
+ /* check options */
+ while ((c = getopt(argc, argv, op == OP_SHARE ? "a" : "af")) != -1) {
+ switch (c) {
+ case 'a':
+ do_all = 1;
+ break;
+ case 'f':
+ flags = MS_FORCE;
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ if (do_all) {
+ /*
+ * We could make use of zfs_for_each() to walk all datasets in
+ * the system, but this would be very inefficient, especially
+ * since we would have to linearly search /etc/mnttab for each
+ * one. Instead, do one pass through /etc/mnttab looking for
+ * zfs entries and call zfs_unmount() for each one.
+ *
+ * Things get a little tricky if the administrator has created
+ * mountpoints beneath other ZFS filesystems. In this case, we
+ * have to unmount the deepest filesystems first. To accomplish
+ * this, we place all the mountpoints in an AVL tree sorted by
+ * the special type (dataset name), and walk the result in
+ * reverse to make sure to get any snapshots first.
+ */
+ uu_avl_pool_t *pool;
+ uu_avl_t *tree;
+ unshare_unmount_node_t *node;
+ uu_avl_index_t idx;
+ uu_avl_walk_t *walk;
+ struct statfs *sfs;
+ int i, n;
+
+ if (argc != 0) {
+ (void) fprintf(stderr, gettext("too many arguments\n"));
+ usage(B_FALSE);
+ }
+
+ if ((pool = uu_avl_pool_create("unmount_pool",
+ sizeof (unshare_unmount_node_t),
+ offsetof(unshare_unmount_node_t, un_avlnode),
+ unshare_unmount_compare,
+ UU_DEFAULT)) == NULL) {
+ (void) fprintf(stderr, gettext("internal error: "
+ "out of memory\n"));
+ exit(1);
+ }
+
+ if ((tree = uu_avl_create(pool, NULL, UU_DEFAULT)) == NULL) {
+ (void) fprintf(stderr, gettext("internal error: "
+ "out of memory\n"));
+ exit(1);
+ }
+
+ if ((n = getmntinfo(&sfs, MNT_WAIT)) == 0) {
+ (void) fprintf(stderr, gettext("internal error: "
+ "getmntinfo() failed\n"));
+ exit(1);
+ }
+ for (i = 0; i < n; i++) {
+
+ /* ignore non-ZFS entries */
+ if (strcmp(sfs[i].f_fstypename, MNTTYPE_ZFS) != 0)
+ continue;
+
+ /* ignore snapshots */
+ if (strchr(sfs[i].f_mntfromname, '@') != NULL)
+ continue;
+
+ if ((zhp = zfs_open(g_zfs, sfs[i].f_mntfromname,
+ ZFS_TYPE_FILESYSTEM)) == NULL) {
+ ret = 1;
+ continue;
+ }
+
+ verify(zfs_prop_get(zhp, op == OP_SHARE ?
+ ZFS_PROP_SHARENFS : ZFS_PROP_MOUNTPOINT,
+ property, sizeof (property), NULL, NULL,
+ 0, B_FALSE) == 0);
+
+ /* Ignore legacy mounts and shares */
+ if ((op == OP_SHARE &&
+ strcmp(property, "off") == 0) ||
+ (op == OP_MOUNT &&
+ strcmp(property, "legacy") == 0)) {
+ zfs_close(zhp);
+ continue;
+ }
+
+ node = safe_malloc(sizeof (unshare_unmount_node_t));
+ node->un_zhp = zhp;
+
+ if ((node->un_mountp = strdup(sfs[i].f_mntonname)) ==
+ NULL) {
+ (void) fprintf(stderr, gettext("internal error:"
+ " out of memory\n"));
+ exit(1);
+ }
+
+ uu_avl_node_init(node, &node->un_avlnode, pool);
+
+ if (uu_avl_find(tree, node, NULL, &idx) == NULL) {
+ uu_avl_insert(tree, node, idx);
+ } else {
+ zfs_close(node->un_zhp);
+ free(node->un_mountp);
+ free(node);
+ }
+ }
+
+ /*
+ * Walk the AVL tree in reverse, unmounting each filesystem and
+ * removing it from the AVL tree in the process.
+ */
+ if ((walk = uu_avl_walk_start(tree,
+ UU_WALK_REVERSE | UU_WALK_ROBUST)) == NULL) {
+ (void) fprintf(stderr,
+ gettext("internal error: out of memory"));
+ exit(1);
+ }
+
+ while ((node = uu_avl_walk_next(walk)) != NULL) {
+ uu_avl_remove(tree, node);
+
+ switch (op) {
+ case OP_SHARE:
+ if (zfs_unshare_nfs(node->un_zhp,
+ node->un_mountp) != 0)
+ ret = 1;
+ break;
+
+ case OP_MOUNT:
+ if (zfs_unmount(node->un_zhp,
+ node->un_mountp, flags) != 0)
+ ret = 1;
+ break;
+ }
+
+ zfs_close(node->un_zhp);
+ free(node->un_mountp);
+ free(node);
+ }
+
+ uu_avl_walk_end(walk);
+ uu_avl_destroy(tree);
+ uu_avl_pool_destroy(pool);
+
+ if (op == OP_SHARE) {
+ /*
+ * Finally, unshare any volumes shared via iSCSI.
+ */
+ zfs_handle_t **dslist = NULL;
+ size_t i, count = 0;
+
+ get_all_datasets(ZFS_TYPE_VOLUME, &dslist, &count);
+
+ if (count != 0) {
+ qsort(dslist, count, sizeof (void *),
+ dataset_cmp);
+
+ for (i = 0; i < count; i++) {
+ if (zfs_unshare_iscsi(dslist[i]) != 0)
+ ret = 1;
+ zfs_close(dslist[i]);
+ }
+
+ free(dslist);
+ }
+ }
+ } else {
+ if (argc != 1) {
+ if (argc == 0)
+ (void) fprintf(stderr,
+ gettext("missing filesystem argument\n"));
+ else
+ (void) fprintf(stderr,
+ gettext("too many arguments\n"));
+ usage(B_FALSE);
+ }
+
+ /*
+ * We have an argument, but it may be a full path or a ZFS
+ * filesystem. Pass full paths off to unmount_path() (shared by
+ * manual_unmount), otherwise open the filesystem and pass to
+ * zfs_unmount().
+ */
+ if (argv[0][0] == '/')
+ return (unshare_unmount_path(op, argv[0],
+ flags, B_FALSE));
+
+ types = ZFS_TYPE_FILESYSTEM;
+ if (op == OP_SHARE)
+ types |= ZFS_TYPE_VOLUME;
+
+ if ((zhp = zfs_open(g_zfs, argv[0], types)) == NULL)
+ return (1);
+
+ if (zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) {
+ verify(zfs_prop_get(zhp, op == OP_SHARE ?
+ ZFS_PROP_SHARENFS : ZFS_PROP_MOUNTPOINT, property,
+ sizeof (property), NULL, NULL, 0, B_FALSE) == 0);
+
+ switch (op) {
+ case OP_SHARE:
+ if (strcmp(property, "off") == 0) {
+ (void) fprintf(stderr, gettext("cannot "
+ "unshare '%s': legacy share\n"),
+ zfs_get_name(zhp));
+ (void) fprintf(stderr, gettext("use "
+ "unshare(1M) to unshare this "
+ "filesystem\n"));
+ ret = 1;
+ } else if (!zfs_is_shared_nfs(zhp, NULL)) {
+ (void) fprintf(stderr, gettext("cannot "
+ "unshare '%s': not currently "
+ "shared\n"), zfs_get_name(zhp));
+ ret = 1;
+ } else if (zfs_unshareall_nfs(zhp) != 0) {
+ ret = 1;
+ }
+ break;
+
+ case OP_MOUNT:
+ if (strcmp(property, "legacy") == 0) {
+ (void) fprintf(stderr, gettext("cannot "
+ "unmount '%s': legacy "
+ "mountpoint\n"), zfs_get_name(zhp));
+ (void) fprintf(stderr, gettext("use "
+ "umount(1M) to unmount this "
+ "filesystem\n"));
+ ret = 1;
+ } else if (!zfs_is_mounted(zhp, NULL)) {
+ (void) fprintf(stderr, gettext("cannot "
+ "unmount '%s': not currently "
+ "mounted\n"),
+ zfs_get_name(zhp));
+ ret = 1;
+ } else if (zfs_unmountall(zhp, flags) != 0) {
+ ret = 1;
+ }
+ break;
+ }
+ } else {
+ assert(op == OP_SHARE);
+
+ verify(zfs_prop_get(zhp, ZFS_PROP_SHAREISCSI, property,
+ sizeof (property), NULL, NULL, 0, B_FALSE) == 0);
+
+ if (strcmp(property, "off") == 0) {
+ (void) fprintf(stderr, gettext("cannot unshare "
+ "'%s': 'shareiscsi' property not set\n"),
+ zfs_get_name(zhp));
+ (void) fprintf(stderr, gettext("set "
+ "'shareiscsi' property or use "
+ "iscsitadm(1M) to share this volume\n"));
+ ret = 1;
+ } else if (!zfs_is_shared_iscsi(zhp)) {
+ (void) fprintf(stderr, gettext("cannot "
+ "unshare '%s': not currently shared\n"),
+ zfs_get_name(zhp));
+ ret = 1;
+ } else if (zfs_unshare_iscsi(zhp) != 0) {
+ ret = 1;
+ }
+ }
+
+ zfs_close(zhp);
+ }
+
+ return (ret);
+}
+
+/*
+ * zfs unmount -a
+ * zfs unmount filesystem
+ *
+ * Unmount all filesystems, or a specific ZFS filesystem.
+ */
+static int
+zfs_do_unmount(int argc, char **argv)
+{
+ return (unshare_unmount(OP_MOUNT, argc, argv));
+}
+
+/*
+ * zfs unshare -a
+ * zfs unshare filesystem
+ *
+ * Unshare all filesystems, or a specific ZFS filesystem.
+ */
+static int
+zfs_do_unshare(int argc, char **argv)
+{
+ return (unshare_unmount(OP_SHARE, argc, argv));
+}
+
+/*
+ * Attach/detach the given dataset to/from the given jail
+ */
+/* ARGSUSED */
+static int
+do_jail(int argc, char **argv, int attach)
+{
+ zfs_handle_t *zhp;
+ int jailid, ret;
+
+ /* check number of arguments */
+ if (argc < 3) {
+ (void) fprintf(stderr, gettext("missing argument(s)\n"));
+ usage(B_FALSE);
+ }
+ if (argc > 3) {
+ (void) fprintf(stderr, gettext("too many arguments\n"));
+ usage(B_FALSE);
+ }
+
+ jailid = atoi(argv[1]);
+ if (jailid == 0) {
+ (void) fprintf(stderr, gettext("invalid jailid\n"));
+ usage(B_FALSE);
+ }
+
+ zhp = zfs_open(g_zfs, argv[2], ZFS_TYPE_FILESYSTEM);
+ if (zhp == NULL)
+ return (1);
+
+ ret = (zfs_jail(zhp, jailid, attach) != 0);
+
+ if (!ret)
+ zpool_log_history(g_zfs, argc, argv, argv[2], B_FALSE, B_FALSE);
+
+ zfs_close(zhp);
+ return (ret);
+}
+
+/*
+ * zfs jail jailid filesystem
+ *
+ * Attach the given dataset to the given jail
+ */
+/* ARGSUSED */
+static int
+zfs_do_jail(int argc, char **argv)
+{
+
+ return (do_jail(argc, argv, 1));
+}
+
+/*
+ * zfs unjail jailid filesystem
+ *
+ * Detach the given dataset from the given jail
+ */
+/* ARGSUSED */
+static int
+zfs_do_unjail(int argc, char **argv)
+{
+
+ return (do_jail(argc, argv, 0));
+}
+
+/*
+ * Called when invoked as /etc/fs/zfs/mount. Do the mount if the mountpoint is
+ * 'legacy'. Otherwise, complain that use should be using 'zfs mount'.
+ */
+static int
+manual_mount(int argc, char **argv)
+{
+ zfs_handle_t *zhp;
+ char mountpoint[ZFS_MAXPROPLEN];
+ char mntopts[MNT_LINE_MAX] = { '\0' };
+ int ret;
+ int c;
+ int flags = 0;
+ char *dataset, *path;
+
+ /* check options */
+ while ((c = getopt(argc, argv, ":mo:O")) != -1) {
+ switch (c) {
+ case 'o':
+ (void) strlcpy(mntopts, optarg, sizeof (mntopts));
+ break;
+ case 'O':
+#if 0 /* FreeBSD: No support for MS_OVERLAY. */
+ flags |= MS_OVERLAY;
+#endif
+ break;
+ case 'm':
+#if 0 /* FreeBSD: No support for MS_NOMNTTAB. */
+ flags |= MS_NOMNTTAB;
+#endif
+ break;
+ case ':':
+ (void) fprintf(stderr, gettext("missing argument for "
+ "'%c' option\n"), optopt);
+ usage(B_FALSE);
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ (void) fprintf(stderr, gettext("usage: mount [-o opts] "
+ "<path>\n"));
+ return (2);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ /* check that we only have two arguments */
+ if (argc != 2) {
+ if (argc == 0)
+ (void) fprintf(stderr, gettext("missing dataset "
+ "argument\n"));
+ else if (argc == 1)
+ (void) fprintf(stderr,
+ gettext("missing mountpoint argument\n"));
+ else
+ (void) fprintf(stderr, gettext("too many arguments\n"));
+ (void) fprintf(stderr, "usage: mount <dataset> <mountpoint>\n");
+ return (2);
+ }
+
+ dataset = argv[0];
+ path = argv[1];
+
+ /* try to open the dataset */
+ if ((zhp = zfs_open(g_zfs, dataset, ZFS_TYPE_FILESYSTEM)) == NULL)
+ return (1);
+
+ (void) zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mountpoint,
+ sizeof (mountpoint), NULL, NULL, 0, B_FALSE);
+
+ /* check for legacy mountpoint and complain appropriately */
+ ret = 0;
+ if (strcmp(mountpoint, ZFS_MOUNTPOINT_LEGACY) == 0) {
+ if (zmount(dataset, path, flags, MNTTYPE_ZFS,
+ NULL, 0, mntopts, sizeof (mntopts)) != 0) {
+ (void) fprintf(stderr, gettext("mount failed: %s\n"),
+ strerror(errno));
+ ret = 1;
+ }
+ } else {
+ (void) fprintf(stderr, gettext("filesystem '%s' cannot be "
+ "mounted using 'mount -F zfs'\n"), dataset);
+ (void) fprintf(stderr, gettext("Use 'zfs set mountpoint=%s' "
+ "instead.\n"), path);
+ (void) fprintf(stderr, gettext("If you must use 'mount -F zfs' "
+ "or /etc/vfstab, use 'zfs set mountpoint=legacy'.\n"));
+ (void) fprintf(stderr, gettext("See zfs(1M) for more "
+ "information.\n"));
+ ret = 1;
+ }
+
+ return (ret);
+}
+
+/*
+ * Called when invoked as /etc/fs/zfs/umount. Unlike a manual mount, we allow
+ * unmounts of non-legacy filesystems, as this is the dominant administrative
+ * interface.
+ */
+static int
+manual_unmount(int argc, char **argv)
+{
+ int flags = 0;
+ int c;
+
+ /* check options */
+ while ((c = getopt(argc, argv, "f")) != -1) {
+ switch (c) {
+ case 'f':
+ flags = MS_FORCE;
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ (void) fprintf(stderr, gettext("usage: unmount [-f] "
+ "<path>\n"));
+ return (2);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ /* check arguments */
+ if (argc != 1) {
+ if (argc == 0)
+ (void) fprintf(stderr, gettext("missing path "
+ "argument\n"));
+ else
+ (void) fprintf(stderr, gettext("too many arguments\n"));
+ (void) fprintf(stderr, gettext("usage: unmount [-f] <path>\n"));
+ return (2);
+ }
+
+ return (unshare_unmount_path(OP_MOUNT, argv[0], flags, B_TRUE));
+}
+
+static int
+volcheck(zpool_handle_t *zhp, void *data)
+{
+ boolean_t isinit = *((boolean_t *)data);
+
+ if (isinit)
+ return (zpool_create_zvol_links(zhp));
+ else
+ return (zpool_remove_zvol_links(zhp));
+}
+
+/*
+ * Iterate over all pools in the system and either create or destroy /dev/zvol
+ * links, depending on the value of 'isinit'.
+ */
+static int
+do_volcheck(boolean_t isinit)
+{
+ return (zpool_iter(g_zfs, volcheck, &isinit) ? 1 : 0);
+}
+
+int
+main(int argc, char **argv)
+{
+ int ret;
+ int i;
+ char *progname;
+ char *cmdname;
+
+ (void) setlocale(LC_ALL, "");
+ (void) textdomain(TEXT_DOMAIN);
+
+ opterr = 0;
+
+ if ((g_zfs = libzfs_init()) == NULL) {
+ (void) fprintf(stderr, gettext("internal error: failed to "
+ "initialize ZFS library\n"));
+ return (1);
+ }
+
+ libzfs_print_on_error(g_zfs, B_TRUE);
+
+ if ((mnttab_file = fopen(MNTTAB, "r")) == NULL) {
+ (void) fprintf(stderr, gettext("internal error: unable to "
+ "open %s\n"), MNTTAB);
+ return (1);
+ }
+
+ /*
+ * This command also doubles as the /etc/fs mount and unmount program.
+ * Determine if we should take this behavior based on argv[0].
+ */
+ progname = basename(argv[0]);
+ if (strcmp(progname, "mount") == 0) {
+ ret = manual_mount(argc, argv);
+ } else if (strcmp(progname, "umount") == 0) {
+ ret = manual_unmount(argc, argv);
+ } else {
+ /*
+ * Make sure the user has specified some command.
+ */
+ if (argc < 2) {
+ (void) fprintf(stderr, gettext("missing command\n"));
+ usage(B_FALSE);
+ }
+
+ cmdname = argv[1];
+
+ /*
+ * The 'umount' command is an alias for 'unmount'
+ */
+ if (strcmp(cmdname, "umount") == 0)
+ cmdname = "unmount";
+
+ /*
+ * The 'recv' command is an alias for 'receive'
+ */
+ if (strcmp(cmdname, "recv") == 0)
+ cmdname = "receive";
+
+ /*
+ * Special case '-?'
+ */
+ if (strcmp(cmdname, "-?") == 0)
+ usage(B_TRUE);
+
+ /*
+ * 'volinit' and 'volfini' do not appear in the usage message,
+ * so we have to special case them here.
+ */
+ if (strcmp(cmdname, "volinit") == 0)
+ return (do_volcheck(B_TRUE));
+ else if (strcmp(cmdname, "volfini") == 0)
+ return (do_volcheck(B_FALSE));
+
+ /*
+ * Run the appropriate command.
+ */
+ for (i = 0; i < NCOMMAND; i++) {
+ if (command_table[i].name == NULL)
+ continue;
+
+ if (strcmp(cmdname, command_table[i].name) == 0) {
+ current_command = &command_table[i];
+ ret = command_table[i].func(argc - 1, argv + 1);
+ break;
+ }
+ }
+
+ if (i == NCOMMAND) {
+ (void) fprintf(stderr, gettext("unrecognized "
+ "command '%s'\n"), cmdname);
+ usage(B_FALSE);
+ }
+ }
+
+ (void) fclose(mnttab_file);
+
+ libzfs_fini(g_zfs);
+
+ /*
+ * The 'ZFS_ABORT' environment variable causes us to dump core on exit
+ * for the purposes of running ::findleaks.
+ */
+ if (getenv("ZFS_ABORT") != NULL) {
+ (void) printf("dumping core by request\n");
+ abort();
+ }
+
+ return (ret);
+}
diff --git a/contrib/opensolaris/cmd/zfs/zfs_util.h b/contrib/opensolaris/cmd/zfs/zfs_util.h
new file mode 100644
index 0000000..c7f2f16
--- /dev/null
+++ b/contrib/opensolaris/cmd/zfs/zfs_util.h
@@ -0,0 +1,44 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _ZFS_UTIL_H
+#define _ZFS_UTIL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <libzfs.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void * safe_malloc(size_t size);
+libzfs_handle_t *g_zfs;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZFS_UTIL_H */
diff --git a/contrib/opensolaris/cmd/zpool/zpool.8 b/contrib/opensolaris/cmd/zpool/zpool.8
new file mode 100644
index 0000000..0fd5b71
--- /dev/null
+++ b/contrib/opensolaris/cmd/zpool/zpool.8
@@ -0,0 +1,1113 @@
+'\" te
+.\" CDDL HEADER START
+.\"
+.\" The contents of this file are subject to the terms of the
+.\" Common Development and Distribution License (the "License").
+.\" You may not use this file except in compliance with the License.
+.\"
+.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+.\" or http://www.opensolaris.org/os/licensing.
+.\" See the License for the specific language governing permissions
+.\" and limitations under the License.
+.\"
+.\" When distributing Covered Code, include this CDDL HEADER in each
+.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+.\" If applicable, add the following below this CDDL HEADER, with the
+.\" fields enclosed by brackets "[]" replaced with your own identifying
+.\" information: Portions Copyright [yyyy] [name of copyright owner]
+.\"
+.\" CDDL HEADER END
+.\" Copyright (c) 2006, Sun Microsystems, Inc. All Rights Reserved.
+.TH zpool 1M "14 Nov 2006" "SunOS 5.11" "System Administration Commands"
+.SH NAME
+zpool \- configures ZFS storage pools
+.SH SYNOPSIS
+.LP
+.nf
+\fBzpool\fR [\fB-?\fR]
+.fi
+.LP
+.nf
+\fBzpool create\fR [\fB-fn\fR] [\fB-R\fR \fIroot\fR] [\fB-m\fR \fImountpoint\fR] \fIpool\fR \fIvdev ...\fR
+.fi
+.LP
+.nf
+\fBzpool destroy\fR [\fB-f\fR] \fIpool\fR
+.fi
+.LP
+.nf
+\fBzpool add\fR [\fB-fn\fR] \fIpool\fR \fIvdev\fR
+.fi
+.LP
+.nf
+\fBzpool remove\fR \fIpool\fR \fIvdev\fR
+.fi
+.LP
+.nf
+\fBzpool \fR \fBlist\fR [\fB-H\fR] [\fB-o\fR \fIfield\fR[,\fIfield\fR]*] [\fIpool\fR] ...
+.fi
+.LP
+.nf
+\fBzpool iostat\fR [\fB-v\fR] [\fIpool\fR] ... [\fIinterval\fR [\fIcount\fR]]
+.fi
+.LP
+.nf
+\fBzpool status\fR [\fB-xv\fR] [\fIpool\fR] ...
+.fi
+.LP
+.nf
+\fBzpool offline\fR [\fB-t\fR] \fIpool\fR \fIdevice\fR ...
+.fi
+.LP
+.nf
+\fBzpool online\fR \fIpool\fR \fIdevice\fR ...
+.fi
+.LP
+.nf
+\fBzpool clear\fR \fIpool\fR [\fIdevice\fR] ...
+.fi
+.LP
+.nf
+\fBzpool attach\fR [\fB-f\fR] \fIpool\fR \fIdevice\fR \fInew_device\fR
+.fi
+.LP
+.nf
+\fBzpool detach\fR \fIpool\fR \fIdevice\fR
+.fi
+.LP
+.nf
+\fBzpool replace\fR [\fB-f\fR] \fIpool\fR \fIdevice\fR [\fInew_device\fR]
+.fi
+.LP
+.nf
+\fBzpool scrub\fR [\fB-s\fR] \fIpool\fR ...
+.fi
+.LP
+.nf
+\fBzpool export\fR [\fB-f\fR] \fIpool\fR
+.fi
+.LP
+.nf
+\fBzpool import\fR [\fB-d\fR \fIdir\fR] [\fB-D\fR]
+.fi
+.LP
+.nf
+\fBzpool import\fR [\fB-d\fR \fIdir\fR] [\fB-D\fR] [\fB-f\fR] [\fB-o \fIopts\fR\fR] [\fB-R \fR\fIroot\fR] \fIpool\fR | \fIid\fR
+ [\fInewpool\fR]
+.fi
+.LP
+.nf
+\fBzpool import\fR [\fB-d\fR \fIdir\fR] [\fB-D\fR] [\fB-f\fR] [\fB-a\fR]
+.fi
+.LP
+.nf
+\fBzpool upgrade\fR
+.fi
+.LP
+.nf
+\fBzpool upgrade\fR \fB-v\fR
+.fi
+.LP
+.nf
+\fBzpool upgrade\fR [\fB-a\fR | \fIpool\fR]
+.fi
+.LP
+.nf
+\fBzpool history\fR [\fIpool\fR] ...
+.fi
+
+.SH DESCRIPTION
+
+.LP
+The \fBzpool\fR command configures \fBZFS\fR storage pools. A storage pool is a collection of devices that provides physical storage and data replication for \fBZFS\fR datasets.
+.LP
+All datasets within a storage pool share the same space. See \fBzfs\fR(1M) for information on managing datasets.
+.SS Virtual Devices (vdevs)
+
+.LP
+A "virtual device" describes a single device or a collection of devices organized according to certain performance and fault characteristics. The following virtual devices are supported:
+.sp
+.ne 2
+.mk
+.na
+\fBdisk\fR
+.ad
+.RS 10n
+.rt
+A block device, typically located under "/dev/dsk". \fBZFS\fR can use individual slices or partitions, though the recommended mode of operation is to use whole disks. A disk can be specified by a full path, or it can be a shorthand name (the relative portion
+of the path under "/dev/dsk"). A whole disk can be specified by omitting the slice or partition designation. For example, "c0t0d0" is equivalent to "/dev/dsk/c0t0d0s2". When given a whole disk, \fBZFS\fR automatically labels the disk, if necessary.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fBfile\fR
+.ad
+.RS 10n
+.rt
+A regular file. The use of files as a backing store is strongly discouraged. It is designed primarily for experimental purposes, as the fault tolerance of a file is only as good as the file system of which it is a part. A file must be specified by a full path.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fBmirror\fR
+.ad
+.RS 10n
+.rt
+A mirror of two or more devices. Data is replicated in an identical fashion across all components of a mirror. A mirror with \fIN\fR disks of size \fIX\fR can hold \fIX\fR bytes and can withstand (\fIN-1\fR)
+devices failing before data integrity is compromised.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fBraidz\fR
+.ad
+.br
+.na
+\fBraidz1\fR
+.ad
+.br
+.na
+\fBraidz2\fR
+.ad
+.RS 10n
+.rt
+A variation on \fBRAID-5\fR that allows for better distribution of parity and eliminates the "\fBRAID-5\fR write hole" (in which data and parity become inconsistent after a power loss). Data and parity is striped across all disks within a \fBraidz\fR group.
+.sp
+A \fBraidz\fR group can have either single- or double-parity, meaning that the \fBraidz\fR group can sustain one or two failures respectively without losing any data. The \fBraidz1\fR \fBvdev\fR type specifies a single-parity \fBraidz\fR group
+and the \fBraidz2\fR \fBvdev\fR type specifies a double-parity \fBraidz\fR group. The \fBraidz\fR \fBvdev\fR type is an alias for \fBraidz1\fR.
+.sp
+A \fBraidz\fR group with \fIN\fR disks of size \fIX\fR with \fIP\fR parity disks can hold approximately (\fIN-P\fR)*\fIX\fR bytes and can withstand one device failing before
+data integrity is compromised. The minimum number of devices in a \fBraidz\fR group is one more than the number of parity disks. The recommended number is between 3 and 9.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fBspare\fR
+.ad
+.RS 10n
+.rt
+A special pseudo-\fBvdev\fR which keeps track of available hot spares for a pool. For more information, see the "Hot Spares" section.
+.RE
+
+.LP
+Virtual devices cannot be nested, so a mirror or \fBraidz\fR virtual device can only contain files or disks. Mirrors of mirrors (or other combinations) are not allowed.
+.LP
+A pool can have any number of virtual devices at the top of the configuration (known as "root vdevs"). Data is dynamically distributed across all top-level devices to balance data among devices. As new virtual devices are added, \fBZFS\fR automatically places data
+on the newly available devices.
+.LP
+Virtual devices are specified one at a time on the command line, separated by whitespace. The keywords "mirror" and "raidz" are used to distinguish where a group ends and another begins. For example, the following creates two root vdevs, each a mirror of two disks:
+.sp
+.in +2
+.nf
+\fB# zpool create mypool mirror c0t0d0 c0t1d0 mirror c1t0d0 c1t1d0\fR
+.fi
+.in -2
+.sp
+
+.SS Device Failure and Recovery
+
+.LP
+\fBZFS\fR supports a rich set of mechanisms for handling device failure and data corruption. All metadata and data is checksummed, and \fBZFS\fR automatically repairs bad data from a good copy when corruption is detected.
+.LP
+In order to take advantage of these features, a pool must make use of some form of redundancy, using either mirrored or \fBraidz\fR groups. While \fBZFS\fR supports running in a non-redundant configuration, where each root vdev is simply a disk or file, this is
+strongly discouraged. A single case of bit corruption can render some or all of your data unavailable.
+.LP
+A pool's health status is described by one of three states: online, degraded, or faulted. An online pool has all devices operating normally. A degraded pool is one in which one or more devices have failed, but the data is still available due to a redundant configuration. A faulted pool has
+one or more failed devices, and there is insufficient redundancy to replicate the missing data.
+.SS Hot Spares
+
+.LP
+\fBZFS\fR allows devices to be associated with pools as "hot spares". These devices are not actively used in the pool, but when an active device fails, it is automatically replaced by a hot spare. To create a pool with hot spares, specify a "spare" \fBvdev\fR with any number of devices. For example,
+.sp
+.in +2
+.nf
+# zpool create pool mirror c0d0 c1d0 spare c2d0 c3d0
+.fi
+.in -2
+.sp
+
+.LP
+Spares can be shared across multiple pools, and can be added with the "zpool add" command and removed with the "zpool remove" command. Once a spare replacement is initiated, a new "spare" \fBvdev\fR is created within the configuration that
+will remain there until the original device is replaced. At this point, the hot spare becomes available again if another device fails.
+.LP
+An in-progress spare replacement can be cancelled by detaching the hot spare. If the original faulted device is detached, then the hot spare assumes its place in the configuration, and is removed from the spare list of all active pools.
+.SS Alternate Root Pools
+
+.LP
+The "zpool create -R" and "zpool import -R" commands allow users to create and import a pool with a different root path. By default, whenever a pool is created or imported on a system, it is permanently added so that it is available whenever the system boots. For
+removable media, or when in recovery situations, this may not always be desirable. An alternate root pool does not persist on the system. Instead, it exists only until exported or the system is rebooted, at which point it will have to be imported again.
+.LP
+In addition, all mount points in the pool are prefixed with the given root, so a pool can be constrained to a particular area of the file system. This is most useful when importing unknown pools from removable media, as the mount points of any file systems cannot be trusted.
+.LP
+When creating an alternate root pool, the default mount point is "/", rather than the normal default "/\fIpool\fR".
+.SS Subcommands
+
+.LP
+All subcommands that modify state are logged persistently to the pool in their original form.
+.LP
+The \fBzpool\fR command provides subcommands to create and destroy storage pools, add capacity to storage pools, and provide information about the storage pools. The following subcommands are supported:
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzpool\fR \fB-?\fR\fR
+.ad
+.sp .6
+.RS 4n
+Displays a help message.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzpool create\fR [\fB-fn\fR] [\fB-R\fR \fIroot\fR] [\fB-m\fR \fImountpoint\fR] \fIpool\fR \fIvdev ...\fR\fR
+.ad
+.sp .6
+.RS 4n
+Creates a new storage pool containing the virtual devices specified on the command line. The pool name must begin with a letter, and can only contain alphanumeric characters as well as underscore ("_"), dash ("-"), and period ("."). The pool
+names "mirror", "raidz", and "spare" are reserved, as are names beginning with the pattern "c[0-9]". The \fBvdev\fR specification is described in the "Virtual Devices" section.
+.sp
+The command verifies that each device specified is accessible and not currently in use by another subsystem. There are some uses, such as being currently mounted, or specified as the dedicated dump device, that prevents a device from ever being used by \fBZFS\fR. Other uses,
+such as having a preexisting \fBUFS\fR file system, can be overridden with the \fB-f\fR option.
+.sp
+The command also checks that the replication strategy for the pool is consistent. An attempt to combine redundant and non-redundant storage in a single pool, or to mix disks and files, results in an error unless \fB-f\fR is specified. The use of differently sized devices within
+a single \fBraidz\fR or mirror group is also flagged as an error unless \fB-f\fR is specified.
+.sp
+Unless the \fB-R\fR option is specified, the default mount point is "/\fIpool\fR". The mount point must not exist or must be empty, or else the root dataset cannot be mounted. This can be overridden with the \fB-m\fR option.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-f\fR\fR
+.ad
+.RS 17n
+.rt
+Forces use of \fBvdev\fRs, even if they appear in use or specify a conflicting replication level. Not all devices can be overridden in this manner.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-n\fR\fR
+.ad
+.RS 17n
+.rt
+Displays the configuration that would be used without actually creating the pool. The actual pool creation can still fail due to insufficient privileges or device sharing.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-R\fR \fIroot\fR\fR
+.ad
+.RS 17n
+.rt
+Creates the pool with an alternate \fIroot\fR. See the "Alternate Root Pools" section. The root dataset has its mount point set to "/" as part of this operation.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-m\fR \fImountpoint\fR\fR
+.ad
+.RS 17n
+.rt
+Sets the mount point for the root dataset. The default mount point is "/\fIpool\fR". The mount point must be an absolute path, "\fBlegacy\fR", or "\fBnone\fR". For more information on dataset mount
+points, see \fBzfs\fR(1M).
+.RE
+
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzpool destroy\fR [\fB-f\fR] \fIpool\fR\fR
+.ad
+.sp .6
+.RS 4n
+Destroys the given pool, freeing up any devices for other use. This command tries to unmount any active datasets before destroying the pool.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-f\fR\fR
+.ad
+.RS 6n
+.rt
+Forces any active datasets contained within the pool to be unmounted.
+.RE
+
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzpool add\fR [\fB-fn\fR] \fIpool\fR \fIvdev ...\fR\fR
+.ad
+.sp .6
+.RS 4n
+Adds the specified virtual devices to the given pool. The \fIvdev\fR specification is described in the "Virtual Devices" section. The behavior of the \fB-f\fR option, and the device checks performed are described in the "zpool create"
+subcommand.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-f\fR\fR
+.ad
+.RS 6n
+.rt
+Forces use of \fBvdev\fRs, even if they appear in use or specify a conflicting replication level. Not all devices can be overridden in this manner.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-n\fR\fR
+.ad
+.RS 6n
+.rt
+Displays the configuration that would be used without actually adding the \fBvdev\fRs. The actual pool creation can still fail due to insufficient privileges or device sharing.
+.RE
+
+Do not add a disk that is currently configured as a quorum device to a zpool. Once a disk is in a zpool, that disk can then be configured as a quorum device.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzpool remove\fR \fIpool\fR \fIvdev\fR\fR
+.ad
+.sp .6
+.RS 4n
+Removes the given \fBvdev\fR from the pool. This command currently only supports removing hot spares. Devices which are part of a mirror can be removed using the "zpool detach" command. \fBRaidz\fR and top-level \fBvdevs\fR cannot
+be removed from a pool.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzpool list\fR [\fB-H\fR] [\fB-o\fR \fIfield\fR[,\fIfield*\fR]] [\fIpool\fR] ...\fR
+.ad
+.sp .6
+.RS 4n
+Lists the given pools along with a health status and space usage. When given no arguments, all pools in the system are listed.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-H\fR\fR
+.ad
+.RS 12n
+.rt
+Scripted mode. Do not display headers, and separate fields by a single tab instead of arbitrary space.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-o\fR \fIfield\fR\fR
+.ad
+.RS 12n
+.rt
+Comma-separated list of fields to display. Each field must be one of:
+.sp
+.in +2
+.nf
+name Pool name
+size Total size
+used Amount of space used
+available Amount of space available
+capacity Percentage of pool space used
+health Health status
+.fi
+.in -2
+.sp
+
+The default is all fields.
+.RE
+
+This command reports actual physical space available to the storage pool. The physical space can be different from the total amount of space that any contained datasets can actually use. The amount of space used in a \fBraidz\fR configuration depends on the characteristics of
+the data being written. In addition, \fBZFS\fR reserves some space for internal accounting that the \fBzfs\fR(1M) command takes into account, but the \fBzpool\fR command does not. For non-full pools of a reasonable size, these effects should be invisible. For small pools, or pools that are close to being completely full, these discrepancies may become more noticeable.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzpool iostat\fR [\fB-v\fR] [\fIpool\fR] ... [\fIinterval\fR [\fIcount\fR]]\fR
+.ad
+.sp .6
+.RS 4n
+Displays \fBI/O\fR statistics for the given pools. When given an interval, the statistics are printed every \fIinterval\fR seconds until \fBCtrl-C\fR is pressed. If no \fIpools\fR are specified, statistics for
+every pool in the system is shown. If \fIcount\fR is specified, the command exits after \fIcount\fR reports are printed.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-v\fR\fR
+.ad
+.RS 6n
+.rt
+Verbose statistics. Reports usage statistics for individual \fIvdevs\fR within the pool, in addition to the pool-wide statistics.
+.RE
+
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzpool status\fR [\fB-xv\fR] [\fIpool\fR] ...\fR
+.ad
+.sp .6
+.RS 4n
+Displays the detailed health status for the given pools. If no \fIpool\fR is specified, then the status of each pool in the system is displayed.
+.sp
+If a scrub or resilver is in progress, this command reports the percentage done and the estimated time to completion. Both of these are only approximate, because the amount of data in the pool and the other workloads on the system can change.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-x\fR\fR
+.ad
+.RS 6n
+.rt
+Only display status for pools that are exhibiting errors or are otherwise unavailable.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-v\fR\fR
+.ad
+.RS 6n
+.rt
+Displays verbose data error information, printing out a complete list of all data errors since the last complete pool scrub.
+.RE
+
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzpool offline\fR [\fB-t\fR] \fIpool\fR \fIdevice\fR ...\fR
+.ad
+.sp .6
+.RS 4n
+Takes the specified physical device offline. While the \fIdevice\fR is offline, no attempt is made to read or write to the device.
+.sp
+This command is not applicable to spares.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-t\fR\fR
+.ad
+.RS 6n
+.rt
+Temporary. Upon reboot, the specified physical device reverts to its previous state.
+.RE
+
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzpool online\fR \fIpool\fR \fIdevice\fR ...\fR
+.ad
+.sp .6
+.RS 4n
+Brings the specified physical device online.
+.sp
+This command is not applicable to spares.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzpool clear\fR \fIpool\fR [\fIdevice\fR] ...\fR
+.ad
+.sp .6
+.RS 4n
+Clears device errors in a pool. If no arguments are specified, all device errors within the pool are cleared. If one or more devices is specified, only those errors associated with the specified device or devices are cleared.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzpool attach\fR [\fB-f\fR] \fIpool\fR \fIdevice\fR \fInew_device\fR\fR
+.ad
+.sp .6
+.RS 4n
+Attaches \fInew_device\fR to an existing \fBzpool\fR device. The existing device cannot be part of a \fBraidz\fR configuration. If \fIdevice\fR is not currently part of a mirrored configuration, \fIdevice\fR automatically
+transforms into a two-way mirror of \fIdevice\fR and \fInew_device\fR. If \fIdevice\fR is part of a two-way mirror, attaching \fInew_device\fR creates a three-way mirror, and so on. In either case, \fInew_device\fR begins to resilver immediately.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-f\fR\fR
+.ad
+.RS 6n
+.rt
+Forces use of \fInew_device\fR, even if its appears to be in use. Not all devices can be overridden in this manner.
+.RE
+
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzpool detach\fR \fIpool\fR \fIdevice\fR\fR
+.ad
+.sp .6
+.RS 4n
+Detaches \fIdevice\fR from a mirror. The operation is refused if there are no other valid replicas of the data.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzpool replace\fR [\fB-f\fR] \fIpool\fR \fIold_device\fR [\fInew_device\fR]\fR
+.ad
+.sp .6
+.RS 4n
+Replaces \fIold_device\fR with \fInew_device\fR. This is equivalent to attaching \fInew_device\fR, waiting for it to resilver, and then detaching \fIold_device\fR.
+.sp
+The size of \fInew_device\fR must be greater than or equal to the minimum size of all the devices in a mirror or \fBraidz\fR configuration.
+.sp
+If \fInew_device\fR is not specified, it defaults to \fIold_device\fR. This form of replacement is useful after an existing disk has failed and has been physically replaced. In this case, the new disk may have the same \fB/dev/dsk\fR path
+as the old device, even though it is actually a different disk. \fBZFS\fR recognizes this.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-f\fR\fR
+.ad
+.RS 6n
+.rt
+Forces use of \fInew_device\fR, even if its appears to be in use. Not all devices can be overridden in this manner.
+.RE
+
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzpool scrub\fR [\fB-s\fR] \fIpool\fR ...\fR
+.ad
+.sp .6
+.RS 4n
+Begins a scrub. The scrub examines all data in the specified pools to verify that it checksums correctly. For replicated (mirror or \fBraidz\fR) devices, \fBZFS\fR automatically repairs any damage discovered during the scrub. The "\fBzpool
+status\fR" command reports the progress of the scrub and summarizes the results of the scrub upon completion.
+.sp
+Scrubbing and resilvering are very similar operations. The difference is that resilvering only examines data that \fBZFS\fR knows to be out of date (for example, when attaching a new device to a mirror or replacing an existing device), whereas scrubbing examines all data to
+discover silent errors due to hardware faults or disk failure.
+.sp
+Because scrubbing and resilvering are \fBI/O\fR-intensive operations, \fBZFS\fR only allows one at a time. If a scrub is already in progress, the "\fBzpool scrub\fR" command terminates it and starts a new scrub. If a resilver is in progress, \fBZFS\fR does not allow a scrub to be started until the resilver completes.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-s\fR\fR
+.ad
+.RS 6n
+.rt
+Stop scrubbing.
+.RE
+
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzpool export\fR [\fB-f\fR] \fIpool\fR ...\fR
+.ad
+.sp .6
+.RS 4n
+Exports the given pools from the system. All devices are marked as exported, but are still considered in use by other subsystems. The devices can be moved between systems (even those of different endianness) and imported as long as a sufficient number of devices are present.
+.sp
+Before exporting the pool, all datasets within the pool are unmounted.
+.sp
+For pools to be portable, you must give the \fBzpool\fR command whole disks, not just slices, so that \fBZFS\fR can label the disks with portable \fBEFI\fR labels. Otherwise, disk drivers on platforms of different endianness will not recognize the disks.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-f\fR\fR
+.ad
+.RS 6n
+.rt
+Forcefully unmount all datasets, using the "\fBunmount -f\fR" command.
+.RE
+
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzpool import\fR [\fB-d\fR \fIdir\fR] [\fB-D\fR]\fR
+.ad
+.sp .6
+.RS 4n
+Lists pools available to import. If the \fB-d\fR option is not specified, this command searches for devices in "/dev/dsk". The \fB-d\fR option can be specified multiple times, and all directories are searched. If the device appears to be part of
+an exported pool, this command displays a summary of the pool with the name of the pool, a numeric identifier, as well as the \fIvdev\fR layout and current health of the device for each device or file. Destroyed pools, pools that were previously destroyed with the "\fB-zpool destroy\fR" command, are not listed unless the \fB-D\fR option is specified.
+.sp
+The numeric identifier is unique, and can be used instead of the pool name when multiple exported pools of the same name are available.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-d\fR \fIdir\fR\fR
+.ad
+.RS 10n
+.rt
+Searches for devices or files in \fIdir\fR. The \fB-d\fR option can be specified multiple times.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-D\fR\fR
+.ad
+.RS 10n
+.rt
+Lists destroyed pools only.
+.RE
+
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzpool import\fR [\fB-d\fR \fIdir\fR] [\fB-D\fR] [\fB-f\fR] [\fB-o\fR \fIopts\fR] [\fB-R\fR \fIroot\fR] \fIpool\fR | \fIid\fR [\fInewpool\fR]\fR
+.ad
+.sp .6
+.RS 4n
+Imports a specific pool. A pool can be identified by its name or the numeric identifier. If \fInewpool\fR is specified, the pool is imported using the name \fInewpool\fR. Otherwise, it is imported with the same name as its exported name.
+.sp
+If a device is removed from a system without running "\fBzpool export\fR" first, the device appears as potentially active. It cannot be determined if this was a failed export, or whether the device is really in use from another host. To import a pool in this state,
+the \fB-f\fR option is required.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-d\fR \fIdir\fR\fR
+.ad
+.RS 11n
+.rt
+Searches for devices or files in \fIdir\fR. The \fB-d\fR option can be specified multiple times.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-D\fR\fR
+.ad
+.RS 11n
+.rt
+Imports destroyed pool. The \fB-f\fR option is also required.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-f\fR\fR
+.ad
+.RS 11n
+.rt
+Forces import, even if the pool appears to be potentially active.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-o\fR \fIopts\fR\fR
+.ad
+.RS 11n
+.rt
+Comma-separated list of mount options to use when mounting datasets within the pool. See \fBzfs\fR(1M) for a description of dataset properties and mount
+options.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-R\fR \fIroot\fR\fR
+.ad
+.RS 11n
+.rt
+Imports pool(s) with an alternate \fIroot\fR. See the "Alternate Root Pools" section.
+.RE
+
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzpool import\fR [\fB-d\fR \fIdir\fR] [\fB-D\fR] [\fB-f\fR] [\fB-a\fR]\fR
+.ad
+.sp .6
+.RS 4n
+Imports all pools found in the search directories. Identical to the previous command, except that all pools with a sufficient number of devices available are imported. Destroyed pools, pools that were previously destroyed with the "\fB-zpool destroy\fR" command,
+will not be imported unless the \fB-D\fR option is specified.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-d\fR \fIdir\fR\fR
+.ad
+.RS 10n
+.rt
+Searches for devices or files in \fIdir\fR. The \fB-d\fR option can be specified multiple times.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-D\fR\fR
+.ad
+.RS 10n
+.rt
+Imports destroyed pools only. The \fB-f\fR option is also required.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-f\fR\fR
+.ad
+.RS 10n
+.rt
+Forces import, even if the pool appears to be potentially active.
+.RE
+
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzpool upgrade\fR\fR
+.ad
+.sp .6
+.RS 4n
+Displays all pools formatted using a different \fBZFS\fR on-disk version. Older versions can continue to be used, but some features may not be available. These pools can be upgraded using "\fBzpool upgrade -a\fR". Pools that are formatted with
+a more recent version are also displayed, although these pools will be inaccessible on the system.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzpool upgrade\fR \fB-v\fR\fR
+.ad
+.sp .6
+.RS 4n
+Displays \fBZFS\fR versions supported by the current software. The current \fBZFS\fR versions and all previous supportedversions are displayed, along with an explanation of the features provided with each version.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzpool upgrade\fR [\fB-a\fR | \fIpool\fR]\fR
+.ad
+.sp .6
+.RS 4n
+Upgrades the given pool to the latest on-disk version. Once this is done, the pool will no longer be accessible on systems running older versions of the software.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-a\fR\fR
+.ad
+.RS 6n
+.rt
+Upgrades all pools.
+.RE
+
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzpool history\fR [\fIpool\fR] ...\fR
+.ad
+.sp .6
+.RS 4n
+Displays the command history of the specified pools (or all pools if no pool is specified).
+.RE
+
+.SH EXAMPLES
+.LP
+\fBExample 1 \fRCreating a RAID-Z Storage Pool
+
+.LP
+The following command creates a pool with a single \fBraidz\fR root \fIvdev\fR that consists of six disks.
+.sp
+.in +2
+.nf
+\fB# zpool create tank raidz c0t0d0 c0t1d0 c0t2d0 c0t3d0 c0t4d0 c0t5d0\fR
+.fi
+.in -2
+.sp
+.LP
+\fBExample 2 \fRCreating a Mirrored Storage Pool
+
+.LP
+The following command creates a pool with two mirrors, where each mirror contains two disks.
+.sp
+.in +2
+.nf
+\fB# zpool create tank mirror c0t0d0 c0t1d0 mirror c0t2d0 c0t3d0\fR
+.fi
+.in -2
+.sp
+.LP
+\fBExample 3 \fRCreating a ZFS Storage Pool by Using Slices
+
+.LP
+The following command creates an unmirrored pool using two disk slices.
+.sp
+.in +2
+.nf
+\fB# zpool create tank /dev/dsk/c0t0d0s1 c0t1d0s4\fR
+.fi
+.in -2
+.sp
+.LP
+\fBExample 4 \fRCreating a ZFS Storage Pool by Using Files
+
+.LP
+The following command creates an unmirrored pool using files. While not recommended, a pool based on files can be useful for experimental purposes.
+.sp
+.in +2
+.nf
+\fB# zpool create tank /path/to/file/a /path/to/file/b\fR
+.fi
+.in -2
+.sp
+.LP
+\fBExample 5 \fRAdding a Mirror to a ZFS Storage Pool
+
+.LP
+The following command adds two mirrored disks to the pool "\fItank\fR", assuming the pool is already made up of two-way mirrors. The additional space is immediately available to any datasets within the pool.
+.sp
+.in +2
+.nf
+\fB# zpool add tank mirror c1t0d0 c1t1d0\fR
+.fi
+.in -2
+.sp
+.LP
+\fBExample 6 \fRListing Available ZFS Storage Pools
+
+.LP
+The following command lists all available pools on the system. In this case, the pool \fIzion\fR is faulted due to a missing device.
+.LP
+The results from this command are similar to the following:
+.sp
+.in +2
+.nf
+\fB# zpool list\fR
+ NAME SIZE USED AVAIL CAP HEALTH ALTROOT
+ pool 67.5G 2.92M 67.5G 0% ONLINE -
+ tank 67.5G 2.92M 67.5G 0% ONLINE -
+ zion - - - 0% FAULTED -
+.fi
+.in -2
+.sp
+.LP
+\fBExample 7 \fRDestroying a ZFS Storage Pool
+
+.LP
+The following command destroys the pool "\fItank\fR" and any datasets contained within.
+.sp
+.in +2
+.nf
+\fB# zpool destroy -f tank\fR
+.fi
+.in -2
+.sp
+.LP
+\fBExample 8 \fRExporting a ZFS Storage Pool
+
+.LP
+The following command exports the devices in pool \fItank\fR so that they can be relocated or later imported.
+.sp
+.in +2
+.nf
+\fB# zpool export tank\fR
+.fi
+.in -2
+.sp
+.LP
+\fBExample 9 \fRImporting a ZFS Storage Pool
+
+.LP
+The following command displays available pools, and then imports the pool "tank" for use on the system.
+.LP
+The results from this command are similar to the following:
+.sp
+.in +2
+.nf
+\fB# zpool import\fR
+ pool: tank
+ id: 15451357997522795478
+state: ONLINE
+action: The pool can be imported using its name or numeric identifier.
+config:
+
+ tank ONLINE
+ mirror ONLINE
+ c1t2d0 ONLINE
+ c1t3d0 ONLINE
+
+\fB# zpool import tank\fR
+.fi
+.in -2
+.sp
+.LP
+\fBExample 10 \fRUpgrading All ZFS Storage Pools to the Current Version
+
+.LP
+The following command upgrades all ZFS Storage pools to the current version of the software.
+.sp
+.in +2
+.nf
+\fB# zpool upgrade -a\fR
+This system is currently running ZFS version 2.
+.fi
+.in -2
+.sp
+.LP
+\fBExample 11 \fRManaging Hot Spares
+
+.LP
+The following command creates a new pool with an available hot spare:
+.sp
+.in +2
+.nf
+\fB# zpool create tank mirror c0t0d0 c0t1d0 spare c0t2d0\fR
+.fi
+.in -2
+.sp
+
+.LP
+If one of the disks were to fail, the pool would be reduced to the degraded state. The failed device can be replaced using the following command:
+.sp
+.in +2
+.nf
+\fB# zpool replace tank c0t0d0 c0t3d0\fR
+.fi
+.in -2
+.sp
+
+.LP
+Once the data has been resilvered, the spare is automatically removed and is made available should another device fails. The hot spare can be permanently removed from the pool using the following command:
+.sp
+.in +2
+.nf
+\fB# zpool remove tank c0t2d0\fR
+.fi
+.in -2
+.sp
+
+.SH EXIT STATUS
+
+.LP
+The following exit values are returned:
+.sp
+.ne 2
+.mk
+.na
+\fB\fB0\fR\fR
+.ad
+.RS 5n
+.rt
+Successful completion.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB1\fR\fR
+.ad
+.RS 5n
+.rt
+An error occurred.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB2\fR\fR
+.ad
+.RS 5n
+.rt
+Invalid command line options were specified.
+.RE
+
+.SH ATTRIBUTES
+
+.LP
+See \fBattributes\fR(5) for descriptions of the following attributes:
+.sp
+
+.sp
+.TS
+tab() box;
+cw(2.75i) |cw(2.75i)
+lw(2.75i) |lw(2.75i)
+.
+ATTRIBUTE TYPEATTRIBUTE VALUE
+_
+AvailabilitySUNWzfsu
+_
+Interface StabilityEvolving
+.TE
+
+.SH SEE ALSO
+
+.LP
+\fBzfs\fR(1M), \fBattributes\fR(5)
diff --git a/contrib/opensolaris/cmd/zpool/zpool_iter.c b/contrib/opensolaris/cmd/zpool/zpool_iter.c
new file mode 100644
index 0000000..f724179
--- /dev/null
+++ b/contrib/opensolaris/cmd/zpool/zpool_iter.c
@@ -0,0 +1,245 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <solaris.h>
+#include <libintl.h>
+#include <libuutil.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+
+#include <libzfs.h>
+
+#include "zpool_util.h"
+
+/*
+ * Private interface for iterating over pools specified on the command line.
+ * Most consumers will call for_each_pool, but in order to support iostat, we
+ * allow fined grained control through the zpool_list_t interface.
+ */
+
+typedef struct zpool_node {
+ zpool_handle_t *zn_handle;
+ uu_avl_node_t zn_avlnode;
+ int zn_mark;
+} zpool_node_t;
+
+struct zpool_list {
+ boolean_t zl_findall;
+ uu_avl_t *zl_avl;
+ uu_avl_pool_t *zl_pool;
+};
+
+/* ARGSUSED */
+static int
+zpool_compare(const void *larg, const void *rarg, void *unused)
+{
+ zpool_handle_t *l = ((zpool_node_t *)larg)->zn_handle;
+ zpool_handle_t *r = ((zpool_node_t *)rarg)->zn_handle;
+ const char *lname = zpool_get_name(l);
+ const char *rname = zpool_get_name(r);
+
+ return (strcmp(lname, rname));
+}
+
+/*
+ * Callback function for pool_list_get(). Adds the given pool to the AVL tree
+ * of known pools.
+ */
+static int
+add_pool(zpool_handle_t *zhp, void *data)
+{
+ zpool_list_t *zlp = data;
+ zpool_node_t *node = safe_malloc(sizeof (zpool_node_t));
+ uu_avl_index_t idx;
+
+ node->zn_handle = zhp;
+ uu_avl_node_init(node, &node->zn_avlnode, zlp->zl_pool);
+ if (uu_avl_find(zlp->zl_avl, node, NULL, &idx) == NULL) {
+ uu_avl_insert(zlp->zl_avl, node, idx);
+ } else {
+ zpool_close(zhp);
+ free(node);
+ return (-1);
+ }
+
+ return (0);
+}
+
+/*
+ * Create a list of pools based on the given arguments. If we're given no
+ * arguments, then iterate over all pools in the system and add them to the AVL
+ * tree. Otherwise, add only those pool explicitly specified on the command
+ * line.
+ */
+zpool_list_t *
+pool_list_get(int argc, char **argv, zpool_proplist_t **proplist, int *err)
+{
+ zpool_list_t *zlp;
+
+ zlp = safe_malloc(sizeof (zpool_list_t));
+
+ zlp->zl_pool = uu_avl_pool_create("zfs_pool", sizeof (zpool_node_t),
+ offsetof(zpool_node_t, zn_avlnode), zpool_compare, UU_DEFAULT);
+
+ if (zlp->zl_pool == NULL)
+ zpool_no_memory();
+
+ if ((zlp->zl_avl = uu_avl_create(zlp->zl_pool, NULL,
+ UU_DEFAULT)) == NULL)
+ zpool_no_memory();
+
+ if (argc == 0) {
+ (void) zpool_iter(g_zfs, add_pool, zlp);
+ zlp->zl_findall = B_TRUE;
+ } else {
+ int i;
+
+ for (i = 0; i < argc; i++) {
+ zpool_handle_t *zhp;
+
+ if ((zhp = zpool_open_canfail(g_zfs,
+ argv[i])) != NULL && add_pool(zhp, zlp) == 0) {
+ if (proplist &&
+ zpool_expand_proplist(zhp, proplist) != 0)
+ *err = B_TRUE;
+ } else
+ *err = B_TRUE;
+ }
+ }
+
+ return (zlp);
+}
+
+/*
+ * Search for any new pools, adding them to the list. We only add pools when no
+ * options were given on the command line. Otherwise, we keep the list fixed as
+ * those that were explicitly specified.
+ */
+void
+pool_list_update(zpool_list_t *zlp)
+{
+ if (zlp->zl_findall)
+ (void) zpool_iter(g_zfs, add_pool, zlp);
+}
+
+/*
+ * Iterate over all pools in the list, executing the callback for each
+ */
+int
+pool_list_iter(zpool_list_t *zlp, int unavail, zpool_iter_f func,
+ void *data)
+{
+ zpool_node_t *node, *next_node;
+ int ret = 0;
+
+ for (node = uu_avl_first(zlp->zl_avl); node != NULL; node = next_node) {
+ next_node = uu_avl_next(zlp->zl_avl, node);
+ if (zpool_get_state(node->zn_handle) != POOL_STATE_UNAVAIL ||
+ unavail)
+ ret |= func(node->zn_handle, data);
+ }
+
+ return (ret);
+}
+
+/*
+ * Remove the given pool from the list. When running iostat, we want to remove
+ * those pools that no longer exist.
+ */
+void
+pool_list_remove(zpool_list_t *zlp, zpool_handle_t *zhp)
+{
+ zpool_node_t search, *node;
+
+ search.zn_handle = zhp;
+ if ((node = uu_avl_find(zlp->zl_avl, &search, NULL, NULL)) != NULL) {
+ uu_avl_remove(zlp->zl_avl, node);
+ zpool_close(node->zn_handle);
+ free(node);
+ }
+}
+
+/*
+ * Free all the handles associated with this list.
+ */
+void
+pool_list_free(zpool_list_t *zlp)
+{
+ uu_avl_walk_t *walk;
+ zpool_node_t *node;
+
+ if ((walk = uu_avl_walk_start(zlp->zl_avl, UU_WALK_ROBUST)) == NULL) {
+ (void) fprintf(stderr,
+ gettext("internal error: out of memory"));
+ exit(1);
+ }
+
+ while ((node = uu_avl_walk_next(walk)) != NULL) {
+ uu_avl_remove(zlp->zl_avl, node);
+ zpool_close(node->zn_handle);
+ free(node);
+ }
+
+ uu_avl_walk_end(walk);
+ uu_avl_destroy(zlp->zl_avl);
+ uu_avl_pool_destroy(zlp->zl_pool);
+
+ free(zlp);
+}
+
+/*
+ * Returns the number of elements in the pool list.
+ */
+int
+pool_list_count(zpool_list_t *zlp)
+{
+ return (uu_avl_numnodes(zlp->zl_avl));
+}
+
+/*
+ * High level function which iterates over all pools given on the command line,
+ * using the pool_list_* interfaces.
+ */
+int
+for_each_pool(int argc, char **argv, boolean_t unavail,
+ zpool_proplist_t **proplist, zpool_iter_f func, void *data)
+{
+ zpool_list_t *list;
+ int ret = 0;
+
+ if ((list = pool_list_get(argc, argv, proplist, &ret)) == NULL)
+ return (1);
+
+ if (pool_list_iter(list, unavail, func, data) != 0)
+ ret = 1;
+
+ pool_list_free(list);
+
+ return (ret);
+}
diff --git a/contrib/opensolaris/cmd/zpool/zpool_main.c b/contrib/opensolaris/cmd/zpool/zpool_main.c
new file mode 100644
index 0000000..e8dc64a
--- /dev/null
+++ b/contrib/opensolaris/cmd/zpool/zpool_main.c
@@ -0,0 +1,3567 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <solaris.h>
+#include <assert.h>
+#include <ctype.h>
+#include <dirent.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <libgen.h>
+#include <libintl.h>
+#include <libuutil.h>
+#include <locale.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+#include <unistd.h>
+#include <priv.h>
+#include <sys/time.h>
+#include <sys/fs/zfs.h>
+
+#include <sys/stat.h>
+
+#include <libzfs.h>
+
+#include "zpool_util.h"
+
+static int zpool_do_create(int, char **);
+static int zpool_do_destroy(int, char **);
+
+static int zpool_do_add(int, char **);
+static int zpool_do_remove(int, char **);
+
+static int zpool_do_list(int, char **);
+static int zpool_do_iostat(int, char **);
+static int zpool_do_status(int, char **);
+
+static int zpool_do_online(int, char **);
+static int zpool_do_offline(int, char **);
+static int zpool_do_clear(int, char **);
+
+static int zpool_do_attach(int, char **);
+static int zpool_do_detach(int, char **);
+static int zpool_do_replace(int, char **);
+
+static int zpool_do_scrub(int, char **);
+
+static int zpool_do_import(int, char **);
+static int zpool_do_export(int, char **);
+
+static int zpool_do_upgrade(int, char **);
+
+static int zpool_do_history(int, char **);
+
+static int zpool_do_get(int, char **);
+static int zpool_do_set(int, char **);
+
+/*
+ * These libumem hooks provide a reasonable set of defaults for the allocator's
+ * debugging facilities.
+ */
+const char *
+_umem_debug_init(void)
+{
+ return ("default,verbose"); /* $UMEM_DEBUG setting */
+}
+
+const char *
+_umem_logging_init(void)
+{
+ return ("fail,contents"); /* $UMEM_LOGGING setting */
+}
+
+typedef enum {
+ HELP_ADD,
+ HELP_ATTACH,
+ HELP_CLEAR,
+ HELP_CREATE,
+ HELP_DESTROY,
+ HELP_DETACH,
+ HELP_EXPORT,
+ HELP_HISTORY,
+ HELP_IMPORT,
+ HELP_IOSTAT,
+ HELP_LIST,
+ HELP_OFFLINE,
+ HELP_ONLINE,
+ HELP_REPLACE,
+ HELP_REMOVE,
+ HELP_SCRUB,
+ HELP_STATUS,
+ HELP_UPGRADE,
+ HELP_GET,
+ HELP_SET
+} zpool_help_t;
+
+
+typedef struct zpool_command {
+ const char *name;
+ int (*func)(int, char **);
+ zpool_help_t usage;
+} zpool_command_t;
+
+/*
+ * Master command table. Each ZFS command has a name, associated function, and
+ * usage message. The usage messages need to be internationalized, so we have
+ * to have a function to return the usage message based on a command index.
+ *
+ * These commands are organized according to how they are displayed in the usage
+ * message. An empty command (one with a NULL name) indicates an empty line in
+ * the generic usage message.
+ */
+static zpool_command_t command_table[] = {
+ { "create", zpool_do_create, HELP_CREATE },
+ { "destroy", zpool_do_destroy, HELP_DESTROY },
+ { NULL },
+ { "add", zpool_do_add, HELP_ADD },
+ { "remove", zpool_do_remove, HELP_REMOVE },
+ { NULL },
+ { "list", zpool_do_list, HELP_LIST },
+ { "iostat", zpool_do_iostat, HELP_IOSTAT },
+ { "status", zpool_do_status, HELP_STATUS },
+ { NULL },
+ { "online", zpool_do_online, HELP_ONLINE },
+ { "offline", zpool_do_offline, HELP_OFFLINE },
+ { "clear", zpool_do_clear, HELP_CLEAR },
+ { NULL },
+ { "attach", zpool_do_attach, HELP_ATTACH },
+ { "detach", zpool_do_detach, HELP_DETACH },
+ { "replace", zpool_do_replace, HELP_REPLACE },
+ { NULL },
+ { "scrub", zpool_do_scrub, HELP_SCRUB },
+ { NULL },
+ { "import", zpool_do_import, HELP_IMPORT },
+ { "export", zpool_do_export, HELP_EXPORT },
+ { "upgrade", zpool_do_upgrade, HELP_UPGRADE },
+ { NULL },
+ { "history", zpool_do_history, HELP_HISTORY },
+ { "get", zpool_do_get, HELP_GET },
+ { "set", zpool_do_set, HELP_SET },
+};
+
+#define NCOMMAND (sizeof (command_table) / sizeof (command_table[0]))
+
+zpool_command_t *current_command;
+
+static const char *
+get_usage(zpool_help_t idx) {
+ switch (idx) {
+ case HELP_ADD:
+ return (gettext("\tadd [-fn] <pool> <vdev> ...\n"));
+ case HELP_ATTACH:
+ return (gettext("\tattach [-f] <pool> <device> "
+ "<new_device>\n"));
+ case HELP_CLEAR:
+ return (gettext("\tclear <pool> [device]\n"));
+ case HELP_CREATE:
+ return (gettext("\tcreate [-fn] [-R root] [-m mountpoint] "
+ "<pool> <vdev> ...\n"));
+ case HELP_DESTROY:
+ return (gettext("\tdestroy [-f] <pool>\n"));
+ case HELP_DETACH:
+ return (gettext("\tdetach <pool> <device>\n"));
+ case HELP_EXPORT:
+ return (gettext("\texport [-f] <pool> ...\n"));
+ case HELP_HISTORY:
+ return (gettext("\thistory [<pool>]\n"));
+ case HELP_IMPORT:
+ return (gettext("\timport [-d dir] [-D]\n"
+ "\timport [-d dir] [-D] [-f] [-o opts] [-R root] -a\n"
+ "\timport [-d dir] [-D] [-f] [-o opts] [-R root ]"
+ " <pool | id> [newpool]\n"));
+ case HELP_IOSTAT:
+ return (gettext("\tiostat [-v] [pool] ... [interval "
+ "[count]]\n"));
+ case HELP_LIST:
+ return (gettext("\tlist [-H] [-o field[,field]*] "
+ "[pool] ...\n"));
+ case HELP_OFFLINE:
+ return (gettext("\toffline [-t] <pool> <device> ...\n"));
+ case HELP_ONLINE:
+ return (gettext("\tonline <pool> <device> ...\n"));
+ case HELP_REPLACE:
+ return (gettext("\treplace [-f] <pool> <device> "
+ "[new_device]\n"));
+ case HELP_REMOVE:
+ return (gettext("\tremove <pool> <device>\n"));
+ case HELP_SCRUB:
+ return (gettext("\tscrub [-s] <pool> ...\n"));
+ case HELP_STATUS:
+ return (gettext("\tstatus [-vx] [pool] ...\n"));
+ case HELP_UPGRADE:
+ return (gettext("\tupgrade\n"
+ "\tupgrade -v\n"
+ "\tupgrade <-a | pool>\n"));
+ case HELP_GET:
+ return (gettext("\tget <all | property[,property]...> "
+ "<pool> ...\n"));
+ case HELP_SET:
+ return (gettext("\tset <property=value> <pool> \n"));
+ }
+
+ abort();
+ /* NOTREACHED */
+}
+
+/*
+ * Fields available for 'zpool list'.
+ */
+typedef enum {
+ ZPOOL_FIELD_NAME,
+ ZPOOL_FIELD_SIZE,
+ ZPOOL_FIELD_USED,
+ ZPOOL_FIELD_AVAILABLE,
+ ZPOOL_FIELD_CAPACITY,
+ ZPOOL_FIELD_HEALTH,
+ ZPOOL_FIELD_ROOT
+} zpool_field_t;
+
+#define MAX_FIELDS 10
+
+typedef struct column_def {
+ const char *cd_title;
+ size_t cd_width;
+ enum {
+ left_justify,
+ right_justify
+ } cd_justify;
+} column_def_t;
+
+static column_def_t column_table[] = {
+ { "NAME", 20, left_justify },
+ { "SIZE", 6, right_justify },
+ { "USED", 6, right_justify },
+ { "AVAIL", 6, right_justify },
+ { "CAP", 5, right_justify },
+ { "HEALTH", 9, left_justify },
+ { "ALTROOT", 15, left_justify }
+};
+
+static char *column_subopts[] = {
+ "name",
+ "size",
+ "used",
+ "available",
+ "capacity",
+ "health",
+ "root",
+ NULL
+};
+
+/*
+ * Callback routine that will print out a pool property value.
+ */
+static zpool_prop_t
+print_prop_cb(zpool_prop_t prop, void *cb)
+{
+ FILE *fp = cb;
+
+ (void) fprintf(fp, "\t%-13s ", zpool_prop_to_name(prop));
+
+ if (zpool_prop_values(prop) == NULL)
+ (void) fprintf(fp, "-\n");
+ else
+ (void) fprintf(fp, "%s\n", zpool_prop_values(prop));
+
+ return (ZFS_PROP_CONT);
+}
+
+/*
+ * Display usage message. If we're inside a command, display only the usage for
+ * that command. Otherwise, iterate over the entire command table and display
+ * a complete usage message.
+ */
+void
+usage(boolean_t requested)
+{
+ int i;
+ FILE *fp = requested ? stdout : stderr;
+
+ if (current_command == NULL) {
+ int i;
+
+ (void) fprintf(fp, gettext("usage: zpool command args ...\n"));
+ (void) fprintf(fp,
+ gettext("where 'command' is one of the following:\n\n"));
+
+ for (i = 0; i < NCOMMAND; i++) {
+ if (command_table[i].name == NULL)
+ (void) fprintf(fp, "\n");
+ else
+ (void) fprintf(fp, "%s",
+ get_usage(command_table[i].usage));
+ }
+ } else {
+ (void) fprintf(fp, gettext("usage:\n"));
+ (void) fprintf(fp, "%s", get_usage(current_command->usage));
+
+ if (strcmp(current_command->name, "list") == 0) {
+ (void) fprintf(fp, gettext("\nwhere 'field' is one "
+ "of the following:\n\n"));
+
+ for (i = 0; column_subopts[i] != NULL; i++)
+ (void) fprintf(fp, "\t%s\n", column_subopts[i]);
+ }
+ }
+
+ if (current_command != NULL &&
+ ((strcmp(current_command->name, "set") == 0) ||
+ (strcmp(current_command->name, "get") == 0))) {
+
+ (void) fprintf(fp,
+ gettext("\nthe following properties are supported:\n"));
+
+ (void) fprintf(fp, "\n\t%-13s %s\n\n",
+ "PROPERTY", "VALUES");
+
+ /* Iterate over all properties */
+ (void) zpool_prop_iter(print_prop_cb, fp, B_FALSE);
+ }
+
+ /*
+ * See comments at end of main().
+ */
+ if (getenv("ZFS_ABORT") != NULL) {
+ (void) printf("dumping core by request\n");
+ abort();
+ }
+
+ exit(requested ? 0 : 2);
+}
+
+const char *
+state_to_health(int vs_state)
+{
+ switch (vs_state) {
+ case VDEV_STATE_CLOSED:
+ case VDEV_STATE_CANT_OPEN:
+ case VDEV_STATE_OFFLINE:
+ return (dgettext(TEXT_DOMAIN, "FAULTED"));
+ case VDEV_STATE_DEGRADED:
+ return (dgettext(TEXT_DOMAIN, "DEGRADED"));
+ case VDEV_STATE_HEALTHY:
+ return (dgettext(TEXT_DOMAIN, "ONLINE"));
+ }
+
+ return (dgettext(TEXT_DOMAIN, "UNKNOWN"));
+}
+
+const char *
+state_to_name(vdev_stat_t *vs)
+{
+ switch (vs->vs_state) {
+ case VDEV_STATE_CLOSED:
+ case VDEV_STATE_CANT_OPEN:
+ if (vs->vs_aux == VDEV_AUX_CORRUPT_DATA)
+ return (gettext("FAULTED"));
+ else
+ return (gettext("UNAVAIL"));
+ case VDEV_STATE_OFFLINE:
+ return (gettext("OFFLINE"));
+ case VDEV_STATE_DEGRADED:
+ return (gettext("DEGRADED"));
+ case VDEV_STATE_HEALTHY:
+ return (gettext("ONLINE"));
+ }
+
+ return (gettext("UNKNOWN"));
+}
+
+void
+print_vdev_tree(zpool_handle_t *zhp, const char *name, nvlist_t *nv, int indent)
+{
+ nvlist_t **child;
+ uint_t c, children;
+ char *vname;
+
+ if (name != NULL)
+ (void) printf("\t%*s%s\n", indent, "", name);
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) != 0)
+ return;
+
+ for (c = 0; c < children; c++) {
+ vname = zpool_vdev_name(g_zfs, zhp, child[c]);
+ print_vdev_tree(zhp, vname, child[c], indent + 2);
+ free(vname);
+ }
+}
+
+/*
+ * zpool add [-fn] <pool> <vdev> ...
+ *
+ * -f Force addition of devices, even if they appear in use
+ * -n Do not add the devices, but display the resulting layout if
+ * they were to be added.
+ *
+ * Adds the given vdevs to 'pool'. As with create, the bulk of this work is
+ * handled by get_vdev_spec(), which constructs the nvlist needed to pass to
+ * libzfs.
+ */
+int
+zpool_do_add(int argc, char **argv)
+{
+ boolean_t force = B_FALSE;
+ boolean_t dryrun = B_FALSE;
+ int c;
+ nvlist_t *nvroot;
+ char *poolname;
+ int ret;
+ zpool_handle_t *zhp;
+ nvlist_t *config;
+
+ /* check options */
+ while ((c = getopt(argc, argv, "fn")) != -1) {
+ switch (c) {
+ case 'f':
+ force = B_TRUE;
+ break;
+ case 'n':
+ dryrun = B_TRUE;
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ /* get pool name and check number of arguments */
+ if (argc < 1) {
+ (void) fprintf(stderr, gettext("missing pool name argument\n"));
+ usage(B_FALSE);
+ }
+ if (argc < 2) {
+ (void) fprintf(stderr, gettext("missing vdev specification\n"));
+ usage(B_FALSE);
+ }
+
+ poolname = argv[0];
+
+ argc--;
+ argv++;
+
+ if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
+ return (1);
+
+ if ((config = zpool_get_config(zhp, NULL)) == NULL) {
+ (void) fprintf(stderr, gettext("pool '%s' is unavailable\n"),
+ poolname);
+ zpool_close(zhp);
+ return (1);
+ }
+
+ /* pass off to get_vdev_spec for processing */
+ nvroot = make_root_vdev(config, force, !force, B_FALSE, argc, argv);
+ if (nvroot == NULL) {
+ zpool_close(zhp);
+ return (1);
+ }
+
+ if (dryrun) {
+ nvlist_t *poolnvroot;
+
+ verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+ &poolnvroot) == 0);
+
+ (void) printf(gettext("would update '%s' to the following "
+ "configuration:\n"), zpool_get_name(zhp));
+
+ print_vdev_tree(zhp, poolname, poolnvroot, 0);
+ print_vdev_tree(zhp, NULL, nvroot, 0);
+
+ ret = 0;
+ } else {
+ ret = (zpool_add(zhp, nvroot) != 0);
+ if (!ret) {
+ zpool_log_history(g_zfs, argc + 1 + optind,
+ argv - 1 - optind, poolname, B_TRUE, B_FALSE);
+ }
+ }
+
+ nvlist_free(nvroot);
+ zpool_close(zhp);
+
+ return (ret);
+}
+
+/*
+ * zpool remove <pool> <vdev>
+ *
+ * Removes the given vdev from the pool. Currently, this only supports removing
+ * spares from the pool. Eventually, we'll want to support removing leaf vdevs
+ * (as an alias for 'detach') as well as toplevel vdevs.
+ */
+int
+zpool_do_remove(int argc, char **argv)
+{
+ char *poolname;
+ int ret;
+ zpool_handle_t *zhp;
+
+ argc--;
+ argv++;
+
+ /* get pool name and check number of arguments */
+ if (argc < 1) {
+ (void) fprintf(stderr, gettext("missing pool name argument\n"));
+ usage(B_FALSE);
+ }
+ if (argc < 2) {
+ (void) fprintf(stderr, gettext("missing device\n"));
+ usage(B_FALSE);
+ }
+
+ poolname = argv[0];
+
+ if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
+ return (1);
+
+ ret = (zpool_vdev_remove(zhp, argv[1]) != 0);
+ if (!ret) {
+ zpool_log_history(g_zfs, ++argc, --argv, poolname, B_TRUE,
+ B_FALSE);
+ }
+
+ return (ret);
+}
+
+/*
+ * zpool create [-fn] [-R root] [-m mountpoint] <pool> <dev> ...
+ *
+ * -f Force creation, even if devices appear in use
+ * -n Do not create the pool, but display the resulting layout if it
+ * were to be created.
+ * -R Create a pool under an alternate root
+ * -m Set default mountpoint for the root dataset. By default it's
+ * '/<pool>'
+ *
+ * Creates the named pool according to the given vdev specification. The
+ * bulk of the vdev processing is done in get_vdev_spec() in zpool_vdev.c. Once
+ * we get the nvlist back from get_vdev_spec(), we either print out the contents
+ * (if '-n' was specified), or pass it to libzfs to do the creation.
+ */
+int
+zpool_do_create(int argc, char **argv)
+{
+ boolean_t force = B_FALSE;
+ boolean_t dryrun = B_FALSE;
+ int c;
+ nvlist_t *nvroot;
+ char *poolname;
+ int ret;
+ char *altroot = NULL;
+ char *mountpoint = NULL;
+ nvlist_t **child;
+ uint_t children;
+
+ /* check options */
+ while ((c = getopt(argc, argv, ":fnR:m:")) != -1) {
+ switch (c) {
+ case 'f':
+ force = B_TRUE;
+ break;
+ case 'n':
+ dryrun = B_TRUE;
+ break;
+ case 'R':
+ altroot = optarg;
+ break;
+ case 'm':
+ mountpoint = optarg;
+ break;
+ case ':':
+ (void) fprintf(stderr, gettext("missing argument for "
+ "'%c' option\n"), optopt);
+ usage(B_FALSE);
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ /* get pool name and check number of arguments */
+ if (argc < 1) {
+ (void) fprintf(stderr, gettext("missing pool name argument\n"));
+ usage(B_FALSE);
+ }
+ if (argc < 2) {
+ (void) fprintf(stderr, gettext("missing vdev specification\n"));
+ usage(B_FALSE);
+ }
+
+ poolname = argv[0];
+
+ /*
+ * As a special case, check for use of '/' in the name, and direct the
+ * user to use 'zfs create' instead.
+ */
+ if (strchr(poolname, '/') != NULL) {
+ (void) fprintf(stderr, gettext("cannot create '%s': invalid "
+ "character '/' in pool name\n"), poolname);
+ (void) fprintf(stderr, gettext("use 'zfs create' to "
+ "create a dataset\n"));
+ return (1);
+ }
+
+ /* pass off to get_vdev_spec for bulk processing */
+ nvroot = make_root_vdev(NULL, force, !force, B_FALSE, argc - 1,
+ argv + 1);
+ if (nvroot == NULL)
+ return (1);
+
+ /* make_root_vdev() allows 0 toplevel children if there are spares */
+ verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) == 0);
+ if (children == 0) {
+ (void) fprintf(stderr, gettext("invalid vdev "
+ "specification: at least one toplevel vdev must be "
+ "specified\n"));
+ return (1);
+ }
+
+
+ if (altroot != NULL && altroot[0] != '/') {
+ (void) fprintf(stderr, gettext("invalid alternate root '%s': "
+ "must be an absolute path\n"), altroot);
+ nvlist_free(nvroot);
+ return (1);
+ }
+
+ /*
+ * Check the validity of the mountpoint and direct the user to use the
+ * '-m' mountpoint option if it looks like its in use.
+ */
+ if (mountpoint == NULL ||
+ (strcmp(mountpoint, ZFS_MOUNTPOINT_LEGACY) != 0 &&
+ strcmp(mountpoint, ZFS_MOUNTPOINT_NONE) != 0)) {
+ char buf[MAXPATHLEN];
+ struct stat64 statbuf;
+
+ if (mountpoint && mountpoint[0] != '/') {
+ (void) fprintf(stderr, gettext("invalid mountpoint "
+ "'%s': must be an absolute path, 'legacy', or "
+ "'none'\n"), mountpoint);
+ nvlist_free(nvroot);
+ return (1);
+ }
+
+ if (mountpoint == NULL) {
+ if (altroot != NULL)
+ (void) snprintf(buf, sizeof (buf), "%s/%s",
+ altroot, poolname);
+ else
+ (void) snprintf(buf, sizeof (buf), "/%s",
+ poolname);
+ } else {
+ if (altroot != NULL)
+ (void) snprintf(buf, sizeof (buf), "%s%s",
+ altroot, mountpoint);
+ else
+ (void) snprintf(buf, sizeof (buf), "%s",
+ mountpoint);
+ }
+
+ if (stat64(buf, &statbuf) == 0 &&
+ statbuf.st_nlink != 2) {
+ if (mountpoint == NULL)
+ (void) fprintf(stderr, gettext("default "
+ "mountpoint '%s' exists and is not "
+ "empty\n"), buf);
+ else
+ (void) fprintf(stderr, gettext("mountpoint "
+ "'%s' exists and is not empty\n"), buf);
+ (void) fprintf(stderr, gettext("use '-m' "
+ "option to provide a different default\n"));
+ nvlist_free(nvroot);
+ return (1);
+ }
+ }
+
+
+ if (dryrun) {
+ /*
+ * For a dry run invocation, print out a basic message and run
+ * through all the vdevs in the list and print out in an
+ * appropriate hierarchy.
+ */
+ (void) printf(gettext("would create '%s' with the "
+ "following layout:\n\n"), poolname);
+
+ print_vdev_tree(NULL, poolname, nvroot, 0);
+
+ ret = 0;
+ } else {
+ ret = 1;
+ /*
+ * Hand off to libzfs.
+ */
+ if (zpool_create(g_zfs, poolname, nvroot, altroot) == 0) {
+ zfs_handle_t *pool = zfs_open(g_zfs, poolname,
+ ZFS_TYPE_FILESYSTEM);
+ if (pool != NULL) {
+ if (mountpoint != NULL)
+ verify(zfs_prop_set(pool,
+ zfs_prop_to_name(
+ ZFS_PROP_MOUNTPOINT),
+ mountpoint) == 0);
+ if (zfs_mount(pool, NULL, 0) == 0)
+ ret = zfs_share_nfs(pool);
+ zfs_close(pool);
+ }
+ zpool_log_history(g_zfs, argc + optind, argv - optind,
+ poolname, B_TRUE, B_TRUE);
+ } else if (libzfs_errno(g_zfs) == EZFS_INVALIDNAME) {
+ (void) fprintf(stderr, gettext("pool name may have "
+ "been omitted\n"));
+ }
+ }
+
+ nvlist_free(nvroot);
+
+ return (ret);
+}
+
+/*
+ * zpool destroy <pool>
+ *
+ * -f Forcefully unmount any datasets
+ *
+ * Destroy the given pool. Automatically unmounts any datasets in the pool.
+ */
+int
+zpool_do_destroy(int argc, char **argv)
+{
+ boolean_t force = B_FALSE;
+ int c;
+ char *pool;
+ zpool_handle_t *zhp;
+ int ret;
+
+ /* check options */
+ while ((c = getopt(argc, argv, "f")) != -1) {
+ switch (c) {
+ case 'f':
+ force = B_TRUE;
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ /* check arguments */
+ if (argc < 1) {
+ (void) fprintf(stderr, gettext("missing pool argument\n"));
+ usage(B_FALSE);
+ }
+ if (argc > 1) {
+ (void) fprintf(stderr, gettext("too many arguments\n"));
+ usage(B_FALSE);
+ }
+
+ pool = argv[0];
+
+ if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL) {
+ /*
+ * As a special case, check for use of '/' in the name, and
+ * direct the user to use 'zfs destroy' instead.
+ */
+ if (strchr(pool, '/') != NULL)
+ (void) fprintf(stderr, gettext("use 'zfs destroy' to "
+ "destroy a dataset\n"));
+ return (1);
+ }
+
+ if (zpool_disable_datasets(zhp, force) != 0) {
+ (void) fprintf(stderr, gettext("could not destroy '%s': "
+ "could not unmount datasets\n"), zpool_get_name(zhp));
+ return (1);
+ }
+
+ zpool_log_history(g_zfs, argc + optind, argv - optind, pool, B_TRUE,
+ B_FALSE);
+
+ ret = (zpool_destroy(zhp) != 0);
+
+ zpool_close(zhp);
+
+ return (ret);
+}
+
+/*
+ * zpool export [-f] <pool> ...
+ *
+ * -f Forcefully unmount datasets
+ *
+ * Export the given pools. By default, the command will attempt to cleanly
+ * unmount any active datasets within the pool. If the '-f' flag is specified,
+ * then the datasets will be forcefully unmounted.
+ */
+int
+zpool_do_export(int argc, char **argv)
+{
+ boolean_t force = B_FALSE;
+ int c;
+ zpool_handle_t *zhp;
+ int ret;
+ int i;
+
+ /* check options */
+ while ((c = getopt(argc, argv, "f")) != -1) {
+ switch (c) {
+ case 'f':
+ force = B_TRUE;
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ /* check arguments */
+ if (argc < 1) {
+ (void) fprintf(stderr, gettext("missing pool argument\n"));
+ usage(B_FALSE);
+ }
+
+ ret = 0;
+ for (i = 0; i < argc; i++) {
+ if ((zhp = zpool_open_canfail(g_zfs, argv[i])) == NULL) {
+ ret = 1;
+ continue;
+ }
+
+ if (zpool_disable_datasets(zhp, force) != 0) {
+ ret = 1;
+ zpool_close(zhp);
+ continue;
+ }
+
+ zpool_log_history(g_zfs, argc + optind, argv - optind, argv[i],
+ B_TRUE, B_FALSE);
+
+ if (zpool_export(zhp) != 0)
+ ret = 1;
+
+ zpool_close(zhp);
+ }
+
+ return (ret);
+}
+
+/*
+ * Given a vdev configuration, determine the maximum width needed for the device
+ * name column.
+ */
+static int
+max_width(zpool_handle_t *zhp, nvlist_t *nv, int depth, int max)
+{
+ char *name = zpool_vdev_name(g_zfs, zhp, nv);
+ nvlist_t **child;
+ uint_t c, children;
+ int ret;
+
+ if (strlen(name) + depth > max)
+ max = strlen(name) + depth;
+
+ free(name);
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
+ &child, &children) == 0) {
+ for (c = 0; c < children; c++)
+ if ((ret = max_width(zhp, child[c], depth + 2,
+ max)) > max)
+ max = ret;
+ }
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) == 0) {
+ for (c = 0; c < children; c++)
+ if ((ret = max_width(zhp, child[c], depth + 2,
+ max)) > max)
+ max = ret;
+ }
+
+
+ return (max);
+}
+
+
+/*
+ * Print the configuration of an exported pool. Iterate over all vdevs in the
+ * pool, printing out the name and status for each one.
+ */
+void
+print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth)
+{
+ nvlist_t **child;
+ uint_t c, children;
+ vdev_stat_t *vs;
+ char *type, *vname;
+
+ verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
+ if (strcmp(type, VDEV_TYPE_MISSING) == 0)
+ return;
+
+ verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_STATS,
+ (uint64_t **)&vs, &c) == 0);
+
+ (void) printf("\t%*s%-*s", depth, "", namewidth - depth, name);
+
+ if (vs->vs_aux != 0) {
+ (void) printf(" %-8s ", state_to_name(vs));
+
+ switch (vs->vs_aux) {
+ case VDEV_AUX_OPEN_FAILED:
+ (void) printf(gettext("cannot open"));
+ break;
+
+ case VDEV_AUX_BAD_GUID_SUM:
+ (void) printf(gettext("missing device"));
+ break;
+
+ case VDEV_AUX_NO_REPLICAS:
+ (void) printf(gettext("insufficient replicas"));
+ break;
+
+ case VDEV_AUX_VERSION_NEWER:
+ (void) printf(gettext("newer version"));
+ break;
+
+ default:
+ (void) printf(gettext("corrupted data"));
+ break;
+ }
+ } else {
+ (void) printf(" %s", state_to_name(vs));
+ }
+ (void) printf("\n");
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) != 0)
+ return;
+
+ for (c = 0; c < children; c++) {
+ vname = zpool_vdev_name(g_zfs, NULL, child[c]);
+ print_import_config(vname, child[c],
+ namewidth, depth + 2);
+ free(vname);
+ }
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
+ &child, &children) != 0)
+ return;
+
+ (void) printf(gettext("\tspares\n"));
+ for (c = 0; c < children; c++) {
+ vname = zpool_vdev_name(g_zfs, NULL, child[c]);
+ (void) printf("\t %s\n", vname);
+ free(vname);
+ }
+}
+
+/*
+ * Display the status for the given pool.
+ */
+static void
+show_import(nvlist_t *config)
+{
+ uint64_t pool_state;
+ vdev_stat_t *vs;
+ char *name;
+ uint64_t guid;
+ char *msgid;
+ nvlist_t *nvroot;
+ int reason;
+ const char *health;
+ uint_t vsc;
+ int namewidth;
+
+ verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
+ &name) == 0);
+ verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
+ &guid) == 0);
+ verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
+ &pool_state) == 0);
+ verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+ &nvroot) == 0);
+
+ verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_STATS,
+ (uint64_t **)&vs, &vsc) == 0);
+ health = state_to_health(vs->vs_state);
+
+ reason = zpool_import_status(config, &msgid);
+
+ (void) printf(gettext(" pool: %s\n"), name);
+ (void) printf(gettext(" id: %llu\n"), (u_longlong_t)guid);
+ (void) printf(gettext(" state: %s"), health);
+ if (pool_state == POOL_STATE_DESTROYED)
+ (void) printf(gettext(" (DESTROYED)"));
+ (void) printf("\n");
+
+ switch (reason) {
+ case ZPOOL_STATUS_MISSING_DEV_R:
+ case ZPOOL_STATUS_MISSING_DEV_NR:
+ case ZPOOL_STATUS_BAD_GUID_SUM:
+ (void) printf(gettext("status: One or more devices are missing "
+ "from the system.\n"));
+ break;
+
+ case ZPOOL_STATUS_CORRUPT_LABEL_R:
+ case ZPOOL_STATUS_CORRUPT_LABEL_NR:
+ (void) printf(gettext("status: One or more devices contains "
+ "corrupted data.\n"));
+ break;
+
+ case ZPOOL_STATUS_CORRUPT_DATA:
+ (void) printf(gettext("status: The pool data is corrupted.\n"));
+ break;
+
+ case ZPOOL_STATUS_OFFLINE_DEV:
+ (void) printf(gettext("status: One or more devices "
+ "are offlined.\n"));
+ break;
+
+ case ZPOOL_STATUS_CORRUPT_POOL:
+ (void) printf(gettext("status: The pool metadata is "
+ "corrupted.\n"));
+ break;
+
+ case ZPOOL_STATUS_VERSION_OLDER:
+ (void) printf(gettext("status: The pool is formatted using an "
+ "older on-disk version.\n"));
+ break;
+
+ case ZPOOL_STATUS_VERSION_NEWER:
+ (void) printf(gettext("status: The pool is formatted using an "
+ "incompatible version.\n"));
+ break;
+
+ default:
+ /*
+ * No other status can be seen when importing pools.
+ */
+ assert(reason == ZPOOL_STATUS_OK);
+ }
+
+ /*
+ * Print out an action according to the overall state of the pool.
+ */
+ if (vs->vs_state == VDEV_STATE_HEALTHY) {
+ if (reason == ZPOOL_STATUS_VERSION_OLDER)
+ (void) printf(gettext("action: The pool can be "
+ "imported using its name or numeric identifier, "
+ "though\n\tsome features will not be available "
+ "without an explicit 'zpool upgrade'.\n"));
+ else
+ (void) printf(gettext("action: The pool can be "
+ "imported using its name or numeric "
+ "identifier.\n"));
+ } else if (vs->vs_state == VDEV_STATE_DEGRADED) {
+ (void) printf(gettext("action: The pool can be imported "
+ "despite missing or damaged devices. The\n\tfault "
+ "tolerance of the pool may be compromised if imported.\n"));
+ } else {
+ switch (reason) {
+ case ZPOOL_STATUS_VERSION_NEWER:
+ (void) printf(gettext("action: The pool cannot be "
+ "imported. Access the pool on a system running "
+ "newer\n\tsoftware, or recreate the pool from "
+ "backup.\n"));
+ break;
+ case ZPOOL_STATUS_MISSING_DEV_R:
+ case ZPOOL_STATUS_MISSING_DEV_NR:
+ case ZPOOL_STATUS_BAD_GUID_SUM:
+ (void) printf(gettext("action: The pool cannot be "
+ "imported. Attach the missing\n\tdevices and try "
+ "again.\n"));
+ break;
+ default:
+ (void) printf(gettext("action: The pool cannot be "
+ "imported due to damaged devices or data.\n"));
+ }
+ }
+
+ /*
+ * If the state is "closed" or "can't open", and the aux state
+ * is "corrupt data":
+ */
+ if (((vs->vs_state == VDEV_STATE_CLOSED) ||
+ (vs->vs_state == VDEV_STATE_CANT_OPEN)) &&
+ (vs->vs_aux == VDEV_AUX_CORRUPT_DATA)) {
+ if (pool_state == POOL_STATE_DESTROYED)
+ (void) printf(gettext("\tThe pool was destroyed, "
+ "but can be imported using the '-Df' flags.\n"));
+ else if (pool_state != POOL_STATE_EXPORTED)
+ (void) printf(gettext("\tThe pool may be active on "
+ "on another system, but can be imported using\n\t"
+ "the '-f' flag.\n"));
+ }
+
+ if (msgid != NULL)
+ (void) printf(gettext(" see: http://www.sun.com/msg/%s\n"),
+ msgid);
+
+ (void) printf(gettext("config:\n\n"));
+
+ namewidth = max_width(NULL, nvroot, 0, 0);
+ if (namewidth < 10)
+ namewidth = 10;
+ print_import_config(name, nvroot, namewidth, 0);
+
+ if (reason == ZPOOL_STATUS_BAD_GUID_SUM) {
+ (void) printf(gettext("\n\tAdditional devices are known to "
+ "be part of this pool, though their\n\texact "
+ "configuration cannot be determined.\n"));
+ }
+}
+
+/*
+ * Perform the import for the given configuration. This passes the heavy
+ * lifting off to zpool_import(), and then mounts the datasets contained within
+ * the pool.
+ */
+static int
+do_import(nvlist_t *config, const char *newname, const char *mntopts,
+ const char *altroot, int force, int argc, char **argv)
+{
+ zpool_handle_t *zhp;
+ char *name;
+ uint64_t state;
+ uint64_t version;
+
+ verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
+ &name) == 0);
+
+ verify(nvlist_lookup_uint64(config,
+ ZPOOL_CONFIG_POOL_STATE, &state) == 0);
+ verify(nvlist_lookup_uint64(config,
+ ZPOOL_CONFIG_VERSION, &version) == 0);
+ if (version > ZFS_VERSION) {
+ (void) fprintf(stderr, gettext("cannot import '%s': pool "
+ "is formatted using a newer ZFS version\n"), name);
+ return (1);
+ } else if (state != POOL_STATE_EXPORTED && !force) {
+ (void) fprintf(stderr, gettext("cannot import '%s': pool "
+ "may be in use from other system\n"), name);
+ (void) fprintf(stderr, gettext("use '-f' to import anyway\n"));
+ return (1);
+ }
+
+ if (zpool_import(g_zfs, config, newname, altroot) != 0)
+ return (1);
+
+ if (newname != NULL)
+ name = (char *)newname;
+
+ zpool_log_history(g_zfs, argc, argv, name, B_TRUE, B_FALSE);
+
+ verify((zhp = zpool_open(g_zfs, name)) != NULL);
+
+ if (zpool_enable_datasets(zhp, mntopts, 0) != 0) {
+ zpool_close(zhp);
+ return (1);
+ }
+
+ zpool_close(zhp);
+ return (0);
+}
+
+/*
+ * zpool import [-d dir] [-D]
+ * import [-R root] [-D] [-d dir] [-f] -a
+ * import [-R root] [-D] [-d dir] [-f] <pool | id> [newpool]
+ *
+ * -d Scan in a specific directory, other than /dev/dsk. More than
+ * one directory can be specified using multiple '-d' options.
+ *
+ * -D Scan for previously destroyed pools or import all or only
+ * specified destroyed pools.
+ *
+ * -R Temporarily import the pool, with all mountpoints relative to
+ * the given root. The pool will remain exported when the machine
+ * is rebooted.
+ *
+ * -f Force import, even if it appears that the pool is active.
+ *
+ * -a Import all pools found.
+ *
+ * The import command scans for pools to import, and import pools based on pool
+ * name and GUID. The pool can also be renamed as part of the import process.
+ */
+int
+zpool_do_import(int argc, char **argv)
+{
+ char **searchdirs = NULL;
+ int nsearch = 0;
+ int c;
+ int err;
+ nvlist_t *pools;
+ boolean_t do_all = B_FALSE;
+ boolean_t do_destroyed = B_FALSE;
+ char *altroot = NULL;
+ char *mntopts = NULL;
+ boolean_t do_force = B_FALSE;
+ nvpair_t *elem;
+ nvlist_t *config;
+ uint64_t searchguid;
+ char *searchname;
+ nvlist_t *found_config;
+ boolean_t first;
+ uint64_t pool_state;
+
+ /* check options */
+ while ((c = getopt(argc, argv, ":Dfd:R:ao:")) != -1) {
+ switch (c) {
+ case 'a':
+ do_all = B_TRUE;
+ break;
+ case 'd':
+ if (searchdirs == NULL) {
+ searchdirs = safe_malloc(sizeof (char *));
+ } else {
+ char **tmp = safe_malloc((nsearch + 1) *
+ sizeof (char *));
+ bcopy(searchdirs, tmp, nsearch *
+ sizeof (char *));
+ free(searchdirs);
+ searchdirs = tmp;
+ }
+ searchdirs[nsearch++] = optarg;
+ break;
+ case 'D':
+ do_destroyed = B_TRUE;
+ break;
+ case 'f':
+ do_force = B_TRUE;
+ break;
+ case 'o':
+ mntopts = optarg;
+ break;
+ case 'R':
+ altroot = optarg;
+ break;
+ case ':':
+ (void) fprintf(stderr, gettext("missing argument for "
+ "'%c' option\n"), optopt);
+ usage(B_FALSE);
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ if (searchdirs == NULL) {
+ searchdirs = safe_malloc(sizeof (char *));
+ searchdirs[0] = "/dev";
+ nsearch = 1;
+ }
+
+ /* check argument count */
+ if (do_all) {
+ if (argc != 0) {
+ (void) fprintf(stderr, gettext("too many arguments\n"));
+ usage(B_FALSE);
+ }
+ } else {
+ if (argc > 2) {
+ (void) fprintf(stderr, gettext("too many arguments\n"));
+ usage(B_FALSE);
+ }
+
+ /*
+ * Check for the SYS_CONFIG privilege. We do this explicitly
+ * here because otherwise any attempt to discover pools will
+ * silently fail.
+ */
+ if (argc == 0 && !priv_ineffect(PRIV_SYS_CONFIG)) {
+ (void) fprintf(stderr, gettext("cannot "
+ "discover pools: permission denied\n"));
+ free(searchdirs);
+ return (1);
+ }
+ }
+
+ if ((pools = zpool_find_import(g_zfs, nsearch, searchdirs)) == NULL) {
+ free(searchdirs);
+ return (1);
+ }
+
+ /*
+ * We now have a list of all available pools in the given directories.
+ * Depending on the arguments given, we do one of the following:
+ *
+ * <none> Iterate through all pools and display information about
+ * each one.
+ *
+ * -a Iterate through all pools and try to import each one.
+ *
+ * <id> Find the pool that corresponds to the given GUID/pool
+ * name and import that one.
+ *
+ * -D Above options applies only to destroyed pools.
+ */
+ if (argc != 0) {
+ char *endptr;
+
+ errno = 0;
+ searchguid = strtoull(argv[0], &endptr, 10);
+ if (errno != 0 || *endptr != '\0')
+ searchname = argv[0];
+ else
+ searchname = NULL;
+ found_config = NULL;
+ }
+
+ err = 0;
+ elem = NULL;
+ first = B_TRUE;
+ while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) {
+
+ verify(nvpair_value_nvlist(elem, &config) == 0);
+
+ verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
+ &pool_state) == 0);
+ if (!do_destroyed && pool_state == POOL_STATE_DESTROYED)
+ continue;
+ if (do_destroyed && pool_state != POOL_STATE_DESTROYED)
+ continue;
+
+ if (argc == 0) {
+ if (first)
+ first = B_FALSE;
+ else if (!do_all)
+ (void) printf("\n");
+
+ if (do_all)
+ err |= do_import(config, NULL, mntopts,
+ altroot, do_force, argc + optind,
+ argv - optind);
+ else
+ show_import(config);
+ } else if (searchname != NULL) {
+ char *name;
+
+ /*
+ * We are searching for a pool based on name.
+ */
+ verify(nvlist_lookup_string(config,
+ ZPOOL_CONFIG_POOL_NAME, &name) == 0);
+
+ if (strcmp(name, searchname) == 0) {
+ if (found_config != NULL) {
+ (void) fprintf(stderr, gettext(
+ "cannot import '%s': more than "
+ "one matching pool\n"), searchname);
+ (void) fprintf(stderr, gettext(
+ "import by numeric ID instead\n"));
+ err = B_TRUE;
+ }
+ found_config = config;
+ }
+ } else {
+ uint64_t guid;
+
+ /*
+ * Search for a pool by guid.
+ */
+ verify(nvlist_lookup_uint64(config,
+ ZPOOL_CONFIG_POOL_GUID, &guid) == 0);
+
+ if (guid == searchguid)
+ found_config = config;
+ }
+ }
+
+ /*
+ * If we were searching for a specific pool, verify that we found a
+ * pool, and then do the import.
+ */
+ if (argc != 0 && err == 0) {
+ if (found_config == NULL) {
+ (void) fprintf(stderr, gettext("cannot import '%s': "
+ "no such pool available\n"), argv[0]);
+ err = B_TRUE;
+ } else {
+ err |= do_import(found_config, argc == 1 ? NULL :
+ argv[1], mntopts, altroot, do_force, argc + optind,
+ argv - optind);
+ }
+ }
+
+ /*
+ * If we were just looking for pools, report an error if none were
+ * found.
+ */
+ if (argc == 0 && first)
+ (void) fprintf(stderr,
+ gettext("no pools available to import\n"));
+
+ nvlist_free(pools);
+ free(searchdirs);
+
+ return (err ? 1 : 0);
+}
+
+typedef struct iostat_cbdata {
+ zpool_list_t *cb_list;
+ int cb_verbose;
+ int cb_iteration;
+ int cb_namewidth;
+} iostat_cbdata_t;
+
+static void
+print_iostat_separator(iostat_cbdata_t *cb)
+{
+ int i = 0;
+
+ for (i = 0; i < cb->cb_namewidth; i++)
+ (void) printf("-");
+ (void) printf(" ----- ----- ----- ----- ----- -----\n");
+}
+
+static void
+print_iostat_header(iostat_cbdata_t *cb)
+{
+ (void) printf("%*s capacity operations bandwidth\n",
+ cb->cb_namewidth, "");
+ (void) printf("%-*s used avail read write read write\n",
+ cb->cb_namewidth, "pool");
+ print_iostat_separator(cb);
+}
+
+/*
+ * Display a single statistic.
+ */
+void
+print_one_stat(uint64_t value)
+{
+ char buf[64];
+
+ zfs_nicenum(value, buf, sizeof (buf));
+ (void) printf(" %5s", buf);
+}
+
+/*
+ * Print out all the statistics for the given vdev. This can either be the
+ * toplevel configuration, or called recursively. If 'name' is NULL, then this
+ * is a verbose output, and we don't want to display the toplevel pool stats.
+ */
+void
+print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv,
+ nvlist_t *newnv, iostat_cbdata_t *cb, int depth)
+{
+ nvlist_t **oldchild, **newchild;
+ uint_t c, children;
+ vdev_stat_t *oldvs, *newvs;
+ vdev_stat_t zerovs = { 0 };
+ uint64_t tdelta;
+ double scale;
+ char *vname;
+
+ if (oldnv != NULL) {
+ verify(nvlist_lookup_uint64_array(oldnv, ZPOOL_CONFIG_STATS,
+ (uint64_t **)&oldvs, &c) == 0);
+ } else {
+ oldvs = &zerovs;
+ }
+
+ verify(nvlist_lookup_uint64_array(newnv, ZPOOL_CONFIG_STATS,
+ (uint64_t **)&newvs, &c) == 0);
+
+ if (strlen(name) + depth > cb->cb_namewidth)
+ (void) printf("%*s%s", depth, "", name);
+ else
+ (void) printf("%*s%s%*s", depth, "", name,
+ (int)(cb->cb_namewidth - strlen(name) - depth), "");
+
+ tdelta = newvs->vs_timestamp - oldvs->vs_timestamp;
+
+ if (tdelta == 0)
+ scale = 1.0;
+ else
+ scale = (double)NANOSEC / tdelta;
+
+ /* only toplevel vdevs have capacity stats */
+ if (newvs->vs_space == 0) {
+ (void) printf(" - -");
+ } else {
+ print_one_stat(newvs->vs_alloc);
+ print_one_stat(newvs->vs_space - newvs->vs_alloc);
+ }
+
+ print_one_stat((uint64_t)(scale * (newvs->vs_ops[ZIO_TYPE_READ] -
+ oldvs->vs_ops[ZIO_TYPE_READ])));
+
+ print_one_stat((uint64_t)(scale * (newvs->vs_ops[ZIO_TYPE_WRITE] -
+ oldvs->vs_ops[ZIO_TYPE_WRITE])));
+
+ print_one_stat((uint64_t)(scale * (newvs->vs_bytes[ZIO_TYPE_READ] -
+ oldvs->vs_bytes[ZIO_TYPE_READ])));
+
+ print_one_stat((uint64_t)(scale * (newvs->vs_bytes[ZIO_TYPE_WRITE] -
+ oldvs->vs_bytes[ZIO_TYPE_WRITE])));
+
+ (void) printf("\n");
+
+ if (!cb->cb_verbose)
+ return;
+
+ if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_CHILDREN,
+ &newchild, &children) != 0)
+ return;
+
+ if (oldnv && nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_CHILDREN,
+ &oldchild, &c) != 0)
+ return;
+
+ for (c = 0; c < children; c++) {
+ vname = zpool_vdev_name(g_zfs, zhp, newchild[c]);
+ print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL,
+ newchild[c], cb, depth + 2);
+ free(vname);
+ }
+}
+
+static int
+refresh_iostat(zpool_handle_t *zhp, void *data)
+{
+ iostat_cbdata_t *cb = data;
+ boolean_t missing;
+
+ /*
+ * If the pool has disappeared, remove it from the list and continue.
+ */
+ if (zpool_refresh_stats(zhp, &missing) != 0)
+ return (-1);
+
+ if (missing)
+ pool_list_remove(cb->cb_list, zhp);
+
+ return (0);
+}
+
+/*
+ * Callback to print out the iostats for the given pool.
+ */
+int
+print_iostat(zpool_handle_t *zhp, void *data)
+{
+ iostat_cbdata_t *cb = data;
+ nvlist_t *oldconfig, *newconfig;
+ nvlist_t *oldnvroot, *newnvroot;
+
+ newconfig = zpool_get_config(zhp, &oldconfig);
+
+ if (cb->cb_iteration == 1)
+ oldconfig = NULL;
+
+ verify(nvlist_lookup_nvlist(newconfig, ZPOOL_CONFIG_VDEV_TREE,
+ &newnvroot) == 0);
+
+ if (oldconfig == NULL)
+ oldnvroot = NULL;
+ else
+ verify(nvlist_lookup_nvlist(oldconfig, ZPOOL_CONFIG_VDEV_TREE,
+ &oldnvroot) == 0);
+
+ /*
+ * Print out the statistics for the pool.
+ */
+ print_vdev_stats(zhp, zpool_get_name(zhp), oldnvroot, newnvroot, cb, 0);
+
+ if (cb->cb_verbose)
+ print_iostat_separator(cb);
+
+ return (0);
+}
+
+int
+get_namewidth(zpool_handle_t *zhp, void *data)
+{
+ iostat_cbdata_t *cb = data;
+ nvlist_t *config, *nvroot;
+
+ if ((config = zpool_get_config(zhp, NULL)) != NULL) {
+ verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+ &nvroot) == 0);
+ if (!cb->cb_verbose)
+ cb->cb_namewidth = strlen(zpool_get_name(zhp));
+ else
+ cb->cb_namewidth = max_width(zhp, nvroot, 0, 0);
+ }
+
+ /*
+ * The width must fall into the range [10,38]. The upper limit is the
+ * maximum we can have and still fit in 80 columns.
+ */
+ if (cb->cb_namewidth < 10)
+ cb->cb_namewidth = 10;
+ if (cb->cb_namewidth > 38)
+ cb->cb_namewidth = 38;
+
+ return (0);
+}
+
+/*
+ * zpool iostat [-v] [pool] ... [interval [count]]
+ *
+ * -v Display statistics for individual vdevs
+ *
+ * This command can be tricky because we want to be able to deal with pool
+ * creation/destruction as well as vdev configuration changes. The bulk of this
+ * processing is handled by the pool_list_* routines in zpool_iter.c. We rely
+ * on pool_list_update() to detect the addition of new pools. Configuration
+ * changes are all handled within libzfs.
+ */
+int
+zpool_do_iostat(int argc, char **argv)
+{
+ int c;
+ int ret;
+ int npools;
+ unsigned long interval = 0, count = 0;
+ zpool_list_t *list;
+ boolean_t verbose = B_FALSE;
+ iostat_cbdata_t cb;
+
+ /* check options */
+ while ((c = getopt(argc, argv, "v")) != -1) {
+ switch (c) {
+ case 'v':
+ verbose = B_TRUE;
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ /*
+ * Determine if the last argument is an integer or a pool name
+ */
+ if (argc > 0 && isdigit(argv[argc - 1][0])) {
+ char *end;
+
+ errno = 0;
+ interval = strtoul(argv[argc - 1], &end, 10);
+
+ if (*end == '\0' && errno == 0) {
+ if (interval == 0) {
+ (void) fprintf(stderr, gettext("interval "
+ "cannot be zero\n"));
+ usage(B_FALSE);
+ }
+
+ /*
+ * Ignore the last parameter
+ */
+ argc--;
+ } else {
+ /*
+ * If this is not a valid number, just plow on. The
+ * user will get a more informative error message later
+ * on.
+ */
+ interval = 0;
+ }
+ }
+
+ /*
+ * If the last argument is also an integer, then we have both a count
+ * and an integer.
+ */
+ if (argc > 0 && isdigit(argv[argc - 1][0])) {
+ char *end;
+
+ errno = 0;
+ count = interval;
+ interval = strtoul(argv[argc - 1], &end, 10);
+
+ if (*end == '\0' && errno == 0) {
+ if (interval == 0) {
+ (void) fprintf(stderr, gettext("interval "
+ "cannot be zero\n"));
+ usage(B_FALSE);
+ }
+
+ /*
+ * Ignore the last parameter
+ */
+ argc--;
+ } else {
+ interval = 0;
+ }
+ }
+
+ /*
+ * Construct the list of all interesting pools.
+ */
+ ret = 0;
+ if ((list = pool_list_get(argc, argv, NULL, &ret)) == NULL)
+ return (1);
+
+ if (pool_list_count(list) == 0 && argc != 0) {
+ pool_list_free(list);
+ return (1);
+ }
+
+ if (pool_list_count(list) == 0 && interval == 0) {
+ pool_list_free(list);
+ (void) fprintf(stderr, gettext("no pools available\n"));
+ return (1);
+ }
+
+ /*
+ * Enter the main iostat loop.
+ */
+ cb.cb_list = list;
+ cb.cb_verbose = verbose;
+ cb.cb_iteration = 0;
+ cb.cb_namewidth = 0;
+
+ for (;;) {
+ pool_list_update(list);
+
+ if ((npools = pool_list_count(list)) == 0)
+ break;
+
+ /*
+ * Refresh all statistics. This is done as an explicit step
+ * before calculating the maximum name width, so that any
+ * configuration changes are properly accounted for.
+ */
+ (void) pool_list_iter(list, B_FALSE, refresh_iostat, &cb);
+
+ /*
+ * Iterate over all pools to determine the maximum width
+ * for the pool / device name column across all pools.
+ */
+ cb.cb_namewidth = 0;
+ (void) pool_list_iter(list, B_FALSE, get_namewidth, &cb);
+
+ /*
+ * If it's the first time, or verbose mode, print the header.
+ */
+ if (++cb.cb_iteration == 1 || verbose)
+ print_iostat_header(&cb);
+
+ (void) pool_list_iter(list, B_FALSE, print_iostat, &cb);
+
+ /*
+ * If there's more than one pool, and we're not in verbose mode
+ * (which prints a separator for us), then print a separator.
+ */
+ if (npools > 1 && !verbose)
+ print_iostat_separator(&cb);
+
+ if (verbose)
+ (void) printf("\n");
+
+ /*
+ * Flush the output so that redirection to a file isn't buffered
+ * indefinitely.
+ */
+ (void) fflush(stdout);
+
+ if (interval == 0)
+ break;
+
+ if (count != 0 && --count == 0)
+ break;
+
+ (void) sleep(interval);
+ }
+
+ pool_list_free(list);
+
+ return (ret);
+}
+
+typedef struct list_cbdata {
+ boolean_t cb_scripted;
+ boolean_t cb_first;
+ int cb_fields[MAX_FIELDS];
+ int cb_fieldcount;
+} list_cbdata_t;
+
+/*
+ * Given a list of columns to display, output appropriate headers for each one.
+ */
+void
+print_header(int *fields, size_t count)
+{
+ int i;
+ column_def_t *col;
+ const char *fmt;
+
+ for (i = 0; i < count; i++) {
+ col = &column_table[fields[i]];
+ if (i != 0)
+ (void) printf(" ");
+ if (col->cd_justify == left_justify)
+ fmt = "%-*s";
+ else
+ fmt = "%*s";
+
+ (void) printf(fmt, i == count - 1 ? strlen(col->cd_title) :
+ col->cd_width, col->cd_title);
+ }
+
+ (void) printf("\n");
+}
+
+int
+list_callback(zpool_handle_t *zhp, void *data)
+{
+ list_cbdata_t *cbp = data;
+ nvlist_t *config;
+ int i;
+ char buf[ZPOOL_MAXNAMELEN];
+ uint64_t total;
+ uint64_t used;
+ const char *fmt;
+ column_def_t *col;
+
+ if (cbp->cb_first) {
+ if (!cbp->cb_scripted)
+ print_header(cbp->cb_fields, cbp->cb_fieldcount);
+ cbp->cb_first = B_FALSE;
+ }
+
+ if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) {
+ config = NULL;
+ } else {
+ config = zpool_get_config(zhp, NULL);
+ total = zpool_get_space_total(zhp);
+ used = zpool_get_space_used(zhp);
+ }
+
+ for (i = 0; i < cbp->cb_fieldcount; i++) {
+ if (i != 0) {
+ if (cbp->cb_scripted)
+ (void) printf("\t");
+ else
+ (void) printf(" ");
+ }
+
+ col = &column_table[cbp->cb_fields[i]];
+
+ switch (cbp->cb_fields[i]) {
+ case ZPOOL_FIELD_NAME:
+ (void) strlcpy(buf, zpool_get_name(zhp), sizeof (buf));
+ break;
+
+ case ZPOOL_FIELD_SIZE:
+ if (config == NULL)
+ (void) strlcpy(buf, "-", sizeof (buf));
+ else
+ zfs_nicenum(total, buf, sizeof (buf));
+ break;
+
+ case ZPOOL_FIELD_USED:
+ if (config == NULL)
+ (void) strlcpy(buf, "-", sizeof (buf));
+ else
+ zfs_nicenum(used, buf, sizeof (buf));
+ break;
+
+ case ZPOOL_FIELD_AVAILABLE:
+ if (config == NULL)
+ (void) strlcpy(buf, "-", sizeof (buf));
+ else
+ zfs_nicenum(total - used, buf, sizeof (buf));
+ break;
+
+ case ZPOOL_FIELD_CAPACITY:
+ if (config == NULL) {
+ (void) strlcpy(buf, "-", sizeof (buf));
+ } else {
+ uint64_t capacity = (total == 0 ? 0 :
+ (used * 100 / total));
+ (void) snprintf(buf, sizeof (buf), "%llu%%",
+ (u_longlong_t)capacity);
+ }
+ break;
+
+ case ZPOOL_FIELD_HEALTH:
+ if (config == NULL) {
+ (void) strlcpy(buf, "FAULTED", sizeof (buf));
+ } else {
+ nvlist_t *nvroot;
+ vdev_stat_t *vs;
+ uint_t vsc;
+
+ verify(nvlist_lookup_nvlist(config,
+ ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
+ verify(nvlist_lookup_uint64_array(nvroot,
+ ZPOOL_CONFIG_STATS, (uint64_t **)&vs,
+ &vsc) == 0);
+ (void) strlcpy(buf, state_to_name(vs),
+ sizeof (buf));
+ }
+ break;
+
+ case ZPOOL_FIELD_ROOT:
+ if (config == NULL)
+ (void) strlcpy(buf, "-", sizeof (buf));
+ else if (zpool_get_root(zhp, buf, sizeof (buf)) != 0)
+ (void) strlcpy(buf, "-", sizeof (buf));
+ break;
+ }
+
+ if (cbp->cb_scripted)
+ (void) printf("%s", buf);
+ else {
+ if (col->cd_justify == left_justify)
+ fmt = "%-*s";
+ else
+ fmt = "%*s";
+
+ (void) printf(fmt, i == cbp->cb_fieldcount - 1 ?
+ strlen(buf) : col->cd_width, buf);
+ }
+ }
+
+ (void) printf("\n");
+
+ return (0);
+}
+
+/*
+ * zpool list [-H] [-o field[,field]*] [pool] ...
+ *
+ * -H Scripted mode. Don't display headers, and separate fields by
+ * a single tab.
+ * -o List of fields to display. Defaults to all fields, or
+ * "name,size,used,available,capacity,health,root"
+ *
+ * List all pools in the system, whether or not they're healthy. Output space
+ * statistics for each one, as well as health status summary.
+ */
+int
+zpool_do_list(int argc, char **argv)
+{
+ int c;
+ int ret;
+ list_cbdata_t cb = { 0 };
+ static char default_fields[] =
+ "name,size,used,available,capacity,health,root";
+ char *fields = default_fields;
+ char *value;
+
+ /* check options */
+ while ((c = getopt(argc, argv, ":Ho:")) != -1) {
+ switch (c) {
+ case 'H':
+ cb.cb_scripted = B_TRUE;
+ break;
+ case 'o':
+ fields = optarg;
+ break;
+ case ':':
+ (void) fprintf(stderr, gettext("missing argument for "
+ "'%c' option\n"), optopt);
+ usage(B_FALSE);
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ while (*fields != '\0') {
+ if (cb.cb_fieldcount == MAX_FIELDS) {
+ (void) fprintf(stderr, gettext("too many "
+ "properties given to -o option\n"));
+ usage(B_FALSE);
+ }
+
+ if ((cb.cb_fields[cb.cb_fieldcount] = getsubopt(&fields,
+ column_subopts, &value)) == -1) {
+ (void) fprintf(stderr, gettext("invalid property "
+ "'%s'\n"), value);
+ usage(B_FALSE);
+ }
+
+ cb.cb_fieldcount++;
+ }
+
+
+ cb.cb_first = B_TRUE;
+
+ ret = for_each_pool(argc, argv, B_TRUE, NULL, list_callback, &cb);
+
+ if (argc == 0 && cb.cb_first) {
+ (void) printf(gettext("no pools available\n"));
+ return (0);
+ }
+
+ return (ret);
+}
+
+static nvlist_t *
+zpool_get_vdev_by_name(nvlist_t *nv, char *name)
+{
+ nvlist_t **child;
+ uint_t c, children;
+ nvlist_t *match;
+ char *path;
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) != 0) {
+ verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0);
+ if (strncmp(name, _PATH_DEV, sizeof(_PATH_DEV)-1) == 0)
+ name += sizeof(_PATH_DEV)-1;
+ if (strncmp(path, _PATH_DEV, sizeof(_PATH_DEV)-1) == 0)
+ path += sizeof(_PATH_DEV)-1;
+ if (strcmp(name, path) == 0)
+ return (nv);
+ return (NULL);
+ }
+
+ for (c = 0; c < children; c++)
+ if ((match = zpool_get_vdev_by_name(child[c], name)) != NULL)
+ return (match);
+
+ return (NULL);
+}
+
+static int
+zpool_do_attach_or_replace(int argc, char **argv, int replacing)
+{
+ boolean_t force = B_FALSE;
+ int c;
+ nvlist_t *nvroot;
+ char *poolname, *old_disk, *new_disk;
+ zpool_handle_t *zhp;
+ nvlist_t *config;
+ int ret;
+ int log_argc;
+ char **log_argv;
+
+ /* check options */
+ while ((c = getopt(argc, argv, "f")) != -1) {
+ switch (c) {
+ case 'f':
+ force = B_TRUE;
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ log_argc = argc;
+ log_argv = argv;
+ argc -= optind;
+ argv += optind;
+
+ /* get pool name and check number of arguments */
+ if (argc < 1) {
+ (void) fprintf(stderr, gettext("missing pool name argument\n"));
+ usage(B_FALSE);
+ }
+
+ poolname = argv[0];
+
+ if (argc < 2) {
+ (void) fprintf(stderr,
+ gettext("missing <device> specification\n"));
+ usage(B_FALSE);
+ }
+
+ old_disk = argv[1];
+
+ if (argc < 3) {
+ if (!replacing) {
+ (void) fprintf(stderr,
+ gettext("missing <new_device> specification\n"));
+ usage(B_FALSE);
+ }
+ new_disk = old_disk;
+ argc -= 1;
+ argv += 1;
+ } else {
+ new_disk = argv[2];
+ argc -= 2;
+ argv += 2;
+ }
+
+ if (argc > 1) {
+ (void) fprintf(stderr, gettext("too many arguments\n"));
+ usage(B_FALSE);
+ }
+
+ if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
+ return (1);
+
+ if ((config = zpool_get_config(zhp, NULL)) == NULL) {
+ (void) fprintf(stderr, gettext("pool '%s' is unavailable\n"),
+ poolname);
+ zpool_close(zhp);
+ return (1);
+ }
+
+ nvroot = make_root_vdev(config, force, B_FALSE, replacing, argc, argv);
+ if (nvroot == NULL) {
+ zpool_close(zhp);
+ return (1);
+ }
+
+ ret = zpool_vdev_attach(zhp, old_disk, new_disk, nvroot, replacing);
+
+ if (!ret) {
+ zpool_log_history(g_zfs, log_argc, log_argv, poolname, B_TRUE,
+ B_FALSE);
+ }
+
+ nvlist_free(nvroot);
+ zpool_close(zhp);
+
+ return (ret);
+}
+
+/*
+ * zpool replace [-f] <pool> <device> <new_device>
+ *
+ * -f Force attach, even if <new_device> appears to be in use.
+ *
+ * Replace <device> with <new_device>.
+ */
+/* ARGSUSED */
+int
+zpool_do_replace(int argc, char **argv)
+{
+ return (zpool_do_attach_or_replace(argc, argv, B_TRUE));
+}
+
+/*
+ * zpool attach [-f] <pool> <device> <new_device>
+ *
+ * -f Force attach, even if <new_device> appears to be in use.
+ *
+ * Attach <new_device> to the mirror containing <device>. If <device> is not
+ * part of a mirror, then <device> will be transformed into a mirror of
+ * <device> and <new_device>. In either case, <new_device> will begin life
+ * with a DTL of [0, now], and will immediately begin to resilver itself.
+ */
+int
+zpool_do_attach(int argc, char **argv)
+{
+ return (zpool_do_attach_or_replace(argc, argv, B_FALSE));
+}
+
+/*
+ * zpool detach [-f] <pool> <device>
+ *
+ * -f Force detach of <device>, even if DTLs argue against it
+ * (not supported yet)
+ *
+ * Detach a device from a mirror. The operation will be refused if <device>
+ * is the last device in the mirror, or if the DTLs indicate that this device
+ * has the only valid copy of some data.
+ */
+/* ARGSUSED */
+int
+zpool_do_detach(int argc, char **argv)
+{
+ int c;
+ char *poolname, *path;
+ zpool_handle_t *zhp;
+ int ret;
+
+ /* check options */
+ while ((c = getopt(argc, argv, "f")) != -1) {
+ switch (c) {
+ case 'f':
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ /* get pool name and check number of arguments */
+ if (argc < 1) {
+ (void) fprintf(stderr, gettext("missing pool name argument\n"));
+ usage(B_FALSE);
+ }
+
+ if (argc < 2) {
+ (void) fprintf(stderr,
+ gettext("missing <device> specification\n"));
+ usage(B_FALSE);
+ }
+
+ poolname = argv[0];
+ path = argv[1];
+
+ if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
+ return (1);
+
+ ret = zpool_vdev_detach(zhp, path);
+
+ if (!ret) {
+ zpool_log_history(g_zfs, argc + optind, argv - optind, poolname,
+ B_TRUE, B_FALSE);
+ }
+ zpool_close(zhp);
+
+ return (ret);
+}
+
+/*
+ * zpool online <pool> <device> ...
+ */
+/* ARGSUSED */
+int
+zpool_do_online(int argc, char **argv)
+{
+ int c, i;
+ char *poolname;
+ zpool_handle_t *zhp;
+ int ret = 0;
+
+ /* check options */
+ while ((c = getopt(argc, argv, "t")) != -1) {
+ switch (c) {
+ case 't':
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ /* get pool name and check number of arguments */
+ if (argc < 1) {
+ (void) fprintf(stderr, gettext("missing pool name\n"));
+ usage(B_FALSE);
+ }
+ if (argc < 2) {
+ (void) fprintf(stderr, gettext("missing device name\n"));
+ usage(B_FALSE);
+ }
+
+ poolname = argv[0];
+
+ if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
+ return (1);
+
+ for (i = 1; i < argc; i++)
+ if (zpool_vdev_online(zhp, argv[i]) == 0)
+ (void) printf(gettext("Bringing device %s online\n"),
+ argv[i]);
+ else
+ ret = 1;
+
+ if (!ret) {
+ zpool_log_history(g_zfs, argc + optind, argv - optind, poolname,
+ B_TRUE, B_FALSE);
+ }
+ zpool_close(zhp);
+
+ return (ret);
+}
+
+/*
+ * zpool offline [-ft] <pool> <device> ...
+ *
+ * -f Force the device into the offline state, even if doing
+ * so would appear to compromise pool availability.
+ * (not supported yet)
+ *
+ * -t Only take the device off-line temporarily. The offline
+ * state will not be persistent across reboots.
+ */
+/* ARGSUSED */
+int
+zpool_do_offline(int argc, char **argv)
+{
+ int c, i;
+ char *poolname;
+ zpool_handle_t *zhp;
+ int ret = 0;
+ boolean_t istmp = B_FALSE;
+
+ /* check options */
+ while ((c = getopt(argc, argv, "ft")) != -1) {
+ switch (c) {
+ case 't':
+ istmp = B_TRUE;
+ break;
+ case 'f':
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ /* get pool name and check number of arguments */
+ if (argc < 1) {
+ (void) fprintf(stderr, gettext("missing pool name\n"));
+ usage(B_FALSE);
+ }
+ if (argc < 2) {
+ (void) fprintf(stderr, gettext("missing device name\n"));
+ usage(B_FALSE);
+ }
+
+ poolname = argv[0];
+
+ if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
+ return (1);
+
+ for (i = 1; i < argc; i++)
+ if (zpool_vdev_offline(zhp, argv[i], istmp) == 0)
+ (void) printf(gettext("Bringing device %s offline\n"),
+ argv[i]);
+ else
+ ret = 1;
+
+ if (!ret) {
+ zpool_log_history(g_zfs, argc + optind, argv - optind, poolname,
+ B_TRUE, B_FALSE);
+ }
+ zpool_close(zhp);
+
+ return (ret);
+}
+
+/*
+ * zpool clear <pool> [device]
+ *
+ * Clear all errors associated with a pool or a particular device.
+ */
+int
+zpool_do_clear(int argc, char **argv)
+{
+ int ret = 0;
+ zpool_handle_t *zhp;
+ char *pool, *device;
+
+ if (argc < 2) {
+ (void) fprintf(stderr, gettext("missing pool name\n"));
+ usage(B_FALSE);
+ }
+
+ if (argc > 3) {
+ (void) fprintf(stderr, gettext("too many arguments\n"));
+ usage(B_FALSE);
+ }
+
+ pool = argv[1];
+ device = argc == 3 ? argv[2] : NULL;
+
+ if ((zhp = zpool_open(g_zfs, pool)) == NULL)
+ return (1);
+
+ if (zpool_clear(zhp, device) != 0)
+ ret = 1;
+
+ if (!ret)
+ zpool_log_history(g_zfs, argc, argv, pool, B_TRUE, B_FALSE);
+ zpool_close(zhp);
+
+ return (ret);
+}
+
+typedef struct scrub_cbdata {
+ int cb_type;
+ int cb_argc;
+ char **cb_argv;
+} scrub_cbdata_t;
+
+int
+scrub_callback(zpool_handle_t *zhp, void *data)
+{
+ scrub_cbdata_t *cb = data;
+ int err;
+
+ /*
+ * Ignore faulted pools.
+ */
+ if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) {
+ (void) fprintf(stderr, gettext("cannot scrub '%s': pool is "
+ "currently unavailable\n"), zpool_get_name(zhp));
+ return (1);
+ }
+
+ err = zpool_scrub(zhp, cb->cb_type);
+
+ if (!err) {
+ zpool_log_history(g_zfs, cb->cb_argc, cb->cb_argv,
+ zpool_get_name(zhp), B_TRUE, B_FALSE);
+ }
+
+ return (err != 0);
+}
+
+/*
+ * zpool scrub [-s] <pool> ...
+ *
+ * -s Stop. Stops any in-progress scrub.
+ */
+int
+zpool_do_scrub(int argc, char **argv)
+{
+ int c;
+ scrub_cbdata_t cb;
+
+ cb.cb_type = POOL_SCRUB_EVERYTHING;
+
+ /* check options */
+ while ((c = getopt(argc, argv, "s")) != -1) {
+ switch (c) {
+ case 's':
+ cb.cb_type = POOL_SCRUB_NONE;
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ cb.cb_argc = argc;
+ cb.cb_argv = argv;
+ argc -= optind;
+ argv += optind;
+
+ if (argc < 1) {
+ (void) fprintf(stderr, gettext("missing pool name argument\n"));
+ usage(B_FALSE);
+ }
+
+ return (for_each_pool(argc, argv, B_TRUE, NULL, scrub_callback, &cb));
+}
+
+typedef struct status_cbdata {
+ int cb_count;
+ boolean_t cb_allpools;
+ boolean_t cb_verbose;
+ boolean_t cb_explain;
+ boolean_t cb_first;
+} status_cbdata_t;
+
+/*
+ * Print out detailed scrub status.
+ */
+void
+print_scrub_status(nvlist_t *nvroot)
+{
+ vdev_stat_t *vs;
+ uint_t vsc;
+ time_t start, end, now;
+ double fraction_done;
+ uint64_t examined, total, minutes_left;
+ char *scrub_type;
+
+ verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_STATS,
+ (uint64_t **)&vs, &vsc) == 0);
+
+ /*
+ * If there's never been a scrub, there's not much to say.
+ */
+ if (vs->vs_scrub_end == 0 && vs->vs_scrub_type == POOL_SCRUB_NONE) {
+ (void) printf(gettext("none requested\n"));
+ return;
+ }
+
+ scrub_type = (vs->vs_scrub_type == POOL_SCRUB_RESILVER) ?
+ "resilver" : "scrub";
+
+ start = vs->vs_scrub_start;
+ end = vs->vs_scrub_end;
+ now = time(NULL);
+ examined = vs->vs_scrub_examined;
+ total = vs->vs_alloc;
+
+ if (end != 0) {
+ (void) printf(gettext("%s %s with %llu errors on %s"),
+ scrub_type, vs->vs_scrub_complete ? "completed" : "stopped",
+ (u_longlong_t)vs->vs_scrub_errors, ctime(&end));
+ return;
+ }
+
+ if (examined == 0)
+ examined = 1;
+ if (examined > total)
+ total = examined;
+
+ fraction_done = (double)examined / total;
+ minutes_left = (uint64_t)((now - start) *
+ (1 - fraction_done) / fraction_done / 60);
+
+ (void) printf(gettext("%s in progress, %.2f%% done, %lluh%um to go\n"),
+ scrub_type, 100 * fraction_done,
+ (u_longlong_t)(minutes_left / 60), (uint_t)(minutes_left % 60));
+}
+
+typedef struct spare_cbdata {
+ uint64_t cb_guid;
+ zpool_handle_t *cb_zhp;
+} spare_cbdata_t;
+
+static boolean_t
+find_vdev(nvlist_t *nv, uint64_t search)
+{
+ uint64_t guid;
+ nvlist_t **child;
+ uint_t c, children;
+
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0 &&
+ search == guid)
+ return (B_TRUE);
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) == 0) {
+ for (c = 0; c < children; c++)
+ if (find_vdev(child[c], search))
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+static int
+find_spare(zpool_handle_t *zhp, void *data)
+{
+ spare_cbdata_t *cbp = data;
+ nvlist_t *config, *nvroot;
+
+ config = zpool_get_config(zhp, NULL);
+ verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+ &nvroot) == 0);
+
+ if (find_vdev(nvroot, cbp->cb_guid)) {
+ cbp->cb_zhp = zhp;
+ return (1);
+ }
+
+ zpool_close(zhp);
+ return (0);
+}
+
+/*
+ * Print out configuration state as requested by status_callback.
+ */
+void
+print_status_config(zpool_handle_t *zhp, const char *name, nvlist_t *nv,
+ int namewidth, int depth, boolean_t isspare)
+{
+ nvlist_t **child;
+ uint_t c, children;
+ vdev_stat_t *vs;
+ char rbuf[6], wbuf[6], cbuf[6], repaired[7];
+ char *vname;
+ uint64_t notpresent;
+ spare_cbdata_t cb;
+ const char *state;
+
+ verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_STATS,
+ (uint64_t **)&vs, &c) == 0);
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) != 0)
+ children = 0;
+
+ state = state_to_name(vs);
+ if (isspare) {
+ /*
+ * For hot spares, we use the terms 'INUSE' and 'AVAILABLE' for
+ * online drives.
+ */
+ if (vs->vs_aux == VDEV_AUX_SPARED)
+ state = "INUSE";
+ else if (vs->vs_state == VDEV_STATE_HEALTHY)
+ state = "AVAIL";
+ }
+
+ (void) printf("\t%*s%-*s %-8s", depth, "", namewidth - depth,
+ name, state);
+
+ if (!isspare) {
+ zfs_nicenum(vs->vs_read_errors, rbuf, sizeof (rbuf));
+ zfs_nicenum(vs->vs_write_errors, wbuf, sizeof (wbuf));
+ zfs_nicenum(vs->vs_checksum_errors, cbuf, sizeof (cbuf));
+ (void) printf(" %5s %5s %5s", rbuf, wbuf, cbuf);
+ }
+
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
+ &notpresent) == 0) {
+ char *path;
+ verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0);
+ (void) printf(" was %s", path);
+ } else if (vs->vs_aux != 0) {
+ (void) printf(" ");
+
+ switch (vs->vs_aux) {
+ case VDEV_AUX_OPEN_FAILED:
+ (void) printf(gettext("cannot open"));
+ break;
+
+ case VDEV_AUX_BAD_GUID_SUM:
+ (void) printf(gettext("missing device"));
+ break;
+
+ case VDEV_AUX_NO_REPLICAS:
+ (void) printf(gettext("insufficient replicas"));
+ break;
+
+ case VDEV_AUX_VERSION_NEWER:
+ (void) printf(gettext("newer version"));
+ break;
+
+ case VDEV_AUX_SPARED:
+ verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID,
+ &cb.cb_guid) == 0);
+ if (zpool_iter(g_zfs, find_spare, &cb) == 1) {
+ if (strcmp(zpool_get_name(cb.cb_zhp),
+ zpool_get_name(zhp)) == 0)
+ (void) printf(gettext("currently in "
+ "use"));
+ else
+ (void) printf(gettext("in use by "
+ "pool '%s'"),
+ zpool_get_name(cb.cb_zhp));
+ zpool_close(cb.cb_zhp);
+ } else {
+ (void) printf(gettext("currently in use"));
+ }
+ break;
+
+ default:
+ (void) printf(gettext("corrupted data"));
+ break;
+ }
+ } else if (vs->vs_scrub_repaired != 0 && children == 0) {
+ /*
+ * Report bytes resilvered/repaired on leaf devices.
+ */
+ zfs_nicenum(vs->vs_scrub_repaired, repaired, sizeof (repaired));
+ (void) printf(gettext(" %s %s"), repaired,
+ (vs->vs_scrub_type == POOL_SCRUB_RESILVER) ?
+ "resilvered" : "repaired");
+ }
+
+ (void) printf("\n");
+
+ for (c = 0; c < children; c++) {
+ vname = zpool_vdev_name(g_zfs, zhp, child[c]);
+ print_status_config(zhp, vname, child[c],
+ namewidth, depth + 2, isspare);
+ free(vname);
+ }
+}
+
+static void
+print_error_log(zpool_handle_t *zhp)
+{
+ nvlist_t *nverrlist;
+ nvpair_t *elem;
+ char *pathname;
+ size_t len = MAXPATHLEN * 2;
+
+ if (zpool_get_errlog(zhp, &nverrlist) != 0) {
+ (void) printf("errors: List of errors unavailable "
+ "(insufficient privileges)\n");
+ return;
+ }
+
+ (void) printf("errors: Permanent errors have been "
+ "detected in the following files:\n\n");
+
+ pathname = safe_malloc(len);
+ elem = NULL;
+ while ((elem = nvlist_next_nvpair(nverrlist, elem)) != NULL) {
+ nvlist_t *nv;
+ uint64_t dsobj, obj;
+
+ verify(nvpair_value_nvlist(elem, &nv) == 0);
+ verify(nvlist_lookup_uint64(nv, ZPOOL_ERR_DATASET,
+ &dsobj) == 0);
+ verify(nvlist_lookup_uint64(nv, ZPOOL_ERR_OBJECT,
+ &obj) == 0);
+ zpool_obj_to_path(zhp, dsobj, obj, pathname, len);
+ (void) printf("%7s %s\n", "", pathname);
+ }
+ free(pathname);
+ nvlist_free(nverrlist);
+}
+
+static void
+print_spares(zpool_handle_t *zhp, nvlist_t **spares, uint_t nspares,
+ int namewidth)
+{
+ uint_t i;
+ char *name;
+
+ if (nspares == 0)
+ return;
+
+ (void) printf(gettext("\tspares\n"));
+
+ for (i = 0; i < nspares; i++) {
+ name = zpool_vdev_name(g_zfs, zhp, spares[i]);
+ print_status_config(zhp, name, spares[i],
+ namewidth, 2, B_TRUE);
+ free(name);
+ }
+}
+
+/*
+ * Display a summary of pool status. Displays a summary such as:
+ *
+ * pool: tank
+ * status: DEGRADED
+ * reason: One or more devices ...
+ * see: http://www.sun.com/msg/ZFS-xxxx-01
+ * config:
+ * mirror DEGRADED
+ * c1t0d0 OK
+ * c2t0d0 UNAVAIL
+ *
+ * When given the '-v' option, we print out the complete config. If the '-e'
+ * option is specified, then we print out error rate information as well.
+ */
+int
+status_callback(zpool_handle_t *zhp, void *data)
+{
+ status_cbdata_t *cbp = data;
+ nvlist_t *config, *nvroot;
+ char *msgid;
+ int reason;
+ const char *health;
+ uint_t c;
+ vdev_stat_t *vs;
+
+ config = zpool_get_config(zhp, NULL);
+ reason = zpool_get_status(zhp, &msgid);
+
+ cbp->cb_count++;
+
+ /*
+ * If we were given 'zpool status -x', only report those pools with
+ * problems.
+ */
+ if (reason == ZPOOL_STATUS_OK && cbp->cb_explain) {
+ if (!cbp->cb_allpools) {
+ (void) printf(gettext("pool '%s' is healthy\n"),
+ zpool_get_name(zhp));
+ if (cbp->cb_first)
+ cbp->cb_first = B_FALSE;
+ }
+ return (0);
+ }
+
+ if (cbp->cb_first)
+ cbp->cb_first = B_FALSE;
+ else
+ (void) printf("\n");
+
+ verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+ &nvroot) == 0);
+ verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_STATS,
+ (uint64_t **)&vs, &c) == 0);
+ health = state_to_name(vs);
+
+ (void) printf(gettext(" pool: %s\n"), zpool_get_name(zhp));
+ (void) printf(gettext(" state: %s\n"), health);
+
+ switch (reason) {
+ case ZPOOL_STATUS_MISSING_DEV_R:
+ (void) printf(gettext("status: One or more devices could not "
+ "be opened. Sufficient replicas exist for\n\tthe pool to "
+ "continue functioning in a degraded state.\n"));
+ (void) printf(gettext("action: Attach the missing device and "
+ "online it using 'zpool online'.\n"));
+ break;
+
+ case ZPOOL_STATUS_MISSING_DEV_NR:
+ (void) printf(gettext("status: One or more devices could not "
+ "be opened. There are insufficient\n\treplicas for the "
+ "pool to continue functioning.\n"));
+ (void) printf(gettext("action: Attach the missing device and "
+ "online it using 'zpool online'.\n"));
+ break;
+
+ case ZPOOL_STATUS_CORRUPT_LABEL_R:
+ (void) printf(gettext("status: One or more devices could not "
+ "be used because the label is missing or\n\tinvalid. "
+ "Sufficient replicas exist for the pool to continue\n\t"
+ "functioning in a degraded state.\n"));
+ (void) printf(gettext("action: Replace the device using "
+ "'zpool replace'.\n"));
+ break;
+
+ case ZPOOL_STATUS_CORRUPT_LABEL_NR:
+ (void) printf(gettext("status: One or more devices could not "
+ "be used because the label is missing \n\tor invalid. "
+ "There are insufficient replicas for the pool to "
+ "continue\n\tfunctioning.\n"));
+ (void) printf(gettext("action: Destroy and re-create the pool "
+ "from a backup source.\n"));
+ break;
+
+ case ZPOOL_STATUS_FAILING_DEV:
+ (void) printf(gettext("status: One or more devices has "
+ "experienced an unrecoverable error. An\n\tattempt was "
+ "made to correct the error. Applications are "
+ "unaffected.\n"));
+ (void) printf(gettext("action: Determine if the device needs "
+ "to be replaced, and clear the errors\n\tusing "
+ "'zpool clear' or replace the device with 'zpool "
+ "replace'.\n"));
+ break;
+
+ case ZPOOL_STATUS_OFFLINE_DEV:
+ (void) printf(gettext("status: One or more devices has "
+ "been taken offline by the administrator.\n\tSufficient "
+ "replicas exist for the pool to continue functioning in "
+ "a\n\tdegraded state.\n"));
+ (void) printf(gettext("action: Online the device using "
+ "'zpool online' or replace the device with\n\t'zpool "
+ "replace'.\n"));
+ break;
+
+ case ZPOOL_STATUS_RESILVERING:
+ (void) printf(gettext("status: One or more devices is "
+ "currently being resilvered. The pool will\n\tcontinue "
+ "to function, possibly in a degraded state.\n"));
+ (void) printf(gettext("action: Wait for the resilver to "
+ "complete.\n"));
+ break;
+
+ case ZPOOL_STATUS_CORRUPT_DATA:
+ (void) printf(gettext("status: One or more devices has "
+ "experienced an error resulting in data\n\tcorruption. "
+ "Applications may be affected.\n"));
+ (void) printf(gettext("action: Restore the file in question "
+ "if possible. Otherwise restore the\n\tentire pool from "
+ "backup.\n"));
+ break;
+
+ case ZPOOL_STATUS_CORRUPT_POOL:
+ (void) printf(gettext("status: The pool metadata is corrupted "
+ "and the pool cannot be opened.\n"));
+ (void) printf(gettext("action: Destroy and re-create the pool "
+ "from a backup source.\n"));
+ break;
+
+ case ZPOOL_STATUS_VERSION_OLDER:
+ (void) printf(gettext("status: The pool is formatted using an "
+ "older on-disk format. The pool can\n\tstill be used, but "
+ "some features are unavailable.\n"));
+ (void) printf(gettext("action: Upgrade the pool using 'zpool "
+ "upgrade'. Once this is done, the\n\tpool will no longer "
+ "be accessible on older software versions.\n"));
+ break;
+
+ case ZPOOL_STATUS_VERSION_NEWER:
+ (void) printf(gettext("status: The pool has been upgraded to a "
+ "newer, incompatible on-disk version.\n\tThe pool cannot "
+ "be accessed on this system.\n"));
+ (void) printf(gettext("action: Access the pool from a system "
+ "running more recent software, or\n\trestore the pool from "
+ "backup.\n"));
+ break;
+
+ default:
+ /*
+ * The remaining errors can't actually be generated, yet.
+ */
+ assert(reason == ZPOOL_STATUS_OK);
+ }
+
+ if (msgid != NULL)
+ (void) printf(gettext(" see: http://www.sun.com/msg/%s\n"),
+ msgid);
+
+ if (config != NULL) {
+ int namewidth;
+ uint64_t nerr;
+ nvlist_t **spares;
+ uint_t nspares;
+
+
+ (void) printf(gettext(" scrub: "));
+ print_scrub_status(nvroot);
+
+ namewidth = max_width(zhp, nvroot, 0, 0);
+ if (namewidth < 10)
+ namewidth = 10;
+
+ (void) printf(gettext("config:\n\n"));
+ (void) printf(gettext("\t%-*s %-8s %5s %5s %5s\n"), namewidth,
+ "NAME", "STATE", "READ", "WRITE", "CKSUM");
+ print_status_config(zhp, zpool_get_name(zhp), nvroot,
+ namewidth, 0, B_FALSE);
+
+ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
+ &spares, &nspares) == 0)
+ print_spares(zhp, spares, nspares, namewidth);
+
+ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRCOUNT,
+ &nerr) == 0) {
+ nvlist_t *nverrlist = NULL;
+
+ /*
+ * If the approximate error count is small, get a
+ * precise count by fetching the entire log and
+ * uniquifying the results.
+ */
+ if (nerr < 100 && !cbp->cb_verbose &&
+ zpool_get_errlog(zhp, &nverrlist) == 0) {
+ nvpair_t *elem;
+
+ elem = NULL;
+ nerr = 0;
+ while ((elem = nvlist_next_nvpair(nverrlist,
+ elem)) != NULL) {
+ nerr++;
+ }
+ }
+ nvlist_free(nverrlist);
+
+ (void) printf("\n");
+
+ if (nerr == 0)
+ (void) printf(gettext("errors: No known data "
+ "errors\n"));
+ else if (!cbp->cb_verbose)
+ (void) printf(gettext("errors: %llu data "
+ "errors, use '-v' for a list\n"),
+ (u_longlong_t)nerr);
+ else
+ print_error_log(zhp);
+ }
+ } else {
+ (void) printf(gettext("config: The configuration cannot be "
+ "determined.\n"));
+ }
+
+ return (0);
+}
+
+/*
+ * zpool status [-vx] [pool] ...
+ *
+ * -v Display complete error logs
+ * -x Display only pools with potential problems
+ *
+ * Describes the health status of all pools or some subset.
+ */
+int
+zpool_do_status(int argc, char **argv)
+{
+ int c;
+ int ret;
+ status_cbdata_t cb = { 0 };
+
+ /* check options */
+ while ((c = getopt(argc, argv, "vx")) != -1) {
+ switch (c) {
+ case 'v':
+ cb.cb_verbose = B_TRUE;
+ break;
+ case 'x':
+ cb.cb_explain = B_TRUE;
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ cb.cb_first = B_TRUE;
+
+ if (argc == 0)
+ cb.cb_allpools = B_TRUE;
+
+ ret = for_each_pool(argc, argv, B_TRUE, NULL, status_callback, &cb);
+
+ if (argc == 0 && cb.cb_count == 0)
+ (void) printf(gettext("no pools available\n"));
+ else if (cb.cb_explain && cb.cb_first && cb.cb_allpools)
+ (void) printf(gettext("all pools are healthy\n"));
+
+ return (ret);
+}
+
+typedef struct upgrade_cbdata {
+ int cb_all;
+ int cb_first;
+ int cb_newer;
+ int cb_argc;
+ char **cb_argv;
+} upgrade_cbdata_t;
+
+static int
+upgrade_cb(zpool_handle_t *zhp, void *arg)
+{
+ upgrade_cbdata_t *cbp = arg;
+ nvlist_t *config;
+ uint64_t version;
+ int ret = 0;
+
+ config = zpool_get_config(zhp, NULL);
+ verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
+ &version) == 0);
+
+ if (!cbp->cb_newer && version < ZFS_VERSION) {
+ if (!cbp->cb_all) {
+ if (cbp->cb_first) {
+ (void) printf(gettext("The following pools are "
+ "out of date, and can be upgraded. After "
+ "being\nupgraded, these pools will no "
+ "longer be accessible by older software "
+ "versions.\n\n"));
+ (void) printf(gettext("VER POOL\n"));
+ (void) printf(gettext("--- ------------\n"));
+ cbp->cb_first = B_FALSE;
+ }
+
+ (void) printf("%2llu %s\n", (u_longlong_t)version,
+ zpool_get_name(zhp));
+ } else {
+ cbp->cb_first = B_FALSE;
+ ret = zpool_upgrade(zhp);
+ if (!ret) {
+ zpool_log_history(g_zfs, cbp->cb_argc,
+ cbp->cb_argv, zpool_get_name(zhp), B_TRUE,
+ B_FALSE);
+ (void) printf(gettext("Successfully upgraded "
+ "'%s'\n"), zpool_get_name(zhp));
+ }
+ }
+ } else if (cbp->cb_newer && version > ZFS_VERSION) {
+ assert(!cbp->cb_all);
+
+ if (cbp->cb_first) {
+ (void) printf(gettext("The following pools are "
+ "formatted using a newer software version and\n"
+ "cannot be accessed on the current system.\n\n"));
+ (void) printf(gettext("VER POOL\n"));
+ (void) printf(gettext("--- ------------\n"));
+ cbp->cb_first = B_FALSE;
+ }
+
+ (void) printf("%2llu %s\n", (u_longlong_t)version,
+ zpool_get_name(zhp));
+ }
+
+ zpool_close(zhp);
+ return (ret);
+}
+
+/* ARGSUSED */
+static int
+upgrade_one(zpool_handle_t *zhp, void *data)
+{
+ nvlist_t *config;
+ uint64_t version;
+ int ret;
+ upgrade_cbdata_t *cbp = data;
+
+ config = zpool_get_config(zhp, NULL);
+ verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
+ &version) == 0);
+
+ if (version == ZFS_VERSION) {
+ (void) printf(gettext("Pool '%s' is already formatted "
+ "using the current version.\n"), zpool_get_name(zhp));
+ return (0);
+ }
+
+ ret = zpool_upgrade(zhp);
+
+ if (!ret) {
+ zpool_log_history(g_zfs, cbp->cb_argc, cbp->cb_argv,
+ zpool_get_name(zhp), B_TRUE, B_FALSE);
+ (void) printf(gettext("Successfully upgraded '%s' "
+ "from version %llu to version %llu\n"), zpool_get_name(zhp),
+ (u_longlong_t)version, (u_longlong_t)ZFS_VERSION);
+ }
+
+ return (ret != 0);
+}
+
+/*
+ * zpool upgrade
+ * zpool upgrade -v
+ * zpool upgrade <-a | pool>
+ *
+ * With no arguments, display downrev'd ZFS pool available for upgrade.
+ * Individual pools can be upgraded by specifying the pool, and '-a' will
+ * upgrade all pools.
+ */
+int
+zpool_do_upgrade(int argc, char **argv)
+{
+ int c;
+ upgrade_cbdata_t cb = { 0 };
+ int ret = 0;
+ boolean_t showversions = B_FALSE;
+
+ /* check options */
+ while ((c = getopt(argc, argv, "av")) != -1) {
+ switch (c) {
+ case 'a':
+ cb.cb_all = B_TRUE;
+ break;
+ case 'v':
+ showversions = B_TRUE;
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ cb.cb_argc = argc;
+ cb.cb_argv = argv;
+ argc -= optind;
+ argv += optind;
+
+ if (showversions) {
+ if (cb.cb_all || argc != 0) {
+ (void) fprintf(stderr, gettext("-v option is "
+ "incompatible with other arguments\n"));
+ usage(B_FALSE);
+ }
+ } else if (cb.cb_all) {
+ if (argc != 0) {
+ (void) fprintf(stderr, gettext("-a option is "
+ "incompatible with other arguments\n"));
+ usage(B_FALSE);
+ }
+ }
+
+ (void) printf(gettext("This system is currently running ZFS version "
+ "%llu.\n\n"), ZFS_VERSION);
+ cb.cb_first = B_TRUE;
+ if (showversions) {
+ (void) printf(gettext("The following versions are "
+ "supported:\n\n"));
+ (void) printf(gettext("VER DESCRIPTION\n"));
+ (void) printf("--- -----------------------------------------"
+ "---------------\n");
+ (void) printf(gettext(" 1 Initial ZFS version\n"));
+ (void) printf(gettext(" 2 Ditto blocks "
+ "(replicated metadata)\n"));
+ (void) printf(gettext(" 3 Hot spares and double parity "
+ "RAID-Z\n"));
+ (void) printf(gettext(" 4 zpool history\n"));
+ (void) printf(gettext(" 5 Compression using the gzip "
+ "algorithm\n"));
+ (void) printf(gettext(" 6 bootfs pool property "));
+ (void) printf(gettext("\nFor more information on a particular "
+ "version, including supported releases, see:\n\n"));
+ (void) printf("http://www.opensolaris.org/os/community/zfs/"
+ "version/N\n\n");
+ (void) printf(gettext("Where 'N' is the version number.\n"));
+ } else if (argc == 0) {
+ int notfound;
+
+ ret = zpool_iter(g_zfs, upgrade_cb, &cb);
+ notfound = cb.cb_first;
+
+ if (!cb.cb_all && ret == 0) {
+ if (!cb.cb_first)
+ (void) printf("\n");
+ cb.cb_first = B_TRUE;
+ cb.cb_newer = B_TRUE;
+ ret = zpool_iter(g_zfs, upgrade_cb, &cb);
+ if (!cb.cb_first) {
+ notfound = B_FALSE;
+ (void) printf("\n");
+ }
+ }
+
+ if (ret == 0) {
+ if (notfound)
+ (void) printf(gettext("All pools are formatted "
+ "using this version.\n"));
+ else if (!cb.cb_all)
+ (void) printf(gettext("Use 'zpool upgrade -v' "
+ "for a list of available versions and "
+ "their associated\nfeatures.\n"));
+ }
+ } else {
+ ret = for_each_pool(argc, argv, B_FALSE, NULL,
+ upgrade_one, &cb);
+ }
+
+ return (ret);
+}
+
+/*
+ * Print out the command history for a specific pool.
+ */
+static int
+get_history_one(zpool_handle_t *zhp, void *data)
+{
+ nvlist_t *nvhis;
+ nvlist_t **records;
+ uint_t numrecords;
+ char *cmdstr;
+ uint64_t dst_time;
+ time_t tsec;
+ struct tm t;
+ char tbuf[30];
+ int ret, i;
+
+ *(boolean_t *)data = B_FALSE;
+
+ (void) printf(gettext("History for '%s':\n"), zpool_get_name(zhp));
+
+ if ((ret = zpool_get_history(zhp, &nvhis)) != 0)
+ return (ret);
+
+ verify(nvlist_lookup_nvlist_array(nvhis, ZPOOL_HIST_RECORD,
+ &records, &numrecords) == 0);
+ for (i = 0; i < numrecords; i++) {
+ if (nvlist_lookup_uint64(records[i], ZPOOL_HIST_TIME,
+ &dst_time) == 0) {
+ verify(nvlist_lookup_string(records[i], ZPOOL_HIST_CMD,
+ &cmdstr) == 0);
+ tsec = dst_time;
+ (void) localtime_r(&tsec, &t);
+ (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t);
+ (void) printf("%s %s\n", tbuf, cmdstr);
+ }
+ }
+ (void) printf("\n");
+ nvlist_free(nvhis);
+
+ return (ret);
+}
+
+/*
+ * zpool history <pool>
+ *
+ * Displays the history of commands that modified pools.
+ */
+int
+zpool_do_history(int argc, char **argv)
+{
+ boolean_t first = B_TRUE;
+ int ret;
+
+ argc -= optind;
+ argv += optind;
+
+ ret = for_each_pool(argc, argv, B_FALSE, NULL, get_history_one,
+ &first);
+
+ if (argc == 0 && first == B_TRUE) {
+ (void) printf(gettext("no pools available\n"));
+ return (0);
+ }
+
+ return (ret);
+}
+
+static int
+get_callback(zpool_handle_t *zhp, void *data)
+{
+ libzfs_get_cbdata_t *cbp = (libzfs_get_cbdata_t *)data;
+ char value[MAXNAMELEN];
+ zfs_source_t srctype;
+ zpool_proplist_t *pl;
+
+ for (pl = cbp->cb_proplist; pl != NULL; pl = pl->pl_next) {
+
+ /*
+ * Skip the special fake placeholder.
+ */
+ if (pl->pl_prop == ZFS_PROP_NAME &&
+ pl == cbp->cb_proplist)
+ continue;
+
+ if (zpool_get_prop(zhp, pl->pl_prop,
+ value, sizeof (value), &srctype) != 0)
+ continue;
+
+ libzfs_print_one_property(zpool_get_name(zhp), cbp,
+ zpool_prop_to_name(pl->pl_prop), value, srctype, NULL);
+ }
+ return (0);
+}
+
+int
+zpool_do_get(int argc, char **argv)
+{
+ libzfs_get_cbdata_t cb = { 0 };
+ zpool_proplist_t fake_name = { 0 };
+ int ret;
+
+ if (argc < 3)
+ usage(B_FALSE);
+
+ cb.cb_first = B_TRUE;
+ cb.cb_sources = ZFS_SRC_ALL;
+ cb.cb_columns[0] = GET_COL_NAME;
+ cb.cb_columns[1] = GET_COL_PROPERTY;
+ cb.cb_columns[2] = GET_COL_VALUE;
+ cb.cb_columns[3] = GET_COL_SOURCE;
+
+ if (zpool_get_proplist(g_zfs, argv[1], &cb.cb_proplist) != 0)
+ usage(B_FALSE);
+
+ if (cb.cb_proplist != NULL) {
+ fake_name.pl_prop = ZFS_PROP_NAME;
+ fake_name.pl_width = strlen(gettext("NAME"));
+ fake_name.pl_next = cb.cb_proplist;
+ cb.cb_proplist = &fake_name;
+ }
+
+ ret = for_each_pool(argc - 2, argv + 2, B_TRUE, &cb.cb_proplist,
+ get_callback, &cb);
+
+ if (cb.cb_proplist == &fake_name)
+ zfs_free_proplist(fake_name.pl_next);
+ else
+ zfs_free_proplist(cb.cb_proplist);
+
+ return (ret);
+}
+
+typedef struct set_cbdata {
+ char *cb_propname;
+ char *cb_value;
+ boolean_t cb_any_successful;
+} set_cbdata_t;
+
+int
+set_callback(zpool_handle_t *zhp, void *data)
+{
+ int error;
+ set_cbdata_t *cb = (set_cbdata_t *)data;
+
+ error = zpool_set_prop(zhp, cb->cb_propname, cb->cb_value);
+
+ if (!error)
+ cb->cb_any_successful = B_TRUE;
+
+ return (error);
+}
+
+int
+zpool_do_set(int argc, char **argv)
+{
+ set_cbdata_t cb = { 0 };
+ int error;
+
+ if (argc > 1 && argv[1][0] == '-') {
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ argv[1][1]);
+ usage(B_FALSE);
+ }
+
+ if (argc < 2) {
+ (void) fprintf(stderr, gettext("missing property=value "
+ "argument\n"));
+ usage(B_FALSE);
+ }
+
+ if (argc < 3) {
+ (void) fprintf(stderr, gettext("missing pool name\n"));
+ usage(B_FALSE);
+ }
+
+ if (argc > 3) {
+ (void) fprintf(stderr, gettext("too many pool names\n"));
+ usage(B_FALSE);
+ }
+
+ cb.cb_propname = argv[1];
+ cb.cb_value = strchr(cb.cb_propname, '=');
+ if (cb.cb_value == NULL) {
+ (void) fprintf(stderr, gettext("missing value in "
+ "property=value argument\n"));
+ usage(B_FALSE);
+ }
+
+ *(cb.cb_value) = '\0';
+ cb.cb_value++;
+
+ error = for_each_pool(argc - 2, argv + 2, B_TRUE, NULL,
+ set_callback, &cb);
+
+ if (cb.cb_any_successful) {
+ *(cb.cb_value - 1) = '=';
+ zpool_log_history(g_zfs, argc, argv, argv[2], B_FALSE, B_FALSE);
+ }
+
+ return (error);
+}
+
+static int
+find_command_idx(char *command, int *idx)
+{
+ int i;
+
+ for (i = 0; i < NCOMMAND; i++) {
+ if (command_table[i].name == NULL)
+ continue;
+
+ if (strcmp(command, command_table[i].name) == 0) {
+ *idx = i;
+ return (0);
+ }
+ }
+ return (1);
+}
+
+int
+main(int argc, char **argv)
+{
+ int ret;
+ int i;
+ char *cmdname;
+ int found = 0;
+
+ (void) setlocale(LC_ALL, "");
+ (void) textdomain(TEXT_DOMAIN);
+
+ if ((g_zfs = libzfs_init()) == NULL) {
+ (void) fprintf(stderr, gettext("internal error: failed to "
+ "initialize ZFS library\n"));
+ return (1);
+ }
+
+ libzfs_print_on_error(g_zfs, B_TRUE);
+
+ opterr = 0;
+
+ /*
+ * Make sure the user has specified some command.
+ */
+ if (argc < 2) {
+ (void) fprintf(stderr, gettext("missing command\n"));
+ usage(B_FALSE);
+ }
+
+ cmdname = argv[1];
+
+ /*
+ * Special case '-?'
+ */
+ if (strcmp(cmdname, "-?") == 0)
+ usage(B_TRUE);
+
+ /*
+ * Run the appropriate command.
+ */
+ if (find_command_idx(cmdname, &i) == 0) {
+ current_command = &command_table[i];
+ ret = command_table[i].func(argc - 1, argv + 1);
+ found++;
+ }
+
+ /*
+ * 'freeze' is a vile debugging abomination, so we treat it as such.
+ */
+ if (strcmp(cmdname, "freeze") == 0 && argc == 3) {
+ char buf[16384];
+ int fd = open(ZFS_DEV, O_RDWR);
+ (void) strcpy((void *)buf, argv[2]);
+ return (!!ioctl(fd, ZFS_IOC_POOL_FREEZE, buf));
+ }
+
+ if (!found) {
+ (void) fprintf(stderr, gettext("unrecognized "
+ "command '%s'\n"), cmdname);
+ usage(B_FALSE);
+ }
+
+ libzfs_fini(g_zfs);
+
+ /*
+ * The 'ZFS_ABORT' environment variable causes us to dump core on exit
+ * for the purposes of running ::findleaks.
+ */
+ if (getenv("ZFS_ABORT") != NULL) {
+ (void) printf("dumping core by request\n");
+ abort();
+ }
+
+ return (ret);
+}
diff --git a/contrib/opensolaris/cmd/zpool/zpool_util.c b/contrib/opensolaris/cmd/zpool/zpool_util.c
new file mode 100644
index 0000000..8eb9c81
--- /dev/null
+++ b/contrib/opensolaris/cmd/zpool/zpool_util.c
@@ -0,0 +1,79 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <errno.h>
+#include <libgen.h>
+#include <libintl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+
+#include "zpool_util.h"
+
+/*
+ * Utility function to guarantee malloc() success.
+ */
+void *
+safe_malloc(size_t size)
+{
+ void *data;
+
+ if ((data = calloc(1, size)) == NULL) {
+ (void) fprintf(stderr, "internal error: out of memory\n");
+ exit(1);
+ }
+
+ return (data);
+}
+
+/*
+ * Same as above, but for strdup()
+ */
+char *
+safe_strdup(const char *str)
+{
+ char *ret;
+
+ if ((ret = strdup(str)) == NULL) {
+ (void) fprintf(stderr, "internal error: out of memory\n");
+ exit(1);
+ }
+
+ return (ret);
+}
+
+/*
+ * Display an out of memory error message and abort the current program.
+ */
+void
+zpool_no_memory(void)
+{
+ assert(errno == ENOMEM);
+ (void) fprintf(stderr,
+ gettext("internal error: out of memory\n"));
+ exit(1);
+}
diff --git a/contrib/opensolaris/cmd/zpool/zpool_util.h b/contrib/opensolaris/cmd/zpool/zpool_util.h
new file mode 100644
index 0000000..cb05bda
--- /dev/null
+++ b/contrib/opensolaris/cmd/zpool/zpool_util.h
@@ -0,0 +1,72 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef ZPOOL_UTIL_H
+#define ZPOOL_UTIL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <libnvpair.h>
+#include <libzfs.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Basic utility functions
+ */
+void *safe_malloc(size_t);
+char *safe_strdup(const char *);
+void zpool_no_memory(void);
+
+/*
+ * Virtual device functions
+ */
+nvlist_t *make_root_vdev(nvlist_t *poolconfig, int force, int check_rep,
+ boolean_t isreplace, int argc, char **argv);
+
+/*
+ * Pool list functions
+ */
+int for_each_pool(int, char **, boolean_t unavail, zpool_proplist_t **,
+ zpool_iter_f, void *);
+
+typedef struct zpool_list zpool_list_t;
+
+zpool_list_t *pool_list_get(int, char **, zpool_proplist_t **, int *);
+void pool_list_update(zpool_list_t *);
+int pool_list_iter(zpool_list_t *, int unavail, zpool_iter_f, void *);
+void pool_list_free(zpool_list_t *);
+int pool_list_count(zpool_list_t *);
+void pool_list_remove(zpool_list_t *, zpool_handle_t *);
+
+libzfs_handle_t *g_zfs;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ZPOOL_UTIL_H */
diff --git a/contrib/opensolaris/cmd/zpool/zpool_vdev.c b/contrib/opensolaris/cmd/zpool/zpool_vdev.c
new file mode 100644
index 0000000..de07723
--- /dev/null
+++ b/contrib/opensolaris/cmd/zpool/zpool_vdev.c
@@ -0,0 +1,850 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * Functions to convert between a list of vdevs and an nvlist representing the
+ * configuration. Each entry in the list can be one of:
+ *
+ * Device vdevs
+ * disk=(path=..., devid=...)
+ * file=(path=...)
+ *
+ * Group vdevs
+ * raidz[1|2]=(...)
+ * mirror=(...)
+ *
+ * Hot spares
+ *
+ * While the underlying implementation supports it, group vdevs cannot contain
+ * other group vdevs. All userland verification of devices is contained within
+ * this file. If successful, the nvlist returned can be passed directly to the
+ * kernel; we've done as much verification as possible in userland.
+ *
+ * Hot spares are a special case, and passed down as an array of disk vdevs, at
+ * the same level as the root of the vdev tree.
+ *
+ * The only function exported by this file is 'get_vdev_spec'. The function
+ * performs several passes:
+ *
+ * 1. Construct the vdev specification. Performs syntax validation and
+ * makes sure each device is valid.
+ * 2. Check for devices in use. Using libdiskmgt, makes sure that no
+ * devices are also in use. Some can be overridden using the 'force'
+ * flag, others cannot.
+ * 3. Check for replication errors if the 'force' flag is not specified.
+ * validates that the replication level is consistent across the
+ * entire pool.
+ */
+
+#include <assert.h>
+#include <devid.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <libintl.h>
+#include <libnvpair.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <paths.h>
+#include <sys/stat.h>
+#include <sys/disk.h>
+#include <sys/mntent.h>
+#include <libgeom.h>
+
+#include <libzfs.h>
+
+#include "zpool_util.h"
+
+/*
+ * For any given vdev specification, we can have multiple errors. The
+ * vdev_error() function keeps track of whether we have seen an error yet, and
+ * prints out a header if its the first error we've seen.
+ */
+boolean_t error_seen;
+boolean_t is_force;
+
+/*PRINTFLIKE1*/
+static void
+vdev_error(const char *fmt, ...)
+{
+ va_list ap;
+
+ if (!error_seen) {
+ (void) fprintf(stderr, gettext("invalid vdev specification\n"));
+ if (!is_force)
+ (void) fprintf(stderr, gettext("use '-f' to override "
+ "the following errors:\n"));
+ else
+ (void) fprintf(stderr, gettext("the following errors "
+ "must be manually repaired:\n"));
+ error_seen = B_TRUE;
+ }
+
+ va_start(ap, fmt);
+ (void) vfprintf(stderr, fmt, ap);
+ va_end(ap);
+}
+
+/*
+ * Validate a GEOM provider.
+ */
+static int
+check_provider(const char *name, boolean_t force, boolean_t isspare)
+{
+ struct gmesh mesh;
+ struct gclass *mp;
+ struct ggeom *gp;
+ struct gprovider *pp;
+ int rv;
+
+ /* XXX: What to do with isspare? */
+
+ if (strncmp(name, _PATH_DEV, sizeof(_PATH_DEV) - 1) == 0)
+ name += sizeof(_PATH_DEV) - 1;
+
+ rv = geom_gettree(&mesh);
+ assert(rv == 0);
+
+ pp = NULL;
+ LIST_FOREACH(mp, &mesh.lg_class, lg_class) {
+ LIST_FOREACH(gp, &mp->lg_geom, lg_geom) {
+ LIST_FOREACH(pp, &gp->lg_provider, lg_provider) {
+ if (strcmp(pp->lg_name, name) == 0)
+ goto out;
+ }
+ }
+ }
+out:
+ rv = -1;
+ if (pp == NULL)
+ vdev_error("no such provider %s\n", name);
+ else {
+ int acr, acw, ace;
+
+ VERIFY(sscanf(pp->lg_mode, "r%dw%de%d", &acr, &acw, &ace) == 3);
+ if (acw == 0 && ace == 0)
+ rv = 0;
+ else
+ vdev_error("%s is in use (%s)\n", name, pp->lg_mode);
+ }
+ geom_deletetree(&mesh);
+ return (rv);
+}
+
+static boolean_t
+is_provider(const char *name)
+{
+ off_t mediasize;
+ int fd;
+
+ fd = open(name, O_RDONLY);
+ if (fd == -1)
+ return (B_FALSE);
+ if (ioctl(fd, DIOCGMEDIASIZE, &mediasize) == -1) {
+ close(fd);
+ return (B_FALSE);
+ }
+ close(fd);
+ return (B_TRUE);
+
+}
+/*
+ * Create a leaf vdev. Determine if this is a GEOM provider.
+ * Valid forms for a leaf vdev are:
+ *
+ * /dev/xxx Complete path to a GEOM provider
+ * xxx Shorthand for /dev/xxx
+ */
+nvlist_t *
+make_leaf_vdev(const char *arg)
+{
+ char path[MAXPATHLEN];
+ nvlist_t *vdev = NULL;
+ char *type = NULL;
+
+ if (strncmp(arg, _PATH_DEV, sizeof(_PATH_DEV) - 1) == 0)
+ strlcpy(path, arg, sizeof (path));
+ else
+ snprintf(path, sizeof (path), "%s%s", _PATH_DEV, arg);
+
+ if (is_provider(path))
+ type = VDEV_TYPE_DISK;
+ else {
+ (void) fprintf(stderr, gettext("cannot use '%s': must be a "
+ "GEOM provider\n"), path);
+ return (NULL);
+ }
+
+ /*
+ * Finally, we have the complete device or file, and we know that it is
+ * acceptable to use. Construct the nvlist to describe this vdev. All
+ * vdevs have a 'path' element, and devices also have a 'devid' element.
+ */
+ verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0);
+ verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0);
+ verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0);
+ if (strcmp(type, VDEV_TYPE_DISK) == 0)
+ verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK,
+ (uint64_t)B_FALSE) == 0);
+
+ return (vdev);
+}
+
+/*
+ * Go through and verify the replication level of the pool is consistent.
+ * Performs the following checks:
+ *
+ * For the new spec, verifies that devices in mirrors and raidz are the
+ * same size.
+ *
+ * If the current configuration already has inconsistent replication
+ * levels, ignore any other potential problems in the new spec.
+ *
+ * Otherwise, make sure that the current spec (if there is one) and the new
+ * spec have consistent replication levels.
+ */
+typedef struct replication_level {
+ char *zprl_type;
+ uint64_t zprl_children;
+ uint64_t zprl_parity;
+} replication_level_t;
+
+/*
+ * Given a list of toplevel vdevs, return the current replication level. If
+ * the config is inconsistent, then NULL is returned. If 'fatal' is set, then
+ * an error message will be displayed for each self-inconsistent vdev.
+ */
+replication_level_t *
+get_replication(nvlist_t *nvroot, boolean_t fatal)
+{
+ nvlist_t **top;
+ uint_t t, toplevels;
+ nvlist_t **child;
+ uint_t c, children;
+ nvlist_t *nv;
+ char *type;
+ replication_level_t lastrep, rep, *ret;
+ boolean_t dontreport;
+
+ ret = safe_malloc(sizeof (replication_level_t));
+
+ verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
+ &top, &toplevels) == 0);
+
+ lastrep.zprl_type = NULL;
+ for (t = 0; t < toplevels; t++) {
+ nv = top[t];
+
+ verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) != 0) {
+ /*
+ * This is a 'file' or 'disk' vdev.
+ */
+ rep.zprl_type = type;
+ rep.zprl_children = 1;
+ rep.zprl_parity = 0;
+ } else {
+ uint64_t vdev_size;
+
+ /*
+ * This is a mirror or RAID-Z vdev. Go through and make
+ * sure the contents are all the same (files vs. disks),
+ * keeping track of the number of elements in the
+ * process.
+ *
+ * We also check that the size of each vdev (if it can
+ * be determined) is the same.
+ */
+ rep.zprl_type = type;
+ rep.zprl_children = 0;
+
+ if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {
+ verify(nvlist_lookup_uint64(nv,
+ ZPOOL_CONFIG_NPARITY,
+ &rep.zprl_parity) == 0);
+ assert(rep.zprl_parity != 0);
+ } else {
+ rep.zprl_parity = 0;
+ }
+
+ /*
+ * The 'dontreport' variable indicatest that we've
+ * already reported an error for this spec, so don't
+ * bother doing it again.
+ */
+ type = NULL;
+ dontreport = 0;
+ vdev_size = -1ULL;
+ for (c = 0; c < children; c++) {
+ nvlist_t *cnv = child[c];
+ char *path;
+ struct stat64 statbuf;
+ uint64_t size = -1ULL;
+ char *childtype;
+ int fd, err;
+
+ rep.zprl_children++;
+
+ verify(nvlist_lookup_string(cnv,
+ ZPOOL_CONFIG_TYPE, &childtype) == 0);
+
+ /*
+ * If this is a a replacing or spare vdev, then
+ * get the real first child of the vdev.
+ */
+ if (strcmp(childtype,
+ VDEV_TYPE_REPLACING) == 0 ||
+ strcmp(childtype, VDEV_TYPE_SPARE) == 0) {
+ nvlist_t **rchild;
+ uint_t rchildren;
+
+ verify(nvlist_lookup_nvlist_array(cnv,
+ ZPOOL_CONFIG_CHILDREN, &rchild,
+ &rchildren) == 0);
+ assert(rchildren == 2);
+ cnv = rchild[0];
+
+ verify(nvlist_lookup_string(cnv,
+ ZPOOL_CONFIG_TYPE,
+ &childtype) == 0);
+ }
+
+ verify(nvlist_lookup_string(cnv,
+ ZPOOL_CONFIG_PATH, &path) == 0);
+
+ /*
+ * If we have a raidz/mirror that combines disks
+ * with files, report it as an error.
+ */
+ if (!dontreport && type != NULL &&
+ strcmp(type, childtype) != 0) {
+ if (ret != NULL)
+ free(ret);
+ ret = NULL;
+ if (fatal)
+ vdev_error(gettext(
+ "mismatched replication "
+ "level: %s contains both "
+ "files and devices\n"),
+ rep.zprl_type);
+ else
+ return (NULL);
+ dontreport = B_TRUE;
+ }
+
+ /*
+ * According to stat(2), the value of 'st_size'
+ * is undefined for block devices and character
+ * devices. But there is no effective way to
+ * determine the real size in userland.
+ *
+ * Instead, we'll take advantage of an
+ * implementation detail of spec_size(). If the
+ * device is currently open, then we (should)
+ * return a valid size.
+ *
+ * If we still don't get a valid size (indicated
+ * by a size of 0 or MAXOFFSET_T), then ignore
+ * this device altogether.
+ */
+ if ((fd = open(path, O_RDONLY)) >= 0) {
+ err = fstat64(fd, &statbuf);
+ (void) close(fd);
+ } else {
+ err = stat64(path, &statbuf);
+ }
+
+ if (err != 0 || statbuf.st_size == 0)
+ continue;
+
+ size = statbuf.st_size;
+
+ /*
+ * Also check the size of each device. If they
+ * differ, then report an error.
+ */
+ if (!dontreport && vdev_size != -1ULL &&
+ size != vdev_size) {
+ if (ret != NULL)
+ free(ret);
+ ret = NULL;
+ if (fatal)
+ vdev_error(gettext(
+ "%s contains devices of "
+ "different sizes\n"),
+ rep.zprl_type);
+ else
+ return (NULL);
+ dontreport = B_TRUE;
+ }
+
+ type = childtype;
+ vdev_size = size;
+ }
+ }
+
+ /*
+ * At this point, we have the replication of the last toplevel
+ * vdev in 'rep'. Compare it to 'lastrep' to see if its
+ * different.
+ */
+ if (lastrep.zprl_type != NULL) {
+ if (strcmp(lastrep.zprl_type, rep.zprl_type) != 0) {
+ if (ret != NULL)
+ free(ret);
+ ret = NULL;
+ if (fatal)
+ vdev_error(gettext(
+ "mismatched replication level: "
+ "both %s and %s vdevs are "
+ "present\n"),
+ lastrep.zprl_type, rep.zprl_type);
+ else
+ return (NULL);
+ } else if (lastrep.zprl_parity != rep.zprl_parity) {
+ if (ret)
+ free(ret);
+ ret = NULL;
+ if (fatal)
+ vdev_error(gettext(
+ "mismatched replication level: "
+ "both %llu and %llu device parity "
+ "%s vdevs are present\n"),
+ lastrep.zprl_parity,
+ rep.zprl_parity,
+ rep.zprl_type);
+ else
+ return (NULL);
+ } else if (lastrep.zprl_children != rep.zprl_children) {
+ if (ret)
+ free(ret);
+ ret = NULL;
+ if (fatal)
+ vdev_error(gettext(
+ "mismatched replication level: "
+ "both %llu-way and %llu-way %s "
+ "vdevs are present\n"),
+ lastrep.zprl_children,
+ rep.zprl_children,
+ rep.zprl_type);
+ else
+ return (NULL);
+ }
+ }
+ lastrep = rep;
+ }
+
+ if (ret != NULL)
+ *ret = rep;
+
+ return (ret);
+}
+
+/*
+ * Check the replication level of the vdev spec against the current pool. Calls
+ * get_replication() to make sure the new spec is self-consistent. If the pool
+ * has a consistent replication level, then we ignore any errors. Otherwise,
+ * report any difference between the two.
+ */
+int
+check_replication(nvlist_t *config, nvlist_t *newroot)
+{
+ replication_level_t *current = NULL, *new;
+ int ret;
+
+ /*
+ * If we have a current pool configuration, check to see if it's
+ * self-consistent. If not, simply return success.
+ */
+ if (config != NULL) {
+ nvlist_t *nvroot;
+
+ verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+ &nvroot) == 0);
+ if ((current = get_replication(nvroot, B_FALSE)) == NULL)
+ return (0);
+ }
+
+ /*
+ * Get the replication level of the new vdev spec, reporting any
+ * inconsistencies found.
+ */
+ if ((new = get_replication(newroot, B_TRUE)) == NULL) {
+ free(current);
+ return (-1);
+ }
+
+ /*
+ * Check to see if the new vdev spec matches the replication level of
+ * the current pool.
+ */
+ ret = 0;
+ if (current != NULL) {
+ if (strcmp(current->zprl_type, new->zprl_type) != 0) {
+ vdev_error(gettext(
+ "mismatched replication level: pool uses %s "
+ "and new vdev is %s\n"),
+ current->zprl_type, new->zprl_type);
+ ret = -1;
+ } else if (current->zprl_parity != new->zprl_parity) {
+ vdev_error(gettext(
+ "mismatched replication level: pool uses %llu "
+ "device parity and new vdev uses %llu\n"),
+ current->zprl_parity, new->zprl_parity);
+ ret = -1;
+ } else if (current->zprl_children != new->zprl_children) {
+ vdev_error(gettext(
+ "mismatched replication level: pool uses %llu-way "
+ "%s and new vdev uses %llu-way %s\n"),
+ current->zprl_children, current->zprl_type,
+ new->zprl_children, new->zprl_type);
+ ret = -1;
+ }
+ }
+
+ free(new);
+ if (current != NULL)
+ free(current);
+
+ return (ret);
+}
+
+/*
+ * Determine if the given path is a hot spare within the given configuration.
+ */
+static boolean_t
+is_spare(nvlist_t *config, const char *path)
+{
+ int fd;
+ pool_state_t state;
+ char *name = NULL;
+ nvlist_t *label;
+ uint64_t guid, spareguid;
+ nvlist_t *nvroot;
+ nvlist_t **spares;
+ uint_t i, nspares;
+ boolean_t inuse;
+
+ if ((fd = open(path, O_RDONLY)) < 0)
+ return (B_FALSE);
+
+ if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) != 0 ||
+ !inuse ||
+ state != POOL_STATE_SPARE ||
+ zpool_read_label(fd, &label) != 0) {
+ free(name);
+ (void) close(fd);
+ return (B_FALSE);
+ }
+ free(name);
+
+ (void) close(fd);
+ verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0);
+ nvlist_free(label);
+
+ verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+ &nvroot) == 0);
+ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
+ &spares, &nspares) == 0) {
+ for (i = 0; i < nspares; i++) {
+ verify(nvlist_lookup_uint64(spares[i],
+ ZPOOL_CONFIG_GUID, &spareguid) == 0);
+ if (spareguid == guid)
+ return (B_TRUE);
+ }
+ }
+
+ return (B_FALSE);
+}
+
+/*
+ * Go through and find any devices that are in use. We rely on libdiskmgt for
+ * the majority of this task.
+ */
+int
+check_in_use(nvlist_t *config, nvlist_t *nv, int force, int isreplacing,
+ int isspare)
+{
+ nvlist_t **child;
+ uint_t c, children;
+ char *type, *path;
+ int ret;
+ char buf[MAXPATHLEN];
+ uint64_t wholedisk;
+
+ verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) != 0) {
+
+ verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0);
+
+ /*
+ * As a generic check, we look to see if this is a replace of a
+ * hot spare within the same pool. If so, we allow it
+ * regardless of what libdiskmgt or zpool_in_use() says.
+ */
+ if (isreplacing) {
+ (void) strlcpy(buf, path, sizeof (buf));
+ if (is_spare(config, buf))
+ return (0);
+ }
+
+ if (strcmp(type, VDEV_TYPE_DISK) == 0)
+ ret = check_provider(path, force, isspare);
+
+ return (ret);
+ }
+
+ for (c = 0; c < children; c++)
+ if ((ret = check_in_use(config, child[c], force,
+ isreplacing, B_FALSE)) != 0)
+ return (ret);
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
+ &child, &children) == 0)
+ for (c = 0; c < children; c++)
+ if ((ret = check_in_use(config, child[c], force,
+ isreplacing, B_TRUE)) != 0)
+ return (ret);
+
+ return (0);
+}
+
+const char *
+is_grouping(const char *type, int *mindev)
+{
+ if (strcmp(type, "raidz") == 0 || strcmp(type, "raidz1") == 0) {
+ if (mindev != NULL)
+ *mindev = 2;
+ return (VDEV_TYPE_RAIDZ);
+ }
+
+ if (strcmp(type, "raidz2") == 0) {
+ if (mindev != NULL)
+ *mindev = 3;
+ return (VDEV_TYPE_RAIDZ);
+ }
+
+ if (strcmp(type, "mirror") == 0) {
+ if (mindev != NULL)
+ *mindev = 2;
+ return (VDEV_TYPE_MIRROR);
+ }
+
+ if (strcmp(type, "spare") == 0) {
+ if (mindev != NULL)
+ *mindev = 1;
+ return (VDEV_TYPE_SPARE);
+ }
+
+ return (NULL);
+}
+
+/*
+ * Construct a syntactically valid vdev specification,
+ * and ensure that all devices and files exist and can be opened.
+ * Note: we don't bother freeing anything in the error paths
+ * because the program is just going to exit anyway.
+ */
+nvlist_t *
+construct_spec(int argc, char **argv)
+{
+ nvlist_t *nvroot, *nv, **top, **spares;
+ int t, toplevels, mindev, nspares;
+ const char *type;
+
+ top = NULL;
+ toplevels = 0;
+ spares = NULL;
+ nspares = 0;
+
+ while (argc > 0) {
+ nv = NULL;
+
+ /*
+ * If it's a mirror or raidz, the subsequent arguments are
+ * its leaves -- until we encounter the next mirror or raidz.
+ */
+ if ((type = is_grouping(argv[0], &mindev)) != NULL) {
+ nvlist_t **child = NULL;
+ int c, children = 0;
+
+ if (strcmp(type, VDEV_TYPE_SPARE) == 0 &&
+ spares != NULL) {
+ (void) fprintf(stderr, gettext("invalid vdev "
+ "specification: 'spare' can be "
+ "specified only once\n"));
+ return (NULL);
+ }
+
+ for (c = 1; c < argc; c++) {
+ if (is_grouping(argv[c], NULL) != NULL)
+ break;
+ children++;
+ child = realloc(child,
+ children * sizeof (nvlist_t *));
+ if (child == NULL)
+ zpool_no_memory();
+ if ((nv = make_leaf_vdev(argv[c])) == NULL)
+ return (NULL);
+ child[children - 1] = nv;
+ }
+
+ if (children < mindev) {
+ (void) fprintf(stderr, gettext("invalid vdev "
+ "specification: %s requires at least %d "
+ "devices\n"), argv[0], mindev);
+ return (NULL);
+ }
+
+ argc -= c;
+ argv += c;
+
+ if (strcmp(type, VDEV_TYPE_SPARE) == 0) {
+ spares = child;
+ nspares = children;
+ continue;
+ } else {
+ verify(nvlist_alloc(&nv, NV_UNIQUE_NAME,
+ 0) == 0);
+ verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE,
+ type) == 0);
+ if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {
+ verify(nvlist_add_uint64(nv,
+ ZPOOL_CONFIG_NPARITY,
+ mindev - 1) == 0);
+ }
+ verify(nvlist_add_nvlist_array(nv,
+ ZPOOL_CONFIG_CHILDREN, child,
+ children) == 0);
+
+ for (c = 0; c < children; c++)
+ nvlist_free(child[c]);
+ free(child);
+ }
+ } else {
+ /*
+ * We have a device. Pass off to make_leaf_vdev() to
+ * construct the appropriate nvlist describing the vdev.
+ */
+ if ((nv = make_leaf_vdev(argv[0])) == NULL)
+ return (NULL);
+ argc--;
+ argv++;
+ }
+
+ toplevels++;
+ top = realloc(top, toplevels * sizeof (nvlist_t *));
+ if (top == NULL)
+ zpool_no_memory();
+ top[toplevels - 1] = nv;
+ }
+
+ if (toplevels == 0 && nspares == 0) {
+ (void) fprintf(stderr, gettext("invalid vdev "
+ "specification: at least one toplevel vdev must be "
+ "specified\n"));
+ return (NULL);
+ }
+
+ /*
+ * Finally, create nvroot and add all top-level vdevs to it.
+ */
+ verify(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) == 0);
+ verify(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
+ VDEV_TYPE_ROOT) == 0);
+ verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
+ top, toplevels) == 0);
+ if (nspares != 0)
+ verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
+ spares, nspares) == 0);
+
+ for (t = 0; t < toplevels; t++)
+ nvlist_free(top[t]);
+ for (t = 0; t < nspares; t++)
+ nvlist_free(spares[t]);
+ if (spares)
+ free(spares);
+ free(top);
+
+ return (nvroot);
+}
+
+/*
+ * Get and validate the contents of the given vdev specification. This ensures
+ * that the nvlist returned is well-formed, that all the devices exist, and that
+ * they are not currently in use by any other known consumer. The 'poolconfig'
+ * parameter is the current configuration of the pool when adding devices
+ * existing pool, and is used to perform additional checks, such as changing the
+ * replication level of the pool. It can be 'NULL' to indicate that this is a
+ * new pool. The 'force' flag controls whether devices should be forcefully
+ * added, even if they appear in use.
+ */
+nvlist_t *
+make_root_vdev(nvlist_t *poolconfig, int force, int check_rep,
+ boolean_t isreplacing, int argc, char **argv)
+{
+ nvlist_t *newroot;
+
+ is_force = force;
+
+ /*
+ * Construct the vdev specification. If this is successful, we know
+ * that we have a valid specification, and that all devices can be
+ * opened.
+ */
+ if ((newroot = construct_spec(argc, argv)) == NULL)
+ return (NULL);
+
+ /*
+ * Validate each device to make sure that its not shared with another
+ * subsystem. We do this even if 'force' is set, because there are some
+ * uses (such as a dedicated dump device) that even '-f' cannot
+ * override.
+ */
+ if (check_in_use(poolconfig, newroot, force, isreplacing,
+ B_FALSE) != 0) {
+ nvlist_free(newroot);
+ return (NULL);
+ }
+
+ /*
+ * Check the replication level of the given vdevs and report any errors
+ * found. We include the existing pool spec, if any, as we need to
+ * catch changes against the existing replication level.
+ */
+ if (check_rep && check_replication(poolconfig, newroot) != 0) {
+ nvlist_free(newroot);
+ return (NULL);
+ }
+
+ return (newroot);
+}
diff --git a/contrib/opensolaris/cmd/ztest/ztest.c b/contrib/opensolaris/cmd/ztest/ztest.c
new file mode 100644
index 0000000..45f5f8c
--- /dev/null
+++ b/contrib/opensolaris/cmd/ztest/ztest.c
@@ -0,0 +1,3477 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * The objective of this program is to provide a DMU/ZAP/SPA stress test
+ * that runs entirely in userland, is easy to use, and easy to extend.
+ *
+ * The overall design of the ztest program is as follows:
+ *
+ * (1) For each major functional area (e.g. adding vdevs to a pool,
+ * creating and destroying datasets, reading and writing objects, etc)
+ * we have a simple routine to test that functionality. These
+ * individual routines do not have to do anything "stressful".
+ *
+ * (2) We turn these simple functionality tests into a stress test by
+ * running them all in parallel, with as many threads as desired,
+ * and spread across as many datasets, objects, and vdevs as desired.
+ *
+ * (3) While all this is happening, we inject faults into the pool to
+ * verify that self-healing data really works.
+ *
+ * (4) Every time we open a dataset, we change its checksum and compression
+ * functions. Thus even individual objects vary from block to block
+ * in which checksum they use and whether they're compressed.
+ *
+ * (5) To verify that we never lose on-disk consistency after a crash,
+ * we run the entire test in a child of the main process.
+ * At random times, the child self-immolates with a SIGKILL.
+ * This is the software equivalent of pulling the power cord.
+ * The parent then runs the test again, using the existing
+ * storage pool, as many times as desired.
+ *
+ * (6) To verify that we don't have future leaks or temporal incursions,
+ * many of the functional tests record the transaction group number
+ * as part of their data. When reading old data, they verify that
+ * the transaction group number is less than the current, open txg.
+ * If you add a new test, please do this if applicable.
+ *
+ * When run with no arguments, ztest runs for about five minutes and
+ * produces no output if successful. To get a little bit of information,
+ * specify -V. To get more information, specify -VV, and so on.
+ *
+ * To turn this into an overnight stress test, use -T to specify run time.
+ *
+ * You can ask more more vdevs [-v], datasets [-d], or threads [-t]
+ * to increase the pool capacity, fanout, and overall stress level.
+ *
+ * The -N(okill) option will suppress kills, so each child runs to completion.
+ * This can be useful when you're trying to distinguish temporal incursions
+ * from plain old race conditions.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/dmu.h>
+#include <sys/txg.h>
+#include <sys/zap.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dmu_objset.h>
+#include <sys/poll.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/wait.h>
+#include <sys/mman.h>
+#include <sys/resource.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+#include <sys/zio_compress.h>
+#include <sys/zil.h>
+#include <sys/vdev_impl.h>
+#include <sys/spa_impl.h>
+#include <sys/dsl_prop.h>
+#include <sys/refcount.h>
+#include <stdio.h>
+#include <stdio_ext.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <signal.h>
+#include <umem.h>
+#include <dlfcn.h>
+#include <ctype.h>
+#include <math.h>
+#include <errno.h>
+#include <sys/fs/zfs.h>
+
+static char cmdname[] = "ztest";
+static char *zopt_pool = cmdname;
+static char *progname;
+
+static uint64_t zopt_vdevs = 5;
+static uint64_t zopt_vdevtime;
+static int zopt_ashift = SPA_MINBLOCKSHIFT;
+static int zopt_mirrors = 2;
+static int zopt_raidz = 4;
+static int zopt_raidz_parity = 1;
+static size_t zopt_vdev_size = SPA_MINDEVSIZE;
+static int zopt_datasets = 7;
+static int zopt_threads = 23;
+static uint64_t zopt_passtime = 60; /* 60 seconds */
+static uint64_t zopt_killrate = 70; /* 70% kill rate */
+static int zopt_verbose = 0;
+static int zopt_init = 1;
+static char *zopt_dir = "/tmp";
+static uint64_t zopt_time = 300; /* 5 minutes */
+static int zopt_maxfaults;
+
+typedef struct ztest_args {
+ char *za_pool;
+ objset_t *za_os;
+ zilog_t *za_zilog;
+ thread_t za_thread;
+ uint64_t za_instance;
+ uint64_t za_random;
+ uint64_t za_diroff;
+ uint64_t za_diroff_shared;
+ uint64_t za_zil_seq;
+ hrtime_t za_start;
+ hrtime_t za_stop;
+ hrtime_t za_kill;
+ traverse_handle_t *za_th;
+} ztest_args_t;
+
+typedef void ztest_func_t(ztest_args_t *);
+
+/*
+ * Note: these aren't static because we want dladdr() to work.
+ */
+ztest_func_t ztest_dmu_read_write;
+ztest_func_t ztest_dmu_write_parallel;
+ztest_func_t ztest_dmu_object_alloc_free;
+ztest_func_t ztest_zap;
+ztest_func_t ztest_zap_parallel;
+ztest_func_t ztest_traverse;
+ztest_func_t ztest_dsl_prop_get_set;
+ztest_func_t ztest_dmu_objset_create_destroy;
+ztest_func_t ztest_dmu_snapshot_create_destroy;
+ztest_func_t ztest_spa_create_destroy;
+ztest_func_t ztest_fault_inject;
+ztest_func_t ztest_vdev_attach_detach;
+ztest_func_t ztest_vdev_LUN_growth;
+ztest_func_t ztest_vdev_add_remove;
+ztest_func_t ztest_scrub;
+ztest_func_t ztest_spa_rename;
+
+typedef struct ztest_info {
+ ztest_func_t *zi_func; /* test function */
+ uint64_t *zi_interval; /* execute every <interval> seconds */
+ uint64_t zi_calls; /* per-pass count */
+ uint64_t zi_call_time; /* per-pass time */
+ uint64_t zi_call_total; /* cumulative total */
+ uint64_t zi_call_target; /* target cumulative total */
+} ztest_info_t;
+
+uint64_t zopt_always = 0; /* all the time */
+uint64_t zopt_often = 1; /* every second */
+uint64_t zopt_sometimes = 10; /* every 10 seconds */
+uint64_t zopt_rarely = 60; /* every 60 seconds */
+
+ztest_info_t ztest_info[] = {
+ { ztest_dmu_read_write, &zopt_always },
+ { ztest_dmu_write_parallel, &zopt_always },
+ { ztest_dmu_object_alloc_free, &zopt_always },
+ { ztest_zap, &zopt_always },
+ { ztest_zap_parallel, &zopt_always },
+ { ztest_traverse, &zopt_often },
+ { ztest_dsl_prop_get_set, &zopt_sometimes },
+ { ztest_dmu_objset_create_destroy, &zopt_sometimes },
+ { ztest_dmu_snapshot_create_destroy, &zopt_rarely },
+ { ztest_spa_create_destroy, &zopt_sometimes },
+ { ztest_fault_inject, &zopt_sometimes },
+ { ztest_spa_rename, &zopt_rarely },
+ { ztest_vdev_attach_detach, &zopt_rarely },
+ { ztest_vdev_LUN_growth, &zopt_rarely },
+ { ztest_vdev_add_remove, &zopt_vdevtime },
+ { ztest_scrub, &zopt_vdevtime },
+};
+
+#define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t))
+
+#define ZTEST_SYNC_LOCKS 16
+
+/*
+ * Stuff we need to share writably between parent and child.
+ */
+typedef struct ztest_shared {
+ mutex_t zs_vdev_lock;
+ rwlock_t zs_name_lock;
+ uint64_t zs_vdev_primaries;
+ uint64_t zs_enospc_count;
+ hrtime_t zs_start_time;
+ hrtime_t zs_stop_time;
+ uint64_t zs_alloc;
+ uint64_t zs_space;
+ uint64_t zs_txg;
+ ztest_info_t zs_info[ZTEST_FUNCS];
+ mutex_t zs_sync_lock[ZTEST_SYNC_LOCKS];
+ uint64_t zs_seq[ZTEST_SYNC_LOCKS];
+} ztest_shared_t;
+
+typedef struct ztest_block_tag {
+ uint64_t bt_objset;
+ uint64_t bt_object;
+ uint64_t bt_offset;
+ uint64_t bt_txg;
+ uint64_t bt_thread;
+ uint64_t bt_seq;
+} ztest_block_tag_t;
+
+static char ztest_dev_template[] = "%s/%s.%llua";
+static ztest_shared_t *ztest_shared;
+
+static int ztest_random_fd;
+static int ztest_dump_core = 1;
+
+extern uint64_t zio_gang_bang;
+extern uint16_t zio_zil_fail_shift;
+
+#define ZTEST_DIROBJ 1
+#define ZTEST_MICROZAP_OBJ 2
+#define ZTEST_FATZAP_OBJ 3
+
+#define ZTEST_DIROBJ_BLOCKSIZE (1 << 10)
+#define ZTEST_DIRSIZE 256
+
+/*
+ * These libumem hooks provide a reasonable set of defaults for the allocator's
+ * debugging facilities.
+ */
+const char *
+_umem_debug_init()
+{
+ return ("default,verbose"); /* $UMEM_DEBUG setting */
+}
+
+const char *
+_umem_logging_init(void)
+{
+ return ("fail,contents"); /* $UMEM_LOGGING setting */
+}
+
+#define FATAL_MSG_SZ 1024
+
+char *fatal_msg;
+
+static void
+fatal(int do_perror, char *message, ...)
+{
+ va_list args;
+ int save_errno = errno;
+ char buf[FATAL_MSG_SZ];
+
+ (void) fflush(stdout);
+
+ va_start(args, message);
+ (void) sprintf(buf, "ztest: ");
+ /* LINTED */
+ (void) vsprintf(buf + strlen(buf), message, args);
+ va_end(args);
+ if (do_perror) {
+ (void) snprintf(buf + strlen(buf), FATAL_MSG_SZ - strlen(buf),
+ ": %s", strerror(save_errno));
+ }
+ (void) fprintf(stderr, "%s\n", buf);
+ fatal_msg = buf; /* to ease debugging */
+ if (ztest_dump_core)
+ abort();
+ exit(3);
+}
+
+static int
+str2shift(const char *buf)
+{
+ const char *ends = "BKMGTPEZ";
+ int i;
+
+ if (buf[0] == '\0')
+ return (0);
+ for (i = 0; i < strlen(ends); i++) {
+ if (toupper(buf[0]) == ends[i])
+ break;
+ }
+ if (i == strlen(ends))
+ fatal(0, "invalid bytes suffix: %s", buf);
+ if (buf[1] == '\0' || (toupper(buf[1]) == 'B' && buf[2] == '\0')) {
+ return (10*i);
+ }
+ fatal(0, "invalid bytes suffix: %s", buf);
+ return (-1);
+}
+
+static uint64_t
+nicenumtoull(const char *buf)
+{
+ char *end;
+ uint64_t val;
+
+ val = strtoull(buf, &end, 0);
+ if (end == buf) {
+ fatal(0, "bad numeric value: %s", buf);
+ } else if (end[0] == '.') {
+ double fval = strtod(buf, &end);
+ fval *= pow(2, str2shift(end));
+ if (fval > UINT64_MAX)
+ fatal(0, "value too large: %s", buf);
+ val = (uint64_t)fval;
+ } else {
+ int shift = str2shift(end);
+ if (shift >= 64 || (val << shift) >> shift != val)
+ fatal(0, "value too large: %s", buf);
+ val <<= shift;
+ }
+ return (val);
+}
+
+static void
+usage(void)
+{
+ char nice_vdev_size[10];
+ char nice_gang_bang[10];
+
+ nicenum(zopt_vdev_size, nice_vdev_size);
+ nicenum(zio_gang_bang, nice_gang_bang);
+
+ (void) printf("Usage: %s\n"
+ "\t[-v vdevs (default: %llu)]\n"
+ "\t[-s size_of_each_vdev (default: %s)]\n"
+ "\t[-a alignment_shift (default: %d) (use 0 for random)]\n"
+ "\t[-m mirror_copies (default: %d)]\n"
+ "\t[-r raidz_disks (default: %d)]\n"
+ "\t[-R raidz_parity (default: %d)]\n"
+ "\t[-d datasets (default: %d)]\n"
+ "\t[-t threads (default: %d)]\n"
+ "\t[-g gang_block_threshold (default: %s)]\n"
+ "\t[-i initialize pool i times (default: %d)]\n"
+ "\t[-k kill percentage (default: %llu%%)]\n"
+ "\t[-p pool_name (default: %s)]\n"
+ "\t[-f file directory for vdev files (default: %s)]\n"
+ "\t[-V(erbose)] (use multiple times for ever more blather)\n"
+ "\t[-E(xisting)] (use existing pool instead of creating new one)\n"
+ "\t[-T time] total run time (default: %llu sec)\n"
+ "\t[-P passtime] time per pass (default: %llu sec)\n"
+ "\t[-z zil failure rate (default: fail every 2^%llu allocs)]\n"
+ "",
+ cmdname,
+ (u_longlong_t)zopt_vdevs, /* -v */
+ nice_vdev_size, /* -s */
+ zopt_ashift, /* -a */
+ zopt_mirrors, /* -m */
+ zopt_raidz, /* -r */
+ zopt_raidz_parity, /* -R */
+ zopt_datasets, /* -d */
+ zopt_threads, /* -t */
+ nice_gang_bang, /* -g */
+ zopt_init, /* -i */
+ (u_longlong_t)zopt_killrate, /* -k */
+ zopt_pool, /* -p */
+ zopt_dir, /* -f */
+ (u_longlong_t)zopt_time, /* -T */
+ (u_longlong_t)zopt_passtime, /* -P */
+ (u_longlong_t)zio_zil_fail_shift); /* -z */
+ exit(1);
+}
+
+static uint64_t
+ztest_random(uint64_t range)
+{
+ uint64_t r;
+
+ if (range == 0)
+ return (0);
+
+ if (read(ztest_random_fd, &r, sizeof (r)) != sizeof (r))
+ fatal(1, "short read from /dev/urandom");
+
+ return (r % range);
+}
+
+static void
+ztest_record_enospc(char *s)
+{
+ dprintf("ENOSPC doing: %s\n", s ? s : "<unknown>");
+ ztest_shared->zs_enospc_count++;
+}
+
+static void
+process_options(int argc, char **argv)
+{
+ int opt;
+ uint64_t value;
+
+ /* Remember program name. */
+ progname = argv[0];
+
+ /* By default, test gang blocks for blocks 32K and greater */
+ zio_gang_bang = 32 << 10;
+
+ /* Default value, fail every 32nd allocation */
+ zio_zil_fail_shift = 5;
+
+ while ((opt = getopt(argc, argv,
+ "v:s:a:m:r:R:d:t:g:i:k:p:f:VET:P:z:")) != EOF) {
+ value = 0;
+ switch (opt) {
+ case 'v':
+ case 's':
+ case 'a':
+ case 'm':
+ case 'r':
+ case 'R':
+ case 'd':
+ case 't':
+ case 'g':
+ case 'i':
+ case 'k':
+ case 'T':
+ case 'P':
+ case 'z':
+ value = nicenumtoull(optarg);
+ }
+ switch (opt) {
+ case 'v':
+ zopt_vdevs = value;
+ break;
+ case 's':
+ zopt_vdev_size = MAX(SPA_MINDEVSIZE, value);
+ break;
+ case 'a':
+ zopt_ashift = value;
+ break;
+ case 'm':
+ zopt_mirrors = value;
+ break;
+ case 'r':
+ zopt_raidz = MAX(1, value);
+ break;
+ case 'R':
+ zopt_raidz_parity = MIN(MAX(value, 1), 2);
+ break;
+ case 'd':
+ zopt_datasets = MAX(1, value);
+ break;
+ case 't':
+ zopt_threads = MAX(1, value);
+ break;
+ case 'g':
+ zio_gang_bang = MAX(SPA_MINBLOCKSIZE << 1, value);
+ break;
+ case 'i':
+ zopt_init = value;
+ break;
+ case 'k':
+ zopt_killrate = value;
+ break;
+ case 'p':
+ zopt_pool = strdup(optarg);
+ break;
+ case 'f':
+ zopt_dir = strdup(optarg);
+ break;
+ case 'V':
+ zopt_verbose++;
+ break;
+ case 'E':
+ zopt_init = 0;
+ break;
+ case 'T':
+ zopt_time = value;
+ break;
+ case 'P':
+ zopt_passtime = MAX(1, value);
+ break;
+ case 'z':
+ zio_zil_fail_shift = MIN(value, 16);
+ break;
+ case '?':
+ default:
+ usage();
+ break;
+ }
+ }
+
+ zopt_raidz_parity = MIN(zopt_raidz_parity, zopt_raidz - 1);
+
+ zopt_vdevtime = (zopt_vdevs > 0 ? zopt_time / zopt_vdevs : UINT64_MAX);
+ zopt_maxfaults = MAX(zopt_mirrors, 1) * (zopt_raidz_parity + 1) - 1;
+}
+
+static uint64_t
+ztest_get_ashift(void)
+{
+ if (zopt_ashift == 0)
+ return (SPA_MINBLOCKSHIFT + ztest_random(3));
+ return (zopt_ashift);
+}
+
+static nvlist_t *
+make_vdev_file(size_t size)
+{
+ char dev_name[MAXPATHLEN];
+ uint64_t vdev;
+ uint64_t ashift = ztest_get_ashift();
+ int fd;
+ nvlist_t *file;
+
+ if (size == 0) {
+ (void) snprintf(dev_name, sizeof (dev_name), "%s",
+ "/dev/bogus");
+ } else {
+ vdev = ztest_shared->zs_vdev_primaries++;
+ (void) sprintf(dev_name, ztest_dev_template,
+ zopt_dir, zopt_pool, vdev);
+
+ fd = open(dev_name, O_RDWR | O_CREAT | O_TRUNC, 0666);
+ if (fd == -1)
+ fatal(1, "can't open %s", dev_name);
+ if (ftruncate(fd, size) != 0)
+ fatal(1, "can't ftruncate %s", dev_name);
+ (void) close(fd);
+ }
+
+ VERIFY(nvlist_alloc(&file, NV_UNIQUE_NAME, 0) == 0);
+ VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_TYPE, VDEV_TYPE_FILE) == 0);
+ VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_PATH, dev_name) == 0);
+ VERIFY(nvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift) == 0);
+
+ return (file);
+}
+
+static nvlist_t *
+make_vdev_raidz(size_t size, int r)
+{
+ nvlist_t *raidz, **child;
+ int c;
+
+ if (r < 2)
+ return (make_vdev_file(size));
+
+ child = umem_alloc(r * sizeof (nvlist_t *), UMEM_NOFAIL);
+
+ for (c = 0; c < r; c++)
+ child[c] = make_vdev_file(size);
+
+ VERIFY(nvlist_alloc(&raidz, NV_UNIQUE_NAME, 0) == 0);
+ VERIFY(nvlist_add_string(raidz, ZPOOL_CONFIG_TYPE,
+ VDEV_TYPE_RAIDZ) == 0);
+ VERIFY(nvlist_add_uint64(raidz, ZPOOL_CONFIG_NPARITY,
+ zopt_raidz_parity) == 0);
+ VERIFY(nvlist_add_nvlist_array(raidz, ZPOOL_CONFIG_CHILDREN,
+ child, r) == 0);
+
+ for (c = 0; c < r; c++)
+ nvlist_free(child[c]);
+
+ umem_free(child, r * sizeof (nvlist_t *));
+
+ return (raidz);
+}
+
+static nvlist_t *
+make_vdev_mirror(size_t size, int r, int m)
+{
+ nvlist_t *mirror, **child;
+ int c;
+
+ if (m < 1)
+ return (make_vdev_raidz(size, r));
+
+ child = umem_alloc(m * sizeof (nvlist_t *), UMEM_NOFAIL);
+
+ for (c = 0; c < m; c++)
+ child[c] = make_vdev_raidz(size, r);
+
+ VERIFY(nvlist_alloc(&mirror, NV_UNIQUE_NAME, 0) == 0);
+ VERIFY(nvlist_add_string(mirror, ZPOOL_CONFIG_TYPE,
+ VDEV_TYPE_MIRROR) == 0);
+ VERIFY(nvlist_add_nvlist_array(mirror, ZPOOL_CONFIG_CHILDREN,
+ child, m) == 0);
+
+ for (c = 0; c < m; c++)
+ nvlist_free(child[c]);
+
+ umem_free(child, m * sizeof (nvlist_t *));
+
+ return (mirror);
+}
+
+static nvlist_t *
+make_vdev_root(size_t size, int r, int m, int t)
+{
+ nvlist_t *root, **child;
+ int c;
+
+ ASSERT(t > 0);
+
+ child = umem_alloc(t * sizeof (nvlist_t *), UMEM_NOFAIL);
+
+ for (c = 0; c < t; c++)
+ child[c] = make_vdev_mirror(size, r, m);
+
+ VERIFY(nvlist_alloc(&root, NV_UNIQUE_NAME, 0) == 0);
+ VERIFY(nvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) == 0);
+ VERIFY(nvlist_add_nvlist_array(root, ZPOOL_CONFIG_CHILDREN,
+ child, t) == 0);
+
+ for (c = 0; c < t; c++)
+ nvlist_free(child[c]);
+
+ umem_free(child, t * sizeof (nvlist_t *));
+
+ return (root);
+}
+
+static void
+ztest_set_random_blocksize(objset_t *os, uint64_t object, dmu_tx_t *tx)
+{
+ int bs = SPA_MINBLOCKSHIFT +
+ ztest_random(SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1);
+ int ibs = DN_MIN_INDBLKSHIFT +
+ ztest_random(DN_MAX_INDBLKSHIFT - DN_MIN_INDBLKSHIFT + 1);
+ int error;
+
+ error = dmu_object_set_blocksize(os, object, 1ULL << bs, ibs, tx);
+ if (error) {
+ char osname[300];
+ dmu_objset_name(os, osname);
+ fatal(0, "dmu_object_set_blocksize('%s', %llu, %d, %d) = %d",
+ osname, object, 1 << bs, ibs, error);
+ }
+}
+
+static uint8_t
+ztest_random_checksum(void)
+{
+ uint8_t checksum;
+
+ do {
+ checksum = ztest_random(ZIO_CHECKSUM_FUNCTIONS);
+ } while (zio_checksum_table[checksum].ci_zbt);
+
+ if (checksum == ZIO_CHECKSUM_OFF)
+ checksum = ZIO_CHECKSUM_ON;
+
+ return (checksum);
+}
+
+static uint8_t
+ztest_random_compress(void)
+{
+ return ((uint8_t)ztest_random(ZIO_COMPRESS_FUNCTIONS));
+}
+
+typedef struct ztest_replay {
+ objset_t *zr_os;
+ uint64_t zr_assign;
+} ztest_replay_t;
+
+static int
+ztest_replay_create(ztest_replay_t *zr, lr_create_t *lr, boolean_t byteswap)
+{
+ objset_t *os = zr->zr_os;
+ dmu_tx_t *tx;
+ int error;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+ error = dmu_tx_assign(tx, zr->zr_assign);
+ if (error) {
+ dmu_tx_abort(tx);
+ return (error);
+ }
+
+ error = dmu_object_claim(os, lr->lr_doid, lr->lr_mode, 0,
+ DMU_OT_NONE, 0, tx);
+ ASSERT3U(error, ==, 0);
+ dmu_tx_commit(tx);
+
+ if (zopt_verbose >= 5) {
+ char osname[MAXNAMELEN];
+ dmu_objset_name(os, osname);
+ (void) printf("replay create of %s object %llu"
+ " in txg %llu = %d\n",
+ osname, (u_longlong_t)lr->lr_doid,
+ (u_longlong_t)zr->zr_assign, error);
+ }
+
+ return (error);
+}
+
+static int
+ztest_replay_remove(ztest_replay_t *zr, lr_remove_t *lr, boolean_t byteswap)
+{
+ objset_t *os = zr->zr_os;
+ dmu_tx_t *tx;
+ int error;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_free(tx, lr->lr_doid, 0, DMU_OBJECT_END);
+ error = dmu_tx_assign(tx, zr->zr_assign);
+ if (error) {
+ dmu_tx_abort(tx);
+ return (error);
+ }
+
+ error = dmu_object_free(os, lr->lr_doid, tx);
+ dmu_tx_commit(tx);
+
+ return (error);
+}
+
+zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = {
+ NULL, /* 0 no such transaction type */
+ ztest_replay_create, /* TX_CREATE */
+ NULL, /* TX_MKDIR */
+ NULL, /* TX_MKXATTR */
+ NULL, /* TX_SYMLINK */
+ ztest_replay_remove, /* TX_REMOVE */
+ NULL, /* TX_RMDIR */
+ NULL, /* TX_LINK */
+ NULL, /* TX_RENAME */
+ NULL, /* TX_WRITE */
+ NULL, /* TX_TRUNCATE */
+ NULL, /* TX_SETATTR */
+ NULL, /* TX_ACL */
+};
+
+/*
+ * Verify that we can't destroy an active pool, create an existing pool,
+ * or create a pool with a bad vdev spec.
+ */
+void
+ztest_spa_create_destroy(ztest_args_t *za)
+{
+ int error;
+ spa_t *spa;
+ nvlist_t *nvroot;
+
+ /*
+ * Attempt to create using a bad file.
+ */
+ nvroot = make_vdev_root(0, 0, 0, 1);
+ error = spa_create("ztest_bad_file", nvroot, NULL);
+ nvlist_free(nvroot);
+ if (error != ENOENT)
+ fatal(0, "spa_create(bad_file) = %d", error);
+
+ /*
+ * Attempt to create using a bad mirror.
+ */
+ nvroot = make_vdev_root(0, 0, 2, 1);
+ error = spa_create("ztest_bad_mirror", nvroot, NULL);
+ nvlist_free(nvroot);
+ if (error != ENOENT)
+ fatal(0, "spa_create(bad_mirror) = %d", error);
+
+ /*
+ * Attempt to create an existing pool. It shouldn't matter
+ * what's in the nvroot; we should fail with EEXIST.
+ */
+ (void) rw_rdlock(&ztest_shared->zs_name_lock);
+ nvroot = make_vdev_root(0, 0, 0, 1);
+ error = spa_create(za->za_pool, nvroot, NULL);
+ nvlist_free(nvroot);
+ if (error != EEXIST)
+ fatal(0, "spa_create(whatever) = %d", error);
+
+ error = spa_open(za->za_pool, &spa, FTAG);
+ if (error)
+ fatal(0, "spa_open() = %d", error);
+
+ error = spa_destroy(za->za_pool);
+ if (error != EBUSY)
+ fatal(0, "spa_destroy() = %d", error);
+
+ spa_close(spa, FTAG);
+ (void) rw_unlock(&ztest_shared->zs_name_lock);
+}
+
+/*
+ * Verify that vdev_add() works as expected.
+ */
+void
+ztest_vdev_add_remove(ztest_args_t *za)
+{
+ spa_t *spa = dmu_objset_spa(za->za_os);
+ uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz;
+ nvlist_t *nvroot;
+ int error;
+
+ if (zopt_verbose >= 6)
+ (void) printf("adding vdev\n");
+
+ (void) mutex_lock(&ztest_shared->zs_vdev_lock);
+
+ spa_config_enter(spa, RW_READER, FTAG);
+
+ ztest_shared->zs_vdev_primaries =
+ spa->spa_root_vdev->vdev_children * leaves;
+
+ spa_config_exit(spa, FTAG);
+
+ nvroot = make_vdev_root(zopt_vdev_size, zopt_raidz, zopt_mirrors, 1);
+ error = spa_vdev_add(spa, nvroot);
+ nvlist_free(nvroot);
+
+ (void) mutex_unlock(&ztest_shared->zs_vdev_lock);
+
+ if (error == ENOSPC)
+ ztest_record_enospc("spa_vdev_add");
+ else if (error != 0)
+ fatal(0, "spa_vdev_add() = %d", error);
+
+ if (zopt_verbose >= 6)
+ (void) printf("spa_vdev_add = %d, as expected\n", error);
+}
+
+static vdev_t *
+vdev_lookup_by_path(vdev_t *vd, const char *path)
+{
+ int c;
+ vdev_t *mvd;
+
+ if (vd->vdev_path != NULL) {
+ if (vd->vdev_wholedisk == 1) {
+ /*
+ * For whole disks, the internal path has 's0', but the
+ * path passed in by the user doesn't.
+ */
+ if (strlen(path) == strlen(vd->vdev_path) - 2 &&
+ strncmp(path, vd->vdev_path, strlen(path)) == 0)
+ return (vd);
+ } else if (strcmp(path, vd->vdev_path) == 0) {
+ return (vd);
+ }
+ }
+
+ for (c = 0; c < vd->vdev_children; c++)
+ if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) !=
+ NULL)
+ return (mvd);
+
+ return (NULL);
+}
+
+/*
+ * Verify that we can attach and detach devices.
+ */
+void
+ztest_vdev_attach_detach(ztest_args_t *za)
+{
+ spa_t *spa = dmu_objset_spa(za->za_os);
+ vdev_t *rvd = spa->spa_root_vdev;
+ vdev_t *oldvd, *newvd, *pvd;
+ nvlist_t *root, *file;
+ uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz;
+ uint64_t leaf, top;
+ uint64_t ashift = ztest_get_ashift();
+ size_t oldsize, newsize;
+ char oldpath[MAXPATHLEN], newpath[MAXPATHLEN];
+ int replacing;
+ int error, expected_error;
+ int fd;
+
+ (void) mutex_lock(&ztest_shared->zs_vdev_lock);
+
+ spa_config_enter(spa, RW_READER, FTAG);
+
+ /*
+ * Decide whether to do an attach or a replace.
+ */
+ replacing = ztest_random(2);
+
+ /*
+ * Pick a random top-level vdev.
+ */
+ top = ztest_random(rvd->vdev_children);
+
+ /*
+ * Pick a random leaf within it.
+ */
+ leaf = ztest_random(leaves);
+
+ /*
+ * Generate the path to this leaf. The filename will end with 'a'.
+ * We'll alternate replacements with a filename that ends with 'b'.
+ */
+ (void) snprintf(oldpath, sizeof (oldpath),
+ ztest_dev_template, zopt_dir, zopt_pool, top * leaves + leaf);
+
+ bcopy(oldpath, newpath, MAXPATHLEN);
+
+ /*
+ * If the 'a' file isn't part of the pool, the 'b' file must be.
+ */
+ if (vdev_lookup_by_path(rvd, oldpath) == NULL)
+ oldpath[strlen(oldpath) - 1] = 'b';
+ else
+ newpath[strlen(newpath) - 1] = 'b';
+
+ /*
+ * Now oldpath represents something that's already in the pool,
+ * and newpath is the thing we'll try to attach.
+ */
+ oldvd = vdev_lookup_by_path(rvd, oldpath);
+ newvd = vdev_lookup_by_path(rvd, newpath);
+ ASSERT(oldvd != NULL);
+ pvd = oldvd->vdev_parent;
+
+ /*
+ * Make newsize a little bigger or smaller than oldsize.
+ * If it's smaller, the attach should fail.
+ * If it's larger, and we're doing a replace,
+ * we should get dynamic LUN growth when we're done.
+ */
+ oldsize = vdev_get_rsize(oldvd);
+ newsize = 10 * oldsize / (9 + ztest_random(3));
+
+ /*
+ * If pvd is not a mirror or root, the attach should fail with ENOTSUP,
+ * unless it's a replace; in that case any non-replacing parent is OK.
+ *
+ * If newvd is already part of the pool, it should fail with EBUSY.
+ *
+ * If newvd is too small, it should fail with EOVERFLOW.
+ */
+ if (newvd != NULL)
+ expected_error = EBUSY;
+ else if (pvd->vdev_ops != &vdev_mirror_ops &&
+ pvd->vdev_ops != &vdev_root_ops &&
+ (!replacing || pvd->vdev_ops == &vdev_replacing_ops))
+ expected_error = ENOTSUP;
+ else if (newsize < oldsize)
+ expected_error = EOVERFLOW;
+ else if (ashift > oldvd->vdev_top->vdev_ashift)
+ expected_error = EDOM;
+ else
+ expected_error = 0;
+
+ /*
+ * If newvd isn't already part of the pool, create it.
+ */
+ if (newvd == NULL) {
+ fd = open(newpath, O_RDWR | O_CREAT | O_TRUNC, 0666);
+ if (fd == -1)
+ fatal(1, "can't open %s", newpath);
+ if (ftruncate(fd, newsize) != 0)
+ fatal(1, "can't ftruncate %s", newpath);
+ (void) close(fd);
+ }
+
+ spa_config_exit(spa, FTAG);
+
+ /*
+ * Build the nvlist describing newpath.
+ */
+ VERIFY(nvlist_alloc(&file, NV_UNIQUE_NAME, 0) == 0);
+ VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_TYPE, VDEV_TYPE_FILE) == 0);
+ VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_PATH, newpath) == 0);
+ VERIFY(nvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift) == 0);
+
+ VERIFY(nvlist_alloc(&root, NV_UNIQUE_NAME, 0) == 0);
+ VERIFY(nvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) == 0);
+ VERIFY(nvlist_add_nvlist_array(root, ZPOOL_CONFIG_CHILDREN,
+ &file, 1) == 0);
+
+ error = spa_vdev_attach(spa, oldvd->vdev_guid, root, replacing);
+
+ nvlist_free(file);
+ nvlist_free(root);
+
+ /*
+ * If our parent was the replacing vdev, but the replace completed,
+ * then instead of failing with ENOTSUP we may either succeed,
+ * fail with ENODEV, or fail with EOVERFLOW.
+ */
+ if (expected_error == ENOTSUP &&
+ (error == 0 || error == ENODEV || error == EOVERFLOW))
+ expected_error = error;
+
+ /*
+ * If someone grew the LUN, the replacement may be too small.
+ */
+ if (error == EOVERFLOW)
+ expected_error = error;
+
+ if (error != expected_error) {
+ fatal(0, "attach (%s, %s, %d) returned %d, expected %d",
+ oldpath, newpath, replacing, error, expected_error);
+ }
+
+ (void) mutex_unlock(&ztest_shared->zs_vdev_lock);
+}
+
+/*
+ * Verify that dynamic LUN growth works as expected.
+ */
+/* ARGSUSED */
+void
+ztest_vdev_LUN_growth(ztest_args_t *za)
+{
+ spa_t *spa = dmu_objset_spa(za->za_os);
+ char dev_name[MAXPATHLEN];
+ uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz;
+ uint64_t vdev;
+ size_t fsize;
+ int fd;
+
+ (void) mutex_lock(&ztest_shared->zs_vdev_lock);
+
+ /*
+ * Pick a random leaf vdev.
+ */
+ spa_config_enter(spa, RW_READER, FTAG);
+ vdev = ztest_random(spa->spa_root_vdev->vdev_children * leaves);
+ spa_config_exit(spa, FTAG);
+
+ (void) sprintf(dev_name, ztest_dev_template, zopt_dir, zopt_pool, vdev);
+
+ if ((fd = open(dev_name, O_RDWR)) != -1) {
+ /*
+ * Determine the size.
+ */
+ fsize = lseek(fd, 0, SEEK_END);
+
+ /*
+ * If it's less than 2x the original size, grow by around 3%.
+ */
+ if (fsize < 2 * zopt_vdev_size) {
+ size_t newsize = fsize + ztest_random(fsize / 32);
+ (void) ftruncate(fd, newsize);
+ if (zopt_verbose >= 6) {
+ (void) printf("%s grew from %lu to %lu bytes\n",
+ dev_name, (ulong_t)fsize, (ulong_t)newsize);
+ }
+ }
+ (void) close(fd);
+ }
+
+ (void) mutex_unlock(&ztest_shared->zs_vdev_lock);
+}
+
+/* ARGSUSED */
+static void
+ztest_create_cb(objset_t *os, void *arg, dmu_tx_t *tx)
+{
+ /*
+ * Create the directory object.
+ */
+ VERIFY(dmu_object_claim(os, ZTEST_DIROBJ,
+ DMU_OT_UINT64_OTHER, ZTEST_DIROBJ_BLOCKSIZE,
+ DMU_OT_UINT64_OTHER, sizeof (ztest_block_tag_t), tx) == 0);
+
+ VERIFY(zap_create_claim(os, ZTEST_MICROZAP_OBJ,
+ DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx) == 0);
+
+ VERIFY(zap_create_claim(os, ZTEST_FATZAP_OBJ,
+ DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx) == 0);
+}
+
+/* ARGSUSED */
+static int
+ztest_destroy_cb(char *name, void *arg)
+{
+ objset_t *os;
+ dmu_object_info_t doi;
+ int error;
+
+ /*
+ * Verify that the dataset contains a directory object.
+ */
+ error = dmu_objset_open(name, DMU_OST_OTHER,
+ DS_MODE_STANDARD | DS_MODE_READONLY, &os);
+ ASSERT3U(error, ==, 0);
+ error = dmu_object_info(os, ZTEST_DIROBJ, &doi);
+ if (error != ENOENT) {
+ /* We could have crashed in the middle of destroying it */
+ ASSERT3U(error, ==, 0);
+ ASSERT3U(doi.doi_type, ==, DMU_OT_UINT64_OTHER);
+ ASSERT3S(doi.doi_physical_blks, >=, 0);
+ }
+ dmu_objset_close(os);
+
+ /*
+ * Destroy the dataset.
+ */
+ error = dmu_objset_destroy(name);
+ ASSERT3U(error, ==, 0);
+ return (0);
+}
+
+/*
+ * Verify that dmu_objset_{create,destroy,open,close} work as expected.
+ */
+static uint64_t
+ztest_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t object, int mode)
+{
+ itx_t *itx;
+ lr_create_t *lr;
+ size_t namesize;
+ char name[24];
+
+ (void) sprintf(name, "ZOBJ_%llu", (u_longlong_t)object);
+ namesize = strlen(name) + 1;
+
+ itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize +
+ ztest_random(ZIL_MAX_BLKSZ));
+ lr = (lr_create_t *)&itx->itx_lr;
+ bzero(lr + 1, lr->lr_common.lrc_reclen - sizeof (*lr));
+ lr->lr_doid = object;
+ lr->lr_foid = 0;
+ lr->lr_mode = mode;
+ lr->lr_uid = 0;
+ lr->lr_gid = 0;
+ lr->lr_gen = dmu_tx_get_txg(tx);
+ lr->lr_crtime[0] = time(NULL);
+ lr->lr_crtime[1] = 0;
+ lr->lr_rdev = 0;
+ bcopy(name, (char *)(lr + 1), namesize);
+
+ return (zil_itx_assign(zilog, itx, tx));
+}
+
+void
+ztest_dmu_objset_create_destroy(ztest_args_t *za)
+{
+ int error;
+ objset_t *os;
+ char name[100];
+ int mode, basemode, expected_error;
+ zilog_t *zilog;
+ uint64_t seq;
+ uint64_t objects;
+ ztest_replay_t zr;
+
+ (void) rw_rdlock(&ztest_shared->zs_name_lock);
+ (void) snprintf(name, 100, "%s/%s_temp_%llu", za->za_pool, za->za_pool,
+ (u_longlong_t)za->za_instance);
+
+ basemode = DS_MODE_LEVEL(za->za_instance);
+ if (basemode == DS_MODE_NONE)
+ basemode++;
+
+ /*
+ * If this dataset exists from a previous run, process its replay log
+ * half of the time. If we don't replay it, then dmu_objset_destroy()
+ * (invoked from ztest_destroy_cb() below) should just throw it away.
+ */
+ if (ztest_random(2) == 0 &&
+ dmu_objset_open(name, DMU_OST_OTHER, DS_MODE_PRIMARY, &os) == 0) {
+ zr.zr_os = os;
+ zil_replay(os, &zr, &zr.zr_assign, ztest_replay_vector);
+ dmu_objset_close(os);
+ }
+
+ /*
+ * There may be an old instance of the dataset we're about to
+ * create lying around from a previous run. If so, destroy it
+ * and all of its snapshots.
+ */
+ (void) dmu_objset_find(name, ztest_destroy_cb, NULL,
+ DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
+
+ /*
+ * Verify that the destroyed dataset is no longer in the namespace.
+ */
+ error = dmu_objset_open(name, DMU_OST_OTHER, basemode, &os);
+ if (error != ENOENT)
+ fatal(1, "dmu_objset_open(%s) found destroyed dataset %p",
+ name, os);
+
+ /*
+ * Verify that we can create a new dataset.
+ */
+ error = dmu_objset_create(name, DMU_OST_OTHER, NULL, ztest_create_cb,
+ NULL);
+ if (error) {
+ if (error == ENOSPC) {
+ ztest_record_enospc("dmu_objset_create");
+ (void) rw_unlock(&ztest_shared->zs_name_lock);
+ return;
+ }
+ fatal(0, "dmu_objset_create(%s) = %d", name, error);
+ }
+
+ error = dmu_objset_open(name, DMU_OST_OTHER, basemode, &os);
+ if (error) {
+ fatal(0, "dmu_objset_open(%s) = %d", name, error);
+ }
+
+ /*
+ * Open the intent log for it.
+ */
+ zilog = zil_open(os, NULL);
+
+ /*
+ * Put a random number of objects in there.
+ */
+ objects = ztest_random(20);
+ seq = 0;
+ while (objects-- != 0) {
+ uint64_t object;
+ dmu_tx_t *tx = dmu_tx_create(os);
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, sizeof (name));
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ } else {
+ object = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
+ DMU_OT_NONE, 0, tx);
+ ztest_set_random_blocksize(os, object, tx);
+ seq = ztest_log_create(zilog, tx, object,
+ DMU_OT_UINT64_OTHER);
+ dmu_write(os, object, 0, sizeof (name), name, tx);
+ dmu_tx_commit(tx);
+ }
+ if (ztest_random(5) == 0) {
+ zil_commit(zilog, seq, object);
+ }
+ if (ztest_random(100) == 0) {
+ error = zil_suspend(zilog);
+ if (error == 0) {
+ zil_resume(zilog);
+ }
+ }
+ }
+
+ /*
+ * Verify that we cannot create an existing dataset.
+ */
+ error = dmu_objset_create(name, DMU_OST_OTHER, NULL, NULL, NULL);
+ if (error != EEXIST)
+ fatal(0, "created existing dataset, error = %d", error);
+
+ /*
+ * Verify that multiple dataset opens are allowed, but only when
+ * the new access mode is compatible with the base mode.
+ * We use a mixture of typed and typeless opens, and when the
+ * open succeeds, verify that the discovered type is correct.
+ */
+ for (mode = DS_MODE_STANDARD; mode < DS_MODE_LEVELS; mode++) {
+ objset_t *os2;
+ error = dmu_objset_open(name, DMU_OST_OTHER, mode, &os2);
+ expected_error = (basemode + mode < DS_MODE_LEVELS) ? 0 : EBUSY;
+ if (error != expected_error)
+ fatal(0, "dmu_objset_open('%s') = %d, expected %d",
+ name, error, expected_error);
+ if (error == 0)
+ dmu_objset_close(os2);
+ }
+
+ zil_close(zilog);
+ dmu_objset_close(os);
+
+ error = dmu_objset_destroy(name);
+ if (error)
+ fatal(0, "dmu_objset_destroy(%s) = %d", name, error);
+
+ (void) rw_unlock(&ztest_shared->zs_name_lock);
+}
+
+/*
+ * Verify that dmu_snapshot_{create,destroy,open,close} work as expected.
+ */
+void
+ztest_dmu_snapshot_create_destroy(ztest_args_t *za)
+{
+ int error;
+ objset_t *os = za->za_os;
+ char snapname[100];
+ char osname[MAXNAMELEN];
+
+ (void) rw_rdlock(&ztest_shared->zs_name_lock);
+ dmu_objset_name(os, osname);
+ (void) snprintf(snapname, 100, "%s@%llu", osname,
+ (u_longlong_t)za->za_instance);
+
+ error = dmu_objset_destroy(snapname);
+ if (error != 0 && error != ENOENT)
+ fatal(0, "dmu_objset_destroy() = %d", error);
+ error = dmu_objset_snapshot(osname, strchr(snapname, '@')+1, FALSE);
+ if (error == ENOSPC)
+ ztest_record_enospc("dmu_take_snapshot");
+ else if (error != 0 && error != EEXIST)
+ fatal(0, "dmu_take_snapshot() = %d", error);
+ (void) rw_unlock(&ztest_shared->zs_name_lock);
+}
+
+#define ZTEST_TRAVERSE_BLOCKS 1000
+
+static int
+ztest_blk_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
+{
+ ztest_args_t *za = arg;
+ zbookmark_t *zb = &bc->bc_bookmark;
+ blkptr_t *bp = &bc->bc_blkptr;
+ dnode_phys_t *dnp = bc->bc_dnode;
+ traverse_handle_t *th = za->za_th;
+ uint64_t size = BP_GET_LSIZE(bp);
+
+ /*
+ * Level -1 indicates the objset_phys_t or something in its intent log.
+ */
+ if (zb->zb_level == -1) {
+ if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
+ ASSERT3U(zb->zb_object, ==, 0);
+ ASSERT3U(zb->zb_blkid, ==, 0);
+ ASSERT3U(size, ==, sizeof (objset_phys_t));
+ za->za_zil_seq = 0;
+ } else if (BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) {
+ ASSERT3U(zb->zb_object, ==, 0);
+ ASSERT3U(zb->zb_blkid, >, za->za_zil_seq);
+ za->za_zil_seq = zb->zb_blkid;
+ } else {
+ ASSERT3U(zb->zb_object, !=, 0); /* lr_write_t */
+ }
+
+ return (0);
+ }
+
+ ASSERT(dnp != NULL);
+
+ if (bc->bc_errno)
+ return (ERESTART);
+
+ /*
+ * Once in a while, abort the traverse. We only do this to odd
+ * instance numbers to ensure that even ones can run to completion.
+ */
+ if ((za->za_instance & 1) && ztest_random(10000) == 0)
+ return (EINTR);
+
+ if (bp->blk_birth == 0) {
+ ASSERT(th->th_advance & ADVANCE_HOLES);
+ return (0);
+ }
+
+ if (zb->zb_level == 0 && !(th->th_advance & ADVANCE_DATA) &&
+ bc == &th->th_cache[ZB_DN_CACHE][0]) {
+ ASSERT(bc->bc_data == NULL);
+ return (0);
+ }
+
+ ASSERT(bc->bc_data != NULL);
+
+ /*
+ * This is an expensive question, so don't ask it too often.
+ */
+ if (((za->za_random ^ th->th_callbacks) & 0xff) == 0) {
+ void *xbuf = umem_alloc(size, UMEM_NOFAIL);
+ if (arc_tryread(spa, bp, xbuf) == 0) {
+ ASSERT(bcmp(bc->bc_data, xbuf, size) == 0);
+ }
+ umem_free(xbuf, size);
+ }
+
+ if (zb->zb_level > 0) {
+ ASSERT3U(size, ==, 1ULL << dnp->dn_indblkshift);
+ return (0);
+ }
+
+ ASSERT(zb->zb_level == 0);
+ ASSERT3U(size, ==, dnp->dn_datablkszsec << DEV_BSHIFT);
+
+ return (0);
+}
+
+/*
+ * Verify that live pool traversal works.
+ */
+void
+ztest_traverse(ztest_args_t *za)
+{
+ spa_t *spa = dmu_objset_spa(za->za_os);
+ traverse_handle_t *th = za->za_th;
+ int rc, advance;
+ uint64_t cbstart, cblimit;
+
+ if (th == NULL) {
+ advance = 0;
+
+ if (ztest_random(2) == 0)
+ advance |= ADVANCE_PRE;
+
+ if (ztest_random(2) == 0)
+ advance |= ADVANCE_PRUNE;
+
+ if (ztest_random(2) == 0)
+ advance |= ADVANCE_DATA;
+
+ if (ztest_random(2) == 0)
+ advance |= ADVANCE_HOLES;
+
+ if (ztest_random(2) == 0)
+ advance |= ADVANCE_ZIL;
+
+ th = za->za_th = traverse_init(spa, ztest_blk_cb, za, advance,
+ ZIO_FLAG_CANFAIL);
+
+ traverse_add_pool(th, 0, -1ULL);
+ }
+
+ advance = th->th_advance;
+ cbstart = th->th_callbacks;
+ cblimit = cbstart + ((advance & ADVANCE_DATA) ? 100 : 1000);
+
+ while ((rc = traverse_more(th)) == EAGAIN && th->th_callbacks < cblimit)
+ continue;
+
+ if (zopt_verbose >= 5)
+ (void) printf("traverse %s%s%s%s %llu blocks to "
+ "<%llu, %llu, %lld, %llx>%s\n",
+ (advance & ADVANCE_PRE) ? "pre" : "post",
+ (advance & ADVANCE_PRUNE) ? "|prune" : "",
+ (advance & ADVANCE_DATA) ? "|data" : "",
+ (advance & ADVANCE_HOLES) ? "|holes" : "",
+ (u_longlong_t)(th->th_callbacks - cbstart),
+ (u_longlong_t)th->th_lastcb.zb_objset,
+ (u_longlong_t)th->th_lastcb.zb_object,
+ (u_longlong_t)th->th_lastcb.zb_level,
+ (u_longlong_t)th->th_lastcb.zb_blkid,
+ rc == 0 ? " [done]" :
+ rc == EINTR ? " [aborted]" :
+ rc == EAGAIN ? "" :
+ strerror(rc));
+
+ if (rc != EAGAIN) {
+ if (rc != 0 && rc != EINTR)
+ fatal(0, "traverse_more(%p) = %d", th, rc);
+ traverse_fini(th);
+ za->za_th = NULL;
+ }
+}
+
+/*
+ * Verify that dmu_object_{alloc,free} work as expected.
+ */
+void
+ztest_dmu_object_alloc_free(ztest_args_t *za)
+{
+ objset_t *os = za->za_os;
+ dmu_buf_t *db;
+ dmu_tx_t *tx;
+ uint64_t batchobj, object, batchsize, endoff, temp;
+ int b, c, error, bonuslen;
+ dmu_object_info_t doi;
+ char osname[MAXNAMELEN];
+
+ dmu_objset_name(os, osname);
+
+ endoff = -8ULL;
+ batchsize = 2;
+
+ /*
+ * Create a batch object if necessary, and record it in the directory.
+ */
+ VERIFY(0 == dmu_read(os, ZTEST_DIROBJ, za->za_diroff,
+ sizeof (uint64_t), &batchobj));
+ if (batchobj == 0) {
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff,
+ sizeof (uint64_t));
+ dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ ztest_record_enospc("create a batch object");
+ dmu_tx_abort(tx);
+ return;
+ }
+ batchobj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
+ DMU_OT_NONE, 0, tx);
+ ztest_set_random_blocksize(os, batchobj, tx);
+ dmu_write(os, ZTEST_DIROBJ, za->za_diroff,
+ sizeof (uint64_t), &batchobj, tx);
+ dmu_tx_commit(tx);
+ }
+
+ /*
+ * Destroy the previous batch of objects.
+ */
+ for (b = 0; b < batchsize; b++) {
+ VERIFY(0 == dmu_read(os, batchobj, b * sizeof (uint64_t),
+ sizeof (uint64_t), &object));
+ if (object == 0)
+ continue;
+ /*
+ * Read and validate contents.
+ * We expect the nth byte of the bonus buffer to be n.
+ */
+ VERIFY(0 == dmu_bonus_hold(os, object, FTAG, &db));
+
+ dmu_object_info_from_db(db, &doi);
+ ASSERT(doi.doi_type == DMU_OT_UINT64_OTHER);
+ ASSERT(doi.doi_bonus_type == DMU_OT_PLAIN_OTHER);
+ ASSERT3S(doi.doi_physical_blks, >=, 0);
+
+ bonuslen = db->db_size;
+
+ for (c = 0; c < bonuslen; c++) {
+ if (((uint8_t *)db->db_data)[c] !=
+ (uint8_t)(c + bonuslen)) {
+ fatal(0,
+ "bad bonus: %s, obj %llu, off %d: %u != %u",
+ osname, object, c,
+ ((uint8_t *)db->db_data)[c],
+ (uint8_t)(c + bonuslen));
+ }
+ }
+
+ dmu_buf_rele(db, FTAG);
+
+ /*
+ * We expect the word at endoff to be our object number.
+ */
+ VERIFY(0 == dmu_read(os, object, endoff,
+ sizeof (uint64_t), &temp));
+
+ if (temp != object) {
+ fatal(0, "bad data in %s, got %llu, expected %llu",
+ osname, temp, object);
+ }
+
+ /*
+ * Destroy old object and clear batch entry.
+ */
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_write(tx, batchobj,
+ b * sizeof (uint64_t), sizeof (uint64_t));
+ dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ ztest_record_enospc("free object");
+ dmu_tx_abort(tx);
+ return;
+ }
+ error = dmu_object_free(os, object, tx);
+ if (error) {
+ fatal(0, "dmu_object_free('%s', %llu) = %d",
+ osname, object, error);
+ }
+ object = 0;
+
+ dmu_object_set_checksum(os, batchobj,
+ ztest_random_checksum(), tx);
+ dmu_object_set_compress(os, batchobj,
+ ztest_random_compress(), tx);
+
+ dmu_write(os, batchobj, b * sizeof (uint64_t),
+ sizeof (uint64_t), &object, tx);
+
+ dmu_tx_commit(tx);
+ }
+
+ /*
+ * Before creating the new batch of objects, generate a bunch of churn.
+ */
+ for (b = ztest_random(100); b > 0; b--) {
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ ztest_record_enospc("churn objects");
+ dmu_tx_abort(tx);
+ return;
+ }
+ object = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
+ DMU_OT_NONE, 0, tx);
+ ztest_set_random_blocksize(os, object, tx);
+ error = dmu_object_free(os, object, tx);
+ if (error) {
+ fatal(0, "dmu_object_free('%s', %llu) = %d",
+ osname, object, error);
+ }
+ dmu_tx_commit(tx);
+ }
+
+ /*
+ * Create a new batch of objects with randomly chosen
+ * blocksizes and record them in the batch directory.
+ */
+ for (b = 0; b < batchsize; b++) {
+ uint32_t va_blksize;
+ u_longlong_t va_nblocks;
+
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_write(tx, batchobj, b * sizeof (uint64_t),
+ sizeof (uint64_t));
+ dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, endoff,
+ sizeof (uint64_t));
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ ztest_record_enospc("create batchobj");
+ dmu_tx_abort(tx);
+ return;
+ }
+ bonuslen = (int)ztest_random(dmu_bonus_max()) + 1;
+
+ object = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
+ DMU_OT_PLAIN_OTHER, bonuslen, tx);
+
+ ztest_set_random_blocksize(os, object, tx);
+
+ dmu_object_set_checksum(os, object,
+ ztest_random_checksum(), tx);
+ dmu_object_set_compress(os, object,
+ ztest_random_compress(), tx);
+
+ dmu_write(os, batchobj, b * sizeof (uint64_t),
+ sizeof (uint64_t), &object, tx);
+
+ /*
+ * Write to both the bonus buffer and the regular data.
+ */
+ VERIFY(0 == dmu_bonus_hold(os, object, FTAG, &db));
+ ASSERT3U(bonuslen, ==, db->db_size);
+
+ dmu_object_size_from_db(db, &va_blksize, &va_nblocks);
+ ASSERT3S(va_nblocks, >=, 0);
+
+ dmu_buf_will_dirty(db, tx);
+
+ /*
+ * See comments above regarding the contents of
+ * the bonus buffer and the word at endoff.
+ */
+ for (c = 0; c < db->db_size; c++)
+ ((uint8_t *)db->db_data)[c] = (uint8_t)(c + bonuslen);
+
+ dmu_buf_rele(db, FTAG);
+
+ /*
+ * Write to a large offset to increase indirection.
+ */
+ dmu_write(os, object, endoff, sizeof (uint64_t), &object, tx);
+
+ dmu_tx_commit(tx);
+ }
+}
+
+/*
+ * Verify that dmu_{read,write} work as expected.
+ */
+typedef struct bufwad {
+ uint64_t bw_index;
+ uint64_t bw_txg;
+ uint64_t bw_data;
+} bufwad_t;
+
+typedef struct dmu_read_write_dir {
+ uint64_t dd_packobj;
+ uint64_t dd_bigobj;
+ uint64_t dd_chunk;
+} dmu_read_write_dir_t;
+
+void
+ztest_dmu_read_write(ztest_args_t *za)
+{
+ objset_t *os = za->za_os;
+ dmu_read_write_dir_t dd;
+ dmu_tx_t *tx;
+ int i, freeit, error;
+ uint64_t n, s, txg;
+ bufwad_t *packbuf, *bigbuf, *pack, *bigH, *bigT;
+ uint64_t packoff, packsize, bigoff, bigsize;
+ uint64_t regions = 997;
+ uint64_t stride = 123456789ULL;
+ uint64_t width = 40;
+ int free_percent = 5;
+
+ /*
+ * This test uses two objects, packobj and bigobj, that are always
+ * updated together (i.e. in the same tx) so that their contents are
+ * in sync and can be compared. Their contents relate to each other
+ * in a simple way: packobj is a dense array of 'bufwad' structures,
+ * while bigobj is a sparse array of the same bufwads. Specifically,
+ * for any index n, there are three bufwads that should be identical:
+ *
+ * packobj, at offset n * sizeof (bufwad_t)
+ * bigobj, at the head of the nth chunk
+ * bigobj, at the tail of the nth chunk
+ *
+ * The chunk size is arbitrary. It doesn't have to be a power of two,
+ * and it doesn't have any relation to the object blocksize.
+ * The only requirement is that it can hold at least two bufwads.
+ *
+ * Normally, we write the bufwad to each of these locations.
+ * However, free_percent of the time we instead write zeroes to
+ * packobj and perform a dmu_free_range() on bigobj. By comparing
+ * bigobj to packobj, we can verify that the DMU is correctly
+ * tracking which parts of an object are allocated and free,
+ * and that the contents of the allocated blocks are correct.
+ */
+
+ /*
+ * Read the directory info. If it's the first time, set things up.
+ */
+ VERIFY(0 == dmu_read(os, ZTEST_DIROBJ, za->za_diroff,
+ sizeof (dd), &dd));
+ if (dd.dd_chunk == 0) {
+ ASSERT(dd.dd_packobj == 0);
+ ASSERT(dd.dd_bigobj == 0);
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff, sizeof (dd));
+ dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ ztest_record_enospc("create r/w directory");
+ dmu_tx_abort(tx);
+ return;
+ }
+
+ dd.dd_packobj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
+ DMU_OT_NONE, 0, tx);
+ dd.dd_bigobj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
+ DMU_OT_NONE, 0, tx);
+ dd.dd_chunk = (1000 + ztest_random(1000)) * sizeof (uint64_t);
+
+ ztest_set_random_blocksize(os, dd.dd_packobj, tx);
+ ztest_set_random_blocksize(os, dd.dd_bigobj, tx);
+
+ dmu_write(os, ZTEST_DIROBJ, za->za_diroff, sizeof (dd), &dd,
+ tx);
+ dmu_tx_commit(tx);
+ }
+
+ /*
+ * Prefetch a random chunk of the big object.
+ * Our aim here is to get some async reads in flight
+ * for blocks that we may free below; the DMU should
+ * handle this race correctly.
+ */
+ n = ztest_random(regions) * stride + ztest_random(width);
+ s = 1 + ztest_random(2 * width - 1);
+ dmu_prefetch(os, dd.dd_bigobj, n * dd.dd_chunk, s * dd.dd_chunk);
+
+ /*
+ * Pick a random index and compute the offsets into packobj and bigobj.
+ */
+ n = ztest_random(regions) * stride + ztest_random(width);
+ s = 1 + ztest_random(width - 1);
+
+ packoff = n * sizeof (bufwad_t);
+ packsize = s * sizeof (bufwad_t);
+
+ bigoff = n * dd.dd_chunk;
+ bigsize = s * dd.dd_chunk;
+
+ packbuf = umem_alloc(packsize, UMEM_NOFAIL);
+ bigbuf = umem_alloc(bigsize, UMEM_NOFAIL);
+
+ /*
+ * free_percent of the time, free a range of bigobj rather than
+ * overwriting it.
+ */
+ freeit = (ztest_random(100) < free_percent);
+
+ /*
+ * Read the current contents of our objects.
+ */
+ error = dmu_read(os, dd.dd_packobj, packoff, packsize, packbuf);
+ ASSERT3U(error, ==, 0);
+ error = dmu_read(os, dd.dd_bigobj, bigoff, bigsize, bigbuf);
+ ASSERT3U(error, ==, 0);
+
+ /*
+ * Get a tx for the mods to both packobj and bigobj.
+ */
+ tx = dmu_tx_create(os);
+
+ dmu_tx_hold_write(tx, dd.dd_packobj, packoff, packsize);
+
+ if (freeit)
+ dmu_tx_hold_free(tx, dd.dd_bigobj, bigoff, bigsize);
+ else
+ dmu_tx_hold_write(tx, dd.dd_bigobj, bigoff, bigsize);
+
+ error = dmu_tx_assign(tx, TXG_WAIT);
+
+ if (error) {
+ ztest_record_enospc("dmu r/w range");
+ dmu_tx_abort(tx);
+ umem_free(packbuf, packsize);
+ umem_free(bigbuf, bigsize);
+ return;
+ }
+
+ txg = dmu_tx_get_txg(tx);
+
+ /*
+ * For each index from n to n + s, verify that the existing bufwad
+ * in packobj matches the bufwads at the head and tail of the
+ * corresponding chunk in bigobj. Then update all three bufwads
+ * with the new values we want to write out.
+ */
+ for (i = 0; i < s; i++) {
+ /* LINTED */
+ pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t));
+ /* LINTED */
+ bigH = (bufwad_t *)((char *)bigbuf + i * dd.dd_chunk);
+ /* LINTED */
+ bigT = (bufwad_t *)((char *)bigH + dd.dd_chunk) - 1;
+
+ ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize);
+ ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize);
+
+ if (pack->bw_txg > txg)
+ fatal(0, "future leak: got %llx, open txg is %llx",
+ pack->bw_txg, txg);
+
+ if (pack->bw_data != 0 && pack->bw_index != n + i)
+ fatal(0, "wrong index: got %llx, wanted %llx+%llx",
+ pack->bw_index, n, i);
+
+ if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0)
+ fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH);
+
+ if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0)
+ fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT);
+
+ if (freeit) {
+ bzero(pack, sizeof (bufwad_t));
+ } else {
+ pack->bw_index = n + i;
+ pack->bw_txg = txg;
+ pack->bw_data = 1 + ztest_random(-2ULL);
+ }
+ *bigH = *pack;
+ *bigT = *pack;
+ }
+
+ /*
+ * We've verified all the old bufwads, and made new ones.
+ * Now write them out.
+ */
+ dmu_write(os, dd.dd_packobj, packoff, packsize, packbuf, tx);
+
+ if (freeit) {
+ if (zopt_verbose >= 6) {
+ (void) printf("freeing offset %llx size %llx"
+ " txg %llx\n",
+ (u_longlong_t)bigoff,
+ (u_longlong_t)bigsize,
+ (u_longlong_t)txg);
+ }
+ VERIFY(0 == dmu_free_range(os, dd.dd_bigobj, bigoff,
+ bigsize, tx));
+ } else {
+ if (zopt_verbose >= 6) {
+ (void) printf("writing offset %llx size %llx"
+ " txg %llx\n",
+ (u_longlong_t)bigoff,
+ (u_longlong_t)bigsize,
+ (u_longlong_t)txg);
+ }
+ dmu_write(os, dd.dd_bigobj, bigoff, bigsize, bigbuf, tx);
+ }
+
+ dmu_tx_commit(tx);
+
+ /*
+ * Sanity check the stuff we just wrote.
+ */
+ {
+ void *packcheck = umem_alloc(packsize, UMEM_NOFAIL);
+ void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL);
+
+ VERIFY(0 == dmu_read(os, dd.dd_packobj, packoff,
+ packsize, packcheck));
+ VERIFY(0 == dmu_read(os, dd.dd_bigobj, bigoff,
+ bigsize, bigcheck));
+
+ ASSERT(bcmp(packbuf, packcheck, packsize) == 0);
+ ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0);
+
+ umem_free(packcheck, packsize);
+ umem_free(bigcheck, bigsize);
+ }
+
+ umem_free(packbuf, packsize);
+ umem_free(bigbuf, bigsize);
+}
+
+void
+ztest_dmu_check_future_leak(objset_t *os, uint64_t txg)
+{
+ dmu_buf_t *db;
+ ztest_block_tag_t rbt;
+
+ if (zopt_verbose >= 3) {
+ char osname[MAXNAMELEN];
+ dmu_objset_name(os, osname);
+ (void) printf("checking %s for future leaks in txg %lld...\n",
+ osname, (u_longlong_t)txg);
+ }
+
+ /*
+ * Make sure that, if there is a write record in the bonus buffer
+ * of the ZTEST_DIROBJ, that the txg for this record is <= the
+ * last synced txg of the pool.
+ */
+
+ VERIFY(0 == dmu_bonus_hold(os, ZTEST_DIROBJ, FTAG, &db));
+ ASSERT3U(db->db_size, ==, sizeof (rbt));
+ bcopy(db->db_data, &rbt, db->db_size);
+ if (rbt.bt_objset != 0) {
+ ASSERT3U(rbt.bt_objset, ==, dmu_objset_id(os));
+ ASSERT3U(rbt.bt_object, ==, ZTEST_DIROBJ);
+ ASSERT3U(rbt.bt_offset, ==, -1ULL);
+ if (rbt.bt_txg > txg) {
+ fatal(0,
+ "future leak: got %llx, last synced txg is %llx",
+ rbt.bt_txg, txg);
+ }
+ }
+ dmu_buf_rele(db, FTAG);
+}
+
+void
+ztest_dmu_write_parallel(ztest_args_t *za)
+{
+ objset_t *os = za->za_os;
+ dmu_tx_t *tx;
+ dmu_buf_t *db;
+ int i, b, error, do_free, bs;
+ uint64_t off, txg_how, txg;
+ mutex_t *lp;
+ char osname[MAXNAMELEN];
+ char iobuf[SPA_MAXBLOCKSIZE];
+ ztest_block_tag_t rbt, wbt;
+
+ dmu_objset_name(os, osname);
+ bs = ZTEST_DIROBJ_BLOCKSIZE;
+
+ /*
+ * Have multiple threads write to large offsets in ZTEST_DIROBJ
+ * to verify that having multiple threads writing to the same object
+ * in parallel doesn't cause any trouble.
+ * Also do parallel writes to the bonus buffer on occasion.
+ */
+ for (i = 0; i < 50; i++) {
+ b = ztest_random(ZTEST_SYNC_LOCKS);
+ lp = &ztest_shared->zs_sync_lock[b];
+
+ do_free = (ztest_random(4) == 0);
+
+ off = za->za_diroff_shared + ((uint64_t)b << SPA_MAXBLOCKSHIFT);
+
+ if (ztest_random(4) == 0) {
+ /*
+ * Do the bonus buffer instead of a regular block.
+ */
+ do_free = 0;
+ off = -1ULL;
+ }
+
+ tx = dmu_tx_create(os);
+
+ if (off == -1ULL)
+ dmu_tx_hold_bonus(tx, ZTEST_DIROBJ);
+ else if (do_free)
+ dmu_tx_hold_free(tx, ZTEST_DIROBJ, off, bs);
+ else
+ dmu_tx_hold_write(tx, ZTEST_DIROBJ, off, bs);
+
+ txg_how = ztest_random(2) == 0 ? TXG_WAIT : TXG_NOWAIT;
+ error = dmu_tx_assign(tx, txg_how);
+ if (error) {
+ if (error == ERESTART) {
+ ASSERT(txg_how == TXG_NOWAIT);
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ continue;
+ }
+ dmu_tx_abort(tx);
+ ztest_record_enospc("dmu write parallel");
+ return;
+ }
+ txg = dmu_tx_get_txg(tx);
+
+ if (do_free) {
+ (void) mutex_lock(lp);
+ VERIFY(0 == dmu_free_range(os, ZTEST_DIROBJ, off,
+ bs, tx));
+ (void) mutex_unlock(lp);
+ dmu_tx_commit(tx);
+ continue;
+ }
+
+ wbt.bt_objset = dmu_objset_id(os);
+ wbt.bt_object = ZTEST_DIROBJ;
+ wbt.bt_offset = off;
+ wbt.bt_txg = txg;
+ wbt.bt_thread = za->za_instance;
+
+ if (off == -1ULL) {
+ wbt.bt_seq = 0;
+ VERIFY(0 == dmu_bonus_hold(os, ZTEST_DIROBJ,
+ FTAG, &db));
+ ASSERT3U(db->db_size, ==, sizeof (wbt));
+ bcopy(db->db_data, &rbt, db->db_size);
+ if (rbt.bt_objset != 0) {
+ ASSERT3U(rbt.bt_objset, ==, wbt.bt_objset);
+ ASSERT3U(rbt.bt_object, ==, wbt.bt_object);
+ ASSERT3U(rbt.bt_offset, ==, wbt.bt_offset);
+ ASSERT3U(rbt.bt_txg, <=, wbt.bt_txg);
+ }
+ dmu_buf_will_dirty(db, tx);
+ bcopy(&wbt, db->db_data, db->db_size);
+ dmu_buf_rele(db, FTAG);
+ dmu_tx_commit(tx);
+ continue;
+ }
+
+ (void) mutex_lock(lp);
+
+ wbt.bt_seq = ztest_shared->zs_seq[b]++;
+
+ dmu_write(os, ZTEST_DIROBJ, off, sizeof (wbt), &wbt, tx);
+
+ (void) mutex_unlock(lp);
+
+ if (ztest_random(100) == 0)
+ (void) poll(NULL, 0, 1); /* open dn_notxholds window */
+
+ dmu_tx_commit(tx);
+
+ if (ztest_random(1000) == 0)
+ txg_wait_synced(dmu_objset_pool(os), txg);
+
+ if (ztest_random(2) == 0) {
+ blkptr_t blk = { 0 };
+ uint64_t blkoff;
+ zbookmark_t zb;
+
+ (void) mutex_lock(lp);
+ blkoff = P2ALIGN_TYPED(off, bs, uint64_t);
+ error = dmu_buf_hold(os,
+ ZTEST_DIROBJ, blkoff, FTAG, &db);
+ if (error) {
+ dprintf("dmu_buf_hold(%s, %d, %llx) = %d\n",
+ osname, ZTEST_DIROBJ, blkoff, error);
+ (void) mutex_unlock(lp);
+ continue;
+ }
+ blkoff = off - blkoff;
+ error = dmu_sync(NULL, db, &blk, txg, NULL, NULL);
+ dmu_buf_rele(db, FTAG);
+ (void) mutex_unlock(lp);
+ if (error) {
+ dprintf("dmu_sync(%s, %d, %llx) = %d\n",
+ osname, ZTEST_DIROBJ, off, error);
+ continue;
+ }
+
+ if (blk.blk_birth == 0) { /* concurrent free */
+ continue;
+ }
+ txg_suspend(dmu_objset_pool(os));
+
+ ASSERT(blk.blk_fill == 1);
+ ASSERT3U(BP_GET_TYPE(&blk), ==, DMU_OT_UINT64_OTHER);
+ ASSERT3U(BP_GET_LEVEL(&blk), ==, 0);
+ ASSERT3U(BP_GET_LSIZE(&blk), ==, bs);
+
+ /*
+ * Read the block that dmu_sync() returned to
+ * make sure its contents match what we wrote.
+ * We do this while still txg_suspend()ed to ensure
+ * that the block can't be reused before we read it.
+ */
+ zb.zb_objset = dmu_objset_id(os);
+ zb.zb_object = ZTEST_DIROBJ;
+ zb.zb_level = 0;
+ zb.zb_blkid = off / bs;
+ error = zio_wait(zio_read(NULL, dmu_objset_spa(os),
+ &blk, iobuf, bs, NULL, NULL,
+ ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_MUSTSUCCEED, &zb));
+ ASSERT(error == 0);
+
+ txg_resume(dmu_objset_pool(os));
+
+ bcopy(&iobuf[blkoff], &rbt, sizeof (rbt));
+
+ if (rbt.bt_objset == 0) /* concurrent free */
+ continue;
+
+ ASSERT3U(rbt.bt_objset, ==, wbt.bt_objset);
+ ASSERT3U(rbt.bt_object, ==, wbt.bt_object);
+ ASSERT3U(rbt.bt_offset, ==, wbt.bt_offset);
+
+ /*
+ * The semantic of dmu_sync() is that we always
+ * push the most recent version of the data,
+ * so in the face of concurrent updates we may
+ * see a newer version of the block. That's OK.
+ */
+ ASSERT3U(rbt.bt_txg, >=, wbt.bt_txg);
+ if (rbt.bt_thread == wbt.bt_thread)
+ ASSERT3U(rbt.bt_seq, ==, wbt.bt_seq);
+ else
+ ASSERT3U(rbt.bt_seq, >, wbt.bt_seq);
+ }
+ }
+}
+
+/*
+ * Verify that zap_{create,destroy,add,remove,update} work as expected.
+ */
+#define ZTEST_ZAP_MIN_INTS 1
+#define ZTEST_ZAP_MAX_INTS 4
+#define ZTEST_ZAP_MAX_PROPS 1000
+
+void
+ztest_zap(ztest_args_t *za)
+{
+ objset_t *os = za->za_os;
+ uint64_t object;
+ uint64_t txg, last_txg;
+ uint64_t value[ZTEST_ZAP_MAX_INTS];
+ uint64_t zl_ints, zl_intsize, prop;
+ int i, ints;
+ int iters = 100;
+ dmu_tx_t *tx;
+ char propname[100], txgname[100];
+ int error;
+ char osname[MAXNAMELEN];
+ char *hc[2] = { "s.acl.h", ".s.open.h.hyLZlg" };
+
+ dmu_objset_name(os, osname);
+
+ /*
+ * Create a new object if necessary, and record it in the directory.
+ */
+ VERIFY(0 == dmu_read(os, ZTEST_DIROBJ, za->za_diroff,
+ sizeof (uint64_t), &object));
+
+ if (object == 0) {
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff,
+ sizeof (uint64_t));
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ ztest_record_enospc("create zap test obj");
+ dmu_tx_abort(tx);
+ return;
+ }
+ object = zap_create(os, DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx);
+ if (error) {
+ fatal(0, "zap_create('%s', %llu) = %d",
+ osname, object, error);
+ }
+ ASSERT(object != 0);
+ dmu_write(os, ZTEST_DIROBJ, za->za_diroff,
+ sizeof (uint64_t), &object, tx);
+ /*
+ * Generate a known hash collision, and verify that
+ * we can lookup and remove both entries.
+ */
+ for (i = 0; i < 2; i++) {
+ value[i] = i;
+ error = zap_add(os, object, hc[i], sizeof (uint64_t),
+ 1, &value[i], tx);
+ ASSERT3U(error, ==, 0);
+ }
+ for (i = 0; i < 2; i++) {
+ error = zap_add(os, object, hc[i], sizeof (uint64_t),
+ 1, &value[i], tx);
+ ASSERT3U(error, ==, EEXIST);
+ error = zap_length(os, object, hc[i],
+ &zl_intsize, &zl_ints);
+ ASSERT3U(error, ==, 0);
+ ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
+ ASSERT3U(zl_ints, ==, 1);
+ }
+ for (i = 0; i < 2; i++) {
+ error = zap_remove(os, object, hc[i], tx);
+ ASSERT3U(error, ==, 0);
+ }
+
+ dmu_tx_commit(tx);
+ }
+
+ ints = MAX(ZTEST_ZAP_MIN_INTS, object % ZTEST_ZAP_MAX_INTS);
+
+ while (--iters >= 0) {
+ prop = ztest_random(ZTEST_ZAP_MAX_PROPS);
+ (void) sprintf(propname, "prop_%llu", (u_longlong_t)prop);
+ (void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop);
+ bzero(value, sizeof (value));
+ last_txg = 0;
+
+ /*
+ * If these zap entries already exist, validate their contents.
+ */
+ error = zap_length(os, object, txgname, &zl_intsize, &zl_ints);
+ if (error == 0) {
+ ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
+ ASSERT3U(zl_ints, ==, 1);
+
+ error = zap_lookup(os, object, txgname, zl_intsize,
+ zl_ints, &last_txg);
+
+ ASSERT3U(error, ==, 0);
+
+ error = zap_length(os, object, propname, &zl_intsize,
+ &zl_ints);
+
+ ASSERT3U(error, ==, 0);
+ ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
+ ASSERT3U(zl_ints, ==, ints);
+
+ error = zap_lookup(os, object, propname, zl_intsize,
+ zl_ints, value);
+
+ ASSERT3U(error, ==, 0);
+
+ for (i = 0; i < ints; i++) {
+ ASSERT3U(value[i], ==, last_txg + object + i);
+ }
+ } else {
+ ASSERT3U(error, ==, ENOENT);
+ }
+
+ /*
+ * Atomically update two entries in our zap object.
+ * The first is named txg_%llu, and contains the txg
+ * in which the property was last updated. The second
+ * is named prop_%llu, and the nth element of its value
+ * should be txg + object + n.
+ */
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_zap(tx, object, TRUE, NULL);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ ztest_record_enospc("create zap entry");
+ dmu_tx_abort(tx);
+ return;
+ }
+ txg = dmu_tx_get_txg(tx);
+
+ if (last_txg > txg)
+ fatal(0, "zap future leak: old %llu new %llu",
+ last_txg, txg);
+
+ for (i = 0; i < ints; i++)
+ value[i] = txg + object + i;
+
+ error = zap_update(os, object, txgname, sizeof (uint64_t),
+ 1, &txg, tx);
+ if (error)
+ fatal(0, "zap_update('%s', %llu, '%s') = %d",
+ osname, object, txgname, error);
+
+ error = zap_update(os, object, propname, sizeof (uint64_t),
+ ints, value, tx);
+ if (error)
+ fatal(0, "zap_update('%s', %llu, '%s') = %d",
+ osname, object, propname, error);
+
+ dmu_tx_commit(tx);
+
+ /*
+ * Remove a random pair of entries.
+ */
+ prop = ztest_random(ZTEST_ZAP_MAX_PROPS);
+ (void) sprintf(propname, "prop_%llu", (u_longlong_t)prop);
+ (void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop);
+
+ error = zap_length(os, object, txgname, &zl_intsize, &zl_ints);
+
+ if (error == ENOENT)
+ continue;
+
+ ASSERT3U(error, ==, 0);
+
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_zap(tx, object, TRUE, NULL);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ ztest_record_enospc("remove zap entry");
+ dmu_tx_abort(tx);
+ return;
+ }
+ error = zap_remove(os, object, txgname, tx);
+ if (error)
+ fatal(0, "zap_remove('%s', %llu, '%s') = %d",
+ osname, object, txgname, error);
+
+ error = zap_remove(os, object, propname, tx);
+ if (error)
+ fatal(0, "zap_remove('%s', %llu, '%s') = %d",
+ osname, object, propname, error);
+
+ dmu_tx_commit(tx);
+ }
+
+ /*
+ * Once in a while, destroy the object.
+ */
+ if (ztest_random(100) != 0)
+ return;
+
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff, sizeof (uint64_t));
+ dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ ztest_record_enospc("destroy zap object");
+ dmu_tx_abort(tx);
+ return;
+ }
+ error = zap_destroy(os, object, tx);
+ if (error)
+ fatal(0, "zap_destroy('%s', %llu) = %d",
+ osname, object, error);
+ object = 0;
+ dmu_write(os, ZTEST_DIROBJ, za->za_diroff, sizeof (uint64_t),
+ &object, tx);
+ dmu_tx_commit(tx);
+}
+
+void
+ztest_zap_parallel(ztest_args_t *za)
+{
+ objset_t *os = za->za_os;
+ uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc;
+ int iters = 100;
+ dmu_tx_t *tx;
+ int i, namelen, error;
+ char name[20], string_value[20];
+ void *data;
+
+ while (--iters >= 0) {
+ /*
+ * Generate a random name of the form 'xxx.....' where each
+ * x is a random printable character and the dots are dots.
+ * There are 94 such characters, and the name length goes from
+ * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names.
+ */
+ namelen = ztest_random(sizeof (name) - 5) + 5 + 1;
+
+ for (i = 0; i < 3; i++)
+ name[i] = '!' + ztest_random('~' - '!' + 1);
+ for (; i < namelen - 1; i++)
+ name[i] = '.';
+ name[i] = '\0';
+
+ if (ztest_random(2) == 0)
+ object = ZTEST_MICROZAP_OBJ;
+ else
+ object = ZTEST_FATZAP_OBJ;
+
+ if ((namelen & 1) || object == ZTEST_MICROZAP_OBJ) {
+ wsize = sizeof (txg);
+ wc = 1;
+ data = &txg;
+ } else {
+ wsize = 1;
+ wc = namelen;
+ data = string_value;
+ }
+
+ count = -1ULL;
+ VERIFY(zap_count(os, object, &count) == 0);
+ ASSERT(count != -1ULL);
+
+ /*
+ * Select an operation: length, lookup, add, update, remove.
+ */
+ i = ztest_random(5);
+
+ if (i >= 2) {
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_zap(tx, object, TRUE, NULL);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ ztest_record_enospc("zap parallel");
+ dmu_tx_abort(tx);
+ return;
+ }
+ txg = dmu_tx_get_txg(tx);
+ bcopy(name, string_value, namelen);
+ } else {
+ tx = NULL;
+ txg = 0;
+ bzero(string_value, namelen);
+ }
+
+ switch (i) {
+
+ case 0:
+ error = zap_length(os, object, name, &zl_wsize, &zl_wc);
+ if (error == 0) {
+ ASSERT3U(wsize, ==, zl_wsize);
+ ASSERT3U(wc, ==, zl_wc);
+ } else {
+ ASSERT3U(error, ==, ENOENT);
+ }
+ break;
+
+ case 1:
+ error = zap_lookup(os, object, name, wsize, wc, data);
+ if (error == 0) {
+ if (data == string_value &&
+ bcmp(name, data, namelen) != 0)
+ fatal(0, "name '%s' != val '%s' len %d",
+ name, data, namelen);
+ } else {
+ ASSERT3U(error, ==, ENOENT);
+ }
+ break;
+
+ case 2:
+ error = zap_add(os, object, name, wsize, wc, data, tx);
+ ASSERT(error == 0 || error == EEXIST);
+ break;
+
+ case 3:
+ VERIFY(zap_update(os, object, name, wsize, wc,
+ data, tx) == 0);
+ break;
+
+ case 4:
+ error = zap_remove(os, object, name, tx);
+ ASSERT(error == 0 || error == ENOENT);
+ break;
+ }
+
+ if (tx != NULL)
+ dmu_tx_commit(tx);
+ }
+}
+
+void
+ztest_dsl_prop_get_set(ztest_args_t *za)
+{
+ objset_t *os = za->za_os;
+ int i, inherit;
+ uint64_t value;
+ const char *prop, *valname;
+ char setpoint[MAXPATHLEN];
+ char osname[MAXNAMELEN];
+ int error;
+
+ (void) rw_rdlock(&ztest_shared->zs_name_lock);
+
+ dmu_objset_name(os, osname);
+
+ for (i = 0; i < 2; i++) {
+ if (i == 0) {
+ prop = "checksum";
+ value = ztest_random_checksum();
+ inherit = (value == ZIO_CHECKSUM_INHERIT);
+ } else {
+ prop = "compression";
+ value = ztest_random_compress();
+ inherit = (value == ZIO_COMPRESS_INHERIT);
+ }
+
+ error = dsl_prop_set(osname, prop, sizeof (value),
+ !inherit, &value);
+
+ if (error == ENOSPC) {
+ ztest_record_enospc("dsl_prop_set");
+ break;
+ }
+
+ ASSERT3U(error, ==, 0);
+
+ VERIFY3U(dsl_prop_get(osname, prop, sizeof (value),
+ 1, &value, setpoint), ==, 0);
+
+ if (i == 0)
+ valname = zio_checksum_table[value].ci_name;
+ else
+ valname = zio_compress_table[value].ci_name;
+
+ if (zopt_verbose >= 6) {
+ (void) printf("%s %s = %s for '%s'\n",
+ osname, prop, valname, setpoint);
+ }
+ }
+
+ (void) rw_unlock(&ztest_shared->zs_name_lock);
+}
+
+static void
+ztest_error_setup(vdev_t *vd, int mode, int mask, uint64_t arg)
+{
+ int c;
+
+ for (c = 0; c < vd->vdev_children; c++)
+ ztest_error_setup(vd->vdev_child[c], mode, mask, arg);
+
+ if (vd->vdev_path != NULL) {
+ vd->vdev_fault_mode = mode;
+ vd->vdev_fault_mask = mask;
+ vd->vdev_fault_arg = arg;
+ }
+}
+
+/*
+ * Inject random faults into the on-disk data.
+ */
+void
+ztest_fault_inject(ztest_args_t *za)
+{
+ int fd;
+ uint64_t offset;
+ uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz;
+ uint64_t bad = 0x1990c0ffeedecadeULL;
+ uint64_t top, leaf;
+ char path0[MAXPATHLEN];
+ char pathrand[MAXPATHLEN];
+ size_t fsize;
+ spa_t *spa = dmu_objset_spa(za->za_os);
+ int bshift = SPA_MAXBLOCKSHIFT + 2; /* don't scrog all labels */
+ int iters = 1000;
+ vdev_t *vd0;
+ uint64_t guid0 = 0;
+
+ /*
+ * We can't inject faults when we have no fault tolerance.
+ */
+ if (zopt_maxfaults == 0)
+ return;
+
+ ASSERT(leaves >= 2);
+
+ /*
+ * Pick a random top-level vdev.
+ */
+ spa_config_enter(spa, RW_READER, FTAG);
+ top = ztest_random(spa->spa_root_vdev->vdev_children);
+ spa_config_exit(spa, FTAG);
+
+ /*
+ * Pick a random leaf.
+ */
+ leaf = ztest_random(leaves);
+
+ /*
+ * Generate paths to the first two leaves in this top-level vdev,
+ * and to the random leaf we selected. We'll induce transient
+ * I/O errors and random online/offline activity on leaf 0,
+ * and we'll write random garbage to the randomly chosen leaf.
+ */
+ (void) snprintf(path0, sizeof (path0),
+ ztest_dev_template, zopt_dir, zopt_pool, top * leaves + 0);
+ (void) snprintf(pathrand, sizeof (pathrand),
+ ztest_dev_template, zopt_dir, zopt_pool, top * leaves + leaf);
+
+ dprintf("damaging %s and %s\n", path0, pathrand);
+
+ spa_config_enter(spa, RW_READER, FTAG);
+
+ /*
+ * If we can tolerate two or more faults, make vd0 fail randomly.
+ */
+ vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0);
+ if (vd0 != NULL && zopt_maxfaults >= 2) {
+ guid0 = vd0->vdev_guid;
+ ztest_error_setup(vd0, VDEV_FAULT_COUNT,
+ (1U << ZIO_TYPE_READ) | (1U << ZIO_TYPE_WRITE), 100);
+ }
+
+ spa_config_exit(spa, FTAG);
+
+ /*
+ * If we can tolerate two or more faults, randomly online/offline vd0.
+ */
+ if (zopt_maxfaults >= 2 && guid0 != 0) {
+ if (ztest_random(10) < 6)
+ (void) vdev_offline(spa, guid0, B_TRUE);
+ else
+ (void) vdev_online(spa, guid0);
+ }
+
+ /*
+ * We have at least single-fault tolerance, so inject data corruption.
+ */
+ fd = open(pathrand, O_RDWR);
+
+ if (fd == -1) /* we hit a gap in the device namespace */
+ return;
+
+ fsize = lseek(fd, 0, SEEK_END);
+
+ while (--iters != 0) {
+ offset = ztest_random(fsize / (leaves << bshift)) *
+ (leaves << bshift) + (leaf << bshift) +
+ (ztest_random(1ULL << (bshift - 1)) & -8ULL);
+
+ if (offset >= fsize)
+ continue;
+
+ if (zopt_verbose >= 6)
+ (void) printf("injecting bad word into %s,"
+ " offset 0x%llx\n", pathrand, (u_longlong_t)offset);
+
+ if (pwrite(fd, &bad, sizeof (bad), offset) != sizeof (bad))
+ fatal(1, "can't inject bad word at 0x%llx in %s",
+ offset, pathrand);
+ }
+
+ (void) close(fd);
+}
+
+/*
+ * Scrub the pool.
+ */
+void
+ztest_scrub(ztest_args_t *za)
+{
+ spa_t *spa = dmu_objset_spa(za->za_os);
+
+ (void) spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_FALSE);
+ (void) poll(NULL, 0, 1000); /* wait a second, then force a restart */
+ (void) spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_FALSE);
+}
+
+/*
+ * Rename the pool to a different name and then rename it back.
+ */
+void
+ztest_spa_rename(ztest_args_t *za)
+{
+ char *oldname, *newname;
+ int error;
+ spa_t *spa;
+
+ (void) rw_wrlock(&ztest_shared->zs_name_lock);
+
+ oldname = za->za_pool;
+ newname = umem_alloc(strlen(oldname) + 5, UMEM_NOFAIL);
+ (void) strcpy(newname, oldname);
+ (void) strcat(newname, "_tmp");
+
+ /*
+ * Do the rename
+ */
+ error = spa_rename(oldname, newname);
+ if (error)
+ fatal(0, "spa_rename('%s', '%s') = %d", oldname,
+ newname, error);
+
+ /*
+ * Try to open it under the old name, which shouldn't exist
+ */
+ error = spa_open(oldname, &spa, FTAG);
+ if (error != ENOENT)
+ fatal(0, "spa_open('%s') = %d", oldname, error);
+
+ /*
+ * Open it under the new name and make sure it's still the same spa_t.
+ */
+ error = spa_open(newname, &spa, FTAG);
+ if (error != 0)
+ fatal(0, "spa_open('%s') = %d", newname, error);
+
+ ASSERT(spa == dmu_objset_spa(za->za_os));
+ spa_close(spa, FTAG);
+
+ /*
+ * Rename it back to the original
+ */
+ error = spa_rename(newname, oldname);
+ if (error)
+ fatal(0, "spa_rename('%s', '%s') = %d", newname,
+ oldname, error);
+
+ /*
+ * Make sure it can still be opened
+ */
+ error = spa_open(oldname, &spa, FTAG);
+ if (error != 0)
+ fatal(0, "spa_open('%s') = %d", oldname, error);
+
+ ASSERT(spa == dmu_objset_spa(za->za_os));
+ spa_close(spa, FTAG);
+
+ umem_free(newname, strlen(newname) + 1);
+
+ (void) rw_unlock(&ztest_shared->zs_name_lock);
+}
+
+
+/*
+ * Completely obliterate one disk.
+ */
+static void
+ztest_obliterate_one_disk(uint64_t vdev)
+{
+ int fd;
+ char dev_name[MAXPATHLEN], copy_name[MAXPATHLEN];
+ size_t fsize;
+
+ if (zopt_maxfaults < 2)
+ return;
+
+ (void) sprintf(dev_name, ztest_dev_template, zopt_dir, zopt_pool, vdev);
+ (void) snprintf(copy_name, MAXPATHLEN, "%s.old", dev_name);
+
+ fd = open(dev_name, O_RDWR);
+
+ if (fd == -1)
+ fatal(1, "can't open %s", dev_name);
+
+ /*
+ * Determine the size.
+ */
+ fsize = lseek(fd, 0, SEEK_END);
+
+ (void) close(fd);
+
+ /*
+ * Rename the old device to dev_name.old (useful for debugging).
+ */
+ VERIFY(rename(dev_name, copy_name) == 0);
+
+ /*
+ * Create a new one.
+ */
+ VERIFY((fd = open(dev_name, O_RDWR | O_CREAT | O_TRUNC, 0666)) >= 0);
+ VERIFY(ftruncate(fd, fsize) == 0);
+ (void) close(fd);
+}
+
+static void
+ztest_replace_one_disk(spa_t *spa, uint64_t vdev)
+{
+ char dev_name[MAXPATHLEN];
+ nvlist_t *file, *root;
+ int error;
+ uint64_t guid;
+ uint64_t ashift = ztest_get_ashift();
+ vdev_t *vd;
+
+ (void) sprintf(dev_name, ztest_dev_template, zopt_dir, zopt_pool, vdev);
+
+ /*
+ * Build the nvlist describing dev_name.
+ */
+ VERIFY(nvlist_alloc(&file, NV_UNIQUE_NAME, 0) == 0);
+ VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_TYPE, VDEV_TYPE_FILE) == 0);
+ VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_PATH, dev_name) == 0);
+ VERIFY(nvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift) == 0);
+
+ VERIFY(nvlist_alloc(&root, NV_UNIQUE_NAME, 0) == 0);
+ VERIFY(nvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) == 0);
+ VERIFY(nvlist_add_nvlist_array(root, ZPOOL_CONFIG_CHILDREN,
+ &file, 1) == 0);
+
+ spa_config_enter(spa, RW_READER, FTAG);
+ if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, dev_name)) == NULL)
+ guid = 0;
+ else
+ guid = vd->vdev_guid;
+ spa_config_exit(spa, FTAG);
+ error = spa_vdev_attach(spa, guid, root, B_TRUE);
+ if (error != 0 &&
+ error != EBUSY &&
+ error != ENOTSUP &&
+ error != ENODEV &&
+ error != EDOM)
+ fatal(0, "spa_vdev_attach(in-place) = %d", error);
+
+ nvlist_free(file);
+ nvlist_free(root);
+}
+
+static void
+ztest_verify_blocks(char *pool)
+{
+ int status;
+ char zdb[MAXPATHLEN + MAXNAMELEN + 20];
+ char zbuf[1024];
+ char *bin;
+ FILE *fp;
+
+ if (realpath(progname, zdb) == NULL)
+ assert(!"realpath() failed");
+
+ /* zdb lives in /usr/sbin, while ztest lives in /usr/bin */
+ bin = strstr(zdb, "/usr/bin/");
+ if (bin == NULL)
+ bin = zdb;
+ /* LINTED */
+ (void) sprintf(bin, "/usr/sbin/zdb -bc%s%s -U -O %s %s",
+ zopt_verbose >= 3 ? "s" : "",
+ zopt_verbose >= 4 ? "v" : "",
+ ztest_random(2) == 0 ? "pre" : "post", pool);
+
+ if (zopt_verbose >= 5)
+ (void) printf("Executing %s\n", strstr(zdb, "zdb "));
+
+ fp = popen(zdb, "r");
+ assert(fp != NULL);
+
+ while (fgets(zbuf, sizeof (zbuf), fp) != NULL)
+ if (zopt_verbose >= 3)
+ (void) printf("%s", zbuf);
+
+ status = pclose(fp);
+
+ if (status == 0)
+ return;
+
+ ztest_dump_core = 0;
+ if (WIFEXITED(status))
+ fatal(0, "'%s' exit code %d", zdb, WEXITSTATUS(status));
+ else
+ fatal(0, "'%s' died with signal %d", zdb, WTERMSIG(status));
+}
+
+static void
+ztest_walk_pool_directory(char *header)
+{
+ spa_t *spa = NULL;
+
+ if (zopt_verbose >= 6)
+ (void) printf("%s\n", header);
+
+ mutex_enter(&spa_namespace_lock);
+ while ((spa = spa_next(spa)) != NULL)
+ if (zopt_verbose >= 6)
+ (void) printf("\t%s\n", spa_name(spa));
+ mutex_exit(&spa_namespace_lock);
+}
+
+static void
+ztest_spa_import_export(char *oldname, char *newname)
+{
+ nvlist_t *config;
+ uint64_t pool_guid;
+ spa_t *spa;
+ int error;
+
+ if (zopt_verbose >= 4) {
+ (void) printf("import/export: old = %s, new = %s\n",
+ oldname, newname);
+ }
+
+ /*
+ * Clean up from previous runs.
+ */
+ (void) spa_destroy(newname);
+
+ /*
+ * Get the pool's configuration and guid.
+ */
+ error = spa_open(oldname, &spa, FTAG);
+ if (error)
+ fatal(0, "spa_open('%s') = %d", oldname, error);
+
+ pool_guid = spa_guid(spa);
+ spa_close(spa, FTAG);
+
+ ztest_walk_pool_directory("pools before export");
+
+ /*
+ * Export it.
+ */
+ error = spa_export(oldname, &config);
+ if (error)
+ fatal(0, "spa_export('%s') = %d", oldname, error);
+
+ ztest_walk_pool_directory("pools after export");
+
+ /*
+ * Import it under the new name.
+ */
+ error = spa_import(newname, config, NULL);
+ if (error)
+ fatal(0, "spa_import('%s') = %d", newname, error);
+
+ ztest_walk_pool_directory("pools after import");
+
+ /*
+ * Try to import it again -- should fail with EEXIST.
+ */
+ error = spa_import(newname, config, NULL);
+ if (error != EEXIST)
+ fatal(0, "spa_import('%s') twice", newname);
+
+ /*
+ * Try to import it under a different name -- should fail with EEXIST.
+ */
+ error = spa_import(oldname, config, NULL);
+ if (error != EEXIST)
+ fatal(0, "spa_import('%s') under multiple names", newname);
+
+ /*
+ * Verify that the pool is no longer visible under the old name.
+ */
+ error = spa_open(oldname, &spa, FTAG);
+ if (error != ENOENT)
+ fatal(0, "spa_open('%s') = %d", newname, error);
+
+ /*
+ * Verify that we can open and close the pool using the new name.
+ */
+ error = spa_open(newname, &spa, FTAG);
+ if (error)
+ fatal(0, "spa_open('%s') = %d", newname, error);
+ ASSERT(pool_guid == spa_guid(spa));
+ spa_close(spa, FTAG);
+
+ nvlist_free(config);
+}
+
+static void *
+ztest_thread(void *arg)
+{
+ ztest_args_t *za = arg;
+ ztest_shared_t *zs = ztest_shared;
+ hrtime_t now, functime;
+ ztest_info_t *zi;
+ int f;
+
+ while ((now = gethrtime()) < za->za_stop) {
+ /*
+ * See if it's time to force a crash.
+ */
+ if (now > za->za_kill) {
+ dmu_tx_t *tx;
+ uint64_t txg;
+
+ mutex_enter(&spa_namespace_lock);
+ tx = dmu_tx_create(za->za_os);
+ VERIFY(0 == dmu_tx_assign(tx, TXG_NOWAIT));
+ txg = dmu_tx_get_txg(tx);
+ dmu_tx_commit(tx);
+ zs->zs_txg = txg;
+ if (zopt_verbose >= 3)
+ (void) printf(
+ "killing process after txg %lld\n",
+ (u_longlong_t)txg);
+ txg_wait_synced(dmu_objset_pool(za->za_os), txg);
+ zs->zs_alloc = spa_get_alloc(dmu_objset_spa(za->za_os));
+ zs->zs_space = spa_get_space(dmu_objset_spa(za->za_os));
+ (void) kill(getpid(), SIGKILL);
+ }
+
+ /*
+ * Pick a random function.
+ */
+ f = ztest_random(ZTEST_FUNCS);
+ zi = &zs->zs_info[f];
+
+ /*
+ * Decide whether to call it, based on the requested frequency.
+ */
+ if (zi->zi_call_target == 0 ||
+ (double)zi->zi_call_total / zi->zi_call_target >
+ (double)(now - zs->zs_start_time) / (zopt_time * NANOSEC))
+ continue;
+
+ atomic_add_64(&zi->zi_calls, 1);
+ atomic_add_64(&zi->zi_call_total, 1);
+
+ za->za_diroff = (za->za_instance * ZTEST_FUNCS + f) *
+ ZTEST_DIRSIZE;
+ za->za_diroff_shared = (1ULL << 63);
+
+ ztest_dmu_write_parallel(za);
+
+ zi->zi_func(za);
+
+ functime = gethrtime() - now;
+
+ atomic_add_64(&zi->zi_call_time, functime);
+
+ if (zopt_verbose >= 4) {
+ Dl_info dli;
+ (void) dladdr((void *)zi->zi_func, &dli);
+ (void) printf("%6.2f sec in %s\n",
+ (double)functime / NANOSEC, dli.dli_sname);
+ }
+
+ /*
+ * If we're getting ENOSPC with some regularity, stop.
+ */
+ if (zs->zs_enospc_count > 10)
+ break;
+ }
+
+ return (NULL);
+}
+
+/*
+ * Kick off threads to run tests on all datasets in parallel.
+ */
+static void
+ztest_run(char *pool)
+{
+ int t, d, error;
+ ztest_shared_t *zs = ztest_shared;
+ ztest_args_t *za;
+ spa_t *spa;
+ char name[100];
+
+ (void) _mutex_init(&zs->zs_vdev_lock, USYNC_THREAD, NULL);
+ (void) rwlock_init(&zs->zs_name_lock, USYNC_THREAD, NULL);
+
+ for (t = 0; t < ZTEST_SYNC_LOCKS; t++)
+ (void) _mutex_init(&zs->zs_sync_lock[t], USYNC_THREAD, NULL);
+
+ /*
+ * Destroy one disk before we even start.
+ * It's mirrored, so everything should work just fine.
+ * This makes us exercise fault handling very early in spa_load().
+ */
+ ztest_obliterate_one_disk(0);
+
+ /*
+ * Verify that the sum of the sizes of all blocks in the pool
+ * equals the SPA's allocated space total.
+ */
+ ztest_verify_blocks(pool);
+
+ /*
+ * Kick off a replacement of the disk we just obliterated.
+ */
+ kernel_init(FREAD | FWRITE);
+ error = spa_open(pool, &spa, FTAG);
+ if (error)
+ fatal(0, "spa_open(%s) = %d", pool, error);
+ ztest_replace_one_disk(spa, 0);
+ if (zopt_verbose >= 5)
+ show_pool_stats(spa);
+ spa_close(spa, FTAG);
+ kernel_fini();
+
+ kernel_init(FREAD | FWRITE);
+
+ /*
+ * Verify that we can export the pool and reimport it under a
+ * different name.
+ */
+ if (ztest_random(2) == 0) {
+ (void) snprintf(name, 100, "%s_import", pool);
+ ztest_spa_import_export(pool, name);
+ ztest_spa_import_export(name, pool);
+ }
+
+ /*
+ * Verify that we can loop over all pools.
+ */
+ mutex_enter(&spa_namespace_lock);
+ for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa)) {
+ if (zopt_verbose > 3) {
+ (void) printf("spa_next: found %s\n", spa_name(spa));
+ }
+ }
+ mutex_exit(&spa_namespace_lock);
+
+ /*
+ * Open our pool.
+ */
+ error = spa_open(pool, &spa, FTAG);
+ if (error)
+ fatal(0, "spa_open() = %d", error);
+
+ /*
+ * Verify that we can safely inquire about about any object,
+ * whether it's allocated or not. To make it interesting,
+ * we probe a 5-wide window around each power of two.
+ * This hits all edge cases, including zero and the max.
+ */
+ for (t = 0; t < 64; t++) {
+ for (d = -5; d <= 5; d++) {
+ error = dmu_object_info(spa->spa_meta_objset,
+ (1ULL << t) + d, NULL);
+ ASSERT(error == 0 || error == ENOENT ||
+ error == EINVAL);
+ }
+ }
+
+ /*
+ * Now kick off all the tests that run in parallel.
+ */
+ zs->zs_enospc_count = 0;
+
+ za = umem_zalloc(zopt_threads * sizeof (ztest_args_t), UMEM_NOFAIL);
+
+ if (zopt_verbose >= 4)
+ (void) printf("starting main threads...\n");
+
+ za[0].za_start = gethrtime();
+ za[0].za_stop = za[0].za_start + zopt_passtime * NANOSEC;
+ za[0].za_stop = MIN(za[0].za_stop, zs->zs_stop_time);
+ za[0].za_kill = za[0].za_stop;
+ if (ztest_random(100) < zopt_killrate)
+ za[0].za_kill -= ztest_random(zopt_passtime * NANOSEC);
+
+ for (t = 0; t < zopt_threads; t++) {
+ d = t % zopt_datasets;
+ if (t < zopt_datasets) {
+ ztest_replay_t zr;
+ int test_future = FALSE;
+ (void) rw_rdlock(&ztest_shared->zs_name_lock);
+ (void) snprintf(name, 100, "%s/%s_%d", pool, pool, d);
+ error = dmu_objset_create(name, DMU_OST_OTHER, NULL,
+ ztest_create_cb, NULL);
+ if (error == EEXIST) {
+ test_future = TRUE;
+ } else if (error != 0) {
+ if (error == ENOSPC) {
+ zs->zs_enospc_count++;
+ (void) rw_unlock(
+ &ztest_shared->zs_name_lock);
+ break;
+ }
+ fatal(0, "dmu_objset_create(%s) = %d",
+ name, error);
+ }
+ error = dmu_objset_open(name, DMU_OST_OTHER,
+ DS_MODE_STANDARD, &za[d].za_os);
+ if (error)
+ fatal(0, "dmu_objset_open('%s') = %d",
+ name, error);
+ (void) rw_unlock(&ztest_shared->zs_name_lock);
+ if (test_future && ztest_shared->zs_txg > 0)
+ ztest_dmu_check_future_leak(za[d].za_os,
+ ztest_shared->zs_txg);
+ zr.zr_os = za[d].za_os;
+ zil_replay(zr.zr_os, &zr, &zr.zr_assign,
+ ztest_replay_vector);
+ za[d].za_zilog = zil_open(za[d].za_os, NULL);
+ }
+ za[t].za_pool = spa_strdup(pool);
+ za[t].za_os = za[d].za_os;
+ za[t].za_zilog = za[d].za_zilog;
+ za[t].za_instance = t;
+ za[t].za_random = ztest_random(-1ULL);
+ za[t].za_start = za[0].za_start;
+ za[t].za_stop = za[0].za_stop;
+ za[t].za_kill = za[0].za_kill;
+
+ error = thr_create(0, 0, ztest_thread, &za[t], THR_BOUND,
+ &za[t].za_thread);
+ if (error)
+ fatal(0, "can't create thread %d: error %d",
+ t, error);
+ }
+ ztest_shared->zs_txg = 0;
+
+ while (--t >= 0) {
+ error = thr_join(za[t].za_thread, NULL, NULL);
+ if (error)
+ fatal(0, "thr_join(%d) = %d", t, error);
+ if (za[t].za_th)
+ traverse_fini(za[t].za_th);
+ if (t < zopt_datasets) {
+ zil_close(za[t].za_zilog);
+ dmu_objset_close(za[t].za_os);
+ }
+ spa_strfree(za[t].za_pool);
+ }
+
+ umem_free(za, zopt_threads * sizeof (ztest_args_t));
+
+ if (zopt_verbose >= 3)
+ show_pool_stats(spa);
+
+ txg_wait_synced(spa_get_dsl(spa), 0);
+
+ zs->zs_alloc = spa_get_alloc(spa);
+ zs->zs_space = spa_get_space(spa);
+
+ /*
+ * Did we have out-of-space errors? If so, destroy a random objset.
+ */
+ if (zs->zs_enospc_count != 0) {
+ (void) rw_rdlock(&ztest_shared->zs_name_lock);
+ (void) snprintf(name, 100, "%s/%s_%d", pool, pool,
+ (int)ztest_random(zopt_datasets));
+ if (zopt_verbose >= 3)
+ (void) printf("Destroying %s to free up space\n", name);
+ (void) dmu_objset_find(name, ztest_destroy_cb, NULL,
+ DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
+ (void) rw_unlock(&ztest_shared->zs_name_lock);
+ }
+
+ txg_wait_synced(spa_get_dsl(spa), 0);
+
+ /*
+ * Right before closing the pool, kick off a bunch of async I/O;
+ * spa_close() should wait for it to complete.
+ */
+ for (t = 1; t < 50; t++)
+ dmu_prefetch(spa->spa_meta_objset, t, 0, 1 << 15);
+
+ spa_close(spa, FTAG);
+
+ kernel_fini();
+}
+
+void
+print_time(hrtime_t t, char *timebuf)
+{
+ hrtime_t s = t / NANOSEC;
+ hrtime_t m = s / 60;
+ hrtime_t h = m / 60;
+ hrtime_t d = h / 24;
+
+ s -= m * 60;
+ m -= h * 60;
+ h -= d * 24;
+
+ timebuf[0] = '\0';
+
+ if (d)
+ (void) sprintf(timebuf,
+ "%llud%02lluh%02llum%02llus", d, h, m, s);
+ else if (h)
+ (void) sprintf(timebuf, "%lluh%02llum%02llus", h, m, s);
+ else if (m)
+ (void) sprintf(timebuf, "%llum%02llus", m, s);
+ else
+ (void) sprintf(timebuf, "%llus", s);
+}
+
+/*
+ * Create a storage pool with the given name and initial vdev size.
+ * Then create the specified number of datasets in the pool.
+ */
+static void
+ztest_init(char *pool)
+{
+ spa_t *spa;
+ int error;
+ nvlist_t *nvroot;
+
+ kernel_init(FREAD | FWRITE);
+
+ /*
+ * Create the storage pool.
+ */
+ (void) spa_destroy(pool);
+ ztest_shared->zs_vdev_primaries = 0;
+ nvroot = make_vdev_root(zopt_vdev_size, zopt_raidz, zopt_mirrors, 1);
+ error = spa_create(pool, nvroot, NULL);
+ nvlist_free(nvroot);
+
+ if (error)
+ fatal(0, "spa_create() = %d", error);
+ error = spa_open(pool, &spa, FTAG);
+ if (error)
+ fatal(0, "spa_open() = %d", error);
+
+ if (zopt_verbose >= 3)
+ show_pool_stats(spa);
+
+ spa_close(spa, FTAG);
+
+ kernel_fini();
+}
+
+int
+main(int argc, char **argv)
+{
+ int kills = 0;
+ int iters = 0;
+ int i, f;
+ ztest_shared_t *zs;
+ ztest_info_t *zi;
+ char timebuf[100];
+ char numbuf[6];
+
+ (void) setvbuf(stdout, NULL, _IOLBF, 0);
+
+ /* Override location of zpool.cache */
+ spa_config_dir = "/tmp";
+
+ ztest_random_fd = open("/dev/urandom", O_RDONLY);
+
+ process_options(argc, argv);
+
+ argc -= optind;
+ argv += optind;
+
+ dprintf_setup(&argc, argv);
+
+ /*
+ * Blow away any existing copy of zpool.cache
+ */
+ if (zopt_init != 0)
+ (void) remove("/tmp/zpool.cache");
+
+ zs = ztest_shared = (void *)mmap(0,
+ P2ROUNDUP(sizeof (ztest_shared_t), getpagesize()),
+ PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0);
+
+ if (zopt_verbose >= 1) {
+ (void) printf("%llu vdevs, %d datasets, %d threads,"
+ " %llu seconds...\n",
+ (u_longlong_t)zopt_vdevs, zopt_datasets, zopt_threads,
+ (u_longlong_t)zopt_time);
+ }
+
+ /*
+ * Create and initialize our storage pool.
+ */
+ for (i = 1; i <= zopt_init; i++) {
+ bzero(zs, sizeof (ztest_shared_t));
+ if (zopt_verbose >= 3 && zopt_init != 1)
+ (void) printf("ztest_init(), pass %d\n", i);
+ ztest_init(zopt_pool);
+ }
+
+ /*
+ * Initialize the call targets for each function.
+ */
+ for (f = 0; f < ZTEST_FUNCS; f++) {
+ zi = &zs->zs_info[f];
+
+ *zi = ztest_info[f];
+
+ if (*zi->zi_interval == 0)
+ zi->zi_call_target = UINT64_MAX;
+ else
+ zi->zi_call_target = zopt_time / *zi->zi_interval;
+ }
+
+ zs->zs_start_time = gethrtime();
+ zs->zs_stop_time = zs->zs_start_time + zopt_time * NANOSEC;
+
+ /*
+ * Run the tests in a loop. These tests include fault injection
+ * to verify that self-healing data works, and forced crashes
+ * to verify that we never lose on-disk consistency.
+ */
+ while (gethrtime() < zs->zs_stop_time) {
+ int status;
+ pid_t pid;
+ char *tmp;
+
+ /*
+ * Initialize the workload counters for each function.
+ */
+ for (f = 0; f < ZTEST_FUNCS; f++) {
+ zi = &zs->zs_info[f];
+ zi->zi_calls = 0;
+ zi->zi_call_time = 0;
+ }
+
+ pid = fork();
+
+ if (pid == -1)
+ fatal(1, "fork failed");
+
+ if (pid == 0) { /* child */
+ struct rlimit rl = { 1024, 1024 };
+ (void) setrlimit(RLIMIT_NOFILE, &rl);
+ (void) enable_extended_FILE_stdio(-1, -1);
+ ztest_run(zopt_pool);
+ exit(0);
+ }
+
+ while (waitpid(pid, &status, 0) != pid)
+ continue;
+
+ if (WIFEXITED(status)) {
+ if (WEXITSTATUS(status) != 0) {
+ (void) fprintf(stderr,
+ "child exited with code %d\n",
+ WEXITSTATUS(status));
+ exit(2);
+ }
+ } else if (WIFSIGNALED(status)) {
+ if (WTERMSIG(status) != SIGKILL) {
+ (void) fprintf(stderr,
+ "child died with signal %d\n",
+ WTERMSIG(status));
+ exit(3);
+ }
+ kills++;
+ } else {
+ (void) fprintf(stderr, "something strange happened "
+ "to child\n");
+ exit(4);
+ }
+
+ iters++;
+
+ if (zopt_verbose >= 1) {
+ hrtime_t now = gethrtime();
+
+ now = MIN(now, zs->zs_stop_time);
+ print_time(zs->zs_stop_time - now, timebuf);
+ nicenum(zs->zs_space, numbuf);
+
+ (void) printf("Pass %3d, %8s, %3llu ENOSPC, "
+ "%4.1f%% of %5s used, %3.0f%% done, %8s to go\n",
+ iters,
+ WIFEXITED(status) ? "Complete" : "SIGKILL",
+ (u_longlong_t)zs->zs_enospc_count,
+ 100.0 * zs->zs_alloc / zs->zs_space,
+ numbuf,
+ 100.0 * (now - zs->zs_start_time) /
+ (zopt_time * NANOSEC), timebuf);
+ }
+
+ if (zopt_verbose >= 2) {
+ (void) printf("\nWorkload summary:\n\n");
+ (void) printf("%7s %9s %s\n",
+ "Calls", "Time", "Function");
+ (void) printf("%7s %9s %s\n",
+ "-----", "----", "--------");
+ for (f = 0; f < ZTEST_FUNCS; f++) {
+ Dl_info dli;
+
+ zi = &zs->zs_info[f];
+ print_time(zi->zi_call_time, timebuf);
+ (void) dladdr((void *)zi->zi_func, &dli);
+ (void) printf("%7llu %9s %s\n",
+ (u_longlong_t)zi->zi_calls, timebuf,
+ dli.dli_sname);
+ }
+ (void) printf("\n");
+ }
+
+ /*
+ * It's possible that we killed a child during a rename test, in
+ * which case we'll have a 'ztest_tmp' pool lying around instead
+ * of 'ztest'. Do a blind rename in case this happened.
+ */
+ tmp = umem_alloc(strlen(zopt_pool) + 5, UMEM_NOFAIL);
+ (void) strcpy(tmp, zopt_pool);
+ (void) strcat(tmp, "_tmp");
+ kernel_init(FREAD | FWRITE);
+ (void) spa_rename(tmp, zopt_pool);
+ kernel_fini();
+ umem_free(tmp, strlen(tmp) + 1);
+ }
+
+ ztest_verify_blocks(zopt_pool);
+
+ if (zopt_verbose >= 1) {
+ (void) printf("%d killed, %d completed, %.0f%% kill rate\n",
+ kills, iters - kills, (100.0 * kills) / MAX(1, iters));
+ }
+
+ return (0);
+}
OpenPOWER on IntegriCloud