84 files changed, 6036 insertions, 2750 deletions
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
index 860b33c..2813924 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
@@ -462,6 +462,7 @@ static arc_state_t	*arc_l2c_only;
 
 static int		arc_no_grow;	/* Don't try to grow cache size */
 static uint64_t		arc_tempreserve;
+static uint64_t		arc_loaned_bytes;
 static uint64_t		arc_meta_used;
 static uint64_t		arc_meta_limit;
 static uint64_t		arc_meta_max = 0;
@@ -511,7 +512,7 @@ struct arc_buf_hdr {
 	/* immutable */
 	arc_buf_contents_t	b_type;
 	uint64_t		b_size;
-	spa_t			*b_spa;
+	uint64_t		b_spa;
 
 	/* protected by arc state mutex */
 	arc_state_t		*b_state;
@@ -533,9 +534,9 @@ static arc_buf_hdr_t arc_eviction_hdr;
 static void arc_get_data_buf(arc_buf_t *buf);
 static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
 static int arc_evict_needed(arc_buf_contents_t type);
-static void arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes);
+static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
 
-static boolean_t l2arc_write_eligible(spa_t *spa, arc_buf_hdr_t *ab);
+static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
 
 #define	GHOST_STATE(state)	\
 	((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||	\
@@ -761,9 +762,8 @@ static void l2arc_hdr_stat_add(void);
 static void l2arc_hdr_stat_remove(void);
 
 static uint64_t
-buf_hash(spa_t *spa, const dva_t *dva, uint64_t birth)
+buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
 {
-	uintptr_t spav = (uintptr_t)spa;
 	uint8_t *vdva = (uint8_t *)dva;
 	uint64_t crc = -1ULL;
 	int i;
@@ -773,7 +773,7 @@ buf_hash(spa_t *spa, const dva_t *dva, uint64_t birth)
 	for (i = 0; i < sizeof (dva_t); i++)
 		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
 
-	crc ^= (spav>>8) ^ birth;
+	crc ^= (spa>>8) ^ birth;
 
 	return (crc);
 }
@@ -789,7 +789,7 @@ buf_hash(spa_t *spa, const dva_t *dva, uint64_t birth)
 	((buf)->b_birth == birth) && ((buf)->b_spa == spa)
 
 static arc_buf_hdr_t *
-buf_hash_find(spa_t *spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
+buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
 {
 	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
 	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
@@ -1345,7 +1345,7 @@ arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
 	ASSERT(BUF_EMPTY(hdr));
 	hdr->b_size = size;
 	hdr->b_type = type;
-	hdr->b_spa = spa;
+	hdr->b_spa = spa_guid(spa);
 	hdr->b_state = arc_anon;
 	hdr->b_arc_access = 0;
 	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
@@ -1364,6 +1364,41 @@ arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
 	return (buf);
 }
 
+static char *arc_onloan_tag = "onloan";
+
+/*
+ * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
+ * flight data by arc_tempreserve_space() until they are "returned". Loaned
+ * buffers must be returned to the arc before they can be used by the DMU or
+ * freed.
+ */
+arc_buf_t *
+arc_loan_buf(spa_t *spa, int size)
+{
+	arc_buf_t *buf;
+
+	buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
+
+	atomic_add_64(&arc_loaned_bytes, size);
+	return (buf);
+}
+
+/*
+ * Return a loaned arc buffer to the arc.
+ */
+void
+arc_return_buf(arc_buf_t *buf, void *tag)
+{
+	arc_buf_hdr_t *hdr = buf->b_hdr;
+
+	ASSERT(hdr->b_state == arc_anon);
+	ASSERT(buf->b_data != NULL);
+	VERIFY(refcount_remove(&hdr->b_refcnt, arc_onloan_tag) == 0);
+	VERIFY(refcount_add(&hdr->b_refcnt, tag) == 1);
+
+	atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
+}
+
 static arc_buf_t *
 arc_buf_clone(arc_buf_t *from)
 {
@@ -1661,7 +1696,7 @@ arc_buf_size(arc_buf_t *buf)
  * It may also return without evicting as much space as requested.
  */
 static void *
-arc_evict(arc_state_t *state, spa_t *spa, int64_t bytes, boolean_t recycle,
+arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
     arc_buf_contents_t type)
 {
 	arc_state_t *evicted_state;
@@ -1830,12 +1865,12 @@ evict_start:
 		if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
 			int64_t todelete =
 			    MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
-			arc_evict_ghost(arc_mru_ghost, NULL, todelete);
+			arc_evict_ghost(arc_mru_ghost, 0, todelete);
 		} else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
 			int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
 			    arc_mru_ghost->arcs_size +
 			    arc_mfu_ghost->arcs_size - arc_c);
-			arc_evict_ghost(arc_mfu_ghost, NULL, todelete);
+			arc_evict_ghost(arc_mfu_ghost, 0, todelete);
 		}
 	}
 	if (stolen)
@@ -1849,7 +1884,7 @@ evict_start:
  * bytes.  Destroy the buffers that are removed.
  */
 static void
-arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes)
+arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
 {
 	arc_buf_hdr_t *ab, *ab_prev;
 	list_t *list, *list_start;
@@ -1955,13 +1990,13 @@ arc_adjust(void)
 
 	if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
 		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
-		(void) arc_evict(arc_mru, NULL, delta, FALSE, ARC_BUFC_DATA);
+		(void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA);
 		adjustment -= delta;
 	}
 
 	if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
 		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
-		(void) arc_evict(arc_mru, NULL, delta, FALSE,
+		(void) arc_evict(arc_mru, 0, delta, FALSE,
 		    ARC_BUFC_METADATA);
 	}
 
@@ -1973,14 +2008,14 @@ arc_adjust(void)
 
 	if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
 		delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
-		(void) arc_evict(arc_mfu, NULL, delta, FALSE, ARC_BUFC_DATA);
+		(void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA);
 		adjustment -= delta;
 	}
 
 	if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
 		int64_t delta = MIN(adjustment,
 		    arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
-		(void) arc_evict(arc_mfu, NULL, delta, FALSE,
+		(void) arc_evict(arc_mfu, 0, delta, FALSE,
 		    ARC_BUFC_METADATA);
 	}
 
@@ -1992,7 +2027,7 @@ arc_adjust(void)
 
 	if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
 		delta = MIN(arc_mru_ghost->arcs_size, adjustment);
-		arc_evict_ghost(arc_mru_ghost, NULL, delta);
+		arc_evict_ghost(arc_mru_ghost, 0, delta);
 	}
 
 	adjustment =
@@ -2000,7 +2035,7 @@ arc_adjust(void)
 
 	if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
 		delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
-		arc_evict_ghost(arc_mfu_ghost, NULL, delta);
+		arc_evict_ghost(arc_mfu_ghost, 0, delta);
 	}
 }
 
@@ -2044,29 +2079,34 @@ restart:
 void
 arc_flush(spa_t *spa)
 {
+	uint64_t guid = 0;
+
+	if (spa)
+		guid = spa_guid(spa);
+
 	while (arc_mru->arcs_lsize[ARC_BUFC_DATA]) {
-		(void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_DATA);
+		(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
 		if (spa)
 			break;
 	}
 	while (arc_mru->arcs_lsize[ARC_BUFC_METADATA]) {
-		(void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_METADATA);
+		(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
 		if (spa)
 			break;
 	}
 	while (arc_mfu->arcs_lsize[ARC_BUFC_DATA]) {
-		(void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_DATA);
+		(void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
 		if (spa)
 			break;
 	}
 	while (arc_mfu->arcs_lsize[ARC_BUFC_METADATA]) {
-		(void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_METADATA);
+		(void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
 		if (spa)
 			break;
 	}
 
-	arc_evict_ghost(arc_mru_ghost, spa, -1);
-	arc_evict_ghost(arc_mfu_ghost, spa, -1);
+	arc_evict_ghost(arc_mru_ghost, guid, -1);
+	arc_evict_ghost(arc_mfu_ghost, guid, -1);
 
 	mutex_enter(&arc_reclaim_thr_lock);
 	arc_do_user_evicts();
@@ -2463,7 +2503,7 @@ arc_get_data_buf(arc_buf_t *buf)
 		state =  (arc_mru->arcs_lsize[type] >= size &&
 		    mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
 	}
-	if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) {
+	if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) {
 		if (type == ARC_BUFC_METADATA) {
 			buf->b_data = zio_buf_alloc(size);
 			arc_space_consume(size, ARC_SPACE_DATA);
@@ -2673,7 +2713,7 @@ arc_read_done(zio_t *zio)
 	 * reason for it not to be found is if we were freed during the
 	 * read.
 	 */
-	found = buf_hash_find(zio->io_spa, &hdr->b_dva, hdr->b_birth,
+	found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth,
 	    &hash_lock);
 
 	ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
@@ -2817,9 +2857,10 @@ arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp,
 	arc_buf_t *buf;
 	kmutex_t *hash_lock;
 	zio_t *rzio;
+	uint64_t guid = spa_guid(spa);
 
 top:
-	hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
+	hdr = buf_hash_find(guid, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
 	if (hdr && hdr->b_datacnt > 0) {
 
 		*arc_flags |= ARC_CACHED;
@@ -2842,7 +2883,7 @@ top:
 				acb->acb_private = private;
 				if (pio != NULL)
 					acb->acb_zio_dummy = zio_null(pio,
-					    spa, NULL, NULL, zio_flags);
+					    spa, NULL, NULL, NULL, zio_flags);
 
 				ASSERT(acb->acb_done != NULL);
 				acb->acb_next = hdr->b_acb;
@@ -3084,9 +3125,10 @@ arc_tryread(spa_t *spa, blkptr_t *bp, void *data)
 {
 	arc_buf_hdr_t *hdr;
 	kmutex_t *hash_mtx;
+	uint64_t guid = spa_guid(spa);
 	int rc = 0;
 
-	hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx);
+	hdr = buf_hash_find(guid, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx);
 
 	if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) {
 		arc_buf_t *buf = hdr->b_buf;
@@ -3254,7 +3296,7 @@ arc_release(arc_buf_t *buf, void *tag)
 		arc_buf_hdr_t *nhdr;
 		arc_buf_t **bufp;
 		uint64_t blksz = hdr->b_size;
-		spa_t *spa = hdr->b_spa;
+		uint64_t spa = hdr->b_spa;
 		arc_buf_contents_t type = hdr->b_type;
 		uint32_t flags = hdr->b_flags;
 
@@ -3539,12 +3581,13 @@ arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
 	arc_buf_hdr_t *ab;
 	kmutex_t *hash_lock;
 	zio_t	*zio;
+	uint64_t guid = spa_guid(spa);
 
 	/*
 	 * If this buffer is in the cache, release it, so it
 	 * can be re-used.
 	 */
-	ab = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
+	ab = buf_hash_find(guid, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
 	if (ab != NULL) {
 		/*
 		 * The checksum of blocks to free is not always
@@ -3607,10 +3650,9 @@ arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
 }
 
 static int
-arc_memory_throttle(uint64_t reserve, uint64_t txg)
+arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
 {
 #ifdef _KERNEL
-	uint64_t inflight_data = arc_anon->arcs_size;
 	uint64_t available_memory = ptoa((uintmax_t)cnt.v_free_count);
 	static uint64_t page_load = 0;
 	static uint64_t last_txg = 0;
@@ -3674,6 +3716,7 @@ int
 arc_tempreserve_space(uint64_t reserve, uint64_t txg)
 {
 	int error;
+	uint64_t anon_size;
 
 #ifdef ZFS_DEBUG
 	/*
@@ -3690,11 +3733,18 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg)
 		return (ENOMEM);
 
 	/*
+	 * Don't count loaned bufs as in flight dirty data to prevent long
+	 * network delays from blocking transactions that are ready to be
+	 * assigned to a txg.
+	 */
+	anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
+
+	/*
 	 * Writes will, almost always, require additional memory allocations
 	 * in order to compress/encrypt/etc the data.  We therefor need to
 	 * make sure that there is sufficient available memory for this.
 	 */
-	if (error = arc_memory_throttle(reserve, txg))
+	if (error = arc_memory_throttle(reserve, anon_size, txg))
 		return (error);
 
 	/*
@@ -3704,8 +3754,9 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg)
 	 * Note: if two requests come in concurrently, we might let them
 	 * both succeed, when one of them should fail.  Not a huge deal.
 	 */
-	if (reserve + arc_tempreserve + arc_anon->arcs_size > arc_c / 2 &&
-	    arc_anon->arcs_size > arc_c / 4) {
+
+	if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
+	    anon_size > arc_c / 4) {
 		dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
 		    "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
 		    arc_tempreserve>>10,
@@ -3959,6 +4010,8 @@ arc_fini(void)
 
 	buf_fini();
 
+	ASSERT(arc_loaned_bytes == 0);
+
 	mutex_destroy(&arc_lowmem_lock);
 #ifdef _KERNEL
 	if (arc_event_lowmem != NULL)
@@ -4103,7 +4156,7 @@ arc_fini(void)
  */
 
 static boolean_t
-l2arc_write_eligible(spa_t *spa, arc_buf_hdr_t *ab)
+l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
 {
 	/*
 	 * A buffer is *not* eligible for the L2ARC if it:
@@ -4112,7 +4165,7 @@ l2arc_write_eligible(spa_t *spa, arc_buf_hdr_t *ab)
 	 * 3. has an I/O in progress (it may be an incomplete read).
 	 * 4. is flagged not eligible (zfs property).
 	 */
-	if (ab->b_spa != spa) {
+	if (ab->b_spa != spa_guid) {
 		ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch);
 		return (B_FALSE);
 	}
@@ -4399,11 +4452,15 @@ l2arc_read_done(zio_t *zio)
 		 * storage now.  If there *is* a waiter, the caller must
 		 * issue the i/o in a context where it's OK to block.
 		 */
-		if (zio->io_waiter == NULL)
-			zio_nowait(zio_read(zio->io_parent,
-			    cb->l2rcb_spa, &cb->l2rcb_bp,
+		if (zio->io_waiter == NULL) {
+			zio_t *pio = zio_unique_parent(zio);
+
+			ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
+
+			zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
 			    buf->b_data, zio->io_size, arc_read_done, buf,
 			    zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
+		}
 	}
 
 	kmem_free(cb, sizeof (l2arc_read_callback_t));
@@ -4600,6 +4657,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 	boolean_t have_lock, full;
 	l2arc_write_callback_t *cb;
 	zio_t *pio, *wzio;
+	uint64_t guid = spa_guid(spa);
 	int try;
 
 	ASSERT(dev->l2ad_vdev != NULL);
@@ -4661,7 +4719,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 				break;
 			}
 
-			if (!l2arc_write_eligible(spa, ab)) {
+			if (!l2arc_write_eligible(guid, ab)) {
 				mutex_exit(hash_lock);
 				continue;
 			}
@@ -5001,7 +5059,7 @@ l2arc_fini(void)
 void
 l2arc_start(void)
 {
-	if (!(spa_mode & FWRITE))
+	if (!(spa_mode_global & FWRITE))
 		return;
 
 	(void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
@@ -5011,7 +5069,7 @@ l2arc_start(void)
 void
 l2arc_stop(void)
 {
-	if (!(spa_mode & FWRITE))
+	if (!(spa_mode_global & FWRITE))
 		return;
 
 	mutex_enter(&l2arc_feed_thr_lock);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
index fe50ecf..cf983e2 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
@@ -327,7 +327,7 @@ dbuf_verify(dmu_buf_impl_t *db)
 		if (db->db_parent == dn->dn_dbuf) {
 			/* db is pointed to by the dnode */
 			/* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
-			if (db->db.db_object == DMU_META_DNODE_OBJECT)
+			if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
 				ASSERT(db->db_parent == NULL);
 			else
 				ASSERT(db->db_parent != NULL);
@@ -899,15 +899,11 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 	 * Shouldn't dirty a regular buffer in syncing context.  Private
 	 * objects may be dirtied in syncing context, but only if they
 	 * were already pre-dirtied in open context.
-	 * XXX We may want to prohibit dirtying in syncing context even
-	 * if they did pre-dirty.
 	 */
 	ASSERT(!dmu_tx_is_syncing(tx) ||
 	    BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
-	    dn->dn_object == DMU_META_DNODE_OBJECT ||
-	    dn->dn_objset->os_dsl_dataset == NULL ||
-	    dsl_dir_is_private(dn->dn_objset->os_dsl_dataset->ds_dir));
-
+	    DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
+	    dn->dn_objset->os_dsl_dataset == NULL);
 	/*
 	 * We make this assert for private objects as well, but after we
 	 * check if we're already dirty.  They are allowed to re-dirty
@@ -965,7 +961,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 	/*
 	 * Only valid if not already dirty.
 	 */
-	ASSERT(dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
+	ASSERT(dn->dn_object == 0 ||
+	    dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
 	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
 
 	ASSERT3U(dn->dn_nlevels, >, db->db_level);
@@ -977,15 +974,13 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 
 	/*
 	 * We should only be dirtying in syncing context if it's the
-	 * mos, a spa os, or we're initializing the os.  However, we are
-	 * allowed to dirty in syncing context provided we already
-	 * dirtied it in open context.  Hence we must make this
-	 * assertion only if we're not already dirty.
+	 * mos or we're initializing the os or it's a special object.
+	 * However, we are allowed to dirty in syncing context provided
+	 * we already dirtied it in open context.  Hence we must make
+	 * this assertion only if we're not already dirty.
 	 */
-	ASSERT(!dmu_tx_is_syncing(tx) ||
-	    os->os_dsl_dataset == NULL ||
-	    !dsl_dir_is_private(os->os_dsl_dataset->ds_dir) ||
-	    !BP_IS_HOLE(os->os_rootbp));
+	ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
+	    os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
 	ASSERT(db->db.db_size != 0);
 
 	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
@@ -1285,6 +1280,68 @@ dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
 }
 
 /*
+ * Directly assign a provided arc buf to a given dbuf if it's not referenced
+ * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
+ */
+void
+dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
+{
+	ASSERT(!refcount_is_zero(&db->db_holds));
+	ASSERT(db->db_dnode->dn_object != DMU_META_DNODE_OBJECT);
+	ASSERT(db->db_blkid != DB_BONUS_BLKID);
+	ASSERT(db->db_level == 0);
+	ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA);
+	ASSERT(buf != NULL);
+	ASSERT(arc_buf_size(buf) == db->db.db_size);
+	ASSERT(tx->tx_txg != 0);
+
+	arc_return_buf(buf, db);
+	ASSERT(arc_released(buf));
+
+	mutex_enter(&db->db_mtx);
+
+	while (db->db_state == DB_READ || db->db_state == DB_FILL)
+		cv_wait(&db->db_changed, &db->db_mtx);
+
+	ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED);
+
+	if (db->db_state == DB_CACHED &&
+	    refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
+		mutex_exit(&db->db_mtx);
+		(void) dbuf_dirty(db, tx);
+		bcopy(buf->b_data, db->db.db_data, db->db.db_size);
+		VERIFY(arc_buf_remove_ref(buf, db) == 1);
+		return;
+	}
+
+	if (db->db_state == DB_CACHED) {
+		dbuf_dirty_record_t *dr = db->db_last_dirty;
+
+		ASSERT(db->db_buf != NULL);
+		if (dr != NULL && dr->dr_txg == tx->tx_txg) {
+			ASSERT(dr->dt.dl.dr_data == db->db_buf);
+			if (!arc_released(db->db_buf)) {
+				ASSERT(dr->dt.dl.dr_override_state ==
+				    DR_OVERRIDDEN);
+				arc_release(db->db_buf, db);
+			}
+			dr->dt.dl.dr_data = buf;
+			VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1);
+		} else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
+			arc_release(db->db_buf, db);
+			VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1);
+		}
+		db->db_buf = NULL;
+	}
+	ASSERT(db->db_buf == NULL);
+	dbuf_set_data(db, buf);
+	db->db_state = DB_FILL;
+	mutex_exit(&db->db_mtx);
+	(void) dbuf_dirty(db, tx);
+	dbuf_fill_done(db, tx);
+}
+
+/*
  * "Clear" the contents of this dbuf.  This will mark the dbuf
  * EVICTING and clear *most* of its references.  Unfortunetely,
  * when we are not holding the dn_dbufs_mtx, we can't clear the
@@ -1827,6 +1884,19 @@ dmu_buf_get_user(dmu_buf_t *db_fake)
 	return (db->db_user_ptr);
 }
 
+boolean_t
+dmu_buf_freeable(dmu_buf_t *dbuf)
+{
+	boolean_t res = B_FALSE;
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
+
+	if (db->db_blkptr)
+		res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset,
+		    db->db_blkptr->blk_birth);
+
+	return (res);
+}
+
 static void
 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
 {
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
index 1152781..133343b 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -82,6 +82,8 @@ const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
 	{	byteswap_uint64_array,	TRUE,	"FUID table size"	},
 	{	zap_byteswap,		TRUE,	"DSL dataset next clones"},
 	{	zap_byteswap,		TRUE,	"scrub work queue"	},
+	{	zap_byteswap,		TRUE,	"ZFS user/group used"	},
+	{	zap_byteswap,		TRUE,	"ZFS user/group quota"	},
 };
 
 int
@@ -177,22 +179,22 @@ dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
  * whose dnodes are in the same block.
  */
 static int
-dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset,
-    uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
+dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
+    int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
 {
 	dsl_pool_t *dp = NULL;
 	dmu_buf_t **dbp;
 	uint64_t blkid, nblks, i;
-	uint32_t flags;
+	uint32_t dbuf_flags;
 	int err;
 	zio_t *zio;
 	hrtime_t start;
 
 	ASSERT(length <= DMU_MAX_ACCESS);
 
-	flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT;
-	if (length > zfetch_array_rd_sz)
-		flags |= DB_RF_NOPREFETCH;
+	dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT;
+	if (flags & DMU_READ_NO_PREFETCH || length > zfetch_array_rd_sz)
+		dbuf_flags |= DB_RF_NOPREFETCH;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	if (dn->dn_datablkshift) {
@@ -230,7 +232,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset,
 		/* initiate async i/o */
 		if (read) {
 			rw_exit(&dn->dn_struct_rwlock);
-			(void) dbuf_read(db, zio, flags);
+			(void) dbuf_read(db, zio, dbuf_flags);
 			rw_enter(&dn->dn_struct_rwlock, RW_READER);
 		}
 		dbp[i] = &db->db;
@@ -282,7 +284,7 @@ dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
 		return (err);
 
 	err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
-	    numbufsp, dbpp);
+	    numbufsp, dbpp, DMU_READ_PREFETCH);
 
 	dnode_rele(dn, FTAG);
 
@@ -297,7 +299,7 @@ dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset,
 	int err;
 
 	err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
-	    numbufsp, dbpp);
+	    numbufsp, dbpp, DMU_READ_PREFETCH);
 
 	return (err);
 }
@@ -434,7 +436,8 @@ dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
 	object_size = align == 1 ? dn->dn_datablksz :
 	    (dn->dn_maxblkid + 1) << dn->dn_datablkshift;
 
-	if (trunc || (end = offset + length) > object_size)
+	end = offset + length;
+	if (trunc || end > object_size)
 		end = object_size;
 	if (end <= offset)
 		return (0);
@@ -442,6 +445,7 @@ dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
 
 	while (length) {
 		start = end;
+		/* assert(offset <= start) */
 		err = get_next_chunk(dn, &start, offset);
 		if (err)
 			return (err);
@@ -532,7 +536,7 @@ dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
 
 int
 dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
-    void *buf)
+    void *buf, uint32_t flags)
 {
 	dnode_t *dn;
 	dmu_buf_t **dbp;
@@ -562,7 +566,7 @@ dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 		 * to be reading in parallel.
 		 */
 		err = dmu_buf_hold_array_by_dnode(dn, offset, mylen,
-		    TRUE, FTAG, &numbufs, &dbp);
+		    TRUE, FTAG, &numbufs, &dbp, flags);
 		if (err)
 			break;
 
@@ -771,9 +775,6 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 		if (tocpy == db->db_size)
 			dmu_buf_fill_done(db, tx);
 
-		if (err)
-			break;
-
 		offset += tocpy;
 		size -= tocpy;
 	}
@@ -783,6 +784,58 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 #endif	/* !__FreeBSD__ */
 #endif	/* _KERNEL */
 
+/*
+ * Allocate a loaned anonymous arc buffer.
+ */
+arc_buf_t *
+dmu_request_arcbuf(dmu_buf_t *handle, int size)
+{
+	dnode_t *dn = ((dmu_buf_impl_t *)handle)->db_dnode;
+
+	return (arc_loan_buf(dn->dn_objset->os_spa, size));
+}
+
+/*
+ * Free a loaned arc buffer.
+ */
+void
+dmu_return_arcbuf(arc_buf_t *buf)
+{
+	arc_return_buf(buf, FTAG);
+	VERIFY(arc_buf_remove_ref(buf, FTAG) == 1);
+}
+
+/*
+ * When possible directly assign passed loaned arc buffer to a dbuf.
+ * If this is not possible copy the contents of passed arc buf via
+ * dmu_write().
+ */
+void
+dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
+    dmu_tx_t *tx)
+{
+	dnode_t *dn = ((dmu_buf_impl_t *)handle)->db_dnode;
+	dmu_buf_impl_t *db;
+	uint32_t blksz = (uint32_t)arc_buf_size(buf);
+	uint64_t blkid;
+
+	rw_enter(&dn->dn_struct_rwlock, RW_READER);
+	blkid = dbuf_whichblock(dn, offset);
+	VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL);
+	rw_exit(&dn->dn_struct_rwlock);
+
+	if (offset == db->db.db_offset && blksz == db->db.db_size) {
+		dbuf_assign_arcbuf(db, buf, tx);
+		dbuf_rele(db, FTAG);
+	} else {
+		dbuf_rele(db, FTAG);
+		ASSERT(dn->dn_objset->os.os == dn->dn_objset);
+		dmu_write(&dn->dn_objset->os, dn->dn_object, offset, blksz,
+		    buf->b_data, tx);
+		dmu_return_arcbuf(buf);
+	}
+}
+
 typedef struct {
 	dbuf_dirty_record_t	*dr;
 	dmu_sync_cb_t		*done;
@@ -794,14 +847,20 @@ static void
 dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
 {
 	blkptr_t *bp = zio->io_bp;
+	dmu_sync_arg_t *in = varg;
+	dbuf_dirty_record_t *dr = in->dr;
+	dmu_buf_impl_t *db = dr->dr_dbuf;
 
 	if (!BP_IS_HOLE(bp)) {
-		dmu_sync_arg_t *in = varg;
-		dbuf_dirty_record_t *dr = in->dr;
-		dmu_buf_impl_t *db = dr->dr_dbuf;
 		ASSERT(BP_GET_TYPE(bp) == db->db_dnode->dn_type);
 		ASSERT(BP_GET_LEVEL(bp) == 0);
 		bp->blk_fill = 1;
+	} else {
+		/*
+		 * dmu_sync() can compress a block of zeros to a null blkptr
+		 * but the block size still needs to be passed through to replay
+		 */
+		BP_SET_LSIZE(bp, db->db.db_size);
 	}
 }
 
@@ -817,6 +876,8 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
 	mutex_enter(&db->db_mtx);
 	ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
 	dr->dt.dl.dr_overridden_by = *zio->io_bp; /* structure assignment */
+	if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by))
+		BP_ZERO(&dr->dt.dl.dr_overridden_by);
 	dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
 	cv_broadcast(&db->db_changed);
 	mutex_exit(&db->db_mtx);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
index c9e00d5..2678b83 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -164,10 +164,15 @@ dmu_objset_byteswap(void *buf, size_t size)
 {
 	objset_phys_t *osp = buf;
 
-	ASSERT(size == sizeof (objset_phys_t));
+	ASSERT(size == OBJSET_OLD_PHYS_SIZE || size == sizeof (objset_phys_t));
 	dnode_byteswap(&osp->os_meta_dnode);
 	byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t));
 	osp->os_type = BSWAP_64(osp->os_type);
+	osp->os_flags = BSWAP_64(osp->os_flags);
+	if (size == sizeof (objset_phys_t)) {
+		dnode_byteswap(&osp->os_userused_dnode);
+		dnode_byteswap(&osp->os_groupused_dnode);
+	}
 }
 
 int
@@ -210,12 +215,30 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
 				err = EIO;
 			return (err);
 		}
+
+		/* Increase the blocksize if we are permitted. */
+		if (spa_version(spa) >= SPA_VERSION_USERSPACE &&
+		    arc_buf_size(osi->os_phys_buf) < sizeof (objset_phys_t)) {
+			arc_buf_t *buf = arc_buf_alloc(spa,
+			    sizeof (objset_phys_t), &osi->os_phys_buf,
+			    ARC_BUFC_METADATA);
+			bzero(buf->b_data, sizeof (objset_phys_t));
+			bcopy(osi->os_phys_buf->b_data, buf->b_data,
+			    arc_buf_size(osi->os_phys_buf));
+			(void) arc_buf_remove_ref(osi->os_phys_buf,
+			    &osi->os_phys_buf);
+			osi->os_phys_buf = buf;
+		}
+
 		osi->os_phys = osi->os_phys_buf->b_data;
+		osi->os_flags = osi->os_phys->os_flags;
 	} else {
-		osi->os_phys_buf = arc_buf_alloc(spa, sizeof (objset_phys_t),
+		int size = spa_version(spa) >= SPA_VERSION_USERSPACE ?
+		    sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE;
+		osi->os_phys_buf = arc_buf_alloc(spa, size,
 		    &osi->os_phys_buf, ARC_BUFC_METADATA);
 		osi->os_phys = osi->os_phys_buf->b_data;
-		bzero(osi->os_phys, sizeof (objset_phys_t));
+		bzero(osi->os_phys, size);
 	}
 
 	/*
@@ -276,6 +299,12 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
 
 	osi->os_meta_dnode = dnode_special_open(osi,
 	    &osi->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT);
+	if (arc_buf_size(osi->os_phys_buf) >= sizeof (objset_phys_t)) {
+		osi->os_userused_dnode = dnode_special_open(osi,
+		    &osi->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT);
+		osi->os_groupused_dnode = dnode_special_open(osi,
+		    &osi->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT);
+	}
 
 	/*
 	 * We should be the only thread trying to do this because we
@@ -456,13 +485,15 @@ dmu_objset_evict(dsl_dataset_t *ds, void *arg)
 	os.os = osi;
 	(void) dmu_objset_evict_dbufs(&os);
 
-	ASSERT3P(list_head(&osi->os_dnodes), ==, osi->os_meta_dnode);
-	ASSERT3P(list_tail(&osi->os_dnodes), ==, osi->os_meta_dnode);
-	ASSERT3P(list_head(&osi->os_meta_dnode->dn_dbufs), ==, NULL);
-
 	dnode_special_close(osi->os_meta_dnode);
+	if (osi->os_userused_dnode) {
+		dnode_special_close(osi->os_userused_dnode);
+		dnode_special_close(osi->os_groupused_dnode);
+	}
 	zil_free(osi->os_zil);
 
+	ASSERT3P(list_head(&osi->os_dnodes), ==, NULL);
+
 	VERIFY(arc_buf_remove_ref(osi->os_phys_buf, &osi->os_phys_buf) == 1);
 	mutex_destroy(&osi->os_lock);
 	mutex_destroy(&osi->os_obj_lock);
@@ -520,6 +551,10 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
 	ASSERT(type != DMU_OST_ANY);
 	ASSERT(type < DMU_OST_NUMTYPES);
 	osi->os_phys->os_type = type;
+	if (dmu_objset_userused_enabled(osi)) {
+		osi->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
+		osi->os_flags = osi->os_phys->os_flags;
+	}
 
 	dsl_dataset_dirty(ds, tx);
 
@@ -704,13 +739,33 @@ struct snaparg {
 	char *snapname;
 	char failed[MAXPATHLEN];
 	boolean_t checkperms;
-	list_t objsets;
+	nvlist_t *props;
 };
 
-struct osnode {
-	list_node_t node;
-	objset_t *os;
-};
+static int
+snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+	objset_t *os = arg1;
+	struct snaparg *sn = arg2;
+
+	/* The props have already been checked by zfs_check_userprops(). */
+
+	return (dsl_dataset_snapshot_check(os->os->os_dsl_dataset,
+	    sn->snapname, tx));
+}
+
+static void
+snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+{
+	objset_t *os = arg1;
+	dsl_dataset_t *ds = os->os->os_dsl_dataset;
+	struct snaparg *sn = arg2;
+
+	dsl_dataset_snapshot_sync(ds, sn->snapname, cr, tx);
+
+	if (sn->props)
+		dsl_props_set_sync(ds->ds_prev, sn->props, cr, tx);
+}
 
 static int
 dmu_objset_snapshot_one(char *name, void *arg)
@@ -747,13 +802,8 @@ dmu_objset_snapshot_one(char *name, void *arg)
 	 */
 	err = zil_suspend(dmu_objset_zil(os));
 	if (err == 0) {
-		struct osnode *osn;
-		dsl_sync_task_create(sn->dstg, dsl_dataset_snapshot_check,
-		    dsl_dataset_snapshot_sync, os->os->os_dsl_dataset,
-		    sn->snapname, 3);
-		osn = kmem_alloc(sizeof (struct osnode), KM_SLEEP);
-		osn->os = os;
-		list_insert_tail(&sn->objsets, osn);
+		dsl_sync_task_create(sn->dstg, snapshot_check,
+		    snapshot_sync, os, sn, 3);
 	} else {
 		dmu_objset_close(os);
 	}
@@ -762,11 +812,11 @@ dmu_objset_snapshot_one(char *name, void *arg)
 }
 
 int
-dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive)
+dmu_objset_snapshot(char *fsname, char *snapname,
+    nvlist_t *props, boolean_t recursive)
 {
 	dsl_sync_task_t *dst;
-	struct osnode *osn;
-	struct snaparg sn = { 0 };
+	struct snaparg sn;
 	spa_t *spa;
 	int err;
 
@@ -778,8 +828,7 @@ dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive)
 
 	sn.dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
 	sn.snapname = snapname;
-	list_create(&sn.objsets, sizeof (struct osnode),
-	    offsetof(struct osnode, node));
+	sn.props = props;
 
 	if (recursive) {
 		sn.checkperms = B_TRUE;
@@ -790,27 +839,19 @@ dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive)
 		err = dmu_objset_snapshot_one(fsname, &sn);
 	}
 
-	if (err)
-		goto out;
-
-	err = dsl_sync_task_group_wait(sn.dstg);
+	if (err == 0)
+		err = dsl_sync_task_group_wait(sn.dstg);
 
 	for (dst = list_head(&sn.dstg->dstg_tasks); dst;
 	    dst = list_next(&sn.dstg->dstg_tasks, dst)) {
-		dsl_dataset_t *ds = dst->dst_arg1;
+		objset_t *os = dst->dst_arg1;
+		dsl_dataset_t *ds = os->os->os_dsl_dataset;
 		if (dst->dst_err)
 			dsl_dataset_name(ds, sn.failed);
+		zil_resume(dmu_objset_zil(os));
+		dmu_objset_close(os);
 	}
 
-out:
-	while (osn = list_head(&sn.objsets)) {
-		list_remove(&sn.objsets, osn);
-		zil_resume(dmu_objset_zil(osn->os));
-		dmu_objset_close(osn->os);
-		kmem_free(osn, sizeof (struct osnode));
-	}
-	list_destroy(&sn.objsets);
-
 	if (err)
 		(void) strcpy(fsname, sn.failed);
 	dsl_sync_task_group_destroy(sn.dstg);
@@ -819,7 +860,7 @@ out:
 }
 
 static void
-dmu_objset_sync_dnodes(list_t *list, dmu_tx_t *tx)
+dmu_objset_sync_dnodes(list_t *list, list_t *newlist, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 
@@ -827,14 +868,20 @@ dmu_objset_sync_dnodes(list_t *list, dmu_tx_t *tx)
 		ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
 		ASSERT(dn->dn_dbuf->db_data_pending);
 		/*
-		 * Initialize dn_zio outside dnode_sync()
-		 * to accomodate meta-dnode
+		 * Initialize dn_zio outside dnode_sync() because the
+		 * meta-dnode needs to set it ouside dnode_sync().
 		 */
 		dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio;
 		ASSERT(dn->dn_zio);
 
 		ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS);
 		list_remove(list, dn);
+
+		if (newlist) {
+			(void) dnode_add_ref(dn, newlist);
+			list_insert_tail(newlist, dn);
+		}
+
 		dnode_sync(dn, tx);
 	}
 }
@@ -853,9 +900,12 @@ ready(zio_t *zio, arc_buf_t *abuf, void *arg)
 	ASSERT(BP_GET_LEVEL(bp) == 0);
 
 	/*
-	 * Update rootbp fill count.
+	 * Update rootbp fill count: it should be the number of objects
+	 * allocated in the object set (not counting the "special"
+	 * objects that are stored in the objset_phys_t -- the meta
+	 * dnode and user/group accounting objects).
 	 */
-	bp->blk_fill = 1;	/* count the meta-dnode */
+	bp->blk_fill = 0;
 	for (int i = 0; i < dnp->dn_nblkptr; i++)
 		bp->blk_fill += dnp->dn_blkptr[i].blk_fill;
 
@@ -878,6 +928,7 @@ dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx)
 	writeprops_t wp = { 0 };
 	zio_t *zio;
 	list_t *list;
+	list_t *newlist = NULL;
 	dbuf_dirty_record_t *dr;
 
 	dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg);
@@ -915,20 +966,41 @@ dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx)
 	}
 
 	arc_release(os->os_phys_buf, &os->os_phys_buf);
+
 	zio = arc_write(pio, os->os_spa, &wp, DMU_OS_IS_L2CACHEABLE(os),
 	    tx->tx_txg, os->os_rootbp, os->os_phys_buf, ready, NULL, os,
 	    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
 
 	/*
-	 * Sync meta-dnode - the parent IO for the sync is the root block
+	 * Sync special dnodes - the parent IO for the sync is the root block
 	 */
 	os->os_meta_dnode->dn_zio = zio;
 	dnode_sync(os->os_meta_dnode, tx);
 
+	os->os_phys->os_flags = os->os_flags;
+
+	if (os->os_userused_dnode &&
+	    os->os_userused_dnode->dn_type != DMU_OT_NONE) {
+		os->os_userused_dnode->dn_zio = zio;
+		dnode_sync(os->os_userused_dnode, tx);
+		os->os_groupused_dnode->dn_zio = zio;
+		dnode_sync(os->os_groupused_dnode, tx);
+	}
+
 	txgoff = tx->tx_txg & TXG_MASK;
 
-	dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], tx);
-	dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], tx);
+	if (dmu_objset_userused_enabled(os)) {
+		newlist = &os->os_synced_dnodes;
+		/*
+		 * We must create the list here because it uses the
+		 * dn_dirty_link[] of this txg.
+		 */
+		list_create(newlist, sizeof (dnode_t),
+		    offsetof(dnode_t, dn_dirty_link[txgoff]));
+	}
+
+	dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], newlist, tx);
+	dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], newlist, tx);
 
 	list = &os->os_meta_dnode->dn_dirty_records[txgoff];
 	while (dr = list_head(list)) {
@@ -945,6 +1017,146 @@ dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx)
 	zio_nowait(zio);
 }
 
+static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES];
+
+void
+dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb)
+{
+	used_cbs[ost] = cb;
+}
+
+boolean_t
+dmu_objset_userused_enabled(objset_impl_t *os)
+{
+	return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE &&
+	    used_cbs[os->os_phys->os_type] &&
+	    os->os_userused_dnode);
+}
+
+void
+dmu_objset_do_userquota_callbacks(objset_impl_t *os, dmu_tx_t *tx)
+{
+	dnode_t *dn;
+	list_t *list = &os->os_synced_dnodes;
+	static const char zerobuf[DN_MAX_BONUSLEN] = {0};
+
+	ASSERT(list_head(list) == NULL || dmu_objset_userused_enabled(os));
+
+	while (dn = list_head(list)) {
+		dmu_object_type_t bonustype;
+
+		ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object));
+		ASSERT(dn->dn_oldphys);
+		ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE ||
+		    dn->dn_phys->dn_flags &
+		    DNODE_FLAG_USERUSED_ACCOUNTED);
+
+		/* Allocate the user/groupused objects if necessary. */
+		if (os->os_userused_dnode->dn_type == DMU_OT_NONE) {
+			VERIFY(0 == zap_create_claim(&os->os,
+			    DMU_USERUSED_OBJECT,
+			    DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
+			VERIFY(0 == zap_create_claim(&os->os,
+			    DMU_GROUPUSED_OBJECT,
+			    DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
+		}
+
+		/*
+		 * If the object was not previously
+		 * accounted, pretend that it was free.
+		 */
+		if (!(dn->dn_oldphys->dn_flags &
+		    DNODE_FLAG_USERUSED_ACCOUNTED)) {
+			bzero(dn->dn_oldphys, sizeof (dnode_phys_t));
+		}
+
+		/*
+		 * If the object was freed, use the previous bonustype.
+		 */
+		bonustype = dn->dn_phys->dn_bonustype ?
+		    dn->dn_phys->dn_bonustype : dn->dn_oldphys->dn_bonustype;
+		ASSERT(dn->dn_phys->dn_type != 0 ||
+		    (bcmp(DN_BONUS(dn->dn_phys), zerobuf,
+		    DN_MAX_BONUSLEN) == 0 &&
+		    DN_USED_BYTES(dn->dn_phys) == 0));
+		ASSERT(dn->dn_oldphys->dn_type != 0 ||
+		    (bcmp(DN_BONUS(dn->dn_oldphys), zerobuf,
+		    DN_MAX_BONUSLEN) == 0 &&
+		    DN_USED_BYTES(dn->dn_oldphys) == 0));
+		used_cbs[os->os_phys->os_type](&os->os, bonustype,
+		    DN_BONUS(dn->dn_oldphys), DN_BONUS(dn->dn_phys),
+		    DN_USED_BYTES(dn->dn_oldphys),
+		    DN_USED_BYTES(dn->dn_phys), tx);
+
+		/*
+		 * The mutex is needed here for interlock with dnode_allocate.
+		 */
+		mutex_enter(&dn->dn_mtx);
+		zio_buf_free(dn->dn_oldphys, sizeof (dnode_phys_t));
+		dn->dn_oldphys = NULL;
+		mutex_exit(&dn->dn_mtx);
+
+		list_remove(list, dn);
+		dnode_rele(dn, list);
+	}
+}
+
+boolean_t
+dmu_objset_userspace_present(objset_t *os)
+{
+	return (os->os->os_phys->os_flags &
+	    OBJSET_FLAG_USERACCOUNTING_COMPLETE);
+}
+
+int
+dmu_objset_userspace_upgrade(objset_t *os)
+{
+	uint64_t obj;
+	int err = 0;
+
+	if (dmu_objset_userspace_present(os))
+		return (0);
+	if (!dmu_objset_userused_enabled(os->os))
+		return (ENOTSUP);
+	if (dmu_objset_is_snapshot(os))
+		return (EINVAL);
+
+	/*
+	 * We simply need to mark every object dirty, so that it will be
+	 * synced out and now accounted.  If this is called
+	 * concurrently, or if we already did some work before crashing,
+	 * that's fine, since we track each object's accounted state
+	 * independently.
+	 */
+
+	for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) {
+		dmu_tx_t *tx;
+		dmu_buf_t *db;
+		int objerr;
+
+		if (issig(JUSTLOOKING) && issig(FORREAL))
+			return (EINTR);
+
+		objerr = dmu_bonus_hold(os, obj, FTAG, &db);
+		if (objerr)
+			continue;
+		tx = dmu_tx_create(os);
+		dmu_tx_hold_bonus(tx, obj);
+		objerr = dmu_tx_assign(tx, TXG_WAIT);
+		if (objerr) {
+			dmu_tx_abort(tx);
+			continue;
+		}
+		dmu_buf_will_dirty(db, tx);
+		dmu_buf_rele(db, FTAG);
+		dmu_tx_commit(tx);
+	}
+
+	os->os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
+	txg_wait_synced(dmu_objset_pool(os), 0);
+	return (0);
+}
+
 void
 dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
     uint64_t *usedobjsp, uint64_t *availobjsp)
@@ -978,6 +1190,8 @@ dmu_objset_stats(objset_t *os, nvlist_t *nv)
 
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE,
 	    os->os->os_phys->os_type);
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERACCOUNTING,
+	    dmu_objset_userspace_present(os));
 }
 
 int
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
index 6effae8..ed5afb4 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
@@ -180,7 +180,9 @@ backup_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
 	if (issig(JUSTLOOKING) && issig(FORREAL))
 		return (EINTR);
 
-	if (bp == NULL && zb->zb_object == 0) {
+	if (zb->zb_object != 0 && DMU_OBJECT_IS_SPECIAL(zb->zb_object)) {
+		return (0);
+	} else if (bp == NULL && zb->zb_object == 0) {
 		uint64_t span = BP_SPAN(dnp, zb->zb_level);
 		uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT;
 		err = dump_freeobjects(ba, dnobj, span >> DNODE_SHIFT);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
index ef0284d..89cbfad 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
@@ -64,6 +64,9 @@ struct traverse_data {
 	void *td_arg;
 };
 
+static int traverse_dnode(struct traverse_data *td, const dnode_phys_t *dnp,
+    arc_buf_t *buf, uint64_t objset, uint64_t object);
+
 /* ARGSUSED */
 static void
 traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
@@ -119,7 +122,7 @@ traverse_zil(struct traverse_data *td, zil_header_t *zh)
 	 * We only want to visit blocks that have been claimed but not yet
 	 * replayed (or, in read-only mode, blocks that *would* be claimed).
 	 */
-	if (claim_txg == 0 && (spa_mode & FWRITE))
+	if (claim_txg == 0 && spa_writeable(td->td_spa))
 		return;
 
 	zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh);
@@ -189,7 +192,7 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
 		}
 	} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
 		uint32_t flags = ARC_WAIT;
-		int i, j;
+		int i;
 		int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
 
 		err = arc_read(NULL, td->td_spa, bp, pbuf,
@@ -201,20 +204,15 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
 		/* recursively visitbp() blocks below this */
 		dnp = buf->b_data;
 		for (i = 0; i < epb && err == 0; i++, dnp++) {
-			for (j = 0; j < dnp->dn_nblkptr; j++) {
-				SET_BOOKMARK(&czb, zb->zb_objset,
-				    zb->zb_blkid * epb + i,
-				    dnp->dn_nlevels - 1, j);
-				err = traverse_visitbp(td, dnp, buf,
-				    (blkptr_t *)&dnp->dn_blkptr[j], &czb);
-				if (err)
-					break;
-			}
+			err = traverse_dnode(td, dnp, buf, zb->zb_objset,
+			    zb->zb_blkid * epb + i);
+			if (err)
+				break;
 		}
 	} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
 		uint32_t flags = ARC_WAIT;
 		objset_phys_t *osp;
-		int j;
+		dnode_phys_t *dnp;
 
 		err = arc_read_nolock(NULL, td->td_spa, bp,
 		    arc_getbuf_func, &buf,
@@ -225,14 +223,17 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
 		osp = buf->b_data;
 		traverse_zil(td, &osp->os_zil_header);
 
-		for (j = 0; j < osp->os_meta_dnode.dn_nblkptr; j++) {
-			SET_BOOKMARK(&czb, zb->zb_objset, 0,
-			    osp->os_meta_dnode.dn_nlevels - 1, j);
-			err = traverse_visitbp(td, &osp->os_meta_dnode, buf,
-			    (blkptr_t *)&osp->os_meta_dnode.dn_blkptr[j],
-			    &czb);
-			if (err)
-				break;
+		dnp = &osp->os_meta_dnode;
+		err = traverse_dnode(td, dnp, buf, zb->zb_objset, 0);
+		if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
+			dnp = &osp->os_userused_dnode;
+			err = traverse_dnode(td, dnp, buf, zb->zb_objset,
+			    DMU_USERUSED_OBJECT);
+		}
+		if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
+			dnp = &osp->os_groupused_dnode;
+			err = traverse_dnode(td, dnp, buf, zb->zb_objset,
+			    DMU_GROUPUSED_OBJECT);
 		}
 	}
 
@@ -245,6 +246,23 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
 	return (err);
 }
 
+static int
+traverse_dnode(struct traverse_data *td, const dnode_phys_t *dnp,
+    arc_buf_t *buf, uint64_t objset, uint64_t object)
+{
+	int j, err = 0;
+	zbookmark_t czb;
+
+	for (j = 0; j < dnp->dn_nblkptr; j++) {
+		SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
+		err = traverse_visitbp(td, dnp, buf,
+		    (blkptr_t *)&dnp->dn_blkptr[j], &czb);
+		if (err)
+			break;
+	}
+	return (err);
+}
+
 /* ARGSUSED */
 static int
 traverse_prefetcher(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
index bfa5699..b6a5cdb 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -160,6 +160,41 @@ dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
 	return (err);
 }
 
+static void
+dmu_tx_count_indirects(dmu_tx_hold_t *txh, dmu_buf_impl_t *db,
+    boolean_t freeable, dmu_buf_impl_t **history)
+{
+	int i = db->db_level + 1;
+	dnode_t *dn = db->db_dnode;
+
+	if (i >= dn->dn_nlevels)
+		return;
+
+	db = db->db_parent;
+	if (db == NULL) {
+		uint64_t lvls = dn->dn_nlevels - i;
+
+		txh->txh_space_towrite += lvls << dn->dn_indblkshift;
+		return;
+	}
+
+	if (db != history[i]) {
+		dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
+		uint64_t space = 1ULL << dn->dn_indblkshift;
+
+		freeable = (db->db_blkptr && (freeable ||
+		    dsl_dataset_block_freeable(ds, db->db_blkptr->blk_birth)));
+		if (freeable)
+			txh->txh_space_tooverwrite += space;
+		else
+			txh->txh_space_towrite += space;
+		if (db->db_blkptr)
+			txh->txh_space_tounref += space;
+		history[i] = db;
+		dmu_tx_count_indirects(txh, db, freeable, history);
+	}
+}
+
 /* ARGSUSED */
 static void
 dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
@@ -177,17 +212,26 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 	min_ibs = DN_MIN_INDBLKSHIFT;
 	max_ibs = DN_MAX_INDBLKSHIFT;
 
+	if (dn) {
+		dmu_buf_impl_t *last[DN_MAX_LEVELS];
+		int nlvls = dn->dn_nlevels;
+		int delta;
 
-	/*
-	 * For i/o error checking, read the first and last level-0
-	 * blocks (if they are not aligned), and all the level-1 blocks.
-	 */
+		/*
+		 * For i/o error checking, read the first and last level-0
+		 * blocks (if they are not aligned), and all the level-1 blocks.
+		 */
 
-	if (dn) {
 		if (dn->dn_maxblkid == 0) {
-			err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
-			if (err)
-				goto out;
+			delta = dn->dn_datablksz;
+			start = (off < dn->dn_datablksz) ? 0 : 1;
+			end = (off+len <= dn->dn_datablksz) ? 0 : 1;
+			if (start == 0 && (off > 0 || len < dn->dn_datablksz)) {
+				err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
+				if (err)
+					goto out;
+				delta -= off;
+			}
 		} else {
 			zio_t *zio = zio_root(dn->dn_objset->os_spa,
 			    NULL, NULL, ZIO_FLAG_CANFAIL);
@@ -211,10 +255,9 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 			}
 
 			/* level-1 blocks */
-			if (dn->dn_nlevels > 1) {
-				start >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT;
-				end >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT;
-				for (i = start+1; i < end; i++) {
+			if (nlvls > 1) {
+				int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+				for (i = (start>>shft)+1; i < end>>shft; i++) {
 					err = dmu_tx_check_ioerr(zio, dn, 1, i);
 					if (err)
 						goto out;
@@ -224,20 +267,70 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 			err = zio_wait(zio);
 			if (err)
 				goto out;
+			delta = P2NPHASE(off, dn->dn_datablksz);
 		}
-	}
 
-	/*
-	 * If there's more than one block, the blocksize can't change,
-	 * so we can make a more precise estimate.  Alternatively,
-	 * if the dnode's ibs is larger than max_ibs, always use that.
-	 * This ensures that if we reduce DN_MAX_INDBLKSHIFT,
-	 * the code will still work correctly on existing pools.
-	 */
-	if (dn && (dn->dn_maxblkid != 0 || dn->dn_indblkshift > max_ibs)) {
-		min_ibs = max_ibs = dn->dn_indblkshift;
-		if (dn->dn_datablkshift != 0)
+		if (dn->dn_maxblkid > 0) {
+			/*
+			 * The blocksize can't change,
+			 * so we can make a more precise estimate.
+			 */
+			ASSERT(dn->dn_datablkshift != 0);
 			min_bs = max_bs = dn->dn_datablkshift;
+			min_ibs = max_ibs = dn->dn_indblkshift;
+		} else if (dn->dn_indblkshift > max_ibs) {
+			/*
+			 * This ensures that if we reduce DN_MAX_INDBLKSHIFT,
+			 * the code will still work correctly on older pools.
+			 */
+			min_ibs = max_ibs = dn->dn_indblkshift;
+		}
+
+		/*
+		 * If this write is not off the end of the file
+		 * we need to account for overwrites/unref.
+		 */
+		if (start <= dn->dn_maxblkid)
+			bzero(last, sizeof (dmu_buf_impl_t *) * DN_MAX_LEVELS);
+		while (start <= dn->dn_maxblkid) {
+			spa_t *spa = txh->txh_tx->tx_pool->dp_spa;
+			dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
+			dmu_buf_impl_t *db;
+
+			rw_enter(&dn->dn_struct_rwlock, RW_READER);
+			db = dbuf_hold_level(dn, 0, start, FTAG);
+			rw_exit(&dn->dn_struct_rwlock);
+			if (db->db_blkptr && dsl_dataset_block_freeable(ds,
+			    db->db_blkptr->blk_birth)) {
+				dprintf_bp(db->db_blkptr, "can free old%s", "");
+				txh->txh_space_tooverwrite += dn->dn_datablksz;
+				txh->txh_space_tounref += dn->dn_datablksz;
+				dmu_tx_count_indirects(txh, db, TRUE, last);
+			} else {
+				txh->txh_space_towrite += dn->dn_datablksz;
+				if (db->db_blkptr)
+					txh->txh_space_tounref +=
+					    bp_get_dasize(spa, db->db_blkptr);
+				dmu_tx_count_indirects(txh, db, FALSE, last);
+			}
+			dbuf_rele(db, FTAG);
+			if (++start > end) {
+				/*
+				 * Account for new indirects appearing
+				 * before this IO gets assigned into a txg.
+				 */
+				bits = 64 - min_bs;
+				epbs = min_ibs - SPA_BLKPTRSHIFT;
+				for (bits -= epbs * (nlvls - 1);
+				    bits >= 0; bits -= epbs)
+					txh->txh_fudge += 1ULL << max_ibs;
+				goto out;
+			}
+			off += delta;
+			if (len >= delta)
+				len -= delta;
+			delta = dn->dn_datablksz;
+		}
 	}
 
 	/*
@@ -260,20 +353,22 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 	for (bits = 64 - min_bs; bits >= 0; bits -= epbs) {
 		start >>= epbs;
 		end >>= epbs;
-		/*
-		 * If we increase the number of levels of indirection,
-		 * we'll need new blkid=0 indirect blocks.  If start == 0,
-		 * we're already accounting for that blocks; and if end == 0,
-		 * we can't increase the number of levels beyond that.
-		 */
-		if (start != 0 && end != 0)
-			txh->txh_space_towrite += 1ULL << max_ibs;
+		ASSERT3U(end, >=, start);
 		txh->txh_space_towrite += (end - start + 1) << max_ibs;
+		if (start != 0) {
+			/*
+			 * We also need a new blkid=0 indirect block
+			 * to reference any existing file data.
+			 */
+			txh->txh_space_towrite += 1ULL << max_ibs;
+		}
 	}
 
-	ASSERT(txh->txh_space_towrite < 2 * DMU_MAX_ACCESS);
-
 out:
+	if (txh->txh_space_towrite + txh->txh_space_tooverwrite >
+	    2 * DMU_MAX_ACCESS)
+		err = EFBIG;
+
 	if (err)
 		txh->txh_tx->tx_err = err;
 }
@@ -290,6 +385,7 @@ dmu_tx_count_dnode(dmu_tx_hold_t *txh)
 	    dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
 	    dn->dn_dbuf->db_blkptr->blk_birth)) {
 		txh->txh_space_tooverwrite += space;
+		txh->txh_space_tounref += space;
 	} else {
 		txh->txh_space_towrite += space;
 		if (dn && dn->dn_dbuf->db_blkptr)
@@ -533,7 +629,7 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
 }
 
 void
-dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name)
+dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
 {
 	dmu_tx_hold_t *txh;
 	dnode_t *dn;
@@ -601,12 +697,8 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name)
 		}
 	}
 
-	/*
-	 * 3 blocks overwritten: target leaf, ptrtbl block, header block
-	 * 3 new blocks written if adding: new split leaf, 2 grown ptrtbl blocks
-	 */
-	dmu_tx_count_write(txh, dn->dn_maxblkid * dn->dn_datablksz,
-	    (3 + (add ? 3 : 0)) << dn->dn_datablkshift);
+	err = zap_count_write(&dn->dn_objset->os, dn->dn_object, name, add,
+	    &txh->txh_space_towrite, &txh->txh_space_tooverwrite);
 
 	/*
 	 * If the modified blocks are scattered to the four winds,
@@ -614,7 +706,10 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name)
 	 */
 	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 	for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs)
-		txh->txh_space_towrite += 3 << dn->dn_indblkshift;
+		if (dn->dn_objset->os_dsl_dataset->ds_phys->ds_prev_snap_obj)
+			txh->txh_space_towrite += 3 << dn->dn_indblkshift;
+		else
+			txh->txh_space_tooverwrite += 3 << dn->dn_indblkshift;
 }
 
 void
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
index f0b4080..f9661d6 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
@@ -156,7 +156,7 @@ dnode_verify(dnode_t *dn)
 	}
 	if (dn->dn_phys->dn_type != DMU_OT_NONE)
 		ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels);
-	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || dn->dn_dbuf != NULL);
+	ASSERT(DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_dbuf != NULL);
 	if (dn->dn_dbuf != NULL) {
 		ASSERT3P(dn->dn_phys, ==,
 		    (dnode_phys_t *)dn->dn_dbuf->db.db_data +
@@ -320,6 +320,7 @@ dnode_destroy(dnode_t *dn)
 	}
 	ASSERT(NULL == list_head(&dn->dn_dbufs));
 #endif
+	ASSERT(dn->dn_oldphys == NULL);
 
 	mutex_enter(&os->os_lock);
 	list_remove(&os->os_dnodes, dn);
@@ -550,6 +551,22 @@ dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag,
 	 */
 	ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0);
 
+	if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT) {
+		dn = (object == DMU_USERUSED_OBJECT) ?
+		    os->os_userused_dnode : os->os_groupused_dnode;
+		if (dn == NULL)
+			return (ENOENT);
+		type = dn->dn_type;
+		if ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE)
+			return (ENOENT);
+		if ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE)
+			return (EEXIST);
+		DNODE_VERIFY(dn);
+		(void) refcount_add(&dn->dn_holds, tag);
+		*dnp = dn;
+		return (0);
+	}
+
 	if (object == 0 || object >= DN_MAX_OBJECT)
 		return (EINVAL);
 
@@ -608,7 +625,8 @@ dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag,
 	type = dn->dn_type;
 	if (dn->dn_free_txg ||
 	    ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) ||
-	    ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE)) {
+	    ((flag & DNODE_MUST_BE_FREE) &&
+	    (type != DMU_OT_NONE || dn->dn_oldphys))) {
 		mutex_exit(&dn->dn_mtx);
 		dbuf_rele(db, FTAG);
 		return (type == DMU_OT_NONE ? ENOENT : EEXIST);
@@ -673,8 +691,10 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
 	objset_impl_t *os = dn->dn_objset;
 	uint64_t txg = tx->tx_txg;
 
-	if (dn->dn_object == DMU_META_DNODE_OBJECT)
+	if (DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
+		dsl_dataset_dirty(os->os_dsl_dataset, tx);
 		return;
+	}
 
 	DNODE_VERIFY(dn);
 
@@ -1270,7 +1290,7 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
 	dprintf("probing object %llu offset %llx level %d of %u\n",
 	    dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels);
 
-	hole = flags & DNODE_FIND_HOLE;
+	hole = ((flags & DNODE_FIND_HOLE) != 0);
 	inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1;
 	ASSERT(txg == 0 || !hole);
 
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
index 1b729e3..3bf0c81 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
@@ -506,9 +506,6 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
 
 /*
  * Write out the dnode's dirty buffers.
- *
- * NOTE: The dnode is kept in memory by being dirty.  Once the
- * dirty bit is cleared, it may be evicted.  Beware of this!
  */
 void
 dnode_sync(dnode_t *dn, dmu_tx_t *tx)
@@ -517,20 +514,33 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
 	dnode_phys_t *dnp = dn->dn_phys;
 	int txgoff = tx->tx_txg & TXG_MASK;
 	list_t *list = &dn->dn_dirty_records[txgoff];
+	static const dnode_phys_t zerodn = { 0 };
 
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg);
+	ASSERT(dnp->dn_type != DMU_OT_NONE ||
+	    bcmp(dnp, &zerodn, DNODE_SIZE) == 0);
 	DNODE_VERIFY(dn);
 
 	ASSERT(dn->dn_dbuf == NULL || arc_released(dn->dn_dbuf->db_buf));
 
+	if (dmu_objset_userused_enabled(dn->dn_objset) &&
+	    !DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
+		ASSERT(dn->dn_oldphys == NULL);
+		dn->dn_oldphys = zio_buf_alloc(sizeof (dnode_phys_t));
+		*dn->dn_oldphys = *dn->dn_phys; /* struct assignment */
+		dn->dn_phys->dn_flags |= DNODE_FLAG_USERUSED_ACCOUNTED;
+	} else {
+		/* Once we account for it, we should always account for it. */
+		ASSERT(!(dn->dn_phys->dn_flags &
+		    DNODE_FLAG_USERUSED_ACCOUNTED));
+	}
+
 	mutex_enter(&dn->dn_mtx);
 	if (dn->dn_allocated_txg == tx->tx_txg) {
 		/* The dnode is newly allocated or reallocated */
 		if (dnp->dn_type == DMU_OT_NONE) {
 			/* this is a first alloc, not a realloc */
-			/* XXX shouldn't the phys already be zeroed? */
-			bzero(dnp, DNODE_CORE_SIZE);
 			dnp->dn_nlevels = 1;
 			dnp->dn_nblkptr = dn->dn_nblkptr;
 		}
@@ -628,7 +638,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
 
 	dbuf_sync_list(list, tx);
 
-	if (dn->dn_object != DMU_META_DNODE_OBJECT) {
+	if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
 		ASSERT3P(list_head(list), ==, NULL);
 		dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
 	}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
index 622fa5d..ac9d67f 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -229,7 +229,7 @@ dsl_dataset_prev_snap_txg(dsl_dataset_t *ds)
 	return (MAX(ds->ds_phys->ds_prev_snap_txg, trysnap));
 }
 
-int
+boolean_t
 dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth)
 {
 	return (blk_birth > dsl_dataset_prev_snap_txg(ds));
@@ -525,7 +525,15 @@ dsl_dataset_hold_ref(dsl_dataset_t *ds, void *tag)
 			rw_enter(&dp->dp_config_rwlock, RW_READER);
 			return (ENOENT);
 		}
+		/*
+		 * The dp_config_rwlock lives above the ds_lock. And
+		 * we need to check DSL_DATASET_IS_DESTROYED() while
+		 * holding the ds_lock, so we have to drop and reacquire
+		 * the ds_lock here.
+		 */
+		mutex_exit(&ds->ds_lock);
 		rw_enter(&dp->dp_config_rwlock, RW_READER);
+		mutex_enter(&ds->ds_lock);
 	}
 	mutex_exit(&ds->ds_lock);
 	return (0);
@@ -981,6 +989,27 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag)
 		(void) dmu_free_object(os, obj);
 	}
 
+	/*
+	 * We need to sync out all in-flight IO before we try to evict
+	 * (the dataset evict func is trying to clear the cached entries
+	 * for this dataset in the ARC).
+	 */
+	txg_wait_synced(dd->dd_pool, 0);
+
+	/*
+	 * If we managed to free all the objects in open
+	 * context, the user space accounting should be zero.
+	 */
+	if (ds->ds_phys->ds_bp.blk_fill == 0 &&
+	    dmu_objset_userused_enabled(os->os)) {
+		uint64_t count;
+
+		ASSERT(zap_count(os, DMU_USERUSED_OBJECT, &count) != 0 ||
+		    count == 0);
+		ASSERT(zap_count(os, DMU_GROUPUSED_OBJECT, &count) != 0 ||
+		    count == 0);
+	}
+
 	dmu_objset_close(os);
 	if (err != ESRCH)
 		goto out;
@@ -1065,7 +1094,6 @@ dsl_dataset_get_user_ptr(dsl_dataset_t *ds)
 	return (ds->ds_user_ptr);
 }
 
-
 blkptr_t *
 dsl_dataset_get_blkptr(dsl_dataset_t *ds)
 {
@@ -1445,6 +1473,33 @@ dsl_dataset_drain_refs(dsl_dataset_t *ds, void *tag)
 	cv_destroy(&arg.cv);
 }
 
+static void
+remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx)
+{
+	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+	uint64_t count;
+	int err;
+
+	ASSERT(ds->ds_phys->ds_num_children >= 2);
+	err = zap_remove_int(mos, ds->ds_phys->ds_next_clones_obj, obj, tx);
+	/*
+	 * The err should not be ENOENT, but a bug in a previous version
+	 * of the code could cause upgrade_clones_cb() to not set
+	 * ds_next_snap_obj when it should, leading to a missing entry.
+	 * If we knew that the pool was created after
+	 * SPA_VERSION_NEXT_CLONES, we could assert that it isn't
+	 * ENOENT.  However, at least we can check that we don't have
+	 * too many entries in the next_clones_obj even after failing to
+	 * remove this one.
+	 */
+	if (err != ENOENT) {
+		VERIFY3U(err, ==, 0);
+	}
+	ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj,
+	    &count));
+	ASSERT3U(count, <=, ds->ds_phys->ds_num_children - 2);
+}
+
 void
 dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
 {
@@ -1495,8 +1550,7 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
 		dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
 		if (after_branch_point &&
 		    ds_prev->ds_phys->ds_next_clones_obj != 0) {
-			VERIFY(0 == zap_remove_int(mos,
-			    ds_prev->ds_phys->ds_next_clones_obj, obj, tx));
+			remove_from_next_clones(ds_prev, obj, tx);
 			if (ds->ds_phys->ds_next_snap_obj != 0) {
 				VERIFY(0 == zap_add_int(mos,
 				    ds_prev->ds_phys->ds_next_clones_obj,
@@ -1852,8 +1906,8 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 			    ds->ds_prev->ds_phys->ds_creation_txg);
 			ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj;
 		} else if (next_clones_obj != 0) {
-			VERIFY3U(0, ==, zap_remove_int(mos,
-			    next_clones_obj, dsphys->ds_next_snap_obj, tx));
+			remove_from_next_clones(ds->ds_prev,
+			    dsphys->ds_next_snap_obj, tx);
 			VERIFY3U(0, ==, zap_add_int(mos,
 			    next_clones_obj, dsobj, tx));
 		}
@@ -1962,6 +2016,9 @@ dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
 	if (ds->ds_phys->ds_next_snap_obj) {
 		stat->dds_is_snapshot = B_TRUE;
 		stat->dds_num_clones = ds->ds_phys->ds_num_children - 1;
+	} else {
+		stat->dds_is_snapshot = B_FALSE;
+		stat->dds_num_clones = 0;
 	}
 
 	/* clone origin is really a dsl_dir thing... */
@@ -1973,6 +2030,8 @@ dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
 		    ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods));
 		dsl_dataset_name(ods, stat->dds_origin);
 		dsl_dataset_drop_ref(ods, FTAG);
+	} else {
+		stat->dds_origin[0] = '\0';
 	}
 	rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
 }
@@ -2439,9 +2498,7 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 
 	/* change the origin's next clone */
 	if (origin_ds->ds_phys->ds_next_clones_obj) {
-		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
-		    origin_ds->ds_phys->ds_next_clones_obj,
-		    origin_ds->ds_phys->ds_next_snap_obj, tx));
+		remove_from_next_clones(origin_ds, snap->ds->ds_object, tx);
 		VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
 		    origin_ds->ds_phys->ds_next_clones_obj,
 		    oldnext_obj, tx));
@@ -3039,12 +3096,8 @@ dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
 	dsl_dataset_t *ds = arg1;
 	uint64_t *reservationp = arg2;
 	uint64_t new_reservation = *reservationp;
-	int64_t delta;
 	uint64_t unique;
 
-	if (new_reservation > INT64_MAX)
-		return (EOVERFLOW);
-
 	if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
 	    SPA_VERSION_REFRESERVATION)
 		return (ENOTSUP);
@@ -3061,15 +3114,18 @@ dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
 
 	mutex_enter(&ds->ds_lock);
 	unique = dsl_dataset_unique(ds);
-	delta = MAX(unique, new_reservation) - MAX(unique, ds->ds_reserved);
 	mutex_exit(&ds->ds_lock);
 
-	if (delta > 0 &&
-	    delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
-		return (ENOSPC);
-	if (delta > 0 && ds->ds_quota > 0 &&
-	    new_reservation > ds->ds_quota)
-		return (ENOSPC);
+	if (MAX(unique, new_reservation) > MAX(unique, ds->ds_reserved)) {
+		uint64_t delta = MAX(unique, new_reservation) -
+		    MAX(unique, ds->ds_reserved);
+
+		if (delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
+			return (ENOSPC);
+		if (ds->ds_quota > 0 &&
+		    new_reservation > ds->ds_quota)
+			return (ENOSPC);
+	}
 
 	return (0);
 }
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c
index 96b5005..2f312ae 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -226,24 +226,11 @@ dsl_dir_namelen(dsl_dir_t *dd)
 	return (result);
 }
 
-int
-dsl_dir_is_private(dsl_dir_t *dd)
-{
-	int rv = FALSE;
-
-	if (dd->dd_parent && dsl_dir_is_private(dd->dd_parent))
-		rv = TRUE;
-	if (dataset_name_hidden(dd->dd_myname))
-		rv = TRUE;
-	return (rv);
-}
-
-
 static int
 getcomponent(const char *path, char *component, const char **nextp)
 {
 	char *p;
-	if (path == NULL)
+	if ((path == NULL) || (path[0] == '\0'))
 		return (ENOENT);
 	/* This would be a good place to reserve some namespace... */
 	p = strpbrk(path, "/@");
@@ -1076,10 +1063,6 @@ dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
 	uint64_t *reservationp = arg2;
 	uint64_t new_reservation = *reservationp;
 	uint64_t used, avail;
-	int64_t delta;
-
-	if (new_reservation > INT64_MAX)
-		return (EOVERFLOW);
 
 	/*
 	 * If we are doing the preliminary check in open context, the
@@ -1090,8 +1073,6 @@ dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
 
 	mutex_enter(&dd->dd_lock);
 	used = dd->dd_phys->dd_used_bytes;
-	delta = MAX(used, new_reservation) -
-	    MAX(used, dd->dd_phys->dd_reserved);
 	mutex_exit(&dd->dd_lock);
 
 	if (dd->dd_parent) {
@@ -1101,11 +1082,17 @@ dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
 		avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used;
 	}
 
-	if (delta > 0 && delta > avail)
-		return (ENOSPC);
-	if (delta > 0 && dd->dd_phys->dd_quota > 0 &&
-	    new_reservation > dd->dd_phys->dd_quota)
-		return (ENOSPC);
+	if (MAX(used, new_reservation) > MAX(used, dd->dd_phys->dd_reserved)) {
+		uint64_t delta = MAX(used, new_reservation) -
+		    MAX(used, dd->dd_phys->dd_reserved);
+
+		if (delta > avail)
+			return (ENOSPC);
+		if (dd->dd_phys->dd_quota > 0 &&
+		    new_reservation > dd->dd_phys->dd_quota)
+			return (ENOSPC);
+	}
+
 	return (0);
 }
 
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
index e5823c5..0f00bc96 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -133,14 +133,15 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
 			goto out;
 		err = dsl_dataset_hold_obj(dp, dd->dd_phys->dd_head_dataset_obj,
 		    FTAG, &ds);
+		if (err == 0) {
+			err = dsl_dataset_hold_obj(dp,
+			     ds->ds_phys->ds_prev_snap_obj, dp,
+			     &dp->dp_origin_snap);
+			dsl_dataset_rele(ds, FTAG);
+		}
+		dsl_dir_close(dd, dp);
 		if (err)
 			goto out;
-		err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
-		    dp, &dp->dp_origin_snap);
-		if (err)
-			goto out;
-		dsl_dataset_rele(ds, FTAG);
-		dsl_dir_close(dd, dp);
 	}
 
 	/* get scrub status */
@@ -303,23 +304,51 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 
 	dp->dp_read_overhead = 0;
 	start = gethrtime();
+
 	zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 	while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) {
-		if (!list_link_active(&ds->ds_synced_link))
-			list_insert_tail(&dp->dp_synced_datasets, ds);
-		else
-			dmu_buf_rele(ds->ds_dbuf, ds);
+		/*
+		 * We must not sync any non-MOS datasets twice, because
+		 * we may have taken a snapshot of them.  However, we
+		 * may sync newly-created datasets on pass 2.
+		 */
+		ASSERT(!list_link_active(&ds->ds_synced_link));
+		list_insert_tail(&dp->dp_synced_datasets, ds);
 		dsl_dataset_sync(ds, zio, tx);
 	}
 	DTRACE_PROBE(pool_sync__1setup);
-
 	err = zio_wait(zio);
+
 	write_time = gethrtime() - start;
 	ASSERT(err == 0);
 	DTRACE_PROBE(pool_sync__2rootzio);
 
-	while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg))
+	for (ds = list_head(&dp->dp_synced_datasets); ds;
+	    ds = list_next(&dp->dp_synced_datasets, ds))
+		dmu_objset_do_userquota_callbacks(ds->ds_user_ptr, tx);
+
+	/*
+	 * Sync the datasets again to push out the changes due to
+	 * userquota updates.  This must be done before we process the
+	 * sync tasks, because that could cause a snapshot of a dataset
+	 * whose ds_bp will be rewritten when we do this 2nd sync.
+	 */
+	zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+	while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) {
+		ASSERT(list_link_active(&ds->ds_synced_link));
+		dmu_buf_rele(ds->ds_dbuf, ds);
+		dsl_dataset_sync(ds, zio, tx);
+	}
+	err = zio_wait(zio);
+
+	while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg)) {
+		/*
+		 * No more sync tasks should have been added while we
+		 * were syncing.
+		 */
+		ASSERT(spa_sync_pass(dp->dp_spa) == 1);
 		dsl_sync_task_group_sync(dstg, tx);
+	}
 	DTRACE_PROBE(pool_sync__3task);
 
 	start = gethrtime();
@@ -574,6 +603,7 @@ upgrade_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
 	ASSERT(ds->ds_phys->ds_prev_snap_obj == prev->ds_object);
 
 	if (prev->ds_phys->ds_next_clones_obj == 0) {
+		dmu_buf_will_dirty(prev->ds_dbuf, tx);
 		prev->ds_phys->ds_next_clones_obj =
 		    zap_create(dp->dp_meta_objset,
 		    DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
@@ -593,8 +623,8 @@ dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx)
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(dp->dp_origin_snap != NULL);
 
-	(void) dmu_objset_find_spa(dp->dp_spa, NULL, upgrade_clones_cb,
-	    tx, DS_FIND_CHILDREN);
+	VERIFY3U(0, ==, dmu_objset_find_spa(dp->dp_spa, NULL, upgrade_clones_cb,
+	    tx, DS_FIND_CHILDREN));
 }
 
 void
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c
index 212acbb..d064932 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/dmu.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_tx.h>
@@ -416,6 +414,34 @@ dsl_prop_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 }
 
 void
+dsl_props_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+{
+	dsl_dataset_t *ds = arg1;
+	nvlist_t *nvl = arg2;
+	nvpair_t *elem = NULL;
+
+	while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
+		struct prop_set_arg psa;
+
+		psa.name = nvpair_name(elem);
+
+		if (nvpair_type(elem) == DATA_TYPE_STRING) {
+			VERIFY(nvpair_value_string(elem,
+			    (char **)&psa.buf) == 0);
+			psa.intsz = 1;
+			psa.numints = strlen(psa.buf) + 1;
+		} else {
+			uint64_t intval;
+			VERIFY(nvpair_value_uint64(elem, &intval) == 0);
+			psa.intsz = sizeof (intval);
+			psa.numints = 1;
+			psa.buf = &intval;
+		}
+		dsl_prop_set_sync(ds, &psa, cr, tx);
+	}
+}
+
+void
 dsl_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val,
     cred_t *cr, dmu_tx_t *tx)
 {
@@ -471,6 +497,43 @@ dsl_prop_set(const char *dsname, const char *propname,
 	return (err);
 }
 
+int
+dsl_props_set(const char *dsname, nvlist_t *nvl)
+{
+	dsl_dataset_t *ds;
+	nvpair_t *elem = NULL;
+	int err;
+
+	/*
+	 * Do these checks before the syncfunc, since it can't fail.
+	 */
+	while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
+		if (strlen(nvpair_name(elem)) >= ZAP_MAXNAMELEN)
+			return (ENAMETOOLONG);
+		if (nvpair_type(elem) == DATA_TYPE_STRING) {
+			char *valstr;
+			VERIFY(nvpair_value_string(elem, &valstr) == 0);
+			if (strlen(valstr) >= ZAP_MAXVALUELEN)
+				return (E2BIG);
+		}
+	}
+
+	if (err = dsl_dataset_hold(dsname, FTAG, &ds))
+		return (err);
+
+	if (dsl_dataset_is_snapshot(ds) &&
+	    spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_SNAP_PROPS) {
+		dsl_dataset_rele(ds, FTAG);
+		return (ENOTSUP);
+	}
+
+	err = dsl_sync_task_do(ds->ds_dir->dd_pool,
+	    NULL, dsl_props_set_sync, ds, nvl, 2);
+
+	dsl_dataset_rele(ds, FTAG);
+	return (err);
+}
+
 /*
  * Iterate over all properties for this dataset and return them in an nvlist.
  */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scrub.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scrub.c
index 84561ab..d11f106 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scrub.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scrub.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -45,6 +45,8 @@ typedef int (scrub_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *);
 
 static scrub_cb_t dsl_pool_scrub_clean_cb;
 static dsl_syncfunc_t dsl_pool_scrub_cancel_sync;
+static void scrub_visitdnode(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *buf,
+    uint64_t objset, uint64_t object);
 
 int zfs_scrub_min_time = 1; /* scrub for at least 1 sec each txg */
 int zfs_resilver_min_time = 3; /* resilver for at least 3 sec each txg */
@@ -95,6 +97,9 @@ dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 			    ESC_ZFS_RESILVER_START);
 			dp->dp_scrub_max_txg = MIN(dp->dp_scrub_max_txg,
 			    tx->tx_txg);
+		} else {
+			spa_event_notify(dp->dp_spa, NULL,
+			    ESC_ZFS_SCRUB_START);
 		}
 
 		/* zero out the scrub stats in all vdev_stat_t's */
@@ -212,8 +217,9 @@ dsl_pool_scrub_cancel_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 	 */
 	vdev_dtl_reassess(dp->dp_spa->spa_root_vdev, tx->tx_txg,
 	    *completep ? dp->dp_scrub_max_txg : 0, B_TRUE);
-	if (dp->dp_scrub_min_txg && *completep)
-		spa_event_notify(dp->dp_spa, NULL, ESC_ZFS_RESILVER_FINISH);
+	if (*completep)
+		spa_event_notify(dp->dp_spa, NULL, dp->dp_scrub_min_txg ?
+		    ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH);
 	spa_errlog_rotate(dp->dp_spa);
 
 	/*
@@ -402,7 +408,7 @@ traverse_zil(dsl_pool_t *dp, zil_header_t *zh)
 	 * We only want to visit blocks that have been claimed but not yet
 	 * replayed (or, in read-only mode, blocks that *would* be claimed).
 	 */
-	if (claim_txg == 0 && (spa_mode & FWRITE))
+	if (claim_txg == 0 && spa_writeable(dp->dp_spa))
 		return;
 
 	zilog = zil_alloc(dp->dp_meta_objset, zh);
@@ -420,9 +426,6 @@ scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp,
 	int err;
 	arc_buf_t *buf = NULL;
 
-	if (bp->blk_birth == 0)
-		return;
-
 	if (bp->blk_birth <= dp->dp_scrub_min_txg)
 		return;
 
@@ -482,7 +485,7 @@ scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp,
 	} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
 		uint32_t flags = ARC_WAIT;
 		dnode_phys_t *child_dnp;
-		int i, j;
+		int i;
 		int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
 
 		err = arc_read(NULL, dp->dp_spa, bp, pbuf,
@@ -497,20 +500,12 @@ scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp,
 		child_dnp = buf->b_data;
 
 		for (i = 0; i < epb; i++, child_dnp++) {
-			for (j = 0; j < child_dnp->dn_nblkptr; j++) {
-				zbookmark_t czb;
-
-				SET_BOOKMARK(&czb, zb->zb_objset,
-				    zb->zb_blkid * epb + i,
-				    child_dnp->dn_nlevels - 1, j);
-				scrub_visitbp(dp, child_dnp, buf,
-				    &child_dnp->dn_blkptr[j], &czb);
-			}
+			scrub_visitdnode(dp, child_dnp, buf, zb->zb_objset,
+			    zb->zb_blkid * epb + i);
 		}
 	} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
 		uint32_t flags = ARC_WAIT;
 		objset_phys_t *osp;
-		int j;
 
 		err = arc_read_nolock(NULL, dp->dp_spa, bp,
 		    arc_getbuf_func, &buf,
@@ -526,13 +521,13 @@ scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp,
 
 		traverse_zil(dp, &osp->os_zil_header);
 
-		for (j = 0; j < osp->os_meta_dnode.dn_nblkptr; j++) {
-			zbookmark_t czb;
-
-			SET_BOOKMARK(&czb, zb->zb_objset, 0,
-			    osp->os_meta_dnode.dn_nlevels - 1, j);
-			scrub_visitbp(dp, &osp->os_meta_dnode, buf,
-			    &osp->os_meta_dnode.dn_blkptr[j], &czb);
+		scrub_visitdnode(dp, &osp->os_meta_dnode,
+		    buf, zb->zb_objset, 0);
+		if (arc_buf_size(buf) >= sizeof (objset_phys_t)) {
+			scrub_visitdnode(dp, &osp->os_userused_dnode,
+			    buf, zb->zb_objset, 0);
+			scrub_visitdnode(dp, &osp->os_groupused_dnode,
+			    buf, zb->zb_objset, 0);
 		}
 	}
 
@@ -542,6 +537,21 @@ scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp,
 }
 
 static void
+scrub_visitdnode(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *buf,
+    uint64_t objset, uint64_t object)
+{
+	int j;
+
+	for (j = 0; j < dnp->dn_nblkptr; j++) {
+		zbookmark_t czb;
+
+		SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
+		scrub_visitbp(dp, dnp, buf, &dnp->dn_blkptr[j], &czb);
+	}
+
+}
+
+static void
 scrub_visit_rootbp(dsl_pool_t *dp, dsl_dataset_t *ds, blkptr_t *bp)
 {
 	zbookmark_t zb;
@@ -688,17 +698,34 @@ scrub_visitds(dsl_pool_t *dp, uint64_t dsobj, dmu_tx_t *tx)
 		    ds->ds_phys->ds_next_snap_obj, tx) == 0);
 	}
 	if (ds->ds_phys->ds_num_children > 1) {
-		if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) {
+		boolean_t usenext = B_FALSE;
+		if (ds->ds_phys->ds_next_clones_obj != 0) {
+			uint64_t count;
+			/*
+			 * A bug in a previous version of the code could
+			 * cause upgrade_clones_cb() to not set
+			 * ds_next_snap_obj when it should, leading to a
+			 * missing entry.  Therefore we can only use the
+			 * next_clones_obj when its count is correct.
+			 */
+			int err = zap_count(dp->dp_meta_objset,
+			    ds->ds_phys->ds_next_clones_obj, &count);
+			if (err == 0 &&
+			    count == ds->ds_phys->ds_num_children - 1)
+				usenext = B_TRUE;
+		}
+
+		if (usenext) {
+			VERIFY(zap_join(dp->dp_meta_objset,
+			    ds->ds_phys->ds_next_clones_obj,
+			    dp->dp_scrub_queue_obj, tx) == 0);
+		} else {
 			struct enqueue_clones_arg eca;
 			eca.tx = tx;
 			eca.originobj = ds->ds_object;
 
 			(void) dmu_objset_find_spa(ds->ds_dir->dd_pool->dp_spa,
 			    NULL, enqueue_clones_cb, &eca, DS_FIND_CHILDREN);
-		} else {
-			VERIFY(zap_join(dp->dp_meta_objset,
-			    ds->ds_phys->ds_next_clones_obj,
-			    dp->dp_scrub_queue_obj, tx) == 0);
 		}
 	}
 
@@ -751,6 +778,7 @@ enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
 void
 dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
 {
+	spa_t *spa = dp->dp_spa;
 	zap_cursor_t zc;
 	zap_attribute_t za;
 	boolean_t complete = B_TRUE;
@@ -758,8 +786,10 @@ dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
 	if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
 		return;
 
-	/* If the spa is not fully loaded, don't bother. */
-	if (dp->dp_spa->spa_load_state != SPA_LOAD_NONE)
+	/*
+	 * If the pool is not loaded, or is trying to unload, leave it alone.
+	 */
+	if (spa->spa_load_state != SPA_LOAD_NONE || spa_shutting_down(spa))
 		return;
 
 	if (dp->dp_scrub_restart) {
@@ -768,13 +798,13 @@ dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
 		dsl_pool_scrub_setup_sync(dp, &func, kcred, tx);
 	}
 
-	if (dp->dp_spa->spa_root_vdev->vdev_stat.vs_scrub_type == 0) {
+	if (spa->spa_root_vdev->vdev_stat.vs_scrub_type == 0) {
 		/*
 		 * We must have resumed after rebooting; reset the vdev
 		 * stats to know that we're doing a scrub (although it
 		 * will think we're just starting now).
 		 */
-		vdev_scrub_stat_update(dp->dp_spa->spa_root_vdev,
+		vdev_scrub_stat_update(spa->spa_root_vdev,
 		    dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER :
 		    POOL_SCRUB_EVERYTHING, B_FALSE);
 	}
@@ -782,7 +812,7 @@ dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
 	dp->dp_scrub_pausing = B_FALSE;
 	dp->dp_scrub_start_time = lbolt64;
 	dp->dp_scrub_isresilver = (dp->dp_scrub_min_txg != 0);
-	dp->dp_spa->spa_scrub_active = B_TRUE;
+	spa->spa_scrub_active = B_TRUE;
 
 	if (dp->dp_scrub_bookmark.zb_objset == 0) {
 		/* First do the MOS & ORIGIN */
@@ -790,8 +820,8 @@ dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
 		if (dp->dp_scrub_pausing)
 			goto out;
 
-		if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) {
-			VERIFY(0 == dmu_objset_find_spa(dp->dp_spa,
+		if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) {
+			VERIFY(0 == dmu_objset_find_spa(spa,
 			    NULL, enqueue_cb, tx, DS_FIND_CHILDREN));
 		} else {
 			scrub_visitds(dp, dp->dp_origin_snap->ds_object, tx);
@@ -841,15 +871,13 @@ out:
 	VERIFY(0 == zap_update(dp->dp_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1,
-	    &dp->dp_spa->spa_scrub_errors, tx));
+	    &spa->spa_scrub_errors, tx));
 
 	/* XXX this is scrub-clean specific */
-	mutex_enter(&dp->dp_spa->spa_scrub_lock);
-	while (dp->dp_spa->spa_scrub_inflight > 0) {
-		cv_wait(&dp->dp_spa->spa_scrub_io_cv,
-		    &dp->dp_spa->spa_scrub_lock);
-	}
-	mutex_exit(&dp->dp_spa->spa_scrub_lock);
+	mutex_enter(&spa->spa_scrub_lock);
+	while (spa->spa_scrub_inflight > 0)
+		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
+	mutex_exit(&spa->spa_scrub_lock);
 }
 
 void
@@ -931,13 +959,17 @@ static int
 dsl_pool_scrub_clean_cb(dsl_pool_t *dp,
     const blkptr_t *bp, const zbookmark_t *zb)
 {
-	size_t size = BP_GET_LSIZE(bp);
-	int d;
+	size_t size = BP_GET_PSIZE(bp);
 	spa_t *spa = dp->dp_spa;
 	boolean_t needs_io;
-	int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL;
+	int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
 	int zio_priority;
 
+	ASSERT(bp->blk_birth > dp->dp_scrub_min_txg);
+
+	if (bp->blk_birth >= dp->dp_scrub_max_txg)
+		return (0);
+
 	count_block(dp->dp_blkstats, bp);
 
 	if (dp->dp_scrub_isresilver == 0) {
@@ -956,7 +988,7 @@ dsl_pool_scrub_clean_cb(dsl_pool_t *dp,
 	if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)
 		zio_flags |= ZIO_FLAG_SPECULATIVE;
 
-	for (d = 0; d < BP_GET_NDVAS(bp); d++) {
+	for (int d = 0; d < BP_GET_NDVAS(bp); d++) {
 		vdev_t *vd = vdev_lookup_top(spa,
 		    DVA_GET_VDEV(&bp->blk_dva[d]));
 
@@ -974,16 +1006,17 @@ dsl_pool_scrub_clean_cb(dsl_pool_t *dp,
 			if (DVA_GET_GANG(&bp->blk_dva[d])) {
 				/*
 				 * Gang members may be spread across multiple
-				 * vdevs, so the best we can do is look at the
-				 * pool-wide DTL.
+				 * vdevs, so the best estimate we have is the
+				 * scrub range, which has already been checked.
 				 * XXX -- it would be better to change our
-				 * allocation policy to ensure that this can't
-				 * happen.
+				 * allocation policy to ensure that all
+				 * gang members reside on the same vdev.
 				 */
-				vd = spa->spa_root_vdev;
+				needs_io = B_TRUE;
+			} else {
+				needs_io = vdev_dtl_contains(vd, DTL_PARTIAL,
+				    bp->blk_birth, 1);
 			}
-			needs_io = vdev_dtl_contains(&vd->vdev_dtl_map,
-			    bp->blk_birth, 1);
 		}
 	}
 
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
index 47f8f5f..d216154 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -36,18 +36,35 @@ uint64_t metaslab_aliquot = 512ULL << 10;
 uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;	/* force gang blocks */
 
 /*
+ * Minimum size which forces the dynamic allocator to change
+ * it's allocation strategy. Once the space map cannot satisfy
+ * an allocation of this size then it switches to using more
+ * aggressive strategy (i.e search by size rather than offset).
+ */
+uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE;
+
+/*
+ * The minimum free space, in percent, which must be available
+ * in a space map to continue allocations in a first-fit fashion.
+ * Once the space_map's free space drops below this level we dynamically
+ * switch to using best-fit allocations.
+ */
+int metaslab_df_free_pct = 30;
+
+/*
  * ==========================================================================
  * Metaslab classes
  * ==========================================================================
  */
 metaslab_class_t *
-metaslab_class_create(void)
+metaslab_class_create(space_map_ops_t *ops)
 {
 	metaslab_class_t *mc;
 
 	mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
 
 	mc->mc_rotor = NULL;
+	mc->mc_ops = ops;
 
 	return (mc);
 }
@@ -202,30 +219,14 @@ metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
 }
 
 /*
- * ==========================================================================
- * The first-fit block allocator
- * ==========================================================================
+ * This is a helper function that can be used by the allocator to find
+ * a suitable block to allocate. This will search the specified AVL
+ * tree looking for a block that matches the specified criteria.
  */
-static void
-metaslab_ff_load(space_map_t *sm)
-{
-	ASSERT(sm->sm_ppd == NULL);
-	sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
-}
-
-static void
-metaslab_ff_unload(space_map_t *sm)
-{
-	kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
-	sm->sm_ppd = NULL;
-}
-
 static uint64_t
-metaslab_ff_alloc(space_map_t *sm, uint64_t size)
+metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
+    uint64_t align)
 {
-	avl_tree_t *t = &sm->sm_root;
-	uint64_t align = size & -size;
-	uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
 	space_seg_t *ss, ssearch;
 	avl_index_t where;
 
@@ -254,7 +255,37 @@ metaslab_ff_alloc(space_map_t *sm, uint64_t size)
 		return (-1ULL);
 
 	*cursor = 0;
-	return (metaslab_ff_alloc(sm, size));
+	return (metaslab_block_picker(t, cursor, size, align));
+}
+
+/*
+ * ==========================================================================
+ * The first-fit block allocator
+ * ==========================================================================
+ */
+static void
+metaslab_ff_load(space_map_t *sm)
+{
+	ASSERT(sm->sm_ppd == NULL);
+	sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
+	sm->sm_pp_root = NULL;
+}
+
+static void
+metaslab_ff_unload(space_map_t *sm)
+{
+	kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
+	sm->sm_ppd = NULL;
+}
+
+static uint64_t
+metaslab_ff_alloc(space_map_t *sm, uint64_t size)
+{
+	avl_tree_t *t = &sm->sm_root;
+	uint64_t align = size & -size;
+	uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
+
+	return (metaslab_block_picker(t, cursor, size, align));
 }
 
 /* ARGSUSED */
@@ -276,9 +307,136 @@ static space_map_ops_t metaslab_ff_ops = {
 	metaslab_ff_unload,
 	metaslab_ff_alloc,
 	metaslab_ff_claim,
-	metaslab_ff_free
+	metaslab_ff_free,
+	NULL	/* maxsize */
+};
+
+/*
+ * Dynamic block allocator -
+ * Uses the first fit allocation scheme until space get low and then
+ * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
+ * and metaslab_df_free_pct to determine when to switch the allocation scheme.
+ */
+
+uint64_t
+metaslab_df_maxsize(space_map_t *sm)
+{
+	avl_tree_t *t = sm->sm_pp_root;
+	space_seg_t *ss;
+
+	if (t == NULL || (ss = avl_last(t)) == NULL)
+		return (0ULL);
+
+	return (ss->ss_end - ss->ss_start);
+}
+
+static int
+metaslab_df_seg_compare(const void *x1, const void *x2)
+{
+	const space_seg_t *s1 = x1;
+	const space_seg_t *s2 = x2;
+	uint64_t ss_size1 = s1->ss_end - s1->ss_start;
+	uint64_t ss_size2 = s2->ss_end - s2->ss_start;
+
+	if (ss_size1 < ss_size2)
+		return (-1);
+	if (ss_size1 > ss_size2)
+		return (1);
+
+	if (s1->ss_start < s2->ss_start)
+		return (-1);
+	if (s1->ss_start > s2->ss_start)
+		return (1);
+
+	return (0);
+}
+
+static void
+metaslab_df_load(space_map_t *sm)
+{
+	space_seg_t *ss;
+
+	ASSERT(sm->sm_ppd == NULL);
+	sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
+
+	sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
+	avl_create(sm->sm_pp_root, metaslab_df_seg_compare,
+	    sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node));
+
+	for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
+		avl_add(sm->sm_pp_root, ss);
+}
+
+static void
+metaslab_df_unload(space_map_t *sm)
+{
+	void *cookie = NULL;
+
+	kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
+	sm->sm_ppd = NULL;
+
+	while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) {
+		/* tear down the tree */
+	}
+
+	avl_destroy(sm->sm_pp_root);
+	kmem_free(sm->sm_pp_root, sizeof (avl_tree_t));
+	sm->sm_pp_root = NULL;
+}
+
+static uint64_t
+metaslab_df_alloc(space_map_t *sm, uint64_t size)
+{
+	avl_tree_t *t = &sm->sm_root;
+	uint64_t align = size & -size;
+	uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
+	uint64_t max_size = metaslab_df_maxsize(sm);
+	int free_pct = sm->sm_space * 100 / sm->sm_size;
+
+	ASSERT(MUTEX_HELD(sm->sm_lock));
+	ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
+
+	if (max_size < size)
+		return (-1ULL);
+
+	/*
+	 * If we're running low on space switch to using the size
+	 * sorted AVL tree (best-fit).
+	 */
+	if (max_size < metaslab_df_alloc_threshold ||
+	    free_pct < metaslab_df_free_pct) {
+		t = sm->sm_pp_root;
+		*cursor = 0;
+	}
+
+	return (metaslab_block_picker(t, cursor, size, 1ULL));
+}
+
+/* ARGSUSED */
+static void
+metaslab_df_claim(space_map_t *sm, uint64_t start, uint64_t size)
+{
+	/* No need to update cursor */
+}
+
+/* ARGSUSED */
+static void
+metaslab_df_free(space_map_t *sm, uint64_t start, uint64_t size)
+{
+	/* No need to update cursor */
+}
+
+static space_map_ops_t metaslab_df_ops = {
+	metaslab_df_load,
+	metaslab_df_unload,
+	metaslab_df_alloc,
+	metaslab_df_claim,
+	metaslab_df_free,
+	metaslab_df_maxsize
 };
 
+space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
+
 /*
  * ==========================================================================
  * Metaslabs
@@ -414,20 +572,28 @@ metaslab_weight(metaslab_t *msp)
 }
 
 static int
-metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
+metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size)
 {
 	space_map_t *sm = &msp->ms_map;
+	space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
-		int error = space_map_load(sm, &metaslab_ff_ops,
-		    SM_FREE, &msp->ms_smo,
+		int error = space_map_load(sm, sm_ops, SM_FREE, &msp->ms_smo,
 		    msp->ms_group->mg_vd->vdev_spa->spa_meta_objset);
 		if (error) {
 			metaslab_group_sort(msp->ms_group, msp, 0);
 			return (error);
 		}
+
+		/*
+		 * If we were able to load the map then make sure
+		 * that this map is still able to satisfy our request.
+		 */
+		if (msp->ms_weight < size)
+			return (ENOSPC);
+
 		metaslab_group_sort(msp->ms_group, msp,
 		    msp->ms_weight | activation_weight);
 	}
@@ -636,11 +802,16 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
 	int i;
 
 	activation_weight = METASLAB_WEIGHT_PRIMARY;
-	for (i = 0; i < d; i++)
-		if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id)
+	for (i = 0; i < d; i++) {
+		if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
 			activation_weight = METASLAB_WEIGHT_SECONDARY;
+			break;
+		}
+	}
 
 	for (;;) {
+		boolean_t was_active;
+
 		mutex_enter(&mg->mg_lock);
 		for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
 			if (msp->ms_weight < size) {
@@ -648,6 +819,7 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
 				return (-1ULL);
 			}
 
+			was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
 			if (activation_weight == METASLAB_WEIGHT_PRIMARY)
 				break;
 
@@ -673,7 +845,9 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
 		 * another thread may have changed the weight while we
 		 * were blocked on the metaslab lock.
 		 */
-		if (msp->ms_weight < size) {
+		if (msp->ms_weight < size || (was_active &&
+		    !(msp->ms_weight & METASLAB_ACTIVE_MASK) &&
+		    activation_weight == METASLAB_WEIGHT_PRIMARY)) {
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
@@ -686,7 +860,7 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
 			continue;
 		}
 
-		if (metaslab_activate(msp, activation_weight) != 0) {
+		if (metaslab_activate(msp, activation_weight, size) != 0) {
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
@@ -720,6 +894,8 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
 	vdev_t *vd;
 	int dshift = 3;
 	int all_zero;
+	int zio_lock = B_FALSE;
+	boolean_t allocatable;
 	uint64_t offset = -1ULL;
 	uint64_t asize;
 	uint64_t distance;
@@ -778,11 +954,20 @@ top:
 	all_zero = B_TRUE;
 	do {
 		vd = mg->mg_vd;
+
 		/*
 		 * Don't allocate from faulted devices.
 		 */
-		if (!vdev_allocatable(vd))
+		if (zio_lock) {
+			spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
+			allocatable = vdev_allocatable(vd);
+			spa_config_exit(spa, SCL_ZIO, FTAG);
+		} else {
+			allocatable = vdev_allocatable(vd);
+		}
+		if (!allocatable)
 			goto next;
+
 		/*
 		 * Avoid writing single-copy data to a failing vdev
 		 */
@@ -858,6 +1043,12 @@ next:
 		goto top;
 	}
 
+	if (!allocatable && !zio_lock) {
+		dshift = 3;
+		zio_lock = B_TRUE;
+		goto top;
+	}
+
 	bzero(&dva[d], sizeof (dva_t));
 
 	return (ENOSPC);
@@ -938,7 +1129,7 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
 
 	mutex_enter(&msp->ms_lock);
 
-	error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
+	error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY, 0);
 	if (error || txg == 0) {	/* txg == 0 indicates dry run */
 		mutex_exit(&msp->ms_lock);
 		return (error);
@@ -946,7 +1137,7 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
 
 	space_map_claim(&msp->ms_map, offset, size);
 
-	if (spa_mode & FWRITE) {	/* don't dirty if we're zdb(1M) */
+	if (spa_writeable(spa)) {	/* don't dirty if we're zdb(1M) */
 		if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
 			vdev_dirty(vd, VDD_METASLAB, msp, txg);
 		space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
index b8925e3..cb6f413 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -70,16 +70,44 @@ TUNABLE_INT("vfs.zfs.check_hostid", &check_hostid);
 SYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RW, &check_hostid, 0,
     "Check hostid on import?");
 
-int zio_taskq_threads[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
-	/*	ISSUE	INTR					*/
-	{	1,	1	},	/* ZIO_TYPE_NULL	*/
-	{	1,	8	},	/* ZIO_TYPE_READ	*/
-	{	8,	1	},	/* ZIO_TYPE_WRITE	*/
-	{	1,	1	},	/* ZIO_TYPE_FREE	*/
-	{	1,	1	},	/* ZIO_TYPE_CLAIM	*/
-	{	1,	1	},	/* ZIO_TYPE_IOCTL	*/
+enum zti_modes {
+	zti_mode_fixed,			/* value is # of threads (min 1) */
+	zti_mode_online_percent,	/* value is % of online CPUs */
+	zti_mode_tune,			/* fill from zio_taskq_tune_* */
+	zti_nmodes
 };
 
+#define	ZTI_THREAD_FIX(n)	{ zti_mode_fixed, (n) }
+#define	ZTI_THREAD_PCT(n)	{ zti_mode_online_percent, (n) }
+#define	ZTI_THREAD_TUNE		{ zti_mode_tune, 0 }
+
+#define	ZTI_THREAD_ONE		ZTI_THREAD_FIX(1)
+
+typedef struct zio_taskq_info {
+	const char *zti_name;
+	struct {
+		enum zti_modes zti_mode;
+		uint_t zti_value;
+	} zti_nthreads[ZIO_TASKQ_TYPES];
+} zio_taskq_info_t;
+
+static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
+				"issue",		"intr"
+};
+
+const zio_taskq_info_t zio_taskqs[ZIO_TYPES] = {
+	/*			ISSUE			INTR		*/
+	{ "spa_zio_null",	{ ZTI_THREAD_ONE,	ZTI_THREAD_ONE } },
+	{ "spa_zio_read",	{ ZTI_THREAD_FIX(8),	ZTI_THREAD_TUNE } },
+	{ "spa_zio_write",	{ ZTI_THREAD_TUNE,	ZTI_THREAD_FIX(8) } },
+	{ "spa_zio_free",	{ ZTI_THREAD_ONE,	ZTI_THREAD_ONE } },
+	{ "spa_zio_claim",	{ ZTI_THREAD_ONE,	ZTI_THREAD_ONE } },
+	{ "spa_zio_ioctl",	{ ZTI_THREAD_ONE,	ZTI_THREAD_ONE } },
+};
+
+enum zti_modes zio_taskq_tune_mode = zti_mode_online_percent;
+uint_t zio_taskq_tune_value = 80;	/* #threads = 80% of # online CPUs */
+
 static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx);
 static boolean_t spa_has_active_shared_spare(spa_t *spa);
 
@@ -117,38 +145,38 @@ spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval,
 static void
 spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
 {
-	uint64_t size = spa_get_space(spa);
-	uint64_t used = spa_get_alloc(spa);
+	uint64_t size;
+	uint64_t used;
 	uint64_t cap, version;
 	zprop_source_t src = ZPROP_SRC_NONE;
 	spa_config_dirent_t *dp;
 
 	ASSERT(MUTEX_HELD(&spa->spa_props_lock));
 
-	/*
-	 * readonly properties
-	 */
-	spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
-	spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
-	spa_prop_add_list(*nvp, ZPOOL_PROP_USED, NULL, used, src);
-	spa_prop_add_list(*nvp, ZPOOL_PROP_AVAILABLE, NULL, size - used, src);
-
-	cap = (size == 0) ? 0 : (used * 100 / size);
-	spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
+	if (spa->spa_root_vdev != NULL) {
+		size = spa_get_space(spa);
+		used = spa_get_alloc(spa);
+		spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
+		spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
+		spa_prop_add_list(*nvp, ZPOOL_PROP_USED, NULL, used, src);
+		spa_prop_add_list(*nvp, ZPOOL_PROP_AVAILABLE, NULL,
+		    size - used, src);
+
+		cap = (size == 0) ? 0 : (used * 100 / size);
+		spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
+
+		spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
+		    spa->spa_root_vdev->vdev_state, src);
+
+		version = spa_version(spa);
+		if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION))
+			src = ZPROP_SRC_DEFAULT;
+		else
+			src = ZPROP_SRC_LOCAL;
+		spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src);
+	}
 
 	spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
-	spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
-	    spa->spa_root_vdev->vdev_state, src);
-
-	/*
-	 * settable properties that are not stored in the pool property object.
-	 */
-	version = spa_version(spa);
-	if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION))
-		src = ZPROP_SRC_DEFAULT;
-	else
-		src = ZPROP_SRC_LOCAL;
-	spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src);
 
 	if (spa->spa_root != NULL)
 		spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
@@ -313,6 +341,11 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
 			break;
 
 		case ZPOOL_PROP_BOOTFS:
+			/*
+			 * If the pool version is less than SPA_VERSION_BOOTFS,
+			 * or the pool is still being created (version == 0),
+			 * the bootfs property cannot be set.
+			 */
 			if (spa_version(spa) < SPA_VERSION_BOOTFS) {
 				error = ENOTSUP;
 				break;
@@ -419,16 +452,60 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
 	return (error);
 }
 
+void
+spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync)
+{
+	char *cachefile;
+	spa_config_dirent_t *dp;
+
+	if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE),
+	    &cachefile) != 0)
+		return;
+
+	dp = kmem_alloc(sizeof (spa_config_dirent_t),
+	    KM_SLEEP);
+
+	if (cachefile[0] == '\0')
+		dp->scd_path = spa_strdup(spa_config_path);
+	else if (strcmp(cachefile, "none") == 0)
+		dp->scd_path = NULL;
+	else
+		dp->scd_path = spa_strdup(cachefile);
+
+	list_insert_head(&spa->spa_config_list, dp);
+	if (need_sync)
+		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
+}
+
 int
 spa_prop_set(spa_t *spa, nvlist_t *nvp)
 {
 	int error;
+	nvpair_t *elem;
+	boolean_t need_sync = B_FALSE;
+	zpool_prop_t prop;
 
 	if ((error = spa_prop_validate(spa, nvp)) != 0)
 		return (error);
 
-	return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props,
-	    spa, nvp, 3));
+	elem = NULL;
+	while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) {
+		if ((prop = zpool_name_to_prop(
+		    nvpair_name(elem))) == ZPROP_INVAL)
+			return (EINVAL);
+
+		if (prop == ZPOOL_PROP_CACHEFILE || prop == ZPOOL_PROP_ALTROOT)
+			continue;
+
+		need_sync = B_TRUE;
+		break;
+	}
+
+	if (need_sync)
+		return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props,
+		    spa, nvp, 3));
+	else
+		return (0);
 }
 
 /*
@@ -493,21 +570,57 @@ spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
  * Activate an uninitialized pool.
  */
 static void
-spa_activate(spa_t *spa)
+spa_activate(spa_t *spa, int mode)
 {
-
 	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
 
 	spa->spa_state = POOL_STATE_ACTIVE;
+	spa->spa_mode = mode;
 
-	spa->spa_normal_class = metaslab_class_create();
-	spa->spa_log_class = metaslab_class_create();
+	spa->spa_normal_class = metaslab_class_create(zfs_metaslab_ops);
+	spa->spa_log_class = metaslab_class_create(zfs_metaslab_ops);
 
 	for (int t = 0; t < ZIO_TYPES; t++) {
+		const zio_taskq_info_t *ztip = &zio_taskqs[t];
 		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
-			spa->spa_zio_taskq[t][q] = taskq_create("spa_zio",
-			    zio_taskq_threads[t][q], maxclsyspri, 50,
-			    INT_MAX, TASKQ_PREPOPULATE);
+			enum zti_modes mode = ztip->zti_nthreads[q].zti_mode;
+			uint_t value = ztip->zti_nthreads[q].zti_value;
+			char name[32];
+
+			(void) snprintf(name, sizeof (name),
+			    "%s_%s", ztip->zti_name, zio_taskq_types[q]);
+
+			if (mode == zti_mode_tune) {
+				mode = zio_taskq_tune_mode;
+				value = zio_taskq_tune_value;
+				if (mode == zti_mode_tune)
+					mode = zti_mode_online_percent;
+			}
+
+			switch (mode) {
+			case zti_mode_fixed:
+				ASSERT3U(value, >=, 1);
+				value = MAX(value, 1);
+
+				spa->spa_zio_taskq[t][q] = taskq_create(name,
+				    value, maxclsyspri, 50, INT_MAX,
+				    TASKQ_PREPOPULATE);
+				break;
+
+			case zti_mode_online_percent:
+				spa->spa_zio_taskq[t][q] = taskq_create(name,
+				    value, maxclsyspri, 50, INT_MAX,
+				    TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT);
+				break;
+
+			case zti_mode_tune:
+			default:
+				panic("unrecognized mode for "
+				    "zio_taskqs[%u]->zti_nthreads[%u] (%u:%u) "
+				    "in spa_activate()",
+				    t, q, mode, value);
+				break;
+			}
 		}
 	}
 
@@ -536,7 +649,7 @@ spa_deactivate(spa_t *spa)
 	ASSERT(spa->spa_sync_on == B_FALSE);
 	ASSERT(spa->spa_dsl_pool == NULL);
 	ASSERT(spa->spa_root_vdev == NULL);
-
+	ASSERT(spa->spa_async_zio_root == NULL);
 	ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
 
 	txg_list_destroy(&spa->spa_vdev_txg_list);
@@ -642,15 +755,10 @@ spa_unload(spa_t *spa)
 	/*
 	 * Wait for any outstanding async I/O to complete.
 	 */
-	mutex_enter(&spa->spa_async_root_lock);
-	while (spa->spa_async_root_count != 0)
-		cv_wait(&spa->spa_async_root_cv, &spa->spa_async_root_lock);
-	mutex_exit(&spa->spa_async_root_lock);
-
-	/*
-	 * Drop and purge level 2 cache
-	 */
-	spa_l2cache_drop(spa);
+	if (spa->spa_async_zio_root != NULL) {
+		(void) zio_wait(spa->spa_async_zio_root);
+		spa->spa_async_zio_root = NULL;
+	}
 
 	/*
 	 * Close the dsl pool.
@@ -660,6 +768,13 @@ spa_unload(spa_t *spa)
 		spa->spa_dsl_pool = NULL;
 	}
 
+	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+
+	/*
+	 * Drop and purge level 2 cache
+	 */
+	spa_l2cache_drop(spa);
+
 	/*
 	 * Close all vdevs.
 	 */
@@ -694,6 +809,8 @@ spa_unload(spa_t *spa)
 	spa->spa_l2cache.sav_count = 0;
 
 	spa->spa_async_suspended = 0;
+
+	spa_config_exit(spa, SCL_ALL, FTAG);
 }
 
 /*
@@ -783,6 +900,7 @@ spa_load_spares(spa_t *spa)
 		}
 
 		vd->vdev_top = vd;
+		vd->vdev_aux = &spa->spa_spares;
 
 		if (vdev_open(vd) != 0)
 			continue;
@@ -905,12 +1023,9 @@ spa_load_l2cache(spa_t *spa)
 
 		vd = oldvdevs[i];
 		if (vd != NULL) {
-			if ((spa_mode & FWRITE) &&
-			    spa_l2cache_exists(vd->vdev_guid, &pool) &&
-			    pool != 0ULL &&
-			    l2arc_vdev_present(vd)) {
+			if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
+			    pool != 0ULL && l2arc_vdev_present(vd))
 				l2arc_remove_vdev(vd);
-			}
 			(void) vdev_close(vd);
 			spa_l2cache_remove(vd);
 		}
@@ -959,7 +1074,8 @@ load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
 	dmu_buf_rele(db, FTAG);
 
 	packed = kmem_alloc(nvsize, KM_SLEEP);
-	error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed);
+	error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed,
+	    DMU_READ_PREFETCH);
 	if (error == 0)
 		error = nvlist_unpack(packed, nvsize, value, 0);
 	kmem_free(packed, nvsize);
@@ -1026,8 +1142,16 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
 	uint64_t pool_guid;
 	uint64_t version;
 	uint64_t autoreplace = 0;
+	int orig_mode = spa->spa_mode;
 	char *ereport = FM_EREPORT_ZFS_POOL;
 
+	/*
+	 * If this is an untrusted config, access the pool in read-only mode.
+	 * This prevents things like resilvering recently removed devices.
+	 */
+	if (!mosconfig)
+		spa->spa_mode = FREAD;
+
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	spa->spa_load_state = state;
@@ -1057,6 +1181,12 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
 	spa->spa_load_guid = pool_guid;
 
 	/*
+	 * Create "The Godfather" zio to hold all async IOs
+	 */
+	spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
+	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
+
+	/*
 	 * Parse the configuration into a vdev tree.  We explicitly set the
 	 * value that will be returned by spa_version() since parsing the
 	 * configuration requires knowing the version number.
@@ -1082,13 +1212,17 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
 		goto out;
 
 	/*
-	 * Validate the labels for all leaf vdevs.  We need to grab the config
-	 * lock because all label I/O is done with ZIO_FLAG_CONFIG_WRITER.
+	 * We need to validate the vdev labels against the configuration that
+	 * we have in hand, which is dependent on the setting of mosconfig. If
+	 * mosconfig is true then we're validating the vdev labels based on
+	 * that config. Otherwise, we're validating against the cached config
+	 * (zpool.cache) that was read when we loaded the zfs module, and then
+	 * later we will recursively call spa_load() and validate against
+	 * the vdev config.
 	 */
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	error = vdev_validate(rvd);
 	spa_config_exit(spa, SCL_ALL, FTAG);
-
 	if (error != 0)
 		goto out;
 
@@ -1192,7 +1326,7 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
 		spa_config_set(spa, newconfig);
 		spa_unload(spa);
 		spa_deactivate(spa);
-		spa_activate(spa);
+		spa_activate(spa, orig_mode);
 
 		return (spa_load(spa, newconfig, state, B_TRUE));
 	}
@@ -1384,10 +1518,11 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
 		goto out;
 	}
 
-	if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) {
+	if (spa_writeable(spa)) {
 		dmu_tx_t *tx;
 		int need_update = B_FALSE;
-		int c;
+
+		ASSERT(state != SPA_LOAD_TRYIMPORT);
 
 		/*
 		 * Claim log blocks that haven't been committed yet.
@@ -1410,12 +1545,15 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
 		/*
 		 * If the config cache is stale, or we have uninitialized
 		 * metaslabs (see spa_vdev_add()), then update the config.
+		 *
+		 * If spa_load_verbatim is true, trust the current
+		 * in-core spa_config and update the disk labels.
 		 */
 		if (config_cache_txg != spa->spa_config_txg ||
-		    state == SPA_LOAD_IMPORT)
+		    state == SPA_LOAD_IMPORT || spa->spa_load_verbatim)
 			need_update = B_TRUE;
 
-		for (c = 0; c < rvd->vdev_children; c++)
+		for (int c = 0; c < rvd->vdev_children; c++)
 			if (rvd->vdev_child[c]->vdev_ms_array == 0)
 				need_update = B_TRUE;
 
@@ -1483,7 +1621,7 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config)
 	}
 	if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
 
-		spa_activate(spa);
+		spa_activate(spa, spa_mode_global);
 
 		error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE);
 
@@ -1586,6 +1724,8 @@ spa_add_spares(spa_t *spa, nvlist_t *config)
 	uint_t vsc;
 	uint64_t pool;
 
+	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
+
 	if (spa->spa_spares.sav_count == 0)
 		return;
 
@@ -1633,11 +1773,11 @@ spa_add_l2cache(spa_t *spa, nvlist_t *config)
 	vdev_stat_t *vs;
 	uint_t vsc;
 
+	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
+
 	if (spa->spa_l2cache.sav_count == 0)
 		return;
 
-	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
-
 	VERIFY(nvlist_lookup_nvlist(config,
 	    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
 	VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
@@ -1671,8 +1811,6 @@ spa_add_l2cache(spa_t *spa, nvlist_t *config)
 			vdev_get_stats(vd, vs);
 		}
 	}
-
-	spa_config_exit(spa, SCL_CONFIG, FTAG);
 }
 
 int
@@ -1684,16 +1822,27 @@ spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen)
 	*config = NULL;
 	error = spa_open_common(name, &spa, FTAG, config);
 
-	if (spa && *config != NULL) {
-		VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT,
-		    spa_get_errlog_size(spa)) == 0);
+	if (spa != NULL) {
+		/*
+		 * This still leaves a window of inconsistency where the spares
+		 * or l2cache devices could change and the config would be
+		 * self-inconsistent.
+		 */
+		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 
-		if (spa_suspended(spa))
+		if (*config != NULL) {
 			VERIFY(nvlist_add_uint64(*config,
-			    ZPOOL_CONFIG_SUSPENDED, spa->spa_failmode) == 0);
+			    ZPOOL_CONFIG_ERRCOUNT,
+			    spa_get_errlog_size(spa)) == 0);
 
-		spa_add_spares(spa, *config);
-		spa_add_l2cache(spa, *config);
+			if (spa_suspended(spa))
+				VERIFY(nvlist_add_uint64(*config,
+				    ZPOOL_CONFIG_SUSPENDED,
+				    spa->spa_failmode) == 0);
+
+			spa_add_spares(spa, *config);
+			spa_add_l2cache(spa, *config);
+		}
 	}
 
 	/*
@@ -1715,8 +1864,10 @@ spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen)
 		}
 	}
 
-	if (spa != NULL)
+	if (spa != NULL) {
+		spa_config_exit(spa, SCL_CONFIG, FTAG);
 		spa_close(spa, FTAG);
+	}
 
 	return (error);
 }
@@ -1887,11 +2038,9 @@ spa_l2cache_drop(spa_t *spa)
 		vd = sav->sav_vdevs[i];
 		ASSERT(vd != NULL);
 
-		if ((spa_mode & FWRITE) &&
-		    spa_l2cache_exists(vd->vdev_guid, &pool) && pool != 0ULL &&
-		    l2arc_vdev_present(vd)) {
+		if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
+		    pool != 0ULL && l2arc_vdev_present(vd))
 			l2arc_remove_vdev(vd);
-		}
 		if (vd->vdev_isl2cache)
 			spa_l2cache_remove(vd);
 		vdev_clear_stats(vd);
@@ -1932,12 +2081,11 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
 	(void) nvlist_lookup_string(props,
 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
 	spa = spa_add(pool, altroot);
-	spa_activate(spa);
+	spa_activate(spa, spa_mode_global);
 
 	spa->spa_uberblock.ub_txg = txg - 1;
 
 	if (props && (error = spa_prop_validate(spa, props))) {
-		spa_unload(spa);
 		spa_deactivate(spa);
 		spa_remove(spa);
 		mutex_exit(&spa_namespace_lock);
@@ -1952,6 +2100,12 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
 	spa->spa_ubsync = spa->spa_uberblock;
 
 	/*
+	 * Create "The Godfather" zio to hold all async IOs
+	 */
+	spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
+	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
+
+	/*
 	 * Create the root vdev.
 	 */
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
@@ -2069,8 +2223,10 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
 	spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
 	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
 	spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
-	if (props)
+	if (props != NULL) {
+		spa_configfile_set(spa, props, B_FALSE);
 		spa_sync_props(spa, props, CRED(), tx);
+	}
 
 	dmu_tx_commit(tx);
 
@@ -2095,148 +2251,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
 	return (0);
 }
 
-/*
- * Import the given pool into the system.  We set up the necessary spa_t and
- * then call spa_load() to do the dirty work.
- */
-static int
-spa_import_common(const char *pool, nvlist_t *config, nvlist_t *props,
-    boolean_t isroot, boolean_t allowfaulted)
-{
-	spa_t *spa;
-	char *altroot = NULL;
-	int error, loaderr;
-	nvlist_t *nvroot;
-	nvlist_t **spares, **l2cache;
-	uint_t nspares, nl2cache;
-
-	/*
-	 * If a pool with this name exists, return failure.
-	 */
-	mutex_enter(&spa_namespace_lock);
-	if (spa_lookup(pool) != NULL) {
-		mutex_exit(&spa_namespace_lock);
-		return (EEXIST);
-	}
-
-	/*
-	 * Create and initialize the spa structure.
-	 */
-	(void) nvlist_lookup_string(props,
-	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
-	spa = spa_add(pool, altroot);
-	spa_activate(spa);
-
-	if (allowfaulted)
-		spa->spa_import_faulted = B_TRUE;
-	spa->spa_is_root = isroot;
-
-	/*
-	 * Pass off the heavy lifting to spa_load().
-	 * Pass TRUE for mosconfig (unless this is a root pool) because
-	 * the user-supplied config is actually the one to trust when
-	 * doing an import.
-	 */
-	loaderr = error = spa_load(spa, config, SPA_LOAD_IMPORT, !isroot);
-
-	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-	/*
-	 * Toss any existing sparelist, as it doesn't have any validity anymore,
-	 * and conflicts with spa_has_spare().
-	 */
-	if (!isroot && spa->spa_spares.sav_config) {
-		nvlist_free(spa->spa_spares.sav_config);
-		spa->spa_spares.sav_config = NULL;
-		spa_load_spares(spa);
-	}
-	if (!isroot && spa->spa_l2cache.sav_config) {
-		nvlist_free(spa->spa_l2cache.sav_config);
-		spa->spa_l2cache.sav_config = NULL;
-		spa_load_l2cache(spa);
-	}
-
-	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
-	    &nvroot) == 0);
-	if (error == 0)
-		error = spa_validate_aux(spa, nvroot, -1ULL, VDEV_ALLOC_SPARE);
-	if (error == 0)
-		error = spa_validate_aux(spa, nvroot, -1ULL,
-		    VDEV_ALLOC_L2CACHE);
-	spa_config_exit(spa, SCL_ALL, FTAG);
-
-	if (error != 0 || (props && (error = spa_prop_set(spa, props)))) {
-		if (loaderr != 0 && loaderr != EINVAL && allowfaulted) {
-			/*
-			 * If we failed to load the pool, but 'allowfaulted' is
-			 * set, then manually set the config as if the config
-			 * passed in was specified in the cache file.
-			 */
-			error = 0;
-			spa->spa_import_faulted = B_FALSE;
-			if (spa->spa_config == NULL)
-				spa->spa_config = spa_config_generate(spa,
-				    NULL, -1ULL, B_TRUE);
-			spa_unload(spa);
-			spa_deactivate(spa);
-			spa_config_sync(spa, B_FALSE, B_TRUE);
-		} else {
-			spa_unload(spa);
-			spa_deactivate(spa);
-			spa_remove(spa);
-		}
-		mutex_exit(&spa_namespace_lock);
-		return (error);
-	}
-
-	/*
-	 * Override any spares and level 2 cache devices as specified by
-	 * the user, as these may have correct device names/devids, etc.
-	 */
-	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
-	    &spares, &nspares) == 0) {
-		if (spa->spa_spares.sav_config)
-			VERIFY(nvlist_remove(spa->spa_spares.sav_config,
-			    ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
-		else
-			VERIFY(nvlist_alloc(&spa->spa_spares.sav_config,
-			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
-		VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
-		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
-		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-		spa_load_spares(spa);
-		spa_config_exit(spa, SCL_ALL, FTAG);
-		spa->spa_spares.sav_sync = B_TRUE;
-	}
-	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
-	    &l2cache, &nl2cache) == 0) {
-		if (spa->spa_l2cache.sav_config)
-			VERIFY(nvlist_remove(spa->spa_l2cache.sav_config,
-			    ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0);
-		else
-			VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
-			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
-		VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
-		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
-		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-		spa_load_l2cache(spa);
-		spa_config_exit(spa, SCL_ALL, FTAG);
-		spa->spa_l2cache.sav_sync = B_TRUE;
-	}
-
-	if (spa_mode & FWRITE) {
-		/*
-		 * Update the config cache to include the newly-imported pool.
-		 */
-		spa_config_update_common(spa, SPA_CONFIG_UPDATE_POOL, isroot);
-	}
-
-	spa->spa_import_faulted = B_FALSE;
-	mutex_exit(&spa_namespace_lock);
-
-	return (0);
-}
-
-#if defined(sun)
+#ifdef sun
 #ifdef _KERNEL
 /*
  * Build a "root" vdev for a top level vdev read in from a rootpool
@@ -2372,11 +2387,11 @@ spa_get_rootconf(char *devpath, char *devid, nvlist_t **bestconf)
 		char *cdevid, *cpath;
 		uint64_t tmptxg;
 
+		cpath = NULL;
+		cdevid = NULL;
 		if (nvlist_lookup_string(child[c], ZPOOL_CONFIG_PHYS_PATH,
-		    &cpath) != 0)
-			return (EINVAL);
-		if (nvlist_lookup_string(child[c], ZPOOL_CONFIG_DEVID,
-		    &cdevid) != 0)
+		    &cpath) != 0 && nvlist_lookup_string(child[c],
+		    ZPOOL_CONFIG_DEVID, &cdevid) != 0)
 			return (EINVAL);
 		if ((spa_check_rootconf(cpath, cdevid, NULL,
 		    &tmptxg) == 0) && (tmptxg > txg)) {
@@ -2414,6 +2429,7 @@ spa_import_rootpool(char *devpath, char *devid)
 	nvlist_t *conf = NULL;
 	char *pname;
 	int error;
+	spa_t *spa;
 
 	/*
 	 * Get the vdev pathname and configuation from the most
@@ -2429,18 +2445,24 @@ spa_import_rootpool(char *devpath, char *devid)
 
 	VERIFY(nvlist_lookup_string(conf, ZPOOL_CONFIG_POOL_NAME, &pname) == 0);
 
-	/*
-	 * We specify 'allowfaulted' for this to be treated like spa_open()
-	 * instead of spa_import().  This prevents us from marking vdevs as
-	 * persistently unavailable, and generates FMA ereports as if it were a
-	 * pool open, not import.
-	 */
-	error = spa_import_common(pname, conf, NULL, B_TRUE, B_TRUE);
-	if (error == EEXIST)
-		error = 0;
+	mutex_enter(&spa_namespace_lock);
+	if ((spa = spa_lookup(pname)) != NULL) {
+		/*
+		 * Remove the existing root pool from the namespace so that we
+		 * can replace it with the correct config we just read in.
+		 */
+		spa_remove(spa);
+	}
+
+	spa = spa_add(pname, NULL);
+	spa->spa_is_root = B_TRUE;
+	spa->spa_load_verbatim = B_TRUE;
+
+	VERIFY(nvlist_dup(conf, &spa->spa_config, 0) == 0);
+	mutex_exit(&spa_namespace_lock);
 
 	nvlist_free(conf);
-	return (error);
+	return (0);
 
 msg_out:
 	cmn_err(CE_NOTE, "\n"
@@ -2453,23 +2475,170 @@ msg_out:
 	return (error);
 }
 #endif
-#endif
+#endif	/* sun */
 
 /*
- * Import a non-root pool into the system.
+ * Take a pool and insert it into the namespace as if it had been loaded at
+ * boot.
  */
 int
-spa_import(const char *pool, nvlist_t *config, nvlist_t *props)
+spa_import_verbatim(const char *pool, nvlist_t *config, nvlist_t *props)
 {
-	return (spa_import_common(pool, config, props, B_FALSE, B_FALSE));
+	spa_t *spa;
+	char *altroot = NULL;
+
+	mutex_enter(&spa_namespace_lock);
+	if (spa_lookup(pool) != NULL) {
+		mutex_exit(&spa_namespace_lock);
+		return (EEXIST);
+	}
+
+	(void) nvlist_lookup_string(props,
+	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
+	spa = spa_add(pool, altroot);
+
+	spa->spa_load_verbatim = B_TRUE;
+
+	VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0);
+
+	if (props != NULL)
+		spa_configfile_set(spa, props, B_FALSE);
+
+	spa_config_sync(spa, B_FALSE, B_TRUE);
+
+	mutex_exit(&spa_namespace_lock);
+
+	return (0);
 }
 
+/*
+ * Import a non-root pool into the system.
+ */
 int
-spa_import_faulted(const char *pool, nvlist_t *config, nvlist_t *props)
+spa_import(const char *pool, nvlist_t *config, nvlist_t *props)
 {
-	return (spa_import_common(pool, config, props, B_FALSE, B_TRUE));
-}
+	spa_t *spa;
+	char *altroot = NULL;
+	int error;
+	nvlist_t *nvroot;
+	nvlist_t **spares, **l2cache;
+	uint_t nspares, nl2cache;
+
+	/*
+	 * If a pool with this name exists, return failure.
+	 */
+	mutex_enter(&spa_namespace_lock);
+	if ((spa = spa_lookup(pool)) != NULL) {
+		mutex_exit(&spa_namespace_lock);
+		return (EEXIST);
+	}
+
+	/*
+	 * Create and initialize the spa structure.
+	 */
+	(void) nvlist_lookup_string(props,
+	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
+	spa = spa_add(pool, altroot);
+	spa_activate(spa, spa_mode_global);
+
+	/*
+	 * Don't start async tasks until we know everything is healthy.
+	 */
+	spa_async_suspend(spa);
+
+	/*
+	 * Pass off the heavy lifting to spa_load().  Pass TRUE for mosconfig
+	 * because the user-supplied config is actually the one to trust when
+	 * doing an import.
+	 */
+	error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE);
 
+	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+	/*
+	 * Toss any existing sparelist, as it doesn't have any validity
+	 * anymore, and conflicts with spa_has_spare().
+	 */
+	if (spa->spa_spares.sav_config) {
+		nvlist_free(spa->spa_spares.sav_config);
+		spa->spa_spares.sav_config = NULL;
+		spa_load_spares(spa);
+	}
+	if (spa->spa_l2cache.sav_config) {
+		nvlist_free(spa->spa_l2cache.sav_config);
+		spa->spa_l2cache.sav_config = NULL;
+		spa_load_l2cache(spa);
+	}
+
+	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+	    &nvroot) == 0);
+	if (error == 0)
+		error = spa_validate_aux(spa, nvroot, -1ULL,
+		    VDEV_ALLOC_SPARE);
+	if (error == 0)
+		error = spa_validate_aux(spa, nvroot, -1ULL,
+		    VDEV_ALLOC_L2CACHE);
+	spa_config_exit(spa, SCL_ALL, FTAG);
+
+	if (props != NULL)
+		spa_configfile_set(spa, props, B_FALSE);
+
+	if (error != 0 || (props && spa_writeable(spa) &&
+	    (error = spa_prop_set(spa, props)))) {
+		spa_unload(spa);
+		spa_deactivate(spa);
+		spa_remove(spa);
+		mutex_exit(&spa_namespace_lock);
+		return (error);
+	}
+
+	spa_async_resume(spa);
+
+	/*
+	 * Override any spares and level 2 cache devices as specified by
+	 * the user, as these may have correct device names/devids, etc.
+	 */
+	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
+	    &spares, &nspares) == 0) {
+		if (spa->spa_spares.sav_config)
+			VERIFY(nvlist_remove(spa->spa_spares.sav_config,
+			    ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
+		else
+			VERIFY(nvlist_alloc(&spa->spa_spares.sav_config,
+			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
+		VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
+		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
+		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+		spa_load_spares(spa);
+		spa_config_exit(spa, SCL_ALL, FTAG);
+		spa->spa_spares.sav_sync = B_TRUE;
+	}
+	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
+	    &l2cache, &nl2cache) == 0) {
+		if (spa->spa_l2cache.sav_config)
+			VERIFY(nvlist_remove(spa->spa_l2cache.sav_config,
+			    ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0);
+		else
+			VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
+			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
+		VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
+		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
+		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+		spa_load_l2cache(spa);
+		spa_config_exit(spa, SCL_ALL, FTAG);
+		spa->spa_l2cache.sav_sync = B_TRUE;
+	}
+
+	if (spa_writeable(spa)) {
+		/*
+		 * Update the config cache to include the newly-imported pool.
+		 */
+		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
+	}
+
+	mutex_exit(&spa_namespace_lock);
+
+	return (0);
+}
 
 /*
  * This (illegal) pool name is used when temporarily importing a spa_t in order
@@ -2497,7 +2666,7 @@ spa_tryimport(nvlist_t *tryconfig)
 	 */
 	mutex_enter(&spa_namespace_lock);
 	spa = spa_add(TRYIMPORT_NAME, NULL);
-	spa_activate(spa);
+	spa_activate(spa, FREAD);
 
 	/*
 	 * Pass off the heavy lifting to spa_load().
@@ -2553,8 +2722,10 @@ spa_tryimport(nvlist_t *tryconfig)
 		/*
 		 * Add the list of hot spares and level 2 cache devices.
 		 */
+		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 		spa_add_spares(spa, config);
 		spa_add_l2cache(spa, config);
+		spa_config_exit(spa, SCL_CONFIG, FTAG);
 	}
 
 	spa_unload(spa);
@@ -2583,7 +2754,7 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
 	if (oldconfig)
 		*oldconfig = NULL;
 
-	if (!(spa_mode & FWRITE))
+	if (!(spa_mode_global & FWRITE))
 		return (EROFS);
 
 	mutex_enter(&spa_namespace_lock);
@@ -2718,7 +2889,7 @@ int
 spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
 {
 	uint64_t txg;
-	int c, error;
+	int error;
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *vd, *tvd;
 	nvlist_t **spares, **l2cache;
@@ -2757,7 +2928,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
 	/*
 	 * Transfer each new top-level vdev from vd to rvd.
 	 */
-	for (c = 0; c < vd->vdev_children; c++) {
+	for (int c = 0; c < vd->vdev_children; c++) {
 		tvd = vd->vdev_child[c];
 		vdev_remove_child(vd, tvd);
 		tvd->vdev_id = rvd->vdev_children;
@@ -2965,13 +3136,14 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
 	 */
 	open_txg = txg + TXG_CONCURRENT_STATES - 1;
 
-	mutex_enter(&newvd->vdev_dtl_lock);
-	space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL,
-	    open_txg - TXG_INITIAL + 1);
-	mutex_exit(&newvd->vdev_dtl_lock);
+	vdev_dtl_dirty(newvd, DTL_MISSING,
+	    TXG_INITIAL, open_txg - TXG_INITIAL + 1);
 
-	if (newvd->vdev_isspare)
+	if (newvd->vdev_isspare) {
 		spa_spare_activate(newvd);
+		spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE);
+	}
+
 	oldvdpath = spa_strdup(oldvd->vdev_path);
 	newvdpath = spa_strdup(newvd->vdev_path);
 	newvd_isspare = newvd->vdev_isspare;
@@ -3012,10 +3184,10 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
  * is a replacing vdev.
  */
 int
-spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
+spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
 {
 	uint64_t txg;
-	int c, t, error;
+	int error;
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *vd, *pvd, *cvd, *tvd;
 	boolean_t unspare = B_FALSE;
@@ -3035,6 +3207,22 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
 	pvd = vd->vdev_parent;
 
 	/*
+	 * If the parent/child relationship is not as expected, don't do it.
+	 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing
+	 * vdev that's replacing B with C.  The user's intent in replacing
+	 * is to go from M(A,B) to M(A,C).  If the user decides to cancel
+	 * the replace by detaching C, the expected behavior is to end up
+	 * M(A,B).  But suppose that right after deciding to detach C,
+	 * the replacement of B completes.  We would have M(A,C), and then
+	 * ask to detach C, which would leave us with just A -- not what
+	 * the user wanted.  To prevent this, we make sure that the
+	 * parent/child relationship hasn't changed -- in this example,
+	 * that C's parent is still the replacing vdev R.
+	 */
+	if (pvd->vdev_guid != pguid && pguid != 0)
+		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
+
+	/*
 	 * If replace_done is specified, only remove this device if it's
 	 * the first child of a replacing vdev.  For the 'spare' vdev, either
 	 * disk can be removed.
@@ -3060,36 +3248,13 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 
 	/*
-	 * If there's only one replica, you can't detach it.
+	 * If this device has the only valid copy of some data,
+	 * we cannot safely detach it.
 	 */
-	if (pvd->vdev_children <= 1)
+	if (vdev_dtl_required(vd))
 		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
 
-	/*
-	 * If all siblings have non-empty DTLs, this device may have the only
-	 * valid copy of the data, which means we cannot safely detach it.
-	 *
-	 * XXX -- as in the vdev_offline() case, we really want a more
-	 * precise DTL check.
-	 */
-	for (c = 0; c < pvd->vdev_children; c++) {
-		uint64_t dirty;
-
-		cvd = pvd->vdev_child[c];
-		if (cvd == vd)
-			continue;
-		if (vdev_is_dead(cvd))
-			continue;
-		mutex_enter(&cvd->vdev_dtl_lock);
-		dirty = cvd->vdev_dtl_map.sm_space |
-		    cvd->vdev_dtl_scrub.sm_space;
-		mutex_exit(&cvd->vdev_dtl_lock);
-		if (!dirty)
-			break;
-	}
-
-	if (c == pvd->vdev_children)
-		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
+	ASSERT(pvd->vdev_children >= 2);
 
 	/*
 	 * If we are detaching the second disk from a replacing vdev, then
@@ -3115,7 +3280,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
 	 * active spare list for the pool.
 	 */
 	if (pvd->vdev_ops == &vdev_spare_ops &&
-	    vd->vdev_id == 0)
+	    vd->vdev_id == 0 && pvd->vdev_child[1]->vdev_isspare)
 		unspare = B_TRUE;
 
 	/*
@@ -3141,14 +3306,18 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
 
 	/*
 	 * If we need to remove the remaining child from the list of hot spares,
-	 * do it now, marking the vdev as no longer a spare in the process.  We
-	 * must do this before vdev_remove_parent(), because that can change the
-	 * GUID if it creates a new toplevel GUID.
+	 * do it now, marking the vdev as no longer a spare in the process.
+	 * We must do this before vdev_remove_parent(), because that can
+	 * change the GUID if it creates a new toplevel GUID.  For a similar
+	 * reason, we must remove the spare now, in the same txg as the detach;
+	 * otherwise someone could attach a new sibling, change the GUID, and
+	 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail.
 	 */
 	if (unspare) {
 		ASSERT(cvd->vdev_isspare);
 		spa_spare_remove(cvd);
 		unspare_guid = cvd->vdev_guid;
+		(void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
 	}
 
 	/*
@@ -3186,7 +3355,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
 	 * But first make sure we're not on any *other* txg's DTL list, to
 	 * prevent vd from being accessed after it's freed.
 	 */
-	for (t = 0; t < TXG_SIZE; t++)
+	for (int t = 0; t < TXG_SIZE; t++)
 		(void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
 	vd->vdev_detached = B_TRUE;
 	vdev_dirty(tvd, VDD_DTL, vd, txg);
@@ -3201,11 +3370,14 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
 	 * list of every other pool.
 	 */
 	if (unspare) {
+		spa_t *myspa = spa;
 		spa = NULL;
 		mutex_enter(&spa_namespace_lock);
 		while ((spa = spa_next(spa)) != NULL) {
 			if (spa->spa_state != POOL_STATE_ACTIVE)
 				continue;
+			if (spa == myspa)
+				continue;
 			spa_open_ref(spa, FTAG);
 			mutex_exit(&spa_namespace_lock);
 			(void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
@@ -3269,10 +3441,12 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
 	vdev_t *vd;
 	nvlist_t **spares, **l2cache, *nv;
 	uint_t nspares, nl2cache;
-	uint64_t txg;
+	uint64_t txg = 0;
 	int error = 0;
+	boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
 
-	txg = spa_vdev_enter(spa);
+	if (!locked)
+		txg = spa_vdev_enter(spa);
 
 	vd = spa_lookup_by_guid(spa, guid, B_FALSE);
 
@@ -3315,7 +3489,10 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
 		error = ENOENT;
 	}
 
-	return (spa_vdev_exit(spa, NULL, txg, error));
+	if (!locked)
+		return (spa_vdev_exit(spa, NULL, txg, error));
+
+	return (error);
 }
 
 /*
@@ -3341,13 +3518,9 @@ spa_vdev_resilver_done_hunt(vdev_t *vd)
 		oldvd = vd->vdev_child[0];
 		newvd = vd->vdev_child[1];
 
-		mutex_enter(&newvd->vdev_dtl_lock);
-		if (newvd->vdev_dtl_map.sm_space == 0 &&
-		    newvd->vdev_dtl_scrub.sm_space == 0) {
-			mutex_exit(&newvd->vdev_dtl_lock);
+		if (vdev_dtl_empty(newvd, DTL_MISSING) &&
+		    !vdev_dtl_required(oldvd))
 			return (oldvd);
-		}
-		mutex_exit(&newvd->vdev_dtl_lock);
 	}
 
 	/*
@@ -3357,15 +3530,12 @@ spa_vdev_resilver_done_hunt(vdev_t *vd)
 		newvd = vd->vdev_child[0];
 		oldvd = vd->vdev_child[1];
 
-		mutex_enter(&newvd->vdev_dtl_lock);
 		if (newvd->vdev_unspare &&
-		    newvd->vdev_dtl_map.sm_space == 0 &&
-		    newvd->vdev_dtl_scrub.sm_space == 0) {
+		    vdev_dtl_empty(newvd, DTL_MISSING) &&
+		    !vdev_dtl_required(oldvd)) {
 			newvd->vdev_unspare = 0;
-			mutex_exit(&newvd->vdev_dtl_lock);
 			return (oldvd);
 		}
-		mutex_exit(&newvd->vdev_dtl_lock);
 	}
 
 	return (NULL);
@@ -3374,92 +3544,84 @@ spa_vdev_resilver_done_hunt(vdev_t *vd)
 static void
 spa_vdev_resilver_done(spa_t *spa)
 {
-	vdev_t *vd;
-	vdev_t *pvd;
-	uint64_t guid;
-	uint64_t pguid = 0;
+	vdev_t *vd, *pvd, *ppvd;
+	uint64_t guid, sguid, pguid, ppguid;
 
-	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 
 	while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
+		pvd = vd->vdev_parent;
+		ppvd = pvd->vdev_parent;
 		guid = vd->vdev_guid;
+		pguid = pvd->vdev_guid;
+		ppguid = ppvd->vdev_guid;
+		sguid = 0;
 		/*
 		 * If we have just finished replacing a hot spared device, then
 		 * we need to detach the parent's first child (the original hot
 		 * spare) as well.
 		 */
-		pvd = vd->vdev_parent;
-		if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
-		    pvd->vdev_id == 0) {
+		if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0) {
 			ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
-			ASSERT(pvd->vdev_parent->vdev_children == 2);
-			pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid;
+			ASSERT(ppvd->vdev_children == 2);
+			sguid = ppvd->vdev_child[1]->vdev_guid;
 		}
-		spa_config_exit(spa, SCL_CONFIG, FTAG);
-		if (spa_vdev_detach(spa, guid, B_TRUE) != 0)
+		spa_config_exit(spa, SCL_ALL, FTAG);
+		if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
 			return;
-		if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0)
+		if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0)
 			return;
-		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	}
 
-	spa_config_exit(spa, SCL_CONFIG, FTAG);
+	spa_config_exit(spa, SCL_ALL, FTAG);
 }
 
 /*
- * Update the stored path for this vdev.  Dirty the vdev configuration, relying
- * on spa_vdev_enter/exit() to synchronize the labels and cache.
+ * Update the stored path or FRU for this vdev.  Dirty the vdev configuration,
+ * relying on spa_vdev_enter/exit() to synchronize the labels and cache.
  */
 int
-spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
+spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value,
+    boolean_t ispath)
 {
 	vdev_t *vd;
 	uint64_t txg;
 
 	txg = spa_vdev_enter(spa);
 
-	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) {
-		/*
-		 * Determine if this is a reference to a hot spare device.  If
-		 * it is, update the path manually as there is no associated
-		 * vdev_t that can be synced to disk.
-		 */
-		nvlist_t **spares;
-		uint_t i, nspares;
-
-		if (spa->spa_spares.sav_config != NULL) {
-			VERIFY(nvlist_lookup_nvlist_array(
-			    spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
-			    &spares, &nspares) == 0);
-			for (i = 0; i < nspares; i++) {
-				uint64_t theguid;
-				VERIFY(nvlist_lookup_uint64(spares[i],
-				    ZPOOL_CONFIG_GUID, &theguid) == 0);
-				if (theguid == guid) {
-					VERIFY(nvlist_add_string(spares[i],
-					    ZPOOL_CONFIG_PATH, newpath) == 0);
-					spa_load_spares(spa);
-					spa->spa_spares.sav_sync = B_TRUE;
-					return (spa_vdev_exit(spa, NULL, txg,
-					    0));
-				}
-			}
-		}
-
+	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_exit(spa, NULL, txg, ENOENT));
-	}
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 
-	spa_strfree(vd->vdev_path);
-	vd->vdev_path = spa_strdup(newpath);
+	if (ispath) {
+		spa_strfree(vd->vdev_path);
+		vd->vdev_path = spa_strdup(value);
+	} else {
+		if (vd->vdev_fru != NULL)
+			spa_strfree(vd->vdev_fru);
+		vd->vdev_fru = spa_strdup(value);
+	}
 
 	vdev_config_dirty(vd->vdev_top);
 
 	return (spa_vdev_exit(spa, NULL, txg, 0));
 }
 
+int
+spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
+{
+	return (spa_vdev_set_common(spa, guid, newpath, B_TRUE));
+}
+
+int
+spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru)
+{
+	return (spa_vdev_set_common(spa, guid, newfru, B_FALSE));
+}
+
 /*
  * ==========================================================================
  * SPA Scrubbing
@@ -3510,7 +3672,17 @@ spa_async_remove(spa_t *spa, vdev_t *vd)
 	if (vd->vdev_remove_wanted) {
 		vd->vdev_remove_wanted = 0;
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE);
-		vdev_clear(spa, vd);
+
+		/*
+		 * We want to clear the stats, but we don't want to do a full
+		 * vdev_clear() as that will cause us to throw away
+		 * degraded/faulted state as well as attempt to reopen the
+		 * device, all of which is a waste.
+		 */
+		vd->vdev_stat.vs_read_errors = 0;
+		vd->vdev_stat.vs_write_errors = 0;
+		vd->vdev_stat.vs_checksum_errors = 0;
+
 		vdev_state_dirty(vd->vdev_top);
 	}
 
@@ -3789,7 +3961,6 @@ spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 	zpool_prop_t prop;
 	const char *propname;
 	zprop_type_t proptype;
-	spa_config_dirent_t *dp;
 
 	mutex_enter(&spa->spa_props_lock);
 
@@ -3822,23 +3993,8 @@ spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 
 		case ZPOOL_PROP_CACHEFILE:
 			/*
-			 * 'cachefile' is a non-persistent property, but note
-			 * an async request that the config cache needs to be
-			 * udpated.
+			 * 'cachefile' is also a non-persisitent property.
 			 */
-			VERIFY(nvpair_value_string(elem, &strval) == 0);
-
-			dp = kmem_alloc(sizeof (spa_config_dirent_t), KM_SLEEP);
-
-			if (strval[0] == '\0')
-				dp->scd_path = spa_strdup(spa_config_path);
-			else if (strcmp(strval, "none") == 0)
-				dp->scd_path = NULL;
-			else
-				dp->scd_path = spa_strdup(strval);
-
-			list_insert_head(&spa->spa_config_list, dp);
-			spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
 			break;
 		default:
 			/*
@@ -3939,9 +4095,22 @@ spa_sync(spa_t *spa, uint64_t txg)
 	 * into config changes that go out with this transaction group.
 	 */
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
-	while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
-		vdev_state_clean(vd);
-		vdev_config_dirty(vd);
+	while (list_head(&spa->spa_state_dirty_list) != NULL) {
+		/*
+		 * We need the write lock here because, for aux vdevs,
+		 * calling vdev_config_dirty() modifies sav_config.
+		 * This is ugly and will become unnecessary when we
+		 * eliminate the aux vdev wart by integrating all vdevs
+		 * into the root vdev tree.
+		 */
+		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+		spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
+		while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
+			vdev_state_clean(vd);
+			vdev_config_dirty(vd);
+		}
+		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+		spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
 	}
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
@@ -4175,7 +4344,7 @@ spa_evict_all(void)
 }
 
 vdev_t *
-spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t l2cache)
+spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux)
 {
 	vdev_t *vd;
 	int i;
@@ -4183,12 +4352,18 @@ spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t l2cache)
 	if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL)
 		return (vd);
 
-	if (l2cache) {
+	if (aux) {
 		for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
 			vd = spa->spa_l2cache.sav_vdevs[i];
 			if (vd->vdev_guid == guid)
 				return (vd);
 		}
+
+		for (i = 0; i < spa->spa_spares.sav_count; i++) {
+			vd = spa->spa_spares.sav_vdevs[i];
+			if (vd->vdev_guid == guid)
+				return (vd);
+		}
 	}
 
 	return (NULL);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c
index 51770fc..34050ef 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c
@@ -212,6 +212,9 @@ spa_config_sync(spa_t *target, boolean_t removing, boolean_t postsysevent)
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
+	if (rootdir == NULL || !(spa_mode_global & FWRITE))
+		return;
+
 	/*
 	 * Iterate over all cachefiles for the pool, past or present.  When the
 	 * cachefile is changed, the new one is pushed onto this list, allowing
@@ -386,23 +389,12 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats)
 }
 
 /*
- * For a pool that's not currently a booting rootpool, update all disk labels,
- * generate a fresh config based on the current in-core state, and sync the
- * global config cache.
- */
-void
-spa_config_update(spa_t *spa, int what)
-{
-	spa_config_update_common(spa, what, FALSE);
-}
-
-/*
  * Update all disk labels, generate a fresh config based on the current
  * in-core state, and sync the global config cache (do not sync the config
  * cache if this is a booting rootpool).
  */
 void
-spa_config_update_common(spa_t *spa, int what, boolean_t isroot)
+spa_config_update(spa_t *spa, int what)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	uint64_t txg;
@@ -440,9 +432,9 @@ spa_config_update_common(spa_t *spa, int what, boolean_t isroot)
 	/*
 	 * Update the global config cache to reflect the new mosconfig.
 	 */
-	if (!isroot)
+	if (!spa->spa_is_root)
 		spa_config_sync(spa, B_FALSE, what != SPA_CONFIG_UPDATE_POOL);
 
 	if (what == SPA_CONFIG_UPDATE_POOL)
-		spa_config_update_common(spa, SPA_CONFIG_UPDATE_VDEVS, isroot);
+		spa_config_update(spa, SPA_CONFIG_UPDATE_VDEVS);
 }
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c
index e5c395f..e1ae491 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * Routines to manage the on-disk persistent error log.
  *
@@ -61,8 +59,8 @@
  * lowercase hexidecimal numbers that don't overflow.
  */
 #ifdef _KERNEL
-static uint64_t
-_strtonum(char *str, char **nptr)
+uint64_t
+_strtonum(const char *str, char **nptr)
 {
 	uint64_t val = 0;
 	char c;
@@ -82,7 +80,8 @@ _strtonum(char *str, char **nptr)
 		str++;
 	}
 
-	*nptr = str;
+	if (nptr)
+		*nptr = (char *)str;
 
 	return (val);
 }
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c
index de520d3..b403ccb 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c
@@ -20,12 +20,10 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/zap.h>
@@ -127,12 +125,12 @@ spa_history_advance_bof(spa_t *spa, spa_history_phys_t *shpp)
 	firstread = MIN(sizeof (reclen), shpp->sh_phys_max_off - phys_bof);
 
 	if ((err = dmu_read(mos, spa->spa_history, phys_bof, firstread,
-	    buf)) != 0)
+	    buf, DMU_READ_PREFETCH)) != 0)
 		return (err);
 	if (firstread != sizeof (reclen)) {
 		if ((err = dmu_read(mos, spa->spa_history,
 		    shpp->sh_pool_create_len, sizeof (reclen) - firstread,
-		    buf + firstread)) != 0)
+		    buf + firstread, DMU_READ_PREFETCH)) != 0)
 			return (err);
 	}
 
@@ -381,10 +379,11 @@ spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf)
 		return (0);
 	}
 
-	err = dmu_read(mos, spa->spa_history, phys_read_off, read_len, buf);
+	err = dmu_read(mos, spa->spa_history, phys_read_off, read_len, buf,
+	    DMU_READ_PREFETCH);
 	if (leftover && err == 0) {
 		err = dmu_read(mos, spa->spa_history, shpp->sh_pool_create_len,
-		    leftover, buf + read_len);
+		    leftover, buf + read_len, DMU_READ_PREFETCH);
 	}
 	mutex_exit(&spa->spa_history_lock);
 
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
index 5735d31..89e0301 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -230,7 +230,7 @@ static kmutex_t spa_l2cache_lock;
 static avl_tree_t spa_l2cache_avl;
 
 kmem_cache_t *spa_buffer_pool;
-int spa_mode;
+int spa_mode_global;
 
 #ifdef ZFS_DEBUG
 /* Everything except dprintf is on by default in debug builds */
@@ -429,7 +429,6 @@ spa_add(const char *name, const char *altroot)
 	spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP);
 
 	mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&spa->spa_async_root_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -438,7 +437,6 @@ spa_add(const char *name, const char *altroot)
 	mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
-	cv_init(&spa->spa_async_root_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL);
 
@@ -512,12 +510,10 @@ spa_remove(spa_t *spa)
 	spa_config_lock_destroy(spa);
 
 	cv_destroy(&spa->spa_async_cv);
-	cv_destroy(&spa->spa_async_root_cv);
 	cv_destroy(&spa->spa_scrub_io_cv);
 	cv_destroy(&spa->spa_suspend_cv);
 
 	mutex_destroy(&spa->spa_async_lock);
-	mutex_destroy(&spa->spa_async_root_lock);
 	mutex_destroy(&spa->spa_scrub_lock);
 	mutex_destroy(&spa->spa_errlog_lock);
 	mutex_destroy(&spa->spa_errlist_lock);
@@ -884,8 +880,10 @@ spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
 		txg_wait_synced(spa->spa_dsl_pool, txg);
 
 	if (vd != NULL) {
-		ASSERT(!vd->vdev_detached || vd->vdev_dtl.smo_object == 0);
+		ASSERT(!vd->vdev_detached || vd->vdev_dtl_smo.smo_object == 0);
+		spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
 		vdev_free(vd);
+		spa_config_exit(spa, SCL_ALL, spa);
 	}
 
 	/*
@@ -916,6 +914,15 @@ spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error)
 
 	spa_config_exit(spa, SCL_STATE_ALL, spa);
 
+	/*
+	 * If anything changed, wait for it to sync.  This ensures that,
+	 * from the system administrator's perspective, zpool(1M) commands
+	 * are synchronous.  This is important for things like zpool offline:
+	 * when the command completes, you expect no further I/O from ZFS.
+	 */
+	if (vd != NULL)
+		txg_wait_synced(spa->spa_dsl_pool, 0);
+
 	return (error);
 }
 
@@ -1118,6 +1125,37 @@ zfs_panic_recover(const char *fmt, ...)
 }
 
 /*
+ * This is a stripped-down version of strtoull, suitable only for converting
+ * lowercase hexidecimal numbers that don't overflow.
+ */
+uint64_t
+zfs_strtonum(const char *str, char **nptr)
+{
+	uint64_t val = 0;
+	char c;
+	int digit;
+
+	while ((c = *str) != '\0') {
+		if (c >= '0' && c <= '9')
+			digit = c - '0';
+		else if (c >= 'a' && c <= 'f')
+			digit = 10 + c - 'a';
+		else
+			break;
+
+		val *= 16;
+		val += digit;
+
+		str++;
+	}
+
+	if (nptr)
+		*nptr = (char *)str;
+
+	return (val);
+}
+
+/*
  * ==========================================================================
  * Accessor functions
  * ==========================================================================
@@ -1355,7 +1393,7 @@ spa_init(int mode)
 	avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t),
 	    offsetof(spa_aux_t, aux_avl));
 
-	spa_mode = mode;
+	spa_mode_global = mode;
 
 	refcount_sysinit();
 	unique_init();
@@ -1412,3 +1450,15 @@ spa_is_root(spa_t *spa)
 {
 	return (spa->spa_is_root);
 }
+
+boolean_t
+spa_writeable(spa_t *spa)
+{
+	return (!!(spa->spa_mode & FWRITE));
+}
+
+int
+spa_mode(spa_t *spa)
+{
+	return (spa->spa_mode);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
index 0f247c0..75b55d5 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -116,12 +116,23 @@ space_map_add(space_map_t *sm, uint64_t start, uint64_t size)
 
 	if (merge_before && merge_after) {
 		avl_remove(&sm->sm_root, ss_before);
+		if (sm->sm_pp_root) {
+			avl_remove(sm->sm_pp_root, ss_before);
+			avl_remove(sm->sm_pp_root, ss_after);
+		}
 		ss_after->ss_start = ss_before->ss_start;
 		kmem_free(ss_before, sizeof (*ss_before));
+		ss = ss_after;
 	} else if (merge_before) {
 		ss_before->ss_end = end;
+		if (sm->sm_pp_root)
+			avl_remove(sm->sm_pp_root, ss_before);
+		ss = ss_before;
 	} else if (merge_after) {
 		ss_after->ss_start = start;
+		if (sm->sm_pp_root)
+			avl_remove(sm->sm_pp_root, ss_after);
+		ss = ss_after;
 	} else {
 		ss = kmem_alloc(sizeof (*ss), KM_SLEEP);
 		ss->ss_start = start;
@@ -129,6 +140,9 @@ space_map_add(space_map_t *sm, uint64_t start, uint64_t size)
 		avl_insert(&sm->sm_root, ss, where);
 	}
 
+	if (sm->sm_pp_root)
+		avl_add(sm->sm_pp_root, ss);
+
 	sm->sm_space += size;
 }
 
@@ -163,12 +177,17 @@ space_map_remove(space_map_t *sm, uint64_t start, uint64_t size)
 	left_over = (ss->ss_start != start);
 	right_over = (ss->ss_end != end);
 
+	if (sm->sm_pp_root)
+		avl_remove(sm->sm_pp_root, ss);
+
 	if (left_over && right_over) {
 		newseg = kmem_alloc(sizeof (*newseg), KM_SLEEP);
 		newseg->ss_start = end;
 		newseg->ss_end = ss->ss_end;
 		ss->ss_end = start;
 		avl_insert_here(&sm->sm_root, newseg, ss, AVL_AFTER);
+		if (sm->sm_pp_root)
+			avl_add(sm->sm_pp_root, newseg);
 	} else if (left_over) {
 		ss->ss_end = start;
 	} else if (right_over) {
@@ -176,12 +195,16 @@ space_map_remove(space_map_t *sm, uint64_t start, uint64_t size)
 	} else {
 		avl_remove(&sm->sm_root, ss);
 		kmem_free(ss, sizeof (*ss));
+		ss = NULL;
 	}
 
+	if (sm->sm_pp_root && ss != NULL)
+		avl_add(sm->sm_pp_root, ss);
+
 	sm->sm_space -= size;
 }
 
-int
+boolean_t
 space_map_contains(space_map_t *sm, uint64_t start, uint64_t size)
 {
 	avl_index_t where;
@@ -221,59 +244,10 @@ space_map_walk(space_map_t *sm, space_map_func_t *func, space_map_t *mdest)
 {
 	space_seg_t *ss;
 
-	for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
-		func(mdest, ss->ss_start, ss->ss_end - ss->ss_start);
-}
-
-void
-space_map_excise(space_map_t *sm, uint64_t start, uint64_t size)
-{
-	avl_tree_t *t = &sm->sm_root;
-	avl_index_t where;
-	space_seg_t *ss, search;
-	uint64_t end = start + size;
-	uint64_t rm_start, rm_end;
-
 	ASSERT(MUTEX_HELD(sm->sm_lock));
 
-	search.ss_start = start;
-	search.ss_end = start;
-
-	for (;;) {
-		ss = avl_find(t, &search, &where);
-
-		if (ss == NULL)
-			ss = avl_nearest(t, where, AVL_AFTER);
-
-		if (ss == NULL || ss->ss_start >= end)
-			break;
-
-		rm_start = MAX(ss->ss_start, start);
-		rm_end = MIN(ss->ss_end, end);
-
-		space_map_remove(sm, rm_start, rm_end - rm_start);
-	}
-}
-
-/*
- * Replace smd with the union of smd and sms.
- */
-void
-space_map_union(space_map_t *smd, space_map_t *sms)
-{
-	avl_tree_t *t = &sms->sm_root;
-	space_seg_t *ss;
-
-	ASSERT(MUTEX_HELD(smd->sm_lock));
-
-	/*
-	 * For each source segment, remove any intersections with the
-	 * destination, then add the source segment to the destination.
-	 */
-	for (ss = avl_first(t); ss != NULL; ss = AVL_NEXT(t, ss)) {
-		space_map_excise(smd, ss->ss_start, ss->ss_end - ss->ss_start);
-		space_map_add(smd, ss->ss_start, ss->ss_end - ss->ss_start);
-	}
+	for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
+		func(mdest, ss->ss_start, ss->ss_end - ss->ss_start);
 }
 
 /*
@@ -337,7 +311,8 @@ space_map_load(space_map_t *sm, space_map_ops_t *ops, uint8_t maptype,
 		    smo->smo_object, offset, size);
 
 		mutex_exit(sm->sm_lock);
-		error = dmu_read(os, smo->smo_object, offset, size, entry_map);
+		error = dmu_read(os, smo->smo_object, offset, size, entry_map,
+		    DMU_READ_PREFETCH);
 		mutex_enter(sm->sm_lock);
 		if (error != 0)
 			break;
@@ -391,6 +366,15 @@ space_map_unload(space_map_t *sm)
 }
 
 uint64_t
+space_map_maxsize(space_map_t *sm)
+{
+	if (sm->sm_loaded && sm->sm_ops != NULL)
+		return (sm->sm_ops->smop_max(sm));
+	else
+		return (-1ULL);
+}
+
+uint64_t
 space_map_alloc(space_map_t *sm, uint64_t size)
 {
 	uint64_t start;
@@ -505,3 +489,131 @@ space_map_truncate(space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx)
 	smo->smo_objsize = 0;
 	smo->smo_alloc = 0;
 }
+
+/*
+ * Space map reference trees.
+ *
+ * A space map is a collection of integers.  Every integer is either
+ * in the map, or it's not.  A space map reference tree generalizes
+ * the idea: it allows its members to have arbitrary reference counts,
+ * as opposed to the implicit reference count of 0 or 1 in a space map.
+ * This representation comes in handy when computing the union or
+ * intersection of multiple space maps.  For example, the union of
+ * N space maps is the subset of the reference tree with refcnt >= 1.
+ * The intersection of N space maps is the subset with refcnt >= N.
+ *
+ * [It's very much like a Fourier transform.  Unions and intersections
+ * are hard to perform in the 'space map domain', so we convert the maps
+ * into the 'reference count domain', where it's trivial, then invert.]
+ *
+ * vdev_dtl_reassess() uses computations of this form to determine
+ * DTL_MISSING and DTL_OUTAGE for interior vdevs -- e.g. a RAID-Z vdev
+ * has an outage wherever refcnt >= vdev_nparity + 1, and a mirror vdev
+ * has an outage wherever refcnt >= vdev_children.
+ */
+static int
+space_map_ref_compare(const void *x1, const void *x2)
+{
+	const space_ref_t *sr1 = x1;
+	const space_ref_t *sr2 = x2;
+
+	if (sr1->sr_offset < sr2->sr_offset)
+		return (-1);
+	if (sr1->sr_offset > sr2->sr_offset)
+		return (1);
+
+	if (sr1 < sr2)
+		return (-1);
+	if (sr1 > sr2)
+		return (1);
+
+	return (0);
+}
+
+void
+space_map_ref_create(avl_tree_t *t)
+{
+	avl_create(t, space_map_ref_compare,
+	    sizeof (space_ref_t), offsetof(space_ref_t, sr_node));
+}
+
+void
+space_map_ref_destroy(avl_tree_t *t)
+{
+	space_ref_t *sr;
+	void *cookie = NULL;
+
+	while ((sr = avl_destroy_nodes(t, &cookie)) != NULL)
+		kmem_free(sr, sizeof (*sr));
+
+	avl_destroy(t);
+}
+
+static void
+space_map_ref_add_node(avl_tree_t *t, uint64_t offset, int64_t refcnt)
+{
+	space_ref_t *sr;
+
+	sr = kmem_alloc(sizeof (*sr), KM_SLEEP);
+	sr->sr_offset = offset;
+	sr->sr_refcnt = refcnt;
+
+	avl_add(t, sr);
+}
+
+void
+space_map_ref_add_seg(avl_tree_t *t, uint64_t start, uint64_t end,
+	int64_t refcnt)
+{
+	space_map_ref_add_node(t, start, refcnt);
+	space_map_ref_add_node(t, end, -refcnt);
+}
+
+/*
+ * Convert (or add) a space map into a reference tree.
+ */
+void
+space_map_ref_add_map(avl_tree_t *t, space_map_t *sm, int64_t refcnt)
+{
+	space_seg_t *ss;
+
+	ASSERT(MUTEX_HELD(sm->sm_lock));
+
+	for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
+		space_map_ref_add_seg(t, ss->ss_start, ss->ss_end, refcnt);
+}
+
+/*
+ * Convert a reference tree into a space map.  The space map will contain
+ * all members of the reference tree for which refcnt >= minref.
+ */
+void
+space_map_ref_generate_map(avl_tree_t *t, space_map_t *sm, int64_t minref)
+{
+	uint64_t start = -1ULL;
+	int64_t refcnt = 0;
+	space_ref_t *sr;
+
+	ASSERT(MUTEX_HELD(sm->sm_lock));
+
+	space_map_vacate(sm, NULL, NULL);
+
+	for (sr = avl_first(t); sr != NULL; sr = AVL_NEXT(t, sr)) {
+		refcnt += sr->sr_refcnt;
+		if (refcnt >= minref) {
+			if (start == -1ULL) {
+				start = sr->sr_offset;
+			}
+		} else {
+			if (start != -1ULL) {
+				uint64_t end = sr->sr_offset;
+				ASSERT(start <= end);
+				if (end > start)
+					space_map_add(sm, start, end - start);
+				start = -1ULL;
+			}
+		}
+	}
+	ASSERT(refcnt == 0);
+	ASSERT(start == -1ULL);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
index 0a39d19..f52851d 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
@@ -85,6 +85,8 @@ void *arc_data_buf_alloc(uint64_t space);
 void arc_data_buf_free(void *buf, uint64_t space);
 arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag,
     arc_buf_contents_t type);
+arc_buf_t *arc_loan_buf(spa_t *spa, int size);
+void arc_return_buf(arc_buf_t *buf, void *tag);
 void arc_buf_add_ref(arc_buf_t *buf, void *tag);
 int arc_buf_remove_ref(arc_buf_t *buf, void *tag);
 int arc_buf_size(arc_buf_t *buf);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
index b27d89f..7e2754d 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -262,6 +262,7 @@ void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
 void dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx);
 void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
 void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx);
+void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx);
 dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
 
 void dbuf_clear(dmu_buf_impl_t *db);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
index 7befe96..08c30c8 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
@@ -61,6 +61,7 @@ struct zbookmark;
 struct spa;
 struct nvlist;
 struct objset_impl;
+struct arc_buf;
 struct file;
 
 typedef struct objset objset_t;
@@ -116,6 +117,8 @@ typedef enum dmu_object_type {
 	DMU_OT_FUID_SIZE,		/* FUID table size UINT64 */
 	DMU_OT_NEXT_CLONES,		/* ZAP */
 	DMU_OT_SCRUB_QUEUE,		/* ZAP */
+	DMU_OT_USERGROUP_USED,		/* ZAP */
+	DMU_OT_USERGROUP_QUOTA,		/* ZAP */
 	DMU_OT_NUMTYPES
 } dmu_object_type_t;
 
@@ -158,6 +161,9 @@ void zfs_znode_byteswap(void *buf, size_t size);
 #define	DMU_MAX_ACCESS (10<<20) /* 10MB */
 #define	DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */
 
+#define	DMU_USERUSED_OBJECT	(-1ULL)
+#define	DMU_GROUPUSED_OBJECT	(-2ULL)
+
 /*
  * Public routines to create, destroy, open, and close objsets.
  */
@@ -173,7 +179,8 @@ int dmu_objset_create(const char *name, dmu_objset_type_t type,
 int dmu_objset_destroy(const char *name);
 int dmu_snapshots_destroy(char *fsname, char *snapname);
 int dmu_objset_rollback(objset_t *os);
-int dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive);
+int dmu_objset_snapshot(char *fsname, char *snapname, struct nvlist *props,
+    boolean_t recursive);
 int dmu_objset_rename(const char *name, const char *newname,
     boolean_t recursive);
 int dmu_objset_find(char *name, int func(char *, void *), void *arg,
@@ -400,6 +407,11 @@ void *dmu_buf_get_user(dmu_buf_t *db);
 void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx);
 
 /*
+ * Tells if the given dbuf is freeable.
+ */
+boolean_t dmu_buf_freeable(dmu_buf_t *);
+
+/*
  * You must create a transaction, then hold the objects which you will
  * (or might) modify as part of this transaction.  Then you must assign
  * the transaction to a transaction group.  Once the transaction has
@@ -424,7 +436,7 @@ dmu_tx_t *dmu_tx_create(objset_t *os);
 void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len);
 void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off,
     uint64_t len);
-void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name);
+void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name);
 void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object);
 void dmu_tx_abort(dmu_tx_t *tx);
 int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
@@ -447,8 +459,10 @@ int dmu_free_object(objset_t *os, uint64_t object);
  * Canfail routines will return 0 on success, or an errno if there is a
  * nonrecoverable I/O error.
  */
+#define	DMU_READ_PREFETCH	0 /* prefetch */
+#define	DMU_READ_NO_PREFETCH	1 /* don't prefetch */
 int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
-	void *buf);
+	void *buf, uint32_t flags);
 void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 	const void *buf, dmu_tx_t *tx);
 int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size);
@@ -456,6 +470,10 @@ int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size,
     dmu_tx_t *tx);
 int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t size, struct page *pp, dmu_tx_t *tx);
+struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size);
+void dmu_return_arcbuf(struct arc_buf *buf);
+void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf,
+    dmu_tx_t *tx);
 
 extern int zfs_prefetch_disable;
 
@@ -562,6 +580,12 @@ extern int dmu_snapshot_realname(objset_t *os, char *name, char *real,
     int maxlen, boolean_t *conflict);
 extern int dmu_dir_list_next(objset_t *os, int namelen, char *name,
     uint64_t *idp, uint64_t *offp);
+
+typedef void objset_used_cb_t(objset_t *os, dmu_object_type_t bonustype,
+    void *oldbonus, void *newbonus, uint64_t oldused, uint64_t newused,
+    dmu_tx_t *tx);
+extern void dmu_objset_register_type(dmu_objset_type_t ost,
+    objset_used_cb_t *cb);
 extern void dmu_objset_set_user(objset_t *os, void *user_ptr);
 extern void *dmu_objset_get_user(objset_t *os);
 
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
index 1d65727..a8022d2 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
@@ -42,12 +42,20 @@ struct dsl_dataset;
 struct dmu_tx;
 struct objset_impl;
 
+#define	OBJSET_PHYS_SIZE 2048
+#define	OBJSET_OLD_PHYS_SIZE 1024
+
+#define	OBJSET_FLAG_USERACCOUNTING_COMPLETE	(1ULL<<0)
+
 typedef struct objset_phys {
 	dnode_phys_t os_meta_dnode;
 	zil_header_t os_zil_header;
 	uint64_t os_type;
-	char os_pad[1024 - sizeof (dnode_phys_t) - sizeof (zil_header_t) -
-	    sizeof (uint64_t)];
+	uint64_t os_flags;
+	char os_pad[OBJSET_PHYS_SIZE - sizeof (dnode_phys_t)*3 -
+	    sizeof (zil_header_t) - sizeof (uint64_t)*2];
+	dnode_phys_t os_userused_dnode;
+	dnode_phys_t os_groupused_dnode;
 } objset_phys_t;
 
 struct objset {
@@ -62,6 +70,8 @@ typedef struct objset_impl {
 	arc_buf_t *os_phys_buf;
 	objset_phys_t *os_phys;
 	dnode_t *os_meta_dnode;
+	dnode_t *os_userused_dnode;
+	dnode_t *os_groupused_dnode;
 	zilog_t *os_zil;
 	objset_t os;
 	uint8_t os_checksum;	/* can change, under dsl_dir's locks */
@@ -74,6 +84,8 @@ typedef struct objset_impl {
 	struct dmu_tx *os_synctx; /* XXX sketchy */
 	blkptr_t *os_rootbp;
 	zil_header_t os_zil_header;
+	list_t os_synced_dnodes;
+	uint64_t os_flags;
 
 	/* Protected by os_obj_lock */
 	kmutex_t os_obj_lock;
@@ -92,6 +104,7 @@ typedef struct objset_impl {
 } objset_impl_t;
 
 #define	DMU_META_DNODE_OBJECT	0
+#define	DMU_OBJECT_IS_SPECIAL(obj) ((int64_t)(obj) <= 0)
 
 #define	DMU_OS_IS_L2CACHEABLE(os)				\
 	((os)->os_secondary_cache == ZFS_CACHE_ALL ||		\
@@ -106,7 +119,8 @@ int dmu_objset_create(const char *name, dmu_objset_type_t type,
     void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg);
 int dmu_objset_destroy(const char *name);
 int dmu_objset_rollback(objset_t *os);
-int dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive);
+int dmu_objset_snapshot(char *fsname, char *snapname, nvlist_t *props,
+    boolean_t recursive);
 void dmu_objset_stats(objset_t *os, nvlist_t *nv);
 void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat);
 void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
@@ -127,6 +141,10 @@ objset_impl_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds,
 int dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp,
     objset_impl_t **osip);
 void dmu_objset_evict(struct dsl_dataset *ds, void *arg);
+void dmu_objset_do_userquota_callbacks(objset_impl_t *os, dmu_tx_t *tx);
+boolean_t dmu_objset_userused_enabled(objset_impl_t *os);
+int dmu_objset_userspace_upgrade(objset_t *os);
+boolean_t dmu_objset_userspace_present(objset_t *os);
 
 #ifdef	__cplusplus
 }
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h
index be9e569..48e4da8 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h
@@ -98,7 +98,8 @@ enum dnode_dirtycontext {
 };
 
 /* Is dn_used in bytes?  if not, it's in multiples of SPA_MINBLOCKSIZE */
-#define	DNODE_FLAG_USED_BYTES	(1<<0)
+#define	DNODE_FLAG_USED_BYTES		(1<<0)
+#define	DNODE_FLAG_USERUSED_ACCOUNTED	(1<<1)
 
 typedef struct dnode_phys {
 	uint8_t dn_type;		/* dmu_object_type_t */
@@ -131,10 +132,7 @@ typedef struct dnode {
 	 */
 	krwlock_t dn_struct_rwlock;
 
-	/*
-	 * Our link on dataset's dd_dnodes list.
-	 * Protected by dd_accounting_mtx.
-	 */
+	/* Our link on dn_objset->os_dnodes list; protected by os_lock.  */
 	list_node_t dn_link;
 
 	/* immutable: */
@@ -191,6 +189,9 @@ typedef struct dnode {
 	/* parent IO for current sync write */
 	zio_t *dn_zio;
 
+	/* used in syncing context */
+	dnode_phys_t *dn_oldphys;
+
 	/* holds prefetch structure */
 	struct zfetch	dn_zfetch;
 } dnode_t;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h
index 8665aec..a1c2896 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -195,7 +195,7 @@ void dsl_dataset_sync(dsl_dataset_t *os, zio_t *zio, dmu_tx_t *tx);
 void dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
 int dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
     dmu_tx_t *tx);
-int dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth);
+boolean_t dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth);
 uint64_t dsl_dataset_prev_snap_txg(dsl_dataset_t *ds);
 
 void dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h
index a29e44e..b064c92 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_DSL_DELEG_H
 #define	_SYS_DSL_DELEG_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/dmu.h>
 #include <sys/dsl_pool.h>
 #include <sys/zfs_context.h>
@@ -51,6 +49,10 @@ extern "C" {
 #define	ZFS_DELEG_PERM_ALLOW		"allow"
 #define	ZFS_DELEG_PERM_USERPROP		"userprop"
 #define	ZFS_DELEG_PERM_VSCAN		"vscan"
+#define	ZFS_DELEG_PERM_USERQUOTA	"userquota"
+#define	ZFS_DELEG_PERM_GROUPQUOTA	"groupquota"
+#define	ZFS_DELEG_PERM_USERUSED		"userused"
+#define	ZFS_DELEG_PERM_GROUPUSED	"groupused"
 
 /*
  * Note: the names of properties that are marked delegatable are also
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h
index 86b9636..56d0638 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -107,7 +107,6 @@ int dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
     const char *tail, void *tag, dsl_dir_t **);
 void dsl_dir_name(dsl_dir_t *dd, char *buf);
 int dsl_dir_namelen(dsl_dir_t *dd);
-int dsl_dir_is_private(dsl_dir_t *dd);
 uint64_t dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds,
     const char *name, dmu_tx_t *tx);
 dsl_checkfunc_t dsl_dir_destroy_check;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h
index ef1b904..d8da295 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h
index d66caa8..26018a4 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h
@@ -19,18 +19,17 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_DSL_PROP_H
 #define	_SYS_DSL_PROP_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/dmu.h>
 #include <sys/dsl_pool.h>
 #include <sys/zfs_context.h>
+#include <sys/dsl_synctask.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -66,8 +65,10 @@ int dsl_prop_get_ds(struct dsl_dataset *ds, const char *propname,
 int dsl_prop_get_dd(struct dsl_dir *dd, const char *propname,
     int intsz, int numints, void *buf, char *setpoint);
 
+dsl_syncfunc_t dsl_props_set_sync;
 int dsl_prop_set(const char *ddname, const char *propname,
     int intsz, int numints, const void *buf);
+int dsl_props_set(const char *dsname, nvlist_t *nvl);
 void dsl_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val,
     cred_t *cr, dmu_tx_t *tx);
 
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
index 1c9d89e..5d3e11c 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -39,6 +39,8 @@ extern "C" {
 typedef struct metaslab_class metaslab_class_t;
 typedef struct metaslab_group metaslab_group_t;
 
+extern space_map_ops_t *zfs_metaslab_ops;
+
 extern metaslab_t *metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
     uint64_t start, uint64_t size, uint64_t txg);
 extern void metaslab_fini(metaslab_t *msp);
@@ -55,7 +57,7 @@ extern void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg,
     boolean_t now);
 extern int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg);
 
-extern metaslab_class_t *metaslab_class_create(void);
+extern metaslab_class_t *metaslab_class_create(space_map_ops_t *ops);
 extern void metaslab_class_destroy(metaslab_class_t *mc);
 extern void metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg);
 extern void metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
index 5980cbc..d67dea7 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _SYS_METASLAB_IMPL_H
 #define	_SYS_METASLAB_IMPL_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/metaslab.h>
 #include <sys/space_map.h>
 #include <sys/vdev.h>
@@ -41,6 +39,7 @@ extern "C" {
 struct metaslab_class {
 	metaslab_group_t	*mc_rotor;
 	uint64_t		mc_allocated;
+	space_map_ops_t		*mc_ops;
 };
 
 struct metaslab_group {
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
index 1cfa7ec..f54a5dc 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -208,8 +208,8 @@ typedef struct blkptr {
 #define	DVA_SET_GANG(dva, x)	BF64_SET((dva)->dva_word[1], 63, 1, x)
 
 #define	BP_GET_LSIZE(bp)	\
-	(BP_IS_HOLE(bp) ? 0 : \
-	BF64_GET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1))
+	BF64_GET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1)
+
 #define	BP_SET_LSIZE(bp, x)	\
 	BF64_SET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x)
 
@@ -329,7 +329,7 @@ extern int spa_check_rootconf(char *devpath, char *devid,
 extern boolean_t spa_rootdev_validate(nvlist_t *nv);
 extern int spa_import_rootpool(char *devpath, char *devid);
 extern int spa_import(const char *pool, nvlist_t *config, nvlist_t *props);
-extern int spa_import_faulted(const char *, nvlist_t *, nvlist_t *);
+extern int spa_import_verbatim(const char *, nvlist_t *, nvlist_t *);
 extern nvlist_t *spa_tryimport(nvlist_t *tryconfig);
 extern int spa_destroy(char *pool);
 extern int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
@@ -352,9 +352,11 @@ extern void spa_inject_delref(spa_t *spa);
 extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot);
 extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot,
     int replacing);
-extern int spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done);
+extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid,
+    int replace_done);
 extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare);
 extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath);
+extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru);
 
 /* spare state (which is global across all pools) */
 extern void spa_spare_add(vdev_t *vd);
@@ -476,6 +478,10 @@ extern boolean_t spa_has_spare(spa_t *, uint64_t guid);
 extern uint64_t bp_get_dasize(spa_t *spa, const blkptr_t *bp);
 extern boolean_t spa_has_slogs(spa_t *spa);
 extern boolean_t spa_is_root(spa_t *spa);
+extern boolean_t spa_writeable(spa_t *spa);
+extern int spa_mode(spa_t *spa);
+extern uint64_t zfs_strtonum(const char *str, char **nptr);
+#define	strtonum(str, nptr)	zfs_strtonum((str), (nptr))
 
 /* history logging */
 typedef enum history_log_type {
@@ -529,6 +535,7 @@ extern void spa_boot_init();
 extern int spa_prop_set(spa_t *spa, nvlist_t *nvp);
 extern int spa_prop_get(spa_t *spa, nvlist_t **nvp);
 extern void spa_prop_clear_bootfs(spa_t *spa, uint64_t obj, dmu_tx_t *tx);
+extern void spa_configfile_set(spa_t *, nvlist_t *, boolean_t);
 
 /* asynchronous event notification */
 extern void spa_event_notify(spa_t *spa, vdev_t *vdev, const char *name);
@@ -546,7 +553,7 @@ _NOTE(CONSTCOND) } while (0)
 #define	dprintf_bp(bp, fmt, ...)
 #endif
 
-extern int spa_mode;			/* mode, e.g. FREAD | FWRITE */
+extern int spa_mode_global;			/* mode, e.g. FREAD | FWRITE */
 
 #ifdef	__cplusplus
 }
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
index 8aeb414..f3124b1 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -105,6 +105,7 @@ struct spa {
 	int		spa_inject_ref;		/* injection references */
 	uint8_t		spa_sync_on;		/* sync threads are running */
 	spa_load_state_t spa_load_state;	/* current load operation */
+	boolean_t	spa_load_verbatim;	/* load the given config? */
 	taskq_t		*spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES];
 	dsl_pool_t	*spa_dsl_pool;
 	metaslab_class_t *spa_normal_class;	/* normal data class */
@@ -141,9 +142,6 @@ struct spa {
 	int		spa_async_suspended;	/* async tasks suspended */
 	kcondvar_t	spa_async_cv;		/* wait for thread_exit() */
 	uint16_t	spa_async_tasks;	/* async task mask */
-	kmutex_t	spa_async_root_lock;	/* protects async root count */
-	uint64_t	spa_async_root_count;	/* number of async root zios */
-	kcondvar_t	spa_async_root_cv;	/* notify when count == 0 */
 	char		*spa_root;		/* alternate root directory */
 	uint64_t	spa_ena;		/* spa-wide ereport ENA */
 	boolean_t	spa_last_open_failed;	/* true if last open faled */
@@ -163,13 +161,14 @@ struct spa {
 	uint64_t	spa_failmode;		/* failure mode for the pool */
 	uint64_t	spa_delegation;		/* delegation on/off */
 	list_t		spa_config_list;	/* previous cache file(s) */
+	zio_t		*spa_async_zio_root;	/* root of all async I/O */
 	zio_t		*spa_suspend_zio_root;	/* root of all suspended I/O */
 	kmutex_t	spa_suspend_lock;	/* protects suspend_zio_root */
 	kcondvar_t	spa_suspend_cv;		/* notification of resume */
 	uint8_t		spa_suspended;		/* pool is suspended */
-	boolean_t	spa_import_faulted;	/* allow faulted vdevs */
 	boolean_t	spa_is_root;		/* pool is root */
 	int		spa_minref;		/* num refs when first opened */
+	int		spa_mode;		/* FREAD | FWRITE */
 	spa_log_state_t spa_log_state;		/* log state */
 	/*
 	 * spa_refcnt & spa_config_lock must be the last elements
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h
index db9daef..a682bbd 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _SYS_SPACE_MAP_H
 #define	_SYS_SPACE_MAP_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/avl.h>
 #include <sys/dmu.h>
 
@@ -48,16 +46,24 @@ typedef struct space_map {
 	uint8_t		sm_loading;	/* map loading? */
 	kcondvar_t	sm_load_cv;	/* map load completion */
 	space_map_ops_t	*sm_ops;	/* space map block picker ops vector */
+	avl_tree_t	*sm_pp_root;	/* picker-private AVL tree */
 	void		*sm_ppd;	/* picker-private data */
 	kmutex_t	*sm_lock;	/* pointer to lock that protects map */
 } space_map_t;
 
 typedef struct space_seg {
 	avl_node_t	ss_node;	/* AVL node */
+	avl_node_t	ss_pp_node;	/* AVL picker-private node */
 	uint64_t	ss_start;	/* starting offset of this segment */
 	uint64_t	ss_end;		/* ending offset (non-inclusive) */
 } space_seg_t;
 
+typedef struct space_ref {
+	avl_node_t	sr_node;	/* AVL node */
+	uint64_t	sr_offset;	/* offset (start or end) */
+	int64_t		sr_refcnt;	/* associated reference count */
+} space_ref_t;
+
 typedef struct space_map_obj {
 	uint64_t	smo_object;	/* on-disk space map object */
 	uint64_t	smo_objsize;	/* size of the object */
@@ -70,6 +76,7 @@ struct space_map_ops {
 	uint64_t (*smop_alloc)(space_map_t *sm, uint64_t size);
 	void	(*smop_claim)(space_map_t *sm, uint64_t start, uint64_t size);
 	void	(*smop_free)(space_map_t *sm, uint64_t start, uint64_t size);
+	uint64_t (*smop_max)(space_map_t *sm);
 };
 
 /*
@@ -133,13 +140,12 @@ extern void space_map_create(space_map_t *sm, uint64_t start, uint64_t size,
 extern void space_map_destroy(space_map_t *sm);
 extern void space_map_add(space_map_t *sm, uint64_t start, uint64_t size);
 extern void space_map_remove(space_map_t *sm, uint64_t start, uint64_t size);
-extern int space_map_contains(space_map_t *sm, uint64_t start, uint64_t size);
+extern boolean_t space_map_contains(space_map_t *sm,
+    uint64_t start, uint64_t size);
 extern void space_map_vacate(space_map_t *sm,
     space_map_func_t *func, space_map_t *mdest);
 extern void space_map_walk(space_map_t *sm,
     space_map_func_t *func, space_map_t *mdest);
-extern void space_map_excise(space_map_t *sm, uint64_t start, uint64_t size);
-extern void space_map_union(space_map_t *smd, space_map_t *sms);
 
 extern void space_map_load_wait(space_map_t *sm);
 extern int space_map_load(space_map_t *sm, space_map_ops_t *ops,
@@ -149,12 +155,22 @@ extern void space_map_unload(space_map_t *sm);
 extern uint64_t space_map_alloc(space_map_t *sm, uint64_t size);
 extern void space_map_claim(space_map_t *sm, uint64_t start, uint64_t size);
 extern void space_map_free(space_map_t *sm, uint64_t start, uint64_t size);
+extern uint64_t space_map_maxsize(space_map_t *sm);
 
 extern void space_map_sync(space_map_t *sm, uint8_t maptype,
     space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx);
 extern void space_map_truncate(space_map_obj_t *smo,
     objset_t *os, dmu_tx_t *tx);
 
+extern void space_map_ref_create(avl_tree_t *t);
+extern void space_map_ref_destroy(avl_tree_t *t);
+extern void space_map_ref_add_seg(avl_tree_t *t,
+    uint64_t start, uint64_t end, int64_t refcnt);
+extern void space_map_ref_add_map(avl_tree_t *t,
+    space_map_t *sm, int64_t refcnt);
+extern void space_map_ref_generate_map(avl_tree_t *t,
+    space_map_t *sm, int64_t minref);
+
 #ifdef	__cplusplus
 }
 #endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h
index 55a0dd5..b49df8a 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _SYS_UBERBLOCK_IMPL_H
 #define	_SYS_UBERBLOCK_IMPL_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/uberblock.h>
 
 #ifdef	__cplusplus
@@ -35,6 +33,11 @@ extern "C" {
 #endif
 
 /*
+ * For zdb use and debugging purposes only
+ */
+extern uint64_t ub_max_txg;
+
+/*
  * The uberblock version is incremented whenever an incompatible on-disk
  * format change is made to the SPA, DMU, or ZAP.
  *
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h
index c070d6f..b8313a9 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h
@@ -36,6 +36,14 @@
 extern "C" {
 #endif
 
+typedef enum vdev_dtl_type {
+	DTL_MISSING,	/* 0% replication: no copies of the data */
+	DTL_PARTIAL,	/* less than 100% replication: some copies missing */
+	DTL_SCRUB,	/* unable to fully repair during scrub/resilver */
+	DTL_OUTAGE,	/* temporarily missing (used to attempt detach) */
+	DTL_TYPES
+} vdev_dtl_type_t;
+
 extern boolean_t zfs_nocacheflush;
 
 extern int vdev_open(vdev_t *);
@@ -50,10 +58,14 @@ extern zio_t *vdev_probe(vdev_t *vd, zio_t *pio);
 extern boolean_t vdev_is_bootable(vdev_t *vd);
 extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev);
 extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid);
-extern void vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size);
-extern int vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size);
+extern void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t d,
+    uint64_t txg, uint64_t size);
+extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d,
+    uint64_t txg, uint64_t size);
+extern boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t d);
 extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
     int scrub_done);
+extern boolean_t vdev_dtl_required(vdev_t *vd);
 extern boolean_t vdev_resilver_needed(vdev_t *vd,
     uint64_t *minp, uint64_t *maxp);
 
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
index 7e24ede..1406d15 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -123,8 +123,7 @@ struct vdev {
 	vdev_t		*vdev_parent;	/* parent vdev			*/
 	vdev_t		**vdev_child;	/* array of children		*/
 	uint64_t	vdev_children;	/* number of children		*/
-	space_map_t	vdev_dtl_map;	/* dirty time log in-core state	*/
-	space_map_t	vdev_dtl_scrub;	/* DTL for scrub repair writes	*/
+	space_map_t	vdev_dtl[DTL_TYPES]; /* in-core dirty time logs	*/
 	vdev_stat_t	vdev_stat;	/* virtual device statistics	*/
 
 	/*
@@ -149,7 +148,7 @@ struct vdev {
 	 * Leaf vdev state.
 	 */
 	uint64_t	vdev_psize;	/* physical device capacity	*/
-	space_map_obj_t	vdev_dtl;	/* dirty time log on-disk state	*/
+	space_map_obj_t	vdev_dtl_smo;	/* dirty time log space map obj	*/
 	txg_node_t	vdev_dtl_node;	/* per-txg dirty DTL linkage	*/
 	uint64_t	vdev_wholedisk;	/* true if this is a whole disk */
 	uint64_t	vdev_offline;	/* persistent offline state	*/
@@ -160,6 +159,7 @@ struct vdev {
 	char		*vdev_path;	/* vdev path (if any)		*/
 	char		*vdev_devid;	/* vdev devid (if any)		*/
 	char		*vdev_physpath;	/* vdev device path (if any)	*/
+	char		*vdev_fru;	/* physical FRU location	*/
 	uint64_t	vdev_not_present; /* not present during import	*/
 	uint64_t	vdev_unspare;	/* unspare when resilvering done */
 	hrtime_t	vdev_last_try;	/* last reopen time		*/
@@ -189,8 +189,9 @@ struct vdev {
 	kmutex_t	vdev_probe_lock; /* protects vdev_probe_zio	*/
 };
 
-#define	VDEV_SKIP_SIZE		(8 << 10)
-#define	VDEV_BOOT_HEADER_SIZE	(8 << 10)
+#define	VDEV_PAD_SIZE		(8 << 10)
+/* 2 padding areas (vl_pad1 and vl_pad2) to skip */
+#define	VDEV_SKIP_SIZE		VDEV_PAD_SIZE * 2
 #define	VDEV_PHYS_SIZE		(112 << 10)
 #define	VDEV_UBERBLOCK_RING	(128 << 10)
 
@@ -202,26 +203,14 @@ struct vdev {
 	offsetof(vdev_label_t, vl_uberblock[(n) << VDEV_UBERBLOCK_SHIFT(vd)])
 #define	VDEV_UBERBLOCK_SIZE(vd)		(1ULL << VDEV_UBERBLOCK_SHIFT(vd))
 
-/* ZFS boot block */
-#define	VDEV_BOOT_MAGIC		0x2f5b007b10cULL
-#define	VDEV_BOOT_VERSION	1		/* version number	*/
-
-typedef struct vdev_boot_header {
-	uint64_t	vb_magic;		/* VDEV_BOOT_MAGIC	*/
-	uint64_t	vb_version;		/* VDEV_BOOT_VERSION	*/
-	uint64_t	vb_offset;		/* start offset	(bytes) */
-	uint64_t	vb_size;		/* size (bytes)		*/
-	char		vb_pad[VDEV_BOOT_HEADER_SIZE - 4 * sizeof (uint64_t)];
-} vdev_boot_header_t;
-
 typedef struct vdev_phys {
 	char		vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_block_tail_t)];
 	zio_block_tail_t vp_zbt;
 } vdev_phys_t;
 
 typedef struct vdev_label {
-	char		vl_pad[VDEV_SKIP_SIZE];			/*   8K	*/
-	vdev_boot_header_t vl_boot_header;			/*   8K	*/
+	char		vl_pad1[VDEV_PAD_SIZE];			/*  8K */
+	char		vl_pad2[VDEV_PAD_SIZE];			/*  8K */
 	vdev_phys_t	vl_vdev_phys;				/* 112K	*/
 	char		vl_uberblock[VDEV_UBERBLOCK_RING];	/* 128K	*/
 } vdev_label_t;							/* 256K total */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h
index f88cc06..ea3a0f6 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h
@@ -186,6 +186,9 @@ int zap_lookup_norm(objset_t *ds, uint64_t zapobj, const char *name,
     matchtype_t mt, char *realname, int rn_len,
     boolean_t *normalization_conflictp);
 
+int zap_count_write(objset_t *os, uint64_t zapobj, const char *name,
+    int add, uint64_t *towrite, uint64_t *tooverwrite);
+
 /*
  * Create an attribute with the given name and value.
  *
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h
index 0dc02ab..c86bb16 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_ZAP_IMPL_H
 #define	_SYS_ZAP_IMPL_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/zap.h>
 #include <sys/zfs_context.h>
 #include <sys/avl.h>
@@ -195,6 +193,8 @@ int fzap_count(zap_t *zap, uint64_t *count);
 int fzap_lookup(zap_name_t *zn,
     uint64_t integer_size, uint64_t num_integers, void *buf,
     char *realname, int rn_len, boolean_t *normalization_conflictp);
+int fzap_count_write(zap_name_t *zn, int add, uint64_t *towrite,
+    uint64_t *tooverwrite);
 int fzap_add(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers,
     const void *val, dmu_tx_t *tx);
 int fzap_update(zap_name_t *zn,
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h
index f87823c..3607e1f 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -113,8 +113,6 @@ typedef struct zfs_acl_phys {
 	uint8_t		z_ace_data[ZFS_ACE_SPACE]; /* space for embedded ACEs */
 } zfs_acl_phys_t;
 
-
-
 typedef struct acl_ops {
 	uint32_t	(*ace_mask_get) (void *acep); /* get  access mask */
 	void 		(*ace_mask_set) (void *acep,
@@ -160,12 +158,21 @@ typedef struct zfs_acl {
 	zfs_acl_node_t	*z_curr_node;	/* current node iterator is handling */
 	list_t		z_acl;		/* chunks of ACE data */
 	acl_ops_t	z_ops;		/* ACL operations */
-	boolean_t	z_has_fuids;	/* FUIDs present in ACL? */
 } zfs_acl_t;
 
 #define	ACL_DATA_ALLOCED	0x1
 #define	ZFS_ACL_SIZE(aclcnt)	(sizeof (ace_t) * (aclcnt))
 
+struct zfs_fuid_info;
+
+typedef struct zfs_acl_ids {
+	uint64_t		z_fuid;		/* file owner fuid */
+	uint64_t		z_fgid;		/* file group owner fuid */
+	uint64_t		z_mode;		/* mode to set on create */
+	zfs_acl_t		*z_aclp;	/* ACL to create with file */
+	struct zfs_fuid_info 	*z_fuidp;	/* for tracking fuids for log */
+} zfs_acl_ids_t;
+
 /*
  * Property values for acl_mode and acl_inherit.
  *
@@ -182,11 +189,12 @@ typedef struct zfs_acl {
 
 struct znode;
 struct zfsvfs;
-struct zfs_fuid_info;
 
 #ifdef _KERNEL
-void zfs_perm_init(struct znode *, struct znode *, int, vattr_t *,
-    dmu_tx_t *, cred_t *, zfs_acl_t *, zfs_fuid_info_t **);
+int zfs_acl_ids_create(struct znode *, int, vattr_t *,
+    cred_t *, vsecattr_t *, zfs_acl_ids_t *);
+void zfs_acl_ids_free(zfs_acl_ids_t *);
+boolean_t zfs_acl_ids_overquota(struct zfsvfs *, zfs_acl_ids_t *);
 int zfs_getacl(struct znode *, vsecattr_t *, boolean_t, cred_t *);
 int zfs_setacl(struct znode *, vsecattr_t *, boolean_t, cred_t *);
 void zfs_acl_rele(void *);
@@ -201,9 +209,9 @@ int zfs_zaccess_delete(struct znode *, struct znode *, cred_t *);
 int zfs_zaccess_rename(struct znode *, struct znode *,
     struct znode *, struct znode *, cred_t *cr);
 void zfs_acl_free(zfs_acl_t *);
-int zfs_vsec_2_aclp(struct zfsvfs *, vtype_t, vsecattr_t *, zfs_acl_t **);
-int zfs_aclset_common(struct znode *, zfs_acl_t *, cred_t *,
-    struct zfs_fuid_info **, dmu_tx_t *);
+int zfs_vsec_2_aclp(struct zfsvfs *, vtype_t, vsecattr_t *, cred_t *,
+    struct zfs_fuid_info **, zfs_acl_t **);
+int zfs_aclset_common(struct znode *, zfs_acl_t *, cred_t *, dmu_tx_t *);
 
 #endif
 
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h
index 0dd8f4f..952bb24 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h
@@ -134,4 +134,6 @@ extern struct mtx zfs_debug_mtx;
 	}								\
 } while (0)
 
+#define	sys_shutdown	rebooting
+
 #endif	/* _SYS_ZFS_CONTEXT_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h
index 905e8dd..25348d6 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_ZFS_CTLDIR_H
 #define	_ZFS_CTLDIR_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/vnode.h>
 #include <sys/zfs_vfsops.h>
 #include <sys/zfs_znode.h>
@@ -63,6 +61,7 @@ int zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp);
 
 #define	ZFSCTL_INO_ROOT		0x1
 #define	ZFSCTL_INO_SNAPDIR	0x2
+#define	ZFSCTL_INO_SHARES	0x3
 
 #ifdef	__cplusplus
 }
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h
index 0dbb3c5..bd2c938 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h
@@ -49,7 +49,6 @@ extern "C" {
 /* mknode flags */
 #define	IS_ROOT_NODE	0x01		/* create a root node */
 #define	IS_XATTR	0x02		/* create an extended attribute node */
-#define	IS_REPLAY	0x04		/* we are replaying intent log */
 
 extern int zfs_dirent_lock(zfs_dirlock_t **, znode_t *, char *, znode_t **,
     int, int *, pathname_t *);
@@ -60,7 +59,7 @@ extern int zfs_link_destroy(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int,
 extern int zfs_dirlook(znode_t *, char *, vnode_t **, int, int *,
     pathname_t *);
 extern void zfs_mknode(znode_t *, vattr_t *, dmu_tx_t *, cred_t *,
-    uint_t, znode_t **, int, zfs_acl_t *, zfs_fuid_info_t **);
+    uint_t, znode_t **, int, zfs_acl_ids_t *);
 extern void zfs_rmnode(znode_t *);
 extern void zfs_dl_name_switch(zfs_dirlock_t *dl, char *new, char **old);
 extern boolean_t zfs_dirempty(znode_t *);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_fuid.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_fuid.h
index 8d73b41..c035707 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_fuid.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_fuid.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_FS_ZFS_FUID_H
 #define	_SYS_FS_ZFS_FUID_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #ifdef _KERNEL
 #include <sys/kidmap.h>
@@ -51,11 +49,11 @@ typedef enum {
  * Estimate space needed for one more fuid table entry.
  * for now assume its current size + 1K
  */
-#define	FUID_SIZE_ESTIMATE(z) (z->z_fuid_size + (SPA_MINBLOCKSIZE << 1))
+#define	FUID_SIZE_ESTIMATE(z) ((z)->z_fuid_size + (SPA_MINBLOCKSIZE << 1))
 
-#define	FUID_INDEX(x)	(x >> 32)
-#define	FUID_RID(x)	(x & 0xffffffff)
-#define	FUID_ENCODE(idx, rid) ((idx << 32) | rid)
+#define	FUID_INDEX(x)	((x) >> 32)
+#define	FUID_RID(x)	((x) & 0xffffffff)
+#define	FUID_ENCODE(idx, rid) (((uint64_t)(idx) << 32) | (rid))
 /*
  * FUIDs cause problems for the intent log
  * we need to replay the creation of the FUID,
@@ -104,17 +102,23 @@ struct znode;
 extern uid_t zfs_fuid_map_id(zfsvfs_t *, uint64_t, cred_t *, zfs_fuid_type_t);
 extern void zfs_fuid_destroy(zfsvfs_t *);
 extern uint64_t zfs_fuid_create_cred(zfsvfs_t *, zfs_fuid_type_t,
-    dmu_tx_t *, cred_t *, zfs_fuid_info_t **);
+    cred_t *, zfs_fuid_info_t **);
 extern uint64_t zfs_fuid_create(zfsvfs_t *, uint64_t, cred_t *, zfs_fuid_type_t,
-    dmu_tx_t *, zfs_fuid_info_t **);
-extern void zfs_fuid_map_ids(struct znode *zp, cred_t *cr, uid_t *uid,
-    uid_t *gid);
+    zfs_fuid_info_t **);
+extern void zfs_fuid_map_ids(struct znode *zp, cred_t *cr,
+    uid_t *uid, uid_t *gid);
 extern zfs_fuid_info_t *zfs_fuid_info_alloc(void);
-extern void zfs_fuid_info_free();
+extern void zfs_fuid_info_free(zfs_fuid_info_t *);
 extern boolean_t zfs_groupmember(zfsvfs_t *, uint64_t, cred_t *);
+void zfs_fuid_sync(zfsvfs_t *, dmu_tx_t *);
+extern int zfs_fuid_find_by_domain(zfsvfs_t *, const char *domain,
+    char **retdomain, boolean_t addok);
+extern const char *zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx);
+extern void zfs_fuid_txhold(zfsvfs_t *zfsvfs, dmu_tx_t *tx);
 #endif
 
 char *zfs_fuid_idx_domain(avl_tree_t *, uint32_t);
+void zfs_fuid_avl_tree_create(avl_tree_t *, avl_tree_t *);
 uint64_t zfs_fuid_table_load(objset_t *, uint64_t, avl_tree_t *, avl_tree_t *);
 void zfs_fuid_table_destroy(avl_tree_t *, avl_tree_t *);
 
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
index 05a21c8..15a4a76 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
@@ -169,6 +169,13 @@ typedef struct zfs_cmd {
 	zinject_record_t zc_inject_record;
 } zfs_cmd_t;
 
+typedef struct zfs_useracct {
+	char zu_domain[256];
+	uid_t zu_rid;
+	uint32_t zu_pad;
+	uint64_t zu_space;
+} zfs_useracct_t;
+
 #define	ZVOL_MAX_MINOR	(1 << 16)
 #define	ZFS_MIN_MINOR	(ZVOL_MAX_MINOR + 1)
 
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h
index 8d53c02..163a800 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_FS_ZFS_VFSOPS_H
 #define	_SYS_FS_ZFS_VFSOPS_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/list.h>
 #include <sys/vfs.h>
 #include <sys/zil.h>
@@ -47,13 +45,13 @@ struct zfsvfs {
 	uint64_t	z_root;		/* id of root znode */
 	uint64_t	z_unlinkedobj;	/* id of unlinked zapobj */
 	uint64_t	z_max_blksz;	/* maximum block size for files */
-	uint64_t	z_assign;	/* TXG_NOWAIT or set by zil_replay() */
 	uint64_t	z_fuid_obj;	/* fuid table object number */
 	uint64_t	z_fuid_size;	/* fuid table size */
 	avl_tree_t	z_fuid_idx;	/* fuid tree keyed by index */
 	avl_tree_t	z_fuid_domain;	/* fuid tree keyed by domain */
 	krwlock_t	z_fuid_lock;	/* fuid lock */
 	boolean_t	z_fuid_loaded;	/* fuid tables are loaded */
+	boolean_t	z_fuid_dirty;   /* need to sync fuid table ? */
 	struct zfs_fuid_info	*z_fuid_replay; /* fuid info for replay */
 	zilog_t		*z_log;		/* intent log pointer */
 	uint_t		z_acl_mode;	/* acl chmod/mode behavior */
@@ -72,8 +70,13 @@ struct zfsvfs {
 	boolean_t	z_issnap;	/* true if this is a snapshot */
 	boolean_t	z_vscan;	/* virus scan on/off */
 	boolean_t	z_use_fuids;	/* version allows fuids */
-	kmutex_t	z_online_recv_lock; /* recv in prog grabs as WRITER */
+	boolean_t	z_replay;	/* set during ZIL replay */
+	kmutex_t	z_online_recv_lock; /* held while recv in progress */
 	uint64_t	z_version;	/* ZPL version */
+	uint64_t	z_shares_dir;	/* hidden shares dir */
+	kmutex_t	z_lock;
+	uint64_t	z_userquota_obj;
+	uint64_t	z_groupquota_obj;
 #define	ZFS_OBJ_MTX_SZ	64
 	kmutex_t	z_hold_mtx[ZFS_OBJ_MTX_SZ];	/* znode hold locks */
 };
@@ -131,6 +134,17 @@ extern int zfs_super_owner;
 
 extern int zfs_suspend_fs(zfsvfs_t *zfsvfs, char *osname, int *mode);
 extern int zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname, int mode);
+extern int zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
+    const char *domain, uint64_t rid, uint64_t *valuep);
+extern int zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
+    uint64_t *cookiep, void *vbuf, uint64_t *bufsizep);
+extern int zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
+    const char *domain, uint64_t rid, uint64_t quota);
+extern boolean_t zfs_usergroup_overquota(zfsvfs_t *zfsvfs,
+    boolean_t isgroup, uint64_t fuid);
+extern int zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers);
+extern int zfsvfs_create(const char *name, int mode, zfsvfs_t **zvp);
+extern void zfsvfs_free(zfsvfs_t *zfsvfs);
 
 #ifdef	__cplusplus
 }
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h
index f91bc90..47072fb 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -100,6 +100,7 @@ extern "C" {
 #define	ZFS_ROOT_OBJ		"ROOT"
 #define	ZPL_VERSION_STR		"VERSION"
 #define	ZFS_FUID_TABLES		"FUID"
+#define	ZFS_SHARES_DIR		"SHARES"
 
 #define	ZFS_MAX_BLOCKSIZE	(SPA_MAXBLOCKSIZE)
 
@@ -186,7 +187,6 @@ typedef struct znode {
 	vnode_t		*z_vnode;
 	uint64_t	z_id;		/* object ID for this znode */
 	kmutex_t	z_lock;		/* znode modification lock */
-	krwlock_t	z_map_lock;	/* page map lock */
 	krwlock_t	z_parent_lock;	/* parent lock for directories */
 	krwlock_t	z_name_lock;	/* "master" lock for dirent locks */
 	zfs_dirlock_t	*z_dirlocks;	/* directory entry lock list */
@@ -338,7 +338,6 @@ extern void	zfs_remove_op_tables();
 extern int	zfs_create_op_tables();
 extern dev_t	zfs_cmpldev(uint64_t);
 extern int	zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value);
-extern int	zfs_set_version(const char *name, uint64_t newvers);
 extern int	zfs_get_stats(objset_t *os, nvlist_t *nv);
 extern void	zfs_znode_dmu_fini(znode_t *);
 
@@ -367,6 +366,7 @@ extern void zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp,
 #endif
 extern void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap);
 extern void zfs_upgrade(zfsvfs_t *zfsvfs, dmu_tx_t *tx);
+extern int zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx);
 
 extern zil_get_data_t zfs_get_data;
 extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE];
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h
index 5212aaf..e992f6a 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h
@@ -139,7 +139,8 @@ typedef enum zil_create {
 #define	TX_MKDIR_ACL		17	/* mkdir with ACL */
 #define	TX_MKDIR_ATTR		18	/* mkdir with attr */
 #define	TX_MKDIR_ACL_ATTR	19	/* mkdir with ACL + attrs */
-#define	TX_MAX_TYPE		20	/* Max transaction type */
+#define	TX_WRITE2		20	/* dmu_sync EALREADY write */
+#define	TX_MAX_TYPE		21	/* Max transaction type */
 
 /*
  * The transactions for mkdir, symlink, remove, rmdir, link, and rename
@@ -341,7 +342,6 @@ typedef void zil_parse_blk_func_t(zilog_t *zilog, blkptr_t *bp, void *arg,
 typedef void zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg,
     uint64_t txg);
 typedef int zil_replay_func_t();
-typedef void zil_replay_cleaner_t();
 typedef int zil_get_data_t(void *arg, lr_write_t *lr, char *dbuf, zio_t *zio);
 
 extern uint64_t	zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
@@ -356,9 +356,8 @@ extern void	zil_free(zilog_t *zilog);
 extern zilog_t	*zil_open(objset_t *os, zil_get_data_t *get_data);
 extern void	zil_close(zilog_t *zilog);
 
-extern void	zil_replay(objset_t *os, void *arg, uint64_t *txgp,
-    zil_replay_func_t *replay_func[TX_MAX_TYPE],
-    zil_replay_cleaner_t *replay_cleaner);
+extern void	zil_replay(objset_t *os, void *arg,
+    zil_replay_func_t *replay_func[TX_MAX_TYPE]);
 extern void	zil_destroy(zilog_t *zilog, boolean_t keep_first);
 extern void	zil_rollback_destroy(zilog_t *zilog, dmu_tx_t *tx);
 
@@ -378,6 +377,7 @@ extern int	zil_suspend(zilog_t *zilog);
 extern void	zil_resume(zilog_t *zilog);
 
 extern void	zil_add_block(zilog_t *zilog, blkptr_t *bp);
+extern void	zil_get_replay_data(zilog_t *zilog, lr_write_t *lr);
 
 extern int zil_disable;
 
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h
index 0fc800b..3f25829 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_ZIL_IMPL_H
 #define	_SYS_ZIL_IMPL_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/zil.h>
 #include <sys/dmu_objset.h>
 
@@ -74,13 +72,14 @@ struct zilog {
 	uint64_t	zl_commit_seq;	/* committed upto this number */
 	uint64_t	zl_lr_seq;	/* log record sequence number */
 	uint64_t	zl_destroy_txg;	/* txg of last zil_destroy() */
-	uint64_t	zl_replay_seq[TXG_SIZE]; /* seq of last replayed rec */
+	uint64_t	zl_replayed_seq[TXG_SIZE]; /* last replayed rec seq */
+	uint64_t	zl_replaying_seq; /* current replay seq number */
 	uint32_t	zl_suspend;	/* log suspend count */
 	kcondvar_t	zl_cv_writer;	/* log writer thread completion */
 	kcondvar_t	zl_cv_suspend;	/* log suspend completion */
 	uint8_t		zl_suspending;	/* log is currently suspending */
 	uint8_t		zl_keep_first;	/* keep first log block in destroy */
-	uint8_t		zl_stop_replay;	/* don't replay any further */
+	uint8_t		zl_replay;	/* replaying records while set */
 	uint8_t		zl_stop_sync;	/* for debugging */
 	uint8_t		zl_writer;	/* boolean: write setup in progress */
 	uint8_t		zl_log_error;	/* boolean: log write error */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
index 8c8efcd..d7c0feb 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -132,12 +132,15 @@ enum zio_compress {
 #define	ZIO_FLAG_IO_RETRY		0x00400
 #define	ZIO_FLAG_IO_REWRITE		0x00800
 
-#define	ZIO_FLAG_PROBE			0x01000
+#define	ZIO_FLAG_SELF_HEAL		0x01000
 #define	ZIO_FLAG_RESILVER		0x02000
 #define	ZIO_FLAG_SCRUB			0x04000
 #define	ZIO_FLAG_SCRUB_THREAD		0x08000
 
-#define	ZIO_FLAG_GANG_CHILD		0x10000
+#define	ZIO_FLAG_PROBE			0x10000
+#define	ZIO_FLAG_GANG_CHILD		0x20000
+#define	ZIO_FLAG_RAW			0x40000
+#define	ZIO_FLAG_GODFATHER		0x80000
 
 #define	ZIO_FLAG_GANG_INHERIT		\
 	(ZIO_FLAG_CANFAIL |		\
@@ -146,6 +149,7 @@ enum zio_compress {
 	ZIO_FLAG_DONT_RETRY |		\
 	ZIO_FLAG_DONT_CACHE |		\
 	ZIO_FLAG_DONT_AGGREGATE |	\
+	ZIO_FLAG_SELF_HEAL |		\
 	ZIO_FLAG_RESILVER |		\
 	ZIO_FLAG_SCRUB |		\
 	ZIO_FLAG_SCRUB_THREAD)
@@ -156,6 +160,14 @@ enum zio_compress {
 	ZIO_FLAG_IO_RETRY |		\
 	ZIO_FLAG_PROBE)
 
+#define	ZIO_FLAG_AGG_INHERIT		\
+	(ZIO_FLAG_DONT_AGGREGATE |	\
+	ZIO_FLAG_IO_REPAIR |		\
+	ZIO_FLAG_SELF_HEAL |		\
+	ZIO_FLAG_RESILVER |		\
+	ZIO_FLAG_SCRUB |		\
+	ZIO_FLAG_SCRUB_THREAD)
+
 #define	ZIO_PIPELINE_CONTINUE		0x100
 #define	ZIO_PIPELINE_STOP		0x101
 
@@ -254,6 +266,13 @@ typedef int zio_pipe_stage_t(zio_t *zio);
 #define	ZIO_REEXECUTE_NOW	0x01
 #define	ZIO_REEXECUTE_SUSPEND	0x02
 
+typedef struct zio_link {
+	zio_t		*zl_parent;
+	zio_t		*zl_child;
+	list_node_t	zl_parent_node;
+	list_node_t	zl_child_node;
+} zio_link_t;
+
 struct zio {
 	/* Core information about this I/O */
 	zbookmark_t	io_bookmark;
@@ -263,15 +282,14 @@ struct zio {
 	int		io_cmd;
 	uint8_t		io_priority;
 	uint8_t		io_reexecute;
-	uint8_t		io_async_root;
+	uint8_t		io_state[ZIO_WAIT_TYPES];
 	uint64_t	io_txg;
 	spa_t		*io_spa;
 	blkptr_t	*io_bp;
 	blkptr_t	io_bp_copy;
-	zio_t		*io_parent;
-	zio_t		*io_child;
-	zio_t		*io_sibling_prev;
-	zio_t		*io_sibling_next;
+	list_t		io_parent_list;
+	list_t		io_child_list;
+	zio_link_t	*io_walk_link;
 	zio_t		*io_logical;
 	zio_transform_t *io_transform_stack;
 
@@ -294,8 +312,6 @@ struct zio {
 	avl_node_t	io_offset_node;
 	avl_node_t	io_deadline_node;
 	avl_tree_t	*io_vdev_tree;
-	zio_t		*io_delegate_list;
-	zio_t		*io_delegate_next;
 
 	/* Internal pipeline state */
 	int		io_flags;
@@ -308,6 +324,7 @@ struct zio {
 	int		io_child_error[ZIO_CHILD_TYPES];
 	uint64_t	io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES];
 	uint64_t	*io_stall;
+	zio_t		*io_gang_leader;
 	zio_gang_node_t	*io_gang_tree;
 	void		*io_executor;
 	void		*io_waiter;
@@ -323,7 +340,7 @@ struct zio {
 #endif
 };
 
-extern zio_t *zio_null(zio_t *pio, spa_t *spa,
+extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd,
     zio_done_func_t *done, void *private, int flags);
 
 extern zio_t *zio_root(spa_t *spa,
@@ -371,6 +388,11 @@ extern void zio_nowait(zio_t *zio);
 extern void zio_execute(zio_t *zio);
 extern void zio_interrupt(zio_t *zio);
 
+extern zio_t *zio_walk_parents(zio_t *cio);
+extern zio_t *zio_walk_children(zio_t *pio);
+extern zio_t *zio_unique_parent(zio_t *cio);
+extern void zio_add_child(zio_t *pio, zio_t *cio);
+
 extern void *zio_buf_alloc(size_t size);
 extern void zio_buf_free(void *buf, size_t size);
 extern void *zio_data_buf_alloc(size_t size);
@@ -397,7 +419,7 @@ extern uint8_t zio_checksum_select(uint8_t child, uint8_t parent);
 extern uint8_t zio_compress_select(uint8_t child, uint8_t parent);
 
 extern void zio_suspend(spa_t *spa, zio_t *zio);
-extern void zio_resume(spa_t *spa);
+extern int zio_resume(spa_t *spa);
 extern void zio_resume_wait(spa_t *spa);
 
 /*
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
index daab409..befc8b3 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -327,8 +327,10 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
 	mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
-	space_map_create(&vd->vdev_dtl_map, 0, -1ULL, 0, &vd->vdev_dtl_lock);
-	space_map_create(&vd->vdev_dtl_scrub, 0, -1ULL, 0, &vd->vdev_dtl_lock);
+	for (int t = 0; t < DTL_TYPES; t++) {
+		space_map_create(&vd->vdev_dtl[t], 0, -1ULL, 0,
+		    &vd->vdev_dtl_lock);
+	}
 	txg_list_create(&vd->vdev_ms_list,
 	    offsetof(struct metaslab, ms_txg_node));
 	txg_list_create(&vd->vdev_dtl_list,
@@ -444,6 +446,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH,
 	    &vd->vdev_physpath) == 0)
 		vd->vdev_physpath = spa_strdup(vd->vdev_physpath);
+	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0)
+		vd->vdev_fru = spa_strdup(vd->vdev_fru);
 
 	/*
 	 * Set the whole_disk property.  If it's not specified, leave the value
@@ -457,9 +461,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
 	 * Look for the 'not present' flag.  This will only be set if the device
 	 * was not present at the time of import.
 	 */
-	if (!spa->spa_import_faulted)
-		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
-		    &vd->vdev_not_present);
+	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
+	    &vd->vdev_not_present);
 
 	/*
 	 * Get the alignment requirement.
@@ -485,7 +488,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
 	    (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE)) {
 		if (alloctype == VDEV_ALLOC_LOAD) {
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
-			    &vd->vdev_dtl.smo_object);
+			    &vd->vdev_dtl_smo.smo_object);
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE,
 			    &vd->vdev_unspare);
 		}
@@ -569,6 +572,8 @@ vdev_free(vdev_t *vd)
 		spa_strfree(vd->vdev_devid);
 	if (vd->vdev_physpath)
 		spa_strfree(vd->vdev_physpath);
+	if (vd->vdev_fru)
+		spa_strfree(vd->vdev_fru);
 
 	if (vd->vdev_isspare)
 		spa_spare_remove(vd);
@@ -577,12 +582,14 @@ vdev_free(vdev_t *vd)
 
 	txg_list_destroy(&vd->vdev_ms_list);
 	txg_list_destroy(&vd->vdev_dtl_list);
+
 	mutex_enter(&vd->vdev_dtl_lock);
-	space_map_unload(&vd->vdev_dtl_map);
-	space_map_destroy(&vd->vdev_dtl_map);
-	space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
-	space_map_destroy(&vd->vdev_dtl_scrub);
+	for (int t = 0; t < DTL_TYPES; t++) {
+		space_map_unload(&vd->vdev_dtl[t]);
+		space_map_destroy(&vd->vdev_dtl[t]);
+	}
 	mutex_exit(&vd->vdev_dtl_lock);
+
 	mutex_destroy(&vd->vdev_dtl_lock);
 	mutex_destroy(&vd->vdev_stat_lock);
 	mutex_destroy(&vd->vdev_probe_lock);
@@ -720,14 +727,18 @@ vdev_remove_parent(vdev_t *cvd)
 
 	vdev_remove_child(mvd, cvd);
 	vdev_remove_child(pvd, mvd);
+
 	/*
 	 * If cvd will replace mvd as a top-level vdev, preserve mvd's guid.
 	 * Otherwise, we could have detached an offline device, and when we
 	 * go to import the pool we'll think we have two top-level vdevs,
 	 * instead of a different version of the same top-level vdev.
 	 */
-	if (mvd->vdev_top == mvd)
-		cvd->vdev_guid = cvd->vdev_guid_sum = mvd->vdev_guid;
+	if (mvd->vdev_top == mvd) {
+		uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid;
+		cvd->vdev_guid += guid_delta;
+		cvd->vdev_guid_sum += guid_delta;
+	}
 	cvd->vdev_id = mvd->vdev_id;
 	vdev_add_child(pvd, cvd);
 	vdev_top_update(cvd->vdev_top, cvd->vdev_top);
@@ -779,7 +790,8 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
 		if (txg == 0) {
 			uint64_t object = 0;
 			error = dmu_read(mos, vd->vdev_ms_array,
-			    m * sizeof (uint64_t), sizeof (uint64_t), &object);
+			    m * sizeof (uint64_t), sizeof (uint64_t), &object,
+			    DMU_READ_PREFETCH);
 			if (error)
 				return (error);
 			if (object != 0) {
@@ -819,22 +831,22 @@ typedef struct vdev_probe_stats {
 	boolean_t	vps_readable;
 	boolean_t	vps_writeable;
 	int		vps_flags;
-	zio_t		*vps_root;
-	vdev_t		*vps_vd;
 } vdev_probe_stats_t;
 
 static void
 vdev_probe_done(zio_t *zio)
 {
+	spa_t *spa = zio->io_spa;
+	vdev_t *vd = zio->io_vd;
 	vdev_probe_stats_t *vps = zio->io_private;
-	vdev_t *vd = vps->vps_vd;
+
+	ASSERT(vd->vdev_probe_zio != NULL);
 
 	if (zio->io_type == ZIO_TYPE_READ) {
-		ASSERT(zio->io_vd == vd);
 		if (zio->io_error == 0)
 			vps->vps_readable = 1;
-		if (zio->io_error == 0 && (spa_mode & FWRITE)) {
-			zio_nowait(zio_write_phys(vps->vps_root, vd,
+		if (zio->io_error == 0 && spa_writeable(spa)) {
+			zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd,
 			    zio->io_offset, zio->io_size, zio->io_data,
 			    ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
 			    ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE));
@@ -842,26 +854,34 @@ vdev_probe_done(zio_t *zio)
 			zio_buf_free(zio->io_data, zio->io_size);
 		}
 	} else if (zio->io_type == ZIO_TYPE_WRITE) {
-		ASSERT(zio->io_vd == vd);
 		if (zio->io_error == 0)
 			vps->vps_writeable = 1;
 		zio_buf_free(zio->io_data, zio->io_size);
 	} else if (zio->io_type == ZIO_TYPE_NULL) {
-		ASSERT(zio->io_vd == NULL);
-		ASSERT(zio == vps->vps_root);
+		zio_t *pio;
 
 		vd->vdev_cant_read |= !vps->vps_readable;
 		vd->vdev_cant_write |= !vps->vps_writeable;
 
 		if (vdev_readable(vd) &&
-		    (vdev_writeable(vd) || !(spa_mode & FWRITE))) {
+		    (vdev_writeable(vd) || !spa_writeable(spa))) {
 			zio->io_error = 0;
 		} else {
 			ASSERT(zio->io_error != 0);
 			zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
-			    zio->io_spa, vd, NULL, 0, 0);
+			    spa, vd, NULL, 0, 0);
 			zio->io_error = ENXIO;
 		}
+
+		mutex_enter(&vd->vdev_probe_lock);
+		ASSERT(vd->vdev_probe_zio == zio);
+		vd->vdev_probe_zio = NULL;
+		mutex_exit(&vd->vdev_probe_lock);
+
+		while ((pio = zio_walk_parents(zio)) != NULL)
+			if (!vdev_accessible(vd, pio))
+				pio->io_error = ENXIO;
+
 		kmem_free(vps, sizeof (*vps));
 	}
 }
@@ -872,53 +892,90 @@ vdev_probe_done(zio_t *zio)
  * but the first (which we leave alone in case it contains a VTOC).
  */
 zio_t *
-vdev_probe(vdev_t *vd, zio_t *pio)
+vdev_probe(vdev_t *vd, zio_t *zio)
 {
 	spa_t *spa = vd->vdev_spa;
-	vdev_probe_stats_t *vps;
-	zio_t *zio;
+	vdev_probe_stats_t *vps = NULL;
+	zio_t *pio;
 
-	vps = kmem_zalloc(sizeof (*vps), KM_SLEEP);
+	ASSERT(vd->vdev_ops->vdev_op_leaf);
 
-	vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE |
-	    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_DONT_RETRY;
+	/*
+	 * Don't probe the probe.
+	 */
+	if (zio && (zio->io_flags & ZIO_FLAG_PROBE))
+		return (NULL);
 
-	if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
-		/*
-		 * vdev_cant_read and vdev_cant_write can only transition
-		 * from TRUE to FALSE when we have the SCL_ZIO lock as writer;
-		 * otherwise they can only transition from FALSE to TRUE.
-		 * This ensures that any zio looking at these values can
-		 * assume that failures persist for the life of the I/O.
-		 * That's important because when a device has intermittent
-		 * connectivity problems, we want to ensure that they're
-		 * ascribed to the device (ENXIO) and not the zio (EIO).
-		 *
-		 * Since we hold SCL_ZIO as writer here, clear both values
-		 * so the probe can reevaluate from first principles.
-		 */
-		vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER;
-		vd->vdev_cant_read = B_FALSE;
-		vd->vdev_cant_write = B_FALSE;
+	/*
+	 * To prevent 'probe storms' when a device fails, we create
+	 * just one probe i/o at a time.  All zios that want to probe
+	 * this vdev will become parents of the probe io.
+	 */
+	mutex_enter(&vd->vdev_probe_lock);
+
+	if ((pio = vd->vdev_probe_zio) == NULL) {
+		vps = kmem_zalloc(sizeof (*vps), KM_SLEEP);
+
+		vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE |
+		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE |
+		    ZIO_FLAG_DONT_RETRY;
+
+		if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
+			/*
+			 * vdev_cant_read and vdev_cant_write can only
+			 * transition from TRUE to FALSE when we have the
+			 * SCL_ZIO lock as writer; otherwise they can only
+			 * transition from FALSE to TRUE.  This ensures that
+			 * any zio looking at these values can assume that
+			 * failures persist for the life of the I/O.  That's
+			 * important because when a device has intermittent
+			 * connectivity problems, we want to ensure that
+			 * they're ascribed to the device (ENXIO) and not
+			 * the zio (EIO).
+			 *
+			 * Since we hold SCL_ZIO as writer here, clear both
+			 * values so the probe can reevaluate from first
+			 * principles.
+			 */
+			vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER;
+			vd->vdev_cant_read = B_FALSE;
+			vd->vdev_cant_write = B_FALSE;
+		}
+
+		vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd,
+		    vdev_probe_done, vps,
+		    vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE);
+
+		if (zio != NULL) {
+			vd->vdev_probe_wanted = B_TRUE;
+			spa_async_request(spa, SPA_ASYNC_PROBE);
+		}
 	}
 
-	ASSERT(vd->vdev_ops->vdev_op_leaf);
+	if (zio != NULL)
+		zio_add_child(zio, pio);
 
-	zio = zio_null(pio, spa, vdev_probe_done, vps, vps->vps_flags);
+	mutex_exit(&vd->vdev_probe_lock);
 
-	vps->vps_root = zio;
-	vps->vps_vd = vd;
+	if (vps == NULL) {
+		ASSERT(zio != NULL);
+		return (NULL);
+	}
 
 	for (int l = 1; l < VDEV_LABELS; l++) {
-		zio_nowait(zio_read_phys(zio, vd,
+		zio_nowait(zio_read_phys(pio, vd,
 		    vdev_label_offset(vd->vdev_psize, l,
-		    offsetof(vdev_label_t, vl_pad)),
-		    VDEV_SKIP_SIZE, zio_buf_alloc(VDEV_SKIP_SIZE),
+		    offsetof(vdev_label_t, vl_pad2)),
+		    VDEV_PAD_SIZE, zio_buf_alloc(VDEV_PAD_SIZE),
 		    ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
 		    ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE));
 	}
 
-	return (zio);
+	if (zio == NULL)
+		return (pio);
+
+	zio_nowait(pio);
+	return (NULL);
 }
 
 /*
@@ -927,12 +984,15 @@ vdev_probe(vdev_t *vd, zio_t *pio)
 int
 vdev_open(vdev_t *vd)
 {
+	spa_t *spa = vd->vdev_spa;
 	int error;
 	int c;
 	uint64_t osize = 0;
 	uint64_t asize, psize;
 	uint64_t ashift = 0;
 
+	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
+
 	ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
 	    vd->vdev_state == VDEV_STATE_CANT_OPEN ||
 	    vd->vdev_state == VDEV_STATE_OFFLINE);
@@ -1066,16 +1126,12 @@ vdev_open(vdev_t *vd)
 
 	/*
 	 * If a leaf vdev has a DTL, and seems healthy, then kick off a
-	 * resilver.  But don't do this if we are doing a reopen for a
-	 * scrub, since this would just restart the scrub we are already
-	 * doing.
+	 * resilver.  But don't do this if we are doing a reopen for a scrub,
+	 * since this would just restart the scrub we are already doing.
 	 */
-	if (vd->vdev_children == 0 && !vd->vdev_spa->spa_scrub_reopen) {
-		mutex_enter(&vd->vdev_dtl_lock);
-		if (vd->vdev_dtl_map.sm_space != 0 && vdev_writeable(vd))
-			spa_async_request(vd->vdev_spa, SPA_ASYNC_RESILVER);
-		mutex_exit(&vd->vdev_dtl_lock);
-	}
+	if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen &&
+	    vdev_resilver_needed(vd, NULL, NULL))
+		spa_async_request(spa, SPA_ASYNC_RESILVER);
 
 	return (0);
 }
@@ -1154,7 +1210,12 @@ vdev_validate(vdev_t *vd)
 
 		nvlist_free(label);
 
-		if (spa->spa_load_state == SPA_LOAD_OPEN &&
+		/*
+		 * If spa->spa_load_verbatim is true, no need to check the
+		 * state of the pool.
+		 */
+		if (!spa->spa_load_verbatim &&
+		    spa->spa_load_state == SPA_LOAD_OPEN &&
 		    state != POOL_STATE_ACTIVE)
 			return (EBADF);
 
@@ -1176,6 +1237,10 @@ vdev_validate(vdev_t *vd)
 void
 vdev_close(vdev_t *vd)
 {
+	spa_t *spa = vd->vdev_spa;
+
+	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
+
 	vd->vdev_ops->vdev_op_close(vd);
 
 	vdev_cache_purge(vd);
@@ -1212,6 +1277,7 @@ vdev_reopen(vdev_t *vd)
 	if (vd->vdev_aux) {
 		(void) vdev_validate_aux(vd);
 		if (vdev_readable(vd) && vdev_writeable(vd) &&
+		    vd->vdev_aux == &spa->spa_l2cache &&
 		    !l2arc_vdev_present(vd)) {
 			uint64_t size = vdev_get_rsize(vd);
 			l2arc_add_vdev(spa, vd,
@@ -1294,34 +1360,88 @@ vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
 	(void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg);
 }
 
+/*
+ * DTLs.
+ *
+ * A vdev's DTL (dirty time log) is the set of transaction groups for which
+ * the vdev has less than perfect replication.  There are three kinds of DTL:
+ *
+ * DTL_MISSING: txgs for which the vdev has no valid copies of the data
+ *
+ * DTL_PARTIAL: txgs for which data is available, but not fully replicated
+ *
+ * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon
+ *	scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of
+ *	txgs that was scrubbed.
+ *
+ * DTL_OUTAGE: txgs which cannot currently be read, whether due to
+ *	persistent errors or just some device being offline.
+ *	Unlike the other three, the DTL_OUTAGE map is not generally
+ *	maintained; it's only computed when needed, typically to
+ *	determine whether a device can be detached.
+ *
+ * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device
+ * either has the data or it doesn't.
+ *
+ * For interior vdevs such as mirror and RAID-Z the picture is more complex.
+ * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because
+ * if any child is less than fully replicated, then so is its parent.
+ * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs,
+ * comprising only those txgs which appear in 'maxfaults' or more children;
+ * those are the txgs we don't have enough replication to read.  For example,
+ * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2);
+ * thus, its DTL_MISSING consists of the set of txgs that appear in more than
+ * two child DTL_MISSING maps.
+ *
+ * It should be clear from the above that to compute the DTLs and outage maps
+ * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps.
+ * Therefore, that is all we keep on disk.  When loading the pool, or after
+ * a configuration change, we generate all other DTLs from first principles.
+ */
 void
-vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size)
+vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
 {
+	space_map_t *sm = &vd->vdev_dtl[t];
+
+	ASSERT(t < DTL_TYPES);
+	ASSERT(vd != vd->vdev_spa->spa_root_vdev);
+
 	mutex_enter(sm->sm_lock);
 	if (!space_map_contains(sm, txg, size))
 		space_map_add(sm, txg, size);
 	mutex_exit(sm->sm_lock);
 }
 
-int
-vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size)
+boolean_t
+vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
 {
-	int dirty;
+	space_map_t *sm = &vd->vdev_dtl[t];
+	boolean_t dirty = B_FALSE;
 
-	/*
-	 * Quick test without the lock -- covers the common case that
-	 * there are no dirty time segments.
-	 */
-	if (sm->sm_space == 0)
-		return (0);
+	ASSERT(t < DTL_TYPES);
+	ASSERT(vd != vd->vdev_spa->spa_root_vdev);
 
 	mutex_enter(sm->sm_lock);
-	dirty = space_map_contains(sm, txg, size);
+	if (sm->sm_space != 0)
+		dirty = space_map_contains(sm, txg, size);
 	mutex_exit(sm->sm_lock);
 
 	return (dirty);
 }
 
+boolean_t
+vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
+{
+	space_map_t *sm = &vd->vdev_dtl[t];
+	boolean_t empty;
+
+	mutex_enter(sm->sm_lock);
+	empty = (sm->sm_space == 0);
+	mutex_exit(sm->sm_lock);
+
+	return (empty);
+}
+
 /*
  * Reassess DTLs after a config change or scrub completion.
  */
@@ -1329,11 +1449,19 @@ void
 vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
 {
 	spa_t *spa = vd->vdev_spa;
-	int c;
+	avl_tree_t reftree;
+	int minref;
 
-	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
+	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
 
-	if (vd->vdev_children == 0) {
+	for (int c = 0; c < vd->vdev_children; c++)
+		vdev_dtl_reassess(vd->vdev_child[c], txg,
+		    scrub_txg, scrub_done);
+
+	if (vd == spa->spa_root_vdev)
+		return;
+
+	if (vd->vdev_ops->vdev_op_leaf) {
 		mutex_enter(&vd->vdev_dtl_lock);
 		if (scrub_txg != 0 &&
 		    (spa->spa_scrub_started || spa->spa_scrub_errors == 0)) {
@@ -1344,12 +1472,38 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
 			 * will be valid, so excise the old region and
 			 * fold in the scrub dtl.  Otherwise, leave the
 			 * dtl as-is if there was an error.
+			 *
+			 * There's little trick here: to excise the beginning
+			 * of the DTL_MISSING map, we put it into a reference
+			 * tree and then add a segment with refcnt -1 that
+			 * covers the range [0, scrub_txg).  This means
+			 * that each txg in that range has refcnt -1 or 0.
+			 * We then add DTL_SCRUB with a refcnt of 2, so that
+			 * entries in the range [0, scrub_txg) will have a
+			 * positive refcnt -- either 1 or 2.  We then convert
+			 * the reference tree into the new DTL_MISSING map.
 			 */
-			space_map_excise(&vd->vdev_dtl_map, 0, scrub_txg);
-			space_map_union(&vd->vdev_dtl_map, &vd->vdev_dtl_scrub);
+			space_map_ref_create(&reftree);
+			space_map_ref_add_map(&reftree,
+			    &vd->vdev_dtl[DTL_MISSING], 1);
+			space_map_ref_add_seg(&reftree, 0, scrub_txg, -1);
+			space_map_ref_add_map(&reftree,
+			    &vd->vdev_dtl[DTL_SCRUB], 2);
+			space_map_ref_generate_map(&reftree,
+			    &vd->vdev_dtl[DTL_MISSING], 1);
+			space_map_ref_destroy(&reftree);
 		}
+		space_map_vacate(&vd->vdev_dtl[DTL_PARTIAL], NULL, NULL);
+		space_map_walk(&vd->vdev_dtl[DTL_MISSING],
+		    space_map_add, &vd->vdev_dtl[DTL_PARTIAL]);
 		if (scrub_done)
-			space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
+			space_map_vacate(&vd->vdev_dtl[DTL_SCRUB], NULL, NULL);
+		space_map_vacate(&vd->vdev_dtl[DTL_OUTAGE], NULL, NULL);
+		if (!vdev_readable(vd))
+			space_map_add(&vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL);
+		else
+			space_map_walk(&vd->vdev_dtl[DTL_MISSING],
+			    space_map_add, &vd->vdev_dtl[DTL_OUTAGE]);
 		mutex_exit(&vd->vdev_dtl_lock);
 
 		if (txg != 0)
@@ -1357,35 +1511,36 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
 		return;
 	}
 
-	/*
-	 * Make sure the DTLs are always correct under the scrub lock.
-	 */
-	if (vd == spa->spa_root_vdev)
-		mutex_enter(&spa->spa_scrub_lock);
-
 	mutex_enter(&vd->vdev_dtl_lock);
-	space_map_vacate(&vd->vdev_dtl_map, NULL, NULL);
-	space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
-	mutex_exit(&vd->vdev_dtl_lock);
-
-	for (c = 0; c < vd->vdev_children; c++) {
-		vdev_t *cvd = vd->vdev_child[c];
-		vdev_dtl_reassess(cvd, txg, scrub_txg, scrub_done);
-		mutex_enter(&vd->vdev_dtl_lock);
-		space_map_union(&vd->vdev_dtl_map, &cvd->vdev_dtl_map);
-		space_map_union(&vd->vdev_dtl_scrub, &cvd->vdev_dtl_scrub);
-		mutex_exit(&vd->vdev_dtl_lock);
+	for (int t = 0; t < DTL_TYPES; t++) {
+		/* account for child's outage in parent's missing map */
+		int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
+		if (t == DTL_SCRUB)
+			continue;			/* leaf vdevs only */
+		if (t == DTL_PARTIAL)
+			minref = 1;			/* i.e. non-zero */
+		else if (vd->vdev_nparity != 0)
+			minref = vd->vdev_nparity + 1;	/* RAID-Z */
+		else
+			minref = vd->vdev_children;	/* any kind of mirror */
+		space_map_ref_create(&reftree);
+		for (int c = 0; c < vd->vdev_children; c++) {
+			vdev_t *cvd = vd->vdev_child[c];
+			mutex_enter(&cvd->vdev_dtl_lock);
+			space_map_ref_add_map(&reftree, &cvd->vdev_dtl[s], 1);
+			mutex_exit(&cvd->vdev_dtl_lock);
+		}
+		space_map_ref_generate_map(&reftree, &vd->vdev_dtl[t], minref);
+		space_map_ref_destroy(&reftree);
 	}
-
-	if (vd == spa->spa_root_vdev)
-		mutex_exit(&spa->spa_scrub_lock);
+	mutex_exit(&vd->vdev_dtl_lock);
 }
 
 static int
 vdev_dtl_load(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
-	space_map_obj_t *smo = &vd->vdev_dtl;
+	space_map_obj_t *smo = &vd->vdev_dtl_smo;
 	objset_t *mos = spa->spa_meta_objset;
 	dmu_buf_t *db;
 	int error;
@@ -1403,7 +1558,8 @@ vdev_dtl_load(vdev_t *vd)
 	dmu_buf_rele(db, FTAG);
 
 	mutex_enter(&vd->vdev_dtl_lock);
-	error = space_map_load(&vd->vdev_dtl_map, NULL, SM_ALLOC, smo, mos);
+	error = space_map_load(&vd->vdev_dtl[DTL_MISSING],
+	    NULL, SM_ALLOC, smo, mos);
 	mutex_exit(&vd->vdev_dtl_lock);
 
 	return (error);
@@ -1413,8 +1569,8 @@ void
 vdev_dtl_sync(vdev_t *vd, uint64_t txg)
 {
 	spa_t *spa = vd->vdev_spa;
-	space_map_obj_t *smo = &vd->vdev_dtl;
-	space_map_t *sm = &vd->vdev_dtl_map;
+	space_map_obj_t *smo = &vd->vdev_dtl_smo;
+	space_map_t *sm = &vd->vdev_dtl[DTL_MISSING];
 	objset_t *mos = spa->spa_meta_objset;
 	space_map_t smsync;
 	kmutex_t smlock;
@@ -1472,6 +1628,37 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg)
 }
 
 /*
+ * Determine whether the specified vdev can be offlined/detached/removed
+ * without losing data.
+ */
+boolean_t
+vdev_dtl_required(vdev_t *vd)
+{
+	spa_t *spa = vd->vdev_spa;
+	vdev_t *tvd = vd->vdev_top;
+	uint8_t cant_read = vd->vdev_cant_read;
+	boolean_t required;
+
+	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
+
+	if (vd == spa->spa_root_vdev || vd == tvd)
+		return (B_TRUE);
+
+	/*
+	 * Temporarily mark the device as unreadable, and then determine
+	 * whether this results in any DTL outages in the top-level vdev.
+	 * If not, we can safely offline/detach/remove the device.
+	 */
+	vd->vdev_cant_read = B_TRUE;
+	vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
+	required = !vdev_dtl_empty(tvd, DTL_OUTAGE);
+	vd->vdev_cant_read = cant_read;
+	vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
+
+	return (required);
+}
+
+/*
  * Determine if resilver is needed, and if so the txg range.
  */
 boolean_t
@@ -1483,19 +1670,19 @@ vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp)
 
 	if (vd->vdev_children == 0) {
 		mutex_enter(&vd->vdev_dtl_lock);
-		if (vd->vdev_dtl_map.sm_space != 0 && vdev_writeable(vd)) {
+		if (vd->vdev_dtl[DTL_MISSING].sm_space != 0 &&
+		    vdev_writeable(vd)) {
 			space_seg_t *ss;
 
-			ss = avl_first(&vd->vdev_dtl_map.sm_root);
+			ss = avl_first(&vd->vdev_dtl[DTL_MISSING].sm_root);
 			thismin = ss->ss_start - 1;
-			ss = avl_last(&vd->vdev_dtl_map.sm_root);
+			ss = avl_last(&vd->vdev_dtl[DTL_MISSING].sm_root);
 			thismax = ss->ss_end;
 			needed = B_TRUE;
 		}
 		mutex_exit(&vd->vdev_dtl_lock);
 	} else {
-		int c;
-		for (c = 0; c < vd->vdev_children; c++) {
+		for (int c = 0; c < vd->vdev_children; c++) {
 			vdev_t *cvd = vd->vdev_child[c];
 			uint64_t cmin, cmax;
 
@@ -1517,12 +1704,10 @@ vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp)
 void
 vdev_load(vdev_t *vd)
 {
-	int c;
-
 	/*
 	 * Recursively load all children.
 	 */
-	for (c = 0; c < vd->vdev_children; c++)
+	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_load(vd->vdev_child[c]);
 
 	/*
@@ -1742,11 +1927,7 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
 	    vd->vdev_parent->vdev_child[0] == vd)
 		vd->vdev_unspare = B_TRUE;
 
-	(void) spa_vdev_state_exit(spa, vd, 0);
-
-	VERIFY3U(spa_scrub(spa, POOL_SCRUB_RESILVER), ==, 0);
-
-	return (0);
+	return (spa_vdev_state_exit(spa, vd, 0));
 }
 
 int
@@ -1767,13 +1948,10 @@ vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
 	 */
 	if (!vd->vdev_offline) {
 		/*
-		 * If this device's top-level vdev has a non-empty DTL,
-		 * don't allow the device to be offlined.
-		 *
-		 * XXX -- make this more precise by allowing the offline
-		 * as long as the remaining devices don't have any DTL holes.
+		 * If this device has the only valid copy of some data,
+		 * don't allow it to be offlined.
 		 */
-		if (vd->vdev_top->vdev_dtl_map.sm_space != 0)
+		if (vd->vdev_aux == NULL && vdev_dtl_required(vd))
 			return (spa_vdev_state_exit(spa, NULL, EBUSY));
 
 		/*
@@ -1783,7 +1961,7 @@ vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
 		 */
 		vd->vdev_offline = B_TRUE;
 		vdev_reopen(vd->vdev_top);
-		if (vdev_is_dead(vd->vdev_top) && vd->vdev_aux == NULL) {
+		if (vd->vdev_aux == NULL && vdev_is_dead(vd->vdev_top)) {
 			vd->vdev_offline = B_FALSE;
 			vdev_reopen(vd->vdev_top);
 			return (spa_vdev_state_exit(spa, NULL, EBUSY));
@@ -1863,13 +2041,17 @@ vdev_writeable(vdev_t *vd)
 boolean_t
 vdev_allocatable(vdev_t *vd)
 {
+	uint64_t state = vd->vdev_state;
+
 	/*
-	 * We currently allow allocations from vdevs which maybe in the
+	 * We currently allow allocations from vdevs which may be in the
 	 * process of reopening (i.e. VDEV_STATE_CLOSED). If the device
 	 * fails to reopen then we'll catch it later when we're holding
-	 * the proper locks.
+	 * the proper locks.  Note that we have to get the vdev state
+	 * in a local variable because although it changes atomically,
+	 * we're asking two separate questions about it.
 	 */
-	return (!(vdev_is_dead(vd) && vd->vdev_state != VDEV_STATE_CLOSED) &&
+	return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) &&
 	    !vd->vdev_cant_write);
 }
 
@@ -1939,7 +2121,8 @@ vdev_clear_stats(vdev_t *vd)
 void
 vdev_stat_update(zio_t *zio, uint64_t psize)
 {
-	vdev_t *rvd = zio->io_spa->spa_root_vdev;
+	spa_t *spa = zio->io_spa;
+	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *vd = zio->io_vd ? zio->io_vd : rvd;
 	vdev_t *pvd;
 	uint64_t txg = zio->io_txg;
@@ -1972,21 +2155,23 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
 			return;
 
 		ASSERT(vd == zio->io_vd);
-		if (!(flags & ZIO_FLAG_IO_BYPASS)) {
-			mutex_enter(&vd->vdev_stat_lock);
-			vs->vs_ops[type]++;
-			vs->vs_bytes[type] += psize;
-			mutex_exit(&vd->vdev_stat_lock);
-		}
+
+		if (flags & ZIO_FLAG_IO_BYPASS)
+			return;
+
+		mutex_enter(&vd->vdev_stat_lock);
+
 		if (flags & ZIO_FLAG_IO_REPAIR) {
-			ASSERT(zio->io_delegate_list == NULL);
-			mutex_enter(&vd->vdev_stat_lock);
 			if (flags & ZIO_FLAG_SCRUB_THREAD)
 				vs->vs_scrub_repaired += psize;
-			else
+			if (flags & ZIO_FLAG_SELF_HEAL)
 				vs->vs_self_healed += psize;
-			mutex_exit(&vd->vdev_stat_lock);
 		}
+
+		vs->vs_ops[type]++;
+		vs->vs_bytes[type] += psize;
+
+		mutex_exit(&vd->vdev_stat_lock);
 		return;
 	}
 
@@ -1994,29 +2179,49 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
 		return;
 
 	mutex_enter(&vd->vdev_stat_lock);
-	if (type == ZIO_TYPE_READ) {
+	if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) {
 		if (zio->io_error == ECKSUM)
 			vs->vs_checksum_errors++;
 		else
 			vs->vs_read_errors++;
 	}
-	if (type == ZIO_TYPE_WRITE)
+	if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd))
 		vs->vs_write_errors++;
 	mutex_exit(&vd->vdev_stat_lock);
 
-	if (type == ZIO_TYPE_WRITE && txg != 0 && vd->vdev_children == 0) {
-		if (flags & ZIO_FLAG_SCRUB_THREAD) {
-			ASSERT(flags & ZIO_FLAG_IO_REPAIR);
-			for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
-				vdev_dtl_dirty(&pvd->vdev_dtl_scrub, txg, 1);
-		}
-		if (!(flags & ZIO_FLAG_IO_REPAIR)) {
-			if (vdev_dtl_contains(&vd->vdev_dtl_map, txg, 1))
+	if (type == ZIO_TYPE_WRITE && txg != 0 &&
+	    (!(flags & ZIO_FLAG_IO_REPAIR) ||
+	    (flags & ZIO_FLAG_SCRUB_THREAD))) {
+		/*
+		 * This is either a normal write (not a repair), or it's a
+		 * repair induced by the scrub thread.  In the normal case,
+		 * we commit the DTL change in the same txg as the block
+		 * was born.  In the scrub-induced repair case, we know that
+		 * scrubs run in first-pass syncing context, so we commit
+		 * the DTL change in spa->spa_syncing_txg.
+		 *
+		 * We currently do not make DTL entries for failed spontaneous
+		 * self-healing writes triggered by normal (non-scrubbing)
+		 * reads, because we have no transactional context in which to
+		 * do so -- and it's not clear that it'd be desirable anyway.
+		 */
+		if (vd->vdev_ops->vdev_op_leaf) {
+			uint64_t commit_txg = txg;
+			if (flags & ZIO_FLAG_SCRUB_THREAD) {
+				ASSERT(flags & ZIO_FLAG_IO_REPAIR);
+				ASSERT(spa_sync_pass(spa) == 1);
+				vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1);
+				commit_txg = spa->spa_syncing_txg;
+			}
+			ASSERT(commit_txg >= spa->spa_syncing_txg);
+			if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1))
 				return;
-			vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
-			for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
-				vdev_dtl_dirty(&pvd->vdev_dtl_map, txg, 1);
+			for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
+				vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1);
+			vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg);
 		}
+		if (vd != rvd)
+			vdev_dtl_dirty(vd, DTL_MISSING, txg, 1);
 	}
 }
 
@@ -2111,8 +2316,8 @@ vdev_config_dirty(vdev_t *vd)
 	int c;
 
 	/*
-	 * If this is an aux vdev (as with l2cache devices), then we update the
-	 * vdev config manually and set the sync flag.
+	 * If this is an aux vdev (as with l2cache and spare devices), then we
+	 * update the vdev config manually and set the sync flag.
 	 */
 	if (vd->vdev_aux != NULL) {
 		spa_aux_vdev_t *sav = vd->vdev_aux;
@@ -2134,8 +2339,11 @@ vdev_config_dirty(vdev_t *vd)
 
 		sav->sav_sync = B_TRUE;
 
-		VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
-		    ZPOOL_CONFIG_L2CACHE, &aux, &naux) == 0);
+		if (nvlist_lookup_nvlist_array(sav->sav_config,
+		    ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) {
+			VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
+			    ZPOOL_CONFIG_SPARES, &aux, &naux) == 0);
+		}
 
 		ASSERT(c < naux);
 
@@ -2229,7 +2437,8 @@ vdev_state_clean(vdev_t *vd)
 void
 vdev_propagate_state(vdev_t *vd)
 {
-	vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
+	spa_t *spa = vd->vdev_spa;
+	vdev_t *rvd = spa->spa_root_vdev;
 	int degraded = 0, faulted = 0;
 	int corrupted = 0;
 	int c;
@@ -2240,7 +2449,7 @@ vdev_propagate_state(vdev_t *vd)
 			child = vd->vdev_child[c];
 
 			if (!vdev_readable(child) ||
-			    (!vdev_writeable(child) && (spa_mode & FWRITE))) {
+			    (!vdev_writeable(child) && spa_writeable(spa))) {
 				/*
 				 * Root special: if there is a top-level log
 				 * device, treat the root vdev as if it were
@@ -2340,7 +2549,6 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
 		 * an error.
 		 */
 		if (spa->spa_load_state == SPA_LOAD_IMPORT &&
-		    !spa->spa_import_faulted &&
 		    vd->vdev_ops->vdev_op_leaf)
 			vd->vdev_not_present = 1;
 
@@ -2399,8 +2607,8 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
 		vd->vdev_removed = B_FALSE;
 	}
 
-	if (!isopen)
-		vdev_propagate_state(vd);
+	if (!isopen && vd->vdev_parent)
+		vdev_propagate_state(vd->vdev_parent);
 }
 
 /*
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c
index 88c15b7..8fc3738 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -215,23 +215,23 @@ vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio)
  * Fill a previously allocated cache entry with data.
  */
 static void
-vdev_cache_fill(zio_t *zio)
+vdev_cache_fill(zio_t *fio)
 {
-	vdev_t *vd = zio->io_vd;
+	vdev_t *vd = fio->io_vd;
 	vdev_cache_t *vc = &vd->vdev_cache;
-	vdev_cache_entry_t *ve = zio->io_private;
-	zio_t *dio;
+	vdev_cache_entry_t *ve = fio->io_private;
+	zio_t *pio;
 
-	ASSERT(zio->io_size == VCBS);
+	ASSERT(fio->io_size == VCBS);
 
 	/*
 	 * Add data to the cache.
 	 */
 	mutex_enter(&vc->vc_lock);
 
-	ASSERT(ve->ve_fill_io == zio);
-	ASSERT(ve->ve_offset == zio->io_offset);
-	ASSERT(ve->ve_data == zio->io_data);
+	ASSERT(ve->ve_fill_io == fio);
+	ASSERT(ve->ve_offset == fio->io_offset);
+	ASSERT(ve->ve_data == fio->io_data);
 
 	ve->ve_fill_io = NULL;
 
@@ -240,20 +240,13 @@ vdev_cache_fill(zio_t *zio)
 	 * any reads that were queued up before the missed update are still
 	 * valid, so we can satisfy them from this line before we evict it.
 	 */
-	for (dio = zio->io_delegate_list; dio; dio = dio->io_delegate_next)
-		vdev_cache_hit(vc, ve, dio);
+	while ((pio = zio_walk_parents(fio)) != NULL)
+		vdev_cache_hit(vc, ve, pio);
 
-	if (zio->io_error || ve->ve_missed_update)
+	if (fio->io_error || ve->ve_missed_update)
 		vdev_cache_evict(vc, ve);
 
 	mutex_exit(&vc->vc_lock);
-
-	while ((dio = zio->io_delegate_list) != NULL) {
-		zio->io_delegate_list = dio->io_delegate_next;
-		dio->io_delegate_next = NULL;
-		dio->io_error = zio->io_error;
-		zio_execute(dio);
-	}
 }
 
 /*
@@ -296,9 +289,8 @@ vdev_cache_read(zio_t *zio)
 		}
 
 		if ((fio = ve->ve_fill_io) != NULL) {
-			zio->io_delegate_next = fio->io_delegate_list;
-			fio->io_delegate_list = zio;
 			zio_vdev_io_bypass(zio);
+			zio_add_child(zio, fio);
 			mutex_exit(&vc->vc_lock);
 			VDCSTAT_BUMP(vdc_stat_delegations);
 			return (0);
@@ -308,7 +300,6 @@ vdev_cache_read(zio_t *zio)
 		zio_vdev_io_bypass(zio);
 
 		mutex_exit(&vc->vc_lock);
-		zio_execute(zio);
 		VDCSTAT_BUMP(vdc_stat_hits);
 		return (0);
 	}
@@ -325,8 +316,8 @@ vdev_cache_read(zio_t *zio)
 	    ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve);
 
 	ve->ve_fill_io = fio;
-	fio->io_delegate_list = zio;
 	zio_vdev_io_bypass(zio);
+	zio_add_child(zio, fio);
 
 	mutex_exit(&vc->vc_lock);
 	zio_nowait(fio);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c
index 35d4e2a..e6d5743 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -47,6 +47,7 @@ typedef struct vdev_disk_buf {
 static int
 vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
 {
+	spa_t *spa = vd->vdev_spa;
 	vdev_disk_t *dvd;
 	struct dk_minfo dkm;
 	int error;
@@ -95,7 +96,7 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
 
 	error = EINVAL;		/* presume failure */
 
-	if (vd->vdev_path != NULL && !spa_is_root(vd->vdev_spa)) {
+	if (vd->vdev_path != NULL && !spa_is_root(spa)) {
 		ddi_devid_t devid;
 
 		if (vd->vdev_wholedisk == -1ULL) {
@@ -105,18 +106,18 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
 
 			(void) snprintf(buf, len, "%ss0", vd->vdev_path);
 
-			if (ldi_open_by_name(buf, spa_mode, kcred,
+			if (ldi_open_by_name(buf, spa_mode(spa), kcred,
 			    &lh, zfs_li) == 0) {
 				spa_strfree(vd->vdev_path);
 				vd->vdev_path = buf;
 				vd->vdev_wholedisk = 1ULL;
-				(void) ldi_close(lh, spa_mode, kcred);
+				(void) ldi_close(lh, spa_mode(spa), kcred);
 			} else {
 				kmem_free(buf, len);
 			}
 		}
 
-		error = ldi_open_by_name(vd->vdev_path, spa_mode, kcred,
+		error = ldi_open_by_name(vd->vdev_path, spa_mode(spa), kcred,
 		    &dvd->vd_lh, zfs_li);
 
 		/*
@@ -126,7 +127,8 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
 		    ldi_get_devid(dvd->vd_lh, &devid) == 0) {
 			if (ddi_devid_compare(devid, dvd->vd_devid) != 0) {
 				error = EINVAL;
-				(void) ldi_close(dvd->vd_lh, spa_mode, kcred);
+				(void) ldi_close(dvd->vd_lh, spa_mode(spa),
+				    kcred);
 				dvd->vd_lh = NULL;
 			}
 			ddi_devid_free(devid);
@@ -146,7 +148,7 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
 	 */
 	if (error != 0 && vd->vdev_devid != NULL)
 		error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor,
-		    spa_mode, kcred, &dvd->vd_lh, zfs_li);
+		    spa_mode(spa), kcred, &dvd->vd_lh, zfs_li);
 
 	/*
 	 * If all else fails, then try opening by physical path (if available)
@@ -156,8 +158,8 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
 	 */
 	if (error) {
 		if (vd->vdev_physpath != NULL &&
-		    (dev = ddi_pathname_to_dev_t(vd->vdev_physpath)) != ENODEV)
-			error = ldi_open_by_dev(&dev, OTYP_BLK, spa_mode,
+		    (dev = ddi_pathname_to_dev_t(vd->vdev_physpath)) != NODEV)
+			error = ldi_open_by_dev(&dev, OTYP_BLK, spa_mode(spa),
 			    kcred, &dvd->vd_lh, zfs_li);
 
 		/*
@@ -165,10 +167,9 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
 		 * as above.  This hasn't been used in a very long time and we
 		 * don't need to propagate its oddities to this edge condition.
 		 */
-		if (error && vd->vdev_path != NULL &&
-		    !spa_is_root(vd->vdev_spa))
-			error = ldi_open_by_name(vd->vdev_path, spa_mode, kcred,
-			    &dvd->vd_lh, zfs_li);
+		if (error && vd->vdev_path != NULL && !spa_is_root(spa))
+			error = ldi_open_by_name(vd->vdev_path, spa_mode(spa),
+			    kcred, &dvd->vd_lh, zfs_li);
 	}
 
 	if (error) {
@@ -253,7 +254,7 @@ vdev_disk_close(vdev_t *vd)
 		ddi_devid_free(dvd->vd_devid);
 
 	if (dvd->vd_lh != NULL)
-		(void) ldi_close(dvd->vd_lh, spa_mode, kcred);
+		(void) ldi_close(dvd->vd_lh, spa_mode(vd->vdev_spa), kcred);
 
 	kmem_free(dvd, sizeof (vdev_disk_t));
 	vd->vdev_tsd = NULL;
@@ -469,7 +470,7 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
 	if (devid != NULL && ddi_devid_str_decode(devid, &tmpdevid,
 	    &minor_name) == 0) {
 		error = ldi_open_by_devid(tmpdevid, minor_name,
-		    spa_mode, kcred, &vd_lh, zfs_li);
+		    FREAD, kcred, &vd_lh, zfs_li);
 		ddi_devid_free(tmpdevid);
 		ddi_devid_str_free(minor_name);
 	}
@@ -492,8 +493,7 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
 		/* read vdev label */
 		offset = vdev_label_offset(size, l, 0);
 		if (vdev_disk_physio(vd_lh, (caddr_t)label,
-		    VDEV_SKIP_SIZE + VDEV_BOOT_HEADER_SIZE +
-		    VDEV_PHYS_SIZE, offset, B_READ) != 0)
+		    VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, B_READ) != 0)
 			continue;
 
 		if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist,
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c
index 30b3f35..67bd110 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c
@@ -61,7 +61,7 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
 	 */
 	ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/');
 	error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE,
-	    spa_mode | FOFFMAX, 0, &vp, 0, 0, rootdir, -1);
+	    spa_mode(vd->vdev_spa) | FOFFMAX, 0, &vp, 0, 0, rootdir, -1);
 
 	if (error) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
@@ -75,7 +75,7 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
 	 * Make sure it's a regular file.
 	 */
 	if (vp->v_type != VREG) {
-		(void) VOP_CLOSE(vp, spa_mode, 1, 0, kcred, NULL);
+		(void) VOP_CLOSE(vp, spa_mode(vd->vdev_spa), 1, 0, kcred, NULL);
 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
 		return (ENODEV);
 	}
@@ -90,7 +90,7 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
 	VOP_UNLOCK(vp, 0);
 	VFS_UNLOCK_GIANT(vfslocked);
 	if (error) {
-		(void) VOP_CLOSE(vp, spa_mode, 1, 0, kcred, NULL);
+		(void) VOP_CLOSE(vp, spa_mode(vd->vdev_spa), 1, 0, kcred, NULL);
 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
 		return (error);
 	}
@@ -110,7 +110,8 @@ vdev_file_close(vdev_t *vd)
 		return;
 
 	if (vf->vf_vnode != NULL)
-		(void) VOP_CLOSE(vf->vf_vnode, spa_mode, 1, 0, kcred, NULL);
+		(void) VOP_CLOSE(vf->vf_vnode, spa_mode(vd->vdev_spa), 1, 0,
+		    kcred, NULL);
 	kmem_free(vf, sizeof (vdev_file_t));
 	vd->vdev_tsd = NULL;
 }
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
index 9c6ec4c..00817bf 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
@@ -457,7 +457,7 @@ vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
 	if (cp == NULL) {
 		ZFS_LOG(1, "Provider %s not found.", vd->vdev_path);
 		error = ENOENT;
-	} else if (cp->acw == 0 && (spa_mode & FWRITE) != 0) {
+	} else if (cp->acw == 0 && (spa_mode(vd->vdev_spa) & FWRITE) != 0) {
 		int i;
 
 		g_topology_lock();
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
index bf93046..f1f3bb0 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -233,6 +233,10 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
 		VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH,
 		    vd->vdev_physpath) == 0);
 
+	if (vd->vdev_fru != NULL)
+		VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_FRU,
+		    vd->vdev_fru) == 0);
+
 	if (vd->vdev_nparity != 0) {
 		ASSERT(strcmp(vd->vdev_ops->vdev_op_type,
 		    VDEV_TYPE_RAIDZ) == 0);
@@ -277,9 +281,9 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
 		    vd->vdev_islog) == 0);
 	}
 
-	if (vd->vdev_dtl.smo_object != 0)
+	if (vd->vdev_dtl_smo.smo_object != 0)
 		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DTL,
-		    vd->vdev_dtl.smo_object) == 0);
+		    vd->vdev_dtl_smo.smo_object) == 0);
 
 	if (getstats) {
 		vdev_stat_t vs;
@@ -488,7 +492,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
 	spa_t *spa = vd->vdev_spa;
 	nvlist_t *label;
 	vdev_phys_t *vp;
-	vdev_boot_header_t *vb;
+	char *pad2;
 	uberblock_t *ub;
 	zio_t *zio;
 	char *buf;
@@ -520,9 +524,6 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
 	    vdev_inuse(vd, crtxg, reason, &spare_guid, &l2cache_guid))
 		return (EBUSY);
 
-	ASSERT(reason != VDEV_LABEL_REMOVE ||
-	    vdev_inuse(vd, crtxg, reason, NULL, NULL));
-
 	/*
 	 * If this is a request to add or replace a spare or l2cache device
 	 * that is in use elsewhere on the system, then we must update the
@@ -633,16 +634,6 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
 	}
 
 	/*
-	 * Initialize boot block header.
-	 */
-	vb = zio_buf_alloc(sizeof (vdev_boot_header_t));
-	bzero(vb, sizeof (vdev_boot_header_t));
-	vb->vb_magic = VDEV_BOOT_MAGIC;
-	vb->vb_version = VDEV_BOOT_VERSION;
-	vb->vb_offset = VDEV_BOOT_OFFSET;
-	vb->vb_size = VDEV_BOOT_SIZE;
-
-	/*
 	 * Initialize uberblock template.
 	 */
 	ub = zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd));
@@ -650,6 +641,10 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
 	*ub = spa->spa_uberblock;
 	ub->ub_txg = 0;
 
+	/* Initialize the 2nd padding area. */
+	pad2 = zio_buf_alloc(VDEV_PAD_SIZE);
+	bzero(pad2, VDEV_PAD_SIZE);
+
 	/*
 	 * Write everything in parallel.
 	 */
@@ -661,9 +656,14 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
 		    offsetof(vdev_label_t, vl_vdev_phys),
 		    sizeof (vdev_phys_t), NULL, NULL, flags);
 
-		vdev_label_write(zio, vd, l, vb,
-		    offsetof(vdev_label_t, vl_boot_header),
-		    sizeof (vdev_boot_header_t), NULL, NULL, flags);
+		/*
+		 * Skip the 1st padding area.
+		 * Zero out the 2nd padding area where it might have
+		 * left over data from previous filesystem format.
+		 */
+		vdev_label_write(zio, vd, l, pad2,
+		    offsetof(vdev_label_t, vl_pad2),
+		    VDEV_PAD_SIZE, NULL, NULL, flags);
 
 		for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
 			vdev_label_write(zio, vd, l, ub,
@@ -675,8 +675,8 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
 	error = zio_wait(zio);
 
 	nvlist_free(label);
+	zio_buf_free(pad2, VDEV_PAD_SIZE);
 	zio_buf_free(ub, VDEV_UBERBLOCK_SIZE(vd));
-	zio_buf_free(vb, sizeof (vdev_boot_header_t));
 	zio_buf_free(vp, sizeof (vdev_phys_t));
 
 	/*
@@ -705,6 +705,11 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
  */
 
 /*
+ * For use by zdb and debugging purposes only
+ */
+uint64_t ub_max_txg = UINT64_MAX;
+
+/*
  * Consider the following situation: txg is safely synced to disk.  We've
  * written the first uberblock for txg + 1, and then we lose power.  When we
  * come back up, we fail to see the uberblock for txg + 1 because, say,
@@ -741,7 +746,8 @@ vdev_uberblock_load_done(zio_t *zio)
 
 	if (zio->io_error == 0 && uberblock_verify(ub) == 0) {
 		mutex_enter(&rio->io_lock);
-		if (vdev_uberblock_compare(ub, ubbest) > 0)
+		if (ub->ub_txg <= ub_max_txg &&
+		    vdev_uberblock_compare(ub, ubbest) > 0)
 			*ubbest = *ub;
 		mutex_exit(&rio->io_lock);
 	}
@@ -958,7 +964,7 @@ vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags)
 	for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) {
 		uint64_t *good_writes = kmem_zalloc(sizeof (uint64_t),
 		    KM_SLEEP);
-		zio_t *vio = zio_null(zio, spa,
+		zio_t *vio = zio_null(zio, spa, NULL,
 		    (vd->vdev_islog || vd->vdev_aux != NULL) ?
 		    vdev_label_sync_ignore_done : vdev_label_sync_top_done,
 		    good_writes, flags);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
index c4629ff..fff7e08 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -180,11 +180,16 @@ vdev_mirror_scrub_done(zio_t *zio)
 	mirror_child_t *mc = zio->io_private;
 
 	if (zio->io_error == 0) {
-		zio_t *pio = zio->io_parent;
-		mutex_enter(&pio->io_lock);
-		ASSERT3U(zio->io_size, >=, pio->io_size);
-		bcopy(zio->io_data, pio->io_data, pio->io_size);
-		mutex_exit(&pio->io_lock);
+		zio_t *pio;
+
+		mutex_enter(&zio->io_lock);
+		while ((pio = zio_walk_parents(zio)) != NULL) {
+			mutex_enter(&pio->io_lock);
+			ASSERT3U(zio->io_size, >=, pio->io_size);
+			bcopy(zio->io_data, pio->io_data, pio->io_size);
+			mutex_exit(&pio->io_lock);
+		}
+		mutex_exit(&zio->io_lock);
 	}
 
 	zio_buf_free(zio->io_data, zio->io_size);
@@ -225,7 +230,7 @@ vdev_mirror_child_select(zio_t *zio)
 			mc->mc_skipped = 1;
 			continue;
 		}
-		if (!vdev_dtl_contains(&mc->mc_vd->vdev_dtl_map, txg, 1))
+		if (!vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1))
 			return (c);
 		mc->mc_error = ESTALE;
 		mc->mc_skipped = 1;
@@ -282,20 +287,10 @@ vdev_mirror_io_start(zio_t *zio)
 		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 
 		/*
-		 * If this is a resilvering I/O to a replacing vdev,
-		 * only the last child should be written -- unless the
-		 * first child happens to have a DTL entry here as well.
-		 * All other writes go to all children.
+		 * Writes go to all children.
 		 */
-		if ((zio->io_flags & ZIO_FLAG_RESILVER) && mm->mm_replacing &&
-		    !vdev_dtl_contains(&mm->mm_child[0].mc_vd->vdev_dtl_map,
-		    zio->io_txg, 1)) {
-			c = mm->mm_children - 1;
-			children = 1;
-		} else {
-			c = 0;
-			children = mm->mm_children;
-		}
+		c = 0;
+		children = mm->mm_children;
 	}
 
 	while (children--) {
@@ -398,7 +393,7 @@ vdev_mirror_io_done(zio_t *zio)
 		ASSERT(zio->io_error != 0);
 	}
 
-	if (good_copies && (spa_mode & FWRITE) &&
+	if (good_copies && spa_writeable(zio->io_spa) &&
 	    (unexpected_errors ||
 	    (zio->io_flags & ZIO_FLAG_RESILVER) ||
 	    ((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_replacing))) {
@@ -419,7 +414,7 @@ vdev_mirror_io_done(zio_t *zio)
 				if (mc->mc_tried)
 					continue;
 				if (!(zio->io_flags & ZIO_FLAG_SCRUB) &&
-				    !vdev_dtl_contains(&mc->mc_vd->vdev_dtl_map,
+				    !vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL,
 				    zio->io_txg, 1))
 					continue;
 				mc->mc_error = ESTALE;
@@ -429,7 +424,8 @@ vdev_mirror_io_done(zio_t *zio)
 			    mc->mc_vd, mc->mc_offset,
 			    zio->io_data, zio->io_size,
 			    ZIO_TYPE_WRITE, zio->io_priority,
-			    ZIO_FLAG_IO_REPAIR, NULL, NULL));
+			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
+			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
 		}
 	}
 }
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
index cd4d5ae..45cc829 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -48,10 +48,11 @@ int zfs_vdev_time_shift = 6;
 int zfs_vdev_ramp_rate = 2;
 
 /*
- * i/os will be aggregated into a single large i/o up to
- * zfs_vdev_aggregation_limit bytes long.
+ * To reduce IOPs, we aggregate small adjacent i/os into one large i/o.
+ * For read i/os, we also aggregate across small adjacency gaps.
  */
 int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE;
+int zfs_vdev_read_gap_limit = 32 << 10;
 
 SYSCTL_DECL(_vfs_zfs_vdev);
 TUNABLE_INT("vfs.zfs.vdev.max_pending", &zfs_vdev_max_pending);
@@ -168,33 +169,33 @@ vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
 static void
 vdev_queue_agg_io_done(zio_t *aio)
 {
-	zio_t *dio;
-	uint64_t offset = 0;
+	zio_t *pio;
 
-	while ((dio = aio->io_delegate_list) != NULL) {
+	while ((pio = zio_walk_parents(aio)) != NULL)
 		if (aio->io_type == ZIO_TYPE_READ)
-			bcopy((char *)aio->io_data + offset, dio->io_data,
-			    dio->io_size);
-		offset += dio->io_size;
-		aio->io_delegate_list = dio->io_delegate_next;
-		dio->io_delegate_next = NULL;
-		dio->io_error = aio->io_error;
-		zio_execute(dio);
-	}
-	ASSERT3U(offset, ==, aio->io_size);
+			bcopy((char *)aio->io_data + (pio->io_offset -
+			    aio->io_offset), pio->io_data, pio->io_size);
 
 	zio_buf_free(aio->io_data, aio->io_size);
 }
 
-#define	IS_ADJACENT(io, nio) \
-	((io)->io_offset + (io)->io_size == (nio)->io_offset)
+/*
+ * Compute the range spanned by two i/os, which is the endpoint of the last
+ * (lio->io_offset + lio->io_size) minus start of the first (fio->io_offset).
+ * Conveniently, the gap between fio and lio is given by -IO_SPAN(lio, fio);
+ * thus fio and lio are adjacent if and only if IO_SPAN(lio, fio) == 0.
+ */
+#define	IO_SPAN(fio, lio) ((lio)->io_offset + (lio)->io_size - (fio)->io_offset)
+#define	IO_GAP(fio, lio) (-IO_SPAN(lio, fio))
 
 static zio_t *
 vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
 {
-	zio_t *fio, *lio, *aio, *dio;
-	avl_tree_t *tree;
-	uint64_t size;
+	zio_t *fio, *lio, *aio, *dio, *nio;
+	avl_tree_t *t;
+	int flags;
+	uint64_t maxspan = zfs_vdev_aggregation_limit;
+	uint64_t maxgap;
 
 	ASSERT(MUTEX_HELD(&vq->vq_lock));
 
@@ -204,56 +205,62 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
 
 	fio = lio = avl_first(&vq->vq_deadline_tree);
 
-	tree = fio->io_vdev_tree;
-	size = fio->io_size;
-
-	while ((dio = AVL_PREV(tree, fio)) != NULL && IS_ADJACENT(dio, fio) &&
-	    !((dio->io_flags | fio->io_flags) & ZIO_FLAG_DONT_AGGREGATE) &&
-	    size + dio->io_size <= zfs_vdev_aggregation_limit) {
-		dio->io_delegate_next = fio;
-		fio = dio;
-		size += dio->io_size;
-	}
-
-	while ((dio = AVL_NEXT(tree, lio)) != NULL && IS_ADJACENT(lio, dio) &&
-	    !((lio->io_flags | dio->io_flags) & ZIO_FLAG_DONT_AGGREGATE) &&
-	    size + dio->io_size <= zfs_vdev_aggregation_limit) {
-		lio->io_delegate_next = dio;
-		lio = dio;
-		size += dio->io_size;
+	t = fio->io_vdev_tree;
+	flags = fio->io_flags & ZIO_FLAG_AGG_INHERIT;
+	maxgap = (t == &vq->vq_read_tree) ? zfs_vdev_read_gap_limit : 0;
+
+	if (!(flags & ZIO_FLAG_DONT_AGGREGATE)) {
+		/*
+		 * We can aggregate I/Os that are adjacent and of the
+		 * same flavor, as expressed by the AGG_INHERIT flags.
+		 * The latter is necessary so that certain attributes
+		 * of the I/O, such as whether it's a normal I/O or a
+		 * scrub/resilver, can be preserved in the aggregate.
+		 */
+		while ((dio = AVL_PREV(t, fio)) != NULL &&
+		    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
+		    IO_SPAN(dio, lio) <= maxspan && IO_GAP(dio, fio) <= maxgap)
+			fio = dio;
+
+		while ((dio = AVL_NEXT(t, lio)) != NULL &&
+		    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
+		    IO_SPAN(fio, dio) <= maxspan && IO_GAP(lio, dio) <= maxgap)
+			lio = dio;
 	}
 
 	if (fio != lio) {
-		char *buf = zio_buf_alloc(size);
-		uint64_t offset = 0;
-
+		uint64_t size = IO_SPAN(fio, lio);
 		ASSERT(size <= zfs_vdev_aggregation_limit);
 
 		aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset,
-		    buf, size, fio->io_type, ZIO_PRIORITY_NOW,
-		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
+		    zio_buf_alloc(size), size, fio->io_type, ZIO_PRIORITY_NOW,
+		    flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
 		    vdev_queue_agg_io_done, NULL);
 
-		aio->io_delegate_list = fio;
-
-		for (dio = fio; dio != NULL; dio = dio->io_delegate_next) {
+		nio = fio;
+		do {
+			dio = nio;
+			nio = AVL_NEXT(t, dio);
 			ASSERT(dio->io_type == aio->io_type);
-			ASSERT(dio->io_vdev_tree == tree);
+			ASSERT(dio->io_vdev_tree == t);
+
 			if (dio->io_type == ZIO_TYPE_WRITE)
-				bcopy(dio->io_data, buf + offset, dio->io_size);
-			offset += dio->io_size;
+				bcopy(dio->io_data, (char *)aio->io_data +
+				    (dio->io_offset - aio->io_offset),
+				    dio->io_size);
+
+			zio_add_child(dio, aio);
 			vdev_queue_io_remove(vq, dio);
 			zio_vdev_io_bypass(dio);
-		}
-
-		ASSERT(offset == size);
+			zio_execute(dio);
+		} while (dio != lio);
 
 		avl_add(&vq->vq_pending_tree, aio);
 
 		return (aio);
 	}
 
-	ASSERT(fio->io_vdev_tree == tree);
+	ASSERT(fio->io_vdev_tree == t);
 	vdev_queue_io_remove(vq, fio);
 
 	avl_add(&vq->vq_pending_tree, fio);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
index 0a06190..92753d8 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
@@ -687,7 +687,7 @@ vdev_raidz_io_start(zio_t *zio)
 			rc->rc_skipped = 1;
 			continue;
 		}
-		if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) {
+		if (vdev_dtl_contains(cvd, DTL_MISSING, bp->blk_birth, 1)) {
 			if (c >= rm->rm_firstdatacol)
 				rm->rm_missingdata++;
 			else
@@ -1165,7 +1165,7 @@ vdev_raidz_io_done(zio_t *zio)
 done:
 	zio_checksum_verified(zio);
 
-	if (zio->io_error == 0 && (spa_mode & FWRITE) &&
+	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
 	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
 		/*
 		 * Use the good data we have in hand to repair damaged children.
@@ -1180,7 +1180,8 @@ done:
 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 			    rc->rc_offset, rc->rc_data, rc->rc_size,
 			    ZIO_TYPE_WRITE, zio->io_priority,
-			    ZIO_FLAG_IO_REPAIR, NULL, NULL));
+			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
+			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
 		}
 	}
 }
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c
index 90fe3d0..7abe63a 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c
@@ -1135,3 +1135,58 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs)
 		}
 	}
 }
+
+int
+fzap_count_write(zap_name_t *zn, int add, uint64_t *towrite,
+    uint64_t *tooverwrite)
+{
+	zap_t *zap = zn->zn_zap;
+	zap_leaf_t *l;
+	int err;
+
+	/*
+	 * Account for the header block of the fatzap.
+	 */
+	if (!add && dmu_buf_freeable(zap->zap_dbuf)) {
+		*tooverwrite += zap->zap_dbuf->db_size;
+	} else {
+		*towrite += zap->zap_dbuf->db_size;
+	}
+
+	/*
+	 * Account for the pointer table blocks.
+	 * If we are adding we need to account for the following cases :
+	 * - If the pointer table is embedded, this operation could force an
+	 *   external pointer table.
+	 * - If this already has an external pointer table this operation
+	 *   could extend the table.
+	 */
+	if (add) {
+		if (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk == 0)
+			*towrite += zap->zap_dbuf->db_size;
+		else
+			*towrite += (zap->zap_dbuf->db_size * 3);
+	}
+
+	/*
+	 * Now, check if the block containing leaf is freeable
+	 * and account accordingly.
+	 */
+	err = zap_deref_leaf(zap, zn->zn_hash, NULL, RW_READER, &l);
+	if (err != 0) {
+		return (err);
+	}
+
+	if (!add && dmu_buf_freeable(l->l_dbuf)) {
+		*tooverwrite += l->l_dbuf->db_size;
+	} else {
+		/*
+		 * If this an add operation, the leaf block could split.
+		 * Hence, we need to account for an additional leaf block.
+		 */
+		*towrite += (add ? 2 : 1) * l->l_dbuf->db_size;
+	}
+
+	zap_put_leaf(l);
+	return (0);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c
index 10d7386..9453fd2 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c
@@ -1079,3 +1079,79 @@ zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
 	zap_unlockdir(zap);
 	return (0);
 }
+
+int
+zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
+    uint64_t *towrite, uint64_t *tooverwrite)
+{
+	zap_t *zap;
+	int err = 0;
+
+
+	/*
+	 * Since, we don't have a name, we cannot figure out which blocks will
+	 * be affected in this operation. So, account for the worst case :
+	 * - 3 blocks overwritten: target leaf, ptrtbl block, header block
+	 * - 4 new blocks written if adding:
+	 * 	- 2 blocks for possibly split leaves,
+	 * 	- 2 grown ptrtbl blocks
+	 *
+	 * This also accomodates the case where an add operation to a fairly
+	 * large microzap results in a promotion to fatzap.
+	 */
+	if (name == NULL) {
+		*towrite += (3 + (add ? 4 : 0)) * SPA_MAXBLOCKSIZE;
+		return (err);
+	}
+
+	/*
+	 * We lock the zap with adding ==  FALSE. Because, if we pass
+	 * the actual value of add, it could trigger a mzap_upgrade().
+	 * At present we are just evaluating the possibility of this operation
+	 * and hence we donot want to trigger an upgrade.
+	 */
+	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
+	if (err)
+		return (err);
+
+	if (!zap->zap_ismicro) {
+		zap_name_t *zn = zap_name_alloc(zap, name, MT_EXACT);
+		if (zn) {
+			err = fzap_count_write(zn, add, towrite,
+			    tooverwrite);
+			zap_name_free(zn);
+		} else {
+			/*
+			 * We treat this case as similar to (name == NULL)
+			 */
+			*towrite += (3 + (add ? 4 : 0)) * SPA_MAXBLOCKSIZE;
+		}
+	} else {
+		/*
+		 * We are here if (name != NULL) and this is a micro-zap.
+		 * We account for the header block depending on whether it
+		 * is freeable.
+		 *
+		 * Incase of an add-operation it is hard to find out
+		 * if this add will promote this microzap to fatzap.
+		 * Hence, we consider the worst case and account for the
+		 * blocks assuming this microzap would be promoted to a
+		 * fatzap.
+		 *
+		 * 1 block overwritten  : header block
+		 * 4 new blocks written : 2 new split leaf, 2 grown
+		 *			ptrtbl blocks
+		 */
+		if (dmu_buf_freeable(zap->zap_dbuf))
+			*tooverwrite += SPA_MAXBLOCKSIZE;
+		else
+			*towrite += SPA_MAXBLOCKSIZE;
+
+		if (add) {
+			*towrite += 4 * SPA_MAXBLOCKSIZE;
+		}
+	}
+
+	zap_unlockdir(zap);
+	return (err);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c
index 01007d7..c42f094 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -62,13 +62,15 @@
     ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS)
 #define	OWNER_ALLOW_MASK (ACE_WRITE_ACL | ACE_WRITE_OWNER | \
     ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS)
-#define	WRITE_MASK_DATA (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS)
 
 #define	ZFS_CHECKED_MASKS (ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_DATA| \
     ACE_READ_NAMED_ATTRS|ACE_WRITE_DATA|ACE_WRITE_ATTRIBUTES| \
     ACE_WRITE_NAMED_ATTRS|ACE_APPEND_DATA|ACE_EXECUTE|ACE_WRITE_OWNER| \
     ACE_WRITE_ACL|ACE_DELETE|ACE_DELETE_CHILD|ACE_SYNCHRONIZE)
 
+#define	WRITE_MASK_DATA (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS)
+#define	WRITE_MASK_ATTRS (ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES| \
+    ACE_DELETE|ACE_DELETE_CHILD)
 #define	WRITE_MASK (WRITE_MASK_DATA|ACE_WRITE_ATTRIBUTES|ACE_WRITE_ACL|\
     ACE_WRITE_OWNER|ACE_DELETE|ACE_DELETE_CHILD)
 
@@ -535,8 +537,9 @@ zfs_acl_curr_node(zfs_acl_t *aclp)
  * ACE FUIDs will be created later.
  */
 int
-zfs_copy_ace_2_fuid(vtype_t obj_type, zfs_acl_t *aclp, void *datap,
-    zfs_ace_t *z_acl, int aclcnt, size_t *size)
+zfs_copy_ace_2_fuid(zfsvfs_t *zfsvfs, vtype_t obj_type, zfs_acl_t *aclp,
+    void *datap, zfs_ace_t *z_acl, int aclcnt, size_t *size,
+    zfs_fuid_info_t **fuidp, cred_t *cr)
 {
 	int i;
 	uint16_t entry_type;
@@ -552,9 +555,9 @@ zfs_copy_ace_2_fuid(vtype_t obj_type, zfs_acl_t *aclp, void *datap,
 		entry_type = aceptr->z_hdr.z_flags & ACE_TYPE_FLAGS;
 		if (entry_type != ACE_OWNER && entry_type != OWNING_GROUP &&
 		    entry_type != ACE_EVERYONE) {
-			if (!aclp->z_has_fuids)
-				aclp->z_has_fuids = IS_EPHEMERAL(acep->a_who);
-			aceptr->z_fuid = (uint64_t)acep->a_who;
+			aceptr->z_fuid = zfs_fuid_create(zfsvfs, acep->a_who,
+			    cr, (entry_type == 0) ?
+			    ZFS_ACE_USER : ZFS_ACE_GROUP, fuidp);
 		}
 
 		/*
@@ -679,7 +682,7 @@ zfs_copy_ace_2_oldace(vtype_t obj_type, zfs_acl_t *aclp, ace_t *acep,
  * convert old ACL format to new
  */
 void
-zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp)
+zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp, cred_t *cr)
 {
 	zfs_oldace_t *oldaclp;
 	int i;
@@ -711,9 +714,9 @@ zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp)
 	newaclnode = zfs_acl_node_alloc(aclp->z_acl_count *
 	    sizeof (zfs_object_ace_t));
 	aclp->z_ops = zfs_acl_fuid_ops;
-	VERIFY(zfs_copy_ace_2_fuid(ZTOV(zp)->v_type, aclp, oldaclp,
-	    newaclnode->z_acldata, aclp->z_acl_count,
-	    &newaclnode->z_size) == 0);
+	VERIFY(zfs_copy_ace_2_fuid(zp->z_zfsvfs, ZTOV(zp)->v_type, aclp,
+	    oldaclp, newaclnode->z_acldata, aclp->z_acl_count,
+	    &newaclnode->z_size, NULL, cr) == 0);
 	newaclnode->z_ace_count = aclp->z_acl_count;
 	aclp->z_version = ZFS_ACL_VERSION;
 	kmem_free(oldaclp, aclp->z_acl_count * sizeof (zfs_oldace_t));
@@ -767,8 +770,7 @@ zfs_set_ace(zfs_acl_t *aclp, void *acep, uint32_t access_mask,
  * Also, create FUIDs for any User/Group ACEs
  */
 static uint64_t
-zfs_mode_fuid_compute(znode_t *zp, zfs_acl_t *aclp, cred_t *cr,
-    zfs_fuid_info_t **fuidp, dmu_tx_t *tx)
+zfs_mode_compute(znode_t *zp, zfs_acl_t *aclp)
 {
 	int		entry_type;
 	mode_t		mode;
@@ -902,15 +904,6 @@ zfs_mode_fuid_compute(znode_t *zp, zfs_acl_t *aclp, cred_t *cr,
 				}
 			}
 		}
-		/*
-		 * Now handle FUID create for user/group ACEs
-		 */
-		if (entry_type == 0 || entry_type == ACE_IDENTIFIER_GROUP) {
-			aclp->z_ops.ace_who_set(acep,
-			    zfs_fuid_create(zp->z_zfsvfs, who, cr,
-			    (entry_type == 0) ? ZFS_ACE_USER : ZFS_ACE_GROUP,
-			    tx, fuidp));
-		}
 	}
 	return (mode);
 }
@@ -986,7 +979,7 @@ zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify)
 	aclnode = zfs_acl_node_alloc(aclsize);
 	list_insert_head(&aclp->z_acl, aclnode);
 	error = dmu_read(zp->z_zfsvfs->z_os, extacl, 0,
-	    aclsize, aclnode->z_acldata);
+	    aclsize, aclnode->z_acldata, DMU_READ_PREFETCH);
 	aclnode->z_ace_count = acl_count;
 	aclp->z_acl_count = acl_count;
 	aclp->z_acl_bytes = aclsize;
@@ -1011,8 +1004,7 @@ zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify)
  * already checked the acl and knows whether to inherit.
  */
 int
-zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr,
-    zfs_fuid_info_t **fuidp, dmu_tx_t *tx)
+zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx)
 {
 	int		error;
 	znode_phys_t	*zphys = zp->z_phys;
@@ -1023,12 +1015,9 @@ zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr,
 	dmu_object_type_t otype;
 	zfs_acl_node_t	*aclnode;
 
-	ASSERT(MUTEX_HELD(&zp->z_lock));
-	ASSERT(MUTEX_HELD(&zp->z_acl_lock));
-
 	dmu_buf_will_dirty(zp->z_dbuf, tx);
 
-	zphys->zp_mode = zfs_mode_fuid_compute(zp, aclp, cr, fuidp, tx);
+	zphys->zp_mode = zfs_mode_compute(zp, aclp);
 
 	/*
 	 * Decide which opbject type to use.  If we are forced to
@@ -1040,7 +1029,7 @@ zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr,
 	} else {
 		if ((aclp->z_version == ZFS_ACL_VERSION_INITIAL) &&
 		    (zfsvfs->z_version >= ZPL_VERSION_FUID))
-			zfs_acl_xform(zp, aclp);
+			zfs_acl_xform(zp, aclp, cr);
 		ASSERT(aclp->z_version >= ZFS_ACL_VERSION_FUID);
 		otype = DMU_OT_ACL;
 	}
@@ -1122,7 +1111,6 @@ zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr,
 	if (ace_trivial_common(aclp, 0, zfs_ace_walk) == 0)
 		zp->z_phys->zp_flags |= ZFS_ACL_TRIVIAL;
 
-	zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
 	return (0);
 }
 
@@ -1333,7 +1321,7 @@ zfs_acl_ace_insert(zfs_acl_t *aclp, void  *acep)
  * Prepend deny ACE
  */
 static void *
-zfs_acl_prepend_deny(znode_t *zp, zfs_acl_t *aclp, void *acep,
+zfs_acl_prepend_deny(uint64_t uid, zfs_acl_t *aclp, void *acep,
     mode_t mode)
 {
 	zfs_acl_node_t *aclnode;
@@ -1346,7 +1334,7 @@ zfs_acl_prepend_deny(znode_t *zp, zfs_acl_t *aclp, void *acep,
 	fuid = aclp->z_ops.ace_who_get(acep);
 	flags = aclp->z_ops.ace_flags_get(acep);
 	zfs_set_ace(aclp, newacep, 0, DENY, fuid, (flags & ACE_TYPE_FLAGS));
-	zfs_acl_prepend_fixup(aclp, newacep, acep, mode, zp->z_phys->zp_uid);
+	zfs_acl_prepend_fixup(aclp, newacep, acep, mode, uid);
 
 	return (newacep);
 }
@@ -1470,9 +1458,9 @@ zfs_fixup_group_entries(zfs_acl_t *aclp, void *acep, void *prevacep,
  * in PSARC/2002/240
  */
 static void
-zfs_acl_chmod(znode_t *zp, uint64_t mode, zfs_acl_t *aclp)
+zfs_acl_chmod(zfsvfs_t *zfsvfs, uint64_t uid,
+    uint64_t mode, zfs_acl_t *aclp)
 {
-	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	void		*acep = NULL, *prevacep = NULL;
 	uint64_t	who;
 	int 		i;
@@ -1482,11 +1470,6 @@ zfs_acl_chmod(znode_t *zp, uint64_t mode, zfs_acl_t *aclp)
 	uint16_t	iflags, type;
 	uint32_t	access_mask;
 
-	ASSERT(MUTEX_HELD(&zp->z_acl_lock));
-	ASSERT(MUTEX_HELD(&zp->z_lock));
-
-	aclp->z_hints = (zp->z_phys->zp_flags & V4_ACL_WIDE_FLAGS);
-
 	/*
 	 * If discard then just discard all ACL nodes which
 	 * represent the ACEs.
@@ -1551,17 +1534,15 @@ zfs_acl_chmod(znode_t *zp, uint64_t mode, zfs_acl_t *aclp)
 
 					if (!reuse_deny) {
 						prevacep =
-						    zfs_acl_prepend_deny(zp,
+						    zfs_acl_prepend_deny(uid,
 						    aclp, acep, mode);
 					} else {
 						zfs_acl_prepend_fixup(
 						    aclp, prevacep,
-						    acep, mode,
-						    zp->z_phys->zp_uid);
+						    acep, mode, uid);
 					}
 					zfs_fixup_group_entries(aclp, acep,
 					    prevacep, mode);
-
 				}
 			}
 		}
@@ -1620,8 +1601,10 @@ zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode)
 	mutex_enter(&zp->z_acl_lock);
 	*aclp = NULL;
 	error = zfs_acl_node_read(zp, aclp, B_TRUE);
-	if (error == 0)
-		zfs_acl_chmod(zp, mode, *aclp);
+	if (error == 0) {
+		(*aclp)->z_hints = zp->z_phys->zp_flags & V4_ACL_WIDE_FLAGS;
+		zfs_acl_chmod(zp->z_zfsvfs, zp->z_phys->zp_uid, mode, *aclp);
+	}
 	mutex_exit(&zp->z_acl_lock);
 	mutex_exit(&zp->z_lock);
 	return (error);
@@ -1646,9 +1629,8 @@ zfs_restricted_update(zfsvfs_t *zfsvfs, zfs_acl_t *aclp, void *acep)
  * Should ACE be inherited?
  */
 static int
-zfs_ace_can_use(znode_t *zp, uint16_t acep_flags)
+zfs_ace_can_use(vtype_t vtype, uint16_t acep_flags)
 {
-	int vtype = ZTOV(zp)->v_type;
 	int	iflags = (acep_flags & 0xf);
 
 	if ((vtype == VDIR) && (iflags & ACE_DIRECTORY_INHERIT_ACE))
@@ -1663,10 +1645,9 @@ zfs_ace_can_use(znode_t *zp, uint16_t acep_flags)
  * inherit inheritable ACEs from parent
  */
 static zfs_acl_t *
-zfs_acl_inherit(znode_t *zp, zfs_acl_t *paclp, uint64_t mode,
-    boolean_t *need_chmod)
+zfs_acl_inherit(zfsvfs_t *zfsvfs, vtype_t vtype, zfs_acl_t *paclp,
+    uint64_t mode, boolean_t *need_chmod)
 {
-	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	void		*pacep;
 	void		*acep, *acep2;
 	zfs_acl_node_t  *aclnode, *aclnode2;
@@ -1677,8 +1658,8 @@ zfs_acl_inherit(znode_t *zp, zfs_acl_t *paclp, uint64_t mode,
 	size_t		ace_size;
 	void		*data1, *data2;
 	size_t		data1sz, data2sz;
-	boolean_t	vdir = ZTOV(zp)->v_type == VDIR;
-	boolean_t	vreg = ZTOV(zp)->v_type == VREG;
+	boolean_t	vdir = vtype == VDIR;
+	boolean_t	vreg = vtype == VREG;
 	boolean_t	passthrough, passthrough_x, noallow;
 
 	passthrough_x =
@@ -1707,7 +1688,7 @@ zfs_acl_inherit(znode_t *zp, zfs_acl_t *paclp, uint64_t mode,
 
 		ace_size = aclp->z_ops.ace_size(pacep);
 
-		if (!zfs_ace_can_use(zp, iflags))
+		if (!zfs_ace_can_use(vtype, iflags))
 			continue;
 
 		/*
@@ -1803,57 +1784,60 @@ zfs_acl_inherit(znode_t *zp, zfs_acl_t *paclp, uint64_t mode,
  * Create file system object initial permissions
  * including inheritable ACEs.
  */
-void
-zfs_perm_init(znode_t *zp, znode_t *parent, int flag,
-    vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
-    zfs_acl_t *setaclp, zfs_fuid_info_t **fuidp)
+int
+zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
+    vsecattr_t *vsecp, zfs_acl_ids_t *acl_ids)
 {
-	uint64_t	mode, fuid, fgid;
 	int		error;
-	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	zfs_acl_t	*aclp = NULL;
+	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zfs_acl_t	*paclp;
-	xvattr_t	*xvap = (xvattr_t *)vap;
 	gid_t		gid;
 	boolean_t	need_chmod = B_TRUE;
 
-	if (setaclp)
-		aclp = setaclp;
+	bzero(acl_ids, sizeof (zfs_acl_ids_t));
+	acl_ids->z_mode = MAKEIMODE(vap->va_type, vap->va_mode);
 
-	mode = MAKEIMODE(vap->va_type, vap->va_mode);
+	if (vsecp)
+		if ((error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, cr,
+		    &acl_ids->z_fuidp, &acl_ids->z_aclp)) != 0)
+			return (error);
 
 	/*
 	 * Determine uid and gid.
 	 */
-	if ((flag & (IS_ROOT_NODE | IS_REPLAY)) ||
+	if ((flag & IS_ROOT_NODE) || zfsvfs->z_replay ||
 	    ((flag & IS_XATTR) && (vap->va_type == VDIR))) {
-		fuid = zfs_fuid_create(zfsvfs, vap->va_uid, cr,
-		    ZFS_OWNER, tx, fuidp);
-		fgid = zfs_fuid_create(zfsvfs, vap->va_gid, cr,
-		    ZFS_GROUP, tx, fuidp);
+		acl_ids->z_fuid = zfs_fuid_create(zfsvfs,
+		    (uint64_t)vap->va_uid, cr,
+		    ZFS_OWNER, &acl_ids->z_fuidp);
+		acl_ids->z_fgid = zfs_fuid_create(zfsvfs,
+		    (uint64_t)vap->va_gid, cr,
+		    ZFS_GROUP, &acl_ids->z_fuidp);
 		gid = vap->va_gid;
 	} else {
-		fuid = zfs_fuid_create_cred(zfsvfs, ZFS_OWNER, tx, cr, fuidp);
-		fgid = 0;
+		acl_ids->z_fuid = zfs_fuid_create_cred(zfsvfs, ZFS_OWNER,
+		    cr, &acl_ids->z_fuidp);
+		acl_ids->z_fgid = 0;
 		if (vap->va_mask & AT_GID)  {
-			fgid = zfs_fuid_create(zfsvfs, vap->va_gid, cr,
-			    ZFS_GROUP, tx, fuidp);
+			acl_ids->z_fgid = zfs_fuid_create(zfsvfs,
+			    (uint64_t)vap->va_gid,
+			    cr, ZFS_GROUP, &acl_ids->z_fuidp);
 			gid = vap->va_gid;
-			if (fgid != parent->z_phys->zp_gid &&
+			if (acl_ids->z_fgid != dzp->z_phys->zp_gid &&
 			    !groupmember(vap->va_gid, cr) &&
 			    secpolicy_vnode_create_gid(cr) != 0)
-				fgid = 0;
+				acl_ids->z_fgid = 0;
 		}
-		if (fgid == 0) {
-			if (parent->z_phys->zp_mode & S_ISGID) {
-				fgid = parent->z_phys->zp_gid;
-				gid = zfs_fuid_map_id(zfsvfs, fgid,
+		if (acl_ids->z_fgid == 0) {
+			if (dzp->z_phys->zp_mode & S_ISGID) {
+				acl_ids->z_fgid = dzp->z_phys->zp_gid;
+				gid = zfs_fuid_map_id(zfsvfs, acl_ids->z_fgid,
 				    cr, ZFS_GROUP);
 			} else {
-				fgid = zfs_fuid_create_cred(zfsvfs,
-				    ZFS_GROUP, tx, cr, fuidp);
+				acl_ids->z_fgid = zfs_fuid_create_cred(zfsvfs,
+				    ZFS_GROUP, cr, &acl_ids->z_fuidp);
 #ifdef __FreeBSD__
-				gid = fgid = parent->z_phys->zp_gid;
+				gid = acl_ids->z_fgid = dzp->z_phys->zp_gid;
 #else
 				gid = crgetgid(cr);
 #endif
@@ -1868,57 +1852,61 @@ zfs_perm_init(znode_t *zp, znode_t *parent, int flag,
 	 * file's new group, clear the file's set-GID bit.
 	 */
 
-	if ((parent->z_phys->zp_mode & S_ISGID) && (vap->va_type == VDIR)) {
-		mode |= S_ISGID;
+	if (!(flag & IS_ROOT_NODE) && (dzp->z_phys->zp_mode & S_ISGID) &&
+	    (vap->va_type == VDIR)) {
+		acl_ids->z_mode |= S_ISGID;
 	} else {
-		if ((mode & S_ISGID) &&
-		    secpolicy_vnode_setids_setgids(ZTOV(zp), cr, gid) != 0)
-			mode &= ~S_ISGID;
-	}
-
-	zp->z_phys->zp_uid = fuid;
-	zp->z_phys->zp_gid = fgid;
-	zp->z_phys->zp_mode = mode;
-
-	if (aclp == NULL) {
-		mutex_enter(&parent->z_lock);
-		if ((ZTOV(parent)->v_type == VDIR &&
-		    (parent->z_phys->zp_flags & ZFS_INHERIT_ACE)) &&
-		    !(zp->z_phys->zp_flags & ZFS_XATTR)) {
-			mutex_enter(&parent->z_acl_lock);
-			VERIFY(0 == zfs_acl_node_read(parent, &paclp, B_FALSE));
-			mutex_exit(&parent->z_acl_lock);
-			aclp = zfs_acl_inherit(zp, paclp, mode, &need_chmod);
+		if ((acl_ids->z_mode & S_ISGID) &&
+		    secpolicy_vnode_setids_setgids(ZTOV(dzp), cr, gid) != 0)
+			acl_ids->z_mode &= ~S_ISGID;
+	}
+
+	if (acl_ids->z_aclp == NULL) {
+		mutex_enter(&dzp->z_lock);
+		if (!(flag & IS_ROOT_NODE) && (ZTOV(dzp)->v_type == VDIR &&
+		    (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)) &&
+		    !(dzp->z_phys->zp_flags & ZFS_XATTR)) {
+			mutex_enter(&dzp->z_acl_lock);
+			VERIFY(0 == zfs_acl_node_read(dzp, &paclp, B_FALSE));
+			mutex_exit(&dzp->z_acl_lock);
+			acl_ids->z_aclp = zfs_acl_inherit(zfsvfs,
+			    vap->va_type, paclp, acl_ids->z_mode, &need_chmod);
 			zfs_acl_free(paclp);
 		} else {
-			aclp = zfs_acl_alloc(zfs_acl_version_zp(zp));
+			acl_ids->z_aclp =
+			    zfs_acl_alloc(zfs_acl_version_zp(dzp));
+		}
+		mutex_exit(&dzp->z_lock);
+		if (need_chmod) {
+			acl_ids->z_aclp->z_hints = (vap->va_type == VDIR) ?
+			    ZFS_ACL_AUTO_INHERIT : 0;
+			zfs_acl_chmod(zfsvfs, acl_ids->z_fuid,
+			    acl_ids->z_mode, acl_ids->z_aclp);
 		}
-		mutex_exit(&parent->z_lock);
-		mutex_enter(&zp->z_lock);
-		mutex_enter(&zp->z_acl_lock);
-		if (need_chmod)
-			zfs_acl_chmod(zp, mode, aclp);
-	} else {
-		mutex_enter(&zp->z_lock);
-		mutex_enter(&zp->z_acl_lock);
 	}
 
-	/* Force auto_inherit on all new directory objects */
-	if (vap->va_type == VDIR)
-		aclp->z_hints |= ZFS_ACL_AUTO_INHERIT;
-
-	error = zfs_aclset_common(zp, aclp, cr, fuidp, tx);
-
-	/* Set optional attributes if any */
-	if (vap->va_mask & AT_XVATTR)
-		zfs_xvattr_set(zp, xvap);
+	return (0);
+}
 
-	mutex_exit(&zp->z_lock);
-	mutex_exit(&zp->z_acl_lock);
-	ASSERT3U(error, ==, 0);
+/*
+ * Free ACL and fuid_infop, but not the acl_ids structure
+ */
+void
+zfs_acl_ids_free(zfs_acl_ids_t *acl_ids)
+{
+	if (acl_ids->z_aclp)
+		zfs_acl_free(acl_ids->z_aclp);
+	if (acl_ids->z_fuidp)
+		zfs_fuid_info_free(acl_ids->z_fuidp);
+	acl_ids->z_aclp = NULL;
+	acl_ids->z_fuidp = NULL;
+}
 
-	if (aclp != setaclp)
-		zfs_acl_free(aclp);
+boolean_t
+zfs_acl_ids_overquota(zfsvfs_t *zfsvfs, zfs_acl_ids_t *acl_ids)
+{
+	return (zfs_usergroup_overquota(zfsvfs, B_FALSE, acl_ids->z_fuid) ||
+	    zfs_usergroup_overquota(zfsvfs, B_TRUE, acl_ids->z_fgid));
 }
 
 /*
@@ -1984,6 +1972,8 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
 	if (mask & VSA_ACE) {
 		size_t aclsz;
 
+		zfs_acl_node_t *aclnode = list_head(&aclp->z_acl);
+
 		aclsz = count * sizeof (ace_t) +
 		    sizeof (ace_object_t) * largeace;
 
@@ -1994,17 +1984,8 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
 			zfs_copy_fuid_2_ace(zp->z_zfsvfs, aclp, cr,
 			    vsecp->vsa_aclentp, !(mask & VSA_ACE_ALLTYPES));
 		else {
-			zfs_acl_node_t *aclnode;
-			void *start = vsecp->vsa_aclentp;
-
-			for (aclnode = list_head(&aclp->z_acl); aclnode;
-			    aclnode = list_next(&aclp->z_acl, aclnode)) {
-				bcopy(aclnode->z_acldata, start,
-				    aclnode->z_size);
-				start = (caddr_t)start + aclnode->z_size;
-			}
-			ASSERT((caddr_t)start - (caddr_t)vsecp->vsa_aclentp ==
-			    aclp->z_acl_bytes);
+			bcopy(aclnode->z_acldata, vsecp->vsa_aclentp,
+			    count * sizeof (ace_t));
 		}
 	}
 	if (mask & VSA_ACE_ACLFLAGS) {
@@ -2026,7 +2007,7 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
 
 int
 zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, vtype_t obj_type,
-    vsecattr_t *vsecp, zfs_acl_t **zaclp)
+    vsecattr_t *vsecp, cred_t *cr, zfs_fuid_info_t **fuidp, zfs_acl_t **zaclp)
 {
 	zfs_acl_t *aclp;
 	zfs_acl_node_t *aclnode;
@@ -2049,9 +2030,9 @@ zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, vtype_t obj_type,
 			return (error);
 		}
 	} else {
-		if ((error = zfs_copy_ace_2_fuid(obj_type, aclp,
+		if ((error = zfs_copy_ace_2_fuid(zfsvfs, obj_type, aclp,
 		    vsecp->vsa_aclentp, aclnode->z_acldata, aclcnt,
-		    &aclnode->z_size)) != 0) {
+		    &aclnode->z_size, fuidp, cr)) != 0) {
 			zfs_acl_free(aclp);
 			zfs_acl_node_free(aclnode);
 			return (error);
@@ -2092,6 +2073,7 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
 	int		error;
 	zfs_acl_t	*aclp;
 	zfs_fuid_info_t	*fuidp = NULL;
+	boolean_t	fuid_dirtied;
 
 	if (mask == 0)
 		return (ENOSYS);
@@ -2102,7 +2084,8 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
 	if (error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr))
 		return (error);
 
-	error = zfs_vsec_2_aclp(zfsvfs, ZTOV(zp)->v_type, vsecp, &aclp);
+	error = zfs_vsec_2_aclp(zfsvfs, ZTOV(zp)->v_type, vsecp, cr, &fuidp,
+	    &aclp);
 	if (error)
 		return (error);
 
@@ -2143,25 +2126,16 @@ top:
 	} else if (aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes);
 	}
-	if (aclp->z_has_fuids) {
-		if (zfsvfs->z_fuid_obj == 0) {
-			dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
-			dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
-			    FUID_SIZE_ESTIMATE(zfsvfs));
-			dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL);
-		} else {
-			dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
-			dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
-			    FUID_SIZE_ESTIMATE(zfsvfs));
-		}
-	}
+	fuid_dirtied = zfsvfs->z_fuid_dirty;
+	if (fuid_dirtied)
+		zfs_fuid_txhold(zfsvfs, tx);
 
-	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	error = dmu_tx_assign(tx, TXG_NOWAIT);
 	if (error) {
 		mutex_exit(&zp->z_acl_lock);
 		mutex_exit(&zp->z_lock);
 
-		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+		if (error == ERESTART) {
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
@@ -2171,9 +2145,13 @@ top:
 		return (error);
 	}
 
-	error = zfs_aclset_common(zp, aclp, cr, &fuidp, tx);
+	error = zfs_aclset_common(zp, aclp, cr, tx);
 	ASSERT(error == 0);
 
+	if (fuid_dirtied)
+		zfs_fuid_sync(zfsvfs, tx);
+
+	zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
 	zfs_log_acl(zilog, tx, zp, vsecp, fuidp);
 
 	if (fuidp)
@@ -2216,7 +2194,7 @@ zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode,
 
 	*check_privs = B_TRUE;
 
-	if (zfsvfs->z_assign >= TXG_INITIAL) {		/* ZIL replay */
+	if (zfsvfs->z_replay) {
 		*working_mode = 0;
 		return (0);
 	}
@@ -2225,7 +2203,8 @@ zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode,
 
 	if ((v4_mode & WRITE_MASK) &&
 	    (zp->z_zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) &&
-	    (!IS_DEVVP(ZTOV(zp)))) {
+	    (!IS_DEVVP(ZTOV(zp)) ||
+	    (IS_DEVVP(ZTOV(zp)) && (v4_mode & WRITE_MASK_ATTRS)))) {
 		*check_privs = B_FALSE;
 		return (EROFS);
 	}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c
index 7820293..361b17d 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * ZFS control directory (a.k.a. ".zfs")
  *
@@ -116,16 +114,21 @@ snapentry_compare(const void *a, const void *b)
 static struct vop_vector zfsctl_ops_root;
 static struct vop_vector zfsctl_ops_snapdir;
 static struct vop_vector zfsctl_ops_snapshot;
+static struct vop_vector zfsctl_ops_shares;
+static struct vop_vector zfsctl_ops_shares_dir;
 
 static vnode_t *zfsctl_mknode_snapdir(vnode_t *);
+static vnode_t *zfsctl_mknode_shares(vnode_t *);
 static vnode_t *zfsctl_snapshot_mknode(vnode_t *, uint64_t objset);
 static int zfsctl_unmount_snap(zfs_snapentry_t *, int, cred_t *);
 
 /*
- * Root directory elements.  We have only a single static entry, 'snapshot'.
+ * Root directory elements.  We only have two entries
+ * snapshot and shares.
  */
 static gfs_dirent_t zfsctl_root_entries[] = {
 	{ "snapshot", zfsctl_mknode_snapdir, GFS_CACHE_VNODE },
+	{ "shares", zfsctl_mknode_shares, GFS_CACHE_VNODE },
 	{ NULL }
 };
 
@@ -150,14 +153,21 @@ zfsctl_fini(void)
 }
 
 /*
- * Return the inode number associated with the 'snapshot' directory.
+ * Return the inode number associated with the 'snapshot' or
+ * 'shares' directory.
  */
 /* ARGSUSED */
 static ino64_t
 zfsctl_root_inode_cb(vnode_t *vp, int index)
 {
-	ASSERT(index == 0);
-	return (ZFSCTL_INO_SNAPDIR);
+	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+
+	ASSERT(index <= 2);
+
+	if (index == 0)
+		return (ZFSCTL_INO_SNAPDIR);
+
+	return (zfsvfs->z_shares_dir);
 }
 
 /*
@@ -260,8 +270,17 @@ zfsctl_common_access(ap)
 {
 	int mode = ap->a_accmode;
 
+#ifdef TODO
+	if (flags & V_ACE_MASK) {
+		if (accmode & ACE_ALL_WRITE_PERMS)
+			return (EACCES);
+	} else {
+#endif
 	if (mode & VWRITE)
 		return (EACCES);
+#ifdef TODO
+	}
+#endif
 
 	return (0);
 }
@@ -334,6 +353,36 @@ zfsctl_common_fid(ap)
 	return (0);
 }
 
+/*ARGSUSED*/
+static int
+zfsctl_shares_fid(ap)
+	struct vop_fid_args /* {
+		struct vnode *a_vp;
+		struct fid *a_fid;
+	} */ *ap;
+{
+	vnode_t		*vp = ap->a_vp;
+	fid_t		*fidp = (void *)ap->a_fid;
+	zfsvfs_t	*zfsvfs = vp->v_vfsp->vfs_data;
+	znode_t		*dzp;
+	int		error;
+
+	ZFS_ENTER(zfsvfs);
+
+	if (zfsvfs->z_shares_dir == 0) {
+		ZFS_EXIT(zfsvfs);
+		return (ENOTSUP);
+	}
+
+	if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) {
+		error = VOP_FID(ZTOV(dzp), fidp);
+		VN_RELE(ZTOV(dzp));
+	}
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
 static int
 zfsctl_common_reclaim(ap)
 	struct vop_reclaim_args /* {
@@ -394,6 +443,41 @@ zfsctl_root_getattr(ap)
 	return (0);
 }
 
+#ifdef sun
+static int
+zfsctl_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
+    caller_context_t *ct)
+{
+	/*
+	 * We only care about ACL_ENABLED so that libsec can
+	 * display ACL correctly and not default to POSIX draft.
+	 */
+	if (cmd == _PC_ACL_ENABLED) {
+		*valp = _ACL_ACE_ENABLED;
+		return (0);
+	}
+
+	return (fs_pathconf(vp, cmd, valp, cr, ct));
+}
+#endif	/* sun */
+
+#ifdef sun
+static const fs_operation_def_t zfsctl_tops_root[] = {
+	{ VOPNAME_OPEN,		{ .vop_open = zfsctl_common_open }	},
+	{ VOPNAME_CLOSE,	{ .vop_close = zfsctl_common_close }	},
+	{ VOPNAME_IOCTL,	{ .error = fs_inval }			},
+	{ VOPNAME_GETATTR,	{ .vop_getattr = zfsctl_root_getattr }	},
+	{ VOPNAME_ACCESS,	{ .vop_access = zfsctl_common_access }	},
+	{ VOPNAME_READDIR,	{ .vop_readdir = gfs_vop_readdir } 	},
+	{ VOPNAME_LOOKUP,	{ .vop_lookup = zfsctl_root_lookup }	},
+	{ VOPNAME_SEEK,		{ .vop_seek = fs_seek }			},
+	{ VOPNAME_INACTIVE,	{ .vop_inactive = gfs_vop_inactive }	},
+	{ VOPNAME_PATHCONF,	{ .vop_pathconf = zfsctl_pathconf }	},
+	{ VOPNAME_FID,		{ .vop_fid = zfsctl_common_fid	}	},
+	{ NULL }
+};
+#endif	/* sun */
+
 /*
  * Special case the handling of "..".
  */
@@ -712,7 +796,7 @@ zfsctl_snapdir_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t  **vpp,
 		return (err);
 
 	if (err == 0) {
-		err = dmu_objset_snapshot(name, dirname, B_FALSE);
+		err = dmu_objset_snapshot(name, dirname, NULL, B_FALSE);
 		if (err)
 			return (err);
 		err = lookupnameat(dirname, seg, follow, NULL, vpp, dvp);
@@ -777,9 +861,6 @@ zfsctl_snapdir_lookup(ap)
 
 	ASSERT(dvp->v_type == VDIR);
 
-	if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0)
-		return (0);
-
 	*vpp = NULL;
 
 	/*
@@ -793,6 +874,11 @@ zfsctl_snapdir_lookup(ap)
 
 	ZFS_ENTER(zfsvfs);
 
+	if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0) {
+		ZFS_EXIT(zfsvfs);
+		return (0);
+	}
+
 	if (flags & FIGNORECASE) {
 		boolean_t conflict = B_FALSE;
 
@@ -904,6 +990,46 @@ domount:
 }
 
 /* ARGSUSED */
+int
+zfsctl_shares_lookup(ap)
+	struct vop_lookup_args /* {
+		struct vnode *a_dvp;
+		struct vnode **a_vpp;
+		struct componentname *a_cnp;
+	} */ *ap;
+{
+	vnode_t *dvp = ap->a_dvp;
+	vnode_t **vpp = ap->a_vpp;
+	struct componentname *cnp = ap->a_cnp;
+	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
+	char nm[NAME_MAX + 1];
+	znode_t *dzp;
+	int error;
+
+	ZFS_ENTER(zfsvfs);
+
+	ASSERT(cnp->cn_namelen < sizeof(nm));
+	strlcpy(nm, cnp->cn_nameptr, cnp->cn_namelen + 1);
+
+	if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0) {
+		ZFS_EXIT(zfsvfs);
+		return (0);
+	}
+
+	if (zfsvfs->z_shares_dir == 0) {
+		ZFS_EXIT(zfsvfs);
+		return (ENOTSUP);
+	}
+	if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0)
+		error = VOP_LOOKUP(ZTOV(dzp), vpp, cnp);
+
+	VN_RELE(ZTOV(dzp));
+	ZFS_EXIT(zfsvfs);
+
+	return (error);
+}
+
+/* ARGSUSED */
 static int
 zfsctl_snapdir_readdir_cb(vnode_t *vp, void *dp, int *eofp,
     offset_t *offp, offset_t *nextp, void *data, int flags)
@@ -947,6 +1073,44 @@ zfsctl_snapdir_readdir_cb(vnode_t *vp, void *dp, int *eofp,
 	return (0);
 }
 
+/* ARGSUSED */
+static int
+zfsctl_shares_readdir(ap)
+	struct vop_readdir_args /* {
+		struct vnode *a_vp;
+		struct uio *a_uio;
+		struct ucred *a_cred;
+		int *a_eofflag;
+		int *a_ncookies;
+		u_long **a_cookies;
+	} */ *ap;
+{
+	vnode_t *vp = ap->a_vp;
+	uio_t *uiop = ap->a_uio;
+	cred_t *cr = ap->a_cred;
+	int *eofp = ap->a_eofflag;
+	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+	znode_t *dzp;
+	int error;
+
+	ZFS_ENTER(zfsvfs);
+
+	if (zfsvfs->z_shares_dir == 0) {
+		ZFS_EXIT(zfsvfs);
+		return (ENOTSUP);
+	}
+	if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) {
+		error = VOP_READDIR(ZTOV(dzp), uiop, cr, eofp, ap->a_ncookies, ap->a_cookies);
+		VN_RELE(ZTOV(dzp));
+	} else {
+		*eofp = 1;
+		error = ENOENT;
+	}
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
 /*
  * pvp is the '.zfs' directory (zfsctl_node_t).
  * Creates vp, which is '.zfs/snapshot' (zfsctl_snapdir_t).
@@ -973,6 +1137,51 @@ zfsctl_mknode_snapdir(vnode_t *pvp)
 	return (vp);
 }
 
+vnode_t *
+zfsctl_mknode_shares(vnode_t *pvp)
+{
+	vnode_t *vp;
+	zfsctl_node_t *sdp;
+
+	vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp, pvp->v_vfsp,
+	    &zfsctl_ops_shares, NULL, NULL, MAXNAMELEN,
+	    NULL, NULL);
+	sdp = vp->v_data;
+	sdp->zc_cmtime = ((zfsctl_node_t *)pvp->v_data)->zc_cmtime;
+	return (vp);
+
+}
+
+/* ARGSUSED */
+static int
+zfsctl_shares_getattr(ap)
+	struct vop_getattr_args /* {
+		struct vnode *a_vp;
+		struct vattr *a_vap;
+		struct ucred *a_cred;
+		struct thread *a_td;
+	} */ *ap;
+{
+	vnode_t *vp = ap->a_vp;
+	vattr_t *vap = ap->a_vap;
+	cred_t *cr = ap->a_cred;
+	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+	znode_t *dzp;
+	int error;
+
+	ZFS_ENTER(zfsvfs);
+	if (zfsvfs->z_shares_dir == 0) {
+		ZFS_EXIT(zfsvfs);
+		return (ENOTSUP);
+	}
+	if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) {
+		error = VOP_GETATTR(ZTOV(dzp), vap, cr);
+		VN_RELE(ZTOV(dzp));
+	}
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
 /* ARGSUSED */
 static int
 zfsctl_snapdir_getattr(ap)
@@ -1061,7 +1270,6 @@ zfsctl_snapshot_mknode(vnode_t *pvp, uint64_t objset)
 	VN_HOLD(vp);
 	zcp = vp->v_data;
 	zcp->zc_id = objset;
-	VFS_HOLD(vp->v_vfsp);
 	VOP_UNLOCK(vp, 0);
 
 	return (vp);
@@ -1112,7 +1320,6 @@ zfsctl_snapshot_inactive(ap)
 		mutex_exit(&sdp->sd_lock);
 	VN_RELE(dvp);
 end:
-	VFS_RELE(vp->v_vfsp);
 
 	/*
 	 * Dispose of the vnode for the snapshot mount point.
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c
index 34b17e4..3ac4741 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -581,24 +581,6 @@ zfs_rmnode(znode_t *zp)
 	ASSERT(zp->z_phys->zp_links == 0);
 
 	/*
-	 * If this is a ZIL replay then leave the object in the unlinked set.
-	 * Otherwise we can get a deadlock, because the delete can be
-	 * quite large and span multiple tx's and txgs, but each replay
-	 * creates a tx to atomically run the replay function and mark the
-	 * replay record as complete. We deadlock trying to start a tx in
-	 * a new txg to further the deletion but can't because the replay
-	 * tx hasn't finished.
-	 *
-	 * We actually delete the object if we get a failure to create an
-	 * object in zil_replay_log_record(), or after calling zil_replay().
-	 */
-	if (zfsvfs->z_assign >= TXG_INITIAL) {
-		zfs_znode_dmu_fini(zp);
-		zfs_znode_free(zp);
-		return;
-	}
-
-	/*
 	 * If this is an attribute directory, purge its contents.
 	 */
 	if (ZTOV(zp) != NULL && ZTOV(zp)->v_type == VDIR &&
@@ -842,7 +824,8 @@ zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr)
 	znode_t *xzp;
 	dmu_tx_t *tx;
 	int error;
-	zfs_fuid_info_t *fuidp = NULL;
+	zfs_acl_ids_t acl_ids;
+	boolean_t fuid_dirtied;
 
 	*xvpp = NULL;
 
@@ -855,37 +838,41 @@ zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr)
 		return (error);
 #endif
 
+	if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL,
+	    &acl_ids)) != 0)
+		return (error);
+	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
+		zfs_acl_ids_free(&acl_ids);
+		return (EDQUOT);
+	}
+
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_bonus(tx, zp->z_id);
 	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
-	if (IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr))) {
-		if (zfsvfs->z_fuid_obj == 0) {
-			dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
-			dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
-			    FUID_SIZE_ESTIMATE(zfsvfs));
-			dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL);
-		} else {
-			dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
-			dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
-			    FUID_SIZE_ESTIMATE(zfsvfs));
-		}
-	}
-	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	fuid_dirtied = zfsvfs->z_fuid_dirty;
+	if (fuid_dirtied)
+		zfs_fuid_txhold(zfsvfs, tx);
+	error = dmu_tx_assign(tx, TXG_NOWAIT);
 	if (error) {
-		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT)
+		zfs_acl_ids_free(&acl_ids);
+		if (error == ERESTART)
 			dmu_tx_wait(tx);
 		dmu_tx_abort(tx);
 		return (error);
 	}
-	zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, 0, NULL, &fuidp);
+	zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, 0, &acl_ids);
+
+	if (fuid_dirtied)
+		zfs_fuid_sync(zfsvfs, tx);
+
 	ASSERT(xzp->z_phys->zp_parent == zp->z_id);
 	dmu_buf_will_dirty(zp->z_dbuf, tx);
 	zp->z_phys->zp_xattr = xzp->z_id;
 
 	(void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp,
-	    xzp, "", NULL, fuidp, vap);
-	if (fuidp)
-		zfs_fuid_info_free(fuidp);
+	    xzp, "", NULL, acl_ids.z_fuidp, vap);
+
+	zfs_acl_ids_free(&acl_ids);
 	dmu_tx_commit(tx);
 
 	*xvpp = ZTOV(xzp);
@@ -959,7 +946,7 @@ top:
 	error = zfs_make_xattrdir(zp, &va, xvpp, cr);
 	zfs_dirent_unlock(dl);
 
-	if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+	if (error == ERESTART) {
 		/* NB: we already did dmu_tx_wait() if necessary */
 		goto top;
 	}
@@ -990,7 +977,7 @@ zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr)
 	uid_t		fowner;
 	zfsvfs_t	*zfsvfs = zdp->z_zfsvfs;
 
-	if (zdp->z_zfsvfs->z_assign >= TXG_INITIAL)	/* ZIL replay */
+	if (zdp->z_zfsvfs->z_replay)
 		return (0);
 
 	if ((zdp->z_phys->zp_mode & S_ISVTX) == 0)
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c
index 17e4b0a..63ae13a 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -101,7 +101,6 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
 	char buf[1024];
 	struct sbuf sb;
 	struct timespec ts;
-	int state;
 
 	/*
 	 * If we are doing a spa_tryimport(), ignore errors.
@@ -134,16 +133,31 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
 		if (zio->io_flags & ZIO_FLAG_SPECULATIVE)
 			return;
 
-		/*
-		 * If the vdev has already been marked as failing due to a
-		 * failed probe, then ignore any subsequent I/O errors, as the
-		 * DE will automatically fault the vdev on the first such
-		 * failure.
-		 */
-		if (vd != NULL &&
-		    (!vdev_readable(vd) || !vdev_writeable(vd)) &&
-		    strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) != 0)
-			return;
+		if (vd != NULL) {
+			/*
+			 * If the vdev has already been marked as failing due
+			 * to a failed probe, then ignore any subsequent I/O
+			 * errors, as the DE will automatically fault the vdev
+			 * on the first such failure.  This also catches cases
+			 * where vdev_remove_wanted is set and the device has
+			 * not yet been asynchronously placed into the REMOVED
+			 * state.
+			 */
+			if (zio->io_vd == vd &&
+			    !vdev_accessible(vd, zio) &&
+			    strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) != 0)
+				return;
+
+			/*
+			 * Ignore checksum errors for reads from DTL regions of
+			 * leaf vdevs.
+			 */
+			if (zio->io_type == ZIO_TYPE_READ &&
+			    zio->io_error == ECKSUM &&
+			    vd->vdev_ops->vdev_op_leaf &&
+			    vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1))
+				return;
+		}
 	}
 	nanotime(&ts);
 
@@ -197,20 +211,13 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
 	 */
 
 	/*
-	 * If we are importing a faulted pool, then we treat it like an open,
-	 * not an import.  Otherwise, the DE will ignore all faults during
-	 * import, since the default behavior is to mark the devices as
-	 * persistently unavailable, not leave them in the faulted state.
-	 */
-	state = spa->spa_import_faulted ? SPA_LOAD_OPEN : spa->spa_load_state;
-
-	/*
 	 * Generic payload members common to all ereports.
 	 */
 	sbuf_printf(&sb, " %s=%s", FM_EREPORT_PAYLOAD_ZFS_POOL, spa_name(spa));
 	sbuf_printf(&sb, " %s=%ju", FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
 	    spa_guid(spa));
-	sbuf_printf(&sb, " %s=%d", FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, state);
+	sbuf_printf(&sb, " %s=%d", FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT,
+	    spa->spa_load_state);
 
 	if (spa != NULL) {
 		sbuf_printf(&sb, " %s=%s", FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE,
@@ -227,12 +234,15 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
 		    vd->vdev_guid);
 		sbuf_printf(&sb, " %s=%s", FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
 		    vd->vdev_ops->vdev_op_type);
-		if (vd->vdev_path)
+		if (vd->vdev_path != NULL)
 			sbuf_printf(&sb, " %s=%s",
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, vd->vdev_path);
-		if (vd->vdev_devid)
+		if (vd->vdev_devid != NULL)
 			sbuf_printf(&sb, " %s=%s",
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, vd->vdev_devid);
+		if (vd->vdev_fru != NULL)
+			sbuf_printf(&sb, " %s=%s",
+			    FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU, vd->vdev_fru);
 
 		if (pvd != NULL) {
 			sbuf_printf(&sb, " %s=%ju",
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c
index dfec3ed..4d5b194 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -47,8 +47,10 @@
  * During file system initialization the nvlist(s) are read and
  * two AVL trees are created.  One tree is keyed by the index number
  * and the other by the domain string.  Nodes are never removed from
- * trees, but new entries may be added.  If a new entry is added then the
- * on-disk packed nvlist will also be updated.
+ * trees, but new entries may be added.  If a new entry is added then
+ * the zfsvfs->z_fuid_dirty flag is set to true and the caller will then
+ * be responsible for calling zfs_fuid_sync() to sync the changes to disk.
+ *
  */
 
 #define	FUID_IDX	"fuid_idx"
@@ -97,6 +99,15 @@ domain_compare(const void *arg1, const void *arg2)
 	return (val > 0 ? 1 : -1);
 }
 
+void
+zfs_fuid_avl_tree_create(avl_tree_t *idx_tree, avl_tree_t *domain_tree)
+{
+	avl_create(idx_tree, idx_compare,
+	    sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_idxnode));
+	avl_create(domain_tree, domain_compare,
+	    sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_domnode));
+}
+
 /*
  * load initial fuid domain and idx trees.  This function is used by
  * both the kernel and zdb.
@@ -108,12 +119,9 @@ zfs_fuid_table_load(objset_t *os, uint64_t fuid_obj, avl_tree_t *idx_tree,
 	dmu_buf_t *db;
 	uint64_t fuid_size;
 
-	avl_create(idx_tree, idx_compare,
-	    sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_idxnode));
-	avl_create(domain_tree, domain_compare,
-	    sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_domnode));
-
-	VERIFY(0 == dmu_bonus_hold(os, fuid_obj, FTAG, &db));
+	ASSERT(fuid_obj != 0);
+	VERIFY(0 == dmu_bonus_hold(os, fuid_obj,
+	    FTAG, &db));
 	fuid_size = *(uint64_t *)db->db_data;
 	dmu_buf_rele(db, FTAG);
 
@@ -125,7 +133,8 @@ zfs_fuid_table_load(objset_t *os, uint64_t fuid_obj, avl_tree_t *idx_tree,
 		int i;
 
 		packed = kmem_alloc(fuid_size, KM_SLEEP);
-		VERIFY(dmu_read(os, fuid_obj, 0, fuid_size, packed) == 0);
+		VERIFY(dmu_read(os, fuid_obj, 0,
+		    fuid_size, packed, DMU_READ_PREFETCH) == 0);
 		VERIFY(nvlist_unpack(packed, fuid_size,
 		    &nvp, 0) == 0);
 		VERIFY(nvlist_lookup_nvlist_array(nvp, FUID_NVP_ARRAY,
@@ -189,10 +198,8 @@ zfs_fuid_idx_domain(avl_tree_t *idx_tree, uint32_t idx)
  * Load the fuid table(s) into memory.
  */
 static void
-zfs_fuid_init(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
+zfs_fuid_init(zfsvfs_t *zfsvfs)
 {
-	int error = 0;
-
 	rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER);
 
 	if (zfsvfs->z_fuid_loaded) {
@@ -200,41 +207,101 @@ zfs_fuid_init(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
 		return;
 	}
 
-	if (zfsvfs->z_fuid_obj == 0) {
-
-		/* first make sure we need to allocate object */
-
-		error = zap_lookup(zfsvfs->z_os, MASTER_NODE_OBJ,
-		    ZFS_FUID_TABLES, 8, 1, &zfsvfs->z_fuid_obj);
-		if (error == ENOENT && tx != NULL) {
-			zfsvfs->z_fuid_obj = dmu_object_alloc(zfsvfs->z_os,
-			    DMU_OT_FUID, 1 << 14, DMU_OT_FUID_SIZE,
-			    sizeof (uint64_t), tx);
-			VERIFY(zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
-			    ZFS_FUID_TABLES, sizeof (uint64_t), 1,
-			    &zfsvfs->z_fuid_obj, tx) == 0);
-		}
-	}
+	zfs_fuid_avl_tree_create(&zfsvfs->z_fuid_idx, &zfsvfs->z_fuid_domain);
 
+	(void) zap_lookup(zfsvfs->z_os, MASTER_NODE_OBJ,
+	    ZFS_FUID_TABLES, 8, 1, &zfsvfs->z_fuid_obj);
 	if (zfsvfs->z_fuid_obj != 0) {
 		zfsvfs->z_fuid_size = zfs_fuid_table_load(zfsvfs->z_os,
 		    zfsvfs->z_fuid_obj, &zfsvfs->z_fuid_idx,
 		    &zfsvfs->z_fuid_domain);
-		zfsvfs->z_fuid_loaded = B_TRUE;
 	}
 
+	zfsvfs->z_fuid_loaded = B_TRUE;
+	rw_exit(&zfsvfs->z_fuid_lock);
+}
+
+/*
+ * sync out AVL trees to persistent storage.
+ */
+void
+zfs_fuid_sync(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
+{
+	nvlist_t *nvp;
+	nvlist_t **fuids;
+	size_t nvsize = 0;
+	char *packed;
+	dmu_buf_t *db;
+	fuid_domain_t *domnode;
+	int numnodes;
+	int i;
+
+	if (!zfsvfs->z_fuid_dirty) {
+		return;
+	}
+
+	rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER);
+
+	/*
+	 * First see if table needs to be created?
+	 */
+	if (zfsvfs->z_fuid_obj == 0) {
+		zfsvfs->z_fuid_obj = dmu_object_alloc(zfsvfs->z_os,
+		    DMU_OT_FUID, 1 << 14, DMU_OT_FUID_SIZE,
+		    sizeof (uint64_t), tx);
+		VERIFY(zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
+		    ZFS_FUID_TABLES, sizeof (uint64_t), 1,
+		    &zfsvfs->z_fuid_obj, tx) == 0);
+	}
+
+	VERIFY(nvlist_alloc(&nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+	numnodes = avl_numnodes(&zfsvfs->z_fuid_idx);
+	fuids = kmem_alloc(numnodes * sizeof (void *), KM_SLEEP);
+	for (i = 0, domnode = avl_first(&zfsvfs->z_fuid_domain); domnode; i++,
+	    domnode = AVL_NEXT(&zfsvfs->z_fuid_domain, domnode)) {
+		VERIFY(nvlist_alloc(&fuids[i], NV_UNIQUE_NAME, KM_SLEEP) == 0);
+		VERIFY(nvlist_add_uint64(fuids[i], FUID_IDX,
+		    domnode->f_idx) == 0);
+		VERIFY(nvlist_add_uint64(fuids[i], FUID_OFFSET, 0) == 0);
+		VERIFY(nvlist_add_string(fuids[i], FUID_DOMAIN,
+		    domnode->f_ksid->kd_name) == 0);
+	}
+	VERIFY(nvlist_add_nvlist_array(nvp, FUID_NVP_ARRAY,
+	    fuids, numnodes) == 0);
+	for (i = 0; i != numnodes; i++)
+		nvlist_free(fuids[i]);
+	kmem_free(fuids, numnodes * sizeof (void *));
+	VERIFY(nvlist_size(nvp, &nvsize, NV_ENCODE_XDR) == 0);
+	packed = kmem_alloc(nvsize, KM_SLEEP);
+	VERIFY(nvlist_pack(nvp, &packed, &nvsize,
+	    NV_ENCODE_XDR, KM_SLEEP) == 0);
+	nvlist_free(nvp);
+	zfsvfs->z_fuid_size = nvsize;
+	dmu_write(zfsvfs->z_os, zfsvfs->z_fuid_obj, 0,
+	    zfsvfs->z_fuid_size, packed, tx);
+	kmem_free(packed, zfsvfs->z_fuid_size);
+	VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, zfsvfs->z_fuid_obj,
+	    FTAG, &db));
+	dmu_buf_will_dirty(db, tx);
+	*(uint64_t *)db->db_data = zfsvfs->z_fuid_size;
+	dmu_buf_rele(db, FTAG);
+
+	zfsvfs->z_fuid_dirty = B_FALSE;
 	rw_exit(&zfsvfs->z_fuid_lock);
 }
 
 /*
  * Query domain table for a given domain.
  *
- * If domain isn't found it is added to AVL trees and
- * the results are pushed out to disk.
+ * If domain isn't found and addok is set, it is added to AVL trees and
+ * the zfsvfs->z_fuid_dirty flag will be set to TRUE.  It will then be
+ * necessary for the caller or another thread to detect the dirty table
+ * and sync out the changes.
  */
 int
-zfs_fuid_find_by_domain(zfsvfs_t *zfsvfs, const char *domain, char **retdomain,
-    dmu_tx_t *tx)
+zfs_fuid_find_by_domain(zfsvfs_t *zfsvfs, const char *domain,
+    char **retdomain, boolean_t addok)
 {
 	fuid_domain_t searchnode, *findnode;
 	avl_index_t loc;
@@ -246,16 +313,16 @@ zfs_fuid_find_by_domain(zfsvfs_t *zfsvfs, const char *domain, char **retdomain,
 	 * for the user nobody.
 	 */
 	if (domain[0] == '\0') {
-		*retdomain = nulldomain;
+		if (retdomain)
+			*retdomain = nulldomain;
 		return (0);
 	}
 
 	searchnode.f_ksid = ksid_lookupdomain(domain);
-	if (retdomain) {
+	if (retdomain)
 		*retdomain = searchnode.f_ksid->kd_name;
-	}
 	if (!zfsvfs->z_fuid_loaded)
-		zfs_fuid_init(zfsvfs, tx);
+		zfs_fuid_init(zfsvfs);
 
 retry:
 	rw_enter(&zfsvfs->z_fuid_lock, rw);
@@ -265,15 +332,9 @@ retry:
 		rw_exit(&zfsvfs->z_fuid_lock);
 		ksiddomain_rele(searchnode.f_ksid);
 		return (findnode->f_idx);
-	} else {
+	} else if (addok) {
 		fuid_domain_t *domnode;
-		nvlist_t *nvp;
-		nvlist_t **fuids;
 		uint64_t retidx;
-		size_t nvsize = 0;
-		char *packed;
-		dmu_buf_t *db;
-		int i = 0;
 
 		if (rw == RW_READER && !rw_tryupgrade(&zfsvfs->z_fuid_lock)) {
 			rw_exit(&zfsvfs->z_fuid_lock);
@@ -288,46 +349,12 @@ retry:
 
 		avl_add(&zfsvfs->z_fuid_domain, domnode);
 		avl_add(&zfsvfs->z_fuid_idx, domnode);
-		/*
-		 * Now resync the on-disk nvlist.
-		 */
-		VERIFY(nvlist_alloc(&nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-
-		domnode = avl_first(&zfsvfs->z_fuid_domain);
-		fuids = kmem_alloc(retidx * sizeof (void *), KM_SLEEP);
-		while (domnode) {
-			VERIFY(nvlist_alloc(&fuids[i],
-			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
-			VERIFY(nvlist_add_uint64(fuids[i], FUID_IDX,
-			    domnode->f_idx) == 0);
-			VERIFY(nvlist_add_uint64(fuids[i],
-			    FUID_OFFSET, 0) == 0);
-			VERIFY(nvlist_add_string(fuids[i++], FUID_DOMAIN,
-			    domnode->f_ksid->kd_name) == 0);
-			domnode = AVL_NEXT(&zfsvfs->z_fuid_domain, domnode);
-		}
-		VERIFY(nvlist_add_nvlist_array(nvp, FUID_NVP_ARRAY,
-		    fuids, retidx) == 0);
-		for (i = 0; i != retidx; i++)
-			nvlist_free(fuids[i]);
-		kmem_free(fuids, retidx * sizeof (void *));
-		VERIFY(nvlist_size(nvp, &nvsize, NV_ENCODE_XDR) == 0);
-		packed = kmem_alloc(nvsize, KM_SLEEP);
-		VERIFY(nvlist_pack(nvp, &packed, &nvsize,
-		    NV_ENCODE_XDR, KM_SLEEP) == 0);
-		nvlist_free(nvp);
-		zfsvfs->z_fuid_size = nvsize;
-		dmu_write(zfsvfs->z_os, zfsvfs->z_fuid_obj, 0,
-		    zfsvfs->z_fuid_size, packed, tx);
-		kmem_free(packed, zfsvfs->z_fuid_size);
-		VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, zfsvfs->z_fuid_obj,
-		    FTAG, &db));
-		dmu_buf_will_dirty(db, tx);
-		*(uint64_t *)db->db_data = zfsvfs->z_fuid_size;
-		dmu_buf_rele(db, FTAG);
-
+		zfsvfs->z_fuid_dirty = B_TRUE;
 		rw_exit(&zfsvfs->z_fuid_lock);
 		return (retidx);
+	} else {
+		rw_exit(&zfsvfs->z_fuid_lock);
+		return (-1);
 	}
 }
 
@@ -337,7 +364,7 @@ retry:
  * Returns a pointer from an avl node of the domain string.
  *
  */
-static char *
+const char *
 zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx)
 {
 	char *domain;
@@ -346,7 +373,7 @@ zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx)
 		return (NULL);
 
 	if (!zfsvfs->z_fuid_loaded)
-		zfs_fuid_init(zfsvfs, NULL);
+		zfs_fuid_init(zfsvfs);
 
 	rw_enter(&zfsvfs->z_fuid_lock, RW_READER);
 
@@ -374,7 +401,7 @@ zfs_fuid_map_id(zfsvfs_t *zfsvfs, uint64_t fuid,
     cred_t *cr, zfs_fuid_type_t type)
 {
 	uint32_t index = FUID_INDEX(fuid);
-	char *domain;
+	const char *domain;
 	uid_t id;
 
 	if (index == 0)
@@ -443,6 +470,7 @@ zfs_fuid_node_add(zfs_fuid_info_t **fuidpp, const char *domain, uint32_t rid,
 	}
 
 	if (type == ZFS_ACE_USER || type == ZFS_ACE_GROUP) {
+
 		/*
 		 * Now allocate fuid entry and add it on the end of the list
 		 */
@@ -467,7 +495,7 @@ zfs_fuid_node_add(zfs_fuid_info_t **fuidpp, const char *domain, uint32_t rid,
  */
 uint64_t
 zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type,
-    dmu_tx_t *tx, cred_t *cr, zfs_fuid_info_t **fuidp)
+    cred_t *cr, zfs_fuid_info_t **fuidp)
 {
 	uint64_t	idx;
 	ksid_t		*ksid;
@@ -493,7 +521,7 @@ zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type,
 	rid = ksid_getrid(ksid);
 	domain = ksid_getdomain(ksid);
 
-	idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, tx);
+	idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, B_TRUE);
 
 	zfs_fuid_node_add(fuidp, kdomain, rid, idx, id, type);
 
@@ -517,7 +545,7 @@ zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type,
  */
 uint64_t
 zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr,
-    zfs_fuid_type_t type, dmu_tx_t *tx, zfs_fuid_info_t **fuidpp)
+    zfs_fuid_type_t type, zfs_fuid_info_t **fuidpp)
 {
 	const char *domain;
 	char *kdomain;
@@ -525,7 +553,6 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr,
 	uint32_t rid;
 	idmap_stat status;
 	uint64_t idx;
-	boolean_t is_replay = (zfsvfs->z_assign >= TXG_INITIAL);
 	zfs_fuid_t *zfuid = NULL;
 	zfs_fuid_info_t *fuidp;
 
@@ -540,7 +567,7 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr,
 	if (!zfsvfs->z_use_fuids || !IS_EPHEMERAL(id) || fuid_idx != 0)
 		return (id);
 
-	if (is_replay) {
+	if (zfsvfs->z_replay) {
 		fuidp = zfsvfs->z_fuid_replay;
 
 		/*
@@ -592,10 +619,11 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr,
 #endif
 	}
 
-	idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, tx);
+	idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, B_TRUE);
 
-	if (!is_replay)
-		zfs_fuid_node_add(fuidpp, kdomain, rid, idx, id, type);
+	if (!zfsvfs->z_replay)
+		zfs_fuid_node_add(fuidpp, kdomain,
+		    rid, idx, id, type);
 	else if (zfuid != NULL) {
 		list_remove(&fuidp->z_fuids, zfuid);
 		kmem_free(zfuid, sizeof (zfs_fuid_t));
@@ -668,11 +696,14 @@ zfs_fuid_info_free(zfs_fuid_info_t *fuidp)
 boolean_t
 zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr)
 {
+#ifdef sun
 	ksid_t		*ksid = crgetsid(cr, KSID_GROUP);
+	ksidlist_t	*ksidlist = crgetsidlist(cr);
+#endif	/* sun */
 	uid_t		gid;
 
-#ifdef TODO
-	if (ksid) {
+#ifdef sun
+	if (ksid && ksidlist) {
 		int 		i;
 		ksid_t		*ksid_groups;
 		ksidlist_t	*ksidlist = crgetsidlist(cr);
@@ -689,7 +720,7 @@ zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr)
 					return (B_TRUE);
 				}
 			} else {
-				char *domain;
+				const char *domain;
 
 				domain = zfs_fuid_find_by_idx(zfsvfs, idx);
 				ASSERT(domain != NULL);
@@ -705,7 +736,7 @@ zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr)
 			}
 		}
 	}
-#endif
+#endif	/* sun */
 
 	/*
 	 * Not found in ksidlist, check posix groups
@@ -713,4 +744,19 @@ zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr)
 	gid = zfs_fuid_map_id(zfsvfs, id, cr, ZFS_GROUP);
 	return (groupmember(gid, cr));
 }
+
+void
+zfs_fuid_txhold(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
+{
+	if (zfsvfs->z_fuid_obj == 0) {
+		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+		    FUID_SIZE_ESTIMATE(zfsvfs));
+		dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL);
+	} else {
+		dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
+		dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
+		    FUID_SIZE_ESTIMATE(zfsvfs));
+	}
+}
 #endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
index 75f1ad0..6b6fc75 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -41,6 +41,7 @@
 #include <sys/cmn_err.h>
 #include <sys/stat.h>
 #include <sys/zfs_ioctl.h>
+#include <sys/zfs_vfsops.h>
 #include <sys/zfs_znode.h>
 #include <sys/zap.h>
 #include <sys/spa.h>
@@ -81,17 +82,29 @@ extern void zfs_fini(void);
 typedef int zfs_ioc_func_t(zfs_cmd_t *);
 typedef int zfs_secpolicy_func_t(zfs_cmd_t *, cred_t *);
 
+typedef enum {
+	NO_NAME,
+	POOL_NAME,
+	DATASET_NAME
+} zfs_ioc_namecheck_t;
+
 typedef struct zfs_ioc_vec {
 	zfs_ioc_func_t		*zvec_func;
 	zfs_secpolicy_func_t	*zvec_secpolicy;
-	enum {
-		NO_NAME,
-		POOL_NAME,
-		DATASET_NAME
-	} zvec_namecheck;
+	zfs_ioc_namecheck_t	zvec_namecheck;
 	boolean_t		zvec_his_log;
+	boolean_t		zvec_pool_check;
 } zfs_ioc_vec_t;
 
+/* This array is indexed by zfs_userquota_prop_t */
+static const char *userquota_perms[] = {
+	ZFS_DELEG_PERM_USERUSED,
+	ZFS_DELEG_PERM_USERQUOTA,
+	ZFS_DELEG_PERM_GROUPUSED,
+	ZFS_DELEG_PERM_GROUPQUOTA,
+};
+
+static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc);
 static void clear_props(char *dataset, nvlist_t *props, nvlist_t *newprops);
 static int zfs_fill_zplprops_root(uint64_t, nvlist_t *, nvlist_t *,
     boolean_t *);
@@ -391,6 +404,30 @@ zfs_secpolicy_send(zfs_cmd_t *zc, cred_t *cr)
 	    ZFS_DELEG_PERM_SEND, cr));
 }
 
+static int
+zfs_secpolicy_deleg_share(zfs_cmd_t *zc, cred_t *cr)
+{
+	vnode_t *vp;
+	int error;
+
+	if ((error = lookupname(zc->zc_value, UIO_SYSSPACE,
+	    NO_FOLLOW, NULL, &vp)) != 0)
+		return (error);
+
+	/* Now make sure mntpnt and dataset are ZFS */
+
+	if (strcmp(vp->v_vfsp->mnt_stat.f_fstypename, "zfs") != 0 ||
+	    (strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource),
+	    zc->zc_name) != 0)) {
+		VN_RELE(vp);
+		return (EPERM);
+	}
+
+	VN_RELE(vp);
+	return (dsl_deleg_access(zc->zc_name,
+	    ZFS_DELEG_PERM_SHARE, cr));
+}
+
 int
 zfs_secpolicy_share(zfs_cmd_t *zc, cred_t *cr)
 {
@@ -400,25 +437,20 @@ zfs_secpolicy_share(zfs_cmd_t *zc, cred_t *cr)
 	if (secpolicy_nfs(cr) == 0) {
 		return (0);
 	} else {
-		vnode_t *vp;
-		int error;
-
-		if ((error = lookupname(zc->zc_value, UIO_SYSSPACE,
-		    NO_FOLLOW, NULL, &vp)) != 0)
-			return (error);
-
-		/* Now make sure mntpnt and dataset are ZFS */
+		return (zfs_secpolicy_deleg_share(zc, cr));
+	}
+}
 
-		if (strcmp(vp->v_vfsp->mnt_stat.f_fstypename, "zfs") != 0 ||
-		    (strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource),
-		    zc->zc_name) != 0)) {
-			VN_RELE(vp);
-			return (EPERM);
-		}
+int
+zfs_secpolicy_smb_acl(zfs_cmd_t *zc, cred_t *cr)
+{
+	if (!INGLOBALZONE(curthread))
+		return (EPERM);
 
-		VN_RELE(vp);
-		return (dsl_deleg_access(zc->zc_name,
-		    ZFS_DELEG_PERM_SHARE, cr));
+	if (secpolicy_smb(cr) == 0) {
+		return (0);
+	} else {
+		return (zfs_secpolicy_deleg_share(zc, cr));
 	}
 }
 
@@ -699,6 +731,55 @@ zfs_secpolicy_operator(const char *dataset, cred_t *cr)
 	return (0);
 }
 
+static int
+zfs_secpolicy_userspace_one(zfs_cmd_t *zc, cred_t *cr)
+{
+	int err = zfs_secpolicy_read(zc, cr);
+	if (err)
+		return (err);
+
+	if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS)
+		return (EINVAL);
+
+	if (zc->zc_value[0] == 0) {
+		/*
+		 * They are asking about a posix uid/gid.  If it's
+		 * themself, allow it.
+		 */
+		if (zc->zc_objset_type == ZFS_PROP_USERUSED ||
+		    zc->zc_objset_type == ZFS_PROP_USERQUOTA) {
+			if (zc->zc_guid == crgetuid(cr))
+				return (0);
+		} else {
+			if (groupmember(zc->zc_guid, cr))
+				return (0);
+		}
+	}
+
+	return (zfs_secpolicy_write_perms(zc->zc_name,
+	    userquota_perms[zc->zc_objset_type], cr));
+}
+
+static int
+zfs_secpolicy_userspace_many(zfs_cmd_t *zc, cred_t *cr)
+{
+	int err = zfs_secpolicy_read(zc, cr);
+	if (err)
+		return (err);
+
+	if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS)
+		return (EINVAL);
+
+	return (zfs_secpolicy_write_perms(zc->zc_name,
+	    userquota_perms[zc->zc_objset_type], cr));
+}
+
+static int
+zfs_secpolicy_userspace_upgrade(zfs_cmd_t *zc, cred_t *cr)
+{
+	return (zfs_secpolicy_setprop(zc->zc_name, ZFS_PROP_VERSION, cr));
+}
+
 /*
  * Returns the nvlist as specified by the user in the zfs_cmd_t.
  */
@@ -766,6 +847,69 @@ put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl)
 }
 
 static int
+getzfsvfs(const char *dsname, zfsvfs_t **zvp)
+{
+	objset_t *os;
+	int error;
+
+	error = dmu_objset_open(dsname, DMU_OST_ZFS,
+	    DS_MODE_USER | DS_MODE_READONLY, &os);
+	if (error)
+		return (error);
+
+	mutex_enter(&os->os->os_user_ptr_lock);
+	*zvp = dmu_objset_get_user(os);
+	if (*zvp) {
+		VFS_HOLD((*zvp)->z_vfs);
+	} else {
+		error = ESRCH;
+	}
+	mutex_exit(&os->os->os_user_ptr_lock);
+	dmu_objset_close(os);
+	return (error);
+}
+
+/*
+ * Find a zfsvfs_t for a mounted filesystem, or create our own, in which
+ * case its z_vfs will be NULL, and it will be opened as the owner.
+ */
+static int
+zfsvfs_hold(const char *name, boolean_t readonly, void *tag, zfsvfs_t **zvp)
+{
+	int error = 0;
+	int mode = DS_MODE_OWNER | (readonly ? DS_MODE_READONLY : 0);
+
+	if (getzfsvfs(name, zvp) != 0)
+		error = zfsvfs_create(name, mode, zvp);
+	if (error == 0) {
+		rrw_enter(&(*zvp)->z_teardown_lock, RW_READER, tag);
+		if ((*zvp)->z_unmounted) {
+			/*
+			 * XXX we could probably try again, since the unmounting
+			 * thread should be just about to disassociate the
+			 * objset from the zfsvfs.
+			 */
+			rrw_exit(&(*zvp)->z_teardown_lock, tag);
+			return (EBUSY);
+		}
+	}
+	return (error);
+}
+
+static void
+zfsvfs_rele(zfsvfs_t *zfsvfs, void *tag)
+{
+	rrw_exit(&zfsvfs->z_teardown_lock, tag);
+
+	if (zfsvfs->z_vfs) {
+		VFS_RELE(zfsvfs->z_vfs);
+	} else {
+		dmu_objset_close(zfsvfs->z_os);
+		zfsvfs_free(zfsvfs);
+	}
+}
+
+static int
 zfs_ioc_pool_create(zfs_cmd_t *zc)
 {
 	int error;
@@ -864,7 +1008,7 @@ zfs_ioc_pool_import(zfs_cmd_t *zc)
 	    guid != zc->zc_guid)
 		error = EINVAL;
 	else if (zc->zc_cookie)
-		error = spa_import_faulted(zc->zc_name, config,
+		error = spa_import_verbatim(zc->zc_name, config,
 		    props);
 	else
 		error = spa_import(zc->zc_name, config, props);
@@ -1189,7 +1333,7 @@ zfs_ioc_vdev_detach(zfs_cmd_t *zc)
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
-	error = spa_vdev_detach(spa, zc->zc_guid, B_FALSE);
+	error = spa_vdev_detach(spa, zc->zc_guid, 0, B_FALSE);
 
 	spa_close(spa, FTAG);
 	return (error);
@@ -1212,6 +1356,23 @@ zfs_ioc_vdev_setpath(zfs_cmd_t *zc)
 	return (error);
 }
 
+static int
+zfs_ioc_vdev_setfru(zfs_cmd_t *zc)
+{
+	spa_t *spa;
+	char *fru = zc->zc_value;
+	uint64_t guid = zc->zc_guid;
+	int error;
+
+	error = spa_open(zc->zc_name, &spa, FTAG);
+	if (error != 0)
+		return (error);
+
+	error = spa_vdev_setfru(spa, guid, fru);
+	spa_close(spa, FTAG);
+	return (error);
+}
+
 /*
  * inputs:
  * zc_name		name of filesystem
@@ -1319,6 +1480,23 @@ zfs_ioc_objset_zplprops(zfs_cmd_t *zc)
 	return (err);
 }
 
+static boolean_t
+dataset_name_hidden(const char *name)
+{
+	/*
+	 * Skip over datasets that are not visible in this zone,
+	 * internal datasets (which have a $ in their name), and
+	 * temporary datasets (which have a % in their name).
+	 */
+	if (strchr(name, '$') != NULL)
+		return (B_TRUE);
+	if (strchr(name, '%') != NULL)
+		return (B_TRUE);
+	if (!INGLOBALZONE(curthread) && !zone_dataset_visible(name, NULL))
+		return (B_TRUE);
+	return (B_FALSE);
+}
+
 /*
  * inputs:
  * zc_name		name of filesystem
@@ -1327,6 +1505,7 @@ zfs_ioc_objset_zplprops(zfs_cmd_t *zc)
  *
  * outputs:
  * zc_name		name of next filesystem
+ * zc_cookie		zap cursor
  * zc_objset_stats	stats
  * zc_nvlist_dst	property nvlist
  * zc_nvlist_dst_size	size of property nvlist
@@ -1350,12 +1529,16 @@ zfs_ioc_dataset_list_next(zfs_cmd_t *zc)
 		(void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name));
 	p = zc->zc_name + strlen(zc->zc_name);
 
+	/*
+	 * Pre-fetch the datasets.  dmu_objset_prefetch() always returns 0
+	 * but is not declared void because its called by dmu_objset_find().
+	 */
 	if (zc->zc_cookie == 0) {
 		uint64_t cookie = 0;
 		int len = sizeof (zc->zc_name) - (p - zc->zc_name);
 
 		while (dmu_dir_list_next(os, len, p, NULL, &cookie) == 0)
-			dmu_objset_prefetch(p, NULL);
+			(void) dmu_objset_prefetch(p, NULL);
 	}
 
 	do {
@@ -1364,15 +1547,10 @@ zfs_ioc_dataset_list_next(zfs_cmd_t *zc)
 		    NULL, &zc->zc_cookie);
 		if (error == ENOENT)
 			error = ESRCH;
-	} while (error == 0 && !INGLOBALZONE(curthread) &&
-	    !zone_dataset_visible(zc->zc_name, NULL));
+	} while (error == 0 && dataset_name_hidden(zc->zc_name));
 	dmu_objset_close(os);
 
-	/*
-	 * If it's a hidden dataset (ie. with a '$' in its name), don't
-	 * try to get stats for it.  Userland will skip over it.
-	 */
-	if (error == 0 && strchr(zc->zc_name, '$') == NULL)
+	if (error == 0)
 		error = zfs_ioc_objset_stats(zc); /* fill in the stats */
 
 	return (error);
@@ -1396,14 +1574,15 @@ zfs_ioc_snapshot_list_next(zfs_cmd_t *zc)
 	objset_t *os;
 	int error;
 
-	if (zc->zc_cookie == 0)
-		dmu_objset_find(zc->zc_name, dmu_objset_prefetch,
-		    NULL, DS_FIND_SNAPSHOTS);
 	error = dmu_objset_open(zc->zc_name,
 	    DMU_OST_ANY, DS_MODE_USER | DS_MODE_READONLY, &os);
 	if (error)
 		return (error == ENOENT ? ESRCH : error);
 
+	if (zc->zc_cookie == 0) {
+		(void) dmu_objset_find(zc->zc_name, dmu_objset_prefetch,
+		    NULL, DS_FIND_SNAPSHOTS);
+	}
 	/*
 	 * A dataset name of maximum length cannot have any snapshots,
 	 * so exit immediately.
@@ -1432,13 +1611,16 @@ int
 zfs_set_prop_nvlist(const char *name, nvlist_t *nvl)
 {
 	nvpair_t *elem;
-	int error;
+	int error = 0;
 	uint64_t intval;
 	char *strval;
+	nvlist_t *genericnvl;
+	boolean_t issnap = (strchr(name, '@') != NULL);
 
 	/*
 	 * First validate permission to set all of the properties
 	 */
+	VERIFY(nvlist_alloc(&genericnvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 	elem = NULL;
 	while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
 		const char *propname = nvpair_name(elem);
@@ -1449,16 +1631,35 @@ zfs_set_prop_nvlist(const char *name, nvlist_t *nvl)
 			 * If this is a user-defined property, it must be a
 			 * string, and there is no further validation to do.
 			 */
-			if (!zfs_prop_user(propname) ||
-			    nvpair_type(elem) != DATA_TYPE_STRING)
-				return (EINVAL);
+			if (zfs_prop_user(propname) &&
+			    nvpair_type(elem) == DATA_TYPE_STRING) {
+				if (error = zfs_secpolicy_write_perms(name,
+				    ZFS_DELEG_PERM_USERPROP, CRED()))
+					return (error);
+				continue;
+			}
 
-			if (error = zfs_secpolicy_write_perms(name,
-			    ZFS_DELEG_PERM_USERPROP, CRED()))
-				return (error);
-			continue;
+			if (!issnap && zfs_prop_userquota(propname) &&
+			    nvpair_type(elem) == DATA_TYPE_UINT64_ARRAY) {
+				const char *perm;
+				const char *up = zfs_userquota_prop_prefixes
+				    [ZFS_PROP_USERQUOTA];
+				if (strncmp(propname, up, strlen(up)) == 0)
+					perm = ZFS_DELEG_PERM_USERQUOTA;
+				else
+					perm = ZFS_DELEG_PERM_GROUPQUOTA;
+				if (error = zfs_secpolicy_write_perms(name,
+				    perm, CRED()))
+					return (error);
+				continue;
+			}
+
+			return (EINVAL);
 		}
 
+		if (issnap)
+			return (EINVAL);
+
 		if ((error = zfs_secpolicy_setprop(name, prop, CRED())) != 0)
 			return (error);
 
@@ -1494,8 +1695,7 @@ zfs_set_prop_nvlist(const char *name, nvlist_t *nvl)
 			break;
 
 		case ZFS_PROP_COPIES:
-			if (zfs_earlier_version(name,
-			    SPA_VERSION_DITTO_BLOCKS))
+			if (zfs_earlier_version(name, SPA_VERSION_DITTO_BLOCKS))
 				return (ENOTSUP);
 			break;
 
@@ -1520,71 +1720,115 @@ zfs_set_prop_nvlist(const char *name, nvlist_t *nvl)
 		zfs_prop_t prop = zfs_name_to_prop(propname);
 
 		if (prop == ZPROP_INVAL) {
-			VERIFY(nvpair_value_string(elem, &strval) == 0);
-			error = dsl_prop_set(name, propname, 1,
-			    strlen(strval) + 1, strval);
-			if (error == 0)
-				continue;
-			else
-				return (error);
+			if (zfs_prop_userquota(propname)) {
+				uint64_t *valary;
+				unsigned int vallen;
+				const char *domain;
+				zfs_userquota_prop_t type;
+				uint64_t rid;
+				uint64_t quota;
+				zfsvfs_t *zfsvfs;
+
+				VERIFY(nvpair_value_uint64_array(elem,
+				    &valary, &vallen) == 0);
+				VERIFY(vallen == 3);
+				type = valary[0];
+				rid = valary[1];
+				quota = valary[2];
+				domain = propname +
+				    strlen(zfs_userquota_prop_prefixes[type]);
+
+				error = zfsvfs_hold(name, B_FALSE, FTAG,
+				    &zfsvfs);
+				if (error == 0) {
+					error = zfs_set_userquota(zfsvfs,
+					    type, domain, rid, quota);
+					zfsvfs_rele(zfsvfs, FTAG);
+				}
+				if (error == 0)
+					continue;
+				else
+					goto out;
+			} else if (zfs_prop_user(propname)) {
+				VERIFY(nvpair_value_string(elem, &strval) == 0);
+				error = dsl_prop_set(name, propname, 1,
+				    strlen(strval) + 1, strval);
+				if (error == 0)
+					continue;
+				else
+					goto out;
+			}
 		}
 
 		switch (prop) {
 		case ZFS_PROP_QUOTA:
 			if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
 			    (error = dsl_dir_set_quota(name, intval)) != 0)
-				return (error);
+				goto out;
 			break;
 
 		case ZFS_PROP_REFQUOTA:
 			if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
 			    (error = dsl_dataset_set_quota(name, intval)) != 0)
-				return (error);
+				goto out;
 			break;
 
 		case ZFS_PROP_RESERVATION:
 			if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
 			    (error = dsl_dir_set_reservation(name,
 			    intval)) != 0)
-				return (error);
+				goto out;
 			break;
 
 		case ZFS_PROP_REFRESERVATION:
 			if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
 			    (error = dsl_dataset_set_reservation(name,
 			    intval)) != 0)
-				return (error);
+				goto out;
 			break;
 
 		case ZFS_PROP_VOLSIZE:
 			if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
 			    (error = zvol_set_volsize(name,
 			    ddi_driver_major(zfs_dip), intval)) != 0)
-				return (error);
+				goto out;
 			break;
 
 		case ZFS_PROP_VOLBLOCKSIZE:
 			if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
 			    (error = zvol_set_volblocksize(name, intval)) != 0)
-				return (error);
+				goto out;
 			break;
 
 		case ZFS_PROP_VERSION:
-			if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
-			    (error = zfs_set_version(name, intval)) != 0)
-				return (error);
+		{
+			zfsvfs_t *zfsvfs;
+
+			if ((error = nvpair_value_uint64(elem, &intval)) != 0)
+				goto out;
+			if ((error = zfsvfs_hold(name, B_FALSE, FTAG,
+			    &zfsvfs)) != 0)
+			        goto out;
+			error = zfs_set_version(zfsvfs, intval);
+			zfsvfs_rele(zfsvfs, FTAG);
+
+			if (error == 0 && intval >= ZPL_VERSION_USERSPACE) {
+				zfs_cmd_t zc = { 0 };
+				(void) strcpy(zc.zc_name, name);
+				(void) zfs_ioc_userspace_upgrade(&zc);
+			}
+			if (error)
+				goto out;
 			break;
+		}
 
 		default:
 			if (nvpair_type(elem) == DATA_TYPE_STRING) {
 				if (zfs_prop_get_type(prop) !=
-				    PROP_TYPE_STRING)
-					return (EINVAL);
-				VERIFY(nvpair_value_string(elem, &strval) == 0);
-				if ((error = dsl_prop_set(name,
-				    nvpair_name(elem), 1, strlen(strval) + 1,
-				    strval)) != 0)
-					return (error);
+				    PROP_TYPE_STRING) {
+					error = EINVAL;
+					goto out;
+				}
 			} else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
 				const char *unused;
 
@@ -1594,35 +1838,72 @@ zfs_set_prop_nvlist(const char *name, nvlist_t *nvl)
 				case PROP_TYPE_NUMBER:
 					break;
 				case PROP_TYPE_STRING:
-					return (EINVAL);
+					error = EINVAL;
+					goto out;
 				case PROP_TYPE_INDEX:
 					if (zfs_prop_index_to_string(prop,
-					    intval, &unused) != 0)
-						return (EINVAL);
+					    intval, &unused) != 0) {
+						error = EINVAL;
+						goto out;
+					}
 					break;
 				default:
 					cmn_err(CE_PANIC,
 					    "unknown property type");
 					break;
 				}
-
-				if ((error = dsl_prop_set(name, propname,
-				    8, 1, &intval)) != 0)
-					return (error);
 			} else {
-				return (EINVAL);
+				error = EINVAL;
+				goto out;
 			}
-			break;
+			if ((error = nvlist_add_nvpair(genericnvl, elem)) != 0)
+				goto out;
 		}
 	}
 
+	if (nvlist_next_nvpair(genericnvl, NULL) != NULL) {
+		error = dsl_props_set(name, genericnvl);
+	}
+out:
+	nvlist_free(genericnvl);
+	return (error);
+}
+
+/*
+ * Check that all the properties are valid user properties.
+ */
+static int
+zfs_check_userprops(char *fsname, nvlist_t *nvl)
+{
+	nvpair_t *elem = NULL;
+	int error = 0;
+
+	while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
+		const char *propname = nvpair_name(elem);
+		char *valstr;
+
+		if (!zfs_prop_user(propname) ||
+		    nvpair_type(elem) != DATA_TYPE_STRING)
+			return (EINVAL);
+
+		if (error = zfs_secpolicy_write_perms(fsname,
+		    ZFS_DELEG_PERM_USERPROP, CRED()))
+			return (error);
+
+		if (strlen(propname) >= ZAP_MAXNAMELEN)
+			return (ENAMETOOLONG);
+
+		VERIFY(nvpair_value_string(elem, &valstr) == 0);
+		if (strlen(valstr) >= ZAP_MAXVALUELEN)
+			return (E2BIG);
+	}
 	return (0);
 }
 
 /*
  * inputs:
  * zc_name		name of filesystem
- * zc_value		name of property to inherit
+ * zc_value		name of property to set
  * zc_nvlist_src{_size}	nvlist of properties to apply
  * zc_cookie		clear existing local props?
  *
@@ -1679,11 +1960,30 @@ zfs_ioc_pool_set_props(zfs_cmd_t *zc)
 	nvlist_t *props;
 	spa_t *spa;
 	int error;
+	nvpair_t *elem;
 
 	if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
 	    &props)))
 		return (error);
 
+	/*
+	 * If the only property is the configfile, then just do a spa_lookup()
+	 * to handle the faulted case.
+	 */
+	elem = nvlist_next_nvpair(props, NULL);
+	if (elem != NULL && strcmp(nvpair_name(elem),
+	    zpool_prop_to_name(ZPOOL_PROP_CACHEFILE)) == 0 &&
+	    nvlist_next_nvpair(props, elem) == NULL) {
+		mutex_enter(&spa_namespace_lock);
+		if ((spa = spa_lookup(zc->zc_name)) != NULL) {
+			spa_configfile_set(spa, props, B_FALSE);
+			spa_config_sync(spa, B_FALSE, B_TRUE);
+		}
+		mutex_exit(&spa_namespace_lock);
+		if (spa != NULL)
+			return (0);
+	}
+
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) {
 		nvlist_free(props);
 		return (error);
@@ -1704,27 +2004,34 @@ zfs_ioc_pool_get_props(zfs_cmd_t *zc)
 	int error;
 	nvlist_t *nvp = NULL;
 
-	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
-		return (error);
-
-	error = spa_prop_get(spa, &nvp);
+	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) {
+		/*
+		 * If the pool is faulted, there may be properties we can still
+		 * get (such as altroot and cachefile), so attempt to get them
+		 * anyway.
+		 */
+		mutex_enter(&spa_namespace_lock);
+		if ((spa = spa_lookup(zc->zc_name)) != NULL)
+			error = spa_prop_get(spa, &nvp);
+		mutex_exit(&spa_namespace_lock);
+	} else {
+		error = spa_prop_get(spa, &nvp);
+		spa_close(spa, FTAG);
+	}
 
 	if (error == 0 && zc->zc_nvlist_dst != 0)
 		error = put_nvlist(zc, nvp);
 	else
 		error = EFAULT;
 
-	spa_close(spa, FTAG);
-
-	if (nvp)
-		nvlist_free(nvp);
+	nvlist_free(nvp);
 	return (error);
 }
 
 static int
 zfs_ioc_iscsi_perm_check(zfs_cmd_t *zc)
 {
-#ifdef TODO
+#ifdef sun
 	nvlist_t *nvp;
 	int error;
 	uint32_t uid;
@@ -1767,9 +2074,9 @@ zfs_ioc_iscsi_perm_check(zfs_cmd_t *zc)
 	    zfs_prop_to_name(ZFS_PROP_SHAREISCSI), usercred);
 	crfree(usercred);
 	return (error);
-#else
+#else	/* sun */
 	return (EPERM);
-#endif
+#endif	/* sun */
 }
 
 /*
@@ -1920,11 +2227,10 @@ zfs_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
  * processing.
  */
 static int
-zfs_fill_zplprops_impl(objset_t *os, uint64_t default_zplver,
+zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver,
     boolean_t fuids_ok, nvlist_t *createprops, nvlist_t *zplprops,
     boolean_t *is_ci)
 {
-	uint64_t zplver = default_zplver;
 	uint64_t sense = ZFS_PROP_UNDEFINED;
 	uint64_t norm = ZFS_PROP_UNDEFINED;
 	uint64_t u8 = ZFS_PROP_UNDEFINED;
@@ -2012,6 +2318,8 @@ zfs_fill_zplprops(const char *dataset, nvlist_t *createprops,
 	ASSERT(cp != NULL);
 	cp[0] = '\0';
 
+	if (zfs_earlier_version(dataset, SPA_VERSION_USERSPACE))
+		zplver = ZPL_VERSION_USERSPACE - 1;
 	if (zfs_earlier_version(dataset, SPA_VERSION_FUID)) {
 		zplver = ZPL_VERSION_FUID - 1;
 		fuids_ok = B_FALSE;
@@ -2190,32 +2498,12 @@ zfs_ioc_create(zfs_cmd_t *zc)
 	return (error);
 }
 
-struct snap_prop_arg {
-	nvlist_t *nvprops;
-	const char *snapname;
-};
-
-static int
-set_snap_props(char *name, void *arg)
-{
-	struct snap_prop_arg *snpa = arg;
-	int len = strlen(name) + strlen(snpa->snapname) + 2;
-	char *buf = kmem_alloc(len, KM_SLEEP);
-	int err;
-
-	(void) snprintf(buf, len, "%s@%s", name, snpa->snapname);
-	err = zfs_set_prop_nvlist(buf, snpa->nvprops);
-	if (err)
-		(void) dmu_objset_destroy(buf);
-	kmem_free(buf, len);
-	return (err);
-}
-
 /*
  * inputs:
  * zc_name	name of filesystem
  * zc_value	short name of snapshot
  * zc_cookie	recursive flag
+ * zc_nvlist_src[_size] property list
  *
  * outputs:	none
  */
@@ -2234,26 +2522,20 @@ zfs_ioc_snapshot(zfs_cmd_t *zc)
 	    &nvprops)) != 0)
 		return (error);
 
-	error = dmu_objset_snapshot(zc->zc_name, zc->zc_value, recursive);
+	error = zfs_check_userprops(zc->zc_name, nvprops);
+	if (error)
+		goto out;
 
-	/*
-	 * It would be nice to do this atomically.
-	 */
-	if (error == 0) {
-		struct snap_prop_arg snpa;
-		snpa.nvprops = nvprops;
-		snpa.snapname = zc->zc_value;
-		if (recursive) {
-			error = dmu_objset_find(zc->zc_name,
-			    set_snap_props, &snpa, DS_FIND_CHILDREN);
-			if (error) {
-				(void) dmu_snapshots_destroy(zc->zc_name,
-				    zc->zc_value);
-			}
-		} else {
-			error = set_snap_props(zc->zc_name, &snpa);
-		}
+	if (nvprops != NULL && nvlist_next_nvpair(nvprops, NULL) != NULL &&
+	    zfs_earlier_version(zc->zc_name, SPA_VERSION_SNAP_PROPS)) {
+		error = ENOTSUP;
+		goto out;
 	}
+
+	error = dmu_objset_snapshot(zc->zc_name, zc->zc_value,
+	    nvprops, recursive);
+
+out:
 	nvlist_free(nvprops);
 	return (error);
 }
@@ -2358,31 +2640,19 @@ zfs_ioc_rollback(zfs_cmd_t *zc)
 	if (error)
 		return (error);
 
-	if (dmu_objset_type(os) == DMU_OST_ZFS) {
-		mutex_enter(&os->os->os_user_ptr_lock);
-		zfsvfs = dmu_objset_get_user(os);
-		if (zfsvfs != NULL)
-			VFS_HOLD(zfsvfs->z_vfs);
-		mutex_exit(&os->os->os_user_ptr_lock);
-	}
-
-	if (zfsvfs != NULL) {
-		char *osname;
+	if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) {
 		int mode;
 
-		osname = kmem_alloc(MAXNAMELEN, KM_SLEEP);
-		error = zfs_suspend_fs(zfsvfs, osname, &mode);
+		error = zfs_suspend_fs(zfsvfs, NULL, &mode);
 		if (error == 0) {
 			int resume_err;
 
-			ASSERT(strcmp(osname, zc->zc_name) == 0);
 			error = dmu_objset_rollback(os);
-			resume_err = zfs_resume_fs(zfsvfs, osname, mode);
+			resume_err = zfs_resume_fs(zfsvfs, zc->zc_name, mode);
 			error = error ? error : resume_err;
 		} else {
 			dmu_objset_close(os);
 		}
-		kmem_free(osname, MAXNAMELEN);
 		VFS_RELE(zfsvfs->z_vfs);
 	} else {
 		error = dmu_objset_rollback(os);
@@ -2497,32 +2767,26 @@ zfs_ioc_recv(zfs_cmd_t *zc)
 		return (EBADF);
 	}
 
-	if (dmu_objset_open(tofs, DMU_OST_ANY,
-	    DS_MODE_USER | DS_MODE_READONLY, &os) == 0) {
-		/*
-		 * Try to get the zfsvfs for the receiving objset.
-		 * There won't be one if we're operating on a zvol,
-		 * if the objset doesn't exist yet, or is not mounted.
-		 */
-		mutex_enter(&os->os->os_user_ptr_lock);
-		if (zfsvfs = dmu_objset_get_user(os)) {
-			if (!mutex_tryenter(&zfsvfs->z_online_recv_lock)) {
-				mutex_exit(&os->os->os_user_ptr_lock);
-				dmu_objset_close(os);
-				zfsvfs = NULL;
-				error = EBUSY;
-				goto out;
-			}
-			VFS_HOLD(zfsvfs->z_vfs);
+	if (getzfsvfs(tofs, &zfsvfs) == 0) {
+		if (!mutex_tryenter(&zfsvfs->z_online_recv_lock)) {
+			VFS_RELE(zfsvfs->z_vfs);
+			zfsvfs = NULL;
+			error = EBUSY;
+			goto out;
 		}
-		mutex_exit(&os->os->os_user_ptr_lock);
-
 		/*
 		 * If new properties are supplied, they are to completely
 		 * replace the existing ones, so stash away the existing ones.
 		 */
 		if (props)
-			(void) dsl_prop_get_all(os, &origprops, TRUE);
+			(void) dsl_prop_get_all(zfsvfs->z_os, &origprops, TRUE);
+	} else if (props && dmu_objset_open(tofs, DMU_OST_ANY,
+	    DS_MODE_USER | DS_MODE_READONLY, &os) == 0) {
+		/*
+		 * Get the props even if there was no zfsvfs (zvol or
+		 * unmounted zpl).
+		 */
+		(void) dsl_prop_get_all(os, &origprops, TRUE);
 
 		dmu_objset_close(os);
 	}
@@ -2762,11 +3026,12 @@ zfs_ioc_clear(zfs_cmd_t *zc)
 	/*
 	 * Resume any suspended I/Os.
 	 */
-	zio_resume(spa);
+	if (zio_resume(spa) != 0)
+		error = EIO;
 
 	spa_close(spa, FTAG);
 
-	return (0);
+	return (error);
 }
 
 /*
@@ -2793,7 +3058,121 @@ zfs_ioc_promote(zfs_cmd_t *zc)
 	return (dsl_dataset_promote(zc->zc_name));
 }
 
-#ifdef TODO
+/*
+ * Retrieve a single {user|group}{used|quota}@... property.
+ *
+ * inputs:
+ * zc_name	name of filesystem
+ * zc_objset_type zfs_userquota_prop_t
+ * zc_value	domain name (eg. "S-1-234-567-89")
+ * zc_guid	RID/UID/GID
+ *
+ * outputs:
+ * zc_cookie	property value
+ */
+static int
+zfs_ioc_userspace_one(zfs_cmd_t *zc)
+{
+	zfsvfs_t *zfsvfs;
+	int error;
+
+	if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS)
+		return (EINVAL);
+
+	error = zfsvfs_hold(zc->zc_name, B_TRUE, FTAG, &zfsvfs);
+	if (error)
+		return (error);
+
+	error = zfs_userspace_one(zfsvfs,
+	    zc->zc_objset_type, zc->zc_value, zc->zc_guid, &zc->zc_cookie);
+	zfsvfs_rele(zfsvfs, FTAG);
+
+	return (error);
+}
+
+/*
+ * inputs:
+ * zc_name		name of filesystem
+ * zc_cookie		zap cursor
+ * zc_objset_type	zfs_userquota_prop_t
+ * zc_nvlist_dst[_size] buffer to fill (not really an nvlist)
+ *
+ * outputs:
+ * zc_nvlist_dst[_size]	data buffer (array of zfs_useracct_t)
+ * zc_cookie	zap cursor
+ */
+static int
+zfs_ioc_userspace_many(zfs_cmd_t *zc)
+{
+	zfsvfs_t *zfsvfs;
+	int error;
+
+	error = zfsvfs_hold(zc->zc_name, B_TRUE, FTAG, &zfsvfs);
+	if (error)
+		return (error);
+
+	int bufsize = zc->zc_nvlist_dst_size;
+	void *buf = kmem_alloc(bufsize, KM_SLEEP);
+
+	error = zfs_userspace_many(zfsvfs, zc->zc_objset_type, &zc->zc_cookie,
+	    buf, &zc->zc_nvlist_dst_size);
+
+	if (error == 0) {
+		error = xcopyout(buf,
+		    (void *)(uintptr_t)zc->zc_nvlist_dst,
+		    zc->zc_nvlist_dst_size);
+	}
+	kmem_free(buf, bufsize);
+	zfsvfs_rele(zfsvfs, FTAG);
+
+	return (error);
+}
+
+/*
+ * inputs:
+ * zc_name		name of filesystem
+ *
+ * outputs:
+ * none
+ */
+static int
+zfs_ioc_userspace_upgrade(zfs_cmd_t *zc)
+{
+	objset_t *os;
+	int error;
+	zfsvfs_t *zfsvfs;
+
+	if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) {
+		if (!dmu_objset_userused_enabled(zfsvfs->z_os->os)) {
+			/*
+			 * If userused is not enabled, it may be because the
+			 * objset needs to be closed & reopened (to grow the
+			 * objset_phys_t).  Suspend/resume the fs will do that.
+			 */
+			int mode;
+			error = zfs_suspend_fs(zfsvfs, NULL, &mode);
+			if (error == 0) {
+				error = zfs_resume_fs(zfsvfs,
+				    zc->zc_name, mode);
+			}
+		}
+		if (error == 0)
+			error = dmu_objset_userspace_upgrade(zfsvfs->z_os);
+		VFS_RELE(zfsvfs->z_vfs);
+	} else {
+		error = dmu_objset_open(zc->zc_name, DMU_OST_ANY,
+		    DS_MODE_USER, &os);
+		if (error)
+			return (error);
+
+		error = dmu_objset_userspace_upgrade(os);
+		dmu_objset_close(os);
+	}
+
+	return (error);
+}
+
+#ifdef sun
 /*
  * We don't want to have a hard dependency
  * against some special symbols in sharefs
@@ -2811,10 +3190,10 @@ int zfs_smbshare_inited;
 ddi_modhandle_t nfs_mod;
 ddi_modhandle_t sharefs_mod;
 ddi_modhandle_t smbsrv_mod;
-#endif
+#endif	/* sun */
 kmutex_t zfs_share_lock;
 
-#ifdef TODO
+#ifdef sun
 static int
 zfs_init_sharefs()
 {
@@ -2834,12 +3213,12 @@ zfs_init_sharefs()
 	}
 	return (0);
 }
-#endif
+#endif /* sun */
 
 static int
 zfs_ioc_share(zfs_cmd_t *zc)
 {
-#ifdef TODO
+#ifdef sun
 	int error;
 	int opcode;
 
@@ -2911,7 +3290,7 @@ zfs_ioc_share(zfs_cmd_t *zc)
 		if (error = zsmbexport_fs((void *)
 		    (uintptr_t)zc->zc_share.z_exportdata,
 		    zc->zc_share.z_sharetype == ZFS_SHARE_SMB ?
-		    B_TRUE : B_FALSE)) {
+		    B_TRUE: B_FALSE)) {
 			return (error);
 		}
 		break;
@@ -2929,9 +3308,168 @@ zfs_ioc_share(zfs_cmd_t *zc)
 	    zc->zc_share.z_sharemax);
 
 	return (error);
-#else
+#else	/* sun */
 	return (ENOSYS);
-#endif
+#endif	/* sun */
+}
+
+ace_t full_access[] = {
+	{(uid_t)-1, ACE_ALL_PERMS, ACE_EVERYONE, 0}
+};
+
+#ifdef sun
+/*
+ * Remove all ACL files in shares dir
+ */
+static int
+zfs_smb_acl_purge(znode_t *dzp)
+{
+	zap_cursor_t	zc;
+	zap_attribute_t	zap;
+	zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+	int error;
+
+	for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id);
+	    (error = zap_cursor_retrieve(&zc, &zap)) == 0;
+	    zap_cursor_advance(&zc)) {
+		if ((error = VOP_REMOVE(ZTOV(dzp), zap.za_name, kcred,
+		    NULL, 0)) != 0)
+			break;
+	}
+	zap_cursor_fini(&zc);
+	return (error);
+}
+#endif	/* sun */
+
+static int
+zfs_ioc_smb_acl(zfs_cmd_t *zc)
+{
+#ifdef sun
+	vnode_t *vp;
+	znode_t *dzp;
+	vnode_t *resourcevp = NULL;
+	znode_t *sharedir;
+	zfsvfs_t *zfsvfs;
+	nvlist_t *nvlist;
+	char *src, *target;
+	vattr_t vattr;
+	vsecattr_t vsec;
+	int error = 0;
+
+	if ((error = lookupname(zc->zc_value, UIO_SYSSPACE,
+	    NO_FOLLOW, NULL, &vp)) != 0)
+		return (error);
+
+	/* Now make sure mntpnt and dataset are ZFS */
+
+	if (vp->v_vfsp->vfs_fstype != zfsfstype ||
+	    (strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource),
+	    zc->zc_name) != 0)) {
+		VN_RELE(vp);
+		return (EINVAL);
+	}
+
+	dzp = VTOZ(vp);
+	zfsvfs = dzp->z_zfsvfs;
+
+	ZFS_ENTER(zfsvfs);
+
+	/*
+	 * Create share dir if its missing.
+	 */
+	mutex_enter(&zfsvfs->z_lock);
+	if (zfsvfs->z_shares_dir == 0) {
+		dmu_tx_t *tx;
+
+		tx = dmu_tx_create(zfsvfs->z_os);
+		dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, TRUE,
+		    ZFS_SHARES_DIR);
+		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
+		error = dmu_tx_assign(tx, TXG_WAIT);
+		if (error) {
+			dmu_tx_abort(tx);
+		} else {
+			error = zfs_create_share_dir(zfsvfs, tx);
+			dmu_tx_commit(tx);
+		}
+		if (error) {
+			mutex_exit(&zfsvfs->z_lock);
+			VN_RELE(vp);
+			ZFS_EXIT(zfsvfs);
+			return (error);
+		}
+	}
+	mutex_exit(&zfsvfs->z_lock);
+
+	ASSERT(zfsvfs->z_shares_dir);
+	if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &sharedir)) != 0) {
+		VN_RELE(vp);
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	switch (zc->zc_cookie) {
+	case ZFS_SMB_ACL_ADD:
+		vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
+		vattr.va_type = VREG;
+		vattr.va_mode = S_IFREG|0777;
+		vattr.va_uid = 0;
+		vattr.va_gid = 0;
+
+		vsec.vsa_mask = VSA_ACE;
+		vsec.vsa_aclentp = &full_access;
+		vsec.vsa_aclentsz = sizeof (full_access);
+		vsec.vsa_aclcnt = 1;
+
+		error = VOP_CREATE(ZTOV(sharedir), zc->zc_string,
+		    &vattr, EXCL, 0, &resourcevp, kcred, 0, NULL, &vsec);
+		if (resourcevp)
+			VN_RELE(resourcevp);
+		break;
+
+	case ZFS_SMB_ACL_REMOVE:
+		error = VOP_REMOVE(ZTOV(sharedir), zc->zc_string, kcred,
+		    NULL, 0);
+		break;
+
+	case ZFS_SMB_ACL_RENAME:
+		if ((error = get_nvlist(zc->zc_nvlist_src,
+		    zc->zc_nvlist_src_size, &nvlist)) != 0) {
+			VN_RELE(vp);
+			ZFS_EXIT(zfsvfs);
+			return (error);
+		}
+		if (nvlist_lookup_string(nvlist, ZFS_SMB_ACL_SRC, &src) ||
+		    nvlist_lookup_string(nvlist, ZFS_SMB_ACL_TARGET,
+		    &target)) {
+			VN_RELE(vp);
+			VN_RELE(ZTOV(sharedir));
+			ZFS_EXIT(zfsvfs);
+			return (error);
+		}
+		error = VOP_RENAME(ZTOV(sharedir), src, ZTOV(sharedir), target,
+		    kcred, NULL, 0);
+		nvlist_free(nvlist);
+		break;
+
+	case ZFS_SMB_ACL_PURGE:
+		error = zfs_smb_acl_purge(sharedir);
+		break;
+
+	default:
+		error = EINVAL;
+		break;
+	}
+
+	VN_RELE(vp);
+	VN_RELE(ZTOV(sharedir));
+
+	ZFS_EXIT(zfsvfs);
+
+	return (error);
+#else	/* !sun */
+	return (EOPNOTSUPP);
+#endif	/* !sun */
 }
 
 /*
@@ -2956,60 +3494,125 @@ zfs_ioc_unjail(zfs_cmd_t *zc)
 }
 
 static zfs_ioc_vec_t zfs_ioc_vec[] = {
-	{ zfs_ioc_pool_create, zfs_secpolicy_config, POOL_NAME, B_FALSE },
-	{ zfs_ioc_pool_destroy,	zfs_secpolicy_config, POOL_NAME, B_FALSE },
-	{ zfs_ioc_pool_import, zfs_secpolicy_config, POOL_NAME, B_TRUE },
-	{ zfs_ioc_pool_export, zfs_secpolicy_config, POOL_NAME, B_FALSE },
-	{ zfs_ioc_pool_configs,	zfs_secpolicy_none, NO_NAME, B_FALSE },
-	{ zfs_ioc_pool_stats, zfs_secpolicy_read, POOL_NAME, B_FALSE },
-	{ zfs_ioc_pool_tryimport, zfs_secpolicy_config, NO_NAME, B_FALSE },
-	{ zfs_ioc_pool_scrub, zfs_secpolicy_config, POOL_NAME, B_TRUE },
-	{ zfs_ioc_pool_freeze, zfs_secpolicy_config, NO_NAME, B_FALSE },
-	{ zfs_ioc_pool_upgrade,	zfs_secpolicy_config, POOL_NAME, B_TRUE },
-	{ zfs_ioc_pool_get_history, zfs_secpolicy_config, POOL_NAME, B_FALSE },
-	{ zfs_ioc_vdev_add, zfs_secpolicy_config, POOL_NAME, B_TRUE },
-	{ zfs_ioc_vdev_remove, zfs_secpolicy_config, POOL_NAME, B_TRUE },
-	{ zfs_ioc_vdev_set_state, zfs_secpolicy_config,	POOL_NAME, B_TRUE },
-	{ zfs_ioc_vdev_attach, zfs_secpolicy_config, POOL_NAME, B_TRUE },
-	{ zfs_ioc_vdev_detach, zfs_secpolicy_config, POOL_NAME, B_TRUE },
-	{ zfs_ioc_vdev_setpath,	zfs_secpolicy_config, POOL_NAME, B_FALSE },
-	{ zfs_ioc_objset_stats,	zfs_secpolicy_read, DATASET_NAME, B_FALSE },
-	{ zfs_ioc_objset_zplprops, zfs_secpolicy_read, DATASET_NAME, B_FALSE },
-	{ zfs_ioc_dataset_list_next, zfs_secpolicy_read,
-	    DATASET_NAME, B_FALSE },
-	{ zfs_ioc_snapshot_list_next, zfs_secpolicy_read,
-	    DATASET_NAME, B_FALSE },
-	{ zfs_ioc_set_prop, zfs_secpolicy_none, DATASET_NAME, B_TRUE },
-	{ zfs_ioc_create_minor,	zfs_secpolicy_minor, DATASET_NAME, B_FALSE },
-	{ zfs_ioc_remove_minor,	zfs_secpolicy_minor, DATASET_NAME, B_FALSE },
-	{ zfs_ioc_create, zfs_secpolicy_create, DATASET_NAME, B_TRUE },
-	{ zfs_ioc_destroy, zfs_secpolicy_destroy, DATASET_NAME, B_TRUE },
-	{ zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME, B_TRUE },
-	{ zfs_ioc_rename, zfs_secpolicy_rename,	DATASET_NAME, B_TRUE },
-	{ zfs_ioc_recv, zfs_secpolicy_receive, DATASET_NAME, B_TRUE },
-	{ zfs_ioc_send, zfs_secpolicy_send, DATASET_NAME, B_TRUE },
-	{ zfs_ioc_inject_fault,	zfs_secpolicy_inject, NO_NAME, B_FALSE },
-	{ zfs_ioc_clear_fault, zfs_secpolicy_inject, NO_NAME, B_FALSE },
-	{ zfs_ioc_inject_list_next, zfs_secpolicy_inject, NO_NAME, B_FALSE },
-	{ zfs_ioc_error_log, zfs_secpolicy_inject, POOL_NAME, B_FALSE },
-	{ zfs_ioc_clear, zfs_secpolicy_config, POOL_NAME, B_TRUE },
-	{ zfs_ioc_promote, zfs_secpolicy_promote, DATASET_NAME, B_TRUE },
-	{ zfs_ioc_destroy_snaps, zfs_secpolicy_destroy,	DATASET_NAME, B_TRUE },
-	{ zfs_ioc_snapshot, zfs_secpolicy_snapshot, DATASET_NAME, B_TRUE },
-	{ zfs_ioc_dsobj_to_dsname, zfs_secpolicy_config, POOL_NAME, B_FALSE },
-	{ zfs_ioc_obj_to_path, zfs_secpolicy_config, NO_NAME, B_FALSE },
-	{ zfs_ioc_pool_set_props, zfs_secpolicy_config,	POOL_NAME, B_TRUE },
-	{ zfs_ioc_pool_get_props, zfs_secpolicy_read, POOL_NAME, B_FALSE },
-	{ zfs_ioc_set_fsacl, zfs_secpolicy_fsacl, DATASET_NAME, B_TRUE },
-	{ zfs_ioc_get_fsacl, zfs_secpolicy_read, DATASET_NAME, B_FALSE },
-	{ zfs_ioc_iscsi_perm_check, zfs_secpolicy_iscsi,
-	    DATASET_NAME, B_FALSE },
-	{ zfs_ioc_share, zfs_secpolicy_share, DATASET_NAME, B_FALSE },
-	{ zfs_ioc_inherit_prop, zfs_secpolicy_inherit, DATASET_NAME, B_TRUE },
-	{ zfs_ioc_jail, zfs_secpolicy_config, DATASET_NAME, B_TRUE },
-	{ zfs_ioc_unjail, zfs_secpolicy_config, DATASET_NAME, B_TRUE }
+	{ zfs_ioc_pool_create, zfs_secpolicy_config, POOL_NAME, B_FALSE,
+	    B_FALSE },
+	{ zfs_ioc_pool_destroy,	zfs_secpolicy_config, POOL_NAME, B_FALSE,
+	    B_FALSE },
+	{ zfs_ioc_pool_import, zfs_secpolicy_config, POOL_NAME, B_TRUE,
+	    B_FALSE },
+	{ zfs_ioc_pool_export, zfs_secpolicy_config, POOL_NAME, B_FALSE,
+	    B_FALSE },
+	{ zfs_ioc_pool_configs,	zfs_secpolicy_none, NO_NAME, B_FALSE,
+	    B_FALSE },
+	{ zfs_ioc_pool_stats, zfs_secpolicy_read, POOL_NAME, B_FALSE,
+	    B_FALSE },
+	{ zfs_ioc_pool_tryimport, zfs_secpolicy_config, NO_NAME, B_FALSE,
+	    B_FALSE },
+	{ zfs_ioc_pool_scrub, zfs_secpolicy_config, POOL_NAME, B_TRUE,
+	    B_TRUE },
+	{ zfs_ioc_pool_freeze, zfs_secpolicy_config, NO_NAME, B_FALSE,
+	    B_FALSE },
+	{ zfs_ioc_pool_upgrade,	zfs_secpolicy_config, POOL_NAME, B_TRUE,
+	    B_TRUE },
+	{ zfs_ioc_pool_get_history, zfs_secpolicy_config, POOL_NAME, B_FALSE,
+	    B_FALSE },
+	{ zfs_ioc_vdev_add, zfs_secpolicy_config, POOL_NAME, B_TRUE,
+	    B_TRUE },
+	{ zfs_ioc_vdev_remove, zfs_secpolicy_config, POOL_NAME, B_TRUE,
+	    B_TRUE },
+	{ zfs_ioc_vdev_set_state, zfs_secpolicy_config,	POOL_NAME, B_TRUE,
+	    B_FALSE },
+	{ zfs_ioc_vdev_attach, zfs_secpolicy_config, POOL_NAME, B_TRUE,
+	    B_TRUE },
+	{ zfs_ioc_vdev_detach, zfs_secpolicy_config, POOL_NAME, B_TRUE,
+	    B_TRUE },
+	{ zfs_ioc_vdev_setpath,	zfs_secpolicy_config, POOL_NAME, B_FALSE,
+	    B_TRUE },
+	{ zfs_ioc_objset_stats,	zfs_secpolicy_read, DATASET_NAME, B_FALSE,
+	    B_TRUE },
+	{ zfs_ioc_objset_zplprops, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
+	    B_FALSE },
+	{ zfs_ioc_dataset_list_next, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
+	    B_TRUE },
+	{ zfs_ioc_snapshot_list_next, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
+	    B_TRUE },
+	{ zfs_ioc_set_prop, zfs_secpolicy_none, DATASET_NAME, B_TRUE, B_TRUE },
+	{ zfs_ioc_create_minor,	zfs_secpolicy_minor, DATASET_NAME, B_FALSE,
+	    B_FALSE },
+	{ zfs_ioc_remove_minor,	zfs_secpolicy_minor, DATASET_NAME, B_FALSE,
+	    B_FALSE },
+	{ zfs_ioc_create, zfs_secpolicy_create, DATASET_NAME, B_TRUE, B_TRUE },
+	{ zfs_ioc_destroy, zfs_secpolicy_destroy, DATASET_NAME, B_TRUE,
+	    B_TRUE},
+	{ zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME, B_TRUE,
+	    B_TRUE },
+	{ zfs_ioc_rename, zfs_secpolicy_rename,	DATASET_NAME, B_TRUE, B_TRUE },
+	{ zfs_ioc_recv, zfs_secpolicy_receive, DATASET_NAME, B_TRUE, B_TRUE },
+	{ zfs_ioc_send, zfs_secpolicy_send, DATASET_NAME, B_TRUE, B_FALSE },
+	{ zfs_ioc_inject_fault,	zfs_secpolicy_inject, NO_NAME, B_FALSE,
+	    B_FALSE },
+	{ zfs_ioc_clear_fault, zfs_secpolicy_inject, NO_NAME, B_FALSE,
+	    B_FALSE },
+	{ zfs_ioc_inject_list_next, zfs_secpolicy_inject, NO_NAME, B_FALSE,
+	    B_FALSE },
+	{ zfs_ioc_error_log, zfs_secpolicy_inject, POOL_NAME, B_FALSE,
+	    B_FALSE },
+	{ zfs_ioc_clear, zfs_secpolicy_config, POOL_NAME, B_TRUE, B_FALSE },
+	{ zfs_ioc_promote, zfs_secpolicy_promote, DATASET_NAME, B_TRUE,
+	    B_TRUE },
+	{ zfs_ioc_destroy_snaps, zfs_secpolicy_destroy,	DATASET_NAME, B_TRUE,
+	    B_TRUE },
+	{ zfs_ioc_snapshot, zfs_secpolicy_snapshot, DATASET_NAME, B_TRUE,
+	    B_TRUE },
+	{ zfs_ioc_dsobj_to_dsname, zfs_secpolicy_config, POOL_NAME, B_FALSE,
+	    B_FALSE },
+	{ zfs_ioc_obj_to_path, zfs_secpolicy_config, NO_NAME, B_FALSE,
+	    B_FALSE },
+	{ zfs_ioc_pool_set_props, zfs_secpolicy_config,	POOL_NAME, B_TRUE,
+	    B_TRUE },
+	{ zfs_ioc_pool_get_props, zfs_secpolicy_read, POOL_NAME, B_FALSE,
+	    B_FALSE },
+	{ zfs_ioc_set_fsacl, zfs_secpolicy_fsacl, DATASET_NAME, B_TRUE,
+	    B_TRUE },
+	{ zfs_ioc_get_fsacl, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
+	    B_FALSE },
+	{ zfs_ioc_iscsi_perm_check, zfs_secpolicy_iscsi, DATASET_NAME, B_FALSE,
+	    B_FALSE },
+	{ zfs_ioc_share, zfs_secpolicy_share, DATASET_NAME, B_FALSE, B_FALSE },
+	{ zfs_ioc_inherit_prop, zfs_secpolicy_inherit, DATASET_NAME, B_TRUE,
+	    B_TRUE },
+	{ zfs_ioc_jail, zfs_secpolicy_config, DATASET_NAME, B_TRUE, B_FALSE },
+	{ zfs_ioc_unjail, zfs_secpolicy_config, DATASET_NAME, B_TRUE, B_FALSE },
+	{ zfs_ioc_smb_acl, zfs_secpolicy_smb_acl, DATASET_NAME, B_FALSE,
+	    B_FALSE },
+	{ zfs_ioc_userspace_one, zfs_secpolicy_userspace_one,
+	    DATASET_NAME, B_FALSE, B_FALSE },
+	{ zfs_ioc_userspace_many, zfs_secpolicy_userspace_many,
+	    DATASET_NAME, B_FALSE, B_FALSE },
+	{ zfs_ioc_userspace_upgrade, zfs_secpolicy_userspace_upgrade,
+	    DATASET_NAME, B_FALSE, B_TRUE },
+	{ zfs_ioc_vdev_setfru,	zfs_secpolicy_config, POOL_NAME, B_FALSE,
+	    B_TRUE }
 };
 
+int
+pool_status_check(const char *name, zfs_ioc_namecheck_t type)
+{
+	spa_t *spa;
+	char pool[ZFS_MAXNAMELEN];
+	int error;
+
+	ASSERT(type == POOL_NAME || type == DATASET_NAME);
+
+	error = spa_open(name, &spa, FTAG);
+	if (error == 0) {
+		if (spa_suspended(spa))
+			error = EAGAIN;
+		spa_close(spa, FTAG);
+	}
+	return (error);
+}
+
 static int
 zfsdev_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag,
     struct thread *td)
@@ -3035,11 +3638,17 @@ zfsdev_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag,
 		case POOL_NAME:
 			if (pool_namecheck(zc->zc_name, NULL, NULL) != 0)
 				error = EINVAL;
+			if (zfs_ioc_vec[vec].zvec_pool_check)
+				error = pool_status_check(zc->zc_name,
+				    zfs_ioc_vec[vec].zvec_namecheck);
 			break;
 
 		case DATASET_NAME:
 			if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0)
 				error = EINVAL;
+			if (zfs_ioc_vec[vec].zvec_pool_check)
+				error = pool_status_check(zc->zc_name,
+				    zfs_ioc_vec[vec].zvec_namecheck);
 			break;
 
 		case NO_NAME:
@@ -3051,7 +3660,7 @@ zfsdev_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag,
 		error = zfs_ioc_vec[vec].zvec_func(zc);
 
 	if (error == 0) {
-		if (zfs_ioc_vec[vec].zvec_his_log == B_TRUE)
+		if (zfs_ioc_vec[vec].zvec_his_log)
 			zfs_log_history(zc);
 	}
 
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c
index 180196b..3105088 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c
@@ -42,13 +42,33 @@
 #include <sys/dmu.h>
 #include <sys/spa.h>
 #include <sys/zfs_fuid.h>
+#include <sys/dsl_dataset.h>
+
+#define	ZFS_HANDLE_REPLAY(zilog, tx) \
+	if (zilog->zl_replay) { \
+		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); \
+		zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] = \
+		    zilog->zl_replaying_seq; \
+		return; \
+	}
 
 /*
- * All the functions in this file are used to construct the log entries
- * to record transactions. They allocate * an intent log transaction
- * structure (itx_t) and save within it all the information necessary to
- * possibly replay the transaction. The itx is then assigned a sequence
- * number and inserted in the in-memory list anchored in the zilog.
+ * These zfs_log_* functions must be called within a dmu tx, in one
+ * of 2 contexts depending on zilog->z_replay:
+ *
+ * Non replay mode
+ * ---------------
+ * We need to record the transaction so that if it is committed to
+ * the Intent Log then it can be replayed.  An intent log transaction
+ * structure (itx_t) is allocated and all the information necessary to
+ * possibly replay the transaction is saved in it. The itx is then assigned
+ * a sequence number and inserted in the in-memory list anchored in the zilog.
+ *
+ * Replay mode
+ * -----------
+ * We need to mark the intent log record as replayed in the log header.
+ * This is done in the same transaction as the replay so that they
+ * commit atomically.
  */
 
 int
@@ -236,6 +256,8 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
 	if (zilog == NULL)
 		return;
 
+	ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
+
 	/*
 	 * If we have FUIDs present then add in space for
 	 * domains and ACE fuid's if any.
@@ -339,6 +361,8 @@ zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
 	if (zilog == NULL)
 		return;
 
+	ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
+
 	itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
 	lr = (lr_remove_t *)&itx->itx_lr;
 	lr->lr_doid = dzp->z_id;
@@ -363,6 +387,8 @@ zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
 	if (zilog == NULL)
 		return;
 
+	ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
+
 	itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
 	lr = (lr_link_t *)&itx->itx_lr;
 	lr->lr_doid = dzp->z_id;
@@ -390,6 +416,8 @@ zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
 	if (zilog == NULL)
 		return;
 
+	ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
+
 	itx = zil_itx_create(txtype, sizeof (*lr) + namesize + linksize);
 	lr = (lr_create_t *)&itx->itx_lr;
 	lr->lr_doid = dzp->z_id;
@@ -424,6 +452,8 @@ zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
 	if (zilog == NULL)
 		return;
 
+	ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
+
 	itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize);
 	lr = (lr_rename_t *)&itx->itx_lr;
 	lr->lr_sdoid = sdzp->z_id;
@@ -456,6 +486,8 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
 	if (zilog == NULL || zp->z_unlinked)
 		return;
 
+	ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
+
 	/*
 	 * Writes are handled in three different ways:
 	 *
@@ -508,7 +540,7 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
 		    (write_state == WR_COPIED ? len : 0));
 		lr = (lr_write_t *)&itx->itx_lr;
 		if (write_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os,
-		    zp->z_id, off, len, lr + 1) != 0) {
+		    zp->z_id, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
 			kmem_free(itx, offsetof(itx_t, itx_lr) +
 			    itx->itx_lr.lrc_reclen);
 			itx = zil_itx_create(txtype, sizeof (*lr));
@@ -554,6 +586,8 @@ zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
 	if (zilog == NULL || zp->z_unlinked)
 		return;
 
+	ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
+
 	itx = zil_itx_create(txtype, sizeof (*lr));
 	lr = (lr_truncate_t *)&itx->itx_lr;
 	lr->lr_foid = zp->z_id;
@@ -583,6 +617,8 @@ zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
 	if (zilog == NULL || zp->z_unlinked)
 		return;
 
+	ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
+
 	/*
 	 * If XVATTR set, then log record size needs to allow
 	 * for lr_attr_t + xvattr mask, mapsize and create time
@@ -649,6 +685,8 @@ zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp,
 	if (zilog == NULL || zp->z_unlinked)
 		return;
 
+	ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
+
 	txtype = (zp->z_zfsvfs->z_version < ZPL_VERSION_FUID) ?
 	    TX_ACL_V0 : TX_ACL;
 
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c
index 658e539..c965247 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c
@@ -278,9 +278,9 @@ zfs_replay_create_acl(zfsvfs_t *zfsvfs,
 	uint64_t txtype;
 	int error;
 
+	txtype = (lr->lr_common.lrc_txtype & ~TX_CI);
 	if (byteswap) {
 		byteswap_uint64_array(lracl, sizeof (*lracl));
-		txtype = (int)lr->lr_common.lrc_txtype;
 		if (txtype == TX_CREATE_ACL_ATTR ||
 		    txtype == TX_MKDIR_ACL_ATTR) {
 			lrattr = (lr_attr_t *)(caddr_t)(lracl + 1);
@@ -321,7 +321,7 @@ zfs_replay_create_acl(zfsvfs_t *zfsvfs,
 
 	if (lr->lr_common.lrc_txtype & TX_CI)
 		vflg |= FIGNORECASE;
-	switch ((int)lr->lr_common.lrc_txtype) {
+	switch (txtype) {
 	case TX_CREATE_ACL:
 		aclstart = (caddr_t)(lracl + 1);
 		fuidstart = (caddr_t)aclstart +
@@ -402,7 +402,8 @@ bail:
 
 	VN_RELE(ZTOV(dzp));
 
-	zfs_fuid_info_free(zfsvfs->z_fuid_replay);
+	if (zfsvfs->z_fuid_replay)
+		zfs_fuid_info_free(zfsvfs->z_fuid_replay);
 	zfsvfs->z_fuid_replay = NULL;
 
 	return (error);
@@ -425,9 +426,9 @@ zfs_replay_create(zfsvfs_t *zfsvfs, lr_create_t *lr, boolean_t byteswap)
 	struct componentname cn;
 	int error;
 
+	txtype = (lr->lr_common.lrc_txtype & ~TX_CI);
 	if (byteswap) {
 		byteswap_uint64_array(lr, sizeof (*lr));
-		txtype = (int)lr->lr_common.lrc_txtype;
 		if (txtype == TX_CREATE_ATTR || txtype == TX_MKDIR_ATTR)
 			zfs_replay_swap_attrs((lr_attr_t *)(lr + 1));
 	}
@@ -477,7 +478,7 @@ zfs_replay_create(zfsvfs_t *zfsvfs, lr_create_t *lr, boolean_t byteswap)
 	cn.cn_flags = SAVENAME;
 
 	vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY);
-	switch ((int)lr->lr_common.lrc_txtype) {
+	switch (txtype) {
 	case TX_CREATE_ATTR:
 		lrattr = (lr_attr_t *)(caddr_t)(lr + 1);
 		xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
@@ -709,6 +710,7 @@ zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap)
 	znode_t	*zp;
 	int error;
 	ssize_t resid;
+	uint64_t orig_eof, eod;
 
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
@@ -723,10 +725,65 @@ zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap)
 			error = 0;
 		return (error);
 	}
+	orig_eof = zp->z_phys->zp_size;
+	eod = lr->lr_offset + lr->lr_length; /* end of data for this write */
+
+	/* If it's a dmu_sync() block get the data and write the whole block */
+	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t))
+		zil_get_replay_data(zfsvfs->z_log, lr);
 
 	error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, lr->lr_length,
 	    lr->lr_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
 
+	/*
+	 * This may be a write from a dmu_sync() for a whole block,
+	 * and may extend beyond the current end of the file.
+	 * We can't just replay what was written for this TX_WRITE as
+	 * a future TX_WRITE2 may extend the eof and the data for that
+	 * write needs to be there. So we write the whole block and
+	 * reduce the eof.
+	 */
+	if (orig_eof < zp->z_phys->zp_size) /* file length grew ? */
+		zp->z_phys->zp_size = eod;
+
+	VN_RELE(ZTOV(zp));
+
+	return (error);
+}
+
+/*
+ * TX_WRITE2 are only generated when dmu_sync() returns EALREADY
+ * meaning the pool block is already being synced. So now that we always write
+ * out full blocks, all we have to do is expand the eof if
+ * the file is grown.
+ */
+static int
+zfs_replay_write2(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap)
+{
+	znode_t	*zp;
+	int error;
+	uint64_t end;
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
+		/*
+		 * As we can log writes out of order, it's possible the
+		 * file has been removed. In this case just drop the write
+		 * and return success.
+		 */
+		if (error == ENOENT)
+			error = 0;
+		return (error);
+	}
+
+	end = lr->lr_offset + lr->lr_length;
+	if (end > zp->z_phys->zp_size) {
+		ASSERT3U(end - zp->z_phys->zp_size, <, zp->z_blksz);
+		zp->z_phys->zp_size = end;
+	}
+
 	VN_RELE(ZTOV(zp));
 
 	return (error);
@@ -944,4 +1001,5 @@ zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = {
 	zfs_replay_create_acl,	/* TX_MKDIR_ACL */
 	zfs_replay_create,	/* TX_MKDIR_ATTR */
 	zfs_replay_create_acl,	/* TX_MKDIR_ACL_ATTR */
+	zfs_replay_write2,	/* TX_WRITE2 */
 };
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c
index f0a75b5..4de8d8a 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * This file contains the code to implement file range locking in
  * ZFS, although there isn't much specific to ZFS (all that comes to mind
@@ -431,6 +429,8 @@ zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type)
 	new = kmem_alloc(sizeof (rl_t), KM_SLEEP);
 	new->r_zp = zp;
 	new->r_off = off;
+	if (len + off < off)	/* overflow */
+		len = UINT64_MAX - off;
 	new->r_len = len;
 	new->r_cnt = 1; /* assume it's going to be in the tree */
 	new->r_type = type;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
index ce2c1e3..beb6401 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -55,7 +55,6 @@
 #include <sys/dnlc.h>
 #include <sys/dmu_objset.h>
 #include <sys/spa_boot.h>
-#include <sys/vdev_impl.h>	/* VDEV_BOOT_VERSION */
 
 struct mtx zfs_debug_mtx;
 MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF);
@@ -84,9 +83,6 @@ SYSCTL_INT(_vfs_zfs_version, OID_AUTO, dmu_backup_stream, CTLFLAG_RD,
 static int zfs_version_spa = SPA_VERSION;
 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0,
     "SPA_VERSION");
-static int zfs_version_vdev_boot = VDEV_BOOT_VERSION;
-SYSCTL_INT(_vfs_zfs_version, OID_AUTO, vdev_boot, CTLFLAG_RD,
-    &zfs_version_vdev_boot, 0, "VDEV_BOOT_VERSION");
 static int zfs_version_zpl = ZPL_VERSION;
 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0,
     "ZPL_VERSION");
@@ -140,6 +136,7 @@ zfs_sync(vfs_t *vfsp, int waitfor)
 		 * Sync a specific filesystem.
 		 */
 		zfsvfs_t *zfsvfs = vfsp->vfs_data;
+		dsl_pool_t *dp;
 		int error;
 
 		error = vfs_stdsync(vfsp, waitfor);
@@ -147,10 +144,21 @@ zfs_sync(vfs_t *vfsp, int waitfor)
 			return (error);
 
 		ZFS_ENTER(zfsvfs);
+		dp = dmu_objset_pool(zfsvfs->z_os);
+
+		/*
+		 * If the system is shutting down, then skip any
+		 * filesystems which may exist on a suspended pool.
+		 */
+		if (sys_shutdown && spa_suspended(dp->dp_spa)) {
+			ZFS_EXIT(zfsvfs);
+			return (0);
+		}
+
 		if (zfsvfs->z_log != NULL)
 			zil_commit(zfsvfs->z_log, UINT64_MAX, 0);
 		else
-			txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
+			txg_wait_synced(dp, 0);
 		ZFS_EXIT(zfsvfs);
 	} else {
 		/*
@@ -483,6 +491,392 @@ unregister:
 
 }
 
+static void
+uidacct(objset_t *os, boolean_t isgroup, uint64_t fuid,
+    int64_t delta, dmu_tx_t *tx)
+{
+	uint64_t used = 0;
+	char buf[32];
+	int err;
+	uint64_t obj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
+
+	if (delta == 0)
+		return;
+
+	(void) snprintf(buf, sizeof (buf), "%llx", (longlong_t)fuid);
+	err = zap_lookup(os, obj, buf, 8, 1, &used);
+	ASSERT(err == 0 || err == ENOENT);
+	/* no underflow/overflow */
+	ASSERT(delta > 0 || used >= -delta);
+	ASSERT(delta < 0 || used + delta > used);
+	used += delta;
+	if (used == 0)
+		err = zap_remove(os, obj, buf, tx);
+	else
+		err = zap_update(os, obj, buf, 8, 1, &used, tx);
+	ASSERT(err == 0);
+}
+
+static void
+zfs_space_delta_cb(objset_t *os, dmu_object_type_t bonustype,
+    void *oldbonus, void *newbonus,
+    uint64_t oldused, uint64_t newused, dmu_tx_t *tx)
+{
+	znode_phys_t *oldznp = oldbonus;
+	znode_phys_t *newznp = newbonus;
+
+	if (bonustype != DMU_OT_ZNODE)
+		return;
+
+	/* We charge 512 for the dnode (if it's allocated). */
+	if (oldznp->zp_gen != 0)
+		oldused += DNODE_SIZE;
+	if (newznp->zp_gen != 0)
+		newused += DNODE_SIZE;
+
+	if (oldznp->zp_uid == newznp->zp_uid) {
+		uidacct(os, B_FALSE, oldznp->zp_uid, newused-oldused, tx);
+	} else {
+		uidacct(os, B_FALSE, oldznp->zp_uid, -oldused, tx);
+		uidacct(os, B_FALSE, newznp->zp_uid, newused, tx);
+	}
+
+	if (oldznp->zp_gid == newznp->zp_gid) {
+		uidacct(os, B_TRUE, oldznp->zp_gid, newused-oldused, tx);
+	} else {
+		uidacct(os, B_TRUE, oldznp->zp_gid, -oldused, tx);
+		uidacct(os, B_TRUE, newznp->zp_gid, newused, tx);
+	}
+}
+
+static void
+fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr,
+    char *domainbuf, int buflen, uid_t *ridp)
+{
+	uint64_t fuid;
+	const char *domain;
+
+	fuid = strtonum(fuidstr, NULL);
+
+	domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid));
+	if (domain)
+		(void) strlcpy(domainbuf, domain, buflen);
+	else
+		domainbuf[0] = '\0';
+	*ridp = FUID_RID(fuid);
+}
+
+static uint64_t
+zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type)
+{
+	switch (type) {
+	case ZFS_PROP_USERUSED:
+		return (DMU_USERUSED_OBJECT);
+	case ZFS_PROP_GROUPUSED:
+		return (DMU_GROUPUSED_OBJECT);
+	case ZFS_PROP_USERQUOTA:
+		return (zfsvfs->z_userquota_obj);
+	case ZFS_PROP_GROUPQUOTA:
+		return (zfsvfs->z_groupquota_obj);
+	}
+	return (0);
+}
+
+int
+zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
+    uint64_t *cookiep, void *vbuf, uint64_t *bufsizep)
+{
+	int error;
+	zap_cursor_t zc;
+	zap_attribute_t za;
+	zfs_useracct_t *buf = vbuf;
+	uint64_t obj;
+
+	if (!dmu_objset_userspace_present(zfsvfs->z_os))
+		return (ENOTSUP);
+
+	obj = zfs_userquota_prop_to_obj(zfsvfs, type);
+	if (obj == 0) {
+		*bufsizep = 0;
+		return (0);
+	}
+
+	for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep);
+	    (error = zap_cursor_retrieve(&zc, &za)) == 0;
+	    zap_cursor_advance(&zc)) {
+		if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) >
+		    *bufsizep)
+			break;
+
+		fuidstr_to_sid(zfsvfs, za.za_name,
+		    buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid);
+
+		buf->zu_space = za.za_first_integer;
+		buf++;
+	}
+	if (error == ENOENT)
+		error = 0;
+
+	ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep);
+	*bufsizep = (uintptr_t)buf - (uintptr_t)vbuf;
+	*cookiep = zap_cursor_serialize(&zc);
+	zap_cursor_fini(&zc);
+	return (error);
+}
+
+/*
+ * buf must be big enough (eg, 32 bytes)
+ */
+static int
+id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid,
+    char *buf, boolean_t addok)
+{
+	uint64_t fuid;
+	int domainid = 0;
+
+	if (domain && domain[0]) {
+		domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok);
+		if (domainid == -1)
+			return (ENOENT);
+	}
+	fuid = FUID_ENCODE(domainid, rid);
+	(void) sprintf(buf, "%llx", (longlong_t)fuid);
+	return (0);
+}
+
+int
+zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
+    const char *domain, uint64_t rid, uint64_t *valp)
+{
+	char buf[32];
+	int err;
+	uint64_t obj;
+
+	*valp = 0;
+
+	if (!dmu_objset_userspace_present(zfsvfs->z_os))
+		return (ENOTSUP);
+
+	obj = zfs_userquota_prop_to_obj(zfsvfs, type);
+	if (obj == 0)
+		return (0);
+
+	err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_FALSE);
+	if (err)
+		return (err);
+
+	err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp);
+	if (err == ENOENT)
+		err = 0;
+	return (err);
+}
+
+int
+zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
+    const char *domain, uint64_t rid, uint64_t quota)
+{
+	char buf[32];
+	int err;
+	dmu_tx_t *tx;
+	uint64_t *objp;
+	boolean_t fuid_dirtied;
+
+	if (type != ZFS_PROP_USERQUOTA && type != ZFS_PROP_GROUPQUOTA)
+		return (EINVAL);
+
+	if (zfsvfs->z_version < ZPL_VERSION_USERSPACE)
+		return (ENOTSUP);
+
+	objp = (type == ZFS_PROP_USERQUOTA) ? &zfsvfs->z_userquota_obj :
+	    &zfsvfs->z_groupquota_obj;
+
+	err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_TRUE);
+	if (err)
+		return (err);
+	fuid_dirtied = zfsvfs->z_fuid_dirty;
+
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL);
+	if (*objp == 0) {
+		dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
+		    zfs_userquota_prop_prefixes[type]);
+	}
+	if (fuid_dirtied)
+		zfs_fuid_txhold(zfsvfs, tx);
+	err = dmu_tx_assign(tx, TXG_WAIT);
+	if (err) {
+		dmu_tx_abort(tx);
+		return (err);
+	}
+
+	mutex_enter(&zfsvfs->z_lock);
+	if (*objp == 0) {
+		*objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA,
+		    DMU_OT_NONE, 0, tx);
+		VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
+		    zfs_userquota_prop_prefixes[type], 8, 1, objp, tx));
+	}
+	mutex_exit(&zfsvfs->z_lock);
+
+	if (quota == 0) {
+		err = zap_remove(zfsvfs->z_os, *objp, buf, tx);
+		if (err == ENOENT)
+			err = 0;
+	} else {
+		err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, &quota, tx);
+	}
+	ASSERT(err == 0);
+	if (fuid_dirtied)
+		zfs_fuid_sync(zfsvfs, tx);
+	dmu_tx_commit(tx);
+	return (err);
+}
+
+boolean_t
+zfs_usergroup_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid)
+{
+	char buf[32];
+	uint64_t used, quota, usedobj, quotaobj;
+	int err;
+
+	usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
+	quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
+
+	if (quotaobj == 0 || zfsvfs->z_replay)
+		return (B_FALSE);
+
+	(void) sprintf(buf, "%llx", (longlong_t)fuid);
+	err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, &quota);
+	if (err != 0)
+		return (B_FALSE);
+
+	err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used);
+	if (err != 0)
+		return (B_FALSE);
+	return (used >= quota);
+}
+
+int
+zfsvfs_create(const char *osname, int mode, zfsvfs_t **zvp)
+{
+	objset_t *os;
+	zfsvfs_t *zfsvfs;
+	uint64_t zval;
+	int i, error;
+
+	if (error = dsl_prop_get_integer(osname, "readonly", &zval, NULL))
+		return (error);
+	if (zval)
+		mode |= DS_MODE_READONLY;
+
+	error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &os);
+	if (error == EROFS) {
+		mode |= DS_MODE_READONLY;
+		error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &os);
+	}
+	if (error)
+		return (error);
+
+	/*
+	 * Initialize the zfs-specific filesystem structure.
+	 * Should probably make this a kmem cache, shuffle fields,
+	 * and just bzero up to z_hold_mtx[].
+	 */
+	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
+	zfsvfs->z_vfs = NULL;
+	zfsvfs->z_parent = zfsvfs;
+	zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
+	zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
+	zfsvfs->z_os = os;
+
+	error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
+	if (error) {
+		goto out;
+	} else if (zfsvfs->z_version > ZPL_VERSION) {
+		(void) printf("Mismatched versions:  File system "
+		    "is version %llu on-disk format, which is "
+		    "incompatible with this software version %lld!",
+		    (u_longlong_t)zfsvfs->z_version, ZPL_VERSION);
+		error = ENOTSUP;
+		goto out;
+	}
+
+	if ((error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &zval)) != 0)
+		goto out;
+	zfsvfs->z_norm = (int)zval;
+
+	if ((error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &zval)) != 0)
+		goto out;
+	zfsvfs->z_utf8 = (zval != 0);
+
+	if ((error = zfs_get_zplprop(os, ZFS_PROP_CASE, &zval)) != 0)
+		goto out;
+	zfsvfs->z_case = (uint_t)zval;
+
+	/*
+	 * Fold case on file systems that are always or sometimes case
+	 * insensitive.
+	 */
+	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
+	    zfsvfs->z_case == ZFS_CASE_MIXED)
+		zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
+
+	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
+
+	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
+	    &zfsvfs->z_root);
+	if (error)
+		goto out;
+	ASSERT(zfsvfs->z_root != 0);
+
+	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
+	    &zfsvfs->z_unlinkedobj);
+	if (error)
+		goto out;
+
+	error = zap_lookup(os, MASTER_NODE_OBJ,
+	    zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
+	    8, 1, &zfsvfs->z_userquota_obj);
+	if (error && error != ENOENT)
+		goto out;
+
+	error = zap_lookup(os, MASTER_NODE_OBJ,
+	    zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
+	    8, 1, &zfsvfs->z_groupquota_obj);
+	if (error && error != ENOENT)
+		goto out;
+
+	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
+	    &zfsvfs->z_fuid_obj);
+	if (error && error != ENOENT)
+		goto out;
+
+	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
+	    &zfsvfs->z_shares_dir);
+	if (error && error != ENOENT)
+		goto out;
+
+	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&zfsvfs->z_online_recv_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
+	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
+	    offsetof(znode_t, z_link_node));
+	rrw_init(&zfsvfs->z_teardown_lock);
+	rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
+	rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
+	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
+		mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
+
+	*zvp = zfsvfs;
+	return (0);
+
+out:
+	dmu_objset_close(os);
+	*zvp = NULL;
+	kmem_free(zfsvfs, sizeof (zfsvfs_t));
+	return (error);
+}
+
 static int
 zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
 {
@@ -551,8 +945,9 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
 			 * allocated and in the unlinked set, and there is an
 			 * intent log record saying to allocate it.
 			 */
-			zil_replay(zfsvfs->z_os, zfsvfs, &zfsvfs->z_assign,
-			    zfs_replay_vector, zfs_unlinked_drain);
+			zfsvfs->z_replay = B_TRUE;
+			zil_replay(zfsvfs->z_os, zfsvfs, zfs_replay_vector);
+			zfsvfs->z_replay = B_FALSE;
 		}
 		zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
 	}
@@ -560,49 +955,52 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
 	return (0);
 }
 
-static void
-zfs_freezfsvfs(zfsvfs_t *zfsvfs)
+void
+zfsvfs_free(zfsvfs_t *zfsvfs)
 {
+	int i;
+
+	zfs_fuid_destroy(zfsvfs);
+
 	mutex_destroy(&zfsvfs->z_znodes_lock);
 	mutex_destroy(&zfsvfs->z_online_recv_lock);
+	mutex_destroy(&zfsvfs->z_lock);
 	list_destroy(&zfsvfs->z_all_znodes);
 	rrw_destroy(&zfsvfs->z_teardown_lock);
 	rw_destroy(&zfsvfs->z_teardown_inactive_lock);
 	rw_destroy(&zfsvfs->z_fuid_lock);
+	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
+		mutex_destroy(&zfsvfs->z_hold_mtx[i]);
 	kmem_free(zfsvfs, sizeof (zfsvfs_t));
 }
 
+static void
+zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
+{
+	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
+	if (zfsvfs->z_use_fuids && zfsvfs->z_vfs) {
+		vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
+		vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
+		vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
+		vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
+	}
+}
+
 static int
 zfs_domount(vfs_t *vfsp, char *osname)
 {
-	uint64_t recordsize, readonly;
+	uint64_t recordsize, fsid_guid;
 	int error = 0;
-	int mode;
 	zfsvfs_t *zfsvfs;
-	znode_t *zp = NULL;
+	vnode_t *vp;
 
 	ASSERT(vfsp);
 	ASSERT(osname);
 
-	/*
-	 * Initialize the zfs-specific filesystem structure.
-	 * Should probably make this a kmem cache, shuffle fields,
-	 * and just bzero up to z_hold_mtx[].
-	 */
-	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
+	error = zfsvfs_create(osname, DS_MODE_OWNER, &zfsvfs);
+	if (error)
+		return (error);
 	zfsvfs->z_vfs = vfsp;
-	zfsvfs->z_parent = zfsvfs;
-	zfsvfs->z_assign = TXG_NOWAIT;
-	zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
-	zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
-
-	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&zfsvfs->z_online_recv_lock, NULL, MUTEX_DEFAULT, NULL);
-	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
-	    offsetof(znode_t, z_link_node));
-	rrw_init(&zfsvfs->z_teardown_lock);
-	rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
-	rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
 
 	if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
 	    NULL))
@@ -616,36 +1014,25 @@ zfs_domount(vfs_t *vfsp, char *osname)
 	vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED;
 	vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES;
 
-	if (error = dsl_prop_get_integer(osname, "readonly", &readonly, NULL))
-		goto out;
-
-	mode = DS_MODE_OWNER;
-	if (readonly)
-		mode |= DS_MODE_READONLY;
 
-	error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os);
-	if (error == EROFS) {
-		mode = DS_MODE_OWNER | DS_MODE_READONLY;
-		error = dmu_objset_open(osname, DMU_OST_ZFS, mode,
-		    &zfsvfs->z_os);
-	}
-
-	if (error)
-		goto out;
-
-	if (error = zfs_init_fs(zfsvfs, &zp))
-		goto out;
+	/*
+	 * The fsid is 64 bits, composed of an 8-bit fs type, which
+	 * separates our fsid from any other filesystem types, and a
+	 * 56-bit objset unique ID.  The objset unique ID is unique to
+	 * all objsets open on this system, provided by unique_create().
+	 * The 8-bit fs type must be put in the low bits of fsid[1]
+	 * because that's where other Solaris filesystems put it.
+	 */
+	fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os);
+	ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
+	vfsp->vfs_fsid.val[0] = fsid_guid;
+	vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) |
+	    vfsp->mnt_vfc->vfc_typenum & 0xFF;
 
 	/*
 	 * Set features for file system.
 	 */
-	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
-	if (zfsvfs->z_use_fuids) {
-		vfs_set_feature(vfsp, VFSFT_XVATTR);
-		vfs_set_feature(vfsp, VFSFT_SYSATTR_VIEWS);
-		vfs_set_feature(vfsp, VFSFT_ACEMASKONACCESS);
-		vfs_set_feature(vfsp, VFSFT_ACLONCREATE);
-	}
+	zfs_set_fuid_feature(zfsvfs);
 	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
 		vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
 		vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
@@ -658,26 +1045,31 @@ zfs_domount(vfs_t *vfsp, char *osname)
 	if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
 		uint64_t pval;
 
-		ASSERT(mode & DS_MODE_READONLY);
 		atime_changed_cb(zfsvfs, B_FALSE);
 		readonly_changed_cb(zfsvfs, B_TRUE);
 		if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL))
 			goto out;
 		xattr_changed_cb(zfsvfs, pval);
 		zfsvfs->z_issnap = B_TRUE;
+
+		mutex_enter(&zfsvfs->z_os->os->os_user_ptr_lock);
+		dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
+		mutex_exit(&zfsvfs->z_os->os->os_user_ptr_lock);
 	} else {
 		error = zfsvfs_setup(zfsvfs, B_TRUE);
 	}
 
 	vfs_mountedfrom(vfsp, osname);
+	/* Grab extra reference. */
+	VERIFY(VFS_ROOT(vfsp, LK_EXCLUSIVE, &vp) == 0);
+	VOP_UNLOCK(vp, 0);
 
 	if (!zfsvfs->z_issnap)
 		zfsctl_create(zfsvfs);
 out:
 	if (error) {
-		if (zfsvfs->z_os)
-			dmu_objset_close(zfsvfs->z_os);
-		zfs_freezfsvfs(zfsvfs);
+		dmu_objset_close(zfsvfs->z_os);
+		zfsvfs_free(zfsvfs);
 	} else {
 		atomic_add_32(&zfs_active_fs_count, 1);
 	}
@@ -779,24 +1171,12 @@ zfs_mount(vfs_t *vfsp)
 				goto out;
 			}
 
-#if 0 /* CHECK THIS! Is probably needed for zfs_suser. */
 			if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 &&
 			    VOP_ACCESS(mvp, VWRITE, cr, td) != 0) {
-				error = EPERM;
-				goto out;
-			}
-#else
-			if (error = secpolicy_vnode_owner(mvp, cr, vattr.va_uid)) {
-				VOP_UNLOCK(mvp, 0);
-				goto out;
-			}
-
-			if (error = VOP_ACCESS(mvp, VWRITE, cr, td)) {
 				VOP_UNLOCK(mvp, 0);
 				goto out;
 			}
 			VOP_UNLOCK(mvp, 0);
-#endif
 		}
 
 		secpolicy_fs_mount_clearopts(cr, vfsp);
@@ -826,6 +1206,21 @@ zfs_mount(vfs_t *vfsp)
 	DROP_GIANT();
 	error = zfs_domount(vfsp, osname);
 	PICKUP_GIANT();
+
+	/*
+	 * Add an extra VFS_HOLD on our parent vfs so that it can't
+	 * disappear due to a forced unmount.
+	 */
+	if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap)
+		VFS_HOLD(mvp->v_vfsp);
+
+	/*
+	 * Add an extra VFS_HOLD on our parent vfs so that it can't
+	 * disappear due to a forced unmount.
+	 */
+	if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap)
+		VFS_HOLD(mvp->v_vfsp);
+
 out:
 	return (error);
 }
@@ -1025,9 +1420,10 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
 static int
 zfs_umount(vfs_t *vfsp, int fflag)
 {
+	kthread_t *td = curthread;
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
 	objset_t *os;
-	cred_t *cr = curthread->td_ucred;
+	cred_t *cr = td->td_ucred;
 	int ret;
 
 	ret = secpolicy_fs_unmount(cr, vfsp);
@@ -1052,7 +1448,7 @@ zfs_umount(vfs_t *vfsp, int fflag)
 	if (zfsvfs->z_ctldir != NULL) {
 		if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0)
 			return (ret);
-		ret = vflush(vfsp, 0, 0, curthread);
+		ret = vflush(vfsp, 0, 0, td);
 		ASSERT(ret == EBUSY);
 		if (!(fflag & MS_FORCE)) {
 			if (zfsvfs->z_ctldir->v_count > 1)
@@ -1077,7 +1473,7 @@ zfs_umount(vfs_t *vfsp, int fflag)
 	/*
 	 * Flush all the files.
 	 */
-	ret = vflush(vfsp, 1, (fflag & MS_FORCE) ? FORCECLOSE : 0, curthread);
+	ret = vflush(vfsp, 1, (fflag & MS_FORCE) ? FORCECLOSE : 0, td);
 	if (ret != 0) {
 		if (!zfsvfs->z_issnap) {
 			zfsctl_create(zfsvfs);
@@ -1304,15 +1700,16 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp)
  * 'z_teardown_inactive_lock' write held.
  */
 int
-zfs_suspend_fs(zfsvfs_t *zfsvfs, char *name, int *mode)
+zfs_suspend_fs(zfsvfs_t *zfsvfs, char *name, int *modep)
 {
 	int error;
 
 	if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
 		return (error);
 
-	*mode = zfsvfs->z_os->os_mode;
-	dmu_objset_name(zfsvfs->z_os, name);
+	*modep = zfsvfs->z_os->os_mode;
+	if (name)
+		dmu_objset_name(zfsvfs->z_os, name);
 	dmu_objset_close(zfsvfs->z_os);
 
 	return (0);
@@ -1371,13 +1768,15 @@ static void
 zfs_freevfs(vfs_t *vfsp)
 {
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
-	int i;
 
-	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
-		mutex_destroy(&zfsvfs->z_hold_mtx[i]);
+	/*
+	 * If this is a snapshot, we have an extra VFS_HOLD on our parent
+	 * from zfs_mount().  Release it here.
+	 */
+	if (zfsvfs->z_issnap)
+		VFS_RELE(zfsvfs->z_parent->z_vfs);
 
-	zfs_fuid_destroy(zfsvfs);
-	zfs_freezfsvfs(zfsvfs);
+	zfsvfs_free(zfsvfs);
 
 	atomic_add_32(&zfs_active_fs_count, -1);
 }
@@ -1438,6 +1837,8 @@ zfs_init(void)
 	 * ZFS/i386.
 	 */
 	zfs_vnodes_adjust();
+
+	dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb);
 }
 
 void
@@ -1455,54 +1856,46 @@ zfs_busy(void)
 }
 
 int
-zfs_set_version(const char *name, uint64_t newvers)
+zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
 {
 	int error;
-	objset_t *os;
+	objset_t *os = zfsvfs->z_os;
 	dmu_tx_t *tx;
-	uint64_t curvers;
-
-	/*
-	 * XXX for now, require that the filesystem be unmounted.  Would
-	 * be nice to find the zfsvfs_t and just update that if
-	 * possible.
-	 */
 
 	if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
 		return (EINVAL);
 
-	error = dmu_objset_open(name, DMU_OST_ZFS, DS_MODE_OWNER, &os);
-	if (error)
-		return (error);
-
-	error = zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
-	    8, 1, &curvers);
-	if (error)
-		goto out;
-	if (newvers < curvers) {
-		error = EINVAL;
-		goto out;
-	}
+	if (newvers < zfsvfs->z_version)
+		return (EINVAL);
 
 	tx = dmu_tx_create(os);
-	dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, 0, ZPL_VERSION_STR);
+	dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
-		goto out;
+		return (error);
+	}
+	error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
+	    8, 1, &newvers, tx);
+
+	if (error) {
+		dmu_tx_commit(tx);
+		return (error);
 	}
-	error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 8, 1,
-	    &newvers, tx);
 
 	spa_history_internal_log(LOG_DS_UPGRADE,
 	    dmu_objset_spa(os), tx, CRED(),
-	    "oldver=%llu newver=%llu dataset = %llu", curvers, newvers,
-	    dmu_objset_id(os));
+	    "oldver=%llu newver=%llu dataset = %llu",
+	    zfsvfs->z_version, newvers, dmu_objset_id(os));
+
 	dmu_tx_commit(tx);
 
-out:
-	dmu_objset_close(os);
-	return (error);
+	zfsvfs->z_version = newvers;
+
+	if (zfsvfs->z_version >= ZPL_VERSION_FUID)
+		zfs_set_fuid_feature(zfsvfs);
+
+	return (0);
 }
 /*
  * Read a property stored within the master node.
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
index adeabfb..9292880 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -56,6 +56,7 @@
 #include <sys/policy.h>
 #include <sys/sunddi.h>
 #include <sys/filio.h>
+#include <sys/sid.h>
 #include <sys/zfs_ctldir.h>
 #include <sys/zfs_fuid.h>
 #include <sys/dnlc.h>
@@ -98,9 +99,7 @@
  *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
  *	as they can span dmu_tx_assign() calls.
  *
- *  (4)	Always pass zfsvfs->z_assign as the second argument to dmu_tx_assign().
- *	In normal operation, this will be TXG_NOWAIT.  During ZIL replay,
- *	it will be a specific txg.  Either way, dmu_tx_assign() never blocks.
+ *  (4)	Always pass TXG_NOWAIT as the second argument to dmu_tx_assign().
  *	This is critical because we don't want to block while holding locks.
  *	Note, in particular, that if a lock is sometimes acquired before
  *	the tx assigns, and sometimes after (e.g. z_lock), then failing to
@@ -117,6 +116,8 @@
  *  (5)	If the operation succeeded, generate the intent log entry for it
  *	before dropping locks.  This ensures that the ordering of events
  *	in the intent log matches the order in which they actually occurred.
+ *      During ZIL replay the zfs_log_* functions will update the sequence
+ *	number to indicate the zil transaction has replayed.
  *
  *  (6)	At the end of each vnode op, the DMU tx must always commit,
  *	regardless of whether there were any errors.
@@ -132,12 +133,12 @@
  *	rw_enter(...);			// grab any other locks you need
  *	tx = dmu_tx_create(...);	// get DMU tx
  *	dmu_tx_hold_*();		// hold each object you might modify
- *	error = dmu_tx_assign(tx, zfsvfs->z_assign);	// try to assign
+ *	error = dmu_tx_assign(tx, TXG_NOWAIT);	// try to assign
  *	if (error) {
  *		rw_exit(...);		// drop locks
  *		zfs_dirent_unlock(dl);	// unlock directory entry
  *		VN_RELE(...);		// release held vnodes
- *		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+ *		if (error == ERESTART) {
  *			dmu_tx_wait(tx);
  *			dmu_tx_abort(tx);
  *			goto top;
@@ -163,23 +164,32 @@ static int
 zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
 {
 	znode_t	*zp = VTOZ(*vpp);
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
 
 	if ((flag & FWRITE) && (zp->z_phys->zp_flags & ZFS_APPENDONLY) &&
 	    ((flag & FAPPEND) == 0)) {
+		ZFS_EXIT(zfsvfs);
 		return (EPERM);
 	}
 
 	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
 	    ZTOV(zp)->v_type == VREG &&
 	    !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) &&
-	    zp->z_phys->zp_size > 0)
-		if (fs_vscan(*vpp, cr, 0) != 0)
+	    zp->z_phys->zp_size > 0) {
+		if (fs_vscan(*vpp, cr, 0) != 0) {
+			ZFS_EXIT(zfsvfs);
 			return (EACCES);
+		}
+	}
 
 	/* Keep a count of the synchronous opens in the znode */
 	if (flag & (FSYNC | FDSYNC))
 		atomic_inc_32(&zp->z_sync_cnt);
 
+	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
@@ -189,6 +199,10 @@ zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
     caller_context_t *ct)
 {
 	znode_t	*zp = VTOZ(vp);
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
 
 	/* Decrement the synchronous opens in the znode */
 	if ((flag & (FSYNC | FDSYNC)) && (count == 1))
@@ -206,6 +220,7 @@ zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
 	    zp->z_phys->zp_size > 0)
 		VERIFY(fs_vscan(vp, cr, 1) == 0);
 
+	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
@@ -296,98 +311,108 @@ zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred,
 	return (ENOTTY);
 }
 
+static vm_page_t
+page_lookup(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
+{
+	vm_object_t obj;
+	vm_page_t pp;
+
+	obj = vp->v_object;
+	VM_OBJECT_LOCK_ASSERT(obj, MA_OWNED);
+
+	for (;;) {
+		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
+		    vm_page_is_valid(pp, (vm_offset_t)off, nbytes)) {
+			if (vm_page_sleep_if_busy(pp, FALSE, "zfsmwb"))
+				continue;
+			vm_page_busy(pp);
+			vm_page_lock_queues();
+			vm_page_undirty(pp);
+			vm_page_unlock_queues();
+		} else {
+			if (__predict_false(obj->cache != NULL)) {
+				vm_page_cache_free(obj, OFF_TO_IDX(start),
+				    OFF_TO_IDX(start) + 1);
+			}
+			pp = NULL;
+		}
+		break;
+	}
+	return (pp);
+}
+
+static void
+page_unlock(vm_page_t pp)
+{
+
+	vm_page_wakeup(pp);
+}
+
+static caddr_t
+zfs_map_page(vm_page_t pp, struct sf_buf **sfp)
+{
+
+	sched_pin();
+	*sfp = sf_buf_alloc(pp, SFB_CPUPRIVATE);
+	return ((caddr_t)sf_buf_kva(*sfp));
+}
+
+static void
+zfs_unmap_page(struct sf_buf *sf)
+{
+
+	sf_buf_free(sf);
+	sched_unpin();
+}
+
+
 /*
  * When a file is memory mapped, we must keep the IO data synchronized
  * between the DMU cache and the memory mapped pages.  What this means:
  *
  * On Write:	If we find a memory mapped page, we write to *both*
  *		the page and the dmu buffer.
- *
- * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
- *	the file is memory mapped.
  */
-static int
-mappedwrite(vnode_t *vp, int nbytes, uio_t *uio, dmu_tx_t *tx)
+
+static void
+update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
+    int segflg, dmu_tx_t *tx)
 {
-	znode_t *zp = VTOZ(vp);
-	objset_t *os = zp->z_zfsvfs->z_os;
 	vm_object_t obj;
-	vm_page_t m;
 	struct sf_buf *sf;
-	int64_t start, off;
-	int len = nbytes;
-	int error = 0;
-	uint64_t dirbytes;
+	int64_t off;
 
 	ASSERT(vp->v_mount != NULL);
 	obj = vp->v_object;
 	ASSERT(obj != NULL);
 
-	start = uio->uio_loffset;
 	off = start & PAGEOFFSET;
-	dirbytes = 0;
 	VM_OBJECT_LOCK(obj);
 	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
-		uint64_t bytes = MIN(PAGESIZE - off, len);
-		uint64_t fsize;
+		vm_page_t pp;
+		uint64_t nbytes = MIN(PAGESIZE - off, len);
 
-again:
-		if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
-		    vm_page_is_valid(m, (vm_offset_t)off, bytes)) {
-			uint64_t woff;
+		if ((pp = page_lookup(vp, start, off, nbytes)) != NULL) {
 			caddr_t va;
 
-			if (vm_page_sleep_if_busy(m, FALSE, "zfsmwb"))
-				goto again;
-			fsize = obj->un_pager.vnp.vnp_size;
-			vm_page_busy(m);
-			vm_page_lock_queues();
-			vm_page_undirty(m);
-			vm_page_unlock_queues();
 			VM_OBJECT_UNLOCK(obj);
-			if (dirbytes > 0) {
-				error = dmu_write_uio(os, zp->z_id, uio,
-				    dirbytes, tx);
-				dirbytes = 0;
-			}
-			if (error == 0) {
-				sched_pin();
-				sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
-				va = (caddr_t)sf_buf_kva(sf);
-				woff = uio->uio_loffset - off;
-				error = uiomove(va + off, bytes, UIO_WRITE, uio);
-				/*
-				 * The uiomove() above could have been partially
-				 * successful, that's why we call dmu_write()
-				 * below unconditionally. The page was marked
-				 * non-dirty above and we would lose the changes
-				 * without doing so. If the uiomove() failed
-				 * entirely, well, we just write what we got
-				 * before one more time.
-				 */
-				dmu_write(os, zp->z_id, woff,
-				    MIN(PAGESIZE, fsize - woff), va, tx);
-				sf_buf_free(sf);
-				sched_unpin();
+			va = zfs_map_page(pp, &sf);
+			if (segflg == UIO_NOCOPY) {
+				(void) dmu_write(os, oid, start+off, nbytes,
+				    va+off, tx);
+			} else {
+				(void) dmu_read(os, oid, start+off, nbytes,
+				    va+off, DMU_READ_PREFETCH);;
 			}
+			zfs_unmap_page(sf);
 			VM_OBJECT_LOCK(obj);
-			vm_page_wakeup(m);
-		} else {
-			if (__predict_false(obj->cache != NULL)) {
-				vm_page_cache_free(obj, OFF_TO_IDX(start),
-				    OFF_TO_IDX(start) + 1);
-			}
-			dirbytes += bytes;
+			page_unlock(pp);
+
 		}
-		len -= bytes;
+		len -= nbytes;
 		off = 0;
-		if (error)
-			break;
 	}
 	VM_OBJECT_UNLOCK(obj);
-	if (error == 0 && dirbytes > 0)
-		error = dmu_write_uio(os, zp->z_id, uio, dirbytes, tx);
-	return (error);
 }
 
 /*
@@ -469,7 +494,8 @@ again:
 				sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
 				va = (caddr_t)sf_buf_kva(sf);
 				error = dmu_read(os, zp->z_id, start + off,
-				    bytes, (void *)(va + off));
+				    bytes, (void *)(va + off),
+				    DMU_READ_PREFETCH);
 				sf_buf_free(sf);
 				sched_unpin();
 			}
@@ -690,6 +716,7 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 	int		max_blksz = zfsvfs->z_max_blksz;
 	uint64_t	pflags;
 	int		error;
+	arc_buf_t	*abuf;
 
 	/*
 	 * Fasttrack empty write
@@ -786,22 +813,59 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 	 * and allows us to do more fine-grained space accounting.
 	 */
 	while (n > 0) {
+		abuf = NULL;
+		woff = uio->uio_loffset;
+
+again:
+		if (zfs_usergroup_overquota(zfsvfs,
+		    B_FALSE, zp->z_phys->zp_uid) ||
+		    zfs_usergroup_overquota(zfsvfs,
+		    B_TRUE, zp->z_phys->zp_gid)) {
+			if (abuf != NULL)
+				dmu_return_arcbuf(abuf);
+			error = EDQUOT;
+			break;
+		}
+
+		/*
+		 * If dmu_assign_arcbuf() is expected to execute with minimum
+		 * overhead loan an arc buffer and copy user data to it before
+		 * we enter a txg.  This avoids holding a txg forever while we
+		 * pagefault on a hanging NFS server mapping.
+		 */
+		if (abuf == NULL && n >= max_blksz &&
+		    woff >= zp->z_phys->zp_size &&
+		    P2PHASE(woff, max_blksz) == 0 &&
+		    zp->z_blksz == max_blksz) {
+			size_t cbytes;
+
+			abuf = dmu_request_arcbuf(zp->z_dbuf, max_blksz);
+			ASSERT(abuf != NULL);
+			ASSERT(arc_buf_size(abuf) == max_blksz);
+			if (error = uiocopy(abuf->b_data, max_blksz,
+			    UIO_WRITE, uio, &cbytes)) {
+				dmu_return_arcbuf(abuf);
+				break;
+			}
+			ASSERT(cbytes == max_blksz);
+		}
+
 		/*
 		 * Start a transaction.
 		 */
-		woff = uio->uio_loffset;
 		tx = dmu_tx_create(zfsvfs->z_os);
 		dmu_tx_hold_bonus(tx, zp->z_id);
 		dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
-		error = dmu_tx_assign(tx, zfsvfs->z_assign);
+		error = dmu_tx_assign(tx, TXG_NOWAIT);
 		if (error) {
-			if (error == ERESTART &&
-			    zfsvfs->z_assign == TXG_NOWAIT) {
+			if (error == ERESTART) {
 				dmu_tx_wait(tx);
 				dmu_tx_abort(tx);
-				continue;
+				goto again;
 			}
 			dmu_tx_abort(tx);
+			if (abuf != NULL)
+				dmu_return_arcbuf(abuf);
 			break;
 		}
 
@@ -833,18 +897,33 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 		if (woff + nbytes > zp->z_phys->zp_size)
 			vnode_pager_setsize(vp, woff + nbytes);
 
-		rw_enter(&zp->z_map_lock, RW_READER);
-
-		tx_bytes = uio->uio_resid;
-		if (vn_has_cached_data(vp)) {
-			rw_exit(&zp->z_map_lock);
-			error = mappedwrite(vp, nbytes, uio, tx);
+		if (abuf == NULL) {
+			tx_bytes = uio->uio_resid;
+			error = dmu_write_uio(zfsvfs->z_os, zp->z_id, uio,
+			    nbytes, tx);
+			tx_bytes -= uio->uio_resid;
 		} else {
-			error = dmu_write_uio(zfsvfs->z_os, zp->z_id,
-			    uio, nbytes, tx);
-			rw_exit(&zp->z_map_lock);
+			tx_bytes = nbytes;
+			ASSERT(tx_bytes == max_blksz);
+			dmu_assign_arcbuf(zp->z_dbuf, woff, abuf, tx);
+			ASSERT(tx_bytes <= uio->uio_resid);
+			uioskip(uio, tx_bytes);
+		}
+
+		/*
+		 * XXXPJD: There are some cases (triggered by fsx) where
+		 *         vn_has_cached_data(vp) returns false when it should
+		 *         return true. This should be investigated.
+		 */
+#if 0
+		if (tx_bytes && vn_has_cached_data(vp))
+#else
+		if (tx_bytes && vp->v_object != NULL)
+#endif
+		{
+			update_pages(vp, woff, tx_bytes, zfsvfs->z_os,
+			    zp->z_id, uio->uio_segflg, tx);
 		}
-		tx_bytes -= uio->uio_resid;
 
 		/*
 		 * If we made no progress, we're done.  If we made even
@@ -906,7 +985,7 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 	 * If we're in replay mode, or we made no progress, return error.
 	 * Otherwise, it's at least a partial write, so it's successful.
 	 */
-	if (zfsvfs->z_assign >= TXG_INITIAL || uio->uio_resid == start_resid) {
+	if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
@@ -988,7 +1067,8 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
 			error = ENOENT;
 			goto out;
 		}
-		VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf));
+		VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf,
+		    DMU_READ_NO_PREFETCH));
 	} else { /* indirect write */
 		uint64_t boff; /* block starting offset */
 
@@ -1027,16 +1107,28 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
 		    lr->lr_common.lrc_txg, zfs_get_done, zgd);
 		ASSERT((error && error != EINPROGRESS) ||
 		    lr->lr_length <= zp->z_blksz);
-		if (error == 0)
+		if (error == 0) {
+			/*
+			 * dmu_sync() can compress a block of zeros to a null
+			 * blkptr but the block size still needs to be passed
+			 * through to replay.
+			 */
+			BP_SET_LSIZE(&lr->lr_blkptr, db->db_size);
 			zil_add_block(zfsvfs->z_log, &lr->lr_blkptr);
+		}
+
 		/*
 		 * If we get EINPROGRESS, then we need to wait for a
 		 * write IO initiated by dmu_sync() to complete before
 		 * we can release this dbuf.  We will finish everything
 		 * up in the zfs_get_done() callback.
 		 */
-		if (error == EINPROGRESS)
+		if (error == EINPROGRESS) {
 			return (0);
+		} else if (error == EALREADY) {
+			lr->lr_common.lrc_txtype = TX_WRITE2;
+			error = 0;
+		}
 		dmu_buf_rele(db, zgd);
 		kmem_free(zgd, sizeof (zgd_t));
 	}
@@ -1279,8 +1371,11 @@ zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
 	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	int		error;
-	zfs_acl_t	*aclp = NULL;
-	zfs_fuid_info_t *fuidp = NULL;
+	ksid_t		*ksid;
+	uid_t		uid;
+	gid_t		gid = crgetgid(cr);
+	zfs_acl_ids_t	acl_ids;
+	boolean_t	fuid_dirtied;
 	void		*vsecp = NULL;
 	int		flag = 0;
 
@@ -1289,6 +1384,11 @@ zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
 	 * make sure file system is at proper version
 	 */
 
+	ksid = crgetsid(cr, KSID_OWNER);
+	if (ksid)
+		uid = ksid_getid(ksid);
+	else
+		uid = crgetuid(cr);
 	if (zfsvfs->z_use_fuids == B_FALSE &&
 	    (vsecp || (vap->va_mask & AT_XVATTR) ||
 	    IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr))))
@@ -1339,21 +1439,9 @@ top:
 			if (strcmp(name, "..") == 0)
 				error = EISDIR;
 			ZFS_EXIT(zfsvfs);
-			if (aclp)
-				zfs_acl_free(aclp);
-			return (error);
-		}
-	}
-	if (vsecp && aclp == NULL) {
-		error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, &aclp);
-		if (error) {
-			ZFS_EXIT(zfsvfs);
-			if (dl)
-				zfs_dirent_unlock(dl);
 			return (error);
 		}
 	}
-
 	if (zp == NULL) {
 		uint64_t txtype;
 
@@ -1375,52 +1463,52 @@ top:
 			goto out;
 		}
 
+
+		if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp,
+		    &acl_ids)) != 0)
+			goto out;
+		if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
+			error = EDQUOT;
+			goto out;
+		}
+
 		tx = dmu_tx_create(os);
 		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
-		if ((aclp && aclp->z_has_fuids) || IS_EPHEMERAL(crgetuid(cr)) ||
-		    IS_EPHEMERAL(crgetgid(cr))) {
-			if (zfsvfs->z_fuid_obj == 0) {
-				dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
-				dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
-				    FUID_SIZE_ESTIMATE(zfsvfs));
-				dmu_tx_hold_zap(tx, MASTER_NODE_OBJ,
-				    FALSE, NULL);
-			} else {
-				dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
-				dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
-				    FUID_SIZE_ESTIMATE(zfsvfs));
-			}
-		}
+		fuid_dirtied = zfsvfs->z_fuid_dirty;
+		if (fuid_dirtied)
+			zfs_fuid_txhold(zfsvfs, tx);
 		dmu_tx_hold_bonus(tx, dzp->z_id);
 		dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
-		if ((dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) || aclp) {
+		if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 			    0, SPA_MAXBLOCKSIZE);
 		}
-		error = dmu_tx_assign(tx, zfsvfs->z_assign);
+		error = dmu_tx_assign(tx, TXG_NOWAIT);
 		if (error) {
+			zfs_acl_ids_free(&acl_ids);
 			zfs_dirent_unlock(dl);
-			if (error == ERESTART &&
-			    zfsvfs->z_assign == TXG_NOWAIT) {
+			if (error == ERESTART) {
 				dmu_tx_wait(tx);
 				dmu_tx_abort(tx);
 				goto top;
 			}
 			dmu_tx_abort(tx);
 			ZFS_EXIT(zfsvfs);
-			if (aclp)
-				zfs_acl_free(aclp);
 			return (error);
 		}
-		zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, aclp, &fuidp);
+		zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids);
+
+		if (fuid_dirtied)
+			zfs_fuid_sync(zfsvfs, tx);
+
 		(void) zfs_link_create(dl, zp, tx, ZNEW);
+
 		txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
 		if (flag & FIGNORECASE)
 			txtype |= TX_CI;
 		zfs_log_create(zilog, tx, txtype, dzp, zp, name,
-		    vsecp, fuidp, vap);
-		if (fuidp)
-			zfs_fuid_info_free(fuidp);
+		    vsecp, acl_ids.z_fuidp, vap);
+		zfs_acl_ids_free(&acl_ids);
 		dmu_tx_commit(tx);
 	} else {
 		int aflags = (flag & FAPPEND) ? V_APPEND : 0;
@@ -1490,8 +1578,6 @@ out:
 			*vpp = svp;
 		}
 	}
-	if (aclp)
-		zfs_acl_free(aclp);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
@@ -1610,11 +1696,11 @@ top:
 	/* charge as an update -- would be nice not to charge at all */
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 
-	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	error = dmu_tx_assign(tx, TXG_NOWAIT);
 	if (error) {
 		zfs_dirent_unlock(dl);
 		VN_RELE(vp);
-		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+		if (error == ERESTART) {
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
@@ -1724,9 +1810,12 @@ zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
 	uint64_t	txtype;
 	dmu_tx_t	*tx;
 	int		error;
-	zfs_acl_t	*aclp = NULL;
-	zfs_fuid_info_t	*fuidp = NULL;
 	int		zf = ZNEW;
+	ksid_t		*ksid;
+	uid_t		uid;
+	gid_t		gid = crgetgid(cr);
+	zfs_acl_ids_t	acl_ids;
+	boolean_t	fuid_dirtied;
 
 	ASSERT(vap->va_type == VDIR);
 
@@ -1735,6 +1824,11 @@ zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
 	 * make sure file system is at proper version
 	 */
 
+	ksid = crgetsid(cr, KSID_OWNER);
+	if (ksid)
+		uid = ksid_getid(ksid);
+	else
+		uid = crgetuid(cr);
 	if (zfsvfs->z_use_fuids == B_FALSE &&
 	    (vsecp || (vap->va_mask & AT_XVATTR) || IS_EPHEMERAL(crgetuid(cr))||
 	    IS_EPHEMERAL(crgetgid(cr))))
@@ -1782,59 +1876,51 @@ top:
 		return (error);
 	}
 
-	if (vsecp && aclp == NULL) {
-		error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, &aclp);
-		if (error) {
-			zfs_dirent_unlock(dl);
-			ZFS_EXIT(zfsvfs);
-			return (error);
-		}
+	if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp,
+	    &acl_ids)) != 0) {
+		zfs_dirent_unlock(dl);
+		ZFS_EXIT(zfsvfs);
+		return (error);
 	}
+	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
+		zfs_dirent_unlock(dl);
+		ZFS_EXIT(zfsvfs);
+		return (EDQUOT);
+	}
+
 	/*
 	 * Add a new entry to the directory.
 	 */
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
 	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
-	if ((aclp && aclp->z_has_fuids) || IS_EPHEMERAL(crgetuid(cr)) ||
-	    IS_EPHEMERAL(crgetgid(cr))) {
-		if (zfsvfs->z_fuid_obj == 0) {
-			dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
-			dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
-			    FUID_SIZE_ESTIMATE(zfsvfs));
-			dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL);
-		} else {
-			dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
-			dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
-			    FUID_SIZE_ESTIMATE(zfsvfs));
-		}
-	}
-	if ((dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) || aclp)
+	fuid_dirtied = zfsvfs->z_fuid_dirty;
+	if (fuid_dirtied)
+		zfs_fuid_txhold(zfsvfs, tx);
+	if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE)
 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 		    0, SPA_MAXBLOCKSIZE);
-	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	error = dmu_tx_assign(tx, TXG_NOWAIT);
 	if (error) {
+		zfs_acl_ids_free(&acl_ids);
 		zfs_dirent_unlock(dl);
-		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+		if (error == ERESTART) {
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
 		}
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
-		if (aclp)
-			zfs_acl_free(aclp);
 		return (error);
 	}
 
 	/*
 	 * Create new node.
 	 */
-	zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, aclp, &fuidp);
-
-	if (aclp)
-		zfs_acl_free(aclp);
+	zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids);
 
+	if (fuid_dirtied)
+		zfs_fuid_sync(zfsvfs, tx);
 	/*
 	 * Now put new name in parent dir.
 	 */
@@ -1845,10 +1931,10 @@ top:
 	txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
 	if (flags & FIGNORECASE)
 		txtype |= TX_CI;
-	zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, fuidp, vap);
+	zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
+	    acl_ids.z_fuidp, vap);
 
-	if (fuidp)
-		zfs_fuid_info_free(fuidp);
+	zfs_acl_ids_free(&acl_ids);
 	dmu_tx_commit(tx);
 
 	zfs_dirent_unlock(dl);
@@ -1942,13 +2028,13 @@ top:
 	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
 	dmu_tx_hold_bonus(tx, zp->z_id);
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
-	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	error = dmu_tx_assign(tx, TXG_NOWAIT);
 	if (error) {
 		rw_exit(&zp->z_parent_lock);
 		rw_exit(&zp->z_name_lock);
 		zfs_dirent_unlock(dl);
 		VN_RELE(vp);
-		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+		if (error == ERESTART) {
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
@@ -2534,11 +2620,13 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
 	zilog_t		*zilog;
 	dmu_tx_t	*tx;
 	vattr_t		oldva;
+	xvattr_t	tmpxvattr;
 	uint_t		mask = vap->va_mask;
 	uint_t		saved_mask;
 	uint64_t	saved_mode;
 	int		trim_mask = 0;
 	uint64_t	new_mode;
+	uint64_t	new_uid, new_gid;
 	znode_t		*attrzp;
 	int		need_policy = FALSE;
 	int		err;
@@ -2547,6 +2635,7 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
 	xoptattr_t	*xoap;
 	zfs_acl_t	*aclp = NULL;
 	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
+	boolean_t fuid_dirtied = B_FALSE;
 
 	if (mask == 0)
 		return (0);
@@ -2589,6 +2678,8 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
 	 */
 	xoap = xva_getxoptattr(xvap);
 
+	xva_init(&tmpxvattr);
+
 	/*
 	 * Immutable files can only alter immutable bit and atime
 	 */
@@ -2711,28 +2802,78 @@ top:
 	oldva.va_mode = pzp->zp_mode;
 	zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
 	if (mask & AT_XVATTR) {
-		if ((need_policy == FALSE) &&
-		    (XVA_ISSET_REQ(xvap, XAT_APPENDONLY) &&
-		    xoap->xoa_appendonly !=
-		    ((pzp->zp_flags & ZFS_APPENDONLY) != 0)) ||
-		    (XVA_ISSET_REQ(xvap, XAT_NOUNLINK) &&
-		    xoap->xoa_nounlink !=
-		    ((pzp->zp_flags & ZFS_NOUNLINK) != 0)) ||
-		    (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE) &&
-		    xoap->xoa_immutable !=
-		    ((pzp->zp_flags & ZFS_IMMUTABLE) != 0)) ||
-		    (XVA_ISSET_REQ(xvap, XAT_NODUMP) &&
-		    xoap->xoa_nodump !=
-		    ((pzp->zp_flags & ZFS_NODUMP) != 0)) ||
-		    (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED) &&
-		    xoap->xoa_av_modified !=
-		    ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0)) ||
-		    ((XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED) &&
-		    ((vp->v_type != VREG && xoap->xoa_av_quarantined) ||
-		    xoap->xoa_av_quarantined !=
-		    ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0)))) ||
-		    (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ||
-		    (XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
+		/*
+		 * Update xvattr mask to include only those attributes
+		 * that are actually changing.
+		 *
+		 * the bits will be restored prior to actually setting
+		 * the attributes so the caller thinks they were set.
+		 */
+		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
+			if (xoap->xoa_appendonly !=
+			    ((pzp->zp_flags & ZFS_APPENDONLY) != 0)) {
+				need_policy = TRUE;
+			} else {
+				XVA_CLR_REQ(xvap, XAT_APPENDONLY);
+				XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
+			}
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
+			if (xoap->xoa_nounlink !=
+			    ((pzp->zp_flags & ZFS_NOUNLINK) != 0)) {
+				need_policy = TRUE;
+			} else {
+				XVA_CLR_REQ(xvap, XAT_NOUNLINK);
+				XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
+			}
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
+			if (xoap->xoa_immutable !=
+			    ((pzp->zp_flags & ZFS_IMMUTABLE) != 0)) {
+				need_policy = TRUE;
+			} else {
+				XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
+				XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
+			}
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
+			if (xoap->xoa_nodump !=
+			    ((pzp->zp_flags & ZFS_NODUMP) != 0)) {
+				need_policy = TRUE;
+			} else {
+				XVA_CLR_REQ(xvap, XAT_NODUMP);
+				XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
+			}
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
+			if (xoap->xoa_av_modified !=
+			    ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0)) {
+				need_policy = TRUE;
+			} else {
+				XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
+				XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
+			}
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
+			if ((vp->v_type != VREG &&
+			    xoap->xoa_av_quarantined) ||
+			    xoap->xoa_av_quarantined !=
+			    ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0)) {
+				need_policy = TRUE;
+			} else {
+				XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
+				XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
+			}
+		}
+
+		if (need_policy == FALSE &&
+		    (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
+		    XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
 			need_policy = TRUE;
 		}
 	}
@@ -2800,30 +2941,14 @@ top:
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_bonus(tx, zp->z_id);
-	if (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
-	    ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid))) {
-		if (zfsvfs->z_fuid_obj == 0) {
-			dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
-			dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
-			    FUID_SIZE_ESTIMATE(zfsvfs));
-			dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL);
-		} else {
-			dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
-			dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
-			    FUID_SIZE_ESTIMATE(zfsvfs));
-		}
-	}
 
 	if (mask & AT_MODE) {
 		uint64_t pmode = pzp->zp_mode;
 
 		new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
 
-		if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) {
-			dmu_tx_abort(tx);
-			ZFS_EXIT(zfsvfs);
-			return (err);
-		}
+		if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
+			goto out;
 		if (pzp->zp_acl.z_acl_extern_obj) {
 			/* Are we upgrading ACL from old V0 format to new V1 */
 			if (zfsvfs->z_version <= ZPL_VERSION_FUID &&
@@ -2845,36 +2970,53 @@ top:
 		}
 	}
 
-	if ((mask & (AT_UID | AT_GID)) && pzp->zp_xattr != 0) {
-		err = zfs_zget(zp->z_zfsvfs, pzp->zp_xattr, &attrzp);
-		if (err) {
-			dmu_tx_abort(tx);
-			ZFS_EXIT(zfsvfs);
-			if (aclp)
-				zfs_acl_free(aclp);
-			return (err);
+	if (mask & (AT_UID | AT_GID)) {
+		if (pzp->zp_xattr) {
+			err = zfs_zget(zp->z_zfsvfs, pzp->zp_xattr, &attrzp);
+			if (err)
+				goto out;
+			dmu_tx_hold_bonus(tx, attrzp->z_id);
+		}
+		if (mask & AT_UID) {
+			new_uid = zfs_fuid_create(zfsvfs,
+			    (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
+			if (new_uid != pzp->zp_uid &&
+			    zfs_usergroup_overquota(zfsvfs, B_FALSE, new_uid)) {
+				err = EDQUOT;
+				goto out;
+			}
 		}
-		dmu_tx_hold_bonus(tx, attrzp->z_id);
-	}
-
-	err = dmu_tx_assign(tx, zfsvfs->z_assign);
-	if (err) {
-		if (attrzp)
-			VN_RELE(ZTOV(attrzp));
 
-		if (aclp) {
-			zfs_acl_free(aclp);
-			aclp = NULL;
+		if (mask & AT_GID) {
+			new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
+			    cr, ZFS_GROUP, &fuidp);
+			if (new_gid != pzp->zp_gid &&
+			    zfs_usergroup_overquota(zfsvfs, B_TRUE, new_gid)) {
+				err = EDQUOT;
+				goto out;
+			}
 		}
+		fuid_dirtied = zfsvfs->z_fuid_dirty;
+		if (fuid_dirtied) {
+			if (zfsvfs->z_fuid_obj == 0) {
+				dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+				dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+				    FUID_SIZE_ESTIMATE(zfsvfs));
+				dmu_tx_hold_zap(tx, MASTER_NODE_OBJ,
+				    FALSE, NULL);
+			} else {
+				dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
+				dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
+				    FUID_SIZE_ESTIMATE(zfsvfs));
+			}
+		}
+	}
 
-		if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+	err = dmu_tx_assign(tx, TXG_NOWAIT);
+	if (err) {
+		if (err == ERESTART)
 			dmu_tx_wait(tx);
-			dmu_tx_abort(tx);
-			goto top;
-		}
-		dmu_tx_abort(tx);
-		ZFS_EXIT(zfsvfs);
-		return (err);
+		goto out;
 	}
 
 	dmu_buf_will_dirty(zp->z_dbuf, tx);
@@ -2892,7 +3034,7 @@ top:
 	if (mask & AT_MODE) {
 		mutex_enter(&zp->z_acl_lock);
 		zp->z_phys->zp_mode = new_mode;
-		err = zfs_aclset_common(zp, aclp, cr, &fuidp, tx);
+		err = zfs_aclset_common(zp, aclp, cr, tx);
 		ASSERT3U(err, ==, 0);
 		mutex_exit(&zp->z_acl_lock);
 	}
@@ -2901,25 +3043,17 @@ top:
 		mutex_enter(&attrzp->z_lock);
 
 	if (mask & AT_UID) {
-		pzp->zp_uid = zfs_fuid_create(zfsvfs,
-		    vap->va_uid, cr, ZFS_OWNER, tx, &fuidp);
-		if (attrzp) {
-			attrzp->z_phys->zp_uid = zfs_fuid_create(zfsvfs,
-			    vap->va_uid,  cr, ZFS_OWNER, tx, &fuidp);
-		}
+		pzp->zp_uid = new_uid;
+		if (attrzp)
+			attrzp->z_phys->zp_uid = new_uid;
 	}
 
 	if (mask & AT_GID) {
-		pzp->zp_gid = zfs_fuid_create(zfsvfs, vap->va_gid,
-		    cr, ZFS_GROUP, tx, &fuidp);
+		pzp->zp_gid = new_gid;
 		if (attrzp)
-			attrzp->z_phys->zp_gid = zfs_fuid_create(zfsvfs,
-			    vap->va_gid, cr, ZFS_GROUP, tx, &fuidp);
+			attrzp->z_phys->zp_gid = new_gid;
 	}
 
-	if (aclp)
-		zfs_acl_free(aclp);
-
 	if (attrzp)
 		mutex_exit(&attrzp->z_lock);
 
@@ -2940,6 +3074,31 @@ top:
 	 */
 
 	if (xoap && (mask & AT_XVATTR)) {
+
+		/*
+		 * restore trimmed off masks
+		 * so that return masks can be set for caller.
+		 */
+
+		if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
+			XVA_SET_REQ(xvap, XAT_APPENDONLY);
+		}
+		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
+			XVA_SET_REQ(xvap, XAT_NOUNLINK);
+		}
+		if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
+			XVA_SET_REQ(xvap, XAT_IMMUTABLE);
+		}
+		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
+			XVA_SET_REQ(xvap, XAT_NODUMP);
+		}
+		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
+			XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
+		}
+		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
+			XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
+		}
+
 		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
 			size_t len;
 			dmu_object_info_t doi;
@@ -2956,17 +3115,35 @@ top:
 		zfs_xvattr_set(zp, xvap);
 	}
 
+	if (fuid_dirtied)
+		zfs_fuid_sync(zfsvfs, tx);
+
 	if (mask != 0)
 		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
 
-	if (fuidp)
-		zfs_fuid_info_free(fuidp);
 	mutex_exit(&zp->z_lock);
 
+out:
 	if (attrzp)
 		VN_RELE(ZTOV(attrzp));
 
-	dmu_tx_commit(tx);
+	if (aclp) {
+		zfs_acl_free(aclp);
+		aclp = NULL;
+	}
+
+	if (fuidp) {
+		zfs_fuid_info_free(fuidp);
+		fuidp = NULL;
+	}
+
+	if (err)
+		dmu_tx_abort(tx);
+	else
+		dmu_tx_commit(tx);
+
+	if (err == ERESTART)
+		goto top;
 
 	ZFS_EXIT(zfsvfs);
 	return (err);
@@ -3329,7 +3506,7 @@ top:
 	if (tzp)
 		dmu_tx_hold_bonus(tx, tzp->z_id);	/* parent changes */
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
-	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	error = dmu_tx_assign(tx, TXG_NOWAIT);
 	if (error) {
 		if (zl != NULL)
 			zfs_rename_unlock(&zl);
@@ -3342,7 +3519,7 @@ top:
 		VN_RELE(ZTOV(szp));
 		if (tzp)
 			VN_RELE(ZTOV(tzp));
-		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+		if (error == ERESTART) {
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
@@ -3428,7 +3605,8 @@ zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link,
 	int		len = strlen(link);
 	int		error;
 	int		zflg = ZNEW;
-	zfs_fuid_info_t *fuidp = NULL;
+	zfs_acl_ids_t	acl_ids;
+	boolean_t	fuid_dirtied;
 	int		flags = 0;
 
 	ASSERT(vap->va_type == VLNK);
@@ -3464,28 +3642,27 @@ top:
 		return (error);
 	}
 
+	VERIFY(0 == zfs_acl_ids_create(dzp, 0, vap, cr, NULL, &acl_ids));
+	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
+		zfs_acl_ids_free(&acl_ids);
+		zfs_dirent_unlock(dl);
+		ZFS_EXIT(zfsvfs);
+		return (EDQUOT);
+	}
 	tx = dmu_tx_create(zfsvfs->z_os);
+	fuid_dirtied = zfsvfs->z_fuid_dirty;
 	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
 	dmu_tx_hold_bonus(tx, dzp->z_id);
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
-	if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
+	if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE)
 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE);
-	if (IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr))) {
-		if (zfsvfs->z_fuid_obj == 0) {
-			dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
-			dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
-			    FUID_SIZE_ESTIMATE(zfsvfs));
-			dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL);
-		} else {
-			dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
-			dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
-			    FUID_SIZE_ESTIMATE(zfsvfs));
-		}
-	}
-	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	if (fuid_dirtied)
+		zfs_fuid_txhold(zfsvfs, tx);
+	error = dmu_tx_assign(tx, TXG_NOWAIT);
 	if (error) {
+		zfs_acl_ids_free(&acl_ids);
 		zfs_dirent_unlock(dl);
-		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+		if (error == ERESTART) {
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
@@ -3503,13 +3680,16 @@ top:
 	 * otherwise, store it just like any other file data.
 	 */
 	if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) {
-		zfs_mknode(dzp, vap, tx, cr, 0, &zp, len, NULL, &fuidp);
+		zfs_mknode(dzp, vap, tx, cr, 0, &zp, len, &acl_ids);
 		if (len != 0)
 			bcopy(link, zp->z_phys + 1, len);
 	} else {
 		dmu_buf_t *dbp;
 
-		zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, NULL, &fuidp);
+		zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids);
+
+		if (fuid_dirtied)
+			zfs_fuid_sync(zfsvfs, tx);
 		/*
 		 * Nothing can access the znode yet so no locking needed
 		 * for growing the znode's blocksize.
@@ -3530,7 +3710,6 @@ top:
 	 * Insert the new object into the directory.
 	 */
 	(void) zfs_link_create(dl, zp, tx, ZNEW);
-out:
 	if (error == 0) {
 		uint64_t txtype = TX_SYMLINK;
 		if (flags & FIGNORECASE)
@@ -3538,8 +3717,8 @@ out:
 		zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
 		*vpp = ZTOV(zp);
 	}
-	if (fuidp)
-		zfs_fuid_info_free(fuidp);
+
+	zfs_acl_ids_free(&acl_ids);
 
 	dmu_tx_commit(tx);
 
@@ -3701,10 +3880,10 @@ top:
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_bonus(tx, szp->z_id);
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
-	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	error = dmu_tx_assign(tx, TXG_NOWAIT);
 	if (error) {
 		zfs_dirent_unlock(dl);
-		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+		if (error == ERESTART) {
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
@@ -4994,6 +5173,7 @@ zfs_freebsd_aclcheck(ap)
 
 struct vop_vector zfs_vnodeops;
 struct vop_vector zfs_fifoops;
+struct vop_vector zfs_shareops;
 
 struct vop_vector zfs_vnodeops = {
 	.vop_default =		&default_vnodeops,
@@ -5052,3 +5232,15 @@ struct vop_vector zfs_fifoops = {
 	.vop_setacl =		zfs_freebsd_setacl,
 	.vop_aclcheck =		zfs_freebsd_aclcheck,
 };
+
+/*
+ * special share hidden files vnode operations template
+ */
+struct vop_vector zfs_shareops = {
+	.vop_default =		&default_vnodeops,
+	.vop_access =		zfs_freebsd_access,
+	.vop_inactive =		zfs_freebsd_inactive,
+	.vop_reclaim =		zfs_freebsd_reclaim,
+	.vop_fid =		zfs_freebsd_fid,
+	.vop_pathconf =		zfs_freebsd_pathconf,
+};
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
index 947f9dd..740302a 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -126,6 +126,7 @@ znode_evict_error(dmu_buf_t *dbuf, void *user_ptr)
 
 extern struct vop_vector zfs_vnodeops;
 extern struct vop_vector zfs_fifoops;
+extern struct vop_vector zfs_shareops;
 
 /*
  * XXX: We cannot use this function as a cache constructor, because
@@ -160,7 +161,6 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
 	list_link_init(&zp->z_link_node);
 
 	mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
-	rw_init(&zp->z_map_lock, NULL, RW_DEFAULT, NULL);
 	rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
 	rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL);
 	mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -185,7 +185,6 @@ zfs_znode_cache_destructor(void *buf, void *arg)
 	vn_free(ZTOV(zp));
 	ASSERT(!list_link_active(&zp->z_link_node));
 	mutex_destroy(&zp->z_lock);
-	rw_destroy(&zp->z_map_lock);
 	rw_destroy(&zp->z_parent_lock);
 	rw_destroy(&zp->z_name_lock);
 	mutex_destroy(&zp->z_acl_lock);
@@ -252,17 +251,6 @@ zfs_znode_move_impl(znode_t *ozp, znode_t *nzp)
 	POINTER_INVALIDATE(&ozp->z_zfsvfs);
 }
 
-/*
- * Wrapper function for ZFS_ENTER that returns 0 if successful and otherwise
- * returns a non-zero error code.
- */
-static int
-zfs_enter(zfsvfs_t *zfsvfs)
-{
-	ZFS_ENTER(zfsvfs);
-	return (0);
-}
-
 /*ARGSUSED*/
 static kmem_cbrc_t
 zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg)
@@ -287,8 +275,11 @@ zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg)
 
 	/*
 	 * Ensure that the filesystem is not unmounted during the move.
+	 * This is the equivalent to ZFS_ENTER().
 	 */
-	if (zfs_enter(zfsvfs) != 0) {		/* ZFS_ENTER */
+	rrw_enter(&zfsvfs->z_teardown_lock, RW_READER, FTAG);
+	if (zfsvfs->z_unmounted) {
+		ZFS_EXIT(zfsvfs);
 		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_unmounted);
 		return (KMEM_CBRC_DONT_KNOW);
 	}
@@ -378,97 +369,55 @@ zfs_znode_fini(void)
 	znode_cache = NULL;
 }
 
-/*
- * zfs_init_fs - Initialize the zfsvfs struct and the file system
- *	incore "master" object.  Verify version compatibility.
- */
 int
-zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp)
+zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
 {
-	objset_t	*os = zfsvfs->z_os;
-	int		i, error;
-	uint64_t fsid_guid;
-	uint64_t zval;
-
-	*zpp = NULL;
-
-	error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
-	if (error) {
-		return (error);
-	} else if (zfsvfs->z_version > ZPL_VERSION) {
-		(void) printf("Mismatched versions:  File system "
-		    "is version %llu on-disk format, which is "
-		    "incompatible with this software version %lld!",
-		    (u_longlong_t)zfsvfs->z_version, ZPL_VERSION);
-		return (ENOTSUP);
-	}
-
-	if ((error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &zval)) != 0)
-		return (error);
-	zfsvfs->z_norm = (int)zval;
-	if ((error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &zval)) != 0)
-		return (error);
-	zfsvfs->z_utf8 = (zval != 0);
-	if ((error = zfs_get_zplprop(os, ZFS_PROP_CASE, &zval)) != 0)
-		return (error);
-	zfsvfs->z_case = (uint_t)zval;
-	/*
-	 * Fold case on file systems that are always or sometimes case
-	 * insensitive.
-	 */
-	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
-	    zfsvfs->z_case == ZFS_CASE_MIXED)
-		zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
-
-	/*
-	 * The fsid is 64 bits, composed of an 8-bit fs type, which
-	 * separates our fsid from any other filesystem types, and a
-	 * 56-bit objset unique ID.  The objset unique ID is unique to
-	 * all objsets open on this system, provided by unique_create().
-	 * The 8-bit fs type must be put in the low bits of fsid[1]
-	 * because that's where other Solaris filesystems put it.
-	 */
-	fsid_guid = dmu_objset_fsid_guid(os);
-	ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
-	zfsvfs->z_vfs->vfs_fsid.val[0] = fsid_guid;
-	zfsvfs->z_vfs->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) |
-	    zfsvfs->z_vfs->mnt_vfc->vfc_typenum & 0xFF;
-
-	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
-	    &zfsvfs->z_root);
-	if (error)
-		return (error);
-	ASSERT(zfsvfs->z_root != 0);
-
-	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
-	    &zfsvfs->z_unlinkedobj);
-	if (error)
-		return (error);
-
-	/*
-	 * Initialize zget mutex's
-	 */
-	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
-		mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
+	zfs_acl_ids_t acl_ids;
+	vattr_t vattr;
+	znode_t *sharezp;
+	vnode_t *vp, vnode;
+	znode_t *zp;
+	int error;
 
-	error = zfs_zget(zfsvfs, zfsvfs->z_root, zpp);
-	if (error) {
-		/*
-		 * On error, we destroy the mutexes here since it's not
-		 * possible for the caller to determine if the mutexes were
-		 * initialized properly.
-		 */
-		for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
-			mutex_destroy(&zfsvfs->z_hold_mtx[i]);
-		return (error);
-	}
-	ASSERT3U((*zpp)->z_id, ==, zfsvfs->z_root);
-	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
-	    &zfsvfs->z_fuid_obj);
-	if (error == ENOENT)
-		error = 0;
+	vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
+	vattr.va_type = VDIR;
+	vattr.va_mode = S_IFDIR|0555;
+	vattr.va_uid = crgetuid(kcred);
+	vattr.va_gid = crgetgid(kcred);
+
+	sharezp = kmem_cache_alloc(znode_cache, KM_SLEEP);
+	zfs_znode_cache_constructor(sharezp, zfsvfs->z_parent->z_vfs, 0);
+	sharezp->z_unlinked = 0;
+	sharezp->z_atime_dirty = 0;
+	sharezp->z_zfsvfs = zfsvfs;
+
+	sharezp->z_vnode = &vnode;
+	vnode.v_data = sharezp;
+
+	vp = ZTOV(sharezp);
+	vp->v_type = VDIR;
+
+	VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr,
+	    kcred, NULL, &acl_ids));
+	zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE,
+	    &zp, 0, &acl_ids);
+	ASSERT3P(zp, ==, sharezp);
+	POINTER_INVALIDATE(&sharezp->z_zfsvfs);
+	error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
+	    ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx);
+	zfsvfs->z_shares_dir = sharezp->z_id;
+
+	zfs_acl_ids_free(&acl_ids);
+	ZTOV(sharezp)->v_data = NULL;
+	ZTOV(sharezp)->v_count = 0;
+	ZTOV(sharezp)->v_holdcnt = 0;
+	zp->z_vnode = NULL;
+	sharezp->z_vnode = NULL;
+	dmu_buf_rele(sharezp->z_dbuf, NULL);
+	sharezp->z_dbuf = NULL;
+	kmem_cache_free(znode_cache, sharezp);
 
-	return (0);
+	return (error);
 }
 
 /*
@@ -611,6 +560,11 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz)
 	case VFIFO:
 		vp->v_op = &zfs_fifoops;
 		break;
+        case VREG:
+		if (zp->z_phys->zp_parent == zfsvfs->z_shares_dir) {
+			vp->v_op = &zfs_shareops;
+		}
+		break;
 	}
 	if (vp->v_type != VFIFO)
 		VN_LOCK_ASHARE(vp);
@@ -639,7 +593,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz)
  *		flag	- flags:
  *			  IS_ROOT_NODE	- new object will be root
  *			  IS_XATTR	- new object is an attribute
- *			  IS_REPLAY	- intent log replay
  *		bonuslen - length of bonus buffer
  *		setaclp  - File/Dir initial ACL
  *		fuidp	 - Tracks fuid allocation.
@@ -649,8 +602,7 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz)
  */
 void
 zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
-    uint_t flag, znode_t **zpp, int bonuslen, zfs_acl_t *setaclp,
-    zfs_fuid_info_t **fuidp)
+    uint_t flag, znode_t **zpp, int bonuslen, zfs_acl_ids_t *acl_ids)
 {
 	dmu_buf_t	*db;
 	znode_phys_t	*pzp;
@@ -661,9 +613,8 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
 
 	ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
 
-	if (zfsvfs->z_assign >= TXG_INITIAL) {		/* ZIL replay */
+	if (zfsvfs->z_replay) {
 		obj = vap->va_nodeid;
-		flag |= IS_REPLAY;
 		now = vap->va_ctime;		/* see zfs_replay_create() */
 		gen = vap->va_nblocks;		/* ditto */
 	} else {
@@ -682,7 +633,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
 	 * assertions below.
 	 */
 	if (vap->va_type == VDIR) {
-		if (flag & IS_REPLAY) {
+		if (zfsvfs->z_replay) {
 			err = zap_create_claim_norm(zfsvfs->z_os, obj,
 			    zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
 			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
@@ -693,7 +644,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
 			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
 		}
 	} else {
-		if (flag & IS_REPLAY) {
+		if (zfsvfs->z_replay) {
 			err = dmu_object_claim(zfsvfs->z_os, obj,
 			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
 			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
@@ -775,7 +726,12 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
 		 */
 		*zpp = dzp;
 	}
-	zfs_perm_init(*zpp, dzp, flag, vap, tx, cr, setaclp, fuidp);
+	pzp->zp_uid = acl_ids->z_fuid;
+	pzp->zp_gid = acl_ids->z_fgid;
+	pzp->zp_mode = acl_ids->z_mode;
+	VERIFY(0 == zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
+	if (vap->va_mask & AT_XVATTR)
+		zfs_xvattr_set(*zpp, (xvattr_t *)vap);
 	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
 	if (!(flag & IS_ROOT_NODE)) {
 		vnode_t *vp;
@@ -1225,9 +1181,9 @@ top:
 		newblksz = 0;
 	}
 
-	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	error = dmu_tx_assign(tx, TXG_NOWAIT);
 	if (error) {
-		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+		if (error == ERESTART) {
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
@@ -1247,11 +1203,7 @@ top:
 
 	dmu_tx_commit(tx);
 
-	rw_enter(&zp->z_map_lock, RW_WRITER);
-	error = vinvalbuf(ZTOV(zp), V_SAVE, 0, 0);
-	ASSERT(error == 0);
 	vnode_pager_setsize(ZTOV(zp), end);
-	rw_exit(&zp->z_map_lock);
 
 	return (0);
 }
@@ -1296,11 +1248,7 @@ zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
 		 * In FreeBSD we cannot free block in the middle of a file,
 		 * but only at the end of a file.
 		 */
-		rw_enter(&zp->z_map_lock, RW_WRITER);
-		error = vinvalbuf(ZTOV(zp), V_SAVE, 0, 0);
-		ASSERT(error == 0);
 		vnode_pager_setsize(ZTOV(zp), off);
-		rw_exit(&zp->z_map_lock);
 	}
 
 	zfs_range_unlock(rl);
@@ -1347,9 +1295,9 @@ zfs_trunc(znode_t *zp, uint64_t end)
 top:
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_bonus(tx, zp->z_id);
-	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	error = dmu_tx_assign(tx, TXG_NOWAIT);
 	if (error) {
-		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+		if (error == ERESTART) {
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
@@ -1364,23 +1312,15 @@ top:
 
 	dmu_tx_commit(tx);
 
-	zfs_range_unlock(rl);
-
 	/*
 	 * Clear any mapped pages in the truncated region.  This has to
 	 * happen outside of the transaction to avoid the possibility of
 	 * a deadlock with someone trying to push a page that we are
 	 * about to invalidate.
 	 */
-	rw_enter(&zp->z_map_lock, RW_WRITER);
-#if 0
-	error = vtruncbuf(vp, curthread->td_ucred, curthread, end, PAGE_SIZE);
-#else
-	error = vinvalbuf(vp, V_SAVE, 0, 0);
-	ASSERT(error == 0);
 	vnode_pager_setsize(vp, end);
-#endif
-	rw_exit(&zp->z_map_lock);
+
+	zfs_range_unlock(rl);
 
 	return (0);
 }
@@ -1426,9 +1366,9 @@ zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
 log:
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_bonus(tx, zp->z_id);
-	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	error = dmu_tx_assign(tx, TXG_NOWAIT);
 	if (error) {
-		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+		if (error == ERESTART) {
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto log;
@@ -1448,7 +1388,7 @@ void
 zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
 {
 	zfsvfs_t	zfsvfs;
-	uint64_t	moid, doid, version;
+	uint64_t	moid, obj, version;
 	uint64_t	sense = ZFS_CASE_SENSITIVE;
 	uint64_t	norm = 0;
 	nvpair_t	*elem;
@@ -1458,6 +1398,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
 	vnode_t		vnode;
 	vattr_t		vattr;
 	znode_t		*zp;
+	zfs_acl_ids_t	acl_ids;
 
 	/*
 	 * First attempt to create master node.
@@ -1474,12 +1415,12 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
 	/*
 	 * Set starting attributes.
 	 */
-	if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID)
+	if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_USERSPACE)
 		version = ZPL_VERSION;
+	else if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID)
+		version = ZPL_VERSION_USERSPACE - 1;
 	else
 		version = ZPL_VERSION_FUID - 1;
-	error = zap_update(os, moid, ZPL_VERSION_STR,
-	    8, 1, &version, tx);
 	elem = NULL;
 	while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) {
 		/* For the moment we expect all zpl props to be uint64_ts */
@@ -1490,9 +1431,8 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
 		VERIFY(nvpair_value_uint64(elem, &val) == 0);
 		name = nvpair_name(elem);
 		if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) {
-			version = val;
-			error = zap_update(os, moid, ZPL_VERSION_STR,
-			    8, 1, &version, tx);
+			if (val < version)
+				version = val;
 		} else {
 			error = zap_update(os, moid, name, 8, 1, &val, tx);
 		}
@@ -1503,13 +1443,14 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
 			sense = val;
 	}
 	ASSERT(version != 0);
+	error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx);
 
 	/*
 	 * Create a delete queue.
 	 */
-	doid = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
+	obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
 
-	error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &doid, tx);
+	error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx);
 	ASSERT(error == 0);
 
 	/*
@@ -1535,7 +1476,6 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
 	bzero(&zfsvfs, sizeof (zfsvfs_t));
 
 	zfsvfs.z_os = os;
-	zfsvfs.z_assign = TXG_NOWAIT;
 	zfsvfs.z_parent = &zfsvfs;
 	zfsvfs.z_version = version;
 	zfsvfs.z_use_fuids = USE_FUIDS(version, os);
@@ -1556,19 +1496,30 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
 
 	ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs));
 	rootzp->z_zfsvfs = &zfsvfs;
-	zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, 0, NULL, NULL);
+	VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
+	    cr, NULL, &acl_ids));
+	zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, 0, &acl_ids);
 	ASSERT3P(zp, ==, rootzp);
 	error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
 	ASSERT(error == 0);
+	zfs_acl_ids_free(&acl_ids);
 	POINTER_INVALIDATE(&rootzp->z_zfsvfs);
 
 	dmu_buf_rele(rootzp->z_dbuf, NULL);
 	rootzp->z_dbuf = NULL;
-	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
-		mutex_destroy(&zfsvfs.z_hold_mtx[i]);
-	mutex_destroy(&zfsvfs.z_znodes_lock);
 	rootzp->z_vnode = NULL;
 	kmem_cache_free(znode_cache, rootzp);
+
+	/*
+	 * Create shares directory
+	 */
+
+	error = zfs_create_share_dir(&zfsvfs, tx);
+
+	ASSERT(error == 0);
+
+	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
+		mutex_destroy(&zfsvfs.z_hold_mtx[i]);
 }
 
 #endif /* _KERNEL */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
index 4a0e8d5..7839713 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
@@ -729,17 +729,26 @@ zil_lwb_write_done(zio_t *zio)
 	ASSERT(zio->io_bp->blk_fill == 0);
 
 	/*
-	 * Now that we've written this log block, we have a stable pointer
-	 * to the next block in the chain, so it's OK to let the txg in
-	 * which we allocated the next block sync.
+	 * Ensure the lwb buffer pointer is cleared before releasing
+	 * the txg. If we have had an allocation failure and
+	 * the txg is waiting to sync then we want want zil_sync()
+	 * to remove the lwb so that it's not picked up as the next new
+	 * one in zil_commit_writer(). zil_sync() will only remove
+	 * the lwb if lwb_buf is null.
 	 */
-	txg_rele_to_sync(&lwb->lwb_txgh);
-
 	zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
 	mutex_enter(&zilog->zl_lock);
 	lwb->lwb_buf = NULL;
 	if (zio->io_error)
 		zilog->zl_log_error = B_TRUE;
+
+	/*
+	 * Now that we've written this log block, we have a stable pointer
+	 * to the next block in the chain, so it's OK to let the txg in
+	 * which we allocated the next block sync. We still have the
+	 * zl_lock to ensure zil_sync doesn't kmem free the lwb.
+	 */
+	txg_rele_to_sync(&lwb->lwb_txgh);
 	mutex_exit(&zilog->zl_lock);
 }
 
@@ -1226,20 +1235,26 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx)
 	spa_t *spa = zilog->zl_spa;
 	lwb_t *lwb;
 
+	/*
+	 * We don't zero out zl_destroy_txg, so make sure we don't try
+	 * to destroy it twice.
+	 */
+	if (spa_sync_pass(spa) != 1)
+		return;
+
 	mutex_enter(&zilog->zl_lock);
 
 	ASSERT(zilog->zl_stop_sync == 0);
 
-	zh->zh_replay_seq = zilog->zl_replay_seq[txg & TXG_MASK];
+	zh->zh_replay_seq = zilog->zl_replayed_seq[txg & TXG_MASK];
 
 	if (zilog->zl_destroy_txg == txg) {
 		blkptr_t blk = zh->zh_log;
 
 		ASSERT(list_head(&zilog->zl_lwb_list) == NULL);
-		ASSERT(spa_sync_pass(spa) == 1);
 
 		bzero(zh, sizeof (zil_header_t));
-		bzero(zilog->zl_replay_seq, sizeof (zilog->zl_replay_seq));
+		bzero(zilog->zl_replayed_seq, sizeof (zilog->zl_replayed_seq));
 
 		if (zilog->zl_keep_first) {
 			/*
@@ -1454,12 +1469,57 @@ zil_resume(zilog_t *zilog)
 	mutex_exit(&zilog->zl_lock);
 }
 
+/*
+ * Read in the data for the dmu_sync()ed block, and change the log
+ * record to write this whole block.
+ */
+void
+zil_get_replay_data(zilog_t *zilog, lr_write_t *lr)
+{
+	blkptr_t *wbp = &lr->lr_blkptr;
+	char *wbuf = (char *)(lr + 1); /* data follows lr_write_t */
+	uint64_t blksz;
+
+	if (BP_IS_HOLE(wbp)) {	/* compressed to a hole */
+		blksz = BP_GET_LSIZE(&lr->lr_blkptr);
+		/*
+		 * If the blksz is zero then we must be replaying a log
+		 * from an version prior to setting the blksize of null blocks.
+		 * So we just zero the actual write size reqeusted.
+		 */
+		if (blksz == 0) {
+			bzero(wbuf, lr->lr_length);
+			return;
+		}
+		bzero(wbuf, blksz);
+	} else {
+		/*
+		 * A subsequent write may have overwritten this block, in which
+		 * case wbp may have been been freed and reallocated, and our
+		 * read of wbp may fail with a checksum error.  We can safely
+		 * ignore this because the later write will provide the
+		 * correct data.
+		 */
+		zbookmark_t zb;
+
+		zb.zb_objset = dmu_objset_id(zilog->zl_os);
+		zb.zb_object = lr->lr_foid;
+		zb.zb_level = 0;
+		zb.zb_blkid = -1; /* unknown */
+
+		blksz = BP_GET_LSIZE(&lr->lr_blkptr);
+		(void) zio_wait(zio_read(NULL, zilog->zl_spa, wbp, wbuf, blksz,
+		    NULL, NULL, ZIO_PRIORITY_SYNC_READ,
+		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &zb));
+	}
+	lr->lr_offset -= lr->lr_offset % blksz;
+	lr->lr_length = blksz;
+}
+
 typedef struct zil_replay_arg {
 	objset_t	*zr_os;
 	zil_replay_func_t **zr_replay;
-	zil_replay_cleaner_t *zr_replay_cleaner;
 	void		*zr_arg;
-	uint64_t	*zr_txgp;
 	boolean_t	zr_byteswap;
 	char		*zr_lrbuf;
 } zil_replay_arg_t;
@@ -1472,9 +1532,9 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
 	uint64_t reclen = lr->lrc_reclen;
 	uint64_t txtype = lr->lrc_txtype;
 	char *name;
-	int pass, error, sunk;
+	int pass, error;
 
-	if (zilog->zl_stop_replay)
+	if (!zilog->zl_replay)			/* giving up */
 		return;
 
 	if (lr->lrc_txg < claim_txg)		/* already committed */
@@ -1486,6 +1546,11 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
 	/* Strip case-insensitive bit, still present in log record */
 	txtype &= ~TX_CI;
 
+	if (txtype == 0 || txtype >= TX_MAX_TYPE) {
+		error = EINVAL;
+		goto bad;
+	}
+
 	/*
 	 * Make a copy of the data so we can revise and extend it.
 	 */
@@ -1502,103 +1567,16 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
 		byteswap_uint64_array(zr->zr_lrbuf, reclen);
 
 	/*
-	 * If this is a TX_WRITE with a blkptr, suck in the data.
-	 */
-	if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) {
-		lr_write_t *lrw = (lr_write_t *)lr;
-		blkptr_t *wbp = &lrw->lr_blkptr;
-		uint64_t wlen = lrw->lr_length;
-		char *wbuf = zr->zr_lrbuf + reclen;
-
-		if (BP_IS_HOLE(wbp)) {	/* compressed to a hole */
-			bzero(wbuf, wlen);
-		} else {
-			/*
-			 * A subsequent write may have overwritten this block,
-			 * in which case wbp may have been been freed and
-			 * reallocated, and our read of wbp may fail with a
-			 * checksum error.  We can safely ignore this because
-			 * the later write will provide the correct data.
-			 */
-			zbookmark_t zb;
-
-			zb.zb_objset = dmu_objset_id(zilog->zl_os);
-			zb.zb_object = lrw->lr_foid;
-			zb.zb_level = -1;
-			zb.zb_blkid = lrw->lr_offset / BP_GET_LSIZE(wbp);
-
-			(void) zio_wait(zio_read(NULL, zilog->zl_spa,
-			    wbp, wbuf, BP_GET_LSIZE(wbp), NULL, NULL,
-			    ZIO_PRIORITY_SYNC_READ,
-			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &zb));
-			(void) memmove(wbuf, wbuf + lrw->lr_blkoff, wlen);
-		}
-	}
-
-	/*
-	 * Replay of large truncates can end up needing additional txs
-	 * and a different txg. If they are nested within the replay tx
-	 * as below then a hang is possible. So we do the truncate here
-	 * and redo the truncate later (a no-op) and update the sequence
-	 * number whilst in the replay tx. Fortunately, it's safe to repeat
-	 * a truncate if we crash and the truncate commits. A create over
-	 * an existing file will also come in as a TX_TRUNCATE record.
-	 *
-	 * Note, remove of large files and renames over large files is
-	 * handled by putting the deleted object on a stable list
-	 * and if necessary force deleting the object outside of the replay
-	 * transaction using the zr_replay_cleaner.
-	 */
-	if (txtype == TX_TRUNCATE) {
-		*zr->zr_txgp = TXG_NOWAIT;
-		error = zr->zr_replay[TX_TRUNCATE](zr->zr_arg, zr->zr_lrbuf,
-		    zr->zr_byteswap);
-		if (error)
-			goto bad;
-		zr->zr_byteswap = 0; /* only byteswap once */
-	}
-
-	/*
 	 * We must now do two things atomically: replay this log record,
-	 * and update the log header to reflect the fact that we did so.
-	 * We use the DMU's ability to assign into a specific txg to do this.
+	 * and update the log header sequence number to reflect the fact that
+	 * we did so. At the end of each replay function the sequence number
+	 * is updated if we are in replay mode.
 	 */
-	for (pass = 1, sunk = B_FALSE; /* CONSTANTCONDITION */; pass++) {
-		uint64_t replay_txg;
-		dmu_tx_t *replay_tx;
-
-		replay_tx = dmu_tx_create(zr->zr_os);
-		error = dmu_tx_assign(replay_tx, TXG_WAIT);
-		if (error) {
-			dmu_tx_abort(replay_tx);
-			break;
-		}
-
-		replay_txg = dmu_tx_get_txg(replay_tx);
-
-		if (txtype == 0 || txtype >= TX_MAX_TYPE) {
-			error = EINVAL;
-		} else {
-			/*
-			 * On the first pass, arrange for the replay vector
-			 * to fail its dmu_tx_assign().  That's the only way
-			 * to ensure that those code paths remain well tested.
-			 *
-			 * Only byteswap (if needed) on the 1st pass.
-			 */
-			*zr->zr_txgp = replay_txg - (pass == 1);
-			error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lrbuf,
-			    zr->zr_byteswap && pass == 1);
-			*zr->zr_txgp = TXG_NOWAIT;
-		}
-
-		if (error == 0) {
-			dsl_dataset_dirty(dmu_objset_ds(zr->zr_os), replay_tx);
-			zilog->zl_replay_seq[replay_txg & TXG_MASK] =
-			    lr->lrc_seq;
-		}
-
-		dmu_tx_commit(replay_tx);
+	for (pass = 1; pass <= 2; pass++) {
+		zilog->zl_replaying_seq = lr->lrc_seq;
+		/* Only byteswap (if needed) on the 1st pass.  */
+		error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lrbuf,
+		    zr->zr_byteswap && pass == 1);
 
 		if (!error)
 			return;
@@ -1606,37 +1584,22 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
 		/*
 		 * The DMU's dnode layer doesn't see removes until the txg
 		 * commits, so a subsequent claim can spuriously fail with
-		 * EEXIST. So if we receive any error other than ERESTART
-		 * we try syncing out any removes then retrying the
-		 * transaction.
+		 * EEXIST. So if we receive any error we try syncing out
+		 * any removes then retry the transaction.
 		 */
-		if (error != ERESTART && !sunk) {
-			if (zr->zr_replay_cleaner)
-				zr->zr_replay_cleaner(zr->zr_arg);
+		if (pass == 1)
 			txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
-			sunk = B_TRUE;
-			continue; /* retry */
-		}
-
-		if (error != ERESTART)
-			break;
-
-		if (pass != 1)
-			txg_wait_open(spa_get_dsl(zilog->zl_spa),
-			    replay_txg + 1);
-
-		dprintf("pass %d, retrying\n", pass);
 	}
 
 bad:
-	ASSERT(error && error != ERESTART);
+	ASSERT(error);
 	name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
 	dmu_objset_name(zr->zr_os, name);
 	cmn_err(CE_WARN, "ZFS replay transaction error %d, "
 	    "dataset %s, seq 0x%llx, txtype %llu %s\n",
 	    error, name, (u_longlong_t)lr->lrc_seq, (u_longlong_t)txtype,
 	    (lr->lrc_txtype & TX_CI) ? "CI" : "");
-	zilog->zl_stop_replay = 1;
+	zilog->zl_replay = B_FALSE;
 	kmem_free(name, MAXNAMELEN);
 }
 
@@ -1651,9 +1614,7 @@ zil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
  * If this dataset has a non-empty intent log, replay it and destroy it.
  */
 void
-zil_replay(objset_t *os, void *arg, uint64_t *txgp,
-	zil_replay_func_t *replay_func[TX_MAX_TYPE],
-	zil_replay_cleaner_t *replay_cleaner)
+zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE])
 {
 	zilog_t *zilog = dmu_objset_zil(os);
 	const zil_header_t *zh = zilog->zl_header;
@@ -1667,9 +1628,7 @@ zil_replay(objset_t *os, void *arg, uint64_t *txgp,
 
 	zr.zr_os = os;
 	zr.zr_replay = replay_func;
-	zr.zr_replay_cleaner = replay_cleaner;
 	zr.zr_arg = arg;
-	zr.zr_txgp = txgp;
 	zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log);
 	zr.zr_lrbuf = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP);
 
@@ -1678,7 +1637,7 @@ zil_replay(objset_t *os, void *arg, uint64_t *txgp,
 	 */
 	txg_wait_synced(zilog->zl_dmu_pool, 0);
 
-	zilog->zl_stop_replay = 0;
+	zilog->zl_replay = B_TRUE;
 	zilog->zl_replay_time = LBOLT;
 	ASSERT(zilog->zl_replay_blks == 0);
 	(void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr,
@@ -1687,6 +1646,7 @@ zil_replay(objset_t *os, void *arg, uint64_t *txgp,
 
 	zil_destroy(zilog, B_FALSE);
 	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
+	zilog->zl_replay = B_FALSE;
 	//printf("ZFS: Replay of ZIL on %s finished.\n", os->os->os_spa->spa_name);
 }
 
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
index 8879742..75b7617 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -76,6 +76,7 @@ char *zio_type_name[ZIO_TYPES] = {
  * ==========================================================================
  */
 kmem_cache_t *zio_cache;
+kmem_cache_t *zio_link_cache;
 kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
 kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
 
@@ -94,8 +95,10 @@ void
 zio_init(void)
 {
 	size_t c;
-	zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0,
-	    NULL, NULL, NULL, NULL, NULL, 0);
+	zio_cache = kmem_cache_create("zio_cache",
+	    sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+	zio_link_cache = kmem_cache_create("zio_link_cache",
+	    sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 
 	/*
 	 * For small buffers, we want a cache for each multiple of
@@ -165,6 +168,7 @@ zio_fini(void)
 		zio_data_buf_cache[c] = NULL;
 	}
 
+	kmem_cache_destroy(zio_link_cache);
 	kmem_cache_destroy(zio_cache);
 
 	zio_inject_fini();
@@ -311,41 +315,102 @@ zio_decompress(zio_t *zio, void *data, uint64_t size)
  * I/O parent/child relationships and pipeline interlocks
  * ==========================================================================
  */
+/*
+ * NOTE - Callers to zio_walk_parents() and zio_walk_children must
+ *        continue calling these functions until they return NULL.
+ *        Otherwise, the next caller will pick up the list walk in
+ *        some indeterminate state.  (Otherwise every caller would
+ *        have to pass in a cookie to keep the state represented by
+ *        io_walk_link, which gets annoying.)
+ */
+zio_t *
+zio_walk_parents(zio_t *cio)
+{
+	zio_link_t *zl = cio->io_walk_link;
+	list_t *pl = &cio->io_parent_list;
 
-static void
-zio_add_child(zio_t *pio, zio_t *zio)
+	zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl);
+	cio->io_walk_link = zl;
+
+	if (zl == NULL)
+		return (NULL);
+
+	ASSERT(zl->zl_child == cio);
+	return (zl->zl_parent);
+}
+
+zio_t *
+zio_walk_children(zio_t *pio)
+{
+	zio_link_t *zl = pio->io_walk_link;
+	list_t *cl = &pio->io_child_list;
+
+	zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl);
+	pio->io_walk_link = zl;
+
+	if (zl == NULL)
+		return (NULL);
+
+	ASSERT(zl->zl_parent == pio);
+	return (zl->zl_child);
+}
+
+zio_t *
+zio_unique_parent(zio_t *cio)
 {
+	zio_t *pio = zio_walk_parents(cio);
+
+	VERIFY(zio_walk_parents(cio) == NULL);
+	return (pio);
+}
+
+void
+zio_add_child(zio_t *pio, zio_t *cio)
+{
+	zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
+
+	/*
+	 * Logical I/Os can have logical, gang, or vdev children.
+	 * Gang I/Os can have gang or vdev children.
+	 * Vdev I/Os can only have vdev children.
+	 * The following ASSERT captures all of these constraints.
+	 */
+	ASSERT(cio->io_child_type <= pio->io_child_type);
+
+	zl->zl_parent = pio;
+	zl->zl_child = cio;
+
+	mutex_enter(&cio->io_lock);
 	mutex_enter(&pio->io_lock);
-	if (zio->io_stage < ZIO_STAGE_READY)
-		pio->io_children[zio->io_child_type][ZIO_WAIT_READY]++;
-	if (zio->io_stage < ZIO_STAGE_DONE)
-		pio->io_children[zio->io_child_type][ZIO_WAIT_DONE]++;
-	zio->io_sibling_prev = NULL;
-	zio->io_sibling_next = pio->io_child;
-	if (pio->io_child != NULL)
-		pio->io_child->io_sibling_prev = zio;
-	pio->io_child = zio;
-	zio->io_parent = pio;
+
+	ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
+
+	for (int w = 0; w < ZIO_WAIT_TYPES; w++)
+		pio->io_children[cio->io_child_type][w] += !cio->io_state[w];
+
+	list_insert_head(&pio->io_child_list, zl);
+	list_insert_head(&cio->io_parent_list, zl);
+
 	mutex_exit(&pio->io_lock);
+	mutex_exit(&cio->io_lock);
 }
 
 static void
-zio_remove_child(zio_t *pio, zio_t *zio)
+zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
 {
-	zio_t *next, *prev;
-
-	ASSERT(zio->io_parent == pio);
+	ASSERT(zl->zl_parent == pio);
+	ASSERT(zl->zl_child == cio);
 
+	mutex_enter(&cio->io_lock);
 	mutex_enter(&pio->io_lock);
-	next = zio->io_sibling_next;
-	prev = zio->io_sibling_prev;
-	if (next != NULL)
-		next->io_sibling_prev = prev;
-	if (prev != NULL)
-		prev->io_sibling_next = next;
-	if (pio->io_child == zio)
-		pio->io_child = next;
+
+	list_remove(&pio->io_child_list, zl);
+	list_remove(&cio->io_parent_list, zl);
+
 	mutex_exit(&pio->io_lock);
+	mutex_exit(&cio->io_lock);
+
+	kmem_cache_free(zio_link_cache, zl);
 }
 
 static boolean_t
@@ -420,6 +485,11 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
 	mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
 
+	list_create(&zio->io_parent_list, sizeof (zio_link_t),
+	    offsetof(zio_link_t, zl_parent_node));
+	list_create(&zio->io_child_list, sizeof (zio_link_t),
+	    offsetof(zio_link_t, zl_child_node));
+
 	if (vd != NULL)
 		zio->io_child_type = ZIO_CHILD_VDEV;
 	else if (flags & ZIO_FLAG_GANG_CHILD)
@@ -433,11 +503,10 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
 		zio->io_bp_orig = *bp;
 		if (type != ZIO_TYPE_WRITE)
 			zio->io_bp = &zio->io_bp_copy;	/* so caller can free */
-		if (zio->io_child_type == ZIO_CHILD_LOGICAL) {
-			if (BP_IS_GANG(bp))
-				pipeline |= ZIO_GANG_STAGES;
+		if (zio->io_child_type == ZIO_CHILD_LOGICAL)
 			zio->io_logical = zio;
-		}
+		if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp))
+			pipeline |= ZIO_GANG_STAGES;
 	}
 
 	zio->io_spa = spa;
@@ -454,19 +523,17 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
 	zio->io_orig_stage = zio->io_stage = stage;
 	zio->io_orig_pipeline = zio->io_pipeline = pipeline;
 
+	zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY);
+	zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
+
 	if (zb != NULL)
 		zio->io_bookmark = *zb;
 
 	if (pio != NULL) {
-		/*
-		 * Logical I/Os can have logical, gang, or vdev children.
-		 * Gang I/Os can have gang or vdev children.
-		 * Vdev I/Os can only have vdev children.
-		 * The following ASSERT captures all of these constraints.
-		 */
-		ASSERT(zio->io_child_type <= pio->io_child_type);
 		if (zio->io_logical == NULL)
 			zio->io_logical = pio->io_logical;
+		if (zio->io_child_type == ZIO_CHILD_GANG)
+			zio->io_gang_leader = pio->io_gang_leader;
 		zio_add_child(pio, zio);
 	}
 
@@ -476,29 +543,21 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
 static void
 zio_destroy(zio_t *zio)
 {
-	spa_t *spa = zio->io_spa;
-	uint8_t async_root = zio->io_async_root;
-
+	list_destroy(&zio->io_parent_list);
+	list_destroy(&zio->io_child_list);
 	mutex_destroy(&zio->io_lock);
 	cv_destroy(&zio->io_cv);
 	kmem_cache_free(zio_cache, zio);
-
-	if (async_root) {
-		mutex_enter(&spa->spa_async_root_lock);
-		if (--spa->spa_async_root_count == 0)
-			cv_broadcast(&spa->spa_async_root_cv);
-		mutex_exit(&spa->spa_async_root_lock);
-	}
 }
 
 zio_t *
-zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private,
-	int flags)
+zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
+    void *private, int flags)
 {
 	zio_t *zio;
 
 	zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
-	    ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL,
+	    ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
 	    ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE);
 
 	return (zio);
@@ -507,7 +566,7 @@ zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private,
 zio_t *
 zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags)
 {
-	return (zio_null(NULL, spa, done, private, flags));
+	return (zio_null(NULL, spa, NULL, done, private, flags));
 }
 
 zio_t *
@@ -576,12 +635,12 @@ zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
 	ASSERT(!BP_IS_HOLE(bp));
 
 	if (bp->blk_fill == BLK_FILL_ALREADY_FREED)
-		return (zio_null(pio, spa, NULL, NULL, flags));
+		return (zio_null(pio, spa, NULL, NULL, NULL, flags));
 
 	if (txg == spa->spa_syncing_txg &&
 	    spa_sync_pass(spa) > SYNC_PASS_DEFERRED_FREE) {
 		bplist_enqueue_deferred(&spa->spa_sync_bplist, bp);
-		return (zio_null(pio, spa, NULL, NULL, flags));
+		return (zio_null(pio, spa, NULL, NULL, NULL, flags));
 	}
 
 	zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
@@ -632,7 +691,7 @@ zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
 
 		zio->io_cmd = cmd;
 	} else {
-		zio = zio_null(pio, spa, NULL, NULL, flags);
+		zio = zio_null(pio, spa, NULL, NULL, NULL, flags);
 
 		for (c = 0; c < vd->vdev_children; c++)
 			zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
@@ -770,7 +829,9 @@ zio_read_bp_init(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 
-	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && zio->io_logical == zio) {
+	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
+	    zio->io_child_type == ZIO_CHILD_LOGICAL &&
+	    !(zio->io_flags & ZIO_FLAG_RAW)) {
 		uint64_t csize = BP_GET_PSIZE(bp);
 		void *cbuf = zio_buf_alloc(csize);
 
@@ -819,16 +880,10 @@ zio_write_bp_init(zio_t *zio)
 		 * few passes, stop compressing to ensure convergence.
 		 */
 		pass = spa_sync_pass(zio->io_spa);
-		ASSERT(pass > 1);
 
 		if (pass > SYNC_PASS_DONT_COMPRESS)
 			compress = ZIO_COMPRESS_OFF;
 
-		/*
-		 * Only MOS (objset 0) data should need to be rewritten.
-		 */
-		ASSERT(zio->io_logical->io_bookmark.zb_objset == 0);
-
 		/* Make sure someone doesn't change their mind on overwrites */
 		ASSERT(MIN(zp->zp_ndvas + BP_IS_GANG(bp),
 		    spa_max_replication(zio->io_spa)) == BP_GET_NDVAS(bp));
@@ -1022,17 +1077,16 @@ zio_nowait(zio_t *zio)
 {
 	ASSERT(zio->io_executor == NULL);
 
-	if (zio->io_parent == NULL && zio->io_child_type == ZIO_CHILD_LOGICAL) {
+	if (zio->io_child_type == ZIO_CHILD_LOGICAL &&
+	    zio_unique_parent(zio) == NULL) {
 		/*
 		 * This is a logical async I/O with no parent to wait for it.
-		 * Attach it to the pool's global async root zio so that
-		 * spa_unload() has a way of waiting for async I/O to finish.
+		 * We add it to the spa_async_root_zio "Godfather" I/O which
+		 * will ensure they complete prior to unloading the pool.
 		 */
 		spa_t *spa = zio->io_spa;
-		zio->io_async_root = B_TRUE;
-		mutex_enter(&spa->spa_async_root_lock);
-		spa->spa_async_root_count++;
-		mutex_exit(&spa->spa_async_root_lock);
+
+		zio_add_child(spa->spa_async_zio_root, zio);
 	}
 
 	zio_execute(zio);
@@ -1047,13 +1101,20 @@ zio_nowait(zio_t *zio)
 static void
 zio_reexecute(zio_t *pio)
 {
-	zio_t *zio, *zio_next;
+	zio_t *cio, *cio_next;
+
+	ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
+	ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN);
+	ASSERT(pio->io_gang_leader == NULL);
+	ASSERT(pio->io_gang_tree == NULL);
 
 	pio->io_flags = pio->io_orig_flags;
 	pio->io_stage = pio->io_orig_stage;
 	pio->io_pipeline = pio->io_orig_pipeline;
 	pio->io_reexecute = 0;
 	pio->io_error = 0;
+	for (int w = 0; w < ZIO_WAIT_TYPES; w++)
+		pio->io_state[w] = 0;
 	for (int c = 0; c < ZIO_CHILD_TYPES; c++)
 		pio->io_child_error[c] = 0;
 
@@ -1073,24 +1134,27 @@ zio_reexecute(zio_t *pio)
 
 	/*
 	 * As we reexecute pio's children, new children could be created.
-	 * New children go to the head of the io_child list, however,
+	 * New children go to the head of pio's io_child_list, however,
 	 * so we will (correctly) not reexecute them.  The key is that
-	 * the remainder of the io_child list, from 'zio_next' onward,
-	 * cannot be affected by any side effects of reexecuting 'zio'.
+	 * the remainder of pio's io_child_list, from 'cio_next' onward,
+	 * cannot be affected by any side effects of reexecuting 'cio'.
 	 */
-	for (zio = pio->io_child; zio != NULL; zio = zio_next) {
-		zio_next = zio->io_sibling_next;
+	for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) {
+		cio_next = zio_walk_children(pio);
 		mutex_enter(&pio->io_lock);
-		pio->io_children[zio->io_child_type][ZIO_WAIT_READY]++;
-		pio->io_children[zio->io_child_type][ZIO_WAIT_DONE]++;
+		for (int w = 0; w < ZIO_WAIT_TYPES; w++)
+			pio->io_children[cio->io_child_type][w]++;
 		mutex_exit(&pio->io_lock);
-		zio_reexecute(zio);
+		zio_reexecute(cio);
 	}
 
 	/*
 	 * Now that all children have been reexecuted, execute the parent.
+	 * We don't reexecute "The Godfather" I/O here as it's the
+	 * responsibility of the caller to wait on him.
 	 */
-	zio_execute(pio);
+	if (!(pio->io_flags & ZIO_FLAG_GODFATHER))
+		zio_execute(pio);
 }
 
 void
@@ -1106,14 +1170,17 @@ zio_suspend(spa_t *spa, zio_t *zio)
 	mutex_enter(&spa->spa_suspend_lock);
 
 	if (spa->spa_suspend_zio_root == NULL)
-		spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, 0);
+		spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL,
+		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
+		    ZIO_FLAG_GODFATHER);
 
 	spa->spa_suspended = B_TRUE;
 
 	if (zio != NULL) {
+		ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
 		ASSERT(zio != spa->spa_suspend_zio_root);
 		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
-		ASSERT(zio->io_parent == NULL);
+		ASSERT(zio_unique_parent(zio) == NULL);
 		ASSERT(zio->io_stage == ZIO_STAGE_DONE);
 		zio_add_child(spa->spa_suspend_zio_root, zio);
 	}
@@ -1121,10 +1188,10 @@ zio_suspend(spa_t *spa, zio_t *zio)
 	mutex_exit(&spa->spa_suspend_lock);
 }
 
-void
+int
 zio_resume(spa_t *spa)
 {
-	zio_t *pio, *zio;
+	zio_t *pio;
 
 	/*
 	 * Reexecute all previously suspended i/o.
@@ -1137,17 +1204,10 @@ zio_resume(spa_t *spa)
 	mutex_exit(&spa->spa_suspend_lock);
 
 	if (pio == NULL)
-		return;
+		return (0);
 
-	while ((zio = pio->io_child) != NULL) {
-		zio_remove_child(pio, zio);
-		zio->io_parent = NULL;
-		zio_reexecute(zio);
-	}
-
-	ASSERT(pio->io_children[ZIO_CHILD_LOGICAL][ZIO_WAIT_DONE] == 0);
-
-	(void) zio_wait(pio);
+	zio_reexecute(pio);
+	return (zio_wait(pio));
 }
 
 void
@@ -1254,7 +1314,7 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
 		 * (Presently, nothing actually uses interior data checksums;
 		 * this is just good hygiene.)
 		 */
-		if (gn != pio->io_logical->io_gang_tree) {
+		if (gn != pio->io_gang_leader->io_gang_tree) {
 			zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
 			    data, BP_GET_PSIZE(bp));
 		}
@@ -1336,27 +1396,27 @@ zio_gang_tree_free(zio_gang_node_t **gnpp)
 }
 
 static void
-zio_gang_tree_assemble(zio_t *lio, blkptr_t *bp, zio_gang_node_t **gnpp)
+zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp)
 {
 	zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
 
-	ASSERT(lio->io_logical == lio);
+	ASSERT(gio->io_gang_leader == gio);
 	ASSERT(BP_IS_GANG(bp));
 
-	zio_nowait(zio_read(lio, lio->io_spa, bp, gn->gn_gbh,
+	zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh,
 	    SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn,
-	    lio->io_priority, ZIO_GANG_CHILD_FLAGS(lio), &lio->io_bookmark));
+	    gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
 }
 
 static void
 zio_gang_tree_assemble_done(zio_t *zio)
 {
-	zio_t *lio = zio->io_logical;
+	zio_t *gio = zio->io_gang_leader;
 	zio_gang_node_t *gn = zio->io_private;
 	blkptr_t *bp = zio->io_bp;
 
-	ASSERT(zio->io_parent == lio);
-	ASSERT(zio->io_child == NULL);
+	ASSERT(gio == zio_unique_parent(zio));
+	ASSERT(zio_walk_children(zio) == NULL);
 
 	if (zio->io_error)
 		return;
@@ -1372,25 +1432,25 @@ zio_gang_tree_assemble_done(zio_t *zio)
 		blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
 		if (!BP_IS_GANG(gbp))
 			continue;
-		zio_gang_tree_assemble(lio, gbp, &gn->gn_child[g]);
+		zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]);
 	}
 }
 
 static void
 zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data)
 {
-	zio_t *lio = pio->io_logical;
+	zio_t *gio = pio->io_gang_leader;
 	zio_t *zio;
 
 	ASSERT(BP_IS_GANG(bp) == !!gn);
-	ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(lio->io_bp));
-	ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == lio->io_gang_tree);
+	ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp));
+	ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree);
 
 	/*
 	 * If you're a gang header, your data is in gn->gn_gbh.
 	 * If you're a gang member, your data is in 'data' and gn == NULL.
 	 */
-	zio = zio_gang_issue_func[lio->io_type](pio, bp, gn, data);
+	zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data);
 
 	if (gn != NULL) {
 		ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC);
@@ -1404,8 +1464,8 @@ zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data)
 		}
 	}
 
-	if (gn == lio->io_gang_tree)
-		ASSERT3P((char *)lio->io_data + lio->io_size, ==, data);
+	if (gn == gio->io_gang_tree)
+		ASSERT3P((char *)gio->io_data + gio->io_size, ==, data);
 
 	if (zio != pio)
 		zio_nowait(zio);
@@ -1416,7 +1476,10 @@ zio_gang_assemble(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 
-	ASSERT(BP_IS_GANG(bp) && zio == zio->io_logical);
+	ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL);
+	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
+
+	zio->io_gang_leader = zio;
 
 	zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree);
 
@@ -1426,18 +1489,18 @@ zio_gang_assemble(zio_t *zio)
 static int
 zio_gang_issue(zio_t *zio)
 {
-	zio_t *lio = zio->io_logical;
 	blkptr_t *bp = zio->io_bp;
 
 	if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE))
 		return (ZIO_PIPELINE_STOP);
 
-	ASSERT(BP_IS_GANG(bp) && zio == lio);
+	ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio);
+	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
 
 	if (zio->io_child_error[ZIO_CHILD_GANG] == 0)
-		zio_gang_tree_issue(lio, lio->io_gang_tree, bp, lio->io_data);
+		zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data);
 	else
-		zio_gang_tree_free(&lio->io_gang_tree);
+		zio_gang_tree_free(&zio->io_gang_tree);
 
 	zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
@@ -1447,8 +1510,8 @@ zio_gang_issue(zio_t *zio)
 static void
 zio_write_gang_member_ready(zio_t *zio)
 {
-	zio_t *pio = zio->io_parent;
-	zio_t *lio = zio->io_logical;
+	zio_t *pio = zio_unique_parent(zio);
+	zio_t *gio = zio->io_gang_leader;
 	dva_t *cdva = zio->io_bp->blk_dva;
 	dva_t *pdva = pio->io_bp->blk_dva;
 	uint64_t asize;
@@ -1459,7 +1522,7 @@ zio_write_gang_member_ready(zio_t *zio)
 	ASSERT(BP_IS_HOLE(&zio->io_bp_orig));
 
 	ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
-	ASSERT3U(zio->io_prop.zp_ndvas, ==, lio->io_prop.zp_ndvas);
+	ASSERT3U(zio->io_prop.zp_ndvas, ==, gio->io_prop.zp_ndvas);
 	ASSERT3U(zio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(zio->io_bp));
 	ASSERT3U(pio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(pio->io_bp));
 	ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
@@ -1479,28 +1542,28 @@ zio_write_gang_block(zio_t *pio)
 {
 	spa_t *spa = pio->io_spa;
 	blkptr_t *bp = pio->io_bp;
-	zio_t *lio = pio->io_logical;
+	zio_t *gio = pio->io_gang_leader;
 	zio_t *zio;
 	zio_gang_node_t *gn, **gnpp;
 	zio_gbh_phys_t *gbh;
 	uint64_t txg = pio->io_txg;
 	uint64_t resid = pio->io_size;
 	uint64_t lsize;
-	int ndvas = lio->io_prop.zp_ndvas;
+	int ndvas = gio->io_prop.zp_ndvas;
 	int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa));
 	zio_prop_t zp;
 	int error;
 
 	error = metaslab_alloc(spa, spa->spa_normal_class, SPA_GANGBLOCKSIZE,
-	    bp, gbh_ndvas, txg, pio == lio ? NULL : lio->io_bp,
+	    bp, gbh_ndvas, txg, pio == gio ? NULL : gio->io_bp,
 	    METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER);
 	if (error) {
 		pio->io_error = error;
 		return (ZIO_PIPELINE_CONTINUE);
 	}
 
-	if (pio == lio) {
-		gnpp = &lio->io_gang_tree;
+	if (pio == gio) {
+		gnpp = &gio->io_gang_tree;
 	} else {
 		gnpp = pio->io_private;
 		ASSERT(pio->io_ready == zio_write_gang_member_ready);
@@ -1524,11 +1587,11 @@ zio_write_gang_block(zio_t *pio)
 		    SPA_MINBLOCKSIZE);
 		ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid);
 
-		zp.zp_checksum = lio->io_prop.zp_checksum;
+		zp.zp_checksum = gio->io_prop.zp_checksum;
 		zp.zp_compress = ZIO_COMPRESS_OFF;
 		zp.zp_type = DMU_OT_NONE;
 		zp.zp_level = 0;
-		zp.zp_ndvas = lio->io_prop.zp_ndvas;
+		zp.zp_ndvas = gio->io_prop.zp_ndvas;
 
 		zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
 		    (char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
@@ -1561,6 +1624,11 @@ zio_dva_allocate(zio_t *zio)
 	blkptr_t *bp = zio->io_bp;
 	int error;
 
+	if (zio->io_gang_leader == NULL) {
+		ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
+		zio->io_gang_leader = zio;
+	}
+
 	ASSERT(BP_IS_HOLE(bp));
 	ASSERT3U(BP_GET_NDVAS(bp), ==, 0);
 	ASSERT3U(zio->io_prop.zp_ndvas, >, 0);
@@ -1692,72 +1760,6 @@ zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg)
  * Read and write to physical devices
  * ==========================================================================
  */
-
-static void
-zio_vdev_io_probe_done(zio_t *zio)
-{
-	zio_t *dio;
-	vdev_t *vd = zio->io_private;
-
-	mutex_enter(&vd->vdev_probe_lock);
-	ASSERT(vd->vdev_probe_zio == zio);
-	vd->vdev_probe_zio = NULL;
-	mutex_exit(&vd->vdev_probe_lock);
-
-	while ((dio = zio->io_delegate_list) != NULL) {
-		zio->io_delegate_list = dio->io_delegate_next;
-		dio->io_delegate_next = NULL;
-		if (!vdev_accessible(vd, dio))
-			dio->io_error = ENXIO;
-		zio_execute(dio);
-	}
-}
-
-/*
- * Probe the device to determine whether I/O failure is specific to this
- * zio (e.g. a bad sector) or affects the entire vdev (e.g. unplugged).
- */
-static int
-zio_vdev_io_probe(zio_t *zio)
-{
-	vdev_t *vd = zio->io_vd;
-	zio_t *pio = NULL;
-	boolean_t created_pio = B_FALSE;
-
-	/*
-	 * Don't probe the probe.
-	 */
-	if (zio->io_flags & ZIO_FLAG_PROBE)
-		return (ZIO_PIPELINE_CONTINUE);
-
-	/*
-	 * To prevent 'probe storms' when a device fails, we create
-	 * just one probe i/o at a time.  All zios that want to probe
-	 * this vdev will join the probe zio's io_delegate_list.
-	 */
-	mutex_enter(&vd->vdev_probe_lock);
-
-	if ((pio = vd->vdev_probe_zio) == NULL) {
-		vd->vdev_probe_zio = pio = zio_root(zio->io_spa,
-		    zio_vdev_io_probe_done, vd, ZIO_FLAG_CANFAIL);
-		created_pio = B_TRUE;
-		vd->vdev_probe_wanted = B_TRUE;
-		spa_async_request(zio->io_spa, SPA_ASYNC_PROBE);
-	}
-
-	zio->io_delegate_next = pio->io_delegate_list;
-	pio->io_delegate_list = zio;
-
-	mutex_exit(&vd->vdev_probe_lock);
-
-	if (created_pio) {
-		zio_nowait(vdev_probe(vd, pio));
-		zio_nowait(pio);
-	}
-
-	return (ZIO_PIPELINE_STOP);
-}
-
 static int
 zio_vdev_io_start(zio_t *zio)
 {
@@ -1793,13 +1795,35 @@ zio_vdev_io_start(zio_t *zio)
 
 	ASSERT(P2PHASE(zio->io_offset, align) == 0);
 	ASSERT(P2PHASE(zio->io_size, align) == 0);
-	ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE));
+	ASSERT(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa));
+
+	/*
+	 * If this is a repair I/O, and there's no self-healing involved --
+	 * that is, we're just resilvering what we expect to resilver --
+	 * then don't do the I/O unless zio's txg is actually in vd's DTL.
+	 * This prevents spurious resilvering with nested replication.
+	 * For example, given a mirror of mirrors, (A+B)+(C+D), if only
+	 * A is out of date, we'll read from C+D, then use the data to
+	 * resilver A+B -- but we don't actually want to resilver B, just A.
+	 * The top-level mirror has no way to know this, so instead we just
+	 * discard unnecessary repairs as we work our way down the vdev tree.
+	 * The same logic applies to any form of nested replication:
+	 * ditto + mirror, RAID-Z + replacing, etc.  This covers them all.
+	 */
+	if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
+	    !(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
+	    zio->io_txg != 0 &&	/* not a delegated i/o */
+	    !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
+		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+		zio_vdev_io_bypass(zio);
+		return (ZIO_PIPELINE_CONTINUE);
+	}
 
 	if (vd->vdev_ops->vdev_op_leaf &&
 	    (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
 
 		if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
-			return (ZIO_PIPELINE_STOP);
+			return (ZIO_PIPELINE_CONTINUE);
 
 		if ((zio = vdev_queue_io(zio)) == NULL)
 			return (ZIO_PIPELINE_STOP);
@@ -1809,7 +1833,6 @@ zio_vdev_io_start(zio_t *zio)
 			zio_interrupt(zio);
 			return (ZIO_PIPELINE_STOP);
 		}
-
 	}
 
 	return (vd->vdev_ops->vdev_op_io_start(zio));
@@ -1852,7 +1875,7 @@ zio_vdev_io_done(zio_t *zio)
 	ops->vdev_op_io_done(zio);
 
 	if (unexpected_error)
-		return (zio_vdev_io_probe(zio));
+		VERIFY(vdev_probe(vd, zio) == NULL);
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
@@ -2048,13 +2071,12 @@ static int
 zio_ready(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
-	zio_t *pio = zio->io_parent;
+	zio_t *pio, *pio_next;
 
-	if (zio->io_ready) {
-		if (BP_IS_GANG(bp) &&
-		    zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY))
-			return (ZIO_PIPELINE_STOP);
+	if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY))
+		return (ZIO_PIPELINE_STOP);
 
+	if (zio->io_ready) {
 		ASSERT(IO_IS_ALLOCATING(zio));
 		ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
 		ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
@@ -2068,8 +2090,22 @@ zio_ready(zio_t *zio)
 	if (zio->io_error)
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
-	if (pio != NULL)
+	mutex_enter(&zio->io_lock);
+	zio->io_state[ZIO_WAIT_READY] = 1;
+	pio = zio_walk_parents(zio);
+	mutex_exit(&zio->io_lock);
+
+	/*
+	 * As we notify zio's parents, new parents could be added.
+	 * New parents go to the head of zio's io_parent_list, however,
+	 * so we will (correctly) not notify them.  The remainder of zio's
+	 * io_parent_list, from 'pio_next' onward, cannot change because
+	 * all parents must wait for us to be done before they can be done.
+	 */
+	for (; pio != NULL; pio = pio_next) {
+		pio_next = zio_walk_parents(zio);
 		zio_notify_parent(pio, zio, ZIO_WAIT_READY);
+	}
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
@@ -2078,14 +2114,14 @@ static int
 zio_done(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
-	zio_t *pio = zio->io_parent;
 	zio_t *lio = zio->io_logical;
 	blkptr_t *bp = zio->io_bp;
 	vdev_t *vd = zio->io_vd;
 	uint64_t psize = zio->io_size;
+	zio_t *pio, *pio_next;
 
 	/*
-	 * If our of children haven't all completed,
+	 * If our children haven't all completed,
 	 * wait for them and then repeat this pipeline stage.
 	 */
 	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) ||
@@ -2102,7 +2138,7 @@ zio_done(zio_t *zio)
 		ASSERT(bp->blk_pad[1] == 0);
 		ASSERT(bp->blk_pad[2] == 0);
 		ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 ||
-		    (pio != NULL && bp == pio->io_bp));
+		    (bp == zio_unique_parent(zio)->io_bp));
 		if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) &&
 		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
 			ASSERT(!BP_SHOULD_BYTESWAP(bp));
@@ -2160,6 +2196,7 @@ zio_done(zio_t *zio)
 		if ((zio->io_type == ZIO_TYPE_READ ||
 		    zio->io_type == ZIO_TYPE_FREE) &&
 		    zio->io_error == ENXIO &&
+		    spa->spa_load_state == SPA_LOAD_NONE &&
 		    spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)
 			zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
 
@@ -2175,6 +2212,21 @@ zio_done(zio_t *zio)
 	 */
 	zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
 
+	if ((zio->io_error || zio->io_reexecute) && IO_IS_ALLOCATING(zio) &&
+	    zio->io_child_type == ZIO_CHILD_LOGICAL) {
+		ASSERT(zio->io_child_type != ZIO_CHILD_GANG);
+		zio_dva_unallocate(zio, zio->io_gang_tree, bp);
+	}
+
+	zio_gang_tree_free(&zio->io_gang_tree);
+
+	/*
+	 * Godfather I/Os should never suspend.
+	 */
+	if ((zio->io_flags & ZIO_FLAG_GODFATHER) &&
+	    (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND))
+		zio->io_reexecute = 0;
+
 	if (zio->io_reexecute) {
 		/*
 		 * This is a logical I/O that wants to reexecute.
@@ -2191,17 +2243,37 @@ zio_done(zio_t *zio)
 		 */
 		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 
-		if (IO_IS_ALLOCATING(zio))
-			zio_dva_unallocate(zio, zio->io_gang_tree, bp);
+		zio->io_gang_leader = NULL;
 
-		zio_gang_tree_free(&zio->io_gang_tree);
+		mutex_enter(&zio->io_lock);
+		zio->io_state[ZIO_WAIT_DONE] = 1;
+		mutex_exit(&zio->io_lock);
+
+		/*
+		 * "The Godfather" I/O monitors its children but is
+		 * not a true parent to them. It will track them through
+		 * the pipeline but severs its ties whenever they get into
+		 * trouble (e.g. suspended). This allows "The Godfather"
+		 * I/O to return status without blocking.
+		 */
+		for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
+			zio_link_t *zl = zio->io_walk_link;
+			pio_next = zio_walk_parents(zio);
+
+			if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
+			    (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
+				zio_remove_child(pio, zio, zl);
+				zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
+			}
+		}
 
-		if (pio != NULL) {
+		if ((pio = zio_unique_parent(zio)) != NULL) {
 			/*
 			 * We're not a root i/o, so there's nothing to do
 			 * but notify our parent.  Don't propagate errors
 			 * upward since we haven't permanently failed yet.
 			 */
+			ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
 			zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE;
 			zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
 		} else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
@@ -2222,20 +2294,26 @@ zio_done(zio_t *zio)
 		return (ZIO_PIPELINE_STOP);
 	}
 
-	ASSERT(zio->io_child == NULL);
+	ASSERT(zio_walk_children(zio) == NULL);
 	ASSERT(zio->io_reexecute == 0);
 	ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
 
+	/*
+	 * It is the responsibility of the done callback to ensure that this
+	 * particular zio is no longer discoverable for adoption, and as
+	 * such, cannot acquire any new parents.
+	 */
 	if (zio->io_done)
 		zio->io_done(zio);
 
-	zio_gang_tree_free(&zio->io_gang_tree);
-
-	ASSERT(zio->io_delegate_list == NULL);
-	ASSERT(zio->io_delegate_next == NULL);
+	mutex_enter(&zio->io_lock);
+	zio->io_state[ZIO_WAIT_DONE] = 1;
+	mutex_exit(&zio->io_lock);
 
-	if (pio != NULL) {
-		zio_remove_child(pio, zio);
+	for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
+		zio_link_t *zl = zio->io_walk_link;
+		pio_next = zio_walk_parents(zio);
+		zio_remove_child(pio, zio, zl);
 		zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
 	}
 
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
index 79a9966..8f769e6 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
@@ -72,6 +72,7 @@
 #include <sys/zfs_rlock.h>
 #include <sys/vdev_impl.h>
 #include <sys/zvol.h>
+#include <sys/zil_impl.h>
 #include <geom/geom.h>
 
 #include "zfs_namecheck.h"
@@ -115,7 +116,6 @@ typedef struct zvol_state {
 	uint32_t	zv_total_opens;	/* total open count */
 	zilog_t		*zv_zilog;	/* ZIL handle */
 	list_t		zv_extents;	/* List of extents for dump */
-	uint64_t	zv_txg_assign;	/* txg to assign during ZIL replay */
 	znode_t		zv_znode;	/* for range locking */
 	int		zv_state;
 	struct bio_queue_head zv_queue;
@@ -287,8 +287,16 @@ static void
 zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t len)
 {
 	uint32_t blocksize = zv->zv_volblocksize;
+	zilog_t *zilog = zv->zv_zilog;
 	lr_write_t *lr;
 
+	if (zilog->zl_replay) {
+		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
+		zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] =
+		    zilog->zl_replaying_seq;
+		return;
+	}
+
 	while (len) {
 		ssize_t nbytes = MIN(len, blocksize - P2PHASE(off, blocksize));
 		itx_t *itx = zil_itx_create(TX_WRITE, sizeof (*lr));
@@ -303,7 +311,7 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t len)
 		lr->lr_blkoff = off - P2ALIGN_TYPED(off, blocksize, uint64_t);
 		BP_ZERO(&lr->lr_blkptr);
 
-		(void) zil_itx_assign(zv->zv_zilog, itx, tx);
+		(void) zil_itx_assign(zilog, itx, tx);
 		len -= nbytes;
 		off += nbytes;
 	}
@@ -373,7 +381,8 @@ zvol_serve_one(zvol_state_t *zv, struct bio *bp)
 			size = volsize - off;
 
 		if (doread) {
-			error = dmu_read(os, ZVOL_OBJ, off, size, addr);
+			error = dmu_read(os, ZVOL_OBJ, off, size, addr,
+			    DMU_READ_PREFETCH);
 		} else {
 			dmu_tx_t *tx = dmu_tx_create(os);
 			dmu_tx_hold_write(tx, ZVOL_OBJ, off, size);
@@ -576,9 +585,13 @@ zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap)
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
+	/* If it's a dmu_sync() block get the data and write the whole block */
+	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t))
+		zil_get_replay_data(dmu_objset_zil(os), lr);
+
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_write(tx, ZVOL_OBJ, off, len);
-	error = dmu_tx_assign(tx, zv->zv_txg_assign);
+	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 	} else {
@@ -614,6 +627,13 @@ zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
 	zvol_replay_err,	/* TX_TRUNCATE */
 	zvol_replay_err,	/* TX_SETATTR */
 	zvol_replay_err,	/* TX_ACL */
+	zvol_replay_err,	/* TX_CREATE_ACL */
+	zvol_replay_err,	/* TX_CREATE_ATTR */
+	zvol_replay_err,	/* TX_CREATE_ACL_ATTR */
+	zvol_replay_err,	/* TX_MKDIR_ACL */
+	zvol_replay_err,	/* TX_MKDIR_ATTR */
+	zvol_replay_err,	/* TX_MKDIR_ACL_ATTR */
+	zvol_replay_err,	/* TX_WRITE2 */
 };
 
 /*
@@ -678,7 +698,7 @@ zvol_create_minor(const char *name, major_t maj)
 	ASSERT(error == 0);
 	zv->zv_volblocksize = doi.doi_data_block_size;
 
-	zil_replay(os, zv, &zv->zv_txg_assign, zvol_replay_vector, NULL);
+	zil_replay(os, zv, zvol_replay_vector);
 
 	/* XXX this should handle the possible i/o error */
 	VERIFY(dsl_prop_register(dmu_objset_ds(zv->zv_objset),
@@ -983,7 +1003,8 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
 	 * we don't have to write the data twice.
 	 */
 	if (buf != NULL) /* immediate write */
-		return (dmu_read(os, ZVOL_OBJ, lr->lr_offset, dlen, buf));
+		return (dmu_read(os, ZVOL_OBJ, lr->lr_offset, dlen, buf,
+		    DMU_READ_NO_PREFETCH));
 
 	zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP);
 	zgd->zgd_zilog = zv->zv_zilog;
@@ -1000,10 +1021,19 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
 	zgd->zgd_rl = rl;
 
 	VERIFY(0 == dmu_buf_hold(os, ZVOL_OBJ, lr->lr_offset, zgd, &db));
+
 	error = dmu_sync(zio, db, &lr->lr_blkptr,
 	    lr->lr_common.lrc_txg, zvol_get_done, zgd);
-	if (error == 0)
+	if (error == 0) {
+		/*
+		 * dmu_sync() can compress a block of zeros to a null blkptr
+		 * but the block size still needs to be passed through to
+		 * replay.
+		 */
+		BP_SET_LSIZE(&lr->lr_blkptr, db->db_size);
 		zil_add_block(zv->zv_zilog, &lr->lr_blkptr);
+	}
+
 	/*
 	 * If we get EINPROGRESS, then we need to wait for a
 	 * write IO initiated by dmu_sync() to complete before
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/acl.h b/sys/cddl/contrib/opensolaris/uts/common/sys/acl.h
index f10fec6..b0ec063 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/acl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/acl.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _SYS_ACL_H
 #define	_SYS_ACL_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #include <sys/acl_impl.h>
 
@@ -168,6 +166,10 @@ typedef struct ace_object {
     ACE_WRITE_ATTRIBUTES|ACE_DELETE|ACE_READ_ACL|ACE_WRITE_ACL| \
     ACE_WRITE_OWNER|ACE_SYNCHRONIZE)
 
+#define	ACE_ALL_WRITE_PERMS (ACE_WRITE_DATA|ACE_APPEND_DATA| \
+    ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS|ACE_WRITE_ACL| \
+    ACE_WRITE_OWNER|ACE_DELETE|ACE_DELETE_CHILD)
+
 #define	ACE_READ_PERMS	(ACE_READ_DATA|ACE_READ_ACL|ACE_READ_ATTRIBUTES| \
     ACE_READ_NAMED_ATTRS)
 
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/debug.h b/sys/cddl/contrib/opensolaris/uts/common/sys/debug.h
index 432e6be..5fabb14 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/debug.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/debug.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -31,8 +30,6 @@
 #ifndef _SYS_DEBUG_H
 #define	_SYS_DEBUG_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 
 #ifdef	__cplusplus
@@ -50,7 +47,7 @@ extern "C" {
 extern int assfail(const char *, const char *, int);
 #define	VERIFY(EX) ((void)((EX) || assfail(#EX, __FILE__, __LINE__)))
 #ifdef DEBUG
-#define	ASSERT(EX) VERIFY(EX)
+#define	ASSERT(EX) ((void)((EX) || assfail(#EX, __FILE__, __LINE__)))
 #else
 #define	ASSERT(x)  ((void)0)
 #endif
@@ -58,7 +55,7 @@ extern int assfail(const char *, const char *, int);
 extern int assfail();
 #define	VERIFY(EX) ((void)((EX) || assfail("EX", __FILE__, __LINE__)))
 #ifdef DEBUG
-#define	ASSERT(EX) VERIFY(EX)
+#define	ASSERT(EX) ((void)((EX) || assfail("EX", __FILE__, __LINE__)))
 #else
 #define	ASSERT(x)  ((void)0)
 #endif
@@ -98,9 +95,9 @@ _NOTE(CONSTCOND) } while (0)
 #define	VERIFY3U(x, y, z)	VERIFY3_IMPL(x, y, z, uint64_t)
 #define	VERIFY3P(x, y, z)	VERIFY3_IMPL(x, y, z, uintptr_t)
 #ifdef DEBUG
-#define	ASSERT3S(x, y, z)	VERIFY3S(x, y, z)
-#define	ASSERT3U(x, y, z)	VERIFY3U(x, y, z)
-#define	ASSERT3P(x, y, z)	VERIFY3P(x, y, z)
+#define	ASSERT3S(x, y, z)	VERIFY3_IMPL(x, y, z, int64_t)
+#define	ASSERT3U(x, y, z)	VERIFY3_IMPL(x, y, z, uint64_t)
+#define	ASSERT3P(x, y, z)	VERIFY3_IMPL(x, y, z, uintptr_t)
 #else
 #define	ASSERT3S(x, y, z)	((void)0)
 #define	ASSERT3U(x, y, z)	((void)0)
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/fm/fs/zfs.h b/sys/cddl/contrib/opensolaris/uts/common/sys/fm/fs/zfs.h
index 66ca9c5..21b7dbe 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/fm/fs/zfs.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/fm/fs/zfs.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_FM_FS_ZFS_H
 #define	_SYS_FM_FS_ZFS_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef	__cplusplus
 extern "C" {
 #endif
@@ -57,6 +55,7 @@ extern "C" {
 #define	FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE	"vdev_type"
 #define	FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH	"vdev_path"
 #define	FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID	"vdev_devid"
+#define	FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU		"vdev_fru"
 #define	FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID	"parent_guid"
 #define	FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE	"parent_type"
 #define	FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH	"parent_path"
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h b/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
index 2f7e747..8400dc1 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -109,9 +109,20 @@ typedef enum {
 	ZFS_PROP_USEDDS,
 	ZFS_PROP_USEDCHILD,
 	ZFS_PROP_USEDREFRESERV,
+	ZFS_PROP_USERACCOUNTING,	/* not exposed to the user */
 	ZFS_NUM_PROPS
 } zfs_prop_t;
 
+typedef enum {
+	ZFS_PROP_USERUSED,
+	ZFS_PROP_USERQUOTA,
+	ZFS_PROP_GROUPUSED,
+	ZFS_PROP_GROUPQUOTA,
+	ZFS_NUM_USERQUOTA_PROPS
+} zfs_userquota_prop_t;
+
+extern const char *zfs_userquota_prop_prefixes[ZFS_NUM_USERQUOTA_PROPS];
+
 /*
  * Pool properties are identified by these constants and must be added to the
  * end of this list to ensure that external consumers are not affected
@@ -173,6 +184,7 @@ boolean_t zfs_prop_setonce(zfs_prop_t);
 const char *zfs_prop_to_name(zfs_prop_t);
 zfs_prop_t zfs_name_to_prop(const char *);
 boolean_t zfs_prop_user(const char *);
+boolean_t zfs_prop_userquota(const char *name);
 int zfs_prop_index_to_string(zfs_prop_t, uint64_t, const char **);
 int zfs_prop_string_to_index(zfs_prop_t, const char *, uint64_t *);
 boolean_t zfs_prop_valid_for_type(int, zfs_type_t);
@@ -217,6 +229,9 @@ typedef enum {
 #define	ZFS_DELEG_PERM_GID	"gid"
 #define	ZFS_DELEG_PERM_GROUPS	"groups"
 
+#define	ZFS_SMB_ACL_SRC		"src"
+#define	ZFS_SMB_ACL_TARGET	"target"
+
 typedef enum {
 	ZFS_CANMOUNT_OFF = 0,
 	ZFS_CANMOUNT_ON = 1,
@@ -230,6 +245,13 @@ typedef enum zfs_share_op {
 	ZFS_UNSHARE_SMB = 3
 } zfs_share_op_t;
 
+typedef enum zfs_smb_acl_op {
+	ZFS_SMB_ACL_ADD,
+	ZFS_SMB_ACL_REMOVE,
+	ZFS_SMB_ACL_RENAME,
+	ZFS_SMB_ACL_PURGE
+} zfs_smb_acl_op_t;
+
 typedef enum zfs_cache_type {
 	ZFS_CACHE_NONE = 0,
 	ZFS_CACHE_METADATA = 1,
@@ -254,13 +276,15 @@ typedef enum zfs_cache_type {
 #define	SPA_VERSION_12			12ULL
 #define	SPA_VERSION_13			13ULL
 #define	SPA_VERSION_14			14ULL
+#define	SPA_VERSION_15			15ULL
 /*
  * When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk
  * format change. Go to usr/src/grub/grub-0.95/stage2/{zfs-include/, fsys_zfs*},
- * and do the appropriate changes.
+ * and do the appropriate changes.  Also bump the version number in
+ * usr/src/grub/capability.
  */
-#define	SPA_VERSION			SPA_VERSION_14
-#define	SPA_VERSION_STRING		"14"
+#define	SPA_VERSION			SPA_VERSION_15
+#define	SPA_VERSION_STRING		"15"
 
 /*
  * Symbolic names for the changes that caused a SPA_VERSION switch.
@@ -296,6 +320,7 @@ typedef enum zfs_cache_type {
 #define	SPA_VERSION_SNAP_PROPS		SPA_VERSION_12
 #define	SPA_VERSION_USED_BREAKDOWN	SPA_VERSION_13
 #define	SPA_VERSION_PASSTHROUGH_X	SPA_VERSION_14
+#define	SPA_VERSION_USERSPACE		SPA_VERSION_15
 
 /*
  * ZPL version - rev'd whenever an incompatible on-disk format change
@@ -308,14 +333,16 @@ typedef enum zfs_cache_type {
 #define	ZPL_VERSION_1			1ULL
 #define	ZPL_VERSION_2			2ULL
 #define	ZPL_VERSION_3			3ULL
-#define	ZPL_VERSION			ZPL_VERSION_3
-#define	ZPL_VERSION_STRING		"3"
+#define	ZPL_VERSION_4			4ULL
+#define	ZPL_VERSION			ZPL_VERSION_4
+#define	ZPL_VERSION_STRING		"4"
 
 #define	ZPL_VERSION_INITIAL		ZPL_VERSION_1
 #define	ZPL_VERSION_DIRENT_TYPE		ZPL_VERSION_2
 #define	ZPL_VERSION_FUID		ZPL_VERSION_3
 #define	ZPL_VERSION_NORMALIZATION	ZPL_VERSION_3
 #define	ZPL_VERSION_SYSATTR		ZPL_VERSION_3
+#define	ZPL_VERSION_USERSPACE		ZPL_VERSION_4
 
 /*
  * The following are configuration names used in the nvlist describing a pool's
@@ -365,6 +392,7 @@ typedef enum zfs_cache_type {
 #define	ZPOOL_CONFIG_FAULTED		"faulted"
 #define	ZPOOL_CONFIG_DEGRADED		"degraded"
 #define	ZPOOL_CONFIG_REMOVED		"removed"
+#define	ZPOOL_CONFIG_FRU		"fru"
 
 #define	VDEV_TYPE_ROOT			"root"
 #define	VDEV_TYPE_MIRROR		"mirror"
@@ -568,6 +596,11 @@ typedef	unsigned long	zfs_ioc_t;
 #define	ZFS_IOC_INHERIT_PROP		_IOWR('Z', 46, struct zfs_cmd)
 #define	ZFS_IOC_JAIL			_IOWR('Z', 47, struct zfs_cmd)
 #define	ZFS_IOC_UNJAIL			_IOWR('Z', 48, struct zfs_cmd)
+#define	ZFS_IOC_SMB_ACL			_IOWR('Z', 49, struct zfs_cmd)
+#define	ZFS_IOC_USERSPACE_ONE		_IOWR('Z', 50, struct zfs_cmd)
+#define	ZFS_IOC_USERSPACE_MANY		_IOWR('Z', 51, struct zfs_cmd)
+#define	ZFS_IOC_USERSPACE_UPGRADE	_IOWR('Z', 52, struct zfs_cmd)
+#define	ZFS_IOC_SETFRU			_IOWR('Z', 53, struct zfs_cmd)
 
 /*
  * Internal SPA load state.  Used by FMA diagnosis engine.
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/eventdefs.h b/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/eventdefs.h
index ac21686..c46223f 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/eventdefs.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/eventdefs.h
@@ -234,6 +234,9 @@ extern "C" {
 #define	ESC_ZFS_VDEV_CLEAR	"ESC_ZFS_vdev_clear"
 #define	ESC_ZFS_VDEV_CHECK	"ESC_ZFS_vdev_check"
 #define	ESC_ZFS_CONFIG_SYNC	"ESC_ZFS_config_sync"
+#define	ESC_ZFS_SCRUB_START	"ESC_ZFS_scrub_start"
+#define	ESC_ZFS_SCRUB_FINISH	"ESC_ZFS_scrub_finish"
+#define	ESC_ZFS_VDEV_SPARE	"ESC_ZFS_vdev_spare"
 
 /*
  * datalink subclass definitions.
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/vnode.h b/sys/cddl/contrib/opensolaris/uts/common/sys/vnode.h
index 5f1f4b4..5a7c9e6 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/vnode.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/vnode.h
@@ -39,8 +39,6 @@
 #ifndef _SYS_VNODE_H
 #define	_SYS_VNODE_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include_next <sys/vnode.h>
 
 #ifdef	__cplusplus
@@ -266,6 +264,14 @@ typedef struct xvattr {
 	ASSERT((xvap)->xva_vattr.va_mask | AT_XVATTR);		\
 	ASSERT((xvap)->xva_magic == XVA_MAGIC);			\
 	(xvap)->xva_reqattrmap[XVA_INDEX(attr)] |= XVA_ATTRBIT(attr)
+/*
+ * XVA_CLR_REQ() clears an attribute bit in the proper element in the bitmap
+ * of requested attributes (xva_reqattrmap[]).
+ */
+#define	XVA_CLR_REQ(xvap, attr)					\
+	ASSERT((xvap)->xva_vattr.va_mask | AT_XVATTR);		\
+	ASSERT((xvap)->xva_magic == XVA_MAGIC);			\
+	(xvap)->xva_reqattrmap[XVA_INDEX(attr)] &= ~XVA_ATTRBIT(attr)
 
 /*
  * XVA_SET_RTN() sets an attribute bit in the proper element in the bitmap