114 files changed, 0 insertions, 60874 deletions
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/contrib/opensolaris/uts/common/fs/zfs/arc.c
deleted file mode 100644
index 420f802..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/arc.c
+++ /dev/null
@@ -1,2859 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-/*
- * DVA-based Adjustable Replacement Cache
- *
- * While much of the theory of operation used here is
- * based on the self-tuning, low overhead replacement cache
- * presented by Megiddo and Modha at FAST 2003, there are some
- * significant differences:
- *
- * 1. The Megiddo and Modha model assumes any page is evictable.
- * Pages in its cache cannot be "locked" into memory.  This makes
- * the eviction algorithm simple: evict the last page in the list.
- * This also make the performance characteristics easy to reason
- * about.  Our cache is not so simple.  At any given moment, some
- * subset of the blocks in the cache are un-evictable because we
- * have handed out a reference to them.  Blocks are only evictable
- * when there are no external references active.  This makes
- * eviction far more problematic:  we choose to evict the evictable
- * blocks that are the "lowest" in the list.
- *
- * There are times when it is not possible to evict the requested
- * space.  In these circumstances we are unable to adjust the cache
- * size.  To prevent the cache growing unbounded at these times we
- * implement a "cache throttle" that slowes the flow of new data
- * into the cache until we can make space avaiable.
- *
- * 2. The Megiddo and Modha model assumes a fixed cache size.
- * Pages are evicted when the cache is full and there is a cache
- * miss.  Our model has a variable sized cache.  It grows with
- * high use, but also tries to react to memory preasure from the
- * operating system: decreasing its size when system memory is
- * tight.
- *
- * 3. The Megiddo and Modha model assumes a fixed page size. All
- * elements of the cache are therefor exactly the same size.  So
- * when adjusting the cache size following a cache miss, its simply
- * a matter of choosing a single page to evict.  In our model, we
- * have variable sized cache blocks (rangeing from 512 bytes to
- * 128K bytes).  We therefor choose a set of blocks to evict to make
- * space for a cache miss that approximates as closely as possible
- * the space used by the new block.
- *
- * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
- * by N. Megiddo & D. Modha, FAST 2003
- */
-
-/*
- * The locking model:
- *
- * A new reference to a cache buffer can be obtained in two
- * ways: 1) via a hash table lookup using the DVA as a key,
- * or 2) via one of the ARC lists.  The arc_read() inerface
- * uses method 1, while the internal arc algorithms for
- * adjusting the cache use method 2.  We therefor provide two
- * types of locks: 1) the hash table lock array, and 2) the
- * arc list locks.
- *
- * Buffers do not have their own mutexs, rather they rely on the
- * hash table mutexs for the bulk of their protection (i.e. most
- * fields in the arc_buf_hdr_t are protected by these mutexs).
- *
- * buf_hash_find() returns the appropriate mutex (held) when it
- * locates the requested buffer in the hash table.  It returns
- * NULL for the mutex if the buffer was not in the table.
- *
- * buf_hash_remove() expects the appropriate hash mutex to be
- * already held before it is invoked.
- *
- * Each arc state also has a mutex which is used to protect the
- * buffer list associated with the state.  When attempting to
- * obtain a hash table lock while holding an arc list lock you
- * must use: mutex_tryenter() to avoid deadlock.  Also note that
- * the active state mutex must be held before the ghost state mutex.
- *
- * Arc buffers may have an associated eviction callback function.
- * This function will be invoked prior to removing the buffer (e.g.
- * in arc_do_user_evicts()).  Note however that the data associated
- * with the buffer may be evicted prior to the callback.  The callback
- * must be made with *no locks held* (to prevent deadlock).  Additionally,
- * the users of callbacks must ensure that their private data is
- * protected from simultaneous callbacks from arc_buf_evict()
- * and arc_do_user_evicts().
- *
- * Note that the majority of the performance stats are manipulated
- * with atomic operations.
- */
-
-#include <sys/spa.h>
-#include <sys/zio.h>
-#include <sys/zio_checksum.h>
-#include <sys/zfs_context.h>
-#include <sys/arc.h>
-#include <sys/refcount.h>
-#ifdef _KERNEL
-#include <sys/dnlc.h>
-#endif
-#include <sys/callb.h>
-#include <sys/kstat.h>
-#include <sys/sdt.h>
-
-static kmutex_t		arc_reclaim_thr_lock;
-static kcondvar_t	arc_reclaim_thr_cv;	/* used to signal reclaim thr */
-static uint8_t		arc_thread_exit;
-
-#define	ARC_REDUCE_DNLC_PERCENT	3
-uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
-
-typedef enum arc_reclaim_strategy {
-	ARC_RECLAIM_AGGR,		/* Aggressive reclaim strategy */
-	ARC_RECLAIM_CONS		/* Conservative reclaim strategy */
-} arc_reclaim_strategy_t;
-
-/* number of seconds before growing cache again */
-static int		arc_grow_retry = 60;
-
-/*
- * minimum lifespan of a prefetch block in clock ticks
- * (initialized in arc_init())
- */
-static int		arc_min_prefetch_lifespan;
-
-static int arc_dead;
-
-/*
- * These tunables are for performance analysis.
- */
-u_long zfs_arc_max;
-u_long zfs_arc_min;
-TUNABLE_ULONG("vfs.zfs.arc_max", &zfs_arc_max);
-TUNABLE_ULONG("vfs.zfs.arc_min", &zfs_arc_min);
-SYSCTL_DECL(_vfs_zfs);
-SYSCTL_ULONG(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0,
-    "Maximum ARC size");
-SYSCTL_ULONG(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0,
-    "Minimum ARC size");
-
-/*
- * Note that buffers can be on one of 5 states:
- *	ARC_anon	- anonymous (discussed below)
- *	ARC_mru		- recently used, currently cached
- *	ARC_mru_ghost	- recentely used, no longer in cache
- *	ARC_mfu		- frequently used, currently cached
- *	ARC_mfu_ghost	- frequently used, no longer in cache
- * When there are no active references to the buffer, they
- * are linked onto one of the lists in arc.  These are the
- * only buffers that can be evicted or deleted.
- *
- * Anonymous buffers are buffers that are not associated with
- * a DVA.  These are buffers that hold dirty block copies
- * before they are written to stable storage.  By definition,
- * they are "ref'd" and are considered part of arc_mru
- * that cannot be freed.  Generally, they will aquire a DVA
- * as they are written and migrate onto the arc_mru list.
- */
-
-typedef struct arc_state {
-	list_t	arcs_list;	/* linked list of evictable buffer in state */
-	uint64_t arcs_lsize;	/* total size of buffers in the linked list */
-	uint64_t arcs_size;	/* total size of all buffers in this state */
-	kmutex_t arcs_mtx;
-} arc_state_t;
-
-/* The 5 states: */
-static arc_state_t ARC_anon;
-static arc_state_t ARC_mru;
-static arc_state_t ARC_mru_ghost;
-static arc_state_t ARC_mfu;
-static arc_state_t ARC_mfu_ghost;
-
-typedef struct arc_stats {
-	kstat_named_t arcstat_hits;
-	kstat_named_t arcstat_misses;
-	kstat_named_t arcstat_demand_data_hits;
-	kstat_named_t arcstat_demand_data_misses;
-	kstat_named_t arcstat_demand_metadata_hits;
-	kstat_named_t arcstat_demand_metadata_misses;
-	kstat_named_t arcstat_prefetch_data_hits;
-	kstat_named_t arcstat_prefetch_data_misses;
-	kstat_named_t arcstat_prefetch_metadata_hits;
-	kstat_named_t arcstat_prefetch_metadata_misses;
-	kstat_named_t arcstat_mru_hits;
-	kstat_named_t arcstat_mru_ghost_hits;
-	kstat_named_t arcstat_mfu_hits;
-	kstat_named_t arcstat_mfu_ghost_hits;
-	kstat_named_t arcstat_deleted;
-	kstat_named_t arcstat_recycle_miss;
-	kstat_named_t arcstat_mutex_miss;
-	kstat_named_t arcstat_evict_skip;
-	kstat_named_t arcstat_hash_elements;
-	kstat_named_t arcstat_hash_elements_max;
-	kstat_named_t arcstat_hash_collisions;
-	kstat_named_t arcstat_hash_chains;
-	kstat_named_t arcstat_hash_chain_max;
-	kstat_named_t arcstat_p;
-	kstat_named_t arcstat_c;
-	kstat_named_t arcstat_c_min;
-	kstat_named_t arcstat_c_max;
-	kstat_named_t arcstat_size;
-} arc_stats_t;
-
-static arc_stats_t arc_stats = {
-	{ "hits",			KSTAT_DATA_UINT64 },
-	{ "misses",			KSTAT_DATA_UINT64 },
-	{ "demand_data_hits",		KSTAT_DATA_UINT64 },
-	{ "demand_data_misses",		KSTAT_DATA_UINT64 },
-	{ "demand_metadata_hits",	KSTAT_DATA_UINT64 },
-	{ "demand_metadata_misses",	KSTAT_DATA_UINT64 },
-	{ "prefetch_data_hits",		KSTAT_DATA_UINT64 },
-	{ "prefetch_data_misses",	KSTAT_DATA_UINT64 },
-	{ "prefetch_metadata_hits",	KSTAT_DATA_UINT64 },
-	{ "prefetch_metadata_misses",	KSTAT_DATA_UINT64 },
-	{ "mru_hits",			KSTAT_DATA_UINT64 },
-	{ "mru_ghost_hits",		KSTAT_DATA_UINT64 },
-	{ "mfu_hits",			KSTAT_DATA_UINT64 },
-	{ "mfu_ghost_hits",		KSTAT_DATA_UINT64 },
-	{ "deleted",			KSTAT_DATA_UINT64 },
-	{ "recycle_miss",		KSTAT_DATA_UINT64 },
-	{ "mutex_miss",			KSTAT_DATA_UINT64 },
-	{ "evict_skip",			KSTAT_DATA_UINT64 },
-	{ "hash_elements",		KSTAT_DATA_UINT64 },
-	{ "hash_elements_max",		KSTAT_DATA_UINT64 },
-	{ "hash_collisions",		KSTAT_DATA_UINT64 },
-	{ "hash_chains",		KSTAT_DATA_UINT64 },
-	{ "hash_chain_max",		KSTAT_DATA_UINT64 },
-	{ "p",				KSTAT_DATA_UINT64 },
-	{ "c",				KSTAT_DATA_UINT64 },
-	{ "c_min",			KSTAT_DATA_UINT64 },
-	{ "c_max",			KSTAT_DATA_UINT64 },
-	{ "size",			KSTAT_DATA_UINT64 }
-};
-
-#define	ARCSTAT(stat)	(arc_stats.stat.value.ui64)
-
-#define	ARCSTAT_INCR(stat, val) \
-	atomic_add_64(&arc_stats.stat.value.ui64, (val));
-
-#define	ARCSTAT_BUMP(stat) 	ARCSTAT_INCR(stat, 1)
-#define	ARCSTAT_BUMPDOWN(stat)	ARCSTAT_INCR(stat, -1)
-
-#define	ARCSTAT_MAX(stat, val) {					\
-	uint64_t m;							\
-	while ((val) > (m = arc_stats.stat.value.ui64) &&		\
-	    (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val))))	\
-		continue;						\
-}
-
-#define	ARCSTAT_MAXSTAT(stat) \
-	ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
-
-/*
- * We define a macro to allow ARC hits/misses to be easily broken down by
- * two separate conditions, giving a total of four different subtypes for
- * each of hits and misses (so eight statistics total).
- */
-#define	ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
-	if (cond1) {							\
-		if (cond2) {						\
-			ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
-		} else {						\
-			ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
-		}							\
-	} else {							\
-		if (cond2) {						\
-			ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
-		} else {						\
-			ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
-		}							\
-	}
-
-kstat_t			*arc_ksp;
-static arc_state_t 	*arc_anon;
-static arc_state_t	*arc_mru;
-static arc_state_t	*arc_mru_ghost;
-static arc_state_t	*arc_mfu;
-static arc_state_t	*arc_mfu_ghost;
-
-/*
- * There are several ARC variables that are critical to export as kstats --
- * but we don't want to have to grovel around in the kstat whenever we wish to
- * manipulate them.  For these variables, we therefore define them to be in
- * terms of the statistic variable.  This assures that we are not introducing
- * the possibility of inconsistency by having shadow copies of the variables,
- * while still allowing the code to be readable.
- */
-#define	arc_size	ARCSTAT(arcstat_size)	/* actual total arc size */
-#define	arc_p		ARCSTAT(arcstat_p)	/* target size of MRU */
-#define	arc_c		ARCSTAT(arcstat_c)	/* target size of cache */
-#define	arc_c_min	ARCSTAT(arcstat_c_min)	/* min target cache size */
-#define	arc_c_max	ARCSTAT(arcstat_c_max)	/* max target cache size */
-
-static int		arc_no_grow;	/* Don't try to grow cache size */
-static uint64_t		arc_tempreserve;
-
-typedef struct arc_callback arc_callback_t;
-
-struct arc_callback {
-	void			*acb_private;
-	arc_done_func_t		*acb_done;
-	arc_byteswap_func_t	*acb_byteswap;
-	arc_buf_t		*acb_buf;
-	zio_t			*acb_zio_dummy;
-	arc_callback_t		*acb_next;
-};
-
-typedef struct arc_write_callback arc_write_callback_t;
-
-struct arc_write_callback {
-	void		*awcb_private;
-	arc_done_func_t	*awcb_ready;
-	arc_done_func_t	*awcb_done;
-	arc_buf_t	*awcb_buf;
-};
-
-struct arc_buf_hdr {
-	/* protected by hash lock */
-	dva_t			b_dva;
-	uint64_t		b_birth;
-	uint64_t		b_cksum0;
-
-	kmutex_t		b_freeze_lock;
-	zio_cksum_t		*b_freeze_cksum;
-
-	arc_buf_hdr_t		*b_hash_next;
-	arc_buf_t		*b_buf;
-	uint32_t		b_flags;
-	uint32_t		b_datacnt;
-
-	arc_callback_t		*b_acb;
-	kcondvar_t		b_cv;
-
-	/* immutable */
-	arc_buf_contents_t	b_type;
-	uint64_t		b_size;
-	spa_t			*b_spa;
-
-	/* protected by arc state mutex */
-	arc_state_t		*b_state;
-	list_node_t		b_arc_node;
-
-	/* updated atomically */
-	clock_t			b_arc_access;
-
-	/* self protecting */
-	refcount_t		b_refcnt;
-};
-
-static arc_buf_t *arc_eviction_list;
-static kmutex_t arc_eviction_mtx;
-static arc_buf_hdr_t arc_eviction_hdr;
-static void arc_get_data_buf(arc_buf_t *buf);
-static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
-
-#define	GHOST_STATE(state)	\
-	((state) == arc_mru_ghost || (state) == arc_mfu_ghost)
-
-/*
- * Private ARC flags.  These flags are private ARC only flags that will show up
- * in b_flags in the arc_hdr_buf_t.  Some flags are publicly declared, and can
- * be passed in as arc_flags in things like arc_read.  However, these flags
- * should never be passed and should only be set by ARC code.  When adding new
- * public flags, make sure not to smash the private ones.
- */
-
-#define	ARC_IN_HASH_TABLE	(1 << 9)	/* this buffer is hashed */
-#define	ARC_IO_IN_PROGRESS	(1 << 10)	/* I/O in progress for buf */
-#define	ARC_IO_ERROR		(1 << 11)	/* I/O failed for buf */
-#define	ARC_FREED_IN_READ	(1 << 12)	/* buf freed while in read */
-#define	ARC_BUF_AVAILABLE	(1 << 13)	/* block not in active use */
-#define	ARC_INDIRECT		(1 << 14)	/* this is an indirect block */
-
-#define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_IN_HASH_TABLE)
-#define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS)
-#define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_IO_ERROR)
-#define	HDR_FREED_IN_READ(hdr)	((hdr)->b_flags & ARC_FREED_IN_READ)
-#define	HDR_BUF_AVAILABLE(hdr)	((hdr)->b_flags & ARC_BUF_AVAILABLE)
-
-/*
- * Hash table routines
- */
-
-#define	HT_LOCK_PAD	128
-
-struct ht_lock {
-	kmutex_t	ht_lock;
-#ifdef _KERNEL
-	unsigned char	pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
-#endif
-};
-
-#define	BUF_LOCKS 256
-typedef struct buf_hash_table {
-	uint64_t ht_mask;
-	arc_buf_hdr_t **ht_table;
-	struct ht_lock ht_locks[BUF_LOCKS];
-} buf_hash_table_t;
-
-static buf_hash_table_t buf_hash_table;
-
-#define	BUF_HASH_INDEX(spa, dva, birth) \
-	(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
-#define	BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
-#define	BUF_HASH_LOCK(idx)	(&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
-#define	HDR_LOCK(buf) \
-	(BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth)))
-
-uint64_t zfs_crc64_table[256];
-
-static uint64_t
-buf_hash(spa_t *spa, dva_t *dva, uint64_t birth)
-{
-	uintptr_t spav = (uintptr_t)spa;
-	uint8_t *vdva = (uint8_t *)dva;
-	uint64_t crc = -1ULL;
-	int i;
-
-	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
-
-	for (i = 0; i < sizeof (dva_t); i++)
-		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
-
-	crc ^= (spav>>8) ^ birth;
-
-	return (crc);
-}
-
-#define	BUF_EMPTY(buf)						\
-	((buf)->b_dva.dva_word[0] == 0 &&			\
-	(buf)->b_dva.dva_word[1] == 0 &&			\
-	(buf)->b_birth == 0)
-
-#define	BUF_EQUAL(spa, dva, birth, buf)				\
-	((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&	\
-	((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&	\
-	((buf)->b_birth == birth) && ((buf)->b_spa == spa)
-
-static arc_buf_hdr_t *
-buf_hash_find(spa_t *spa, dva_t *dva, uint64_t birth, kmutex_t **lockp)
-{
-	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
-	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
-	arc_buf_hdr_t *buf;
-
-	mutex_enter(hash_lock);
-	for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
-	    buf = buf->b_hash_next) {
-		if (BUF_EQUAL(spa, dva, birth, buf)) {
-			*lockp = hash_lock;
-			return (buf);
-		}
-	}
-	mutex_exit(hash_lock);
-	*lockp = NULL;
-	return (NULL);
-}
-
-/*
- * Insert an entry into the hash table.  If there is already an element
- * equal to elem in the hash table, then the already existing element
- * will be returned and the new element will not be inserted.
- * Otherwise returns NULL.
- */
-static arc_buf_hdr_t *
-buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
-{
-	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
-	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
-	arc_buf_hdr_t *fbuf;
-	uint32_t i;
-
-	ASSERT(!HDR_IN_HASH_TABLE(buf));
-	*lockp = hash_lock;
-	mutex_enter(hash_lock);
-	for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
-	    fbuf = fbuf->b_hash_next, i++) {
-		if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
-			return (fbuf);
-	}
-
-	buf->b_hash_next = buf_hash_table.ht_table[idx];
-	buf_hash_table.ht_table[idx] = buf;
-	buf->b_flags |= ARC_IN_HASH_TABLE;
-
-	/* collect some hash table performance data */
-	if (i > 0) {
-		ARCSTAT_BUMP(arcstat_hash_collisions);
-		if (i == 1)
-			ARCSTAT_BUMP(arcstat_hash_chains);
-
-		ARCSTAT_MAX(arcstat_hash_chain_max, i);
-	}
-
-	ARCSTAT_BUMP(arcstat_hash_elements);
-	ARCSTAT_MAXSTAT(arcstat_hash_elements);
-
-	return (NULL);
-}
-
-static void
-buf_hash_remove(arc_buf_hdr_t *buf)
-{
-	arc_buf_hdr_t *fbuf, **bufp;
-	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
-
-	ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
-	ASSERT(HDR_IN_HASH_TABLE(buf));
-
-	bufp = &buf_hash_table.ht_table[idx];
-	while ((fbuf = *bufp) != buf) {
-		ASSERT(fbuf != NULL);
-		bufp = &fbuf->b_hash_next;
-	}
-	*bufp = buf->b_hash_next;
-	buf->b_hash_next = NULL;
-	buf->b_flags &= ~ARC_IN_HASH_TABLE;
-
-	/* collect some hash table performance data */
-	ARCSTAT_BUMPDOWN(arcstat_hash_elements);
-
-	if (buf_hash_table.ht_table[idx] &&
-	    buf_hash_table.ht_table[idx]->b_hash_next == NULL)
-		ARCSTAT_BUMPDOWN(arcstat_hash_chains);
-}
-
-/*
- * Global data structures and functions for the buf kmem cache.
- */
-static kmem_cache_t *hdr_cache;
-static kmem_cache_t *buf_cache;
-
-static void
-buf_fini(void)
-{
-	int i;
-
-	kmem_free(buf_hash_table.ht_table,
-	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
-	for (i = 0; i < BUF_LOCKS; i++)
-		mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
-	kmem_cache_destroy(hdr_cache);
-	kmem_cache_destroy(buf_cache);
-}
-
-/*
- * Constructor callback - called when the cache is empty
- * and a new buf is requested.
- */
-/* ARGSUSED */
-static int
-hdr_cons(void *vbuf, void *unused, int kmflag)
-{
-	arc_buf_hdr_t *buf = vbuf;
-
-	bzero(buf, sizeof (arc_buf_hdr_t));
-	refcount_create(&buf->b_refcnt);
-	cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
-	return (0);
-}
-
-/*
- * Destructor callback - called when a cached buf is
- * no longer required.
- */
-/* ARGSUSED */
-static void
-hdr_dest(void *vbuf, void *unused)
-{
-	arc_buf_hdr_t *buf = vbuf;
-
-	refcount_destroy(&buf->b_refcnt);
-	cv_destroy(&buf->b_cv);
-}
-
-/*
- * Reclaim callback -- invoked when memory is low.
- */
-/* ARGSUSED */
-static void
-hdr_recl(void *unused)
-{
-	dprintf("hdr_recl called\n");
-	/*
-	 * umem calls the reclaim func when we destroy the buf cache,
-	 * which is after we do arc_fini().
-	 */
-	if (!arc_dead)
-		cv_signal(&arc_reclaim_thr_cv);
-}
-
-static void
-buf_init(void)
-{
-	uint64_t *ct;
-	uint64_t hsize = 1ULL << 12;
-	int i, j;
-
-	/*
-	 * The hash table is big enough to fill all of physical memory
-	 * with an average 64K block size.  The table will take up
-	 * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
-	 */
-	while (hsize * 65536 < (uint64_t)physmem * PAGESIZE)
-		hsize <<= 1;
-retry:
-	buf_hash_table.ht_mask = hsize - 1;
-	buf_hash_table.ht_table =
-	    kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
-	if (buf_hash_table.ht_table == NULL) {
-		ASSERT(hsize > (1ULL << 8));
-		hsize >>= 1;
-		goto retry;
-	}
-
-	hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
-	    0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
-	buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
-	    0, NULL, NULL, NULL, NULL, NULL, 0);
-
-	for (i = 0; i < 256; i++)
-		for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
-			*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
-
-	for (i = 0; i < BUF_LOCKS; i++) {
-		mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
-		    NULL, MUTEX_DEFAULT, NULL);
-	}
-}
-
-#define	ARC_MINTIME	(hz>>4) /* 62 ms */
-
-static void
-arc_cksum_verify(arc_buf_t *buf)
-{
-	zio_cksum_t zc;
-
-	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
-		return;
-
-	mutex_enter(&buf->b_hdr->b_freeze_lock);
-	if (buf->b_hdr->b_freeze_cksum == NULL ||
-	    (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
-		mutex_exit(&buf->b_hdr->b_freeze_lock);
-		return;
-	}
-	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
-	if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
-		panic("buffer modified while frozen!");
-	mutex_exit(&buf->b_hdr->b_freeze_lock);
-}
-
-static void
-arc_cksum_compute(arc_buf_t *buf)
-{
-	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
-		return;
-
-	mutex_enter(&buf->b_hdr->b_freeze_lock);
-	if (buf->b_hdr->b_freeze_cksum != NULL) {
-		mutex_exit(&buf->b_hdr->b_freeze_lock);
-		return;
-	}
-	buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
-	fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
-	    buf->b_hdr->b_freeze_cksum);
-	mutex_exit(&buf->b_hdr->b_freeze_lock);
-}
-
-void
-arc_buf_thaw(arc_buf_t *buf)
-{
-	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
-		return;
-
-	if (buf->b_hdr->b_state != arc_anon)
-		panic("modifying non-anon buffer!");
-	if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
-		panic("modifying buffer while i/o in progress!");
-	arc_cksum_verify(buf);
-	mutex_enter(&buf->b_hdr->b_freeze_lock);
-	if (buf->b_hdr->b_freeze_cksum != NULL) {
-		kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
-		buf->b_hdr->b_freeze_cksum = NULL;
-	}
-	mutex_exit(&buf->b_hdr->b_freeze_lock);
-}
-
-void
-arc_buf_freeze(arc_buf_t *buf)
-{
-	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
-		return;
-
-	ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
-	    buf->b_hdr->b_state == arc_anon);
-	arc_cksum_compute(buf);
-}
-
-static void
-add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
-{
-	ASSERT(MUTEX_HELD(hash_lock));
-
-	if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
-	    (ab->b_state != arc_anon)) {
-		uint64_t delta = ab->b_size * ab->b_datacnt;
-
-		ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx));
-		mutex_enter(&ab->b_state->arcs_mtx);
-		ASSERT(list_link_active(&ab->b_arc_node));
-		list_remove(&ab->b_state->arcs_list, ab);
-		if (GHOST_STATE(ab->b_state)) {
-			ASSERT3U(ab->b_datacnt, ==, 0);
-			ASSERT3P(ab->b_buf, ==, NULL);
-			delta = ab->b_size;
-		}
-		ASSERT(delta > 0);
-		ASSERT3U(ab->b_state->arcs_lsize, >=, delta);
-		atomic_add_64(&ab->b_state->arcs_lsize, -delta);
-		mutex_exit(&ab->b_state->arcs_mtx);
-		/* remove the prefetch flag is we get a reference */
-		if (ab->b_flags & ARC_PREFETCH)
-			ab->b_flags &= ~ARC_PREFETCH;
-	}
-}
-
-static int
-remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
-{
-	int cnt;
-	arc_state_t *state = ab->b_state;
-
-	ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
-	ASSERT(!GHOST_STATE(state));
-
-	if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
-	    (state != arc_anon)) {
-		ASSERT(!MUTEX_HELD(&state->arcs_mtx));
-		mutex_enter(&state->arcs_mtx);
-		ASSERT(!list_link_active(&ab->b_arc_node));
-		list_insert_head(&state->arcs_list, ab);
-		ASSERT(ab->b_datacnt > 0);
-		atomic_add_64(&state->arcs_lsize, ab->b_size * ab->b_datacnt);
-		ASSERT3U(state->arcs_size, >=, state->arcs_lsize);
-		mutex_exit(&state->arcs_mtx);
-	}
-	return (cnt);
-}
-
-/*
- * Move the supplied buffer to the indicated state.  The mutex
- * for the buffer must be held by the caller.
- */
-static void
-arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
-{
-	arc_state_t *old_state = ab->b_state;
-	int64_t refcnt = refcount_count(&ab->b_refcnt);
-	uint64_t from_delta, to_delta;
-
-	ASSERT(MUTEX_HELD(hash_lock));
-	ASSERT(new_state != old_state);
-	ASSERT(refcnt == 0 || ab->b_datacnt > 0);
-	ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
-
-	from_delta = to_delta = ab->b_datacnt * ab->b_size;
-
-	/*
-	 * If this buffer is evictable, transfer it from the
-	 * old state list to the new state list.
-	 */
-	if (refcnt == 0) {
-		if (old_state != arc_anon) {
-			int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
-
-			if (use_mutex)
-				mutex_enter(&old_state->arcs_mtx);
-
-			ASSERT(list_link_active(&ab->b_arc_node));
-			list_remove(&old_state->arcs_list, ab);
-
-			/*
-			 * If prefetching out of the ghost cache,
-			 * we will have a non-null datacnt.
-			 */
-			if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
-				/* ghost elements have a ghost size */
-				ASSERT(ab->b_buf == NULL);
-				from_delta = ab->b_size;
-			}
-			ASSERT3U(old_state->arcs_lsize, >=, from_delta);
-			atomic_add_64(&old_state->arcs_lsize, -from_delta);
-
-			if (use_mutex)
-				mutex_exit(&old_state->arcs_mtx);
-		}
-		if (new_state != arc_anon) {
-			int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
-
-			if (use_mutex)
-				mutex_enter(&new_state->arcs_mtx);
-
-			list_insert_head(&new_state->arcs_list, ab);
-
-			/* ghost elements have a ghost size */
-			if (GHOST_STATE(new_state)) {
-				ASSERT(ab->b_datacnt == 0);
-				ASSERT(ab->b_buf == NULL);
-				to_delta = ab->b_size;
-			}
-			atomic_add_64(&new_state->arcs_lsize, to_delta);
-			ASSERT3U(new_state->arcs_size + to_delta, >=,
-			    new_state->arcs_lsize);
-
-			if (use_mutex)
-				mutex_exit(&new_state->arcs_mtx);
-		}
-	}
-
-	ASSERT(!BUF_EMPTY(ab));
-	if (new_state == arc_anon && old_state != arc_anon) {
-		buf_hash_remove(ab);
-	}
-
-	/* adjust state sizes */
-	if (to_delta)
-		atomic_add_64(&new_state->arcs_size, to_delta);
-	if (from_delta) {
-		ASSERT3U(old_state->arcs_size, >=, from_delta);
-		atomic_add_64(&old_state->arcs_size, -from_delta);
-	}
-	ab->b_state = new_state;
-}
-
-arc_buf_t *
-arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
-{
-	arc_buf_hdr_t *hdr;
-	arc_buf_t *buf;
-
-	ASSERT3U(size, >, 0);
-	hdr = kmem_cache_alloc(hdr_cache, KM_SLEEP);
-	ASSERT(BUF_EMPTY(hdr));
-	hdr->b_size = size;
-	hdr->b_type = type;
-	hdr->b_spa = spa;
-	hdr->b_state = arc_anon;
-	hdr->b_arc_access = 0;
-	mutex_init(&hdr->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
-	buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
-	buf->b_hdr = hdr;
-	buf->b_data = NULL;
-	buf->b_efunc = NULL;
-	buf->b_private = NULL;
-	buf->b_next = NULL;
-	hdr->b_buf = buf;
-	arc_get_data_buf(buf);
-	hdr->b_datacnt = 1;
-	hdr->b_flags = 0;
-	ASSERT(refcount_is_zero(&hdr->b_refcnt));
-	(void) refcount_add(&hdr->b_refcnt, tag);
-
-	return (buf);
-}
-
-static arc_buf_t *
-arc_buf_clone(arc_buf_t *from)
-{
-	arc_buf_t *buf;
-	arc_buf_hdr_t *hdr = from->b_hdr;
-	uint64_t size = hdr->b_size;
-
-	buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
-	buf->b_hdr = hdr;
-	buf->b_data = NULL;
-	buf->b_efunc = NULL;
-	buf->b_private = NULL;
-	buf->b_next = hdr->b_buf;
-	hdr->b_buf = buf;
-	arc_get_data_buf(buf);
-	bcopy(from->b_data, buf->b_data, size);
-	hdr->b_datacnt += 1;
-	return (buf);
-}
-
-void
-arc_buf_add_ref(arc_buf_t *buf, void* tag)
-{
-	arc_buf_hdr_t *hdr;
-	kmutex_t *hash_lock;
-
-	/*
-	 * Check to see if this buffer is currently being evicted via
-	 * arc_do_user_evicts().
-	 */
-	mutex_enter(&arc_eviction_mtx);
-	hdr = buf->b_hdr;
-	if (hdr == NULL) {
-		mutex_exit(&arc_eviction_mtx);
-		return;
-	}
-	hash_lock = HDR_LOCK(hdr);
-	mutex_exit(&arc_eviction_mtx);
-
-	mutex_enter(hash_lock);
-	if (buf->b_data == NULL) {
-		/*
-		 * This buffer is evicted.
-		 */
-		mutex_exit(hash_lock);
-		return;
-	}
-
-	ASSERT(buf->b_hdr == hdr);
-	ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
-	add_reference(hdr, hash_lock, tag);
-	arc_access(hdr, hash_lock);
-	mutex_exit(hash_lock);
-	ARCSTAT_BUMP(arcstat_hits);
-	ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
-	    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
-	    data, metadata, hits);
-}
-
-static void
-arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
-{
-	arc_buf_t **bufp;
-
-	/* free up data associated with the buf */
-	if (buf->b_data) {
-		arc_state_t *state = buf->b_hdr->b_state;
-		uint64_t size = buf->b_hdr->b_size;
-		arc_buf_contents_t type = buf->b_hdr->b_type;
-
-		arc_cksum_verify(buf);
-		if (!recycle) {
-			if (type == ARC_BUFC_METADATA) {
-				zio_buf_free(buf->b_data, size);
-			} else {
-				ASSERT(type == ARC_BUFC_DATA);
-				zio_data_buf_free(buf->b_data, size);
-			}
-			atomic_add_64(&arc_size, -size);
-		}
-		if (list_link_active(&buf->b_hdr->b_arc_node)) {
-			ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
-			ASSERT(state != arc_anon);
-			ASSERT3U(state->arcs_lsize, >=, size);
-			atomic_add_64(&state->arcs_lsize, -size);
-		}
-		ASSERT3U(state->arcs_size, >=, size);
-		atomic_add_64(&state->arcs_size, -size);
-		buf->b_data = NULL;
-		ASSERT(buf->b_hdr->b_datacnt > 0);
-		buf->b_hdr->b_datacnt -= 1;
-	}
-
-	/* only remove the buf if requested */
-	if (!all)
-		return;
-
-	/* remove the buf from the hdr list */
-	for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
-		continue;
-	*bufp = buf->b_next;
-
-	ASSERT(buf->b_efunc == NULL);
-
-	/* clean up the buf */
-	buf->b_hdr = NULL;
-	kmem_cache_free(buf_cache, buf);
-}
-
-static void
-arc_hdr_destroy(arc_buf_hdr_t *hdr)
-{
-	ASSERT(refcount_is_zero(&hdr->b_refcnt));
-	ASSERT3P(hdr->b_state, ==, arc_anon);
-	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
-
-	if (!BUF_EMPTY(hdr)) {
-		ASSERT(!HDR_IN_HASH_TABLE(hdr));
-		bzero(&hdr->b_dva, sizeof (dva_t));
-		hdr->b_birth = 0;
-		hdr->b_cksum0 = 0;
-	}
-	while (hdr->b_buf) {
-		arc_buf_t *buf = hdr->b_buf;
-
-		if (buf->b_efunc) {
-			mutex_enter(&arc_eviction_mtx);
-			ASSERT(buf->b_hdr != NULL);
-			arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
-			hdr->b_buf = buf->b_next;
-			buf->b_hdr = &arc_eviction_hdr;
-			buf->b_next = arc_eviction_list;
-			arc_eviction_list = buf;
-			mutex_exit(&arc_eviction_mtx);
-		} else {
-			arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
-		}
-	}
-	if (hdr->b_freeze_cksum != NULL) {
-		kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
-		hdr->b_freeze_cksum = NULL;
-	}
-	mutex_destroy(&hdr->b_freeze_lock);
-
-	ASSERT(!list_link_active(&hdr->b_arc_node));
-	ASSERT3P(hdr->b_hash_next, ==, NULL);
-	ASSERT3P(hdr->b_acb, ==, NULL);
-	kmem_cache_free(hdr_cache, hdr);
-}
-
-void
-arc_buf_free(arc_buf_t *buf, void *tag)
-{
-	arc_buf_hdr_t *hdr = buf->b_hdr;
-	int hashed = hdr->b_state != arc_anon;
-
-	ASSERT(buf->b_efunc == NULL);
-	ASSERT(buf->b_data != NULL);
-
-	if (hashed) {
-		kmutex_t *hash_lock = HDR_LOCK(hdr);
-
-		mutex_enter(hash_lock);
-		(void) remove_reference(hdr, hash_lock, tag);
-		if (hdr->b_datacnt > 1)
-			arc_buf_destroy(buf, FALSE, TRUE);
-		else
-			hdr->b_flags |= ARC_BUF_AVAILABLE;
-		mutex_exit(hash_lock);
-	} else if (HDR_IO_IN_PROGRESS(hdr)) {
-		int destroy_hdr;
-		/*
-		 * We are in the middle of an async write.  Don't destroy
-		 * this buffer unless the write completes before we finish
-		 * decrementing the reference count.
-		 */
-		mutex_enter(&arc_eviction_mtx);
-		(void) remove_reference(hdr, NULL, tag);
-		ASSERT(refcount_is_zero(&hdr->b_refcnt));
-		destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
-		mutex_exit(&arc_eviction_mtx);
-		if (destroy_hdr)
-			arc_hdr_destroy(hdr);
-	} else {
-		if (remove_reference(hdr, NULL, tag) > 0) {
-			ASSERT(HDR_IO_ERROR(hdr));
-			arc_buf_destroy(buf, FALSE, TRUE);
-		} else {
-			arc_hdr_destroy(hdr);
-		}
-	}
-}
-
-int
-arc_buf_remove_ref(arc_buf_t *buf, void* tag)
-{
-	arc_buf_hdr_t *hdr = buf->b_hdr;
-	kmutex_t *hash_lock = HDR_LOCK(hdr);
-	int no_callback = (buf->b_efunc == NULL);
-
-	if (hdr->b_state == arc_anon) {
-		arc_buf_free(buf, tag);
-		return (no_callback);
-	}
-
-	mutex_enter(hash_lock);
-	ASSERT(hdr->b_state != arc_anon);
-	ASSERT(buf->b_data != NULL);
-
-	(void) remove_reference(hdr, hash_lock, tag);
-	if (hdr->b_datacnt > 1) {
-		if (no_callback)
-			arc_buf_destroy(buf, FALSE, TRUE);
-	} else if (no_callback) {
-		ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
-		hdr->b_flags |= ARC_BUF_AVAILABLE;
-	}
-	ASSERT(no_callback || hdr->b_datacnt > 1 ||
-	    refcount_is_zero(&hdr->b_refcnt));
-	mutex_exit(hash_lock);
-	return (no_callback);
-}
-
-int
-arc_buf_size(arc_buf_t *buf)
-{
-	return (buf->b_hdr->b_size);
-}
-
-/*
- * Evict buffers from list until we've removed the specified number of
- * bytes.  Move the removed buffers to the appropriate evict state.
- * If the recycle flag is set, then attempt to "recycle" a buffer:
- * - look for a buffer to evict that is `bytes' long.
- * - return the data block from this buffer rather than freeing it.
- * This flag is used by callers that are trying to make space for a
- * new buffer in a full arc cache.
- */
-static void *
-arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle,
-    arc_buf_contents_t type)
-{
-	arc_state_t *evicted_state;
-	uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
-	arc_buf_hdr_t *ab, *ab_prev = NULL;
-	kmutex_t *hash_lock;
-	boolean_t have_lock;
-	void *stolen = NULL;
-
-	ASSERT(state == arc_mru || state == arc_mfu);
-
-	evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
-
-	mutex_enter(&state->arcs_mtx);
-	mutex_enter(&evicted_state->arcs_mtx);
-
-	for (ab = list_tail(&state->arcs_list); ab; ab = ab_prev) {
-		ab_prev = list_prev(&state->arcs_list, ab);
-		/* prefetch buffers have a minimum lifespan */
-		if (HDR_IO_IN_PROGRESS(ab) ||
-		    (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
-		    LBOLT - ab->b_arc_access < arc_min_prefetch_lifespan)) {
-			skipped++;
-			continue;
-		}
-		/* "lookahead" for better eviction candidate */
-		if (recycle && ab->b_size != bytes &&
-		    ab_prev && ab_prev->b_size == bytes)
-			continue;
-		hash_lock = HDR_LOCK(ab);
-		have_lock = MUTEX_HELD(hash_lock);
-		if (have_lock || mutex_tryenter(hash_lock)) {
-			ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0);
-			ASSERT(ab->b_datacnt > 0);
-			while (ab->b_buf) {
-				arc_buf_t *buf = ab->b_buf;
-				if (buf->b_data) {
-					bytes_evicted += ab->b_size;
-					if (recycle && ab->b_type == type &&
-					    ab->b_size == bytes) {
-						stolen = buf->b_data;
-						recycle = FALSE;
-					}
-				}
-				if (buf->b_efunc) {
-					mutex_enter(&arc_eviction_mtx);
-					arc_buf_destroy(buf,
-					    buf->b_data == stolen, FALSE);
-					ab->b_buf = buf->b_next;
-					buf->b_hdr = &arc_eviction_hdr;
-					buf->b_next = arc_eviction_list;
-					arc_eviction_list = buf;
-					mutex_exit(&arc_eviction_mtx);
-				} else {
-					arc_buf_destroy(buf,
-					    buf->b_data == stolen, TRUE);
-				}
-			}
-			ASSERT(ab->b_datacnt == 0);
-			arc_change_state(evicted_state, ab, hash_lock);
-			ASSERT(HDR_IN_HASH_TABLE(ab));
-			ab->b_flags = ARC_IN_HASH_TABLE;
-			DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
-			if (!have_lock)
-				mutex_exit(hash_lock);
-			if (bytes >= 0 && bytes_evicted >= bytes)
-				break;
-		} else {
-			missed += 1;
-		}
-	}
-
-	mutex_exit(&evicted_state->arcs_mtx);
-	mutex_exit(&state->arcs_mtx);
-
-	if (bytes_evicted < bytes)
-		dprintf("only evicted %lld bytes from %x",
-		    (longlong_t)bytes_evicted, state);
-
-	if (skipped)
-		ARCSTAT_INCR(arcstat_evict_skip, skipped);
-
-	if (missed)
-		ARCSTAT_INCR(arcstat_mutex_miss, missed);
-
-	return (stolen);
-}
-
-/*
- * Remove buffers from list until we've removed the specified number of
- * bytes.  Destroy the buffers that are removed.
- */
-static void
-arc_evict_ghost(arc_state_t *state, int64_t bytes)
-{
-	arc_buf_hdr_t *ab, *ab_prev;
-	kmutex_t *hash_lock;
-	uint64_t bytes_deleted = 0;
-	uint64_t bufs_skipped = 0;
-
-	ASSERT(GHOST_STATE(state));
-top:
-	mutex_enter(&state->arcs_mtx);
-	for (ab = list_tail(&state->arcs_list); ab; ab = ab_prev) {
-		ab_prev = list_prev(&state->arcs_list, ab);
-		hash_lock = HDR_LOCK(ab);
-		if (mutex_tryenter(hash_lock)) {
-			ASSERT(!HDR_IO_IN_PROGRESS(ab));
-			ASSERT(ab->b_buf == NULL);
-			arc_change_state(arc_anon, ab, hash_lock);
-			mutex_exit(hash_lock);
-			ARCSTAT_BUMP(arcstat_deleted);
-			bytes_deleted += ab->b_size;
-			arc_hdr_destroy(ab);
-			DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
-			if (bytes >= 0 && bytes_deleted >= bytes)
-				break;
-		} else {
-			if (bytes < 0) {
-				mutex_exit(&state->arcs_mtx);
-				mutex_enter(hash_lock);
-				mutex_exit(hash_lock);
-				goto top;
-			}
-			bufs_skipped += 1;
-		}
-	}
-	mutex_exit(&state->arcs_mtx);
-
-	if (bufs_skipped) {
-		ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
-		ASSERT(bytes >= 0);
-	}
-
-	if (bytes_deleted < bytes)
-		dprintf("only deleted %lld bytes from %p",
-		    (longlong_t)bytes_deleted, state);
-}
-
-static void
-arc_adjust(void)
-{
-	int64_t top_sz, mru_over, arc_over, todelete;
-
-	top_sz = arc_anon->arcs_size + arc_mru->arcs_size;
-
-	if (top_sz > arc_p && arc_mru->arcs_lsize > 0) {
-		int64_t toevict = MIN(arc_mru->arcs_lsize, top_sz - arc_p);
-		(void) arc_evict(arc_mru, toevict, FALSE, ARC_BUFC_UNDEF);
-		top_sz = arc_anon->arcs_size + arc_mru->arcs_size;
-	}
-
-	mru_over = top_sz + arc_mru_ghost->arcs_size - arc_c;
-
-	if (mru_over > 0) {
-		if (arc_mru_ghost->arcs_lsize > 0) {
-			todelete = MIN(arc_mru_ghost->arcs_lsize, mru_over);
-			arc_evict_ghost(arc_mru_ghost, todelete);
-		}
-	}
-
-	if ((arc_over = arc_size - arc_c) > 0) {
-		int64_t tbl_over;
-
-		if (arc_mfu->arcs_lsize > 0) {
-			int64_t toevict = MIN(arc_mfu->arcs_lsize, arc_over);
-			(void) arc_evict(arc_mfu, toevict, FALSE,
-			    ARC_BUFC_UNDEF);
-		}
-
-		tbl_over = arc_size + arc_mru_ghost->arcs_lsize +
-		    arc_mfu_ghost->arcs_lsize - arc_c*2;
-
-		if (tbl_over > 0 && arc_mfu_ghost->arcs_lsize > 0) {
-			todelete = MIN(arc_mfu_ghost->arcs_lsize, tbl_over);
-			arc_evict_ghost(arc_mfu_ghost, todelete);
-		}
-	}
-}
-
-static void
-arc_do_user_evicts(void)
-{
-	mutex_enter(&arc_eviction_mtx);
-	while (arc_eviction_list != NULL) {
-		arc_buf_t *buf = arc_eviction_list;
-		arc_eviction_list = buf->b_next;
-		buf->b_hdr = NULL;
-		mutex_exit(&arc_eviction_mtx);
-
-		if (buf->b_efunc != NULL)
-			VERIFY(buf->b_efunc(buf) == 0);
-
-		buf->b_efunc = NULL;
-		buf->b_private = NULL;
-		kmem_cache_free(buf_cache, buf);
-		mutex_enter(&arc_eviction_mtx);
-	}
-	mutex_exit(&arc_eviction_mtx);
-}
-
-/*
- * Flush all *evictable* data from the cache.
- * NOTE: this will not touch "active" (i.e. referenced) data.
- */
-void
-arc_flush(void)
-{
-	while (list_head(&arc_mru->arcs_list))
-		(void) arc_evict(arc_mru, -1, FALSE, ARC_BUFC_UNDEF);
-	while (list_head(&arc_mfu->arcs_list))
-		(void) arc_evict(arc_mfu, -1, FALSE, ARC_BUFC_UNDEF);
-
-	arc_evict_ghost(arc_mru_ghost, -1);
-	arc_evict_ghost(arc_mfu_ghost, -1);
-
-	mutex_enter(&arc_reclaim_thr_lock);
-	arc_do_user_evicts();
-	mutex_exit(&arc_reclaim_thr_lock);
-	ASSERT(arc_eviction_list == NULL);
-}
-
-int arc_shrink_shift = 5;		/* log2(fraction of arc to reclaim) */
-
-void
-arc_shrink(void)
-{
-	if (arc_c > arc_c_min) {
-		uint64_t to_free;
-
-#ifdef _KERNEL
-		to_free = arc_c >> arc_shrink_shift;
-#else
-		to_free = arc_c >> arc_shrink_shift;
-#endif
-		if (arc_c > arc_c_min + to_free)
-			atomic_add_64(&arc_c, -to_free);
-		else
-			arc_c = arc_c_min;
-
-		atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
-		if (arc_c > arc_size)
-			arc_c = MAX(arc_size, arc_c_min);
-		if (arc_p > arc_c)
-			arc_p = (arc_c >> 1);
-		ASSERT(arc_c >= arc_c_min);
-		ASSERT((int64_t)arc_p >= 0);
-	}
-
-	if (arc_size > arc_c)
-		arc_adjust();
-}
-
-static int zfs_needfree = 0;
-
-static int
-arc_reclaim_needed(void)
-{
-#if 0
-	uint64_t extra;
-#endif
-
-#ifdef _KERNEL
-
-	if (zfs_needfree)
-		return (1);
-
-#if 0
-	/*
-	 * check to make sure that swapfs has enough space so that anon
-	 * reservations can still succeeed. anon_resvmem() checks that the
-	 * availrmem is greater than swapfs_minfree, and the number of reserved
-	 * swap pages.  We also add a bit of extra here just to prevent
-	 * circumstances from getting really dire.
-	 */
-	if (availrmem < swapfs_minfree + swapfs_reserve + extra)
-		return (1);
-
-	/*
-	 * If zio data pages are being allocated out of a separate heap segment,
-	 * then check that the size of available vmem for this area remains
-	 * above 1/4th free.  This needs to be done when the size of the
-	 * non-default segment is smaller than physical memory, so we could
-	 * conceivably run out of VA in that segment before running out of
-	 * physical memory.
-	 */
-	if (zio_arena != NULL) {
-		size_t arc_ziosize =
-		    btop(vmem_size(zio_arena, VMEM_FREE | VMEM_ALLOC));
-
-		if ((physmem > arc_ziosize) &&
-		    (btop(vmem_size(zio_arena, VMEM_FREE)) < arc_ziosize >> 2))
-			return (1);
-	}
-
-#if defined(__i386)
-	/*
-	 * If we're on an i386 platform, it's possible that we'll exhaust the
-	 * kernel heap space before we ever run out of available physical
-	 * memory.  Most checks of the size of the heap_area compare against
-	 * tune.t_minarmem, which is the minimum available real memory that we
-	 * can have in the system.  However, this is generally fixed at 25 pages
-	 * which is so low that it's useless.  In this comparison, we seek to
-	 * calculate the total heap-size, and reclaim if more than 3/4ths of the
-	 * heap is allocated.  (Or, in the caclulation, if less than 1/4th is
-	 * free)
-	 */
-	if (btop(vmem_size(heap_arena, VMEM_FREE)) <
-	    (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2))
-		return (1);
-#endif
-#else
-	if (kmem_used() > (kmem_size() * 3) / 4)
-		return (1);
-#endif
-
-#else
-	if (spa_get_random(100) == 0)
-		return (1);
-#endif
-	return (0);
-}
-
-static void
-arc_kmem_reap_now(arc_reclaim_strategy_t strat)
-{
-#ifdef ZIO_USE_UMA
-	size_t			i;
-	kmem_cache_t		*prev_cache = NULL;
-	kmem_cache_t		*prev_data_cache = NULL;
-	extern kmem_cache_t	*zio_buf_cache[];
-	extern kmem_cache_t	*zio_data_buf_cache[];
-#endif
-
-#ifdef _KERNEL
-	/*
-	 * First purge some DNLC entries, in case the DNLC is using
-	 * up too much memory.
-	 */
-	dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
-
-#if defined(__i386)
-	/*
-	 * Reclaim unused memory from all kmem caches.
-	 */
-	kmem_reap();
-#endif
-#endif
-
-	/*
-	 * An agressive reclamation will shrink the cache size as well as
-	 * reap free buffers from the arc kmem caches.
-	 */
-	if (strat == ARC_RECLAIM_AGGR)
-		arc_shrink();
-
-#ifdef ZIO_USE_UMA
-	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
-		if (zio_buf_cache[i] != prev_cache) {
-			prev_cache = zio_buf_cache[i];
-			kmem_cache_reap_now(zio_buf_cache[i]);
-		}
-		if (zio_data_buf_cache[i] != prev_data_cache) {
-			prev_data_cache = zio_data_buf_cache[i];
-			kmem_cache_reap_now(zio_data_buf_cache[i]);
-		}
-	}
-#endif
-	kmem_cache_reap_now(buf_cache);
-	kmem_cache_reap_now(hdr_cache);
-}
-
-static void
-arc_reclaim_thread(void *dummy __unused)
-{
-	clock_t			growtime = 0;
-	arc_reclaim_strategy_t	last_reclaim = ARC_RECLAIM_CONS;
-	callb_cpr_t		cpr;
-
-	CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
-
-	mutex_enter(&arc_reclaim_thr_lock);
-	while (arc_thread_exit == 0) {
-		if (arc_reclaim_needed()) {
-
-			if (arc_no_grow) {
-				if (last_reclaim == ARC_RECLAIM_CONS) {
-					last_reclaim = ARC_RECLAIM_AGGR;
-				} else {
-					last_reclaim = ARC_RECLAIM_CONS;
-				}
-			} else {
-				arc_no_grow = TRUE;
-				last_reclaim = ARC_RECLAIM_AGGR;
-				membar_producer();
-			}
-
-			/* reset the growth delay for every reclaim */
-			growtime = LBOLT + (arc_grow_retry * hz);
-			ASSERT(growtime > 0);
-
-			if (zfs_needfree && last_reclaim == ARC_RECLAIM_CONS) {
-				/*
-				 * If zfs_needfree is TRUE our vm_lowmem hook
-				 * was called and in that case we must free some
-				 * memory, so switch to aggressive mode.
-				 */
-				arc_no_grow = TRUE;
-				last_reclaim = ARC_RECLAIM_AGGR;
-			}
-			arc_kmem_reap_now(last_reclaim);
-		} else if ((growtime > 0) && ((growtime - LBOLT) <= 0)) {
-			arc_no_grow = FALSE;
-		}
-
-		if (zfs_needfree ||
-		    (2 * arc_c < arc_size +
-		    arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size))
-			arc_adjust();
-
-		if (arc_eviction_list != NULL)
-			arc_do_user_evicts();
-
-		if (arc_reclaim_needed()) {
-			zfs_needfree = 0;
-#ifdef _KERNEL
-			wakeup(&zfs_needfree);
-#endif
-		}
-
-		/* block until needed, or one second, whichever is shorter */
-		CALLB_CPR_SAFE_BEGIN(&cpr);
-		(void) cv_timedwait(&arc_reclaim_thr_cv,
-		    &arc_reclaim_thr_lock, hz);
-		CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
-	}
-
-	arc_thread_exit = 0;
-	cv_broadcast(&arc_reclaim_thr_cv);
-	CALLB_CPR_EXIT(&cpr);		/* drops arc_reclaim_thr_lock */
-	thread_exit();
-}
-
-/*
- * Adapt arc info given the number of bytes we are trying to add and
- * the state that we are comming from.  This function is only called
- * when we are adding new content to the cache.
- */
-static void
-arc_adapt(int bytes, arc_state_t *state)
-{
-	int mult;
-
-	ASSERT(bytes > 0);
-	/*
-	 * Adapt the target size of the MRU list:
-	 *	- if we just hit in the MRU ghost list, then increase
-	 *	  the target size of the MRU list.
-	 *	- if we just hit in the MFU ghost list, then increase
-	 *	  the target size of the MFU list by decreasing the
-	 *	  target size of the MRU list.
-	 */
-	if (state == arc_mru_ghost) {
-		mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
-		    1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
-
-		arc_p = MIN(arc_c, arc_p + bytes * mult);
-	} else if (state == arc_mfu_ghost) {
-		mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
-		    1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
-
-		arc_p = MAX(0, (int64_t)arc_p - bytes * mult);
-	}
-	ASSERT((int64_t)arc_p >= 0);
-
-	if (arc_reclaim_needed()) {
-		cv_signal(&arc_reclaim_thr_cv);
-		return;
-	}
-
-	if (arc_no_grow)
-		return;
-
-	if (arc_c >= arc_c_max)
-		return;
-
-	/*
-	 * If we're within (2 * maxblocksize) bytes of the target
-	 * cache size, increment the target cache size
-	 */
-	if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
-		atomic_add_64(&arc_c, (int64_t)bytes);
-		if (arc_c > arc_c_max)
-			arc_c = arc_c_max;
-		else if (state == arc_anon)
-			atomic_add_64(&arc_p, (int64_t)bytes);
-		if (arc_p > arc_c)
-			arc_p = arc_c;
-	}
-	ASSERT((int64_t)arc_p >= 0);
-}
-
-/*
- * Check if the cache has reached its limits and eviction is required
- * prior to insert.
- */
-static int
-arc_evict_needed()
-{
-	if (arc_reclaim_needed())
-		return (1);
-
-	return (arc_size > arc_c);
-}
-
-/*
- * The buffer, supplied as the first argument, needs a data block.
- * So, if we are at cache max, determine which cache should be victimized.
- * We have the following cases:
- *
- * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
- * In this situation if we're out of space, but the resident size of the MFU is
- * under the limit, victimize the MFU cache to satisfy this insertion request.
- *
- * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
- * Here, we've used up all of the available space for the MRU, so we need to
- * evict from our own cache instead.  Evict from the set of resident MRU
- * entries.
- *
- * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
- * c minus p represents the MFU space in the cache, since p is the size of the
- * cache that is dedicated to the MRU.  In this situation there's still space on
- * the MFU side, so the MRU side needs to be victimized.
- *
- * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
- * MFU's resident set is consuming more space than it has been allotted.  In
- * this situation, we must victimize our own cache, the MFU, for this insertion.
- */
-static void
-arc_get_data_buf(arc_buf_t *buf)
-{
-	arc_state_t		*state = buf->b_hdr->b_state;
-	uint64_t		size = buf->b_hdr->b_size;
-	arc_buf_contents_t	type = buf->b_hdr->b_type;
-
-	arc_adapt(size, state);
-
-	/*
-	 * We have not yet reached cache maximum size,
-	 * just allocate a new buffer.
-	 */
-	if (!arc_evict_needed()) {
-		if (type == ARC_BUFC_METADATA) {
-			buf->b_data = zio_buf_alloc(size);
-		} else {
-			ASSERT(type == ARC_BUFC_DATA);
-			buf->b_data = zio_data_buf_alloc(size);
-		}
-		atomic_add_64(&arc_size, size);
-		goto out;
-	}
-
-	/*
-	 * If we are prefetching from the mfu ghost list, this buffer
-	 * will end up on the mru list; so steal space from there.
-	 */
-	if (state == arc_mfu_ghost)
-		state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
-	else if (state == arc_mru_ghost)
-		state = arc_mru;
-
-	if (state == arc_mru || state == arc_anon) {
-		uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
-		state = (arc_p > mru_used) ? arc_mfu : arc_mru;
-	} else {
-		/* MFU cases */
-		uint64_t mfu_space = arc_c - arc_p;
-		state =  (mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
-	}
-	if ((buf->b_data = arc_evict(state, size, TRUE, type)) == NULL) {
-		if (type == ARC_BUFC_METADATA) {
-			buf->b_data = zio_buf_alloc(size);
-		} else {
-			ASSERT(type == ARC_BUFC_DATA);
-			buf->b_data = zio_data_buf_alloc(size);
-		}
-		atomic_add_64(&arc_size, size);
-		ARCSTAT_BUMP(arcstat_recycle_miss);
-	}
-	ASSERT(buf->b_data != NULL);
-out:
-	/*
-	 * Update the state size.  Note that ghost states have a
-	 * "ghost size" and so don't need to be updated.
-	 */
-	if (!GHOST_STATE(buf->b_hdr->b_state)) {
-		arc_buf_hdr_t *hdr = buf->b_hdr;
-
-		atomic_add_64(&hdr->b_state->arcs_size, size);
-		if (list_link_active(&hdr->b_arc_node)) {
-			ASSERT(refcount_is_zero(&hdr->b_refcnt));
-			atomic_add_64(&hdr->b_state->arcs_lsize, size);
-		}
-		/*
-		 * If we are growing the cache, and we are adding anonymous
-		 * data, and we have outgrown arc_p, update arc_p
-		 */
-		if (arc_size < arc_c && hdr->b_state == arc_anon &&
-		    arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
-			arc_p = MIN(arc_c, arc_p + size);
-	}
-}
-
-/*
- * This routine is called whenever a buffer is accessed.
- * NOTE: the hash lock is dropped in this function.
- */
-static void
-arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
-{
-	ASSERT(MUTEX_HELD(hash_lock));
-
-	if (buf->b_state == arc_anon) {
-		/*
-		 * This buffer is not in the cache, and does not
-		 * appear in our "ghost" list.  Add the new buffer
-		 * to the MRU state.
-		 */
-
-		ASSERT(buf->b_arc_access == 0);
-		buf->b_arc_access = LBOLT;
-		DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
-		arc_change_state(arc_mru, buf, hash_lock);
-
-	} else if (buf->b_state == arc_mru) {
-		/*
-		 * If this buffer is here because of a prefetch, then either:
-		 * - clear the flag if this is a "referencing" read
-		 *   (any subsequent access will bump this into the MFU state).
-		 * or
-		 * - move the buffer to the head of the list if this is
-		 *   another prefetch (to make it less likely to be evicted).
-		 */
-		if ((buf->b_flags & ARC_PREFETCH) != 0) {
-			if (refcount_count(&buf->b_refcnt) == 0) {
-				ASSERT(list_link_active(&buf->b_arc_node));
-				mutex_enter(&arc_mru->arcs_mtx);
-				list_remove(&arc_mru->arcs_list, buf);
-				list_insert_head(&arc_mru->arcs_list, buf);
-				mutex_exit(&arc_mru->arcs_mtx);
-			} else {
-				buf->b_flags &= ~ARC_PREFETCH;
-				ARCSTAT_BUMP(arcstat_mru_hits);
-			}
-			buf->b_arc_access = LBOLT;
-			return;
-		}
-
-		/*
-		 * This buffer has been "accessed" only once so far,
-		 * but it is still in the cache. Move it to the MFU
-		 * state.
-		 */
-		if (LBOLT > buf->b_arc_access + ARC_MINTIME) {
-			/*
-			 * More than 125ms have passed since we
-			 * instantiated this buffer.  Move it to the
-			 * most frequently used state.
-			 */
-			buf->b_arc_access = LBOLT;
-			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
-			arc_change_state(arc_mfu, buf, hash_lock);
-		}
-		ARCSTAT_BUMP(arcstat_mru_hits);
-	} else if (buf->b_state == arc_mru_ghost) {
-		arc_state_t	*new_state;
-		/*
-		 * This buffer has been "accessed" recently, but
-		 * was evicted from the cache.  Move it to the
-		 * MFU state.
-		 */
-
-		if (buf->b_flags & ARC_PREFETCH) {
-			new_state = arc_mru;
-			if (refcount_count(&buf->b_refcnt) > 0)
-				buf->b_flags &= ~ARC_PREFETCH;
-			DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
-		} else {
-			new_state = arc_mfu;
-			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
-		}
-
-		buf->b_arc_access = LBOLT;
-		arc_change_state(new_state, buf, hash_lock);
-
-		ARCSTAT_BUMP(arcstat_mru_ghost_hits);
-	} else if (buf->b_state == arc_mfu) {
-		/*
-		 * This buffer has been accessed more than once and is
-		 * still in the cache.  Keep it in the MFU state.
-		 *
-		 * NOTE: an add_reference() that occurred when we did
-		 * the arc_read() will have kicked this off the list.
-		 * If it was a prefetch, we will explicitly move it to
-		 * the head of the list now.
-		 */
-		if ((buf->b_flags & ARC_PREFETCH) != 0) {
-			ASSERT(refcount_count(&buf->b_refcnt) == 0);
-			ASSERT(list_link_active(&buf->b_arc_node));
-			mutex_enter(&arc_mfu->arcs_mtx);
-			list_remove(&arc_mfu->arcs_list, buf);
-			list_insert_head(&arc_mfu->arcs_list, buf);
-			mutex_exit(&arc_mfu->arcs_mtx);
-		}
-		ARCSTAT_BUMP(arcstat_mfu_hits);
-		buf->b_arc_access = LBOLT;
-	} else if (buf->b_state == arc_mfu_ghost) {
-		arc_state_t	*new_state = arc_mfu;
-		/*
-		 * This buffer has been accessed more than once but has
-		 * been evicted from the cache.  Move it back to the
-		 * MFU state.
-		 */
-
-		if (buf->b_flags & ARC_PREFETCH) {
-			/*
-			 * This is a prefetch access...
-			 * move this block back to the MRU state.
-			 */
-			ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0);
-			new_state = arc_mru;
-		}
-
-		buf->b_arc_access = LBOLT;
-		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
-		arc_change_state(new_state, buf, hash_lock);
-
-		ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
-	} else {
-		ASSERT(!"invalid arc state");
-	}
-}
-
-/* a generic arc_done_func_t which you can use */
-/* ARGSUSED */
-void
-arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
-{
-	bcopy(buf->b_data, arg, buf->b_hdr->b_size);
-	VERIFY(arc_buf_remove_ref(buf, arg) == 1);
-}
-
-/* a generic arc_done_func_t which you can use */
-void
-arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
-{
-	arc_buf_t **bufp = arg;
-	if (zio && zio->io_error) {
-		VERIFY(arc_buf_remove_ref(buf, arg) == 1);
-		*bufp = NULL;
-	} else {
-		*bufp = buf;
-	}
-}
-
-static void
-arc_read_done(zio_t *zio)
-{
-	arc_buf_hdr_t	*hdr, *found;
-	arc_buf_t	*buf;
-	arc_buf_t	*abuf;	/* buffer we're assigning to callback */
-	kmutex_t	*hash_lock;
-	arc_callback_t	*callback_list, *acb;
-	int		freeable = FALSE;
-
-	buf = zio->io_private;
-	hdr = buf->b_hdr;
-
-	/*
-	 * The hdr was inserted into hash-table and removed from lists
-	 * prior to starting I/O.  We should find this header, since
-	 * it's in the hash table, and it should be legit since it's
-	 * not possible to evict it during the I/O.  The only possible
-	 * reason for it not to be found is if we were freed during the
-	 * read.
-	 */
-	found = buf_hash_find(zio->io_spa, &hdr->b_dva, hdr->b_birth,
-	    &hash_lock);
-
-	ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
-	    (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))));
-
-	/* byteswap if necessary */
-	callback_list = hdr->b_acb;
-	ASSERT(callback_list != NULL);
-	if (BP_SHOULD_BYTESWAP(zio->io_bp) && callback_list->acb_byteswap)
-		callback_list->acb_byteswap(buf->b_data, hdr->b_size);
-
-	arc_cksum_compute(buf);
-
-	/* create copies of the data buffer for the callers */
-	abuf = buf;
-	for (acb = callback_list; acb; acb = acb->acb_next) {
-		if (acb->acb_done) {
-			if (abuf == NULL)
-				abuf = arc_buf_clone(buf);
-			acb->acb_buf = abuf;
-			abuf = NULL;
-		}
-	}
-	hdr->b_acb = NULL;
-	hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
-	ASSERT(!HDR_BUF_AVAILABLE(hdr));
-	if (abuf == buf)
-		hdr->b_flags |= ARC_BUF_AVAILABLE;
-
-	ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
-
-	if (zio->io_error != 0) {
-		hdr->b_flags |= ARC_IO_ERROR;
-		if (hdr->b_state != arc_anon)
-			arc_change_state(arc_anon, hdr, hash_lock);
-		if (HDR_IN_HASH_TABLE(hdr))
-			buf_hash_remove(hdr);
-		freeable = refcount_is_zero(&hdr->b_refcnt);
-		/* convert checksum errors into IO errors */
-		if (zio->io_error == ECKSUM)
-			zio->io_error = EIO;
-	}
-
-	/*
-	 * Broadcast before we drop the hash_lock to avoid the possibility
-	 * that the hdr (and hence the cv) might be freed before we get to
-	 * the cv_broadcast().
-	 */
-	cv_broadcast(&hdr->b_cv);
-
-	if (hash_lock) {
-		/*
-		 * Only call arc_access on anonymous buffers.  This is because
-		 * if we've issued an I/O for an evicted buffer, we've already
-		 * called arc_access (to prevent any simultaneous readers from
-		 * getting confused).
-		 */
-		if (zio->io_error == 0 && hdr->b_state == arc_anon)
-			arc_access(hdr, hash_lock);
-		mutex_exit(hash_lock);
-	} else {
-		/*
-		 * This block was freed while we waited for the read to
-		 * complete.  It has been removed from the hash table and
-		 * moved to the anonymous state (so that it won't show up
-		 * in the cache).
-		 */
-		ASSERT3P(hdr->b_state, ==, arc_anon);
-		freeable = refcount_is_zero(&hdr->b_refcnt);
-	}
-
-	/* execute each callback and free its structure */
-	while ((acb = callback_list) != NULL) {
-		if (acb->acb_done)
-			acb->acb_done(zio, acb->acb_buf, acb->acb_private);
-
-		if (acb->acb_zio_dummy != NULL) {
-			acb->acb_zio_dummy->io_error = zio->io_error;
-			zio_nowait(acb->acb_zio_dummy);
-		}
-
-		callback_list = acb->acb_next;
-		kmem_free(acb, sizeof (arc_callback_t));
-	}
-
-	if (freeable)
-		arc_hdr_destroy(hdr);
-}
-
-/*
- * "Read" the block block at the specified DVA (in bp) via the
- * cache.  If the block is found in the cache, invoke the provided
- * callback immediately and return.  Note that the `zio' parameter
- * in the callback will be NULL in this case, since no IO was
- * required.  If the block is not in the cache pass the read request
- * on to the spa with a substitute callback function, so that the
- * requested block will be added to the cache.
- *
- * If a read request arrives for a block that has a read in-progress,
- * either wait for the in-progress read to complete (and return the
- * results); or, if this is a read with a "done" func, add a record
- * to the read to invoke the "done" func when the read completes,
- * and return; or just return.
- *
- * arc_read_done() will invoke all the requested "done" functions
- * for readers of this block.
- */
-int
-arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap,
-    arc_done_func_t *done, void *private, int priority, int flags,
-    uint32_t *arc_flags, zbookmark_t *zb)
-{
-	arc_buf_hdr_t *hdr;
-	arc_buf_t *buf;
-	kmutex_t *hash_lock;
-	zio_t	*rzio;
-
-top:
-	hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
-	if (hdr && hdr->b_datacnt > 0) {
-
-		*arc_flags |= ARC_CACHED;
-
-		if (HDR_IO_IN_PROGRESS(hdr)) {
-
-			if (*arc_flags & ARC_WAIT) {
-				cv_wait(&hdr->b_cv, hash_lock);
-				mutex_exit(hash_lock);
-				goto top;
-			}
-			ASSERT(*arc_flags & ARC_NOWAIT);
-
-			if (done) {
-				arc_callback_t	*acb = NULL;
-
-				acb = kmem_zalloc(sizeof (arc_callback_t),
-				    KM_SLEEP);
-				acb->acb_done = done;
-				acb->acb_private = private;
-				acb->acb_byteswap = swap;
-				if (pio != NULL)
-					acb->acb_zio_dummy = zio_null(pio,
-					    spa, NULL, NULL, flags);
-
-				ASSERT(acb->acb_done != NULL);
-				acb->acb_next = hdr->b_acb;
-				hdr->b_acb = acb;
-				add_reference(hdr, hash_lock, private);
-				mutex_exit(hash_lock);
-				return (0);
-			}
-			mutex_exit(hash_lock);
-			return (0);
-		}
-
-		ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
-
-		if (done) {
-			add_reference(hdr, hash_lock, private);
-			/*
-			 * If this block is already in use, create a new
-			 * copy of the data so that we will be guaranteed
-			 * that arc_release() will always succeed.
-			 */
-			buf = hdr->b_buf;
-			ASSERT(buf);
-			ASSERT(buf->b_data);
-			if (HDR_BUF_AVAILABLE(hdr)) {
-				ASSERT(buf->b_efunc == NULL);
-				hdr->b_flags &= ~ARC_BUF_AVAILABLE;
-			} else {
-				buf = arc_buf_clone(buf);
-			}
-		} else if (*arc_flags & ARC_PREFETCH &&
-		    refcount_count(&hdr->b_refcnt) == 0) {
-			hdr->b_flags |= ARC_PREFETCH;
-		}
-		DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
-		arc_access(hdr, hash_lock);
-		mutex_exit(hash_lock);
-		ARCSTAT_BUMP(arcstat_hits);
-		ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
-		    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
-		    data, metadata, hits);
-
-		if (done)
-			done(NULL, buf, private);
-	} else {
-		uint64_t size = BP_GET_LSIZE(bp);
-		arc_callback_t	*acb;
-
-		if (hdr == NULL) {
-			/* this block is not in the cache */
-			arc_buf_hdr_t	*exists;
-			arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
-			buf = arc_buf_alloc(spa, size, private, type);
-			hdr = buf->b_hdr;
-			hdr->b_dva = *BP_IDENTITY(bp);
-			hdr->b_birth = bp->blk_birth;
-			hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
-			exists = buf_hash_insert(hdr, &hash_lock);
-			if (exists) {
-				/* somebody beat us to the hash insert */
-				mutex_exit(hash_lock);
-				bzero(&hdr->b_dva, sizeof (dva_t));
-				hdr->b_birth = 0;
-				hdr->b_cksum0 = 0;
-				(void) arc_buf_remove_ref(buf, private);
-				goto top; /* restart the IO request */
-			}
-			/* if this is a prefetch, we don't have a reference */
-			if (*arc_flags & ARC_PREFETCH) {
-				(void) remove_reference(hdr, hash_lock,
-				    private);
-				hdr->b_flags |= ARC_PREFETCH;
-			}
-			if (BP_GET_LEVEL(bp) > 0)
-				hdr->b_flags |= ARC_INDIRECT;
-		} else {
-			/* this block is in the ghost cache */
-			ASSERT(GHOST_STATE(hdr->b_state));
-			ASSERT(!HDR_IO_IN_PROGRESS(hdr));
-			ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0);
-			ASSERT(hdr->b_buf == NULL);
-
-			/* if this is a prefetch, we don't have a reference */
-			if (*arc_flags & ARC_PREFETCH)
-				hdr->b_flags |= ARC_PREFETCH;
-			else
-				add_reference(hdr, hash_lock, private);
-			buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
-			buf->b_hdr = hdr;
-			buf->b_data = NULL;
-			buf->b_efunc = NULL;
-			buf->b_private = NULL;
-			buf->b_next = NULL;
-			hdr->b_buf = buf;
-			arc_get_data_buf(buf);
-			ASSERT(hdr->b_datacnt == 0);
-			hdr->b_datacnt = 1;
-
-		}
-
-		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
-		acb->acb_done = done;
-		acb->acb_private = private;
-		acb->acb_byteswap = swap;
-
-		ASSERT(hdr->b_acb == NULL);
-		hdr->b_acb = acb;
-		hdr->b_flags |= ARC_IO_IN_PROGRESS;
-
-		/*
-		 * If the buffer has been evicted, migrate it to a present state
-		 * before issuing the I/O.  Once we drop the hash-table lock,
-		 * the header will be marked as I/O in progress and have an
-		 * attached buffer.  At this point, anybody who finds this
-		 * buffer ought to notice that it's legit but has a pending I/O.
-		 */
-
-		if (GHOST_STATE(hdr->b_state))
-			arc_access(hdr, hash_lock);
-		mutex_exit(hash_lock);
-
-		ASSERT3U(hdr->b_size, ==, size);
-		DTRACE_PROBE3(arc__miss, blkptr_t *, bp, uint64_t, size,
-		    zbookmark_t *, zb);
-		ARCSTAT_BUMP(arcstat_misses);
-		ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
-		    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
-		    data, metadata, misses);
-
-		rzio = zio_read(pio, spa, bp, buf->b_data, size,
-		    arc_read_done, buf, priority, flags, zb);
-
-		if (*arc_flags & ARC_WAIT)
-			return (zio_wait(rzio));
-
-		ASSERT(*arc_flags & ARC_NOWAIT);
-		zio_nowait(rzio);
-	}
-	return (0);
-}
-
-/*
- * arc_read() variant to support pool traversal.  If the block is already
- * in the ARC, make a copy of it; otherwise, the caller will do the I/O.
- * The idea is that we don't want pool traversal filling up memory, but
- * if the ARC already has the data anyway, we shouldn't pay for the I/O.
- */
-int
-arc_tryread(spa_t *spa, blkptr_t *bp, void *data)
-{
-	arc_buf_hdr_t *hdr;
-	kmutex_t *hash_mtx;
-	int rc = 0;
-
-	hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx);
-
-	if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) {
-		arc_buf_t *buf = hdr->b_buf;
-
-		ASSERT(buf);
-		while (buf->b_data == NULL) {
-			buf = buf->b_next;
-			ASSERT(buf);
-		}
-		bcopy(buf->b_data, data, hdr->b_size);
-	} else {
-		rc = ENOENT;
-	}
-
-	if (hash_mtx)
-		mutex_exit(hash_mtx);
-
-	return (rc);
-}
-
-void
-arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
-{
-	ASSERT(buf->b_hdr != NULL);
-	ASSERT(buf->b_hdr->b_state != arc_anon);
-	ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
-	buf->b_efunc = func;
-	buf->b_private = private;
-}
-
-/*
- * This is used by the DMU to let the ARC know that a buffer is
- * being evicted, so the ARC should clean up.  If this arc buf
- * is not yet in the evicted state, it will be put there.
- */
-int
-arc_buf_evict(arc_buf_t *buf)
-{
-	arc_buf_hdr_t *hdr;
-	kmutex_t *hash_lock;
-	arc_buf_t **bufp;
-
-	mutex_enter(&arc_eviction_mtx);
-	hdr = buf->b_hdr;
-	if (hdr == NULL) {
-		/*
-		 * We are in arc_do_user_evicts().
-		 */
-		ASSERT(buf->b_data == NULL);
-		mutex_exit(&arc_eviction_mtx);
-		return (0);
-	}
-	hash_lock = HDR_LOCK(hdr);
-	mutex_exit(&arc_eviction_mtx);
-
-	mutex_enter(hash_lock);
-
-	if (buf->b_data == NULL) {
-		/*
-		 * We are on the eviction list.
-		 */
-		mutex_exit(hash_lock);
-		mutex_enter(&arc_eviction_mtx);
-		if (buf->b_hdr == NULL) {
-			/*
-			 * We are already in arc_do_user_evicts().
-			 */
-			mutex_exit(&arc_eviction_mtx);
-			return (0);
-		} else {
-			arc_buf_t copy = *buf; /* structure assignment */
-			/*
-			 * Process this buffer now
-			 * but let arc_do_user_evicts() do the reaping.
-			 */
-			buf->b_efunc = NULL;
-			mutex_exit(&arc_eviction_mtx);
-			VERIFY(copy.b_efunc(&copy) == 0);
-			return (1);
-		}
-	}
-
-	ASSERT(buf->b_hdr == hdr);
-	ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
-	ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
-
-	/*
-	 * Pull this buffer off of the hdr
-	 */
-	bufp = &hdr->b_buf;
-	while (*bufp != buf)
-		bufp = &(*bufp)->b_next;
-	*bufp = buf->b_next;
-
-	ASSERT(buf->b_data != NULL);
-	arc_buf_destroy(buf, FALSE, FALSE);
-
-	if (hdr->b_datacnt == 0) {
-		arc_state_t *old_state = hdr->b_state;
-		arc_state_t *evicted_state;
-
-		ASSERT(refcount_is_zero(&hdr->b_refcnt));
-
-		evicted_state =
-		    (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
-
-		mutex_enter(&old_state->arcs_mtx);
-		mutex_enter(&evicted_state->arcs_mtx);
-
-		arc_change_state(evicted_state, hdr, hash_lock);
-		ASSERT(HDR_IN_HASH_TABLE(hdr));
-		hdr->b_flags = ARC_IN_HASH_TABLE;
-
-		mutex_exit(&evicted_state->arcs_mtx);
-		mutex_exit(&old_state->arcs_mtx);
-	}
-	mutex_exit(hash_lock);
-
-	VERIFY(buf->b_efunc(buf) == 0);
-	buf->b_efunc = NULL;
-	buf->b_private = NULL;
-	buf->b_hdr = NULL;
-	kmem_cache_free(buf_cache, buf);
-	return (1);
-}
-
-/*
- * Release this buffer from the cache.  This must be done
- * after a read and prior to modifying the buffer contents.
- * If the buffer has more than one reference, we must make
- * make a new hdr for the buffer.
- */
-void
-arc_release(arc_buf_t *buf, void *tag)
-{
-	arc_buf_hdr_t *hdr = buf->b_hdr;
-	kmutex_t *hash_lock = HDR_LOCK(hdr);
-
-	/* this buffer is not on any list */
-	ASSERT(refcount_count(&hdr->b_refcnt) > 0);
-
-	if (hdr->b_state == arc_anon) {
-		/* this buffer is already released */
-		ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1);
-		ASSERT(BUF_EMPTY(hdr));
-		ASSERT(buf->b_efunc == NULL);
-		arc_buf_thaw(buf);
-		return;
-	}
-
-	mutex_enter(hash_lock);
-
-	/*
-	 * Do we have more than one buf?
-	 */
-	if (hdr->b_buf != buf || buf->b_next != NULL) {
-		arc_buf_hdr_t *nhdr;
-		arc_buf_t **bufp;
-		uint64_t blksz = hdr->b_size;
-		spa_t *spa = hdr->b_spa;
-		arc_buf_contents_t type = hdr->b_type;
-
-		ASSERT(hdr->b_datacnt > 1);
-		/*
-		 * Pull the data off of this buf and attach it to
-		 * a new anonymous buf.
-		 */
-		(void) remove_reference(hdr, hash_lock, tag);
-		bufp = &hdr->b_buf;
-		while (*bufp != buf)
-			bufp = &(*bufp)->b_next;
-		*bufp = (*bufp)->b_next;
-		buf->b_next = NULL;
-
-		ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
-		atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
-		if (refcount_is_zero(&hdr->b_refcnt)) {
-			ASSERT3U(hdr->b_state->arcs_lsize, >=, hdr->b_size);
-			atomic_add_64(&hdr->b_state->arcs_lsize, -hdr->b_size);
-		}
-		hdr->b_datacnt -= 1;
-		arc_cksum_verify(buf);
-
-		mutex_exit(hash_lock);
-
-		nhdr = kmem_cache_alloc(hdr_cache, KM_SLEEP);
-		nhdr->b_size = blksz;
-		nhdr->b_spa = spa;
-		nhdr->b_type = type;
-		nhdr->b_buf = buf;
-		nhdr->b_state = arc_anon;
-		nhdr->b_arc_access = 0;
-		nhdr->b_flags = 0;
-		nhdr->b_datacnt = 1;
-		nhdr->b_freeze_cksum = NULL;
-		mutex_init(&nhdr->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
-		(void) refcount_add(&nhdr->b_refcnt, tag);
-		buf->b_hdr = nhdr;
-		atomic_add_64(&arc_anon->arcs_size, blksz);
-
-		hdr = nhdr;
-	} else {
-		ASSERT(refcount_count(&hdr->b_refcnt) == 1);
-		ASSERT(!list_link_active(&hdr->b_arc_node));
-		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
-		arc_change_state(arc_anon, hdr, hash_lock);
-		hdr->b_arc_access = 0;
-		mutex_exit(hash_lock);
-		bzero(&hdr->b_dva, sizeof (dva_t));
-		hdr->b_birth = 0;
-		hdr->b_cksum0 = 0;
-		arc_buf_thaw(buf);
-	}
-	buf->b_efunc = NULL;
-	buf->b_private = NULL;
-}
-
-int
-arc_released(arc_buf_t *buf)
-{
-	return (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
-}
-
-int
-arc_has_callback(arc_buf_t *buf)
-{
-	return (buf->b_efunc != NULL);
-}
-
-#ifdef ZFS_DEBUG
-int
-arc_referenced(arc_buf_t *buf)
-{
-	return (refcount_count(&buf->b_hdr->b_refcnt));
-}
-#endif
-
-static void
-arc_write_ready(zio_t *zio)
-{
-	arc_write_callback_t *callback = zio->io_private;
-	arc_buf_t *buf = callback->awcb_buf;
-
-	if (callback->awcb_ready) {
-		ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
-		callback->awcb_ready(zio, buf, callback->awcb_private);
-	}
-	arc_cksum_compute(buf);
-}
-
-static void
-arc_write_done(zio_t *zio)
-{
-	arc_write_callback_t *callback = zio->io_private;
-	arc_buf_t *buf = callback->awcb_buf;
-	arc_buf_hdr_t *hdr = buf->b_hdr;
-
-	hdr->b_acb = NULL;
-
-	/* this buffer is on no lists and is not in the hash table */
-	ASSERT3P(hdr->b_state, ==, arc_anon);
-
-	hdr->b_dva = *BP_IDENTITY(zio->io_bp);
-	hdr->b_birth = zio->io_bp->blk_birth;
-	hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
-	/*
-	 * If the block to be written was all-zero, we may have
-	 * compressed it away.  In this case no write was performed
-	 * so there will be no dva/birth-date/checksum.  The buffer
-	 * must therefor remain anonymous (and uncached).
-	 */
-	if (!BUF_EMPTY(hdr)) {
-		arc_buf_hdr_t *exists;
-		kmutex_t *hash_lock;
-
-		arc_cksum_verify(buf);
-
-		exists = buf_hash_insert(hdr, &hash_lock);
-		if (exists) {
-			/*
-			 * This can only happen if we overwrite for
-			 * sync-to-convergence, because we remove
-			 * buffers from the hash table when we arc_free().
-			 */
-			ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig),
-			    BP_IDENTITY(zio->io_bp)));
-			ASSERT3U(zio->io_bp_orig.blk_birth, ==,
-			    zio->io_bp->blk_birth);
-
-			ASSERT(refcount_is_zero(&exists->b_refcnt));
-			arc_change_state(arc_anon, exists, hash_lock);
-			mutex_exit(hash_lock);
-			arc_hdr_destroy(exists);
-			exists = buf_hash_insert(hdr, &hash_lock);
-			ASSERT3P(exists, ==, NULL);
-		}
-		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
-		arc_access(hdr, hash_lock);
-		mutex_exit(hash_lock);
-	} else if (callback->awcb_done == NULL) {
-		int destroy_hdr;
-		/*
-		 * This is an anonymous buffer with no user callback,
-		 * destroy it if there are no active references.
-		 */
-		mutex_enter(&arc_eviction_mtx);
-		destroy_hdr = refcount_is_zero(&hdr->b_refcnt);
-		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
-		mutex_exit(&arc_eviction_mtx);
-		if (destroy_hdr)
-			arc_hdr_destroy(hdr);
-	} else {
-		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
-	}
-
-	if (callback->awcb_done) {
-		ASSERT(!refcount_is_zero(&hdr->b_refcnt));
-		callback->awcb_done(zio, buf, callback->awcb_private);
-	}
-
-	kmem_free(callback, sizeof (arc_write_callback_t));
-}
-
-zio_t *
-arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies,
-    uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
-    arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority,
-    int flags, zbookmark_t *zb)
-{
-	arc_buf_hdr_t *hdr = buf->b_hdr;
-	arc_write_callback_t *callback;
-	zio_t	*zio;
-
-	/* this is a private buffer - no locking required */
-	ASSERT3P(hdr->b_state, ==, arc_anon);
-	ASSERT(BUF_EMPTY(hdr));
-	ASSERT(!HDR_IO_ERROR(hdr));
-	ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
-	ASSERT(hdr->b_acb == 0);
-	callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
-	callback->awcb_ready = ready;
-	callback->awcb_done = done;
-	callback->awcb_private = private;
-	callback->awcb_buf = buf;
-	hdr->b_flags |= ARC_IO_IN_PROGRESS;
-	zio = zio_write(pio, spa, checksum, compress, ncopies, txg, bp,
-	    buf->b_data, hdr->b_size, arc_write_ready, arc_write_done, callback,
-	    priority, flags, zb);
-
-	return (zio);
-}
-
-int
-arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
-    zio_done_func_t *done, void *private, uint32_t arc_flags)
-{
-	arc_buf_hdr_t *ab;
-	kmutex_t *hash_lock;
-	zio_t	*zio;
-
-	/*
-	 * If this buffer is in the cache, release it, so it
-	 * can be re-used.
-	 */
-	ab = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
-	if (ab != NULL) {
-		/*
-		 * The checksum of blocks to free is not always
-		 * preserved (eg. on the deadlist).  However, if it is
-		 * nonzero, it should match what we have in the cache.
-		 */
-		ASSERT(bp->blk_cksum.zc_word[0] == 0 ||
-		    ab->b_cksum0 == bp->blk_cksum.zc_word[0]);
-		if (ab->b_state != arc_anon)
-			arc_change_state(arc_anon, ab, hash_lock);
-		if (HDR_IO_IN_PROGRESS(ab)) {
-			/*
-			 * This should only happen when we prefetch.
-			 */
-			ASSERT(ab->b_flags & ARC_PREFETCH);
-			ASSERT3U(ab->b_datacnt, ==, 1);
-			ab->b_flags |= ARC_FREED_IN_READ;
-			if (HDR_IN_HASH_TABLE(ab))
-				buf_hash_remove(ab);
-			ab->b_arc_access = 0;
-			bzero(&ab->b_dva, sizeof (dva_t));
-			ab->b_birth = 0;
-			ab->b_cksum0 = 0;
-			ab->b_buf->b_efunc = NULL;
-			ab->b_buf->b_private = NULL;
-			mutex_exit(hash_lock);
-		} else if (refcount_is_zero(&ab->b_refcnt)) {
-			mutex_exit(hash_lock);
-			arc_hdr_destroy(ab);
-			ARCSTAT_BUMP(arcstat_deleted);
-		} else {
-			/*
-			 * We still have an active reference on this
-			 * buffer.  This can happen, e.g., from
-			 * dbuf_unoverride().
-			 */
-			ASSERT(!HDR_IN_HASH_TABLE(ab));
-			ab->b_arc_access = 0;
-			bzero(&ab->b_dva, sizeof (dva_t));
-			ab->b_birth = 0;
-			ab->b_cksum0 = 0;
-			ab->b_buf->b_efunc = NULL;
-			ab->b_buf->b_private = NULL;
-			mutex_exit(hash_lock);
-		}
-	}
-
-	zio = zio_free(pio, spa, txg, bp, done, private);
-
-	if (arc_flags & ARC_WAIT)
-		return (zio_wait(zio));
-
-	ASSERT(arc_flags & ARC_NOWAIT);
-	zio_nowait(zio);
-
-	return (0);
-}
-
-void
-arc_tempreserve_clear(uint64_t tempreserve)
-{
-	atomic_add_64(&arc_tempreserve, -tempreserve);
-	ASSERT((int64_t)arc_tempreserve >= 0);
-}
-
-int
-arc_tempreserve_space(uint64_t tempreserve)
-{
-#ifdef ZFS_DEBUG
-	/*
-	 * Once in a while, fail for no reason.  Everything should cope.
-	 */
-	if (spa_get_random(10000) == 0) {
-		dprintf("forcing random failure\n");
-		return (ERESTART);
-	}
-#endif
-	if (tempreserve > arc_c/4 && !arc_no_grow)
-		arc_c = MIN(arc_c_max, tempreserve * 4);
-	if (tempreserve > arc_c)
-		return (ENOMEM);
-
-	/*
-	 * Throttle writes when the amount of dirty data in the cache
-	 * gets too large.  We try to keep the cache less than half full
-	 * of dirty blocks so that our sync times don't grow too large.
-	 * Note: if two requests come in concurrently, we might let them
-	 * both succeed, when one of them should fail.  Not a huge deal.
-	 *
-	 * XXX The limit should be adjusted dynamically to keep the time
-	 * to sync a dataset fixed (around 1-5 seconds?).
-	 */
-
-	if (tempreserve + arc_tempreserve + arc_anon->arcs_size > arc_c / 2 &&
-	    arc_tempreserve + arc_anon->arcs_size > arc_c / 4) {
-		dprintf("failing, arc_tempreserve=%lluK anon=%lluK "
-		    "tempreserve=%lluK arc_c=%lluK\n",
-		    arc_tempreserve>>10, arc_anon->arcs_lsize>>10,
-		    tempreserve>>10, arc_c>>10);
-		return (ERESTART);
-	}
-	atomic_add_64(&arc_tempreserve, tempreserve);
-	return (0);
-}
-
-static kmutex_t arc_lowmem_lock;
-#ifdef _KERNEL
-static eventhandler_tag arc_event_lowmem = NULL;
-
-static void
-arc_lowmem(void *arg __unused, int howto __unused)
-{
-
-	/* Serialize access via arc_lowmem_lock. */
-	mutex_enter(&arc_lowmem_lock);
-	zfs_needfree = 1;
-	cv_signal(&arc_reclaim_thr_cv);
-	while (zfs_needfree)
-		tsleep(&zfs_needfree, 0, "zfs:lowmem", hz / 5);
-	mutex_exit(&arc_lowmem_lock);
-}
-#endif
-
-void
-arc_init(void)
-{
-	mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
-	cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
-	mutex_init(&arc_lowmem_lock, NULL, MUTEX_DEFAULT, NULL);
-
-	/* Convert seconds to clock ticks */
-	arc_min_prefetch_lifespan = 1 * hz;
-
-	/* Start out with 1/8 of all memory */
-	arc_c = kmem_size() / 8;
-#if 0
-#ifdef _KERNEL
-	/*
-	 * On architectures where the physical memory can be larger
-	 * than the addressable space (intel in 32-bit mode), we may
-	 * need to limit the cache to 1/8 of VM size.
-	 */
-	arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
-#endif
-#endif
-	/* set min cache to 1/32 of all memory, or 16MB, whichever is more */
-	arc_c_min = MAX(arc_c / 4, 64<<18);
-	/* set max to 1/2 of all memory, or all but 1GB, whichever is more */
-	if (arc_c * 8 >= 1<<30)
-		arc_c_max = (arc_c * 8) - (1<<30);
-	else
-		arc_c_max = arc_c_min;
-	arc_c_max = MAX(arc_c * 5, arc_c_max);
-#ifdef _KERNEL
-	/*
-	 * Allow the tunables to override our calculations if they are
-	 * reasonable (ie. over 16MB)
-	 */
-	if (zfs_arc_max >= 64<<18 && zfs_arc_max < kmem_size())
-		arc_c_max = zfs_arc_max;
-	if (zfs_arc_min >= 64<<18 && zfs_arc_min <= arc_c_max)
-		arc_c_min = zfs_arc_min;
-#endif
-	arc_c = arc_c_max;
-	arc_p = (arc_c >> 1);
-
-	/* if kmem_flags are set, lets try to use less memory */
-	if (kmem_debugging())
-		arc_c = arc_c / 2;
-	if (arc_c < arc_c_min)
-		arc_c = arc_c_min;
-
-	zfs_arc_min = arc_c_min;
-	zfs_arc_max = arc_c_max;
-
-	arc_anon = &ARC_anon;
-	arc_mru = &ARC_mru;
-	arc_mru_ghost = &ARC_mru_ghost;
-	arc_mfu = &ARC_mfu;
-	arc_mfu_ghost = &ARC_mfu_ghost;
-	arc_size = 0;
-
-	mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
-
-	list_create(&arc_mru->arcs_list, sizeof (arc_buf_hdr_t),
-	    offsetof(arc_buf_hdr_t, b_arc_node));
-	list_create(&arc_mru_ghost->arcs_list, sizeof (arc_buf_hdr_t),
-	    offsetof(arc_buf_hdr_t, b_arc_node));
-	list_create(&arc_mfu->arcs_list, sizeof (arc_buf_hdr_t),
-	    offsetof(arc_buf_hdr_t, b_arc_node));
-	list_create(&arc_mfu_ghost->arcs_list, sizeof (arc_buf_hdr_t),
-	    offsetof(arc_buf_hdr_t, b_arc_node));
-
-	buf_init();
-
-	arc_thread_exit = 0;
-	arc_eviction_list = NULL;
-	mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
-	bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
-
-	arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
-	    sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
-
-	if (arc_ksp != NULL) {
-		arc_ksp->ks_data = &arc_stats;
-		kstat_install(arc_ksp);
-	}
-
-	(void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
-	    TS_RUN, minclsyspri);
-
-#ifdef _KERNEL
-	arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL,
-	    EVENTHANDLER_PRI_FIRST);
-#endif
-
-	arc_dead = FALSE;
-
-#ifdef _KERNEL
-	/* Warn about ZFS memory and address space requirements. */
-	if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) {
-		printf("ZFS WARNING: Recommended minimum RAM size is 512MB; "
-		    "expect unstable behavior.\n");
-	}
-	if (kmem_size() < 512 * (1 << 20)) {
-		printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; "
-		    "expect unstable behavior.\n");
-		printf("	     Consider tuning vm.kmem_size and "
-		    "vm.kmem_size_max\n");
-		printf("	     in /boot/loader.conf.\n");
-	}
-#endif
-}
-
-void
-arc_fini(void)
-{
-	mutex_enter(&arc_reclaim_thr_lock);
-	arc_thread_exit = 1;
-	cv_signal(&arc_reclaim_thr_cv);
-	while (arc_thread_exit != 0)
-		cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
-	mutex_exit(&arc_reclaim_thr_lock);
-
-	arc_flush();
-
-	arc_dead = TRUE;
-
-	if (arc_ksp != NULL) {
-		kstat_delete(arc_ksp);
-		arc_ksp = NULL;
-	}
-
-	mutex_destroy(&arc_eviction_mtx);
-	mutex_destroy(&arc_reclaim_thr_lock);
-	cv_destroy(&arc_reclaim_thr_cv);
-
-	list_destroy(&arc_mru->arcs_list);
-	list_destroy(&arc_mru_ghost->arcs_list);
-	list_destroy(&arc_mfu->arcs_list);
-	list_destroy(&arc_mfu_ghost->arcs_list);
-
-	mutex_destroy(&arc_anon->arcs_mtx);
-	mutex_destroy(&arc_mru->arcs_mtx);
-	mutex_destroy(&arc_mru_ghost->arcs_mtx);
-	mutex_destroy(&arc_mfu->arcs_mtx);
-	mutex_destroy(&arc_mfu_ghost->arcs_mtx);
-
-	buf_fini();
-
-	mutex_destroy(&arc_lowmem_lock);
-#ifdef _KERNEL
-	if (arc_event_lowmem != NULL)
-		EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem);
-#endif
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/bplist.c b/sys/contrib/opensolaris/uts/common/fs/zfs/bplist.c
deleted file mode 100644
index 4442b1f..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/bplist.c
+++ /dev/null
@@ -1,312 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/bplist.h>
-#include <sys/zfs_context.h>
-
-static int
-bplist_hold(bplist_t *bpl)
-{
-	ASSERT(MUTEX_HELD(&bpl->bpl_lock));
-	if (bpl->bpl_dbuf == NULL) {
-		int err = dmu_bonus_hold(bpl->bpl_mos,
-		    bpl->bpl_object, bpl, &bpl->bpl_dbuf);
-		if (err)
-			return (err);
-		bpl->bpl_phys = bpl->bpl_dbuf->db_data;
-	}
-	return (0);
-}
-
-uint64_t
-bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx)
-{
-	int size;
-
-	size = spa_version(dmu_objset_spa(mos)) < ZFS_VERSION_BPLIST_ACCOUNT ?
-	    BPLIST_SIZE_V0 : sizeof (bplist_phys_t);
-
-	return (dmu_object_alloc(mos, DMU_OT_BPLIST, blocksize,
-	    DMU_OT_BPLIST_HDR, size, tx));
-}
-
-void
-bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx)
-{
-	VERIFY(dmu_object_free(mos, object, tx) == 0);
-}
-
-int
-bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object)
-{
-	dmu_object_info_t doi;
-	int err;
-
-	err = dmu_object_info(mos, object, &doi);
-	if (err)
-		return (err);
-
-	mutex_enter(&bpl->bpl_lock);
-
-	ASSERT(bpl->bpl_dbuf == NULL);
-	ASSERT(bpl->bpl_phys == NULL);
-	ASSERT(bpl->bpl_cached_dbuf == NULL);
-	ASSERT(bpl->bpl_queue == NULL);
-	ASSERT(object != 0);
-	ASSERT3U(doi.doi_type, ==, DMU_OT_BPLIST);
-	ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPLIST_HDR);
-
-	bpl->bpl_mos = mos;
-	bpl->bpl_object = object;
-	bpl->bpl_blockshift = highbit(doi.doi_data_block_size - 1);
-	bpl->bpl_bpshift = bpl->bpl_blockshift - SPA_BLKPTRSHIFT;
-	bpl->bpl_havecomp = (doi.doi_bonus_size == sizeof (bplist_phys_t));
-
-	mutex_exit(&bpl->bpl_lock);
-	return (0);
-}
-
-void
-bplist_close(bplist_t *bpl)
-{
-	mutex_enter(&bpl->bpl_lock);
-
-	ASSERT(bpl->bpl_queue == NULL);
-
-	if (bpl->bpl_cached_dbuf) {
-		dmu_buf_rele(bpl->bpl_cached_dbuf, bpl);
-		bpl->bpl_cached_dbuf = NULL;
-	}
-	if (bpl->bpl_dbuf) {
-		dmu_buf_rele(bpl->bpl_dbuf, bpl);
-		bpl->bpl_dbuf = NULL;
-		bpl->bpl_phys = NULL;
-	}
-
-	mutex_exit(&bpl->bpl_lock);
-}
-
-boolean_t
-bplist_empty(bplist_t *bpl)
-{
-	boolean_t rv;
-
-	if (bpl->bpl_object == 0)
-		return (B_TRUE);
-
-	mutex_enter(&bpl->bpl_lock);
-	VERIFY(0 == bplist_hold(bpl)); /* XXX */
-	rv = (bpl->bpl_phys->bpl_entries == 0);
-	mutex_exit(&bpl->bpl_lock);
-
-	return (rv);
-}
-
-static int
-bplist_cache(bplist_t *bpl, uint64_t blkid)
-{
-	int err = 0;
-
-	if (bpl->bpl_cached_dbuf == NULL ||
-	    bpl->bpl_cached_dbuf->db_offset != (blkid << bpl->bpl_blockshift)) {
-		if (bpl->bpl_cached_dbuf != NULL)
-			dmu_buf_rele(bpl->bpl_cached_dbuf, bpl);
-		err = dmu_buf_hold(bpl->bpl_mos,
-		    bpl->bpl_object, blkid << bpl->bpl_blockshift,
-		    bpl, &bpl->bpl_cached_dbuf);
-		ASSERT(err || bpl->bpl_cached_dbuf->db_size ==
-		    1ULL << bpl->bpl_blockshift);
-	}
-	return (err);
-}
-
-int
-bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp)
-{
-	uint64_t blk, off;
-	blkptr_t *bparray;
-	int err;
-
-	mutex_enter(&bpl->bpl_lock);
-
-	err = bplist_hold(bpl);
-	if (err) {
-		mutex_exit(&bpl->bpl_lock);
-		return (err);
-	}
-
-	if (*itorp >= bpl->bpl_phys->bpl_entries) {
-		mutex_exit(&bpl->bpl_lock);
-		return (ENOENT);
-	}
-
-	blk = *itorp >> bpl->bpl_bpshift;
-	off = P2PHASE(*itorp, 1ULL << bpl->bpl_bpshift);
-
-	err = bplist_cache(bpl, blk);
-	if (err) {
-		mutex_exit(&bpl->bpl_lock);
-		return (err);
-	}
-
-	bparray = bpl->bpl_cached_dbuf->db_data;
-	*bp = bparray[off];
-	(*itorp)++;
-	mutex_exit(&bpl->bpl_lock);
-	return (0);
-}
-
-int
-bplist_enqueue(bplist_t *bpl, blkptr_t *bp, dmu_tx_t *tx)
-{
-	uint64_t blk, off;
-	blkptr_t *bparray;
-	int err;
-
-	ASSERT(!BP_IS_HOLE(bp));
-	mutex_enter(&bpl->bpl_lock);
-	err = bplist_hold(bpl);
-	if (err)
-		return (err);
-
-	blk = bpl->bpl_phys->bpl_entries >> bpl->bpl_bpshift;
-	off = P2PHASE(bpl->bpl_phys->bpl_entries, 1ULL << bpl->bpl_bpshift);
-
-	err = bplist_cache(bpl, blk);
-	if (err) {
-		mutex_exit(&bpl->bpl_lock);
-		return (err);
-	}
-
-	dmu_buf_will_dirty(bpl->bpl_cached_dbuf, tx);
-	bparray = bpl->bpl_cached_dbuf->db_data;
-	bparray[off] = *bp;
-
-	/* We never need the fill count. */
-	bparray[off].blk_fill = 0;
-
-	/* The bplist will compress better if we can leave off the checksum */
-	bzero(&bparray[off].blk_cksum, sizeof (bparray[off].blk_cksum));
-
-	dmu_buf_will_dirty(bpl->bpl_dbuf, tx);
-	bpl->bpl_phys->bpl_entries++;
-	bpl->bpl_phys->bpl_bytes +=
-	    bp_get_dasize(dmu_objset_spa(bpl->bpl_mos), bp);
-	if (bpl->bpl_havecomp) {
-		bpl->bpl_phys->bpl_comp += BP_GET_PSIZE(bp);
-		bpl->bpl_phys->bpl_uncomp += BP_GET_UCSIZE(bp);
-	}
-	mutex_exit(&bpl->bpl_lock);
-
-	return (0);
-}
-
-/*
- * Deferred entry; will be written later by bplist_sync().
- */
-void
-bplist_enqueue_deferred(bplist_t *bpl, blkptr_t *bp)
-{
-	bplist_q_t *bpq = kmem_alloc(sizeof (*bpq), KM_SLEEP);
-
-	ASSERT(!BP_IS_HOLE(bp));
-	mutex_enter(&bpl->bpl_lock);
-	bpq->bpq_blk = *bp;
-	bpq->bpq_next = bpl->bpl_queue;
-	bpl->bpl_queue = bpq;
-	mutex_exit(&bpl->bpl_lock);
-}
-
-void
-bplist_sync(bplist_t *bpl, dmu_tx_t *tx)
-{
-	bplist_q_t *bpq;
-
-	mutex_enter(&bpl->bpl_lock);
-	while ((bpq = bpl->bpl_queue) != NULL) {
-		bpl->bpl_queue = bpq->bpq_next;
-		mutex_exit(&bpl->bpl_lock);
-		VERIFY(0 == bplist_enqueue(bpl, &bpq->bpq_blk, tx));
-		kmem_free(bpq, sizeof (*bpq));
-		mutex_enter(&bpl->bpl_lock);
-	}
-	mutex_exit(&bpl->bpl_lock);
-}
-
-void
-bplist_vacate(bplist_t *bpl, dmu_tx_t *tx)
-{
-	mutex_enter(&bpl->bpl_lock);
-	ASSERT3P(bpl->bpl_queue, ==, NULL);
-	VERIFY(0 == bplist_hold(bpl));
-	dmu_buf_will_dirty(bpl->bpl_dbuf, tx);
-	VERIFY(0 == dmu_free_range(bpl->bpl_mos,
-	    bpl->bpl_object, 0, -1ULL, tx));
-	bpl->bpl_phys->bpl_entries = 0;
-	bpl->bpl_phys->bpl_bytes = 0;
-	if (bpl->bpl_havecomp) {
-		bpl->bpl_phys->bpl_comp = 0;
-		bpl->bpl_phys->bpl_uncomp = 0;
-	}
-	mutex_exit(&bpl->bpl_lock);
-}
-
-int
-bplist_space(bplist_t *bpl, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
-{
-	uint64_t itor = 0, comp = 0, uncomp = 0;
-	int err;
-	blkptr_t bp;
-
-	mutex_enter(&bpl->bpl_lock);
-
-	err = bplist_hold(bpl);
-	if (err) {
-		mutex_exit(&bpl->bpl_lock);
-		return (err);
-	}
-
-	*usedp = bpl->bpl_phys->bpl_bytes;
-	if (bpl->bpl_havecomp) {
-		*compp = bpl->bpl_phys->bpl_comp;
-		*uncompp = bpl->bpl_phys->bpl_uncomp;
-	}
-	mutex_exit(&bpl->bpl_lock);
-
-	if (!bpl->bpl_havecomp) {
-		while ((err = bplist_iterate(bpl, &itor, &bp)) == 0) {
-			comp += BP_GET_PSIZE(&bp);
-			uncomp += BP_GET_UCSIZE(&bp);
-		}
-		if (err == ENOENT)
-			err = 0;
-		*compp = comp;
-		*uncompp = uncomp;
-	}
-
-	return (err);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dbuf.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
deleted file mode 100644
index 94c6308..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
+++ /dev/null
@@ -1,2247 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/dmu.h>
-#include <sys/dmu_impl.h>
-#include <sys/dbuf.h>
-#include <sys/dmu_objset.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_dir.h>
-#include <sys/dmu_tx.h>
-#include <sys/spa.h>
-#include <sys/zio.h>
-#include <sys/dmu_zfetch.h>
-
-static void dbuf_destroy(dmu_buf_impl_t *db);
-static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
-static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum,
-    int compress, dmu_tx_t *tx);
-static arc_done_func_t dbuf_write_ready;
-static arc_done_func_t dbuf_write_done;
-
-int zfs_mdcomp_disable = 0;
-SYSCTL_DECL(_vfs_zfs);
-TUNABLE_INT("vfs.zfs.mdcomp_disable", &zfs_mdcomp_disable);
-SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RDTUN,
-    &zfs_mdcomp_disable, 0, "Disable metadata compression");
-
-/*
- * Global data structures and functions for the dbuf cache.
- */
-static kmem_cache_t *dbuf_cache;
-
-/* ARGSUSED */
-static int
-dbuf_cons(void *vdb, void *unused, int kmflag)
-{
-	dmu_buf_impl_t *db = vdb;
-	bzero(db, sizeof (dmu_buf_impl_t));
-
-	mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
-	cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
-	refcount_create(&db->db_holds);
-	return (0);
-}
-
-/* ARGSUSED */
-static void
-dbuf_dest(void *vdb, void *unused)
-{
-	dmu_buf_impl_t *db = vdb;
-	mutex_destroy(&db->db_mtx);
-	cv_destroy(&db->db_changed);
-	refcount_destroy(&db->db_holds);
-}
-
-/*
- * dbuf hash table routines
- */
-static dbuf_hash_table_t dbuf_hash_table;
-
-static uint64_t dbuf_hash_count;
-
-static uint64_t
-dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
-{
-	uintptr_t osv = (uintptr_t)os;
-	uint64_t crc = -1ULL;
-
-	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
-	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
-	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
-	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
-	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
-	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
-	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
-
-	crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
-
-	return (crc);
-}
-
-#define	DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
-
-#define	DBUF_EQUAL(dbuf, os, obj, level, blkid)		\
-	((dbuf)->db.db_object == (obj) &&		\
-	(dbuf)->db_objset == (os) &&			\
-	(dbuf)->db_level == (level) &&			\
-	(dbuf)->db_blkid == (blkid))
-
-dmu_buf_impl_t *
-dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
-{
-	dbuf_hash_table_t *h = &dbuf_hash_table;
-	objset_impl_t *os = dn->dn_objset;
-	uint64_t obj = dn->dn_object;
-	uint64_t hv = DBUF_HASH(os, obj, level, blkid);
-	uint64_t idx = hv & h->hash_table_mask;
-	dmu_buf_impl_t *db;
-
-	mutex_enter(DBUF_HASH_MUTEX(h, idx));
-	for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
-		if (DBUF_EQUAL(db, os, obj, level, blkid)) {
-			mutex_enter(&db->db_mtx);
-			if (db->db_state != DB_EVICTING) {
-				mutex_exit(DBUF_HASH_MUTEX(h, idx));
-				return (db);
-			}
-			mutex_exit(&db->db_mtx);
-		}
-	}
-	mutex_exit(DBUF_HASH_MUTEX(h, idx));
-	return (NULL);
-}
-
-/*
- * Insert an entry into the hash table.  If there is already an element
- * equal to elem in the hash table, then the already existing element
- * will be returned and the new element will not be inserted.
- * Otherwise returns NULL.
- */
-static dmu_buf_impl_t *
-dbuf_hash_insert(dmu_buf_impl_t *db)
-{
-	dbuf_hash_table_t *h = &dbuf_hash_table;
-	objset_impl_t *os = db->db_objset;
-	uint64_t obj = db->db.db_object;
-	int level = db->db_level;
-	uint64_t blkid = db->db_blkid;
-	uint64_t hv = DBUF_HASH(os, obj, level, blkid);
-	uint64_t idx = hv & h->hash_table_mask;
-	dmu_buf_impl_t *dbf;
-
-	mutex_enter(DBUF_HASH_MUTEX(h, idx));
-	for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
-		if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
-			mutex_enter(&dbf->db_mtx);
-			if (dbf->db_state != DB_EVICTING) {
-				mutex_exit(DBUF_HASH_MUTEX(h, idx));
-				return (dbf);
-			}
-			mutex_exit(&dbf->db_mtx);
-		}
-	}
-
-	mutex_enter(&db->db_mtx);
-	db->db_hash_next = h->hash_table[idx];
-	h->hash_table[idx] = db;
-	mutex_exit(DBUF_HASH_MUTEX(h, idx));
-	atomic_add_64(&dbuf_hash_count, 1);
-
-	return (NULL);
-}
-
-/*
- * Remove an entry from the hash table.  This operation will
- * fail if there are any existing holds on the db.
- */
-static void
-dbuf_hash_remove(dmu_buf_impl_t *db)
-{
-	dbuf_hash_table_t *h = &dbuf_hash_table;
-	uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object,
-	    db->db_level, db->db_blkid);
-	uint64_t idx = hv & h->hash_table_mask;
-	dmu_buf_impl_t *dbf, **dbp;
-
-	/*
-	 * We musn't hold db_mtx to maintin lock ordering:
-	 * DBUF_HASH_MUTEX > db_mtx.
-	 */
-	ASSERT(refcount_is_zero(&db->db_holds));
-	ASSERT(db->db_state == DB_EVICTING);
-	ASSERT(!MUTEX_HELD(&db->db_mtx));
-
-	mutex_enter(DBUF_HASH_MUTEX(h, idx));
-	dbp = &h->hash_table[idx];
-	while ((dbf = *dbp) != db) {
-		dbp = &dbf->db_hash_next;
-		ASSERT(dbf != NULL);
-	}
-	*dbp = db->db_hash_next;
-	db->db_hash_next = NULL;
-	mutex_exit(DBUF_HASH_MUTEX(h, idx));
-	atomic_add_64(&dbuf_hash_count, -1);
-}
-
-static arc_evict_func_t dbuf_do_evict;
-
-static void
-dbuf_evict_user(dmu_buf_impl_t *db)
-{
-	ASSERT(MUTEX_HELD(&db->db_mtx));
-
-	if (db->db_level != 0 || db->db_evict_func == NULL)
-		return;
-
-	if (db->db_user_data_ptr_ptr)
-		*db->db_user_data_ptr_ptr = db->db.db_data;
-	db->db_evict_func(&db->db, db->db_user_ptr);
-	db->db_user_ptr = NULL;
-	db->db_user_data_ptr_ptr = NULL;
-	db->db_evict_func = NULL;
-}
-
-void
-dbuf_evict(dmu_buf_impl_t *db)
-{
-	ASSERT(MUTEX_HELD(&db->db_mtx));
-	ASSERT(db->db_buf == NULL);
-	ASSERT(db->db_data_pending == NULL);
-
-	dbuf_clear(db);
-	dbuf_destroy(db);
-}
-
-void
-dbuf_init(void)
-{
-	uint64_t hsize = 1ULL << 16;
-	dbuf_hash_table_t *h = &dbuf_hash_table;
-	int i;
-
-	/*
-	 * The hash table is big enough to fill all of physical memory
-	 * with an average 4K block size.  The table will take up
-	 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
-	 */
-	while (hsize * 4096 < (uint64_t)physmem * PAGESIZE)
-		hsize <<= 1;
-
-retry:
-	h->hash_table_mask = hsize - 1;
-	h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
-	if (h->hash_table == NULL) {
-		/* XXX - we should really return an error instead of assert */
-		ASSERT(hsize > (1ULL << 10));
-		hsize >>= 1;
-		goto retry;
-	}
-
-	dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
-	    sizeof (dmu_buf_impl_t),
-	    0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
-
-	for (i = 0; i < DBUF_MUTEXES; i++)
-		mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
-}
-
-void
-dbuf_fini(void)
-{
-	dbuf_hash_table_t *h = &dbuf_hash_table;
-	int i;
-
-	for (i = 0; i < DBUF_MUTEXES; i++)
-		mutex_destroy(&h->hash_mutexes[i]);
-	kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
-	kmem_cache_destroy(dbuf_cache);
-}
-
-/*
- * Other stuff.
- */
-
-#ifdef ZFS_DEBUG
-static void
-dbuf_verify(dmu_buf_impl_t *db)
-{
-	dnode_t *dn = db->db_dnode;
-
-	ASSERT(MUTEX_HELD(&db->db_mtx));
-
-	if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
-		return;
-
-	ASSERT(db->db_objset != NULL);
-	if (dn == NULL) {
-		ASSERT(db->db_parent == NULL);
-		ASSERT(db->db_blkptr == NULL);
-	} else {
-		ASSERT3U(db->db.db_object, ==, dn->dn_object);
-		ASSERT3P(db->db_objset, ==, dn->dn_objset);
-		ASSERT3U(db->db_level, <, dn->dn_nlevels);
-		ASSERT(db->db_blkid == DB_BONUS_BLKID ||
-		    list_head(&dn->dn_dbufs));
-	}
-	if (db->db_blkid == DB_BONUS_BLKID) {
-		ASSERT(dn != NULL);
-		ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen);
-		ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID);
-	} else {
-		ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
-	}
-
-	if (db->db_level == 0) {
-		/* we can be momentarily larger in dnode_set_blksz() */
-		if (db->db_blkid != DB_BONUS_BLKID && dn) {
-			ASSERT3U(db->db.db_size, >=, dn->dn_datablksz);
-		}
-		if (db->db.db_object == DMU_META_DNODE_OBJECT) {
-			dbuf_dirty_record_t *dr = db->db_data_pending;
-			/*
-			 * it should only be modified in syncing
-			 * context, so make sure we only have
-			 * one copy of the data.
-			 */
-			ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
-		}
-	}
-
-	/* verify db->db_blkptr */
-	if (db->db_blkptr) {
-		if (db->db_parent == dn->dn_dbuf) {
-			/* db is pointed to by the dnode */
-			/* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
-			if (db->db.db_object == DMU_META_DNODE_OBJECT)
-				ASSERT(db->db_parent == NULL);
-			else
-				ASSERT(db->db_parent != NULL);
-			ASSERT3P(db->db_blkptr, ==,
-			    &dn->dn_phys->dn_blkptr[db->db_blkid]);
-		} else {
-			/* db is pointed to by an indirect block */
-			int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
-			ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
-			ASSERT3U(db->db_parent->db.db_object, ==,
-			    db->db.db_object);
-			/*
-			 * dnode_grow_indblksz() can make this fail if we don't
-			 * have the struct_rwlock.  XXX indblksz no longer
-			 * grows.  safe to do this now?
-			 */
-			if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) {
-				ASSERT3P(db->db_blkptr, ==,
-				    ((blkptr_t *)db->db_parent->db.db_data +
-				    db->db_blkid % epb));
-			}
-		}
-	}
-	if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
-	    db->db.db_data && db->db_blkid != DB_BONUS_BLKID &&
-	    db->db_state != DB_FILL && !dn->dn_free_txg) {
-		/*
-		 * If the blkptr isn't set but they have nonzero data,
-		 * it had better be dirty, otherwise we'll lose that
-		 * data when we evict this buffer.
-		 */
-		if (db->db_dirtycnt == 0) {
-			uint64_t *buf = db->db.db_data;
-			int i;
-
-			for (i = 0; i < db->db.db_size >> 3; i++) {
-				ASSERT(buf[i] == 0);
-			}
-		}
-	}
-}
-#endif
-
-static void
-dbuf_update_data(dmu_buf_impl_t *db)
-{
-	ASSERT(MUTEX_HELD(&db->db_mtx));
-	if (db->db_level == 0 && db->db_user_data_ptr_ptr) {
-		ASSERT(!refcount_is_zero(&db->db_holds));
-		*db->db_user_data_ptr_ptr = db->db.db_data;
-	}
-}
-
-static void
-dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
-{
-	ASSERT(MUTEX_HELD(&db->db_mtx));
-	ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf));
-	db->db_buf = buf;
-	if (buf != NULL) {
-		ASSERT(buf->b_data != NULL);
-		db->db.db_data = buf->b_data;
-		if (!arc_released(buf))
-			arc_set_callback(buf, dbuf_do_evict, db);
-		dbuf_update_data(db);
-	} else {
-		dbuf_evict_user(db);
-		db->db.db_data = NULL;
-		db->db_state = DB_UNCACHED;
-	}
-}
-
-uint64_t
-dbuf_whichblock(dnode_t *dn, uint64_t offset)
-{
-	if (dn->dn_datablkshift) {
-		return (offset >> dn->dn_datablkshift);
-	} else {
-		ASSERT3U(offset, <, dn->dn_datablksz);
-		return (0);
-	}
-}
-
-static void
-dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
-{
-	dmu_buf_impl_t *db = vdb;
-
-	mutex_enter(&db->db_mtx);
-	ASSERT3U(db->db_state, ==, DB_READ);
-	/*
-	 * All reads are synchronous, so we must have a hold on the dbuf
-	 */
-	ASSERT(refcount_count(&db->db_holds) > 0);
-	ASSERT(db->db_buf == NULL);
-	ASSERT(db->db.db_data == NULL);
-	if (db->db_level == 0 && db->db_freed_in_flight) {
-		/* we were freed in flight; disregard any error */
-		arc_release(buf, db);
-		bzero(buf->b_data, db->db.db_size);
-		arc_buf_freeze(buf);
-		db->db_freed_in_flight = FALSE;
-		dbuf_set_data(db, buf);
-		db->db_state = DB_CACHED;
-	} else if (zio == NULL || zio->io_error == 0) {
-		dbuf_set_data(db, buf);
-		db->db_state = DB_CACHED;
-	} else {
-		ASSERT(db->db_blkid != DB_BONUS_BLKID);
-		ASSERT3P(db->db_buf, ==, NULL);
-		VERIFY(arc_buf_remove_ref(buf, db) == 1);
-		db->db_state = DB_UNCACHED;
-	}
-	cv_broadcast(&db->db_changed);
-	mutex_exit(&db->db_mtx);
-	dbuf_rele(db, NULL);
-}
-
-static void
-dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
-{
-	blkptr_t *bp;
-	zbookmark_t zb;
-	uint32_t aflags = ARC_NOWAIT;
-
-	ASSERT(!refcount_is_zero(&db->db_holds));
-	/* We need the struct_rwlock to prevent db_blkptr from changing. */
-	ASSERT(RW_LOCK_HELD(&db->db_dnode->dn_struct_rwlock));
-	ASSERT(MUTEX_HELD(&db->db_mtx));
-	ASSERT(db->db_state == DB_UNCACHED);
-	ASSERT(db->db_buf == NULL);
-
-	if (db->db_blkid == DB_BONUS_BLKID) {
-		ASSERT3U(db->db_dnode->dn_bonuslen, ==, db->db.db_size);
-		db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
-		if (db->db.db_size < DN_MAX_BONUSLEN)
-			bzero(db->db.db_data, DN_MAX_BONUSLEN);
-		bcopy(DN_BONUS(db->db_dnode->dn_phys), db->db.db_data,
-		    db->db.db_size);
-		dbuf_update_data(db);
-		db->db_state = DB_CACHED;
-		mutex_exit(&db->db_mtx);
-		return;
-	}
-
-	if (db->db_level == 0 && dnode_block_freed(db->db_dnode, db->db_blkid))
-		bp = NULL;
-	else
-		bp = db->db_blkptr;
-
-	if (bp == NULL)
-		dprintf_dbuf(db, "blkptr: %s\n", "NULL");
-	else
-		dprintf_dbuf_bp(db, bp, "%s", "blkptr:");
-
-	if (bp == NULL || BP_IS_HOLE(bp)) {
-		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
-
-		ASSERT(bp == NULL || BP_IS_HOLE(bp));
-		dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
-		    db->db.db_size, db, type));
-		bzero(db->db.db_data, db->db.db_size);
-		db->db_state = DB_CACHED;
-		*flags |= DB_RF_CACHED;
-		mutex_exit(&db->db_mtx);
-		return;
-	}
-
-	db->db_state = DB_READ;
-	mutex_exit(&db->db_mtx);
-
-	zb.zb_objset = db->db_objset->os_dsl_dataset ?
-	    db->db_objset->os_dsl_dataset->ds_object : 0;
-	zb.zb_object = db->db.db_object;
-	zb.zb_level = db->db_level;
-	zb.zb_blkid = db->db_blkid;
-
-	dbuf_add_ref(db, NULL);
-	/* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */
-	ASSERT3U(db->db_dnode->dn_type, <, DMU_OT_NUMTYPES);
-	(void) arc_read(zio, db->db_dnode->dn_objset->os_spa, bp,
-	    db->db_level > 0 ? byteswap_uint64_array :
-	    dmu_ot[db->db_dnode->dn_type].ot_byteswap,
-	    dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
-	    (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
-	    &aflags, &zb);
-	if (aflags & ARC_CACHED)
-		*flags |= DB_RF_CACHED;
-}
-
-int
-dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
-{
-	int err = 0;
-	int havepzio = (zio != NULL);
-	int prefetch;
-
-	/*
-	 * We don't have to hold the mutex to check db_state because it
-	 * can't be freed while we have a hold on the buffer.
-	 */
-	ASSERT(!refcount_is_zero(&db->db_holds));
-
-	if ((flags & DB_RF_HAVESTRUCT) == 0)
-		rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
-
-	prefetch = db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
-	    (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL;
-
-	mutex_enter(&db->db_mtx);
-	if (db->db_state == DB_CACHED) {
-		mutex_exit(&db->db_mtx);
-		if (prefetch)
-			dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
-			    db->db.db_size, TRUE);
-		if ((flags & DB_RF_HAVESTRUCT) == 0)
-			rw_exit(&db->db_dnode->dn_struct_rwlock);
-	} else if (db->db_state == DB_UNCACHED) {
-		if (zio == NULL) {
-			zio = zio_root(db->db_dnode->dn_objset->os_spa,
-			    NULL, NULL, ZIO_FLAG_CANFAIL);
-		}
-		dbuf_read_impl(db, zio, &flags);
-
-		/* dbuf_read_impl has dropped db_mtx for us */
-
-		if (prefetch)
-			dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
-			    db->db.db_size, flags & DB_RF_CACHED);
-
-		if ((flags & DB_RF_HAVESTRUCT) == 0)
-			rw_exit(&db->db_dnode->dn_struct_rwlock);
-
-		if (!havepzio)
-			err = zio_wait(zio);
-	} else {
-		mutex_exit(&db->db_mtx);
-		if (prefetch)
-			dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
-			    db->db.db_size, TRUE);
-		if ((flags & DB_RF_HAVESTRUCT) == 0)
-			rw_exit(&db->db_dnode->dn_struct_rwlock);
-
-		mutex_enter(&db->db_mtx);
-		if ((flags & DB_RF_NEVERWAIT) == 0) {
-			while (db->db_state == DB_READ ||
-			    db->db_state == DB_FILL) {
-				ASSERT(db->db_state == DB_READ ||
-				    (flags & DB_RF_HAVESTRUCT) == 0);
-				cv_wait(&db->db_changed, &db->db_mtx);
-			}
-			if (db->db_state == DB_UNCACHED)
-				err = EIO;
-		}
-		mutex_exit(&db->db_mtx);
-	}
-
-	ASSERT(err || havepzio || db->db_state == DB_CACHED);
-	return (err);
-}
-
-static void
-dbuf_noread(dmu_buf_impl_t *db)
-{
-	ASSERT(!refcount_is_zero(&db->db_holds));
-	ASSERT(db->db_blkid != DB_BONUS_BLKID);
-	mutex_enter(&db->db_mtx);
-	while (db->db_state == DB_READ || db->db_state == DB_FILL)
-		cv_wait(&db->db_changed, &db->db_mtx);
-	if (db->db_state == DB_UNCACHED) {
-		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
-
-		ASSERT(db->db_buf == NULL);
-		ASSERT(db->db.db_data == NULL);
-		dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
-		    db->db.db_size, db, type));
-		db->db_state = DB_FILL;
-	} else {
-		ASSERT3U(db->db_state, ==, DB_CACHED);
-	}
-	mutex_exit(&db->db_mtx);
-}
-
-/*
- * This is our just-in-time copy function.  It makes a copy of
- * buffers, that have been modified in a previous transaction
- * group, before we modify them in the current active group.
- *
- * This function is used in two places: when we are dirtying a
- * buffer for the first time in a txg, and when we are freeing
- * a range in a dnode that includes this buffer.
- *
- * Note that when we are called from dbuf_free_range() we do
- * not put a hold on the buffer, we just traverse the active
- * dbuf list for the dnode.
- */
-static void
-dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
-{
-	dbuf_dirty_record_t *dr = db->db_last_dirty;
-
-	ASSERT(MUTEX_HELD(&db->db_mtx));
-	ASSERT(db->db.db_data != NULL);
-	ASSERT(db->db_level == 0);
-	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
-
-	if (dr == NULL ||
-	    (dr->dt.dl.dr_data !=
-	    ((db->db_blkid  == DB_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
-		return;
-
-	/*
-	 * If the last dirty record for this dbuf has not yet synced
-	 * and its referencing the dbuf data, either:
-	 * 	reset the reference to point to a new copy,
-	 * or (if there a no active holders)
-	 *	just null out the current db_data pointer.
-	 */
-	ASSERT(dr->dr_txg >= txg - 2);
-	if (db->db_blkid == DB_BONUS_BLKID) {
-		/* Note that the data bufs here are zio_bufs */
-		dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
-		bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
-	} else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
-		int size = db->db.db_size;
-		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
-		dr->dt.dl.dr_data = arc_buf_alloc(
-		    db->db_dnode->dn_objset->os_spa, size, db, type);
-		bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
-	} else {
-		dbuf_set_data(db, NULL);
-	}
-}
-
-void
-dbuf_unoverride(dbuf_dirty_record_t *dr)
-{
-	dmu_buf_impl_t *db = dr->dr_dbuf;
-	uint64_t txg = dr->dr_txg;
-
-	ASSERT(MUTEX_HELD(&db->db_mtx));
-	ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
-	ASSERT(db->db_level == 0);
-
-	if (db->db_blkid == DB_BONUS_BLKID ||
-	    dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
-		return;
-
-	/* free this block */
-	if (!BP_IS_HOLE(&dr->dt.dl.dr_overridden_by)) {
-		/* XXX can get silent EIO here */
-		(void) arc_free(NULL, db->db_dnode->dn_objset->os_spa,
-		    txg, &dr->dt.dl.dr_overridden_by, NULL, NULL, ARC_WAIT);
-	}
-	dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
-	/*
-	 * Release the already-written buffer, so we leave it in
-	 * a consistent dirty state.  Note that all callers are
-	 * modifying the buffer, so they will immediately do
-	 * another (redundant) arc_release().  Therefore, leave
-	 * the buf thawed to save the effort of freezing &
-	 * immediately re-thawing it.
-	 */
-	arc_release(dr->dt.dl.dr_data, db);
-}
-
-void
-dbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
-{
-	dmu_buf_impl_t *db, *db_next;
-	uint64_t txg = tx->tx_txg;
-
-	dprintf_dnode(dn, "blkid=%llu nblks=%llu\n", blkid, nblks);
-	mutex_enter(&dn->dn_dbufs_mtx);
-	for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
-		db_next = list_next(&dn->dn_dbufs, db);
-		ASSERT(db->db_blkid != DB_BONUS_BLKID);
-		if (db->db_level != 0)
-			continue;
-		dprintf_dbuf(db, "found buf %s\n", "");
-		if (db->db_blkid < blkid ||
-		    db->db_blkid >= blkid+nblks)
-			continue;
-
-		/* found a level 0 buffer in the range */
-		if (dbuf_undirty(db, tx))
-			continue;
-
-		mutex_enter(&db->db_mtx);
-		if (db->db_state == DB_UNCACHED ||
-		    db->db_state == DB_EVICTING) {
-			ASSERT(db->db.db_data == NULL);
-			mutex_exit(&db->db_mtx);
-			continue;
-		}
-		if (db->db_state == DB_READ || db->db_state == DB_FILL) {
-			/* will be handled in dbuf_read_done or dbuf_rele */
-			db->db_freed_in_flight = TRUE;
-			mutex_exit(&db->db_mtx);
-			continue;
-		}
-		if (refcount_count(&db->db_holds) == 0) {
-			ASSERT(db->db_buf);
-			dbuf_clear(db);
-			continue;
-		}
-		/* The dbuf is referenced */
-
-		if (db->db_last_dirty != NULL) {
-			dbuf_dirty_record_t *dr = db->db_last_dirty;
-
-			if (dr->dr_txg == txg) {
-				/*
-				 * This buffer is "in-use", re-adjust the file
-				 * size to reflect that this buffer may
-				 * contain new data when we sync.
-				 */
-				if (db->db_blkid > dn->dn_maxblkid)
-					dn->dn_maxblkid = db->db_blkid;
-				dbuf_unoverride(dr);
-			} else {
-				/*
-				 * This dbuf is not dirty in the open context.
-				 * Either uncache it (if its not referenced in
-				 * the open context) or reset its contents to
-				 * empty.
-				 */
-				dbuf_fix_old_data(db, txg);
-			}
-		}
-		/* clear the contents if its cached */
-		if (db->db_state == DB_CACHED) {
-			ASSERT(db->db.db_data != NULL);
-			arc_release(db->db_buf, db);
-			bzero(db->db.db_data, db->db.db_size);
-			arc_buf_freeze(db->db_buf);
-		}
-
-		mutex_exit(&db->db_mtx);
-	}
-	mutex_exit(&dn->dn_dbufs_mtx);
-}
-
-static int
-dbuf_new_block(dmu_buf_impl_t *db)
-{
-	dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
-	uint64_t birth_txg = 0;
-
-	/* Don't count meta-objects */
-	if (ds == NULL)
-		return (FALSE);
-
-	/*
-	 * We don't need any locking to protect db_blkptr:
-	 * If it's syncing, then db_last_dirty will be set
-	 * so we'll ignore db_blkptr.
-	 */
-	ASSERT(MUTEX_HELD(&db->db_mtx));
-	/* If we have been dirtied since the last snapshot, its not new */
-	if (db->db_last_dirty)
-		birth_txg = db->db_last_dirty->dr_txg;
-	else if (db->db_blkptr)
-		birth_txg = db->db_blkptr->blk_birth;
-
-	if (birth_txg)
-		return (!dsl_dataset_block_freeable(ds, birth_txg));
-	else
-		return (TRUE);
-}
-
-void
-dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
-{
-	arc_buf_t *buf, *obuf;
-	int osize = db->db.db_size;
-	arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
-
-	ASSERT(db->db_blkid != DB_BONUS_BLKID);
-
-	/* XXX does *this* func really need the lock? */
-	ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock));
-
-	/*
-	 * This call to dbuf_will_dirty() with the dn_struct_rwlock held
-	 * is OK, because there can be no other references to the db
-	 * when we are changing its size, so no concurrent DB_FILL can
-	 * be happening.
-	 */
-	/*
-	 * XXX we should be doing a dbuf_read, checking the return
-	 * value and returning that up to our callers
-	 */
-	dbuf_will_dirty(db, tx);
-
-	/* create the data buffer for the new block */
-	buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, size, db, type);
-
-	/* copy old block data to the new block */
-	obuf = db->db_buf;
-	bcopy(obuf->b_data, buf->b_data, MIN(osize, size));
-	/* zero the remainder */
-	if (size > osize)
-		bzero((uint8_t *)buf->b_data + osize, size - osize);
-
-	mutex_enter(&db->db_mtx);
-	dbuf_set_data(db, buf);
-	VERIFY(arc_buf_remove_ref(obuf, db) == 1);
-	db->db.db_size = size;
-
-	if (db->db_level == 0) {
-		ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
-		db->db_last_dirty->dt.dl.dr_data = buf;
-	}
-	mutex_exit(&db->db_mtx);
-
-	dnode_willuse_space(db->db_dnode, size-osize, tx);
-}
-
-dbuf_dirty_record_t *
-dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
-{
-	dnode_t *dn = db->db_dnode;
-	objset_impl_t *os = dn->dn_objset;
-	dbuf_dirty_record_t **drp, *dr;
-	int drop_struct_lock = FALSE;
-	int txgoff = tx->tx_txg & TXG_MASK;
-
-	ASSERT(tx->tx_txg != 0);
-	ASSERT(!refcount_is_zero(&db->db_holds));
-	DMU_TX_DIRTY_BUF(tx, db);
-
-	/*
-	 * Shouldn't dirty a regular buffer in syncing context.  Private
-	 * objects may be dirtied in syncing context, but only if they
-	 * were already pre-dirtied in open context.
-	 * XXX We may want to prohibit dirtying in syncing context even
-	 * if they did pre-dirty.
-	 */
-	ASSERT(!dmu_tx_is_syncing(tx) ||
-	    BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
-	    dn->dn_object == DMU_META_DNODE_OBJECT ||
-	    dn->dn_objset->os_dsl_dataset == NULL ||
-	    dsl_dir_is_private(dn->dn_objset->os_dsl_dataset->ds_dir));
-
-	/*
-	 * We make this assert for private objects as well, but after we
-	 * check if we're already dirty.  They are allowed to re-dirty
-	 * in syncing context.
-	 */
-	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
-	    dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
-	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
-
-	mutex_enter(&db->db_mtx);
-	/*
-	 * XXX make this true for indirects too?  The problem is that
-	 * transactions created with dmu_tx_create_assigned() from
-	 * syncing context don't bother holding ahead.
-	 */
-	ASSERT(db->db_level != 0 ||
-	    db->db_state == DB_CACHED || db->db_state == DB_FILL);
-
-	mutex_enter(&dn->dn_mtx);
-	/*
-	 * Don't set dirtyctx to SYNC if we're just modifying this as we
-	 * initialize the objset.
-	 */
-	if (dn->dn_dirtyctx == DN_UNDIRTIED &&
-	    !BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
-		dn->dn_dirtyctx =
-		    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
-		ASSERT(dn->dn_dirtyctx_firstset == NULL);
-		dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
-	}
-	mutex_exit(&dn->dn_mtx);
-
-	/*
-	 * If this buffer is already dirty, we're done.
-	 */
-	drp = &db->db_last_dirty;
-	ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
-	    db->db.db_object == DMU_META_DNODE_OBJECT);
-	while (*drp && (*drp)->dr_txg > tx->tx_txg)
-		drp = &(*drp)->dr_next;
-	if (*drp && (*drp)->dr_txg == tx->tx_txg) {
-		if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) {
-			/*
-			 * If this buffer has already been written out,
-			 * we now need to reset its state.
-			 */
-			dbuf_unoverride(*drp);
-			if (db->db.db_object != DMU_META_DNODE_OBJECT)
-				arc_buf_thaw(db->db_buf);
-		}
-		mutex_exit(&db->db_mtx);
-		return (*drp);
-	}
-
-	/*
-	 * Only valid if not already dirty.
-	 */
-	ASSERT(dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
-	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
-
-	ASSERT3U(dn->dn_nlevels, >, db->db_level);
-	ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
-	    dn->dn_phys->dn_nlevels > db->db_level ||
-	    dn->dn_next_nlevels[txgoff] > db->db_level ||
-	    dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
-	    dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
-
-	/*
-	 * We should only be dirtying in syncing context if it's the
-	 * mos, a spa os, or we're initializing the os.  However, we are
-	 * allowed to dirty in syncing context provided we already
-	 * dirtied it in open context.  Hence we must make this
-	 * assertion only if we're not already dirty.
-	 */
-	ASSERT(!dmu_tx_is_syncing(tx) ||
-	    os->os_dsl_dataset == NULL ||
-	    !dsl_dir_is_private(os->os_dsl_dataset->ds_dir) ||
-	    !BP_IS_HOLE(os->os_rootbp));
-	ASSERT(db->db.db_size != 0);
-
-	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
-
-	/*
-	 * If this buffer is dirty in an old transaction group we need
-	 * to make a copy of it so that the changes we make in this
-	 * transaction group won't leak out when we sync the older txg.
-	 */
-	dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
-	if (db->db_level == 0) {
-		void *data_old = db->db_buf;
-
-		if (db->db_blkid == DB_BONUS_BLKID) {
-			dbuf_fix_old_data(db, tx->tx_txg);
-			data_old = db->db.db_data;
-		} else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
-			/*
-			 * Release the data buffer from the cache so that we
-			 * can modify it without impacting possible other users
-			 * of this cached data block.  Note that indirect
-			 * blocks and private objects are not released until the
-			 * syncing state (since they are only modified then).
-			 */
-			arc_release(db->db_buf, db);
-			dbuf_fix_old_data(db, tx->tx_txg);
-			data_old = db->db_buf;
-		}
-		ASSERT(data_old != NULL);
-		dr->dt.dl.dr_data = data_old;
-	} else {
-		mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
-		list_create(&dr->dt.di.dr_children,
-		    sizeof (dbuf_dirty_record_t),
-		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
-	}
-	dr->dr_dbuf = db;
-	dr->dr_txg = tx->tx_txg;
-	dr->dr_next = *drp;
-	*drp = dr;
-
-	/*
-	 * We could have been freed_in_flight between the dbuf_noread
-	 * and dbuf_dirty.  We win, as though the dbuf_noread() had
-	 * happened after the free.
-	 */
-	if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) {
-		mutex_enter(&dn->dn_mtx);
-		dnode_clear_range(dn, db->db_blkid, 1, tx);
-		mutex_exit(&dn->dn_mtx);
-		db->db_freed_in_flight = FALSE;
-	}
-
-	if (db->db_blkid != DB_BONUS_BLKID) {
-		/*
-		 * Update the accounting.
-		 */
-		if (!dbuf_new_block(db) && db->db_blkptr) {
-			/*
-			 * This is only a guess -- if the dbuf is dirty
-			 * in a previous txg, we don't know how much
-			 * space it will use on disk yet.  We should
-			 * really have the struct_rwlock to access
-			 * db_blkptr, but since this is just a guess,
-			 * it's OK if we get an odd answer.
-			 */
-			dnode_willuse_space(dn,
-			    -bp_get_dasize(os->os_spa, db->db_blkptr), tx);
-		}
-		dnode_willuse_space(dn, db->db.db_size, tx);
-	}
-
-	/*
-	 * This buffer is now part of this txg
-	 */
-	dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
-	db->db_dirtycnt += 1;
-	ASSERT3U(db->db_dirtycnt, <=, 3);
-
-	mutex_exit(&db->db_mtx);
-
-	if (db->db_blkid == DB_BONUS_BLKID) {
-		mutex_enter(&dn->dn_mtx);
-		ASSERT(!list_link_active(&dr->dr_dirty_node));
-		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
-		mutex_exit(&dn->dn_mtx);
-		dnode_setdirty(dn, tx);
-		return (dr);
-	}
-
-	if (db->db_level == 0) {
-		dnode_new_blkid(dn, db->db_blkid, tx);
-		ASSERT(dn->dn_maxblkid >= db->db_blkid);
-	}
-
-	if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
-		rw_enter(&dn->dn_struct_rwlock, RW_READER);
-		drop_struct_lock = TRUE;
-	}
-
-	if (db->db_level+1 < dn->dn_nlevels) {
-		dmu_buf_impl_t *parent = db->db_parent;
-		dbuf_dirty_record_t *di;
-		int parent_held = FALSE;
-
-		if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
-			int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
-
-			parent = dbuf_hold_level(dn, db->db_level+1,
-			    db->db_blkid >> epbs, FTAG);
-			parent_held = TRUE;
-		}
-		if (drop_struct_lock)
-			rw_exit(&dn->dn_struct_rwlock);
-		ASSERT3U(db->db_level+1, ==, parent->db_level);
-		di = dbuf_dirty(parent, tx);
-		if (parent_held)
-			dbuf_rele(parent, FTAG);
-
-		mutex_enter(&db->db_mtx);
-		/*  possible race with dbuf_undirty() */
-		if (db->db_last_dirty == dr ||
-		    dn->dn_object == DMU_META_DNODE_OBJECT) {
-			mutex_enter(&di->dt.di.dr_mtx);
-			ASSERT3U(di->dr_txg, ==, tx->tx_txg);
-			ASSERT(!list_link_active(&dr->dr_dirty_node));
-			list_insert_tail(&di->dt.di.dr_children, dr);
-			mutex_exit(&di->dt.di.dr_mtx);
-			dr->dr_parent = di;
-		}
-		mutex_exit(&db->db_mtx);
-	} else {
-		ASSERT(db->db_level+1 == dn->dn_nlevels);
-		ASSERT(db->db_blkid < dn->dn_nblkptr);
-		ASSERT(db->db_parent == NULL ||
-		    db->db_parent == db->db_dnode->dn_dbuf);
-		mutex_enter(&dn->dn_mtx);
-		ASSERT(!list_link_active(&dr->dr_dirty_node));
-		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
-		mutex_exit(&dn->dn_mtx);
-		if (drop_struct_lock)
-			rw_exit(&dn->dn_struct_rwlock);
-	}
-
-	dnode_setdirty(dn, tx);
-	return (dr);
-}
-
-static int
-dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
-{
-	dnode_t *dn = db->db_dnode;
-	uint64_t txg = tx->tx_txg;
-	dbuf_dirty_record_t *dr;
-
-	ASSERT(txg != 0);
-	ASSERT(db->db_blkid != DB_BONUS_BLKID);
-
-	mutex_enter(&db->db_mtx);
-
-	/*
-	 * If this buffer is not dirty, we're done.
-	 */
-	for (dr = db->db_last_dirty; dr; dr = dr->dr_next)
-		if (dr->dr_txg <= txg)
-			break;
-	if (dr == NULL || dr->dr_txg < txg) {
-		mutex_exit(&db->db_mtx);
-		return (0);
-	}
-	ASSERT(dr->dr_txg == txg);
-
-	/*
-	 * If this buffer is currently held, we cannot undirty
-	 * it, since one of the current holders may be in the
-	 * middle of an update.  Note that users of dbuf_undirty()
-	 * should not place a hold on the dbuf before the call.
-	 */
-	if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
-		mutex_exit(&db->db_mtx);
-		/* Make sure we don't toss this buffer at sync phase */
-		mutex_enter(&dn->dn_mtx);
-		dnode_clear_range(dn, db->db_blkid, 1, tx);
-		mutex_exit(&dn->dn_mtx);
-		return (0);
-	}
-
-	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
-
-	ASSERT(db->db.db_size != 0);
-
-	/* XXX would be nice to fix up dn_towrite_space[] */
-
-	db->db_last_dirty = dr->dr_next;
-
-	if (dr->dr_parent) {
-		mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
-		list_remove(&dr->dr_parent->dt.di.dr_children, dr);
-		mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
-	} else if (db->db_level+1 == dn->dn_nlevels) {
-		ASSERT3P(db->db_parent, ==, dn->dn_dbuf);
-		mutex_enter(&dn->dn_mtx);
-		list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
-		mutex_exit(&dn->dn_mtx);
-	}
-
-	if (db->db_level == 0) {
-		dbuf_unoverride(dr);
-
-		ASSERT(db->db_buf != NULL);
-		ASSERT(dr->dt.dl.dr_data != NULL);
-		if (dr->dt.dl.dr_data != db->db_buf)
-			VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1);
-	} else {
-		ASSERT(db->db_buf != NULL);
-		ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
-		list_destroy(&dr->dt.di.dr_children);
-		mutex_destroy(&dr->dt.di.dr_mtx);
-	}
-	kmem_free(dr, sizeof (dbuf_dirty_record_t));
-
-	ASSERT(db->db_dirtycnt > 0);
-	db->db_dirtycnt -= 1;
-
-	if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
-		arc_buf_t *buf = db->db_buf;
-
-		ASSERT(arc_released(buf));
-		dbuf_set_data(db, NULL);
-		VERIFY(arc_buf_remove_ref(buf, db) == 1);
-		dbuf_evict(db);
-		return (1);
-	}
-
-	mutex_exit(&db->db_mtx);
-	return (0);
-}
-
-#pragma weak dmu_buf_will_dirty = dbuf_will_dirty
-void
-dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
-{
-	int rf = DB_RF_MUST_SUCCEED;
-
-	ASSERT(tx->tx_txg != 0);
-	ASSERT(!refcount_is_zero(&db->db_holds));
-
-	if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock))
-		rf |= DB_RF_HAVESTRUCT;
-	(void) dbuf_read(db, NULL, rf);
-	(void) dbuf_dirty(db, tx);
-}
-
-void
-dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
-{
-	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
-
-	ASSERT(db->db_blkid != DB_BONUS_BLKID);
-	ASSERT(tx->tx_txg != 0);
-	ASSERT(db->db_level == 0);
-	ASSERT(!refcount_is_zero(&db->db_holds));
-
-	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
-	    dmu_tx_private_ok(tx));
-
-	dbuf_noread(db);
-	(void) dbuf_dirty(db, tx);
-}
-
-#pragma weak dmu_buf_fill_done = dbuf_fill_done
-/* ARGSUSED */
-void
-dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
-{
-	mutex_enter(&db->db_mtx);
-	DBUF_VERIFY(db);
-
-	if (db->db_state == DB_FILL) {
-		if (db->db_level == 0 && db->db_freed_in_flight) {
-			ASSERT(db->db_blkid != DB_BONUS_BLKID);
-			/* we were freed while filling */
-			/* XXX dbuf_undirty? */
-			bzero(db->db.db_data, db->db.db_size);
-			db->db_freed_in_flight = FALSE;
-		}
-		db->db_state = DB_CACHED;
-		cv_broadcast(&db->db_changed);
-	}
-	mutex_exit(&db->db_mtx);
-}
-
-/*
- * "Clear" the contents of this dbuf.  This will mark the dbuf
- * EVICTING and clear *most* of its references.  Unfortunetely,
- * when we are not holding the dn_dbufs_mtx, we can't clear the
- * entry in the dn_dbufs list.  We have to wait until dbuf_destroy()
- * in this case.  For callers from the DMU we will usually see:
- *	dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy()
- * For the arc callback, we will usually see:
- * 	dbuf_do_evict()->dbuf_clear();dbuf_destroy()
- * Sometimes, though, we will get a mix of these two:
- *	DMU: dbuf_clear()->arc_buf_evict()
- *	ARC: dbuf_do_evict()->dbuf_destroy()
- */
-void
-dbuf_clear(dmu_buf_impl_t *db)
-{
-	dnode_t *dn = db->db_dnode;
-	dmu_buf_impl_t *parent = db->db_parent;
-	dmu_buf_impl_t *dndb = dn->dn_dbuf;
-	int dbuf_gone = FALSE;
-
-	ASSERT(MUTEX_HELD(&db->db_mtx));
-	ASSERT(refcount_is_zero(&db->db_holds));
-
-	dbuf_evict_user(db);
-
-	if (db->db_state == DB_CACHED) {
-		ASSERT(db->db.db_data != NULL);
-		if (db->db_blkid == DB_BONUS_BLKID)
-			zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
-		db->db.db_data = NULL;
-		db->db_state = DB_UNCACHED;
-	}
-
-	ASSERT3U(db->db_state, ==, DB_UNCACHED);
-	ASSERT(db->db_data_pending == NULL);
-
-	db->db_state = DB_EVICTING;
-	db->db_blkptr = NULL;
-
-	if (db->db_blkid != DB_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
-		list_remove(&dn->dn_dbufs, db);
-		dnode_rele(dn, db);
-	}
-
-	if (db->db_buf)
-		dbuf_gone = arc_buf_evict(db->db_buf);
-
-	if (!dbuf_gone)
-		mutex_exit(&db->db_mtx);
-
-	/*
-	 * If this dbuf is referened from an indirect dbuf,
-	 * decrement the ref count on the indirect dbuf.
-	 */
-	if (parent && parent != dndb)
-		dbuf_rele(parent, db);
-}
-
-static int
-dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
-    dmu_buf_impl_t **parentp, blkptr_t **bpp)
-{
-	int nlevels, epbs;
-
-	*parentp = NULL;
-	*bpp = NULL;
-
-	ASSERT(blkid != DB_BONUS_BLKID);
-
-	if (dn->dn_phys->dn_nlevels == 0)
-		nlevels = 1;
-	else
-		nlevels = dn->dn_phys->dn_nlevels;
-
-	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
-
-	ASSERT3U(level * epbs, <, 64);
-	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
-	if (level >= nlevels ||
-	    (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
-		/* the buffer has no parent yet */
-		return (ENOENT);
-	} else if (level < nlevels-1) {
-		/* this block is referenced from an indirect block */
-		int err = dbuf_hold_impl(dn, level+1,
-		    blkid >> epbs, fail_sparse, NULL, parentp);
-		if (err)
-			return (err);
-		err = dbuf_read(*parentp, NULL,
-		    (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
-		if (err) {
-			dbuf_rele(*parentp, NULL);
-			*parentp = NULL;
-			return (err);
-		}
-		*bpp = ((blkptr_t *)(*parentp)->db.db_data) +
-		    (blkid & ((1ULL << epbs) - 1));
-		return (0);
-	} else {
-		/* the block is referenced from the dnode */
-		ASSERT3U(level, ==, nlevels-1);
-		ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
-		    blkid < dn->dn_phys->dn_nblkptr);
-		if (dn->dn_dbuf) {
-			dbuf_add_ref(dn->dn_dbuf, NULL);
-			*parentp = dn->dn_dbuf;
-		}
-		*bpp = &dn->dn_phys->dn_blkptr[blkid];
-		return (0);
-	}
-}
-
-static dmu_buf_impl_t *
-dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
-    dmu_buf_impl_t *parent, blkptr_t *blkptr)
-{
-	objset_impl_t *os = dn->dn_objset;
-	dmu_buf_impl_t *db, *odb;
-
-	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
-	ASSERT(dn->dn_type != DMU_OT_NONE);
-
-	db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
-
-	db->db_objset = os;
-	db->db.db_object = dn->dn_object;
-	db->db_level = level;
-	db->db_blkid = blkid;
-	db->db_last_dirty = NULL;
-	db->db_dirtycnt = 0;
-	db->db_dnode = dn;
-	db->db_parent = parent;
-	db->db_blkptr = blkptr;
-
-	db->db_user_ptr = NULL;
-	db->db_user_data_ptr_ptr = NULL;
-	db->db_evict_func = NULL;
-	db->db_immediate_evict = 0;
-	db->db_freed_in_flight = 0;
-
-	if (blkid == DB_BONUS_BLKID) {
-		ASSERT3P(parent, ==, dn->dn_dbuf);
-		db->db.db_size = dn->dn_bonuslen;
-		db->db.db_offset = DB_BONUS_BLKID;
-		db->db_state = DB_UNCACHED;
-		/* the bonus dbuf is not placed in the hash table */
-		return (db);
-	} else {
-		int blocksize =
-		    db->db_level ? 1<<dn->dn_indblkshift :  dn->dn_datablksz;
-		db->db.db_size = blocksize;
-		db->db.db_offset = db->db_blkid * blocksize;
-	}
-
-	/*
-	 * Hold the dn_dbufs_mtx while we get the new dbuf
-	 * in the hash table *and* added to the dbufs list.
-	 * This prevents a possible deadlock with someone
-	 * trying to look up this dbuf before its added to the
-	 * dn_dbufs list.
-	 */
-	mutex_enter(&dn->dn_dbufs_mtx);
-	db->db_state = DB_EVICTING;
-	if ((odb = dbuf_hash_insert(db)) != NULL) {
-		/* someone else inserted it first */
-		kmem_cache_free(dbuf_cache, db);
-		mutex_exit(&dn->dn_dbufs_mtx);
-		return (odb);
-	}
-	list_insert_head(&dn->dn_dbufs, db);
-	db->db_state = DB_UNCACHED;
-	mutex_exit(&dn->dn_dbufs_mtx);
-
-	if (parent && parent != dn->dn_dbuf)
-		dbuf_add_ref(parent, db);
-
-	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
-	    refcount_count(&dn->dn_holds) > 0);
-	(void) refcount_add(&dn->dn_holds, db);
-
-	dprintf_dbuf(db, "db=%p\n", db);
-
-	return (db);
-}
-
-static int
-dbuf_do_evict(void *private)
-{
-	arc_buf_t *buf = private;
-	dmu_buf_impl_t *db = buf->b_private;
-
-	if (!MUTEX_HELD(&db->db_mtx))
-		mutex_enter(&db->db_mtx);
-
-	ASSERT(refcount_is_zero(&db->db_holds));
-
-	if (db->db_state != DB_EVICTING) {
-		ASSERT(db->db_state == DB_CACHED);
-		DBUF_VERIFY(db);
-		db->db_buf = NULL;
-		dbuf_evict(db);
-	} else {
-		mutex_exit(&db->db_mtx);
-		dbuf_destroy(db);
-	}
-	return (0);
-}
-
-static void
-dbuf_destroy(dmu_buf_impl_t *db)
-{
-	ASSERT(refcount_is_zero(&db->db_holds));
-
-	if (db->db_blkid != DB_BONUS_BLKID) {
-		dnode_t *dn = db->db_dnode;
-
-		/*
-		 * If this dbuf is still on the dn_dbufs list,
-		 * remove it from that list.
-		 */
-		if (list_link_active(&db->db_link)) {
-			mutex_enter(&dn->dn_dbufs_mtx);
-			list_remove(&dn->dn_dbufs, db);
-			mutex_exit(&dn->dn_dbufs_mtx);
-
-			dnode_rele(dn, db);
-		}
-		dbuf_hash_remove(db);
-	}
-	db->db_parent = NULL;
-	db->db_dnode = NULL;
-	db->db_buf = NULL;
-
-	ASSERT(db->db.db_data == NULL);
-	ASSERT(db->db_hash_next == NULL);
-	ASSERT(db->db_blkptr == NULL);
-	ASSERT(db->db_data_pending == NULL);
-
-	kmem_cache_free(dbuf_cache, db);
-}
-
-void
-dbuf_prefetch(dnode_t *dn, uint64_t blkid)
-{
-	dmu_buf_impl_t *db = NULL;
-	blkptr_t *bp = NULL;
-
-	ASSERT(blkid != DB_BONUS_BLKID);
-	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
-
-	if (dnode_block_freed(dn, blkid))
-		return;
-
-	/* dbuf_find() returns with db_mtx held */
-	if (db = dbuf_find(dn, 0, blkid)) {
-		if (refcount_count(&db->db_holds) > 0) {
-			/*
-			 * This dbuf is active.  We assume that it is
-			 * already CACHED, or else about to be either
-			 * read or filled.
-			 */
-			mutex_exit(&db->db_mtx);
-			return;
-		}
-		mutex_exit(&db->db_mtx);
-		db = NULL;
-	}
-
-	if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
-		if (bp && !BP_IS_HOLE(bp)) {
-			uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
-			zbookmark_t zb;
-			zb.zb_objset = dn->dn_objset->os_dsl_dataset ?
-			    dn->dn_objset->os_dsl_dataset->ds_object : 0;
-			zb.zb_object = dn->dn_object;
-			zb.zb_level = 0;
-			zb.zb_blkid = blkid;
-
-			(void) arc_read(NULL, dn->dn_objset->os_spa, bp,
-			    dmu_ot[dn->dn_type].ot_byteswap,
-			    NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
-			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
-			    &aflags, &zb);
-		}
-		if (db)
-			dbuf_rele(db, NULL);
-	}
-}
-
-/*
- * Returns with db_holds incremented, and db_mtx not held.
- * Note: dn_struct_rwlock must be held.
- */
-int
-dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
-    void *tag, dmu_buf_impl_t **dbp)
-{
-	dmu_buf_impl_t *db, *parent = NULL;
-
-	ASSERT(blkid != DB_BONUS_BLKID);
-	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
-	ASSERT3U(dn->dn_nlevels, >, level);
-
-	*dbp = NULL;
-top:
-	/* dbuf_find() returns with db_mtx held */
-	db = dbuf_find(dn, level, blkid);
-
-	if (db == NULL) {
-		blkptr_t *bp = NULL;
-		int err;
-
-		ASSERT3P(parent, ==, NULL);
-		err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
-		if (fail_sparse) {
-			if (err == 0 && bp && BP_IS_HOLE(bp))
-				err = ENOENT;
-			if (err) {
-				if (parent)
-					dbuf_rele(parent, NULL);
-				return (err);
-			}
-		}
-		if (err && err != ENOENT)
-			return (err);
-		db = dbuf_create(dn, level, blkid, parent, bp);
-	}
-
-	if (db->db_buf && refcount_is_zero(&db->db_holds)) {
-		arc_buf_add_ref(db->db_buf, db);
-		if (db->db_buf->b_data == NULL) {
-			dbuf_clear(db);
-			if (parent) {
-				dbuf_rele(parent, NULL);
-				parent = NULL;
-			}
-			goto top;
-		}
-		ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
-	}
-
-	ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
-
-	/*
-	 * If this buffer is currently syncing out, and we are are
-	 * still referencing it from db_data, we need to make a copy
-	 * of it in case we decide we want to dirty it again in this txg.
-	 */
-	if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
-	    dn->dn_object != DMU_META_DNODE_OBJECT &&
-	    db->db_state == DB_CACHED && db->db_data_pending) {
-		dbuf_dirty_record_t *dr = db->db_data_pending;
-
-		if (dr->dt.dl.dr_data == db->db_buf) {
-			arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
-
-			dbuf_set_data(db,
-			    arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
-			    db->db.db_size, db, type));
-			bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
-			    db->db.db_size);
-		}
-	}
-
-	(void) refcount_add(&db->db_holds, tag);
-	dbuf_update_data(db);
-	DBUF_VERIFY(db);
-	mutex_exit(&db->db_mtx);
-
-	/* NOTE: we can't rele the parent until after we drop the db_mtx */
-	if (parent)
-		dbuf_rele(parent, NULL);
-
-	ASSERT3P(db->db_dnode, ==, dn);
-	ASSERT3U(db->db_blkid, ==, blkid);
-	ASSERT3U(db->db_level, ==, level);
-	*dbp = db;
-
-	return (0);
-}
-
-dmu_buf_impl_t *
-dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
-{
-	dmu_buf_impl_t *db;
-	int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db);
-	return (err ? NULL : db);
-}
-
-dmu_buf_impl_t *
-dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
-{
-	dmu_buf_impl_t *db;
-	int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db);
-	return (err ? NULL : db);
-}
-
-dmu_buf_impl_t *
-dbuf_create_bonus(dnode_t *dn)
-{
-	dmu_buf_impl_t *db = dn->dn_bonus;
-
-	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
-
-	ASSERT(dn->dn_bonus == NULL);
-	db = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL);
-	return (db);
-}
-
-#pragma weak dmu_buf_add_ref = dbuf_add_ref
-void
-dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
-{
-	int64_t holds = refcount_add(&db->db_holds, tag);
-	ASSERT(holds > 1);
-}
-
-#pragma weak dmu_buf_rele = dbuf_rele
-void
-dbuf_rele(dmu_buf_impl_t *db, void *tag)
-{
-	int64_t holds;
-
-	mutex_enter(&db->db_mtx);
-	DBUF_VERIFY(db);
-
-	holds = refcount_remove(&db->db_holds, tag);
-	ASSERT(holds >= 0);
-
-	/*
-	 * We can't freeze indirects if there is a possibility that they
-	 * may be modified in the current syncing context.
-	 */
-	if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0))
-		arc_buf_freeze(db->db_buf);
-
-	if (holds == db->db_dirtycnt &&
-	    db->db_level == 0 && db->db_immediate_evict)
-		dbuf_evict_user(db);
-
-	if (holds == 0) {
-		if (db->db_blkid == DB_BONUS_BLKID) {
-			mutex_exit(&db->db_mtx);
-			dnode_rele(db->db_dnode, db);
-		} else if (db->db_buf == NULL) {
-			/*
-			 * This is a special case: we never associated this
-			 * dbuf with any data allocated from the ARC.
-			 */
-			ASSERT3U(db->db_state, ==, DB_UNCACHED);
-			dbuf_evict(db);
-		} else if (arc_released(db->db_buf)) {
-			arc_buf_t *buf = db->db_buf;
-			/*
-			 * This dbuf has anonymous data associated with it.
-			 */
-			dbuf_set_data(db, NULL);
-			VERIFY(arc_buf_remove_ref(buf, db) == 1);
-			dbuf_evict(db);
-		} else {
-			VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0);
-			mutex_exit(&db->db_mtx);
-		}
-	} else {
-		mutex_exit(&db->db_mtx);
-	}
-}
-
-#pragma weak dmu_buf_refcount = dbuf_refcount
-uint64_t
-dbuf_refcount(dmu_buf_impl_t *db)
-{
-	return (refcount_count(&db->db_holds));
-}
-
-void *
-dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
-    dmu_buf_evict_func_t *evict_func)
-{
-	return (dmu_buf_update_user(db_fake, NULL, user_ptr,
-	    user_data_ptr_ptr, evict_func));
-}
-
-void *
-dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
-    dmu_buf_evict_func_t *evict_func)
-{
-	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
-
-	db->db_immediate_evict = TRUE;
-	return (dmu_buf_update_user(db_fake, NULL, user_ptr,
-	    user_data_ptr_ptr, evict_func));
-}
-
-void *
-dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr,
-    void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func)
-{
-	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
-	ASSERT(db->db_level == 0);
-
-	ASSERT((user_ptr == NULL) == (evict_func == NULL));
-
-	mutex_enter(&db->db_mtx);
-
-	if (db->db_user_ptr == old_user_ptr) {
-		db->db_user_ptr = user_ptr;
-		db->db_user_data_ptr_ptr = user_data_ptr_ptr;
-		db->db_evict_func = evict_func;
-
-		dbuf_update_data(db);
-	} else {
-		old_user_ptr = db->db_user_ptr;
-	}
-
-	mutex_exit(&db->db_mtx);
-	return (old_user_ptr);
-}
-
-void *
-dmu_buf_get_user(dmu_buf_t *db_fake)
-{
-	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
-	ASSERT(!refcount_is_zero(&db->db_holds));
-
-	return (db->db_user_ptr);
-}
-
-static void
-dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
-{
-	/* ASSERT(dmu_tx_is_syncing(tx) */
-	ASSERT(MUTEX_HELD(&db->db_mtx));
-
-	if (db->db_blkptr != NULL)
-		return;
-
-	if (db->db_level == dn->dn_phys->dn_nlevels-1) {
-		/*
-		 * This buffer was allocated at a time when there was
-		 * no available blkptrs from the dnode, or it was
-		 * inappropriate to hook it in (i.e., nlevels mis-match).
-		 */
-		ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
-		ASSERT(db->db_parent == NULL);
-		db->db_parent = dn->dn_dbuf;
-		db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
-		DBUF_VERIFY(db);
-	} else {
-		dmu_buf_impl_t *parent = db->db_parent;
-		int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
-
-		ASSERT(dn->dn_phys->dn_nlevels > 1);
-		if (parent == NULL) {
-			mutex_exit(&db->db_mtx);
-			rw_enter(&dn->dn_struct_rwlock, RW_READER);
-			(void) dbuf_hold_impl(dn, db->db_level+1,
-			    db->db_blkid >> epbs, FALSE, db, &parent);
-			rw_exit(&dn->dn_struct_rwlock);
-			mutex_enter(&db->db_mtx);
-			db->db_parent = parent;
-		}
-		db->db_blkptr = (blkptr_t *)parent->db.db_data +
-		    (db->db_blkid & ((1ULL << epbs) - 1));
-		DBUF_VERIFY(db);
-	}
-}
-
-static void
-dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
-{
-	dmu_buf_impl_t *db = dr->dr_dbuf;
-	dnode_t *dn = db->db_dnode;
-	zio_t *zio;
-
-	ASSERT(dmu_tx_is_syncing(tx));
-
-	dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
-
-	mutex_enter(&db->db_mtx);
-
-	ASSERT(db->db_level > 0);
-	DBUF_VERIFY(db);
-
-	if (db->db_buf == NULL) {
-		mutex_exit(&db->db_mtx);
-		(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
-		mutex_enter(&db->db_mtx);
-	}
-	ASSERT3U(db->db_state, ==, DB_CACHED);
-	ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
-	ASSERT(db->db_buf != NULL);
-
-	dbuf_check_blkptr(dn, db);
-
-	db->db_data_pending = dr;
-
-	arc_release(db->db_buf, db);
-	mutex_exit(&db->db_mtx);
-
-	/*
-	 * XXX -- we should design a compression algorithm
-	 * that specializes in arrays of bps.
-	 */
-	dbuf_write(dr, db->db_buf, ZIO_CHECKSUM_FLETCHER_4,
-	    zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY : ZIO_COMPRESS_LZJB, tx);
-
-	zio = dr->dr_zio;
-	mutex_enter(&dr->dt.di.dr_mtx);
-	dbuf_sync_list(&dr->dt.di.dr_children, tx);
-	ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
-	mutex_exit(&dr->dt.di.dr_mtx);
-	zio_nowait(zio);
-}
-
-static void
-dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
-{
-	arc_buf_t **datap = &dr->dt.dl.dr_data;
-	dmu_buf_impl_t *db = dr->dr_dbuf;
-	dnode_t *dn = db->db_dnode;
-	objset_impl_t *os = dn->dn_objset;
-	uint64_t txg = tx->tx_txg;
-	int checksum, compress;
-	int blksz;
-
-	ASSERT(dmu_tx_is_syncing(tx));
-
-	dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
-
-	mutex_enter(&db->db_mtx);
-	/*
-	 * To be synced, we must be dirtied.  But we
-	 * might have been freed after the dirty.
-	 */
-	if (db->db_state == DB_UNCACHED) {
-		/* This buffer has been freed since it was dirtied */
-		ASSERT(db->db.db_data == NULL);
-	} else if (db->db_state == DB_FILL) {
-		/* This buffer was freed and is now being re-filled */
-		ASSERT(db->db.db_data != dr->dt.dl.dr_data);
-	} else {
-		ASSERT3U(db->db_state, ==, DB_CACHED);
-	}
-	DBUF_VERIFY(db);
-
-	/*
-	 * If this is a bonus buffer, simply copy the bonus data into the
-	 * dnode.  It will be written out when the dnode is synced (and it
-	 * will be synced, since it must have been dirty for dbuf_sync to
-	 * be called).
-	 */
-	if (db->db_blkid == DB_BONUS_BLKID) {
-		dbuf_dirty_record_t **drp;
-		/*
-		 * Use dn_phys->dn_bonuslen since db.db_size is the length
-		 * of the bonus buffer in the open transaction rather than
-		 * the syncing transaction.
-		 */
-		ASSERT(*datap != NULL);
-		ASSERT3U(db->db_level, ==, 0);
-		ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
-		bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
-		if (*datap != db->db.db_data)
-			zio_buf_free(*datap, DN_MAX_BONUSLEN);
-		db->db_data_pending = NULL;
-		drp = &db->db_last_dirty;
-		while (*drp != dr)
-			drp = &(*drp)->dr_next;
-		ASSERT((*drp)->dr_next == NULL);
-		*drp = NULL;
-		if (dr->dr_dbuf->db_level != 0) {
-			list_destroy(&dr->dt.di.dr_children);
-			mutex_destroy(&dr->dt.di.dr_mtx);
-		}
-		kmem_free(dr, sizeof (dbuf_dirty_record_t));
-		ASSERT(db->db_dirtycnt > 0);
-		db->db_dirtycnt -= 1;
-		mutex_exit(&db->db_mtx);
-		dbuf_rele(db, (void *)(uintptr_t)txg);
-		return;
-	}
-
-	/*
-	 * If this buffer is in the middle of an immdiate write,
-	 * wait for the synchronous IO to complete.
-	 */
-	while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
-		ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
-		cv_wait(&db->db_changed, &db->db_mtx);
-		ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
-	}
-
-	dbuf_check_blkptr(dn, db);
-
-	/*
-	 * If this dbuf has already been written out via an immediate write,
-	 * just complete the write by copying over the new block pointer and
-	 * updating the accounting via the write-completion functions.
-	 */
-	if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
-		zio_t zio_fake;
-
-		zio_fake.io_private = &db;
-		zio_fake.io_error = 0;
-		zio_fake.io_bp = db->db_blkptr;
-		zio_fake.io_bp_orig = *db->db_blkptr;
-		zio_fake.io_txg = txg;
-
-		*db->db_blkptr = dr->dt.dl.dr_overridden_by;
-		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
-		db->db_data_pending = dr;
-		dr->dr_zio = &zio_fake;
-		mutex_exit(&db->db_mtx);
-
-		if (BP_IS_OLDER(&zio_fake.io_bp_orig, txg))
-			dsl_dataset_block_kill(os->os_dsl_dataset,
-			    &zio_fake.io_bp_orig, dn->dn_zio, tx);
-
-		dbuf_write_ready(&zio_fake, db->db_buf, db);
-		dbuf_write_done(&zio_fake, db->db_buf, db);
-
-		return;
-	}
-
-	blksz = arc_buf_size(*datap);
-
-	if (dn->dn_object != DMU_META_DNODE_OBJECT) {
-		/*
-		 * If this buffer is currently "in use" (i.e., there are
-		 * active holds and db_data still references it), then make
-		 * a copy before we start the write so that any modifications
-		 * from the open txg will not leak into this write.
-		 *
-		 * NOTE: this copy does not need to be made for objects only
-		 * modified in the syncing context (e.g. DNONE_DNODE blocks).
-		 */
-		if (refcount_count(&db->db_holds) > 1 && *datap == db->db_buf) {
-			arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
-			*datap = arc_buf_alloc(os->os_spa, blksz, db, type);
-			bcopy(db->db.db_data, (*datap)->b_data, blksz);
-		}
-	} else {
-		/*
-		 * Private object buffers are released here rather
-		 * than in dbuf_dirty() since they are only modified
-		 * in the syncing context and we don't want the
-		 * overhead of making multiple copies of the data.
-		 */
-		arc_release(db->db_buf, db);
-	}
-
-	ASSERT(*datap != NULL);
-	db->db_data_pending = dr;
-
-	mutex_exit(&db->db_mtx);
-
-	/*
-	 * Allow dnode settings to override objset settings,
-	 * except for metadata checksums.
-	 */
-	if (dmu_ot[dn->dn_type].ot_metadata) {
-		checksum = os->os_md_checksum;
-		compress = zio_compress_select(dn->dn_compress,
-		    os->os_md_compress);
-	} else {
-		checksum = zio_checksum_select(dn->dn_checksum,
-		    os->os_checksum);
-		compress = zio_compress_select(dn->dn_compress,
-		    os->os_compress);
-	}
-
-	dbuf_write(dr, *datap, checksum, compress, tx);
-
-	ASSERT(!list_link_active(&dr->dr_dirty_node));
-	if (dn->dn_object == DMU_META_DNODE_OBJECT)
-		list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
-	else
-		zio_nowait(dr->dr_zio);
-}
-
-void
-dbuf_sync_list(list_t *list, dmu_tx_t *tx)
-{
-	dbuf_dirty_record_t *dr;
-
-	while (dr = list_head(list)) {
-		if (dr->dr_zio != NULL) {
-			/*
-			 * If we find an already initialized zio then we
-			 * are processing the meta-dnode, and we have finished.
-			 * The dbufs for all dnodes are put back on the list
-			 * during processing, so that we can zio_wait()
-			 * these IOs after initiating all child IOs.
-			 */
-			ASSERT3U(dr->dr_dbuf->db.db_object, ==,
-			    DMU_META_DNODE_OBJECT);
-			break;
-		}
-		list_remove(list, dr);
-		if (dr->dr_dbuf->db_level > 0)
-			dbuf_sync_indirect(dr, tx);
-		else
-			dbuf_sync_leaf(dr, tx);
-	}
-}
-
-static void
-dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum,
-    int compress, dmu_tx_t *tx)
-{
-	dmu_buf_impl_t *db = dr->dr_dbuf;
-	dnode_t *dn = db->db_dnode;
-	objset_impl_t *os = dn->dn_objset;
-	dmu_buf_impl_t *parent = db->db_parent;
-	uint64_t txg = tx->tx_txg;
-	zbookmark_t zb;
-	zio_t *zio;
-	int zio_flags;
-
-	if (parent != dn->dn_dbuf) {
-		ASSERT(parent && parent->db_data_pending);
-		ASSERT(db->db_level == parent->db_level-1);
-		ASSERT(arc_released(parent->db_buf));
-		zio = parent->db_data_pending->dr_zio;
-	} else {
-		ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1);
-		ASSERT3P(db->db_blkptr, ==,
-		    &dn->dn_phys->dn_blkptr[db->db_blkid]);
-		zio = dn->dn_zio;
-	}
-
-	ASSERT(db->db_level == 0 || data == db->db_buf);
-	ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
-	ASSERT(zio);
-
-	zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0;
-	zb.zb_object = db->db.db_object;
-	zb.zb_level = db->db_level;
-	zb.zb_blkid = db->db_blkid;
-
-	zio_flags = ZIO_FLAG_MUSTSUCCEED;
-	if (dmu_ot[dn->dn_type].ot_metadata || zb.zb_level != 0)
-		zio_flags |= ZIO_FLAG_METADATA;
-	if (BP_IS_OLDER(db->db_blkptr, txg))
-		dsl_dataset_block_kill(
-		    os->os_dsl_dataset, db->db_blkptr, zio, tx);
-
-	dr->dr_zio = arc_write(zio, os->os_spa, checksum, compress,
-	    dmu_get_replication_level(os, &zb, dn->dn_type), txg,
-	    db->db_blkptr, data, dbuf_write_ready, dbuf_write_done, db,
-	    ZIO_PRIORITY_ASYNC_WRITE, zio_flags, &zb);
-}
-
-/* ARGSUSED */
-static void
-dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
-{
-	dmu_buf_impl_t *db = vdb;
-	dnode_t *dn = db->db_dnode;
-	objset_impl_t *os = dn->dn_objset;
-	blkptr_t *bp_orig = &zio->io_bp_orig;
-	uint64_t fill = 0;
-	int old_size, new_size, i;
-
-	dprintf_dbuf_bp(db, bp_orig, "bp_orig: %s", "");
-
-	old_size = bp_get_dasize(os->os_spa, bp_orig);
-	new_size = bp_get_dasize(os->os_spa, zio->io_bp);
-
-	dnode_diduse_space(dn, new_size-old_size);
-
-	if (BP_IS_HOLE(zio->io_bp)) {
-		dsl_dataset_t *ds = os->os_dsl_dataset;
-		dmu_tx_t *tx = os->os_synctx;
-
-		if (bp_orig->blk_birth == tx->tx_txg)
-			dsl_dataset_block_kill(ds, bp_orig, NULL, tx);
-		ASSERT3U(db->db_blkptr->blk_fill, ==, 0);
-		return;
-	}
-
-	mutex_enter(&db->db_mtx);
-
-	if (db->db_level == 0) {
-		mutex_enter(&dn->dn_mtx);
-		if (db->db_blkid > dn->dn_phys->dn_maxblkid)
-			dn->dn_phys->dn_maxblkid = db->db_blkid;
-		mutex_exit(&dn->dn_mtx);
-
-		if (dn->dn_type == DMU_OT_DNODE) {
-			dnode_phys_t *dnp = db->db.db_data;
-			for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
-			    i--, dnp++) {
-				if (dnp->dn_type != DMU_OT_NONE)
-					fill++;
-			}
-		} else {
-			fill = 1;
-		}
-	} else {
-		blkptr_t *bp = db->db.db_data;
-		ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
-		for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, bp++) {
-			if (BP_IS_HOLE(bp))
-				continue;
-			ASSERT3U(BP_GET_LSIZE(bp), ==,
-			    db->db_level == 1 ? dn->dn_datablksz :
-			    (1<<dn->dn_phys->dn_indblkshift));
-			fill += bp->blk_fill;
-		}
-	}
-
-	db->db_blkptr->blk_fill = fill;
-	BP_SET_TYPE(db->db_blkptr, dn->dn_type);
-	BP_SET_LEVEL(db->db_blkptr, db->db_level);
-
-	mutex_exit(&db->db_mtx);
-
-	/* We must do this after we've set the bp's type and level */
-	if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp), BP_IDENTITY(bp_orig))) {
-		dsl_dataset_t *ds = os->os_dsl_dataset;
-		dmu_tx_t *tx = os->os_synctx;
-
-		if (bp_orig->blk_birth == tx->tx_txg)
-			dsl_dataset_block_kill(ds, bp_orig, NULL, tx);
-		dsl_dataset_block_born(ds, zio->io_bp, tx);
-	}
-}
-
-/* ARGSUSED */
-static void
-dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
-{
-	dmu_buf_impl_t *db = vdb;
-	uint64_t txg = zio->io_txg;
-	dbuf_dirty_record_t **drp, *dr;
-
-	ASSERT3U(zio->io_error, ==, 0);
-
-	mutex_enter(&db->db_mtx);
-
-	drp = &db->db_last_dirty;
-	while (*drp != db->db_data_pending)
-		drp = &(*drp)->dr_next;
-	ASSERT(!list_link_active(&(*drp)->dr_dirty_node));
-	ASSERT((*drp)->dr_txg == txg);
-	ASSERT((*drp)->dr_next == NULL);
-	dr = *drp;
-	*drp = NULL;
-
-	if (db->db_level == 0) {
-		ASSERT(db->db_blkid != DB_BONUS_BLKID);
-		ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
-
-		if (dr->dt.dl.dr_data != db->db_buf)
-			VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1);
-		else if (!BP_IS_HOLE(db->db_blkptr))
-			arc_set_callback(db->db_buf, dbuf_do_evict, db);
-		else
-			ASSERT(arc_released(db->db_buf));
-	} else {
-		dnode_t *dn = db->db_dnode;
-
-		ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
-		ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
-		if (!BP_IS_HOLE(db->db_blkptr)) {
-			int epbs =
-			    dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
-			ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
-			    db->db.db_size);
-			ASSERT3U(dn->dn_phys->dn_maxblkid
-			    >> (db->db_level * epbs), >=, db->db_blkid);
-			arc_set_callback(db->db_buf, dbuf_do_evict, db);
-		}
-		list_destroy(&dr->dt.di.dr_children);
-		mutex_destroy(&dr->dt.di.dr_mtx);
-	}
-	kmem_free(dr, sizeof (dbuf_dirty_record_t));
-
-	cv_broadcast(&db->db_changed);
-	ASSERT(db->db_dirtycnt > 0);
-	db->db_dirtycnt -= 1;
-	db->db_data_pending = NULL;
-	mutex_exit(&db->db_mtx);
-
-	dprintf_dbuf_bp(db, zio->io_bp, "bp: %s", "");
-
-	dbuf_rele(db, (void *)(uintptr_t)txg);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dmu.c
deleted file mode 100644
index d3be6b4..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu.c
+++ /dev/null
@@ -1,1029 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/dmu.h>
-#include <sys/dmu_impl.h>
-#include <sys/dmu_tx.h>
-#include <sys/dbuf.h>
-#include <sys/dnode.h>
-#include <sys/zfs_context.h>
-#include <sys/dmu_objset.h>
-#include <sys/dmu_traverse.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_dir.h>
-#include <sys/dsl_pool.h>
-#include <sys/dsl_synctask.h>
-#include <sys/dsl_prop.h>
-#include <sys/dmu_zfetch.h>
-#include <sys/zfs_ioctl.h>
-#include <sys/zap.h>
-#include <sys/zio_checksum.h>
-
-const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
-	{	byteswap_uint8_array,	TRUE,	"unallocated"		},
-	{	zap_byteswap,		TRUE,	"object directory"	},
-	{	byteswap_uint64_array,	TRUE,	"object array"		},
-	{	byteswap_uint8_array,	TRUE,	"packed nvlist"		},
-	{	byteswap_uint64_array,	TRUE,	"packed nvlist size"	},
-	{	byteswap_uint64_array,	TRUE,	"bplist"		},
-	{	byteswap_uint64_array,	TRUE,	"bplist header"		},
-	{	byteswap_uint64_array,	TRUE,	"SPA space map header"	},
-	{	byteswap_uint64_array,	TRUE,	"SPA space map"		},
-	{	byteswap_uint64_array,	TRUE,	"ZIL intent log"	},
-	{	dnode_buf_byteswap,	TRUE,	"DMU dnode"		},
-	{	dmu_objset_byteswap,	TRUE,	"DMU objset"		},
-	{	byteswap_uint64_array,	TRUE,	"DSL directory"		},
-	{	zap_byteswap,		TRUE,	"DSL directory child map"},
-	{	zap_byteswap,		TRUE,	"DSL dataset snap map"	},
-	{	zap_byteswap,		TRUE,	"DSL props"		},
-	{	byteswap_uint64_array,	TRUE,	"DSL dataset"		},
-	{	zfs_znode_byteswap,	TRUE,	"ZFS znode"		},
-	{	zfs_acl_byteswap,	TRUE,	"ZFS ACL"		},
-	{	byteswap_uint8_array,	FALSE,	"ZFS plain file"	},
-	{	zap_byteswap,		TRUE,	"ZFS directory"		},
-	{	zap_byteswap,		TRUE,	"ZFS master node"	},
-	{	zap_byteswap,		TRUE,	"ZFS delete queue"	},
-	{	byteswap_uint8_array,	FALSE,	"zvol object"		},
-	{	zap_byteswap,		TRUE,	"zvol prop"		},
-	{	byteswap_uint8_array,	FALSE,	"other uint8[]"		},
-	{	byteswap_uint64_array,	FALSE,	"other uint64[]"	},
-	{	zap_byteswap,		TRUE,	"other ZAP"		},
-	{	zap_byteswap,		TRUE,	"persistent error log"	},
-	{	byteswap_uint8_array,	TRUE,	"SPA history"		},
-	{	byteswap_uint64_array,	TRUE,	"SPA history offsets"	},
-	{	zap_byteswap,	TRUE,	"Pool properties"	},
-};
-
-int
-dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
-    void *tag, dmu_buf_t **dbp)
-{
-	dnode_t *dn;
-	uint64_t blkid;
-	dmu_buf_impl_t *db;
-	int err;
-
-	err = dnode_hold(os->os, object, FTAG, &dn);
-	if (err)
-		return (err);
-	blkid = dbuf_whichblock(dn, offset);
-	rw_enter(&dn->dn_struct_rwlock, RW_READER);
-	db = dbuf_hold(dn, blkid, tag);
-	rw_exit(&dn->dn_struct_rwlock);
-	if (db == NULL) {
-		err = EIO;
-	} else {
-		err = dbuf_read(db, NULL, DB_RF_CANFAIL);
-		if (err) {
-			dbuf_rele(db, tag);
-			db = NULL;
-		}
-	}
-
-	dnode_rele(dn, FTAG);
-	*dbp = &db->db;
-	return (err);
-}
-
-int
-dmu_bonus_max(void)
-{
-	return (DN_MAX_BONUSLEN);
-}
-
-/*
- * returns ENOENT, EIO, or 0.
- */
-int
-dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
-{
-	dnode_t *dn;
-	int err, count;
-	dmu_buf_impl_t *db;
-
-	err = dnode_hold(os->os, object, FTAG, &dn);
-	if (err)
-		return (err);
-
-	rw_enter(&dn->dn_struct_rwlock, RW_READER);
-	if (dn->dn_bonus == NULL) {
-		rw_exit(&dn->dn_struct_rwlock);
-		rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
-		if (dn->dn_bonus == NULL)
-			dn->dn_bonus = dbuf_create_bonus(dn);
-	}
-	db = dn->dn_bonus;
-	rw_exit(&dn->dn_struct_rwlock);
-	mutex_enter(&db->db_mtx);
-	count = refcount_add(&db->db_holds, tag);
-	mutex_exit(&db->db_mtx);
-	if (count == 1)
-		dnode_add_ref(dn, db);
-	dnode_rele(dn, FTAG);
-
-	VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED));
-
-	*dbp = &db->db;
-	return (0);
-}
-
-/*
- * Note: longer-term, we should modify all of the dmu_buf_*() interfaces
- * to take a held dnode rather than <os, object> -- the lookup is wasteful,
- * and can induce severe lock contention when writing to several files
- * whose dnodes are in the same block.
- */
-static int
-dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset,
-    uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
-{
-	dmu_buf_t **dbp;
-	uint64_t blkid, nblks, i;
-	uint32_t flags;
-	int err;
-	zio_t *zio;
-
-	ASSERT(length <= DMU_MAX_ACCESS);
-
-	flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT;
-	if (length > zfetch_array_rd_sz)
-		flags |= DB_RF_NOPREFETCH;
-
-	rw_enter(&dn->dn_struct_rwlock, RW_READER);
-	if (dn->dn_datablkshift) {
-		int blkshift = dn->dn_datablkshift;
-		nblks = (P2ROUNDUP(offset+length, 1ULL<<blkshift) -
-		    P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift;
-	} else {
-		if (offset + length > dn->dn_datablksz) {
-			zfs_panic_recover("zfs: accessing past end of object "
-			    "%llx/%llx (size=%u access=%llu+%llu)",
-			    (longlong_t)dn->dn_objset->
-			    os_dsl_dataset->ds_object,
-			    (longlong_t)dn->dn_object, dn->dn_datablksz,
-			    (longlong_t)offset, (longlong_t)length);
-			return (EIO);
-		}
-		nblks = 1;
-	}
-	dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
-
-	zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, TRUE);
-	blkid = dbuf_whichblock(dn, offset);
-	for (i = 0; i < nblks; i++) {
-		dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag);
-		if (db == NULL) {
-			rw_exit(&dn->dn_struct_rwlock);
-			dmu_buf_rele_array(dbp, nblks, tag);
-			zio_nowait(zio);
-			return (EIO);
-		}
-		/* initiate async i/o */
-		if (read) {
-			rw_exit(&dn->dn_struct_rwlock);
-			(void) dbuf_read(db, zio, flags);
-			rw_enter(&dn->dn_struct_rwlock, RW_READER);
-		}
-		dbp[i] = &db->db;
-	}
-	rw_exit(&dn->dn_struct_rwlock);
-
-	/* wait for async i/o */
-	err = zio_wait(zio);
-	if (err) {
-		dmu_buf_rele_array(dbp, nblks, tag);
-		return (err);
-	}
-
-	/* wait for other io to complete */
-	if (read) {
-		for (i = 0; i < nblks; i++) {
-			dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
-			mutex_enter(&db->db_mtx);
-			while (db->db_state == DB_READ ||
-			    db->db_state == DB_FILL)
-				cv_wait(&db->db_changed, &db->db_mtx);
-			if (db->db_state == DB_UNCACHED)
-				err = EIO;
-			mutex_exit(&db->db_mtx);
-			if (err) {
-				dmu_buf_rele_array(dbp, nblks, tag);
-				return (err);
-			}
-		}
-	}
-
-	*numbufsp = nblks;
-	*dbpp = dbp;
-	return (0);
-}
-
-static int
-dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
-    uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
-{
-	dnode_t *dn;
-	int err;
-
-	err = dnode_hold(os->os, object, FTAG, &dn);
-	if (err)
-		return (err);
-
-	err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
-	    numbufsp, dbpp);
-
-	dnode_rele(dn, FTAG);
-
-	return (err);
-}
-
-int
-dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset,
-    uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
-{
-	dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
-	int err;
-
-	err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
-	    numbufsp, dbpp);
-
-	return (err);
-}
-
-void
-dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag)
-{
-	int i;
-	dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
-
-	if (numbufs == 0)
-		return;
-
-	for (i = 0; i < numbufs; i++) {
-		if (dbp[i])
-			dbuf_rele(dbp[i], tag);
-	}
-
-	kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs);
-}
-
-void
-dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
-{
-	dnode_t *dn;
-	uint64_t blkid;
-	int nblks, i, err;
-
-	if (zfs_prefetch_disable)
-		return;
-
-	if (len == 0) {  /* they're interested in the bonus buffer */
-		dn = os->os->os_meta_dnode;
-
-		if (object == 0 || object >= DN_MAX_OBJECT)
-			return;
-
-		rw_enter(&dn->dn_struct_rwlock, RW_READER);
-		blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t));
-		dbuf_prefetch(dn, blkid);
-		rw_exit(&dn->dn_struct_rwlock);
-		return;
-	}
-
-	/*
-	 * XXX - Note, if the dnode for the requested object is not
-	 * already cached, we will do a *synchronous* read in the
-	 * dnode_hold() call.  The same is true for any indirects.
-	 */
-	err = dnode_hold(os->os, object, FTAG, &dn);
-	if (err != 0)
-		return;
-
-	rw_enter(&dn->dn_struct_rwlock, RW_READER);
-	if (dn->dn_datablkshift) {
-		int blkshift = dn->dn_datablkshift;
-		nblks = (P2ROUNDUP(offset+len, 1<<blkshift) -
-		    P2ALIGN(offset, 1<<blkshift)) >> blkshift;
-	} else {
-		nblks = (offset < dn->dn_datablksz);
-	}
-
-	if (nblks != 0) {
-		blkid = dbuf_whichblock(dn, offset);
-		for (i = 0; i < nblks; i++)
-			dbuf_prefetch(dn, blkid+i);
-	}
-
-	rw_exit(&dn->dn_struct_rwlock);
-
-	dnode_rele(dn, FTAG);
-}
-
-int
-dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
-    uint64_t size, dmu_tx_t *tx)
-{
-	dnode_t *dn;
-	int err = dnode_hold(os->os, object, FTAG, &dn);
-	if (err)
-		return (err);
-	ASSERT(offset < UINT64_MAX);
-	ASSERT(size == -1ULL || size <= UINT64_MAX - offset);
-	dnode_free_range(dn, offset, size, tx);
-	dnode_rele(dn, FTAG);
-	return (0);
-}
-
-int
-dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
-    void *buf)
-{
-	dnode_t *dn;
-	dmu_buf_t **dbp;
-	int numbufs, i, err;
-
-	err = dnode_hold(os->os, object, FTAG, &dn);
-	if (err)
-		return (err);
-
-	/*
-	 * Deal with odd block sizes, where there can't be data past the first
-	 * block.  If we ever do the tail block optimization, we will need to
-	 * handle that here as well.
-	 */
-	if (dn->dn_datablkshift == 0) {
-		int newsz = offset > dn->dn_datablksz ? 0 :
-		    MIN(size, dn->dn_datablksz - offset);
-		bzero((char *)buf + newsz, size - newsz);
-		size = newsz;
-	}
-
-	while (size > 0) {
-		uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
-		int err;
-
-		/*
-		 * NB: we could do this block-at-a-time, but it's nice
-		 * to be reading in parallel.
-		 */
-		err = dmu_buf_hold_array_by_dnode(dn, offset, mylen,
-		    TRUE, FTAG, &numbufs, &dbp);
-		if (err)
-			return (err);
-
-		for (i = 0; i < numbufs; i++) {
-			int tocpy;
-			int bufoff;
-			dmu_buf_t *db = dbp[i];
-
-			ASSERT(size > 0);
-
-			bufoff = offset - db->db_offset;
-			tocpy = (int)MIN(db->db_size - bufoff, size);
-
-			bcopy((char *)db->db_data + bufoff, buf, tocpy);
-
-			offset += tocpy;
-			size -= tocpy;
-			buf = (char *)buf + tocpy;
-		}
-		dmu_buf_rele_array(dbp, numbufs, FTAG);
-	}
-	dnode_rele(dn, FTAG);
-	return (0);
-}
-
-void
-dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
-    const void *buf, dmu_tx_t *tx)
-{
-	dmu_buf_t **dbp;
-	int numbufs, i;
-
-	if (size == 0)
-		return;
-
-	VERIFY(0 == dmu_buf_hold_array(os, object, offset, size,
-	    FALSE, FTAG, &numbufs, &dbp));
-
-	for (i = 0; i < numbufs; i++) {
-		int tocpy;
-		int bufoff;
-		dmu_buf_t *db = dbp[i];
-
-		ASSERT(size > 0);
-
-		bufoff = offset - db->db_offset;
-		tocpy = (int)MIN(db->db_size - bufoff, size);
-
-		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
-
-		if (tocpy == db->db_size)
-			dmu_buf_will_fill(db, tx);
-		else
-			dmu_buf_will_dirty(db, tx);
-
-		bcopy(buf, (char *)db->db_data + bufoff, tocpy);
-
-		if (tocpy == db->db_size)
-			dmu_buf_fill_done(db, tx);
-
-		offset += tocpy;
-		size -= tocpy;
-		buf = (char *)buf + tocpy;
-	}
-	dmu_buf_rele_array(dbp, numbufs, FTAG);
-}
-
-#ifdef _KERNEL
-int
-dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
-{
-	dmu_buf_t **dbp;
-	int numbufs, i, err;
-
-	/*
-	 * NB: we could do this block-at-a-time, but it's nice
-	 * to be reading in parallel.
-	 */
-	err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, TRUE, FTAG,
-	    &numbufs, &dbp);
-	if (err)
-		return (err);
-
-	for (i = 0; i < numbufs; i++) {
-		int tocpy;
-		int bufoff;
-		dmu_buf_t *db = dbp[i];
-
-		ASSERT(size > 0);
-
-		bufoff = uio->uio_loffset - db->db_offset;
-		tocpy = (int)MIN(db->db_size - bufoff, size);
-
-		err = uiomove((char *)db->db_data + bufoff, tocpy,
-		    UIO_READ, uio);
-		if (err)
-			break;
-
-		size -= tocpy;
-	}
-	dmu_buf_rele_array(dbp, numbufs, FTAG);
-
-	return (err);
-}
-
-int
-dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size,
-    dmu_tx_t *tx)
-{
-	dmu_buf_t **dbp;
-	int numbufs, i;
-	int err = 0;
-
-	if (size == 0)
-		return (0);
-
-	err = dmu_buf_hold_array(os, object, uio->uio_loffset, size,
-	    FALSE, FTAG, &numbufs, &dbp);
-	if (err)
-		return (err);
-
-	for (i = 0; i < numbufs; i++) {
-		int tocpy;
-		int bufoff;
-		dmu_buf_t *db = dbp[i];
-
-		ASSERT(size > 0);
-
-		bufoff = uio->uio_loffset - db->db_offset;
-		tocpy = (int)MIN(db->db_size - bufoff, size);
-
-		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
-
-		if (tocpy == db->db_size)
-			dmu_buf_will_fill(db, tx);
-		else
-			dmu_buf_will_dirty(db, tx);
-
-		/*
-		 * XXX uiomove could block forever (eg. nfs-backed
-		 * pages).  There needs to be a uiolockdown() function
-		 * to lock the pages in memory, so that uiomove won't
-		 * block.
-		 */
-		err = uiomove((char *)db->db_data + bufoff, tocpy,
-		    UIO_WRITE, uio);
-
-		if (tocpy == db->db_size)
-			dmu_buf_fill_done(db, tx);
-
-		if (err)
-			break;
-
-		size -= tocpy;
-	}
-	dmu_buf_rele_array(dbp, numbufs, FTAG);
-	return (err);
-}
-
-#ifndef __FreeBSD__
-int
-dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
-    page_t *pp, dmu_tx_t *tx)
-{
-	dmu_buf_t **dbp;
-	int numbufs, i;
-	int err;
-
-	if (size == 0)
-		return (0);
-
-	err = dmu_buf_hold_array(os, object, offset, size,
-	    FALSE, FTAG, &numbufs, &dbp);
-	if (err)
-		return (err);
-
-	for (i = 0; i < numbufs; i++) {
-		int tocpy, copied, thiscpy;
-		int bufoff;
-		dmu_buf_t *db = dbp[i];
-		caddr_t va;
-
-		ASSERT(size > 0);
-		ASSERT3U(db->db_size, >=, PAGESIZE);
-
-		bufoff = offset - db->db_offset;
-		tocpy = (int)MIN(db->db_size - bufoff, size);
-
-		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
-
-		if (tocpy == db->db_size)
-			dmu_buf_will_fill(db, tx);
-		else
-			dmu_buf_will_dirty(db, tx);
-
-		for (copied = 0; copied < tocpy; copied += PAGESIZE) {
-			ASSERT3U(pp->p_offset, ==, db->db_offset + bufoff);
-			thiscpy = MIN(PAGESIZE, tocpy - copied);
-			va = ppmapin(pp, PROT_READ, (caddr_t)-1);
-			bcopy(va, (char *)db->db_data + bufoff, thiscpy);
-			ppmapout(va);
-			pp = pp->p_next;
-			bufoff += PAGESIZE;
-		}
-
-		if (tocpy == db->db_size)
-			dmu_buf_fill_done(db, tx);
-
-		if (err)
-			break;
-
-		offset += tocpy;
-		size -= tocpy;
-	}
-	dmu_buf_rele_array(dbp, numbufs, FTAG);
-	return (err);
-}
-#endif	/* !__FreeBSD__ */
-#endif	/* _KERNEL */
-
-typedef struct {
-	dbuf_dirty_record_t	*dr;
-	dmu_sync_cb_t		*done;
-	void			*arg;
-} dmu_sync_arg_t;
-
-/* ARGSUSED */
-static void
-dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
-{
-	dmu_sync_arg_t *in = varg;
-	dbuf_dirty_record_t *dr = in->dr;
-	dmu_buf_impl_t *db = dr->dr_dbuf;
-	dmu_sync_cb_t *done = in->done;
-
-	if (!BP_IS_HOLE(zio->io_bp)) {
-		zio->io_bp->blk_fill = 1;
-		BP_SET_TYPE(zio->io_bp, db->db_dnode->dn_type);
-		BP_SET_LEVEL(zio->io_bp, 0);
-	}
-
-	mutex_enter(&db->db_mtx);
-	ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
-	dr->dt.dl.dr_overridden_by = *zio->io_bp; /* structure assignment */
-	dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
-	cv_broadcast(&db->db_changed);
-	mutex_exit(&db->db_mtx);
-
-	if (done)
-		done(&(db->db), in->arg);
-
-	kmem_free(in, sizeof (dmu_sync_arg_t));
-}
-
-/*
- * Intent log support: sync the block associated with db to disk.
- * N.B. and XXX: the caller is responsible for making sure that the
- * data isn't changing while dmu_sync() is writing it.
- *
- * Return values:
- *
- *	EEXIST: this txg has already been synced, so there's nothing to to.
- *		The caller should not log the write.
- *
- *	ENOENT: the block was dbuf_free_range()'d, so there's nothing to do.
- *		The caller should not log the write.
- *
- *	EALREADY: this block is already in the process of being synced.
- *		The caller should track its progress (somehow).
- *
- *	EINPROGRESS: the IO has been initiated.
- *		The caller should log this blkptr in the callback.
- *
- *	0: completed.  Sets *bp to the blkptr just written.
- *		The caller should log this blkptr immediately.
- */
-int
-dmu_sync(zio_t *pio, dmu_buf_t *db_fake,
-    blkptr_t *bp, uint64_t txg, dmu_sync_cb_t *done, void *arg)
-{
-	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
-	objset_impl_t *os = db->db_objset;
-	dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool;
-	tx_state_t *tx = &dp->dp_tx;
-	dbuf_dirty_record_t *dr;
-	dmu_sync_arg_t *in;
-	zbookmark_t zb;
-	zio_t *zio;
-	int zio_flags;
-	int err;
-
-	ASSERT(BP_IS_HOLE(bp));
-	ASSERT(txg != 0);
-
-
-	dprintf("dmu_sync txg=%llu, s,o,q %llu %llu %llu\n",
-	    txg, tx->tx_synced_txg, tx->tx_open_txg, tx->tx_quiesced_txg);
-
-	/*
-	 * XXX - would be nice if we could do this without suspending...
-	 */
-	txg_suspend(dp);
-
-	/*
-	 * If this txg already synced, there's nothing to do.
-	 */
-	if (txg <= tx->tx_synced_txg) {
-		txg_resume(dp);
-		/*
-		 * If we're running ziltest, we need the blkptr regardless.
-		 */
-		if (txg > spa_freeze_txg(dp->dp_spa)) {
-			/* if db_blkptr == NULL, this was an empty write */
-			if (db->db_blkptr)
-				*bp = *db->db_blkptr; /* structure assignment */
-			return (0);
-		}
-		return (EEXIST);
-	}
-
-	mutex_enter(&db->db_mtx);
-
-	if (txg == tx->tx_syncing_txg) {
-		while (db->db_data_pending) {
-			/*
-			 * IO is in-progress.  Wait for it to finish.
-			 * XXX - would be nice to be able to somehow "attach"
-			 * this zio to the parent zio passed in.
-			 */
-			cv_wait(&db->db_changed, &db->db_mtx);
-			if (!db->db_data_pending &&
-			    db->db_blkptr && BP_IS_HOLE(db->db_blkptr)) {
-				/*
-				 * IO was compressed away
-				 */
-				*bp = *db->db_blkptr; /* structure assignment */
-				mutex_exit(&db->db_mtx);
-				txg_resume(dp);
-				return (0);
-			}
-			ASSERT(db->db_data_pending ||
-			    (db->db_blkptr && db->db_blkptr->blk_birth == txg));
-		}
-
-		if (db->db_blkptr && db->db_blkptr->blk_birth == txg) {
-			/*
-			 * IO is already completed.
-			 */
-			*bp = *db->db_blkptr; /* structure assignment */
-			mutex_exit(&db->db_mtx);
-			txg_resume(dp);
-			return (0);
-		}
-	}
-
-	dr = db->db_last_dirty;
-	while (dr && dr->dr_txg > txg)
-		dr = dr->dr_next;
-	if (dr == NULL || dr->dr_txg < txg) {
-		/*
-		 * This dbuf isn't dirty, must have been free_range'd.
-		 * There's no need to log writes to freed blocks, so we're done.
-		 */
-		mutex_exit(&db->db_mtx);
-		txg_resume(dp);
-		return (ENOENT);
-	}
-
-	ASSERT(dr->dr_txg == txg);
-	if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
-		/*
-		 * We have already issued a sync write for this buffer.
-		 */
-		mutex_exit(&db->db_mtx);
-		txg_resume(dp);
-		return (EALREADY);
-	} else if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
-		/*
-		 * This buffer has already been synced.  It could not
-		 * have been dirtied since, or we would have cleared the state.
-		 */
-		*bp = dr->dt.dl.dr_overridden_by; /* structure assignment */
-		mutex_exit(&db->db_mtx);
-		txg_resume(dp);
-		return (0);
-	}
-
-	dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
-	in = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
-	in->dr = dr;
-	in->done = done;
-	in->arg = arg;
-	mutex_exit(&db->db_mtx);
-	txg_resume(dp);
-
-	zb.zb_objset = os->os_dsl_dataset->ds_object;
-	zb.zb_object = db->db.db_object;
-	zb.zb_level = db->db_level;
-	zb.zb_blkid = db->db_blkid;
-	zio_flags = ZIO_FLAG_MUSTSUCCEED;
-	if (dmu_ot[db->db_dnode->dn_type].ot_metadata || zb.zb_level != 0)
-		zio_flags |= ZIO_FLAG_METADATA;
-	zio = arc_write(pio, os->os_spa,
-	    zio_checksum_select(db->db_dnode->dn_checksum, os->os_checksum),
-	    zio_compress_select(db->db_dnode->dn_compress, os->os_compress),
-	    dmu_get_replication_level(os, &zb, db->db_dnode->dn_type),
-	    txg, bp, dr->dt.dl.dr_data, NULL, dmu_sync_done, in,
-	    ZIO_PRIORITY_SYNC_WRITE, zio_flags, &zb);
-
-	if (pio) {
-		zio_nowait(zio);
-		err = EINPROGRESS;
-	} else {
-		err = zio_wait(zio);
-		ASSERT(err == 0);
-	}
-	return (err);
-}
-
-int
-dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
-	dmu_tx_t *tx)
-{
-	dnode_t *dn;
-	int err;
-
-	err = dnode_hold(os->os, object, FTAG, &dn);
-	if (err)
-		return (err);
-	err = dnode_set_blksz(dn, size, ibs, tx);
-	dnode_rele(dn, FTAG);
-	return (err);
-}
-
-void
-dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
-	dmu_tx_t *tx)
-{
-	dnode_t *dn;
-
-	/* XXX assumes dnode_hold will not get an i/o error */
-	(void) dnode_hold(os->os, object, FTAG, &dn);
-	ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS);
-	dn->dn_checksum = checksum;
-	dnode_setdirty(dn, tx);
-	dnode_rele(dn, FTAG);
-}
-
-void
-dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
-	dmu_tx_t *tx)
-{
-	dnode_t *dn;
-
-	/* XXX assumes dnode_hold will not get an i/o error */
-	(void) dnode_hold(os->os, object, FTAG, &dn);
-	ASSERT(compress < ZIO_COMPRESS_FUNCTIONS);
-	dn->dn_compress = compress;
-	dnode_setdirty(dn, tx);
-	dnode_rele(dn, FTAG);
-}
-
-int
-dmu_get_replication_level(objset_impl_t *os,
-    zbookmark_t *zb, dmu_object_type_t ot)
-{
-	int ncopies = os->os_copies;
-
-	/* If it's the mos, it should have max copies set. */
-	ASSERT(zb->zb_objset != 0 ||
-	    ncopies == spa_max_replication(os->os_spa));
-
-	if (dmu_ot[ot].ot_metadata || zb->zb_level != 0)
-		ncopies++;
-	return (MIN(ncopies, spa_max_replication(os->os_spa)));
-}
-
-int
-dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
-{
-	dnode_t *dn;
-	int i, err;
-
-	err = dnode_hold(os->os, object, FTAG, &dn);
-	if (err)
-		return (err);
-	/*
-	 * Sync any current changes before
-	 * we go trundling through the block pointers.
-	 */
-	for (i = 0; i < TXG_SIZE; i++) {
-		if (list_link_active(&dn->dn_dirty_link[i]))
-			break;
-	}
-	if (i != TXG_SIZE) {
-		dnode_rele(dn, FTAG);
-		txg_wait_synced(dmu_objset_pool(os), 0);
-		err = dnode_hold(os->os, object, FTAG, &dn);
-		if (err)
-			return (err);
-	}
-
-	err = dnode_next_offset(dn, hole, off, 1, 1, 0);
-	dnode_rele(dn, FTAG);
-
-	return (err);
-}
-
-void
-dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
-{
-	rw_enter(&dn->dn_struct_rwlock, RW_READER);
-	mutex_enter(&dn->dn_mtx);
-
-	doi->doi_data_block_size = dn->dn_datablksz;
-	doi->doi_metadata_block_size = dn->dn_indblkshift ?
-	    1ULL << dn->dn_indblkshift : 0;
-	doi->doi_indirection = dn->dn_nlevels;
-	doi->doi_checksum = dn->dn_checksum;
-	doi->doi_compress = dn->dn_compress;
-	doi->doi_physical_blks = (DN_USED_BYTES(dn->dn_phys) +
-	    SPA_MINBLOCKSIZE/2) >> SPA_MINBLOCKSHIFT;
-	doi->doi_max_block_offset = dn->dn_phys->dn_maxblkid;
-	doi->doi_type = dn->dn_type;
-	doi->doi_bonus_size = dn->dn_bonuslen;
-	doi->doi_bonus_type = dn->dn_bonustype;
-
-	mutex_exit(&dn->dn_mtx);
-	rw_exit(&dn->dn_struct_rwlock);
-}
-
-/*
- * Get information on a DMU object.
- * If doi is NULL, just indicates whether the object exists.
- */
-int
-dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi)
-{
-	dnode_t *dn;
-	int err = dnode_hold(os->os, object, FTAG, &dn);
-
-	if (err)
-		return (err);
-
-	if (doi != NULL)
-		dmu_object_info_from_dnode(dn, doi);
-
-	dnode_rele(dn, FTAG);
-	return (0);
-}
-
-/*
- * As above, but faster; can be used when you have a held dbuf in hand.
- */
-void
-dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi)
-{
-	dmu_object_info_from_dnode(((dmu_buf_impl_t *)db)->db_dnode, doi);
-}
-
-/*
- * Faster still when you only care about the size.
- * This is specifically optimized for zfs_getattr().
- */
-void
-dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, u_longlong_t *nblk512)
-{
-	dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
-
-	*blksize = dn->dn_datablksz;
-	/* add 1 for dnode space */
-	*nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >>
-	    SPA_MINBLOCKSHIFT) + 1;
-}
-
-void
-byteswap_uint64_array(void *vbuf, size_t size)
-{
-	uint64_t *buf = vbuf;
-	size_t count = size >> 3;
-	int i;
-
-	ASSERT((size & 7) == 0);
-
-	for (i = 0; i < count; i++)
-		buf[i] = BSWAP_64(buf[i]);
-}
-
-void
-byteswap_uint32_array(void *vbuf, size_t size)
-{
-	uint32_t *buf = vbuf;
-	size_t count = size >> 2;
-	int i;
-
-	ASSERT((size & 3) == 0);
-
-	for (i = 0; i < count; i++)
-		buf[i] = BSWAP_32(buf[i]);
-}
-
-void
-byteswap_uint16_array(void *vbuf, size_t size)
-{
-	uint16_t *buf = vbuf;
-	size_t count = size >> 1;
-	int i;
-
-	ASSERT((size & 1) == 0);
-
-	for (i = 0; i < count; i++)
-		buf[i] = BSWAP_16(buf[i]);
-}
-
-/* ARGSUSED */
-void
-byteswap_uint8_array(void *vbuf, size_t size)
-{
-}
-
-void
-dmu_init(void)
-{
-	dbuf_init();
-	dnode_init();
-	arc_init();
-}
-
-void
-dmu_fini(void)
-{
-	arc_fini();
-	dnode_fini();
-	dbuf_fini();
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c
deleted file mode 100644
index 93168cc..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/dmu.h>
-#include <sys/dmu_objset.h>
-#include <sys/dmu_tx.h>
-#include <sys/dnode.h>
-
-uint64_t
-dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
-    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
-{
-	objset_impl_t *osi = os->os;
-	uint64_t object;
-	uint64_t L2_dnode_count = DNODES_PER_BLOCK <<
-	    (osi->os_meta_dnode->dn_indblkshift - SPA_BLKPTRSHIFT);
-	dnode_t *dn = NULL;
-	int restarted = B_FALSE;
-
-	mutex_enter(&osi->os_obj_lock);
-	for (;;) {
-		object = osi->os_obj_next;
-		/*
-		 * Each time we polish off an L2 bp worth of dnodes
-		 * (2^13 objects), move to another L2 bp that's still
-		 * reasonably sparse (at most 1/4 full).  Look from the
-		 * beginning once, but after that keep looking from here.
-		 * If we can't find one, just keep going from here.
-		 */
-		if (P2PHASE(object, L2_dnode_count) == 0) {
-			uint64_t offset = restarted ? object << DNODE_SHIFT : 0;
-			int error = dnode_next_offset(osi->os_meta_dnode,
-			    B_TRUE, &offset, 2, DNODES_PER_BLOCK >> 2, 0);
-			restarted = B_TRUE;
-			if (error == 0)
-				object = offset >> DNODE_SHIFT;
-		}
-		osi->os_obj_next = ++object;
-
-		/*
-		 * XXX We should check for an i/o error here and return
-		 * up to our caller.  Actually we should pre-read it in
-		 * dmu_tx_assign(), but there is currently no mechanism
-		 * to do so.
-		 */
-		(void) dnode_hold_impl(os->os, object, DNODE_MUST_BE_FREE,
-		    FTAG, &dn);
-		if (dn)
-			break;
-
-		if (dmu_object_next(os, &object, B_TRUE, 0) == 0)
-			osi->os_obj_next = object - 1;
-	}
-
-	dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx);
-	dnode_rele(dn, FTAG);
-
-	mutex_exit(&osi->os_obj_lock);
-
-	dmu_tx_add_new_object(tx, os, object);
-	return (object);
-}
-
-int
-dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
-    int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
-{
-	dnode_t *dn;
-	int err;
-
-	if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx))
-		return (EBADF);
-
-	err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_FREE, FTAG, &dn);
-	if (err)
-		return (err);
-	dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx);
-	dnode_rele(dn, FTAG);
-
-	dmu_tx_add_new_object(tx, os, object);
-	return (0);
-}
-
-int
-dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
-    int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
-{
-	dnode_t *dn;
-	int err;
-
-	if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx))
-		return (EBADF);
-
-	err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED,
-	    FTAG, &dn);
-	if (err)
-		return (err);
-	dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, tx);
-	dnode_rele(dn, FTAG);
-
-	return (0);
-}
-
-int
-dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
-{
-	dnode_t *dn;
-	int err;
-
-	ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
-
-	err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED,
-	    FTAG, &dn);
-	if (err)
-		return (err);
-
-	ASSERT(dn->dn_type != DMU_OT_NONE);
-	dnode_free(dn, tx);
-	dnode_rele(dn, FTAG);
-
-	return (0);
-}
-
-int
-dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
-{
-	uint64_t offset = (*objectp + 1) << DNODE_SHIFT;
-	int error;
-
-	error = dnode_next_offset(os->os->os_meta_dnode,
-	    hole, &offset, 0, DNODES_PER_BLOCK, txg);
-
-	*objectp = offset >> DNODE_SHIFT;
-
-	return (error);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
deleted file mode 100644
index 378fe8c..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
+++ /dev/null
@@ -1,1037 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/dmu_objset.h>
-#include <sys/dsl_dir.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_prop.h>
-#include <sys/dsl_pool.h>
-#include <sys/dsl_synctask.h>
-#include <sys/dnode.h>
-#include <sys/dbuf.h>
-#include <sys/zvol.h>
-#include <sys/dmu_tx.h>
-#include <sys/zio_checksum.h>
-#include <sys/zap.h>
-#include <sys/zil.h>
-#include <sys/dmu_impl.h>
-
-
-spa_t *
-dmu_objset_spa(objset_t *os)
-{
-	return (os->os->os_spa);
-}
-
-zilog_t *
-dmu_objset_zil(objset_t *os)
-{
-	return (os->os->os_zil);
-}
-
-dsl_pool_t *
-dmu_objset_pool(objset_t *os)
-{
-	dsl_dataset_t *ds;
-
-	if ((ds = os->os->os_dsl_dataset) != NULL && ds->ds_dir)
-		return (ds->ds_dir->dd_pool);
-	else
-		return (spa_get_dsl(os->os->os_spa));
-}
-
-dsl_dataset_t *
-dmu_objset_ds(objset_t *os)
-{
-	return (os->os->os_dsl_dataset);
-}
-
-dmu_objset_type_t
-dmu_objset_type(objset_t *os)
-{
-	return (os->os->os_phys->os_type);
-}
-
-void
-dmu_objset_name(objset_t *os, char *buf)
-{
-	dsl_dataset_name(os->os->os_dsl_dataset, buf);
-}
-
-uint64_t
-dmu_objset_id(objset_t *os)
-{
-	dsl_dataset_t *ds = os->os->os_dsl_dataset;
-
-	return (ds ? ds->ds_object : 0);
-}
-
-static void
-checksum_changed_cb(void *arg, uint64_t newval)
-{
-	objset_impl_t *osi = arg;
-
-	/*
-	 * Inheritance should have been done by now.
-	 */
-	ASSERT(newval != ZIO_CHECKSUM_INHERIT);
-
-	osi->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE);
-}
-
-static void
-compression_changed_cb(void *arg, uint64_t newval)
-{
-	objset_impl_t *osi = arg;
-
-	/*
-	 * Inheritance and range checking should have been done by now.
-	 */
-	ASSERT(newval != ZIO_COMPRESS_INHERIT);
-
-	osi->os_compress = zio_compress_select(newval, ZIO_COMPRESS_ON_VALUE);
-}
-
-static void
-copies_changed_cb(void *arg, uint64_t newval)
-{
-	objset_impl_t *osi = arg;
-
-	/*
-	 * Inheritance and range checking should have been done by now.
-	 */
-	ASSERT(newval > 0);
-	ASSERT(newval <= spa_max_replication(osi->os_spa));
-
-	osi->os_copies = newval;
-}
-
-void
-dmu_objset_byteswap(void *buf, size_t size)
-{
-	objset_phys_t *osp = buf;
-
-	ASSERT(size == sizeof (objset_phys_t));
-	dnode_byteswap(&osp->os_meta_dnode);
-	byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t));
-	osp->os_type = BSWAP_64(osp->os_type);
-}
-
-int
-dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
-    objset_impl_t **osip)
-{
-	objset_impl_t *winner, *osi;
-	int i, err, checksum;
-
-	osi = kmem_zalloc(sizeof (objset_impl_t), KM_SLEEP);
-	osi->os.os = osi;
-	osi->os_dsl_dataset = ds;
-	osi->os_spa = spa;
-	osi->os_rootbp = bp;
-	if (!BP_IS_HOLE(osi->os_rootbp)) {
-		uint32_t aflags = ARC_WAIT;
-		zbookmark_t zb;
-		zb.zb_objset = ds ? ds->ds_object : 0;
-		zb.zb_object = 0;
-		zb.zb_level = -1;
-		zb.zb_blkid = 0;
-
-		dprintf_bp(osi->os_rootbp, "reading %s", "");
-		err = arc_read(NULL, spa, osi->os_rootbp,
-		    dmu_ot[DMU_OT_OBJSET].ot_byteswap,
-		    arc_getbuf_func, &osi->os_phys_buf,
-		    ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb);
-		if (err) {
-			kmem_free(osi, sizeof (objset_impl_t));
-			return (err);
-		}
-		osi->os_phys = osi->os_phys_buf->b_data;
-		arc_release(osi->os_phys_buf, &osi->os_phys_buf);
-	} else {
-		osi->os_phys_buf = arc_buf_alloc(spa, sizeof (objset_phys_t),
-		    &osi->os_phys_buf, ARC_BUFC_METADATA);
-		osi->os_phys = osi->os_phys_buf->b_data;
-		bzero(osi->os_phys, sizeof (objset_phys_t));
-	}
-
-	/*
-	 * Note: the changed_cb will be called once before the register
-	 * func returns, thus changing the checksum/compression from the
-	 * default (fletcher2/off).  Snapshots don't need to know, and
-	 * registering would complicate clone promotion.
-	 */
-	if (ds && ds->ds_phys->ds_num_children == 0) {
-		err = dsl_prop_register(ds, "checksum",
-		    checksum_changed_cb, osi);
-		if (err == 0)
-			err = dsl_prop_register(ds, "compression",
-			    compression_changed_cb, osi);
-		if (err == 0)
-			err = dsl_prop_register(ds, "copies",
-			    copies_changed_cb, osi);
-		if (err) {
-			VERIFY(arc_buf_remove_ref(osi->os_phys_buf,
-			    &osi->os_phys_buf) == 1);
-			kmem_free(osi, sizeof (objset_impl_t));
-			return (err);
-		}
-	} else if (ds == NULL) {
-		/* It's the meta-objset. */
-		osi->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
-		osi->os_compress = ZIO_COMPRESS_LZJB;
-		osi->os_copies = spa_max_replication(spa);
-	}
-
-	osi->os_zil = zil_alloc(&osi->os, &osi->os_phys->os_zil_header);
-
-	/*
-	 * Metadata always gets compressed and checksummed.
-	 * If the data checksum is multi-bit correctable, and it's not
-	 * a ZBT-style checksum, then it's suitable for metadata as well.
-	 * Otherwise, the metadata checksum defaults to fletcher4.
-	 */
-	checksum = osi->os_checksum;
-
-	if (zio_checksum_table[checksum].ci_correctable &&
-	    !zio_checksum_table[checksum].ci_zbt)
-		osi->os_md_checksum = checksum;
-	else
-		osi->os_md_checksum = ZIO_CHECKSUM_FLETCHER_4;
-	osi->os_md_compress = ZIO_COMPRESS_LZJB;
-
-	for (i = 0; i < TXG_SIZE; i++) {
-		list_create(&osi->os_dirty_dnodes[i], sizeof (dnode_t),
-		    offsetof(dnode_t, dn_dirty_link[i]));
-		list_create(&osi->os_free_dnodes[i], sizeof (dnode_t),
-		    offsetof(dnode_t, dn_dirty_link[i]));
-	}
-	list_create(&osi->os_dnodes, sizeof (dnode_t),
-	    offsetof(dnode_t, dn_link));
-	list_create(&osi->os_downgraded_dbufs, sizeof (dmu_buf_impl_t),
-	    offsetof(dmu_buf_impl_t, db_link));
-
-	mutex_init(&osi->os_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&osi->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);
-
-	osi->os_meta_dnode = dnode_special_open(osi,
-	    &osi->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT);
-
-	if (ds != NULL) {
-		winner = dsl_dataset_set_user_ptr(ds, osi, dmu_objset_evict);
-		if (winner) {
-			dmu_objset_evict(ds, osi);
-			osi = winner;
-		}
-	}
-
-	*osip = osi;
-	return (0);
-}
-
-/* called from zpl */
-int
-dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
-    objset_t **osp)
-{
-	dsl_dataset_t *ds;
-	int err;
-	objset_t *os;
-	objset_impl_t *osi;
-
-	os = kmem_alloc(sizeof (objset_t), KM_SLEEP);
-	err = dsl_dataset_open(name, mode, os, &ds);
-	if (err) {
-		kmem_free(os, sizeof (objset_t));
-		return (err);
-	}
-
-	osi = dsl_dataset_get_user_ptr(ds);
-	if (osi == NULL) {
-		err = dmu_objset_open_impl(dsl_dataset_get_spa(ds),
-		    ds, &ds->ds_phys->ds_bp, &osi);
-		if (err) {
-			dsl_dataset_close(ds, mode, os);
-			kmem_free(os, sizeof (objset_t));
-			return (err);
-		}
-	}
-
-	os->os = osi;
-	os->os_mode = mode;
-
-	if (type != DMU_OST_ANY && type != os->os->os_phys->os_type) {
-		dmu_objset_close(os);
-		return (EINVAL);
-	}
-	*osp = os;
-	return (0);
-}
-
-void
-dmu_objset_close(objset_t *os)
-{
-	dsl_dataset_close(os->os->os_dsl_dataset, os->os_mode, os);
-	kmem_free(os, sizeof (objset_t));
-}
-
-int
-dmu_objset_evict_dbufs(objset_t *os, int try)
-{
-	objset_impl_t *osi = os->os;
-	dnode_t *dn;
-
-	mutex_enter(&osi->os_lock);
-
-	/* process the mdn last, since the other dnodes have holds on it */
-	list_remove(&osi->os_dnodes, osi->os_meta_dnode);
-	list_insert_tail(&osi->os_dnodes, osi->os_meta_dnode);
-
-	/*
-	 * Find the first dnode with holds.  We have to do this dance
-	 * because dnode_add_ref() only works if you already have a
-	 * hold.  If there are no holds then it has no dbufs so OK to
-	 * skip.
-	 */
-	for (dn = list_head(&osi->os_dnodes);
-	    dn && refcount_is_zero(&dn->dn_holds);
-	    dn = list_next(&osi->os_dnodes, dn))
-		continue;
-	if (dn)
-		dnode_add_ref(dn, FTAG);
-
-	while (dn) {
-		dnode_t *next_dn = dn;
-
-		do {
-			next_dn = list_next(&osi->os_dnodes, next_dn);
-		} while (next_dn && refcount_is_zero(&next_dn->dn_holds));
-		if (next_dn)
-			dnode_add_ref(next_dn, FTAG);
-
-		mutex_exit(&osi->os_lock);
-		if (dnode_evict_dbufs(dn, try)) {
-			dnode_rele(dn, FTAG);
-			if (next_dn)
-				dnode_rele(next_dn, FTAG);
-			return (1);
-		}
-		dnode_rele(dn, FTAG);
-		mutex_enter(&osi->os_lock);
-		dn = next_dn;
-	}
-	mutex_exit(&osi->os_lock);
-	return (0);
-}
-
-void
-dmu_objset_evict(dsl_dataset_t *ds, void *arg)
-{
-	objset_impl_t *osi = arg;
-	objset_t os;
-	int i;
-
-	for (i = 0; i < TXG_SIZE; i++) {
-		ASSERT(list_head(&osi->os_dirty_dnodes[i]) == NULL);
-		ASSERT(list_head(&osi->os_free_dnodes[i]) == NULL);
-	}
-
-	if (ds && ds->ds_phys->ds_num_children == 0) {
-		VERIFY(0 == dsl_prop_unregister(ds, "checksum",
-		    checksum_changed_cb, osi));
-		VERIFY(0 == dsl_prop_unregister(ds, "compression",
-		    compression_changed_cb, osi));
-		VERIFY(0 == dsl_prop_unregister(ds, "copies",
-		    copies_changed_cb, osi));
-	}
-
-	/*
-	 * We should need only a single pass over the dnode list, since
-	 * nothing can be added to the list at this point.
-	 */
-	os.os = osi;
-	(void) dmu_objset_evict_dbufs(&os, 0);
-
-	ASSERT3P(list_head(&osi->os_dnodes), ==, osi->os_meta_dnode);
-	ASSERT3P(list_tail(&osi->os_dnodes), ==, osi->os_meta_dnode);
-	ASSERT3P(list_head(&osi->os_meta_dnode->dn_dbufs), ==, NULL);
-
-	dnode_special_close(osi->os_meta_dnode);
-	zil_free(osi->os_zil);
-
-	VERIFY(arc_buf_remove_ref(osi->os_phys_buf, &osi->os_phys_buf) == 1);
-	mutex_destroy(&osi->os_lock);
-	mutex_destroy(&osi->os_obj_lock);
-	kmem_free(osi, sizeof (objset_impl_t));
-}
-
-/* called from dsl for meta-objset */
-objset_impl_t *
-dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
-    dmu_objset_type_t type, dmu_tx_t *tx)
-{
-	objset_impl_t *osi;
-	dnode_t *mdn;
-
-	ASSERT(dmu_tx_is_syncing(tx));
-	VERIFY(0 == dmu_objset_open_impl(spa, ds, bp, &osi));
-	mdn = osi->os_meta_dnode;
-
-	dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT,
-	    DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx);
-
-	/*
-	 * We don't want to have to increase the meta-dnode's nlevels
-	 * later, because then we could do it in quescing context while
-	 * we are also accessing it in open context.
-	 *
-	 * This precaution is not necessary for the MOS (ds == NULL),
-	 * because the MOS is only updated in syncing context.
-	 * This is most fortunate: the MOS is the only objset that
-	 * needs to be synced multiple times as spa_sync() iterates
-	 * to convergence, so minimizing its dn_nlevels matters.
-	 */
-	if (ds != NULL) {
-		int levels = 1;
-
-		/*
-		 * Determine the number of levels necessary for the meta-dnode
-		 * to contain DN_MAX_OBJECT dnodes.
-		 */
-		while ((uint64_t)mdn->dn_nblkptr << (mdn->dn_datablkshift +
-		    (levels - 1) * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) <
-		    DN_MAX_OBJECT * sizeof (dnode_phys_t))
-			levels++;
-
-		mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] =
-		    mdn->dn_nlevels = levels;
-	}
-
-	ASSERT(type != DMU_OST_NONE);
-	ASSERT(type != DMU_OST_ANY);
-	ASSERT(type < DMU_OST_NUMTYPES);
-	osi->os_phys->os_type = type;
-
-	dsl_dataset_dirty(ds, tx);
-
-	return (osi);
-}
-
-struct oscarg {
-	void (*userfunc)(objset_t *os, void *arg, dmu_tx_t *tx);
-	void *userarg;
-	dsl_dataset_t *clone_parent;
-	const char *lastname;
-	dmu_objset_type_t type;
-};
-
-/* ARGSUSED */
-static int
-dmu_objset_create_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-	dsl_dir_t *dd = arg1;
-	struct oscarg *oa = arg2;
-	objset_t *mos = dd->dd_pool->dp_meta_objset;
-	int err;
-	uint64_t ddobj;
-
-	err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj,
-	    oa->lastname, sizeof (uint64_t), 1, &ddobj);
-	if (err != ENOENT)
-		return (err ? err : EEXIST);
-
-	if (oa->clone_parent != NULL) {
-		/*
-		 * You can't clone across pools.
-		 */
-		if (oa->clone_parent->ds_dir->dd_pool != dd->dd_pool)
-			return (EXDEV);
-
-		/*
-		 * You can only clone snapshots, not the head datasets.
-		 */
-		if (oa->clone_parent->ds_phys->ds_num_children == 0)
-			return (EINVAL);
-	}
-	return (0);
-}
-
-static void
-dmu_objset_create_sync(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-	dsl_dir_t *dd = arg1;
-	struct oscarg *oa = arg2;
-	dsl_dataset_t *ds;
-	blkptr_t *bp;
-	uint64_t dsobj;
-
-	ASSERT(dmu_tx_is_syncing(tx));
-
-	dsobj = dsl_dataset_create_sync(dd, oa->lastname,
-	    oa->clone_parent, tx);
-
-	VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, dsobj, NULL,
-	    DS_MODE_STANDARD | DS_MODE_READONLY, FTAG, &ds));
-	bp = dsl_dataset_get_blkptr(ds);
-	if (BP_IS_HOLE(bp)) {
-		objset_impl_t *osi;
-
-		/* This is an empty dmu_objset; not a clone. */
-		osi = dmu_objset_create_impl(dsl_dataset_get_spa(ds),
-		    ds, bp, oa->type, tx);
-
-		if (oa->userfunc)
-			oa->userfunc(&osi->os, oa->userarg, tx);
-	}
-	dsl_dataset_close(ds, DS_MODE_STANDARD | DS_MODE_READONLY, FTAG);
-}
-
-int
-dmu_objset_create(const char *name, dmu_objset_type_t type,
-    objset_t *clone_parent,
-    void (*func)(objset_t *os, void *arg, dmu_tx_t *tx), void *arg)
-{
-	dsl_dir_t *pdd;
-	const char *tail;
-	int err = 0;
-	struct oscarg oa = { 0 };
-
-	ASSERT(strchr(name, '@') == NULL);
-	err = dsl_dir_open(name, FTAG, &pdd, &tail);
-	if (err)
-		return (err);
-	if (tail == NULL) {
-		dsl_dir_close(pdd, FTAG);
-		return (EEXIST);
-	}
-
-	dprintf("name=%s\n", name);
-
-	oa.userfunc = func;
-	oa.userarg = arg;
-	oa.lastname = tail;
-	oa.type = type;
-	if (clone_parent != NULL) {
-		/*
-		 * You can't clone to a different type.
-		 */
-		if (clone_parent->os->os_phys->os_type != type) {
-			dsl_dir_close(pdd, FTAG);
-			return (EINVAL);
-		}
-		oa.clone_parent = clone_parent->os->os_dsl_dataset;
-	}
-	err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check,
-	    dmu_objset_create_sync, pdd, &oa, 5);
-	dsl_dir_close(pdd, FTAG);
-	return (err);
-}
-
-int
-dmu_objset_destroy(const char *name)
-{
-	objset_t *os;
-	int error;
-
-	/*
-	 * If it looks like we'll be able to destroy it, and there's
-	 * an unplayed replay log sitting around, destroy the log.
-	 * It would be nicer to do this in dsl_dataset_destroy_sync(),
-	 * but the replay log objset is modified in open context.
-	 */
-	error = dmu_objset_open(name, DMU_OST_ANY, DS_MODE_EXCLUSIVE, &os);
-	if (error == 0) {
-		zil_destroy(dmu_objset_zil(os), B_FALSE);
-		dmu_objset_close(os);
-	}
-
-	return (dsl_dataset_destroy(name));
-}
-
-int
-dmu_objset_rollback(const char *name)
-{
-	int err;
-	objset_t *os;
-
-	err = dmu_objset_open(name, DMU_OST_ANY,
-	    DS_MODE_EXCLUSIVE | DS_MODE_INCONSISTENT, &os);
-	if (err == 0) {
-		err = zil_suspend(dmu_objset_zil(os));
-		if (err == 0)
-			zil_resume(dmu_objset_zil(os));
-		if (err == 0) {
-			/* XXX uncache everything? */
-			err = dsl_dataset_rollback(os->os->os_dsl_dataset);
-		}
-		dmu_objset_close(os);
-	}
-	return (err);
-}
-
-struct snaparg {
-	dsl_sync_task_group_t *dstg;
-	char *snapname;
-	char failed[MAXPATHLEN];
-};
-
-static int
-dmu_objset_snapshot_one(char *name, void *arg)
-{
-	struct snaparg *sn = arg;
-	objset_t *os;
-	dmu_objset_stats_t stat;
-	int err;
-
-	(void) strcpy(sn->failed, name);
-
-	err = dmu_objset_open(name, DMU_OST_ANY, DS_MODE_STANDARD, &os);
-	if (err != 0)
-		return (err);
-
-	/*
-	 * If the objset is in an inconsistent state, return busy.
-	 */
-	dmu_objset_fast_stat(os, &stat);
-	if (stat.dds_inconsistent) {
-		dmu_objset_close(os);
-		return (EBUSY);
-	}
-
-	/*
-	 * NB: we need to wait for all in-flight changes to get to disk,
-	 * so that we snapshot those changes.  zil_suspend does this as
-	 * a side effect.
-	 */
-	err = zil_suspend(dmu_objset_zil(os));
-	if (err == 0) {
-		dsl_sync_task_create(sn->dstg, dsl_dataset_snapshot_check,
-		    dsl_dataset_snapshot_sync, os, sn->snapname, 3);
-	} else {
-		dmu_objset_close(os);
-	}
-
-	return (err);
-}
-
-int
-dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive)
-{
-	dsl_sync_task_t *dst;
-	struct snaparg sn = { 0 };
-	char *cp;
-	spa_t *spa;
-	int err;
-
-	(void) strcpy(sn.failed, fsname);
-
-	cp = strchr(fsname, '/');
-	if (cp) {
-		*cp = '\0';
-		err = spa_open(fsname, &spa, FTAG);
-		*cp = '/';
-	} else {
-		err = spa_open(fsname, &spa, FTAG);
-	}
-	if (err)
-		return (err);
-
-	sn.dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
-	sn.snapname = snapname;
-
-	if (recursive) {
-		err = dmu_objset_find(fsname,
-		    dmu_objset_snapshot_one, &sn, DS_FIND_CHILDREN);
-	} else {
-		err = dmu_objset_snapshot_one(fsname, &sn);
-	}
-
-	if (err)
-		goto out;
-
-	err = dsl_sync_task_group_wait(sn.dstg);
-
-	for (dst = list_head(&sn.dstg->dstg_tasks); dst;
-	    dst = list_next(&sn.dstg->dstg_tasks, dst)) {
-		objset_t *os = dst->dst_arg1;
-		if (dst->dst_err)
-			dmu_objset_name(os, sn.failed);
-		zil_resume(dmu_objset_zil(os));
-		dmu_objset_close(os);
-	}
-out:
-	if (err)
-		(void) strcpy(fsname, sn.failed);
-	dsl_sync_task_group_destroy(sn.dstg);
-	spa_close(spa, FTAG);
-	return (err);
-}
-
-static void
-dmu_objset_sync_dnodes(list_t *list, dmu_tx_t *tx)
-{
-	dnode_t *dn;
-
-	while (dn = list_head(list)) {
-		ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
-		ASSERT(dn->dn_dbuf->db_data_pending);
-		/*
-		 * Initialize dn_zio outside dnode_sync()
-		 * to accomodate meta-dnode
-		 */
-		dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio;
-		ASSERT(dn->dn_zio);
-
-		ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS);
-		list_remove(list, dn);
-		dnode_sync(dn, tx);
-	}
-}
-
-/* ARGSUSED */
-static void
-ready(zio_t *zio, arc_buf_t *abuf, void *arg)
-{
-	objset_impl_t *os = arg;
-	blkptr_t *bp = os->os_rootbp;
-	dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;
-	int i;
-
-	/*
-	 * Update rootbp fill count.
-	 */
-	bp->blk_fill = 1;	/* count the meta-dnode */
-	for (i = 0; i < dnp->dn_nblkptr; i++)
-		bp->blk_fill += dnp->dn_blkptr[i].blk_fill;
-}
-
-/* ARGSUSED */
-static void
-killer(zio_t *zio, arc_buf_t *abuf, void *arg)
-{
-	objset_impl_t *os = arg;
-
-	ASSERT3U(zio->io_error, ==, 0);
-
-	BP_SET_TYPE(zio->io_bp, DMU_OT_OBJSET);
-	BP_SET_LEVEL(zio->io_bp, 0);
-
-	if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp),
-	    BP_IDENTITY(&zio->io_bp_orig))) {
-		if (zio->io_bp_orig.blk_birth == os->os_synctx->tx_txg)
-			dsl_dataset_block_kill(os->os_dsl_dataset,
-			    &zio->io_bp_orig, NULL, os->os_synctx);
-		dsl_dataset_block_born(os->os_dsl_dataset, zio->io_bp,
-		    os->os_synctx);
-	}
-	arc_release(os->os_phys_buf, &os->os_phys_buf);
-}
-
-/* called from dsl */
-void
-dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx)
-{
-	int txgoff;
-	zbookmark_t zb;
-	zio_t *zio;
-	list_t *list;
-	dbuf_dirty_record_t *dr;
-	int zio_flags;
-
-	dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg);
-
-	ASSERT(dmu_tx_is_syncing(tx));
-	/* XXX the write_done callback should really give us the tx... */
-	os->os_synctx = tx;
-
-	if (os->os_dsl_dataset == NULL) {
-		/*
-		 * This is the MOS.  If we have upgraded,
-		 * spa_max_replication() could change, so reset
-		 * os_copies here.
-		 */
-		os->os_copies = spa_max_replication(os->os_spa);
-	}
-
-	/*
-	 * Create the root block IO
-	 */
-	zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0;
-	zb.zb_object = 0;
-	zb.zb_level = -1;
-	zb.zb_blkid = 0;
-	zio_flags = ZIO_FLAG_MUSTSUCCEED;
-	if (dmu_ot[DMU_OT_OBJSET].ot_metadata || zb.zb_level != 0)
-		zio_flags |= ZIO_FLAG_METADATA;
-	if (BP_IS_OLDER(os->os_rootbp, tx->tx_txg))
-		dsl_dataset_block_kill(os->os_dsl_dataset,
-		    os->os_rootbp, pio, tx);
-	zio = arc_write(pio, os->os_spa, os->os_md_checksum,
-	    os->os_md_compress,
-	    dmu_get_replication_level(os, &zb, DMU_OT_OBJSET),
-	    tx->tx_txg, os->os_rootbp, os->os_phys_buf, ready, killer, os,
-	    ZIO_PRIORITY_ASYNC_WRITE, zio_flags, &zb);
-
-	/*
-	 * Sync meta-dnode - the parent IO for the sync is the root block
-	 */
-	os->os_meta_dnode->dn_zio = zio;
-	dnode_sync(os->os_meta_dnode, tx);
-
-	txgoff = tx->tx_txg & TXG_MASK;
-
-	dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], tx);
-	dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], tx);
-
-	list = &os->os_meta_dnode->dn_dirty_records[txgoff];
-	while (dr = list_head(list)) {
-		ASSERT(dr->dr_dbuf->db_level == 0);
-		list_remove(list, dr);
-		if (dr->dr_zio)
-			zio_nowait(dr->dr_zio);
-	}
-	/*
-	 * Free intent log blocks up to this tx.
-	 */
-	zil_sync(os->os_zil, tx);
-	zio_nowait(zio);
-}
-
-void
-dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
-    uint64_t *usedobjsp, uint64_t *availobjsp)
-{
-	dsl_dataset_space(os->os->os_dsl_dataset, refdbytesp, availbytesp,
-	    usedobjsp, availobjsp);
-}
-
-uint64_t
-dmu_objset_fsid_guid(objset_t *os)
-{
-	return (dsl_dataset_fsid_guid(os->os->os_dsl_dataset));
-}
-
-void
-dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat)
-{
-	stat->dds_type = os->os->os_phys->os_type;
-	if (os->os->os_dsl_dataset)
-		dsl_dataset_fast_stat(os->os->os_dsl_dataset, stat);
-}
-
-void
-dmu_objset_stats(objset_t *os, nvlist_t *nv)
-{
-	ASSERT(os->os->os_dsl_dataset ||
-	    os->os->os_phys->os_type == DMU_OST_META);
-
-	if (os->os->os_dsl_dataset != NULL)
-		dsl_dataset_stats(os->os->os_dsl_dataset, nv);
-
-	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE,
-	    os->os->os_phys->os_type);
-}
-
-int
-dmu_objset_is_snapshot(objset_t *os)
-{
-	if (os->os->os_dsl_dataset != NULL)
-		return (dsl_dataset_is_snapshot(os->os->os_dsl_dataset));
-	else
-		return (B_FALSE);
-}
-
-int
-dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
-    uint64_t *idp, uint64_t *offp)
-{
-	dsl_dataset_t *ds = os->os->os_dsl_dataset;
-	zap_cursor_t cursor;
-	zap_attribute_t attr;
-
-	if (ds->ds_phys->ds_snapnames_zapobj == 0)
-		return (ENOENT);
-
-	zap_cursor_init_serialized(&cursor,
-	    ds->ds_dir->dd_pool->dp_meta_objset,
-	    ds->ds_phys->ds_snapnames_zapobj, *offp);
-
-	if (zap_cursor_retrieve(&cursor, &attr) != 0) {
-		zap_cursor_fini(&cursor);
-		return (ENOENT);
-	}
-
-	if (strlen(attr.za_name) + 1 > namelen) {
-		zap_cursor_fini(&cursor);
-		return (ENAMETOOLONG);
-	}
-
-	(void) strcpy(name, attr.za_name);
-	if (idp)
-		*idp = attr.za_first_integer;
-	zap_cursor_advance(&cursor);
-	*offp = zap_cursor_serialize(&cursor);
-	zap_cursor_fini(&cursor);
-
-	return (0);
-}
-
-int
-dmu_dir_list_next(objset_t *os, int namelen, char *name,
-    uint64_t *idp, uint64_t *offp)
-{
-	dsl_dir_t *dd = os->os->os_dsl_dataset->ds_dir;
-	zap_cursor_t cursor;
-	zap_attribute_t attr;
-
-	/* there is no next dir on a snapshot! */
-	if (os->os->os_dsl_dataset->ds_object !=
-	    dd->dd_phys->dd_head_dataset_obj)
-		return (ENOENT);
-
-	zap_cursor_init_serialized(&cursor,
-	    dd->dd_pool->dp_meta_objset,
-	    dd->dd_phys->dd_child_dir_zapobj, *offp);
-
-	if (zap_cursor_retrieve(&cursor, &attr) != 0) {
-		zap_cursor_fini(&cursor);
-		return (ENOENT);
-	}
-
-	if (strlen(attr.za_name) + 1 > namelen) {
-		zap_cursor_fini(&cursor);
-		return (ENAMETOOLONG);
-	}
-
-	(void) strcpy(name, attr.za_name);
-	if (idp)
-		*idp = attr.za_first_integer;
-	zap_cursor_advance(&cursor);
-	*offp = zap_cursor_serialize(&cursor);
-	zap_cursor_fini(&cursor);
-
-	return (0);
-}
-
-/*
- * Find all objsets under name, and for each, call 'func(child_name, arg)'.
- */
-int
-dmu_objset_find(char *name, int func(char *, void *), void *arg, int flags)
-{
-	dsl_dir_t *dd;
-	objset_t *os;
-	uint64_t snapobj;
-	zap_cursor_t zc;
-	zap_attribute_t *attr;
-	char *child;
-	int do_self, err;
-
-	err = dsl_dir_open(name, FTAG, &dd, NULL);
-	if (err)
-		return (err);
-
-	/* NB: the $MOS dir doesn't have a head dataset */
-	do_self = (dd->dd_phys->dd_head_dataset_obj != 0);
-	attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
-
-	/*
-	 * Iterate over all children.
-	 */
-	if (flags & DS_FIND_CHILDREN) {
-		for (zap_cursor_init(&zc, dd->dd_pool->dp_meta_objset,
-		    dd->dd_phys->dd_child_dir_zapobj);
-		    zap_cursor_retrieve(&zc, attr) == 0;
-		    (void) zap_cursor_advance(&zc)) {
-			ASSERT(attr->za_integer_length == sizeof (uint64_t));
-			ASSERT(attr->za_num_integers == 1);
-
-			/*
-			 * No separating '/' because parent's name ends in /.
-			 */
-			child = kmem_alloc(MAXPATHLEN, KM_SLEEP);
-			/* XXX could probably just use name here */
-			dsl_dir_name(dd, child);
-			(void) strcat(child, "/");
-			(void) strcat(child, attr->za_name);
-			err = dmu_objset_find(child, func, arg, flags);
-			kmem_free(child, MAXPATHLEN);
-			if (err)
-				break;
-		}
-		zap_cursor_fini(&zc);
-
-		if (err) {
-			dsl_dir_close(dd, FTAG);
-			kmem_free(attr, sizeof (zap_attribute_t));
-			return (err);
-		}
-	}
-
-	/*
-	 * Iterate over all snapshots.
-	 */
-	if ((flags & DS_FIND_SNAPSHOTS) &&
-	    dmu_objset_open(name, DMU_OST_ANY,
-	    DS_MODE_STANDARD | DS_MODE_READONLY, &os) == 0) {
-
-		snapobj = os->os->os_dsl_dataset->ds_phys->ds_snapnames_zapobj;
-		dmu_objset_close(os);
-
-		for (zap_cursor_init(&zc, dd->dd_pool->dp_meta_objset, snapobj);
-		    zap_cursor_retrieve(&zc, attr) == 0;
-		    (void) zap_cursor_advance(&zc)) {
-			ASSERT(attr->za_integer_length == sizeof (uint64_t));
-			ASSERT(attr->za_num_integers == 1);
-
-			child = kmem_alloc(MAXPATHLEN, KM_SLEEP);
-			/* XXX could probably just use name here */
-			dsl_dir_name(dd, child);
-			(void) strcat(child, "@");
-			(void) strcat(child, attr->za_name);
-			err = func(child, arg);
-			kmem_free(child, MAXPATHLEN);
-			if (err)
-				break;
-		}
-		zap_cursor_fini(&zc);
-	}
-
-	dsl_dir_close(dd, FTAG);
-	kmem_free(attr, sizeof (zap_attribute_t));
-
-	if (err)
-		return (err);
-
-	/*
-	 * Apply to self if appropriate.
-	 */
-	if (do_self)
-		err = func(name, arg);
-	return (err);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
deleted file mode 100644
index 3e55dc3..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
+++ /dev/null
@@ -1,1009 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/dmu.h>
-#include <sys/dmu_impl.h>
-#include <sys/dmu_tx.h>
-#include <sys/dbuf.h>
-#include <sys/dnode.h>
-#include <sys/zfs_context.h>
-#include <sys/dmu_objset.h>
-#include <sys/dmu_traverse.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_dir.h>
-#include <sys/dsl_pool.h>
-#include <sys/dsl_synctask.h>
-#include <sys/zfs_ioctl.h>
-#include <sys/zap.h>
-#include <sys/zio_checksum.h>
-
-struct backuparg {
-	dmu_replay_record_t *drr;
-	kthread_t *td;
-	struct file *fp;
-	objset_t *os;
-	zio_cksum_t zc;
-	int err;
-};
-
-static int
-dump_bytes(struct backuparg *ba, void *buf, int len)
-{
-	struct uio auio;
-	struct iovec aiov;
-
-	ASSERT3U(len % 8, ==, 0);
-
-	fletcher_4_incremental_native(buf, len, &ba->zc);
-
-	aiov.iov_base = buf;
-	aiov.iov_len = len;
-	auio.uio_iov = &aiov;
-	auio.uio_iovcnt = 1;
-	auio.uio_resid = len;
-	auio.uio_segflg = UIO_SYSSPACE;
-	auio.uio_rw = UIO_WRITE;
-	auio.uio_offset = (off_t)-1;
-	auio.uio_td = ba->td;
-#ifdef _KERNEL
-	if (ba->fp->f_type == DTYPE_VNODE)
-		bwillwrite();
-	ba->err = fo_write(ba->fp, &auio, ba->td->td_ucred, 0, ba->td);
-#else
-	fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__);
-	ba->err = EOPNOTSUPP;
-#endif
-
-	return (ba->err);
-}
-
-static int
-dump_free(struct backuparg *ba, uint64_t object, uint64_t offset,
-    uint64_t length)
-{
-	/* write a FREE record */
-	bzero(ba->drr, sizeof (dmu_replay_record_t));
-	ba->drr->drr_type = DRR_FREE;
-	ba->drr->drr_u.drr_free.drr_object = object;
-	ba->drr->drr_u.drr_free.drr_offset = offset;
-	ba->drr->drr_u.drr_free.drr_length = length;
-
-	if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
-		return (EINTR);
-	return (0);
-}
-
-static int
-dump_data(struct backuparg *ba, dmu_object_type_t type,
-    uint64_t object, uint64_t offset, int blksz, void *data)
-{
-	/* write a DATA record */
-	bzero(ba->drr, sizeof (dmu_replay_record_t));
-	ba->drr->drr_type = DRR_WRITE;
-	ba->drr->drr_u.drr_write.drr_object = object;
-	ba->drr->drr_u.drr_write.drr_type = type;
-	ba->drr->drr_u.drr_write.drr_offset = offset;
-	ba->drr->drr_u.drr_write.drr_length = blksz;
-
-	if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
-		return (EINTR);
-	if (dump_bytes(ba, data, blksz))
-		return (EINTR);
-	return (0);
-}
-
-static int
-dump_freeobjects(struct backuparg *ba, uint64_t firstobj, uint64_t numobjs)
-{
-	/* write a FREEOBJECTS record */
-	bzero(ba->drr, sizeof (dmu_replay_record_t));
-	ba->drr->drr_type = DRR_FREEOBJECTS;
-	ba->drr->drr_u.drr_freeobjects.drr_firstobj = firstobj;
-	ba->drr->drr_u.drr_freeobjects.drr_numobjs = numobjs;
-
-	if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
-		return (EINTR);
-	return (0);
-}
-
-static int
-dump_dnode(struct backuparg *ba, uint64_t object, dnode_phys_t *dnp)
-{
-	if (dnp == NULL || dnp->dn_type == DMU_OT_NONE)
-		return (dump_freeobjects(ba, object, 1));
-
-	/* write an OBJECT record */
-	bzero(ba->drr, sizeof (dmu_replay_record_t));
-	ba->drr->drr_type = DRR_OBJECT;
-	ba->drr->drr_u.drr_object.drr_object = object;
-	ba->drr->drr_u.drr_object.drr_type = dnp->dn_type;
-	ba->drr->drr_u.drr_object.drr_bonustype = dnp->dn_bonustype;
-	ba->drr->drr_u.drr_object.drr_blksz =
-	    dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
-	ba->drr->drr_u.drr_object.drr_bonuslen = dnp->dn_bonuslen;
-	ba->drr->drr_u.drr_object.drr_checksum = dnp->dn_checksum;
-	ba->drr->drr_u.drr_object.drr_compress = dnp->dn_compress;
-
-	if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
-		return (EINTR);
-
-	if (dump_bytes(ba, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)))
-		return (EINTR);
-
-	/* free anything past the end of the file */
-	if (dump_free(ba, object, (dnp->dn_maxblkid + 1) *
-	    (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL))
-		return (EINTR);
-	if (ba->err)
-		return (EINTR);
-	return (0);
-}
-
-#define	BP_SPAN(dnp, level) \
-	(((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \
-	(level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)))
-
-static int
-backup_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
-{
-	struct backuparg *ba = arg;
-	uint64_t object = bc->bc_bookmark.zb_object;
-	int level = bc->bc_bookmark.zb_level;
-	uint64_t blkid = bc->bc_bookmark.zb_blkid;
-	blkptr_t *bp = bc->bc_blkptr.blk_birth ? &bc->bc_blkptr : NULL;
-	dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE;
-	void *data = bc->bc_data;
-	int err = 0;
-
-	if (SIGPENDING(curthread))
-		return (EINTR);
-
-	ASSERT(data || bp == NULL);
-
-	if (bp == NULL && object == 0) {
-		uint64_t span = BP_SPAN(bc->bc_dnode, level);
-		uint64_t dnobj = (blkid * span) >> DNODE_SHIFT;
-		err = dump_freeobjects(ba, dnobj, span >> DNODE_SHIFT);
-	} else if (bp == NULL) {
-		uint64_t span = BP_SPAN(bc->bc_dnode, level);
-		err = dump_free(ba, object, blkid * span, span);
-	} else if (data && level == 0 && type == DMU_OT_DNODE) {
-		dnode_phys_t *blk = data;
-		int i;
-		int blksz = BP_GET_LSIZE(bp);
-
-		for (i = 0; i < blksz >> DNODE_SHIFT; i++) {
-			uint64_t dnobj =
-			    (blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i;
-			err = dump_dnode(ba, dnobj, blk+i);
-			if (err)
-				break;
-		}
-	} else if (level == 0 &&
-	    type != DMU_OT_DNODE && type != DMU_OT_OBJSET) {
-		int blksz = BP_GET_LSIZE(bp);
-		if (data == NULL) {
-			uint32_t aflags = ARC_WAIT;
-			arc_buf_t *abuf;
-			zbookmark_t zb;
-
-			zb.zb_objset = ba->os->os->os_dsl_dataset->ds_object;
-			zb.zb_object = object;
-			zb.zb_level = level;
-			zb.zb_blkid = blkid;
-			(void) arc_read(NULL, spa, bp,
-			    dmu_ot[type].ot_byteswap, arc_getbuf_func, &abuf,
-			    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_MUSTSUCCEED,
-			    &aflags, &zb);
-
-			if (abuf) {
-				err = dump_data(ba, type, object, blkid * blksz,
-				    blksz, abuf->b_data);
-				(void) arc_buf_remove_ref(abuf, &abuf);
-			}
-		} else {
-			err = dump_data(ba, type, object, blkid * blksz,
-			    blksz, data);
-		}
-	}
-
-	ASSERT(err == 0 || err == EINTR);
-	return (err);
-}
-
-int
-dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, struct file *fp)
-{
-	dsl_dataset_t *ds = tosnap->os->os_dsl_dataset;
-	dsl_dataset_t *fromds = fromsnap ? fromsnap->os->os_dsl_dataset : NULL;
-	dmu_replay_record_t *drr;
-	struct backuparg ba;
-	int err;
-
-	/* tosnap must be a snapshot */
-	if (ds->ds_phys->ds_next_snap_obj == 0)
-		return (EINVAL);
-
-	/* fromsnap must be an earlier snapshot from the same fs as tosnap */
-	if (fromds && (ds->ds_dir != fromds->ds_dir ||
-	    fromds->ds_phys->ds_creation_txg >=
-	    ds->ds_phys->ds_creation_txg))
-		return (EXDEV);
-
-	drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
-	drr->drr_type = DRR_BEGIN;
-	drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC;
-	drr->drr_u.drr_begin.drr_version = DMU_BACKUP_VERSION;
-	drr->drr_u.drr_begin.drr_creation_time =
-	    ds->ds_phys->ds_creation_time;
-	drr->drr_u.drr_begin.drr_type = tosnap->os->os_phys->os_type;
-	drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid;
-	if (fromds)
-		drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid;
-	dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname);
-
-	ba.drr = drr;
-	ba.td = curthread;
-	ba.fp = fp;
-	ba.os = tosnap;
-	ZIO_SET_CHECKSUM(&ba.zc, 0, 0, 0, 0);
-
-	if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) {
-		kmem_free(drr, sizeof (dmu_replay_record_t));
-		return (ba.err);
-	}
-
-	err = traverse_dsl_dataset(ds,
-	    fromds ? fromds->ds_phys->ds_creation_txg : 0,
-	    ADVANCE_PRE | ADVANCE_HOLES | ADVANCE_DATA | ADVANCE_NOLOCK,
-	    backup_cb, &ba);
-
-	if (err) {
-		if (err == EINTR && ba.err)
-			err = ba.err;
-		kmem_free(drr, sizeof (dmu_replay_record_t));
-		return (err);
-	}
-
-	bzero(drr, sizeof (dmu_replay_record_t));
-	drr->drr_type = DRR_END;
-	drr->drr_u.drr_end.drr_checksum = ba.zc;
-
-	if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) {
-		kmem_free(drr, sizeof (dmu_replay_record_t));
-		return (ba.err);
-	}
-
-	kmem_free(drr, sizeof (dmu_replay_record_t));
-
-	return (0);
-}
-
-struct restorearg {
-	int err;
-	int byteswap;
-	kthread_t *td;
-	struct file *fp;
-	char *buf;
-	uint64_t voff;
-	int buflen; /* number of valid bytes in buf */
-	int bufoff; /* next offset to read */
-	int bufsize; /* amount of memory allocated for buf */
-	zio_cksum_t zc;
-};
-
-/* ARGSUSED */
-static int
-replay_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-	dsl_dataset_t *ds = arg1;
-	struct drr_begin *drrb = arg2;
-	const char *snapname;
-	int err;
-	uint64_t val;
-
-	/* must already be a snapshot of this fs */
-	if (ds->ds_phys->ds_prev_snap_obj == 0)
-		return (ENODEV);
-
-	/* most recent snapshot must match fromguid */
-	if (ds->ds_prev->ds_phys->ds_guid != drrb->drr_fromguid)
-		return (ENODEV);
-	/* must not have any changes since most recent snapshot */
-	if (ds->ds_phys->ds_bp.blk_birth >
-	    ds->ds_prev->ds_phys->ds_creation_txg)
-		return (ETXTBSY);
-
-	/* new snapshot name must not exist */
-	snapname = strrchr(drrb->drr_toname, '@');
-	if (snapname == NULL)
-		return (EEXIST);
-
-	snapname++;
-	err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
-	    ds->ds_phys->ds_snapnames_zapobj, snapname, 8, 1, &val);
-	if (err == 0)
-		return (EEXIST);
-	if (err != ENOENT)
-		return (err);
-
-	return (0);
-}
-
-/* ARGSUSED */
-static void
-replay_incremental_sync(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-	dsl_dataset_t *ds = arg1;
-	dmu_buf_will_dirty(ds->ds_dbuf, tx);
-	ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
-}
-
-/* ARGSUSED */
-static int
-replay_full_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-	dsl_dir_t *dd = arg1;
-	struct drr_begin *drrb = arg2;
-	objset_t *mos = dd->dd_pool->dp_meta_objset;
-	char *cp;
-	uint64_t val;
-	int err;
-
-	cp = strchr(drrb->drr_toname, '@');
-	*cp = '\0';
-	err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj,
-	    strrchr(drrb->drr_toname, '/') + 1,
-	    sizeof (uint64_t), 1, &val);
-	*cp = '@';
-
-	if (err != ENOENT)
-		return (err ? err : EEXIST);
-
-	return (0);
-}
-
-static void
-replay_full_sync(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-	dsl_dir_t *dd = arg1;
-	struct drr_begin *drrb = arg2;
-	char *cp;
-	dsl_dataset_t *ds;
-	uint64_t dsobj;
-
-	cp = strchr(drrb->drr_toname, '@');
-	*cp = '\0';
-	dsobj = dsl_dataset_create_sync(dd, strrchr(drrb->drr_toname, '/') + 1,
-	    NULL, tx);
-	*cp = '@';
-
-	VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, dsobj, NULL,
-	    DS_MODE_EXCLUSIVE, FTAG, &ds));
-
-	(void) dmu_objset_create_impl(dsl_dataset_get_spa(ds),
-	    ds, &ds->ds_phys->ds_bp, drrb->drr_type, tx);
-
-	dmu_buf_will_dirty(ds->ds_dbuf, tx);
-	ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
-
-	dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
-}
-
-static int
-replay_end_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-	objset_t *os = arg1;
-	struct drr_begin *drrb = arg2;
-	char *snapname;
-
-	/* XXX verify that drr_toname is in dd */
-
-	snapname = strchr(drrb->drr_toname, '@');
-	if (snapname == NULL)
-		return (EINVAL);
-	snapname++;
-
-	return (dsl_dataset_snapshot_check(os, snapname, tx));
-}
-
-static void
-replay_end_sync(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-	objset_t *os = arg1;
-	struct drr_begin *drrb = arg2;
-	char *snapname;
-	dsl_dataset_t *ds, *hds;
-
-	snapname = strchr(drrb->drr_toname, '@') + 1;
-
-	dsl_dataset_snapshot_sync(os, snapname, tx);
-
-	/* set snapshot's creation time and guid */
-	hds = os->os->os_dsl_dataset;
-	VERIFY(0 == dsl_dataset_open_obj(hds->ds_dir->dd_pool,
-	    hds->ds_phys->ds_prev_snap_obj, NULL,
-	    DS_MODE_PRIMARY | DS_MODE_READONLY | DS_MODE_INCONSISTENT,
-	    FTAG, &ds));
-
-	dmu_buf_will_dirty(ds->ds_dbuf, tx);
-	ds->ds_phys->ds_creation_time = drrb->drr_creation_time;
-	ds->ds_phys->ds_guid = drrb->drr_toguid;
-	ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
-
-	dsl_dataset_close(ds, DS_MODE_PRIMARY, FTAG);
-
-	dmu_buf_will_dirty(hds->ds_dbuf, tx);
-	hds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
-}
-
-static int
-restore_bytes(struct restorearg *ra, void *buf, int len, off_t off, int *resid)
-{
-	struct uio auio;
-	struct iovec aiov;
-	int error;
-
-	aiov.iov_base = buf;
-	aiov.iov_len = len;
-	auio.uio_iov = &aiov;
-	auio.uio_iovcnt = 1;
-	auio.uio_resid = len;
-	auio.uio_segflg = UIO_SYSSPACE;
-	auio.uio_rw = UIO_READ;
-	auio.uio_offset = off;
-	auio.uio_td = ra->td;
-#ifdef _KERNEL
-	error = fo_read(ra->fp, &auio, ra->td->td_ucred, FOF_OFFSET, ra->td);
-#else
-	fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__);
-	error = EOPNOTSUPP;
-#endif
-	*resid = auio.uio_resid;
-	return (error);
-}
-
-static void *
-restore_read(struct restorearg *ra, int len)
-{
-	void *rv;
-
-	/* some things will require 8-byte alignment, so everything must */
-	ASSERT3U(len % 8, ==, 0);
-
-	while (ra->buflen - ra->bufoff < len) {
-		int resid;
-		int leftover = ra->buflen - ra->bufoff;
-
-		(void) memmove(ra->buf, ra->buf + ra->bufoff, leftover);
-
-		ra->err = restore_bytes(ra, (caddr_t)ra->buf + leftover,
-		    ra->bufsize - leftover, ra->voff, &resid);
-
-		ra->voff += ra->bufsize - leftover - resid;
-		ra->buflen = ra->bufsize - resid;
-		ra->bufoff = 0;
-		if (resid == ra->bufsize - leftover)
-			ra->err = EINVAL;
-		if (ra->err)
-			return (NULL);
-		/* Could compute checksum here? */
-	}
-
-	ASSERT3U(ra->bufoff % 8, ==, 0);
-	ASSERT3U(ra->buflen - ra->bufoff, >=, len);
-	rv = ra->buf + ra->bufoff;
-	ra->bufoff += len;
-	if (ra->byteswap)
-		fletcher_4_incremental_byteswap(rv, len, &ra->zc);
-	else
-		fletcher_4_incremental_native(rv, len, &ra->zc);
-	return (rv);
-}
-
-static void
-backup_byteswap(dmu_replay_record_t *drr)
-{
-#define	DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X))
-#define	DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X))
-	drr->drr_type = BSWAP_32(drr->drr_type);
-	switch (drr->drr_type) {
-	case DRR_BEGIN:
-		DO64(drr_begin.drr_magic);
-		DO64(drr_begin.drr_version);
-		DO64(drr_begin.drr_creation_time);
-		DO32(drr_begin.drr_type);
-		DO64(drr_begin.drr_toguid);
-		DO64(drr_begin.drr_fromguid);
-		break;
-	case DRR_OBJECT:
-		DO64(drr_object.drr_object);
-		/* DO64(drr_object.drr_allocation_txg); */
-		DO32(drr_object.drr_type);
-		DO32(drr_object.drr_bonustype);
-		DO32(drr_object.drr_blksz);
-		DO32(drr_object.drr_bonuslen);
-		break;
-	case DRR_FREEOBJECTS:
-		DO64(drr_freeobjects.drr_firstobj);
-		DO64(drr_freeobjects.drr_numobjs);
-		break;
-	case DRR_WRITE:
-		DO64(drr_write.drr_object);
-		DO32(drr_write.drr_type);
-		DO64(drr_write.drr_offset);
-		DO64(drr_write.drr_length);
-		break;
-	case DRR_FREE:
-		DO64(drr_free.drr_object);
-		DO64(drr_free.drr_offset);
-		DO64(drr_free.drr_length);
-		break;
-	case DRR_END:
-		DO64(drr_end.drr_checksum.zc_word[0]);
-		DO64(drr_end.drr_checksum.zc_word[1]);
-		DO64(drr_end.drr_checksum.zc_word[2]);
-		DO64(drr_end.drr_checksum.zc_word[3]);
-		break;
-	}
-#undef DO64
-#undef DO32
-}
-
-static int
-restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
-{
-	int err;
-	dmu_tx_t *tx;
-
-	err = dmu_object_info(os, drro->drr_object, NULL);
-
-	if (err != 0 && err != ENOENT)
-		return (EINVAL);
-
-	if (drro->drr_type == DMU_OT_NONE ||
-	    drro->drr_type >= DMU_OT_NUMTYPES ||
-	    drro->drr_bonustype >= DMU_OT_NUMTYPES ||
-	    drro->drr_checksum >= ZIO_CHECKSUM_FUNCTIONS ||
-	    drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
-	    P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
-	    drro->drr_blksz < SPA_MINBLOCKSIZE ||
-	    drro->drr_blksz > SPA_MAXBLOCKSIZE ||
-	    drro->drr_bonuslen > DN_MAX_BONUSLEN) {
-		return (EINVAL);
-	}
-
-	tx = dmu_tx_create(os);
-
-	if (err == ENOENT) {
-		/* currently free, want to be allocated */
-		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
-		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 1);
-		err = dmu_tx_assign(tx, TXG_WAIT);
-		if (err) {
-			dmu_tx_abort(tx);
-			return (err);
-		}
-		err = dmu_object_claim(os, drro->drr_object,
-		    drro->drr_type, drro->drr_blksz,
-		    drro->drr_bonustype, drro->drr_bonuslen, tx);
-	} else {
-		/* currently allocated, want to be allocated */
-		dmu_tx_hold_bonus(tx, drro->drr_object);
-		/*
-		 * We may change blocksize, so need to
-		 * hold_write
-		 */
-		dmu_tx_hold_write(tx, drro->drr_object, 0, 1);
-		err = dmu_tx_assign(tx, TXG_WAIT);
-		if (err) {
-			dmu_tx_abort(tx);
-			return (err);
-		}
-
-		err = dmu_object_reclaim(os, drro->drr_object,
-		    drro->drr_type, drro->drr_blksz,
-		    drro->drr_bonustype, drro->drr_bonuslen, tx);
-	}
-	if (err) {
-		dmu_tx_commit(tx);
-		return (EINVAL);
-	}
-
-	dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksum, tx);
-	dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx);
-
-	if (drro->drr_bonuslen) {
-		dmu_buf_t *db;
-		void *data;
-		VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db));
-		dmu_buf_will_dirty(db, tx);
-
-		ASSERT3U(db->db_size, ==, drro->drr_bonuslen);
-		data = restore_read(ra, P2ROUNDUP(db->db_size, 8));
-		if (data == NULL) {
-			dmu_tx_commit(tx);
-			return (ra->err);
-		}
-		bcopy(data, db->db_data, db->db_size);
-		if (ra->byteswap) {
-			dmu_ot[drro->drr_bonustype].ot_byteswap(db->db_data,
-			    drro->drr_bonuslen);
-		}
-		dmu_buf_rele(db, FTAG);
-	}
-	dmu_tx_commit(tx);
-	return (0);
-}
-
-/* ARGSUSED */
-static int
-restore_freeobjects(struct restorearg *ra, objset_t *os,
-    struct drr_freeobjects *drrfo)
-{
-	uint64_t obj;
-
-	if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj)
-		return (EINVAL);
-
-	for (obj = drrfo->drr_firstobj;
-	    obj < drrfo->drr_firstobj + drrfo->drr_numobjs;
-	    (void) dmu_object_next(os, &obj, FALSE, 0)) {
-		dmu_tx_t *tx;
-		int err;
-
-		if (dmu_object_info(os, obj, NULL) != 0)
-			continue;
-
-		tx = dmu_tx_create(os);
-		dmu_tx_hold_bonus(tx, obj);
-		err = dmu_tx_assign(tx, TXG_WAIT);
-		if (err) {
-			dmu_tx_abort(tx);
-			return (err);
-		}
-		err = dmu_object_free(os, obj, tx);
-		dmu_tx_commit(tx);
-		if (err && err != ENOENT)
-			return (EINVAL);
-	}
-	return (0);
-}
-
-static int
-restore_write(struct restorearg *ra, objset_t *os,
-    struct drr_write *drrw)
-{
-	dmu_tx_t *tx;
-	void *data;
-	int err;
-
-	if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset ||
-	    drrw->drr_type >= DMU_OT_NUMTYPES)
-		return (EINVAL);
-
-	data = restore_read(ra, drrw->drr_length);
-	if (data == NULL)
-		return (ra->err);
-
-	if (dmu_object_info(os, drrw->drr_object, NULL) != 0)
-		return (EINVAL);
-
-	tx = dmu_tx_create(os);
-
-	dmu_tx_hold_write(tx, drrw->drr_object,
-	    drrw->drr_offset, drrw->drr_length);
-	err = dmu_tx_assign(tx, TXG_WAIT);
-	if (err) {
-		dmu_tx_abort(tx);
-		return (err);
-	}
-	if (ra->byteswap)
-		dmu_ot[drrw->drr_type].ot_byteswap(data, drrw->drr_length);
-	dmu_write(os, drrw->drr_object,
-	    drrw->drr_offset, drrw->drr_length, data, tx);
-	dmu_tx_commit(tx);
-	return (0);
-}
-
-/* ARGSUSED */
-static int
-restore_free(struct restorearg *ra, objset_t *os,
-    struct drr_free *drrf)
-{
-	dmu_tx_t *tx;
-	int err;
-
-	if (drrf->drr_length != -1ULL &&
-	    drrf->drr_offset + drrf->drr_length < drrf->drr_offset)
-		return (EINVAL);
-
-	if (dmu_object_info(os, drrf->drr_object, NULL) != 0)
-		return (EINVAL);
-
-	tx = dmu_tx_create(os);
-
-	dmu_tx_hold_free(tx, drrf->drr_object,
-	    drrf->drr_offset, drrf->drr_length);
-	err = dmu_tx_assign(tx, TXG_WAIT);
-	if (err) {
-		dmu_tx_abort(tx);
-		return (err);
-	}
-	err = dmu_free_range(os, drrf->drr_object,
-	    drrf->drr_offset, drrf->drr_length, tx);
-	dmu_tx_commit(tx);
-	return (err);
-}
-
-int
-dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep,
-    boolean_t force, struct file *fp, uint64_t voffset)
-{
-	kthread_t *td = curthread;
-	struct restorearg ra;
-	dmu_replay_record_t *drr;
-	char *cp;
-	objset_t *os = NULL;
-	zio_cksum_t pzc;
-
-	bzero(&ra, sizeof (ra));
-	ra.td = td;
-	ra.fp = fp;
-	ra.voff = voffset;
-	ra.bufsize = 1<<20;
-	ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP);
-
-	if (drrb->drr_magic == DMU_BACKUP_MAGIC) {
-		ra.byteswap = FALSE;
-	} else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) {
-		ra.byteswap = TRUE;
-	} else {
-		ra.err = EINVAL;
-		goto out;
-	}
-
-	/*
-	 * NB: this assumes that struct drr_begin will be the largest in
-	 * dmu_replay_record_t's drr_u, and thus we don't need to pad it
-	 * with zeros to make it the same length as we wrote out.
-	 */
-	((dmu_replay_record_t *)ra.buf)->drr_type = DRR_BEGIN;
-	((dmu_replay_record_t *)ra.buf)->drr_pad = 0;
-	((dmu_replay_record_t *)ra.buf)->drr_u.drr_begin = *drrb;
-	if (ra.byteswap) {
-		fletcher_4_incremental_byteswap(ra.buf,
-		    sizeof (dmu_replay_record_t), &ra.zc);
-	} else {
-		fletcher_4_incremental_native(ra.buf,
-		    sizeof (dmu_replay_record_t), &ra.zc);
-	}
-	(void) strcpy(drrb->drr_toname, tosnap); /* for the sync funcs */
-
-	if (ra.byteswap) {
-		drrb->drr_magic = BSWAP_64(drrb->drr_magic);
-		drrb->drr_version = BSWAP_64(drrb->drr_version);
-		drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time);
-		drrb->drr_type = BSWAP_32(drrb->drr_type);
-		drrb->drr_toguid = BSWAP_64(drrb->drr_toguid);
-		drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid);
-	}
-
-	ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
-
-	if (drrb->drr_version != DMU_BACKUP_VERSION ||
-	    drrb->drr_type >= DMU_OST_NUMTYPES ||
-	    strchr(drrb->drr_toname, '@') == NULL) {
-		ra.err = EINVAL;
-		goto out;
-	}
-
-	/*
-	 * Process the begin in syncing context.
-	 */
-	if (drrb->drr_fromguid) {
-		/* incremental backup */
-		dsl_dataset_t *ds = NULL;
-
-		cp = strchr(tosnap, '@');
-		*cp = '\0';
-		ra.err = dsl_dataset_open(tosnap, DS_MODE_EXCLUSIVE, FTAG, &ds);
-		*cp = '@';
-		if (ra.err)
-			goto out;
-
-		/*
-		 * Only do the rollback if the most recent snapshot
-		 * matches the incremental source
-		 */
-		if (force) {
-			if (ds->ds_prev == NULL ||
-			    ds->ds_prev->ds_phys->ds_guid !=
-			    drrb->drr_fromguid) {
-				dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
-				kmem_free(ra.buf, ra.bufsize);
-				return (ENODEV);
-			}
-			(void) dsl_dataset_rollback(ds);
-		}
-		ra.err = dsl_sync_task_do(ds->ds_dir->dd_pool,
-		    replay_incremental_check, replay_incremental_sync,
-		    ds, drrb, 1);
-		dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
-	} else {
-		/* full backup */
-		dsl_dir_t *dd = NULL;
-		const char *tail;
-
-		/* can't restore full backup into topmost fs, for now */
-		if (strrchr(drrb->drr_toname, '/') == NULL) {
-			ra.err = EINVAL;
-			goto out;
-		}
-
-		cp = strchr(tosnap, '@');
-		*cp = '\0';
-		ra.err = dsl_dir_open(tosnap, FTAG, &dd, &tail);
-		*cp = '@';
-		if (ra.err)
-			goto out;
-		if (tail == NULL) {
-			ra.err = EEXIST;
-			goto out;
-		}
-
-		ra.err = dsl_sync_task_do(dd->dd_pool, replay_full_check,
-		    replay_full_sync, dd, drrb, 5);
-		dsl_dir_close(dd, FTAG);
-	}
-	if (ra.err)
-		goto out;
-
-	/*
-	 * Open the objset we are modifying.
-	 */
-
-	cp = strchr(tosnap, '@');
-	*cp = '\0';
-	ra.err = dmu_objset_open(tosnap, DMU_OST_ANY,
-	    DS_MODE_PRIMARY | DS_MODE_INCONSISTENT, &os);
-	*cp = '@';
-	ASSERT3U(ra.err, ==, 0);
-
-	/*
-	 * Read records and process them.
-	 */
-	pzc = ra.zc;
-	while (ra.err == 0 &&
-	    NULL != (drr = restore_read(&ra, sizeof (*drr)))) {
-		if (SIGPENDING(td)) {
-			ra.err = EINTR;
-			goto out;
-		}
-
-		if (ra.byteswap)
-			backup_byteswap(drr);
-
-		switch (drr->drr_type) {
-		case DRR_OBJECT:
-		{
-			/*
-			 * We need to make a copy of the record header,
-			 * because restore_{object,write} may need to
-			 * restore_read(), which will invalidate drr.
-			 */
-			struct drr_object drro = drr->drr_u.drr_object;
-			ra.err = restore_object(&ra, os, &drro);
-			break;
-		}
-		case DRR_FREEOBJECTS:
-		{
-			struct drr_freeobjects drrfo =
-			    drr->drr_u.drr_freeobjects;
-			ra.err = restore_freeobjects(&ra, os, &drrfo);
-			break;
-		}
-		case DRR_WRITE:
-		{
-			struct drr_write drrw = drr->drr_u.drr_write;
-			ra.err = restore_write(&ra, os, &drrw);
-			break;
-		}
-		case DRR_FREE:
-		{
-			struct drr_free drrf = drr->drr_u.drr_free;
-			ra.err = restore_free(&ra, os, &drrf);
-			break;
-		}
-		case DRR_END:
-		{
-			struct drr_end drre = drr->drr_u.drr_end;
-			/*
-			 * We compare against the *previous* checksum
-			 * value, because the stored checksum is of
-			 * everything before the DRR_END record.
-			 */
-			if (drre.drr_checksum.zc_word[0] != 0 &&
-			    !ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pzc)) {
-				ra.err = ECKSUM;
-				goto out;
-			}
-
-			ra.err = dsl_sync_task_do(dmu_objset_ds(os)->
-			    ds_dir->dd_pool, replay_end_check, replay_end_sync,
-			    os, drrb, 3);
-			goto out;
-		}
-		default:
-			ra.err = EINVAL;
-			goto out;
-		}
-		pzc = ra.zc;
-	}
-
-out:
-	if (os)
-		dmu_objset_close(os);
-
-	/*
-	 * Make sure we don't rollback/destroy unless we actually
-	 * processed the begin properly.  'os' will only be set if this
-	 * is the case.
-	 */
-	if (ra.err && os && tosnap && strchr(tosnap, '@')) {
-		/*
-		 * rollback or destroy what we created, so we don't
-		 * leave it in the restoring state.
-		 */
-		dsl_dataset_t *ds;
-		int err;
-
-		cp = strchr(tosnap, '@');
-		*cp = '\0';
-		err = dsl_dataset_open(tosnap,
-		    DS_MODE_EXCLUSIVE | DS_MODE_INCONSISTENT,
-		    FTAG, &ds);
-		if (err == 0) {
-			txg_wait_synced(ds->ds_dir->dd_pool, 0);
-			if (drrb->drr_fromguid) {
-				/* incremental: rollback to most recent snap */
-				(void) dsl_dataset_rollback(ds);
-				dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
-			} else {
-				/* full: destroy whole fs */
-				dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
-				(void) dsl_dataset_destroy(tosnap);
-			}
-		}
-		*cp = '@';
-	}
-
-	kmem_free(ra.buf, ra.bufsize);
-	if (sizep)
-		*sizep = ra.voff;
-	return (ra.err);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
deleted file mode 100644
index 3d2bc3e..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
+++ /dev/null
@@ -1,888 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/dmu_objset.h>
-#include <sys/dmu_traverse.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_dir.h>
-#include <sys/dsl_pool.h>
-#include <sys/dnode.h>
-#include <sys/spa.h>
-#include <sys/zio.h>
-#include <sys/dmu_impl.h>
-
-#define	BP_SPAN_SHIFT(level, width)	((level) * (width))
-
-#define	BP_EQUAL(b1, b2)				\
-	(DVA_EQUAL(BP_IDENTITY(b1), BP_IDENTITY(b2)) &&	\
-	(b1)->blk_birth == (b2)->blk_birth)
-
-/*
- * Compare two bookmarks.
- *
- * For ADVANCE_PRE, the visitation order is:
- *
- *	objset 0, 1, 2, ..., ZB_MAXOBJSET.
- *	object 0, 1, 2, ..., ZB_MAXOBJECT.
- *	blkoff 0, 1, 2, ...
- *	level ZB_MAXLEVEL, ..., 2, 1, 0.
- *
- * where blkoff = blkid << BP_SPAN_SHIFT(level, width), and thus a valid
- * ordering vector is:
- *
- *	< objset, object, blkoff, -level >
- *
- * For ADVANCE_POST, the starting offsets aren't sequential but ending
- * offsets [blkoff = (blkid + 1) << BP_SPAN_SHIFT(level, width)] are.
- * The visitation order is:
- *
- *	objset 1, 2, ..., ZB_MAXOBJSET, 0.
- *	object 1, 2, ..., ZB_MAXOBJECT, 0.
- *	blkoff 1, 2, ...
- *	level 0, 1, 2, ..., ZB_MAXLEVEL.
- *
- * and thus a valid ordering vector is:
- *
- *	< objset - 1, object - 1, blkoff, level >
- *
- * Both orderings can be expressed as:
- *
- *	< objset + bias, object + bias, blkoff, level ^ bias >
- *
- * where 'bias' is either 0 or -1 (for ADVANCE_PRE or ADVANCE_POST)
- * and 'blkoff' is (blkid - bias) << BP_SPAN_SHIFT(level, wshift).
- *
- * Special case: an objset's osphys is represented as level -1 of object 0.
- * It is always either the very first or very last block we visit in an objset.
- * Therefore, if either bookmark's level is -1, level alone determines order.
- */
-static int
-compare_bookmark(zbookmark_t *szb, zbookmark_t *ezb, dnode_phys_t *dnp,
-    int advance)
-{
-	int bias = (advance & ADVANCE_PRE) ? 0 : -1;
-	uint64_t sblkoff, eblkoff;
-	int slevel, elevel, wshift;
-
-	if (szb->zb_objset + bias < ezb->zb_objset + bias)
-		return (-1);
-
-	if (szb->zb_objset + bias > ezb->zb_objset + bias)
-		return (1);
-
-	slevel = szb->zb_level;
-	elevel = ezb->zb_level;
-
-	if ((slevel | elevel) < 0)
-		return ((slevel ^ bias) - (elevel ^ bias));
-
-	if (szb->zb_object + bias < ezb->zb_object + bias)
-		return (-1);
-
-	if (szb->zb_object + bias > ezb->zb_object + bias)
-		return (1);
-
-	if (dnp == NULL)
-		return (0);
-
-	wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
-
-	sblkoff = (szb->zb_blkid - bias) << BP_SPAN_SHIFT(slevel, wshift);
-	eblkoff = (ezb->zb_blkid - bias) << BP_SPAN_SHIFT(elevel, wshift);
-
-	if (sblkoff < eblkoff)
-		return (-1);
-
-	if (sblkoff > eblkoff)
-		return (1);
-
-	return ((elevel ^ bias) - (slevel ^ bias));
-}
-
-#define	SET_BOOKMARK(zb, objset, object, level, blkid)	\
-{							\
-	(zb)->zb_objset = objset;			\
-	(zb)->zb_object = object;			\
-	(zb)->zb_level = level;				\
-	(zb)->zb_blkid = blkid;				\
-}
-
-#define	SET_BOOKMARK_LB(zb, level, blkid)		\
-{							\
-	(zb)->zb_level = level;				\
-	(zb)->zb_blkid = blkid;				\
-}
-
-static int
-advance_objset(zseg_t *zseg, uint64_t objset, int advance)
-{
-	zbookmark_t *zb = &zseg->seg_start;
-
-	if (advance & ADVANCE_PRE) {
-		if (objset >= ZB_MAXOBJSET)
-			return (ERANGE);
-		SET_BOOKMARK(zb, objset, 0, -1, 0);
-	} else {
-		if (objset >= ZB_MAXOBJSET)
-			objset = 0;
-		SET_BOOKMARK(zb, objset, 1, 0, 0);
-	}
-
-	if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
-		return (ERANGE);
-
-	return (EAGAIN);
-}
-
-static int
-advance_object(zseg_t *zseg, uint64_t object, int advance)
-{
-	zbookmark_t *zb = &zseg->seg_start;
-
-	if (advance & ADVANCE_PRE) {
-		if (object >= ZB_MAXOBJECT) {
-			SET_BOOKMARK(zb, zb->zb_objset + 1, 0, -1, 0);
-		} else {
-			SET_BOOKMARK(zb, zb->zb_objset, object, ZB_MAXLEVEL, 0);
-		}
-	} else {
-		if (zb->zb_object == 0) {
-			SET_BOOKMARK(zb, zb->zb_objset, 0, -1, 0);
-		} else {
-			if (object >= ZB_MAXOBJECT)
-				object = 0;
-			SET_BOOKMARK(zb, zb->zb_objset, object, 0, 0);
-		}
-	}
-
-	if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
-		return (ERANGE);
-
-	return (EAGAIN);
-}
-
-static int
-advance_from_osphys(zseg_t *zseg, int advance)
-{
-	zbookmark_t *zb = &zseg->seg_start;
-
-	ASSERT(zb->zb_object == 0);
-	ASSERT(zb->zb_level == -1);
-	ASSERT(zb->zb_blkid == 0);
-
-	if (advance & ADVANCE_PRE) {
-		SET_BOOKMARK_LB(zb, ZB_MAXLEVEL, 0);
-	} else {
-		if (zb->zb_objset == 0)
-			return (ERANGE);
-		SET_BOOKMARK(zb, zb->zb_objset + 1, 1, 0, 0);
-	}
-
-	if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
-		return (ERANGE);
-
-	return (EAGAIN);
-}
-
-static int
-advance_block(zseg_t *zseg, dnode_phys_t *dnp, int rc, int advance)
-{
-	zbookmark_t *zb = &zseg->seg_start;
-	int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
-	int maxlevel = dnp->dn_nlevels - 1;
-	int level = zb->zb_level;
-	uint64_t blkid = zb->zb_blkid;
-
-	if (advance & ADVANCE_PRE) {
-		if (level > 0 && rc == 0) {
-			level--;
-			blkid <<= wshift;
-		} else {
-			blkid++;
-
-			if ((blkid << BP_SPAN_SHIFT(level, wshift)) >
-			    dnp->dn_maxblkid)
-				return (ERANGE);
-
-			while (level < maxlevel) {
-				if (P2PHASE(blkid, 1ULL << wshift))
-					break;
-				blkid >>= wshift;
-				level++;
-			}
-		}
-	} else {
-		if (level >= maxlevel || P2PHASE(blkid + 1, 1ULL << wshift)) {
-			blkid = (blkid + 1) << BP_SPAN_SHIFT(level, wshift);
-			level = 0;
-		} else {
-			blkid >>= wshift;
-			level++;
-		}
-
-		while ((blkid << BP_SPAN_SHIFT(level, wshift)) >
-		    dnp->dn_maxblkid) {
-			if (level == maxlevel)
-				return (ERANGE);
-			blkid >>= wshift;
-			level++;
-		}
-	}
-	SET_BOOKMARK_LB(zb, level, blkid);
-
-	if (compare_bookmark(zb, &zseg->seg_end, dnp, advance) > 0)
-		return (ERANGE);
-
-	return (EAGAIN);
-}
-
-static int
-traverse_callback(traverse_handle_t *th, zseg_t *zseg, traverse_blk_cache_t *bc)
-{
-	/*
-	 * Before we issue the callback, prune against maxtxg.
-	 *
-	 * We prune against mintxg before we get here because it's a big win.
-	 * If a given block was born in txg 37, then we know that the entire
-	 * subtree below that block must have been born in txg 37 or earlier.
-	 * We can therefore lop off huge branches of the tree as we go.
-	 *
-	 * There's no corresponding optimization for maxtxg because knowing
-	 * that bp->blk_birth >= maxtxg doesn't imply anything about the bp's
-	 * children.  In fact, the copy-on-write design of ZFS ensures that
-	 * top-level blocks will pretty much always be new.
-	 *
-	 * Therefore, in the name of simplicity we don't prune against
-	 * maxtxg until the last possible moment -- that being right now.
-	 */
-	if (bc->bc_errno == 0 && bc->bc_blkptr.blk_birth >= zseg->seg_maxtxg)
-		return (0);
-
-	/*
-	 * Debugging: verify that the order we visit things agrees with the
-	 * order defined by compare_bookmark().  We don't check this for
-	 * log blocks because there's no defined ordering for them; they're
-	 * always visited (or not) as part of visiting the objset_phys_t.
-	 */
-	if (bc->bc_errno == 0 && bc != &th->th_zil_cache) {
-		zbookmark_t *zb = &bc->bc_bookmark;
-		zbookmark_t *szb = &zseg->seg_start;
-		zbookmark_t *ezb = &zseg->seg_end;
-		zbookmark_t *lzb = &th->th_lastcb;
-		dnode_phys_t *dnp = bc->bc_dnode;
-
-		ASSERT(compare_bookmark(zb, ezb, dnp, th->th_advance) <= 0);
-		ASSERT(compare_bookmark(zb, szb, dnp, th->th_advance) == 0);
-		ASSERT(compare_bookmark(lzb, zb, dnp, th->th_advance) < 0 ||
-		    lzb->zb_level == ZB_NO_LEVEL);
-		*lzb = *zb;
-	}
-
-	th->th_callbacks++;
-	return (th->th_func(bc, th->th_spa, th->th_arg));
-}
-
-static int
-traverse_read(traverse_handle_t *th, traverse_blk_cache_t *bc, blkptr_t *bp,
-	dnode_phys_t *dnp)
-{
-	zbookmark_t *zb = &bc->bc_bookmark;
-	int error;
-
-	th->th_hits++;
-
-	bc->bc_dnode = dnp;
-	bc->bc_errno = 0;
-
-	if (BP_EQUAL(&bc->bc_blkptr, bp))
-		return (0);
-
-	bc->bc_blkptr = *bp;
-
-	if (bc->bc_data == NULL)
-		return (0);
-
-	if (BP_IS_HOLE(bp)) {
-		ASSERT(th->th_advance & ADVANCE_HOLES);
-		return (0);
-	}
-
-	if (compare_bookmark(zb, &th->th_noread, dnp, 0) == 0) {
-		error = EIO;
-	} else if (arc_tryread(th->th_spa, bp, bc->bc_data) == 0) {
-		error = 0;
-		th->th_arc_hits++;
-	} else {
-		error = zio_wait(zio_read(NULL, th->th_spa, bp, bc->bc_data,
-		    BP_GET_LSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_READ,
-		    th->th_zio_flags | ZIO_FLAG_DONT_CACHE, zb));
-
-		if (BP_SHOULD_BYTESWAP(bp) && error == 0)
-			(zb->zb_level > 0 ? byteswap_uint64_array :
-			    dmu_ot[BP_GET_TYPE(bp)].ot_byteswap)(bc->bc_data,
-			    BP_GET_LSIZE(bp));
-		th->th_reads++;
-	}
-
-	if (error) {
-		bc->bc_errno = error;
-		error = traverse_callback(th, NULL, bc);
-		ASSERT(error == EAGAIN || error == EINTR || error == ERESTART);
-		bc->bc_blkptr.blk_birth = -1ULL;
-	}
-
-	dprintf("cache %02x error %d <%llu, %llu, %d, %llx>\n",
-	    bc - &th->th_cache[0][0], error,
-	    zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid);
-
-	return (error);
-}
-
-static int
-find_block(traverse_handle_t *th, zseg_t *zseg, dnode_phys_t *dnp, int depth)
-{
-	zbookmark_t *zb = &zseg->seg_start;
-	traverse_blk_cache_t *bc;
-	blkptr_t *bp = dnp->dn_blkptr;
-	int i, first, level;
-	int nbp = dnp->dn_nblkptr;
-	int minlevel = zb->zb_level;
-	int maxlevel = dnp->dn_nlevels - 1;
-	int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
-	int bp_shift = BP_SPAN_SHIFT(maxlevel - minlevel, wshift);
-	uint64_t blkid = zb->zb_blkid >> bp_shift;
-	int do_holes = (th->th_advance & ADVANCE_HOLES) && depth == ZB_DN_CACHE;
-	int rc;
-
-	if (minlevel > maxlevel || blkid >= nbp)
-		return (ERANGE);
-
-	for (level = maxlevel; level >= minlevel; level--) {
-		first = P2PHASE(blkid, 1ULL << wshift);
-
-		for (i = first; i < nbp; i++)
-			if (bp[i].blk_birth > zseg->seg_mintxg ||
-			    BP_IS_HOLE(&bp[i]) && do_holes)
-				break;
-
-		if (i != first) {
-			i--;
-			SET_BOOKMARK_LB(zb, level, blkid + (i - first));
-			return (ENOTBLK);
-		}
-
-		bc = &th->th_cache[depth][level];
-
-		SET_BOOKMARK(&bc->bc_bookmark, zb->zb_objset, zb->zb_object,
-		    level, blkid);
-
-		if (rc = traverse_read(th, bc, bp + i, dnp)) {
-			if (rc != EAGAIN) {
-				SET_BOOKMARK_LB(zb, level, blkid);
-			}
-			return (rc);
-		}
-
-		if (BP_IS_HOLE(&bp[i])) {
-			SET_BOOKMARK_LB(zb, level, blkid);
-			th->th_lastcb.zb_level = ZB_NO_LEVEL;
-			return (0);
-		}
-
-		nbp = 1 << wshift;
-		bp = bc->bc_data;
-		bp_shift -= wshift;
-		blkid = zb->zb_blkid >> bp_shift;
-	}
-
-	return (0);
-}
-
-static int
-get_dnode(traverse_handle_t *th, uint64_t objset, dnode_phys_t *mdn,
-    uint64_t *objectp, dnode_phys_t **dnpp, uint64_t txg, int type, int depth)
-{
-	zseg_t zseg;
-	zbookmark_t *zb = &zseg.seg_start;
-	uint64_t object = *objectp;
-	int i, rc;
-
-	SET_BOOKMARK(zb, objset, 0, 0, object / DNODES_PER_BLOCK);
-	SET_BOOKMARK(&zseg.seg_end, objset, 0, 0, ZB_MAXBLKID);
-
-	zseg.seg_mintxg = txg;
-	zseg.seg_maxtxg = -1ULL;
-
-	for (;;) {
-		rc = find_block(th, &zseg, mdn, depth);
-
-		if (rc == EAGAIN || rc == EINTR || rc == ERANGE)
-			break;
-
-		if (rc == 0 && zb->zb_level == 0) {
-			dnode_phys_t *dnp = th->th_cache[depth][0].bc_data;
-			for (i = 0; i < DNODES_PER_BLOCK; i++) {
-				object = (zb->zb_blkid * DNODES_PER_BLOCK) + i;
-				if (object >= *objectp &&
-				    dnp[i].dn_type != DMU_OT_NONE &&
-				    (type == -1 || dnp[i].dn_type == type)) {
-					*objectp = object;
-					*dnpp = &dnp[i];
-					return (0);
-				}
-			}
-		}
-
-		rc = advance_block(&zseg, mdn, rc, ADVANCE_PRE);
-
-		if (rc == ERANGE)
-			break;
-	}
-
-	if (rc == ERANGE)
-		*objectp = ZB_MAXOBJECT;
-
-	return (rc);
-}
-
-/* ARGSUSED */
-static void
-traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
-{
-	traverse_handle_t *th = arg;
-	traverse_blk_cache_t *bc = &th->th_zil_cache;
-	zbookmark_t *zb = &bc->bc_bookmark;
-	zseg_t *zseg = list_head(&th->th_seglist);
-
-	if (bp->blk_birth <= zseg->seg_mintxg)
-		return;
-
-	if (claim_txg != 0 || bp->blk_birth < spa_first_txg(th->th_spa)) {
-		zb->zb_object = 0;
-		zb->zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
-		bc->bc_blkptr = *bp;
-		(void) traverse_callback(th, zseg, bc);
-	}
-}
-
-/* ARGSUSED */
-static void
-traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
-{
-	traverse_handle_t *th = arg;
-	traverse_blk_cache_t *bc = &th->th_zil_cache;
-	zbookmark_t *zb = &bc->bc_bookmark;
-	zseg_t *zseg = list_head(&th->th_seglist);
-
-	if (lrc->lrc_txtype == TX_WRITE) {
-		lr_write_t *lr = (lr_write_t *)lrc;
-		blkptr_t *bp = &lr->lr_blkptr;
-
-		if (bp->blk_birth <= zseg->seg_mintxg)
-			return;
-
-		if (claim_txg != 0 && bp->blk_birth >= claim_txg) {
-			zb->zb_object = lr->lr_foid;
-			zb->zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp);
-			bc->bc_blkptr = *bp;
-			(void) traverse_callback(th, zseg, bc);
-		}
-	}
-}
-
-static void
-traverse_zil(traverse_handle_t *th, traverse_blk_cache_t *bc)
-{
-	spa_t *spa = th->th_spa;
-	dsl_pool_t *dp = spa_get_dsl(spa);
-	objset_phys_t *osphys = bc->bc_data;
-	zil_header_t *zh = &osphys->os_zil_header;
-	uint64_t claim_txg = zh->zh_claim_txg;
-	zilog_t *zilog;
-
-	ASSERT(bc == &th->th_cache[ZB_MDN_CACHE][ZB_MAXLEVEL - 1]);
-	ASSERT(bc->bc_bookmark.zb_level == -1);
-
-	/*
-	 * We only want to visit blocks that have been claimed but not yet
-	 * replayed (or, in read-only mode, blocks that *would* be claimed).
-	 */
-	if (claim_txg == 0 && (spa_mode & FWRITE))
-		return;
-
-	th->th_zil_cache.bc_bookmark = bc->bc_bookmark;
-
-	zilog = zil_alloc(dp->dp_meta_objset, zh);
-
-	(void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, th,
-	    claim_txg);
-
-	zil_free(zilog);
-}
-
-static int
-traverse_segment(traverse_handle_t *th, zseg_t *zseg, blkptr_t *mosbp)
-{
-	zbookmark_t *zb = &zseg->seg_start;
-	traverse_blk_cache_t *bc;
-	dnode_phys_t *dn, *dn_tmp;
-	int worklimit = 100;
-	int rc;
-
-	dprintf("<%llu, %llu, %d, %llx>\n",
-	    zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid);
-
-	bc = &th->th_cache[ZB_MOS_CACHE][ZB_MAXLEVEL - 1];
-	dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode;
-
-	SET_BOOKMARK(&bc->bc_bookmark, 0, 0, -1, 0);
-
-	rc = traverse_read(th, bc, mosbp, dn);
-
-	if (rc)		/* If we get ERESTART, we've got nowhere left to go */
-		return (rc == ERESTART ? EINTR : rc);
-
-	ASSERT(dn->dn_nlevels < ZB_MAXLEVEL);
-
-	if (zb->zb_objset != 0) {
-		uint64_t objset = zb->zb_objset;
-		dsl_dataset_phys_t *dsp;
-
-		rc = get_dnode(th, 0, dn, &objset, &dn_tmp, 0,
-		    DMU_OT_DSL_DATASET, ZB_MOS_CACHE);
-
-		if (objset != zb->zb_objset)
-			rc = advance_objset(zseg, objset, th->th_advance);
-
-		if (rc != 0)
-			return (rc);
-
-		dsp = DN_BONUS(dn_tmp);
-
-		bc = &th->th_cache[ZB_MDN_CACHE][ZB_MAXLEVEL - 1];
-		dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode;
-
-		SET_BOOKMARK(&bc->bc_bookmark, objset, 0, -1, 0);
-
-		/*
-		 * If we're traversing an open snapshot, we know that it
-		 * can't be deleted (because it's open) and it can't change
-		 * (because it's a snapshot).  Therefore, once we've gotten
-		 * from the uberblock down to the snapshot's objset_phys_t,
-		 * we no longer need to synchronize with spa_sync(); we're
-		 * traversing a completely static block tree from here on.
-		 */
-		if (th->th_advance & ADVANCE_NOLOCK) {
-			ASSERT(th->th_locked);
-			rw_exit(spa_traverse_rwlock(th->th_spa));
-			th->th_locked = 0;
-		}
-
-		rc = traverse_read(th, bc, &dsp->ds_bp, dn);
-
-		if (rc != 0) {
-			if (rc == ERESTART)
-				rc = advance_objset(zseg, zb->zb_objset + 1,
-				    th->th_advance);
-			return (rc);
-		}
-
-		if (th->th_advance & ADVANCE_PRUNE)
-			zseg->seg_mintxg =
-			    MAX(zseg->seg_mintxg, dsp->ds_prev_snap_txg);
-	}
-
-	if (zb->zb_level == -1) {
-		ASSERT(zb->zb_object == 0);
-		ASSERT(zb->zb_blkid == 0);
-		ASSERT(BP_GET_TYPE(&bc->bc_blkptr) == DMU_OT_OBJSET);
-
-		if (bc->bc_blkptr.blk_birth > zseg->seg_mintxg) {
-			rc = traverse_callback(th, zseg, bc);
-			if (rc) {
-				ASSERT(rc == EINTR);
-				return (rc);
-			}
-			if ((th->th_advance & ADVANCE_ZIL) &&
-			    zb->zb_objset != 0)
-				traverse_zil(th, bc);
-		}
-
-		return (advance_from_osphys(zseg, th->th_advance));
-	}
-
-	if (zb->zb_object != 0) {
-		uint64_t object = zb->zb_object;
-
-		rc = get_dnode(th, zb->zb_objset, dn, &object, &dn_tmp,
-		    zseg->seg_mintxg, -1, ZB_MDN_CACHE);
-
-		if (object != zb->zb_object)
-			rc = advance_object(zseg, object, th->th_advance);
-
-		if (rc != 0)
-			return (rc);
-
-		dn = dn_tmp;
-	}
-
-	if (zb->zb_level == ZB_MAXLEVEL)
-		zb->zb_level = dn->dn_nlevels - 1;
-
-	for (;;) {
-		rc = find_block(th, zseg, dn, ZB_DN_CACHE);
-
-		if (rc == EAGAIN || rc == EINTR || rc == ERANGE)
-			break;
-
-		if (rc == 0) {
-			bc = &th->th_cache[ZB_DN_CACHE][zb->zb_level];
-			ASSERT(bc->bc_dnode == dn);
-			ASSERT(bc->bc_blkptr.blk_birth <= mosbp->blk_birth);
-			rc = traverse_callback(th, zseg, bc);
-			if (rc) {
-				ASSERT(rc == EINTR);
-				return (rc);
-			}
-			if (BP_IS_HOLE(&bc->bc_blkptr)) {
-				ASSERT(th->th_advance & ADVANCE_HOLES);
-				rc = ENOTBLK;
-			}
-		}
-
-		rc = advance_block(zseg, dn, rc, th->th_advance);
-
-		if (rc == ERANGE)
-			break;
-
-		/*
-		 * Give spa_sync() a chance to run.
-		 */
-		if (th->th_locked && spa_traverse_wanted(th->th_spa)) {
-			th->th_syncs++;
-			return (EAGAIN);
-		}
-
-		if (--worklimit == 0)
-			return (EAGAIN);
-	}
-
-	if (rc == ERANGE)
-		rc = advance_object(zseg, zb->zb_object + 1, th->th_advance);
-
-	return (rc);
-}
-
-/*
- * It is the caller's responsibility to ensure that the dsl_dataset_t
- * doesn't go away during traversal.
- */
-int
-traverse_dsl_dataset(dsl_dataset_t *ds, uint64_t txg_start, int advance,
-    blkptr_cb_t func, void *arg)
-{
-	spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
-	traverse_handle_t *th;
-	int err;
-
-	th = traverse_init(spa, func, arg, advance, ZIO_FLAG_MUSTSUCCEED);
-
-	traverse_add_objset(th, txg_start, -1ULL, ds->ds_object);
-
-	while ((err = traverse_more(th)) == EAGAIN)
-		continue;
-
-	traverse_fini(th);
-	return (err);
-}
-
-int
-traverse_more(traverse_handle_t *th)
-{
-	zseg_t *zseg = list_head(&th->th_seglist);
-	uint64_t save_txg;	/* XXX won't be necessary with real itinerary */
-	krwlock_t *rw = spa_traverse_rwlock(th->th_spa);
-	blkptr_t *mosbp = spa_get_rootblkptr(th->th_spa);
-	int rc;
-
-	if (zseg == NULL)
-		return (0);
-
-	th->th_restarts++;
-
-	save_txg = zseg->seg_mintxg;
-
-	rw_enter(rw, RW_READER);
-	th->th_locked = 1;
-
-	rc = traverse_segment(th, zseg, mosbp);
-	ASSERT(rc == ERANGE || rc == EAGAIN || rc == EINTR);
-
-	if (th->th_locked)
-		rw_exit(rw);
-	th->th_locked = 0;
-
-	zseg->seg_mintxg = save_txg;
-
-	if (rc == ERANGE) {
-		list_remove(&th->th_seglist, zseg);
-		kmem_free(zseg, sizeof (*zseg));
-		return (EAGAIN);
-	}
-
-	return (rc);
-}
-
-/*
- * Note: (mintxg, maxtxg) is an open interval; mintxg and maxtxg themselves
- * are not included.  The blocks covered by this segment will all have
- * mintxg < birth < maxtxg.
- */
-static void
-traverse_add_segment(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
-    uint64_t sobjset, uint64_t sobject, int slevel, uint64_t sblkid,
-    uint64_t eobjset, uint64_t eobject, int elevel, uint64_t eblkid)
-{
-	zseg_t *zseg;
-
-	zseg = kmem_alloc(sizeof (zseg_t), KM_SLEEP);
-
-	zseg->seg_mintxg = mintxg;
-	zseg->seg_maxtxg = maxtxg;
-
-	zseg->seg_start.zb_objset = sobjset;
-	zseg->seg_start.zb_object = sobject;
-	zseg->seg_start.zb_level = slevel;
-	zseg->seg_start.zb_blkid = sblkid;
-
-	zseg->seg_end.zb_objset = eobjset;
-	zseg->seg_end.zb_object = eobject;
-	zseg->seg_end.zb_level = elevel;
-	zseg->seg_end.zb_blkid = eblkid;
-
-	list_insert_tail(&th->th_seglist, zseg);
-}
-
-void
-traverse_add_dnode(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
-    uint64_t objset, uint64_t object)
-{
-	if (th->th_advance & ADVANCE_PRE)
-		traverse_add_segment(th, mintxg, maxtxg,
-		    objset, object, ZB_MAXLEVEL, 0,
-		    objset, object, 0, ZB_MAXBLKID);
-	else
-		traverse_add_segment(th, mintxg, maxtxg,
-		    objset, object, 0, 0,
-		    objset, object, 0, ZB_MAXBLKID);
-}
-
-void
-traverse_add_objset(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
-    uint64_t objset)
-{
-	if (th->th_advance & ADVANCE_PRE)
-		traverse_add_segment(th, mintxg, maxtxg,
-		    objset, 0, -1, 0,
-		    objset, ZB_MAXOBJECT, 0, ZB_MAXBLKID);
-	else
-		traverse_add_segment(th, mintxg, maxtxg,
-		    objset, 1, 0, 0,
-		    objset, 0, -1, 0);
-}
-
-void
-traverse_add_pool(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg)
-{
-	if (th->th_advance & ADVANCE_PRE)
-		traverse_add_segment(th, mintxg, maxtxg,
-		    0, 0, -1, 0,
-		    ZB_MAXOBJSET, ZB_MAXOBJECT, 0, ZB_MAXBLKID);
-	else
-		traverse_add_segment(th, mintxg, maxtxg,
-		    1, 1, 0, 0,
-		    0, 0, -1, 0);
-}
-
-traverse_handle_t *
-traverse_init(spa_t *spa, blkptr_cb_t func, void *arg, int advance,
-    int zio_flags)
-{
-	traverse_handle_t *th;
-	int d, l;
-
-	th = kmem_zalloc(sizeof (*th), KM_SLEEP);
-
-	th->th_spa = spa;
-	th->th_func = func;
-	th->th_arg = arg;
-	th->th_advance = advance;
-	th->th_lastcb.zb_level = ZB_NO_LEVEL;
-	th->th_noread.zb_level = ZB_NO_LEVEL;
-	th->th_zio_flags = zio_flags;
-
-	list_create(&th->th_seglist, sizeof (zseg_t),
-	    offsetof(zseg_t, seg_node));
-
-	for (d = 0; d < ZB_DEPTH; d++) {
-		for (l = 0; l < ZB_MAXLEVEL; l++) {
-			if ((advance & ADVANCE_DATA) ||
-			    l != 0 || d != ZB_DN_CACHE)
-				th->th_cache[d][l].bc_data =
-				    zio_buf_alloc(SPA_MAXBLOCKSIZE);
-		}
-	}
-
-	return (th);
-}
-
-void
-traverse_fini(traverse_handle_t *th)
-{
-	int d, l;
-	zseg_t *zseg;
-
-	for (d = 0; d < ZB_DEPTH; d++)
-		for (l = 0; l < ZB_MAXLEVEL; l++)
-			if (th->th_cache[d][l].bc_data != NULL)
-				zio_buf_free(th->th_cache[d][l].bc_data,
-				    SPA_MAXBLOCKSIZE);
-
-	while ((zseg = list_head(&th->th_seglist)) != NULL) {
-		list_remove(&th->th_seglist, zseg);
-		kmem_free(zseg, sizeof (*zseg));
-	}
-
-	list_destroy(&th->th_seglist);
-
-	dprintf("%llu hit, %llu ARC, %llu IO, %llu cb, %llu sync, %llu again\n",
-	    th->th_hits, th->th_arc_hits, th->th_reads, th->th_callbacks,
-	    th->th_syncs, th->th_restarts);
-
-	kmem_free(th, sizeof (*th));
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
deleted file mode 100644
index 13fd8d4..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
+++ /dev/null
@@ -1,992 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/dmu.h>
-#include <sys/dmu_impl.h>
-#include <sys/dbuf.h>
-#include <sys/dmu_tx.h>
-#include <sys/dmu_objset.h>
-#include <sys/dsl_dataset.h> /* for dsl_dataset_block_freeable() */
-#include <sys/dsl_dir.h> /* for dsl_dir_tempreserve_*() */
-#include <sys/dsl_pool.h>
-#include <sys/zap_impl.h> /* for fzap_default_block_shift */
-#include <sys/spa.h>
-#include <sys/zfs_context.h>
-
-typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
-    uint64_t arg1, uint64_t arg2);
-
-
-dmu_tx_t *
-dmu_tx_create_dd(dsl_dir_t *dd)
-{
-	dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP);
-	tx->tx_dir = dd;
-	if (dd)
-		tx->tx_pool = dd->dd_pool;
-	list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t),
-	    offsetof(dmu_tx_hold_t, txh_node));
-#ifdef ZFS_DEBUG
-	refcount_create(&tx->tx_space_written);
-	refcount_create(&tx->tx_space_freed);
-#endif
-	return (tx);
-}
-
-dmu_tx_t *
-dmu_tx_create(objset_t *os)
-{
-	dmu_tx_t *tx = dmu_tx_create_dd(os->os->os_dsl_dataset->ds_dir);
-	tx->tx_objset = os;
-	tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os->os_dsl_dataset);
-	return (tx);
-}
-
-dmu_tx_t *
-dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg)
-{
-	dmu_tx_t *tx = dmu_tx_create_dd(NULL);
-
-	ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg);
-	tx->tx_pool = dp;
-	tx->tx_txg = txg;
-	tx->tx_anyobj = TRUE;
-
-	return (tx);
-}
-
-int
-dmu_tx_is_syncing(dmu_tx_t *tx)
-{
-	return (tx->tx_anyobj);
-}
-
-int
-dmu_tx_private_ok(dmu_tx_t *tx)
-{
-	return (tx->tx_anyobj);
-}
-
-static dmu_tx_hold_t *
-dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object,
-    enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2)
-{
-	dmu_tx_hold_t *txh;
-	dnode_t *dn = NULL;
-	int err;
-
-	if (object != DMU_NEW_OBJECT) {
-		err = dnode_hold(os->os, object, tx, &dn);
-		if (err) {
-			tx->tx_err = err;
-			return (NULL);
-		}
-
-		if (err == 0 && tx->tx_txg != 0) {
-			mutex_enter(&dn->dn_mtx);
-			/*
-			 * dn->dn_assigned_txg == tx->tx_txg doesn't pose a
-			 * problem, but there's no way for it to happen (for
-			 * now, at least).
-			 */
-			ASSERT(dn->dn_assigned_txg == 0);
-			dn->dn_assigned_txg = tx->tx_txg;
-			(void) refcount_add(&dn->dn_tx_holds, tx);
-			mutex_exit(&dn->dn_mtx);
-		}
-	}
-
-	txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP);
-	txh->txh_tx = tx;
-	txh->txh_dnode = dn;
-#ifdef ZFS_DEBUG
-	txh->txh_type = type;
-	txh->txh_arg1 = arg1;
-	txh->txh_arg2 = arg2;
-#endif
-	list_insert_tail(&tx->tx_holds, txh);
-
-	return (txh);
-}
-
-void
-dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object)
-{
-	/*
-	 * If we're syncing, they can manipulate any object anyhow, and
-	 * the hold on the dnode_t can cause problems.
-	 */
-	if (!dmu_tx_is_syncing(tx)) {
-		(void) dmu_tx_hold_object_impl(tx, os,
-		    object, THT_NEWOBJECT, 0, 0);
-	}
-}
-
-static int
-dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
-{
-	int err;
-	dmu_buf_impl_t *db;
-
-	rw_enter(&dn->dn_struct_rwlock, RW_READER);
-	db = dbuf_hold_level(dn, level, blkid, FTAG);
-	rw_exit(&dn->dn_struct_rwlock);
-	if (db == NULL)
-		return (EIO);
-	err = dbuf_read(db, zio, DB_RF_CANFAIL);
-	dbuf_rele(db, FTAG);
-	return (err);
-}
-
-/* ARGSUSED */
-static void
-dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
-{
-	dnode_t *dn = txh->txh_dnode;
-	uint64_t start, end, i;
-	int min_bs, max_bs, min_ibs, max_ibs, epbs, bits;
-	int err = 0;
-
-	if (len == 0)
-		return;
-
-	min_bs = SPA_MINBLOCKSHIFT;
-	max_bs = SPA_MAXBLOCKSHIFT;
-	min_ibs = DN_MIN_INDBLKSHIFT;
-	max_ibs = DN_MAX_INDBLKSHIFT;
-
-
-	/*
-	 * For i/o error checking, read the first and last level-0
-	 * blocks (if they are not aligned), and all the level-1 blocks.
-	 */
-
-	if (dn) {
-		if (dn->dn_maxblkid == 0) {
-			err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
-			if (err)
-				goto out;
-		} else {
-			zio_t *zio = zio_root(dn->dn_objset->os_spa,
-			    NULL, NULL, ZIO_FLAG_CANFAIL);
-
-			/* first level-0 block */
-			start = off >> dn->dn_datablkshift;
-			if (P2PHASE(off, dn->dn_datablksz) ||
-			    len < dn->dn_datablksz) {
-				err = dmu_tx_check_ioerr(zio, dn, 0, start);
-				if (err)
-					goto out;
-			}
-
-			/* last level-0 block */
-			end = (off+len-1) >> dn->dn_datablkshift;
-			if (end != start &&
-			    P2PHASE(off+len, dn->dn_datablksz)) {
-				err = dmu_tx_check_ioerr(zio, dn, 0, end);
-				if (err)
-					goto out;
-			}
-
-			/* level-1 blocks */
-			if (dn->dn_nlevels > 1) {
-				start >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT;
-				end >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT;
-				for (i = start+1; i < end; i++) {
-					err = dmu_tx_check_ioerr(zio, dn, 1, i);
-					if (err)
-						goto out;
-				}
-			}
-
-			err = zio_wait(zio);
-			if (err)
-				goto out;
-		}
-	}
-
-	/*
-	 * If there's more than one block, the blocksize can't change,
-	 * so we can make a more precise estimate.  Alternatively,
-	 * if the dnode's ibs is larger than max_ibs, always use that.
-	 * This ensures that if we reduce DN_MAX_INDBLKSHIFT,
-	 * the code will still work correctly on existing pools.
-	 */
-	if (dn && (dn->dn_maxblkid != 0 || dn->dn_indblkshift > max_ibs)) {
-		min_ibs = max_ibs = dn->dn_indblkshift;
-		if (dn->dn_datablkshift != 0)
-			min_bs = max_bs = dn->dn_datablkshift;
-	}
-
-	/*
-	 * 'end' is the last thing we will access, not one past.
-	 * This way we won't overflow when accessing the last byte.
-	 */
-	start = P2ALIGN(off, 1ULL << max_bs);
-	end = P2ROUNDUP(off + len, 1ULL << max_bs) - 1;
-	txh->txh_space_towrite += end - start + 1;
-
-	start >>= min_bs;
-	end >>= min_bs;
-
-	epbs = min_ibs - SPA_BLKPTRSHIFT;
-
-	/*
-	 * The object contains at most 2^(64 - min_bs) blocks,
-	 * and each indirect level maps 2^epbs.
-	 */
-	for (bits = 64 - min_bs; bits >= 0; bits -= epbs) {
-		start >>= epbs;
-		end >>= epbs;
-		/*
-		 * If we increase the number of levels of indirection,
-		 * we'll need new blkid=0 indirect blocks.  If start == 0,
-		 * we're already accounting for that blocks; and if end == 0,
-		 * we can't increase the number of levels beyond that.
-		 */
-		if (start != 0 && end != 0)
-			txh->txh_space_towrite += 1ULL << max_ibs;
-		txh->txh_space_towrite += (end - start + 1) << max_ibs;
-	}
-
-	ASSERT(txh->txh_space_towrite < 2 * DMU_MAX_ACCESS);
-
-out:
-	if (err)
-		txh->txh_tx->tx_err = err;
-}
-
-static void
-dmu_tx_count_dnode(dmu_tx_hold_t *txh)
-{
-	dnode_t *dn = txh->txh_dnode;
-	dnode_t *mdn = txh->txh_tx->tx_objset->os->os_meta_dnode;
-	uint64_t space = mdn->dn_datablksz +
-	    ((mdn->dn_nlevels-1) << mdn->dn_indblkshift);
-
-	if (dn && dn->dn_dbuf->db_blkptr &&
-	    dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
-	    dn->dn_dbuf->db_blkptr->blk_birth)) {
-		txh->txh_space_tooverwrite += space;
-	} else {
-		txh->txh_space_towrite += space;
-	}
-}
-
-void
-dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
-{
-	dmu_tx_hold_t *txh;
-
-	ASSERT(tx->tx_txg == 0);
-	ASSERT(len < DMU_MAX_ACCESS);
-	ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
-
-	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
-	    object, THT_WRITE, off, len);
-	if (txh == NULL)
-		return;
-
-	dmu_tx_count_write(txh, off, len);
-	dmu_tx_count_dnode(txh);
-}
-
-static void
-dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
-{
-	uint64_t blkid, nblks;
-	uint64_t space = 0;
-	dnode_t *dn = txh->txh_dnode;
-	dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
-	spa_t *spa = txh->txh_tx->tx_pool->dp_spa;
-	int dirty;
-
-	/*
-	 * We don't need to use any locking to check for dirtyness
-	 * because it's OK if we get stale data -- the dnode may become
-	 * dirty immediately after our check anyway.  This is just a
-	 * means to avoid the expensive count when we aren't sure we
-	 * need it.  We need to be able to deal with a dirty dnode.
-	 */
-	dirty = list_link_active(&dn->dn_dirty_link[0]) |
-	    list_link_active(&dn->dn_dirty_link[1]) |
-	    list_link_active(&dn->dn_dirty_link[2]) |
-	    list_link_active(&dn->dn_dirty_link[3]);
-	if (dirty || dn->dn_assigned_txg || dn->dn_phys->dn_nlevels == 0)
-		return;
-
-	/*
-	 * the struct_rwlock protects us against dn_phys->dn_nlevels
-	 * changing, in case (against all odds) we manage to dirty &
-	 * sync out the changes after we check for being dirty.
-	 * also, dbuf_hold_impl() wants us to have the struct_rwlock.
-	 *
-	 * It's fine to use dn_datablkshift rather than the dn_phys
-	 * equivalent because if it is changing, maxblkid==0 and we will
-	 * bail.
-	 */
-	rw_enter(&dn->dn_struct_rwlock, RW_READER);
-	if (dn->dn_phys->dn_maxblkid == 0) {
-		if (off == 0 && len >= dn->dn_datablksz) {
-			blkid = 0;
-			nblks = 1;
-		} else {
-			rw_exit(&dn->dn_struct_rwlock);
-			return;
-		}
-	} else {
-		blkid = off >> dn->dn_datablkshift;
-		nblks = (off + len) >> dn->dn_datablkshift;
-
-		if (blkid >= dn->dn_phys->dn_maxblkid) {
-			rw_exit(&dn->dn_struct_rwlock);
-			return;
-		}
-		if (blkid + nblks > dn->dn_phys->dn_maxblkid)
-			nblks = dn->dn_phys->dn_maxblkid - blkid;
-
-		/* don't bother after 128,000 blocks */
-		nblks = MIN(nblks, 128*1024);
-	}
-
-	if (dn->dn_phys->dn_nlevels == 1) {
-		int i;
-		for (i = 0; i < nblks; i++) {
-			blkptr_t *bp = dn->dn_phys->dn_blkptr;
-			ASSERT3U(blkid + i, <, dn->dn_phys->dn_nblkptr);
-			bp += blkid + i;
-			if (dsl_dataset_block_freeable(ds, bp->blk_birth)) {
-				dprintf_bp(bp, "can free old%s", "");
-				space += bp_get_dasize(spa, bp);
-			}
-		}
-		nblks = 0;
-	}
-
-	while (nblks) {
-		dmu_buf_impl_t *dbuf;
-		int err, epbs, blkoff, tochk;
-
-		epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
-		blkoff = P2PHASE(blkid, 1<<epbs);
-		tochk = MIN((1<<epbs) - blkoff, nblks);
-
-		err = dbuf_hold_impl(dn, 1, blkid >> epbs, TRUE, FTAG, &dbuf);
-		if (err == 0) {
-			int i;
-			blkptr_t *bp;
-
-			err = dbuf_read(dbuf, NULL,
-			    DB_RF_HAVESTRUCT | DB_RF_CANFAIL);
-			if (err != 0) {
-				txh->txh_tx->tx_err = err;
-				dbuf_rele(dbuf, FTAG);
-				break;
-			}
-
-			bp = dbuf->db.db_data;
-			bp += blkoff;
-
-			for (i = 0; i < tochk; i++) {
-				if (dsl_dataset_block_freeable(ds,
-				    bp[i].blk_birth)) {
-					dprintf_bp(&bp[i],
-					    "can free old%s", "");
-					space += bp_get_dasize(spa, &bp[i]);
-				}
-			}
-			dbuf_rele(dbuf, FTAG);
-		}
-		if (err && err != ENOENT) {
-			txh->txh_tx->tx_err = err;
-			break;
-		}
-
-		blkid += tochk;
-		nblks -= tochk;
-	}
-	rw_exit(&dn->dn_struct_rwlock);
-
-	txh->txh_space_tofree += space;
-}
-
-void
-dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
-{
-	dmu_tx_hold_t *txh;
-	dnode_t *dn;
-	uint64_t start, end, i;
-	int err, shift;
-	zio_t *zio;
-
-	ASSERT(tx->tx_txg == 0);
-
-	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
-	    object, THT_FREE, off, len);
-	if (txh == NULL)
-		return;
-	dn = txh->txh_dnode;
-
-	/* first block */
-	if (off != 0)
-		dmu_tx_count_write(txh, off, 1);
-	/* last block */
-	if (len != DMU_OBJECT_END)
-		dmu_tx_count_write(txh, off+len, 1);
-
-	if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz)
-		return;
-	if (len == DMU_OBJECT_END)
-		len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off;
-
-	/*
-	 * For i/o error checking, read the first and last level-0
-	 * blocks, and all the level-1 blocks.  The above count_write's
-	 * will take care of the level-0 blocks.
-	 */
-	if (dn->dn_nlevels > 1) {
-		shift = dn->dn_datablkshift + dn->dn_indblkshift -
-		    SPA_BLKPTRSHIFT;
-		start = off >> shift;
-		end = dn->dn_datablkshift ? ((off+len) >> shift) : 0;
-
-		zio = zio_root(tx->tx_pool->dp_spa,
-		    NULL, NULL, ZIO_FLAG_CANFAIL);
-		for (i = start; i <= end; i++) {
-			uint64_t ibyte = i << shift;
-			err = dnode_next_offset(dn, FALSE, &ibyte, 2, 1, 0);
-			i = ibyte >> shift;
-			if (err == ESRCH)
-				break;
-			if (err) {
-				tx->tx_err = err;
-				return;
-			}
-
-			err = dmu_tx_check_ioerr(zio, dn, 1, i);
-			if (err) {
-				tx->tx_err = err;
-				return;
-			}
-		}
-		err = zio_wait(zio);
-		if (err) {
-			tx->tx_err = err;
-			return;
-		}
-	}
-
-	dmu_tx_count_dnode(txh);
-	dmu_tx_count_free(txh, off, len);
-}
-
-void
-dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name)
-{
-	dmu_tx_hold_t *txh;
-	dnode_t *dn;
-	uint64_t nblocks;
-	int epbs, err;
-
-	ASSERT(tx->tx_txg == 0);
-
-	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
-	    object, THT_ZAP, add, (uintptr_t)name);
-	if (txh == NULL)
-		return;
-	dn = txh->txh_dnode;
-
-	dmu_tx_count_dnode(txh);
-
-	if (dn == NULL) {
-		/*
-		 * We will be able to fit a new object's entries into one leaf
-		 * block.  So there will be at most 2 blocks total,
-		 * including the header block.
-		 */
-		dmu_tx_count_write(txh, 0, 2 << fzap_default_block_shift);
-		return;
-	}
-
-	ASSERT3P(dmu_ot[dn->dn_type].ot_byteswap, ==, zap_byteswap);
-
-	if (dn->dn_maxblkid == 0 && !add) {
-		/*
-		 * If there is only one block  (i.e. this is a micro-zap)
-		 * and we are not adding anything, the accounting is simple.
-		 */
-		err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
-		if (err) {
-			tx->tx_err = err;
-			return;
-		}
-
-		/*
-		 * Use max block size here, since we don't know how much
-		 * the size will change between now and the dbuf dirty call.
-		 */
-		if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
-		    dn->dn_phys->dn_blkptr[0].blk_birth))
-			txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
-		else
-			txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
-		return;
-	}
-
-	if (dn->dn_maxblkid > 0 && name) {
-		/*
-		 * access the name in this fat-zap so that we'll check
-		 * for i/o errors to the leaf blocks, etc.
-		 */
-		err = zap_lookup(&dn->dn_objset->os, dn->dn_object, name,
-		    8, 0, NULL);
-		if (err == EIO) {
-			tx->tx_err = err;
-			return;
-		}
-	}
-
-	/*
-	 * 3 blocks overwritten: target leaf, ptrtbl block, header block
-	 * 3 new blocks written if adding: new split leaf, 2 grown ptrtbl blocks
-	 */
-	dmu_tx_count_write(txh, dn->dn_maxblkid * dn->dn_datablksz,
-	    (3 + add ? 3 : 0) << dn->dn_datablkshift);
-
-	/*
-	 * If the modified blocks are scattered to the four winds,
-	 * we'll have to modify an indirect twig for each.
-	 */
-	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
-	for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs)
-		txh->txh_space_towrite += 3 << dn->dn_indblkshift;
-}
-
-void
-dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object)
-{
-	dmu_tx_hold_t *txh;
-
-	ASSERT(tx->tx_txg == 0);
-
-	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
-	    object, THT_BONUS, 0, 0);
-	if (txh)
-		dmu_tx_count_dnode(txh);
-}
-
-void
-dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space)
-{
-	dmu_tx_hold_t *txh;
-	ASSERT(tx->tx_txg == 0);
-
-	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
-	    DMU_NEW_OBJECT, THT_SPACE, space, 0);
-
-	txh->txh_space_towrite += space;
-}
-
-int
-dmu_tx_holds(dmu_tx_t *tx, uint64_t object)
-{
-	dmu_tx_hold_t *txh;
-	int holds = 0;
-
-	/*
-	 * By asserting that the tx is assigned, we're counting the
-	 * number of dn_tx_holds, which is the same as the number of
-	 * dn_holds.  Otherwise, we'd be counting dn_holds, but
-	 * dn_tx_holds could be 0.
-	 */
-	ASSERT(tx->tx_txg != 0);
-
-	/* if (tx->tx_anyobj == TRUE) */
-		/* return (0); */
-
-	for (txh = list_head(&tx->tx_holds); txh;
-	    txh = list_next(&tx->tx_holds, txh)) {
-		if (txh->txh_dnode && txh->txh_dnode->dn_object == object)
-			holds++;
-	}
-
-	return (holds);
-}
-
-#ifdef ZFS_DEBUG
-void
-dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
-{
-	dmu_tx_hold_t *txh;
-	int match_object = FALSE, match_offset = FALSE;
-	dnode_t *dn = db->db_dnode;
-
-	ASSERT(tx->tx_txg != 0);
-	ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset->os);
-	ASSERT3U(dn->dn_object, ==, db->db.db_object);
-
-	if (tx->tx_anyobj)
-		return;
-
-	/* XXX No checking on the meta dnode for now */
-	if (db->db.db_object == DMU_META_DNODE_OBJECT)
-		return;
-
-	for (txh = list_head(&tx->tx_holds); txh;
-	    txh = list_next(&tx->tx_holds, txh)) {
-		ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg);
-		if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT)
-			match_object = TRUE;
-		if (txh->txh_dnode == NULL || txh->txh_dnode == dn) {
-			int datablkshift = dn->dn_datablkshift ?
-			    dn->dn_datablkshift : SPA_MAXBLOCKSHIFT;
-			int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
-			int shift = datablkshift + epbs * db->db_level;
-			uint64_t beginblk = shift >= 64 ? 0 :
-			    (txh->txh_arg1 >> shift);
-			uint64_t endblk = shift >= 64 ? 0 :
-			    ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift);
-			uint64_t blkid = db->db_blkid;
-
-			/* XXX txh_arg2 better not be zero... */
-
-			dprintf("found txh type %x beginblk=%llx endblk=%llx\n",
-			    txh->txh_type, beginblk, endblk);
-
-			switch (txh->txh_type) {
-			case THT_WRITE:
-				if (blkid >= beginblk && blkid <= endblk)
-					match_offset = TRUE;
-				/*
-				 * We will let this hold work for the bonus
-				 * buffer so that we don't need to hold it
-				 * when creating a new object.
-				 */
-				if (blkid == DB_BONUS_BLKID)
-					match_offset = TRUE;
-				/*
-				 * They might have to increase nlevels,
-				 * thus dirtying the new TLIBs.  Or the
-				 * might have to change the block size,
-				 * thus dirying the new lvl=0 blk=0.
-				 */
-				if (blkid == 0)
-					match_offset = TRUE;
-				break;
-			case THT_FREE:
-				if (blkid == beginblk &&
-				    (txh->txh_arg1 != 0 ||
-				    dn->dn_maxblkid == 0))
-					match_offset = TRUE;
-				if (blkid == endblk &&
-				    txh->txh_arg2 != DMU_OBJECT_END)
-					match_offset = TRUE;
-				break;
-			case THT_BONUS:
-				if (blkid == DB_BONUS_BLKID)
-					match_offset = TRUE;
-				break;
-			case THT_ZAP:
-				match_offset = TRUE;
-				break;
-			case THT_NEWOBJECT:
-				match_object = TRUE;
-				break;
-			default:
-				ASSERT(!"bad txh_type");
-			}
-		}
-		if (match_object && match_offset)
-			return;
-	}
-	panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n",
-	    (u_longlong_t)db->db.db_object, db->db_level,
-	    (u_longlong_t)db->db_blkid);
-}
-#endif
-
-static int
-dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
-{
-	dmu_tx_hold_t *txh;
-	uint64_t lsize, asize, fsize, towrite, tofree, tooverwrite;
-
-	ASSERT3U(tx->tx_txg, ==, 0);
-	if (tx->tx_err)
-		return (tx->tx_err);
-
-	tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
-	tx->tx_needassign_txh = NULL;
-
-	/*
-	 * NB: No error returns are allowed after txg_hold_open, but
-	 * before processing the dnode holds, due to the
-	 * dmu_tx_unassign() logic.
-	 */
-
-	towrite = tofree = tooverwrite = 0;
-	for (txh = list_head(&tx->tx_holds); txh;
-	    txh = list_next(&tx->tx_holds, txh)) {
-		dnode_t *dn = txh->txh_dnode;
-		if (dn != NULL) {
-			mutex_enter(&dn->dn_mtx);
-			if (dn->dn_assigned_txg == tx->tx_txg - 1) {
-				mutex_exit(&dn->dn_mtx);
-				tx->tx_needassign_txh = txh;
-				return (ERESTART);
-			}
-			if (dn->dn_assigned_txg == 0)
-				dn->dn_assigned_txg = tx->tx_txg;
-			ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
-			(void) refcount_add(&dn->dn_tx_holds, tx);
-			mutex_exit(&dn->dn_mtx);
-		}
-		towrite += txh->txh_space_towrite;
-		tofree += txh->txh_space_tofree;
-		tooverwrite += txh->txh_space_tooverwrite;
-	}
-
-	/*
-	 * NB: This check must be after we've held the dnodes, so that
-	 * the dmu_tx_unassign() logic will work properly
-	 */
-	if (txg_how >= TXG_INITIAL && txg_how != tx->tx_txg)
-		return (ERESTART);
-
-	/*
-	 * If a snapshot has been taken since we made our estimates,
-	 * assume that we won't be able to free or overwrite anything.
-	 */
-	if (tx->tx_objset &&
-	    dsl_dataset_prev_snap_txg(tx->tx_objset->os->os_dsl_dataset) >
-	    tx->tx_lastsnap_txg) {
-		towrite += tooverwrite;
-		tooverwrite = tofree = 0;
-	}
-
-	/*
-	 * Convert logical size to worst-case allocated size.
-	 */
-	fsize = spa_get_asize(tx->tx_pool->dp_spa, tooverwrite) + tofree;
-	lsize = towrite + tooverwrite;
-	asize = spa_get_asize(tx->tx_pool->dp_spa, lsize);
-
-#ifdef ZFS_DEBUG
-	tx->tx_space_towrite = asize;
-	tx->tx_space_tofree = tofree;
-	tx->tx_space_tooverwrite = tooverwrite;
-#endif
-
-	if (tx->tx_dir && asize != 0) {
-		int err = dsl_dir_tempreserve_space(tx->tx_dir,
-		    lsize, asize, fsize, &tx->tx_tempreserve_cookie, tx);
-		if (err)
-			return (err);
-	}
-
-	return (0);
-}
-
-static void
-dmu_tx_unassign(dmu_tx_t *tx)
-{
-	dmu_tx_hold_t *txh;
-
-	if (tx->tx_txg == 0)
-		return;
-
-	txg_rele_to_quiesce(&tx->tx_txgh);
-
-	for (txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh;
-	    txh = list_next(&tx->tx_holds, txh)) {
-		dnode_t *dn = txh->txh_dnode;
-
-		if (dn == NULL)
-			continue;
-		mutex_enter(&dn->dn_mtx);
-		ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
-
-		if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
-			dn->dn_assigned_txg = 0;
-			cv_broadcast(&dn->dn_notxholds);
-		}
-		mutex_exit(&dn->dn_mtx);
-	}
-
-	txg_rele_to_sync(&tx->tx_txgh);
-
-	tx->tx_lasttried_txg = tx->tx_txg;
-	tx->tx_txg = 0;
-}
-
-/*
- * Assign tx to a transaction group.  txg_how can be one of:
- *
- * (1)	TXG_WAIT.  If the current open txg is full, waits until there's
- *	a new one.  This should be used when you're not holding locks.
- *	If will only fail if we're truly out of space (or over quota).
- *
- * (2)	TXG_NOWAIT.  If we can't assign into the current open txg without
- *	blocking, returns immediately with ERESTART.  This should be used
- *	whenever you're holding locks.  On an ERESTART error, the caller
- *	should drop locks, do a dmu_tx_wait(tx), and try again.
- *
- * (3)	A specific txg.  Use this if you need to ensure that multiple
- *	transactions all sync in the same txg.  Like TXG_NOWAIT, it
- *	returns ERESTART if it can't assign you into the requested txg.
- */
-int
-dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how)
-{
-	int err;
-
-	ASSERT(tx->tx_txg == 0);
-	ASSERT(txg_how != 0);
-	ASSERT(!dsl_pool_sync_context(tx->tx_pool));
-
-	while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
-		dmu_tx_unassign(tx);
-
-		if (err != ERESTART || txg_how != TXG_WAIT)
-			return (err);
-
-		dmu_tx_wait(tx);
-	}
-
-	txg_rele_to_quiesce(&tx->tx_txgh);
-
-	return (0);
-}
-
-void
-dmu_tx_wait(dmu_tx_t *tx)
-{
-	ASSERT(tx->tx_txg == 0);
-	ASSERT(tx->tx_lasttried_txg != 0);
-
-	if (tx->tx_needassign_txh) {
-		dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
-
-		mutex_enter(&dn->dn_mtx);
-		while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1)
-			cv_wait(&dn->dn_notxholds, &dn->dn_mtx);
-		mutex_exit(&dn->dn_mtx);
-		tx->tx_needassign_txh = NULL;
-	} else {
-		txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1);
-	}
-}
-
-void
-dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta)
-{
-#ifdef ZFS_DEBUG
-	if (tx->tx_dir == NULL || delta == 0)
-		return;
-
-	if (delta > 0) {
-		ASSERT3U(refcount_count(&tx->tx_space_written) + delta, <=,
-		    tx->tx_space_towrite);
-		(void) refcount_add_many(&tx->tx_space_written, delta, NULL);
-	} else {
-		(void) refcount_add_many(&tx->tx_space_freed, -delta, NULL);
-	}
-#endif
-}
-
-void
-dmu_tx_commit(dmu_tx_t *tx)
-{
-	dmu_tx_hold_t *txh;
-
-	ASSERT(tx->tx_txg != 0);
-
-	while (txh = list_head(&tx->tx_holds)) {
-		dnode_t *dn = txh->txh_dnode;
-
-		list_remove(&tx->tx_holds, txh);
-		kmem_free(txh, sizeof (dmu_tx_hold_t));
-		if (dn == NULL)
-			continue;
-		mutex_enter(&dn->dn_mtx);
-		ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
-
-		if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
-			dn->dn_assigned_txg = 0;
-			cv_broadcast(&dn->dn_notxholds);
-		}
-		mutex_exit(&dn->dn_mtx);
-		dnode_rele(dn, tx);
-	}
-
-	if (tx->tx_tempreserve_cookie)
-		dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx);
-
-	if (tx->tx_anyobj == FALSE)
-		txg_rele_to_sync(&tx->tx_txgh);
-#ifdef ZFS_DEBUG
-	dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n",
-	    tx->tx_space_towrite, refcount_count(&tx->tx_space_written),
-	    tx->tx_space_tofree, refcount_count(&tx->tx_space_freed));
-	refcount_destroy_many(&tx->tx_space_written,
-	    refcount_count(&tx->tx_space_written));
-	refcount_destroy_many(&tx->tx_space_freed,
-	    refcount_count(&tx->tx_space_freed));
-#endif
-	kmem_free(tx, sizeof (dmu_tx_t));
-}
-
-void
-dmu_tx_abort(dmu_tx_t *tx)
-{
-	dmu_tx_hold_t *txh;
-
-	ASSERT(tx->tx_txg == 0);
-
-	while (txh = list_head(&tx->tx_holds)) {
-		dnode_t *dn = txh->txh_dnode;
-
-		list_remove(&tx->tx_holds, txh);
-		kmem_free(txh, sizeof (dmu_tx_hold_t));
-		if (dn != NULL)
-			dnode_rele(dn, tx);
-	}
-#ifdef ZFS_DEBUG
-	refcount_destroy_many(&tx->tx_space_written,
-	    refcount_count(&tx->tx_space_written));
-	refcount_destroy_many(&tx->tx_space_freed,
-	    refcount_count(&tx->tx_space_freed));
-#endif
-	kmem_free(tx, sizeof (dmu_tx_t));
-}
-
-uint64_t
-dmu_tx_get_txg(dmu_tx_t *tx)
-{
-	ASSERT(tx->tx_txg != 0);
-	return (tx->tx_txg);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c
deleted file mode 100644
index 78d625c..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c
+++ /dev/null
@@ -1,655 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/dnode.h>
-#include <sys/dmu_objset.h>
-#include <sys/dmu_zfetch.h>
-#include <sys/dmu.h>
-#include <sys/dbuf.h>
-
-/*
- * I'm against tune-ables, but these should probably exist as tweakable globals
- * until we can get this working the way we want it to.
- */
-
-int zfs_prefetch_disable = 0;
-SYSCTL_DECL(_vfs_zfs);
-TUNABLE_INT("vfs.zfs.prefetch_disable", &zfs_prefetch_disable);
-SYSCTL_INT(_vfs_zfs, OID_AUTO, prefetch_disable, CTLFLAG_RDTUN,
-    &zfs_prefetch_disable, 0, "Disable prefetch");
-
-/* max # of streams per zfetch */
-uint32_t	zfetch_max_streams = 8;
-/* min time before stream reclaim */
-uint32_t	zfetch_min_sec_reap = 2;
-/* max number of blocks to fetch at a time */
-uint32_t	zfetch_block_cap = 256;
-/* number of bytes in a array_read at which we stop prefetching (1Mb) */
-uint64_t	zfetch_array_rd_sz = 1024 * 1024;
-
-/* forward decls for static routines */
-static int		dmu_zfetch_colinear(zfetch_t *, zstream_t *);
-static void		dmu_zfetch_dofetch(zfetch_t *, zstream_t *);
-static uint64_t		dmu_zfetch_fetch(dnode_t *, uint64_t, uint64_t);
-static uint64_t		dmu_zfetch_fetchsz(dnode_t *, uint64_t, uint64_t);
-static int		dmu_zfetch_find(zfetch_t *, zstream_t *, int);
-static int		dmu_zfetch_stream_insert(zfetch_t *, zstream_t *);
-static zstream_t	*dmu_zfetch_stream_reclaim(zfetch_t *);
-static void		dmu_zfetch_stream_remove(zfetch_t *, zstream_t *);
-static int		dmu_zfetch_streams_equal(zstream_t *, zstream_t *);
-
-/*
- * Given a zfetch structure and a zstream structure, determine whether the
- * blocks to be read are part of a co-linear pair of existing prefetch
- * streams.  If a set is found, coalesce the streams, removing one, and
- * configure the prefetch so it looks for a strided access pattern.
- *
- * In other words: if we find two sequential access streams that are
- * the same length and distance N appart, and this read is N from the
- * last stream, then we are probably in a strided access pattern.  So
- * combine the two sequential streams into a single strided stream.
- *
- * If no co-linear streams are found, return NULL.
- */
-static int
-dmu_zfetch_colinear(zfetch_t *zf, zstream_t *zh)
-{
-	zstream_t	*z_walk;
-	zstream_t	*z_comp;
-
-	if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER))
-		return (0);
-
-	if (zh == NULL) {
-		rw_exit(&zf->zf_rwlock);
-		return (0);
-	}
-
-	for (z_walk = list_head(&zf->zf_stream); z_walk;
-	    z_walk = list_next(&zf->zf_stream, z_walk)) {
-		for (z_comp = list_next(&zf->zf_stream, z_walk); z_comp;
-		    z_comp = list_next(&zf->zf_stream, z_comp)) {
-			int64_t		diff;
-
-			if (z_walk->zst_len != z_walk->zst_stride ||
-			    z_comp->zst_len != z_comp->zst_stride) {
-				continue;
-			}
-
-			diff = z_comp->zst_offset - z_walk->zst_offset;
-			if (z_comp->zst_offset + diff == zh->zst_offset) {
-				z_walk->zst_offset = zh->zst_offset;
-				z_walk->zst_direction = diff < 0 ? -1 : 1;
-				z_walk->zst_stride =
-				    diff * z_walk->zst_direction;
-				z_walk->zst_ph_offset =
-				    zh->zst_offset + z_walk->zst_stride;
-				dmu_zfetch_stream_remove(zf, z_comp);
-				mutex_destroy(&z_comp->zst_lock);
-				kmem_free(z_comp, sizeof (zstream_t));
-
-				dmu_zfetch_dofetch(zf, z_walk);
-
-				rw_exit(&zf->zf_rwlock);
-				return (1);
-			}
-
-			diff = z_walk->zst_offset - z_comp->zst_offset;
-			if (z_walk->zst_offset + diff == zh->zst_offset) {
-				z_walk->zst_offset = zh->zst_offset;
-				z_walk->zst_direction = diff < 0 ? -1 : 1;
-				z_walk->zst_stride =
-				    diff * z_walk->zst_direction;
-				z_walk->zst_ph_offset =
-				    zh->zst_offset + z_walk->zst_stride;
-				dmu_zfetch_stream_remove(zf, z_comp);
-				mutex_destroy(&z_comp->zst_lock);
-				kmem_free(z_comp, sizeof (zstream_t));
-
-				dmu_zfetch_dofetch(zf, z_walk);
-
-				rw_exit(&zf->zf_rwlock);
-				return (1);
-			}
-		}
-	}
-
-	rw_exit(&zf->zf_rwlock);
-	return (0);
-}
-
-/*
- * Given a zstream_t, determine the bounds of the prefetch.  Then call the
- * routine that actually prefetches the individual blocks.
- */
-static void
-dmu_zfetch_dofetch(zfetch_t *zf, zstream_t *zs)
-{
-	uint64_t	prefetch_tail;
-	uint64_t	prefetch_limit;
-	uint64_t	prefetch_ofst;
-	uint64_t	prefetch_len;
-	uint64_t	blocks_fetched;
-
-	zs->zst_stride = MAX((int64_t)zs->zst_stride, zs->zst_len);
-	zs->zst_cap = MIN(zfetch_block_cap, 2 * zs->zst_cap);
-
-	prefetch_tail = MAX((int64_t)zs->zst_ph_offset,
-	    (int64_t)(zs->zst_offset + zs->zst_stride));
-	/*
-	 * XXX: use a faster division method?
-	 */
-	prefetch_limit = zs->zst_offset + zs->zst_len +
-	    (zs->zst_cap * zs->zst_stride) / zs->zst_len;
-
-	while (prefetch_tail < prefetch_limit) {
-		prefetch_ofst = zs->zst_offset + zs->zst_direction *
-		    (prefetch_tail - zs->zst_offset);
-
-		prefetch_len = zs->zst_len;
-
-		/*
-		 * Don't prefetch beyond the end of the file, if working
-		 * backwards.
-		 */
-		if ((zs->zst_direction == ZFETCH_BACKWARD) &&
-		    (prefetch_ofst > prefetch_tail)) {
-			prefetch_len += prefetch_ofst;
-			prefetch_ofst = 0;
-		}
-
-		/* don't prefetch more than we're supposed to */
-		if (prefetch_len > zs->zst_len)
-			break;
-
-		blocks_fetched = dmu_zfetch_fetch(zf->zf_dnode,
-		    prefetch_ofst, zs->zst_len);
-
-		prefetch_tail += zs->zst_stride;
-		/* stop if we've run out of stuff to prefetch */
-		if (blocks_fetched < zs->zst_len)
-			break;
-	}
-	zs->zst_ph_offset = prefetch_tail;
-	zs->zst_last = lbolt;
-}
-
-/*
- * This takes a pointer to a zfetch structure and a dnode.  It performs the
- * necessary setup for the zfetch structure, grokking data from the
- * associated dnode.
- */
-void
-dmu_zfetch_init(zfetch_t *zf, dnode_t *dno)
-{
-	if (zf == NULL) {
-		return;
-	}
-
-	zf->zf_dnode = dno;
-	zf->zf_stream_cnt = 0;
-	zf->zf_alloc_fail = 0;
-
-	list_create(&zf->zf_stream, sizeof (zstream_t),
-	    offsetof(zstream_t, zst_node));
-
-	rw_init(&zf->zf_rwlock, NULL, RW_DEFAULT, NULL);
-}
-
-/*
- * This function computes the actual size, in blocks, that can be prefetched,
- * and fetches it.
- */
-static uint64_t
-dmu_zfetch_fetch(dnode_t *dn, uint64_t blkid, uint64_t nblks)
-{
-	uint64_t	fetchsz;
-	uint64_t	i;
-
-	fetchsz = dmu_zfetch_fetchsz(dn, blkid, nblks);
-
-	for (i = 0; i < fetchsz; i++) {
-		dbuf_prefetch(dn, blkid + i);
-	}
-
-	return (fetchsz);
-}
-
-/*
- * this function returns the number of blocks that would be prefetched, based
- * upon the supplied dnode, blockid, and nblks.  This is used so that we can
- * update streams in place, and then prefetch with their old value after the
- * fact.  This way, we can delay the prefetch, but subsequent accesses to the
- * stream won't result in the same data being prefetched multiple times.
- */
-static uint64_t
-dmu_zfetch_fetchsz(dnode_t *dn, uint64_t blkid, uint64_t nblks)
-{
-	uint64_t	fetchsz;
-
-	if (blkid > dn->dn_maxblkid) {
-		return (0);
-	}
-
-	/* compute fetch size */
-	if (blkid + nblks + 1 > dn->dn_maxblkid) {
-		fetchsz = (dn->dn_maxblkid - blkid) + 1;
-		ASSERT(blkid + fetchsz - 1 <= dn->dn_maxblkid);
-	} else {
-		fetchsz = nblks;
-	}
-
-
-	return (fetchsz);
-}
-
-/*
- * given a zfetch and a zsearch structure, see if there is an associated zstream
- * for this block read.  If so, it starts a prefetch for the stream it
- * located and returns true, otherwise it returns false
- */
-static int
-dmu_zfetch_find(zfetch_t *zf, zstream_t *zh, int prefetched)
-{
-	zstream_t	*zs;
-	int64_t		diff;
-	int		reset = !prefetched;
-	int		rc = 0;
-
-	if (zh == NULL)
-		return (0);
-
-	/*
-	 * XXX: This locking strategy is a bit coarse; however, it's impact has
-	 * yet to be tested.  If this turns out to be an issue, it can be
-	 * modified in a number of different ways.
-	 */
-
-	rw_enter(&zf->zf_rwlock, RW_READER);
-top:
-
-	for (zs = list_head(&zf->zf_stream); zs;
-	    zs = list_next(&zf->zf_stream, zs)) {
-
-		/*
-		 * XXX - should this be an assert?
-		 */
-		if (zs->zst_len == 0) {
-			/* bogus stream */
-			continue;
-		}
-
-		/*
-		 * We hit this case when we are in a strided prefetch stream:
-		 * we will read "len" blocks before "striding".
-		 */
-		if (zh->zst_offset >= zs->zst_offset &&
-		    zh->zst_offset < zs->zst_offset + zs->zst_len) {
-			/* already fetched */
-			rc = 1;
-			goto out;
-		}
-
-		/*
-		 * This is the forward sequential read case: we increment
-		 * len by one each time we hit here, so we will enter this
-		 * case on every read.
-		 */
-		if (zh->zst_offset == zs->zst_offset + zs->zst_len) {
-
-			reset = !prefetched && zs->zst_len > 1;
-
-			mutex_enter(&zs->zst_lock);
-
-			if (zh->zst_offset != zs->zst_offset + zs->zst_len) {
-				mutex_exit(&zs->zst_lock);
-				goto top;
-			}
-			zs->zst_len += zh->zst_len;
-			diff = zs->zst_len - zfetch_block_cap;
-			if (diff > 0) {
-				zs->zst_offset += diff;
-				zs->zst_len = zs->zst_len > diff ?
-				    zs->zst_len - diff : 0;
-			}
-			zs->zst_direction = ZFETCH_FORWARD;
-
-			break;
-
-		/*
-		 * Same as above, but reading backwards through the file.
-		 */
-		} else if (zh->zst_offset == zs->zst_offset - zh->zst_len) {
-			/* backwards sequential access */
-
-			reset = !prefetched && zs->zst_len > 1;
-
-			mutex_enter(&zs->zst_lock);
-
-			if (zh->zst_offset != zs->zst_offset - zh->zst_len) {
-				mutex_exit(&zs->zst_lock);
-				goto top;
-			}
-
-			zs->zst_offset = zs->zst_offset > zh->zst_len ?
-			    zs->zst_offset - zh->zst_len : 0;
-			zs->zst_ph_offset = zs->zst_ph_offset > zh->zst_len ?
-			    zs->zst_ph_offset - zh->zst_len : 0;
-			zs->zst_len += zh->zst_len;
-
-			diff = zs->zst_len - zfetch_block_cap;
-			if (diff > 0) {
-				zs->zst_ph_offset = zs->zst_ph_offset > diff ?
-				    zs->zst_ph_offset - diff : 0;
-				zs->zst_len = zs->zst_len > diff ?
-				    zs->zst_len - diff : zs->zst_len;
-			}
-			zs->zst_direction = ZFETCH_BACKWARD;
-
-			break;
-
-		} else if ((zh->zst_offset - zs->zst_offset - zs->zst_stride <
-		    zs->zst_len) && (zs->zst_len != zs->zst_stride)) {
-			/* strided forward access */
-
-			mutex_enter(&zs->zst_lock);
-
-			if ((zh->zst_offset - zs->zst_offset - zs->zst_stride >=
-			    zs->zst_len) || (zs->zst_len == zs->zst_stride)) {
-				mutex_exit(&zs->zst_lock);
-				goto top;
-			}
-
-			zs->zst_offset += zs->zst_stride;
-			zs->zst_direction = ZFETCH_FORWARD;
-
-			break;
-
-		} else if ((zh->zst_offset - zs->zst_offset + zs->zst_stride <
-		    zs->zst_len) && (zs->zst_len != zs->zst_stride)) {
-			/* strided reverse access */
-
-			mutex_enter(&zs->zst_lock);
-
-			if ((zh->zst_offset - zs->zst_offset + zs->zst_stride >=
-			    zs->zst_len) || (zs->zst_len == zs->zst_stride)) {
-				mutex_exit(&zs->zst_lock);
-				goto top;
-			}
-
-			zs->zst_offset = zs->zst_offset > zs->zst_stride ?
-			    zs->zst_offset - zs->zst_stride : 0;
-			zs->zst_ph_offset = (zs->zst_ph_offset >
-			    (2 * zs->zst_stride)) ?
-			    (zs->zst_ph_offset - (2 * zs->zst_stride)) : 0;
-			zs->zst_direction = ZFETCH_BACKWARD;
-
-			break;
-		}
-	}
-
-	if (zs) {
-		if (reset) {
-			zstream_t *remove = zs;
-
-			rc = 0;
-			mutex_exit(&zs->zst_lock);
-			rw_exit(&zf->zf_rwlock);
-			rw_enter(&zf->zf_rwlock, RW_WRITER);
-			/*
-			 * Relocate the stream, in case someone removes
-			 * it while we were acquiring the WRITER lock.
-			 */
-			for (zs = list_head(&zf->zf_stream); zs;
-			    zs = list_next(&zf->zf_stream, zs)) {
-				if (zs == remove) {
-					dmu_zfetch_stream_remove(zf, zs);
-					mutex_destroy(&zs->zst_lock);
-					kmem_free(zs, sizeof (zstream_t));
-					break;
-				}
-			}
-		} else {
-			rc = 1;
-			dmu_zfetch_dofetch(zf, zs);
-			mutex_exit(&zs->zst_lock);
-		}
-	}
-out:
-	rw_exit(&zf->zf_rwlock);
-	return (rc);
-}
-
-/*
- * Clean-up state associated with a zfetch structure.  This frees allocated
- * structure members, empties the zf_stream tree, and generally makes things
- * nice.  This doesn't free the zfetch_t itself, that's left to the caller.
- */
-void
-dmu_zfetch_rele(zfetch_t *zf)
-{
-	zstream_t	*zs;
-	zstream_t	*zs_next;
-
-	ASSERT(!RW_LOCK_HELD(&zf->zf_rwlock));
-
-	for (zs = list_head(&zf->zf_stream); zs; zs = zs_next) {
-		zs_next = list_next(&zf->zf_stream, zs);
-
-		list_remove(&zf->zf_stream, zs);
-		mutex_destroy(&zs->zst_lock);
-		kmem_free(zs, sizeof (zstream_t));
-	}
-	list_destroy(&zf->zf_stream);
-	rw_destroy(&zf->zf_rwlock);
-
-	zf->zf_dnode = NULL;
-}
-
-/*
- * Given a zfetch and zstream structure, insert the zstream structure into the
- * AVL tree contained within the zfetch structure.  Peform the appropriate
- * book-keeping.  It is possible that another thread has inserted a stream which
- * matches one that we are about to insert, so we must be sure to check for this
- * case.  If one is found, return failure, and let the caller cleanup the
- * duplicates.
- */
-static int
-dmu_zfetch_stream_insert(zfetch_t *zf, zstream_t *zs)
-{
-	zstream_t	*zs_walk;
-	zstream_t	*zs_next;
-
-	ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
-
-	for (zs_walk = list_head(&zf->zf_stream); zs_walk; zs_walk = zs_next) {
-		zs_next = list_next(&zf->zf_stream, zs_walk);
-
-		if (dmu_zfetch_streams_equal(zs_walk, zs)) {
-		    return (0);
-		}
-	}
-
-	list_insert_head(&zf->zf_stream, zs);
-	zf->zf_stream_cnt++;
-
-	return (1);
-}
-
-
-/*
- * Walk the list of zstreams in the given zfetch, find an old one (by time), and
- * reclaim it for use by the caller.
- */
-static zstream_t *
-dmu_zfetch_stream_reclaim(zfetch_t *zf)
-{
-	zstream_t	*zs;
-
-	if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER))
-		return (0);
-
-	for (zs = list_head(&zf->zf_stream); zs;
-	    zs = list_next(&zf->zf_stream, zs)) {
-
-		if (((lbolt - zs->zst_last) / hz) > zfetch_min_sec_reap)
-			break;
-	}
-
-	if (zs) {
-		dmu_zfetch_stream_remove(zf, zs);
-		mutex_destroy(&zs->zst_lock);
-		bzero(zs, sizeof (zstream_t));
-	} else {
-		zf->zf_alloc_fail++;
-	}
-	rw_exit(&zf->zf_rwlock);
-
-	return (zs);
-}
-
-/*
- * Given a zfetch and zstream structure, remove the zstream structure from its
- * container in the zfetch structure.  Perform the appropriate book-keeping.
- */
-static void
-dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs)
-{
-	ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
-
-	list_remove(&zf->zf_stream, zs);
-	zf->zf_stream_cnt--;
-}
-
-static int
-dmu_zfetch_streams_equal(zstream_t *zs1, zstream_t *zs2)
-{
-	if (zs1->zst_offset != zs2->zst_offset)
-		return (0);
-
-	if (zs1->zst_len != zs2->zst_len)
-		return (0);
-
-	if (zs1->zst_stride != zs2->zst_stride)
-		return (0);
-
-	if (zs1->zst_ph_offset != zs2->zst_ph_offset)
-		return (0);
-
-	if (zs1->zst_cap != zs2->zst_cap)
-		return (0);
-
-	if (zs1->zst_direction != zs2->zst_direction)
-		return (0);
-
-	return (1);
-}
-
-/*
- * This is the prefetch entry point.  It calls all of the other dmu_zfetch
- * routines to create, delete, find, or operate upon prefetch streams.
- */
-void
-dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched)
-{
-	zstream_t	zst;
-	zstream_t	*newstream;
-	int		fetched;
-	int		inserted;
-	unsigned int	blkshft;
-	uint64_t	blksz;
-
-	if (zfs_prefetch_disable)
-		return;
-
-	/* files that aren't ln2 blocksz are only one block -- nothing to do */
-	if (!zf->zf_dnode->dn_datablkshift)
-		return;
-
-	/* convert offset and size, into blockid and nblocks */
-	blkshft = zf->zf_dnode->dn_datablkshift;
-	blksz = (1 << blkshft);
-
-	bzero(&zst, sizeof (zstream_t));
-	zst.zst_offset = offset >> blkshft;
-	zst.zst_len = (P2ROUNDUP(offset + size, blksz) -
-	    P2ALIGN(offset, blksz)) >> blkshft;
-
-	fetched = dmu_zfetch_find(zf, &zst, prefetched);
-	if (!fetched) {
-		fetched = dmu_zfetch_colinear(zf, &zst);
-	}
-
-	if (!fetched) {
-		newstream = dmu_zfetch_stream_reclaim(zf);
-
-		/*
-		 * we still couldn't find a stream, drop the lock, and allocate
-		 * one if possible.  Otherwise, give up and go home.
-		 */
-		if (newstream == NULL) {
-			uint64_t	maxblocks;
-			uint32_t	max_streams;
-			uint32_t	cur_streams;
-
-			cur_streams = zf->zf_stream_cnt;
-			maxblocks = zf->zf_dnode->dn_maxblkid;
-
-			max_streams = MIN(zfetch_max_streams,
-			    (maxblocks / zfetch_block_cap));
-			if (max_streams == 0) {
-				max_streams++;
-			}
-
-			if (cur_streams >= max_streams) {
-				return;
-			}
-
-			newstream = kmem_zalloc(sizeof (zstream_t), KM_SLEEP);
-		}
-
-		newstream->zst_offset = zst.zst_offset;
-		newstream->zst_len = zst.zst_len;
-		newstream->zst_stride = zst.zst_len;
-		newstream->zst_ph_offset = zst.zst_len + zst.zst_offset;
-		newstream->zst_cap = zst.zst_len;
-		newstream->zst_direction = ZFETCH_FORWARD;
-		newstream->zst_last = lbolt;
-
-		mutex_init(&newstream->zst_lock, NULL, MUTEX_DEFAULT, NULL);
-
-		rw_enter(&zf->zf_rwlock, RW_WRITER);
-		inserted = dmu_zfetch_stream_insert(zf, newstream);
-		rw_exit(&zf->zf_rwlock);
-
-		if (!inserted) {
-			mutex_destroy(&newstream->zst_lock);
-			kmem_free(newstream, sizeof (zstream_t));
-		}
-	}
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dnode.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dnode.c
deleted file mode 100644
index ca50285..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/dnode.c
+++ /dev/null
@@ -1,1369 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/dbuf.h>
-#include <sys/dnode.h>
-#include <sys/dmu.h>
-#include <sys/dmu_impl.h>
-#include <sys/dmu_tx.h>
-#include <sys/dmu_objset.h>
-#include <sys/dsl_dir.h>
-#include <sys/dsl_dataset.h>
-#include <sys/spa.h>
-#include <sys/zio.h>
-#include <sys/dmu_zfetch.h>
-
-static int free_range_compar(const void *node1, const void *node2);
-
-static kmem_cache_t *dnode_cache;
-
-static dnode_phys_t dnode_phys_zero;
-
-int zfs_default_bs = SPA_MINBLOCKSHIFT;
-int zfs_default_ibs = DN_MAX_INDBLKSHIFT;
-
-/* ARGSUSED */
-static int
-dnode_cons(void *arg, void *unused, int kmflag)
-{
-	int i;
-	dnode_t *dn = arg;
-	bzero(dn, sizeof (dnode_t));
-
-	cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL);
-	rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL);
-	mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL);
-	refcount_create(&dn->dn_holds);
-	refcount_create(&dn->dn_tx_holds);
-
-	for (i = 0; i < TXG_SIZE; i++) {
-		avl_create(&dn->dn_ranges[i], free_range_compar,
-		    sizeof (free_range_t),
-		    offsetof(struct free_range, fr_node));
-		list_create(&dn->dn_dirty_records[i],
-		    sizeof (dbuf_dirty_record_t),
-		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
-	}
-
-	list_create(&dn->dn_dbufs, sizeof (dmu_buf_impl_t),
-	    offsetof(dmu_buf_impl_t, db_link));
-
-	return (0);
-}
-
-/* ARGSUSED */
-static void
-dnode_dest(void *arg, void *unused)
-{
-	int i;
-	dnode_t *dn = arg;
-
-	cv_destroy(&dn->dn_notxholds);
-	rw_destroy(&dn->dn_struct_rwlock);
-	mutex_destroy(&dn->dn_mtx);
-	mutex_destroy(&dn->dn_dbufs_mtx);
-	refcount_destroy(&dn->dn_holds);
-	refcount_destroy(&dn->dn_tx_holds);
-
-	for (i = 0; i < TXG_SIZE; i++) {
-		avl_destroy(&dn->dn_ranges[i]);
-		list_destroy(&dn->dn_dirty_records[i]);
-	}
-
-	list_destroy(&dn->dn_dbufs);
-}
-
-void
-dnode_init(void)
-{
-	dnode_cache = kmem_cache_create("dnode_t",
-	    sizeof (dnode_t),
-	    0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0);
-}
-
-void
-dnode_fini(void)
-{
-	kmem_cache_destroy(dnode_cache);
-}
-
-
-#ifdef ZFS_DEBUG
-void
-dnode_verify(dnode_t *dn)
-{
-	int drop_struct_lock = FALSE;
-
-	ASSERT(dn->dn_phys);
-	ASSERT(dn->dn_objset);
-
-	ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES);
-
-	if (!(zfs_flags & ZFS_DEBUG_DNODE_VERIFY))
-		return;
-
-	if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
-		rw_enter(&dn->dn_struct_rwlock, RW_READER);
-		drop_struct_lock = TRUE;
-	}
-	if (dn->dn_phys->dn_type != DMU_OT_NONE || dn->dn_allocated_txg != 0) {
-		int i;
-		ASSERT3U(dn->dn_indblkshift, >=, 0);
-		ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT);
-		if (dn->dn_datablkshift) {
-			ASSERT3U(dn->dn_datablkshift, >=, SPA_MINBLOCKSHIFT);
-			ASSERT3U(dn->dn_datablkshift, <=, SPA_MAXBLOCKSHIFT);
-			ASSERT3U(1<<dn->dn_datablkshift, ==, dn->dn_datablksz);
-		}
-		ASSERT3U(dn->dn_nlevels, <=, 30);
-		ASSERT3U(dn->dn_type, <=, DMU_OT_NUMTYPES);
-		ASSERT3U(dn->dn_nblkptr, >=, 1);
-		ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
-		ASSERT3U(dn->dn_bonuslen, <=, DN_MAX_BONUSLEN);
-		ASSERT3U(dn->dn_datablksz, ==,
-		    dn->dn_datablkszsec << SPA_MINBLOCKSHIFT);
-		ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0);
-		ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) +
-		    dn->dn_bonuslen, <=, DN_MAX_BONUSLEN);
-		for (i = 0; i < TXG_SIZE; i++) {
-			ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels);
-		}
-	}
-	if (dn->dn_phys->dn_type != DMU_OT_NONE)
-		ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels);
-	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || dn->dn_dbuf != NULL);
-	if (dn->dn_dbuf != NULL) {
-		ASSERT3P(dn->dn_phys, ==,
-		    (dnode_phys_t *)dn->dn_dbuf->db.db_data +
-		    (dn->dn_object % (dn->dn_dbuf->db.db_size >> DNODE_SHIFT)));
-	}
-	if (drop_struct_lock)
-		rw_exit(&dn->dn_struct_rwlock);
-}
-#endif
-
-void
-dnode_byteswap(dnode_phys_t *dnp)
-{
-	uint64_t *buf64 = (void*)&dnp->dn_blkptr;
-	int i;
-
-	if (dnp->dn_type == DMU_OT_NONE) {
-		bzero(dnp, sizeof (dnode_phys_t));
-		return;
-	}
-
-	dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec);
-	dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen);
-	dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid);
-	dnp->dn_used = BSWAP_64(dnp->dn_used);
-
-	/*
-	 * dn_nblkptr is only one byte, so it's OK to read it in either
-	 * byte order.  We can't read dn_bouslen.
-	 */
-	ASSERT(dnp->dn_indblkshift <= SPA_MAXBLOCKSHIFT);
-	ASSERT(dnp->dn_nblkptr <= DN_MAX_NBLKPTR);
-	for (i = 0; i < dnp->dn_nblkptr * sizeof (blkptr_t)/8; i++)
-		buf64[i] = BSWAP_64(buf64[i]);
-
-	/*
-	 * OK to check dn_bonuslen for zero, because it won't matter if
-	 * we have the wrong byte order.  This is necessary because the
-	 * dnode dnode is smaller than a regular dnode.
-	 */
-	if (dnp->dn_bonuslen != 0) {
-		/*
-		 * Note that the bonus length calculated here may be
-		 * longer than the actual bonus buffer.  This is because
-		 * we always put the bonus buffer after the last block
-		 * pointer (instead of packing it against the end of the
-		 * dnode buffer).
-		 */
-		int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t);
-		size_t len = DN_MAX_BONUSLEN - off;
-		ASSERT3U(dnp->dn_bonustype, <, DMU_OT_NUMTYPES);
-		dmu_ot[dnp->dn_bonustype].ot_byteswap(dnp->dn_bonus + off, len);
-	}
-}
-
-void
-dnode_buf_byteswap(void *vbuf, size_t size)
-{
-	dnode_phys_t *buf = vbuf;
-	int i;
-
-	ASSERT3U(sizeof (dnode_phys_t), ==, (1<<DNODE_SHIFT));
-	ASSERT((size & (sizeof (dnode_phys_t)-1)) == 0);
-
-	size >>= DNODE_SHIFT;
-	for (i = 0; i < size; i++) {
-		dnode_byteswap(buf);
-		buf++;
-	}
-}
-
-static int
-free_range_compar(const void *node1, const void *node2)
-{
-	const free_range_t *rp1 = node1;
-	const free_range_t *rp2 = node2;
-
-	if (rp1->fr_blkid < rp2->fr_blkid)
-		return (-1);
-	else if (rp1->fr_blkid > rp2->fr_blkid)
-		return (1);
-	else return (0);
-}
-
-static void
-dnode_setdblksz(dnode_t *dn, int size)
-{
-	ASSERT3U(P2PHASE(size, SPA_MINBLOCKSIZE), ==, 0);
-	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
-	ASSERT3U(size, >=, SPA_MINBLOCKSIZE);
-	ASSERT3U(size >> SPA_MINBLOCKSHIFT, <,
-	    1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8));
-	dn->dn_datablksz = size;
-	dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT;
-	dn->dn_datablkshift = ISP2(size) ? highbit(size - 1) : 0;
-}
-
-static dnode_t *
-dnode_create(objset_impl_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
-    uint64_t object)
-{
-	dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
-
-	dn->dn_objset = os;
-	dn->dn_object = object;
-	dn->dn_dbuf = db;
-	dn->dn_phys = dnp;
-
-	if (dnp->dn_datablkszsec)
-		dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
-	dn->dn_indblkshift = dnp->dn_indblkshift;
-	dn->dn_nlevels = dnp->dn_nlevels;
-	dn->dn_type = dnp->dn_type;
-	dn->dn_nblkptr = dnp->dn_nblkptr;
-	dn->dn_checksum = dnp->dn_checksum;
-	dn->dn_compress = dnp->dn_compress;
-	dn->dn_bonustype = dnp->dn_bonustype;
-	dn->dn_bonuslen = dnp->dn_bonuslen;
-	dn->dn_maxblkid = dnp->dn_maxblkid;
-
-	dmu_zfetch_init(&dn->dn_zfetch, dn);
-
-	ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES);
-	mutex_enter(&os->os_lock);
-	list_insert_head(&os->os_dnodes, dn);
-	mutex_exit(&os->os_lock);
-
-	return (dn);
-}
-
-static void
-dnode_destroy(dnode_t *dn)
-{
-	objset_impl_t *os = dn->dn_objset;
-
-#ifdef ZFS_DEBUG
-	int i;
-
-	for (i = 0; i < TXG_SIZE; i++) {
-		ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
-		ASSERT(NULL == list_head(&dn->dn_dirty_records[i]));
-		ASSERT(0 == avl_numnodes(&dn->dn_ranges[i]));
-	}
-	ASSERT(NULL == list_head(&dn->dn_dbufs));
-#endif
-
-	mutex_enter(&os->os_lock);
-	list_remove(&os->os_dnodes, dn);
-	mutex_exit(&os->os_lock);
-
-	if (dn->dn_dirtyctx_firstset) {
-		kmem_free(dn->dn_dirtyctx_firstset, 1);
-		dn->dn_dirtyctx_firstset = NULL;
-	}
-	dmu_zfetch_rele(&dn->dn_zfetch);
-	if (dn->dn_bonus) {
-		mutex_enter(&dn->dn_bonus->db_mtx);
-		dbuf_evict(dn->dn_bonus);
-		dn->dn_bonus = NULL;
-	}
-	kmem_cache_free(dnode_cache, dn);
-}
-
-void
-dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
-    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
-{
-	int i;
-
-	if (blocksize == 0)
-		blocksize = 1 << zfs_default_bs;
-	else if (blocksize > SPA_MAXBLOCKSIZE)
-		blocksize = SPA_MAXBLOCKSIZE;
-	else
-		blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE);
-
-	if (ibs == 0)
-		ibs = zfs_default_ibs;
-
-	ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT);
-
-	dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d\n", dn->dn_objset,
-	    dn->dn_object, tx->tx_txg, blocksize, ibs);
-
-	ASSERT(dn->dn_type == DMU_OT_NONE);
-	ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0);
-	ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE);
-	ASSERT(ot != DMU_OT_NONE);
-	ASSERT3U(ot, <, DMU_OT_NUMTYPES);
-	ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
-	    (bonustype != DMU_OT_NONE && bonuslen != 0));
-	ASSERT3U(bonustype, <, DMU_OT_NUMTYPES);
-	ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
-	ASSERT(dn->dn_type == DMU_OT_NONE);
-	ASSERT3U(dn->dn_maxblkid, ==, 0);
-	ASSERT3U(dn->dn_allocated_txg, ==, 0);
-	ASSERT3U(dn->dn_assigned_txg, ==, 0);
-	ASSERT(refcount_is_zero(&dn->dn_tx_holds));
-	ASSERT3U(refcount_count(&dn->dn_holds), <=, 1);
-	ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
-
-	for (i = 0; i < TXG_SIZE; i++) {
-		ASSERT3U(dn->dn_next_nlevels[i], ==, 0);
-		ASSERT3U(dn->dn_next_indblkshift[i], ==, 0);
-		ASSERT3U(dn->dn_next_blksz[i], ==, 0);
-		ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
-		ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
-		ASSERT3U(avl_numnodes(&dn->dn_ranges[i]), ==, 0);
-	}
-
-	dn->dn_type = ot;
-	dnode_setdblksz(dn, blocksize);
-	dn->dn_indblkshift = ibs;
-	dn->dn_nlevels = 1;
-	dn->dn_nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
-	dn->dn_bonustype = bonustype;
-	dn->dn_bonuslen = bonuslen;
-	dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
-	dn->dn_compress = ZIO_COMPRESS_INHERIT;
-	dn->dn_dirtyctx = 0;
-
-	dn->dn_free_txg = 0;
-	if (dn->dn_dirtyctx_firstset) {
-		kmem_free(dn->dn_dirtyctx_firstset, 1);
-		dn->dn_dirtyctx_firstset = NULL;
-	}
-
-	dn->dn_allocated_txg = tx->tx_txg;
-
-	dnode_setdirty(dn, tx);
-	dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs;
-	dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz;
-}
-
-void
-dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
-    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
-{
-	int i;
-	dmu_buf_impl_t *db = NULL;
-
-	ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
-	ASSERT3U(blocksize, <=, SPA_MAXBLOCKSIZE);
-	ASSERT3U(blocksize % SPA_MINBLOCKSIZE, ==, 0);
-	ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
-	ASSERT(tx->tx_txg != 0);
-	ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
-	    (bonustype != DMU_OT_NONE && bonuslen != 0));
-	ASSERT3U(bonustype, <, DMU_OT_NUMTYPES);
-	ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
-
-	for (i = 0; i < TXG_SIZE; i++)
-		ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
-
-	/* clean up any unreferenced dbufs */
-	(void) dnode_evict_dbufs(dn, 0);
-	ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
-
-	/*
-	 * XXX I should really have a generation number to tell if we
-	 * need to do this...
-	 */
-	if (blocksize != dn->dn_datablksz ||
-	    dn->dn_bonustype != bonustype || dn->dn_bonuslen != bonuslen) {
-		/* free all old data */
-		dnode_free_range(dn, 0, -1ULL, tx);
-	}
-
-	/* change blocksize */
-	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
-	if (blocksize != dn->dn_datablksz &&
-	    (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) ||
-	    list_head(&dn->dn_dbufs) != NULL)) {
-		db = dbuf_hold(dn, 0, FTAG);
-		dbuf_new_size(db, blocksize, tx);
-	}
-	dnode_setdblksz(dn, blocksize);
-	dnode_setdirty(dn, tx);
-	dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize;
-	rw_exit(&dn->dn_struct_rwlock);
-	if (db) {
-		dbuf_rele(db, FTAG);
-		db = NULL;
-	}
-
-	/* change type */
-	dn->dn_type = ot;
-
-	if (dn->dn_bonuslen != bonuslen) {
-		/* change bonus size */
-		if (bonuslen == 0)
-			bonuslen = 1; /* XXX */
-		rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
-		if (dn->dn_bonus == NULL)
-			dn->dn_bonus = dbuf_create_bonus(dn);
-		db = dn->dn_bonus;
-		rw_exit(&dn->dn_struct_rwlock);
-		if (refcount_add(&db->db_holds, FTAG) == 1)
-			dnode_add_ref(dn, db);
-		VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED));
-		mutex_enter(&db->db_mtx);
-		ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen);
-		ASSERT(db->db.db_data != NULL);
-		db->db.db_size = bonuslen;
-		mutex_exit(&db->db_mtx);
-		(void) dbuf_dirty(db, tx);
-	}
-
-	/* change bonus size and type */
-	mutex_enter(&dn->dn_mtx);
-	dn->dn_bonustype = bonustype;
-	dn->dn_bonuslen = bonuslen;
-	dn->dn_nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
-	dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
-	dn->dn_compress = ZIO_COMPRESS_INHERIT;
-	ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
-
-	/*
-	 * NB: we have to do the dbuf_rele after we've changed the
-	 * dn_bonuslen, for the sake of dbuf_verify().
-	 */
-	if (db)
-		dbuf_rele(db, FTAG);
-
-	dn->dn_allocated_txg = tx->tx_txg;
-	mutex_exit(&dn->dn_mtx);
-}
-
-void
-dnode_special_close(dnode_t *dn)
-{
-	/*
-	 * Wait for final references to the dnode to clear.  This can
-	 * only happen if the arc is asyncronously evicting state that
-	 * has a hold on this dnode while we are trying to evict this
-	 * dnode.
-	 */
-	while (refcount_count(&dn->dn_holds) > 0)
-		delay(1);
-	dnode_destroy(dn);
-}
-
-dnode_t *
-dnode_special_open(objset_impl_t *os, dnode_phys_t *dnp, uint64_t object)
-{
-	dnode_t *dn = dnode_create(os, dnp, NULL, object);
-	DNODE_VERIFY(dn);
-	return (dn);
-}
-
-static void
-dnode_buf_pageout(dmu_buf_t *db, void *arg)
-{
-	dnode_t **children_dnodes = arg;
-	int i;
-	int epb = db->db_size >> DNODE_SHIFT;
-
-	for (i = 0; i < epb; i++) {
-		dnode_t *dn = children_dnodes[i];
-		int n;
-
-		if (dn == NULL)
-			continue;
-#ifdef ZFS_DEBUG
-		/*
-		 * If there are holds on this dnode, then there should
-		 * be holds on the dnode's containing dbuf as well; thus
-		 * it wouldn't be eligable for eviction and this function
-		 * would not have been called.
-		 */
-		ASSERT(refcount_is_zero(&dn->dn_holds));
-		ASSERT(list_head(&dn->dn_dbufs) == NULL);
-		ASSERT(refcount_is_zero(&dn->dn_tx_holds));
-
-		for (n = 0; n < TXG_SIZE; n++)
-			ASSERT(!list_link_active(&dn->dn_dirty_link[n]));
-#endif
-		children_dnodes[i] = NULL;
-		dnode_destroy(dn);
-	}
-	kmem_free(children_dnodes, epb * sizeof (dnode_t *));
-}
-
-/*
- * errors:
- * EINVAL - invalid object number.
- * EIO - i/o error.
- * succeeds even for free dnodes.
- */
-int
-dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag,
-    void *tag, dnode_t **dnp)
-{
-	int epb, idx, err;
-	int drop_struct_lock = FALSE;
-	int type;
-	uint64_t blk;
-	dnode_t *mdn, *dn;
-	dmu_buf_impl_t *db;
-	dnode_t **children_dnodes;
-
-	if (object == 0 || object >= DN_MAX_OBJECT)
-		return (EINVAL);
-
-	mdn = os->os_meta_dnode;
-
-	DNODE_VERIFY(mdn);
-
-	if (!RW_WRITE_HELD(&mdn->dn_struct_rwlock)) {
-		rw_enter(&mdn->dn_struct_rwlock, RW_READER);
-		drop_struct_lock = TRUE;
-	}
-
-	blk = dbuf_whichblock(mdn, object * sizeof (dnode_phys_t));
-
-	db = dbuf_hold(mdn, blk, FTAG);
-	if (drop_struct_lock)
-		rw_exit(&mdn->dn_struct_rwlock);
-	if (db == NULL)
-		return (EIO);
-	err = dbuf_read(db, NULL, DB_RF_CANFAIL);
-	if (err) {
-		dbuf_rele(db, FTAG);
-		return (err);
-	}
-
-	ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT);
-	epb = db->db.db_size >> DNODE_SHIFT;
-
-	idx = object & (epb-1);
-
-	children_dnodes = dmu_buf_get_user(&db->db);
-	if (children_dnodes == NULL) {
-		dnode_t **winner;
-		children_dnodes = kmem_zalloc(epb * sizeof (dnode_t *),
-		    KM_SLEEP);
-		if (winner = dmu_buf_set_user(&db->db, children_dnodes, NULL,
-		    dnode_buf_pageout)) {
-			kmem_free(children_dnodes, epb * sizeof (dnode_t *));
-			children_dnodes = winner;
-		}
-	}
-
-	if ((dn = children_dnodes[idx]) == NULL) {
-		dnode_t *winner;
-		dn = dnode_create(os, (dnode_phys_t *)db->db.db_data+idx,
-			db, object);
-		winner = atomic_cas_ptr(&children_dnodes[idx], NULL, dn);
-		if (winner != NULL) {
-			dnode_destroy(dn);
-			dn = winner;
-		}
-	}
-
-	mutex_enter(&dn->dn_mtx);
-	type = dn->dn_type;
-	if (dn->dn_free_txg ||
-	    ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) ||
-	    ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE)) {
-		mutex_exit(&dn->dn_mtx);
-		dbuf_rele(db, FTAG);
-		return (type == DMU_OT_NONE ? ENOENT : EEXIST);
-	}
-	mutex_exit(&dn->dn_mtx);
-
-	if (refcount_add(&dn->dn_holds, tag) == 1)
-		dbuf_add_ref(db, dn);
-
-	DNODE_VERIFY(dn);
-	ASSERT3P(dn->dn_dbuf, ==, db);
-	ASSERT3U(dn->dn_object, ==, object);
-	dbuf_rele(db, FTAG);
-
-	*dnp = dn;
-	return (0);
-}
-
-/*
- * Return held dnode if the object is allocated, NULL if not.
- */
-int
-dnode_hold(objset_impl_t *os, uint64_t object, void *tag, dnode_t **dnp)
-{
-	return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, tag, dnp));
-}
-
-void
-dnode_add_ref(dnode_t *dn, void *tag)
-{
-	ASSERT(refcount_count(&dn->dn_holds) > 0);
-	(void) refcount_add(&dn->dn_holds, tag);
-}
-
-void
-dnode_rele(dnode_t *dn, void *tag)
-{
-	uint64_t refs;
-
-	refs = refcount_remove(&dn->dn_holds, tag);
-	/* NOTE: the DNODE_DNODE does not have a dn_dbuf */
-	if (refs == 0 && dn->dn_dbuf)
-		dbuf_rele(dn->dn_dbuf, dn);
-}
-
-void
-dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
-{
-	objset_impl_t *os = dn->dn_objset;
-	uint64_t txg = tx->tx_txg;
-
-	if (dn->dn_object == DMU_META_DNODE_OBJECT)
-		return;
-
-	DNODE_VERIFY(dn);
-
-#ifdef ZFS_DEBUG
-	mutex_enter(&dn->dn_mtx);
-	ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg);
-	/* ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg); */
-	mutex_exit(&dn->dn_mtx);
-#endif
-
-	mutex_enter(&os->os_lock);
-
-	/*
-	 * If we are already marked dirty, we're done.
-	 */
-	if (list_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) {
-		mutex_exit(&os->os_lock);
-		return;
-	}
-
-	ASSERT(!refcount_is_zero(&dn->dn_holds) || list_head(&dn->dn_dbufs));
-	ASSERT(dn->dn_datablksz != 0);
-	ASSERT3U(dn->dn_next_blksz[txg&TXG_MASK], ==, 0);
-
-	dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n",
-	    dn->dn_object, txg);
-
-	if (dn->dn_free_txg > 0 && dn->dn_free_txg <= txg) {
-		list_insert_tail(&os->os_free_dnodes[txg&TXG_MASK], dn);
-	} else {
-		list_insert_tail(&os->os_dirty_dnodes[txg&TXG_MASK], dn);
-	}
-
-	mutex_exit(&os->os_lock);
-
-	/*
-	 * The dnode maintains a hold on its containing dbuf as
-	 * long as there are holds on it.  Each instantiated child
-	 * dbuf maintaines a hold on the dnode.  When the last child
-	 * drops its hold, the dnode will drop its hold on the
-	 * containing dbuf. We add a "dirty hold" here so that the
-	 * dnode will hang around after we finish processing its
-	 * children.
-	 */
-	dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg);
-
-	(void) dbuf_dirty(dn->dn_dbuf, tx);
-
-	dsl_dataset_dirty(os->os_dsl_dataset, tx);
-}
-
-void
-dnode_free(dnode_t *dn, dmu_tx_t *tx)
-{
-	int txgoff = tx->tx_txg & TXG_MASK;
-
-	dprintf("dn=%p txg=%llu\n", dn, tx->tx_txg);
-
-	/* we should be the only holder... hopefully */
-	/* ASSERT3U(refcount_count(&dn->dn_holds), ==, 1); */
-
-	mutex_enter(&dn->dn_mtx);
-	if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg) {
-		mutex_exit(&dn->dn_mtx);
-		return;
-	}
-	dn->dn_free_txg = tx->tx_txg;
-	mutex_exit(&dn->dn_mtx);
-
-	/*
-	 * If the dnode is already dirty, it needs to be moved from
-	 * the dirty list to the free list.
-	 */
-	mutex_enter(&dn->dn_objset->os_lock);
-	if (list_link_active(&dn->dn_dirty_link[txgoff])) {
-		list_remove(&dn->dn_objset->os_dirty_dnodes[txgoff], dn);
-		list_insert_tail(&dn->dn_objset->os_free_dnodes[txgoff], dn);
-		mutex_exit(&dn->dn_objset->os_lock);
-	} else {
-		mutex_exit(&dn->dn_objset->os_lock);
-		dnode_setdirty(dn, tx);
-	}
-}
-
-/*
- * Try to change the block size for the indicated dnode.  This can only
- * succeed if there are no blocks allocated or dirty beyond first block
- */
-int
-dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
-{
-	dmu_buf_impl_t *db, *db_next;
-	int have_db0 = FALSE;
-
-	if (size == 0)
-		size = SPA_MINBLOCKSIZE;
-	if (size > SPA_MAXBLOCKSIZE)
-		size = SPA_MAXBLOCKSIZE;
-	else
-		size = P2ROUNDUP(size, SPA_MINBLOCKSIZE);
-
-	if (ibs == dn->dn_indblkshift)
-		ibs = 0;
-
-	if (size >> SPA_MINBLOCKSHIFT == dn->dn_datablkszsec && ibs == 0)
-		return (0);
-
-	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
-
-	/* Check for any allocated blocks beyond the first */
-	if (dn->dn_phys->dn_maxblkid != 0)
-		goto fail;
-
-	mutex_enter(&dn->dn_dbufs_mtx);
-	for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
-		db_next = list_next(&dn->dn_dbufs, db);
-
-		if (db->db_blkid == 0) {
-			have_db0 = TRUE;
-		} else if (db->db_blkid != DB_BONUS_BLKID) {
-			mutex_exit(&dn->dn_dbufs_mtx);
-			goto fail;
-		}
-	}
-	mutex_exit(&dn->dn_dbufs_mtx);
-
-	if (ibs && dn->dn_nlevels != 1)
-		goto fail;
-
-	db = NULL;
-	if (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) || have_db0) {
-		/* obtain the old block */
-		db = dbuf_hold(dn, 0, FTAG);
-		dbuf_new_size(db, size, tx);
-	}
-
-	dnode_setdblksz(dn, size);
-	dnode_setdirty(dn, tx);
-	dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = size;
-	if (ibs) {
-		dn->dn_indblkshift = ibs;
-		dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs;
-	}
-
-	if (db)
-		dbuf_rele(db, FTAG);
-
-	rw_exit(&dn->dn_struct_rwlock);
-	return (0);
-
-fail:
-	rw_exit(&dn->dn_struct_rwlock);
-	return (ENOTSUP);
-}
-
-void
-dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx)
-{
-	uint64_t txgoff = tx->tx_txg & TXG_MASK;
-	int drop_struct_lock = FALSE;
-	int epbs, new_nlevels;
-	uint64_t sz;
-
-	ASSERT(blkid != DB_BONUS_BLKID);
-
-	if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
-		rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
-		drop_struct_lock = TRUE;
-	}
-
-	if (blkid <= dn->dn_maxblkid)
-		goto out;
-
-	dn->dn_maxblkid = blkid;
-
-	/*
-	 * Compute the number of levels necessary to support the new maxblkid.
-	 */
-	new_nlevels = 1;
-	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
-	for (sz = dn->dn_nblkptr;
-	    sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs)
-		new_nlevels++;
-
-	if (new_nlevels > dn->dn_nlevels) {
-		int old_nlevels = dn->dn_nlevels;
-		dmu_buf_impl_t *db;
-		list_t *list;
-		dbuf_dirty_record_t *new, *dr, *dr_next;
-
-		dn->dn_nlevels = new_nlevels;
-
-		ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]);
-		dn->dn_next_nlevels[txgoff] = new_nlevels;
-
-		/* dirty the left indirects */
-		db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
-		new = dbuf_dirty(db, tx);
-		dbuf_rele(db, FTAG);
-
-		/* transfer the dirty records to the new indirect */
-		mutex_enter(&dn->dn_mtx);
-		mutex_enter(&new->dt.di.dr_mtx);
-		list = &dn->dn_dirty_records[txgoff];
-		for (dr = list_head(list); dr; dr = dr_next) {
-			dr_next = list_next(&dn->dn_dirty_records[txgoff], dr);
-			if (dr->dr_dbuf->db_level != new_nlevels-1 &&
-			    dr->dr_dbuf->db_blkid != DB_BONUS_BLKID) {
-				ASSERT(dr->dr_dbuf->db_level == old_nlevels-1);
-				list_remove(&dn->dn_dirty_records[txgoff], dr);
-				list_insert_tail(&new->dt.di.dr_children, dr);
-				dr->dr_parent = new;
-			}
-		}
-		mutex_exit(&new->dt.di.dr_mtx);
-		mutex_exit(&dn->dn_mtx);
-	}
-
-out:
-	if (drop_struct_lock)
-		rw_exit(&dn->dn_struct_rwlock);
-}
-
-void
-dnode_clear_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
-{
-	avl_tree_t *tree = &dn->dn_ranges[tx->tx_txg&TXG_MASK];
-	avl_index_t where;
-	free_range_t *rp;
-	free_range_t rp_tofind;
-	uint64_t endblk = blkid + nblks;
-
-	ASSERT(MUTEX_HELD(&dn->dn_mtx));
-	ASSERT(nblks <= UINT64_MAX - blkid); /* no overflow */
-
-	dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
-	    blkid, nblks, tx->tx_txg);
-	rp_tofind.fr_blkid = blkid;
-	rp = avl_find(tree, &rp_tofind, &where);
-	if (rp == NULL)
-		rp = avl_nearest(tree, where, AVL_BEFORE);
-	if (rp == NULL)
-		rp = avl_nearest(tree, where, AVL_AFTER);
-
-	while (rp && (rp->fr_blkid <= blkid + nblks)) {
-		uint64_t fr_endblk = rp->fr_blkid + rp->fr_nblks;
-		free_range_t *nrp = AVL_NEXT(tree, rp);
-
-		if (blkid <= rp->fr_blkid && endblk >= fr_endblk) {
-			/* clear this entire range */
-			avl_remove(tree, rp);
-			kmem_free(rp, sizeof (free_range_t));
-		} else if (blkid <= rp->fr_blkid &&
-		    endblk > rp->fr_blkid && endblk < fr_endblk) {
-			/* clear the beginning of this range */
-			rp->fr_blkid = endblk;
-			rp->fr_nblks = fr_endblk - endblk;
-		} else if (blkid > rp->fr_blkid && blkid < fr_endblk &&
-		    endblk >= fr_endblk) {
-			/* clear the end of this range */
-			rp->fr_nblks = blkid - rp->fr_blkid;
-		} else if (blkid > rp->fr_blkid && endblk < fr_endblk) {
-			/* clear a chunk out of this range */
-			free_range_t *new_rp =
-			    kmem_alloc(sizeof (free_range_t), KM_SLEEP);
-
-			new_rp->fr_blkid = endblk;
-			new_rp->fr_nblks = fr_endblk - endblk;
-			avl_insert_here(tree, new_rp, rp, AVL_AFTER);
-			rp->fr_nblks = blkid - rp->fr_blkid;
-		}
-		/* there may be no overlap */
-		rp = nrp;
-	}
-}
-
-void
-dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
-{
-	dmu_buf_impl_t *db;
-	uint64_t blkoff, blkid, nblks;
-	int blksz, head;
-	int trunc = FALSE;
-
-	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
-	blksz = dn->dn_datablksz;
-
-	/* If the range is past the end of the file, this is a no-op */
-	if (off >= blksz * (dn->dn_maxblkid+1))
-		goto out;
-	if (len == -1ULL) {
-		len = UINT64_MAX - off;
-		trunc = TRUE;
-	}
-
-	/*
-	 * First, block align the region to free:
-	 */
-	if (ISP2(blksz)) {
-		head = P2NPHASE(off, blksz);
-		blkoff = P2PHASE(off, blksz);
-	} else {
-		ASSERT(dn->dn_maxblkid == 0);
-		if (off == 0 && len >= blksz) {
-			/* Freeing the whole block; don't do any head. */
-			head = 0;
-		} else {
-			/* Freeing part of the block. */
-			head = blksz - off;
-			ASSERT3U(head, >, 0);
-		}
-		blkoff = off;
-	}
-	/* zero out any partial block data at the start of the range */
-	if (head) {
-		ASSERT3U(blkoff + head, ==, blksz);
-		if (len < head)
-			head = len;
-		if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off), TRUE,
-		    FTAG, &db) == 0) {
-			caddr_t data;
-
-			/* don't dirty if it isn't on disk and isn't dirty */
-			if (db->db_last_dirty ||
-			    (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
-				rw_exit(&dn->dn_struct_rwlock);
-				dbuf_will_dirty(db, tx);
-				rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
-				data = db->db.db_data;
-				bzero(data + blkoff, head);
-			}
-			dbuf_rele(db, FTAG);
-		}
-		off += head;
-		len -= head;
-	}
-
-	/* If the range was less than one block, we're done */
-	if (len == 0 || off >= blksz * (dn->dn_maxblkid+1))
-		goto out;
-
-	if (!ISP2(blksz)) {
-		/*
-		 * They are freeing the whole block of a
-		 * non-power-of-two blocksize file.  Skip all the messy
-		 * math.
-		 */
-		ASSERT3U(off, ==, 0);
-		ASSERT3U(len, >=, blksz);
-		blkid = 0;
-		nblks = 1;
-	} else {
-		int tail;
-		int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
-		int blkshift = dn->dn_datablkshift;
-
-		/* If the remaining range is past end of file, we're done */
-		if (off > dn->dn_maxblkid << blkshift)
-			goto out;
-
-		if (off + len == UINT64_MAX)
-			tail = 0;
-		else
-			tail = P2PHASE(len, blksz);
-
-		ASSERT3U(P2PHASE(off, blksz), ==, 0);
-		/* zero out any partial block data at the end of the range */
-		if (tail) {
-			if (len < tail)
-				tail = len;
-			if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len),
-			    TRUE, FTAG, &db) == 0) {
-				/* don't dirty if not on disk and not dirty */
-				if (db->db_last_dirty ||
-				    (db->db_blkptr &&
-				    !BP_IS_HOLE(db->db_blkptr))) {
-					rw_exit(&dn->dn_struct_rwlock);
-					dbuf_will_dirty(db, tx);
-					rw_enter(&dn->dn_struct_rwlock,
-					    RW_WRITER);
-					bzero(db->db.db_data, tail);
-				}
-				dbuf_rele(db, FTAG);
-			}
-			len -= tail;
-		}
-		/* If the range did not include a full block, we are done */
-		if (len == 0)
-			goto out;
-
-		/* dirty the left indirects */
-		if (dn->dn_nlevels > 1 && off != 0) {
-			db = dbuf_hold_level(dn, 1,
-			    (off - head) >> (blkshift + epbs), FTAG);
-			dbuf_will_dirty(db, tx);
-			dbuf_rele(db, FTAG);
-		}
-
-		/* dirty the right indirects */
-		if (dn->dn_nlevels > 1 && !trunc) {
-			db = dbuf_hold_level(dn, 1,
-			    (off + len + tail - 1) >> (blkshift + epbs), FTAG);
-			dbuf_will_dirty(db, tx);
-			dbuf_rele(db, FTAG);
-		}
-
-		/*
-		 * Finally, add this range to the dnode range list, we
-		 * will finish up this free operation in the syncing phase.
-		 */
-		ASSERT(IS_P2ALIGNED(off, 1<<blkshift));
-		ASSERT(off + len == UINT64_MAX ||
-		    IS_P2ALIGNED(len, 1<<blkshift));
-		blkid = off >> blkshift;
-		nblks = len >> blkshift;
-
-		if (trunc)
-			dn->dn_maxblkid = (blkid ? blkid - 1 : 0);
-	}
-
-	mutex_enter(&dn->dn_mtx);
-	dnode_clear_range(dn, blkid, nblks, tx);
-	{
-		free_range_t *rp, *found;
-		avl_index_t where;
-		avl_tree_t *tree = &dn->dn_ranges[tx->tx_txg&TXG_MASK];
-
-		/* Add new range to dn_ranges */
-		rp = kmem_alloc(sizeof (free_range_t), KM_SLEEP);
-		rp->fr_blkid = blkid;
-		rp->fr_nblks = nblks;
-		found = avl_find(tree, rp, &where);
-		ASSERT(found == NULL);
-		avl_insert(tree, rp, where);
-		dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
-		    blkid, nblks, tx->tx_txg);
-	}
-	mutex_exit(&dn->dn_mtx);
-
-	dbuf_free_range(dn, blkid, nblks, tx);
-	dnode_setdirty(dn, tx);
-out:
-	rw_exit(&dn->dn_struct_rwlock);
-}
-
-/* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */
-uint64_t
-dnode_block_freed(dnode_t *dn, uint64_t blkid)
-{
-	free_range_t range_tofind;
-	void *dp = spa_get_dsl(dn->dn_objset->os_spa);
-	int i;
-
-	if (blkid == DB_BONUS_BLKID)
-		return (FALSE);
-
-	/*
-	 * If we're in the process of opening the pool, dp will not be
-	 * set yet, but there shouldn't be anything dirty.
-	 */
-	if (dp == NULL)
-		return (FALSE);
-
-	if (dn->dn_free_txg)
-		return (TRUE);
-
-	/*
-	 * If dn_datablkshift is not set, then there's only a single
-	 * block, in which case there will never be a free range so it
-	 * won't matter.
-	 */
-	range_tofind.fr_blkid = blkid;
-	mutex_enter(&dn->dn_mtx);
-	for (i = 0; i < TXG_SIZE; i++) {
-		free_range_t *range_found;
-		avl_index_t idx;
-
-		range_found = avl_find(&dn->dn_ranges[i], &range_tofind, &idx);
-		if (range_found) {
-			ASSERT(range_found->fr_nblks > 0);
-			break;
-		}
-		range_found = avl_nearest(&dn->dn_ranges[i], idx, AVL_BEFORE);
-		if (range_found &&
-		    range_found->fr_blkid + range_found->fr_nblks > blkid)
-			break;
-	}
-	mutex_exit(&dn->dn_mtx);
-	return (i < TXG_SIZE);
-}
-
-/* call from syncing context when we actually write/free space for this dnode */
-void
-dnode_diduse_space(dnode_t *dn, int64_t delta)
-{
-	uint64_t space;
-	dprintf_dnode(dn, "dn=%p dnp=%p used=%llu delta=%lld\n",
-	    dn, dn->dn_phys,
-	    (u_longlong_t)dn->dn_phys->dn_used,
-	    (longlong_t)delta);
-
-	mutex_enter(&dn->dn_mtx);
-	space = DN_USED_BYTES(dn->dn_phys);
-	if (delta > 0) {
-		ASSERT3U(space + delta, >=, space); /* no overflow */
-	} else {
-		ASSERT3U(space, >=, -delta); /* no underflow */
-	}
-	space += delta;
-	if (spa_version(dn->dn_objset->os_spa) < ZFS_VERSION_DNODE_BYTES) {
-		ASSERT((dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) == 0);
-		ASSERT3U(P2PHASE(space, 1<<DEV_BSHIFT), ==, 0);
-		dn->dn_phys->dn_used = space >> DEV_BSHIFT;
-	} else {
-		dn->dn_phys->dn_used = space;
-		dn->dn_phys->dn_flags |= DNODE_FLAG_USED_BYTES;
-	}
-	mutex_exit(&dn->dn_mtx);
-}
-
-/*
- * Call when we think we're going to write/free space in open context.
- * Be conservative (ie. OK to write less than this or free more than
- * this, but don't write more or free less).
- */
-void
-dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx)
-{
-	objset_impl_t *os = dn->dn_objset;
-	dsl_dataset_t *ds = os->os_dsl_dataset;
-
-	if (space > 0)
-		space = spa_get_asize(os->os_spa, space);
-
-	if (ds)
-		dsl_dir_willuse_space(ds->ds_dir, space, tx);
-
-	dmu_tx_willuse_space(tx, space);
-}
-
-static int
-dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset,
-	int lvl, uint64_t blkfill, uint64_t txg)
-{
-	dmu_buf_impl_t *db = NULL;
-	void *data = NULL;
-	uint64_t epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
-	uint64_t epb = 1ULL << epbs;
-	uint64_t minfill, maxfill;
-	int i, error, span;
-
-	dprintf("probing object %llu offset %llx level %d of %u\n",
-	    dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels);
-
-	if (lvl == dn->dn_phys->dn_nlevels) {
-		error = 0;
-		epb = dn->dn_phys->dn_nblkptr;
-		data = dn->dn_phys->dn_blkptr;
-	} else {
-		uint64_t blkid = dbuf_whichblock(dn, *offset) >> (epbs * lvl);
-		error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FTAG, &db);
-		if (error) {
-			if (error == ENOENT)
-				return (hole ? 0 : ESRCH);
-			return (error);
-		}
-		error = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_HAVESTRUCT);
-		if (error) {
-			dbuf_rele(db, FTAG);
-			return (error);
-		}
-		data = db->db.db_data;
-	}
-
-	if (db && txg &&
-	    (db->db_blkptr == NULL || db->db_blkptr->blk_birth <= txg)) {
-		error = ESRCH;
-	} else if (lvl == 0) {
-		dnode_phys_t *dnp = data;
-		span = DNODE_SHIFT;
-		ASSERT(dn->dn_type == DMU_OT_DNODE);
-
-		for (i = (*offset >> span) & (blkfill - 1); i < blkfill; i++) {
-			boolean_t newcontents = B_TRUE;
-			if (txg) {
-				int j;
-				newcontents = B_FALSE;
-				for (j = 0; j < dnp[i].dn_nblkptr; j++) {
-					if (dnp[i].dn_blkptr[j].blk_birth > txg)
-						newcontents = B_TRUE;
-				}
-			}
-			if (!dnp[i].dn_type == hole && newcontents)
-				break;
-			*offset += 1ULL << span;
-		}
-		if (i == blkfill)
-			error = ESRCH;
-	} else {
-		blkptr_t *bp = data;
-		span = (lvl - 1) * epbs + dn->dn_datablkshift;
-		minfill = 0;
-		maxfill = blkfill << ((lvl - 1) * epbs);
-
-		if (hole)
-			maxfill--;
-		else
-			minfill++;
-
-		for (i = (*offset >> span) & ((1ULL << epbs) - 1);
-		    i < epb; i++) {
-			if (bp[i].blk_fill >= minfill &&
-			    bp[i].blk_fill <= maxfill &&
-			    bp[i].blk_birth > txg)
-				break;
-			*offset += 1ULL << span;
-		}
-		if (i >= epb)
-			error = ESRCH;
-	}
-
-	if (db)
-		dbuf_rele(db, FTAG);
-
-	return (error);
-}
-
-/*
- * Find the next hole, data, or sparse region at or after *offset.
- * The value 'blkfill' tells us how many items we expect to find
- * in an L0 data block; this value is 1 for normal objects,
- * DNODES_PER_BLOCK for the meta dnode, and some fraction of
- * DNODES_PER_BLOCK when searching for sparse regions thereof.
- *
- * Examples:
- *
- * dnode_next_offset(dn, hole, offset, 1, 1, 0);
- *	Finds the next hole/data in a file.
- *	Used in dmu_offset_next().
- *
- * dnode_next_offset(mdn, hole, offset, 0, DNODES_PER_BLOCK, txg);
- *	Finds the next free/allocated dnode an objset's meta-dnode.
- *	Only finds objects that have new contents since txg (ie.
- *	bonus buffer changes and content removal are ignored).
- *	Used in dmu_object_next().
- *
- * dnode_next_offset(mdn, TRUE, offset, 2, DNODES_PER_BLOCK >> 2, 0);
- *	Finds the next L2 meta-dnode bp that's at most 1/4 full.
- *	Used in dmu_object_alloc().
- */
-int
-dnode_next_offset(dnode_t *dn, boolean_t hole, uint64_t *offset,
-    int minlvl, uint64_t blkfill, uint64_t txg)
-{
-	int lvl, maxlvl;
-	int error = 0;
-	uint64_t initial_offset = *offset;
-
-	rw_enter(&dn->dn_struct_rwlock, RW_READER);
-
-	if (dn->dn_phys->dn_nlevels == 0) {
-		rw_exit(&dn->dn_struct_rwlock);
-		return (ESRCH);
-	}
-
-	if (dn->dn_datablkshift == 0) {
-		if (*offset < dn->dn_datablksz) {
-			if (hole)
-				*offset = dn->dn_datablksz;
-		} else {
-			error = ESRCH;
-		}
-		rw_exit(&dn->dn_struct_rwlock);
-		return (error);
-	}
-
-	maxlvl = dn->dn_phys->dn_nlevels;
-
-	for (lvl = minlvl; lvl <= maxlvl; lvl++) {
-		error = dnode_next_offset_level(dn,
-		    hole, offset, lvl, blkfill, txg);
-		if (error != ESRCH)
-			break;
-	}
-
-	while (--lvl >= minlvl && error == 0) {
-		error = dnode_next_offset_level(dn,
-		    hole, offset, lvl, blkfill, txg);
-	}
-
-	rw_exit(&dn->dn_struct_rwlock);
-
-	if (error == 0 && initial_offset > *offset)
-		error = ESRCH;
-
-	return (error);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
deleted file mode 100644
index 9e8c7ad..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
+++ /dev/null
@@ -1,623 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/dbuf.h>
-#include <sys/dnode.h>
-#include <sys/dmu.h>
-#include <sys/dmu_tx.h>
-#include <sys/dmu_objset.h>
-#include <sys/dsl_dataset.h>
-#include <sys/spa.h>
-
-static void
-dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
-{
-	dmu_buf_impl_t *db;
-	int txgoff = tx->tx_txg & TXG_MASK;
-	int nblkptr = dn->dn_phys->dn_nblkptr;
-	int old_toplvl = dn->dn_phys->dn_nlevels - 1;
-	int new_level = dn->dn_next_nlevels[txgoff];
-	int i;
-
-	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
-
-	/* this dnode can't be paged out because it's dirty */
-	ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
-	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
-	ASSERT(new_level > 1 && dn->dn_phys->dn_nlevels > 0);
-
-	db = dbuf_hold_level(dn, dn->dn_phys->dn_nlevels, 0, FTAG);
-	ASSERT(db != NULL);
-
-	dn->dn_phys->dn_nlevels = new_level;
-	dprintf("os=%p obj=%llu, increase to %d\n",
-		dn->dn_objset, dn->dn_object,
-		dn->dn_phys->dn_nlevels);
-
-	/* check for existing blkptrs in the dnode */
-	for (i = 0; i < nblkptr; i++)
-		if (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[i]))
-			break;
-	if (i != nblkptr) {
-		/* transfer dnode's block pointers to new indirect block */
-		(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED|DB_RF_HAVESTRUCT);
-		ASSERT(db->db.db_data);
-		ASSERT(arc_released(db->db_buf));
-		ASSERT3U(sizeof (blkptr_t) * nblkptr, <=, db->db.db_size);
-		bcopy(dn->dn_phys->dn_blkptr, db->db.db_data,
-		    sizeof (blkptr_t) * nblkptr);
-		arc_buf_freeze(db->db_buf);
-	}
-
-	/* set dbuf's parent pointers to new indirect buf */
-	for (i = 0; i < nblkptr; i++) {
-		dmu_buf_impl_t *child = dbuf_find(dn, old_toplvl, i);
-
-		if (child == NULL)
-			continue;
-		ASSERT3P(child->db_dnode, ==, dn);
-		if (child->db_parent && child->db_parent != dn->dn_dbuf) {
-			ASSERT(child->db_parent->db_level == db->db_level);
-			ASSERT(child->db_blkptr !=
-			    &dn->dn_phys->dn_blkptr[child->db_blkid]);
-			mutex_exit(&child->db_mtx);
-			continue;
-		}
-		ASSERT(child->db_parent == NULL ||
-		    child->db_parent == dn->dn_dbuf);
-
-		child->db_parent = db;
-		dbuf_add_ref(db, child);
-		if (db->db.db_data)
-			child->db_blkptr = (blkptr_t *)db->db.db_data + i;
-		else
-			child->db_blkptr = NULL;
-		dprintf_dbuf_bp(child, child->db_blkptr,
-		    "changed db_blkptr to new indirect %s", "");
-
-		mutex_exit(&child->db_mtx);
-	}
-
-	bzero(dn->dn_phys->dn_blkptr, sizeof (blkptr_t) * nblkptr);
-
-	dbuf_rele(db, FTAG);
-
-	rw_exit(&dn->dn_struct_rwlock);
-}
-
-static void
-free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx)
-{
-	objset_impl_t *os = dn->dn_objset;
-	uint64_t bytesfreed = 0;
-	int i;
-
-	dprintf("os=%p obj=%llx num=%d\n", os, dn->dn_object, num);
-
-	for (i = 0; i < num; i++, bp++) {
-		if (BP_IS_HOLE(bp))
-			continue;
-
-		bytesfreed += bp_get_dasize(os->os_spa, bp);
-		ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys));
-		dsl_dataset_block_kill(os->os_dsl_dataset, bp, dn->dn_zio, tx);
-		bzero(bp, sizeof (blkptr_t));
-	}
-	dnode_diduse_space(dn, -bytesfreed);
-}
-
-#ifdef ZFS_DEBUG
-static void
-free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
-{
-	int off, num;
-	int i, err, epbs;
-	uint64_t txg = tx->tx_txg;
-
-	epbs = db->db_dnode->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
-	off = start - (db->db_blkid * 1<<epbs);
-	num = end - start + 1;
-
-	ASSERT3U(off, >=, 0);
-	ASSERT3U(num, >=, 0);
-	ASSERT3U(db->db_level, >, 0);
-	ASSERT3U(db->db.db_size, ==, 1<<db->db_dnode->dn_phys->dn_indblkshift);
-	ASSERT3U(off+num, <=, db->db.db_size >> SPA_BLKPTRSHIFT);
-	ASSERT(db->db_blkptr != NULL);
-
-	for (i = off; i < off+num; i++) {
-		uint64_t *buf;
-		dmu_buf_impl_t *child;
-		dbuf_dirty_record_t *dr;
-		int j;
-
-		ASSERT(db->db_level == 1);
-
-		rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
-		err = dbuf_hold_impl(db->db_dnode, db->db_level-1,
-			(db->db_blkid << epbs) + i, TRUE, FTAG, &child);
-		rw_exit(&db->db_dnode->dn_struct_rwlock);
-		if (err == ENOENT)
-			continue;
-		ASSERT(err == 0);
-		ASSERT(child->db_level == 0);
-		dr = child->db_last_dirty;
-		while (dr && dr->dr_txg > txg)
-			dr = dr->dr_next;
-		ASSERT(dr == NULL || dr->dr_txg == txg);
-
-		/* data_old better be zeroed */
-		if (dr) {
-			buf = dr->dt.dl.dr_data->b_data;
-			for (j = 0; j < child->db.db_size >> 3; j++) {
-				if (buf[j] != 0) {
-					panic("freed data not zero: "
-					    "child=%p i=%d off=%d num=%d\n",
-					    child, i, off, num);
-				}
-			}
-		}
-
-		/*
-		 * db_data better be zeroed unless it's dirty in a
-		 * future txg.
-		 */
-		mutex_enter(&child->db_mtx);
-		buf = child->db.db_data;
-		if (buf != NULL && child->db_state != DB_FILL &&
-		    child->db_last_dirty == NULL) {
-			for (j = 0; j < child->db.db_size >> 3; j++) {
-				if (buf[j] != 0) {
-					panic("freed data not zero: "
-					    "child=%p i=%d off=%d num=%d\n",
-					    child, i, off, num);
-				}
-			}
-		}
-		mutex_exit(&child->db_mtx);
-
-		dbuf_rele(child, FTAG);
-	}
-}
-#endif
-
-static int
-free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
-    dmu_tx_t *tx)
-{
-	dnode_t *dn = db->db_dnode;
-	blkptr_t *bp;
-	dmu_buf_impl_t *subdb;
-	uint64_t start, end, dbstart, dbend, i;
-	int epbs, shift, err;
-	int all = TRUE;
-
-	(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
-	arc_release(db->db_buf, db);
-	bp = (blkptr_t *)db->db.db_data;
-
-	epbs = db->db_dnode->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
-	shift = (db->db_level - 1) * epbs;
-	dbstart = db->db_blkid << epbs;
-	start = blkid >> shift;
-	if (dbstart < start) {
-		bp += start - dbstart;
-		all = FALSE;
-	} else {
-		start = dbstart;
-	}
-	dbend = ((db->db_blkid + 1) << epbs) - 1;
-	end = (blkid + nblks - 1) >> shift;
-	if (dbend <= end)
-		end = dbend;
-	else if (all)
-		all = trunc;
-	ASSERT3U(start, <=, end);
-
-	if (db->db_level == 1) {
-		FREE_VERIFY(db, start, end, tx);
-		free_blocks(dn, bp, end-start+1, tx);
-		arc_buf_freeze(db->db_buf);
-		ASSERT(all || db->db_last_dirty);
-		return (all);
-	}
-
-	for (i = start; i <= end; i++, bp++) {
-		if (BP_IS_HOLE(bp))
-			continue;
-		rw_enter(&dn->dn_struct_rwlock, RW_READER);
-		err = dbuf_hold_impl(dn, db->db_level-1, i, TRUE, FTAG, &subdb);
-		ASSERT3U(err, ==, 0);
-		rw_exit(&dn->dn_struct_rwlock);
-
-		if (free_children(subdb, blkid, nblks, trunc, tx)) {
-			ASSERT3P(subdb->db_blkptr, ==, bp);
-			free_blocks(dn, bp, 1, tx);
-		} else {
-			all = FALSE;
-		}
-		dbuf_rele(subdb, FTAG);
-	}
-	arc_buf_freeze(db->db_buf);
-#ifdef ZFS_DEBUG
-	bp -= (end-start)+1;
-	for (i = start; i <= end; i++, bp++) {
-		if (i == start && blkid != 0)
-			continue;
-		else if (i == end && !trunc)
-			continue;
-		ASSERT3U(bp->blk_birth, ==, 0);
-	}
-#endif
-	ASSERT(all || db->db_last_dirty);
-	return (all);
-}
-
-/*
- * free_range: Traverse the indicated range of the provided file
- * and "free" all the blocks contained there.
- */
-static void
-dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
-{
-	blkptr_t *bp = dn->dn_phys->dn_blkptr;
-	dmu_buf_impl_t *db;
-	int trunc, start, end, shift, i, err;
-	int dnlevel = dn->dn_phys->dn_nlevels;
-
-	if (blkid > dn->dn_phys->dn_maxblkid)
-		return;
-
-	ASSERT(dn->dn_phys->dn_maxblkid < UINT64_MAX);
-	trunc = blkid + nblks > dn->dn_phys->dn_maxblkid;
-	if (trunc)
-		nblks = dn->dn_phys->dn_maxblkid - blkid + 1;
-
-	/* There are no indirect blocks in the object */
-	if (dnlevel == 1) {
-		if (blkid >= dn->dn_phys->dn_nblkptr) {
-			/* this range was never made persistent */
-			return;
-		}
-		ASSERT3U(blkid + nblks, <=, dn->dn_phys->dn_nblkptr);
-		free_blocks(dn, bp + blkid, nblks, tx);
-		if (trunc) {
-			uint64_t off = (dn->dn_phys->dn_maxblkid + 1) *
-			    (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT);
-			dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0);
-			ASSERT(off < dn->dn_phys->dn_maxblkid ||
-			    dn->dn_phys->dn_maxblkid == 0 ||
-			    dnode_next_offset(dn, FALSE, &off,
-			    1, 1, 0) != 0);
-		}
-		return;
-	}
-
-	shift = (dnlevel - 1) * (dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT);
-	start = blkid >> shift;
-	ASSERT(start < dn->dn_phys->dn_nblkptr);
-	end = (blkid + nblks - 1) >> shift;
-	bp += start;
-	for (i = start; i <= end; i++, bp++) {
-		if (BP_IS_HOLE(bp))
-			continue;
-		rw_enter(&dn->dn_struct_rwlock, RW_READER);
-		err = dbuf_hold_impl(dn, dnlevel-1, i, TRUE, FTAG, &db);
-		ASSERT3U(err, ==, 0);
-		rw_exit(&dn->dn_struct_rwlock);
-
-		if (free_children(db, blkid, nblks, trunc, tx)) {
-			ASSERT3P(db->db_blkptr, ==, bp);
-			free_blocks(dn, bp, 1, tx);
-		}
-		dbuf_rele(db, FTAG);
-	}
-	if (trunc) {
-		uint64_t off = (dn->dn_phys->dn_maxblkid + 1) *
-		    (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT);
-		dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0);
-		ASSERT(off < dn->dn_phys->dn_maxblkid ||
-		    dn->dn_phys->dn_maxblkid == 0 ||
-		    dnode_next_offset(dn, FALSE, &off, 1, 1, 0) != 0);
-	}
-}
-
-/*
- * Try to kick all the dnodes dbufs out of the cache...
- */
-int
-dnode_evict_dbufs(dnode_t *dn, int try)
-{
-	int progress;
-	int pass = 0;
-
-	do {
-		dmu_buf_impl_t *db, marker;
-		int evicting = FALSE;
-
-		progress = FALSE;
-		mutex_enter(&dn->dn_dbufs_mtx);
-		list_insert_tail(&dn->dn_dbufs, &marker);
-		db = list_head(&dn->dn_dbufs);
-		for (; db != &marker; db = list_head(&dn->dn_dbufs)) {
-			list_remove(&dn->dn_dbufs, db);
-			list_insert_tail(&dn->dn_dbufs, db);
-
-			mutex_enter(&db->db_mtx);
-			if (db->db_state == DB_EVICTING) {
-				progress = TRUE;
-				evicting = TRUE;
-				mutex_exit(&db->db_mtx);
-			} else if (refcount_is_zero(&db->db_holds)) {
-				progress = TRUE;
-				ASSERT(!arc_released(db->db_buf));
-				dbuf_clear(db); /* exits db_mtx for us */
-			} else {
-				mutex_exit(&db->db_mtx);
-			}
-
-		}
-		list_remove(&dn->dn_dbufs, &marker);
-		/*
-		 * NB: we need to drop dn_dbufs_mtx between passes so
-		 * that any DB_EVICTING dbufs can make progress.
-		 * Ideally, we would have some cv we could wait on, but
-		 * since we don't, just wait a bit to give the other
-		 * thread a chance to run.
-		 */
-		mutex_exit(&dn->dn_dbufs_mtx);
-		if (evicting)
-			delay(1);
-		pass++;
-		ASSERT(pass < 100); /* sanity check */
-	} while (progress);
-
-	/*
-	 * This function works fine even if it can't evict everything.
-	 * If were only asked to try to evict everything then
-	 * return an error if we can't. Otherwise panic as the caller
-	 * expects total eviction.
-	 */
-	if (list_head(&dn->dn_dbufs) != NULL) {
-		if (try) {
-			return (1);
-		} else {
-			panic("dangling dbufs (dn=%p, dbuf=%p)\n",
-			    dn, list_head(&dn->dn_dbufs));
-		}
-	}
-
-	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
-	if (dn->dn_bonus && refcount_is_zero(&dn->dn_bonus->db_holds)) {
-		mutex_enter(&dn->dn_bonus->db_mtx);
-		dbuf_evict(dn->dn_bonus);
-		dn->dn_bonus = NULL;
-	}
-	rw_exit(&dn->dn_struct_rwlock);
-	return (0);
-}
-
-static void
-dnode_undirty_dbufs(list_t *list)
-{
-	dbuf_dirty_record_t *dr;
-
-	while (dr = list_head(list)) {
-		dmu_buf_impl_t *db = dr->dr_dbuf;
-		uint64_t txg = dr->dr_txg;
-
-		mutex_enter(&db->db_mtx);
-		/* XXX - use dbuf_undirty()? */
-		list_remove(list, dr);
-		ASSERT(db->db_last_dirty == dr);
-		db->db_last_dirty = NULL;
-		db->db_dirtycnt -= 1;
-		if (db->db_level == 0) {
-			ASSERT(db->db_blkid == DB_BONUS_BLKID ||
-			    dr->dt.dl.dr_data == db->db_buf);
-			dbuf_unoverride(dr);
-			mutex_exit(&db->db_mtx);
-		} else {
-			mutex_exit(&db->db_mtx);
-			dnode_undirty_dbufs(&dr->dt.di.dr_children);
-			list_destroy(&dr->dt.di.dr_children);
-			mutex_destroy(&dr->dt.di.dr_mtx);
-		}
-		kmem_free(dr, sizeof (dbuf_dirty_record_t));
-		dbuf_rele(db, (void *)(uintptr_t)txg);
-	}
-}
-
-static void
-dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
-{
-	int txgoff = tx->tx_txg & TXG_MASK;
-
-	ASSERT(dmu_tx_is_syncing(tx));
-
-	dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]);
-	(void) dnode_evict_dbufs(dn, 0);
-	ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
-
-	/*
-	 * XXX - It would be nice to assert this, but we may still
-	 * have residual holds from async evictions from the arc...
-	 *
-	 * zfs_obj_to_path() also depends on this being
-	 * commented out.
-	 *
-	 * ASSERT3U(refcount_count(&dn->dn_holds), ==, 1);
-	 */
-
-	/* Undirty next bits */
-	dn->dn_next_nlevels[txgoff] = 0;
-	dn->dn_next_indblkshift[txgoff] = 0;
-	dn->dn_next_blksz[txgoff] = 0;
-
-	/* free up all the blocks in the file. */
-	dnode_sync_free_range(dn, 0, dn->dn_phys->dn_maxblkid+1, tx);
-	ASSERT3U(DN_USED_BYTES(dn->dn_phys), ==, 0);
-
-	/* ASSERT(blkptrs are zero); */
-	ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
-	ASSERT(dn->dn_type != DMU_OT_NONE);
-
-	ASSERT(dn->dn_free_txg > 0);
-	if (dn->dn_allocated_txg != dn->dn_free_txg)
-		dbuf_will_dirty(dn->dn_dbuf, tx);
-	bzero(dn->dn_phys, sizeof (dnode_phys_t));
-
-	mutex_enter(&dn->dn_mtx);
-	dn->dn_type = DMU_OT_NONE;
-	dn->dn_maxblkid = 0;
-	dn->dn_allocated_txg = 0;
-	mutex_exit(&dn->dn_mtx);
-
-	ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
-
-	dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
-	/*
-	 * Now that we've released our hold, the dnode may
-	 * be evicted, so we musn't access it.
-	 */
-}
-
-/*
- * Write out the dnode's dirty buffers.
- *
- * NOTE: The dnode is kept in memory by being dirty.  Once the
- * dirty bit is cleared, it may be evicted.  Beware of this!
- */
-void
-dnode_sync(dnode_t *dn, dmu_tx_t *tx)
-{
-	free_range_t *rp;
-	dnode_phys_t *dnp = dn->dn_phys;
-	int txgoff = tx->tx_txg & TXG_MASK;
-	list_t *list = &dn->dn_dirty_records[txgoff];
-
-	ASSERT(dmu_tx_is_syncing(tx));
-	ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg);
-	DNODE_VERIFY(dn);
-
-	ASSERT(dn->dn_dbuf == NULL || arc_released(dn->dn_dbuf->db_buf));
-
-	mutex_enter(&dn->dn_mtx);
-	if (dn->dn_allocated_txg == tx->tx_txg) {
-		/* The dnode is newly allocated or reallocated */
-		if (dnp->dn_type == DMU_OT_NONE) {
-			/* this is a first alloc, not a realloc */
-			/* XXX shouldn't the phys already be zeroed? */
-			bzero(dnp, DNODE_CORE_SIZE);
-			dnp->dn_nlevels = 1;
-		}
-
-		if (dn->dn_nblkptr > dnp->dn_nblkptr) {
-			/* zero the new blkptrs we are gaining */
-			bzero(dnp->dn_blkptr + dnp->dn_nblkptr,
-			    sizeof (blkptr_t) *
-			    (dn->dn_nblkptr - dnp->dn_nblkptr));
-		}
-		dnp->dn_type = dn->dn_type;
-		dnp->dn_bonustype = dn->dn_bonustype;
-		dnp->dn_bonuslen = dn->dn_bonuslen;
-		dnp->dn_nblkptr = dn->dn_nblkptr;
-	}
-
-	ASSERT(dnp->dn_nlevels > 1 ||
-	    BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
-	    BP_GET_LSIZE(&dnp->dn_blkptr[0]) ==
-	    dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
-
-	if (dn->dn_next_blksz[txgoff]) {
-		ASSERT(P2PHASE(dn->dn_next_blksz[txgoff],
-		    SPA_MINBLOCKSIZE) == 0);
-		ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
-		    list_head(list) != NULL ||
-		    dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT ==
-		    dnp->dn_datablkszsec);
-		dnp->dn_datablkszsec =
-		    dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT;
-		dn->dn_next_blksz[txgoff] = 0;
-	}
-
-	if (dn->dn_next_indblkshift[txgoff]) {
-		ASSERT(dnp->dn_nlevels == 1);
-		dnp->dn_indblkshift = dn->dn_next_indblkshift[txgoff];
-		dn->dn_next_indblkshift[txgoff] = 0;
-	}
-
-	/*
-	 * Just take the live (open-context) values for checksum and compress.
-	 * Strictly speaking it's a future leak, but nothing bad happens if we
-	 * start using the new checksum or compress algorithm a little early.
-	 */
-	dnp->dn_checksum = dn->dn_checksum;
-	dnp->dn_compress = dn->dn_compress;
-
-	mutex_exit(&dn->dn_mtx);
-
-	/* process all the "freed" ranges in the file */
-	if (dn->dn_free_txg == 0 || dn->dn_free_txg > tx->tx_txg) {
-		for (rp = avl_last(&dn->dn_ranges[txgoff]); rp != NULL;
-		    rp = AVL_PREV(&dn->dn_ranges[txgoff], rp))
-			dnode_sync_free_range(dn,
-			    rp->fr_blkid, rp->fr_nblks, tx);
-	}
-	mutex_enter(&dn->dn_mtx);
-	for (rp = avl_first(&dn->dn_ranges[txgoff]); rp; ) {
-		free_range_t *last = rp;
-		rp = AVL_NEXT(&dn->dn_ranges[txgoff], rp);
-		avl_remove(&dn->dn_ranges[txgoff], last);
-		kmem_free(last, sizeof (free_range_t));
-	}
-	mutex_exit(&dn->dn_mtx);
-
-	if (dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg) {
-		dnode_sync_free(dn, tx);
-		return;
-	}
-
-	if (dn->dn_next_nlevels[txgoff]) {
-		dnode_increase_indirection(dn, tx);
-		dn->dn_next_nlevels[txgoff] = 0;
-	}
-
-	dbuf_sync_list(list, tx);
-
-	if (dn->dn_object != DMU_META_DNODE_OBJECT) {
-		ASSERT3P(list_head(list), ==, NULL);
-		dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
-	}
-
-	/*
-	 * Although we have dropped our reference to the dnode, it
-	 * can't be evicted until its written, and we haven't yet
-	 * initiated the IO for the dnode's dbuf.
-	 */
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
deleted file mode 100644
index 7d4689f..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
+++ /dev/null
@@ -1,2035 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/dmu_objset.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_dir.h>
-#include <sys/dsl_prop.h>
-#include <sys/dsl_synctask.h>
-#include <sys/dmu_traverse.h>
-#include <sys/dmu_tx.h>
-#include <sys/arc.h>
-#include <sys/zio.h>
-#include <sys/zap.h>
-#include <sys/unique.h>
-#include <sys/zfs_context.h>
-#include <sys/zfs_ioctl.h>
-
-static dsl_checkfunc_t dsl_dataset_destroy_begin_check;
-static dsl_syncfunc_t dsl_dataset_destroy_begin_sync;
-static dsl_checkfunc_t dsl_dataset_rollback_check;
-static dsl_syncfunc_t dsl_dataset_rollback_sync;
-static dsl_checkfunc_t dsl_dataset_destroy_check;
-static dsl_syncfunc_t dsl_dataset_destroy_sync;
-
-#define	DS_REF_MAX	(1ULL << 62)
-
-#define	DSL_DEADLIST_BLOCKSIZE	SPA_MAXBLOCKSIZE
-
-/*
- * We use weighted reference counts to express the various forms of exclusion
- * between different open modes.  A STANDARD open is 1 point, an EXCLUSIVE open
- * is DS_REF_MAX, and a PRIMARY open is little more than half of an EXCLUSIVE.
- * This makes the exclusion logic simple: the total refcnt for all opens cannot
- * exceed DS_REF_MAX.  For example, EXCLUSIVE opens are exclusive because their
- * weight (DS_REF_MAX) consumes the entire refcnt space.  PRIMARY opens consume
- * just over half of the refcnt space, so there can't be more than one, but it
- * can peacefully coexist with any number of STANDARD opens.
- */
-static uint64_t ds_refcnt_weight[DS_MODE_LEVELS] = {
-	0,			/* DS_MODE_NONE - invalid		*/
-	1,			/* DS_MODE_STANDARD - unlimited number	*/
-	(DS_REF_MAX >> 1) + 1,	/* DS_MODE_PRIMARY - only one of these	*/
-	DS_REF_MAX		/* DS_MODE_EXCLUSIVE - no other opens	*/
-};
-
-
-void
-dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
-{
-	int used = bp_get_dasize(tx->tx_pool->dp_spa, bp);
-	int compressed = BP_GET_PSIZE(bp);
-	int uncompressed = BP_GET_UCSIZE(bp);
-
-	dprintf_bp(bp, "born, ds=%p\n", ds);
-
-	ASSERT(dmu_tx_is_syncing(tx));
-	/* It could have been compressed away to nothing */
-	if (BP_IS_HOLE(bp))
-		return;
-	ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE);
-	ASSERT3U(BP_GET_TYPE(bp), <, DMU_OT_NUMTYPES);
-	if (ds == NULL) {
-		/*
-		 * Account for the meta-objset space in its placeholder
-		 * dsl_dir.
-		 */
-		ASSERT3U(compressed, ==, uncompressed); /* it's all metadata */
-		dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir,
-		    used, compressed, uncompressed, tx);
-		dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx);
-		return;
-	}
-	dmu_buf_will_dirty(ds->ds_dbuf, tx);
-	mutex_enter(&ds->ds_lock);
-	ds->ds_phys->ds_used_bytes += used;
-	ds->ds_phys->ds_compressed_bytes += compressed;
-	ds->ds_phys->ds_uncompressed_bytes += uncompressed;
-	ds->ds_phys->ds_unique_bytes += used;
-	mutex_exit(&ds->ds_lock);
-	dsl_dir_diduse_space(ds->ds_dir,
-	    used, compressed, uncompressed, tx);
-}
-
-void
-dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
-    dmu_tx_t *tx)
-{
-	int used = bp_get_dasize(tx->tx_pool->dp_spa, bp);
-	int compressed = BP_GET_PSIZE(bp);
-	int uncompressed = BP_GET_UCSIZE(bp);
-
-	ASSERT(dmu_tx_is_syncing(tx));
-	/* No block pointer => nothing to free */
-	if (BP_IS_HOLE(bp))
-		return;
-
-	ASSERT(used > 0);
-	if (ds == NULL) {
-		int err;
-		/*
-		 * Account for the meta-objset space in its placeholder
-		 * dataset.
-		 */
-		err = arc_free(pio, tx->tx_pool->dp_spa,
-		    tx->tx_txg, bp, NULL, NULL, pio ? ARC_NOWAIT: ARC_WAIT);
-		ASSERT(err == 0);
-
-		dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir,
-		    -used, -compressed, -uncompressed, tx);
-		dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx);
-		return;
-	}
-	ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);
-
-	dmu_buf_will_dirty(ds->ds_dbuf, tx);
-
-	if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) {
-		int err;
-
-		dprintf_bp(bp, "freeing: %s", "");
-		err = arc_free(pio, tx->tx_pool->dp_spa,
-		    tx->tx_txg, bp, NULL, NULL, pio ? ARC_NOWAIT: ARC_WAIT);
-		ASSERT(err == 0);
-
-		mutex_enter(&ds->ds_lock);
-		/* XXX unique_bytes is not accurate for head datasets */
-		/* ASSERT3U(ds->ds_phys->ds_unique_bytes, >=, used); */
-		ds->ds_phys->ds_unique_bytes -= used;
-		mutex_exit(&ds->ds_lock);
-		dsl_dir_diduse_space(ds->ds_dir,
-		    -used, -compressed, -uncompressed, tx);
-	} else {
-		dprintf_bp(bp, "putting on dead list: %s", "");
-		VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, bp, tx));
-		/* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
-		if (ds->ds_phys->ds_prev_snap_obj != 0) {
-			ASSERT3U(ds->ds_prev->ds_object, ==,
-			    ds->ds_phys->ds_prev_snap_obj);
-			ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0);
-			if (ds->ds_prev->ds_phys->ds_next_snap_obj ==
-			    ds->ds_object && bp->blk_birth >
-			    ds->ds_prev->ds_phys->ds_prev_snap_txg) {
-				dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
-				mutex_enter(&ds->ds_prev->ds_lock);
-				ds->ds_prev->ds_phys->ds_unique_bytes +=
-				    used;
-				mutex_exit(&ds->ds_prev->ds_lock);
-			}
-		}
-	}
-	mutex_enter(&ds->ds_lock);
-	ASSERT3U(ds->ds_phys->ds_used_bytes, >=, used);
-	ds->ds_phys->ds_used_bytes -= used;
-	ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed);
-	ds->ds_phys->ds_compressed_bytes -= compressed;
-	ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed);
-	ds->ds_phys->ds_uncompressed_bytes -= uncompressed;
-	mutex_exit(&ds->ds_lock);
-}
-
-uint64_t
-dsl_dataset_prev_snap_txg(dsl_dataset_t *ds)
-{
-	uint64_t trysnap = 0;
-
-	if (ds == NULL)
-		return (0);
-	/*
-	 * The snapshot creation could fail, but that would cause an
-	 * incorrect FALSE return, which would only result in an
-	 * overestimation of the amount of space that an operation would
-	 * consume, which is OK.
-	 *
-	 * There's also a small window where we could miss a pending
-	 * snapshot, because we could set the sync task in the quiescing
-	 * phase.  So this should only be used as a guess.
-	 */
-	if (ds->ds_trysnap_txg >
-	    spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa))
-		trysnap = ds->ds_trysnap_txg;
-	return (MAX(ds->ds_phys->ds_prev_snap_txg, trysnap));
-}
-
-int
-dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth)
-{
-	return (blk_birth > dsl_dataset_prev_snap_txg(ds));
-}
-
-/* ARGSUSED */
-static void
-dsl_dataset_evict(dmu_buf_t *db, void *dsv)
-{
-	dsl_dataset_t *ds = dsv;
-	dsl_pool_t *dp = ds->ds_dir->dd_pool;
-
-	/* open_refcount == DS_REF_MAX when deleting */
-	ASSERT(ds->ds_open_refcount == 0 ||
-	    ds->ds_open_refcount == DS_REF_MAX);
-
-	dprintf_ds(ds, "evicting %s\n", "");
-
-	unique_remove(ds->ds_phys->ds_fsid_guid);
-
-	if (ds->ds_user_ptr != NULL)
-		ds->ds_user_evict_func(ds, ds->ds_user_ptr);
-
-	if (ds->ds_prev) {
-		dsl_dataset_close(ds->ds_prev, DS_MODE_NONE, ds);
-		ds->ds_prev = NULL;
-	}
-
-	bplist_close(&ds->ds_deadlist);
-	dsl_dir_close(ds->ds_dir, ds);
-
-	if (list_link_active(&ds->ds_synced_link))
-		list_remove(&dp->dp_synced_objsets, ds);
-
-	mutex_destroy(&ds->ds_lock);
-	mutex_destroy(&ds->ds_deadlist.bpl_lock);
-
-	kmem_free(ds, sizeof (dsl_dataset_t));
-}
-
-static int
-dsl_dataset_get_snapname(dsl_dataset_t *ds)
-{
-	dsl_dataset_phys_t *headphys;
-	int err;
-	dmu_buf_t *headdbuf;
-	dsl_pool_t *dp = ds->ds_dir->dd_pool;
-	objset_t *mos = dp->dp_meta_objset;
-
-	if (ds->ds_snapname[0])
-		return (0);
-	if (ds->ds_phys->ds_next_snap_obj == 0)
-		return (0);
-
-	err = dmu_bonus_hold(mos, ds->ds_dir->dd_phys->dd_head_dataset_obj,
-	    FTAG, &headdbuf);
-	if (err)
-		return (err);
-	headphys = headdbuf->db_data;
-	err = zap_value_search(dp->dp_meta_objset,
-	    headphys->ds_snapnames_zapobj, ds->ds_object, ds->ds_snapname);
-	dmu_buf_rele(headdbuf, FTAG);
-	return (err);
-}
-
-int
-dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname,
-    int mode, void *tag, dsl_dataset_t **dsp)
-{
-	uint64_t weight = ds_refcnt_weight[DS_MODE_LEVEL(mode)];
-	objset_t *mos = dp->dp_meta_objset;
-	dmu_buf_t *dbuf;
-	dsl_dataset_t *ds;
-	int err;
-
-	ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
-	    dsl_pool_sync_context(dp));
-
-	err = dmu_bonus_hold(mos, dsobj, tag, &dbuf);
-	if (err)
-		return (err);
-	ds = dmu_buf_get_user(dbuf);
-	if (ds == NULL) {
-		dsl_dataset_t *winner;
-
-		ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP);
-		ds->ds_dbuf = dbuf;
-		ds->ds_object = dsobj;
-		ds->ds_phys = dbuf->db_data;
-
-		mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
-		mutex_init(&ds->ds_deadlist.bpl_lock, NULL, MUTEX_DEFAULT,
-		    NULL);
-
-		err = bplist_open(&ds->ds_deadlist,
-		    mos, ds->ds_phys->ds_deadlist_obj);
-		if (err == 0) {
-			err = dsl_dir_open_obj(dp,
-			    ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir);
-		}
-		if (err) {
-			/*
-			 * we don't really need to close the blist if we
-			 * just opened it.
-			 */
-			mutex_destroy(&ds->ds_lock);
-			mutex_destroy(&ds->ds_deadlist.bpl_lock);
-			kmem_free(ds, sizeof (dsl_dataset_t));
-			dmu_buf_rele(dbuf, tag);
-			return (err);
-		}
-
-		if (ds->ds_dir->dd_phys->dd_head_dataset_obj == dsobj) {
-			ds->ds_snapname[0] = '\0';
-			if (ds->ds_phys->ds_prev_snap_obj) {
-				err = dsl_dataset_open_obj(dp,
-				    ds->ds_phys->ds_prev_snap_obj, NULL,
-				    DS_MODE_NONE, ds, &ds->ds_prev);
-			}
-		} else {
-			if (snapname) {
-#ifdef ZFS_DEBUG
-				dsl_dataset_phys_t *headphys;
-				dmu_buf_t *headdbuf;
-				err = dmu_bonus_hold(mos,
-				    ds->ds_dir->dd_phys->dd_head_dataset_obj,
-				    FTAG, &headdbuf);
-				if (err == 0) {
-					headphys = headdbuf->db_data;
-					uint64_t foundobj;
-					err = zap_lookup(dp->dp_meta_objset,
-					    headphys->ds_snapnames_zapobj,
-					    snapname, sizeof (foundobj), 1,
-					    &foundobj);
-					ASSERT3U(foundobj, ==, dsobj);
-					dmu_buf_rele(headdbuf, FTAG);
-				}
-#endif
-				(void) strcat(ds->ds_snapname, snapname);
-			} else if (zfs_flags & ZFS_DEBUG_SNAPNAMES) {
-				err = dsl_dataset_get_snapname(ds);
-			}
-		}
-
-		if (err == 0) {
-			winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys,
-			    dsl_dataset_evict);
-		}
-		if (err || winner) {
-			bplist_close(&ds->ds_deadlist);
-			if (ds->ds_prev) {
-				dsl_dataset_close(ds->ds_prev,
-				    DS_MODE_NONE, ds);
-			}
-			dsl_dir_close(ds->ds_dir, ds);
-			mutex_destroy(&ds->ds_lock);
-			mutex_destroy(&ds->ds_deadlist.bpl_lock);
-			kmem_free(ds, sizeof (dsl_dataset_t));
-			if (err) {
-				dmu_buf_rele(dbuf, tag);
-				return (err);
-			}
-			ds = winner;
-		} else {
-			uint64_t new =
-			    unique_insert(ds->ds_phys->ds_fsid_guid);
-			if (new != ds->ds_phys->ds_fsid_guid) {
-				/* XXX it won't necessarily be synced... */
-				ds->ds_phys->ds_fsid_guid = new;
-			}
-		}
-	}
-	ASSERT3P(ds->ds_dbuf, ==, dbuf);
-	ASSERT3P(ds->ds_phys, ==, dbuf->db_data);
-
-	mutex_enter(&ds->ds_lock);
-	if ((DS_MODE_LEVEL(mode) == DS_MODE_PRIMARY &&
-	    (ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) &&
-	    !DS_MODE_IS_INCONSISTENT(mode)) ||
-	    (ds->ds_open_refcount + weight > DS_REF_MAX)) {
-		mutex_exit(&ds->ds_lock);
-		dsl_dataset_close(ds, DS_MODE_NONE, tag);
-		return (EBUSY);
-	}
-	ds->ds_open_refcount += weight;
-	mutex_exit(&ds->ds_lock);
-
-	*dsp = ds;
-	return (0);
-}
-
-int
-dsl_dataset_open_spa(spa_t *spa, const char *name, int mode,
-    void *tag, dsl_dataset_t **dsp)
-{
-	dsl_dir_t *dd;
-	dsl_pool_t *dp;
-	const char *tail;
-	uint64_t obj;
-	dsl_dataset_t *ds = NULL;
-	int err = 0;
-
-	err = dsl_dir_open_spa(spa, name, FTAG, &dd, &tail);
-	if (err)
-		return (err);
-
-	dp = dd->dd_pool;
-	obj = dd->dd_phys->dd_head_dataset_obj;
-	rw_enter(&dp->dp_config_rwlock, RW_READER);
-	if (obj == 0) {
-		/* A dataset with no associated objset */
-		err = ENOENT;
-		goto out;
-	}
-
-	if (tail != NULL) {
-		objset_t *mos = dp->dp_meta_objset;
-
-		err = dsl_dataset_open_obj(dp, obj, NULL,
-		    DS_MODE_NONE, tag, &ds);
-		if (err)
-			goto out;
-		obj = ds->ds_phys->ds_snapnames_zapobj;
-		dsl_dataset_close(ds, DS_MODE_NONE, tag);
-		ds = NULL;
-
-		if (tail[0] != '@') {
-			err = ENOENT;
-			goto out;
-		}
-		tail++;
-
-		/* Look for a snapshot */
-		if (!DS_MODE_IS_READONLY(mode)) {
-			err = EROFS;
-			goto out;
-		}
-		dprintf("looking for snapshot '%s'\n", tail);
-		err = zap_lookup(mos, obj, tail, 8, 1, &obj);
-		if (err)
-			goto out;
-	}
-	err = dsl_dataset_open_obj(dp, obj, tail, mode, tag, &ds);
-
-out:
-	rw_exit(&dp->dp_config_rwlock);
-	dsl_dir_close(dd, FTAG);
-
-	ASSERT3U((err == 0), ==, (ds != NULL));
-	/* ASSERT(ds == NULL || strcmp(name, ds->ds_name) == 0); */
-
-	*dsp = ds;
-	return (err);
-}
-
-int
-dsl_dataset_open(const char *name, int mode, void *tag, dsl_dataset_t **dsp)
-{
-	return (dsl_dataset_open_spa(NULL, name, mode, tag, dsp));
-}
-
-void
-dsl_dataset_name(dsl_dataset_t *ds, char *name)
-{
-	if (ds == NULL) {
-		(void) strcpy(name, "mos");
-	} else {
-		dsl_dir_name(ds->ds_dir, name);
-		VERIFY(0 == dsl_dataset_get_snapname(ds));
-		if (ds->ds_snapname[0]) {
-			(void) strcat(name, "@");
-			if (!MUTEX_HELD(&ds->ds_lock)) {
-				/*
-				 * We use a "recursive" mutex so that we
-				 * can call dprintf_ds() with ds_lock held.
-				 */
-				mutex_enter(&ds->ds_lock);
-				(void) strcat(name, ds->ds_snapname);
-				mutex_exit(&ds->ds_lock);
-			} else {
-				(void) strcat(name, ds->ds_snapname);
-			}
-		}
-	}
-}
-
-static int
-dsl_dataset_namelen(dsl_dataset_t *ds)
-{
-	int result;
-
-	if (ds == NULL) {
-		result = 3;	/* "mos" */
-	} else {
-		result = dsl_dir_namelen(ds->ds_dir);
-		VERIFY(0 == dsl_dataset_get_snapname(ds));
-		if (ds->ds_snapname[0]) {
-			++result;	/* adding one for the @-sign */
-			if (!MUTEX_HELD(&ds->ds_lock)) {
-				/* see dsl_datset_name */
-				mutex_enter(&ds->ds_lock);
-				result += strlen(ds->ds_snapname);
-				mutex_exit(&ds->ds_lock);
-			} else {
-				result += strlen(ds->ds_snapname);
-			}
-		}
-	}
-
-	return (result);
-}
-
-void
-dsl_dataset_close(dsl_dataset_t *ds, int mode, void *tag)
-{
-	uint64_t weight = ds_refcnt_weight[DS_MODE_LEVEL(mode)];
-	mutex_enter(&ds->ds_lock);
-	ASSERT3U(ds->ds_open_refcount, >=, weight);
-	ds->ds_open_refcount -= weight;
-	dprintf_ds(ds, "closing mode %u refcount now 0x%llx\n",
-	    mode, ds->ds_open_refcount);
-	mutex_exit(&ds->ds_lock);
-
-	dmu_buf_rele(ds->ds_dbuf, tag);
-}
-
-void
-dsl_dataset_create_root(dsl_pool_t *dp, uint64_t *ddobjp, dmu_tx_t *tx)
-{
-	objset_t *mos = dp->dp_meta_objset;
-	dmu_buf_t *dbuf;
-	dsl_dataset_phys_t *dsphys;
-	dsl_dataset_t *ds;
-	uint64_t dsobj;
-	dsl_dir_t *dd;
-
-	dsl_dir_create_root(mos, ddobjp, tx);
-	VERIFY(0 == dsl_dir_open_obj(dp, *ddobjp, NULL, FTAG, &dd));
-
-	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
-	    DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
-	VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
-	dmu_buf_will_dirty(dbuf, tx);
-	dsphys = dbuf->db_data;
-	dsphys->ds_dir_obj = dd->dd_object;
-	dsphys->ds_fsid_guid = unique_create();
-	unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */
-	(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
-	    sizeof (dsphys->ds_guid));
-	dsphys->ds_snapnames_zapobj =
-	    zap_create(mos, DMU_OT_DSL_DS_SNAP_MAP, DMU_OT_NONE, 0, tx);
-	dsphys->ds_creation_time = gethrestime_sec();
-	dsphys->ds_creation_txg = tx->tx_txg;
-	dsphys->ds_deadlist_obj =
-	    bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
-	dmu_buf_rele(dbuf, FTAG);
-
-	dmu_buf_will_dirty(dd->dd_dbuf, tx);
-	dd->dd_phys->dd_head_dataset_obj = dsobj;
-	dsl_dir_close(dd, FTAG);
-
-	VERIFY(0 ==
-	    dsl_dataset_open_obj(dp, dsobj, NULL, DS_MODE_NONE, FTAG, &ds));
-	(void) dmu_objset_create_impl(dp->dp_spa, ds,
-	    &ds->ds_phys->ds_bp, DMU_OST_ZFS, tx);
-	dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
-}
-
-uint64_t
-dsl_dataset_create_sync(dsl_dir_t *pdd,
-    const char *lastname, dsl_dataset_t *clone_parent, dmu_tx_t *tx)
-{
-	dsl_pool_t *dp = pdd->dd_pool;
-	dmu_buf_t *dbuf;
-	dsl_dataset_phys_t *dsphys;
-	uint64_t dsobj, ddobj;
-	objset_t *mos = dp->dp_meta_objset;
-	dsl_dir_t *dd;
-
-	ASSERT(clone_parent == NULL || clone_parent->ds_dir->dd_pool == dp);
-	ASSERT(clone_parent == NULL ||
-	    clone_parent->ds_phys->ds_num_children > 0);
-	ASSERT(lastname[0] != '@');
-	ASSERT(dmu_tx_is_syncing(tx));
-
-	ddobj = dsl_dir_create_sync(pdd, lastname, tx);
-	VERIFY(0 == dsl_dir_open_obj(dp, ddobj, lastname, FTAG, &dd));
-
-	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
-	    DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
-	VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
-	dmu_buf_will_dirty(dbuf, tx);
-	dsphys = dbuf->db_data;
-	dsphys->ds_dir_obj = dd->dd_object;
-	dsphys->ds_fsid_guid = unique_create();
-	unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */
-	(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
-	    sizeof (dsphys->ds_guid));
-	dsphys->ds_snapnames_zapobj =
-	    zap_create(mos, DMU_OT_DSL_DS_SNAP_MAP, DMU_OT_NONE, 0, tx);
-	dsphys->ds_creation_time = gethrestime_sec();
-	dsphys->ds_creation_txg = tx->tx_txg;
-	dsphys->ds_deadlist_obj =
-	    bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
-	if (clone_parent) {
-		dsphys->ds_prev_snap_obj = clone_parent->ds_object;
-		dsphys->ds_prev_snap_txg =
-		    clone_parent->ds_phys->ds_creation_txg;
-		dsphys->ds_used_bytes =
-		    clone_parent->ds_phys->ds_used_bytes;
-		dsphys->ds_compressed_bytes =
-		    clone_parent->ds_phys->ds_compressed_bytes;
-		dsphys->ds_uncompressed_bytes =
-		    clone_parent->ds_phys->ds_uncompressed_bytes;
-		dsphys->ds_bp = clone_parent->ds_phys->ds_bp;
-
-		dmu_buf_will_dirty(clone_parent->ds_dbuf, tx);
-		clone_parent->ds_phys->ds_num_children++;
-
-		dmu_buf_will_dirty(dd->dd_dbuf, tx);
-		dd->dd_phys->dd_clone_parent_obj = clone_parent->ds_object;
-	}
-	dmu_buf_rele(dbuf, FTAG);
-
-	dmu_buf_will_dirty(dd->dd_dbuf, tx);
-	dd->dd_phys->dd_head_dataset_obj = dsobj;
-	dsl_dir_close(dd, FTAG);
-
-	return (dsobj);
-}
-
-struct destroyarg {
-	dsl_sync_task_group_t *dstg;
-	char *snapname;
-	char *failed;
-};
-
-static int
-dsl_snapshot_destroy_one(char *name, void *arg)
-{
-	struct destroyarg *da = arg;
-	dsl_dataset_t *ds;
-	char *cp;
-	int err;
-
-	(void) strcat(name, "@");
-	(void) strcat(name, da->snapname);
-	err = dsl_dataset_open(name,
-	    DS_MODE_EXCLUSIVE | DS_MODE_READONLY | DS_MODE_INCONSISTENT,
-	    da->dstg, &ds);
-	cp = strchr(name, '@');
-	*cp = '\0';
-	if (err == ENOENT)
-		return (0);
-	if (err) {
-		(void) strcpy(da->failed, name);
-		return (err);
-	}
-
-	dsl_sync_task_create(da->dstg, dsl_dataset_destroy_check,
-	    dsl_dataset_destroy_sync, ds, da->dstg, 0);
-	return (0);
-}
-
-/*
- * Destroy 'snapname' in all descendants of 'fsname'.
- */
-#pragma weak dmu_snapshots_destroy = dsl_snapshots_destroy
-int
-dsl_snapshots_destroy(char *fsname, char *snapname)
-{
-	int err;
-	struct destroyarg da;
-	dsl_sync_task_t *dst;
-	spa_t *spa;
-	char *cp;
-
-	cp = strchr(fsname, '/');
-	if (cp) {
-		*cp = '\0';
-		err = spa_open(fsname, &spa, FTAG);
-		*cp = '/';
-	} else {
-		err = spa_open(fsname, &spa, FTAG);
-	}
-	if (err)
-		return (err);
-	da.dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
-	da.snapname = snapname;
-	da.failed = fsname;
-
-	err = dmu_objset_find(fsname,
-	    dsl_snapshot_destroy_one, &da, DS_FIND_CHILDREN);
-
-	if (err == 0)
-		err = dsl_sync_task_group_wait(da.dstg);
-
-	for (dst = list_head(&da.dstg->dstg_tasks); dst;
-	    dst = list_next(&da.dstg->dstg_tasks, dst)) {
-		dsl_dataset_t *ds = dst->dst_arg1;
-		if (dst->dst_err) {
-			dsl_dataset_name(ds, fsname);
-			cp = strchr(fsname, '@');
-			*cp = '\0';
-		}
-		/*
-		 * If it was successful, destroy_sync would have
-		 * closed the ds
-		 */
-		if (err)
-			dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, da.dstg);
-	}
-
-	dsl_sync_task_group_destroy(da.dstg);
-	spa_close(spa, FTAG);
-	return (err);
-}
-
-int
-dsl_dataset_destroy(const char *name)
-{
-	int err;
-	dsl_sync_task_group_t *dstg;
-	objset_t *os;
-	dsl_dataset_t *ds;
-	dsl_dir_t *dd;
-	uint64_t obj;
-
-	if (strchr(name, '@')) {
-		/* Destroying a snapshot is simpler */
-		err = dsl_dataset_open(name,
-		    DS_MODE_EXCLUSIVE | DS_MODE_READONLY | DS_MODE_INCONSISTENT,
-		    FTAG, &ds);
-		if (err)
-			return (err);
-		err = dsl_sync_task_do(ds->ds_dir->dd_pool,
-		    dsl_dataset_destroy_check, dsl_dataset_destroy_sync,
-		    ds, FTAG, 0);
-		if (err)
-			dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
-		return (err);
-	}
-
-	err = dmu_objset_open(name, DMU_OST_ANY,
-	    DS_MODE_EXCLUSIVE | DS_MODE_INCONSISTENT, &os);
-	if (err)
-		return (err);
-	ds = os->os->os_dsl_dataset;
-	dd = ds->ds_dir;
-
-	/*
-	 * Check for errors and mark this ds as inconsistent, in
-	 * case we crash while freeing the objects.
-	 */
-	err = dsl_sync_task_do(dd->dd_pool, dsl_dataset_destroy_begin_check,
-	    dsl_dataset_destroy_begin_sync, ds, NULL, 0);
-	if (err) {
-		dmu_objset_close(os);
-		return (err);
-	}
-
-	/*
-	 * remove the objects in open context, so that we won't
-	 * have too much to do in syncing context.
-	 */
-	for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE,
-	    ds->ds_phys->ds_prev_snap_txg)) {
-		dmu_tx_t *tx = dmu_tx_create(os);
-		dmu_tx_hold_free(tx, obj, 0, DMU_OBJECT_END);
-		dmu_tx_hold_bonus(tx, obj);
-		err = dmu_tx_assign(tx, TXG_WAIT);
-		if (err) {
-			/*
-			 * Perhaps there is not enough disk
-			 * space.  Just deal with it from
-			 * dsl_dataset_destroy_sync().
-			 */
-			dmu_tx_abort(tx);
-			continue;
-		}
-		VERIFY(0 == dmu_object_free(os, obj, tx));
-		dmu_tx_commit(tx);
-	}
-	/* Make sure it's not dirty before we finish destroying it. */
-	txg_wait_synced(dd->dd_pool, 0);
-
-	dmu_objset_close(os);
-	if (err != ESRCH)
-		return (err);
-
-	err = dsl_dataset_open(name,
-	    DS_MODE_EXCLUSIVE | DS_MODE_READONLY | DS_MODE_INCONSISTENT,
-	    FTAG, &ds);
-	if (err)
-		return (err);
-
-	err = dsl_dir_open(name, FTAG, &dd, NULL);
-	if (err) {
-		dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
-		return (err);
-	}
-
-	/*
-	 * Blow away the dsl_dir + head dataset.
-	 */
-	dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool);
-	dsl_sync_task_create(dstg, dsl_dataset_destroy_check,
-	    dsl_dataset_destroy_sync, ds, FTAG, 0);
-	dsl_sync_task_create(dstg, dsl_dir_destroy_check,
-	    dsl_dir_destroy_sync, dd, FTAG, 0);
-	err = dsl_sync_task_group_wait(dstg);
-	dsl_sync_task_group_destroy(dstg);
-	/* if it is successful, *destroy_sync will close the ds+dd */
-	if (err) {
-		dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
-		dsl_dir_close(dd, FTAG);
-	}
-	return (err);
-}
-
-int
-dsl_dataset_rollback(dsl_dataset_t *ds)
-{
-	ASSERT3U(ds->ds_open_refcount, ==, DS_REF_MAX);
-	return (dsl_sync_task_do(ds->ds_dir->dd_pool,
-	    dsl_dataset_rollback_check, dsl_dataset_rollback_sync,
-	    ds, NULL, 0));
-}
-
-void *
-dsl_dataset_set_user_ptr(dsl_dataset_t *ds,
-    void *p, dsl_dataset_evict_func_t func)
-{
-	void *old;
-
-	mutex_enter(&ds->ds_lock);
-	old = ds->ds_user_ptr;
-	if (old == NULL) {
-		ds->ds_user_ptr = p;
-		ds->ds_user_evict_func = func;
-	}
-	mutex_exit(&ds->ds_lock);
-	return (old);
-}
-
-void *
-dsl_dataset_get_user_ptr(dsl_dataset_t *ds)
-{
-	return (ds->ds_user_ptr);
-}
-
-
-blkptr_t *
-dsl_dataset_get_blkptr(dsl_dataset_t *ds)
-{
-	return (&ds->ds_phys->ds_bp);
-}
-
-void
-dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
-{
-	ASSERT(dmu_tx_is_syncing(tx));
-	/* If it's the meta-objset, set dp_meta_rootbp */
-	if (ds == NULL) {
-		tx->tx_pool->dp_meta_rootbp = *bp;
-	} else {
-		dmu_buf_will_dirty(ds->ds_dbuf, tx);
-		ds->ds_phys->ds_bp = *bp;
-	}
-}
-
-spa_t *
-dsl_dataset_get_spa(dsl_dataset_t *ds)
-{
-	return (ds->ds_dir->dd_pool->dp_spa);
-}
-
-void
-dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)
-{
-	dsl_pool_t *dp;
-
-	if (ds == NULL) /* this is the meta-objset */
-		return;
-
-	ASSERT(ds->ds_user_ptr != NULL);
-
-	if (ds->ds_phys->ds_next_snap_obj != 0)
-		panic("dirtying snapshot!");
-
-	dp = ds->ds_dir->dd_pool;
-
-	if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg) == 0) {
-		/* up the hold count until we can be written out */
-		dmu_buf_add_ref(ds->ds_dbuf, ds);
-	}
-}
-
-struct killarg {
-	uint64_t *usedp;
-	uint64_t *compressedp;
-	uint64_t *uncompressedp;
-	zio_t *zio;
-	dmu_tx_t *tx;
-};
-
-static int
-kill_blkptr(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
-{
-	struct killarg *ka = arg;
-	blkptr_t *bp = &bc->bc_blkptr;
-
-	ASSERT3U(bc->bc_errno, ==, 0);
-
-	/*
-	 * Since this callback is not called concurrently, no lock is
-	 * needed on the accounting values.
-	 */
-	*ka->usedp += bp_get_dasize(spa, bp);
-	*ka->compressedp += BP_GET_PSIZE(bp);
-	*ka->uncompressedp += BP_GET_UCSIZE(bp);
-	/* XXX check for EIO? */
-	(void) arc_free(ka->zio, spa, ka->tx->tx_txg, bp, NULL, NULL,
-	    ARC_NOWAIT);
-	return (0);
-}
-
-/* ARGSUSED */
-static int
-dsl_dataset_rollback_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-	dsl_dataset_t *ds = arg1;
-
-	/*
-	 * There must be a previous snapshot.  I suppose we could roll
-	 * it back to being empty (and re-initialize the upper (ZPL)
-	 * layer).  But for now there's no way to do this via the user
-	 * interface.
-	 */
-	if (ds->ds_phys->ds_prev_snap_txg == 0)
-		return (EINVAL);
-
-	/*
-	 * This must not be a snapshot.
-	 */
-	if (ds->ds_phys->ds_next_snap_obj != 0)
-		return (EINVAL);
-
-	/*
-	 * If we made changes this txg, traverse_dsl_dataset won't find
-	 * them.  Try again.
-	 */
-	if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg)
-		return (EAGAIN);
-
-	return (0);
-}
-
-/* ARGSUSED */
-static void
-dsl_dataset_rollback_sync(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-	dsl_dataset_t *ds = arg1;
-	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
-
-	dmu_buf_will_dirty(ds->ds_dbuf, tx);
-
-	/* Zero out the deadlist. */
-	bplist_close(&ds->ds_deadlist);
-	bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx);
-	ds->ds_phys->ds_deadlist_obj =
-	    bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
-	VERIFY(0 == bplist_open(&ds->ds_deadlist, mos,
-	    ds->ds_phys->ds_deadlist_obj));
-
-	{
-		/* Free blkptrs that we gave birth to */
-		zio_t *zio;
-		uint64_t used = 0, compressed = 0, uncompressed = 0;
-		struct killarg ka;
-
-		zio = zio_root(tx->tx_pool->dp_spa, NULL, NULL,
-		    ZIO_FLAG_MUSTSUCCEED);
-		ka.usedp = &used;
-		ka.compressedp = &compressed;
-		ka.uncompressedp = &uncompressed;
-		ka.zio = zio;
-		ka.tx = tx;
-		(void) traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
-		    ADVANCE_POST, kill_blkptr, &ka);
-		(void) zio_wait(zio);
-
-		dsl_dir_diduse_space(ds->ds_dir,
-		    -used, -compressed, -uncompressed, tx);
-	}
-
-	/* Change our contents to that of the prev snapshot */
-	ASSERT3U(ds->ds_prev->ds_object, ==, ds->ds_phys->ds_prev_snap_obj);
-	ds->ds_phys->ds_bp = ds->ds_prev->ds_phys->ds_bp;
-	ds->ds_phys->ds_used_bytes = ds->ds_prev->ds_phys->ds_used_bytes;
-	ds->ds_phys->ds_compressed_bytes =
-	    ds->ds_prev->ds_phys->ds_compressed_bytes;
-	ds->ds_phys->ds_uncompressed_bytes =
-	    ds->ds_prev->ds_phys->ds_uncompressed_bytes;
-	ds->ds_phys->ds_flags = ds->ds_prev->ds_phys->ds_flags;
-	ds->ds_phys->ds_unique_bytes = 0;
-
-	if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
-		dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
-		ds->ds_prev->ds_phys->ds_unique_bytes = 0;
-	}
-}
-
-/* ARGSUSED */
-static int
-dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-	dsl_dataset_t *ds = arg1;
-
-	/*
-	 * Can't delete a head dataset if there are snapshots of it.
-	 * (Except if the only snapshots are from the branch we cloned
-	 * from.)
-	 */
-	if (ds->ds_prev != NULL &&
-	    ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
-		return (EINVAL);
-
-	return (0);
-}
-
-/* ARGSUSED */
-static void
-dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-	dsl_dataset_t *ds = arg1;
-
-	/* Mark it as inconsistent on-disk, in case we crash */
-	dmu_buf_will_dirty(ds->ds_dbuf, tx);
-	ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
-}
-
-/* ARGSUSED */
-static int
-dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-	dsl_dataset_t *ds = arg1;
-
-	/* Can't delete a branch point. */
-	if (ds->ds_phys->ds_num_children > 1)
-		return (EEXIST);
-
-	/*
-	 * Can't delete a head dataset if there are snapshots of it.
-	 * (Except if the only snapshots are from the branch we cloned
-	 * from.)
-	 */
-	if (ds->ds_prev != NULL &&
-	    ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
-		return (EINVAL);
-
-	/*
-	 * If we made changes this txg, traverse_dsl_dataset won't find
-	 * them.  Try again.
-	 */
-	if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg)
-		return (EAGAIN);
-
-	/* XXX we should do some i/o error checking... */
-	return (0);
-}
-
-static void
-dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
-{
-	dsl_dataset_t *ds = arg1;
-	uint64_t used = 0, compressed = 0, uncompressed = 0;
-	zio_t *zio;
-	int err;
-	int after_branch_point = FALSE;
-	dsl_pool_t *dp = ds->ds_dir->dd_pool;
-	objset_t *mos = dp->dp_meta_objset;
-	dsl_dataset_t *ds_prev = NULL;
-	uint64_t obj;
-
-	ASSERT3U(ds->ds_open_refcount, ==, DS_REF_MAX);
-	ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
-	ASSERT(ds->ds_prev == NULL ||
-	    ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object);
-	ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg);
-
-	ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
-
-	obj = ds->ds_object;
-
-	if (ds->ds_phys->ds_prev_snap_obj != 0) {
-		if (ds->ds_prev) {
-			ds_prev = ds->ds_prev;
-		} else {
-			VERIFY(0 == dsl_dataset_open_obj(dp,
-			    ds->ds_phys->ds_prev_snap_obj, NULL,
-			    DS_MODE_NONE, FTAG, &ds_prev));
-		}
-		after_branch_point =
-		    (ds_prev->ds_phys->ds_next_snap_obj != obj);
-
-		dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
-		if (after_branch_point &&
-		    ds->ds_phys->ds_next_snap_obj == 0) {
-			/* This clone is toast. */
-			ASSERT(ds_prev->ds_phys->ds_num_children > 1);
-			ds_prev->ds_phys->ds_num_children--;
-		} else if (!after_branch_point) {
-			ds_prev->ds_phys->ds_next_snap_obj =
-			    ds->ds_phys->ds_next_snap_obj;
-		}
-	}
-
-	zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
-
-	if (ds->ds_phys->ds_next_snap_obj != 0) {
-		blkptr_t bp;
-		dsl_dataset_t *ds_next;
-		uint64_t itor = 0;
-
-		spa_scrub_restart(dp->dp_spa, tx->tx_txg);
-
-		VERIFY(0 == dsl_dataset_open_obj(dp,
-		    ds->ds_phys->ds_next_snap_obj, NULL,
-		    DS_MODE_NONE, FTAG, &ds_next));
-		ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj);
-
-		dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
-		ds_next->ds_phys->ds_prev_snap_obj =
-		    ds->ds_phys->ds_prev_snap_obj;
-		ds_next->ds_phys->ds_prev_snap_txg =
-		    ds->ds_phys->ds_prev_snap_txg;
-		ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
-		    ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0);
-
-		/*
-		 * Transfer to our deadlist (which will become next's
-		 * new deadlist) any entries from next's current
-		 * deadlist which were born before prev, and free the
-		 * other entries.
-		 *
-		 * XXX we're doing this long task with the config lock held
-		 */
-		while (bplist_iterate(&ds_next->ds_deadlist, &itor,
-		    &bp) == 0) {
-			if (bp.blk_birth <= ds->ds_phys->ds_prev_snap_txg) {
-				VERIFY(0 == bplist_enqueue(&ds->ds_deadlist,
-				    &bp, tx));
-				if (ds_prev && !after_branch_point &&
-				    bp.blk_birth >
-				    ds_prev->ds_phys->ds_prev_snap_txg) {
-					ds_prev->ds_phys->ds_unique_bytes +=
-					    bp_get_dasize(dp->dp_spa, &bp);
-				}
-			} else {
-				used += bp_get_dasize(dp->dp_spa, &bp);
-				compressed += BP_GET_PSIZE(&bp);
-				uncompressed += BP_GET_UCSIZE(&bp);
-				/* XXX check return value? */
-				(void) arc_free(zio, dp->dp_spa, tx->tx_txg,
-				    &bp, NULL, NULL, ARC_NOWAIT);
-			}
-		}
-
-		/* free next's deadlist */
-		bplist_close(&ds_next->ds_deadlist);
-		bplist_destroy(mos, ds_next->ds_phys->ds_deadlist_obj, tx);
-
-		/* set next's deadlist to our deadlist */
-		ds_next->ds_phys->ds_deadlist_obj =
-		    ds->ds_phys->ds_deadlist_obj;
-		VERIFY(0 == bplist_open(&ds_next->ds_deadlist, mos,
-		    ds_next->ds_phys->ds_deadlist_obj));
-		ds->ds_phys->ds_deadlist_obj = 0;
-
-		if (ds_next->ds_phys->ds_next_snap_obj != 0) {
-			/*
-			 * Update next's unique to include blocks which
-			 * were previously shared by only this snapshot
-			 * and it.  Those blocks will be born after the
-			 * prev snap and before this snap, and will have
-			 * died after the next snap and before the one
-			 * after that (ie. be on the snap after next's
-			 * deadlist).
-			 *
-			 * XXX we're doing this long task with the
-			 * config lock held
-			 */
-			dsl_dataset_t *ds_after_next;
-
-			VERIFY(0 == dsl_dataset_open_obj(dp,
-			    ds_next->ds_phys->ds_next_snap_obj, NULL,
-			    DS_MODE_NONE, FTAG, &ds_after_next));
-			itor = 0;
-			while (bplist_iterate(&ds_after_next->ds_deadlist,
-			    &itor, &bp) == 0) {
-				if (bp.blk_birth >
-				    ds->ds_phys->ds_prev_snap_txg &&
-				    bp.blk_birth <=
-				    ds->ds_phys->ds_creation_txg) {
-					ds_next->ds_phys->ds_unique_bytes +=
-					    bp_get_dasize(dp->dp_spa, &bp);
-				}
-			}
-
-			dsl_dataset_close(ds_after_next, DS_MODE_NONE, FTAG);
-			ASSERT3P(ds_next->ds_prev, ==, NULL);
-		} else {
-			/*
-			 * It would be nice to update the head dataset's
-			 * unique.  To do so we would have to traverse
-			 * it for blocks born after ds_prev, which is
-			 * pretty expensive just to maintain something
-			 * for debugging purposes.
-			 */
-			ASSERT3P(ds_next->ds_prev, ==, ds);
-			dsl_dataset_close(ds_next->ds_prev, DS_MODE_NONE,
-			    ds_next);
-			if (ds_prev) {
-				VERIFY(0 == dsl_dataset_open_obj(dp,
-				    ds->ds_phys->ds_prev_snap_obj, NULL,
-				    DS_MODE_NONE, ds_next, &ds_next->ds_prev));
-			} else {
-				ds_next->ds_prev = NULL;
-			}
-		}
-		dsl_dataset_close(ds_next, DS_MODE_NONE, FTAG);
-
-		/*
-		 * NB: unique_bytes is not accurate for head objsets
-		 * because we don't update it when we delete the most
-		 * recent snapshot -- see above comment.
-		 */
-		ASSERT3U(used, ==, ds->ds_phys->ds_unique_bytes);
-	} else {
-		/*
-		 * There's no next snapshot, so this is a head dataset.
-		 * Destroy the deadlist.  Unless it's a clone, the
-		 * deadlist should be empty.  (If it's a clone, it's
-		 * safe to ignore the deadlist contents.)
-		 */
-		struct killarg ka;
-
-		ASSERT(after_branch_point || bplist_empty(&ds->ds_deadlist));
-		bplist_close(&ds->ds_deadlist);
-		bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx);
-		ds->ds_phys->ds_deadlist_obj = 0;
-
-		/*
-		 * Free everything that we point to (that's born after
-		 * the previous snapshot, if we are a clone)
-		 *
-		 * XXX we're doing this long task with the config lock held
-		 */
-		ka.usedp = &used;
-		ka.compressedp = &compressed;
-		ka.uncompressedp = &uncompressed;
-		ka.zio = zio;
-		ka.tx = tx;
-		err = traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
-		    ADVANCE_POST, kill_blkptr, &ka);
-		ASSERT3U(err, ==, 0);
-	}
-
-	err = zio_wait(zio);
-	ASSERT3U(err, ==, 0);
-
-	dsl_dir_diduse_space(ds->ds_dir, -used, -compressed, -uncompressed, tx);
-
-	if (ds->ds_phys->ds_snapnames_zapobj) {
-		err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx);
-		ASSERT(err == 0);
-	}
-
-	if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) {
-		/* Erase the link in the dataset */
-		dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
-		ds->ds_dir->dd_phys->dd_head_dataset_obj = 0;
-		/*
-		 * dsl_dir_sync_destroy() called us, they'll destroy
-		 * the dataset.
-		 */
-	} else {
-		/* remove from snapshot namespace */
-		dsl_dataset_t *ds_head;
-		VERIFY(0 == dsl_dataset_open_obj(dp,
-		    ds->ds_dir->dd_phys->dd_head_dataset_obj, NULL,
-		    DS_MODE_NONE, FTAG, &ds_head));
-		VERIFY(0 == dsl_dataset_get_snapname(ds));
-#ifdef ZFS_DEBUG
-		{
-			uint64_t val;
-			err = zap_lookup(mos,
-			    ds_head->ds_phys->ds_snapnames_zapobj,
-			    ds->ds_snapname, 8, 1, &val);
-			ASSERT3U(err, ==, 0);
-			ASSERT3U(val, ==, obj);
-		}
-#endif
-		err = zap_remove(mos, ds_head->ds_phys->ds_snapnames_zapobj,
-		    ds->ds_snapname, tx);
-		ASSERT(err == 0);
-		dsl_dataset_close(ds_head, DS_MODE_NONE, FTAG);
-	}
-
-	if (ds_prev && ds->ds_prev != ds_prev)
-		dsl_dataset_close(ds_prev, DS_MODE_NONE, FTAG);
-
-	spa_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
-	dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, tag);
-	VERIFY(0 == dmu_object_free(mos, obj, tx));
-
-}
-
-/* ARGSUSED */
-int
-dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-	objset_t *os = arg1;
-	dsl_dataset_t *ds = os->os->os_dsl_dataset;
-	const char *snapname = arg2;
-	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
-	int err;
-	uint64_t value;
-
-	/*
-	 * We don't allow multiple snapshots of the same txg.  If there
-	 * is already one, try again.
-	 */
-	if (ds->ds_phys->ds_prev_snap_txg >= tx->tx_txg)
-		return (EAGAIN);
-
-	/*
-	 * Check for conflicting name snapshot name.
-	 */
-	err = zap_lookup(mos, ds->ds_phys->ds_snapnames_zapobj,
-	    snapname, 8, 1, &value);
-	if (err == 0)
-		return (EEXIST);
-	if (err != ENOENT)
-		return (err);
-
-	/*
-	 * Check that the dataset's name is not too long.  Name consists
-	 * of the dataset's length + 1 for the @-sign + snapshot name's length
-	 */
-	if (dsl_dataset_namelen(ds) + 1 + strlen(snapname) >= MAXNAMELEN)
-		return (ENAMETOOLONG);
-
-	ds->ds_trysnap_txg = tx->tx_txg;
-	return (0);
-}
-
-void
-dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-	objset_t *os = arg1;
-	dsl_dataset_t *ds = os->os->os_dsl_dataset;
-	const char *snapname = arg2;
-	dsl_pool_t *dp = ds->ds_dir->dd_pool;
-	dmu_buf_t *dbuf;
-	dsl_dataset_phys_t *dsphys;
-	uint64_t dsobj;
-	objset_t *mos = dp->dp_meta_objset;
-	int err;
-
-	spa_scrub_restart(dp->dp_spa, tx->tx_txg);
-	ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
-
-	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
-	    DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
-	VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
-	dmu_buf_will_dirty(dbuf, tx);
-	dsphys = dbuf->db_data;
-	dsphys->ds_dir_obj = ds->ds_dir->dd_object;
-	dsphys->ds_fsid_guid = unique_create();
-	unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */
-	(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
-	    sizeof (dsphys->ds_guid));
-	dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj;
-	dsphys->ds_prev_snap_txg = ds->ds_phys->ds_prev_snap_txg;
-	dsphys->ds_next_snap_obj = ds->ds_object;
-	dsphys->ds_num_children = 1;
-	dsphys->ds_creation_time = gethrestime_sec();
-	dsphys->ds_creation_txg = tx->tx_txg;
-	dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj;
-	dsphys->ds_used_bytes = ds->ds_phys->ds_used_bytes;
-	dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes;
-	dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes;
-	dsphys->ds_flags = ds->ds_phys->ds_flags;
-	dsphys->ds_bp = ds->ds_phys->ds_bp;
-	dmu_buf_rele(dbuf, FTAG);
-
-	ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0);
-	if (ds->ds_prev) {
-		ASSERT(ds->ds_prev->ds_phys->ds_next_snap_obj ==
-		    ds->ds_object ||
-		    ds->ds_prev->ds_phys->ds_num_children > 1);
-		if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
-			dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
-			ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
-			    ds->ds_prev->ds_phys->ds_creation_txg);
-			ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj;
-		}
-	}
-
-	bplist_close(&ds->ds_deadlist);
-	dmu_buf_will_dirty(ds->ds_dbuf, tx);
-	ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, dsphys->ds_creation_txg);
-	ds->ds_phys->ds_prev_snap_obj = dsobj;
-	ds->ds_phys->ds_prev_snap_txg = dsphys->ds_creation_txg;
-	ds->ds_phys->ds_unique_bytes = 0;
-	ds->ds_phys->ds_deadlist_obj =
-	    bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
-	VERIFY(0 == bplist_open(&ds->ds_deadlist, mos,
-	    ds->ds_phys->ds_deadlist_obj));
-
-	dprintf("snap '%s' -> obj %llu\n", snapname, dsobj);
-	err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj,
-	    snapname, 8, 1, &dsobj, tx);
-	ASSERT(err == 0);
-
-	if (ds->ds_prev)
-		dsl_dataset_close(ds->ds_prev, DS_MODE_NONE, ds);
-	VERIFY(0 == dsl_dataset_open_obj(dp,
-	    ds->ds_phys->ds_prev_snap_obj, snapname,
-	    DS_MODE_NONE, ds, &ds->ds_prev));
-}
-
-void
-dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
-{
-	ASSERT(dmu_tx_is_syncing(tx));
-	ASSERT(ds->ds_user_ptr != NULL);
-	ASSERT(ds->ds_phys->ds_next_snap_obj == 0);
-
-	dsl_dir_dirty(ds->ds_dir, tx);
-	dmu_objset_sync(ds->ds_user_ptr, zio, tx);
-	/* Unneeded? bplist_close(&ds->ds_deadlist); */
-}
-
-void
-dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
-{
-	dsl_dir_stats(ds->ds_dir, nv);
-
-	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION,
-	    ds->ds_phys->ds_creation_time);
-	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG,
-	    ds->ds_phys->ds_creation_txg);
-	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED,
-	    ds->ds_phys->ds_used_bytes);
-
-	if (ds->ds_phys->ds_next_snap_obj) {
-		/*
-		 * This is a snapshot; override the dd's space used with
-		 * our unique space and compression ratio.
-		 */
-		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
-		    ds->ds_phys->ds_unique_bytes);
-		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO,
-		    ds->ds_phys->ds_compressed_bytes == 0 ? 100 :
-		    (ds->ds_phys->ds_uncompressed_bytes * 100 /
-		    ds->ds_phys->ds_compressed_bytes));
-	}
-}
-
-void
-dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
-{
-	stat->dds_creation_txg = ds->ds_phys->ds_creation_txg;
-	stat->dds_inconsistent = ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT;
-	if (ds->ds_phys->ds_next_snap_obj) {
-		stat->dds_is_snapshot = B_TRUE;
-		stat->dds_num_clones = ds->ds_phys->ds_num_children - 1;
-	}
-
-	/* clone origin is really a dsl_dir thing... */
-	if (ds->ds_dir->dd_phys->dd_clone_parent_obj) {
-		dsl_dataset_t *ods;
-
-		rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER);
-		VERIFY(0 == dsl_dataset_open_obj(ds->ds_dir->dd_pool,
-		    ds->ds_dir->dd_phys->dd_clone_parent_obj,
-		    NULL, DS_MODE_NONE, FTAG, &ods));
-		dsl_dataset_name(ods, stat->dds_clone_of);
-		dsl_dataset_close(ods, DS_MODE_NONE, FTAG);
-		rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
-	}
-}
-
-uint64_t
-dsl_dataset_fsid_guid(dsl_dataset_t *ds)
-{
-	return (ds->ds_phys->ds_fsid_guid);
-}
-
-void
-dsl_dataset_space(dsl_dataset_t *ds,
-    uint64_t *refdbytesp, uint64_t *availbytesp,
-    uint64_t *usedobjsp, uint64_t *availobjsp)
-{
-	*refdbytesp = ds->ds_phys->ds_used_bytes;
-	*availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE);
-	*usedobjsp = ds->ds_phys->ds_bp.blk_fill;
-	*availobjsp = DN_MAX_OBJECT - *usedobjsp;
-}
-
-/* ARGSUSED */
-static int
-dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-	dsl_dataset_t *ds = arg1;
-	char *newsnapname = arg2;
-	dsl_dir_t *dd = ds->ds_dir;
-	objset_t *mos = dd->dd_pool->dp_meta_objset;
-	dsl_dataset_t *hds;
-	uint64_t val;
-	int err;
-
-	err = dsl_dataset_open_obj(dd->dd_pool,
-	    dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG, &hds);
-	if (err)
-		return (err);
-
-	/* new name better not be in use */
-	err = zap_lookup(mos, hds->ds_phys->ds_snapnames_zapobj,
-	    newsnapname, 8, 1, &val);
-	dsl_dataset_close(hds, DS_MODE_NONE, FTAG);
-
-	if (err == 0)
-		err = EEXIST;
-	else if (err == ENOENT)
-		err = 0;
-
-	/* dataset name + 1 for the "@" + the new snapshot name must fit */
-	if (dsl_dir_namelen(ds->ds_dir) + 1 + strlen(newsnapname) >= MAXNAMELEN)
-		err = ENAMETOOLONG;
-
-	return (err);
-}
-
-static void
-dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-	dsl_dataset_t *ds = arg1;
-	char *newsnapname = arg2;
-	dsl_dir_t *dd = ds->ds_dir;
-	objset_t *mos = dd->dd_pool->dp_meta_objset;
-	dsl_dataset_t *hds;
-	int err;
-
-	ASSERT(ds->ds_phys->ds_next_snap_obj != 0);
-
-	VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool,
-	    dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG, &hds));
-
-	VERIFY(0 == dsl_dataset_get_snapname(ds));
-	err = zap_remove(mos, hds->ds_phys->ds_snapnames_zapobj,
-	    ds->ds_snapname, tx);
-	ASSERT3U(err, ==, 0);
-	mutex_enter(&ds->ds_lock);
-	(void) strcpy(ds->ds_snapname, newsnapname);
-	mutex_exit(&ds->ds_lock);
-	err = zap_add(mos, hds->ds_phys->ds_snapnames_zapobj,
-	    ds->ds_snapname, 8, 1, &ds->ds_object, tx);
-	ASSERT3U(err, ==, 0);
-
-	dsl_dataset_close(hds, DS_MODE_NONE, FTAG);
-}
-
-struct renamearg {
-	dsl_sync_task_group_t *dstg;
-	char failed[MAXPATHLEN];
-	char *oldsnap;
-	char *newsnap;
-};
-
-static int
-dsl_snapshot_rename_one(char *name, void *arg)
-{
-	struct renamearg *ra = arg;
-	dsl_dataset_t *ds = NULL;
-	char *cp;
-	int err;
-
-	cp = name + strlen(name);
-	*cp = '@';
-	(void) strcpy(cp + 1, ra->oldsnap);
-	err = dsl_dataset_open(name, DS_MODE_READONLY | DS_MODE_STANDARD,
-	    ra->dstg, &ds);
-	if (err == ENOENT) {
-		*cp = '\0';
-		return (0);
-	}
-	if (err) {
-		(void) strcpy(ra->failed, name);
-		*cp = '\0';
-		dsl_dataset_close(ds, DS_MODE_STANDARD, ra->dstg);
-		return (err);
-	}
-
-#ifdef _KERNEL
-	/* for all filesystems undergoing rename, we'll need to unmount it */
-	(void) zfs_unmount_snap(name, NULL);
-#endif
-
-	*cp = '\0';
-
-	dsl_sync_task_create(ra->dstg, dsl_dataset_snapshot_rename_check,
-	    dsl_dataset_snapshot_rename_sync, ds, ra->newsnap, 0);
-
-	return (0);
-}
-
-static int
-dsl_recursive_rename(char *oldname, const char *newname)
-{
-	int err;
-	struct renamearg *ra;
-	dsl_sync_task_t *dst;
-	spa_t *spa;
-	char *cp, *fsname = spa_strdup(oldname);
-	int len = strlen(oldname);
-
-	/* truncate the snapshot name to get the fsname */
-	cp = strchr(fsname, '@');
-	*cp = '\0';
-
-	cp = strchr(fsname, '/');
-	if (cp) {
-		*cp = '\0';
-		err = spa_open(fsname, &spa, FTAG);
-		*cp = '/';
-	} else {
-		err = spa_open(fsname, &spa, FTAG);
-	}
-	if (err) {
-		kmem_free(fsname, len + 1);
-		return (err);
-	}
-	ra = kmem_alloc(sizeof (struct renamearg), KM_SLEEP);
-	ra->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
-
-	ra->oldsnap = strchr(oldname, '@') + 1;
-	ra->newsnap = strchr(newname, '@') + 1;
-	*ra->failed = '\0';
-
-	err = dmu_objset_find(fsname, dsl_snapshot_rename_one, ra,
-	    DS_FIND_CHILDREN);
-	kmem_free(fsname, len + 1);
-
-	if (err == 0) {
-		err = dsl_sync_task_group_wait(ra->dstg);
-	}
-
-	for (dst = list_head(&ra->dstg->dstg_tasks); dst;
-	    dst = list_next(&ra->dstg->dstg_tasks, dst)) {
-		dsl_dataset_t *ds = dst->dst_arg1;
-		if (dst->dst_err) {
-			dsl_dir_name(ds->ds_dir, ra->failed);
-			(void) strcat(ra->failed, "@");
-			(void) strcat(ra->failed, ra->newsnap);
-		}
-		dsl_dataset_close(ds, DS_MODE_STANDARD, ra->dstg);
-	}
-
-	(void) strcpy(oldname, ra->failed);
-
-	dsl_sync_task_group_destroy(ra->dstg);
-	kmem_free(ra, sizeof (struct renamearg));
-	spa_close(spa, FTAG);
-	return (err);
-}
-
-#pragma weak dmu_objset_rename = dsl_dataset_rename
-int
-dsl_dataset_rename(char *oldname, const char *newname,
-    boolean_t recursive)
-{
-	dsl_dir_t *dd;
-	dsl_dataset_t *ds;
-	const char *tail;
-	int err;
-
-	err = dsl_dir_open(oldname, FTAG, &dd, &tail);
-	if (err)
-		return (err);
-	if (tail == NULL) {
-		err = dsl_dir_rename(dd, newname);
-		dsl_dir_close(dd, FTAG);
-		return (err);
-	}
-	if (tail[0] != '@') {
-		/* the name ended in a nonexistant component */
-		dsl_dir_close(dd, FTAG);
-		return (ENOENT);
-	}
-
-	dsl_dir_close(dd, FTAG);
-
-	/* new name must be snapshot in same filesystem */
-	tail = strchr(newname, '@');
-	if (tail == NULL)
-		return (EINVAL);
-	tail++;
-	if (strncmp(oldname, newname, tail - newname) != 0)
-		return (EXDEV);
-
-	if (recursive) {
-		err = dsl_recursive_rename(oldname, newname);
-	} else {
-		err = dsl_dataset_open(oldname,
-		    DS_MODE_READONLY | DS_MODE_STANDARD, FTAG, &ds);
-		if (err)
-			return (err);
-
-		err = dsl_sync_task_do(ds->ds_dir->dd_pool,
-		    dsl_dataset_snapshot_rename_check,
-		    dsl_dataset_snapshot_rename_sync, ds, (char *)tail, 1);
-
-		dsl_dataset_close(ds, DS_MODE_STANDARD, FTAG);
-	}
-
-	return (err);
-}
-
-struct promotearg {
-	uint64_t used, comp, uncomp, unique;
-	uint64_t newnext_obj, snapnames_obj;
-};
-
-static int
-dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-	dsl_dataset_t *hds = arg1;
-	struct promotearg *pa = arg2;
-	dsl_dir_t *dd = hds->ds_dir;
-	dsl_pool_t *dp = hds->ds_dir->dd_pool;
-	dsl_dir_t *pdd = NULL;
-	dsl_dataset_t *ds = NULL;
-	dsl_dataset_t *pivot_ds = NULL;
-	dsl_dataset_t *newnext_ds = NULL;
-	int err;
-	char *name = NULL;
-	uint64_t itor = 0;
-	blkptr_t bp;
-
-	bzero(pa, sizeof (*pa));
-
-	/* Check that it is a clone */
-	if (dd->dd_phys->dd_clone_parent_obj == 0)
-		return (EINVAL);
-
-	/* Since this is so expensive, don't do the preliminary check */
-	if (!dmu_tx_is_syncing(tx))
-		return (0);
-
-	if (err = dsl_dataset_open_obj(dp,
-	    dd->dd_phys->dd_clone_parent_obj,
-	    NULL, DS_MODE_EXCLUSIVE, FTAG, &pivot_ds))
-		goto out;
-	pdd = pivot_ds->ds_dir;
-
-	{
-		dsl_dataset_t *phds;
-		if (err = dsl_dataset_open_obj(dd->dd_pool,
-		    pdd->dd_phys->dd_head_dataset_obj,
-		    NULL, DS_MODE_NONE, FTAG, &phds))
-			goto out;
-		pa->snapnames_obj = phds->ds_phys->ds_snapnames_zapobj;
-		dsl_dataset_close(phds, DS_MODE_NONE, FTAG);
-	}
-
-	if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE) {
-		err = EXDEV;
-		goto out;
-	}
-
-	/* find pivot point's new next ds */
-	VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, hds->ds_object,
-	    NULL, DS_MODE_NONE, FTAG, &newnext_ds));
-	while (newnext_ds->ds_phys->ds_prev_snap_obj != pivot_ds->ds_object) {
-		dsl_dataset_t *prev;
-
-		if (err = dsl_dataset_open_obj(dd->dd_pool,
-		    newnext_ds->ds_phys->ds_prev_snap_obj,
-		    NULL, DS_MODE_NONE, FTAG, &prev))
-			goto out;
-		dsl_dataset_close(newnext_ds, DS_MODE_NONE, FTAG);
-		newnext_ds = prev;
-	}
-	pa->newnext_obj = newnext_ds->ds_object;
-
-	/* compute pivot point's new unique space */
-	while ((err = bplist_iterate(&newnext_ds->ds_deadlist,
-	    &itor, &bp)) == 0) {
-		if (bp.blk_birth > pivot_ds->ds_phys->ds_prev_snap_txg)
-			pa->unique += bp_get_dasize(dd->dd_pool->dp_spa, &bp);
-	}
-	if (err != ENOENT)
-		goto out;
-
-	/* Walk the snapshots that we are moving */
-	name = kmem_alloc(MAXPATHLEN, KM_SLEEP);
-	ds = pivot_ds;
-	/* CONSTCOND */
-	while (TRUE) {
-		uint64_t val, dlused, dlcomp, dluncomp;
-		dsl_dataset_t *prev;
-
-		/* Check that the snapshot name does not conflict */
-		dsl_dataset_name(ds, name);
-		err = zap_lookup(dd->dd_pool->dp_meta_objset,
-		    hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname,
-		    8, 1, &val);
-		if (err != ENOENT) {
-			if (err == 0)
-				err = EEXIST;
-			goto out;
-		}
-
-		/*
-		 * compute space to transfer.  Each snapshot gave birth to:
-		 * (my used) - (prev's used) + (deadlist's used)
-		 */
-		pa->used += ds->ds_phys->ds_used_bytes;
-		pa->comp += ds->ds_phys->ds_compressed_bytes;
-		pa->uncomp += ds->ds_phys->ds_uncompressed_bytes;
-
-		/* If we reach the first snapshot, we're done. */
-		if (ds->ds_phys->ds_prev_snap_obj == 0)
-			break;
-
-		if (err = bplist_space(&ds->ds_deadlist,
-		    &dlused, &dlcomp, &dluncomp))
-			goto out;
-		if (err = dsl_dataset_open_obj(dd->dd_pool,
-		    ds->ds_phys->ds_prev_snap_obj, NULL, DS_MODE_EXCLUSIVE,
-		    FTAG, &prev))
-			goto out;
-		pa->used += dlused - prev->ds_phys->ds_used_bytes;
-		pa->comp += dlcomp - prev->ds_phys->ds_compressed_bytes;
-		pa->uncomp += dluncomp - prev->ds_phys->ds_uncompressed_bytes;
-
-		/*
-		 * We could be a clone of a clone.  If we reach our
-		 * parent's branch point, we're done.
-		 */
-		if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) {
-			dsl_dataset_close(prev, DS_MODE_EXCLUSIVE, FTAG);
-			break;
-		}
-		if (ds != pivot_ds)
-			dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
-		ds = prev;
-	}
-
-	/* Check that there is enough space here */
-	err = dsl_dir_transfer_possible(pdd, dd, pa->used);
-
-out:
-	if (ds && ds != pivot_ds)
-		dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
-	if (pivot_ds)
-		dsl_dataset_close(pivot_ds, DS_MODE_EXCLUSIVE, FTAG);
-	if (newnext_ds)
-		dsl_dataset_close(newnext_ds, DS_MODE_NONE, FTAG);
-	if (name)
-		kmem_free(name, MAXPATHLEN);
-	return (err);
-}
-
-static void
-dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-	dsl_dataset_t *hds = arg1;
-	struct promotearg *pa = arg2;
-	dsl_dir_t *dd = hds->ds_dir;
-	dsl_pool_t *dp = hds->ds_dir->dd_pool;
-	dsl_dir_t *pdd = NULL;
-	dsl_dataset_t *ds, *pivot_ds;
-	char *name;
-
-	ASSERT(dd->dd_phys->dd_clone_parent_obj != 0);
-	ASSERT(0 == (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE));
-
-	VERIFY(0 == dsl_dataset_open_obj(dp,
-	    dd->dd_phys->dd_clone_parent_obj,
-	    NULL, DS_MODE_EXCLUSIVE, FTAG, &pivot_ds));
-	/*
-	 * We need to explicitly open pdd, since pivot_ds's pdd will be
-	 * changing.
-	 */
-	VERIFY(0 == dsl_dir_open_obj(dp, pivot_ds->ds_dir->dd_object,
-	    NULL, FTAG, &pdd));
-
-	/* move snapshots to this dir */
-	name = kmem_alloc(MAXPATHLEN, KM_SLEEP);
-	ds = pivot_ds;
-	/* CONSTCOND */
-	while (TRUE) {
-		dsl_dataset_t *prev;
-
-		/* move snap name entry */
-		dsl_dataset_name(ds, name);
-		VERIFY(0 == zap_remove(dp->dp_meta_objset,
-		    pa->snapnames_obj, ds->ds_snapname, tx));
-		VERIFY(0 == zap_add(dp->dp_meta_objset,
-		    hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname,
-		    8, 1, &ds->ds_object, tx));
-
-		/* change containing dsl_dir */
-		dmu_buf_will_dirty(ds->ds_dbuf, tx);
-		ASSERT3U(ds->ds_phys->ds_dir_obj, ==, pdd->dd_object);
-		ds->ds_phys->ds_dir_obj = dd->dd_object;
-		ASSERT3P(ds->ds_dir, ==, pdd);
-		dsl_dir_close(ds->ds_dir, ds);
-		VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object,
-		    NULL, ds, &ds->ds_dir));
-
-		ASSERT3U(dsl_prop_numcb(ds), ==, 0);
-
-		if (ds->ds_phys->ds_prev_snap_obj == 0)
-			break;
-
-		VERIFY(0 == dsl_dataset_open_obj(dp,
-		    ds->ds_phys->ds_prev_snap_obj, NULL, DS_MODE_EXCLUSIVE,
-		    FTAG, &prev));
-
-		if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) {
-			dsl_dataset_close(prev, DS_MODE_EXCLUSIVE, FTAG);
-			break;
-		}
-		if (ds != pivot_ds)
-			dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
-		ds = prev;
-	}
-	if (ds != pivot_ds)
-		dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
-
-	/* change pivot point's next snap */
-	dmu_buf_will_dirty(pivot_ds->ds_dbuf, tx);
-	pivot_ds->ds_phys->ds_next_snap_obj = pa->newnext_obj;
-
-	/* change clone_parent-age */
-	dmu_buf_will_dirty(dd->dd_dbuf, tx);
-	ASSERT3U(dd->dd_phys->dd_clone_parent_obj, ==, pivot_ds->ds_object);
-	dd->dd_phys->dd_clone_parent_obj = pdd->dd_phys->dd_clone_parent_obj;
-	dmu_buf_will_dirty(pdd->dd_dbuf, tx);
-	pdd->dd_phys->dd_clone_parent_obj = pivot_ds->ds_object;
-
-	/* change space accounting */
-	dsl_dir_diduse_space(pdd, -pa->used, -pa->comp, -pa->uncomp, tx);
-	dsl_dir_diduse_space(dd, pa->used, pa->comp, pa->uncomp, tx);
-	pivot_ds->ds_phys->ds_unique_bytes = pa->unique;
-
-	dsl_dir_close(pdd, FTAG);
-	dsl_dataset_close(pivot_ds, DS_MODE_EXCLUSIVE, FTAG);
-	kmem_free(name, MAXPATHLEN);
-}
-
-int
-dsl_dataset_promote(const char *name)
-{
-	dsl_dataset_t *ds;
-	int err;
-	dmu_object_info_t doi;
-	struct promotearg pa;
-
-	err = dsl_dataset_open(name, DS_MODE_NONE, FTAG, &ds);
-	if (err)
-		return (err);
-
-	err = dmu_object_info(ds->ds_dir->dd_pool->dp_meta_objset,
-	    ds->ds_phys->ds_snapnames_zapobj, &doi);
-	if (err) {
-		dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
-		return (err);
-	}
-
-	/*
-	 * Add in 128x the snapnames zapobj size, since we will be moving
-	 * a bunch of snapnames to the promoted ds, and dirtying their
-	 * bonus buffers.
-	 */
-	err = dsl_sync_task_do(ds->ds_dir->dd_pool,
-	    dsl_dataset_promote_check,
-	    dsl_dataset_promote_sync, ds, &pa, 2 + 2 * doi.doi_physical_blks);
-	dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
-	return (err);
-}
-
-/*
- * Given a pool name and a dataset object number in that pool,
- * return the name of that dataset.
- */
-int
-dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf)
-{
-	spa_t *spa;
-	dsl_pool_t *dp;
-	dsl_dataset_t *ds = NULL;
-	int error;
-
-	if ((error = spa_open(pname, &spa, FTAG)) != 0)
-		return (error);
-	dp = spa_get_dsl(spa);
-	rw_enter(&dp->dp_config_rwlock, RW_READER);
-	if ((error = dsl_dataset_open_obj(dp, obj,
-	    NULL, DS_MODE_NONE, FTAG, &ds)) != 0) {
-		rw_exit(&dp->dp_config_rwlock);
-		spa_close(spa, FTAG);
-		return (error);
-	}
-	dsl_dataset_name(ds, buf);
-	dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
-	rw_exit(&dp->dp_config_rwlock);
-	spa_close(spa, FTAG);
-
-	return (0);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c
deleted file mode 100644
index 5e563b6..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c
+++ /dev/null
@@ -1,1215 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/dmu.h>
-#include <sys/dmu_tx.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_dir.h>
-#include <sys/dsl_prop.h>
-#include <sys/dsl_synctask.h>
-#include <sys/spa.h>
-#include <sys/zap.h>
-#include <sys/zio.h>
-#include <sys/arc.h>
-#include "zfs_namecheck.h"
-
-static uint64_t dsl_dir_estimated_space(dsl_dir_t *dd);
-static void dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx);
-
-
-/* ARGSUSED */
-static void
-dsl_dir_evict(dmu_buf_t *db, void *arg)
-{
-	dsl_dir_t *dd = arg;
-	dsl_pool_t *dp = dd->dd_pool;
-	int t;
-
-	for (t = 0; t < TXG_SIZE; t++) {
-		ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t));
-		ASSERT(dd->dd_tempreserved[t] == 0);
-		ASSERT(dd->dd_space_towrite[t] == 0);
-	}
-
-	ASSERT3U(dd->dd_used_bytes, ==, dd->dd_phys->dd_used_bytes);
-
-	if (dd->dd_parent)
-		dsl_dir_close(dd->dd_parent, dd);
-
-	spa_close(dd->dd_pool->dp_spa, dd);
-
-	/*
-	 * The props callback list should be empty since they hold the
-	 * dir open.
-	 */
-	list_destroy(&dd->dd_prop_cbs);
-	mutex_destroy(&dd->dd_lock);
-	kmem_free(dd, sizeof (dsl_dir_t));
-}
-
-int
-dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
-    const char *tail, void *tag, dsl_dir_t **ddp)
-{
-	dmu_buf_t *dbuf;
-	dsl_dir_t *dd;
-	int err;
-
-	ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
-	    dsl_pool_sync_context(dp));
-
-	err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf);
-	if (err)
-		return (err);
-	dd = dmu_buf_get_user(dbuf);
-#ifdef ZFS_DEBUG
-	{
-		dmu_object_info_t doi;
-		dmu_object_info_from_db(dbuf, &doi);
-		ASSERT3U(doi.doi_type, ==, DMU_OT_DSL_DIR);
-	}
-#endif
-	/* XXX assert bonus buffer size is correct */
-	if (dd == NULL) {
-		dsl_dir_t *winner;
-		int err;
-
-		dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP);
-		dd->dd_object = ddobj;
-		dd->dd_dbuf = dbuf;
-		dd->dd_pool = dp;
-		dd->dd_phys = dbuf->db_data;
-		dd->dd_used_bytes = dd->dd_phys->dd_used_bytes;
-		mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL);
-
-		list_create(&dd->dd_prop_cbs, sizeof (dsl_prop_cb_record_t),
-		    offsetof(dsl_prop_cb_record_t, cbr_node));
-
-		if (dd->dd_phys->dd_parent_obj) {
-			err = dsl_dir_open_obj(dp, dd->dd_phys->dd_parent_obj,
-			    NULL, dd, &dd->dd_parent);
-			if (err) {
-				mutex_destroy(&dd->dd_lock);
-				kmem_free(dd, sizeof (dsl_dir_t));
-				dmu_buf_rele(dbuf, tag);
-				return (err);
-			}
-			if (tail) {
-#ifdef ZFS_DEBUG
-				uint64_t foundobj;
-
-				err = zap_lookup(dp->dp_meta_objset,
-				    dd->dd_parent->dd_phys->
-				    dd_child_dir_zapobj,
-				    tail, sizeof (foundobj), 1, &foundobj);
-				ASSERT(err || foundobj == ddobj);
-#endif
-				(void) strcpy(dd->dd_myname, tail);
-			} else {
-				err = zap_value_search(dp->dp_meta_objset,
-				    dd->dd_parent->dd_phys->
-				    dd_child_dir_zapobj,
-				    ddobj, dd->dd_myname);
-			}
-			if (err) {
-				dsl_dir_close(dd->dd_parent, dd);
-				mutex_destroy(&dd->dd_lock);
-				kmem_free(dd, sizeof (dsl_dir_t));
-				dmu_buf_rele(dbuf, tag);
-				return (err);
-			}
-		} else {
-			(void) strcpy(dd->dd_myname, spa_name(dp->dp_spa));
-		}
-
-		winner = dmu_buf_set_user_ie(dbuf, dd, &dd->dd_phys,
-		    dsl_dir_evict);
-		if (winner) {
-			if (dd->dd_parent)
-				dsl_dir_close(dd->dd_parent, dd);
-			mutex_destroy(&dd->dd_lock);
-			kmem_free(dd, sizeof (dsl_dir_t));
-			dd = winner;
-		} else {
-			spa_open_ref(dp->dp_spa, dd);
-		}
-	}
-
-	/*
-	 * The dsl_dir_t has both open-to-close and instantiate-to-evict
-	 * holds on the spa.  We need the open-to-close holds because
-	 * otherwise the spa_refcnt wouldn't change when we open a
-	 * dir which the spa also has open, so we could incorrectly
-	 * think it was OK to unload/export/destroy the pool.  We need
-	 * the instantiate-to-evict hold because the dsl_dir_t has a
-	 * pointer to the dd_pool, which has a pointer to the spa_t.
-	 */
-	spa_open_ref(dp->dp_spa, tag);
-	ASSERT3P(dd->dd_pool, ==, dp);
-	ASSERT3U(dd->dd_object, ==, ddobj);
-	ASSERT3P(dd->dd_dbuf, ==, dbuf);
-	*ddp = dd;
-	return (0);
-}
-
-void
-dsl_dir_close(dsl_dir_t *dd, void *tag)
-{
-	dprintf_dd(dd, "%s\n", "");
-	spa_close(dd->dd_pool->dp_spa, tag);
-	dmu_buf_rele(dd->dd_dbuf, tag);
-}
-
-/* buf must be long enough (MAXNAMELEN + strlen(MOS_DIR_NAME) + 1 should do) */
-void
-dsl_dir_name(dsl_dir_t *dd, char *buf)
-{
-	if (dd->dd_parent) {
-		dsl_dir_name(dd->dd_parent, buf);
-		(void) strcat(buf, "/");
-	} else {
-		buf[0] = '\0';
-	}
-	if (!MUTEX_HELD(&dd->dd_lock)) {
-		/*
-		 * recursive mutex so that we can use
-		 * dprintf_dd() with dd_lock held
-		 */
-		mutex_enter(&dd->dd_lock);
-		(void) strcat(buf, dd->dd_myname);
-		mutex_exit(&dd->dd_lock);
-	} else {
-		(void) strcat(buf, dd->dd_myname);
-	}
-}
-
-/* Calculate name legnth, avoiding all the strcat calls of dsl_dir_name */
-int
-dsl_dir_namelen(dsl_dir_t *dd)
-{
-	int result = 0;
-
-	if (dd->dd_parent) {
-		/* parent's name + 1 for the "/" */
-		result = dsl_dir_namelen(dd->dd_parent) + 1;
-	}
-
-	if (!MUTEX_HELD(&dd->dd_lock)) {
-		/* see dsl_dir_name */
-		mutex_enter(&dd->dd_lock);
-		result += strlen(dd->dd_myname);
-		mutex_exit(&dd->dd_lock);
-	} else {
-		result += strlen(dd->dd_myname);
-	}
-
-	return (result);
-}
-
-int
-dsl_dir_is_private(dsl_dir_t *dd)
-{
-	int rv = FALSE;
-
-	if (dd->dd_parent && dsl_dir_is_private(dd->dd_parent))
-		rv = TRUE;
-	if (dataset_name_hidden(dd->dd_myname))
-		rv = TRUE;
-	return (rv);
-}
-
-
-static int
-getcomponent(const char *path, char *component, const char **nextp)
-{
-	char *p;
-	if (path == NULL)
-		return (ENOENT);
-	/* This would be a good place to reserve some namespace... */
-	p = strpbrk(path, "/@");
-	if (p && (p[1] == '/' || p[1] == '@')) {
-		/* two separators in a row */
-		return (EINVAL);
-	}
-	if (p == NULL || p == path) {
-		/*
-		 * if the first thing is an @ or /, it had better be an
-		 * @ and it had better not have any more ats or slashes,
-		 * and it had better have something after the @.
-		 */
-		if (p != NULL &&
-		    (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0'))
-			return (EINVAL);
-		if (strlen(path) >= MAXNAMELEN)
-			return (ENAMETOOLONG);
-		(void) strcpy(component, path);
-		p = NULL;
-	} else if (p[0] == '/') {
-		if (p-path >= MAXNAMELEN)
-			return (ENAMETOOLONG);
-		(void) strncpy(component, path, p - path);
-		component[p-path] = '\0';
-		p++;
-	} else if (p[0] == '@') {
-		/*
-		 * if the next separator is an @, there better not be
-		 * any more slashes.
-		 */
-		if (strchr(path, '/'))
-			return (EINVAL);
-		if (p-path >= MAXNAMELEN)
-			return (ENAMETOOLONG);
-		(void) strncpy(component, path, p - path);
-		component[p-path] = '\0';
-	} else {
-		ASSERT(!"invalid p");
-	}
-	*nextp = p;
-	return (0);
-}
-
-/*
- * same as dsl_open_dir, ignore the first component of name and use the
- * spa instead
- */
-int
-dsl_dir_open_spa(spa_t *spa, const char *name, void *tag,
-    dsl_dir_t **ddp, const char **tailp)
-{
-	char buf[MAXNAMELEN];
-	const char *next, *nextnext = NULL;
-	int err;
-	dsl_dir_t *dd;
-	dsl_pool_t *dp;
-	uint64_t ddobj;
-	int openedspa = FALSE;
-
-	dprintf("%s\n", name);
-
-	err = getcomponent(name, buf, &next);
-	if (err)
-		return (err);
-	if (spa == NULL) {
-		err = spa_open(buf, &spa, FTAG);
-		if (err) {
-			dprintf("spa_open(%s) failed\n", buf);
-			return (err);
-		}
-		openedspa = TRUE;
-
-		/* XXX this assertion belongs in spa_open */
-		ASSERT(!dsl_pool_sync_context(spa_get_dsl(spa)));
-	}
-
-	dp = spa_get_dsl(spa);
-
-	rw_enter(&dp->dp_config_rwlock, RW_READER);
-	err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd);
-	if (err) {
-		rw_exit(&dp->dp_config_rwlock);
-		if (openedspa)
-			spa_close(spa, FTAG);
-		return (err);
-	}
-
-	while (next != NULL) {
-		dsl_dir_t *child_ds;
-		err = getcomponent(next, buf, &nextnext);
-		if (err)
-			break;
-		ASSERT(next[0] != '\0');
-		if (next[0] == '@')
-			break;
-		dprintf("looking up %s in obj%lld\n",
-		    buf, dd->dd_phys->dd_child_dir_zapobj);
-
-		err = zap_lookup(dp->dp_meta_objset,
-		    dd->dd_phys->dd_child_dir_zapobj,
-		    buf, sizeof (ddobj), 1, &ddobj);
-		if (err) {
-			if (err == ENOENT)
-				err = 0;
-			break;
-		}
-
-		err = dsl_dir_open_obj(dp, ddobj, buf, tag, &child_ds);
-		if (err)
-			break;
-		dsl_dir_close(dd, tag);
-		dd = child_ds;
-		next = nextnext;
-	}
-	rw_exit(&dp->dp_config_rwlock);
-
-	if (err) {
-		dsl_dir_close(dd, tag);
-		if (openedspa)
-			spa_close(spa, FTAG);
-		return (err);
-	}
-
-	/*
-	 * It's an error if there's more than one component left, or
-	 * tailp==NULL and there's any component left.
-	 */
-	if (next != NULL &&
-	    (tailp == NULL || (nextnext && nextnext[0] != '\0'))) {
-		/* bad path name */
-		dsl_dir_close(dd, tag);
-		dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp);
-		err = ENOENT;
-	}
-	if (tailp)
-		*tailp = next;
-	if (openedspa)
-		spa_close(spa, FTAG);
-	*ddp = dd;
-	return (err);
-}
-
-/*
- * Return the dsl_dir_t, and possibly the last component which couldn't
- * be found in *tail.  Return NULL if the path is bogus, or if
- * tail==NULL and we couldn't parse the whole name.  (*tail)[0] == '@'
- * means that the last component is a snapshot.
- */
-int
-dsl_dir_open(const char *name, void *tag, dsl_dir_t **ddp, const char **tailp)
-{
-	return (dsl_dir_open_spa(NULL, name, tag, ddp, tailp));
-}
-
-uint64_t
-dsl_dir_create_sync(dsl_dir_t *pds, const char *name, dmu_tx_t *tx)
-{
-	objset_t *mos = pds->dd_pool->dp_meta_objset;
-	uint64_t ddobj;
-	dsl_dir_phys_t *dsphys;
-	dmu_buf_t *dbuf;
-
-	ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
-	    DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx);
-	VERIFY(0 == zap_add(mos, pds->dd_phys->dd_child_dir_zapobj,
-	    name, sizeof (uint64_t), 1, &ddobj, tx));
-	VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf));
-	dmu_buf_will_dirty(dbuf, tx);
-	dsphys = dbuf->db_data;
-
-	dsphys->dd_creation_time = gethrestime_sec();
-	dsphys->dd_parent_obj = pds->dd_object;
-	dsphys->dd_props_zapobj = zap_create(mos,
-	    DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
-	dsphys->dd_child_dir_zapobj = zap_create(mos,
-	    DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
-	dmu_buf_rele(dbuf, FTAG);
-
-	return (ddobj);
-}
-
-/* ARGSUSED */
-int
-dsl_dir_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-	dsl_dir_t *dd = arg1;
-	dsl_pool_t *dp = dd->dd_pool;
-	objset_t *mos = dp->dp_meta_objset;
-	int err;
-	uint64_t count;
-
-	/*
-	 * There should be exactly two holds, both from
-	 * dsl_dataset_destroy: one on the dd directory, and one on its
-	 * head ds.  Otherwise, someone is trying to lookup something
-	 * inside this dir while we want to destroy it.  The
-	 * config_rwlock ensures that nobody else opens it after we
-	 * check.
-	 */
-	if (dmu_buf_refcount(dd->dd_dbuf) > 2)
-		return (EBUSY);
-
-	err = zap_count(mos, dd->dd_phys->dd_child_dir_zapobj, &count);
-	if (err)
-		return (err);
-	if (count != 0)
-		return (EEXIST);
-
-	return (0);
-}
-
-void
-dsl_dir_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
-{
-	dsl_dir_t *dd = arg1;
-	objset_t *mos = dd->dd_pool->dp_meta_objset;
-	uint64_t val, obj;
-
-	ASSERT(RW_WRITE_HELD(&dd->dd_pool->dp_config_rwlock));
-	ASSERT(dd->dd_phys->dd_head_dataset_obj == 0);
-
-	/* Remove our reservation. */
-	val = 0;
-	dsl_dir_set_reservation_sync(dd, &val, tx);
-	ASSERT3U(dd->dd_used_bytes, ==, 0);
-	ASSERT3U(dd->dd_phys->dd_reserved, ==, 0);
-
-	VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_child_dir_zapobj, tx));
-	VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_props_zapobj, tx));
-	VERIFY(0 == zap_remove(mos,
-	    dd->dd_parent->dd_phys->dd_child_dir_zapobj, dd->dd_myname, tx));
-
-	obj = dd->dd_object;
-	dsl_dir_close(dd, tag);
-	VERIFY(0 == dmu_object_free(mos, obj, tx));
-}
-
-void
-dsl_dir_create_root(objset_t *mos, uint64_t *ddobjp, dmu_tx_t *tx)
-{
-	dsl_dir_phys_t *dsp;
-	dmu_buf_t *dbuf;
-	int error;
-
-	*ddobjp = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
-	    DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx);
-
-	error = zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ROOT_DATASET,
-	    sizeof (uint64_t), 1, ddobjp, tx);
-	ASSERT3U(error, ==, 0);
-
-	VERIFY(0 == dmu_bonus_hold(mos, *ddobjp, FTAG, &dbuf));
-	dmu_buf_will_dirty(dbuf, tx);
-	dsp = dbuf->db_data;
-
-	dsp->dd_creation_time = gethrestime_sec();
-	dsp->dd_props_zapobj = zap_create(mos,
-	    DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
-	dsp->dd_child_dir_zapobj = zap_create(mos,
-	    DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
-
-	dmu_buf_rele(dbuf, FTAG);
-}
-
-void
-dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv)
-{
-	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE,
-	    dsl_dir_space_available(dd, NULL, 0, TRUE));
-
-	mutex_enter(&dd->dd_lock);
-	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, dd->dd_used_bytes);
-	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA,
-	    dd->dd_phys->dd_quota);
-	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION,
-	    dd->dd_phys->dd_reserved);
-	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO,
-	    dd->dd_phys->dd_compressed_bytes == 0 ? 100 :
-	    (dd->dd_phys->dd_uncompressed_bytes * 100 /
-	    dd->dd_phys->dd_compressed_bytes));
-	mutex_exit(&dd->dd_lock);
-
-	if (dd->dd_phys->dd_clone_parent_obj) {
-		dsl_dataset_t *ds;
-		char buf[MAXNAMELEN];
-
-		rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
-		VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool,
-		    dd->dd_phys->dd_clone_parent_obj,
-		    NULL, DS_MODE_NONE, FTAG, &ds));
-		dsl_dataset_name(ds, buf);
-		dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
-		rw_exit(&dd->dd_pool->dp_config_rwlock);
-
-		dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf);
-	}
-}
-
-void
-dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx)
-{
-	dsl_pool_t *dp = dd->dd_pool;
-
-	ASSERT(dd->dd_phys);
-
-	if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg) == 0) {
-		/* up the hold count until we can be written out */
-		dmu_buf_add_ref(dd->dd_dbuf, dd);
-	}
-}
-
-static int64_t
-parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta)
-{
-	uint64_t old_accounted = MAX(used, dd->dd_phys->dd_reserved);
-	uint64_t new_accounted = MAX(used + delta, dd->dd_phys->dd_reserved);
-	return (new_accounted - old_accounted);
-}
-
-void
-dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx)
-{
-	ASSERT(dmu_tx_is_syncing(tx));
-
-	dmu_buf_will_dirty(dd->dd_dbuf, tx);
-
-	mutex_enter(&dd->dd_lock);
-	ASSERT3U(dd->dd_tempreserved[tx->tx_txg&TXG_MASK], ==, 0);
-	dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg,
-	    dd->dd_space_towrite[tx->tx_txg&TXG_MASK] / 1024);
-	dd->dd_space_towrite[tx->tx_txg&TXG_MASK] = 0;
-	dd->dd_phys->dd_used_bytes = dd->dd_used_bytes;
-	mutex_exit(&dd->dd_lock);
-
-	/* release the hold from dsl_dir_dirty */
-	dmu_buf_rele(dd->dd_dbuf, dd);
-}
-
-static uint64_t
-dsl_dir_estimated_space(dsl_dir_t *dd)
-{
-	int64_t space;
-	int i;
-
-	ASSERT(MUTEX_HELD(&dd->dd_lock));
-
-	space = dd->dd_phys->dd_used_bytes;
-	ASSERT(space >= 0);
-	for (i = 0; i < TXG_SIZE; i++) {
-		space += dd->dd_space_towrite[i&TXG_MASK];
-		ASSERT3U(dd->dd_space_towrite[i&TXG_MASK], >=, 0);
-	}
-	return (space);
-}
-
-/*
- * How much space would dd have available if ancestor had delta applied
- * to it?  If ondiskonly is set, we're only interested in what's
- * on-disk, not estimated pending changes.
- */
-uint64_t
-dsl_dir_space_available(dsl_dir_t *dd,
-    dsl_dir_t *ancestor, int64_t delta, int ondiskonly)
-{
-	uint64_t parentspace, myspace, quota, used;
-
-	/*
-	 * If there are no restrictions otherwise, assume we have
-	 * unlimited space available.
-	 */
-	quota = UINT64_MAX;
-	parentspace = UINT64_MAX;
-
-	if (dd->dd_parent != NULL) {
-		parentspace = dsl_dir_space_available(dd->dd_parent,
-		    ancestor, delta, ondiskonly);
-	}
-
-	mutex_enter(&dd->dd_lock);
-	if (dd->dd_phys->dd_quota != 0)
-		quota = dd->dd_phys->dd_quota;
-	if (ondiskonly) {
-		used = dd->dd_used_bytes;
-	} else {
-		used = dsl_dir_estimated_space(dd);
-	}
-	if (dd == ancestor)
-		used += delta;
-
-	if (dd->dd_parent == NULL) {
-		uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, FALSE);
-		quota = MIN(quota, poolsize);
-	}
-
-	if (dd->dd_phys->dd_reserved > used && parentspace != UINT64_MAX) {
-		/*
-		 * We have some space reserved, in addition to what our
-		 * parent gave us.
-		 */
-		parentspace += dd->dd_phys->dd_reserved - used;
-	}
-
-	if (used > quota) {
-		/* over quota */
-		myspace = 0;
-
-		/*
-		 * While it's OK to be a little over quota, if
-		 * we think we are using more space than there
-		 * is in the pool (which is already 1.6% more than
-		 * dsl_pool_adjustedsize()), something is very
-		 * wrong.
-		 */
-		ASSERT3U(used, <=, spa_get_space(dd->dd_pool->dp_spa));
-	} else {
-		/*
-		 * the lesser of the space provided by our parent and
-		 * the space left in our quota
-		 */
-		myspace = MIN(parentspace, quota - used);
-	}
-
-	mutex_exit(&dd->dd_lock);
-
-	return (myspace);
-}
-
-struct tempreserve {
-	list_node_t tr_node;
-	dsl_dir_t *tr_ds;
-	uint64_t tr_size;
-};
-
-/*
- * Reserve space in this dsl_dir, to be used in this tx's txg.
- * After the space has been dirtied (and thus
- * dsl_dir_willuse_space() has been called), the reservation should
- * be canceled, using dsl_dir_tempreserve_clear().
- */
-static int
-dsl_dir_tempreserve_impl(dsl_dir_t *dd,
-    uint64_t asize, boolean_t netfree, list_t *tr_list, dmu_tx_t *tx)
-{
-	uint64_t txg = tx->tx_txg;
-	uint64_t est_used, quota, parent_rsrv;
-	int edquot = EDQUOT;
-	int txgidx = txg & TXG_MASK;
-	int i;
-	struct tempreserve *tr;
-
-	ASSERT3U(txg, !=, 0);
-	ASSERT3S(asize, >=, 0);
-
-	mutex_enter(&dd->dd_lock);
-	/*
-	 * Check against the dsl_dir's quota.  We don't add in the delta
-	 * when checking for over-quota because they get one free hit.
-	 */
-	est_used = dsl_dir_estimated_space(dd);
-	for (i = 0; i < TXG_SIZE; i++)
-		est_used += dd->dd_tempreserved[i];
-
-	quota = UINT64_MAX;
-
-	if (dd->dd_phys->dd_quota)
-		quota = dd->dd_phys->dd_quota;
-
-	/*
-	 * If this transaction will result in a net free of space, we want
-	 * to let it through, but we have to be careful: the space that it
-	 * frees won't become available until *after* this txg syncs.
-	 * Therefore, to ensure that it's possible to remove files from
-	 * a full pool without inducing transient overcommits, we throttle
-	 * netfree transactions against a quota that is slightly larger,
-	 * but still within the pool's allocation slop.  In cases where
-	 * we're very close to full, this will allow a steady trickle of
-	 * removes to get through.
-	 */
-	if (dd->dd_parent == NULL) {
-		uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree);
-		if (poolsize < quota) {
-			quota = poolsize;
-			edquot = ENOSPC;
-		}
-	} else if (netfree) {
-		quota = UINT64_MAX;
-	}
-
-	/*
-	 * If they are requesting more space, and our current estimate
-	 * is over quota.  They get to try again unless the actual
-	 * on-disk is over quota and there are no pending changes (which
-	 * may free up space for us).
-	 */
-	if (asize > 0 && est_used > quota) {
-		if (dd->dd_space_towrite[txg & TXG_MASK] != 0 ||
-		    dd->dd_space_towrite[(txg-1) & TXG_MASK] != 0 ||
-		    dd->dd_space_towrite[(txg-2) & TXG_MASK] != 0 ||
-		    dd->dd_used_bytes < quota)
-			edquot = ERESTART;
-		dprintf_dd(dd, "failing: used=%lluK est_used = %lluK "
-		    "quota=%lluK tr=%lluK err=%d\n",
-		    dd->dd_used_bytes>>10, est_used>>10,
-		    quota>>10, asize>>10, edquot);
-		mutex_exit(&dd->dd_lock);
-		return (edquot);
-	}
-
-	/* We need to up our estimated delta before dropping dd_lock */
-	dd->dd_tempreserved[txgidx] += asize;
-
-	parent_rsrv = parent_delta(dd, est_used, asize);
-	mutex_exit(&dd->dd_lock);
-
-	tr = kmem_alloc(sizeof (struct tempreserve), KM_SLEEP);
-	tr->tr_ds = dd;
-	tr->tr_size = asize;
-	list_insert_tail(tr_list, tr);
-
-	/* see if it's OK with our parent */
-	if (dd->dd_parent && parent_rsrv) {
-		return (dsl_dir_tempreserve_impl(dd->dd_parent,
-		    parent_rsrv, netfree, tr_list, tx));
-	} else {
-		return (0);
-	}
-}
-
-/*
- * Reserve space in this dsl_dir, to be used in this tx's txg.
- * After the space has been dirtied (and thus
- * dsl_dir_willuse_space() has been called), the reservation should
- * be canceled, using dsl_dir_tempreserve_clear().
- */
-int
-dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize,
-    uint64_t asize, uint64_t fsize, void **tr_cookiep, dmu_tx_t *tx)
-{
-	int err = 0;
-	list_t *tr_list;
-
-	tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
-	list_create(tr_list, sizeof (struct tempreserve),
-	    offsetof(struct tempreserve, tr_node));
-	ASSERT3S(asize, >=, 0);
-	ASSERT3S(fsize, >=, 0);
-
-	err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize,
-	    tr_list, tx);
-
-	if (err == 0) {
-		struct tempreserve *tr;
-
-		err = arc_tempreserve_space(lsize);
-		if (err == 0) {
-			tr = kmem_alloc(sizeof (struct tempreserve), KM_SLEEP);
-			tr->tr_ds = NULL;
-			tr->tr_size = lsize;
-			list_insert_tail(tr_list, tr);
-		}
-	}
-
-	if (err)
-		dsl_dir_tempreserve_clear(tr_list, tx);
-	else
-		*tr_cookiep = tr_list;
-	return (err);
-}
-
-/*
- * Clear a temporary reservation that we previously made with
- * dsl_dir_tempreserve_space().
- */
-void
-dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx)
-{
-	int txgidx = tx->tx_txg & TXG_MASK;
-	list_t *tr_list = tr_cookie;
-	struct tempreserve *tr;
-
-	ASSERT3U(tx->tx_txg, !=, 0);
-
-	while (tr = list_head(tr_list)) {
-		if (tr->tr_ds == NULL) {
-			arc_tempreserve_clear(tr->tr_size);
-		} else {
-			mutex_enter(&tr->tr_ds->dd_lock);
-			ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=,
-			    tr->tr_size);
-			tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size;
-			mutex_exit(&tr->tr_ds->dd_lock);
-		}
-		list_remove(tr_list, tr);
-		kmem_free(tr, sizeof (struct tempreserve));
-	}
-
-	kmem_free(tr_list, sizeof (list_t));
-}
-
-/*
- * Call in open context when we think we're going to write/free space,
- * eg. when dirtying data.  Be conservative (ie. OK to write less than
- * this or free more than this, but don't write more or free less).
- */
-void
-dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
-{
-	int64_t parent_space;
-	uint64_t est_used;
-
-	mutex_enter(&dd->dd_lock);
-	if (space > 0)
-		dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space;
-
-	est_used = dsl_dir_estimated_space(dd);
-	parent_space = parent_delta(dd, est_used, space);
-	mutex_exit(&dd->dd_lock);
-
-	/* Make sure that we clean up dd_space_to* */
-	dsl_dir_dirty(dd, tx);
-
-	/* XXX this is potentially expensive and unnecessary... */
-	if (parent_space && dd->dd_parent)
-		dsl_dir_willuse_space(dd->dd_parent, parent_space, tx);
-}
-
-/* call from syncing context when we actually write/free space for this dd */
-void
-dsl_dir_diduse_space(dsl_dir_t *dd,
-    int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx)
-{
-	int64_t accounted_delta;
-
-	ASSERT(dmu_tx_is_syncing(tx));
-
-	dsl_dir_dirty(dd, tx);
-
-	mutex_enter(&dd->dd_lock);
-	accounted_delta = parent_delta(dd, dd->dd_used_bytes, used);
-	ASSERT(used >= 0 || dd->dd_used_bytes >= -used);
-	ASSERT(compressed >= 0 ||
-	    dd->dd_phys->dd_compressed_bytes >= -compressed);
-	ASSERT(uncompressed >= 0 ||
-	    dd->dd_phys->dd_uncompressed_bytes >= -uncompressed);
-	dd->dd_used_bytes += used;
-	dd->dd_phys->dd_uncompressed_bytes += uncompressed;
-	dd->dd_phys->dd_compressed_bytes += compressed;
-	mutex_exit(&dd->dd_lock);
-
-	if (dd->dd_parent != NULL) {
-		dsl_dir_diduse_space(dd->dd_parent,
-		    accounted_delta, compressed, uncompressed, tx);
-	}
-}
-
-/* ARGSUSED */
-static int
-dsl_dir_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-	dsl_dir_t *dd = arg1;
-	uint64_t *quotap = arg2;
-	uint64_t new_quota = *quotap;
-	int err = 0;
-	uint64_t towrite;
-
-	if (new_quota == 0)
-		return (0);
-
-	mutex_enter(&dd->dd_lock);
-	/*
-	 * If we are doing the preliminary check in open context, and
-	 * there are pending changes, then don't fail it, since the
-	 * pending changes could under-estimat the amount of space to be
-	 * freed up.
-	 */
-	towrite = dd->dd_space_towrite[0] + dd->dd_space_towrite[1] +
-	    dd->dd_space_towrite[2] + dd->dd_space_towrite[3];
-	if ((dmu_tx_is_syncing(tx) || towrite == 0) &&
-	    (new_quota < dd->dd_phys->dd_reserved ||
-	    new_quota < dsl_dir_estimated_space(dd))) {
-		err = ENOSPC;
-	}
-	mutex_exit(&dd->dd_lock);
-	return (err);
-}
-
-static void
-dsl_dir_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-	dsl_dir_t *dd = arg1;
-	uint64_t *quotap = arg2;
-	uint64_t new_quota = *quotap;
-
-	dmu_buf_will_dirty(dd->dd_dbuf, tx);
-
-	mutex_enter(&dd->dd_lock);
-	dd->dd_phys->dd_quota = new_quota;
-	mutex_exit(&dd->dd_lock);
-}
-
-int
-dsl_dir_set_quota(const char *ddname, uint64_t quota)
-{
-	dsl_dir_t *dd;
-	int err;
-
-	err = dsl_dir_open(ddname, FTAG, &dd, NULL);
-	if (err)
-		return (err);
-	/*
-	 * If someone removes a file, then tries to set the quota, we
-	 * want to make sure the file freeing takes effect.
-	 */
-	txg_wait_open(dd->dd_pool, 0);
-
-	err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_quota_check,
-	    dsl_dir_set_quota_sync, dd, &quota, 0);
-	dsl_dir_close(dd, FTAG);
-	return (err);
-}
-
-/* ARGSUSED */
-static int
-dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-	dsl_dir_t *dd = arg1;
-	uint64_t *reservationp = arg2;
-	uint64_t new_reservation = *reservationp;
-	uint64_t used, avail;
-	int64_t delta;
-
-	if (new_reservation > INT64_MAX)
-		return (EOVERFLOW);
-
-	/*
-	 * If we are doing the preliminary check in open context, the
-	 * space estimates may be inaccurate.
-	 */
-	if (!dmu_tx_is_syncing(tx))
-		return (0);
-
-	mutex_enter(&dd->dd_lock);
-	used = dd->dd_used_bytes;
-	delta = MAX(used, new_reservation) -
-	    MAX(used, dd->dd_phys->dd_reserved);
-	mutex_exit(&dd->dd_lock);
-
-	if (dd->dd_parent) {
-		avail = dsl_dir_space_available(dd->dd_parent,
-		    NULL, 0, FALSE);
-	} else {
-		avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used;
-	}
-
-	if (delta > 0 && delta > avail)
-		return (ENOSPC);
-	if (delta > 0 && dd->dd_phys->dd_quota > 0 &&
-	    new_reservation > dd->dd_phys->dd_quota)
-		return (ENOSPC);
-	return (0);
-}
-
-static void
-dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-	dsl_dir_t *dd = arg1;
-	uint64_t *reservationp = arg2;
-	uint64_t new_reservation = *reservationp;
-	uint64_t used;
-	int64_t delta;
-
-	mutex_enter(&dd->dd_lock);
-	used = dd->dd_used_bytes;
-	delta = MAX(used, new_reservation) -
-	    MAX(used, dd->dd_phys->dd_reserved);
-	mutex_exit(&dd->dd_lock);
-
-	dmu_buf_will_dirty(dd->dd_dbuf, tx);
-	dd->dd_phys->dd_reserved = new_reservation;
-
-	if (dd->dd_parent != NULL) {
-		/* Roll up this additional usage into our ancestors */
-		dsl_dir_diduse_space(dd->dd_parent, delta, 0, 0, tx);
-	}
-}
-
-int
-dsl_dir_set_reservation(const char *ddname, uint64_t reservation)
-{
-	dsl_dir_t *dd;
-	int err;
-
-	err = dsl_dir_open(ddname, FTAG, &dd, NULL);
-	if (err)
-		return (err);
-	err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_reservation_check,
-	    dsl_dir_set_reservation_sync, dd, &reservation, 0);
-	dsl_dir_close(dd, FTAG);
-	return (err);
-}
-
-static dsl_dir_t *
-closest_common_ancestor(dsl_dir_t *ds1, dsl_dir_t *ds2)
-{
-	for (; ds1; ds1 = ds1->dd_parent) {
-		dsl_dir_t *dd;
-		for (dd = ds2; dd; dd = dd->dd_parent) {
-			if (ds1 == dd)
-				return (dd);
-		}
-	}
-	return (NULL);
-}
-
-/*
- * If delta is applied to dd, how much of that delta would be applied to
- * ancestor?  Syncing context only.
- */
-static int64_t
-would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor)
-{
-	if (dd == ancestor)
-		return (delta);
-
-	mutex_enter(&dd->dd_lock);
-	delta = parent_delta(dd, dd->dd_used_bytes, delta);
-	mutex_exit(&dd->dd_lock);
-	return (would_change(dd->dd_parent, delta, ancestor));
-}
-
-struct renamearg {
-	dsl_dir_t *newparent;
-	const char *mynewname;
-};
-
-/* ARGSUSED */
-static int
-dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-	dsl_dir_t *dd = arg1;
-	struct renamearg *ra = arg2;
-	dsl_pool_t *dp = dd->dd_pool;
-	objset_t *mos = dp->dp_meta_objset;
-	int err;
-	uint64_t val;
-
-	/* There should be 2 references: the open and the dirty */
-	if (dmu_buf_refcount(dd->dd_dbuf) > 2)
-		return (EBUSY);
-
-	/* check for existing name */
-	err = zap_lookup(mos, ra->newparent->dd_phys->dd_child_dir_zapobj,
-	    ra->mynewname, 8, 1, &val);
-	if (err == 0)
-		return (EEXIST);
-	if (err != ENOENT)
-		return (err);
-
-	if (ra->newparent != dd->dd_parent) {
-		/* is there enough space? */
-		uint64_t myspace =
-		    MAX(dd->dd_used_bytes, dd->dd_phys->dd_reserved);
-
-		/* no rename into our descendant */
-		if (closest_common_ancestor(dd, ra->newparent) == dd)
-			return (EINVAL);
-
-		if (err = dsl_dir_transfer_possible(dd->dd_parent,
-		    ra->newparent, myspace))
-			return (err);
-	}
-
-	return (0);
-}
-
-static void
-dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-	dsl_dir_t *dd = arg1;
-	struct renamearg *ra = arg2;
-	dsl_pool_t *dp = dd->dd_pool;
-	objset_t *mos = dp->dp_meta_objset;
-	int err;
-
-	ASSERT(dmu_buf_refcount(dd->dd_dbuf) <= 2);
-
-	if (ra->newparent != dd->dd_parent) {
-		uint64_t myspace =
-		    MAX(dd->dd_used_bytes, dd->dd_phys->dd_reserved);
-
-		dsl_dir_diduse_space(dd->dd_parent, -myspace,
-		    -dd->dd_phys->dd_compressed_bytes,
-		    -dd->dd_phys->dd_uncompressed_bytes, tx);
-		dsl_dir_diduse_space(ra->newparent, myspace,
-		    dd->dd_phys->dd_compressed_bytes,
-		    dd->dd_phys->dd_uncompressed_bytes, tx);
-	}
-
-	dmu_buf_will_dirty(dd->dd_dbuf, tx);
-
-	/* remove from old parent zapobj */
-	err = zap_remove(mos, dd->dd_parent->dd_phys->dd_child_dir_zapobj,
-	    dd->dd_myname, tx);
-	ASSERT3U(err, ==, 0);
-
-	(void) strcpy(dd->dd_myname, ra->mynewname);
-	dsl_dir_close(dd->dd_parent, dd);
-	dd->dd_phys->dd_parent_obj = ra->newparent->dd_object;
-	VERIFY(0 == dsl_dir_open_obj(dd->dd_pool,
-	    ra->newparent->dd_object, NULL, dd, &dd->dd_parent));
-
-	/* add to new parent zapobj */
-	err = zap_add(mos, ra->newparent->dd_phys->dd_child_dir_zapobj,
-	    dd->dd_myname, 8, 1, &dd->dd_object, tx);
-	ASSERT3U(err, ==, 0);
-}
-
-int
-dsl_dir_rename(dsl_dir_t *dd, const char *newname)
-{
-	struct renamearg ra;
-	int err;
-
-	/* new parent should exist */
-	err = dsl_dir_open(newname, FTAG, &ra.newparent, &ra.mynewname);
-	if (err)
-		return (err);
-
-	/* can't rename to different pool */
-	if (dd->dd_pool != ra.newparent->dd_pool) {
-		err = ENXIO;
-		goto out;
-	}
-
-	/* new name should not already exist */
-	if (ra.mynewname == NULL) {
-		err = EEXIST;
-		goto out;
-	}
-
-
-	err = dsl_sync_task_do(dd->dd_pool,
-	    dsl_dir_rename_check, dsl_dir_rename_sync, dd, &ra, 3);
-
-out:
-	dsl_dir_close(ra.newparent, FTAG);
-	return (err);
-}
-
-int
-dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space)
-{
-	dsl_dir_t *ancestor;
-	int64_t adelta;
-	uint64_t avail;
-
-	ancestor = closest_common_ancestor(sdd, tdd);
-	adelta = would_change(sdd, -space, ancestor);
-	avail = dsl_dir_space_available(tdd, ancestor, adelta, FALSE);
-	if (avail < space)
-		return (ENOSPC);
-
-	return (0);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
deleted file mode 100644
index 00abf7e..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
+++ /dev/null
@@ -1,256 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/dsl_pool.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_dir.h>
-#include <sys/dsl_synctask.h>
-#include <sys/dmu_tx.h>
-#include <sys/dmu_objset.h>
-#include <sys/arc.h>
-#include <sys/zap.h>
-#include <sys/zio.h>
-#include <sys/zfs_context.h>
-#include <sys/fs/zfs.h>
-
-static int
-dsl_pool_open_mos_dir(dsl_pool_t *dp, dsl_dir_t **ddp)
-{
-	uint64_t obj;
-	int err;
-
-	err = zap_lookup(dp->dp_meta_objset,
-	    dp->dp_root_dir->dd_phys->dd_child_dir_zapobj,
-	    MOS_DIR_NAME, sizeof (obj), 1, &obj);
-	if (err)
-		return (err);
-
-	return (dsl_dir_open_obj(dp, obj, MOS_DIR_NAME, dp, ddp));
-}
-
-static dsl_pool_t *
-dsl_pool_open_impl(spa_t *spa, uint64_t txg)
-{
-	dsl_pool_t *dp;
-	blkptr_t *bp = spa_get_rootblkptr(spa);
-
-	dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP);
-	dp->dp_spa = spa;
-	dp->dp_meta_rootbp = *bp;
-	rw_init(&dp->dp_config_rwlock, NULL, RW_DEFAULT, NULL);
-	txg_init(dp, txg);
-
-	txg_list_create(&dp->dp_dirty_datasets,
-	    offsetof(dsl_dataset_t, ds_dirty_link));
-	txg_list_create(&dp->dp_dirty_dirs,
-	    offsetof(dsl_dir_t, dd_dirty_link));
-	txg_list_create(&dp->dp_sync_tasks,
-	    offsetof(dsl_sync_task_group_t, dstg_node));
-	list_create(&dp->dp_synced_objsets, sizeof (dsl_dataset_t),
-	    offsetof(dsl_dataset_t, ds_synced_link));
-
-	return (dp);
-}
-
-int
-dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
-{
-	int err;
-	dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
-	objset_impl_t *osi;
-
-	rw_enter(&dp->dp_config_rwlock, RW_READER);
-	err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, &osi);
-	if (err)
-		goto out;
-	dp->dp_meta_objset = &osi->os;
-
-	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1,
-	    &dp->dp_root_dir_obj);
-	if (err)
-		goto out;
-
-	err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj,
-	    NULL, dp, &dp->dp_root_dir);
-	if (err)
-		goto out;
-
-	err = dsl_pool_open_mos_dir(dp, &dp->dp_mos_dir);
-	if (err)
-		goto out;
-
-out:
-	rw_exit(&dp->dp_config_rwlock);
-	if (err)
-		dsl_pool_close(dp);
-	else
-		*dpp = dp;
-
-	return (err);
-}
-
-void
-dsl_pool_close(dsl_pool_t *dp)
-{
-	/* drop our reference from dsl_pool_open() */
-	if (dp->dp_mos_dir)
-		dsl_dir_close(dp->dp_mos_dir, dp);
-	if (dp->dp_root_dir)
-		dsl_dir_close(dp->dp_root_dir, dp);
-
-	/* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */
-	if (dp->dp_meta_objset)
-		dmu_objset_evict(NULL, dp->dp_meta_objset->os);
-
-	txg_list_destroy(&dp->dp_dirty_datasets);
-	txg_list_destroy(&dp->dp_dirty_dirs);
-	txg_list_destroy(&dp->dp_sync_tasks);
-	list_destroy(&dp->dp_synced_objsets);
-
-	arc_flush();
-	txg_fini(dp);
-	rw_destroy(&dp->dp_config_rwlock);
-	kmem_free(dp, sizeof (dsl_pool_t));
-}
-
-dsl_pool_t *
-dsl_pool_create(spa_t *spa, uint64_t txg)
-{
-	int err;
-	dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
-	dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
-	dp->dp_meta_objset = &dmu_objset_create_impl(spa,
-	    NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx)->os;
-
-	/* create the pool directory */
-	err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx);
-	ASSERT3U(err, ==, 0);
-
-	/* create and open the root dir */
-	dsl_dataset_create_root(dp, &dp->dp_root_dir_obj, tx);
-	VERIFY(0 == dsl_dir_open_obj(dp, dp->dp_root_dir_obj,
-	    NULL, dp, &dp->dp_root_dir));
-
-	/* create and open the meta-objset dir */
-	(void) dsl_dir_create_sync(dp->dp_root_dir, MOS_DIR_NAME, tx);
-	VERIFY(0 == dsl_pool_open_mos_dir(dp, &dp->dp_mos_dir));
-
-	dmu_tx_commit(tx);
-
-	return (dp);
-}
-
-void
-dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
-{
-	zio_t *zio;
-	dmu_tx_t *tx;
-	dsl_dir_t *dd;
-	dsl_dataset_t *ds;
-	dsl_sync_task_group_t *dstg;
-	objset_impl_t *mosi = dp->dp_meta_objset->os;
-	int err;
-
-	tx = dmu_tx_create_assigned(dp, txg);
-
-	zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
-	while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) {
-		if (!list_link_active(&ds->ds_synced_link))
-			list_insert_tail(&dp->dp_synced_objsets, ds);
-		else
-			dmu_buf_rele(ds->ds_dbuf, ds);
-		dsl_dataset_sync(ds, zio, tx);
-	}
-	err = zio_wait(zio);
-	ASSERT(err == 0);
-
-	while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg))
-		dsl_sync_task_group_sync(dstg, tx);
-	while (dd = txg_list_remove(&dp->dp_dirty_dirs, txg))
-		dsl_dir_sync(dd, tx);
-
-	if (list_head(&mosi->os_dirty_dnodes[txg & TXG_MASK]) != NULL ||
-	    list_head(&mosi->os_free_dnodes[txg & TXG_MASK]) != NULL) {
-		zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
-		dmu_objset_sync(mosi, zio, tx);
-		err = zio_wait(zio);
-		ASSERT(err == 0);
-		dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", "");
-		spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
-	}
-
-	dmu_tx_commit(tx);
-}
-
-void
-dsl_pool_zil_clean(dsl_pool_t *dp)
-{
-	dsl_dataset_t *ds;
-
-	while (ds = list_head(&dp->dp_synced_objsets)) {
-		list_remove(&dp->dp_synced_objsets, ds);
-		ASSERT(ds->ds_user_ptr != NULL);
-		zil_clean(((objset_impl_t *)ds->ds_user_ptr)->os_zil);
-		dmu_buf_rele(ds->ds_dbuf, ds);
-	}
-}
-
-/*
- * TRUE if the current thread is the tx_sync_thread or if we
- * are being called from SPA context during pool initialization.
- */
-int
-dsl_pool_sync_context(dsl_pool_t *dp)
-{
-	return (curthread == dp->dp_tx.tx_sync_thread ||
-	    spa_get_dsl(dp->dp_spa) == NULL);
-}
-
-uint64_t
-dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree)
-{
-	uint64_t space, resv;
-
-	/*
-	 * Reserve about 1.6% (1/64), or at least 32MB, for allocation
-	 * efficiency.
-	 * XXX The intent log is not accounted for, so it must fit
-	 * within this slop.
-	 *
-	 * If we're trying to assess whether it's OK to do a free,
-	 * cut the reservation in half to allow forward progress
-	 * (e.g. make it possible to rm(1) files from a full pool).
-	 */
-	space = spa_get_dspace(dp->dp_spa);
-	resv = MAX(space >> 6, SPA_MINDEVSIZE >> 1);
-	if (netfree)
-		resv >>= 1;
-
-	return (space - resv);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c
deleted file mode 100644
index 2fff66d..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c
+++ /dev/null
@@ -1,501 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/dmu.h>
-#include <sys/dmu_objset.h>
-#include <sys/dmu_tx.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_dir.h>
-#include <sys/dsl_prop.h>
-#include <sys/dsl_synctask.h>
-#include <sys/spa.h>
-#include <sys/zio_checksum.h> /* for the default checksum value */
-#include <sys/zap.h>
-#include <sys/fs/zfs.h>
-
-#include "zfs_prop.h"
-
-static int
-dodefault(const char *propname, int intsz, int numint, void *buf)
-{
-	zfs_prop_t prop;
-
-	if ((prop = zfs_name_to_prop(propname)) == ZFS_PROP_INVAL ||
-	    zfs_prop_readonly(prop))
-		return (ENOENT);
-
-	if (zfs_prop_get_type(prop) == prop_type_string) {
-		if (intsz != 1)
-			return (EOVERFLOW);
-		(void) strncpy(buf, zfs_prop_default_string(prop), numint);
-	} else {
-		if (intsz != 8 || numint < 1)
-			return (EOVERFLOW);
-
-		*(uint64_t *)buf = zfs_prop_default_numeric(prop);
-	}
-
-	return (0);
-}
-
-static int
-dsl_prop_get_impl(dsl_dir_t *dd, const char *propname,
-    int intsz, int numint, void *buf, char *setpoint)
-{
-	int err = ENOENT;
-	zfs_prop_t prop;
-
-	if (setpoint)
-		setpoint[0] = '\0';
-
-	prop = zfs_name_to_prop(propname);
-
-	/*
-	 * Note: dd may be NULL, therefore we shouldn't dereference it
-	 * ouside this loop.
-	 */
-	for (; dd != NULL; dd = dd->dd_parent) {
-		objset_t *mos = dd->dd_pool->dp_meta_objset;
-		ASSERT(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock));
-		err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj,
-		    propname, intsz, numint, buf);
-		if (err != ENOENT) {
-			if (setpoint)
-				dsl_dir_name(dd, setpoint);
-			break;
-		}
-
-		/*
-		 * Break out of this loop for non-inheritable properties.
-		 */
-		if (prop != ZFS_PROP_INVAL &&
-		    !zfs_prop_inheritable(prop))
-			break;
-	}
-	if (err == ENOENT)
-		err = dodefault(propname, intsz, numint, buf);
-
-	return (err);
-}
-
-/*
- * Register interest in the named property.  We'll call the callback
- * once to notify it of the current property value, and again each time
- * the property changes, until this callback is unregistered.
- *
- * Return 0 on success, errno if the prop is not an integer value.
- */
-int
-dsl_prop_register(dsl_dataset_t *ds, const char *propname,
-    dsl_prop_changed_cb_t *callback, void *cbarg)
-{
-	dsl_dir_t *dd = ds->ds_dir;
-	uint64_t value;
-	dsl_prop_cb_record_t *cbr;
-	int err;
-	int need_rwlock;
-
-	need_rwlock = !RW_WRITE_HELD(&dd->dd_pool->dp_config_rwlock);
-	if (need_rwlock)
-		rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
-
-	err = dsl_prop_get_impl(dd, propname, 8, 1, &value, NULL);
-	if (err != 0) {
-		rw_exit(&dd->dd_pool->dp_config_rwlock);
-		return (err);
-	}
-
-	cbr = kmem_alloc(sizeof (dsl_prop_cb_record_t), KM_SLEEP);
-	cbr->cbr_ds = ds;
-	cbr->cbr_propname = kmem_alloc(strlen(propname)+1, KM_SLEEP);
-	(void) strcpy((char *)cbr->cbr_propname, propname);
-	cbr->cbr_func = callback;
-	cbr->cbr_arg = cbarg;
-	mutex_enter(&dd->dd_lock);
-	list_insert_head(&dd->dd_prop_cbs, cbr);
-	mutex_exit(&dd->dd_lock);
-
-	cbr->cbr_func(cbr->cbr_arg, value);
-
-	VERIFY(0 == dsl_dir_open_obj(dd->dd_pool, dd->dd_object,
-	    NULL, cbr, &dd));
-	if (need_rwlock)
-		rw_exit(&dd->dd_pool->dp_config_rwlock);
-	/* Leave dataset open until this callback is unregistered */
-	return (0);
-}
-
-int
-dsl_prop_get_ds(dsl_dir_t *dd, const char *propname,
-    int intsz, int numints, void *buf, char *setpoint)
-{
-	int err;
-
-	rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
-	err = dsl_prop_get_impl(dd, propname, intsz, numints, buf, setpoint);
-	rw_exit(&dd->dd_pool->dp_config_rwlock);
-
-	return (err);
-}
-
-int
-dsl_prop_get(const char *ddname, const char *propname,
-    int intsz, int numints, void *buf, char *setpoint)
-{
-	dsl_dir_t *dd;
-	const char *tail;
-	int err;
-
-	err = dsl_dir_open(ddname, FTAG, &dd, &tail);
-	if (err)
-		return (err);
-	if (tail && tail[0] != '@') {
-		dsl_dir_close(dd, FTAG);
-		return (ENOENT);
-	}
-
-	err = dsl_prop_get_ds(dd, propname, intsz, numints, buf, setpoint);
-
-	dsl_dir_close(dd, FTAG);
-	return (err);
-}
-
-/*
- * Get the current property value.  It may have changed by the time this
- * function returns, so it is NOT safe to follow up with
- * dsl_prop_register() and assume that the value has not changed in
- * between.
- *
- * Return 0 on success, ENOENT if ddname is invalid.
- */
-int
-dsl_prop_get_integer(const char *ddname, const char *propname,
-    uint64_t *valuep, char *setpoint)
-{
-	return (dsl_prop_get(ddname, propname, 8, 1, valuep, setpoint));
-}
-
-/*
- * Unregister this callback.  Return 0 on success, ENOENT if ddname is
- * invalid, ENOMSG if no matching callback registered.
- */
-int
-dsl_prop_unregister(dsl_dataset_t *ds, const char *propname,
-    dsl_prop_changed_cb_t *callback, void *cbarg)
-{
-	dsl_dir_t *dd = ds->ds_dir;
-	dsl_prop_cb_record_t *cbr;
-
-	mutex_enter(&dd->dd_lock);
-	for (cbr = list_head(&dd->dd_prop_cbs);
-	    cbr; cbr = list_next(&dd->dd_prop_cbs, cbr)) {
-		if (cbr->cbr_ds == ds &&
-		    cbr->cbr_func == callback &&
-		    cbr->cbr_arg == cbarg &&
-		    strcmp(cbr->cbr_propname, propname) == 0)
-			break;
-	}
-
-	if (cbr == NULL) {
-		mutex_exit(&dd->dd_lock);
-		return (ENOMSG);
-	}
-
-	list_remove(&dd->dd_prop_cbs, cbr);
-	mutex_exit(&dd->dd_lock);
-	kmem_free((void*)cbr->cbr_propname, strlen(cbr->cbr_propname)+1);
-	kmem_free(cbr, sizeof (dsl_prop_cb_record_t));
-
-	/* Clean up from dsl_prop_register */
-	dsl_dir_close(dd, cbr);
-	return (0);
-}
-
-/*
- * Return the number of callbacks that are registered for this dataset.
- */
-int
-dsl_prop_numcb(dsl_dataset_t *ds)
-{
-	dsl_dir_t *dd = ds->ds_dir;
-	dsl_prop_cb_record_t *cbr;
-	int num = 0;
-
-	mutex_enter(&dd->dd_lock);
-	for (cbr = list_head(&dd->dd_prop_cbs);
-	    cbr; cbr = list_next(&dd->dd_prop_cbs, cbr)) {
-		if (cbr->cbr_ds == ds)
-			num++;
-	}
-	mutex_exit(&dd->dd_lock);
-
-	return (num);
-}
-
-static void
-dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj,
-    const char *propname, uint64_t value, int first)
-{
-	dsl_dir_t *dd;
-	dsl_prop_cb_record_t *cbr;
-	objset_t *mos = dp->dp_meta_objset;
-	zap_cursor_t zc;
-	zap_attribute_t za;
-	int err;
-
-	ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
-	err = dsl_dir_open_obj(dp, ddobj, NULL, FTAG, &dd);
-	if (err)
-		return;
-
-	if (!first) {
-		/*
-		 * If the prop is set here, then this change is not
-		 * being inherited here or below; stop the recursion.
-		 */
-		err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj, propname,
-		    8, 1, &value);
-		if (err == 0) {
-			dsl_dir_close(dd, FTAG);
-			return;
-		}
-		ASSERT3U(err, ==, ENOENT);
-	}
-
-	mutex_enter(&dd->dd_lock);
-	for (cbr = list_head(&dd->dd_prop_cbs);
-	    cbr; cbr = list_next(&dd->dd_prop_cbs, cbr)) {
-		if (strcmp(cbr->cbr_propname, propname) == 0) {
-			cbr->cbr_func(cbr->cbr_arg, value);
-		}
-	}
-	mutex_exit(&dd->dd_lock);
-
-	for (zap_cursor_init(&zc, mos,
-	    dd->dd_phys->dd_child_dir_zapobj);
-	    zap_cursor_retrieve(&zc, &za) == 0;
-	    zap_cursor_advance(&zc)) {
-		/* XXX recursion could blow stack; esp. za! */
-		dsl_prop_changed_notify(dp, za.za_first_integer,
-		    propname, value, FALSE);
-	}
-	zap_cursor_fini(&zc);
-	dsl_dir_close(dd, FTAG);
-}
-
-struct prop_set_arg {
-	const char *name;
-	int intsz;
-	int numints;
-	const void *buf;
-};
-
-
-static void
-dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-	dsl_dir_t *dd = arg1;
-	struct prop_set_arg *psa = arg2;
-	objset_t *mos = dd->dd_pool->dp_meta_objset;
-	uint64_t zapobj = dd->dd_phys->dd_props_zapobj;
-	uint64_t intval;
-	int isint;
-
-	isint = (dodefault(psa->name, 8, 1, &intval) == 0);
-
-	if (psa->numints == 0) {
-		int err = zap_remove(mos, zapobj, psa->name, tx);
-		ASSERT(err == 0 || err == ENOENT);
-		if (isint) {
-			VERIFY(0 == dsl_prop_get_impl(dd->dd_parent,
-			    psa->name, 8, 1, &intval, NULL));
-		}
-	} else {
-		VERIFY(0 == zap_update(mos, zapobj, psa->name,
-		    psa->intsz, psa->numints, psa->buf, tx));
-		if (isint)
-			intval = *(uint64_t *)psa->buf;
-	}
-
-	if (isint) {
-		dsl_prop_changed_notify(dd->dd_pool,
-		    dd->dd_object, psa->name, intval, TRUE);
-	}
-}
-
-int
-dsl_prop_set_dd(dsl_dir_t *dd, const char *propname,
-    int intsz, int numints, const void *buf)
-{
-	struct prop_set_arg psa;
-
-	psa.name = propname;
-	psa.intsz = intsz;
-	psa.numints = numints;
-	psa.buf = buf;
-
-	return (dsl_sync_task_do(dd->dd_pool,
-	    NULL, dsl_prop_set_sync, dd, &psa, 2));
-}
-
-int
-dsl_prop_set(const char *ddname, const char *propname,
-    int intsz, int numints, const void *buf)
-{
-	dsl_dir_t *dd;
-	int err;
-
-	/*
-	 * We must do these checks before we get to the syncfunc, since
-	 * it can't fail.
-	 */
-	if (strlen(propname) >= ZAP_MAXNAMELEN)
-		return (ENAMETOOLONG);
-	if (intsz * numints >= ZAP_MAXVALUELEN)
-		return (E2BIG);
-
-	err = dsl_dir_open(ddname, FTAG, &dd, NULL);
-	if (err)
-		return (err);
-	err = dsl_prop_set_dd(dd, propname, intsz, numints, buf);
-	dsl_dir_close(dd, FTAG);
-	return (err);
-}
-
-/*
- * Iterate over all properties for this dataset and return them in an nvlist.
- */
-int
-dsl_prop_get_all(objset_t *os, nvlist_t **nvp)
-{
-	dsl_dataset_t *ds = os->os->os_dsl_dataset;
-	dsl_dir_t *dd = ds->ds_dir;
-	int err = 0;
-	dsl_pool_t *dp;
-	objset_t *mos;
-
-	if (dsl_dataset_is_snapshot(ds)) {
-		VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-		return (0);
-	}
-
-	VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-
-	dp = dd->dd_pool;
-	mos = dp->dp_meta_objset;
-
-	rw_enter(&dp->dp_config_rwlock, RW_READER);
-	for (; dd != NULL; dd = dd->dd_parent) {
-		char setpoint[MAXNAMELEN];
-		zap_cursor_t zc;
-		zap_attribute_t za;
-
-		dsl_dir_name(dd, setpoint);
-
-		for (zap_cursor_init(&zc, mos, dd->dd_phys->dd_props_zapobj);
-		    (err = zap_cursor_retrieve(&zc, &za)) == 0;
-		    zap_cursor_advance(&zc)) {
-			nvlist_t *propval;
-			zfs_prop_t prop;
-			/*
-			 * Skip non-inheritable properties.
-			 */
-			if ((prop = zfs_name_to_prop(za.za_name)) !=
-			    ZFS_PROP_INVAL && !zfs_prop_inheritable(prop) &&
-			    dd != ds->ds_dir)
-				continue;
-
-			if (nvlist_lookup_nvlist(*nvp, za.za_name,
-			    &propval) == 0)
-				continue;
-
-			VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME,
-			    KM_SLEEP) == 0);
-			if (za.za_integer_length == 1) {
-				/*
-				 * String property
-				 */
-				char *tmp = kmem_alloc(za.za_num_integers,
-				    KM_SLEEP);
-				err = zap_lookup(mos,
-				    dd->dd_phys->dd_props_zapobj,
-				    za.za_name, 1, za.za_num_integers,
-				    tmp);
-				if (err != 0) {
-					kmem_free(tmp, za.za_num_integers);
-					break;
-				}
-				VERIFY(nvlist_add_string(propval,
-				    ZFS_PROP_VALUE, tmp) == 0);
-				kmem_free(tmp, za.za_num_integers);
-			} else {
-				/*
-				 * Integer property
-				 */
-				ASSERT(za.za_integer_length == 8);
-				(void) nvlist_add_uint64(propval,
-				    ZFS_PROP_VALUE, za.za_first_integer);
-			}
-
-			VERIFY(nvlist_add_string(propval,
-			    ZFS_PROP_SOURCE, setpoint) == 0);
-			VERIFY(nvlist_add_nvlist(*nvp, za.za_name,
-			    propval) == 0);
-			nvlist_free(propval);
-		}
-		zap_cursor_fini(&zc);
-
-		if (err != ENOENT)
-			break;
-		err = 0;
-	}
-	rw_exit(&dp->dp_config_rwlock);
-
-	return (err);
-}
-
-void
-dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value)
-{
-	nvlist_t *propval;
-
-	VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-	VERIFY(nvlist_add_uint64(propval, ZFS_PROP_VALUE, value) == 0);
-	VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(prop), propval) == 0);
-	nvlist_free(propval);
-}
-
-void
-dsl_prop_nvlist_add_string(nvlist_t *nv, zfs_prop_t prop, const char *value)
-{
-	nvlist_t *propval;
-
-	VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-	VERIFY(nvlist_add_string(propval, ZFS_PROP_VALUE, value) == 0);
-	VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(prop), propval) == 0);
-	nvlist_free(propval);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c
deleted file mode 100644
index 17deb56..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c
+++ /dev/null
@@ -1,196 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/dmu.h>
-#include <sys/dmu_tx.h>
-#include <sys/dsl_pool.h>
-#include <sys/dsl_dir.h>
-#include <sys/dsl_synctask.h>
-
-#define	DST_AVG_BLKSHIFT 14
-
-/* ARGSUSED */
-static int
-dsl_null_checkfunc(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-	return (0);
-}
-
-dsl_sync_task_group_t *
-dsl_sync_task_group_create(dsl_pool_t *dp)
-{
-	dsl_sync_task_group_t *dstg;
-
-	dstg = kmem_zalloc(sizeof (dsl_sync_task_group_t), KM_SLEEP);
-	list_create(&dstg->dstg_tasks, sizeof (dsl_sync_task_t),
-	    offsetof(dsl_sync_task_t, dst_node));
-	dstg->dstg_pool = dp;
-
-	return (dstg);
-}
-
-void
-dsl_sync_task_create(dsl_sync_task_group_t *dstg,
-    dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc,
-    void *arg1, void *arg2, int blocks_modified)
-{
-	dsl_sync_task_t *dst;
-
-	if (checkfunc == NULL)
-		checkfunc = dsl_null_checkfunc;
-	dst = kmem_zalloc(sizeof (dsl_sync_task_t), KM_SLEEP);
-	dst->dst_checkfunc = checkfunc;
-	dst->dst_syncfunc = syncfunc;
-	dst->dst_arg1 = arg1;
-	dst->dst_arg2 = arg2;
-	list_insert_tail(&dstg->dstg_tasks, dst);
-
-	dstg->dstg_space += blocks_modified << DST_AVG_BLKSHIFT;
-}
-
-int
-dsl_sync_task_group_wait(dsl_sync_task_group_t *dstg)
-{
-	dmu_tx_t *tx;
-	uint64_t txg;
-	dsl_sync_task_t *dst;
-
-top:
-	tx = dmu_tx_create_dd(dstg->dstg_pool->dp_mos_dir);
-	VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT));
-
-	txg = dmu_tx_get_txg(tx);
-
-	/* Do a preliminary error check. */
-	dstg->dstg_err = 0;
-	rw_enter(&dstg->dstg_pool->dp_config_rwlock, RW_READER);
-	for (dst = list_head(&dstg->dstg_tasks); dst;
-	    dst = list_next(&dstg->dstg_tasks, dst)) {
-#ifdef ZFS_DEBUG
-		/*
-		 * Only check half the time, otherwise, the sync-context
-		 * check will almost never fail.
-		 */
-		if (spa_get_random(2) == 0)
-			continue;
-#endif
-		dst->dst_err =
-		    dst->dst_checkfunc(dst->dst_arg1, dst->dst_arg2, tx);
-		if (dst->dst_err)
-			dstg->dstg_err = dst->dst_err;
-	}
-	rw_exit(&dstg->dstg_pool->dp_config_rwlock);
-
-	if (dstg->dstg_err) {
-		dmu_tx_commit(tx);
-		return (dstg->dstg_err);
-	}
-
-	VERIFY(0 == txg_list_add(&dstg->dstg_pool->dp_sync_tasks, dstg, txg));
-
-	dmu_tx_commit(tx);
-
-	txg_wait_synced(dstg->dstg_pool, txg);
-
-	if (dstg->dstg_err == EAGAIN)
-		goto top;
-
-	return (dstg->dstg_err);
-}
-
-void
-dsl_sync_task_group_destroy(dsl_sync_task_group_t *dstg)
-{
-	dsl_sync_task_t *dst;
-
-	while (dst = list_head(&dstg->dstg_tasks)) {
-		list_remove(&dstg->dstg_tasks, dst);
-		kmem_free(dst, sizeof (dsl_sync_task_t));
-	}
-	kmem_free(dstg, sizeof (dsl_sync_task_group_t));
-}
-
-void
-dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx)
-{
-	dsl_sync_task_t *dst;
-	void *tr_cookie;
-
-	ASSERT3U(dstg->dstg_err, ==, 0);
-
-	/*
-	 * Check for sufficient space.
-	 */
-	dstg->dstg_err = dsl_dir_tempreserve_space(dstg->dstg_pool->dp_mos_dir,
-	    dstg->dstg_space, dstg->dstg_space * 3, 0, &tr_cookie, tx);
-	/* don't bother trying again */
-	if (dstg->dstg_err == ERESTART)
-		dstg->dstg_err = EAGAIN;
-	if (dstg->dstg_err)
-		return;
-
-	/*
-	 * Check for errors by calling checkfuncs.
-	 */
-	rw_enter(&dstg->dstg_pool->dp_config_rwlock, RW_WRITER);
-	for (dst = list_head(&dstg->dstg_tasks); dst;
-	    dst = list_next(&dstg->dstg_tasks, dst)) {
-		dst->dst_err =
-		    dst->dst_checkfunc(dst->dst_arg1, dst->dst_arg2, tx);
-		if (dst->dst_err)
-			dstg->dstg_err = dst->dst_err;
-	}
-
-	if (dstg->dstg_err == 0) {
-		/*
-		 * Execute sync tasks.
-		 */
-		for (dst = list_head(&dstg->dstg_tasks); dst;
-		    dst = list_next(&dstg->dstg_tasks, dst)) {
-			dst->dst_syncfunc(dst->dst_arg1, dst->dst_arg2, tx);
-		}
-	}
-	rw_exit(&dstg->dstg_pool->dp_config_rwlock);
-
-	dsl_dir_tempreserve_clear(tr_cookie, tx);
-}
-
-int
-dsl_sync_task_do(dsl_pool_t *dp,
-    dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc,
-    void *arg1, void *arg2, int blocks_modified)
-{
-	dsl_sync_task_group_t *dstg;
-	int err;
-
-	dstg = dsl_sync_task_group_create(dp);
-	dsl_sync_task_create(dstg, checkfunc, syncfunc,
-	    arg1, arg2, blocks_modified);
-	err = dsl_sync_task_group_wait(dstg);
-	dsl_sync_task_group_destroy(dstg);
-	return (err);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/fletcher.c b/sys/contrib/opensolaris/uts/common/fs/zfs/fletcher.c
deleted file mode 100644
index edda3c9..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/fletcher.c
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/types.h>
-#include <sys/sysmacros.h>
-#include <sys/byteorder.h>
-#include <sys/spa.h>
-
-void
-fletcher_2_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
-{
-	const uint64_t *ip = buf;
-	const uint64_t *ipend = ip + (size / sizeof (uint64_t));
-	uint64_t a0, b0, a1, b1;
-
-	for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) {
-		a0 += ip[0];
-		a1 += ip[1];
-		b0 += a0;
-		b1 += a1;
-	}
-
-	ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
-}
-
-void
-fletcher_2_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp)
-{
-	const uint64_t *ip = buf;
-	const uint64_t *ipend = ip + (size / sizeof (uint64_t));
-	uint64_t a0, b0, a1, b1;
-
-	for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) {
-		a0 += BSWAP_64(ip[0]);
-		a1 += BSWAP_64(ip[1]);
-		b0 += a0;
-		b1 += a1;
-	}
-
-	ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
-}
-
-void
-fletcher_4_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
-{
-	const uint32_t *ip = buf;
-	const uint32_t *ipend = ip + (size / sizeof (uint32_t));
-	uint64_t a, b, c, d;
-
-	for (a = b = c = d = 0; ip < ipend; ip++) {
-		a += ip[0];
-		b += a;
-		c += b;
-		d += c;
-	}
-
-	ZIO_SET_CHECKSUM(zcp, a, b, c, d);
-}
-
-void
-fletcher_4_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp)
-{
-	const uint32_t *ip = buf;
-	const uint32_t *ipend = ip + (size / sizeof (uint32_t));
-	uint64_t a, b, c, d;
-
-	for (a = b = c = d = 0; ip < ipend; ip++) {
-		a += BSWAP_32(ip[0]);
-		b += a;
-		c += b;
-		d += c;
-	}
-
-	ZIO_SET_CHECKSUM(zcp, a, b, c, d);
-}
-
-void
-fletcher_4_incremental_native(const void *buf, uint64_t size,
-    zio_cksum_t *zcp)
-{
-	const uint32_t *ip = buf;
-	const uint32_t *ipend = ip + (size / sizeof (uint32_t));
-	uint64_t a, b, c, d;
-
-	a = zcp->zc_word[0];
-	b = zcp->zc_word[1];
-	c = zcp->zc_word[2];
-	d = zcp->zc_word[3];
-
-	for (; ip < ipend; ip++) {
-		a += ip[0];
-		b += a;
-		c += b;
-		d += c;
-	}
-
-	ZIO_SET_CHECKSUM(zcp, a, b, c, d);
-}
-
-void
-fletcher_4_incremental_byteswap(const void *buf, uint64_t size,
-    zio_cksum_t *zcp)
-{
-	const uint32_t *ip = buf;
-	const uint32_t *ipend = ip + (size / sizeof (uint32_t));
-	uint64_t a, b, c, d;
-
-	a = zcp->zc_word[0];
-	b = zcp->zc_word[1];
-	c = zcp->zc_word[2];
-	d = zcp->zc_word[3];
-
-	for (; ip < ipend; ip++) {
-		a += BSWAP_32(ip[0]);
-		b += a;
-		c += b;
-		d += c;
-	}
-
-	ZIO_SET_CHECKSUM(zcp, a, b, c, d);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/gzip.c b/sys/contrib/opensolaris/uts/common/fs/zfs/gzip.c
deleted file mode 100644
index b257d4a..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/gzip.c
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/debug.h>
-#include <sys/types.h>
-#include <sys/zmod.h>
-
-#ifdef _KERNEL
-#include <sys/systm.h>
-#else
-#include <strings.h>
-#endif
-
-size_t
-gzip_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
-{
-	size_t dstlen = d_len;
-
-	ASSERT(d_len <= s_len);
-
-	if (z_compress_level(d_start, &dstlen, s_start, s_len, n) != Z_OK) {
-		if (d_len != s_len)
-			return (s_len);
-
-		bcopy(s_start, d_start, s_len);
-		return (s_len);
-	}
-
-	return (dstlen);
-}
-
-/*ARGSUSED*/
-int
-gzip_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
-{
-	size_t dstlen = d_len;
-
-	ASSERT(d_len >= s_len);
-
-	if (z_uncompress(d_start, &dstlen, s_start, s_len) != Z_OK)
-		return (-1);
-
-	return (0);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/lzjb.c b/sys/contrib/opensolaris/uts/common/fs/zfs/lzjb.c
deleted file mode 100644
index a88b85c..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/lzjb.c
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-/*
- * We keep our own copy of this algorithm for 2 main reasons:
- * 	1. If we didn't, anyone modifying common/os/compress.c would
- *         directly break our on disk format
- * 	2. Our version of lzjb does not have a number of checks that the
- *         common/os version needs and uses
- * In particular, we are adding the "feature" that compress() can
- * take a destination buffer size and return -1 if the data will not
- * compress to d_len or less.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/types.h>
-
-#define	MATCH_BITS	6
-#define	MATCH_MIN	3
-#define	MATCH_MAX	((1 << MATCH_BITS) + (MATCH_MIN - 1))
-#define	OFFSET_MASK	((1 << (16 - MATCH_BITS)) - 1)
-#define	LEMPEL_SIZE	256
-
-/*ARGSUSED*/
-size_t
-lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
-{
-	uchar_t *src = s_start;
-	uchar_t *dst = d_start;
-	uchar_t *cpy, *copymap;
-	int copymask = 1 << (NBBY - 1);
-	int mlen, offset;
-	uint16_t *hp;
-	uint16_t lempel[LEMPEL_SIZE];	/* uninitialized; see above */
-
-	while (src < (uchar_t *)s_start + s_len) {
-		if ((copymask <<= 1) == (1 << NBBY)) {
-			if (dst >= (uchar_t *)d_start + d_len - 1 - 2 * NBBY) {
-				if (d_len != s_len)
-					return (s_len);
-				mlen = s_len;
-				for (src = s_start, dst = d_start; mlen; mlen--)
-					*dst++ = *src++;
-				return (s_len);
-			}
-			copymask = 1;
-			copymap = dst;
-			*dst++ = 0;
-		}
-		if (src > (uchar_t *)s_start + s_len - MATCH_MAX) {
-			*dst++ = *src++;
-			continue;
-		}
-		hp = &lempel[((src[0] + 13) ^ (src[1] - 13) ^ src[2]) &
-		    (LEMPEL_SIZE - 1)];
-		offset = (intptr_t)(src - *hp) & OFFSET_MASK;
-		*hp = (uint16_t)(uintptr_t)src;
-		cpy = src - offset;
-		if (cpy >= (uchar_t *)s_start && cpy != src &&
-		    src[0] == cpy[0] && src[1] == cpy[1] && src[2] == cpy[2]) {
-			*copymap |= copymask;
-			for (mlen = MATCH_MIN; mlen < MATCH_MAX; mlen++)
-				if (src[mlen] != cpy[mlen])
-					break;
-			*dst++ = ((mlen - MATCH_MIN) << (NBBY - MATCH_BITS)) |
-			    (offset >> NBBY);
-			*dst++ = (uchar_t)offset;
-			src += mlen;
-		} else {
-			*dst++ = *src++;
-		}
-	}
-	return (dst - (uchar_t *)d_start);
-}
-
-/*ARGSUSED*/
-int
-lzjb_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
-{
-	uchar_t *src = s_start;
-	uchar_t *dst = d_start;
-	uchar_t *d_end = (uchar_t *)d_start + d_len;
-	uchar_t *cpy, copymap;
-	int copymask = 1 << (NBBY - 1);
-
-	while (dst < d_end) {
-		if ((copymask <<= 1) == (1 << NBBY)) {
-			copymask = 1;
-			copymap = *src++;
-		}
-		if (copymap & copymask) {
-			int mlen = (src[0] >> (NBBY - MATCH_BITS)) + MATCH_MIN;
-			int offset = ((src[0] << NBBY) | src[1]) & OFFSET_MASK;
-			src += 2;
-			if ((cpy = dst - offset) < (uchar_t *)d_start)
-				return (-1);
-			while (--mlen >= 0 && dst < d_end)
-				*dst++ = *cpy++;
-		} else {
-			*dst++ = *src++;
-		}
-	}
-	return (0);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/metaslab.c b/sys/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
deleted file mode 100644
index 0dba134..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
+++ /dev/null
@@ -1,1023 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/spa_impl.h>
-#include <sys/dmu.h>
-#include <sys/dmu_tx.h>
-#include <sys/space_map.h>
-#include <sys/metaslab_impl.h>
-#include <sys/vdev_impl.h>
-#include <sys/zio.h>
-
-uint64_t metaslab_aliquot = 512ULL << 10;
-
-/*
- * ==========================================================================
- * Metaslab classes
- * ==========================================================================
- */
-metaslab_class_t *
-metaslab_class_create(void)
-{
-	metaslab_class_t *mc;
-
-	mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
-
-	mc->mc_rotor = NULL;
-
-	return (mc);
-}
-
-void
-metaslab_class_destroy(metaslab_class_t *mc)
-{
-	metaslab_group_t *mg;
-
-	while ((mg = mc->mc_rotor) != NULL) {
-		metaslab_class_remove(mc, mg);
-		metaslab_group_destroy(mg);
-	}
-
-	kmem_free(mc, sizeof (metaslab_class_t));
-}
-
-void
-metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg)
-{
-	metaslab_group_t *mgprev, *mgnext;
-
-	ASSERT(mg->mg_class == NULL);
-
-	if ((mgprev = mc->mc_rotor) == NULL) {
-		mg->mg_prev = mg;
-		mg->mg_next = mg;
-	} else {
-		mgnext = mgprev->mg_next;
-		mg->mg_prev = mgprev;
-		mg->mg_next = mgnext;
-		mgprev->mg_next = mg;
-		mgnext->mg_prev = mg;
-	}
-	mc->mc_rotor = mg;
-	mg->mg_class = mc;
-}
-
-void
-metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg)
-{
-	metaslab_group_t *mgprev, *mgnext;
-
-	ASSERT(mg->mg_class == mc);
-
-	mgprev = mg->mg_prev;
-	mgnext = mg->mg_next;
-
-	if (mg == mgnext) {
-		mc->mc_rotor = NULL;
-	} else {
-		mc->mc_rotor = mgnext;
-		mgprev->mg_next = mgnext;
-		mgnext->mg_prev = mgprev;
-	}
-
-	mg->mg_prev = NULL;
-	mg->mg_next = NULL;
-	mg->mg_class = NULL;
-}
-
-/*
- * ==========================================================================
- * Metaslab groups
- * ==========================================================================
- */
-static int
-metaslab_compare(const void *x1, const void *x2)
-{
-	const metaslab_t *m1 = x1;
-	const metaslab_t *m2 = x2;
-
-	if (m1->ms_weight < m2->ms_weight)
-		return (1);
-	if (m1->ms_weight > m2->ms_weight)
-		return (-1);
-
-	/*
-	 * If the weights are identical, use the offset to force uniqueness.
-	 */
-	if (m1->ms_map.sm_start < m2->ms_map.sm_start)
-		return (-1);
-	if (m1->ms_map.sm_start > m2->ms_map.sm_start)
-		return (1);
-
-	ASSERT3P(m1, ==, m2);
-
-	return (0);
-}
-
-metaslab_group_t *
-metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
-{
-	metaslab_group_t *mg;
-
-	mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
-	mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
-	avl_create(&mg->mg_metaslab_tree, metaslab_compare,
-	    sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
-	mg->mg_aliquot = metaslab_aliquot * MAX(1, vd->vdev_children);
-	mg->mg_vd = vd;
-	metaslab_class_add(mc, mg);
-
-	return (mg);
-}
-
-void
-metaslab_group_destroy(metaslab_group_t *mg)
-{
-	avl_destroy(&mg->mg_metaslab_tree);
-	mutex_destroy(&mg->mg_lock);
-	kmem_free(mg, sizeof (metaslab_group_t));
-}
-
-static void
-metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
-{
-	mutex_enter(&mg->mg_lock);
-	ASSERT(msp->ms_group == NULL);
-	msp->ms_group = mg;
-	msp->ms_weight = 0;
-	avl_add(&mg->mg_metaslab_tree, msp);
-	mutex_exit(&mg->mg_lock);
-}
-
-static void
-metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
-{
-	mutex_enter(&mg->mg_lock);
-	ASSERT(msp->ms_group == mg);
-	avl_remove(&mg->mg_metaslab_tree, msp);
-	msp->ms_group = NULL;
-	mutex_exit(&mg->mg_lock);
-}
-
-static void
-metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
-{
-	/*
-	 * Although in principle the weight can be any value, in
-	 * practice we do not use values in the range [1, 510].
-	 */
-	ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0);
-	ASSERT(MUTEX_HELD(&msp->ms_lock));
-
-	mutex_enter(&mg->mg_lock);
-	ASSERT(msp->ms_group == mg);
-	avl_remove(&mg->mg_metaslab_tree, msp);
-	msp->ms_weight = weight;
-	avl_add(&mg->mg_metaslab_tree, msp);
-	mutex_exit(&mg->mg_lock);
-}
-
-/*
- * ==========================================================================
- * The first-fit block allocator
- * ==========================================================================
- */
-static void
-metaslab_ff_load(space_map_t *sm)
-{
-	ASSERT(sm->sm_ppd == NULL);
-	sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
-}
-
-static void
-metaslab_ff_unload(space_map_t *sm)
-{
-	kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
-	sm->sm_ppd = NULL;
-}
-
-static uint64_t
-metaslab_ff_alloc(space_map_t *sm, uint64_t size)
-{
-	avl_tree_t *t = &sm->sm_root;
-	uint64_t align = size & -size;
-	uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
-	space_seg_t *ss, ssearch;
-	avl_index_t where;
-
-	ssearch.ss_start = *cursor;
-	ssearch.ss_end = *cursor + size;
-
-	ss = avl_find(t, &ssearch, &where);
-	if (ss == NULL)
-		ss = avl_nearest(t, where, AVL_AFTER);
-
-	while (ss != NULL) {
-		uint64_t offset = P2ROUNDUP(ss->ss_start, align);
-
-		if (offset + size <= ss->ss_end) {
-			*cursor = offset + size;
-			return (offset);
-		}
-		ss = AVL_NEXT(t, ss);
-	}
-
-	/*
-	 * If we know we've searched the whole map (*cursor == 0), give up.
-	 * Otherwise, reset the cursor to the beginning and try again.
-	 */
-	if (*cursor == 0)
-		return (-1ULL);
-
-	*cursor = 0;
-	return (metaslab_ff_alloc(sm, size));
-}
-
-/* ARGSUSED */
-static void
-metaslab_ff_claim(space_map_t *sm, uint64_t start, uint64_t size)
-{
-	/* No need to update cursor */
-}
-
-/* ARGSUSED */
-static void
-metaslab_ff_free(space_map_t *sm, uint64_t start, uint64_t size)
-{
-	/* No need to update cursor */
-}
-
-static space_map_ops_t metaslab_ff_ops = {
-	metaslab_ff_load,
-	metaslab_ff_unload,
-	metaslab_ff_alloc,
-	metaslab_ff_claim,
-	metaslab_ff_free
-};
-
-/*
- * ==========================================================================
- * Metaslabs
- * ==========================================================================
- */
-metaslab_t *
-metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
-	uint64_t start, uint64_t size, uint64_t txg)
-{
-	vdev_t *vd = mg->mg_vd;
-	metaslab_t *msp;
-
-	msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
-	mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL);
-
-	msp->ms_smo_syncing = *smo;
-
-	/*
-	 * We create the main space map here, but we don't create the
-	 * allocmaps and freemaps until metaslab_sync_done().  This serves
-	 * two purposes: it allows metaslab_sync_done() to detect the
-	 * addition of new space; and for debugging, it ensures that we'd
-	 * data fault on any attempt to use this metaslab before it's ready.
-	 */
-	space_map_create(&msp->ms_map, start, size,
-	    vd->vdev_ashift, &msp->ms_lock);
-
-	metaslab_group_add(mg, msp);
-
-	/*
-	 * If we're opening an existing pool (txg == 0) or creating
-	 * a new one (txg == TXG_INITIAL), all space is available now.
-	 * If we're adding space to an existing pool, the new space
-	 * does not become available until after this txg has synced.
-	 */
-	if (txg <= TXG_INITIAL)
-		metaslab_sync_done(msp, 0);
-
-	if (txg != 0) {
-		/*
-		 * The vdev is dirty, but the metaslab isn't -- it just needs
-		 * to have metaslab_sync_done() invoked from vdev_sync_done().
-		 * [We could just dirty the metaslab, but that would cause us
-		 * to allocate a space map object for it, which is wasteful
-		 * and would mess up the locality logic in metaslab_weight().]
-		 */
-		ASSERT(TXG_CLEAN(txg) == spa_last_synced_txg(vd->vdev_spa));
-		vdev_dirty(vd, 0, NULL, txg);
-		vdev_dirty(vd, VDD_METASLAB, msp, TXG_CLEAN(txg));
-	}
-
-	return (msp);
-}
-
-void
-metaslab_fini(metaslab_t *msp)
-{
-	metaslab_group_t *mg = msp->ms_group;
-	int t;
-
-	vdev_space_update(mg->mg_vd, -msp->ms_map.sm_size,
-	    -msp->ms_smo.smo_alloc);
-
-	metaslab_group_remove(mg, msp);
-
-	mutex_enter(&msp->ms_lock);
-
-	space_map_unload(&msp->ms_map);
-	space_map_destroy(&msp->ms_map);
-
-	for (t = 0; t < TXG_SIZE; t++) {
-		space_map_destroy(&msp->ms_allocmap[t]);
-		space_map_destroy(&msp->ms_freemap[t]);
-	}
-
-	mutex_exit(&msp->ms_lock);
-	mutex_destroy(&msp->ms_lock);
-
-	kmem_free(msp, sizeof (metaslab_t));
-}
-
-#define	METASLAB_WEIGHT_PRIMARY		(1ULL << 63)
-#define	METASLAB_WEIGHT_SECONDARY	(1ULL << 62)
-#define	METASLAB_ACTIVE_MASK		\
-	(METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
-#define	METASLAB_SMO_BONUS_MULTIPLIER	2
-
-static uint64_t
-metaslab_weight(metaslab_t *msp)
-{
-	metaslab_group_t *mg = msp->ms_group;
-	space_map_t *sm = &msp->ms_map;
-	space_map_obj_t *smo = &msp->ms_smo;
-	vdev_t *vd = mg->mg_vd;
-	uint64_t weight, space;
-
-	ASSERT(MUTEX_HELD(&msp->ms_lock));
-
-	/*
-	 * The baseline weight is the metaslab's free space.
-	 */
-	space = sm->sm_size - smo->smo_alloc;
-	weight = space;
-
-	/*
-	 * Modern disks have uniform bit density and constant angular velocity.
-	 * Therefore, the outer recording zones are faster (higher bandwidth)
-	 * than the inner zones by the ratio of outer to inner track diameter,
-	 * which is typically around 2:1.  We account for this by assigning
-	 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
-	 * In effect, this means that we'll select the metaslab with the most
-	 * free bandwidth rather than simply the one with the most free space.
-	 */
-	weight = 2 * weight -
-	    ((sm->sm_start >> vd->vdev_ms_shift) * weight) / vd->vdev_ms_count;
-	ASSERT(weight >= space && weight <= 2 * space);
-
-	/*
-	 * For locality, assign higher weight to metaslabs we've used before.
-	 */
-	if (smo->smo_object != 0)
-		weight *= METASLAB_SMO_BONUS_MULTIPLIER;
-	ASSERT(weight >= space &&
-	    weight <= 2 * METASLAB_SMO_BONUS_MULTIPLIER * space);
-
-	/*
-	 * If this metaslab is one we're actively using, adjust its weight to
-	 * make it preferable to any inactive metaslab so we'll polish it off.
-	 */
-	weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
-
-	return (weight);
-}
-
-static int
-metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
-{
-	space_map_t *sm = &msp->ms_map;
-
-	ASSERT(MUTEX_HELD(&msp->ms_lock));
-
-	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
-		int error = space_map_load(sm, &metaslab_ff_ops,
-		    SM_FREE, &msp->ms_smo,
-		    msp->ms_group->mg_vd->vdev_spa->spa_meta_objset);
-		if (error) {
-			metaslab_group_sort(msp->ms_group, msp, 0);
-			return (error);
-		}
-		metaslab_group_sort(msp->ms_group, msp,
-		    msp->ms_weight | activation_weight);
-	}
-	ASSERT(sm->sm_loaded);
-	ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
-
-	return (0);
-}
-
-static void
-metaslab_passivate(metaslab_t *msp, uint64_t size)
-{
-	/*
-	 * If size < SPA_MINBLOCKSIZE, then we will not allocate from
-	 * this metaslab again.  In that case, it had better be empty,
-	 * or we would be leaving space on the table.
-	 */
-	ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map.sm_space == 0);
-	metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size));
-	ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
-}
-
-/*
- * Write a metaslab to disk in the context of the specified transaction group.
- */
-void
-metaslab_sync(metaslab_t *msp, uint64_t txg)
-{
-	vdev_t *vd = msp->ms_group->mg_vd;
-	spa_t *spa = vd->vdev_spa;
-	objset_t *mos = spa->spa_meta_objset;
-	space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK];
-	space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK];
-	space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
-	space_map_t *sm = &msp->ms_map;
-	space_map_obj_t *smo = &msp->ms_smo_syncing;
-	dmu_buf_t *db;
-	dmu_tx_t *tx;
-	int t;
-
-	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
-
-	/*
-	 * The only state that can actually be changing concurrently with
-	 * metaslab_sync() is the metaslab's ms_map.  No other thread can
-	 * be modifying this txg's allocmap, freemap, freed_map, or smo.
-	 * Therefore, we only hold ms_lock to satify space_map ASSERTs.
-	 * We drop it whenever we call into the DMU, because the DMU
-	 * can call down to us (e.g. via zio_free()) at any time.
-	 */
-	mutex_enter(&msp->ms_lock);
-
-	if (smo->smo_object == 0) {
-		ASSERT(smo->smo_objsize == 0);
-		ASSERT(smo->smo_alloc == 0);
-		mutex_exit(&msp->ms_lock);
-		smo->smo_object = dmu_object_alloc(mos,
-		    DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
-		    DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
-		ASSERT(smo->smo_object != 0);
-		dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
-		    (sm->sm_start >> vd->vdev_ms_shift),
-		    sizeof (uint64_t), &smo->smo_object, tx);
-		mutex_enter(&msp->ms_lock);
-	}
-
-	space_map_walk(freemap, space_map_add, freed_map);
-
-	if (sm->sm_loaded && spa_sync_pass(spa) == 1 && smo->smo_objsize >=
-	    2 * sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) {
-		/*
-		 * The in-core space map representation is twice as compact
-		 * as the on-disk one, so it's time to condense the latter
-		 * by generating a pure allocmap from first principles.
-		 *
-		 * This metaslab is 100% allocated,
-		 * minus the content of the in-core map (sm),
-		 * minus what's been freed this txg (freed_map),
-		 * minus allocations from txgs in the future
-		 * (because they haven't been committed yet).
-		 */
-		space_map_vacate(allocmap, NULL, NULL);
-		space_map_vacate(freemap, NULL, NULL);
-
-		space_map_add(allocmap, allocmap->sm_start, allocmap->sm_size);
-
-		space_map_walk(sm, space_map_remove, allocmap);
-		space_map_walk(freed_map, space_map_remove, allocmap);
-
-		for (t = 1; t < TXG_CONCURRENT_STATES; t++)
-			space_map_walk(&msp->ms_allocmap[(txg + t) & TXG_MASK],
-			    space_map_remove, allocmap);
-
-		mutex_exit(&msp->ms_lock);
-		space_map_truncate(smo, mos, tx);
-		mutex_enter(&msp->ms_lock);
-	}
-
-	space_map_sync(allocmap, SM_ALLOC, smo, mos, tx);
-	space_map_sync(freemap, SM_FREE, smo, mos, tx);
-
-	mutex_exit(&msp->ms_lock);
-
-	VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db));
-	dmu_buf_will_dirty(db, tx);
-	ASSERT3U(db->db_size, ==, sizeof (*smo));
-	bcopy(smo, db->db_data, db->db_size);
-	dmu_buf_rele(db, FTAG);
-
-	dmu_tx_commit(tx);
-}
-
-/*
- * Called after a transaction group has completely synced to mark
- * all of the metaslab's free space as usable.
- */
-void
-metaslab_sync_done(metaslab_t *msp, uint64_t txg)
-{
-	space_map_obj_t *smo = &msp->ms_smo;
-	space_map_obj_t *smosync = &msp->ms_smo_syncing;
-	space_map_t *sm = &msp->ms_map;
-	space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
-	metaslab_group_t *mg = msp->ms_group;
-	vdev_t *vd = mg->mg_vd;
-	int t;
-
-	mutex_enter(&msp->ms_lock);
-
-	/*
-	 * If this metaslab is just becoming available, initialize its
-	 * allocmaps and freemaps and add its capacity to the vdev.
-	 */
-	if (freed_map->sm_size == 0) {
-		for (t = 0; t < TXG_SIZE; t++) {
-			space_map_create(&msp->ms_allocmap[t], sm->sm_start,
-			    sm->sm_size, sm->sm_shift, sm->sm_lock);
-			space_map_create(&msp->ms_freemap[t], sm->sm_start,
-			    sm->sm_size, sm->sm_shift, sm->sm_lock);
-		}
-		vdev_space_update(vd, sm->sm_size, 0);
-	}
-
-	vdev_space_update(vd, 0, smosync->smo_alloc - smo->smo_alloc);
-
-	ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0);
-	ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0);
-
-	/*
-	 * If there's a space_map_load() in progress, wait for it to complete
-	 * so that we have a consistent view of the in-core space map.
-	 * Then, add everything we freed in this txg to the map.
-	 */
-	space_map_load_wait(sm);
-	space_map_vacate(freed_map, sm->sm_loaded ? space_map_free : NULL, sm);
-
-	*smo = *smosync;
-
-	/*
-	 * If the map is loaded but no longer active, evict it as soon as all
-	 * future allocations have synced.  (If we unloaded it now and then
-	 * loaded a moment later, the map wouldn't reflect those allocations.)
-	 */
-	if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
-		int evictable = 1;
-
-		for (t = 1; t < TXG_CONCURRENT_STATES; t++)
-			if (msp->ms_allocmap[(txg + t) & TXG_MASK].sm_space)
-				evictable = 0;
-
-		if (evictable)
-			space_map_unload(sm);
-	}
-
-	metaslab_group_sort(mg, msp, metaslab_weight(msp));
-
-	mutex_exit(&msp->ms_lock);
-}
-
-static uint64_t
-metaslab_distance(metaslab_t *msp, dva_t *dva)
-{
-	uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
-	uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
-	uint64_t start = msp->ms_map.sm_start >> ms_shift;
-
-	if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
-		return (1ULL << 63);
-
-	if (offset < start)
-		return ((start - offset) << ms_shift);
-	if (offset > start)
-		return ((offset - start) << ms_shift);
-	return (0);
-}
-
-static uint64_t
-metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
-    uint64_t min_distance, dva_t *dva, int d)
-{
-	metaslab_t *msp = NULL;
-	uint64_t offset = -1ULL;
-	avl_tree_t *t = &mg->mg_metaslab_tree;
-	uint64_t activation_weight;
-	uint64_t target_distance;
-	int i;
-
-	activation_weight = METASLAB_WEIGHT_PRIMARY;
-	for (i = 0; i < d; i++)
-		if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id)
-			activation_weight = METASLAB_WEIGHT_SECONDARY;
-
-	for (;;) {
-		mutex_enter(&mg->mg_lock);
-		for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
-			if (msp->ms_weight < size) {
-				mutex_exit(&mg->mg_lock);
-				return (-1ULL);
-			}
-
-			if (activation_weight == METASLAB_WEIGHT_PRIMARY)
-				break;
-
-			target_distance = min_distance +
-			    (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1);
-
-			for (i = 0; i < d; i++)
-				if (metaslab_distance(msp, &dva[i]) <
-				    target_distance)
-					break;
-			if (i == d)
-				break;
-		}
-		mutex_exit(&mg->mg_lock);
-		if (msp == NULL)
-			return (-1ULL);
-
-		mutex_enter(&msp->ms_lock);
-
-		/*
-		 * Ensure that the metaslab we have selected is still
-		 * capable of handling our request. It's possible that
-		 * another thread may have changed the weight while we
-		 * were blocked on the metaslab lock.
-		 */
-		if (msp->ms_weight < size) {
-			mutex_exit(&msp->ms_lock);
-			continue;
-		}
-
-		if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
-		    activation_weight == METASLAB_WEIGHT_PRIMARY) {
-			metaslab_passivate(msp,
-			    msp->ms_weight & ~METASLAB_ACTIVE_MASK);
-			mutex_exit(&msp->ms_lock);
-			continue;
-		}
-
-		if (metaslab_activate(msp, activation_weight) != 0) {
-			mutex_exit(&msp->ms_lock);
-			continue;
-		}
-
-		if ((offset = space_map_alloc(&msp->ms_map, size)) != -1ULL)
-			break;
-
-		metaslab_passivate(msp, size - 1);
-
-		mutex_exit(&msp->ms_lock);
-	}
-
-	if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
-		vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
-
-	space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
-
-	mutex_exit(&msp->ms_lock);
-
-	return (offset);
-}
-
-/*
- * Allocate a block for the specified i/o.
- */
-static int
-metaslab_alloc_dva(spa_t *spa, uint64_t psize, dva_t *dva, int d,
-    dva_t *hintdva, uint64_t txg, boolean_t hintdva_avoid)
-{
-	metaslab_group_t *mg, *rotor;
-	metaslab_class_t *mc;
-	vdev_t *vd;
-	int dshift = 3;
-	int all_zero;
-	uint64_t offset = -1ULL;
-	uint64_t asize;
-	uint64_t distance;
-
-	ASSERT(!DVA_IS_VALID(&dva[d]));
-
-	mc = spa_metaslab_class_select(spa);
-
-	/*
-	 * Start at the rotor and loop through all mgs until we find something.
-	 * Note that there's no locking on mc_rotor or mc_allocated because
-	 * nothing actually breaks if we miss a few updates -- we just won't
-	 * allocate quite as evenly.  It all balances out over time.
-	 *
-	 * If we are doing ditto or log blocks, try to spread them across
-	 * consecutive vdevs.  If we're forced to reuse a vdev before we've
-	 * allocated all of our ditto blocks, then try and spread them out on
-	 * that vdev as much as possible.  If it turns out to not be possible,
-	 * gradually lower our standards until anything becomes acceptable.
-	 * Also, allocating on consecutive vdevs (as opposed to random vdevs)
-	 * gives us hope of containing our fault domains to something we're
-	 * able to reason about.  Otherwise, any two top-level vdev failures
-	 * will guarantee the loss of data.  With consecutive allocation,
-	 * only two adjacent top-level vdev failures will result in data loss.
-	 *
-	 * If we are doing gang blocks (hintdva is non-NULL), try to keep
-	 * ourselves on the same vdev as our gang block header.  That
-	 * way, we can hope for locality in vdev_cache, plus it makes our
-	 * fault domains something tractable.
-	 */
-	if (hintdva) {
-		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
-		if (hintdva_avoid)
-			mg = vd->vdev_mg->mg_next;
-		else
-			mg = vd->vdev_mg;
-	} else if (d != 0) {
-		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
-		mg = vd->vdev_mg->mg_next;
-	} else {
-		mg = mc->mc_rotor;
-	}
-	rotor = mg;
-
-top:
-	all_zero = B_TRUE;
-	do {
-		vd = mg->mg_vd;
-
-		distance = vd->vdev_asize >> dshift;
-		if (distance <= (1ULL << vd->vdev_ms_shift))
-			distance = 0;
-		else
-			all_zero = B_FALSE;
-
-		asize = vdev_psize_to_asize(vd, psize);
-		ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
-
-		offset = metaslab_group_alloc(mg, asize, txg, distance, dva, d);
-		if (offset != -1ULL) {
-			/*
-			 * If we've just selected this metaslab group,
-			 * figure out whether the corresponding vdev is
-			 * over- or under-used relative to the pool,
-			 * and set an allocation bias to even it out.
-			 */
-			if (mc->mc_allocated == 0) {
-				vdev_stat_t *vs = &vd->vdev_stat;
-				uint64_t alloc, space;
-				int64_t vu, su;
-
-				alloc = spa_get_alloc(spa);
-				space = spa_get_space(spa);
-
-				/*
-				 * Determine percent used in units of 0..1024.
-				 * (This is just to avoid floating point.)
-				 */
-				vu = (vs->vs_alloc << 10) / (vs->vs_space + 1);
-				su = (alloc << 10) / (space + 1);
-
-				/*
-				 * Bias by at most +/- 25% of the aliquot.
-				 */
-				mg->mg_bias = ((su - vu) *
-				    (int64_t)mg->mg_aliquot) / (1024 * 4);
-			}
-
-			if (atomic_add_64_nv(&mc->mc_allocated, asize) >=
-			    mg->mg_aliquot + mg->mg_bias) {
-				mc->mc_rotor = mg->mg_next;
-				mc->mc_allocated = 0;
-			}
-
-			DVA_SET_VDEV(&dva[d], vd->vdev_id);
-			DVA_SET_OFFSET(&dva[d], offset);
-			DVA_SET_GANG(&dva[d], 0);
-			DVA_SET_ASIZE(&dva[d], asize);
-
-			return (0);
-		}
-		mc->mc_rotor = mg->mg_next;
-		mc->mc_allocated = 0;
-	} while ((mg = mg->mg_next) != rotor);
-
-	if (!all_zero) {
-		dshift++;
-		ASSERT(dshift < 64);
-		goto top;
-	}
-
-	bzero(&dva[d], sizeof (dva_t));
-
-	return (ENOSPC);
-}
-
-/*
- * Free the block represented by DVA in the context of the specified
- * transaction group.
- */
-static void
-metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now)
-{
-	uint64_t vdev = DVA_GET_VDEV(dva);
-	uint64_t offset = DVA_GET_OFFSET(dva);
-	uint64_t size = DVA_GET_ASIZE(dva);
-	vdev_t *vd;
-	metaslab_t *msp;
-
-	ASSERT(DVA_IS_VALID(dva));
-
-	if (txg > spa_freeze_txg(spa))
-		return;
-
-	if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
-	    (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
-		cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu",
-		    (u_longlong_t)vdev, (u_longlong_t)offset);
-		ASSERT(0);
-		return;
-	}
-
-	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
-
-	if (DVA_GET_GANG(dva))
-		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
-
-	mutex_enter(&msp->ms_lock);
-
-	if (now) {
-		space_map_remove(&msp->ms_allocmap[txg & TXG_MASK],
-		    offset, size);
-		space_map_free(&msp->ms_map, offset, size);
-	} else {
-		if (msp->ms_freemap[txg & TXG_MASK].sm_space == 0)
-			vdev_dirty(vd, VDD_METASLAB, msp, txg);
-		space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size);
-
-		/*
-		 * verify that this region is actually allocated in
-		 * either a ms_allocmap or the ms_map
-		 */
-		if (msp->ms_map.sm_loaded) {
-			boolean_t allocd = B_FALSE;
-			int i;
-
-			if (!space_map_contains(&msp->ms_map, offset, size)) {
-				allocd = B_TRUE;
-			} else {
-				for (i = 0; i < TXG_CONCURRENT_STATES; i++) {
-					space_map_t *sm = &msp->ms_allocmap
-					    [(txg - i) & TXG_MASK];
-					if (space_map_contains(sm,
-					    offset, size)) {
-						allocd = B_TRUE;
-						break;
-					}
-				}
-			}
-
-			if (!allocd) {
-				zfs_panic_recover("freeing free segment "
-				    "(vdev=%llu offset=%llx size=%llx)",
-				    (longlong_t)vdev, (longlong_t)offset,
-				    (longlong_t)size);
-			}
-		}
-
-
-	}
-
-	mutex_exit(&msp->ms_lock);
-}
-
-/*
- * Intent log support: upon opening the pool after a crash, notify the SPA
- * of blocks that the intent log has allocated for immediate write, but
- * which are still considered free by the SPA because the last transaction
- * group didn't commit yet.
- */
-static int
-metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
-{
-	uint64_t vdev = DVA_GET_VDEV(dva);
-	uint64_t offset = DVA_GET_OFFSET(dva);
-	uint64_t size = DVA_GET_ASIZE(dva);
-	vdev_t *vd;
-	metaslab_t *msp;
-	int error;
-
-	ASSERT(DVA_IS_VALID(dva));
-
-	if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
-	    (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
-		return (ENXIO);
-
-	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
-
-	if (DVA_GET_GANG(dva))
-		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
-
-	mutex_enter(&msp->ms_lock);
-
-	error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
-	if (error) {
-		mutex_exit(&msp->ms_lock);
-		return (error);
-	}
-
-	if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
-		vdev_dirty(vd, VDD_METASLAB, msp, txg);
-
-	space_map_claim(&msp->ms_map, offset, size);
-	space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
-
-	mutex_exit(&msp->ms_lock);
-
-	return (0);
-}
-
-int
-metaslab_alloc(spa_t *spa, uint64_t psize, blkptr_t *bp, int ndvas,
-    uint64_t txg, blkptr_t *hintbp, boolean_t hintbp_avoid)
-{
-	dva_t *dva = bp->blk_dva;
-	dva_t *hintdva = hintbp->blk_dva;
-	int d;
-	int error = 0;
-
-	ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
-	ASSERT(BP_GET_NDVAS(bp) == 0);
-	ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
-
-	for (d = 0; d < ndvas; d++) {
-		error = metaslab_alloc_dva(spa, psize, dva, d, hintdva,
-		    txg, hintbp_avoid);
-		if (error) {
-			for (d--; d >= 0; d--) {
-				metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
-				bzero(&dva[d], sizeof (dva_t));
-			}
-			return (error);
-		}
-	}
-	ASSERT(error == 0);
-	ASSERT(BP_GET_NDVAS(bp) == ndvas);
-
-	return (0);
-}
-
-void
-metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
-{
-	const dva_t *dva = bp->blk_dva;
-	int ndvas = BP_GET_NDVAS(bp);
-	int d;
-
-	ASSERT(!BP_IS_HOLE(bp));
-
-	for (d = 0; d < ndvas; d++)
-		metaslab_free_dva(spa, &dva[d], txg, now);
-}
-
-int
-metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
-{
-	const dva_t *dva = bp->blk_dva;
-	int ndvas = BP_GET_NDVAS(bp);
-	int d, error;
-	int last_error = 0;
-
-	ASSERT(!BP_IS_HOLE(bp));
-
-	for (d = 0; d < ndvas; d++)
-		if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
-			last_error = error;
-
-	return (last_error);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/refcount.c b/sys/contrib/opensolaris/uts/common/fs/zfs/refcount.c
deleted file mode 100644
index 411ed46..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/refcount.c
+++ /dev/null
@@ -1,194 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/refcount.h>
-
-#if defined(DEBUG) || !defined(_KERNEL)
-
-#ifdef _KERNEL
-int reference_tracking_enable = FALSE; /* runs out of memory too easily */
-#else
-int reference_tracking_enable = TRUE;
-#endif
-int reference_history = 4; /* tunable */
-
-static kmem_cache_t *reference_cache;
-static kmem_cache_t *reference_history_cache;
-
-void
-refcount_init(void)
-{
-	reference_cache = kmem_cache_create("reference_cache",
-	    sizeof (reference_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
-
-	reference_history_cache = kmem_cache_create("reference_history_cache",
-	    sizeof (uint64_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
-}
-
-void
-refcount_fini(void)
-{
-	kmem_cache_destroy(reference_cache);
-	kmem_cache_destroy(reference_history_cache);
-}
-
-void
-refcount_create(refcount_t *rc)
-{
-	list_create(&rc->rc_list, sizeof (reference_t),
-	    offsetof(reference_t, ref_link));
-	list_create(&rc->rc_removed, sizeof (reference_t),
-	    offsetof(reference_t, ref_link));
-	mutex_init(&rc->rc_mtx, NULL, MUTEX_DEFAULT, NULL);
-}
-
-void
-refcount_destroy_many(refcount_t *rc, uint64_t number)
-{
-	reference_t *ref;
-
-	ASSERT(rc->rc_count == number);
-	while (ref = list_head(&rc->rc_list)) {
-		list_remove(&rc->rc_list, ref);
-		kmem_cache_free(reference_cache, ref);
-	}
-	list_destroy(&rc->rc_list);
-
-	while (ref = list_head(&rc->rc_removed)) {
-		list_remove(&rc->rc_removed, ref);
-		kmem_cache_free(reference_history_cache, ref->ref_removed);
-		kmem_cache_free(reference_cache, ref);
-	}
-	list_destroy(&rc->rc_removed);
-	mutex_destroy(&rc->rc_mtx);
-}
-
-void
-refcount_destroy(refcount_t *rc)
-{
-	refcount_destroy_many(rc, 0);
-}
-
-int
-refcount_is_zero(refcount_t *rc)
-{
-	ASSERT(rc->rc_count >= 0);
-	return (rc->rc_count == 0);
-}
-
-int64_t
-refcount_count(refcount_t *rc)
-{
-	ASSERT(rc->rc_count >= 0);
-	return (rc->rc_count);
-}
-
-int64_t
-refcount_add_many(refcount_t *rc, uint64_t number, void *holder)
-{
-	reference_t *ref;
-	int64_t count;
-
-	if (reference_tracking_enable) {
-		ref = kmem_cache_alloc(reference_cache, KM_SLEEP);
-		ref->ref_holder = holder;
-		ref->ref_number = number;
-	}
-	mutex_enter(&rc->rc_mtx);
-	ASSERT(rc->rc_count >= 0);
-	if (reference_tracking_enable)
-		list_insert_head(&rc->rc_list, ref);
-	rc->rc_count += number;
-	count = rc->rc_count;
-	mutex_exit(&rc->rc_mtx);
-
-	return (count);
-}
-
-int64_t
-refcount_add(refcount_t *rc, void *holder)
-{
-	return (refcount_add_many(rc, 1, holder));
-}
-
-int64_t
-refcount_remove_many(refcount_t *rc, uint64_t number, void *holder)
-{
-	reference_t *ref;
-	int64_t count;
-
-	mutex_enter(&rc->rc_mtx);
-	ASSERT(rc->rc_count >= number);
-
-	if (!reference_tracking_enable) {
-		rc->rc_count -= number;
-		count = rc->rc_count;
-		mutex_exit(&rc->rc_mtx);
-		return (count);
-	}
-
-	for (ref = list_head(&rc->rc_list); ref;
-	    ref = list_next(&rc->rc_list, ref)) {
-		if (ref->ref_holder == holder && ref->ref_number == number) {
-			list_remove(&rc->rc_list, ref);
-			if (reference_history > 0) {
-				ref->ref_removed =
-				    kmem_cache_alloc(reference_history_cache,
-				    KM_SLEEP);
-				list_insert_head(&rc->rc_removed, ref);
-				rc->rc_removed_count++;
-				if (rc->rc_removed_count >= reference_history) {
-					ref = list_tail(&rc->rc_removed);
-					list_remove(&rc->rc_removed, ref);
-					kmem_cache_free(reference_history_cache,
-					    ref->ref_removed);
-					kmem_cache_free(reference_cache, ref);
-					rc->rc_removed_count--;
-				}
-			} else {
-				kmem_cache_free(reference_cache, ref);
-			}
-			rc->rc_count -= number;
-			count = rc->rc_count;
-			mutex_exit(&rc->rc_mtx);
-			return (count);
-		}
-	}
-	panic("No such hold %p on refcount %llx", holder,
-	    (u_longlong_t)(uintptr_t)rc);
-	return (-1);
-}
-
-int64_t
-refcount_remove(refcount_t *rc, void *holder)
-{
-	return (refcount_remove_many(rc, 1, holder));
-}
-
-#endif
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sha256.c b/sys/contrib/opensolaris/uts/common/fs/zfs/sha256.c
deleted file mode 100644
index ce5c261..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sha256.c
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/zio.h>
-#include <sys/zio_checksum.h>
-
-/*
- * SHA-256 checksum, as specified in FIPS 180-2, available at:
- * http://csrc.nist.gov/cryptval
- *
- * This is a very compact implementation of SHA-256.
- * It is designed to be simple and portable, not to be fast.
- */
-
-/*
- * The literal definitions according to FIPS180-2 would be:
- *
- * 	Ch(x, y, z)     (((x) & (y)) ^ ((~(x)) & (z)))
- * 	Maj(x, y, z)    (((x) & (y)) | ((x) & (z)) | ((y) & (z)))
- *
- * We use logical equivalents which require one less op.
- */
-#define	Ch(x, y, z)	((z) ^ ((x) & ((y) ^ (z))))
-#define	Maj(x, y, z)	(((x) & (y)) ^ ((z) & ((x) ^ (y))))
-#define	Rot32(x, s)	(((x) >> s) | ((x) << (32 - s)))
-#define	SIGMA0(x)	(Rot32(x, 2) ^ Rot32(x, 13) ^ Rot32(x, 22))
-#define	SIGMA1(x)	(Rot32(x, 6) ^ Rot32(x, 11) ^ Rot32(x, 25))
-#define	sigma0(x)	(Rot32(x, 7) ^ Rot32(x, 18) ^ ((x) >> 3))
-#define	sigma1(x)	(Rot32(x, 17) ^ Rot32(x, 19) ^ ((x) >> 10))
-
-static const uint32_t SHA256_K[64] = {
-	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
-	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
-	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
-	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
-	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
-	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
-	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
-	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
-	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
-	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
-	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
-	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
-	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
-	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
-	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
-	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
-};
-
-static void
-SHA256Transform(uint32_t *H, const uint8_t *cp)
-{
-	uint32_t a, b, c, d, e, f, g, h, t, T1, T2, W[64];
-
-	for (t = 0; t < 16; t++, cp += 4)
-		W[t] = (cp[0] << 24) | (cp[1] << 16) | (cp[2] << 8) | cp[3];
-
-	for (t = 16; t < 64; t++)
-		W[t] = sigma1(W[t - 2]) + W[t - 7] +
-		    sigma0(W[t - 15]) + W[t - 16];
-
-	a = H[0]; b = H[1]; c = H[2]; d = H[3];
-	e = H[4]; f = H[5]; g = H[6]; h = H[7];
-
-	for (t = 0; t < 64; t++) {
-		T1 = h + SIGMA1(e) + Ch(e, f, g) + SHA256_K[t] + W[t];
-		T2 = SIGMA0(a) + Maj(a, b, c);
-		h = g; g = f; f = e; e = d + T1;
-		d = c; c = b; b = a; a = T1 + T2;
-	}
-
-	H[0] += a; H[1] += b; H[2] += c; H[3] += d;
-	H[4] += e; H[5] += f; H[6] += g; H[7] += h;
-}
-
-void
-zio_checksum_SHA256(const void *buf, uint64_t size, zio_cksum_t *zcp)
-{
-	uint32_t H[8] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
-	    0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 };
-	uint8_t pad[128];
-	int padsize = size & 63;
-	int i;
-
-	for (i = 0; i < size - padsize; i += 64)
-		SHA256Transform(H, (uint8_t *)buf + i);
-
-	for (i = 0; i < padsize; i++)
-		pad[i] = ((uint8_t *)buf)[i];
-
-	for (pad[padsize++] = 0x80; (padsize & 63) != 56; padsize++)
-		pad[padsize] = 0;
-
-	for (i = 0; i < 8; i++)
-		pad[padsize++] = (size << 3) >> (56 - 8 * i);
-
-	for (i = 0; i < padsize; i += 64)
-		SHA256Transform(H, pad + i);
-
-	ZIO_SET_CHECKSUM(zcp,
-	    (uint64_t)H[0] << 32 | H[1],
-	    (uint64_t)H[2] << 32 | H[3],
-	    (uint64_t)H[4] << 32 | H[5],
-	    (uint64_t)H[6] << 32 | H[7]);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/spa.c b/sys/contrib/opensolaris/uts/common/fs/zfs/spa.c
deleted file mode 100644
index 6a7c525..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/spa.c
+++ /dev/null
@@ -1,3301 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-/*
- * This file contains all the routines used when modifying on-disk SPA state.
- * This includes opening, importing, destroying, exporting a pool, and syncing a
- * pool.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/fm/fs/zfs.h>
-#include <sys/spa_impl.h>
-#include <sys/zio.h>
-#include <sys/zio_checksum.h>
-#include <sys/zio_compress.h>
-#include <sys/dmu.h>
-#include <sys/dmu_tx.h>
-#include <sys/zap.h>
-#include <sys/zil.h>
-#include <sys/vdev_impl.h>
-#include <sys/metaslab.h>
-#include <sys/uberblock_impl.h>
-#include <sys/txg.h>
-#include <sys/avl.h>
-#include <sys/dmu_traverse.h>
-#include <sys/dmu_objset.h>
-#include <sys/unique.h>
-#include <sys/dsl_pool.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_dir.h>
-#include <sys/dsl_prop.h>
-#include <sys/dsl_synctask.h>
-#include <sys/fs/zfs.h>
-#include <sys/callb.h>
-#include <sys/sunddi.h>
-
-int zio_taskq_threads = 0;
-SYSCTL_DECL(_vfs_zfs);
-SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO");
-TUNABLE_INT("vfs.zfs.zio.taskq_threads", &zio_taskq_threads);
-SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, taskq_threads, CTLFLAG_RW,
-    &zio_taskq_threads, 0, "Number of ZIO threads per ZIO type");
-
-
-/*
- * ==========================================================================
- * SPA state manipulation (open/create/destroy/import/export)
- * ==========================================================================
- */
-
-static int
-spa_error_entry_compare(const void *a, const void *b)
-{
-	spa_error_entry_t *sa = (spa_error_entry_t *)a;
-	spa_error_entry_t *sb = (spa_error_entry_t *)b;
-	int ret;
-
-	ret = bcmp(&sa->se_bookmark, &sb->se_bookmark,
-	    sizeof (zbookmark_t));
-
-	if (ret < 0)
-		return (-1);
-	else if (ret > 0)
-		return (1);
-	else
-		return (0);
-}
-
-/*
- * Utility function which retrieves copies of the current logs and
- * re-initializes them in the process.
- */
-void
-spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
-{
-	ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
-
-	bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
-	bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
-
-	avl_create(&spa->spa_errlist_scrub,
-	    spa_error_entry_compare, sizeof (spa_error_entry_t),
-	    offsetof(spa_error_entry_t, se_avl));
-	avl_create(&spa->spa_errlist_last,
-	    spa_error_entry_compare, sizeof (spa_error_entry_t),
-	    offsetof(spa_error_entry_t, se_avl));
-}
-
-/*
- * Activate an uninitialized pool.
- */
-static void
-spa_activate(spa_t *spa)
-{
-	int t;
-	int nthreads = zio_taskq_threads;
-	char name[32];
-
-	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
-
-	spa->spa_state = POOL_STATE_ACTIVE;
-
-	spa->spa_normal_class = metaslab_class_create();
-
-	if (nthreads == 0)
-		nthreads = max_ncpus;
-	for (t = 0; t < ZIO_TYPES; t++) {
-		snprintf(name, sizeof(name), "spa_zio_issue %d", t);
-		spa->spa_zio_issue_taskq[t] = taskq_create(name, nthreads,
-		    maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE);
-		snprintf(name, sizeof(name), "spa_zio_intr %d", t);
-		spa->spa_zio_intr_taskq[t] = taskq_create(name, nthreads,
-		    maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE);
-	}
-
-	rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL);
-
-	mutex_init(&spa->spa_uberblock_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&spa->spa_config_lock.scl_lock, NULL, MUTEX_DEFAULT, NULL);
-	cv_init(&spa->spa_config_lock.scl_cv, NULL, CV_DEFAULT, NULL);
-	mutex_init(&spa->spa_sync_bplist.bpl_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL);
-
-	list_create(&spa->spa_dirty_list, sizeof (vdev_t),
-	    offsetof(vdev_t, vdev_dirty_node));
-
-	txg_list_create(&spa->spa_vdev_txg_list,
-	    offsetof(struct vdev, vdev_txg_node));
-
-	avl_create(&spa->spa_errlist_scrub,
-	    spa_error_entry_compare, sizeof (spa_error_entry_t),
-	    offsetof(spa_error_entry_t, se_avl));
-	avl_create(&spa->spa_errlist_last,
-	    spa_error_entry_compare, sizeof (spa_error_entry_t),
-	    offsetof(spa_error_entry_t, se_avl));
-}
-
-/*
- * Opposite of spa_activate().
- */
-static void
-spa_deactivate(spa_t *spa)
-{
-	int t;
-
-	ASSERT(spa->spa_sync_on == B_FALSE);
-	ASSERT(spa->spa_dsl_pool == NULL);
-	ASSERT(spa->spa_root_vdev == NULL);
-
-	ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
-
-	txg_list_destroy(&spa->spa_vdev_txg_list);
-
-	list_destroy(&spa->spa_dirty_list);
-
-	for (t = 0; t < ZIO_TYPES; t++) {
-		taskq_destroy(spa->spa_zio_issue_taskq[t]);
-		taskq_destroy(spa->spa_zio_intr_taskq[t]);
-		spa->spa_zio_issue_taskq[t] = NULL;
-		spa->spa_zio_intr_taskq[t] = NULL;
-	}
-
-	metaslab_class_destroy(spa->spa_normal_class);
-	spa->spa_normal_class = NULL;
-
-	/*
-	 * If this was part of an import or the open otherwise failed, we may
-	 * still have errors left in the queues.  Empty them just in case.
-	 */
-	spa_errlog_drain(spa);
-
-	avl_destroy(&spa->spa_errlist_scrub);
-	avl_destroy(&spa->spa_errlist_last);
-
-	rw_destroy(&spa->spa_traverse_lock);
-	mutex_destroy(&spa->spa_uberblock_lock);
-	mutex_destroy(&spa->spa_errlog_lock);
-	mutex_destroy(&spa->spa_errlist_lock);
-	mutex_destroy(&spa->spa_config_lock.scl_lock);
-	cv_destroy(&spa->spa_config_lock.scl_cv);
-	mutex_destroy(&spa->spa_sync_bplist.bpl_lock);
-	mutex_destroy(&spa->spa_history_lock);
-	mutex_destroy(&spa->spa_props_lock);
-
-	spa->spa_state = POOL_STATE_UNINITIALIZED;
-}
-
-/*
- * Verify a pool configuration, and construct the vdev tree appropriately.  This
- * will create all the necessary vdevs in the appropriate layout, with each vdev
- * in the CLOSED state.  This will prep the pool before open/creation/import.
- * All vdev validation is done by the vdev_alloc() routine.
- */
-static int
-spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
-    uint_t id, int atype)
-{
-	nvlist_t **child;
-	uint_t c, children;
-	int error;
-
-	if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
-		return (error);
-
-	if ((*vdp)->vdev_ops->vdev_op_leaf)
-		return (0);
-
-	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
-	    &child, &children) != 0) {
-		vdev_free(*vdp);
-		*vdp = NULL;
-		return (EINVAL);
-	}
-
-	for (c = 0; c < children; c++) {
-		vdev_t *vd;
-		if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
-		    atype)) != 0) {
-			vdev_free(*vdp);
-			*vdp = NULL;
-			return (error);
-		}
-	}
-
-	ASSERT(*vdp != NULL);
-
-	return (0);
-}
-
-/*
- * Opposite of spa_load().
- */
-static void
-spa_unload(spa_t *spa)
-{
-	int i;
-
-	/*
-	 * Stop async tasks.
-	 */
-	spa_async_suspend(spa);
-
-	/*
-	 * Stop syncing.
-	 */
-	if (spa->spa_sync_on) {
-		txg_sync_stop(spa->spa_dsl_pool);
-		spa->spa_sync_on = B_FALSE;
-	}
-
-	/*
-	 * Wait for any outstanding prefetch I/O to complete.
-	 */
-	spa_config_enter(spa, RW_WRITER, FTAG);
-	spa_config_exit(spa, FTAG);
-
-	/*
-	 * Close the dsl pool.
-	 */
-	if (spa->spa_dsl_pool) {
-		dsl_pool_close(spa->spa_dsl_pool);
-		spa->spa_dsl_pool = NULL;
-	}
-
-	/*
-	 * Close all vdevs.
-	 */
-	if (spa->spa_root_vdev)
-		vdev_free(spa->spa_root_vdev);
-	ASSERT(spa->spa_root_vdev == NULL);
-
-	for (i = 0; i < spa->spa_nspares; i++)
-		vdev_free(spa->spa_spares[i]);
-	if (spa->spa_spares) {
-		kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *));
-		spa->spa_spares = NULL;
-	}
-	if (spa->spa_sparelist) {
-		nvlist_free(spa->spa_sparelist);
-		spa->spa_sparelist = NULL;
-	}
-
-	spa->spa_async_suspended = 0;
-}
-
-/*
- * Load (or re-load) the current list of vdevs describing the active spares for
- * this pool.  When this is called, we have some form of basic information in
- * 'spa_sparelist'.  We parse this into vdevs, try to open them, and then
- * re-generate a more complete list including status information.
- */
-static void
-spa_load_spares(spa_t *spa)
-{
-	nvlist_t **spares;
-	uint_t nspares;
-	int i;
-	vdev_t *vd, *tvd;
-
-	/*
-	 * First, close and free any existing spare vdevs.
-	 */
-	for (i = 0; i < spa->spa_nspares; i++) {
-		vd = spa->spa_spares[i];
-
-		/* Undo the call to spa_activate() below */
-		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL &&
-		    tvd->vdev_isspare)
-			spa_spare_remove(tvd);
-		vdev_close(vd);
-		vdev_free(vd);
-	}
-
-	if (spa->spa_spares)
-		kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *));
-
-	if (spa->spa_sparelist == NULL)
-		nspares = 0;
-	else
-		VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist,
-		    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
-
-	spa->spa_nspares = (int)nspares;
-	spa->spa_spares = NULL;
-
-	if (nspares == 0)
-		return;
-
-	/*
-	 * Construct the array of vdevs, opening them to get status in the
-	 * process.   For each spare, there is potentially two different vdev_t
-	 * structures associated with it: one in the list of spares (used only
-	 * for basic validation purposes) and one in the active vdev
-	 * configuration (if it's spared in).  During this phase we open and
-	 * validate each vdev on the spare list.  If the vdev also exists in the
-	 * active configuration, then we also mark this vdev as an active spare.
-	 */
-	spa->spa_spares = kmem_alloc(nspares * sizeof (void *), KM_SLEEP);
-	for (i = 0; i < spa->spa_nspares; i++) {
-		VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
-		    VDEV_ALLOC_SPARE) == 0);
-		ASSERT(vd != NULL);
-
-		spa->spa_spares[i] = vd;
-
-		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL) {
-			if (!tvd->vdev_isspare)
-				spa_spare_add(tvd);
-
-			/*
-			 * We only mark the spare active if we were successfully
-			 * able to load the vdev.  Otherwise, importing a pool
-			 * with a bad active spare would result in strange
-			 * behavior, because multiple pool would think the spare
-			 * is actively in use.
-			 *
-			 * There is a vulnerability here to an equally bizarre
-			 * circumstance, where a dead active spare is later
-			 * brought back to life (onlined or otherwise).  Given
-			 * the rarity of this scenario, and the extra complexity
-			 * it adds, we ignore the possibility.
-			 */
-			if (!vdev_is_dead(tvd))
-				spa_spare_activate(tvd);
-		}
-
-		if (vdev_open(vd) != 0)
-			continue;
-
-		vd->vdev_top = vd;
-		(void) vdev_validate_spare(vd);
-	}
-
-	/*
-	 * Recompute the stashed list of spares, with status information
-	 * this time.
-	 */
-	VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES,
-	    DATA_TYPE_NVLIST_ARRAY) == 0);
-
-	spares = kmem_alloc(spa->spa_nspares * sizeof (void *), KM_SLEEP);
-	for (i = 0; i < spa->spa_nspares; i++)
-		spares[i] = vdev_config_generate(spa, spa->spa_spares[i],
-		    B_TRUE, B_TRUE);
-	VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES,
-	    spares, spa->spa_nspares) == 0);
-	for (i = 0; i < spa->spa_nspares; i++)
-		nvlist_free(spares[i]);
-	kmem_free(spares, spa->spa_nspares * sizeof (void *));
-}
-
-static int
-load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
-{
-	dmu_buf_t *db;
-	char *packed = NULL;
-	size_t nvsize = 0;
-	int error;
-	*value = NULL;
-
-	VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
-	nvsize = *(uint64_t *)db->db_data;
-	dmu_buf_rele(db, FTAG);
-
-	packed = kmem_alloc(nvsize, KM_SLEEP);
-	error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed);
-	if (error == 0)
-		error = nvlist_unpack(packed, nvsize, value, 0);
-	kmem_free(packed, nvsize);
-
-	return (error);
-}
-
-/*
- * Load an existing storage pool, using the pool's builtin spa_config as a
- * source of configuration information.
- */
-static int
-spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
-{
-	int error = 0;
-	nvlist_t *nvroot = NULL;
-	vdev_t *rvd;
-	uberblock_t *ub = &spa->spa_uberblock;
-	uint64_t config_cache_txg = spa->spa_config_txg;
-	uint64_t pool_guid;
-	uint64_t version;
-	zio_t *zio;
-
-	spa->spa_load_state = state;
-
-	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) ||
-	    nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) {
-		error = EINVAL;
-		goto out;
-	}
-
-	/*
-	 * Versioning wasn't explicitly added to the label until later, so if
-	 * it's not present treat it as the initial version.
-	 */
-	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0)
-		version = ZFS_VERSION_INITIAL;
-
-	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
-	    &spa->spa_config_txg);
-
-	if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
-	    spa_guid_exists(pool_guid, 0)) {
-		error = EEXIST;
-		goto out;
-	}
-
-	spa->spa_load_guid = pool_guid;
-
-	/*
-	 * Parse the configuration into a vdev tree.  We explicitly set the
-	 * value that will be returned by spa_version() since parsing the
-	 * configuration requires knowing the version number.
-	 */
-	spa_config_enter(spa, RW_WRITER, FTAG);
-	spa->spa_ubsync.ub_version = version;
-	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD);
-	spa_config_exit(spa, FTAG);
-
-	if (error != 0)
-		goto out;
-
-	ASSERT(spa->spa_root_vdev == rvd);
-	ASSERT(spa_guid(spa) == pool_guid);
-
-	/*
-	 * Try to open all vdevs, loading each label in the process.
-	 */
-	error = vdev_open(rvd);
-	if (error != 0)
-		goto out;
-
-	/*
-	 * Validate the labels for all leaf vdevs.  We need to grab the config
-	 * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD
-	 * flag.
-	 */
-	spa_config_enter(spa, RW_READER, FTAG);
-	error = vdev_validate(rvd);
-	spa_config_exit(spa, FTAG);
-
-	if (error != 0)
-		goto out;
-
-	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
-		error = ENXIO;
-		goto out;
-	}
-
-	/*
-	 * Find the best uberblock.
-	 */
-	bzero(ub, sizeof (uberblock_t));
-
-	zio = zio_root(spa, NULL, NULL,
-	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
-	vdev_uberblock_load(zio, rvd, ub);
-	error = zio_wait(zio);
-
-	/*
-	 * If we weren't able to find a single valid uberblock, return failure.
-	 */
-	if (ub->ub_txg == 0) {
-		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
-		    VDEV_AUX_CORRUPT_DATA);
-		error = ENXIO;
-		goto out;
-	}
-
-	/*
-	 * If the pool is newer than the code, we can't open it.
-	 */
-	if (ub->ub_version > ZFS_VERSION) {
-		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
-		    VDEV_AUX_VERSION_NEWER);
-		error = ENOTSUP;
-		goto out;
-	}
-
-	/*
-	 * If the vdev guid sum doesn't match the uberblock, we have an
-	 * incomplete configuration.
-	 */
-	if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) {
-		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
-		    VDEV_AUX_BAD_GUID_SUM);
-		error = ENXIO;
-		goto out;
-	}
-
-	/*
-	 * Initialize internal SPA structures.
-	 */
-	spa->spa_state = POOL_STATE_ACTIVE;
-	spa->spa_ubsync = spa->spa_uberblock;
-	spa->spa_first_txg = spa_last_synced_txg(spa) + 1;
-	error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
-	if (error) {
-		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
-		    VDEV_AUX_CORRUPT_DATA);
-		goto out;
-	}
-	spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
-
-	if (zap_lookup(spa->spa_meta_objset,
-	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
-	    sizeof (uint64_t), 1, &spa->spa_config_object) != 0) {
-		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
-		    VDEV_AUX_CORRUPT_DATA);
-		error = EIO;
-		goto out;
-	}
-
-	if (!mosconfig) {
-		nvlist_t *newconfig;
-		uint64_t hostid;
-
-		if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) {
-			vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
-			    VDEV_AUX_CORRUPT_DATA);
-			error = EIO;
-			goto out;
-		}
-
-		/*
-		 * hostid is set after the root file system is mounted, so
-		 * ignore the check until it's done.
-		 */
-		if (nvlist_lookup_uint64(newconfig, ZPOOL_CONFIG_HOSTID,
-		    &hostid) == 0 && root_mounted()) {
-			char *hostname;
-			unsigned long myhostid = 0;
-
-			VERIFY(nvlist_lookup_string(newconfig,
-			    ZPOOL_CONFIG_HOSTNAME, &hostname) == 0);
-
-			(void) ddi_strtoul(hw_serial, NULL, 10, &myhostid);
-			if ((unsigned long)hostid != myhostid) {
-				cmn_err(CE_WARN, "pool '%s' could not be "
-				    "loaded as it was last accessed by "
-				    "another system (host: %s hostid: 0x%lx).  "
-				    "See: http://www.sun.com/msg/ZFS-8000-EY",
-				    spa->spa_name, hostname,
-				    (unsigned long)hostid);
-				error = EBADF;
-				goto out;
-			}
-		}
-
-		spa_config_set(spa, newconfig);
-		spa_unload(spa);
-		spa_deactivate(spa);
-		spa_activate(spa);
-
-		return (spa_load(spa, newconfig, state, B_TRUE));
-	}
-
-	if (zap_lookup(spa->spa_meta_objset,
-	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
-	    sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) {
-		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
-		    VDEV_AUX_CORRUPT_DATA);
-		error = EIO;
-		goto out;
-	}
-
-	/*
-	 * Load the bit that tells us to use the new accounting function
-	 * (raid-z deflation).  If we have an older pool, this will not
-	 * be present.
-	 */
-	error = zap_lookup(spa->spa_meta_objset,
-	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
-	    sizeof (uint64_t), 1, &spa->spa_deflate);
-	if (error != 0 && error != ENOENT) {
-		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
-		    VDEV_AUX_CORRUPT_DATA);
-		error = EIO;
-		goto out;
-	}
-
-	/*
-	 * Load the persistent error log.  If we have an older pool, this will
-	 * not be present.
-	 */
-	error = zap_lookup(spa->spa_meta_objset,
-	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST,
-	    sizeof (uint64_t), 1, &spa->spa_errlog_last);
-	if (error != 0 && error != ENOENT) {
-		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
-		    VDEV_AUX_CORRUPT_DATA);
-		error = EIO;
-		goto out;
-	}
-
-	error = zap_lookup(spa->spa_meta_objset,
-	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB,
-	    sizeof (uint64_t), 1, &spa->spa_errlog_scrub);
-	if (error != 0 && error != ENOENT) {
-		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
-		    VDEV_AUX_CORRUPT_DATA);
-		error = EIO;
-		goto out;
-	}
-
-	/*
-	 * Load the history object.  If we have an older pool, this
-	 * will not be present.
-	 */
-	error = zap_lookup(spa->spa_meta_objset,
-	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY,
-	    sizeof (uint64_t), 1, &spa->spa_history);
-	if (error != 0 && error != ENOENT) {
-		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
-		    VDEV_AUX_CORRUPT_DATA);
-		error = EIO;
-		goto out;
-	}
-
-	/*
-	 * Load any hot spares for this pool.
-	 */
-	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares_object);
-	if (error != 0 && error != ENOENT) {
-		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
-		    VDEV_AUX_CORRUPT_DATA);
-		error = EIO;
-		goto out;
-	}
-	if (error == 0) {
-		ASSERT(spa_version(spa) >= ZFS_VERSION_SPARES);
-		if (load_nvlist(spa, spa->spa_spares_object,
-		    &spa->spa_sparelist) != 0) {
-			vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
-			    VDEV_AUX_CORRUPT_DATA);
-			error = EIO;
-			goto out;
-		}
-
-		spa_config_enter(spa, RW_WRITER, FTAG);
-		spa_load_spares(spa);
-		spa_config_exit(spa, FTAG);
-	}
-
-	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object);
-
-	if (error && error != ENOENT) {
-		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
-		    VDEV_AUX_CORRUPT_DATA);
-		error = EIO;
-		goto out;
-	}
-
-	if (error == 0) {
-		(void) zap_lookup(spa->spa_meta_objset,
-		    spa->spa_pool_props_object,
-		    zpool_prop_to_name(ZFS_PROP_BOOTFS),
-		    sizeof (uint64_t), 1, &spa->spa_bootfs);
-	}
-
-	/*
-	 * Load the vdev state for all toplevel vdevs.
-	 */
-	vdev_load(rvd);
-
-	/*
-	 * Propagate the leaf DTLs we just loaded all the way up the tree.
-	 */
-	spa_config_enter(spa, RW_WRITER, FTAG);
-	vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
-	spa_config_exit(spa, FTAG);
-
-	/*
-	 * Check the state of the root vdev.  If it can't be opened, it
-	 * indicates one or more toplevel vdevs are faulted.
-	 */
-	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
-		error = ENXIO;
-		goto out;
-	}
-
-	if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) {
-		dmu_tx_t *tx;
-		int need_update = B_FALSE;
-		int c;
-
-		/*
-		 * Claim log blocks that haven't been committed yet.
-		 * This must all happen in a single txg.
-		 */
-		tx = dmu_tx_create_assigned(spa_get_dsl(spa),
-		    spa_first_txg(spa));
-		(void) dmu_objset_find(spa->spa_name,
-		    zil_claim, tx, DS_FIND_CHILDREN);
-		dmu_tx_commit(tx);
-
-		spa->spa_sync_on = B_TRUE;
-		txg_sync_start(spa->spa_dsl_pool);
-
-		/*
-		 * Wait for all claims to sync.
-		 */
-		txg_wait_synced(spa->spa_dsl_pool, 0);
-
-		/*
-		 * If the config cache is stale, or we have uninitialized
-		 * metaslabs (see spa_vdev_add()), then update the config.
-		 */
-		if (config_cache_txg != spa->spa_config_txg ||
-		    state == SPA_LOAD_IMPORT)
-			need_update = B_TRUE;
-
-		for (c = 0; c < rvd->vdev_children; c++)
-			if (rvd->vdev_child[c]->vdev_ms_array == 0)
-				need_update = B_TRUE;
-
-		/*
-		 * Update the config cache asychronously in case we're the
-		 * root pool, in which case the config cache isn't writable yet.
-		 */
-		if (need_update)
-			spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
-	}
-
-	error = 0;
-out:
-	if (error && error != EBADF)
-		zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0);
-	spa->spa_load_state = SPA_LOAD_NONE;
-	spa->spa_ena = 0;
-
-	return (error);
-}
-
-/*
- * Pool Open/Import
- *
- * The import case is identical to an open except that the configuration is sent
- * down from userland, instead of grabbed from the configuration cache.  For the
- * case of an open, the pool configuration will exist in the
- * POOL_STATE_UNITIALIZED state.
- *
- * The stats information (gen/count/ustats) is used to gather vdev statistics at
- * the same time open the pool, without having to keep around the spa_t in some
- * ambiguous state.
- */
-static int
-spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config)
-{
-	spa_t *spa;
-	int error;
-	int loaded = B_FALSE;
-	int locked = B_FALSE;
-
-	*spapp = NULL;
-
-	/*
-	 * As disgusting as this is, we need to support recursive calls to this
-	 * function because dsl_dir_open() is called during spa_load(), and ends
-	 * up calling spa_open() again.  The real fix is to figure out how to
-	 * avoid dsl_dir_open() calling this in the first place.
-	 */
-	if (mutex_owner(&spa_namespace_lock) != curthread) {
-		mutex_enter(&spa_namespace_lock);
-		locked = B_TRUE;
-	}
-
-	if ((spa = spa_lookup(pool)) == NULL) {
-		if (locked)
-			mutex_exit(&spa_namespace_lock);
-		return (ENOENT);
-	}
-	if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
-
-		spa_activate(spa);
-
-		error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE);
-
-		if (error == EBADF) {
-			/*
-			 * If vdev_validate() returns failure (indicated by
-			 * EBADF), it indicates that one of the vdevs indicates
-			 * that the pool has been exported or destroyed.  If
-			 * this is the case, the config cache is out of sync and
-			 * we should remove the pool from the namespace.
-			 */
-			zfs_post_ok(spa, NULL);
-			spa_unload(spa);
-			spa_deactivate(spa);
-			spa_remove(spa);
-			spa_config_sync();
-			if (locked)
-				mutex_exit(&spa_namespace_lock);
-			return (ENOENT);
-		}
-
-		if (error) {
-			/*
-			 * We can't open the pool, but we still have useful
-			 * information: the state of each vdev after the
-			 * attempted vdev_open().  Return this to the user.
-			 */
-			if (config != NULL && spa->spa_root_vdev != NULL) {
-				spa_config_enter(spa, RW_READER, FTAG);
-				*config = spa_config_generate(spa, NULL, -1ULL,
-				    B_TRUE);
-				spa_config_exit(spa, FTAG);
-			}
-			spa_unload(spa);
-			spa_deactivate(spa);
-			spa->spa_last_open_failed = B_TRUE;
-			if (locked)
-				mutex_exit(&spa_namespace_lock);
-			*spapp = NULL;
-			return (error);
-		} else {
-			zfs_post_ok(spa, NULL);
-			spa->spa_last_open_failed = B_FALSE;
-		}
-
-		loaded = B_TRUE;
-	}
-
-	spa_open_ref(spa, tag);
-	if (locked)
-		mutex_exit(&spa_namespace_lock);
-
-	*spapp = spa;
-
-	if (config != NULL) {
-		spa_config_enter(spa, RW_READER, FTAG);
-		*config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
-		spa_config_exit(spa, FTAG);
-	}
-
-	/*
-	 * If we just loaded the pool, resilver anything that's out of date.
-	 */
-	if (loaded && (spa_mode & FWRITE))
-		VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
-
-	return (0);
-}
-
-int
-spa_open(const char *name, spa_t **spapp, void *tag)
-{
-	return (spa_open_common(name, spapp, tag, NULL));
-}
-
-/*
- * Lookup the given spa_t, incrementing the inject count in the process,
- * preventing it from being exported or destroyed.
- */
-spa_t *
-spa_inject_addref(char *name)
-{
-	spa_t *spa;
-
-	mutex_enter(&spa_namespace_lock);
-	if ((spa = spa_lookup(name)) == NULL) {
-		mutex_exit(&spa_namespace_lock);
-		return (NULL);
-	}
-	spa->spa_inject_ref++;
-	mutex_exit(&spa_namespace_lock);
-
-	return (spa);
-}
-
-void
-spa_inject_delref(spa_t *spa)
-{
-	mutex_enter(&spa_namespace_lock);
-	spa->spa_inject_ref--;
-	mutex_exit(&spa_namespace_lock);
-}
-
-static void
-spa_add_spares(spa_t *spa, nvlist_t *config)
-{
-	nvlist_t **spares;
-	uint_t i, nspares;
-	nvlist_t *nvroot;
-	uint64_t guid;
-	vdev_stat_t *vs;
-	uint_t vsc;
-	uint64_t pool;
-
-	if (spa->spa_nspares == 0)
-		return;
-
-	VERIFY(nvlist_lookup_nvlist(config,
-	    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
-	VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist,
-	    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
-	if (nspares != 0) {
-		VERIFY(nvlist_add_nvlist_array(nvroot,
-		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
-		VERIFY(nvlist_lookup_nvlist_array(nvroot,
-		    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
-
-		/*
-		 * Go through and find any spares which have since been
-		 * repurposed as an active spare.  If this is the case, update
-		 * their status appropriately.
-		 */
-		for (i = 0; i < nspares; i++) {
-			VERIFY(nvlist_lookup_uint64(spares[i],
-			    ZPOOL_CONFIG_GUID, &guid) == 0);
-			if (spa_spare_exists(guid, &pool) && pool != 0ULL) {
-				VERIFY(nvlist_lookup_uint64_array(
-				    spares[i], ZPOOL_CONFIG_STATS,
-				    (uint64_t **)&vs, &vsc) == 0);
-				vs->vs_state = VDEV_STATE_CANT_OPEN;
-				vs->vs_aux = VDEV_AUX_SPARED;
-			}
-		}
-	}
-}
-
-int
-spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen)
-{
-	int error;
-	spa_t *spa;
-
-	*config = NULL;
-	error = spa_open_common(name, &spa, FTAG, config);
-
-	if (spa && *config != NULL) {
-		VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT,
-		    spa_get_errlog_size(spa)) == 0);
-
-		spa_add_spares(spa, *config);
-	}
-
-	/*
-	 * We want to get the alternate root even for faulted pools, so we cheat
-	 * and call spa_lookup() directly.
-	 */
-	if (altroot) {
-		if (spa == NULL) {
-			mutex_enter(&spa_namespace_lock);
-			spa = spa_lookup(name);
-			if (spa)
-				spa_altroot(spa, altroot, buflen);
-			else
-				altroot[0] = '\0';
-			spa = NULL;
-			mutex_exit(&spa_namespace_lock);
-		} else {
-			spa_altroot(spa, altroot, buflen);
-		}
-	}
-
-	if (spa != NULL)
-		spa_close(spa, FTAG);
-
-	return (error);
-}
-
-/*
- * Validate that the 'spares' array is well formed.  We must have an array of
- * nvlists, each which describes a valid leaf vdev.  If this is an import (mode
- * is VDEV_ALLOC_SPARE), then we allow corrupted spares to be specified, as long
- * as they are well-formed.
- */
-static int
-spa_validate_spares(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
-{
-	nvlist_t **spares;
-	uint_t i, nspares;
-	vdev_t *vd;
-	int error;
-
-	/*
-	 * It's acceptable to have no spares specified.
-	 */
-	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
-	    &spares, &nspares) != 0)
-		return (0);
-
-	if (nspares == 0)
-		return (EINVAL);
-
-	/*
-	 * Make sure the pool is formatted with a version that supports hot
-	 * spares.
-	 */
-	if (spa_version(spa) < ZFS_VERSION_SPARES)
-		return (ENOTSUP);
-
-	/*
-	 * Set the pending spare list so we correctly handle device in-use
-	 * checking.
-	 */
-	spa->spa_pending_spares = spares;
-	spa->spa_pending_nspares = nspares;
-
-	for (i = 0; i < nspares; i++) {
-		if ((error = spa_config_parse(spa, &vd, spares[i], NULL, 0,
-		    mode)) != 0)
-			goto out;
-
-		if (!vd->vdev_ops->vdev_op_leaf) {
-			vdev_free(vd);
-			error = EINVAL;
-			goto out;
-		}
-
-		vd->vdev_top = vd;
-
-		if ((error = vdev_open(vd)) == 0 &&
-		    (error = vdev_label_init(vd, crtxg,
-		    VDEV_LABEL_SPARE)) == 0) {
-			VERIFY(nvlist_add_uint64(spares[i], ZPOOL_CONFIG_GUID,
-			    vd->vdev_guid) == 0);
-		}
-
-		vdev_free(vd);
-
-		if (error && mode != VDEV_ALLOC_SPARE)
-			goto out;
-		else
-			error = 0;
-	}
-
-out:
-	spa->spa_pending_spares = NULL;
-	spa->spa_pending_nspares = 0;
-	return (error);
-}
-
-/*
- * Pool Creation
- */
-int
-spa_create(const char *pool, nvlist_t *nvroot, const char *altroot)
-{
-	spa_t *spa;
-	vdev_t *rvd;
-	dsl_pool_t *dp;
-	dmu_tx_t *tx;
-	int c, error = 0;
-	uint64_t txg = TXG_INITIAL;
-	nvlist_t **spares;
-	uint_t nspares;
-
-	/*
-	 * If this pool already exists, return failure.
-	 */
-	mutex_enter(&spa_namespace_lock);
-	if (spa_lookup(pool) != NULL) {
-		mutex_exit(&spa_namespace_lock);
-		return (EEXIST);
-	}
-
-	/*
-	 * Allocate a new spa_t structure.
-	 */
-	spa = spa_add(pool, altroot);
-	spa_activate(spa);
-
-	spa->spa_uberblock.ub_txg = txg - 1;
-	spa->spa_uberblock.ub_version = ZFS_VERSION;
-	spa->spa_ubsync = spa->spa_uberblock;
-
-	/*
-	 * Create the root vdev.
-	 */
-	spa_config_enter(spa, RW_WRITER, FTAG);
-
-	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
-
-	ASSERT(error != 0 || rvd != NULL);
-	ASSERT(error != 0 || spa->spa_root_vdev == rvd);
-
-	if (error == 0 && rvd->vdev_children == 0)
-		error = EINVAL;
-
-	if (error == 0 &&
-	    (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
-	    (error = spa_validate_spares(spa, nvroot, txg,
-	    VDEV_ALLOC_ADD)) == 0) {
-		for (c = 0; c < rvd->vdev_children; c++)
-			vdev_init(rvd->vdev_child[c], txg);
-		vdev_config_dirty(rvd);
-	}
-
-	spa_config_exit(spa, FTAG);
-
-	if (error != 0) {
-		spa_unload(spa);
-		spa_deactivate(spa);
-		spa_remove(spa);
-		mutex_exit(&spa_namespace_lock);
-		return (error);
-	}
-
-	/*
-	 * Get the list of spares, if specified.
-	 */
-	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
-	    &spares, &nspares) == 0) {
-		VERIFY(nvlist_alloc(&spa->spa_sparelist, NV_UNIQUE_NAME,
-		    KM_SLEEP) == 0);
-		VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist,
-		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
-		spa_config_enter(spa, RW_WRITER, FTAG);
-		spa_load_spares(spa);
-		spa_config_exit(spa, FTAG);
-		spa->spa_sync_spares = B_TRUE;
-	}
-
-	spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg);
-	spa->spa_meta_objset = dp->dp_meta_objset;
-
-	tx = dmu_tx_create_assigned(dp, txg);
-
-	/*
-	 * Create the pool config object.
-	 */
-	spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
-	    DMU_OT_PACKED_NVLIST, 1 << 14,
-	    DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
-
-	if (zap_add(spa->spa_meta_objset,
-	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
-	    sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
-		cmn_err(CE_PANIC, "failed to add pool config");
-	}
-
-	/* Newly created pools are always deflated. */
-	spa->spa_deflate = TRUE;
-	if (zap_add(spa->spa_meta_objset,
-	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
-	    sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
-		cmn_err(CE_PANIC, "failed to add deflate");
-	}
-
-	/*
-	 * Create the deferred-free bplist object.  Turn off compression
-	 * because sync-to-convergence takes longer if the blocksize
-	 * keeps changing.
-	 */
-	spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset,
-	    1 << 14, tx);
-	dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj,
-	    ZIO_COMPRESS_OFF, tx);
-
-	if (zap_add(spa->spa_meta_objset,
-	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
-	    sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) {
-		cmn_err(CE_PANIC, "failed to add bplist");
-	}
-
-	/*
-	 * Create the pool's history object.
-	 */
-	spa_history_create_obj(spa, tx);
-
-	dmu_tx_commit(tx);
-
-	spa->spa_bootfs = zfs_prop_default_numeric(ZFS_PROP_BOOTFS);
-	spa->spa_sync_on = B_TRUE;
-	txg_sync_start(spa->spa_dsl_pool);
-
-	/*
-	 * We explicitly wait for the first transaction to complete so that our
-	 * bean counters are appropriately updated.
-	 */
-	txg_wait_synced(spa->spa_dsl_pool, txg);
-
-	spa_config_sync();
-
-	mutex_exit(&spa_namespace_lock);
-
-	return (0);
-}
-
-/*
- * Import the given pool into the system.  We set up the necessary spa_t and
- * then call spa_load() to do the dirty work.
- */
-int
-spa_import(const char *pool, nvlist_t *config, const char *altroot)
-{
-	spa_t *spa;
-	int error;
-	nvlist_t *nvroot;
-	nvlist_t **spares;
-	uint_t nspares;
-
-	if (!(spa_mode & FWRITE))
-		return (EROFS);
-
-	/*
-	 * If a pool with this name exists, return failure.
-	 */
-	mutex_enter(&spa_namespace_lock);
-	if (spa_lookup(pool) != NULL) {
-		mutex_exit(&spa_namespace_lock);
-		return (EEXIST);
-	}
-
-	/*
-	 * Create and initialize the spa structure.
-	 */
-	spa = spa_add(pool, altroot);
-	spa_activate(spa);
-
-	/*
-	 * Pass off the heavy lifting to spa_load().
-	 * Pass TRUE for mosconfig because the user-supplied config
-	 * is actually the one to trust when doing an import.
-	 */
-	error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE);
-
-	spa_config_enter(spa, RW_WRITER, FTAG);
-	/*
-	 * Toss any existing sparelist, as it doesn't have any validity anymore,
-	 * and conflicts with spa_has_spare().
-	 */
-	if (spa->spa_sparelist) {
-		nvlist_free(spa->spa_sparelist);
-		spa->spa_sparelist = NULL;
-		spa_load_spares(spa);
-	}
-
-	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
-	    &nvroot) == 0);
-	if (error == 0)
-		error = spa_validate_spares(spa, nvroot, -1ULL,
-		    VDEV_ALLOC_SPARE);
-	spa_config_exit(spa, FTAG);
-
-	if (error != 0) {
-		spa_unload(spa);
-		spa_deactivate(spa);
-		spa_remove(spa);
-		mutex_exit(&spa_namespace_lock);
-		return (error);
-	}
-
-	/*
-	 * Override any spares as specified by the user, as these may have
-	 * correct device names/devids, etc.
-	 */
-	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
-	    &spares, &nspares) == 0) {
-		if (spa->spa_sparelist)
-			VERIFY(nvlist_remove(spa->spa_sparelist,
-			    ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
-		else
-			VERIFY(nvlist_alloc(&spa->spa_sparelist,
-			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
-		VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist,
-		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
-		spa_config_enter(spa, RW_WRITER, FTAG);
-		spa_load_spares(spa);
-		spa_config_exit(spa, FTAG);
-		spa->spa_sync_spares = B_TRUE;
-	}
-
-	/*
-	 * Update the config cache to include the newly-imported pool.
-	 */
-	spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
-
-	mutex_exit(&spa_namespace_lock);
-
-	/*
-	 * Resilver anything that's out of date.
-	 */
-	if (spa_mode & FWRITE)
-		VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
-
-	return (0);
-}
-
-/*
- * This (illegal) pool name is used when temporarily importing a spa_t in order
- * to get the vdev stats associated with the imported devices.
- */
-#define	TRYIMPORT_NAME	"$import"
-
-nvlist_t *
-spa_tryimport(nvlist_t *tryconfig)
-{
-	nvlist_t *config = NULL;
-	char *poolname;
-	spa_t *spa;
-	uint64_t state;
-
-	if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
-		return (NULL);
-
-	if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
-		return (NULL);
-
-	/*
-	 * Create and initialize the spa structure.
-	 */
-	mutex_enter(&spa_namespace_lock);
-	spa = spa_add(TRYIMPORT_NAME, NULL);
-	spa_activate(spa);
-
-	/*
-	 * Pass off the heavy lifting to spa_load().
-	 * Pass TRUE for mosconfig because the user-supplied config
-	 * is actually the one to trust when doing an import.
-	 */
-	(void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE);
-
-	/*
-	 * If 'tryconfig' was at least parsable, return the current config.
-	 */
-	if (spa->spa_root_vdev != NULL) {
-		spa_config_enter(spa, RW_READER, FTAG);
-		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
-		spa_config_exit(spa, FTAG);
-		VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
-		    poolname) == 0);
-		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
-		    state) == 0);
-		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
-		    spa->spa_uberblock.ub_timestamp) == 0);
-
-		/*
-		 * Add the list of hot spares.
-		 */
-		spa_add_spares(spa, config);
-	}
-
-	spa_unload(spa);
-	spa_deactivate(spa);
-	spa_remove(spa);
-	mutex_exit(&spa_namespace_lock);
-
-	return (config);
-}
-
-/*
- * Pool export/destroy
- *
- * The act of destroying or exporting a pool is very simple.  We make sure there
- * is no more pending I/O and any references to the pool are gone.  Then, we
- * update the pool state and sync all the labels to disk, removing the
- * configuration from the cache afterwards.
- */
-static int
-spa_export_common(char *pool, int new_state, nvlist_t **oldconfig)
-{
-	spa_t *spa;
-
-	if (oldconfig)
-		*oldconfig = NULL;
-
-	if (!(spa_mode & FWRITE))
-		return (EROFS);
-
-	mutex_enter(&spa_namespace_lock);
-	if ((spa = spa_lookup(pool)) == NULL) {
-		mutex_exit(&spa_namespace_lock);
-		return (ENOENT);
-	}
-
-	/*
-	 * Put a hold on the pool, drop the namespace lock, stop async tasks,
-	 * reacquire the namespace lock, and see if we can export.
-	 */
-	spa_open_ref(spa, FTAG);
-	mutex_exit(&spa_namespace_lock);
-	spa_async_suspend(spa);
-	mutex_enter(&spa_namespace_lock);
-	spa_close(spa, FTAG);
-
-	/*
-	 * The pool will be in core if it's openable,
-	 * in which case we can modify its state.
-	 */
-	if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) {
-		/*
-		 * Objsets may be open only because they're dirty, so we
-		 * have to force it to sync before checking spa_refcnt.
-		 */
-		spa_scrub_suspend(spa);
-		txg_wait_synced(spa->spa_dsl_pool, 0);
-
-		/*
-		 * A pool cannot be exported or destroyed if there are active
-		 * references.  If we are resetting a pool, allow references by
-		 * fault injection handlers.
-		 */
-		if (!spa_refcount_zero(spa) ||
-		    (spa->spa_inject_ref != 0 &&
-		    new_state != POOL_STATE_UNINITIALIZED)) {
-			spa_scrub_resume(spa);
-			spa_async_resume(spa);
-			mutex_exit(&spa_namespace_lock);
-			return (EBUSY);
-		}
-
-		spa_scrub_resume(spa);
-		VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0);
-
-		/*
-		 * We want this to be reflected on every label,
-		 * so mark them all dirty.  spa_unload() will do the
-		 * final sync that pushes these changes out.
-		 */
-		if (new_state != POOL_STATE_UNINITIALIZED) {
-			spa_config_enter(spa, RW_WRITER, FTAG);
-			spa->spa_state = new_state;
-			spa->spa_final_txg = spa_last_synced_txg(spa) + 1;
-			vdev_config_dirty(spa->spa_root_vdev);
-			spa_config_exit(spa, FTAG);
-		}
-	}
-
-	if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
-		spa_unload(spa);
-		spa_deactivate(spa);
-	}
-
-	if (oldconfig && spa->spa_config)
-		VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
-
-	if (new_state != POOL_STATE_UNINITIALIZED) {
-		spa_remove(spa);
-		spa_config_sync();
-	}
-	mutex_exit(&spa_namespace_lock);
-
-	return (0);
-}
-
-/*
- * Destroy a storage pool.
- */
-int
-spa_destroy(char *pool)
-{
-	return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL));
-}
-
-/*
- * Export a storage pool.
- */
-int
-spa_export(char *pool, nvlist_t **oldconfig)
-{
-	return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig));
-}
-
-/*
- * Similar to spa_export(), this unloads the spa_t without actually removing it
- * from the namespace in any way.
- */
-int
-spa_reset(char *pool)
-{
-	return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL));
-}
-
-
-/*
- * ==========================================================================
- * Device manipulation
- * ==========================================================================
- */
-
-/*
- * Add capacity to a storage pool.
- */
-int
-spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
-{
-	uint64_t txg;
-	int c, error;
-	vdev_t *rvd = spa->spa_root_vdev;
-	vdev_t *vd, *tvd;
-	nvlist_t **spares;
-	uint_t i, nspares;
-
-	txg = spa_vdev_enter(spa);
-
-	if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
-	    VDEV_ALLOC_ADD)) != 0)
-		return (spa_vdev_exit(spa, NULL, txg, error));
-
-	spa->spa_pending_vdev = vd;
-
-	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
-	    &spares, &nspares) != 0)
-		nspares = 0;
-
-	if (vd->vdev_children == 0 && nspares == 0) {
-		spa->spa_pending_vdev = NULL;
-		return (spa_vdev_exit(spa, vd, txg, EINVAL));
-	}
-
-	if (vd->vdev_children != 0) {
-		if ((error = vdev_create(vd, txg, B_FALSE)) != 0) {
-			spa->spa_pending_vdev = NULL;
-			return (spa_vdev_exit(spa, vd, txg, error));
-		}
-	}
-
-	/*
-	 * We must validate the spares after checking the children.  Otherwise,
-	 * vdev_inuse() will blindly overwrite the spare.
-	 */
-	if ((error = spa_validate_spares(spa, nvroot, txg,
-	    VDEV_ALLOC_ADD)) != 0) {
-		spa->spa_pending_vdev = NULL;
-		return (spa_vdev_exit(spa, vd, txg, error));
-	}
-
-	spa->spa_pending_vdev = NULL;
-
-	/*
-	 * Transfer each new top-level vdev from vd to rvd.
-	 */
-	for (c = 0; c < vd->vdev_children; c++) {
-		tvd = vd->vdev_child[c];
-		vdev_remove_child(vd, tvd);
-		tvd->vdev_id = rvd->vdev_children;
-		vdev_add_child(rvd, tvd);
-		vdev_config_dirty(tvd);
-	}
-
-	if (nspares != 0) {
-		if (spa->spa_sparelist != NULL) {
-			nvlist_t **oldspares;
-			uint_t oldnspares;
-			nvlist_t **newspares;
-
-			VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist,
-			    ZPOOL_CONFIG_SPARES, &oldspares, &oldnspares) == 0);
-
-			newspares = kmem_alloc(sizeof (void *) *
-			    (nspares + oldnspares), KM_SLEEP);
-			for (i = 0; i < oldnspares; i++)
-				VERIFY(nvlist_dup(oldspares[i],
-				    &newspares[i], KM_SLEEP) == 0);
-			for (i = 0; i < nspares; i++)
-				VERIFY(nvlist_dup(spares[i],
-				    &newspares[i + oldnspares],
-				    KM_SLEEP) == 0);
-
-			VERIFY(nvlist_remove(spa->spa_sparelist,
-			    ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
-
-			VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist,
-			    ZPOOL_CONFIG_SPARES, newspares,
-			    nspares + oldnspares) == 0);
-			for (i = 0; i < oldnspares + nspares; i++)
-				nvlist_free(newspares[i]);
-			kmem_free(newspares, (oldnspares + nspares) *
-			    sizeof (void *));
-		} else {
-			VERIFY(nvlist_alloc(&spa->spa_sparelist,
-			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
-			VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist,
-			    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
-		}
-
-		spa_load_spares(spa);
-		spa->spa_sync_spares = B_TRUE;
-	}
-
-	/*
-	 * We have to be careful when adding new vdevs to an existing pool.
-	 * If other threads start allocating from these vdevs before we
-	 * sync the config cache, and we lose power, then upon reboot we may
-	 * fail to open the pool because there are DVAs that the config cache
-	 * can't translate.  Therefore, we first add the vdevs without
-	 * initializing metaslabs; sync the config cache (via spa_vdev_exit());
-	 * and then let spa_config_update() initialize the new metaslabs.
-	 *
-	 * spa_load() checks for added-but-not-initialized vdevs, so that
-	 * if we lose power at any point in this sequence, the remaining
-	 * steps will be completed the next time we load the pool.
-	 */
-	(void) spa_vdev_exit(spa, vd, txg, 0);
-
-	mutex_enter(&spa_namespace_lock);
-	spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
-	mutex_exit(&spa_namespace_lock);
-
-	return (0);
-}
-
-/*
- * Attach a device to a mirror.  The arguments are the path to any device
- * in the mirror, and the nvroot for the new device.  If the path specifies
- * a device that is not mirrored, we automatically insert the mirror vdev.
- *
- * If 'replacing' is specified, the new device is intended to replace the
- * existing device; in this case the two devices are made into their own
- * mirror using the 'replacing' vdev, which is functionally idendical to
- * the mirror vdev (it actually reuses all the same ops) but has a few
- * extra rules: you can't attach to it after it's been created, and upon
- * completion of resilvering, the first disk (the one being replaced)
- * is automatically detached.
- */
-int
-spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
-{
-	uint64_t txg, open_txg;
-	int error;
-	vdev_t *rvd = spa->spa_root_vdev;
-	vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
-	vdev_ops_t *pvops;
-
-	txg = spa_vdev_enter(spa);
-
-	oldvd = vdev_lookup_by_guid(rvd, guid);
-
-	if (oldvd == NULL)
-		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
-
-	if (!oldvd->vdev_ops->vdev_op_leaf)
-		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
-
-	pvd = oldvd->vdev_parent;
-
-	if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
-	    VDEV_ALLOC_ADD)) != 0 || newrootvd->vdev_children != 1)
-		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
-
-	newvd = newrootvd->vdev_child[0];
-
-	if (!newvd->vdev_ops->vdev_op_leaf)
-		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
-
-	if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
-		return (spa_vdev_exit(spa, newrootvd, txg, error));
-
-	if (!replacing) {
-		/*
-		 * For attach, the only allowable parent is a mirror or the root
-		 * vdev.
-		 */
-		if (pvd->vdev_ops != &vdev_mirror_ops &&
-		    pvd->vdev_ops != &vdev_root_ops)
-			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
-
-		pvops = &vdev_mirror_ops;
-	} else {
-		/*
-		 * Active hot spares can only be replaced by inactive hot
-		 * spares.
-		 */
-		if (pvd->vdev_ops == &vdev_spare_ops &&
-		    pvd->vdev_child[1] == oldvd &&
-		    !spa_has_spare(spa, newvd->vdev_guid))
-			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
-
-		/*
-		 * If the source is a hot spare, and the parent isn't already a
-		 * spare, then we want to create a new hot spare.  Otherwise, we
-		 * want to create a replacing vdev.  The user is not allowed to
-		 * attach to a spared vdev child unless the 'isspare' state is
-		 * the same (spare replaces spare, non-spare replaces
-		 * non-spare).
-		 */
-		if (pvd->vdev_ops == &vdev_replacing_ops)
-			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
-		else if (pvd->vdev_ops == &vdev_spare_ops &&
-		    newvd->vdev_isspare != oldvd->vdev_isspare)
-			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
-		else if (pvd->vdev_ops != &vdev_spare_ops &&
-		    newvd->vdev_isspare)
-			pvops = &vdev_spare_ops;
-		else
-			pvops = &vdev_replacing_ops;
-	}
-
-	/*
-	 * Compare the new device size with the replaceable/attachable
-	 * device size.
-	 */
-	if (newvd->vdev_psize < vdev_get_rsize(oldvd))
-		return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
-
-	/*
-	 * The new device cannot have a higher alignment requirement
-	 * than the top-level vdev.
-	 */
-	if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
-		return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
-
-	/*
-	 * If this is an in-place replacement, update oldvd's path and devid
-	 * to make it distinguishable from newvd, and unopenable from now on.
-	 */
-	if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
-		spa_strfree(oldvd->vdev_path);
-		oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
-		    KM_SLEEP);
-		(void) sprintf(oldvd->vdev_path, "%s/%s",
-		    newvd->vdev_path, "old");
-		if (oldvd->vdev_devid != NULL) {
-			spa_strfree(oldvd->vdev_devid);
-			oldvd->vdev_devid = NULL;
-		}
-	}
-
-	/*
-	 * If the parent is not a mirror, or if we're replacing, insert the new
-	 * mirror/replacing/spare vdev above oldvd.
-	 */
-	if (pvd->vdev_ops != pvops)
-		pvd = vdev_add_parent(oldvd, pvops);
-
-	ASSERT(pvd->vdev_top->vdev_parent == rvd);
-	ASSERT(pvd->vdev_ops == pvops);
-	ASSERT(oldvd->vdev_parent == pvd);
-
-	/*
-	 * Extract the new device from its root and add it to pvd.
-	 */
-	vdev_remove_child(newrootvd, newvd);
-	newvd->vdev_id = pvd->vdev_children;
-	vdev_add_child(pvd, newvd);
-
-	/*
-	 * If newvd is smaller than oldvd, but larger than its rsize,
-	 * the addition of newvd may have decreased our parent's asize.
-	 */
-	pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize);
-
-	tvd = newvd->vdev_top;
-	ASSERT(pvd->vdev_top == tvd);
-	ASSERT(tvd->vdev_parent == rvd);
-
-	vdev_config_dirty(tvd);
-
-	/*
-	 * Set newvd's DTL to [TXG_INITIAL, open_txg].  It will propagate
-	 * upward when spa_vdev_exit() calls vdev_dtl_reassess().
-	 */
-	open_txg = txg + TXG_CONCURRENT_STATES - 1;
-
-	mutex_enter(&newvd->vdev_dtl_lock);
-	space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL,
-	    open_txg - TXG_INITIAL + 1);
-	mutex_exit(&newvd->vdev_dtl_lock);
-
-	if (newvd->vdev_isspare)
-		spa_spare_activate(newvd);
-
-	/*
-	 * Mark newvd's DTL dirty in this txg.
-	 */
-	vdev_dirty(tvd, VDD_DTL, newvd, txg);
-
-	(void) spa_vdev_exit(spa, newrootvd, open_txg, 0);
-
-	/*
-	 * Kick off a resilver to update newvd.
-	 */
-	VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
-
-	return (0);
-}
-
-/*
- * Detach a device from a mirror or replacing vdev.
- * If 'replace_done' is specified, only detach if the parent
- * is a replacing vdev.
- */
-int
-spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
-{
-	uint64_t txg;
-	int c, t, error;
-	vdev_t *rvd = spa->spa_root_vdev;
-	vdev_t *vd, *pvd, *cvd, *tvd;
-	boolean_t unspare = B_FALSE;
-	uint64_t unspare_guid;
-
-	txg = spa_vdev_enter(spa);
-
-	vd = vdev_lookup_by_guid(rvd, guid);
-
-	if (vd == NULL)
-		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
-
-	if (!vd->vdev_ops->vdev_op_leaf)
-		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
-
-	pvd = vd->vdev_parent;
-
-	/*
-	 * If replace_done is specified, only remove this device if it's
-	 * the first child of a replacing vdev.  For the 'spare' vdev, either
-	 * disk can be removed.
-	 */
-	if (replace_done) {
-		if (pvd->vdev_ops == &vdev_replacing_ops) {
-			if (vd->vdev_id != 0)
-				return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
-		} else if (pvd->vdev_ops != &vdev_spare_ops) {
-			return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
-		}
-	}
-
-	ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
-	    spa_version(spa) >= ZFS_VERSION_SPARES);
-
-	/*
-	 * Only mirror, replacing, and spare vdevs support detach.
-	 */
-	if (pvd->vdev_ops != &vdev_replacing_ops &&
-	    pvd->vdev_ops != &vdev_mirror_ops &&
-	    pvd->vdev_ops != &vdev_spare_ops)
-		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
-
-	/*
-	 * If there's only one replica, you can't detach it.
-	 */
-	if (pvd->vdev_children <= 1)
-		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
-
-	/*
-	 * If all siblings have non-empty DTLs, this device may have the only
-	 * valid copy of the data, which means we cannot safely detach it.
-	 *
-	 * XXX -- as in the vdev_offline() case, we really want a more
-	 * precise DTL check.
-	 */
-	for (c = 0; c < pvd->vdev_children; c++) {
-		uint64_t dirty;
-
-		cvd = pvd->vdev_child[c];
-		if (cvd == vd)
-			continue;
-		if (vdev_is_dead(cvd))
-			continue;
-		mutex_enter(&cvd->vdev_dtl_lock);
-		dirty = cvd->vdev_dtl_map.sm_space |
-		    cvd->vdev_dtl_scrub.sm_space;
-		mutex_exit(&cvd->vdev_dtl_lock);
-		if (!dirty)
-			break;
-	}
-
-	/*
-	 * If we are a replacing or spare vdev, then we can always detach the
-	 * latter child, as that is how one cancels the operation.
-	 */
-	if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) &&
-	    c == pvd->vdev_children)
-		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
-
-	/*
-	 * If we are detaching the original disk from a spare, then it implies
-	 * that the spare should become a real disk, and be removed from the
-	 * active spare list for the pool.
-	 */
-	if (pvd->vdev_ops == &vdev_spare_ops &&
-	    vd->vdev_id == 0)
-		unspare = B_TRUE;
-
-	/*
-	 * Erase the disk labels so the disk can be used for other things.
-	 * This must be done after all other error cases are handled,
-	 * but before we disembowel vd (so we can still do I/O to it).
-	 * But if we can't do it, don't treat the error as fatal --
-	 * it may be that the unwritability of the disk is the reason
-	 * it's being detached!
-	 */
-	error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
-
-	/*
-	 * Remove vd from its parent and compact the parent's children.
-	 */
-	vdev_remove_child(pvd, vd);
-	vdev_compact_children(pvd);
-
-	/*
-	 * Remember one of the remaining children so we can get tvd below.
-	 */
-	cvd = pvd->vdev_child[0];
-
-	/*
-	 * If we need to remove the remaining child from the list of hot spares,
-	 * do it now, marking the vdev as no longer a spare in the process.  We
-	 * must do this before vdev_remove_parent(), because that can change the
-	 * GUID if it creates a new toplevel GUID.
-	 */
-	if (unspare) {
-		ASSERT(cvd->vdev_isspare);
-		spa_spare_remove(cvd);
-		unspare_guid = cvd->vdev_guid;
-	}
-
-	/*
-	 * If the parent mirror/replacing vdev only has one child,
-	 * the parent is no longer needed.  Remove it from the tree.
-	 */
-	if (pvd->vdev_children == 1)
-		vdev_remove_parent(cvd);
-
-	/*
-	 * We don't set tvd until now because the parent we just removed
-	 * may have been the previous top-level vdev.
-	 */
-	tvd = cvd->vdev_top;
-	ASSERT(tvd->vdev_parent == rvd);
-
-	/*
-	 * Reevaluate the parent vdev state.
-	 */
-	vdev_propagate_state(cvd->vdev_parent);
-
-	/*
-	 * If the device we just detached was smaller than the others, it may be
-	 * possible to add metaslabs (i.e. grow the pool).  vdev_metaslab_init()
-	 * can't fail because the existing metaslabs are already in core, so
-	 * there's nothing to read from disk.
-	 */
-	VERIFY(vdev_metaslab_init(tvd, txg) == 0);
-
-	vdev_config_dirty(tvd);
-
-	/*
-	 * Mark vd's DTL as dirty in this txg.  vdev_dtl_sync() will see that
-	 * vd->vdev_detached is set and free vd's DTL object in syncing context.
-	 * But first make sure we're not on any *other* txg's DTL list, to
-	 * prevent vd from being accessed after it's freed.
-	 */
-	for (t = 0; t < TXG_SIZE; t++)
-		(void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
-	vd->vdev_detached = B_TRUE;
-	vdev_dirty(tvd, VDD_DTL, vd, txg);
-
-	error = spa_vdev_exit(spa, vd, txg, 0);
-
-	/*
-	 * If this was the removal of the original device in a hot spare vdev,
-	 * then we want to go through and remove the device from the hot spare
-	 * list of every other pool.
-	 */
-	if (unspare) {
-		spa = NULL;
-		mutex_enter(&spa_namespace_lock);
-		while ((spa = spa_next(spa)) != NULL) {
-			if (spa->spa_state != POOL_STATE_ACTIVE)
-				continue;
-
-			(void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
-		}
-		mutex_exit(&spa_namespace_lock);
-	}
-
-	return (error);
-}
-
-/*
- * Remove a device from the pool.  Currently, this supports removing only hot
- * spares.
- */
-int
-spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
-{
-	vdev_t *vd;
-	nvlist_t **spares, *nv, **newspares;
-	uint_t i, j, nspares;
-	int ret = 0;
-
-	spa_config_enter(spa, RW_WRITER, FTAG);
-
-	vd = spa_lookup_by_guid(spa, guid);
-
-	nv = NULL;
-	if (spa->spa_spares != NULL &&
-	    nvlist_lookup_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES,
-	    &spares, &nspares) == 0) {
-		for (i = 0; i < nspares; i++) {
-			uint64_t theguid;
-
-			VERIFY(nvlist_lookup_uint64(spares[i],
-			    ZPOOL_CONFIG_GUID, &theguid) == 0);
-			if (theguid == guid) {
-				nv = spares[i];
-				break;
-			}
-		}
-	}
-
-	/*
-	 * We only support removing a hot spare, and only if it's not currently
-	 * in use in this pool.
-	 */
-	if (nv == NULL && vd == NULL) {
-		ret = ENOENT;
-		goto out;
-	}
-
-	if (nv == NULL && vd != NULL) {
-		ret = ENOTSUP;
-		goto out;
-	}
-
-	if (!unspare && nv != NULL && vd != NULL) {
-		ret = EBUSY;
-		goto out;
-	}
-
-	if (nspares == 1) {
-		newspares = NULL;
-	} else {
-		newspares = kmem_alloc((nspares - 1) * sizeof (void *),
-		    KM_SLEEP);
-		for (i = 0, j = 0; i < nspares; i++) {
-			if (spares[i] != nv)
-				VERIFY(nvlist_dup(spares[i],
-				    &newspares[j++], KM_SLEEP) == 0);
-		}
-	}
-
-	VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES,
-	    DATA_TYPE_NVLIST_ARRAY) == 0);
-	VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES,
-	    newspares, nspares - 1) == 0);
-	for (i = 0; i < nspares - 1; i++)
-		nvlist_free(newspares[i]);
-	kmem_free(newspares, (nspares - 1) * sizeof (void *));
-	spa_load_spares(spa);
-	spa->spa_sync_spares = B_TRUE;
-
-out:
-	spa_config_exit(spa, FTAG);
-
-	return (ret);
-}
-
-/*
- * Find any device that's done replacing, so we can detach it.
- */
-static vdev_t *
-spa_vdev_replace_done_hunt(vdev_t *vd)
-{
-	vdev_t *newvd, *oldvd;
-	int c;
-
-	for (c = 0; c < vd->vdev_children; c++) {
-		oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]);
-		if (oldvd != NULL)
-			return (oldvd);
-	}
-
-	if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) {
-		oldvd = vd->vdev_child[0];
-		newvd = vd->vdev_child[1];
-
-		mutex_enter(&newvd->vdev_dtl_lock);
-		if (newvd->vdev_dtl_map.sm_space == 0 &&
-		    newvd->vdev_dtl_scrub.sm_space == 0) {
-			mutex_exit(&newvd->vdev_dtl_lock);
-			return (oldvd);
-		}
-		mutex_exit(&newvd->vdev_dtl_lock);
-	}
-
-	return (NULL);
-}
-
-static void
-spa_vdev_replace_done(spa_t *spa)
-{
-	vdev_t *vd;
-	vdev_t *pvd;
-	uint64_t guid;
-	uint64_t pguid = 0;
-
-	spa_config_enter(spa, RW_READER, FTAG);
-
-	while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) {
-		guid = vd->vdev_guid;
-		/*
-		 * If we have just finished replacing a hot spared device, then
-		 * we need to detach the parent's first child (the original hot
-		 * spare) as well.
-		 */
-		pvd = vd->vdev_parent;
-		if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
-		    pvd->vdev_id == 0) {
-			ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
-			ASSERT(pvd->vdev_parent->vdev_children == 2);
-			pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid;
-		}
-		spa_config_exit(spa, FTAG);
-		if (spa_vdev_detach(spa, guid, B_TRUE) != 0)
-			return;
-		if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0)
-			return;
-		spa_config_enter(spa, RW_READER, FTAG);
-	}
-
-	spa_config_exit(spa, FTAG);
-}
-
-/*
- * Update the stored path for this vdev.  Dirty the vdev configuration, relying
- * on spa_vdev_enter/exit() to synchronize the labels and cache.
- */
-int
-spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
-{
-	vdev_t *rvd, *vd;
-	uint64_t txg;
-
-	rvd = spa->spa_root_vdev;
-
-	txg = spa_vdev_enter(spa);
-
-	if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) {
-		/*
-		 * Determine if this is a reference to a hot spare.  In that
-		 * case, update the path as stored in the spare list.
-		 */
-		nvlist_t **spares;
-		uint_t i, nspares;
-		if (spa->spa_sparelist != NULL) {
-			VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist,
-			    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
-			for (i = 0; i < nspares; i++) {
-				uint64_t theguid;
-				VERIFY(nvlist_lookup_uint64(spares[i],
-				    ZPOOL_CONFIG_GUID, &theguid) == 0);
-				if (theguid == guid)
-					break;
-			}
-
-			if (i == nspares)
-				return (spa_vdev_exit(spa, NULL, txg, ENOENT));
-
-			VERIFY(nvlist_add_string(spares[i],
-			    ZPOOL_CONFIG_PATH, newpath) == 0);
-			spa_load_spares(spa);
-			spa->spa_sync_spares = B_TRUE;
-			return (spa_vdev_exit(spa, NULL, txg, 0));
-		} else {
-			return (spa_vdev_exit(spa, NULL, txg, ENOENT));
-		}
-	}
-
-	if (!vd->vdev_ops->vdev_op_leaf)
-		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
-
-	spa_strfree(vd->vdev_path);
-	vd->vdev_path = spa_strdup(newpath);
-
-	vdev_config_dirty(vd->vdev_top);
-
-	return (spa_vdev_exit(spa, NULL, txg, 0));
-}
-
-/*
- * ==========================================================================
- * SPA Scrubbing
- * ==========================================================================
- */
-
-static void
-spa_scrub_io_done(zio_t *zio)
-{
-	spa_t *spa = zio->io_spa;
-
-	zio_data_buf_free(zio->io_data, zio->io_size);
-
-	mutex_enter(&spa->spa_scrub_lock);
-	if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
-		vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev;
-		spa->spa_scrub_errors++;
-		mutex_enter(&vd->vdev_stat_lock);
-		vd->vdev_stat.vs_scrub_errors++;
-		mutex_exit(&vd->vdev_stat_lock);
-	}
-
-	if (--spa->spa_scrub_inflight < spa->spa_scrub_maxinflight)
-		cv_broadcast(&spa->spa_scrub_io_cv);
-
-	ASSERT(spa->spa_scrub_inflight >= 0);
-
-	mutex_exit(&spa->spa_scrub_lock);
-}
-
-static void
-spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags,
-    zbookmark_t *zb)
-{
-	size_t size = BP_GET_LSIZE(bp);
-	void *data;
-
-	mutex_enter(&spa->spa_scrub_lock);
-	/*
-	 * Do not give too much work to vdev(s).
-	 */
-	while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) {
-		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
-	}
-	spa->spa_scrub_inflight++;
-	mutex_exit(&spa->spa_scrub_lock);
-
-	data = zio_data_buf_alloc(size);
-
-	if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)
-		flags |= ZIO_FLAG_SPECULATIVE;	/* intent log block */
-
-	flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL;
-
-	zio_nowait(zio_read(NULL, spa, bp, data, size,
-	    spa_scrub_io_done, NULL, priority, flags, zb));
-}
-
-/* ARGSUSED */
-static int
-spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a)
-{
-	blkptr_t *bp = &bc->bc_blkptr;
-	vdev_t *vd = spa->spa_root_vdev;
-	dva_t *dva = bp->blk_dva;
-	int needs_resilver = B_FALSE;
-	int d;
-
-	if (bc->bc_errno) {
-		/*
-		 * We can't scrub this block, but we can continue to scrub
-		 * the rest of the pool.  Note the error and move along.
-		 */
-		mutex_enter(&spa->spa_scrub_lock);
-		spa->spa_scrub_errors++;
-		mutex_exit(&spa->spa_scrub_lock);
-
-		mutex_enter(&vd->vdev_stat_lock);
-		vd->vdev_stat.vs_scrub_errors++;
-		mutex_exit(&vd->vdev_stat_lock);
-
-		return (ERESTART);
-	}
-
-	ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg);
-
-	for (d = 0; d < BP_GET_NDVAS(bp); d++) {
-		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]));
-
-		ASSERT(vd != NULL);
-
-		/*
-		 * Keep track of how much data we've examined so that
-		 * zpool(1M) status can make useful progress reports.
-		 */
-		mutex_enter(&vd->vdev_stat_lock);
-		vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]);
-		mutex_exit(&vd->vdev_stat_lock);
-
-		if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) {
-			if (DVA_GET_GANG(&dva[d])) {
-				/*
-				 * Gang members may be spread across multiple
-				 * vdevs, so the best we can do is look at the
-				 * pool-wide DTL.
-				 * XXX -- it would be better to change our
-				 * allocation policy to ensure that this can't
-				 * happen.
-				 */
-				vd = spa->spa_root_vdev;
-			}
-			if (vdev_dtl_contains(&vd->vdev_dtl_map,
-			    bp->blk_birth, 1))
-				needs_resilver = B_TRUE;
-		}
-	}
-
-	if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING)
-		spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB,
-		    ZIO_FLAG_SCRUB, &bc->bc_bookmark);
-	else if (needs_resilver)
-		spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER,
-		    ZIO_FLAG_RESILVER, &bc->bc_bookmark);
-
-	return (0);
-}
-
-static void
-spa_scrub_thread(void *arg)
-{
-	spa_t *spa = arg;
-	callb_cpr_t cprinfo;
-	traverse_handle_t *th = spa->spa_scrub_th;
-	vdev_t *rvd = spa->spa_root_vdev;
-	pool_scrub_type_t scrub_type = spa->spa_scrub_type;
-	int error = 0;
-	boolean_t complete;
-
-	CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG);
-
-	/*
-	 * If we're restarting due to a snapshot create/delete,
-	 * wait for that to complete.
-	 */
-	txg_wait_synced(spa_get_dsl(spa), 0);
-
-	dprintf("start %s mintxg=%llu maxtxg=%llu\n",
-	    scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub",
-	    spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg);
-
-	spa_config_enter(spa, RW_WRITER, FTAG);
-	vdev_reopen(rvd);		/* purge all vdev caches */
-	vdev_config_dirty(rvd);		/* rewrite all disk labels */
-	vdev_scrub_stat_update(rvd, scrub_type, B_FALSE);
-	spa_config_exit(spa, FTAG);
-
-	mutex_enter(&spa->spa_scrub_lock);
-	spa->spa_scrub_errors = 0;
-	spa->spa_scrub_active = 1;
-	ASSERT(spa->spa_scrub_inflight == 0);
-
-	while (!spa->spa_scrub_stop) {
-		CALLB_CPR_SAFE_BEGIN(&cprinfo);
-		while (spa->spa_scrub_suspended) {
-			spa->spa_scrub_active = 0;
-			cv_broadcast(&spa->spa_scrub_cv);
-			cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
-			spa->spa_scrub_active = 1;
-		}
-		CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock);
-
-		if (spa->spa_scrub_restart_txg != 0)
-			break;
-
-		mutex_exit(&spa->spa_scrub_lock);
-		error = traverse_more(th);
-		mutex_enter(&spa->spa_scrub_lock);
-		if (error != EAGAIN)
-			break;
-	}
-
-	while (spa->spa_scrub_inflight)
-		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
-
-	spa->spa_scrub_active = 0;
-	cv_broadcast(&spa->spa_scrub_cv);
-
-	mutex_exit(&spa->spa_scrub_lock);
-
-	spa_config_enter(spa, RW_WRITER, FTAG);
-
-	mutex_enter(&spa->spa_scrub_lock);
-
-	/*
-	 * Note: we check spa_scrub_restart_txg under both spa_scrub_lock
-	 * AND the spa config lock to synchronize with any config changes
-	 * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit().
-	 */
-	if (spa->spa_scrub_restart_txg != 0)
-		error = ERESTART;
-
-	if (spa->spa_scrub_stop)
-		error = EINTR;
-
-	/*
-	 * Even if there were uncorrectable errors, we consider the scrub
-	 * completed.  The downside is that if there is a transient error during
-	 * a resilver, we won't resilver the data properly to the target.  But
-	 * if the damage is permanent (more likely) we will resilver forever,
-	 * which isn't really acceptable.  Since there is enough information for
-	 * the user to know what has failed and why, this seems like a more
-	 * tractable approach.
-	 */
-	complete = (error == 0);
-
-	dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n",
-	    scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub",
-	    spa->spa_scrub_maxtxg, complete ? "done" : "FAILED",
-	    error, spa->spa_scrub_errors, spa->spa_scrub_stop);
-
-	mutex_exit(&spa->spa_scrub_lock);
-
-	/*
-	 * If the scrub/resilver completed, update all DTLs to reflect this.
-	 * Whether it succeeded or not, vacate all temporary scrub DTLs.
-	 */
-	vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1,
-	    complete ? spa->spa_scrub_maxtxg : 0, B_TRUE);
-	vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete);
-	spa_errlog_rotate(spa);
-
-	spa_config_exit(spa, FTAG);
-
-	mutex_enter(&spa->spa_scrub_lock);
-
-	/*
-	 * We may have finished replacing a device.
-	 * Let the async thread assess this and handle the detach.
-	 */
-	spa_async_request(spa, SPA_ASYNC_REPLACE_DONE);
-
-	/*
-	 * If we were told to restart, our final act is to start a new scrub.
-	 */
-	if (error == ERESTART)
-		spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ?
-		    SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB);
-
-	spa->spa_scrub_type = POOL_SCRUB_NONE;
-	spa->spa_scrub_active = 0;
-	spa->spa_scrub_thread = NULL;
-	cv_broadcast(&spa->spa_scrub_cv);
-	CALLB_CPR_EXIT(&cprinfo);	/* drops &spa->spa_scrub_lock */
-	thread_exit();
-}
-
-void
-spa_scrub_suspend(spa_t *spa)
-{
-	mutex_enter(&spa->spa_scrub_lock);
-	spa->spa_scrub_suspended++;
-	while (spa->spa_scrub_active) {
-		cv_broadcast(&spa->spa_scrub_cv);
-		cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
-	}
-	while (spa->spa_scrub_inflight)
-		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
-	mutex_exit(&spa->spa_scrub_lock);
-}
-
-void
-spa_scrub_resume(spa_t *spa)
-{
-	mutex_enter(&spa->spa_scrub_lock);
-	ASSERT(spa->spa_scrub_suspended != 0);
-	if (--spa->spa_scrub_suspended == 0)
-		cv_broadcast(&spa->spa_scrub_cv);
-	mutex_exit(&spa->spa_scrub_lock);
-}
-
-void
-spa_scrub_restart(spa_t *spa, uint64_t txg)
-{
-	/*
-	 * Something happened (e.g. snapshot create/delete) that means
-	 * we must restart any in-progress scrubs.  The itinerary will
-	 * fix this properly.
-	 */
-	mutex_enter(&spa->spa_scrub_lock);
-	spa->spa_scrub_restart_txg = txg;
-	mutex_exit(&spa->spa_scrub_lock);
-}
-
-int
-spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force)
-{
-	space_seg_t *ss;
-	uint64_t mintxg, maxtxg;
-	vdev_t *rvd = spa->spa_root_vdev;
-
-	if ((uint_t)type >= POOL_SCRUB_TYPES)
-		return (ENOTSUP);
-
-	mutex_enter(&spa->spa_scrub_lock);
-
-	/*
-	 * If there's a scrub or resilver already in progress, stop it.
-	 */
-	while (spa->spa_scrub_thread != NULL) {
-		/*
-		 * Don't stop a resilver unless forced.
-		 */
-		if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) {
-			mutex_exit(&spa->spa_scrub_lock);
-			return (EBUSY);
-		}
-		spa->spa_scrub_stop = 1;
-		cv_broadcast(&spa->spa_scrub_cv);
-		cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
-	}
-
-	/*
-	 * Terminate the previous traverse.
-	 */
-	if (spa->spa_scrub_th != NULL) {
-		traverse_fini(spa->spa_scrub_th);
-		spa->spa_scrub_th = NULL;
-	}
-
-	if (rvd == NULL) {
-		ASSERT(spa->spa_scrub_stop == 0);
-		ASSERT(spa->spa_scrub_type == type);
-		ASSERT(spa->spa_scrub_restart_txg == 0);
-		mutex_exit(&spa->spa_scrub_lock);
-		return (0);
-	}
-
-	mintxg = TXG_INITIAL - 1;
-	maxtxg = spa_last_synced_txg(spa) + 1;
-
-	mutex_enter(&rvd->vdev_dtl_lock);
-
-	if (rvd->vdev_dtl_map.sm_space == 0) {
-		/*
-		 * The pool-wide DTL is empty.
-		 * If this is a resilver, there's nothing to do except
-		 * check whether any in-progress replacements have completed.
-		 */
-		if (type == POOL_SCRUB_RESILVER) {
-			type = POOL_SCRUB_NONE;
-			spa_async_request(spa, SPA_ASYNC_REPLACE_DONE);
-		}
-	} else {
-		/*
-		 * The pool-wide DTL is non-empty.
-		 * If this is a normal scrub, upgrade to a resilver instead.
-		 */
-		if (type == POOL_SCRUB_EVERYTHING)
-			type = POOL_SCRUB_RESILVER;
-	}
-
-	if (type == POOL_SCRUB_RESILVER) {
-		/*
-		 * Determine the resilvering boundaries.
-		 *
-		 * Note: (mintxg, maxtxg) is an open interval,
-		 * i.e. mintxg and maxtxg themselves are not included.
-		 *
-		 * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1
-		 * so we don't claim to resilver a txg that's still changing.
-		 */
-		ss = avl_first(&rvd->vdev_dtl_map.sm_root);
-		mintxg = ss->ss_start - 1;
-		ss = avl_last(&rvd->vdev_dtl_map.sm_root);
-		maxtxg = MIN(ss->ss_end, maxtxg);
-	}
-
-	mutex_exit(&rvd->vdev_dtl_lock);
-
-	spa->spa_scrub_stop = 0;
-	spa->spa_scrub_type = type;
-	spa->spa_scrub_restart_txg = 0;
-
-	if (type != POOL_SCRUB_NONE) {
-		spa->spa_scrub_mintxg = mintxg;
-		spa->spa_scrub_maxtxg = maxtxg;
-		spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL,
-		    ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL,
-		    ZIO_FLAG_CANFAIL);
-		traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg);
-		spa->spa_scrub_thread = thread_create(NULL, 0,
-		    spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri);
-	}
-
-	mutex_exit(&spa->spa_scrub_lock);
-
-	return (0);
-}
-
-/*
- * ==========================================================================
- * SPA async task processing
- * ==========================================================================
- */
-
-static void
-spa_async_reopen(spa_t *spa)
-{
-	vdev_t *rvd = spa->spa_root_vdev;
-	vdev_t *tvd;
-	int c;
-
-	spa_config_enter(spa, RW_WRITER, FTAG);
-
-	for (c = 0; c < rvd->vdev_children; c++) {
-		tvd = rvd->vdev_child[c];
-		if (tvd->vdev_reopen_wanted) {
-			tvd->vdev_reopen_wanted = 0;
-			vdev_reopen(tvd);
-		}
-	}
-
-	spa_config_exit(spa, FTAG);
-}
-
-static void
-spa_async_thread(void *arg)
-{
-	spa_t *spa = arg;
-	int tasks;
-
-	ASSERT(spa->spa_sync_on);
-
-	mutex_enter(&spa->spa_async_lock);
-	tasks = spa->spa_async_tasks;
-	spa->spa_async_tasks = 0;
-	mutex_exit(&spa->spa_async_lock);
-
-	/*
-	 * See if the config needs to be updated.
-	 */
-	if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
-		mutex_enter(&spa_namespace_lock);
-		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
-		mutex_exit(&spa_namespace_lock);
-	}
-
-	/*
-	 * See if any devices need to be reopened.
-	 */
-	if (tasks & SPA_ASYNC_REOPEN)
-		spa_async_reopen(spa);
-
-	/*
-	 * If any devices are done replacing, detach them.
-	 */
-	if (tasks & SPA_ASYNC_REPLACE_DONE)
-		spa_vdev_replace_done(spa);
-
-	/*
-	 * Kick off a scrub.
-	 */
-	if (tasks & SPA_ASYNC_SCRUB)
-		VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0);
-
-	/*
-	 * Kick off a resilver.
-	 */
-	if (tasks & SPA_ASYNC_RESILVER)
-		VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
-
-	/*
-	 * Let the world know that we're done.
-	 */
-	mutex_enter(&spa->spa_async_lock);
-	spa->spa_async_thread = NULL;
-	cv_broadcast(&spa->spa_async_cv);
-	mutex_exit(&spa->spa_async_lock);
-	thread_exit();
-}
-
-void
-spa_async_suspend(spa_t *spa)
-{
-	mutex_enter(&spa->spa_async_lock);
-	spa->spa_async_suspended++;
-	while (spa->spa_async_thread != NULL)
-		cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
-	mutex_exit(&spa->spa_async_lock);
-}
-
-void
-spa_async_resume(spa_t *spa)
-{
-	mutex_enter(&spa->spa_async_lock);
-	ASSERT(spa->spa_async_suspended != 0);
-	spa->spa_async_suspended--;
-	mutex_exit(&spa->spa_async_lock);
-}
-
-static void
-spa_async_dispatch(spa_t *spa)
-{
-	mutex_enter(&spa->spa_async_lock);
-	if (spa->spa_async_tasks && !spa->spa_async_suspended &&
-	    spa->spa_async_thread == NULL &&
-	    rootdir != NULL && !vn_is_readonly(rootdir))
-		spa->spa_async_thread = thread_create(NULL, 0,
-		    spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
-	mutex_exit(&spa->spa_async_lock);
-}
-
-void
-spa_async_request(spa_t *spa, int task)
-{
-	mutex_enter(&spa->spa_async_lock);
-	spa->spa_async_tasks |= task;
-	mutex_exit(&spa->spa_async_lock);
-}
-
-/*
- * ==========================================================================
- * SPA syncing routines
- * ==========================================================================
- */
-
-static void
-spa_sync_deferred_frees(spa_t *spa, uint64_t txg)
-{
-	bplist_t *bpl = &spa->spa_sync_bplist;
-	dmu_tx_t *tx;
-	blkptr_t blk;
-	uint64_t itor = 0;
-	zio_t *zio;
-	int error;
-	uint8_t c = 1;
-
-	zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD);
-
-	while (bplist_iterate(bpl, &itor, &blk) == 0)
-		zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL));
-
-	error = zio_wait(zio);
-	ASSERT3U(error, ==, 0);
-
-	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
-	bplist_vacate(bpl, tx);
-
-	/*
-	 * Pre-dirty the first block so we sync to convergence faster.
-	 * (Usually only the first block is needed.)
-	 */
-	dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx);
-	dmu_tx_commit(tx);
-}
-
-static void
-spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
-{
-	char *packed = NULL;
-	size_t nvsize = 0;
-	dmu_buf_t *db;
-
-	VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
-
-	packed = kmem_alloc(nvsize, KM_SLEEP);
-
-	VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
-	    KM_SLEEP) == 0);
-
-	dmu_write(spa->spa_meta_objset, obj, 0, nvsize, packed, tx);
-
-	kmem_free(packed, nvsize);
-
-	VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
-	dmu_buf_will_dirty(db, tx);
-	*(uint64_t *)db->db_data = nvsize;
-	dmu_buf_rele(db, FTAG);
-}
-
-static void
-spa_sync_spares(spa_t *spa, dmu_tx_t *tx)
-{
-	nvlist_t *nvroot;
-	nvlist_t **spares;
-	int i;
-
-	if (!spa->spa_sync_spares)
-		return;
-
-	/*
-	 * Update the MOS nvlist describing the list of available spares.
-	 * spa_validate_spares() will have already made sure this nvlist is
-	 * valid and the vdevs are labelled appropriately.
-	 */
-	if (spa->spa_spares_object == 0) {
-		spa->spa_spares_object = dmu_object_alloc(spa->spa_meta_objset,
-		    DMU_OT_PACKED_NVLIST, 1 << 14,
-		    DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
-		VERIFY(zap_update(spa->spa_meta_objset,
-		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SPARES,
-		    sizeof (uint64_t), 1, &spa->spa_spares_object, tx) == 0);
-	}
-
-	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-	if (spa->spa_nspares == 0) {
-		VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
-		    NULL, 0) == 0);
-	} else {
-		spares = kmem_alloc(spa->spa_nspares * sizeof (void *),
-		    KM_SLEEP);
-		for (i = 0; i < spa->spa_nspares; i++)
-			spares[i] = vdev_config_generate(spa,
-			    spa->spa_spares[i], B_FALSE, B_TRUE);
-		VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
-		    spares, spa->spa_nspares) == 0);
-		for (i = 0; i < spa->spa_nspares; i++)
-			nvlist_free(spares[i]);
-		kmem_free(spares, spa->spa_nspares * sizeof (void *));
-	}
-
-	spa_sync_nvlist(spa, spa->spa_spares_object, nvroot, tx);
-	nvlist_free(nvroot);
-
-	spa->spa_sync_spares = B_FALSE;
-}
-
-static void
-spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
-{
-	nvlist_t *config;
-
-	if (list_is_empty(&spa->spa_dirty_list))
-		return;
-
-	config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE);
-
-	if (spa->spa_config_syncing)
-		nvlist_free(spa->spa_config_syncing);
-	spa->spa_config_syncing = config;
-
-	spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
-}
-
-static void
-spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-	spa_t *spa = arg1;
-	nvlist_t *nvp = arg2;
-	nvpair_t *nvpair;
-	objset_t *mos = spa->spa_meta_objset;
-	uint64_t zapobj;
-
-	mutex_enter(&spa->spa_props_lock);
-	if (spa->spa_pool_props_object == 0) {
-		zapobj = zap_create(mos, DMU_OT_POOL_PROPS, DMU_OT_NONE, 0, tx);
-		VERIFY(zapobj > 0);
-
-		spa->spa_pool_props_object = zapobj;
-
-		VERIFY(zap_update(mos, DMU_POOL_DIRECTORY_OBJECT,
-		    DMU_POOL_PROPS, 8, 1,
-		    &spa->spa_pool_props_object, tx) == 0);
-	}
-	mutex_exit(&spa->spa_props_lock);
-
-	nvpair = NULL;
-	while ((nvpair = nvlist_next_nvpair(nvp, nvpair))) {
-		switch (zpool_name_to_prop(nvpair_name(nvpair))) {
-		case ZFS_PROP_BOOTFS:
-			VERIFY(nvlist_lookup_uint64(nvp,
-			    nvpair_name(nvpair), &spa->spa_bootfs) == 0);
-			VERIFY(zap_update(mos,
-			    spa->spa_pool_props_object,
-			    zpool_prop_to_name(ZFS_PROP_BOOTFS), 8, 1,
-			    &spa->spa_bootfs, tx) == 0);
-			break;
-		}
-	}
-}
-
-/*
- * Sync the specified transaction group.  New blocks may be dirtied as
- * part of the process, so we iterate until it converges.
- */
-void
-spa_sync(spa_t *spa, uint64_t txg)
-{
-	dsl_pool_t *dp = spa->spa_dsl_pool;
-	objset_t *mos = spa->spa_meta_objset;
-	bplist_t *bpl = &spa->spa_sync_bplist;
-	vdev_t *rvd = spa->spa_root_vdev;
-	vdev_t *vd;
-	dmu_tx_t *tx;
-	int dirty_vdevs;
-
-	/*
-	 * Lock out configuration changes.
-	 */
-	spa_config_enter(spa, RW_READER, FTAG);
-
-	spa->spa_syncing_txg = txg;
-	spa->spa_sync_pass = 0;
-
-	VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj));
-
-	tx = dmu_tx_create_assigned(dp, txg);
-
-	/*
-	 * If we are upgrading to ZFS_VERSION_RAIDZ_DEFLATE this txg,
-	 * set spa_deflate if we have no raid-z vdevs.
-	 */
-	if (spa->spa_ubsync.ub_version < ZFS_VERSION_RAIDZ_DEFLATE &&
-	    spa->spa_uberblock.ub_version >= ZFS_VERSION_RAIDZ_DEFLATE) {
-		int i;
-
-		for (i = 0; i < rvd->vdev_children; i++) {
-			vd = rvd->vdev_child[i];
-			if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
-				break;
-		}
-		if (i == rvd->vdev_children) {
-			spa->spa_deflate = TRUE;
-			VERIFY(0 == zap_add(spa->spa_meta_objset,
-			    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
-			    sizeof (uint64_t), 1, &spa->spa_deflate, tx));
-		}
-	}
-
-	/*
-	 * If anything has changed in this txg, push the deferred frees
-	 * from the previous txg.  If not, leave them alone so that we
-	 * don't generate work on an otherwise idle system.
-	 */
-	if (!txg_list_empty(&dp->dp_dirty_datasets, txg) ||
-	    !txg_list_empty(&dp->dp_dirty_dirs, txg) ||
-	    !txg_list_empty(&dp->dp_sync_tasks, txg))
-		spa_sync_deferred_frees(spa, txg);
-
-	/*
-	 * Iterate to convergence.
-	 */
-	do {
-		spa->spa_sync_pass++;
-
-		spa_sync_config_object(spa, tx);
-		spa_sync_spares(spa, tx);
-		spa_errlog_sync(spa, txg);
-		dsl_pool_sync(dp, txg);
-
-		dirty_vdevs = 0;
-		while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) {
-			vdev_sync(vd, txg);
-			dirty_vdevs++;
-		}
-
-		bplist_sync(bpl, tx);
-	} while (dirty_vdevs);
-
-	bplist_close(bpl);
-
-	dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass);
-
-	/*
-	 * Rewrite the vdev configuration (which includes the uberblock)
-	 * to commit the transaction group.
-	 *
-	 * If there are any dirty vdevs, sync the uberblock to all vdevs.
-	 * Otherwise, pick a random top-level vdev that's known to be
-	 * visible in the config cache (see spa_vdev_add() for details).
-	 * If the write fails, try the next vdev until we're tried them all.
-	 */
-	if (!list_is_empty(&spa->spa_dirty_list)) {
-		VERIFY(vdev_config_sync(rvd, txg) == 0);
-	} else {
-		int children = rvd->vdev_children;
-		int c0 = spa_get_random(children);
-		int c;
-
-		for (c = 0; c < children; c++) {
-			vd = rvd->vdev_child[(c0 + c) % children];
-			if (vd->vdev_ms_array == 0)
-				continue;
-			if (vdev_config_sync(vd, txg) == 0)
-				break;
-		}
-		if (c == children)
-			VERIFY(vdev_config_sync(rvd, txg) == 0);
-	}
-
-	dmu_tx_commit(tx);
-
-	/*
-	 * Clear the dirty config list.
-	 */
-	while ((vd = list_head(&spa->spa_dirty_list)) != NULL)
-		vdev_config_clean(vd);
-
-	/*
-	 * Now that the new config has synced transactionally,
-	 * let it become visible to the config cache.
-	 */
-	if (spa->spa_config_syncing != NULL) {
-		spa_config_set(spa, spa->spa_config_syncing);
-		spa->spa_config_txg = txg;
-		spa->spa_config_syncing = NULL;
-	}
-
-	/*
-	 * Make a stable copy of the fully synced uberblock.
-	 * We use this as the root for pool traversals.
-	 */
-	spa->spa_traverse_wanted = 1;	/* tells traverse_more() to stop */
-
-	spa_scrub_suspend(spa);		/* stop scrubbing and finish I/Os */
-
-	rw_enter(&spa->spa_traverse_lock, RW_WRITER);
-	spa->spa_traverse_wanted = 0;
-	spa->spa_ubsync = spa->spa_uberblock;
-	rw_exit(&spa->spa_traverse_lock);
-
-	spa_scrub_resume(spa);		/* resume scrub with new ubsync */
-
-	/*
-	 * Clean up the ZIL records for the synced txg.
-	 */
-	dsl_pool_zil_clean(dp);
-
-	/*
-	 * Update usable space statistics.
-	 */
-	while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
-		vdev_sync_done(vd, txg);
-
-	/*
-	 * It had better be the case that we didn't dirty anything
-	 * since vdev_config_sync().
-	 */
-	ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
-	ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
-	ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
-	ASSERT(bpl->bpl_queue == NULL);
-
-	spa_config_exit(spa, FTAG);
-
-	/*
-	 * If any async tasks have been requested, kick them off.
-	 */
-	spa_async_dispatch(spa);
-}
-
-/*
- * Sync all pools.  We don't want to hold the namespace lock across these
- * operations, so we take a reference on the spa_t and drop the lock during the
- * sync.
- */
-void
-spa_sync_allpools(void)
-{
-	spa_t *spa = NULL;
-	mutex_enter(&spa_namespace_lock);
-	while ((spa = spa_next(spa)) != NULL) {
-		if (spa_state(spa) != POOL_STATE_ACTIVE)
-			continue;
-		spa_open_ref(spa, FTAG);
-		mutex_exit(&spa_namespace_lock);
-		txg_wait_synced(spa_get_dsl(spa), 0);
-		mutex_enter(&spa_namespace_lock);
-		spa_close(spa, FTAG);
-	}
-	mutex_exit(&spa_namespace_lock);
-}
-
-/*
- * ==========================================================================
- * Miscellaneous routines
- * ==========================================================================
- */
-
-/*
- * Remove all pools in the system.
- */
-void
-spa_evict_all(void)
-{
-	spa_t *spa;
-
-	/*
-	 * Remove all cached state.  All pools should be closed now,
-	 * so every spa in the AVL tree should be unreferenced.
-	 */
-	mutex_enter(&spa_namespace_lock);
-	while ((spa = spa_next(NULL)) != NULL) {
-		/*
-		 * Stop async tasks.  The async thread may need to detach
-		 * a device that's been replaced, which requires grabbing
-		 * spa_namespace_lock, so we must drop it here.
-		 */
-		spa_open_ref(spa, FTAG);
-		mutex_exit(&spa_namespace_lock);
-		spa_async_suspend(spa);
-		VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0);
-		mutex_enter(&spa_namespace_lock);
-		spa_close(spa, FTAG);
-
-		if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
-			spa_unload(spa);
-			spa_deactivate(spa);
-		}
-		spa_remove(spa);
-	}
-	mutex_exit(&spa_namespace_lock);
-}
-
-vdev_t *
-spa_lookup_by_guid(spa_t *spa, uint64_t guid)
-{
-	return (vdev_lookup_by_guid(spa->spa_root_vdev, guid));
-}
-
-void
-spa_upgrade(spa_t *spa)
-{
-	spa_config_enter(spa, RW_WRITER, FTAG);
-
-	/*
-	 * This should only be called for a non-faulted pool, and since a
-	 * future version would result in an unopenable pool, this shouldn't be
-	 * possible.
-	 */
-	ASSERT(spa->spa_uberblock.ub_version <= ZFS_VERSION);
-
-	spa->spa_uberblock.ub_version = ZFS_VERSION;
-	vdev_config_dirty(spa->spa_root_vdev);
-
-	spa_config_exit(spa, FTAG);
-
-	txg_wait_synced(spa_get_dsl(spa), 0);
-}
-
-boolean_t
-spa_has_spare(spa_t *spa, uint64_t guid)
-{
-	int i;
-	uint64_t spareguid;
-
-	for (i = 0; i < spa->spa_nspares; i++)
-		if (spa->spa_spares[i]->vdev_guid == guid)
-			return (B_TRUE);
-
-	for (i = 0; i < spa->spa_pending_nspares; i++) {
-		if (nvlist_lookup_uint64(spa->spa_pending_spares[i],
-		    ZPOOL_CONFIG_GUID, &spareguid) == 0 &&
-		    spareguid == guid)
-			return (B_TRUE);
-	}
-
-	return (B_FALSE);
-}
-
-int
-spa_set_props(spa_t *spa, nvlist_t *nvp)
-{
-	return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props,
-	    spa, nvp, 3));
-}
-
-int
-spa_get_props(spa_t *spa, nvlist_t **nvp)
-{
-	zap_cursor_t zc;
-	zap_attribute_t za;
-	objset_t *mos = spa->spa_meta_objset;
-	zfs_source_t src;
-	zfs_prop_t prop;
-	nvlist_t *propval;
-	uint64_t value;
-	int err;
-
-	VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-
-	mutex_enter(&spa->spa_props_lock);
-	/* If no props object, then just return empty nvlist */
-	if (spa->spa_pool_props_object == 0) {
-		mutex_exit(&spa->spa_props_lock);
-		return (0);
-	}
-
-	for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
-	    (err = zap_cursor_retrieve(&zc, &za)) == 0;
-	    zap_cursor_advance(&zc)) {
-
-		if ((prop = zpool_name_to_prop(za.za_name)) == ZFS_PROP_INVAL)
-			continue;
-
-		VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-		switch (za.za_integer_length) {
-		case 8:
-			if (zfs_prop_default_numeric(prop) ==
-			    za.za_first_integer)
-				src = ZFS_SRC_DEFAULT;
-			else
-				src = ZFS_SRC_LOCAL;
-			value = za.za_first_integer;
-
-			if (prop == ZFS_PROP_BOOTFS) {
-				dsl_pool_t *dp;
-				dsl_dataset_t *ds = NULL;
-				char strval[MAXPATHLEN];
-
-				dp = spa_get_dsl(spa);
-				rw_enter(&dp->dp_config_rwlock, RW_READER);
-				if ((err = dsl_dataset_open_obj(dp,
-				    za.za_first_integer, NULL, DS_MODE_NONE,
-				    FTAG, &ds)) != 0) {
-					rw_exit(&dp->dp_config_rwlock);
-					break;
-				}
-				dsl_dataset_name(ds, strval);
-				dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
-				rw_exit(&dp->dp_config_rwlock);
-
-				VERIFY(nvlist_add_uint64(propval,
-				    ZFS_PROP_SOURCE, src) == 0);
-				VERIFY(nvlist_add_string(propval,
-				    ZFS_PROP_VALUE, strval) == 0);
-			} else {
-				VERIFY(nvlist_add_uint64(propval,
-				    ZFS_PROP_SOURCE, src) == 0);
-				VERIFY(nvlist_add_uint64(propval,
-				    ZFS_PROP_VALUE, value) == 0);
-			}
-			VERIFY(nvlist_add_nvlist(*nvp, za.za_name,
-			    propval) == 0);
-			break;
-		}
-		nvlist_free(propval);
-	}
-	zap_cursor_fini(&zc);
-	mutex_exit(&spa->spa_props_lock);
-	if (err && err != ENOENT) {
-		nvlist_free(*nvp);
-		return (err);
-	}
-
-	return (0);
-}
-
-/*
- * If the bootfs property value is dsobj, clear it.
- */
-void
-spa_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
-{
-	if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
-		VERIFY(zap_remove(spa->spa_meta_objset,
-		    spa->spa_pool_props_object,
-		    zpool_prop_to_name(ZFS_PROP_BOOTFS), tx) == 0);
-		spa->spa_bootfs = 0;
-	}
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/spa_config.c b/sys/contrib/opensolaris/uts/common/fs/zfs/spa_config.c
deleted file mode 100644
index 9e8bcf3..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/spa_config.c
+++ /dev/null
@@ -1,375 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/spa.h>
-#include <sys/spa_impl.h>
-#include <sys/nvpair.h>
-#include <sys/uio.h>
-#include <sys/fs/zfs.h>
-#include <sys/vdev_impl.h>
-#include <sys/zfs_ioctl.h>
-#include <sys/utsname.h>
-#include <sys/sunddi.h>
-#ifdef _KERNEL
-#include <sys/kobj.h>
-#endif
-
-/*
- * Pool configuration repository.
- *
- * The configuration for all pools, in addition to being stored on disk, is
- * stored in /etc/zfs/zpool.cache as a packed nvlist.  The kernel maintains
- * this list as pools are created, destroyed, or modified.
- *
- * We have a single nvlist which holds all the configuration information.  When
- * the module loads, we read this information from the cache and populate the
- * SPA namespace.  This namespace is maintained independently in spa.c.
- * Whenever the namespace is modified, or the configuration of a pool is
- * changed, we call spa_config_sync(), which walks through all the active pools
- * and writes the configuration to disk.
- */
-
-static uint64_t spa_config_generation = 1;
-
-/*
- * This can be overridden in userland to preserve an alternate namespace for
- * userland pools when doing testing.
- */
-const char *spa_config_dir = ZPOOL_CACHE_DIR;
-
-/*
- * Called when the module is first loaded, this routine loads the configuration
- * file into the SPA namespace.  It does not actually open or load the pools; it
- * only populates the namespace.
- */
-void
-spa_config_load(void)
-{
-	void *buf = NULL;
-	nvlist_t *nvlist, *child;
-	nvpair_t *nvpair;
-	spa_t *spa;
-	char pathname[128];
-	struct _buf *file;
-	uint64_t fsize;
-
-	/*
-	 * Open the configuration file.
-	 */
-	(void) snprintf(pathname, sizeof (pathname), "%s/%s",
-	    spa_config_dir, ZPOOL_CACHE_FILE);
-
-	file = kobj_open_file(pathname);
-	if (file == (struct _buf *)-1) {
-		ZFS_LOG(1, "Cannot open %s.", pathname);
-		return;
-	}
-
-	if (kobj_get_filesize(file, &fsize) != 0) {
-		ZFS_LOG(1, "Cannot get size of %s.", pathname);
-		goto out;
-	}
-
-	buf = kmem_alloc(fsize, KM_SLEEP);
-
-	/*
-	 * Read the nvlist from the file.
-	 */
-	if (kobj_read_file(file, buf, fsize, 0) < 0) {
-		ZFS_LOG(1, "Cannot read %s.", pathname);
-		goto out;
-	}
-
-	/*
-	 * Unpack the nvlist.
-	 */
-	if (nvlist_unpack(buf, fsize, &nvlist, KM_SLEEP) != 0)
-		goto out;
-
-	ZFS_LOG(1, "File %s loaded.", pathname);
-
-	/*
-	 * Iterate over all elements in the nvlist, creating a new spa_t for
-	 * each one with the specified configuration.
-	 */
-	mutex_enter(&spa_namespace_lock);
-	nvpair = NULL;
-	while ((nvpair = nvlist_next_nvpair(nvlist, nvpair)) != NULL) {
-
-		if (nvpair_type(nvpair) != DATA_TYPE_NVLIST)
-			continue;
-
-		VERIFY(nvpair_value_nvlist(nvpair, &child) == 0);
-
-		if (spa_lookup(nvpair_name(nvpair)) != NULL)
-			continue;
-		spa = spa_add(nvpair_name(nvpair), NULL);
-
-		/*
-		 * We blindly duplicate the configuration here.  If it's
-		 * invalid, we will catch it when the pool is first opened.
-		 */
-		VERIFY(nvlist_dup(child, &spa->spa_config, 0) == 0);
-	}
-	mutex_exit(&spa_namespace_lock);
-
-	nvlist_free(nvlist);
-
-out:
-	if (buf != NULL)
-		kmem_free(buf, fsize);
-
-	kobj_close_file(file);
-}
-
-/*
- * Synchronize all pools to disk.  This must be called with the namespace lock
- * held.
- */
-void
-spa_config_sync(void)
-{
-	spa_t *spa = NULL;
-	nvlist_t *config;
-	size_t buflen;
-	char *buf;
-	vnode_t *vp;
-	int oflags = FWRITE | FTRUNC | FCREAT | FOFFMAX;
-	char pathname[128];
-	char pathname2[128];
-
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
-
-	VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-
-	/*
-	 * Add all known pools to the configuration list, ignoring those with
-	 * alternate root paths.
-	 */
-	spa = NULL;
-	while ((spa = spa_next(spa)) != NULL) {
-		mutex_enter(&spa->spa_config_cache_lock);
-		if (spa->spa_config && spa->spa_name && spa->spa_root == NULL)
-			VERIFY(nvlist_add_nvlist(config, spa->spa_name,
-			    spa->spa_config) == 0);
-		mutex_exit(&spa->spa_config_cache_lock);
-	}
-
-	/*
-	 * Pack the configuration into a buffer.
-	 */
-	VERIFY(nvlist_size(config, &buflen, NV_ENCODE_XDR) == 0);
-
-	buf = kmem_alloc(buflen, KM_SLEEP);
-
-	VERIFY(nvlist_pack(config, &buf, &buflen, NV_ENCODE_XDR,
-	    KM_SLEEP) == 0);
-
-	/*
-	 * Write the configuration to disk.  We need to do the traditional
-	 * 'write to temporary file, sync, move over original' to make sure we
-	 * always have a consistent view of the data.
-	 */
-	(void) snprintf(pathname, sizeof (pathname), "%s/%s", spa_config_dir,
-	    ZPOOL_CACHE_TMP);
-
-	if (vn_open(pathname, UIO_SYSSPACE, oflags, 0644, &vp, CRCREAT, 0) != 0)
-		goto out;
-
-	if (vn_rdwr(UIO_WRITE, vp, buf, buflen, 0, UIO_SYSSPACE,
-	    0, RLIM64_INFINITY, kcred, NULL) == 0 &&
-	    VOP_FSYNC(vp, FSYNC, kcred) == 0) {
-		(void) snprintf(pathname2, sizeof (pathname2), "%s/%s",
-		    spa_config_dir, ZPOOL_CACHE_FILE);
-		(void) vn_rename(pathname, pathname2, UIO_SYSSPACE);
-	}
-
-	(void) VOP_CLOSE(vp, oflags, 1, 0, kcred);
-	VN_RELE(vp);
-
-out:
-	(void) vn_remove(pathname, UIO_SYSSPACE, RMFILE);
-	spa_config_generation++;
-
-	kmem_free(buf, buflen);
-	nvlist_free(config);
-}
-
-/*
- * Sigh.  Inside a local zone, we don't have access to /etc/zfs/zpool.cache,
- * and we don't want to allow the local zone to see all the pools anyway.
- * So we have to invent the ZFS_IOC_CONFIG ioctl to grab the configuration
- * information for all pool visible within the zone.
- */
-nvlist_t *
-spa_all_configs(uint64_t *generation)
-{
-	nvlist_t *pools;
-	spa_t *spa;
-
-	if (*generation == spa_config_generation)
-		return (NULL);
-
-	VERIFY(nvlist_alloc(&pools, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-
-	spa = NULL;
-	mutex_enter(&spa_namespace_lock);
-	while ((spa = spa_next(spa)) != NULL) {
-		if (INGLOBALZONE(curproc) ||
-		    zone_dataset_visible(spa_name(spa), NULL)) {
-			mutex_enter(&spa->spa_config_cache_lock);
-			VERIFY(nvlist_add_nvlist(pools, spa_name(spa),
-			    spa->spa_config) == 0);
-			mutex_exit(&spa->spa_config_cache_lock);
-		}
-	}
-	mutex_exit(&spa_namespace_lock);
-
-	*generation = spa_config_generation;
-
-	return (pools);
-}
-
-void
-spa_config_set(spa_t *spa, nvlist_t *config)
-{
-	mutex_enter(&spa->spa_config_cache_lock);
-	if (spa->spa_config != NULL)
-		nvlist_free(spa->spa_config);
-	spa->spa_config = config;
-	mutex_exit(&spa->spa_config_cache_lock);
-}
-
-/*
- * Generate the pool's configuration based on the current in-core state.
- * We infer whether to generate a complete config or just one top-level config
- * based on whether vd is the root vdev.
- */
-nvlist_t *
-spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats)
-{
-	nvlist_t *config, *nvroot;
-	vdev_t *rvd = spa->spa_root_vdev;
-	unsigned long hostid = 0;
-
-	ASSERT(spa_config_held(spa, RW_READER));
-
-	if (vd == NULL)
-		vd = rvd;
-
-	/*
-	 * If txg is -1, report the current value of spa->spa_config_txg.
-	 */
-	if (txg == -1ULL)
-		txg = spa->spa_config_txg;
-
-	VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-
-	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
-	    spa_version(spa)) == 0);
-	VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
-	    spa_name(spa)) == 0);
-	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
-	    spa_state(spa)) == 0);
-	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG,
-	    txg) == 0);
-	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
-	    spa_guid(spa)) == 0);
-	(void) ddi_strtoul(hw_serial, NULL, 10, &hostid);
-	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID,
-	    hostid) == 0);
-	VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME,
-	    utsname.nodename) == 0);
-
-	if (vd != rvd) {
-		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TOP_GUID,
-		    vd->vdev_top->vdev_guid) == 0);
-		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_GUID,
-		    vd->vdev_guid) == 0);
-		if (vd->vdev_isspare)
-			VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_IS_SPARE,
-			    1ULL) == 0);
-		vd = vd->vdev_top;		/* label contains top config */
-	}
-
-	nvroot = vdev_config_generate(spa, vd, getstats, B_FALSE);
-	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
-	nvlist_free(nvroot);
-
-	return (config);
-}
-
-/*
- * Update all disk labels, generate a fresh config based on the current
- * in-core state, and sync the global config cache.
- */
-void
-spa_config_update(spa_t *spa, int what)
-{
-	vdev_t *rvd = spa->spa_root_vdev;
-	uint64_t txg;
-	int c;
-
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
-
-	spa_config_enter(spa, RW_WRITER, FTAG);
-	txg = spa_last_synced_txg(spa) + 1;
-	if (what == SPA_CONFIG_UPDATE_POOL) {
-		vdev_config_dirty(rvd);
-	} else {
-		/*
-		 * If we have top-level vdevs that were added but have
-		 * not yet been prepared for allocation, do that now.
-		 * (It's safe now because the config cache is up to date,
-		 * so it will be able to translate the new DVAs.)
-		 * See comments in spa_vdev_add() for full details.
-		 */
-		for (c = 0; c < rvd->vdev_children; c++) {
-			vdev_t *tvd = rvd->vdev_child[c];
-			if (tvd->vdev_ms_array == 0) {
-				vdev_init(tvd, txg);
-				vdev_config_dirty(tvd);
-			}
-		}
-	}
-	spa_config_exit(spa, FTAG);
-
-	/*
-	 * Wait for the mosconfig to be regenerated and synced.
-	 */
-	txg_wait_synced(spa->spa_dsl_pool, txg);
-
-	/*
-	 * Update the global config cache to reflect the new mosconfig.
-	 */
-	spa_config_sync();
-
-	if (what == SPA_CONFIG_UPDATE_POOL)
-		spa_config_update(spa, SPA_CONFIG_UPDATE_VDEVS);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c b/sys/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c
deleted file mode 100644
index c52acaf..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c
+++ /dev/null
@@ -1,440 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-/*
- * Routines to manage the on-disk persistent error log.
- *
- * Each pool stores a log of all logical data errors seen during normal
- * operation.  This is actually the union of two distinct logs: the last log,
- * and the current log.  All errors seen are logged to the current log.  When a
- * scrub completes, the current log becomes the last log, the last log is thrown
- * out, and the current log is reinitialized.  This way, if an error is somehow
- * corrected, a new scrub will show that that it no longer exists, and will be
- * deleted from the log when the scrub completes.
- *
- * The log is stored using a ZAP object whose key is a string form of the
- * zbookmark tuple (objset, object, level, blkid), and whose contents is an
- * optional 'objset:object' human-readable string describing the data.  When an
- * error is first logged, this string will be empty, indicating that no name is
- * known.  This prevents us from having to issue a potentially large amount of
- * I/O to discover the object name during an error path.  Instead, we do the
- * calculation when the data is requested, storing the result so future queries
- * will be faster.
- *
- * This log is then shipped into an nvlist where the key is the dataset name and
- * the value is the object name.  Userland is then responsible for uniquifying
- * this list and displaying it to the user.
- */
-
-#include <sys/dmu_tx.h>
-#include <sys/spa.h>
-#include <sys/spa_impl.h>
-#include <sys/zap.h>
-#include <sys/zio.h>
-
-/*
- * This is a stripped-down version of strtoull, suitable only for converting
- * lowercase hexidecimal numbers that don't overflow.
- */
-#ifdef _KERNEL
-static uint64_t
-_strtonum(char *str, char **nptr)
-{
-	uint64_t val = 0;
-	char c;
-	int digit;
-
-	while ((c = *str) != '\0') {
-		if (c >= '0' && c <= '9')
-			digit = c - '0';
-		else if (c >= 'a' && c <= 'f')
-			digit = 10 + c - 'a';
-		else
-			break;
-
-		val *= 16;
-		val += digit;
-
-		str++;
-	}
-
-	*nptr = str;
-
-	return (val);
-}
-#endif
-
-/*
- * Convert a bookmark to a string.
- */
-static void
-bookmark_to_name(zbookmark_t *zb, char *buf, size_t len)
-{
-	(void) snprintf(buf, len, "%llx:%llx:%llx:%llx",
-	    (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object,
-	    (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid);
-}
-
-/*
- * Convert a string to a bookmark
- */
-#ifdef _KERNEL
-static void
-name_to_bookmark(char *buf, zbookmark_t *zb)
-{
-	zb->zb_objset = _strtonum(buf, &buf);
-	ASSERT(*buf == ':');
-	zb->zb_object = _strtonum(buf + 1, &buf);
-	ASSERT(*buf == ':');
-	zb->zb_level = (int)_strtonum(buf + 1, &buf);
-	ASSERT(*buf == ':');
-	zb->zb_blkid = _strtonum(buf + 1, &buf);
-	ASSERT(*buf == '\0');
-}
-#endif
-
-/*
- * Log an uncorrectable error to the persistent error log.  We add it to the
- * spa's list of pending errors.  The changes are actually synced out to disk
- * during spa_errlog_sync().
- */
-void
-spa_log_error(spa_t *spa, zio_t *zio)
-{
-	zbookmark_t *zb = &zio->io_logical->io_bookmark;
-	spa_error_entry_t search;
-	spa_error_entry_t *new;
-	avl_tree_t *tree;
-	avl_index_t where;
-
-	/*
-	 * If we are trying to import a pool, ignore any errors, as we won't be
-	 * writing to the pool any time soon.
-	 */
-	if (spa->spa_load_state == SPA_LOAD_TRYIMPORT)
-		return;
-
-	mutex_enter(&spa->spa_errlist_lock);
-
-	/*
-	 * If we have had a request to rotate the log, log it to the next list
-	 * instead of the current one.
-	 */
-	if (spa->spa_scrub_active || spa->spa_scrub_finished)
-		tree = &spa->spa_errlist_scrub;
-	else
-		tree = &spa->spa_errlist_last;
-
-	search.se_bookmark = *zb;
-	if (avl_find(tree, &search, &where) != NULL) {
-		mutex_exit(&spa->spa_errlist_lock);
-		return;
-	}
-
-	new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP);
-	new->se_bookmark = *zb;
-	avl_insert(tree, new, where);
-
-	mutex_exit(&spa->spa_errlist_lock);
-}
-
-/*
- * Return the number of errors currently in the error log.  This is actually the
- * sum of both the last log and the current log, since we don't know the union
- * of these logs until we reach userland.
- */
-uint64_t
-spa_get_errlog_size(spa_t *spa)
-{
-	uint64_t total = 0, count;
-
-	mutex_enter(&spa->spa_errlog_lock);
-	if (spa->spa_errlog_scrub != 0 &&
-	    zap_count(spa->spa_meta_objset, spa->spa_errlog_scrub,
-	    &count) == 0)
-		total += count;
-
-	if (spa->spa_errlog_last != 0 && !spa->spa_scrub_finished &&
-	    zap_count(spa->spa_meta_objset, spa->spa_errlog_last,
-	    &count) == 0)
-		total += count;
-	mutex_exit(&spa->spa_errlog_lock);
-
-	mutex_enter(&spa->spa_errlist_lock);
-	total += avl_numnodes(&spa->spa_errlist_last);
-	total += avl_numnodes(&spa->spa_errlist_scrub);
-	mutex_exit(&spa->spa_errlist_lock);
-
-	return (total);
-}
-
-#ifdef _KERNEL
-static int
-process_error_log(spa_t *spa, uint64_t obj, void *addr, size_t *count)
-{
-	zap_cursor_t zc;
-	zap_attribute_t za;
-	zbookmark_t zb;
-
-	if (obj == 0)
-		return (0);
-
-	for (zap_cursor_init(&zc, spa->spa_meta_objset, obj);
-	    zap_cursor_retrieve(&zc, &za) == 0;
-	    zap_cursor_advance(&zc)) {
-
-		if (*count == 0) {
-			zap_cursor_fini(&zc);
-			return (ENOMEM);
-		}
-
-		name_to_bookmark(za.za_name, &zb);
-
-		if (copyout(&zb, (char *)addr +
-		    (*count - 1) * sizeof (zbookmark_t),
-		    sizeof (zbookmark_t)) != 0)
-			return (EFAULT);
-
-		*count -= 1;
-	}
-
-	zap_cursor_fini(&zc);
-
-	return (0);
-}
-
-static int
-process_error_list(avl_tree_t *list, void *addr, size_t *count)
-{
-	spa_error_entry_t *se;
-
-	for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) {
-
-		if (*count == 0)
-			return (ENOMEM);
-
-		if (copyout(&se->se_bookmark, (char *)addr +
-		    (*count - 1) * sizeof (zbookmark_t),
-		    sizeof (zbookmark_t)) != 0)
-			return (EFAULT);
-
-		*count -= 1;
-	}
-
-	return (0);
-}
-#endif
-
-/*
- * Copy all known errors to userland as an array of bookmarks.  This is
- * actually a union of the on-disk last log and current log, as well as any
- * pending error requests.
- *
- * Because the act of reading the on-disk log could cause errors to be
- * generated, we have two separate locks: one for the error log and one for the
- * in-core error lists.  We only need the error list lock to log and error, so
- * we grab the error log lock while we read the on-disk logs, and only pick up
- * the error list lock when we are finished.
- */
-int
-spa_get_errlog(spa_t *spa, void *uaddr, size_t *count)
-{
-	int ret = 0;
-
-#ifdef _KERNEL
-	mutex_enter(&spa->spa_errlog_lock);
-
-	ret = process_error_log(spa, spa->spa_errlog_scrub, uaddr, count);
-
-	if (!ret && !spa->spa_scrub_finished)
-		ret = process_error_log(spa, spa->spa_errlog_last, uaddr,
-		    count);
-
-	mutex_enter(&spa->spa_errlist_lock);
-	if (!ret)
-		ret = process_error_list(&spa->spa_errlist_scrub, uaddr,
-		    count);
-	if (!ret)
-		ret = process_error_list(&spa->spa_errlist_last, uaddr,
-		    count);
-	mutex_exit(&spa->spa_errlist_lock);
-
-	mutex_exit(&spa->spa_errlog_lock);
-#endif
-
-	return (ret);
-}
-
-/*
- * Called when a scrub completes.  This simply set a bit which tells which AVL
- * tree to add new errors.  spa_errlog_sync() is responsible for actually
- * syncing the changes to the underlying objects.
- */
-void
-spa_errlog_rotate(spa_t *spa)
-{
-	mutex_enter(&spa->spa_errlist_lock);
-
-	ASSERT(!spa->spa_scrub_finished);
-	spa->spa_scrub_finished = B_TRUE;
-
-	mutex_exit(&spa->spa_errlist_lock);
-}
-
-/*
- * Discard any pending errors from the spa_t.  Called when unloading a faulted
- * pool, as the errors encountered during the open cannot be synced to disk.
- */
-void
-spa_errlog_drain(spa_t *spa)
-{
-	spa_error_entry_t *se;
-	void *cookie;
-
-	mutex_enter(&spa->spa_errlist_lock);
-
-	cookie = NULL;
-	while ((se = avl_destroy_nodes(&spa->spa_errlist_last,
-	    &cookie)) != NULL)
-		kmem_free(se, sizeof (spa_error_entry_t));
-	cookie = NULL;
-	while ((se = avl_destroy_nodes(&spa->spa_errlist_scrub,
-	    &cookie)) != NULL)
-		kmem_free(se, sizeof (spa_error_entry_t));
-
-	mutex_exit(&spa->spa_errlist_lock);
-}
-
-/*
- * Process a list of errors into the current on-disk log.
- */
-static void
-sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx)
-{
-	spa_error_entry_t *se;
-	char buf[64];
-	void *cookie;
-
-	if (avl_numnodes(t) != 0) {
-		/* create log if necessary */
-		if (*obj == 0)
-			*obj = zap_create(spa->spa_meta_objset,
-			    DMU_OT_ERROR_LOG, DMU_OT_NONE,
-			    0, tx);
-
-		/* add errors to the current log */
-		for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) {
-			char *name = se->se_name ? se->se_name : "";
-
-			bookmark_to_name(&se->se_bookmark, buf, sizeof (buf));
-
-			(void) zap_update(spa->spa_meta_objset,
-			    *obj, buf, 1, strlen(name) + 1, name, tx);
-		}
-
-		/* purge the error list */
-		cookie = NULL;
-		while ((se = avl_destroy_nodes(t, &cookie)) != NULL)
-			kmem_free(se, sizeof (spa_error_entry_t));
-	}
-}
-
-/*
- * Sync the error log out to disk.  This is a little tricky because the act of
- * writing the error log requires the spa_errlist_lock.  So, we need to lock the
- * error lists, take a copy of the lists, and then reinitialize them.  Then, we
- * drop the error list lock and take the error log lock, at which point we
- * do the errlog processing.  Then, if we encounter an I/O error during this
- * process, we can successfully add the error to the list.  Note that this will
- * result in the perpetual recycling of errors, but it is an unlikely situation
- * and not a performance critical operation.
- */
-void
-spa_errlog_sync(spa_t *spa, uint64_t txg)
-{
-	dmu_tx_t *tx;
-	avl_tree_t scrub, last;
-	int scrub_finished;
-
-	mutex_enter(&spa->spa_errlist_lock);
-
-	/*
-	 * Bail out early under normal circumstances.
-	 */
-	if (avl_numnodes(&spa->spa_errlist_scrub) == 0 &&
-	    avl_numnodes(&spa->spa_errlist_last) == 0 &&
-	    !spa->spa_scrub_finished) {
-		mutex_exit(&spa->spa_errlist_lock);
-		return;
-	}
-
-	spa_get_errlists(spa, &last, &scrub);
-	scrub_finished = spa->spa_scrub_finished;
-	spa->spa_scrub_finished = B_FALSE;
-
-	mutex_exit(&spa->spa_errlist_lock);
-	mutex_enter(&spa->spa_errlog_lock);
-
-	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
-
-	/*
-	 * Sync out the current list of errors.
-	 */
-	sync_error_list(spa, &last, &spa->spa_errlog_last, tx);
-
-	/*
-	 * Rotate the log if necessary.
-	 */
-	if (scrub_finished) {
-		if (spa->spa_errlog_last != 0)
-			VERIFY(dmu_object_free(spa->spa_meta_objset,
-			    spa->spa_errlog_last, tx) == 0);
-		spa->spa_errlog_last = spa->spa_errlog_scrub;
-		spa->spa_errlog_scrub = 0;
-
-		sync_error_list(spa, &scrub, &spa->spa_errlog_last, tx);
-	}
-
-	/*
-	 * Sync out any pending scrub errors.
-	 */
-	sync_error_list(spa, &scrub, &spa->spa_errlog_scrub, tx);
-
-	/*
-	 * Update the MOS to reflect the new values.
-	 */
-	(void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_ERRLOG_LAST, sizeof (uint64_t), 1,
-	    &spa->spa_errlog_last, tx);
-	(void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_ERRLOG_SCRUB, sizeof (uint64_t), 1,
-	    &spa->spa_errlog_scrub, tx);
-
-	dmu_tx_commit(tx);
-
-	mutex_exit(&spa->spa_errlog_lock);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/spa_history.c b/sys/contrib/opensolaris/uts/common/fs/zfs/spa_history.c
deleted file mode 100644
index 6642801..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/spa_history.c
+++ /dev/null
@@ -1,354 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/spa_impl.h>
-#include <sys/zap.h>
-#include <sys/dsl_synctask.h>
-
-/*
- * Routines to manage the on-disk history log.
- *
- * The history log is stored as a dmu object containing
- * <packed record length, record nvlist> tuples.
- *
- * Where "record nvlist" is a nvlist containing uint64_ts and strings, and
- * "packed record length" is the packed length of the "record nvlist" stored
- * as a little endian uint64_t.
- *
- * The log is implemented as a ring buffer, though the original creation
- * of the pool ('zpool create') is never overwritten.
- *
- * The history log is tracked as object 'spa_t::spa_history'.  The bonus buffer
- * of 'spa_history' stores the offsets for logging/retrieving history as
- * 'spa_history_phys_t'.  'sh_pool_create_len' is the ending offset in bytes of
- * where the 'zpool create' record is stored.  This allows us to never
- * overwrite the original creation of the pool.  'sh_phys_max_off' is the
- * physical ending offset in bytes of the log.  This tells you the length of
- * the buffer. 'sh_eof' is the logical EOF (in bytes).  Whenever a record
- * is added, 'sh_eof' is incremented by the the size of the record.
- * 'sh_eof' is never decremented.  'sh_bof' is the logical BOF (in bytes).
- * This is where the consumer should start reading from after reading in
- * the 'zpool create' portion of the log.
- *
- * 'sh_records_lost' keeps track of how many records have been overwritten
- * and permanently lost.
- */
-
-typedef enum history_log_type {
-	LOG_CMD_CREATE,
-	LOG_CMD_NO_CREATE
-} history_log_type_t;
-
-typedef struct history_arg {
-	const char *ha_history_str;
-	history_log_type_t ha_log_type;
-} history_arg_t;
-
-/* convert a logical offset to physical */
-static uint64_t
-spa_history_log_to_phys(uint64_t log_off, spa_history_phys_t *shpp)
-{
-	uint64_t phys_len;
-
-	phys_len = shpp->sh_phys_max_off - shpp->sh_pool_create_len;
-	return ((log_off - shpp->sh_pool_create_len) % phys_len
-	    + shpp->sh_pool_create_len);
-}
-
-void
-spa_history_create_obj(spa_t *spa, dmu_tx_t *tx)
-{
-	dmu_buf_t *dbp;
-	spa_history_phys_t *shpp;
-	objset_t *mos = spa->spa_meta_objset;
-
-	ASSERT(spa->spa_history == 0);
-	spa->spa_history = dmu_object_alloc(mos, DMU_OT_SPA_HISTORY,
-	    SPA_MAXBLOCKSIZE, DMU_OT_SPA_HISTORY_OFFSETS,
-	    sizeof (spa_history_phys_t), tx);
-
-	VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_HISTORY, sizeof (uint64_t), 1,
-	    &spa->spa_history, tx) == 0);
-
-	VERIFY(0 == dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp));
-	ASSERT(dbp->db_size >= sizeof (spa_history_phys_t));
-
-	shpp = dbp->db_data;
-	dmu_buf_will_dirty(dbp, tx);
-
-	/*
-	 * Figure out maximum size of history log.  We set it at
-	 * 1% of pool size, with a max of 32MB and min of 128KB.
-	 */
-	shpp->sh_phys_max_off = spa_get_dspace(spa) / 100;
-	shpp->sh_phys_max_off = MIN(shpp->sh_phys_max_off, 32<<20);
-	shpp->sh_phys_max_off = MAX(shpp->sh_phys_max_off, 128<<10);
-
-	dmu_buf_rele(dbp, FTAG);
-}
-
-/*
- * Change 'sh_bof' to the beginning of the next record.
- */
-static int
-spa_history_advance_bof(spa_t *spa, spa_history_phys_t *shpp)
-{
-	objset_t *mos = spa->spa_meta_objset;
-	uint64_t firstread, reclen, phys_bof;
-	char buf[sizeof (reclen)];
-	int err;
-
-	phys_bof = spa_history_log_to_phys(shpp->sh_bof, shpp);
-	firstread = MIN(sizeof (reclen), shpp->sh_phys_max_off - phys_bof);
-
-	if ((err = dmu_read(mos, spa->spa_history, phys_bof, firstread,
-	    buf)) != 0)
-		return (err);
-	if (firstread != sizeof (reclen)) {
-		if ((err = dmu_read(mos, spa->spa_history,
-		    shpp->sh_pool_create_len, sizeof (reclen) - firstread,
-		    buf + firstread)) != 0)
-			return (err);
-	}
-
-	reclen = LE_64(*((uint64_t *)buf));
-	shpp->sh_bof += reclen + sizeof (reclen);
-	shpp->sh_records_lost++;
-	return (0);
-}
-
-static int
-spa_history_write(spa_t *spa, void *buf, uint64_t len, spa_history_phys_t *shpp,
-    dmu_tx_t *tx)
-{
-	uint64_t firstwrite, phys_eof;
-	objset_t *mos = spa->spa_meta_objset;
-	int err;
-
-	ASSERT(MUTEX_HELD(&spa->spa_history_lock));
-
-	/* see if we need to reset logical BOF */
-	while (shpp->sh_phys_max_off - shpp->sh_pool_create_len -
-	    (shpp->sh_eof - shpp->sh_bof) <= len) {
-		if ((err = spa_history_advance_bof(spa, shpp)) != 0)
-			return (err);
-	}
-
-	phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp);
-	firstwrite = MIN(len, shpp->sh_phys_max_off - phys_eof);
-	shpp->sh_eof += len;
-	dmu_write(mos, spa->spa_history, phys_eof, firstwrite, buf, tx);
-
-	len -= firstwrite;
-	if (len > 0) {
-		/* write out the rest at the beginning of physical file */
-		dmu_write(mos, spa->spa_history, shpp->sh_pool_create_len,
-		    len, (char *)buf + firstwrite, tx);
-	}
-
-	return (0);
-}
-
-/*
- * Write out a history event.
- */
-void
-spa_history_log_sync(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-	spa_t		*spa = arg1;
-	history_arg_t	*hap = arg2;
-	const char	*history_str = hap->ha_history_str;
-	objset_t	*mos = spa->spa_meta_objset;
-	dmu_buf_t	*dbp;
-	spa_history_phys_t *shpp;
-	size_t		reclen;
-	uint64_t	le_len;
-	nvlist_t	*nvrecord;
-	char		*record_packed = NULL;
-	int		ret;
-
-	if (history_str == NULL)
-		return;
-
-	/*
-	 * If we have an older pool that doesn't have a command
-	 * history object, create it now.
-	 */
-	mutex_enter(&spa->spa_history_lock);
-	if (!spa->spa_history)
-		spa_history_create_obj(spa, tx);
-	mutex_exit(&spa->spa_history_lock);
-
-	/*
-	 * Get the offset of where we need to write via the bonus buffer.
-	 * Update the offset when the write completes.
-	 */
-	VERIFY(0 == dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp));
-	shpp = dbp->db_data;
-
-	dmu_buf_will_dirty(dbp, tx);
-
-#ifdef ZFS_DEBUG
-	{
-		dmu_object_info_t doi;
-		dmu_object_info_from_db(dbp, &doi);
-		ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS);
-	}
-#endif
-
-	/* construct a nvlist of the current time and cmd string */
-	VERIFY(nvlist_alloc(&nvrecord, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-	VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_TIME,
-	    gethrestime_sec()) == 0);
-	VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_CMD, history_str) == 0);
-	VERIFY(nvlist_pack(nvrecord, &record_packed, &reclen,
-	    NV_ENCODE_XDR, KM_SLEEP) == 0);
-
-	mutex_enter(&spa->spa_history_lock);
-	if (hap->ha_log_type == LOG_CMD_CREATE)
-		VERIFY(shpp->sh_eof == shpp->sh_pool_create_len);
-
-	/* write out the packed length as little endian */
-	le_len = LE_64((uint64_t)reclen);
-	ret = spa_history_write(spa, &le_len, sizeof (le_len), shpp, tx);
-	if (!ret)
-		ret = spa_history_write(spa, record_packed, reclen, shpp, tx);
-
-	if (!ret && hap->ha_log_type == LOG_CMD_CREATE) {
-		shpp->sh_pool_create_len += sizeof (le_len) + reclen;
-		shpp->sh_bof = shpp->sh_pool_create_len;
-	}
-
-	mutex_exit(&spa->spa_history_lock);
-	nvlist_free(nvrecord);
-	kmem_free(record_packed, reclen);
-	dmu_buf_rele(dbp, FTAG);
-}
-
-/*
- * Write out a history event.
- */
-int
-spa_history_log(spa_t *spa, const char *history_str, uint64_t pool_create)
-{
-	history_arg_t ha;
-
-	ha.ha_history_str = history_str;
-	ha.ha_log_type = pool_create ? LOG_CMD_CREATE : LOG_CMD_NO_CREATE;
-	return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_history_log_sync,
-	    spa, &ha, 0));
-}
-
-/*
- * Read out the command history.
- */
-int
-spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf)
-{
-	objset_t *mos = spa->spa_meta_objset;
-	dmu_buf_t *dbp;
-	uint64_t read_len, phys_read_off, phys_eof;
-	uint64_t leftover = 0;
-	spa_history_phys_t *shpp;
-	int err;
-
-	/*
-	 * If the command history  doesn't exist (older pool),
-	 * that's ok, just return ENOENT.
-	 */
-	if (!spa->spa_history)
-		return (ENOENT);
-
-	if ((err = dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)) != 0)
-		return (err);
-	shpp = dbp->db_data;
-
-#ifdef ZFS_DEBUG
-	{
-		dmu_object_info_t doi;
-		dmu_object_info_from_db(dbp, &doi);
-		ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS);
-	}
-#endif
-
-	mutex_enter(&spa->spa_history_lock);
-	phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp);
-
-	if (*offp < shpp->sh_pool_create_len) {
-		/* read in just the zpool create history */
-		phys_read_off = *offp;
-		read_len = MIN(*len, shpp->sh_pool_create_len -
-		    phys_read_off);
-	} else {
-		/*
-		 * Need to reset passed in offset to BOF if the passed in
-		 * offset has since been overwritten.
-		 */
-		*offp = MAX(*offp, shpp->sh_bof);
-		phys_read_off = spa_history_log_to_phys(*offp, shpp);
-
-		/*
-		 * Read up to the minimum of what the user passed down or
-		 * the EOF (physical or logical).  If we hit physical EOF,
-		 * use 'leftover' to read from the physical BOF.
-		 */
-		if (phys_read_off <= phys_eof) {
-			read_len = MIN(*len, phys_eof - phys_read_off);
-		} else {
-			read_len = MIN(*len,
-			    shpp->sh_phys_max_off - phys_read_off);
-			if (phys_read_off + *len > shpp->sh_phys_max_off) {
-				leftover = MIN(*len - read_len,
-				    phys_eof - shpp->sh_pool_create_len);
-			}
-		}
-	}
-
-	/* offset for consumer to use next */
-	*offp += read_len + leftover;
-
-	/* tell the consumer how much you actually read */
-	*len = read_len + leftover;
-
-	if (read_len == 0) {
-		mutex_exit(&spa->spa_history_lock);
-		dmu_buf_rele(dbp, FTAG);
-		return (0);
-	}
-
-	err = dmu_read(mos, spa->spa_history, phys_read_off, read_len, buf);
-	if (leftover && err == 0) {
-		err = dmu_read(mos, spa->spa_history, shpp->sh_pool_create_len,
-		    leftover, buf + read_len);
-	}
-	mutex_exit(&spa->spa_history_lock);
-
-	dmu_buf_rele(dbp, FTAG);
-	return (err);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c b/sys/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
deleted file mode 100644
index 5da1f96..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
+++ /dev/null
@@ -1,1130 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/spa_impl.h>
-#include <sys/zio.h>
-#include <sys/zio_checksum.h>
-#include <sys/zio_compress.h>
-#include <sys/dmu.h>
-#include <sys/dmu_tx.h>
-#include <sys/zap.h>
-#include <sys/zil.h>
-#include <sys/vdev_impl.h>
-#include <sys/metaslab.h>
-#include <sys/uberblock_impl.h>
-#include <sys/txg.h>
-#include <sys/avl.h>
-#include <sys/unique.h>
-#include <sys/dsl_pool.h>
-#include <sys/dsl_dir.h>
-#include <sys/dsl_prop.h>
-#include <sys/fs/zfs.h>
-
-/*
- * SPA locking
- *
- * There are four basic locks for managing spa_t structures:
- *
- * spa_namespace_lock (global mutex)
- *
- *	This lock must be acquired to do any of the following:
- *
- *		- Lookup a spa_t by name
- *		- Add or remove a spa_t from the namespace
- *		- Increase spa_refcount from non-zero
- *		- Check if spa_refcount is zero
- *		- Rename a spa_t
- *		- add/remove/attach/detach devices
- *		- Held for the duration of create/destroy/import/export
- *
- *	It does not need to handle recursion.  A create or destroy may
- *	reference objects (files or zvols) in other pools, but by
- *	definition they must have an existing reference, and will never need
- *	to lookup a spa_t by name.
- *
- * spa_refcount (per-spa refcount_t protected by mutex)
- *
- *	This reference count keep track of any active users of the spa_t.  The
- *	spa_t cannot be destroyed or freed while this is non-zero.  Internally,
- *	the refcount is never really 'zero' - opening a pool implicitly keeps
- *	some references in the DMU.  Internally we check against SPA_MINREF, but
- *	present the image of a zero/non-zero value to consumers.
- *
- * spa_config_lock (per-spa crazy rwlock)
- *
- *	This SPA special is a recursive rwlock, capable of being acquired from
- *	asynchronous threads.  It has protects the spa_t from config changes,
- *	and must be held in the following circumstances:
- *
- *		- RW_READER to perform I/O to the spa
- *		- RW_WRITER to change the vdev config
- *
- * spa_config_cache_lock (per-spa mutex)
- *
- *	This mutex prevents the spa_config nvlist from being updated.  No
- *      other locks are required to obtain this lock, although implicitly you
- *      must have the namespace lock or non-zero refcount to have any kind
- *      of spa_t pointer at all.
- *
- * The locking order is fairly straightforward:
- *
- *		spa_namespace_lock	->	spa_refcount
- *
- *	The namespace lock must be acquired to increase the refcount from 0
- *	or to check if it is zero.
- *
- *		spa_refcount		->	spa_config_lock
- *
- *	There must be at least one valid reference on the spa_t to acquire
- *	the config lock.
- *
- *		spa_namespace_lock	->	spa_config_lock
- *
- *	The namespace lock must always be taken before the config lock.
- *
- *
- * The spa_namespace_lock and spa_config_cache_lock can be acquired directly and
- * are globally visible.
- *
- * The namespace is manipulated using the following functions, all which require
- * the spa_namespace_lock to be held.
- *
- *	spa_lookup()		Lookup a spa_t by name.
- *
- *	spa_add()		Create a new spa_t in the namespace.
- *
- *	spa_remove()		Remove a spa_t from the namespace.  This also
- *				frees up any memory associated with the spa_t.
- *
- *	spa_next()		Returns the next spa_t in the system, or the
- *				first if NULL is passed.
- *
- *	spa_evict_all()		Shutdown and remove all spa_t structures in
- *				the system.
- *
- *	spa_guid_exists()	Determine whether a pool/device guid exists.
- *
- * The spa_refcount is manipulated using the following functions:
- *
- *	spa_open_ref()		Adds a reference to the given spa_t.  Must be
- *				called with spa_namespace_lock held if the
- *				refcount is currently zero.
- *
- *	spa_close()		Remove a reference from the spa_t.  This will
- *				not free the spa_t or remove it from the
- *				namespace.  No locking is required.
- *
- *	spa_refcount_zero()	Returns true if the refcount is currently
- *				zero.  Must be called with spa_namespace_lock
- *				held.
- *
- * The spa_config_lock is manipulated using the following functions:
- *
- *	spa_config_enter()	Acquire the config lock as RW_READER or
- *				RW_WRITER.  At least one reference on the spa_t
- *				must exist.
- *
- *	spa_config_exit()	Release the config lock.
- *
- *	spa_config_held()	Returns true if the config lock is currently
- *				held in the given state.
- *
- * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit().
- *
- *	spa_vdev_enter()	Acquire the namespace lock and the config lock
- *				for writing.
- *
- *	spa_vdev_exit()		Release the config lock, wait for all I/O
- *				to complete, sync the updated configs to the
- *				cache, and release the namespace lock.
- *
- * The spa_name() function also requires either the spa_namespace_lock
- * or the spa_config_lock, as both are needed to do a rename.  spa_rename() is
- * also implemented within this file since is requires manipulation of the
- * namespace.
- */
-
-static avl_tree_t spa_namespace_avl;
-kmutex_t spa_namespace_lock;
-static kcondvar_t spa_namespace_cv;
-static int spa_active_count;
-int spa_max_replication_override = SPA_DVAS_PER_BP;
-
-static kmutex_t spa_spare_lock;
-static avl_tree_t spa_spare_avl;
-
-kmem_cache_t *spa_buffer_pool;
-int spa_mode;
-
-#ifdef ZFS_DEBUG
-int zfs_flags = ~0;
-#else
-int zfs_flags = 0;
-#endif
-
-/*
- * zfs_recover can be set to nonzero to attempt to recover from
- * otherwise-fatal errors, typically caused by on-disk corruption.  When
- * set, calls to zfs_panic_recover() will turn into warning messages.
- */
-int zfs_recover = 0;
-SYSCTL_DECL(_vfs_zfs);
-TUNABLE_INT("vfs.zfs.recover", &zfs_recover);
-SYSCTL_INT(_vfs_zfs, OID_AUTO, recover, CTLFLAG_RDTUN, &zfs_recover, 0,
-    "Try to recover from otherwise-fatal errors.");
-
-#define	SPA_MINREF	5	/* spa_refcnt for an open-but-idle pool */
-
-/*
- * ==========================================================================
- * SPA namespace functions
- * ==========================================================================
- */
-
-/*
- * Lookup the named spa_t in the AVL tree.  The spa_namespace_lock must be held.
- * Returns NULL if no matching spa_t is found.
- */
-spa_t *
-spa_lookup(const char *name)
-{
-	spa_t search, *spa;
-	avl_index_t where;
-
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
-
-	search.spa_name = (char *)name;
-	spa = avl_find(&spa_namespace_avl, &search, &where);
-
-	return (spa);
-}
-
-/*
- * Create an uninitialized spa_t with the given name.  Requires
- * spa_namespace_lock.  The caller must ensure that the spa_t doesn't already
- * exist by calling spa_lookup() first.
- */
-spa_t *
-spa_add(const char *name, const char *altroot)
-{
-	spa_t *spa;
-
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
-
-	spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP);
-
-	spa->spa_name = spa_strdup(name);
-	spa->spa_state = POOL_STATE_UNINITIALIZED;
-	spa->spa_freeze_txg = UINT64_MAX;
-	spa->spa_final_txg = UINT64_MAX;
-
-	mutex_init(&spa->spa_config_cache_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
-
-	cv_init(&spa->spa_scrub_cv, NULL, CV_DEFAULT, NULL);
-	cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL);
-	cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
-
-	refcount_create(&spa->spa_refcount);
-	refcount_create(&spa->spa_config_lock.scl_count);
-
-	avl_add(&spa_namespace_avl, spa);
-
-	/*
-	 * Set the alternate root, if there is one.
-	 */
-	if (altroot) {
-		spa->spa_root = spa_strdup(altroot);
-		spa_active_count++;
-	}
-
-	return (spa);
-}
-
-/*
- * Removes a spa_t from the namespace, freeing up any memory used.  Requires
- * spa_namespace_lock.  This is called only after the spa_t has been closed and
- * deactivated.
- */
-void
-spa_remove(spa_t *spa)
-{
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
-	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
-	ASSERT(spa->spa_scrub_thread == NULL);
-
-	avl_remove(&spa_namespace_avl, spa);
-	cv_broadcast(&spa_namespace_cv);
-
-	if (spa->spa_root) {
-		spa_strfree(spa->spa_root);
-		spa_active_count--;
-	}
-
-	if (spa->spa_name)
-		spa_strfree(spa->spa_name);
-
-	spa_config_set(spa, NULL);
-
-	refcount_destroy(&spa->spa_refcount);
-	refcount_destroy(&spa->spa_config_lock.scl_count);
-
-	cv_destroy(&spa->spa_async_cv);
-	cv_destroy(&spa->spa_scrub_io_cv);
-	cv_destroy(&spa->spa_scrub_cv);
-
-	mutex_destroy(&spa->spa_scrub_lock);
-	mutex_destroy(&spa->spa_async_lock);
-	mutex_destroy(&spa->spa_config_cache_lock);
-
-	kmem_free(spa, sizeof (spa_t));
-}
-
-/*
- * Given a pool, return the next pool in the namespace, or NULL if there is
- * none.  If 'prev' is NULL, return the first pool.
- */
-spa_t *
-spa_next(spa_t *prev)
-{
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
-
-	if (prev)
-		return (AVL_NEXT(&spa_namespace_avl, prev));
-	else
-		return (avl_first(&spa_namespace_avl));
-}
-
-/*
- * ==========================================================================
- * SPA refcount functions
- * ==========================================================================
- */
-
-/*
- * Add a reference to the given spa_t.  Must have at least one reference, or
- * have the namespace lock held.
- */
-void
-spa_open_ref(spa_t *spa, void *tag)
-{
-	ASSERT(refcount_count(&spa->spa_refcount) > SPA_MINREF ||
-	    MUTEX_HELD(&spa_namespace_lock));
-
-	(void) refcount_add(&spa->spa_refcount, tag);
-}
-
-/*
- * Remove a reference to the given spa_t.  Must have at least one reference, or
- * have the namespace lock held.
- */
-void
-spa_close(spa_t *spa, void *tag)
-{
-	ASSERT(refcount_count(&spa->spa_refcount) > SPA_MINREF ||
-	    MUTEX_HELD(&spa_namespace_lock));
-
-	(void) refcount_remove(&spa->spa_refcount, tag);
-}
-
-/*
- * Check to see if the spa refcount is zero.  Must be called with
- * spa_namespace_lock held.  We really compare against SPA_MINREF, which is the
- * number of references acquired when opening a pool
- */
-boolean_t
-spa_refcount_zero(spa_t *spa)
-{
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
-
-	return (refcount_count(&spa->spa_refcount) == SPA_MINREF);
-}
-
-/*
- * ==========================================================================
- * SPA spare tracking
- * ==========================================================================
- */
-
-/*
- * Spares are tracked globally due to the following constraints:
- *
- * 	- A spare may be part of multiple pools.
- * 	- A spare may be added to a pool even if it's actively in use within
- *	  another pool.
- * 	- A spare in use in any pool can only be the source of a replacement if
- *	  the target is a spare in the same pool.
- *
- * We keep track of all spares on the system through the use of a reference
- * counted AVL tree.  When a vdev is added as a spare, or used as a replacement
- * spare, then we bump the reference count in the AVL tree.  In addition, we set
- * the 'vdev_isspare' member to indicate that the device is a spare (active or
- * inactive).  When a spare is made active (used to replace a device in the
- * pool), we also keep track of which pool its been made a part of.
- *
- * The 'spa_spare_lock' protects the AVL tree.  These functions are normally
- * called under the spa_namespace lock as part of vdev reconfiguration.  The
- * separate spare lock exists for the status query path, which does not need to
- * be completely consistent with respect to other vdev configuration changes.
- */
-
-typedef struct spa_spare {
-	uint64_t	spare_guid;
-	uint64_t	spare_pool;
-	avl_node_t	spare_avl;
-	int		spare_count;
-} spa_spare_t;
-
-static int
-spa_spare_compare(const void *a, const void *b)
-{
-	const spa_spare_t *sa = a;
-	const spa_spare_t *sb = b;
-
-	if (sa->spare_guid < sb->spare_guid)
-		return (-1);
-	else if (sa->spare_guid > sb->spare_guid)
-		return (1);
-	else
-		return (0);
-}
-
-void
-spa_spare_add(vdev_t *vd)
-{
-	avl_index_t where;
-	spa_spare_t search;
-	spa_spare_t *spare;
-
-	mutex_enter(&spa_spare_lock);
-	ASSERT(!vd->vdev_isspare);
-
-	search.spare_guid = vd->vdev_guid;
-	if ((spare = avl_find(&spa_spare_avl, &search, &where)) != NULL) {
-		spare->spare_count++;
-	} else {
-		spare = kmem_zalloc(sizeof (spa_spare_t), KM_SLEEP);
-		spare->spare_guid = vd->vdev_guid;
-		spare->spare_count = 1;
-		avl_insert(&spa_spare_avl, spare, where);
-	}
-	vd->vdev_isspare = B_TRUE;
-
-	mutex_exit(&spa_spare_lock);
-}
-
-void
-spa_spare_remove(vdev_t *vd)
-{
-	spa_spare_t search;
-	spa_spare_t *spare;
-	avl_index_t where;
-
-	mutex_enter(&spa_spare_lock);
-
-	search.spare_guid = vd->vdev_guid;
-	spare = avl_find(&spa_spare_avl, &search, &where);
-
-	ASSERT(vd->vdev_isspare);
-	ASSERT(spare != NULL);
-
-	if (--spare->spare_count == 0) {
-		avl_remove(&spa_spare_avl, spare);
-		kmem_free(spare, sizeof (spa_spare_t));
-	} else if (spare->spare_pool == spa_guid(vd->vdev_spa)) {
-		spare->spare_pool = 0ULL;
-	}
-
-	vd->vdev_isspare = B_FALSE;
-	mutex_exit(&spa_spare_lock);
-}
-
-boolean_t
-spa_spare_exists(uint64_t guid, uint64_t *pool)
-{
-	spa_spare_t search, *found;
-	avl_index_t where;
-
-	mutex_enter(&spa_spare_lock);
-
-	search.spare_guid = guid;
-	found = avl_find(&spa_spare_avl, &search, &where);
-
-	if (pool) {
-		if (found)
-			*pool = found->spare_pool;
-		else
-			*pool = 0ULL;
-	}
-
-	mutex_exit(&spa_spare_lock);
-
-	return (found != NULL);
-}
-
-void
-spa_spare_activate(vdev_t *vd)
-{
-	spa_spare_t search, *found;
-	avl_index_t where;
-
-	mutex_enter(&spa_spare_lock);
-	ASSERT(vd->vdev_isspare);
-
-	search.spare_guid = vd->vdev_guid;
-	found = avl_find(&spa_spare_avl, &search, &where);
-	ASSERT(found != NULL);
-	ASSERT(found->spare_pool == 0ULL);
-
-	found->spare_pool = spa_guid(vd->vdev_spa);
-	mutex_exit(&spa_spare_lock);
-}
-
-/*
- * ==========================================================================
- * SPA config locking
- * ==========================================================================
- */
-
-/*
- * Acquire the config lock.  The config lock is a special rwlock that allows for
- * recursive enters.  Because these enters come from the same thread as well as
- * asynchronous threads working on behalf of the owner, we must unilaterally
- * allow all reads access as long at least one reader is held (even if a write
- * is requested).  This has the side effect of write starvation, but write locks
- * are extremely rare, and a solution to this problem would be significantly
- * more complex (if even possible).
- *
- * We would like to assert that the namespace lock isn't held, but this is a
- * valid use during create.
- */
-void
-spa_config_enter(spa_t *spa, krw_t rw, void *tag)
-{
-	spa_config_lock_t *scl = &spa->spa_config_lock;
-
-	mutex_enter(&scl->scl_lock);
-
-	if (scl->scl_writer != curthread) {
-		if (rw == RW_READER) {
-			while (scl->scl_writer != NULL)
-				cv_wait(&scl->scl_cv, &scl->scl_lock);
-		} else {
-			while (scl->scl_writer != NULL ||
-			    !refcount_is_zero(&scl->scl_count))
-				cv_wait(&scl->scl_cv, &scl->scl_lock);
-			scl->scl_writer = curthread;
-		}
-	}
-
-	(void) refcount_add(&scl->scl_count, tag);
-
-	mutex_exit(&scl->scl_lock);
-}
-
-/*
- * Release the spa config lock, notifying any waiters in the process.
- */
-void
-spa_config_exit(spa_t *spa, void *tag)
-{
-	spa_config_lock_t *scl = &spa->spa_config_lock;
-
-	mutex_enter(&scl->scl_lock);
-
-	ASSERT(!refcount_is_zero(&scl->scl_count));
-	if (refcount_remove(&scl->scl_count, tag) == 0) {
-		cv_broadcast(&scl->scl_cv);
-		scl->scl_writer = NULL;  /* OK in either case */
-	}
-
-	mutex_exit(&scl->scl_lock);
-}
-
-/*
- * Returns true if the config lock is held in the given manner.
- */
-boolean_t
-spa_config_held(spa_t *spa, krw_t rw)
-{
-	spa_config_lock_t *scl = &spa->spa_config_lock;
-	boolean_t held;
-
-	mutex_enter(&scl->scl_lock);
-	if (rw == RW_WRITER)
-		held = (scl->scl_writer == curthread);
-	else
-		held = !refcount_is_zero(&scl->scl_count);
-	mutex_exit(&scl->scl_lock);
-
-	return (held);
-}
-
-/*
- * ==========================================================================
- * SPA vdev locking
- * ==========================================================================
- */
-
-/*
- * Lock the given spa_t for the purpose of adding or removing a vdev.
- * Grabs the global spa_namespace_lock plus the spa config lock for writing.
- * It returns the next transaction group for the spa_t.
- */
-uint64_t
-spa_vdev_enter(spa_t *spa)
-{
-	/*
-	 * Suspend scrub activity while we mess with the config.
-	 */
-	spa_scrub_suspend(spa);
-
-	mutex_enter(&spa_namespace_lock);
-
-	spa_config_enter(spa, RW_WRITER, spa);
-
-	return (spa_last_synced_txg(spa) + 1);
-}
-
-/*
- * Unlock the spa_t after adding or removing a vdev.  Besides undoing the
- * locking of spa_vdev_enter(), we also want make sure the transactions have
- * synced to disk, and then update the global configuration cache with the new
- * information.
- */
-int
-spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
-{
-	int config_changed = B_FALSE;
-
-	ASSERT(txg > spa_last_synced_txg(spa));
-
-	/*
-	 * Reassess the DTLs.
-	 */
-	vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE);
-
-	/*
-	 * If the config changed, notify the scrub thread that it must restart.
-	 */
-	if (error == 0 && !list_is_empty(&spa->spa_dirty_list)) {
-		config_changed = B_TRUE;
-		spa_scrub_restart(spa, txg);
-	}
-
-	spa_config_exit(spa, spa);
-
-	/*
-	 * Allow scrubbing to resume.
-	 */
-	spa_scrub_resume(spa);
-
-	/*
-	 * Note: this txg_wait_synced() is important because it ensures
-	 * that there won't be more than one config change per txg.
-	 * This allows us to use the txg as the generation number.
-	 */
-	if (error == 0)
-		txg_wait_synced(spa->spa_dsl_pool, txg);
-
-	if (vd != NULL) {
-		ASSERT(!vd->vdev_detached || vd->vdev_dtl.smo_object == 0);
-		vdev_free(vd);
-	}
-
-	/*
-	 * If the config changed, update the config cache.
-	 */
-	if (config_changed)
-		spa_config_sync();
-
-	mutex_exit(&spa_namespace_lock);
-
-	return (error);
-}
-
-/*
- * ==========================================================================
- * Miscellaneous functions
- * ==========================================================================
- */
-
-/*
- * Rename a spa_t.
- */
-int
-spa_rename(const char *name, const char *newname)
-{
-	spa_t *spa;
-	int err;
-
-	/*
-	 * Lookup the spa_t and grab the config lock for writing.  We need to
-	 * actually open the pool so that we can sync out the necessary labels.
-	 * It's OK to call spa_open() with the namespace lock held because we
-	 * allow recursive calls for other reasons.
-	 */
-	mutex_enter(&spa_namespace_lock);
-	if ((err = spa_open(name, &spa, FTAG)) != 0) {
-		mutex_exit(&spa_namespace_lock);
-		return (err);
-	}
-
-	spa_config_enter(spa, RW_WRITER, FTAG);
-
-	avl_remove(&spa_namespace_avl, spa);
-	spa_strfree(spa->spa_name);
-	spa->spa_name = spa_strdup(newname);
-	avl_add(&spa_namespace_avl, spa);
-
-	/*
-	 * Sync all labels to disk with the new names by marking the root vdev
-	 * dirty and waiting for it to sync.  It will pick up the new pool name
-	 * during the sync.
-	 */
-	vdev_config_dirty(spa->spa_root_vdev);
-
-	spa_config_exit(spa, FTAG);
-
-	txg_wait_synced(spa->spa_dsl_pool, 0);
-
-	/*
-	 * Sync the updated config cache.
-	 */
-	spa_config_sync();
-
-	spa_close(spa, FTAG);
-
-	mutex_exit(&spa_namespace_lock);
-
-	return (0);
-}
-
-
-/*
- * Determine whether a pool with given pool_guid exists.  If device_guid is
- * non-zero, determine whether the pool exists *and* contains a device with the
- * specified device_guid.
- */
-boolean_t
-spa_guid_exists(uint64_t pool_guid, uint64_t device_guid)
-{
-	spa_t *spa;
-	avl_tree_t *t = &spa_namespace_avl;
-
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
-
-	for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) {
-		if (spa->spa_state == POOL_STATE_UNINITIALIZED)
-			continue;
-		if (spa->spa_root_vdev == NULL)
-			continue;
-		if (spa_guid(spa) == pool_guid) {
-			if (device_guid == 0)
-				break;
-
-			if (vdev_lookup_by_guid(spa->spa_root_vdev,
-			    device_guid) != NULL)
-				break;
-
-			/*
-			 * Check any devices we may in the process of adding.
-			 */
-			if (spa->spa_pending_vdev) {
-				if (vdev_lookup_by_guid(spa->spa_pending_vdev,
-				    device_guid) != NULL)
-					break;
-			}
-		}
-	}
-
-	return (spa != NULL);
-}
-
-char *
-spa_strdup(const char *s)
-{
-	size_t len;
-	char *new;
-
-	len = strlen(s);
-	new = kmem_alloc(len + 1, KM_SLEEP);
-	bcopy(s, new, len);
-	new[len] = '\0';
-
-	return (new);
-}
-
-void
-spa_strfree(char *s)
-{
-	kmem_free(s, strlen(s) + 1);
-}
-
-uint64_t
-spa_get_random(uint64_t range)
-{
-	uint64_t r;
-
-	ASSERT(range != 0);
-
-	(void) random_get_pseudo_bytes((void *)&r, sizeof (uint64_t));
-
-	return (r % range);
-}
-
-void
-sprintf_blkptr(char *buf, int len, const blkptr_t *bp)
-{
-	int d;
-
-	if (bp == NULL) {
-		(void) snprintf(buf, len, "<NULL>");
-		return;
-	}
-
-	if (BP_IS_HOLE(bp)) {
-		(void) snprintf(buf, len, "<hole>");
-		return;
-	}
-
-	(void) snprintf(buf, len, "[L%llu %s] %llxL/%llxP ",
-	    (u_longlong_t)BP_GET_LEVEL(bp),
-	    dmu_ot[BP_GET_TYPE(bp)].ot_name,
-	    (u_longlong_t)BP_GET_LSIZE(bp),
-	    (u_longlong_t)BP_GET_PSIZE(bp));
-
-	for (d = 0; d < BP_GET_NDVAS(bp); d++) {
-		const dva_t *dva = &bp->blk_dva[d];
-		(void) snprintf(buf + strlen(buf), len - strlen(buf),
-		    "DVA[%d]=<%llu:%llx:%llx> ", d,
-		    (u_longlong_t)DVA_GET_VDEV(dva),
-		    (u_longlong_t)DVA_GET_OFFSET(dva),
-		    (u_longlong_t)DVA_GET_ASIZE(dva));
-	}
-
-	(void) snprintf(buf + strlen(buf), len - strlen(buf),
-	    "%s %s %s %s birth=%llu fill=%llu cksum=%llx:%llx:%llx:%llx",
-	    zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name,
-	    zio_compress_table[BP_GET_COMPRESS(bp)].ci_name,
-	    BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE",
-	    BP_IS_GANG(bp) ? "gang" : "contiguous",
-	    (u_longlong_t)bp->blk_birth,
-	    (u_longlong_t)bp->blk_fill,
-	    (u_longlong_t)bp->blk_cksum.zc_word[0],
-	    (u_longlong_t)bp->blk_cksum.zc_word[1],
-	    (u_longlong_t)bp->blk_cksum.zc_word[2],
-	    (u_longlong_t)bp->blk_cksum.zc_word[3]);
-}
-
-void
-spa_freeze(spa_t *spa)
-{
-	uint64_t freeze_txg = 0;
-
-	spa_config_enter(spa, RW_WRITER, FTAG);
-	if (spa->spa_freeze_txg == UINT64_MAX) {
-		freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE;
-		spa->spa_freeze_txg = freeze_txg;
-	}
-	spa_config_exit(spa, FTAG);
-	if (freeze_txg != 0)
-		txg_wait_synced(spa_get_dsl(spa), freeze_txg);
-}
-
-void
-zfs_panic_recover(const char *fmt, ...)
-{
-	va_list adx;
-
-	va_start(adx, fmt);
-	vcmn_err(zfs_recover ? CE_WARN : CE_PANIC, fmt, adx);
-	va_end(adx);
-}
-
-/*
- * ==========================================================================
- * Accessor functions
- * ==========================================================================
- */
-
-krwlock_t *
-spa_traverse_rwlock(spa_t *spa)
-{
-	return (&spa->spa_traverse_lock);
-}
-
-int
-spa_traverse_wanted(spa_t *spa)
-{
-	return (spa->spa_traverse_wanted);
-}
-
-dsl_pool_t *
-spa_get_dsl(spa_t *spa)
-{
-	return (spa->spa_dsl_pool);
-}
-
-blkptr_t *
-spa_get_rootblkptr(spa_t *spa)
-{
-	return (&spa->spa_ubsync.ub_rootbp);
-}
-
-void
-spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp)
-{
-	spa->spa_uberblock.ub_rootbp = *bp;
-}
-
-void
-spa_altroot(spa_t *spa, char *buf, size_t buflen)
-{
-	if (spa->spa_root == NULL)
-		buf[0] = '\0';
-	else
-		(void) strncpy(buf, spa->spa_root, buflen);
-}
-
-int
-spa_sync_pass(spa_t *spa)
-{
-	return (spa->spa_sync_pass);
-}
-
-char *
-spa_name(spa_t *spa)
-{
-	/*
-	 * Accessing the name requires holding either the namespace lock or the
-	 * config lock, both of which are required to do a rename.
-	 */
-	ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
-	    spa_config_held(spa, RW_READER) || spa_config_held(spa, RW_WRITER));
-
-	return (spa->spa_name);
-}
-
-uint64_t
-spa_guid(spa_t *spa)
-{
-	/*
-	 * If we fail to parse the config during spa_load(), we can go through
-	 * the error path (which posts an ereport) and end up here with no root
-	 * vdev.  We stash the original pool guid in 'spa_load_guid' to handle
-	 * this case.
-	 */
-	if (spa->spa_root_vdev != NULL)
-		return (spa->spa_root_vdev->vdev_guid);
-	else
-		return (spa->spa_load_guid);
-}
-
-uint64_t
-spa_last_synced_txg(spa_t *spa)
-{
-	return (spa->spa_ubsync.ub_txg);
-}
-
-uint64_t
-spa_first_txg(spa_t *spa)
-{
-	return (spa->spa_first_txg);
-}
-
-int
-spa_state(spa_t *spa)
-{
-	return (spa->spa_state);
-}
-
-uint64_t
-spa_freeze_txg(spa_t *spa)
-{
-	return (spa->spa_freeze_txg);
-}
-
-/*
- * In the future, this may select among different metaslab classes
- * depending on the zdp.  For now, there's no such distinction.
- */
-metaslab_class_t *
-spa_metaslab_class_select(spa_t *spa)
-{
-	return (spa->spa_normal_class);
-}
-
-/*
- * Return how much space is allocated in the pool (ie. sum of all asize)
- */
-uint64_t
-spa_get_alloc(spa_t *spa)
-{
-	return (spa->spa_root_vdev->vdev_stat.vs_alloc);
-}
-
-/*
- * Return how much (raid-z inflated) space there is in the pool.
- */
-uint64_t
-spa_get_space(spa_t *spa)
-{
-	return (spa->spa_root_vdev->vdev_stat.vs_space);
-}
-
-/*
- * Return the amount of raid-z-deflated space in the pool.
- */
-uint64_t
-spa_get_dspace(spa_t *spa)
-{
-	if (spa->spa_deflate)
-		return (spa->spa_root_vdev->vdev_stat.vs_dspace);
-	else
-		return (spa->spa_root_vdev->vdev_stat.vs_space);
-}
-
-/* ARGSUSED */
-uint64_t
-spa_get_asize(spa_t *spa, uint64_t lsize)
-{
-	/*
-	 * For now, the worst case is 512-byte RAID-Z blocks, in which
-	 * case the space requirement is exactly 2x; so just assume that.
-	 * Add to this the fact that we can have up to 3 DVAs per bp, and
-	 * we have to multiply by a total of 6x.
-	 */
-	return (lsize * 6);
-}
-
-uint64_t
-spa_version(spa_t *spa)
-{
-	return (spa->spa_ubsync.ub_version);
-}
-
-int
-spa_max_replication(spa_t *spa)
-{
-	/*
-	 * As of ZFS_VERSION == ZFS_VERSION_DITTO_BLOCKS, we are able to
-	 * handle BPs with more than one DVA allocated.  Set our max
-	 * replication level accordingly.
-	 */
-	if (spa_version(spa) < ZFS_VERSION_DITTO_BLOCKS)
-		return (1);
-	return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override));
-}
-
-uint64_t
-bp_get_dasize(spa_t *spa, const blkptr_t *bp)
-{
-	int sz = 0, i;
-
-	if (!spa->spa_deflate)
-		return (BP_GET_ASIZE(bp));
-
-	for (i = 0; i < SPA_DVAS_PER_BP; i++) {
-		vdev_t *vd =
-		    vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[i]));
-		sz += (DVA_GET_ASIZE(&bp->blk_dva[i]) >> SPA_MINBLOCKSHIFT) *
-		    vd->vdev_deflate_ratio;
-	}
-	return (sz);
-}
-
-/*
- * ==========================================================================
- * Initialization and Termination
- * ==========================================================================
- */
-
-static int
-spa_name_compare(const void *a1, const void *a2)
-{
-	const spa_t *s1 = a1;
-	const spa_t *s2 = a2;
-	int s;
-
-	s = strcmp(s1->spa_name, s2->spa_name);
-	if (s > 0)
-		return (1);
-	if (s < 0)
-		return (-1);
-	return (0);
-}
-
-int
-spa_busy(void)
-{
-	return (spa_active_count);
-}
-
-void
-spa_init(int mode)
-{
-	mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL);
-	cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL);
-
-	avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t),
-	    offsetof(spa_t, spa_avl));
-
-	mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL);
-
-	avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_spare_t),
-	    offsetof(spa_spare_t, spare_avl));
-
-	spa_mode = mode;
-
-	refcount_init();
-	unique_init();
-	zio_init();
-	dmu_init();
-	zil_init();
-	spa_config_load();
-}
-
-void
-spa_fini(void)
-{
-	spa_evict_all();
-
-	zil_fini();
-	dmu_fini();
-	zio_fini();
-	refcount_fini();
-
-	avl_destroy(&spa_namespace_avl);
-	avl_destroy(&spa_spare_avl);
-
-	cv_destroy(&spa_namespace_cv);
-	mutex_destroy(&spa_namespace_lock);
-	mutex_destroy(&spa_spare_lock);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/space_map.c b/sys/contrib/opensolaris/uts/common/fs/zfs/space_map.c
deleted file mode 100644
index 23313a9..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/space_map.c
+++ /dev/null
@@ -1,501 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/spa.h>
-#include <sys/dmu.h>
-#include <sys/zio.h>
-#include <sys/space_map.h>
-
-/*
- * Space map routines.
- * NOTE: caller is responsible for all locking.
- */
-static int
-space_map_seg_compare(const void *x1, const void *x2)
-{
-	const space_seg_t *s1 = x1;
-	const space_seg_t *s2 = x2;
-
-	if (s1->ss_start < s2->ss_start) {
-		if (s1->ss_end > s2->ss_start)
-			return (0);
-		return (-1);
-	}
-	if (s1->ss_start > s2->ss_start) {
-		if (s1->ss_start < s2->ss_end)
-			return (0);
-		return (1);
-	}
-	return (0);
-}
-
-void
-space_map_create(space_map_t *sm, uint64_t start, uint64_t size, uint8_t shift,
-	kmutex_t *lp)
-{
-	bzero(sm, sizeof (*sm));
-
-	cv_init(&sm->sm_load_cv, NULL, CV_DEFAULT, NULL);
-	avl_create(&sm->sm_root, space_map_seg_compare,
-	    sizeof (space_seg_t), offsetof(struct space_seg, ss_node));
-
-	sm->sm_start = start;
-	sm->sm_size = size;
-	sm->sm_shift = shift;
-	sm->sm_lock = lp;
-}
-
-void
-space_map_destroy(space_map_t *sm)
-{
-	ASSERT(!sm->sm_loaded && !sm->sm_loading);
-	VERIFY3U(sm->sm_space, ==, 0);
-	avl_destroy(&sm->sm_root);
-	cv_destroy(&sm->sm_load_cv);
-}
-
-void
-space_map_add(space_map_t *sm, uint64_t start, uint64_t size)
-{
-	avl_index_t where;
-	space_seg_t ssearch, *ss_before, *ss_after, *ss;
-	uint64_t end = start + size;
-	int merge_before, merge_after;
-
-	ASSERT(MUTEX_HELD(sm->sm_lock));
-	VERIFY(size != 0);
-	VERIFY3U(start, >=, sm->sm_start);
-	VERIFY3U(end, <=, sm->sm_start + sm->sm_size);
-	VERIFY(sm->sm_space + size <= sm->sm_size);
-	VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0);
-	VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0);
-
-	ssearch.ss_start = start;
-	ssearch.ss_end = end;
-	ss = avl_find(&sm->sm_root, &ssearch, &where);
-
-	if (ss != NULL && ss->ss_start <= start && ss->ss_end >= end) {
-		zfs_panic_recover("zfs: allocating allocated segment"
-		    "(offset=%llu size=%llu)\n",
-		    (longlong_t)start, (longlong_t)size);
-		return;
-	}
-
-	/* Make sure we don't overlap with either of our neighbors */
-	VERIFY(ss == NULL);
-
-	ss_before = avl_nearest(&sm->sm_root, where, AVL_BEFORE);
-	ss_after = avl_nearest(&sm->sm_root, where, AVL_AFTER);
-
-	merge_before = (ss_before != NULL && ss_before->ss_end == start);
-	merge_after = (ss_after != NULL && ss_after->ss_start == end);
-
-	if (merge_before && merge_after) {
-		avl_remove(&sm->sm_root, ss_before);
-		ss_after->ss_start = ss_before->ss_start;
-		kmem_free(ss_before, sizeof (*ss_before));
-	} else if (merge_before) {
-		ss_before->ss_end = end;
-	} else if (merge_after) {
-		ss_after->ss_start = start;
-	} else {
-		ss = kmem_alloc(sizeof (*ss), KM_SLEEP);
-		ss->ss_start = start;
-		ss->ss_end = end;
-		avl_insert(&sm->sm_root, ss, where);
-	}
-
-	sm->sm_space += size;
-}
-
-void
-space_map_remove(space_map_t *sm, uint64_t start, uint64_t size)
-{
-	avl_index_t where;
-	space_seg_t ssearch, *ss, *newseg;
-	uint64_t end = start + size;
-	int left_over, right_over;
-
-	ASSERT(MUTEX_HELD(sm->sm_lock));
-	VERIFY(size != 0);
-	VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0);
-	VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0);
-
-	ssearch.ss_start = start;
-	ssearch.ss_end = end;
-	ss = avl_find(&sm->sm_root, &ssearch, &where);
-
-	/* Make sure we completely overlap with someone */
-	if (ss == NULL) {
-		zfs_panic_recover("zfs: freeing free segment "
-		    "(offset=%llu size=%llu)",
-		    (longlong_t)start, (longlong_t)size);
-		return;
-	}
-	VERIFY3U(ss->ss_start, <=, start);
-	VERIFY3U(ss->ss_end, >=, end);
-	VERIFY(sm->sm_space - size <= sm->sm_size);
-
-	left_over = (ss->ss_start != start);
-	right_over = (ss->ss_end != end);
-
-	if (left_over && right_over) {
-		newseg = kmem_alloc(sizeof (*newseg), KM_SLEEP);
-		newseg->ss_start = end;
-		newseg->ss_end = ss->ss_end;
-		ss->ss_end = start;
-		avl_insert_here(&sm->sm_root, newseg, ss, AVL_AFTER);
-	} else if (left_over) {
-		ss->ss_end = start;
-	} else if (right_over) {
-		ss->ss_start = end;
-	} else {
-		avl_remove(&sm->sm_root, ss);
-		kmem_free(ss, sizeof (*ss));
-	}
-
-	sm->sm_space -= size;
-}
-
-int
-space_map_contains(space_map_t *sm, uint64_t start, uint64_t size)
-{
-	avl_index_t where;
-	space_seg_t ssearch, *ss;
-	uint64_t end = start + size;
-
-	ASSERT(MUTEX_HELD(sm->sm_lock));
-	VERIFY(size != 0);
-	VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0);
-	VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0);
-
-	ssearch.ss_start = start;
-	ssearch.ss_end = end;
-	ss = avl_find(&sm->sm_root, &ssearch, &where);
-
-	return (ss != NULL && ss->ss_start <= start && ss->ss_end >= end);
-}
-
-void
-space_map_vacate(space_map_t *sm, space_map_func_t *func, space_map_t *mdest)
-{
-	space_seg_t *ss;
-	void *cookie = NULL;
-
-	ASSERT(MUTEX_HELD(sm->sm_lock));
-
-	while ((ss = avl_destroy_nodes(&sm->sm_root, &cookie)) != NULL) {
-		if (func != NULL)
-			func(mdest, ss->ss_start, ss->ss_end - ss->ss_start);
-		kmem_free(ss, sizeof (*ss));
-	}
-	sm->sm_space = 0;
-}
-
-void
-space_map_walk(space_map_t *sm, space_map_func_t *func, space_map_t *mdest)
-{
-	space_seg_t *ss;
-
-	for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
-		func(mdest, ss->ss_start, ss->ss_end - ss->ss_start);
-}
-
-void
-space_map_excise(space_map_t *sm, uint64_t start, uint64_t size)
-{
-	avl_tree_t *t = &sm->sm_root;
-	avl_index_t where;
-	space_seg_t *ss, search;
-	uint64_t end = start + size;
-	uint64_t rm_start, rm_end;
-
-	ASSERT(MUTEX_HELD(sm->sm_lock));
-
-	search.ss_start = start;
-	search.ss_end = start;
-
-	for (;;) {
-		ss = avl_find(t, &search, &where);
-
-		if (ss == NULL)
-			ss = avl_nearest(t, where, AVL_AFTER);
-
-		if (ss == NULL || ss->ss_start >= end)
-			break;
-
-		rm_start = MAX(ss->ss_start, start);
-		rm_end = MIN(ss->ss_end, end);
-
-		space_map_remove(sm, rm_start, rm_end - rm_start);
-	}
-}
-
-/*
- * Replace smd with the union of smd and sms.
- */
-void
-space_map_union(space_map_t *smd, space_map_t *sms)
-{
-	avl_tree_t *t = &sms->sm_root;
-	space_seg_t *ss;
-
-	ASSERT(MUTEX_HELD(smd->sm_lock));
-
-	/*
-	 * For each source segment, remove any intersections with the
-	 * destination, then add the source segment to the destination.
-	 */
-	for (ss = avl_first(t); ss != NULL; ss = AVL_NEXT(t, ss)) {
-		space_map_excise(smd, ss->ss_start, ss->ss_end - ss->ss_start);
-		space_map_add(smd, ss->ss_start, ss->ss_end - ss->ss_start);
-	}
-}
-
-/*
- * Wait for any in-progress space_map_load() to complete.
- */
-void
-space_map_load_wait(space_map_t *sm)
-{
-	ASSERT(MUTEX_HELD(sm->sm_lock));
-
-	while (sm->sm_loading)
-		cv_wait(&sm->sm_load_cv, sm->sm_lock);
-}
-
-/*
- * Note: space_map_load() will drop sm_lock across dmu_read() calls.
- * The caller must be OK with this.
- */
-int
-space_map_load(space_map_t *sm, space_map_ops_t *ops, uint8_t maptype,
-	space_map_obj_t *smo, objset_t *os)
-{
-	uint64_t *entry, *entry_map, *entry_map_end;
-	uint64_t bufsize, size, offset, end, space;
-	uint64_t mapstart = sm->sm_start;
-
-	ASSERT(MUTEX_HELD(sm->sm_lock));
-
-	space_map_load_wait(sm);
-
-	if (sm->sm_loaded)
-		return (0);
-
-	sm->sm_loading = B_TRUE;
-	end = smo->smo_objsize;
-	space = smo->smo_alloc;
-
-	ASSERT(sm->sm_ops == NULL);
-	VERIFY3U(sm->sm_space, ==, 0);
-
-	if (maptype == SM_FREE) {
-		space_map_add(sm, sm->sm_start, sm->sm_size);
-		space = sm->sm_size - space;
-	}
-
-	bufsize = 1ULL << SPACE_MAP_BLOCKSHIFT;
-	entry_map = zio_buf_alloc(bufsize);
-
-	mutex_exit(sm->sm_lock);
-	if (end > bufsize)
-		dmu_prefetch(os, smo->smo_object, bufsize, end - bufsize);
-	mutex_enter(sm->sm_lock);
-
-	for (offset = 0; offset < end; offset += bufsize) {
-		size = MIN(end - offset, bufsize);
-		VERIFY(P2PHASE(size, sizeof (uint64_t)) == 0);
-		VERIFY(size != 0);
-
-		dprintf("object=%llu  offset=%llx  size=%llx\n",
-		    smo->smo_object, offset, size);
-
-		mutex_exit(sm->sm_lock);
-		VERIFY3U(dmu_read(os, smo->smo_object, offset, size,
-		    entry_map), ==, 0);
-		mutex_enter(sm->sm_lock);
-
-		entry_map_end = entry_map + (size / sizeof (uint64_t));
-		for (entry = entry_map; entry < entry_map_end; entry++) {
-			uint64_t e = *entry;
-
-			if (SM_DEBUG_DECODE(e))		/* Skip debug entries */
-				continue;
-
-			(SM_TYPE_DECODE(e) == maptype ?
-			    space_map_add : space_map_remove)(sm,
-			    (SM_OFFSET_DECODE(e) << sm->sm_shift) + mapstart,
-			    SM_RUN_DECODE(e) << sm->sm_shift);
-		}
-	}
-	VERIFY3U(sm->sm_space, ==, space);
-
-	zio_buf_free(entry_map, bufsize);
-
-	sm->sm_loading = B_FALSE;
-	sm->sm_loaded = B_TRUE;
-	sm->sm_ops = ops;
-
-	cv_broadcast(&sm->sm_load_cv);
-
-	if (ops != NULL)
-		ops->smop_load(sm);
-
-	return (0);
-}
-
-void
-space_map_unload(space_map_t *sm)
-{
-	ASSERT(MUTEX_HELD(sm->sm_lock));
-
-	if (sm->sm_loaded && sm->sm_ops != NULL)
-		sm->sm_ops->smop_unload(sm);
-
-	sm->sm_loaded = B_FALSE;
-	sm->sm_ops = NULL;
-
-	space_map_vacate(sm, NULL, NULL);
-}
-
-uint64_t
-space_map_alloc(space_map_t *sm, uint64_t size)
-{
-	uint64_t start;
-
-	start = sm->sm_ops->smop_alloc(sm, size);
-	if (start != -1ULL)
-		space_map_remove(sm, start, size);
-	return (start);
-}
-
-void
-space_map_claim(space_map_t *sm, uint64_t start, uint64_t size)
-{
-	sm->sm_ops->smop_claim(sm, start, size);
-	space_map_remove(sm, start, size);
-}
-
-void
-space_map_free(space_map_t *sm, uint64_t start, uint64_t size)
-{
-	space_map_add(sm, start, size);
-	sm->sm_ops->smop_free(sm, start, size);
-}
-
-/*
- * Note: space_map_sync() will drop sm_lock across dmu_write() calls.
- */
-void
-space_map_sync(space_map_t *sm, uint8_t maptype,
-	space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx)
-{
-	spa_t *spa = dmu_objset_spa(os);
-	void *cookie = NULL;
-	space_seg_t *ss;
-	uint64_t bufsize, start, size, run_len;
-	uint64_t *entry, *entry_map, *entry_map_end;
-
-	ASSERT(MUTEX_HELD(sm->sm_lock));
-
-	if (sm->sm_space == 0)
-		return;
-
-	dprintf("object %4llu, txg %llu, pass %d, %c, count %lu, space %llx\n",
-	    smo->smo_object, dmu_tx_get_txg(tx), spa_sync_pass(spa),
-	    maptype == SM_ALLOC ? 'A' : 'F', avl_numnodes(&sm->sm_root),
-	    sm->sm_space);
-
-	if (maptype == SM_ALLOC)
-		smo->smo_alloc += sm->sm_space;
-	else
-		smo->smo_alloc -= sm->sm_space;
-
-	bufsize = (8 + avl_numnodes(&sm->sm_root)) * sizeof (uint64_t);
-	bufsize = MIN(bufsize, 1ULL << SPACE_MAP_BLOCKSHIFT);
-	entry_map = zio_buf_alloc(bufsize);
-	entry_map_end = entry_map + (bufsize / sizeof (uint64_t));
-	entry = entry_map;
-
-	*entry++ = SM_DEBUG_ENCODE(1) |
-	    SM_DEBUG_ACTION_ENCODE(maptype) |
-	    SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(spa)) |
-	    SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx));
-
-	while ((ss = avl_destroy_nodes(&sm->sm_root, &cookie)) != NULL) {
-		size = ss->ss_end - ss->ss_start;
-		start = (ss->ss_start - sm->sm_start) >> sm->sm_shift;
-
-		sm->sm_space -= size;
-		size >>= sm->sm_shift;
-
-		while (size) {
-			run_len = MIN(size, SM_RUN_MAX);
-
-			if (entry == entry_map_end) {
-				mutex_exit(sm->sm_lock);
-				dmu_write(os, smo->smo_object, smo->smo_objsize,
-				    bufsize, entry_map, tx);
-				mutex_enter(sm->sm_lock);
-				smo->smo_objsize += bufsize;
-				entry = entry_map;
-			}
-
-			*entry++ = SM_OFFSET_ENCODE(start) |
-			    SM_TYPE_ENCODE(maptype) |
-			    SM_RUN_ENCODE(run_len);
-
-			start += run_len;
-			size -= run_len;
-		}
-		kmem_free(ss, sizeof (*ss));
-	}
-
-	if (entry != entry_map) {
-		size = (entry - entry_map) * sizeof (uint64_t);
-		mutex_exit(sm->sm_lock);
-		dmu_write(os, smo->smo_object, smo->smo_objsize,
-		    size, entry_map, tx);
-		mutex_enter(sm->sm_lock);
-		smo->smo_objsize += size;
-	}
-
-	zio_buf_free(entry_map, bufsize);
-
-	VERIFY3U(sm->sm_space, ==, 0);
-}
-
-void
-space_map_truncate(space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx)
-{
-	VERIFY(dmu_free_range(os, smo->smo_object, 0, -1ULL, tx) == 0);
-
-	smo->smo_objsize = 0;
-	smo->smo_alloc = 0;
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
deleted file mode 100644
index f58ffc0..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef	_SYS_ARC_H
-#define	_SYS_ARC_H
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/zfs_context.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-#include <sys/zio.h>
-
-typedef struct arc_buf_hdr arc_buf_hdr_t;
-typedef struct arc_buf arc_buf_t;
-typedef void arc_done_func_t(zio_t *zio, arc_buf_t *buf, void *private);
-typedef void arc_byteswap_func_t(void *buf, size_t size);
-typedef int arc_evict_func_t(void *private);
-
-/* generic arc_done_func_t's which you can use */
-arc_done_func_t arc_bcopy_func;
-arc_done_func_t arc_getbuf_func;
-
-struct arc_buf {
-	arc_buf_hdr_t		*b_hdr;
-	arc_buf_t		*b_next;
-	void			*b_data;
-	arc_evict_func_t	*b_efunc;
-	void			*b_private;
-};
-
-typedef enum arc_buf_contents {
-	ARC_BUFC_UNDEF,				/* buffer contents undefined */
-	ARC_BUFC_DATA,				/* buffer contains data */
-	ARC_BUFC_METADATA			/* buffer contains metadata */
-} arc_buf_contents_t;
-/*
- * These are the flags we pass into calls to the arc
- */
-#define	ARC_WAIT	(1 << 1)	/* perform I/O synchronously */
-#define	ARC_NOWAIT	(1 << 2)	/* perform I/O asynchronously */
-#define	ARC_PREFETCH	(1 << 3)	/* I/O is a prefetch */
-#define	ARC_CACHED	(1 << 4)	/* I/O was already in cache */
-
-arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag,
-    arc_buf_contents_t type);
-void arc_buf_add_ref(arc_buf_t *buf, void *tag);
-int arc_buf_remove_ref(arc_buf_t *buf, void *tag);
-int arc_buf_size(arc_buf_t *buf);
-void arc_release(arc_buf_t *buf, void *tag);
-int arc_released(arc_buf_t *buf);
-int arc_has_callback(arc_buf_t *buf);
-void arc_buf_freeze(arc_buf_t *buf);
-void arc_buf_thaw(arc_buf_t *buf);
-#ifdef ZFS_DEBUG
-int arc_referenced(arc_buf_t *buf);
-#endif
-
-int arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap,
-    arc_done_func_t *done, void *private, int priority, int flags,
-    uint32_t *arc_flags, zbookmark_t *zb);
-zio_t *arc_write(zio_t *pio, spa_t *spa, int checksum, int compress,
-    int ncopies, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
-    arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority,
-    int flags, zbookmark_t *zb);
-int arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
-    zio_done_func_t *done, void *private, uint32_t arc_flags);
-int arc_tryread(spa_t *spa, blkptr_t *bp, void *data);
-
-void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private);
-int arc_buf_evict(arc_buf_t *buf);
-
-void arc_flush(void);
-void arc_tempreserve_clear(uint64_t tempreserve);
-int arc_tempreserve_space(uint64_t tempreserve);
-
-void arc_init(void);
-void arc_fini(void);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif /* _SYS_ARC_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h
deleted file mode 100644
index b4c8376..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef	_SYS_BPLIST_H
-#define	_SYS_BPLIST_H
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/dmu.h>
-#include <sys/spa.h>
-#include <sys/txg.h>
-#include <sys/zfs_context.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-typedef struct bplist_phys {
-	/*
-	 * This is the bonus buffer for the dead lists.  The object's
-	 * contents is an array of bpl_entries blkptr_t's, representing
-	 * a total of bpl_bytes physical space.
-	 */
-	uint64_t	bpl_entries;
-	uint64_t	bpl_bytes;
-	uint64_t	bpl_comp;
-	uint64_t	bpl_uncomp;
-} bplist_phys_t;
-
-#define	BPLIST_SIZE_V0	(2 * sizeof (uint64_t))
-
-typedef struct bplist_q {
-	blkptr_t	bpq_blk;
-	void		*bpq_next;
-} bplist_q_t;
-
-typedef struct bplist {
-	kmutex_t	bpl_lock;
-	objset_t	*bpl_mos;
-	uint64_t	bpl_object;
-	uint8_t		bpl_blockshift;
-	uint8_t		bpl_bpshift;
-	uint8_t		bpl_havecomp;
-	bplist_q_t	*bpl_queue;
-	bplist_phys_t	*bpl_phys;
-	dmu_buf_t	*bpl_dbuf;
-	dmu_buf_t	*bpl_cached_dbuf;
-} bplist_t;
-
-extern uint64_t bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx);
-extern void bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx);
-extern int bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object);
-extern void bplist_close(bplist_t *bpl);
-extern boolean_t bplist_empty(bplist_t *bpl);
-extern int bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp);
-extern int bplist_enqueue(bplist_t *bpl, blkptr_t *bp, dmu_tx_t *tx);
-extern void bplist_enqueue_deferred(bplist_t *bpl, blkptr_t *bp);
-extern void bplist_sync(bplist_t *bpl, dmu_tx_t *tx);
-extern void bplist_vacate(bplist_t *bpl, dmu_tx_t *tx);
-extern int bplist_space(bplist_t *bpl,
-    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif /* _SYS_BPLIST_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
deleted file mode 100644
index d33657b..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
+++ /dev/null
@@ -1,334 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef	_SYS_DBUF_H
-#define	_SYS_DBUF_H
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/dmu.h>
-#include <sys/spa.h>
-#include <sys/txg.h>
-#include <sys/zio.h>
-#include <sys/arc.h>
-#include <sys/zfs_context.h>
-#include <sys/refcount.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-#define	DB_BONUS_BLKID (-1ULL)
-#define	IN_DMU_SYNC 2
-
-/*
- * define flags for dbuf_read
- */
-
-#define	DB_RF_MUST_SUCCEED	(1 << 0)
-#define	DB_RF_CANFAIL		(1 << 1)
-#define	DB_RF_HAVESTRUCT	(1 << 2)
-#define	DB_RF_NOPREFETCH	(1 << 3)
-#define	DB_RF_NEVERWAIT		(1 << 4)
-#define	DB_RF_CACHED		(1 << 5)
-
-/*
- * The state transition diagram for dbufs looks like:
- *
- *		+----> READ ----+
- *		|		|
- *		|		V
- *  (alloc)-->UNCACHED	     CACHED-->EVICTING-->(free)
- *		|		^
- *		|		|
- *		+----> FILL ----+
- */
-typedef enum dbuf_states {
-	DB_UNCACHED,
-	DB_FILL,
-	DB_READ,
-	DB_CACHED,
-	DB_EVICTING
-} dbuf_states_t;
-
-struct objset_impl;
-struct dnode;
-struct dmu_tx;
-
-/*
- * level = 0 means the user data
- * level = 1 means the single indirect block
- * etc.
- */
-
-#define	LIST_LINK_INACTIVE(link) \
-	((link)->list_next == NULL && (link)->list_prev == NULL)
-
-struct dmu_buf_impl;
-
-typedef enum override_states {
-	DR_NOT_OVERRIDDEN,
-	DR_IN_DMU_SYNC,
-	DR_OVERRIDDEN
-} override_states_t;
-
-typedef struct dbuf_dirty_record {
-	/* link on our parents dirty list */
-	list_node_t dr_dirty_node;
-
-	/* transaction group this data will sync in */
-	uint64_t dr_txg;
-
-	/* zio of outstanding write IO */
-	zio_t *dr_zio;
-
-	/* pointer back to our dbuf */
-	struct dmu_buf_impl *dr_dbuf;
-
-	/* pointer to next dirty record */
-	struct dbuf_dirty_record *dr_next;
-
-	/* pointer to parent dirty record */
-	struct dbuf_dirty_record *dr_parent;
-
-	union dirty_types {
-		struct dirty_indirect {
-
-			/* protect access to list */
-			kmutex_t dr_mtx;
-
-			/* Our list of dirty children */
-			list_t dr_children;
-		} di;
-		struct dirty_leaf {
-
-			/*
-			 * dr_data is set when we dirty the buffer
-			 * so that we can retain the pointer even if it
-			 * gets COW'd in a subsequent transaction group.
-			 */
-			arc_buf_t *dr_data;
-			blkptr_t dr_overridden_by;
-			override_states_t dr_override_state;
-		} dl;
-	} dt;
-} dbuf_dirty_record_t;
-
-typedef struct dmu_buf_impl {
-	/*
-	 * The following members are immutable, with the exception of
-	 * db.db_data, which is protected by db_mtx.
-	 */
-
-	/* the publicly visible structure */
-	dmu_buf_t db;
-
-	/* the objset we belong to */
-	struct objset_impl *db_objset;
-
-	/*
-	 * the dnode we belong to (NULL when evicted)
-	 */
-	struct dnode *db_dnode;
-
-	/*
-	 * our parent buffer; if the dnode points to us directly,
-	 * db_parent == db_dnode->dn_dbuf
-	 * only accessed by sync thread ???
-	 * (NULL when evicted)
-	 */
-	struct dmu_buf_impl *db_parent;
-
-	/*
-	 * link for hash table of all dmu_buf_impl_t's
-	 */
-	struct dmu_buf_impl *db_hash_next;
-
-	/* our block number */
-	uint64_t db_blkid;
-
-	/*
-	 * Pointer to the blkptr_t which points to us. May be NULL if we
-	 * don't have one yet. (NULL when evicted)
-	 */
-	blkptr_t *db_blkptr;
-
-	/*
-	 * Our indirection level.  Data buffers have db_level==0.
-	 * Indirect buffers which point to data buffers have
-	 * db_level==1. etc.  Buffers which contain dnodes have
-	 * db_level==0, since the dnodes are stored in a file.
-	 */
-	uint8_t db_level;
-
-	/* db_mtx protects the members below */
-	kmutex_t db_mtx;
-
-	/*
-	 * Current state of the buffer
-	 */
-	dbuf_states_t db_state;
-
-	/*
-	 * Refcount accessed by dmu_buf_{hold,rele}.
-	 * If nonzero, the buffer can't be destroyed.
-	 * Protected by db_mtx.
-	 */
-	refcount_t db_holds;
-
-	/* buffer holding our data */
-	arc_buf_t *db_buf;
-
-	kcondvar_t db_changed;
-	dbuf_dirty_record_t *db_data_pending;
-
-	/* pointer to most recent dirty record for this buffer */
-	dbuf_dirty_record_t *db_last_dirty;
-
-	/*
-	 * Our link on the owner dnodes's dn_dbufs list.
-	 * Protected by its dn_dbufs_mtx.
-	 */
-	list_node_t db_link;
-
-	/* Data which is unique to data (leaf) blocks: */
-
-	/* stuff we store for the user (see dmu_buf_set_user) */
-	void *db_user_ptr;
-	void **db_user_data_ptr_ptr;
-	dmu_buf_evict_func_t *db_evict_func;
-
-	uint8_t db_immediate_evict;
-	uint8_t db_freed_in_flight;
-
-	uint8_t db_dirtycnt;
-} dmu_buf_impl_t;
-
-/* Note: the dbuf hash table is exposed only for the mdb module */
-#define	DBUF_MUTEXES 256
-#define	DBUF_HASH_MUTEX(h, idx) (&(h)->hash_mutexes[(idx) & (DBUF_MUTEXES-1)])
-typedef struct dbuf_hash_table {
-	uint64_t hash_table_mask;
-	dmu_buf_impl_t **hash_table;
-	kmutex_t hash_mutexes[DBUF_MUTEXES];
-} dbuf_hash_table_t;
-
-
-uint64_t dbuf_whichblock(struct dnode *di, uint64_t offset);
-
-dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data);
-dmu_buf_impl_t *dbuf_create_bonus(struct dnode *dn);
-
-dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag);
-dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid,
-    void *tag);
-int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid, int create,
-    void *tag, dmu_buf_impl_t **dbp);
-
-void dbuf_prefetch(struct dnode *dn, uint64_t blkid);
-
-void dbuf_add_ref(dmu_buf_impl_t *db, void *tag);
-uint64_t dbuf_refcount(dmu_buf_impl_t *db);
-
-void dbuf_rele(dmu_buf_impl_t *db, void *tag);
-
-dmu_buf_impl_t *dbuf_find(struct dnode *dn, uint8_t level, uint64_t blkid);
-
-int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags);
-void dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
-void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
-void dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx);
-void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
-void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx);
-dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
-
-void dbuf_clear(dmu_buf_impl_t *db);
-void dbuf_evict(dmu_buf_impl_t *db);
-
-void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
-void dbuf_unoverride(dbuf_dirty_record_t *dr);
-void dbuf_sync_list(list_t *list, dmu_tx_t *tx);
-
-void dbuf_free_range(struct dnode *dn, uint64_t blkid, uint64_t nblks,
-    struct dmu_tx *);
-
-void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx);
-
-void dbuf_init(void);
-void dbuf_fini(void);
-
-#define	DBUF_GET_BUFC_TYPE(db)					\
-	((((db)->db_level > 0) ||				\
-	    (dmu_ot[(db)->db_dnode->dn_type].ot_metadata)) ?	\
-	    ARC_BUFC_METADATA : ARC_BUFC_DATA);
-
-#ifdef ZFS_DEBUG
-
-/*
- * There should be a ## between the string literal and fmt, to make it
- * clear that we're joining two strings together, but gcc does not
- * support that preprocessor token.
- */
-#define	dprintf_dbuf(dbuf, fmt, ...) do { \
-	if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
-	char __db_buf[32]; \
-	uint64_t __db_obj = (dbuf)->db.db_object; \
-	if (__db_obj == DMU_META_DNODE_OBJECT) \
-		(void) strcpy(__db_buf, "mdn"); \
-	else \
-		(void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \
-		    (u_longlong_t)__db_obj); \
-	dprintf_ds((dbuf)->db_objset->os_dsl_dataset, \
-	    "obj=%s lvl=%u blkid=%lld " fmt, \
-	    __db_buf, (dbuf)->db_level, \
-	    (u_longlong_t)(dbuf)->db_blkid, __VA_ARGS__); \
-	} \
-_NOTE(CONSTCOND) } while (0)
-
-#define	dprintf_dbuf_bp(db, bp, fmt, ...) do {			\
-	if (zfs_flags & ZFS_DEBUG_DPRINTF) {			\
-	char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP);	\
-	sprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, bp);		\
-	dprintf_dbuf(db, fmt " %s\n", __VA_ARGS__, __blkbuf);	\
-	kmem_free(__blkbuf, BP_SPRINTF_LEN);			\
-	} 							\
-_NOTE(CONSTCOND) } while (0)
-
-#define	DBUF_VERIFY(db)	dbuf_verify(db)
-
-#else
-
-#define	dprintf_dbuf(db, fmt, ...)
-#define	dprintf_dbuf_bp(db, bp, fmt, ...)
-#define	DBUF_VERIFY(db)
-
-#endif
-
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif /* _SYS_DBUF_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
deleted file mode 100644
index 8c2a1fd..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
+++ /dev/null
@@ -1,587 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef	_SYS_DMU_H
-#define	_SYS_DMU_H
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-/*
- * This file describes the interface that the DMU provides for its
- * consumers.
- *
- * The DMU also interacts with the SPA.  That interface is described in
- * dmu_spa.h.
- */
-
-#include <sys/types.h>
-#include <sys/param.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-struct uio;
-struct page;
-struct vnode;
-struct spa;
-struct zilog;
-struct zio;
-struct blkptr;
-struct zap_cursor;
-struct dsl_dataset;
-struct dsl_pool;
-struct dnode;
-struct drr_begin;
-struct drr_end;
-struct zbookmark;
-struct spa;
-struct nvlist;
-struct objset_impl;
-struct file;
-
-typedef struct objset objset_t;
-typedef struct dmu_tx dmu_tx_t;
-typedef struct dsl_dir dsl_dir_t;
-
-typedef enum dmu_object_type {
-	DMU_OT_NONE,
-	/* general: */
-	DMU_OT_OBJECT_DIRECTORY,	/* ZAP */
-	DMU_OT_OBJECT_ARRAY,		/* UINT64 */
-	DMU_OT_PACKED_NVLIST,		/* UINT8 (XDR by nvlist_pack/unpack) */
-	DMU_OT_PACKED_NVLIST_SIZE,	/* UINT64 */
-	DMU_OT_BPLIST,			/* UINT64 */
-	DMU_OT_BPLIST_HDR,		/* UINT64 */
-	/* spa: */
-	DMU_OT_SPACE_MAP_HEADER,	/* UINT64 */
-	DMU_OT_SPACE_MAP,		/* UINT64 */
-	/* zil: */
-	DMU_OT_INTENT_LOG,		/* UINT64 */
-	/* dmu: */
-	DMU_OT_DNODE,			/* DNODE */
-	DMU_OT_OBJSET,			/* OBJSET */
-	/* dsl: */
-	DMU_OT_DSL_DIR,			/* UINT64 */
-	DMU_OT_DSL_DIR_CHILD_MAP,	/* ZAP */
-	DMU_OT_DSL_DS_SNAP_MAP,		/* ZAP */
-	DMU_OT_DSL_PROPS,		/* ZAP */
-	DMU_OT_DSL_DATASET,		/* UINT64 */
-	/* zpl: */
-	DMU_OT_ZNODE,			/* ZNODE */
-	DMU_OT_ACL,			/* ACL */
-	DMU_OT_PLAIN_FILE_CONTENTS,	/* UINT8 */
-	DMU_OT_DIRECTORY_CONTENTS,	/* ZAP */
-	DMU_OT_MASTER_NODE,		/* ZAP */
-	DMU_OT_UNLINKED_SET,		/* ZAP */
-	/* zvol: */
-	DMU_OT_ZVOL,			/* UINT8 */
-	DMU_OT_ZVOL_PROP,		/* ZAP */
-	/* other; for testing only! */
-	DMU_OT_PLAIN_OTHER,		/* UINT8 */
-	DMU_OT_UINT64_OTHER,		/* UINT64 */
-	DMU_OT_ZAP_OTHER,		/* ZAP */
-	/* new object types: */
-	DMU_OT_ERROR_LOG,		/* ZAP */
-	DMU_OT_SPA_HISTORY,		/* UINT8 */
-	DMU_OT_SPA_HISTORY_OFFSETS,	/* spa_his_phys_t */
-	DMU_OT_POOL_PROPS,		/* ZAP */
-
-	DMU_OT_NUMTYPES
-} dmu_object_type_t;
-
-typedef enum dmu_objset_type {
-	DMU_OST_NONE,
-	DMU_OST_META,
-	DMU_OST_ZFS,
-	DMU_OST_ZVOL,
-	DMU_OST_OTHER,			/* For testing only! */
-	DMU_OST_ANY,			/* Be careful! */
-	DMU_OST_NUMTYPES
-} dmu_objset_type_t;
-
-void byteswap_uint64_array(void *buf, size_t size);
-void byteswap_uint32_array(void *buf, size_t size);
-void byteswap_uint16_array(void *buf, size_t size);
-void byteswap_uint8_array(void *buf, size_t size);
-void zap_byteswap(void *buf, size_t size);
-void zfs_acl_byteswap(void *buf, size_t size);
-void zfs_znode_byteswap(void *buf, size_t size);
-
-#define	DS_MODE_NONE		0	/* invalid, to aid debugging */
-#define	DS_MODE_STANDARD	1	/* normal access, no special needs */
-#define	DS_MODE_PRIMARY		2	/* the "main" access, e.g. a mount */
-#define	DS_MODE_EXCLUSIVE	3	/* exclusive access, e.g. to destroy */
-#define	DS_MODE_LEVELS		4
-#define	DS_MODE_LEVEL(x)	((x) & (DS_MODE_LEVELS - 1))
-#define	DS_MODE_READONLY	0x8
-#define	DS_MODE_IS_READONLY(x)	((x) & DS_MODE_READONLY)
-#define	DS_MODE_INCONSISTENT	0x10
-#define	DS_MODE_IS_INCONSISTENT(x)	((x) & DS_MODE_INCONSISTENT)
-
-#define	DS_FIND_SNAPSHOTS	(1<<0)
-#define	DS_FIND_CHILDREN	(1<<1)
-
-/*
- * The maximum number of bytes that can be accessed as part of one
- * operation, including metadata.
- */
-#define	DMU_MAX_ACCESS (10<<20) /* 10MB */
-
-/*
- * Public routines to create, destroy, open, and close objsets.
- */
-int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
-    objset_t **osp);
-void dmu_objset_close(objset_t *os);
-int dmu_objset_evict_dbufs(objset_t *os, int try);
-int dmu_objset_create(const char *name, dmu_objset_type_t type,
-    objset_t *clone_parent,
-    void (*func)(objset_t *os, void *arg, dmu_tx_t *tx), void *arg);
-int dmu_objset_destroy(const char *name);
-int dmu_snapshots_destroy(char *fsname, char *snapname);
-int dmu_objset_rollback(const char *name);
-int dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive);
-int dmu_objset_rename(const char *name, const char *newname,
-    boolean_t recursive);
-int dmu_objset_find(char *name, int func(char *, void *), void *arg,
-    int flags);
-void dmu_objset_byteswap(void *buf, size_t size);
-
-typedef struct dmu_buf {
-	uint64_t db_object;		/* object that this buffer is part of */
-	uint64_t db_offset;		/* byte offset in this object */
-	uint64_t db_size;		/* size of buffer in bytes */
-	void *db_data;			/* data in buffer */
-} dmu_buf_t;
-
-typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr);
-
-/*
- * Callback function to perform byte swapping on a block.
- */
-typedef void dmu_byteswap_func_t(void *buf, size_t size);
-
-/*
- * The names of zap entries in the DIRECTORY_OBJECT of the MOS.
- */
-#define	DMU_POOL_DIRECTORY_OBJECT	1
-#define	DMU_POOL_CONFIG			"config"
-#define	DMU_POOL_ROOT_DATASET		"root_dataset"
-#define	DMU_POOL_SYNC_BPLIST		"sync_bplist"
-#define	DMU_POOL_ERRLOG_SCRUB		"errlog_scrub"
-#define	DMU_POOL_ERRLOG_LAST		"errlog_last"
-#define	DMU_POOL_SPARES			"spares"
-#define	DMU_POOL_DEFLATE		"deflate"
-#define	DMU_POOL_HISTORY		"history"
-#define	DMU_POOL_PROPS			"pool_props"
-
-/*
- * Allocate an object from this objset.  The range of object numbers
- * available is (0, DN_MAX_OBJECT).  Object 0 is the meta-dnode.
- *
- * The transaction must be assigned to a txg.  The newly allocated
- * object will be "held" in the transaction (ie. you can modify the
- * newly allocated object in this transaction).
- *
- * dmu_object_alloc() chooses an object and returns it in *objectp.
- *
- * dmu_object_claim() allocates a specific object number.  If that
- * number is already allocated, it fails and returns EEXIST.
- *
- * Return 0 on success, or ENOSPC or EEXIST as specified above.
- */
-uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot,
-    int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
-int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
-    int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
-int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
-    int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
-
-/*
- * Free an object from this objset.
- *
- * The object's data will be freed as well (ie. you don't need to call
- * dmu_free(object, 0, -1, tx)).
- *
- * The object need not be held in the transaction.
- *
- * If there are any holds on this object's buffers (via dmu_buf_hold()),
- * or tx holds on the object (via dmu_tx_hold_object()), you can not
- * free it; it fails and returns EBUSY.
- *
- * If the object is not allocated, it fails and returns ENOENT.
- *
- * Return 0 on success, or EBUSY or ENOENT as specified above.
- */
-int dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx);
-
-/*
- * Find the next allocated or free object.
- *
- * The objectp parameter is in-out.  It will be updated to be the next
- * object which is allocated.  Ignore objects which have not been
- * modified since txg.
- *
- * XXX Can only be called on a objset with no dirty data.
- *
- * Returns 0 on success, or ENOENT if there are no more objects.
- */
-int dmu_object_next(objset_t *os, uint64_t *objectp,
-    boolean_t hole, uint64_t txg);
-
-/*
- * Set the data blocksize for an object.
- *
- * The object cannot have any blocks allcated beyond the first.  If
- * the first block is allocated already, the new size must be greater
- * than the current block size.  If these conditions are not met,
- * ENOTSUP will be returned.
- *
- * Returns 0 on success, or EBUSY if there are any holds on the object
- * contents, or ENOTSUP as described above.
- */
-int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size,
-    int ibs, dmu_tx_t *tx);
-
-/*
- * Set the checksum property on a dnode.  The new checksum algorithm will
- * apply to all newly written blocks; existing blocks will not be affected.
- */
-void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
-    dmu_tx_t *tx);
-
-/*
- * Set the compress property on a dnode.  The new compression algorithm will
- * apply to all newly written blocks; existing blocks will not be affected.
- */
-void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
-    dmu_tx_t *tx);
-
-/*
- * Decide how many copies of a given block we should make.  Can be from
- * 1 to SPA_DVAS_PER_BP.
- */
-int dmu_get_replication_level(struct objset_impl *, struct zbookmark *zb,
-    dmu_object_type_t ot);
-/*
- * The bonus data is accessed more or less like a regular buffer.
- * You must dmu_bonus_hold() to get the buffer, which will give you a
- * dmu_buf_t with db_offset==-1ULL, and db_size = the size of the bonus
- * data.  As with any normal buffer, you must call dmu_buf_read() to
- * read db_data, dmu_buf_will_dirty() before modifying it, and the
- * object must be held in an assigned transaction before calling
- * dmu_buf_will_dirty.  You may use dmu_buf_set_user() on the bonus
- * buffer as well.  You must release your hold with dmu_buf_rele().
- */
-int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **);
-int dmu_bonus_max(void);
-
-/*
- * Obtain the DMU buffer from the specified object which contains the
- * specified offset.  dmu_buf_hold() puts a "hold" on the buffer, so
- * that it will remain in memory.  You must release the hold with
- * dmu_buf_rele().  You musn't access the dmu_buf_t after releasing your
- * hold.  You must have a hold on any dmu_buf_t* you pass to the DMU.
- *
- * You must call dmu_buf_read, dmu_buf_will_dirty, or dmu_buf_will_fill
- * on the returned buffer before reading or writing the buffer's
- * db_data.  The comments for those routines describe what particular
- * operations are valid after calling them.
- *
- * The object number must be a valid, allocated object number.
- */
-int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
-    void *tag, dmu_buf_t **);
-void dmu_buf_add_ref(dmu_buf_t *db, void* tag);
-void dmu_buf_rele(dmu_buf_t *db, void *tag);
-uint64_t dmu_buf_refcount(dmu_buf_t *db);
-
-/*
- * dmu_buf_hold_array holds the DMU buffers which contain all bytes in a
- * range of an object.  A pointer to an array of dmu_buf_t*'s is
- * returned (in *dbpp).
- *
- * dmu_buf_rele_array releases the hold on an array of dmu_buf_t*'s, and
- * frees the array.  The hold on the array of buffers MUST be released
- * with dmu_buf_rele_array.  You can NOT release the hold on each buffer
- * individually with dmu_buf_rele.
- */
-int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset,
-    uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp);
-void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag);
-
-/*
- * Returns NULL on success, or the existing user ptr if it's already
- * been set.
- *
- * user_ptr is for use by the user and can be obtained via dmu_buf_get_user().
- *
- * user_data_ptr_ptr should be NULL, or a pointer to a pointer which
- * will be set to db->db_data when you are allowed to access it.  Note
- * that db->db_data (the pointer) can change when you do dmu_buf_read(),
- * dmu_buf_tryupgrade(), dmu_buf_will_dirty(), or dmu_buf_will_fill().
- * *user_data_ptr_ptr will be set to the new value when it changes.
- *
- * If non-NULL, pageout func will be called when this buffer is being
- * excised from the cache, so that you can clean up the data structure
- * pointed to by user_ptr.
- *
- * dmu_evict_user() will call the pageout func for all buffers in a
- * objset with a given pageout func.
- */
-void *dmu_buf_set_user(dmu_buf_t *db, void *user_ptr, void *user_data_ptr_ptr,
-    dmu_buf_evict_func_t *pageout_func);
-/*
- * set_user_ie is the same as set_user, but request immediate eviction
- * when hold count goes to zero.
- */
-void *dmu_buf_set_user_ie(dmu_buf_t *db, void *user_ptr,
-    void *user_data_ptr_ptr, dmu_buf_evict_func_t *pageout_func);
-void *dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr,
-    void *user_ptr, void *user_data_ptr_ptr,
-    dmu_buf_evict_func_t *pageout_func);
-void dmu_evict_user(objset_t *os, dmu_buf_evict_func_t *func);
-
-/*
- * Returns the user_ptr set with dmu_buf_set_user(), or NULL if not set.
- */
-void *dmu_buf_get_user(dmu_buf_t *db);
-
-/*
- * Indicate that you are going to modify the buffer's data (db_data).
- *
- * The transaction (tx) must be assigned to a txg (ie. you've called
- * dmu_tx_assign()).  The buffer's object must be held in the tx
- * (ie. you've called dmu_tx_hold_object(tx, db->db_object)).
- */
-void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx);
-
-/*
- * You must create a transaction, then hold the objects which you will
- * (or might) modify as part of this transaction.  Then you must assign
- * the transaction to a transaction group.  Once the transaction has
- * been assigned, you can modify buffers which belong to held objects as
- * part of this transaction.  You can't modify buffers before the
- * transaction has been assigned; you can't modify buffers which don't
- * belong to objects which this transaction holds; you can't hold
- * objects once the transaction has been assigned.  You may hold an
- * object which you are going to free (with dmu_object_free()), but you
- * don't have to.
- *
- * You can abort the transaction before it has been assigned.
- *
- * Note that you may hold buffers (with dmu_buf_hold) at any time,
- * regardless of transaction state.
- */
-
-#define	DMU_NEW_OBJECT	(-1ULL)
-#define	DMU_OBJECT_END	(-1ULL)
-
-dmu_tx_t *dmu_tx_create(objset_t *os);
-void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len);
-void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off,
-    uint64_t len);
-void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name);
-void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object);
-void dmu_tx_abort(dmu_tx_t *tx);
-int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
-void dmu_tx_wait(dmu_tx_t *tx);
-void dmu_tx_commit(dmu_tx_t *tx);
-
-/*
- * Free up the data blocks for a defined range of a file.  If size is
- * zero, the range from offset to end-of-file is freed.
- */
-int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
-	uint64_t size, dmu_tx_t *tx);
-
-/*
- * Convenience functions.
- *
- * Canfail routines will return 0 on success, or an errno if there is a
- * nonrecoverable I/O error.
- */
-int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
-	void *buf);
-void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
-	const void *buf, dmu_tx_t *tx);
-int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size);
-int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size,
-    dmu_tx_t *tx);
-int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset,
-    uint64_t size, struct page *pp, dmu_tx_t *tx);
-
-extern int zfs_prefetch_disable;
-
-/*
- * Asynchronously try to read in the data.
- */
-void dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset,
-    uint64_t len);
-
-typedef struct dmu_object_info {
-	/* All sizes are in bytes. */
-	uint32_t doi_data_block_size;
-	uint32_t doi_metadata_block_size;
-	uint64_t doi_bonus_size;
-	dmu_object_type_t doi_type;
-	dmu_object_type_t doi_bonus_type;
-	uint8_t doi_indirection;		/* 2 = dnode->indirect->data */
-	uint8_t doi_checksum;
-	uint8_t doi_compress;
-	uint8_t doi_pad[5];
-	/* Values below are number of 512-byte blocks. */
-	uint64_t doi_physical_blks;		/* data + metadata */
-	uint64_t doi_max_block_offset;
-} dmu_object_info_t;
-
-typedef struct dmu_object_type_info {
-	dmu_byteswap_func_t	*ot_byteswap;
-	boolean_t		ot_metadata;
-	char			*ot_name;
-} dmu_object_type_info_t;
-
-extern const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES];
-
-/*
- * Get information on a DMU object.
- *
- * Return 0 on success or ENOENT if object is not allocated.
- *
- * If doi is NULL, just indicates whether the object exists.
- */
-int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi);
-void dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi);
-void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi);
-void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize,
-    u_longlong_t *nblk512);
-
-typedef struct dmu_objset_stats {
-	uint64_t dds_num_clones; /* number of clones of this */
-	uint64_t dds_creation_txg;
-	dmu_objset_type_t dds_type;
-	uint8_t dds_is_snapshot;
-	uint8_t dds_inconsistent;
-	char dds_clone_of[MAXNAMELEN];
-} dmu_objset_stats_t;
-
-/*
- * Get stats on a dataset.
- */
-void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat);
-
-/*
- * Add entries to the nvlist for all the objset's properties.  See
- * zfs_prop_table[] and zfs(1m) for details on the properties.
- */
-void dmu_objset_stats(objset_t *os, struct nvlist *nv);
-
-/*
- * Get the space usage statistics for statvfs().
- *
- * refdbytes is the amount of space "referenced" by this objset.
- * availbytes is the amount of space available to this objset, taking
- * into account quotas & reservations, assuming that no other objsets
- * use the space first.  These values correspond to the 'referenced' and
- * 'available' properties, described in the zfs(1m) manpage.
- *
- * usedobjs and availobjs are the number of objects currently allocated,
- * and available.
- */
-void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
-    uint64_t *usedobjsp, uint64_t *availobjsp);
-
-/*
- * The fsid_guid is a 56-bit ID that can change to avoid collisions.
- * (Contrast with the ds_guid which is a 64-bit ID that will never
- * change, so there is a small probability that it will collide.)
- */
-uint64_t dmu_objset_fsid_guid(objset_t *os);
-
-int dmu_objset_is_snapshot(objset_t *os);
-
-extern struct spa *dmu_objset_spa(objset_t *os);
-extern struct zilog *dmu_objset_zil(objset_t *os);
-extern struct dsl_pool *dmu_objset_pool(objset_t *os);
-extern struct dsl_dataset *dmu_objset_ds(objset_t *os);
-extern void dmu_objset_name(objset_t *os, char *buf);
-extern dmu_objset_type_t dmu_objset_type(objset_t *os);
-extern uint64_t dmu_objset_id(objset_t *os);
-extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
-    uint64_t *id, uint64_t *offp);
-extern int dmu_dir_list_next(objset_t *os, int namelen, char *name,
-    uint64_t *idp, uint64_t *offp);
-
-/*
- * Return the txg number for the given assigned transaction.
- */
-uint64_t dmu_tx_get_txg(dmu_tx_t *tx);
-
-/*
- * Synchronous write.
- * If a parent zio is provided this function initiates a write on the
- * provided buffer as a child of the parent zio.
- * In the absense of a parent zio, the write is completed synchronously.
- * At write completion, blk is filled with the bp of the written block.
- * Note that while the data covered by this function will be on stable
- * storage when the write completes this new data does not become a
- * permanent part of the file until the associated transaction commits.
- */
-typedef void dmu_sync_cb_t(dmu_buf_t *db, void *arg);
-int dmu_sync(struct zio *zio, dmu_buf_t *db,
-    struct blkptr *bp, uint64_t txg, dmu_sync_cb_t *done, void *arg);
-
-/*
- * Find the next hole or data block in file starting at *off
- * Return found offset in *off. Return ESRCH for end of file.
- */
-int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole,
-    uint64_t *off);
-
-/*
- * Initial setup and final teardown.
- */
-extern void dmu_init(void);
-extern void dmu_fini(void);
-
-typedef void (*dmu_traverse_cb_t)(objset_t *os, void *arg, struct blkptr *bp,
-    uint64_t object, uint64_t offset, int len);
-void dmu_traverse_objset(objset_t *os, uint64_t txg_start,
-    dmu_traverse_cb_t cb, void *arg);
-
-int dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, struct file *fp);
-int dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep,
-    boolean_t force, struct file *fp, uint64_t voffset);
-
-/* CRC64 table */
-#define	ZFS_CRC64_POLY	0xC96C5795D7870F42ULL	/* ECMA-182, reflected form */
-extern uint64_t zfs_crc64_table[256];
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_DMU_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h
deleted file mode 100644
index 807011e..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h
+++ /dev/null
@@ -1,237 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_DMU_IMPL_H
-#define	_SYS_DMU_IMPL_H
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/txg_impl.h>
-#include <sys/zio.h>
-#include <sys/dnode.h>
-#include <sys/zfs_context.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-/*
- * This is the locking strategy for the DMU.  Numbers in parenthesis are
- * cases that use that lock order, referenced below:
- *
- * ARC is self-contained
- * bplist is self-contained
- * refcount is self-contained
- * txg is self-contained (hopefully!)
- * zst_lock
- * zf_rwlock
- *
- * XXX try to improve evicting path?
- *
- * dp_config_rwlock > os_obj_lock > dn_struct_rwlock >
- * 	dn_dbufs_mtx > hash_mutexes > db_mtx > leafs
- *
- * dp_config_rwlock
- *    must be held before: everything
- *    protects dd namespace changes
- *    protects property changes globally
- *    held from:
- *    	dsl_dir_open/r:
- *    	dsl_dir_create_sync/w:
- *    	dsl_dir_sync_destroy/w:
- *    	dsl_dir_rename_sync/w:
- *    	dsl_prop_changed_notify/r:
- *
- * os_obj_lock
- *   must be held before:
- *   	everything except dp_config_rwlock
- *   protects os_obj_next
- *   held from:
- *   	dmu_object_alloc: dn_dbufs_mtx, db_mtx, hash_mutexes, dn_struct_rwlock
- *
- * dn_struct_rwlock
- *   must be held before:
- *   	everything except dp_config_rwlock and os_obj_lock
- *   protects structure of dnode (eg. nlevels)
- *   	db_blkptr can change when syncing out change to nlevels
- *   	dn_maxblkid
- *   	dn_nlevels
- *   	dn_*blksz*
- *   	phys nlevels, maxblkid, physical blkptr_t's (?)
- *   held from:
- *   	callers of dbuf_read_impl, dbuf_hold[_impl], dbuf_prefetch
- *   	dmu_object_info_from_dnode: dn_dirty_mtx (dn_datablksz)
- *   	dmu_tx_count_free:
- *   	dbuf_read_impl: db_mtx, dmu_zfetch()
- *   	dmu_zfetch: zf_rwlock/r, zst_lock, dbuf_prefetch()
- *   	dbuf_new_size: db_mtx
- *   	dbuf_dirty: db_mtx
- *	dbuf_findbp: (callers, phys? - the real need)
- *	dbuf_create: dn_dbufs_mtx, hash_mutexes, db_mtx (phys?)
- *	dbuf_prefetch: dn_dirty_mtx, hash_mutexes, db_mtx, dn_dbufs_mtx
- *	dbuf_hold_impl: hash_mutexes, db_mtx, dn_dbufs_mtx, dbuf_findbp()
- *	dnode_sync/w (increase_indirection): db_mtx (phys)
- *	dnode_set_blksz/w: dn_dbufs_mtx (dn_*blksz*)
- *	dnode_new_blkid/w: (dn_maxblkid)
- *	dnode_free_range/w: dn_dirty_mtx (dn_maxblkid)
- *	dnode_next_offset: (phys)
- *
- * dn_dbufs_mtx
- *    must be held before:
- *    	db_mtx, hash_mutexes
- *    protects:
- *    	dn_dbufs
- *    	dn_evicted
- *    held from:
- *    	dmu_evict_user: db_mtx (dn_dbufs)
- *    	dbuf_free_range: db_mtx (dn_dbufs)
- *    	dbuf_remove_ref: db_mtx, callees:
- *    		dbuf_hash_remove: hash_mutexes, db_mtx
- *    	dbuf_create: hash_mutexes, db_mtx (dn_dbufs)
- *    	dnode_set_blksz: (dn_dbufs)
- *
- * hash_mutexes (global)
- *   must be held before:
- *   	db_mtx
- *   protects dbuf_hash_table (global) and db_hash_next
- *   held from:
- *   	dbuf_find: db_mtx
- *   	dbuf_hash_insert: db_mtx
- *   	dbuf_hash_remove: db_mtx
- *
- * db_mtx (meta-leaf)
- *   must be held before:
- *   	dn_mtx, dn_dirty_mtx, dd_lock (leaf mutexes)
- *   protects:
- *   	db_state
- * 	db_holds
- * 	db_buf
- * 	db_changed
- * 	db_data_pending
- * 	db_dirtied
- * 	db_link
- * 	db_dirty_node (??)
- * 	db_dirtycnt
- * 	db_d.*
- * 	db.*
- *   held from:
- * 	dbuf_dirty: dn_mtx, dn_dirty_mtx
- * 	dbuf_dirty->dsl_dir_willuse_space: dd_lock
- * 	dbuf_dirty->dbuf_new_block->dsl_dataset_block_freeable: dd_lock
- * 	dbuf_undirty: dn_dirty_mtx (db_d)
- * 	dbuf_write_done: dn_dirty_mtx (db_state)
- * 	dbuf_*
- * 	dmu_buf_update_user: none (db_d)
- * 	dmu_evict_user: none (db_d) (maybe can eliminate)
- *   	dbuf_find: none (db_holds)
- *   	dbuf_hash_insert: none (db_holds)
- *   	dmu_buf_read_array_impl: none (db_state, db_changed)
- *   	dmu_sync: none (db_dirty_node, db_d)
- *   	dnode_reallocate: none (db)
- *
- * dn_mtx (leaf)
- *   protects:
- *   	dn_dirty_dbufs
- *   	dn_ranges
- *   	phys accounting
- * 	dn_allocated_txg
- * 	dn_free_txg
- * 	dn_assigned_txg
- * 	dd_assigned_tx
- * 	dn_notxholds
- * 	dn_dirtyctx
- * 	dn_dirtyctx_firstset
- * 	(dn_phys copy fields?)
- * 	(dn_phys contents?)
- *   held from:
- *   	dnode_*
- *   	dbuf_dirty: none
- *   	dbuf_sync: none (phys accounting)
- *   	dbuf_undirty: none (dn_ranges, dn_dirty_dbufs)
- *   	dbuf_write_done: none (phys accounting)
- *   	dmu_object_info_from_dnode: none (accounting)
- *   	dmu_tx_commit: none
- *   	dmu_tx_hold_object_impl: none
- *   	dmu_tx_try_assign: dn_notxholds(cv)
- *   	dmu_tx_unassign: none
- *
- * dd_lock (leaf)
- *    protects:
- *    	dd_prop_cbs
- *    	dd_sync_*
- *    	dd_used_bytes
- *    	dd_tempreserved
- *    	dd_space_towrite
- *    	dd_myname
- *    	dd_phys accounting?
- *    held from:
- *    	dsl_dir_*
- *    	dsl_prop_changed_notify: none (dd_prop_cbs)
- *    	dsl_prop_register: none (dd_prop_cbs)
- *    	dsl_prop_unregister: none (dd_prop_cbs)
- *    	dsl_dataset_block_freeable: none (dd_sync_*)
- *
- * os_lock (leaf)
- *   protects:
- *   	os_dirty_dnodes
- *   	os_free_dnodes
- *   	os_dnodes
- *   	os_downgraded_dbufs
- *   	dn_dirtyblksz
- *   	dn_dirty_link
- *   held from:
- *   	dnode_create: none (os_dnodes)
- *   	dnode_destroy: none (os_dnodes)
- *   	dnode_setdirty: none (dn_dirtyblksz, os_*_dnodes)
- *   	dnode_free: none (dn_dirtyblksz, os_*_dnodes)
- *
- * ds_lock (leaf)
- *    protects:
- *    	ds_user_ptr
- *    	ds_user_evice_func
- *    	ds_open_refcount
- *    	ds_snapname
- *    	ds_phys accounting
- *    held from:
- *    	dsl_dataset_*
- *
- * dr_mtx (leaf)
- *    protects:
- *	dr_children
- *    held from:
- *	dbuf_dirty
- *	dbuf_undirty
- *	dbuf_sync_indirect
- *	dnode_new_blkid
- */
-
-struct objset;
-struct dmu_pool;
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_DMU_IMPL_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
deleted file mode 100644
index 8293a3b..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef	_SYS_DMU_OBJSET_H
-#define	_SYS_DMU_OBJSET_H
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/spa.h>
-#include <sys/arc.h>
-#include <sys/txg.h>
-#include <sys/zfs_context.h>
-#include <sys/dnode.h>
-#include <sys/zio.h>
-#include <sys/zil.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-struct dsl_dataset;
-struct dmu_tx;
-struct objset_impl;
-
-typedef struct objset_phys {
-	dnode_phys_t os_meta_dnode;
-	zil_header_t os_zil_header;
-	uint64_t os_type;
-	char os_pad[1024 - sizeof (dnode_phys_t) - sizeof (zil_header_t) -
-	    sizeof (uint64_t)];
-} objset_phys_t;
-
-struct objset {
-	struct objset_impl *os;
-	int os_mode;
-};
-
-typedef struct objset_impl {
-	/* Immutable: */
-	struct dsl_dataset *os_dsl_dataset;
-	spa_t *os_spa;
-	arc_buf_t *os_phys_buf;
-	objset_phys_t *os_phys;
-	dnode_t *os_meta_dnode;
-	zilog_t *os_zil;
-	objset_t os;
-	uint8_t os_checksum;	/* can change, under dsl_dir's locks */
-	uint8_t os_compress;	/* can change, under dsl_dir's locks */
-	uint8_t os_copies;	/* can change, under dsl_dir's locks */
-	uint8_t os_md_checksum;
-	uint8_t os_md_compress;
-
-	/* no lock needed: */
-	struct dmu_tx *os_synctx; /* XXX sketchy */
-	blkptr_t *os_rootbp;
-
-	/* Protected by os_obj_lock */
-	kmutex_t os_obj_lock;
-	uint64_t os_obj_next;
-
-	/* Protected by os_lock */
-	kmutex_t os_lock;
-	list_t os_dirty_dnodes[TXG_SIZE];
-	list_t os_free_dnodes[TXG_SIZE];
-	list_t os_dnodes;
-	list_t os_downgraded_dbufs;
-} objset_impl_t;
-
-#define	DMU_META_DNODE_OBJECT	0
-
-/* called from zpl */
-int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
-    objset_t **osp);
-void dmu_objset_close(objset_t *os);
-int dmu_objset_create(const char *name, dmu_objset_type_t type,
-    objset_t *clone_parent,
-    void (*func)(objset_t *os, void *arg, dmu_tx_t *tx), void *arg);
-int dmu_objset_destroy(const char *name);
-int dmu_objset_rollback(const char *name);
-int dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive);
-void dmu_objset_stats(objset_t *os, nvlist_t *nv);
-void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat);
-void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
-    uint64_t *usedobjsp, uint64_t *availobjsp);
-uint64_t dmu_objset_fsid_guid(objset_t *os);
-int dmu_objset_find(char *name, int func(char *, void *), void *arg,
-    int flags);
-void dmu_objset_byteswap(void *buf, size_t size);
-int dmu_objset_evict_dbufs(objset_t *os, int try);
-
-/* called from dsl */
-void dmu_objset_sync(objset_impl_t *os, zio_t *zio, dmu_tx_t *tx);
-objset_impl_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds,
-    blkptr_t *bp, dmu_objset_type_t type, dmu_tx_t *tx);
-int dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp,
-    objset_impl_t **osip);
-void dmu_objset_evict(struct dsl_dataset *ds, void *arg);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif /* _SYS_DMU_OBJSET_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h
deleted file mode 100644
index ea9fa6c..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef	_SYS_DMU_TRAVERSE_H
-#define	_SYS_DMU_TRAVERSE_H
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/spa.h>
-#include <sys/zio.h>
-#include <sys/dmu.h>
-#include <sys/dnode.h>
-#include <sys/arc.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-#define	ADVANCE_POST	0		/* post-order traversal */
-#define	ADVANCE_PRE	0x01		/* pre-order traversal */
-#define	ADVANCE_PRUNE	0x02		/* prune by prev snapshot birth time */
-#define	ADVANCE_DATA	0x04		/* read user data blocks */
-#define	ADVANCE_HOLES	0x08		/* visit holes */
-#define	ADVANCE_ZIL	0x10		/* visit intent log blocks */
-#define	ADVANCE_NOLOCK	0x20		/* Don't grab SPA sync lock */
-
-#define	ZB_NO_LEVEL	-2
-#define	ZB_MAXLEVEL	32		/* Next power of 2 >= DN_MAX_LEVELS */
-#define	ZB_MAXBLKID	(1ULL << 62)
-#define	ZB_MAXOBJSET	(1ULL << 62)
-#define	ZB_MAXOBJECT	(1ULL << 62)
-
-#define	ZB_MOS_CACHE	0
-#define	ZB_MDN_CACHE	1
-#define	ZB_DN_CACHE	2
-#define	ZB_DEPTH	3
-
-typedef struct zseg {
-	uint64_t	seg_mintxg;
-	uint64_t	seg_maxtxg;
-	zbookmark_t	seg_start;
-	zbookmark_t	seg_end;
-	list_node_t	seg_node;
-} zseg_t;
-
-typedef struct traverse_blk_cache {
-	zbookmark_t	bc_bookmark;
-	blkptr_t	bc_blkptr;
-	void		*bc_data;
-	dnode_phys_t	*bc_dnode;
-	int		bc_errno;
-	int		bc_pad1;
-	uint64_t	bc_pad2;
-} traverse_blk_cache_t;
-
-typedef int (blkptr_cb_t)(traverse_blk_cache_t *bc, spa_t *spa, void *arg);
-
-struct traverse_handle {
-	spa_t		*th_spa;
-	blkptr_cb_t	*th_func;
-	void		*th_arg;
-	uint16_t	th_advance;
-	uint16_t	th_locked;
-	int		th_zio_flags;
-	list_t		th_seglist;
-	traverse_blk_cache_t th_cache[ZB_DEPTH][ZB_MAXLEVEL];
-	traverse_blk_cache_t th_zil_cache;
-	uint64_t	th_hits;
-	uint64_t	th_arc_hits;
-	uint64_t	th_reads;
-	uint64_t	th_callbacks;
-	uint64_t	th_syncs;
-	uint64_t	th_restarts;
-	zbookmark_t	th_noread;
-	zbookmark_t	th_lastcb;
-};
-
-int traverse_dsl_dataset(struct dsl_dataset *ds, uint64_t txg_start,
-    int advance, blkptr_cb_t func, void *arg);
-
-traverse_handle_t *traverse_init(spa_t *spa, blkptr_cb_t *func, void *arg,
-    int advance, int zio_flags);
-void traverse_fini(traverse_handle_t *th);
-
-void traverse_add_dnode(traverse_handle_t *th,
-    uint64_t mintxg, uint64_t maxtxg, uint64_t objset, uint64_t object);
-void traverse_add_objset(traverse_handle_t *th,
-    uint64_t mintxg, uint64_t maxtxg, uint64_t objset);
-void traverse_add_pool(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg);
-
-int traverse_more(traverse_handle_t *th);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif /* _SYS_DMU_TRAVERSE_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h
deleted file mode 100644
index 89f4799..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef	_SYS_DMU_TX_H
-#define	_SYS_DMU_TX_H
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/dmu.h>
-#include <sys/txg.h>
-#include <sys/refcount.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-struct dmu_buf_impl;
-struct dmu_tx_hold;
-struct dnode_link;
-struct dsl_pool;
-struct dnode;
-struct dsl_dir;
-
-struct dmu_tx {
-	/*
-	 * No synchronization is needed because a tx can only be handled
-	 * by one thread.
-	 */
-	list_t tx_holds; /* list of dmu_tx_hold_t */
-	objset_t *tx_objset;
-	struct dsl_dir *tx_dir;
-	struct dsl_pool *tx_pool;
-	uint64_t tx_txg;
-	uint64_t tx_lastsnap_txg;
-	uint64_t tx_lasttried_txg;
-	txg_handle_t tx_txgh;
-	void *tx_tempreserve_cookie;
-	struct dmu_tx_hold *tx_needassign_txh;
-	uint8_t tx_anyobj;
-	int tx_err;
-#ifdef ZFS_DEBUG
-	uint64_t tx_space_towrite;
-	uint64_t tx_space_tofree;
-	uint64_t tx_space_tooverwrite;
-	refcount_t tx_space_written;
-	refcount_t tx_space_freed;
-#endif
-};
-
-enum dmu_tx_hold_type {
-	THT_NEWOBJECT,
-	THT_WRITE,
-	THT_BONUS,
-	THT_FREE,
-	THT_ZAP,
-	THT_SPACE,
-	THT_NUMTYPES
-};
-
-typedef struct dmu_tx_hold {
-	dmu_tx_t *txh_tx;
-	list_node_t txh_node;
-	struct dnode *txh_dnode;
-	uint64_t txh_space_towrite;
-	uint64_t txh_space_tofree;
-	uint64_t txh_space_tooverwrite;
-#ifdef ZFS_DEBUG
-	enum dmu_tx_hold_type txh_type;
-	uint64_t txh_arg1;
-	uint64_t txh_arg2;
-#endif
-} dmu_tx_hold_t;
-
-
-/*
- * These routines are defined in dmu.h, and are called by the user.
- */
-dmu_tx_t *dmu_tx_create(objset_t *dd);
-int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
-void dmu_tx_commit(dmu_tx_t *tx);
-void dmu_tx_abort(dmu_tx_t *tx);
-uint64_t dmu_tx_get_txg(dmu_tx_t *tx);
-void dmu_tx_wait(dmu_tx_t *tx);
-
-/*
- * These routines are defined in dmu_spa.h, and are called by the SPA.
- */
-extern dmu_tx_t *dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg);
-
-/*
- * These routines are only called by the DMU.
- */
-dmu_tx_t *dmu_tx_create_dd(dsl_dir_t *dd);
-int dmu_tx_is_syncing(dmu_tx_t *tx);
-int dmu_tx_private_ok(dmu_tx_t *tx);
-void dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object);
-void dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta);
-void dmu_tx_dirty_buf(dmu_tx_t *tx, struct dmu_buf_impl *db);
-int dmu_tx_holds(dmu_tx_t *tx, uint64_t object);
-void dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space);
-
-#ifdef ZFS_DEBUG
-#define	DMU_TX_DIRTY_BUF(tx, db)	dmu_tx_dirty_buf(tx, db)
-#else
-#define	DMU_TX_DIRTY_BUF(tx, db)
-#endif
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_DMU_TX_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_zfetch.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_zfetch.h
deleted file mode 100644
index c94bced..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_zfetch.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef	_DFETCH_H
-#define	_DFETCH_H
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/zfs_context.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-extern uint64_t	zfetch_array_rd_sz;
-
-struct dnode;				/* so we can reference dnode */
-
-typedef enum zfetch_dirn {
-	ZFETCH_FORWARD = 1,		/* prefetch increasing block numbers */
-	ZFETCH_BACKWARD	= -1		/* prefetch decreasing block numbers */
-} zfetch_dirn_t;
-
-typedef struct zstream {
-	uint64_t	zst_offset;	/* offset of starting block in range */
-	uint64_t	zst_len;	/* length of range, in blocks */
-	zfetch_dirn_t	zst_direction;	/* direction of prefetch */
-	uint64_t	zst_stride;	/* length of stride, in blocks */
-	uint64_t	zst_ph_offset;	/* prefetch offset, in blocks */
-	uint64_t	zst_cap;	/* prefetch limit (cap), in blocks */
-	kmutex_t	zst_lock;	/* protects stream */
-	clock_t		zst_last;	/* lbolt of last prefetch */
-	avl_node_t	zst_node;	/* embed avl node here */
-} zstream_t;
-
-typedef struct zfetch {
-	krwlock_t	zf_rwlock;	/* protects zfetch structure */
-	list_t		zf_stream;	/* AVL tree of zstream_t's */
-	struct dnode	*zf_dnode;	/* dnode that owns this zfetch */
-	uint32_t	zf_stream_cnt;	/* # of active streams */
-	uint64_t	zf_alloc_fail;	/* # of failed attempts to alloc strm */
-} zfetch_t;
-
-void		dmu_zfetch_init(zfetch_t *, struct dnode *);
-void		dmu_zfetch_rele(zfetch_t *);
-void		dmu_zfetch(zfetch_t *, uint64_t, uint64_t, int);
-
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _DFETCH_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h
deleted file mode 100644
index 327e538..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h
+++ /dev/null
@@ -1,267 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef	_SYS_DNODE_H
-#define	_SYS_DNODE_H
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/avl.h>
-#include <sys/spa.h>
-#include <sys/txg.h>
-#include <sys/zio.h>
-#include <sys/refcount.h>
-#include <sys/dmu_zfetch.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-/*
- * Flags.
- */
-#define	DNODE_MUST_BE_ALLOCATED	1
-#define	DNODE_MUST_BE_FREE	2
-
-/*
- * Fixed constants.
- */
-#define	DNODE_SHIFT		9	/* 512 bytes */
-#define	DN_MIN_INDBLKSHIFT	10	/* 1k */
-#define	DN_MAX_INDBLKSHIFT	14	/* 16k */
-#define	DNODE_BLOCK_SHIFT	14	/* 16k */
-#define	DNODE_CORE_SIZE		64	/* 64 bytes for dnode sans blkptrs */
-#define	DN_MAX_OBJECT_SHIFT	48	/* 256 trillion (zfs_fid_t limit) */
-#define	DN_MAX_OFFSET_SHIFT	64	/* 2^64 bytes in a dnode */
-
-/*
- * Derived constants.
- */
-#define	DNODE_SIZE	(1 << DNODE_SHIFT)
-#define	DN_MAX_NBLKPTR	((DNODE_SIZE - DNODE_CORE_SIZE) >> SPA_BLKPTRSHIFT)
-#define	DN_MAX_BONUSLEN	(DNODE_SIZE - DNODE_CORE_SIZE - (1 << SPA_BLKPTRSHIFT))
-#define	DN_MAX_OBJECT	(1ULL << DN_MAX_OBJECT_SHIFT)
-
-#define	DNODES_PER_BLOCK_SHIFT	(DNODE_BLOCK_SHIFT - DNODE_SHIFT)
-#define	DNODES_PER_BLOCK	(1ULL << DNODES_PER_BLOCK_SHIFT)
-#define	DNODES_PER_LEVEL_SHIFT	(DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT)
-
-/* The +2 here is a cheesy way to round up */
-#define	DN_MAX_LEVELS	(2 + ((DN_MAX_OFFSET_SHIFT - SPA_MINBLOCKSHIFT) / \
-	(DN_MIN_INDBLKSHIFT - SPA_BLKPTRSHIFT)))
-
-#define	DN_BONUS(dnp)	((void*)((dnp)->dn_bonus + \
-	(((dnp)->dn_nblkptr - 1) * sizeof (blkptr_t))))
-
-#define	DN_USED_BYTES(dnp) (((dnp)->dn_flags & DNODE_FLAG_USED_BYTES) ? \
-	(dnp)->dn_used : (dnp)->dn_used << SPA_MINBLOCKSHIFT)
-
-#define	EPB(blkshift, typeshift)	(1 << (blkshift - typeshift))
-
-struct dmu_buf_impl;
-struct objset_impl;
-struct zio;
-
-enum dnode_dirtycontext {
-	DN_UNDIRTIED,
-	DN_DIRTY_OPEN,
-	DN_DIRTY_SYNC
-};
-
-/* Is dn_used in bytes?  if not, it's in multiples of SPA_MINBLOCKSIZE */
-#define	DNODE_FLAG_USED_BYTES	(1<<0)
-
-typedef struct dnode_phys {
-	uint8_t dn_type;		/* dmu_object_type_t */
-	uint8_t dn_indblkshift;		/* ln2(indirect block size) */
-	uint8_t dn_nlevels;		/* 1=dn_blkptr->data blocks */
-	uint8_t dn_nblkptr;		/* length of dn_blkptr */
-	uint8_t dn_bonustype;		/* type of data in bonus buffer */
-	uint8_t	dn_checksum;		/* ZIO_CHECKSUM type */
-	uint8_t	dn_compress;		/* ZIO_COMPRESS type */
-	uint8_t dn_flags;		/* DNODE_FLAG_* */
-	uint16_t dn_datablkszsec;	/* data block size in 512b sectors */
-	uint16_t dn_bonuslen;		/* length of dn_bonus */
-	uint8_t dn_pad2[4];
-
-	/* accounting is protected by dn_dirty_mtx */
-	uint64_t dn_maxblkid;		/* largest allocated block ID */
-	uint64_t dn_used;		/* bytes (or sectors) of disk space */
-
-	uint64_t dn_pad3[4];
-
-	blkptr_t dn_blkptr[1];
-	uint8_t dn_bonus[DN_MAX_BONUSLEN];
-} dnode_phys_t;
-
-typedef struct dnode {
-	/*
-	 * dn_struct_rwlock protects the structure of the dnode,
-	 * including the number of levels of indirection (dn_nlevels),
-	 * dn_maxblkid, and dn_next_*
-	 */
-	krwlock_t dn_struct_rwlock;
-
-	/*
-	 * Our link on dataset's dd_dnodes list.
-	 * Protected by dd_accounting_mtx.
-	 */
-	list_node_t dn_link;
-
-	/* immutable: */
-	struct objset_impl *dn_objset;
-	uint64_t dn_object;
-	struct dmu_buf_impl *dn_dbuf;
-	dnode_phys_t *dn_phys; /* pointer into dn->dn_dbuf->db.db_data */
-
-	/*
-	 * Copies of stuff in dn_phys.  They're valid in the open
-	 * context (eg. even before the dnode is first synced).
-	 * Where necessary, these are protected by dn_struct_rwlock.
-	 */
-	dmu_object_type_t dn_type;	/* object type */
-	uint16_t dn_bonuslen;		/* bonus length */
-	uint8_t dn_bonustype;		/* bonus type */
-	uint8_t dn_nblkptr;		/* number of blkptrs (immutable) */
-	uint8_t dn_checksum;		/* ZIO_CHECKSUM type */
-	uint8_t dn_compress;		/* ZIO_COMPRESS type */
-	uint8_t dn_nlevels;
-	uint8_t dn_indblkshift;
-	uint8_t dn_datablkshift;	/* zero if blksz not power of 2! */
-	uint16_t dn_datablkszsec;	/* in 512b sectors */
-	uint32_t dn_datablksz;		/* in bytes */
-	uint64_t dn_maxblkid;
-	uint8_t dn_next_nlevels[TXG_SIZE];
-	uint8_t dn_next_indblkshift[TXG_SIZE];
-	uint32_t dn_next_blksz[TXG_SIZE];	/* next block size in bytes */
-
-	/* protected by os_lock: */
-	list_node_t dn_dirty_link[TXG_SIZE];	/* next on dataset's dirty */
-
-	/* protected by dn_mtx: */
-	kmutex_t dn_mtx;
-	list_t dn_dirty_records[TXG_SIZE];
-	avl_tree_t dn_ranges[TXG_SIZE];
-	uint64_t dn_allocated_txg;
-	uint64_t dn_free_txg;
-	uint64_t dn_assigned_txg;
-	kcondvar_t dn_notxholds;
-	enum dnode_dirtycontext dn_dirtyctx;
-	uint8_t *dn_dirtyctx_firstset;		/* dbg: contents meaningless */
-
-	/* protected by own devices */
-	refcount_t dn_tx_holds;
-	refcount_t dn_holds;
-
-	kmutex_t dn_dbufs_mtx;
-	list_t dn_dbufs;		/* linked list of descendent dbuf_t's */
-	struct dmu_buf_impl *dn_bonus;	/* bonus buffer dbuf */
-
-	/* parent IO for current sync write */
-	zio_t *dn_zio;
-
-	/* holds prefetch structure */
-	struct zfetch	dn_zfetch;
-} dnode_t;
-
-typedef struct free_range {
-	avl_node_t fr_node;
-	uint64_t fr_blkid;
-	uint64_t fr_nblks;
-} free_range_t;
-
-dnode_t *dnode_special_open(struct objset_impl *dd, dnode_phys_t *dnp,
-    uint64_t object);
-void dnode_special_close(dnode_t *dn);
-
-int dnode_hold(struct objset_impl *dd, uint64_t object,
-    void *ref, dnode_t **dnp);
-int dnode_hold_impl(struct objset_impl *dd, uint64_t object, int flag,
-    void *ref, dnode_t **dnp);
-void dnode_add_ref(dnode_t *dn, void *ref);
-void dnode_rele(dnode_t *dn, void *ref);
-void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx);
-void dnode_sync(dnode_t *dn, dmu_tx_t *tx);
-void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
-    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
-void dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
-    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
-void dnode_free(dnode_t *dn, dmu_tx_t *tx);
-void dnode_byteswap(dnode_phys_t *dnp);
-void dnode_buf_byteswap(void *buf, size_t size);
-void dnode_verify(dnode_t *dn);
-int dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx);
-uint64_t dnode_current_max_length(dnode_t *dn);
-void dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx);
-void dnode_clear_range(dnode_t *dn, uint64_t blkid,
-    uint64_t nblks, dmu_tx_t *tx);
-void dnode_diduse_space(dnode_t *dn, int64_t space);
-void dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx);
-void dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx);
-uint64_t dnode_block_freed(dnode_t *dn, uint64_t blkid);
-void dnode_init(void);
-void dnode_fini(void);
-int dnode_next_offset(dnode_t *dn, boolean_t hole, uint64_t *off, int minlvl,
-    uint64_t blkfill, uint64_t txg);
-int dnode_evict_dbufs(dnode_t *dn, int try);
-
-#ifdef ZFS_DEBUG
-
-/*
- * There should be a ## between the string literal and fmt, to make it
- * clear that we're joining two strings together, but that piece of shit
- * gcc doesn't support that preprocessor token.
- */
-#define	dprintf_dnode(dn, fmt, ...) do { \
-	if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
-	char __db_buf[32]; \
-	uint64_t __db_obj = (dn)->dn_object; \
-	if (__db_obj == DMU_META_DNODE_OBJECT) \
-		(void) strcpy(__db_buf, "mdn"); \
-	else \
-		(void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \
-		    (u_longlong_t)__db_obj);\
-	dprintf_ds((dn)->dn_objset->os_dsl_dataset, "obj=%s " fmt, \
-	    __db_buf, __VA_ARGS__); \
-	} \
-_NOTE(CONSTCOND) } while (0)
-
-#define	DNODE_VERIFY(dn)		dnode_verify(dn)
-#define	FREE_VERIFY(db, start, end, tx)	free_verify(db, start, end, tx)
-
-#else
-
-#define	dprintf_dnode(db, fmt, ...)
-#define	DNODE_VERIFY(dn)
-#define	FREE_VERIFY(db, start, end, tx)
-
-#endif
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_DNODE_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h
deleted file mode 100644
index 8cfc1dc..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef	_SYS_DSL_DATASET_H
-#define	_SYS_DSL_DATASET_H
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/dmu.h>
-#include <sys/spa.h>
-#include <sys/txg.h>
-#include <sys/zio.h>
-#include <sys/bplist.h>
-#include <sys/dsl_synctask.h>
-#include <sys/zfs_context.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-struct dsl_dataset;
-struct dsl_dir;
-struct dsl_pool;
-
-typedef void dsl_dataset_evict_func_t(struct dsl_dataset *, void *);
-
-#define	DS_FLAG_INCONSISTENT	(1ULL<<0)
-/*
- * NB: nopromote can not yet be set, but we want support for it in this
- * on-disk version, so that we don't need to upgrade for it later.  It
- * will be needed when we implement 'zfs split' (where the split off
- * clone should not be promoted).
- */
-#define	DS_FLAG_NOPROMOTE	(1ULL<<1)
-
-typedef struct dsl_dataset_phys {
-	uint64_t ds_dir_obj;
-	uint64_t ds_prev_snap_obj;
-	uint64_t ds_prev_snap_txg;
-	uint64_t ds_next_snap_obj;
-	uint64_t ds_snapnames_zapobj;	/* zap obj of snaps; ==0 for snaps */
-	uint64_t ds_num_children;	/* clone/snap children; ==0 for head */
-	uint64_t ds_creation_time;	/* seconds since 1970 */
-	uint64_t ds_creation_txg;
-	uint64_t ds_deadlist_obj;
-	uint64_t ds_used_bytes;
-	uint64_t ds_compressed_bytes;
-	uint64_t ds_uncompressed_bytes;
-	uint64_t ds_unique_bytes;	/* only relevant to snapshots */
-	/*
-	 * The ds_fsid_guid is a 56-bit ID that can change to avoid
-	 * collisions.  The ds_guid is a 64-bit ID that will never
-	 * change, so there is a small probability that it will collide.
-	 */
-	uint64_t ds_fsid_guid;
-	uint64_t ds_guid;
-	uint64_t ds_flags;
-	blkptr_t ds_bp;
-	uint64_t ds_pad[8]; /* pad out to 320 bytes for good measure */
-} dsl_dataset_phys_t;
-
-typedef struct dsl_dataset {
-	/* Immutable: */
-	struct dsl_dir *ds_dir;
-	dsl_dataset_phys_t *ds_phys;
-	dmu_buf_t *ds_dbuf;
-	uint64_t ds_object;
-
-	/* only used in syncing context: */
-	struct dsl_dataset *ds_prev; /* only valid for non-snapshots */
-
-	/* has internal locking: */
-	bplist_t ds_deadlist;
-
-	/* protected by lock on pool's dp_dirty_datasets list */
-	txg_node_t ds_dirty_link;
-	list_node_t ds_synced_link;
-
-	/*
-	 * ds_phys->ds_<accounting> is also protected by ds_lock.
-	 * Protected by ds_lock:
-	 */
-	kmutex_t ds_lock;
-	void *ds_user_ptr;
-	dsl_dataset_evict_func_t *ds_user_evict_func;
-	uint64_t ds_open_refcount;
-
-	/* no locking; only for making guesses */
-	uint64_t ds_trysnap_txg;
-
-	/* Protected by ds_lock; keep at end of struct for better locality */
-	char ds_snapname[MAXNAMELEN];
-} dsl_dataset_t;
-
-#define	dsl_dataset_is_snapshot(ds)	\
-	((ds)->ds_phys->ds_num_children != 0)
-
-int dsl_dataset_open_spa(spa_t *spa, const char *name, int mode,
-    void *tag, dsl_dataset_t **dsp);
-int dsl_dataset_open(const char *name, int mode, void *tag,
-    dsl_dataset_t **dsp);
-int dsl_dataset_open_obj(struct dsl_pool *dp, uint64_t dsobj,
-    const char *tail, int mode, void *tag, dsl_dataset_t **);
-void dsl_dataset_name(dsl_dataset_t *ds, char *name);
-void dsl_dataset_close(dsl_dataset_t *ds, int mode, void *tag);
-uint64_t dsl_dataset_create_sync(dsl_dir_t *pds,
-    const char *lastname, dsl_dataset_t *clone_parent, dmu_tx_t *tx);
-int dsl_dataset_destroy(const char *name);
-int dsl_snapshots_destroy(char *fsname, char *snapname);
-dsl_checkfunc_t dsl_dataset_snapshot_check;
-dsl_syncfunc_t dsl_dataset_snapshot_sync;
-int dsl_dataset_rollback(dsl_dataset_t *ds);
-int dsl_dataset_rename(char *name, const char *newname, boolean_t recursive);
-int dsl_dataset_promote(const char *name);
-
-void *dsl_dataset_set_user_ptr(dsl_dataset_t *ds,
-    void *p, dsl_dataset_evict_func_t func);
-void *dsl_dataset_get_user_ptr(dsl_dataset_t *ds);
-
-blkptr_t *dsl_dataset_get_blkptr(dsl_dataset_t *ds);
-void dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
-
-spa_t *dsl_dataset_get_spa(dsl_dataset_t *ds);
-
-void dsl_dataset_sync(dsl_dataset_t *os, zio_t *zio, dmu_tx_t *tx);
-
-void dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
-void dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
-    dmu_tx_t *tx);
-int dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth);
-uint64_t dsl_dataset_prev_snap_txg(dsl_dataset_t *ds);
-
-void dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx);
-void dsl_dataset_stats(dsl_dataset_t *os, nvlist_t *nv);
-void dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat);
-void dsl_dataset_space(dsl_dataset_t *ds,
-    uint64_t *refdbytesp, uint64_t *availbytesp,
-    uint64_t *usedobjsp, uint64_t *availobjsp);
-uint64_t dsl_dataset_fsid_guid(dsl_dataset_t *ds);
-
-void dsl_dataset_create_root(struct dsl_pool *dp, uint64_t *ddobjp,
-    dmu_tx_t *tx);
-
-int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf);
-
-#ifdef ZFS_DEBUG
-#define	dprintf_ds(ds, fmt, ...) do { \
-	if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
-	char *__ds_name = kmem_alloc(MAXNAMELEN, KM_SLEEP); \
-	dsl_dataset_name(ds, __ds_name); \
-	dprintf("ds=%s " fmt, __ds_name, __VA_ARGS__); \
-	kmem_free(__ds_name, MAXNAMELEN); \
-	} \
-_NOTE(CONSTCOND) } while (0)
-#else
-#define	dprintf_ds(dd, fmt, ...)
-#endif
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif /* _SYS_DSL_DATASET_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h
deleted file mode 100644
index e0595d3..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h
+++ /dev/null
@@ -1,143 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef	_SYS_DSL_DIR_H
-#define	_SYS_DSL_DIR_H
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/dmu.h>
-#include <sys/dsl_pool.h>
-#include <sys/dsl_synctask.h>
-#include <sys/refcount.h>
-#include <sys/zfs_context.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-struct dsl_dataset;
-
-typedef struct dsl_dir_phys {
-	uint64_t dd_creation_time; /* not actually used */
-	uint64_t dd_head_dataset_obj;
-	uint64_t dd_parent_obj;
-	uint64_t dd_clone_parent_obj;
-	uint64_t dd_child_dir_zapobj;
-	/*
-	 * how much space our children are accounting for; for leaf
-	 * datasets, == physical space used by fs + snaps
-	 */
-	uint64_t dd_used_bytes;
-	uint64_t dd_compressed_bytes;
-	uint64_t dd_uncompressed_bytes;
-	/* Administrative quota setting */
-	uint64_t dd_quota;
-	/* Administrative reservation setting */
-	uint64_t dd_reserved;
-	uint64_t dd_props_zapobj;
-	uint64_t dd_pad[21]; /* pad out to 256 bytes for good measure */
-} dsl_dir_phys_t;
-
-struct dsl_dir {
-	/* These are immutable; no lock needed: */
-	uint64_t dd_object;
-	dsl_dir_phys_t *dd_phys;
-	dmu_buf_t *dd_dbuf;
-	dsl_pool_t *dd_pool;
-
-	/* protected by lock on pool's dp_dirty_dirs list */
-	txg_node_t dd_dirty_link;
-
-	/* protected by dp_config_rwlock */
-	dsl_dir_t *dd_parent;
-
-	/* Protected by dd_lock */
-	kmutex_t dd_lock;
-	list_t dd_prop_cbs; /* list of dsl_prop_cb_record_t's */
-
-	/* Accounting */
-	/* reflects any changes to dd_phys->dd_used_bytes made this syncing */
-	int64_t dd_used_bytes;
-	/* gross estimate of space used by in-flight tx's */
-	uint64_t dd_tempreserved[TXG_SIZE];
-	/* amount of space we expect to write; == amount of dirty data */
-	int64_t dd_space_towrite[TXG_SIZE];
-
-	/* protected by dd_lock; keep at end of struct for better locality */
-	char dd_myname[MAXNAMELEN];
-};
-
-void dsl_dir_close(dsl_dir_t *dd, void *tag);
-int dsl_dir_open(const char *name, void *tag, dsl_dir_t **, const char **tail);
-int dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, dsl_dir_t **,
-    const char **tailp);
-int dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
-    const char *tail, void *tag, dsl_dir_t **);
-void dsl_dir_name(dsl_dir_t *dd, char *buf);
-int dsl_dir_namelen(dsl_dir_t *dd);
-int dsl_dir_is_private(dsl_dir_t *dd);
-uint64_t dsl_dir_create_sync(dsl_dir_t *pds, const char *name, dmu_tx_t *tx);
-void dsl_dir_create_root(objset_t *mos, uint64_t *ddobjp, dmu_tx_t *tx);
-dsl_checkfunc_t dsl_dir_destroy_check;
-dsl_syncfunc_t dsl_dir_destroy_sync;
-void dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv);
-uint64_t dsl_dir_space_available(dsl_dir_t *dd,
-    dsl_dir_t *ancestor, int64_t delta, int ondiskonly);
-void dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx);
-void dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx);
-int dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t mem,
-    uint64_t asize, uint64_t fsize, void **tr_cookiep, dmu_tx_t *tx);
-void dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx);
-void dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx);
-void dsl_dir_diduse_space(dsl_dir_t *dd,
-    int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx);
-int dsl_dir_set_quota(const char *ddname, uint64_t quota);
-int dsl_dir_set_reservation(const char *ddname, uint64_t reservation);
-int dsl_dir_rename(dsl_dir_t *dd, const char *newname);
-int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space);
-
-/* internal reserved dir name */
-#define	MOS_DIR_NAME "$MOS"
-
-#ifdef ZFS_DEBUG
-#define	dprintf_dd(dd, fmt, ...) do { \
-	if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
-	char *__ds_name = kmem_alloc(MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, \
-	    KM_SLEEP); \
-	dsl_dir_name(dd, __ds_name); \
-	dprintf("dd=%s " fmt, __ds_name, __VA_ARGS__); \
-	kmem_free(__ds_name, MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); \
-	} \
-_NOTE(CONSTCOND) } while (0)
-#else
-#define	dprintf_dd(dd, fmt, ...)
-#endif
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif /* _SYS_DSL_DIR_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h
deleted file mode 100644
index f7ec67a..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef	_SYS_DSL_POOL_H
-#define	_SYS_DSL_POOL_H
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/spa.h>
-#include <sys/txg.h>
-#include <sys/txg_impl.h>
-#include <sys/zfs_context.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-struct objset;
-struct dsl_dir;
-
-typedef struct dsl_pool {
-	/* Immutable */
-	spa_t *dp_spa;
-	struct objset *dp_meta_objset;
-	struct dsl_dir *dp_root_dir;
-	struct dsl_dir *dp_mos_dir;
-	uint64_t dp_root_dir_obj;
-
-	/* No lock needed - sync context only */
-	blkptr_t dp_meta_rootbp;
-	list_t dp_synced_objsets;
-
-	/* Has its own locking */
-	tx_state_t dp_tx;
-	txg_list_t dp_dirty_datasets;
-	txg_list_t dp_dirty_dirs;
-	txg_list_t dp_sync_tasks;
-
-	/*
-	 * Protects administrative changes (properties, namespace)
-	 * It is only held for write in syncing context.  Therefore
-	 * syncing context does not need to ever have it for read, since
-	 * nobody else could possibly have it for write.
-	 */
-	krwlock_t dp_config_rwlock;
-} dsl_pool_t;
-
-int dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp);
-void dsl_pool_close(dsl_pool_t *dp);
-dsl_pool_t *dsl_pool_create(spa_t *spa, uint64_t txg);
-void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg);
-void dsl_pool_zil_clean(dsl_pool_t *dp);
-int dsl_pool_sync_context(dsl_pool_t *dp);
-uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif /* _SYS_DSL_POOL_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h
deleted file mode 100644
index d2debff..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef	_SYS_DSL_PROP_H
-#define	_SYS_DSL_PROP_H
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/dmu.h>
-#include <sys/dsl_pool.h>
-#include <sys/zfs_context.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-struct dsl_dataset;
-
-/* The callback func may not call into the DMU or DSL! */
-typedef void (dsl_prop_changed_cb_t)(void *arg, uint64_t newval);
-
-typedef struct dsl_prop_cb_record {
-	list_node_t cbr_node; /* link on dd_prop_cbs */
-	struct dsl_dataset *cbr_ds;
-	const char *cbr_propname;
-	dsl_prop_changed_cb_t *cbr_func;
-	void *cbr_arg;
-} dsl_prop_cb_record_t;
-
-int dsl_prop_register(struct dsl_dataset *ds, const char *propname,
-    dsl_prop_changed_cb_t *callback, void *cbarg);
-int dsl_prop_unregister(struct dsl_dataset *ds, const char *propname,
-    dsl_prop_changed_cb_t *callback, void *cbarg);
-int dsl_prop_numcb(struct dsl_dataset *ds);
-
-int dsl_prop_get(const char *ddname, const char *propname,
-    int intsz, int numints, void *buf, char *setpoint);
-int dsl_prop_get_integer(const char *ddname, const char *propname,
-    uint64_t *valuep, char *setpoint);
-int dsl_prop_get_all(objset_t *os, nvlist_t **nvp);
-
-int dsl_prop_set(const char *ddname, const char *propname,
-    int intsz, int numints, const void *buf);
-int dsl_prop_set_dd(dsl_dir_t *dd, const char *propname,
-    int intsz, int numints, const void *buf);
-
-void dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value);
-void dsl_prop_nvlist_add_string(nvlist_t *nv,
-    zfs_prop_t prop, const char *value);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_DSL_PROP_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h
deleted file mode 100644
index e695b18..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef	_SYS_DSL_SYNCTASK_H
-#define	_SYS_DSL_SYNCTASK_H
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/txg.h>
-#include <sys/zfs_context.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-struct dsl_pool;
-
-typedef int (dsl_checkfunc_t)(void *, void *, dmu_tx_t *);
-typedef void (dsl_syncfunc_t)(void *, void *, dmu_tx_t *);
-
-typedef struct dsl_sync_task {
-	list_node_t dst_node;
-	dsl_checkfunc_t *dst_checkfunc;
-	dsl_syncfunc_t *dst_syncfunc;
-	void *dst_arg1;
-	void *dst_arg2;
-	int dst_err;
-} dsl_sync_task_t;
-
-typedef struct dsl_sync_task_group {
-	txg_node_t dstg_node;
-	list_t dstg_tasks;
-	struct dsl_pool *dstg_pool;
-	uint64_t dstg_txg;
-	int dstg_err;
-	int dstg_space;
-} dsl_sync_task_group_t;
-
-dsl_sync_task_group_t *dsl_sync_task_group_create(struct dsl_pool *dp);
-void dsl_sync_task_create(dsl_sync_task_group_t *dstg,
-    dsl_checkfunc_t *, dsl_syncfunc_t *,
-    void *arg1, void *arg2, int blocks_modified);
-int dsl_sync_task_group_wait(dsl_sync_task_group_t *dstg);
-void dsl_sync_task_group_destroy(dsl_sync_task_group_t *dstg);
-void dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx);
-
-int dsl_sync_task_do(struct dsl_pool *dp,
-    dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc,
-    void *arg1, void *arg2, int blocks_modified);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif /* _SYS_DSL_SYNCTASK_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
deleted file mode 100644
index 095dd3c..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_METASLAB_H
-#define	_SYS_METASLAB_H
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/spa.h>
-#include <sys/space_map.h>
-#include <sys/txg.h>
-#include <sys/zio.h>
-#include <sys/avl.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-typedef struct metaslab_class metaslab_class_t;
-typedef struct metaslab_group metaslab_group_t;
-
-extern metaslab_t *metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
-    uint64_t start, uint64_t size, uint64_t txg);
-extern void metaslab_fini(metaslab_t *msp);
-extern void metaslab_sync(metaslab_t *msp, uint64_t txg);
-extern void metaslab_sync_done(metaslab_t *msp, uint64_t txg);
-
-extern int metaslab_alloc(spa_t *spa, uint64_t psize, blkptr_t *bp,
-    int ncopies, uint64_t txg, blkptr_t *hintbp, boolean_t hintbp_avoid);
-extern void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg,
-    boolean_t now);
-extern int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg);
-
-extern metaslab_class_t *metaslab_class_create(void);
-extern void metaslab_class_destroy(metaslab_class_t *mc);
-extern void metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg);
-extern void metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg);
-
-extern metaslab_group_t *metaslab_group_create(metaslab_class_t *mc,
-    vdev_t *vd);
-extern void metaslab_group_destroy(metaslab_group_t *mg);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_METASLAB_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
deleted file mode 100644
index 5980cbc..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_METASLAB_IMPL_H
-#define	_SYS_METASLAB_IMPL_H
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/metaslab.h>
-#include <sys/space_map.h>
-#include <sys/vdev.h>
-#include <sys/txg.h>
-#include <sys/avl.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-struct metaslab_class {
-	metaslab_group_t	*mc_rotor;
-	uint64_t		mc_allocated;
-};
-
-struct metaslab_group {
-	kmutex_t		mg_lock;
-	avl_tree_t		mg_metaslab_tree;
-	uint64_t		mg_aliquot;
-	int64_t			mg_bias;
-	metaslab_class_t	*mg_class;
-	vdev_t			*mg_vd;
-	metaslab_group_t	*mg_prev;
-	metaslab_group_t	*mg_next;
-};
-
-/*
- * Each metaslab's free space is tracked in space map object in the MOS,
- * which is only updated in syncing context.  Each time we sync a txg,
- * we append the allocs and frees from that txg to the space map object.
- * When the txg is done syncing, metaslab_sync_done() updates ms_smo
- * to ms_smo_syncing.  Everything in ms_smo is always safe to allocate.
- */
-struct metaslab {
-	kmutex_t	ms_lock;	/* metaslab lock		*/
-	space_map_obj_t	ms_smo;		/* synced space map object	*/
-	space_map_obj_t	ms_smo_syncing;	/* syncing space map object	*/
-	space_map_t	ms_allocmap[TXG_SIZE];  /* allocated this txg	*/
-	space_map_t	ms_freemap[TXG_SIZE];	/* freed this txg	*/
-	space_map_t	ms_map;		/* in-core free space map	*/
-	uint64_t	ms_weight;	/* weight vs. others in group	*/
-	metaslab_group_t *ms_group;	/* metaslab group		*/
-	avl_node_t	ms_group_node;	/* node in metaslab group tree	*/
-	txg_node_t	ms_txg_node;	/* per-txg dirty metaslab links	*/
-};
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_METASLAB_IMPL_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h
deleted file mode 100644
index 4de1cae..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef	_SYS_REFCOUNT_H
-#define	_SYS_REFCOUNT_H
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/list.h>
-#include <sys/zfs_context.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-/*
- * If the reference is held only by the calling function and not any
- * particular object, use FTAG (which is a string) for the holder_tag.
- * Otherwise, use the object that holds the reference.
- */
-#define	FTAG ((char *)__func__)
-
-#if defined(DEBUG) || !defined(_KERNEL)
-typedef struct reference {
-	list_node_t ref_link;
-	void *ref_holder;
-	uint64_t ref_number;
-	uint8_t *ref_removed;
-} reference_t;
-
-typedef struct refcount {
-	kmutex_t rc_mtx;
-	list_t rc_list;
-	list_t rc_removed;
-	int64_t rc_count;
-	int64_t rc_removed_count;
-} refcount_t;
-
-/* Note: refcount_t should be initialized to zero before use. */
-
-void refcount_create(refcount_t *rc);
-void refcount_destroy(refcount_t *rc);
-void refcount_destroy_many(refcount_t *rc, uint64_t number);
-int refcount_is_zero(refcount_t *rc);
-int64_t refcount_count(refcount_t *rc);
-int64_t refcount_add(refcount_t *rc, void *holder_tag);
-int64_t refcount_remove(refcount_t *rc, void *holder_tag);
-int64_t refcount_add_many(refcount_t *rc, uint64_t number, void *holder_tag);
-int64_t refcount_remove_many(refcount_t *rc, uint64_t number, void *holder_tag);
-
-void refcount_init(void);
-void refcount_fini(void);
-
-#else /* DEBUG */
-
-typedef struct refcount {
-	uint64_t rc_count;
-} refcount_t;
-
-#define	refcount_create(rc) ((rc)->rc_count = 0)
-#define	refcount_destroy(rc) ((rc)->rc_count = 0)
-#define	refcount_destroy_many(rc, number) ((rc)->rc_count = 0)
-#define	refcount_is_zero(rc) ((rc)->rc_count == 0)
-#define	refcount_count(rc) ((rc)->rc_count)
-#define	refcount_add(rc, holder) atomic_add_64_nv(&(rc)->rc_count, 1)
-#define	refcount_remove(rc, holder) atomic_add_64_nv(&(rc)->rc_count, -1)
-#define	refcount_add_many(rc, number, holder) \
-	atomic_add_64_nv(&(rc)->rc_count, number)
-#define	refcount_remove_many(rc, number, holder) \
-	atomic_add_64_nv(&(rc)->rc_count, -number)
-
-#define	refcount_init()
-#define	refcount_fini()
-
-#endif /* DEBUG */
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif /* _SYS_REFCOUNT_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
deleted file mode 100644
index f0eb2e1..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
+++ /dev/null
@@ -1,491 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_SPA_H
-#define	_SYS_SPA_H
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/avl.h>
-#include <sys/zfs_context.h>
-#include <sys/nvpair.h>
-#include <sys/sysmacros.h>
-#include <sys/types.h>
-#include <sys/fs/zfs.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-/*
- * Forward references that lots of things need.
- */
-typedef struct spa spa_t;
-typedef struct vdev vdev_t;
-typedef struct metaslab metaslab_t;
-typedef struct zilog zilog_t;
-typedef struct traverse_handle traverse_handle_t;
-struct dsl_pool;
-
-/*
- * General-purpose 32-bit and 64-bit bitfield encodings.
- */
-#define	BF32_DECODE(x, low, len)	P2PHASE((x) >> (low), 1U << (len))
-#define	BF64_DECODE(x, low, len)	P2PHASE((x) >> (low), 1ULL << (len))
-#define	BF32_ENCODE(x, low, len)	(P2PHASE((x), 1U << (len)) << (low))
-#define	BF64_ENCODE(x, low, len)	(P2PHASE((x), 1ULL << (len)) << (low))
-
-#define	BF32_GET(x, low, len)		BF32_DECODE(x, low, len)
-#define	BF64_GET(x, low, len)		BF64_DECODE(x, low, len)
-
-#define	BF32_SET(x, low, len, val)	\
-	((x) ^= BF32_ENCODE((x >> low) ^ (val), low, len))
-#define	BF64_SET(x, low, len, val)	\
-	((x) ^= BF64_ENCODE((x >> low) ^ (val), low, len))
-
-#define	BF32_GET_SB(x, low, len, shift, bias)	\
-	((BF32_GET(x, low, len) + (bias)) << (shift))
-#define	BF64_GET_SB(x, low, len, shift, bias)	\
-	((BF64_GET(x, low, len) + (bias)) << (shift))
-
-#define	BF32_SET_SB(x, low, len, shift, bias, val)	\
-	BF32_SET(x, low, len, ((val) >> (shift)) - (bias))
-#define	BF64_SET_SB(x, low, len, shift, bias, val)	\
-	BF64_SET(x, low, len, ((val) >> (shift)) - (bias))
-
-/*
- * We currently support nine block sizes, from 512 bytes to 128K.
- * We could go higher, but the benefits are near-zero and the cost
- * of COWing a giant block to modify one byte would become excessive.
- */
-#define	SPA_MINBLOCKSHIFT	9
-#define	SPA_MAXBLOCKSHIFT	17
-#define	SPA_MINBLOCKSIZE	(1ULL << SPA_MINBLOCKSHIFT)
-#define	SPA_MAXBLOCKSIZE	(1ULL << SPA_MAXBLOCKSHIFT)
-
-#define	SPA_BLOCKSIZES		(SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1)
-
-/*
- * The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB.
- * The ASIZE encoding should be at least 64 times larger (6 more bits)
- * to support up to 4-way RAID-Z mirror mode with worst-case gang block
- * overhead, three DVAs per bp, plus one more bit in case we do anything
- * else that expands the ASIZE.
- */
-#define	SPA_LSIZEBITS		16	/* LSIZE up to 32M (2^16 * 512)	*/
-#define	SPA_PSIZEBITS		16	/* PSIZE up to 32M (2^16 * 512)	*/
-#define	SPA_ASIZEBITS		24	/* ASIZE up to 64 times larger	*/
-
-/*
- * All SPA data is represented by 128-bit data virtual addresses (DVAs).
- * The members of the dva_t should be considered opaque outside the SPA.
- */
-typedef struct dva {
-	uint64_t	dva_word[2];
-} dva_t;
-
-/*
- * Each block has a 256-bit checksum -- strong enough for cryptographic hashes.
- */
-typedef struct zio_cksum {
-	uint64_t	zc_word[4];
-} zio_cksum_t;
-
-/*
- * Each block is described by its DVAs, time of birth, checksum, etc.
- * The word-by-word, bit-by-bit layout of the blkptr is as follows:
- *
- *	64	56	48	40	32	24	16	8	0
- *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 0	|		vdev1		| GRID  |	  ASIZE		|
- *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 1	|G|			 offset1				|
- *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 2	|		vdev2		| GRID  |	  ASIZE		|
- *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 3	|G|			 offset2				|
- *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 4	|		vdev3		| GRID  |	  ASIZE		|
- *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 5	|G|			 offset3				|
- *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 6	|E| lvl | type	| cksum | comp	|     PSIZE	|     LSIZE	|
- *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 7	|			padding					|
- *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 8	|			padding					|
- *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 9	|			padding					|
- *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * a	|			birth txg				|
- *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * b	|			fill count				|
- *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * c	|			checksum[0]				|
- *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * d	|			checksum[1]				|
- *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * e	|			checksum[2]				|
- *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * f	|			checksum[3]				|
- *	+-------+-------+-------+-------+-------+-------+-------+-------+
- *
- * Legend:
- *
- * vdev		virtual device ID
- * offset	offset into virtual device
- * LSIZE	logical size
- * PSIZE	physical size (after compression)
- * ASIZE	allocated size (including RAID-Z parity and gang block headers)
- * GRID		RAID-Z layout information (reserved for future use)
- * cksum	checksum function
- * comp		compression function
- * G		gang block indicator
- * E		endianness
- * type		DMU object type
- * lvl		level of indirection
- * birth txg	transaction group in which the block was born
- * fill count	number of non-zero blocks under this bp
- * checksum[4]	256-bit checksum of the data this bp describes
- */
-typedef struct blkptr {
-	dva_t		blk_dva[3];	/* 128-bit Data Virtual Address	*/
-	uint64_t	blk_prop;	/* size, compression, type, etc	*/
-	uint64_t	blk_pad[3];	/* Extra space for the future	*/
-	uint64_t	blk_birth;	/* transaction group at birth	*/
-	uint64_t	blk_fill;	/* fill count			*/
-	zio_cksum_t	blk_cksum;	/* 256-bit checksum		*/
-} blkptr_t;
-
-#define	SPA_BLKPTRSHIFT	7		/* blkptr_t is 128 bytes	*/
-#define	SPA_DVAS_PER_BP	3		/* Number of DVAs in a bp	*/
-
-/*
- * Macros to get and set fields in a bp or DVA.
- */
-#define	DVA_GET_ASIZE(dva)	\
-	BF64_GET_SB((dva)->dva_word[0], 0, 24, SPA_MINBLOCKSHIFT, 0)
-#define	DVA_SET_ASIZE(dva, x)	\
-	BF64_SET_SB((dva)->dva_word[0], 0, 24, SPA_MINBLOCKSHIFT, 0, x)
-
-#define	DVA_GET_GRID(dva)	BF64_GET((dva)->dva_word[0], 24, 8)
-#define	DVA_SET_GRID(dva, x)	BF64_SET((dva)->dva_word[0], 24, 8, x)
-
-#define	DVA_GET_VDEV(dva)	BF64_GET((dva)->dva_word[0], 32, 32)
-#define	DVA_SET_VDEV(dva, x)	BF64_SET((dva)->dva_word[0], 32, 32, x)
-
-#define	DVA_GET_OFFSET(dva)	\
-	BF64_GET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0)
-#define	DVA_SET_OFFSET(dva, x)	\
-	BF64_SET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0, x)
-
-#define	DVA_GET_GANG(dva)	BF64_GET((dva)->dva_word[1], 63, 1)
-#define	DVA_SET_GANG(dva, x)	BF64_SET((dva)->dva_word[1], 63, 1, x)
-
-#define	BP_GET_LSIZE(bp)	\
-	(BP_IS_HOLE(bp) ? 0 : \
-	BF64_GET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1))
-#define	BP_SET_LSIZE(bp, x)	\
-	BF64_SET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x)
-
-#define	BP_GET_PSIZE(bp)	\
-	BF64_GET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1)
-#define	BP_SET_PSIZE(bp, x)	\
-	BF64_SET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x)
-
-#define	BP_GET_COMPRESS(bp)	BF64_GET((bp)->blk_prop, 32, 8)
-#define	BP_SET_COMPRESS(bp, x)	BF64_SET((bp)->blk_prop, 32, 8, x)
-
-#define	BP_GET_CHECKSUM(bp)	BF64_GET((bp)->blk_prop, 40, 8)
-#define	BP_SET_CHECKSUM(bp, x)	BF64_SET((bp)->blk_prop, 40, 8, x)
-
-#define	BP_GET_TYPE(bp)		BF64_GET((bp)->blk_prop, 48, 8)
-#define	BP_SET_TYPE(bp, x)	BF64_SET((bp)->blk_prop, 48, 8, x)
-
-#define	BP_GET_LEVEL(bp)	BF64_GET((bp)->blk_prop, 56, 5)
-#define	BP_SET_LEVEL(bp, x)	BF64_SET((bp)->blk_prop, 56, 5, x)
-
-#define	BP_GET_BYTEORDER(bp)	(0 - BF64_GET((bp)->blk_prop, 63, 1))
-#define	BP_SET_BYTEORDER(bp, x)	BF64_SET((bp)->blk_prop, 63, 1, x)
-
-#define	BP_GET_ASIZE(bp)	\
-	(DVA_GET_ASIZE(&(bp)->blk_dva[0]) + DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
-		DVA_GET_ASIZE(&(bp)->blk_dva[2]))
-
-#define	BP_GET_UCSIZE(bp) \
-	((BP_GET_LEVEL(bp) > 0 || dmu_ot[BP_GET_TYPE(bp)].ot_metadata) ? \
-	BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp));
-
-#define	BP_GET_NDVAS(bp)	\
-	(!!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
-	!!DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
-	!!DVA_GET_ASIZE(&(bp)->blk_dva[2]))
-
-#define	BP_COUNT_GANG(bp)	\
-	(DVA_GET_GANG(&(bp)->blk_dva[0]) + \
-	DVA_GET_GANG(&(bp)->blk_dva[1]) + \
-	DVA_GET_GANG(&(bp)->blk_dva[2]))
-
-#define	DVA_EQUAL(dva1, dva2)	\
-	((dva1)->dva_word[1] == (dva2)->dva_word[1] && \
-	(dva1)->dva_word[0] == (dva2)->dva_word[0])
-
-#define	ZIO_CHECKSUM_EQUAL(zc1, zc2) \
-	(0 == (((zc1).zc_word[0] - (zc2).zc_word[0]) | \
-	((zc1).zc_word[1] - (zc2).zc_word[1]) | \
-	((zc1).zc_word[2] - (zc2).zc_word[2]) | \
-	((zc1).zc_word[3] - (zc2).zc_word[3])))
-
-
-#define	DVA_IS_VALID(dva)	(DVA_GET_ASIZE(dva) != 0)
-
-#define	ZIO_SET_CHECKSUM(zcp, w0, w1, w2, w3)	\
-{						\
-	(zcp)->zc_word[0] = w0;			\
-	(zcp)->zc_word[1] = w1;			\
-	(zcp)->zc_word[2] = w2;			\
-	(zcp)->zc_word[3] = w3;			\
-}
-
-#define	BP_IDENTITY(bp)		(&(bp)->blk_dva[0])
-#define	BP_IS_GANG(bp)		DVA_GET_GANG(BP_IDENTITY(bp))
-#define	BP_IS_HOLE(bp)		((bp)->blk_birth == 0)
-#define	BP_IS_OLDER(bp, txg)	(!BP_IS_HOLE(bp) && (bp)->blk_birth < (txg))
-
-#define	BP_ZERO(bp)				\
-{						\
-	(bp)->blk_dva[0].dva_word[0] = 0;	\
-	(bp)->blk_dva[0].dva_word[1] = 0;	\
-	(bp)->blk_dva[1].dva_word[0] = 0;	\
-	(bp)->blk_dva[1].dva_word[1] = 0;	\
-	(bp)->blk_dva[2].dva_word[0] = 0;	\
-	(bp)->blk_dva[2].dva_word[1] = 0;	\
-	(bp)->blk_prop = 0;			\
-	(bp)->blk_pad[0] = 0;			\
-	(bp)->blk_pad[1] = 0;			\
-	(bp)->blk_pad[2] = 0;			\
-	(bp)->blk_birth = 0;			\
-	(bp)->blk_fill = 0;			\
-	ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0);	\
-}
-
-/*
- * Note: the byteorder is either 0 or -1, both of which are palindromes.
- * This simplifies the endianness handling a bit.
- */
-#if BYTE_ORDER == _BIG_ENDIAN
-#define	ZFS_HOST_BYTEORDER	(0ULL)
-#else
-#define	ZFS_HOST_BYTEORDER	(-1ULL)
-#endif
-
-#define	BP_SHOULD_BYTESWAP(bp)	(BP_GET_BYTEORDER(bp) != ZFS_HOST_BYTEORDER)
-
-#define	BP_SPRINTF_LEN	320
-
-#include <sys/dmu.h>
-
-#define	BP_GET_BUFC_TYPE(bp)						\
-	(((BP_GET_LEVEL(bp) > 0) || (dmu_ot[BP_GET_TYPE(bp)].ot_metadata)) ? \
-	ARC_BUFC_METADATA : ARC_BUFC_DATA);
-/*
- * Routines found in spa.c
- */
-
-/* state manipulation functions */
-extern int spa_open(const char *pool, spa_t **, void *tag);
-extern int spa_get_stats(const char *pool, nvlist_t **config,
-    char *altroot, size_t buflen);
-extern int spa_create(const char *pool, nvlist_t *config, const char *altroot);
-extern int spa_import(const char *pool, nvlist_t *config, const char *altroot);
-extern nvlist_t *spa_tryimport(nvlist_t *tryconfig);
-extern int spa_destroy(char *pool);
-extern int spa_export(char *pool, nvlist_t **oldconfig);
-extern int spa_reset(char *pool);
-extern void spa_async_request(spa_t *spa, int flag);
-extern void spa_async_suspend(spa_t *spa);
-extern void spa_async_resume(spa_t *spa);
-extern spa_t *spa_inject_addref(char *pool);
-extern void spa_inject_delref(spa_t *spa);
-
-#define	SPA_ASYNC_REOPEN	0x01
-#define	SPA_ASYNC_REPLACE_DONE	0x02
-#define	SPA_ASYNC_SCRUB		0x04
-#define	SPA_ASYNC_RESILVER	0x08
-#define	SPA_ASYNC_CONFIG_UPDATE	0x10
-
-/* device manipulation */
-extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot);
-extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot,
-    int replacing);
-extern int spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done);
-extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare);
-extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath);
-
-/* spare state (which is global across all pools) */
-extern void spa_spare_add(vdev_t *vd);
-extern void spa_spare_remove(vdev_t *vd);
-extern boolean_t spa_spare_exists(uint64_t guid, uint64_t *pool);
-extern void spa_spare_activate(vdev_t *vd);
-
-/* scrubbing */
-extern int spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force);
-extern void spa_scrub_suspend(spa_t *spa);
-extern void spa_scrub_resume(spa_t *spa);
-extern void spa_scrub_restart(spa_t *spa, uint64_t txg);
-
-/* spa syncing */
-extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */
-extern void spa_sync_allpools(void);
-
-/*
- * SPA configuration functions in spa_config.c
- */
-
-#define	SPA_CONFIG_UPDATE_POOL	0
-#define	SPA_CONFIG_UPDATE_VDEVS	1
-
-extern void spa_config_sync(void);
-extern void spa_config_load(void);
-extern nvlist_t *spa_all_configs(uint64_t *);
-extern void spa_config_set(spa_t *spa, nvlist_t *config);
-extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg,
-    int getstats);
-extern void spa_config_update(spa_t *spa, int what);
-
-/*
- * Miscellaneous SPA routines in spa_misc.c
- */
-
-/* Namespace manipulation */
-extern spa_t *spa_lookup(const char *name);
-extern spa_t *spa_add(const char *name, const char *altroot);
-extern void spa_remove(spa_t *spa);
-extern spa_t *spa_next(spa_t *prev);
-
-/* Refcount functions */
-extern void spa_open_ref(spa_t *spa, void *tag);
-extern void spa_close(spa_t *spa, void *tag);
-extern boolean_t spa_refcount_zero(spa_t *spa);
-
-/* Pool configuration lock */
-extern void spa_config_enter(spa_t *spa, krw_t rw, void *tag);
-extern void spa_config_exit(spa_t *spa, void *tag);
-extern boolean_t spa_config_held(spa_t *spa, krw_t rw);
-
-/* Pool vdev add/remove lock */
-extern uint64_t spa_vdev_enter(spa_t *spa);
-extern int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error);
-
-/* Accessor functions */
-extern krwlock_t *spa_traverse_rwlock(spa_t *spa);
-extern int spa_traverse_wanted(spa_t *spa);
-extern struct dsl_pool *spa_get_dsl(spa_t *spa);
-extern blkptr_t *spa_get_rootblkptr(spa_t *spa);
-extern void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp);
-extern void spa_altroot(spa_t *, char *, size_t);
-extern int spa_sync_pass(spa_t *spa);
-extern char *spa_name(spa_t *spa);
-extern uint64_t spa_guid(spa_t *spa);
-extern uint64_t spa_last_synced_txg(spa_t *spa);
-extern uint64_t spa_first_txg(spa_t *spa);
-extern uint64_t spa_version(spa_t *spa);
-extern int spa_state(spa_t *spa);
-extern uint64_t spa_freeze_txg(spa_t *spa);
-struct metaslab_class;
-extern struct metaslab_class *spa_metaslab_class_select(spa_t *spa);
-extern uint64_t spa_get_alloc(spa_t *spa);
-extern uint64_t spa_get_space(spa_t *spa);
-extern uint64_t spa_get_dspace(spa_t *spa);
-extern uint64_t spa_get_asize(spa_t *spa, uint64_t lsize);
-extern uint64_t spa_version(spa_t *spa);
-extern int spa_max_replication(spa_t *spa);
-extern int spa_busy(void);
-
-/* Miscellaneous support routines */
-extern int spa_rename(const char *oldname, const char *newname);
-extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid);
-extern char *spa_strdup(const char *);
-extern void spa_strfree(char *);
-extern uint64_t spa_get_random(uint64_t range);
-extern void sprintf_blkptr(char *buf, int len, const blkptr_t *bp);
-extern void spa_freeze(spa_t *spa);
-extern void spa_upgrade(spa_t *spa);
-extern void spa_evict_all(void);
-extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid);
-extern boolean_t spa_has_spare(spa_t *, uint64_t guid);
-extern uint64_t bp_get_dasize(spa_t *spa, const blkptr_t *bp);
-
-/* history logging */
-extern void spa_history_create_obj(spa_t *spa, dmu_tx_t *tx);
-extern int spa_history_get(spa_t *spa, uint64_t *offset, uint64_t *len_read,
-    char *his_buf);
-extern int spa_history_log(spa_t *spa, const char *his_buf,
-    uint64_t pool_create);
-
-/* error handling */
-struct zbookmark;
-struct zio;
-extern void spa_log_error(spa_t *spa, struct zio *zio);
-extern void zfs_ereport_post(const char *class, spa_t *spa, vdev_t *vd,
-    struct zio *zio, uint64_t stateoroffset, uint64_t length);
-extern void zfs_post_ok(spa_t *spa, vdev_t *vd);
-extern uint64_t spa_get_errlog_size(spa_t *spa);
-extern int spa_get_errlog(spa_t *spa, void *uaddr, size_t *count);
-extern void spa_errlog_rotate(spa_t *spa);
-extern void spa_errlog_drain(spa_t *spa);
-extern void spa_errlog_sync(spa_t *spa, uint64_t txg);
-extern void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub);
-
-/* Initialization and termination */
-extern void spa_init(int flags);
-extern void spa_fini(void);
-
-/* properties */
-extern int spa_set_props(spa_t *spa, nvlist_t *nvp);
-extern int spa_get_props(spa_t *spa, nvlist_t **nvp);
-extern void spa_clear_bootfs(spa_t *spa, uint64_t obj, dmu_tx_t *tx);
-extern boolean_t spa_has_bootfs(spa_t *spa);
-
-#ifdef ZFS_DEBUG
-#define	dprintf_bp(bp, fmt, ...) do {				\
-	if (zfs_flags & ZFS_DEBUG_DPRINTF) { 			\
-	char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP);	\
-	sprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, (bp));		\
-	dprintf(fmt " %s\n", __VA_ARGS__, __blkbuf);		\
-	kmem_free(__blkbuf, BP_SPRINTF_LEN);			\
-	} \
-_NOTE(CONSTCOND) } while (0)
-#else
-#define	dprintf_bp(bp, fmt, ...)
-#endif
-
-extern int spa_mode;			/* mode, e.g. FREAD | FWRITE */
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_SPA_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
deleted file mode 100644
index 8c57123..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_SPA_IMPL_H
-#define	_SYS_SPA_IMPL_H
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/spa.h>
-#include <sys/vdev.h>
-#include <sys/metaslab.h>
-#include <sys/dmu.h>
-#include <sys/dsl_pool.h>
-#include <sys/uberblock_impl.h>
-#include <sys/zfs_context.h>
-#include <sys/avl.h>
-#include <sys/refcount.h>
-#include <sys/bplist.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-typedef struct spa_config_lock {
-	kmutex_t	scl_lock;
-	refcount_t	scl_count;
-	kthread_t	*scl_writer;
-	kcondvar_t	scl_cv;
-} spa_config_lock_t;
-
-typedef struct spa_error_entry {
-	zbookmark_t	se_bookmark;
-	char		*se_name;
-	avl_node_t	se_avl;
-} spa_error_entry_t;
-
-typedef struct spa_history_phys {
-	uint64_t sh_pool_create_len;	/* ending offset of zpool create */
-	uint64_t sh_phys_max_off;	/* physical EOF */
-	uint64_t sh_bof;		/* logical BOF */
-	uint64_t sh_eof;		/* logical EOF */
-	uint64_t sh_records_lost;	/* num of records overwritten */
-} spa_history_phys_t;
-
-typedef struct spa_props {
-	nvlist_t	*spa_props_nvp;
-	list_node_t	spa_list_node;
-} spa_props_t;
-
-struct spa {
-	/*
-	 * Fields protected by spa_namespace_lock.
-	 */
-	char		*spa_name;		/* pool name */
-	avl_node_t	spa_avl;		/* node in spa_namespace_avl */
-	nvlist_t	*spa_config;		/* last synced config */
-	nvlist_t	*spa_config_syncing;	/* currently syncing config */
-	uint64_t	spa_config_txg;		/* txg of last config change */
-	kmutex_t	spa_config_cache_lock;	/* for spa_config RW_READER */
-	int		spa_sync_pass;		/* iterate-to-convergence */
-	int		spa_state;		/* pool state */
-	int		spa_inject_ref;		/* injection references */
-	uint8_t		spa_traverse_wanted;	/* traverse lock wanted */
-	uint8_t		spa_sync_on;		/* sync threads are running */
-	spa_load_state_t spa_load_state;	/* current load operation */
-	taskq_t		*spa_zio_issue_taskq[ZIO_TYPES];
-	taskq_t		*spa_zio_intr_taskq[ZIO_TYPES];
-	dsl_pool_t	*spa_dsl_pool;
-	metaslab_class_t *spa_normal_class;	/* normal data class */
-	uint64_t	spa_first_txg;		/* first txg after spa_open() */
-	uint64_t	spa_final_txg;		/* txg of export/destroy */
-	uint64_t	spa_freeze_txg;		/* freeze pool at this txg */
-	objset_t	*spa_meta_objset;	/* copy of dp->dp_meta_objset */
-	txg_list_t	spa_vdev_txg_list;	/* per-txg dirty vdev list */
-	vdev_t		*spa_root_vdev;		/* top-level vdev container */
-	uint64_t	spa_load_guid;		/* initial guid for spa_load */
-	list_t		spa_dirty_list;		/* vdevs with dirty labels */
-	uint64_t	spa_spares_object;	/* MOS object for spare list */
-	nvlist_t	*spa_sparelist;		/* cached spare config */
-	vdev_t		**spa_spares;		/* available hot spares */
-	int		spa_nspares;		/* number of hot spares */
-	boolean_t	spa_sync_spares;	/* sync the spares list */
-	uint64_t	spa_config_object;	/* MOS object for pool config */
-	uint64_t	spa_syncing_txg;	/* txg currently syncing */
-	uint64_t	spa_sync_bplist_obj;	/* object for deferred frees */
-	bplist_t	spa_sync_bplist;	/* deferred-free bplist */
-	krwlock_t	spa_traverse_lock;	/* traverse vs. spa_sync() */
-	uberblock_t	spa_ubsync;		/* last synced uberblock */
-	uberblock_t	spa_uberblock;		/* current uberblock */
-	kmutex_t	spa_scrub_lock;		/* resilver/scrub lock */
-	kthread_t	*spa_scrub_thread;	/* scrub/resilver thread */
-	traverse_handle_t *spa_scrub_th;	/* scrub traverse handle */
-	uint64_t	spa_scrub_restart_txg;	/* need to restart */
-	uint64_t	spa_scrub_mintxg;	/* min txg we'll scrub */
-	uint64_t	spa_scrub_maxtxg;	/* max txg we'll scrub */
-	uint64_t	spa_scrub_inflight;	/* in-flight scrub I/Os */
-	uint64_t	spa_scrub_maxinflight;	/* max in-flight scrub I/Os */
-	uint64_t	spa_scrub_errors;	/* scrub I/O error count */
-	int		spa_scrub_suspended;	/* tell scrubber to suspend */
-	kcondvar_t	spa_scrub_cv;		/* scrub thread state change */
-	kcondvar_t	spa_scrub_io_cv;	/* scrub I/O completion */
-	uint8_t		spa_scrub_stop;		/* tell scrubber to stop */
-	uint8_t		spa_scrub_active;	/* active or suspended? */
-	uint8_t		spa_scrub_type;		/* type of scrub we're doing */
-	uint8_t		spa_scrub_finished;	/* indicator to rotate logs */
-	kmutex_t	spa_async_lock;		/* protect async state */
-	kthread_t	*spa_async_thread;	/* thread doing async task */
-	int		spa_async_suspended;	/* async tasks suspended */
-	kcondvar_t	spa_async_cv;		/* wait for thread_exit() */
-	uint16_t	spa_async_tasks;	/* async task mask */
-	char		*spa_root;		/* alternate root directory */
-	kmutex_t	spa_uberblock_lock;	/* vdev_uberblock_load_done() */
-	uint64_t	spa_ena;		/* spa-wide ereport ENA */
-	boolean_t	spa_last_open_failed;	/* true if last open faled */
-	kmutex_t	spa_errlog_lock;	/* error log lock */
-	uint64_t	spa_errlog_last;	/* last error log object */
-	uint64_t	spa_errlog_scrub;	/* scrub error log object */
-	kmutex_t	spa_errlist_lock;	/* error list/ereport lock */
-	avl_tree_t	spa_errlist_last;	/* last error list */
-	avl_tree_t	spa_errlist_scrub;	/* scrub error list */
-	uint64_t	spa_deflate;		/* should we deflate? */
-	uint64_t	spa_history;		/* history object */
-	kmutex_t	spa_history_lock;	/* history lock */
-	vdev_t		*spa_pending_vdev;	/* pending vdev additions */
-	nvlist_t	**spa_pending_spares;	/* pending spare additions */
-	uint_t		spa_pending_nspares;	/* # pending spares */
-	kmutex_t	spa_props_lock;		/* property lock */
-	uint64_t	spa_pool_props_object;	/* object for properties */
-	uint64_t	spa_bootfs;		/* default boot filesystem */
-	/*
-	 * spa_refcnt must be the last element because it changes size based on
-	 * compilation options.  In order for the MDB module to function
-	 * correctly, the other fields must remain in the same location.
-	 */
-	spa_config_lock_t spa_config_lock;	/* configuration changes */
-	refcount_t	spa_refcount;		/* number of opens */
-};
-
-extern const char *spa_config_dir;
-extern kmutex_t spa_namespace_lock;
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_SPA_IMPL_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h
deleted file mode 100644
index db9daef..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h
+++ /dev/null
@@ -1,162 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_SPACE_MAP_H
-#define	_SYS_SPACE_MAP_H
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/avl.h>
-#include <sys/dmu.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-typedef struct space_map_ops space_map_ops_t;
-
-typedef struct space_map {
-	avl_tree_t	sm_root;	/* AVL tree of map segments */
-	uint64_t	sm_space;	/* sum of all segments in the map */
-	uint64_t	sm_start;	/* start of map */
-	uint64_t	sm_size;	/* size of map */
-	uint8_t		sm_shift;	/* unit shift */
-	uint8_t		sm_pad[3];	/* unused */
-	uint8_t		sm_loaded;	/* map loaded? */
-	uint8_t		sm_loading;	/* map loading? */
-	kcondvar_t	sm_load_cv;	/* map load completion */
-	space_map_ops_t	*sm_ops;	/* space map block picker ops vector */
-	void		*sm_ppd;	/* picker-private data */
-	kmutex_t	*sm_lock;	/* pointer to lock that protects map */
-} space_map_t;
-
-typedef struct space_seg {
-	avl_node_t	ss_node;	/* AVL node */
-	uint64_t	ss_start;	/* starting offset of this segment */
-	uint64_t	ss_end;		/* ending offset (non-inclusive) */
-} space_seg_t;
-
-typedef struct space_map_obj {
-	uint64_t	smo_object;	/* on-disk space map object */
-	uint64_t	smo_objsize;	/* size of the object */
-	uint64_t	smo_alloc;	/* space allocated from the map */
-} space_map_obj_t;
-
-struct space_map_ops {
-	void	(*smop_load)(space_map_t *sm);
-	void	(*smop_unload)(space_map_t *sm);
-	uint64_t (*smop_alloc)(space_map_t *sm, uint64_t size);
-	void	(*smop_claim)(space_map_t *sm, uint64_t start, uint64_t size);
-	void	(*smop_free)(space_map_t *sm, uint64_t start, uint64_t size);
-};
-
-/*
- * debug entry
- *
- *    1      3         10                     50
- *  ,---+--------+------------+---------------------------------.
- *  | 1 | action |  syncpass  |        txg (lower bits)         |
- *  `---+--------+------------+---------------------------------'
- *   63  62    60 59        50 49                               0
- *
- *
- *
- * non-debug entry
- *
- *    1               47                   1           15
- *  ,-----------------------------------------------------------.
- *  | 0 |   offset (sm_shift units)    | type |       run       |
- *  `-----------------------------------------------------------'
- *   63  62                          17   16   15               0
- */
-
-/* All this stuff takes and returns bytes */
-#define	SM_RUN_DECODE(x)	(BF64_DECODE(x, 0, 15) + 1)
-#define	SM_RUN_ENCODE(x)	BF64_ENCODE((x) - 1, 0, 15)
-#define	SM_TYPE_DECODE(x)	BF64_DECODE(x, 15, 1)
-#define	SM_TYPE_ENCODE(x)	BF64_ENCODE(x, 15, 1)
-#define	SM_OFFSET_DECODE(x)	BF64_DECODE(x, 16, 47)
-#define	SM_OFFSET_ENCODE(x)	BF64_ENCODE(x, 16, 47)
-#define	SM_DEBUG_DECODE(x)	BF64_DECODE(x, 63, 1)
-#define	SM_DEBUG_ENCODE(x)	BF64_ENCODE(x, 63, 1)
-
-#define	SM_DEBUG_ACTION_DECODE(x)	BF64_DECODE(x, 60, 3)
-#define	SM_DEBUG_ACTION_ENCODE(x)	BF64_ENCODE(x, 60, 3)
-
-#define	SM_DEBUG_SYNCPASS_DECODE(x)	BF64_DECODE(x, 50, 10)
-#define	SM_DEBUG_SYNCPASS_ENCODE(x)	BF64_ENCODE(x, 50, 10)
-
-#define	SM_DEBUG_TXG_DECODE(x)		BF64_DECODE(x, 0, 50)
-#define	SM_DEBUG_TXG_ENCODE(x)		BF64_ENCODE(x, 0, 50)
-
-#define	SM_RUN_MAX			SM_RUN_DECODE(~0ULL)
-
-#define	SM_ALLOC	0x0
-#define	SM_FREE		0x1
-
-/*
- * The data for a given space map can be kept on blocks of any size.
- * Larger blocks entail fewer i/o operations, but they also cause the
- * DMU to keep more data in-core, and also to waste more i/o bandwidth
- * when only a few blocks have changed since the last transaction group.
- * This could use a lot more research, but for now, set the freelist
- * block size to 4k (2^12).
- */
-#define	SPACE_MAP_BLOCKSHIFT	12
-
-typedef void space_map_func_t(space_map_t *sm, uint64_t start, uint64_t size);
-
-extern void space_map_create(space_map_t *sm, uint64_t start, uint64_t size,
-    uint8_t shift, kmutex_t *lp);
-extern void space_map_destroy(space_map_t *sm);
-extern void space_map_add(space_map_t *sm, uint64_t start, uint64_t size);
-extern void space_map_remove(space_map_t *sm, uint64_t start, uint64_t size);
-extern int space_map_contains(space_map_t *sm, uint64_t start, uint64_t size);
-extern void space_map_vacate(space_map_t *sm,
-    space_map_func_t *func, space_map_t *mdest);
-extern void space_map_walk(space_map_t *sm,
-    space_map_func_t *func, space_map_t *mdest);
-extern void space_map_excise(space_map_t *sm, uint64_t start, uint64_t size);
-extern void space_map_union(space_map_t *smd, space_map_t *sms);
-
-extern void space_map_load_wait(space_map_t *sm);
-extern int space_map_load(space_map_t *sm, space_map_ops_t *ops,
-    uint8_t maptype, space_map_obj_t *smo, objset_t *os);
-extern void space_map_unload(space_map_t *sm);
-
-extern uint64_t space_map_alloc(space_map_t *sm, uint64_t size);
-extern void space_map_claim(space_map_t *sm, uint64_t start, uint64_t size);
-extern void space_map_free(space_map_t *sm, uint64_t start, uint64_t size);
-
-extern void space_map_sync(space_map_t *sm, uint8_t maptype,
-    space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx);
-extern void space_map_truncate(space_map_obj_t *smo,
-    objset_t *os, dmu_tx_t *tx);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_SPACE_MAP_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h
deleted file mode 100644
index dae129c..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_TXG_H
-#define	_SYS_TXG_H
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/spa.h>
-#include <sys/zfs_context.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-#define	TXG_CONCURRENT_STATES	3	/* open, quiescing, syncing	*/
-#define	TXG_SIZE		4		/* next power of 2	*/
-#define	TXG_MASK		(TXG_SIZE - 1)	/* mask for size	*/
-#define	TXG_INITIAL		TXG_SIZE	/* initial txg 		*/
-#define	TXG_IDX			(txg & TXG_MASK)
-
-#define	TXG_WAIT		1ULL
-#define	TXG_NOWAIT		2ULL
-
-typedef struct tx_cpu tx_cpu_t;
-
-typedef struct txg_handle {
-	tx_cpu_t	*th_cpu;
-	uint64_t	th_txg;
-} txg_handle_t;
-
-typedef struct txg_node {
-	struct txg_node	*tn_next[TXG_SIZE];
-	uint8_t		tn_member[TXG_SIZE];
-} txg_node_t;
-
-typedef struct txg_list {
-	kmutex_t	tl_lock;
-	size_t		tl_offset;
-	txg_node_t	*tl_head[TXG_SIZE];
-} txg_list_t;
-
-struct dsl_pool;
-
-extern void txg_init(struct dsl_pool *dp, uint64_t txg);
-extern void txg_fini(struct dsl_pool *dp);
-extern void txg_sync_start(struct dsl_pool *dp);
-extern void txg_sync_stop(struct dsl_pool *dp);
-extern uint64_t txg_hold_open(struct dsl_pool *dp, txg_handle_t *txghp);
-extern void txg_rele_to_quiesce(txg_handle_t *txghp);
-extern void txg_rele_to_sync(txg_handle_t *txghp);
-extern void txg_suspend(struct dsl_pool *dp);
-extern void txg_resume(struct dsl_pool *dp);
-
-/*
- * Wait until the given transaction group has finished syncing.
- * Try to make this happen as soon as possible (eg. kick off any
- * necessary syncs immediately).  If txg==0, wait for the currently open
- * txg to finish syncing.
- */
-extern void txg_wait_synced(struct dsl_pool *dp, uint64_t txg);
-
-/*
- * Wait until the given transaction group, or one after it, is
- * the open transaction group.  Try to make this happen as soon
- * as possible (eg. kick off any necessary syncs immediately).
- * If txg == 0, wait for the next open txg.
- */
-extern void txg_wait_open(struct dsl_pool *dp, uint64_t txg);
-
-/*
- * Returns TRUE if we are "backed up" waiting for the syncing
- * transaction to complete; otherwise returns FALSE.
- */
-extern int txg_stalled(struct dsl_pool *dp);
-
-/*
- * Per-txg object lists.
- */
-
-#define	TXG_CLEAN(txg)	((txg) - 1)
-
-extern void txg_list_create(txg_list_t *tl, size_t offset);
-extern void txg_list_destroy(txg_list_t *tl);
-extern int txg_list_empty(txg_list_t *tl, uint64_t txg);
-extern int txg_list_add(txg_list_t *tl, void *p, uint64_t txg);
-extern void *txg_list_remove(txg_list_t *tl, uint64_t txg);
-extern void *txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg);
-extern int txg_list_member(txg_list_t *tl, void *p, uint64_t txg);
-extern void *txg_list_head(txg_list_t *tl, uint64_t txg);
-extern void *txg_list_next(txg_list_t *tl, void *p, uint64_t txg);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_TXG_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h
deleted file mode 100644
index 45a138a..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_TXG_IMPL_H
-#define	_SYS_TXG_IMPL_H
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/spa.h>
-#include <sys/txg.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-struct tx_cpu {
-	kmutex_t	tc_lock;
-	kcondvar_t	tc_cv[TXG_SIZE];
-	uint64_t	tc_count[TXG_SIZE];
-	char		tc_pad[16];
-};
-
-typedef struct tx_state {
-	tx_cpu_t	*tx_cpu;	/* protects right to enter txg	*/
-	kmutex_t	tx_sync_lock;	/* protects tx_state_t */
-	krwlock_t	tx_suspend;
-	uint64_t	tx_open_txg;	/* currently open txg id */
-	uint64_t	tx_quiesced_txg; /* quiesced txg waiting for sync */
-	uint64_t	tx_syncing_txg;	/* currently syncing txg id */
-	uint64_t	tx_synced_txg;	/* last synced txg id */
-
-	uint64_t	tx_sync_txg_waiting; /* txg we're waiting to sync */
-	uint64_t	tx_quiesce_txg_waiting; /* txg we're waiting to open */
-
-	kcondvar_t	tx_sync_more_cv;
-	kcondvar_t	tx_sync_done_cv;
-	kcondvar_t	tx_quiesce_more_cv;
-	kcondvar_t	tx_quiesce_done_cv;
-	kcondvar_t	tx_timeout_exit_cv;
-	kcondvar_t	tx_exit_cv;	/* wait for all threads to exit */
-
-	uint8_t		tx_threads;	/* number of threads */
-	uint8_t		tx_exiting;	/* set when we're exiting */
-
-	kthread_t	*tx_sync_thread;
-	kthread_t	*tx_quiesce_thread;
-	kthread_t	*tx_timelimit_thread;
-} tx_state_t;
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_TXG_IMPL_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h
deleted file mode 100644
index 93d936a..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_UBERBLOCK_H
-#define	_SYS_UBERBLOCK_H
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/spa.h>
-#include <sys/vdev.h>
-#include <sys/zio.h>
-#include <sys/zio_checksum.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-typedef struct uberblock uberblock_t;
-
-extern int uberblock_verify(uberblock_t *ub);
-extern int uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_UBERBLOCK_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h
deleted file mode 100644
index ab0f2dc..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_UBERBLOCK_IMPL_H
-#define	_SYS_UBERBLOCK_IMPL_H
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/uberblock.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-/*
- * The uberblock version is incremented whenever an incompatible on-disk
- * format change is made to the SPA, DMU, or ZAP.
- *
- * Note: the first two fields should never be moved.  When a storage pool
- * is opened, the uberblock must be read off the disk before the version
- * can be checked.  If the ub_version field is moved, we may not detect
- * version mismatch.  If the ub_magic field is moved, applications that
- * expect the magic number in the first word won't work.
- */
-#define	UBERBLOCK_MAGIC		0x00bab10c		/* oo-ba-bloc!	*/
-#define	UBERBLOCK_SHIFT		10			/* up to 1K	*/
-
-struct uberblock {
-	uint64_t	ub_magic;	/* UBERBLOCK_MAGIC		*/
-	uint64_t	ub_version;	/* ZFS_VERSION			*/
-	uint64_t	ub_txg;		/* txg of last sync		*/
-	uint64_t	ub_guid_sum;	/* sum of all vdev guids	*/
-	uint64_t	ub_timestamp;	/* UTC time of last sync	*/
-	blkptr_t	ub_rootbp;	/* MOS objset_phys_t		*/
-};
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_UBERBLOCK_IMPL_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/unique.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/unique.h
deleted file mode 100644
index c8c177e..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/unique.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef	_SYS_UNIQUE_H
-#define	_SYS_UNIQUE_H
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/zfs_context.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-/* The number of significant bits in each unique value. */
-#define	UNIQUE_BITS	56
-
-void unique_init(void);
-
-/* Return a new unique value. */
-uint64_t unique_create(void);
-
-/* Return a unique value, which equals the one passed in if possible. */
-uint64_t unique_insert(uint64_t value);
-
-/* Indicate that this value no longer needs to be uniquified against. */
-void unique_remove(uint64_t value);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif /* _SYS_UNIQUE_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h
deleted file mode 100644
index 3120811..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_VDEV_H
-#define	_SYS_VDEV_H
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/spa.h>
-#include <sys/zio.h>
-#include <sys/dmu.h>
-#include <sys/space_map.h>
-#include <sys/fs/zfs.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-extern boolean_t zfs_nocacheflush;
-
-/*
- * Fault injection modes.
- */
-#define	VDEV_FAULT_NONE		0
-#define	VDEV_FAULT_RANDOM	1
-#define	VDEV_FAULT_COUNT	2
-
-extern int vdev_open(vdev_t *);
-extern int vdev_validate(vdev_t *);
-extern void vdev_close(vdev_t *);
-extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace);
-extern void vdev_init(vdev_t *, uint64_t txg);
-extern void vdev_reopen(vdev_t *);
-extern int vdev_validate_spare(vdev_t *);
-
-extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev);
-extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid);
-extern void vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size);
-extern int vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size);
-extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
-    int scrub_done);
-
-extern const char *vdev_description(vdev_t *vd);
-
-extern int vdev_metaslab_init(vdev_t *vd, uint64_t txg);
-extern void vdev_metaslab_fini(vdev_t *vd);
-
-extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs);
-extern void vdev_stat_update(zio_t *zio);
-extern void vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type,
-    boolean_t complete);
-extern int vdev_getspec(spa_t *spa, uint64_t vdev, char **vdev_spec);
-extern void vdev_propagate_state(vdev_t *vd);
-extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state,
-    vdev_aux_t aux);
-
-extern void vdev_space_update(vdev_t *vd, int64_t space_delta,
-    int64_t alloc_delta);
-
-extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize);
-
-extern void vdev_io_start(zio_t *zio);
-extern void vdev_io_done(zio_t *zio);
-
-extern int vdev_online(spa_t *spa, uint64_t guid);
-extern int vdev_offline(spa_t *spa, uint64_t guid, int istmp);
-extern void vdev_clear(spa_t *spa, vdev_t *vd);
-
-extern int vdev_error_inject(vdev_t *vd, zio_t *zio);
-extern int vdev_is_dead(vdev_t *vd);
-
-extern void vdev_cache_init(vdev_t *vd);
-extern void vdev_cache_fini(vdev_t *vd);
-extern int vdev_cache_read(zio_t *zio);
-extern void vdev_cache_write(zio_t *zio);
-
-extern void vdev_queue_init(vdev_t *vd);
-extern void vdev_queue_fini(vdev_t *vd);
-extern zio_t *vdev_queue_io(zio_t *zio);
-extern void vdev_queue_io_done(zio_t *zio);
-
-extern void vdev_config_dirty(vdev_t *vd);
-extern void vdev_config_clean(vdev_t *vd);
-extern int vdev_config_sync(vdev_t *vd, uint64_t txg);
-
-extern nvlist_t *vdev_config_generate(spa_t *spa, vdev_t *vd,
-    boolean_t getstats, boolean_t isspare);
-
-/*
- * Label routines
- */
-struct uberblock;
-extern uint64_t vdev_label_offset(uint64_t psize, int l, uint64_t offset);
-extern nvlist_t *vdev_label_read_config(vdev_t *vd);
-extern void vdev_uberblock_load(zio_t *zio, vdev_t *vd, struct uberblock *ub);
-
-typedef enum {
-	VDEV_LABEL_CREATE,	/* create/add a new device */
-	VDEV_LABEL_REPLACE,	/* replace an existing device */
-	VDEV_LABEL_SPARE,	/* add a new hot spare */
-	VDEV_LABEL_REMOVE	/* remove an existing device */
-} vdev_labeltype_t;
-
-extern int vdev_label_init(vdev_t *vd, uint64_t txg, vdev_labeltype_t reason);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_VDEV_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h
deleted file mode 100644
index 95536a7..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_VDEV_DISK_H
-#define	_SYS_VDEV_DISK_H
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/vdev.h>
-#ifdef _KERNEL
-#include <sys/sunldi.h>
-#include <sys/sunddi.h>
-#endif
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-typedef struct vdev_disk {
-	ddi_devid_t	vd_devid;
-	char		*vd_minor;
-	ldi_handle_t	vd_lh;
-} vdev_disk_t;
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_VDEV_DISK_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_file.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_file.h
deleted file mode 100644
index cd49673..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_file.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_VDEV_FILE_H
-#define	_SYS_VDEV_FILE_H
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/vdev.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-typedef struct vdev_file {
-	vnode_t		*vf_vnode;
-} vdev_file_t;
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_VDEV_FILE_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
deleted file mode 100644
index aba7567..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
+++ /dev/null
@@ -1,298 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_VDEV_IMPL_H
-#define	_SYS_VDEV_IMPL_H
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/avl.h>
-#include <sys/dmu.h>
-#include <sys/metaslab.h>
-#include <sys/nvpair.h>
-#include <sys/space_map.h>
-#include <sys/vdev.h>
-#include <sys/dkio.h>
-#include <sys/uberblock_impl.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-/*
- * Virtual device descriptors.
- *
- * All storage pool operations go through the virtual device framework,
- * which provides data replication and I/O scheduling.
- */
-
-/*
- * Forward declarations that lots of things need.
- */
-typedef struct vdev_queue vdev_queue_t;
-typedef struct vdev_cache vdev_cache_t;
-typedef struct vdev_cache_entry vdev_cache_entry_t;
-
-/*
- * Virtual device operations
- */
-typedef int	vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *ashift);
-typedef void	vdev_close_func_t(vdev_t *vd);
-typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize);
-typedef void	vdev_io_start_func_t(zio_t *zio);
-typedef void	vdev_io_done_func_t(zio_t *zio);
-typedef void	vdev_state_change_func_t(vdev_t *vd, int, int);
-
-typedef struct vdev_ops {
-	vdev_open_func_t		*vdev_op_open;
-	vdev_close_func_t		*vdev_op_close;
-	vdev_asize_func_t		*vdev_op_asize;
-	vdev_io_start_func_t		*vdev_op_io_start;
-	vdev_io_done_func_t		*vdev_op_io_done;
-	vdev_state_change_func_t	*vdev_op_state_change;
-	char				vdev_op_type[16];
-	boolean_t			vdev_op_leaf;
-} vdev_ops_t;
-
-/*
- * Virtual device properties
- */
-struct vdev_cache_entry {
-	char		*ve_data;
-	uint64_t	ve_offset;
-	uint64_t	ve_lastused;
-	avl_node_t	ve_offset_node;
-	avl_node_t	ve_lastused_node;
-	uint32_t	ve_hits;
-	uint16_t	ve_missed_update;
-	zio_t		*ve_fill_io;
-};
-
-struct vdev_cache {
-	avl_tree_t	vc_offset_tree;
-	avl_tree_t	vc_lastused_tree;
-	kmutex_t	vc_lock;
-};
-
-struct vdev_queue {
-	avl_tree_t	vq_deadline_tree;
-	avl_tree_t	vq_read_tree;
-	avl_tree_t	vq_write_tree;
-	avl_tree_t	vq_pending_tree;
-	kmutex_t	vq_lock;
-};
-
-/*
- * Virtual device descriptor
- */
-struct vdev {
-	/*
-	 * Common to all vdev types.
-	 */
-	uint64_t	vdev_id;	/* child number in vdev parent	*/
-	uint64_t	vdev_guid;	/* unique ID for this vdev	*/
-	uint64_t	vdev_guid_sum;	/* self guid + all child guids	*/
-	uint64_t	vdev_asize;	/* allocatable device capacity	*/
-	uint64_t	vdev_ashift;	/* block alignment shift	*/
-	uint64_t	vdev_state;	/* see VDEV_STATE_* #defines	*/
-	uint64_t	vdev_prevstate;	/* used when reopening a vdev	*/
-	vdev_ops_t	*vdev_ops;	/* vdev operations		*/
-	spa_t		*vdev_spa;	/* spa for this vdev		*/
-	void		*vdev_tsd;	/* type-specific data		*/
-	vdev_t		*vdev_top;	/* top-level vdev		*/
-	vdev_t		*vdev_parent;	/* parent vdev			*/
-	vdev_t		**vdev_child;	/* array of children		*/
-	uint64_t	vdev_children;	/* number of children		*/
-	space_map_t	vdev_dtl_map;	/* dirty time log in-core state	*/
-	space_map_t	vdev_dtl_scrub;	/* DTL for scrub repair writes	*/
-	vdev_stat_t	vdev_stat;	/* virtual device statistics	*/
-
-	/*
-	 * Top-level vdev state.
-	 */
-	uint64_t	vdev_ms_array;	/* metaslab array object	*/
-	uint64_t	vdev_ms_shift;	/* metaslab size shift		*/
-	uint64_t	vdev_ms_count;	/* number of metaslabs		*/
-	metaslab_group_t *vdev_mg;	/* metaslab group		*/
-	metaslab_t	**vdev_ms;	/* metaslab array		*/
-	txg_list_t	vdev_ms_list;	/* per-txg dirty metaslab lists	*/
-	txg_list_t	vdev_dtl_list;	/* per-txg dirty DTL lists	*/
-	txg_node_t	vdev_txg_node;	/* per-txg dirty vdev linkage	*/
-	uint8_t		vdev_reopen_wanted; /* async reopen wanted?	*/
-	list_node_t	vdev_dirty_node; /* config dirty list		*/
-	uint64_t	vdev_deflate_ratio; /* deflation ratio (x512)	*/
-
-	/*
-	 * Leaf vdev state.
-	 */
-	uint64_t	vdev_psize;	/* physical device capacity	*/
-	space_map_obj_t	vdev_dtl;	/* dirty time log on-disk state	*/
-	txg_node_t	vdev_dtl_node;	/* per-txg dirty DTL linkage	*/
-	uint64_t	vdev_wholedisk;	/* true if this is a whole disk */
-	uint64_t	vdev_offline;	/* device taken offline?	*/
-	uint64_t	vdev_nparity;	/* number of parity devices for raidz */
-	char		*vdev_path;	/* vdev path (if any)		*/
-	char		*vdev_devid;	/* vdev devid (if any)		*/
-	uint64_t	vdev_fault_arg; /* fault injection paramater	*/
-	int		vdev_fault_mask; /* zio types to fault		*/
-	uint8_t		vdev_fault_mode; /* fault injection mode	*/
-	uint8_t		vdev_cache_active; /* vdev_cache and vdev_queue	*/
-	uint8_t		vdev_tmpoffline; /* device taken offline temporarily? */
-	uint8_t		vdev_detached;	/* device detached?		*/
-	uint64_t	vdev_isspare;	/* was a hot spare */
-	vdev_queue_t	vdev_queue;	/* I/O deadline schedule queue	*/
-	vdev_cache_t	vdev_cache;	/* physical block cache		*/
-	uint64_t	vdev_not_present; /* not present during import	*/
-	hrtime_t	vdev_last_try;	/* last reopen time		*/
-	boolean_t	vdev_nowritecache; /* true if flushwritecache failed */
-
-	/*
-	 * For DTrace to work in userland (libzpool) context, these fields must
-	 * remain at the end of the structure.  DTrace will use the kernel's
-	 * CTF definition for 'struct vdev', and since the size of a kmutex_t is
-	 * larger in userland, the offsets for the rest fields would be
-	 * incorrect.
-	 */
-	kmutex_t	vdev_dtl_lock;	/* vdev_dtl_{map,resilver}	*/
-	kmutex_t	vdev_stat_lock;	/* vdev_stat			*/
-};
-
-#define	VDEV_SKIP_SIZE		(8 << 10)
-#define	VDEV_BOOT_HEADER_SIZE	(8 << 10)
-#define	VDEV_PHYS_SIZE		(112 << 10)
-#define	VDEV_UBERBLOCK_RING	(128 << 10)
-
-#define	VDEV_UBERBLOCK_SHIFT(vd)	\
-	MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT)
-#define	VDEV_UBERBLOCK_COUNT(vd)	\
-	(VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd))
-#define	VDEV_UBERBLOCK_OFFSET(vd, n)	\
-	offsetof(vdev_label_t, vl_uberblock[(n) << VDEV_UBERBLOCK_SHIFT(vd)])
-#define	VDEV_UBERBLOCK_SIZE(vd)		(1ULL << VDEV_UBERBLOCK_SHIFT(vd))
-
-/* ZFS boot block */
-#define	VDEV_BOOT_MAGIC		0x2f5b007b10cULL
-#define	VDEV_BOOT_VERSION	1		/* version number	*/
-
-typedef struct vdev_boot_header {
-	uint64_t	vb_magic;		/* VDEV_BOOT_MAGIC	*/
-	uint64_t	vb_version;		/* VDEV_BOOT_VERSION	*/
-	uint64_t	vb_offset;		/* start offset	(bytes) */
-	uint64_t	vb_size;		/* size (bytes)		*/
-	char		vb_pad[VDEV_BOOT_HEADER_SIZE - 4 * sizeof (uint64_t)];
-} vdev_boot_header_t;
-
-typedef struct vdev_phys {
-	char		vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_block_tail_t)];
-	zio_block_tail_t vp_zbt;
-} vdev_phys_t;
-
-typedef struct vdev_label {
-	char		vl_pad[VDEV_SKIP_SIZE];			/*   8K	*/
-	vdev_boot_header_t vl_boot_header;			/*   8K	*/
-	vdev_phys_t	vl_vdev_phys;				/* 112K	*/
-	char		vl_uberblock[VDEV_UBERBLOCK_RING];	/* 128K	*/
-} vdev_label_t;							/* 256K total */
-
-/*
- * vdev_dirty() flags
- */
-#define	VDD_METASLAB	0x01
-#define	VDD_DTL		0x02
-
-/*
- * Size and offset of embedded boot loader region on each label.
- * The total size of the first two labels plus the boot area is 4MB.
- */
-#define	VDEV_BOOT_OFFSET	(2 * sizeof (vdev_label_t))
-#define	VDEV_BOOT_SIZE		(7ULL << 19)			/* 3.5M	*/
-
-/*
- * Size of label regions at the start and end of each leaf device.
- */
-#define	VDEV_LABEL_START_SIZE	(2 * sizeof (vdev_label_t) + VDEV_BOOT_SIZE)
-#define	VDEV_LABEL_END_SIZE	(2 * sizeof (vdev_label_t))
-#define	VDEV_LABELS		4
-
-#define	VDEV_ALLOC_LOAD		0
-#define	VDEV_ALLOC_ADD		1
-#define	VDEV_ALLOC_SPARE	2
-
-/*
- * Allocate or free a vdev
- */
-extern int vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *config,
-    vdev_t *parent, uint_t id, int alloctype);
-extern void vdev_free(vdev_t *vd);
-
-/*
- * Add or remove children and parents
- */
-extern void vdev_add_child(vdev_t *pvd, vdev_t *cvd);
-extern void vdev_remove_child(vdev_t *pvd, vdev_t *cvd);
-extern void vdev_compact_children(vdev_t *pvd);
-extern vdev_t *vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops);
-extern void vdev_remove_parent(vdev_t *cvd);
-
-/*
- * vdev sync load and sync
- */
-extern void vdev_load(vdev_t *vd);
-extern void vdev_sync(vdev_t *vd, uint64_t txg);
-extern void vdev_sync_done(vdev_t *vd, uint64_t txg);
-extern void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg);
-
-/*
- * Available vdev types.
- */
-extern vdev_ops_t vdev_root_ops;
-extern vdev_ops_t vdev_mirror_ops;
-extern vdev_ops_t vdev_replacing_ops;
-extern vdev_ops_t vdev_raidz_ops;
-#ifdef _KERNEL
-extern vdev_ops_t vdev_geom_ops;
-#else
-extern vdev_ops_t vdev_disk_ops;
-extern vdev_ops_t vdev_file_ops;
-#endif
-extern vdev_ops_t vdev_missing_ops;
-extern vdev_ops_t vdev_spare_ops;
-
-/*
- * Common size functions
- */
-extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize);
-extern uint64_t vdev_get_rsize(vdev_t *vd);
-
-/*
- * zdb uses this tunable, so it must be declared here to make lint happy.
- */
-extern int zfs_vdev_cache_size;
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_VDEV_IMPL_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h
deleted file mode 100644
index f89d938..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h
+++ /dev/null
@@ -1,359 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef	_SYS_ZAP_H
-#define	_SYS_ZAP_H
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-/*
- * ZAP - ZFS Attribute Processor
- *
- * The ZAP is a module which sits on top of the DMU (Data Managemnt
- * Unit) and implements a higher-level storage primitive using DMU
- * objects.  Its primary consumer is the ZPL (ZFS Posix Layer).
- *
- * A "zapobj" is a DMU object which the ZAP uses to stores attributes.
- * Users should use only zap routines to access a zapobj - they should
- * not access the DMU object directly using DMU routines.
- *
- * The attributes stored in a zapobj are name-value pairs.  The name is
- * a zero-terminated string of up to ZAP_MAXNAMELEN bytes (including
- * terminating NULL).  The value is an array of integers, which may be
- * 1, 2, 4, or 8 bytes long.  The total space used by the array (number
- * of integers * integer length) can be up to ZAP_MAXVALUELEN bytes.
- * Note that an 8-byte integer value can be used to store the location
- * (object number) of another dmu object (which may be itself a zapobj).
- * Note that you can use a zero-length attribute to store a single bit
- * of information - the attribute is present or not.
- *
- * The ZAP routines are thread-safe.  However, you must observe the
- * DMU's restriction that a transaction may not be operated on
- * concurrently.
- *
- * Any of the routines that return an int may return an I/O error (EIO
- * or ECHECKSUM).
- *
- *
- * Implementation / Performance Notes:
- *
- * The ZAP is intended to operate most efficiently on attributes with
- * short (49 bytes or less) names and single 8-byte values, for which
- * the microzap will be used.  The ZAP should be efficient enough so
- * that the user does not need to cache these attributes.
- *
- * The ZAP's locking scheme makes its routines thread-safe.  Operations
- * on different zapobjs will be processed concurrently.  Operations on
- * the same zapobj which only read data will be processed concurrently.
- * Operations on the same zapobj which modify data will be processed
- * concurrently when there are many attributes in the zapobj (because
- * the ZAP uses per-block locking - more than 128 * (number of cpus)
- * small attributes will suffice).
- */
-
-/*
- * We're using zero-terminated byte strings (ie. ASCII or UTF-8 C
- * strings) for the names of attributes, rather than a byte string
- * bounded by an explicit length.  If some day we want to support names
- * in character sets which have embedded zeros (eg. UTF-16, UTF-32),
- * we'll have to add routines for using length-bounded strings.
- */
-
-#include <sys/dmu.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-#define	ZAP_MAXNAMELEN 256
-#define	ZAP_MAXVALUELEN 1024
-
-/*
- * Create a new zapobj with no attributes and return its object number.
- */
-uint64_t zap_create(objset_t *ds, dmu_object_type_t ot,
-    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
-
-/*
- * Create a new zapobj with no attributes from the given (unallocated)
- * object number.
- */
-int zap_create_claim(objset_t *ds, uint64_t obj, dmu_object_type_t ot,
-    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
-
-/*
- * The zapobj passed in must be a valid ZAP object for all of the
- * following routines.
- */
-
-/*
- * Destroy this zapobj and all its attributes.
- *
- * Frees the object number using dmu_object_free.
- */
-int zap_destroy(objset_t *ds, uint64_t zapobj, dmu_tx_t *tx);
-
-/*
- * Manipulate attributes.
- *
- * 'integer_size' is in bytes, and must be 1, 2, 4, or 8.
- */
-
-/*
- * Retrieve the contents of the attribute with the given name.
- *
- * If the requested attribute does not exist, the call will fail and
- * return ENOENT.
- *
- * If 'integer_size' is smaller than the attribute's integer size, the
- * call will fail and return EINVAL.
- *
- * If 'integer_size' is equal to or larger than the attribute's integer
- * size, the call will succeed and return 0.  * When converting to a
- * larger integer size, the integers will be treated as unsigned (ie. no
- * sign-extension will be performed).
- *
- * 'num_integers' is the length (in integers) of 'buf'.
- *
- * If the attribute is longer than the buffer, as many integers as will
- * fit will be transferred to 'buf'.  If the entire attribute was not
- * transferred, the call will return EOVERFLOW.
- */
-int zap_lookup(objset_t *ds, uint64_t zapobj, const char *name,
-    uint64_t integer_size, uint64_t num_integers, void *buf);
-
-/*
- * Create an attribute with the given name and value.
- *
- * If an attribute with the given name already exists, the call will
- * fail and return EEXIST.
- */
-int zap_add(objset_t *ds, uint64_t zapobj, const char *name,
-    int integer_size, uint64_t num_integers,
-    const void *val, dmu_tx_t *tx);
-
-/*
- * Set the attribute with the given name to the given value.  If an
- * attribute with the given name does not exist, it will be created.  If
- * an attribute with the given name already exists, the previous value
- * will be overwritten.  The integer_size may be different from the
- * existing attribute's integer size, in which case the attribute's
- * integer size will be updated to the new value.
- */
-int zap_update(objset_t *ds, uint64_t zapobj, const char *name,
-    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
-
-/*
- * Get the length (in integers) and the integer size of the specified
- * attribute.
- *
- * If the requested attribute does not exist, the call will fail and
- * return ENOENT.
- */
-int zap_length(objset_t *ds, uint64_t zapobj, const char *name,
-    uint64_t *integer_size, uint64_t *num_integers);
-
-/*
- * Remove the specified attribute.
- *
- * If the specified attribute does not exist, the call will fail and
- * return ENOENT.
- */
-int zap_remove(objset_t *ds, uint64_t zapobj, const char *name, dmu_tx_t *tx);
-
-/*
- * Returns (in *count) the number of attributes in the specified zap
- * object.
- */
-int zap_count(objset_t *ds, uint64_t zapobj, uint64_t *count);
-
-
-/*
- * Returns (in name) the name of the entry whose value
- * (za_first_integer) is value, or ENOENT if not found.  The string
- * pointed to by name must be at least 256 bytes long.
- */
-int zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, char *name);
-
-struct zap;
-struct zap_leaf;
-typedef struct zap_cursor {
-	/* This structure is opaque! */
-	objset_t *zc_objset;
-	struct zap *zc_zap;
-	struct zap_leaf *zc_leaf;
-	uint64_t zc_zapobj;
-	uint64_t zc_hash;
-	uint32_t zc_cd;
-} zap_cursor_t;
-
-typedef struct {
-	int za_integer_length;
-	uint64_t za_num_integers;
-	uint64_t za_first_integer;	/* no sign extension for <8byte ints */
-	char za_name[MAXNAMELEN];
-} zap_attribute_t;
-
-/*
- * The interface for listing all the attributes of a zapobj can be
- * thought of as cursor moving down a list of the attributes one by
- * one.  The cookie returned by the zap_cursor_serialize routine is
- * persistent across system calls (and across reboot, even).
- */
-
-/*
- * Initialize a zap cursor, pointing to the "first" attribute of the
- * zapobj.  You must _fini the cursor when you are done with it.
- */
-void zap_cursor_init(zap_cursor_t *zc, objset_t *ds, uint64_t zapobj);
-void zap_cursor_fini(zap_cursor_t *zc);
-
-/*
- * Get the attribute currently pointed to by the cursor.  Returns
- * ENOENT if at the end of the attributes.
- */
-int zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za);
-
-/*
- * Advance the cursor to the next attribute.
- */
-void zap_cursor_advance(zap_cursor_t *zc);
-
-/*
- * Get a persistent cookie pointing to the current position of the zap
- * cursor.  The low 4 bits in the cookie are always zero, and thus can
- * be used as to differentiate a serialized cookie from a different type
- * of value.  The cookie will be less than 2^32 as long as there are
- * fewer than 2^22 (4.2 million) entries in the zap object.
- */
-uint64_t zap_cursor_serialize(zap_cursor_t *zc);
-
-/*
- * Initialize a zap cursor pointing to the position recorded by
- * zap_cursor_serialize (in the "serialized" argument).  You can also
- * use a "serialized" argument of 0 to start at the beginning of the
- * zapobj (ie.  zap_cursor_init_serialized(..., 0) is equivalent to
- * zap_cursor_init(...).)
- */
-void zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *ds,
-    uint64_t zapobj, uint64_t serialized);
-
-
-#define	ZAP_HISTOGRAM_SIZE 10
-
-typedef struct zap_stats {
-	/*
-	 * Size of the pointer table (in number of entries).
-	 * This is always a power of 2, or zero if it's a microzap.
-	 * In general, it should be considerably greater than zs_num_leafs.
-	 */
-	uint64_t zs_ptrtbl_len;
-
-	uint64_t zs_blocksize;		/* size of zap blocks */
-
-	/*
-	 * The number of blocks used.  Note that some blocks may be
-	 * wasted because old ptrtbl's and large name/value blocks are
-	 * not reused.  (Although their space is reclaimed, we don't
-	 * reuse those offsets in the object.)
-	 */
-	uint64_t zs_num_blocks;
-
-	/*
-	 * Pointer table values from zap_ptrtbl in the zap_phys_t
-	 */
-	uint64_t zs_ptrtbl_nextblk;	  /* next (larger) copy start block */
-	uint64_t zs_ptrtbl_blks_copied;   /* number source blocks copied */
-	uint64_t zs_ptrtbl_zt_blk;	  /* starting block number */
-	uint64_t zs_ptrtbl_zt_numblks;    /* number of blocks */
-	uint64_t zs_ptrtbl_zt_shift;	  /* bits to index it */
-
-	/*
-	 * Values of the other members of the zap_phys_t
-	 */
-	uint64_t zs_block_type;		/* ZBT_HEADER */
-	uint64_t zs_magic;		/* ZAP_MAGIC */
-	uint64_t zs_num_leafs;		/* The number of leaf blocks */
-	uint64_t zs_num_entries;	/* The number of zap entries */
-	uint64_t zs_salt;		/* salt to stir into hash function */
-
-	/*
-	 * Histograms.  For all histograms, the last index
-	 * (ZAP_HISTOGRAM_SIZE-1) includes any values which are greater
-	 * than what can be represented.  For example
-	 * zs_leafs_with_n5_entries[ZAP_HISTOGRAM_SIZE-1] is the number
-	 * of leafs with more than 45 entries.
-	 */
-
-	/*
-	 * zs_leafs_with_n_pointers[n] is the number of leafs with
-	 * 2^n pointers to it.
-	 */
-	uint64_t zs_leafs_with_2n_pointers[ZAP_HISTOGRAM_SIZE];
-
-	/*
-	 * zs_leafs_with_n_entries[n] is the number of leafs with
-	 * [n*5, (n+1)*5) entries.  In the current implementation, there
-	 * can be at most 55 entries in any block, but there may be
-	 * fewer if the name or value is large, or the block is not
-	 * completely full.
-	 */
-	uint64_t zs_blocks_with_n5_entries[ZAP_HISTOGRAM_SIZE];
-
-	/*
-	 * zs_leafs_n_tenths_full[n] is the number of leafs whose
-	 * fullness is in the range [n/10, (n+1)/10).
-	 */
-	uint64_t zs_blocks_n_tenths_full[ZAP_HISTOGRAM_SIZE];
-
-	/*
-	 * zs_entries_using_n_chunks[n] is the number of entries which
-	 * consume n 24-byte chunks.  (Note, large names/values only use
-	 * one chunk, but contribute to zs_num_blocks_large.)
-	 */
-	uint64_t zs_entries_using_n_chunks[ZAP_HISTOGRAM_SIZE];
-
-	/*
-	 * zs_buckets_with_n_entries[n] is the number of buckets (each
-	 * leaf has 64 buckets) with n entries.
-	 * zs_buckets_with_n_entries[1] should be very close to
-	 * zs_num_entries.
-	 */
-	uint64_t zs_buckets_with_n_entries[ZAP_HISTOGRAM_SIZE];
-} zap_stats_t;
-
-/*
- * Get statistics about a ZAP object.  Note: you need to be aware of the
- * internal implementation of the ZAP to correctly interpret some of the
- * statistics.  This interface shouldn't be relied on unless you really
- * know what you're doing.
- */
-int zap_get_stats(objset_t *ds, uint64_t zapobj, zap_stats_t *zs);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_ZAP_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h
deleted file mode 100644
index 4e43f4a..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h
+++ /dev/null
@@ -1,204 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef	_SYS_ZAP_IMPL_H
-#define	_SYS_ZAP_IMPL_H
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/zap.h>
-#include <sys/zfs_context.h>
-#include <sys/avl.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-extern int fzap_default_block_shift;
-
-#define	ZAP_MAGIC 0x2F52AB2ABULL
-
-#define	FZAP_BLOCK_SHIFT(zap)	((zap)->zap_f.zap_block_shift)
-
-#define	ZAP_MAXCD		(uint32_t)(-1)
-#define	ZAP_HASHBITS		28
-#define	MZAP_ENT_LEN		64
-#define	MZAP_NAME_LEN		(MZAP_ENT_LEN - 8 - 4 - 2)
-#define	MZAP_MAX_BLKSHIFT	SPA_MAXBLOCKSHIFT
-#define	MZAP_MAX_BLKSZ		(1 << MZAP_MAX_BLKSHIFT)
-
-typedef struct mzap_ent_phys {
-	uint64_t mze_value;
-	uint32_t mze_cd;
-	uint16_t mze_pad;	/* in case we want to chain them someday */
-	char mze_name[MZAP_NAME_LEN];
-} mzap_ent_phys_t;
-
-typedef struct mzap_phys {
-	uint64_t mz_block_type;	/* ZBT_MICRO */
-	uint64_t mz_salt;
-	uint64_t mz_pad[6];
-	mzap_ent_phys_t mz_chunk[1];
-	/* actually variable size depending on block size */
-} mzap_phys_t;
-
-typedef struct mzap_ent {
-	avl_node_t mze_node;
-	int mze_chunkid;
-	uint64_t mze_hash;
-	mzap_ent_phys_t mze_phys;
-} mzap_ent_t;
-
-
-/*
- * The (fat) zap is stored in one object. It is an array of
- * 1<<FZAP_BLOCK_SHIFT byte blocks. The layout looks like one of:
- *
- * ptrtbl fits in first block:
- * 	[zap_phys_t zap_ptrtbl_shift < 6] [zap_leaf_t] ...
- *
- * ptrtbl too big for first block:
- * 	[zap_phys_t zap_ptrtbl_shift >= 6] [zap_leaf_t] [ptrtbl] ...
- *
- */
-
-struct dmu_buf;
-struct zap_leaf;
-
-#define	ZBT_LEAF		((1ULL << 63) + 0)
-#define	ZBT_HEADER		((1ULL << 63) + 1)
-#define	ZBT_MICRO		((1ULL << 63) + 3)
-/* any other values are ptrtbl blocks */
-
-/*
- * the embedded pointer table takes up half a block:
- * block size / entry size (2^3) / 2
- */
-#define	ZAP_EMBEDDED_PTRTBL_SHIFT(zap) (FZAP_BLOCK_SHIFT(zap) - 3 - 1)
-
-/*
- * The embedded pointer table starts half-way through the block.  Since
- * the pointer table itself is half the block, it starts at (64-bit)
- * word number (1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)).
- */
-#define	ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) \
-	((uint64_t *)(zap)->zap_f.zap_phys) \
-	[(idx) + (1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap))]
-
-/*
- * TAKE NOTE:
- * If zap_phys_t is modified, zap_byteswap() must be modified.
- */
-typedef struct zap_phys {
-	uint64_t zap_block_type;	/* ZBT_HEADER */
-	uint64_t zap_magic;		/* ZAP_MAGIC */
-
-	struct zap_table_phys {
-		uint64_t zt_blk;	/* starting block number */
-		uint64_t zt_numblks;	/* number of blocks */
-		uint64_t zt_shift;	/* bits to index it */
-		uint64_t zt_nextblk;	/* next (larger) copy start block */
-		uint64_t zt_blks_copied; /* number source blocks copied */
-	} zap_ptrtbl;
-
-	uint64_t zap_freeblk;		/* the next free block */
-	uint64_t zap_num_leafs;		/* number of leafs */
-	uint64_t zap_num_entries;	/* number of entries */
-	uint64_t zap_salt;		/* salt to stir into hash function */
-	/*
-	 * This structure is followed by padding, and then the embedded
-	 * pointer table.  The embedded pointer table takes up second
-	 * half of the block.  It is accessed using the
-	 * ZAP_EMBEDDED_PTRTBL_ENT() macro.
-	 */
-} zap_phys_t;
-
-typedef struct zap_table_phys zap_table_phys_t;
-
-typedef struct zap {
-	objset_t *zap_objset;
-	uint64_t zap_object;
-	struct dmu_buf *zap_dbuf;
-	krwlock_t zap_rwlock;
-	int zap_ismicro;
-	uint64_t zap_salt;
-	union {
-		struct {
-			zap_phys_t *zap_phys;
-
-			/*
-			 * zap_num_entries_mtx protects
-			 * zap_num_entries
-			 */
-			kmutex_t zap_num_entries_mtx;
-			int zap_block_shift;
-		} zap_fat;
-		struct {
-			mzap_phys_t *zap_phys;
-			int16_t zap_num_entries;
-			int16_t zap_num_chunks;
-			int16_t zap_alloc_next;
-			avl_tree_t zap_avl;
-		} zap_micro;
-	} zap_u;
-} zap_t;
-
-#define	zap_f	zap_u.zap_fat
-#define	zap_m	zap_u.zap_micro
-
-uint64_t zap_hash(zap_t *zap, const char *name);
-int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
-    krw_t lti, int fatreader, zap_t **zapp);
-void zap_unlockdir(zap_t *zap);
-void zap_evict(dmu_buf_t *db, void *vmzap);
-
-#define	ZAP_HASH_IDX(hash, n) (((n) == 0) ? 0 : ((hash) >> (64 - (n))))
-
-void fzap_byteswap(void *buf, size_t size);
-int fzap_count(zap_t *zap, uint64_t *count);
-int fzap_lookup(zap_t *zap, const char *name,
-    uint64_t integer_size, uint64_t num_integers, void *buf);
-int fzap_add(zap_t *zap, const char *name,
-    uint64_t integer_size, uint64_t num_integers,
-    const void *val, dmu_tx_t *tx);
-int fzap_update(zap_t *zap, const char *name,
-    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
-int fzap_length(zap_t *zap, const char *name,
-    uint64_t *integer_size, uint64_t *num_integers);
-int fzap_remove(zap_t *zap, const char *name, dmu_tx_t *tx);
-int fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za);
-void fzap_get_stats(zap_t *zap, zap_stats_t *zs);
-void zap_put_leaf(struct zap_leaf *l);
-
-int fzap_add_cd(zap_t *zap, const char *name,
-    uint64_t integer_size, uint64_t num_integers,
-    const void *val, uint32_t cd, dmu_tx_t *tx);
-void fzap_upgrade(zap_t *zap, dmu_tx_t *tx);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif /* _SYS_ZAP_IMPL_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h
deleted file mode 100644
index 147fb72..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h
+++ /dev/null
@@ -1,234 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef	_SYS_ZAP_LEAF_H
-#define	_SYS_ZAP_LEAF_H
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-struct zap;
-
-#define	ZAP_LEAF_MAGIC 0x2AB1EAF
-
-/* chunk size = 24 bytes */
-#define	ZAP_LEAF_CHUNKSIZE 24
-
-/*
- * The amount of space available for chunks is:
- * block size (1<<l->l_bs) - hash entry size (2) * number of hash
- * entries - header space (2*chunksize)
- */
-#define	ZAP_LEAF_NUMCHUNKS(l) \
-	(((1<<(l)->l_bs) - 2*ZAP_LEAF_HASH_NUMENTRIES(l)) / \
-	ZAP_LEAF_CHUNKSIZE - 2)
-
-/*
- * The amount of space within the chunk available for the array is:
- * chunk size - space for type (1) - space for next pointer (2)
- */
-#define	ZAP_LEAF_ARRAY_BYTES (ZAP_LEAF_CHUNKSIZE - 3)
-
-#define	ZAP_LEAF_ARRAY_NCHUNKS(bytes) \
-	(((bytes)+ZAP_LEAF_ARRAY_BYTES-1)/ZAP_LEAF_ARRAY_BYTES)
-
-/*
- * Low water mark:  when there are only this many chunks free, start
- * growing the ptrtbl.  Ideally, this should be larger than a
- * "reasonably-sized" entry.  20 chunks is more than enough for the
- * largest directory entry (MAXNAMELEN (256) byte name, 8-byte value),
- * while still being only around 3% for 16k blocks.
- */
-#define	ZAP_LEAF_LOW_WATER (20)
-
-/*
- * The leaf hash table has block size / 2^5 (32) number of entries,
- * which should be more than enough for the maximum number of entries,
- * which is less than block size / CHUNKSIZE (24) / minimum number of
- * chunks per entry (3).
- */
-#define	ZAP_LEAF_HASH_SHIFT(l) ((l)->l_bs - 5)
-#define	ZAP_LEAF_HASH_NUMENTRIES(l) (1 << ZAP_LEAF_HASH_SHIFT(l))
-
-/*
- * The chunks start immediately after the hash table.  The end of the
- * hash table is at l_hash + HASH_NUMENTRIES, which we simply cast to a
- * chunk_t.
- */
-#define	ZAP_LEAF_CHUNK(l, idx) \
-	((zap_leaf_chunk_t *) \
-	((l)->l_phys->l_hash + ZAP_LEAF_HASH_NUMENTRIES(l)))[idx]
-#define	ZAP_LEAF_ENTRY(l, idx) (&ZAP_LEAF_CHUNK(l, idx).l_entry)
-
-typedef enum zap_chunk_type {
-	ZAP_CHUNK_FREE = 253,
-	ZAP_CHUNK_ENTRY = 252,
-	ZAP_CHUNK_ARRAY = 251,
-	ZAP_CHUNK_TYPE_MAX = 250
-} zap_chunk_type_t;
-
-/*
- * TAKE NOTE:
- * If zap_leaf_phys_t is modified, zap_leaf_byteswap() must be modified.
- */
-typedef struct zap_leaf_phys {
-	struct zap_leaf_header {
-		uint64_t lh_block_type;		/* ZBT_LEAF */
-		uint64_t lh_pad1;
-		uint64_t lh_prefix;		/* hash prefix of this leaf */
-		uint32_t lh_magic;		/* ZAP_LEAF_MAGIC */
-		uint16_t lh_nfree;		/* number free chunks */
-		uint16_t lh_nentries;		/* number of entries */
-		uint16_t lh_prefix_len;		/* num bits used to id this */
-
-/* above is accessable to zap, below is zap_leaf private */
-
-		uint16_t lh_freelist;		/* chunk head of free list */
-		uint8_t lh_pad2[12];
-	} l_hdr; /* 2 24-byte chunks */
-
-	/*
-	 * The header is followed by a hash table with
-	 * ZAP_LEAF_HASH_NUMENTRIES(zap) entries.  The hash table is
-	 * followed by an array of ZAP_LEAF_NUMCHUNKS(zap)
-	 * zap_leaf_chunk structures.  These structures are accessed
-	 * with the ZAP_LEAF_CHUNK() macro.
-	 */
-
-	uint16_t l_hash[1];
-} zap_leaf_phys_t;
-
-typedef union zap_leaf_chunk {
-	struct zap_leaf_entry {
-		uint8_t le_type; 		/* always ZAP_CHUNK_ENTRY */
-		uint8_t le_int_size;		/* size of ints */
-		uint16_t le_next;		/* next entry in hash chain */
-		uint16_t le_name_chunk;		/* first chunk of the name */
-		uint16_t le_name_length;	/* bytes in name, incl null */
-		uint16_t le_value_chunk;	/* first chunk of the value */
-		uint16_t le_value_length;	/* value length in ints */
-		uint32_t le_cd;			/* collision differentiator */
-		uint64_t le_hash;		/* hash value of the name */
-	} l_entry;
-	struct zap_leaf_array {
-		uint8_t la_type;		/* always ZAP_CHUNK_ARRAY */
-		uint8_t la_array[ZAP_LEAF_ARRAY_BYTES];
-		uint16_t la_next;		/* next blk or CHAIN_END */
-	} l_array;
-	struct zap_leaf_free {
-		uint8_t lf_type;		/* always ZAP_CHUNK_FREE */
-		uint8_t lf_pad[ZAP_LEAF_ARRAY_BYTES];
-		uint16_t lf_next;	/* next in free list, or CHAIN_END */
-	} l_free;
-} zap_leaf_chunk_t;
-
-typedef struct zap_leaf {
-	krwlock_t l_rwlock; 		/* only used on head of chain */
-	uint64_t l_blkid;		/* 1<<ZAP_BLOCK_SHIFT byte block off */
-	int l_bs;			/* block size shift */
-	dmu_buf_t *l_dbuf;
-	zap_leaf_phys_t *l_phys;
-} zap_leaf_t;
-
-
-typedef struct zap_entry_handle {
-	/* below is set by zap_leaf.c and is public to zap.c */
-	uint64_t zeh_num_integers;
-	uint64_t zeh_hash;
-	uint32_t zeh_cd;
-	uint8_t zeh_integer_size;
-
-	/* below is private to zap_leaf.c */
-	uint16_t zeh_fakechunk;
-	uint16_t *zeh_chunkp;
-	zap_leaf_t *zeh_leaf;
-} zap_entry_handle_t;
-
-/*
- * Return a handle to the named entry, or ENOENT if not found.  The hash
- * value must equal zap_hash(name).
- */
-extern int zap_leaf_lookup(zap_leaf_t *l,
-	const char *name, uint64_t h, zap_entry_handle_t *zeh);
-
-/*
- * Return a handle to the entry with this hash+cd, or the entry with the
- * next closest hash+cd.
- */
-extern int zap_leaf_lookup_closest(zap_leaf_t *l,
-    uint64_t hash, uint32_t cd, zap_entry_handle_t *zeh);
-
-/*
- * Read the first num_integers in the attribute.  Integer size
- * conversion will be done without sign extension.  Return EINVAL if
- * integer_size is too small.  Return EOVERFLOW if there are more than
- * num_integers in the attribute.
- */
-extern int zap_entry_read(const zap_entry_handle_t *zeh,
-	uint8_t integer_size, uint64_t num_integers, void *buf);
-
-extern int zap_entry_read_name(const zap_entry_handle_t *zeh,
-	uint16_t buflen, char *buf);
-
-/*
- * Replace the value of an existing entry.
- *
- * zap_entry_update may fail if it runs out of space (ENOSPC).
- */
-extern int zap_entry_update(zap_entry_handle_t *zeh,
-	uint8_t integer_size, uint64_t num_integers, const void *buf);
-
-/*
- * Remove an entry.
- */
-extern void zap_entry_remove(zap_entry_handle_t *zeh);
-
-/*
- * Create an entry. An equal entry must not exist, and this entry must
- * belong in this leaf (according to its hash value).  Fills in the
- * entry handle on success.  Returns 0 on success or ENOSPC on failure.
- */
-extern int zap_entry_create(zap_leaf_t *l,
-	const char *name, uint64_t h, uint32_t cd,
-	uint8_t integer_size, uint64_t num_integers, const void *buf,
-	zap_entry_handle_t *zeh);
-
-/*
- * Other stuff.
- */
-
-extern void zap_leaf_init(zap_leaf_t *l);
-extern void zap_leaf_byteswap(zap_leaf_phys_t *buf, int len);
-extern void zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl);
-extern void zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif /* _SYS_ZAP_LEAF_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h
deleted file mode 100644
index 3250b76..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef	_SYS_FS_ZFS_ACL_H
-#define	_SYS_FS_ZFS_ACL_H
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#ifdef _KERNEL
-#include <sys/cred.h>
-#endif
-#include <sys/acl.h>
-#include <sys/dmu.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-struct znode_phys;
-
-#define	ACCESS_UNDETERMINED	-1
-
-#define	ACE_SLOT_CNT	6
-
-typedef struct zfs_znode_acl {
-	uint64_t	z_acl_extern_obj;	  /* ext acl pieces */
-	uint32_t	z_acl_count;		  /* Number of ACEs */
-	uint16_t	z_acl_version;		  /* acl version */
-	uint16_t	z_acl_pad;		  /* pad */
-	ace_t		z_ace_data[ACE_SLOT_CNT]; /* 6 standard ACEs */
-} zfs_znode_acl_t;
-
-#define	ACL_DATA_ALLOCED	0x1
-
-/*
- * Max ACL size is prepended deny for all entries + the
- * canonical six tacked on * the end.
- */
-#define	MAX_ACL_SIZE	(MAX_ACL_ENTRIES * 2 + 6)
-
-typedef struct zfs_acl {
-	int		z_slots;	/* number of allocated slots for ACEs */
-	int		z_acl_count;
-	uint_t		z_state;
-	ace_t		*z_acl;
-} zfs_acl_t;
-
-#define	ZFS_ACL_SIZE(aclcnt)	(sizeof (ace_t) * (aclcnt))
-
-/*
- * Property values for acl_mode and acl_inherit.
- *
- * acl_mode can take discard, noallow, groupmask and passthrough.
- * whereas acl_inherit has secure instead of groupmask.
- */
-
-#define	ZFS_ACL_DISCARD		0
-#define	ZFS_ACL_NOALLOW		1
-#define	ZFS_ACL_GROUPMASK	2
-#define	ZFS_ACL_PASSTHROUGH	3
-#define	ZFS_ACL_SECURE		4
-
-struct znode;
-
-#ifdef _KERNEL
-void zfs_perm_init(struct znode *, struct znode *, int, vattr_t *,
-    dmu_tx_t *, cred_t *);
-#ifdef TODO
-int zfs_getacl(struct znode *, vsecattr_t *, cred_t *);
-#endif
-int zfs_mode_update(struct znode *, uint64_t, dmu_tx_t  *);
-#ifdef TODO
-int zfs_setacl(struct znode *, vsecattr_t *, cred_t *);
-#endif
-void zfs_acl_rele(void *);
-void zfs_ace_byteswap(ace_t *, int);
-extern int zfs_zaccess(struct znode *, int, cred_t *);
-extern int zfs_zaccess_rwx(struct znode *, mode_t, cred_t *);
-extern int zfs_acl_access(struct znode *, int, cred_t *);
-int zfs_acl_chmod_setattr(struct znode *, uint64_t, dmu_tx_t *);
-int zfs_zaccess_delete(struct znode *, struct znode *, cred_t *);
-int zfs_zaccess_rename(struct znode *, struct znode *,
-    struct znode *, struct znode *, cred_t *cr);
-int zfs_zaccess_v4_perm(struct znode *, int, cred_t *);
-void zfs_acl_free(zfs_acl_t *);
-
-#endif
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* !ZFS_NO_ACL */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h
deleted file mode 100644
index 4deeb3c..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_ZFS_CONTEXT_H
-#define	_SYS_ZFS_CONTEXT_H
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-#include <sys/param.h>
-#include <sys/stdint.h>
-#include <sys/note.h>
-#include <sys/kernel.h>
-#include <sys/debug.h>
-#include <sys/systm.h>
-#include <sys/proc.h>
-#include <sys/sysmacros.h>
-#include <sys/bitmap.h>
-#include <sys/cmn_err.h>
-#include <sys/kmem.h>
-#include <sys/taskq.h>
-#include <sys/systm.h>
-#include <sys/conf.h>
-#include <sys/mutex.h>
-#include <sys/rwlock.h>
-#include <sys/random.h>
-#include <sys/byteorder.h>
-#include <sys/systm.h>
-#include <sys/list.h>
-#include <sys/uio.h>
-#include <sys/dirent.h>
-#include <sys/time.h>
-#include <sys/uio.h>
-#include <sys/fcntl.h>
-#include <sys/limits.h>
-#include <sys/string.h>
-#include <sys/bio.h>
-#include <sys/buf.h>
-#include <sys/cred.h>
-#include <sys/sdt.h>
-#include <sys/file.h>
-#include <sys/vfs.h>
-#include <sys/sysctl.h>
-#include <sys/sbuf.h>
-#include <sys/priv.h>
-#include <sys/kdb.h>
-#include <sys/ktr.h>
-#include <sys/stack.h>
-#include <sys/lockf.h>
-#include <sys/policy.h>
-#include <sys/zone.h>
-#include <sys/eventhandler.h>
-#include <sys/misc.h>
-#include <sys/zfs_debug.h>
-
-#include <machine/stdarg.h>
-
-#include <vm/vm.h>
-#include <vm/vm_page.h>
-#include <vm/vm_object.h>
-#include <vm/vm_pager.h>
-#include <vm/vm_kern.h>
-#include <vm/vm_map.h>
-/* There is clash. vm_map.h defines the two below and vdev_cache.c use them. */
-#ifdef min_offset
-#undef min_offset
-#endif
-#ifdef max_offset
-#undef max_offset
-#endif
-#include <vm/vm_extern.h>
-#include <vm/vnode_pager.h>
-
-#define	CPU_SEQID	(curcpu)
-
-#ifdef	__cplusplus
-}
-#endif
-
-extern int zfs_debug_level;
-extern struct mtx zfs_debug_mtx;
-#define	ZFS_LOG(lvl, ...)	do {					\
-	if (((lvl) & 0xff) <= zfs_debug_level) {			\
-		mtx_lock(&zfs_debug_mtx);				\
-		printf("%s:%u[%d]: ", __func__, __LINE__, (lvl));	\
-		printf(__VA_ARGS__);					\
-		printf("\n");						\
-		if ((lvl) & 0x100)					\
-			kdb_backtrace();				\
-		mtx_unlock(&zfs_debug_mtx);				\
-	}								\
-} while (0)
-
-#endif	/* _SYS_ZFS_CONTEXT_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h
deleted file mode 100644
index a676533..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef	_ZFS_CTLDIR_H
-#define	_ZFS_CTLDIR_H
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/vnode.h>
-#include <sys/zfs_vfsops.h>
-#include <sys/zfs_znode.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-#define	ZFS_CTLDIR_NAME		".zfs"
-
-#define	zfs_has_ctldir(zdp)	\
-	((zdp)->z_id == (zdp)->z_zfsvfs->z_root && \
-	((zdp)->z_zfsvfs->z_ctldir != NULL))
-#define	zfs_show_ctldir(zdp)	\
-	(zfs_has_ctldir(zdp) && \
-	((zdp)->z_zfsvfs->z_show_ctldir))
-
-void zfsctl_create(zfsvfs_t *);
-void zfsctl_destroy(zfsvfs_t *);
-vnode_t *zfsctl_root(znode_t *);
-void zfsctl_init(void);
-void zfsctl_fini(void);
-
-int zfsctl_rename_snapshot(const char *from, const char *to);
-int zfsctl_destroy_snapshot(const char *snapname, int force);
-int zfsctl_umount_snapshots(vfs_t *, int, cred_t *);
-
-int zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
-    int flags, vnode_t *rdir, cred_t *cr);
-
-int zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp);
-
-#define	ZFSCTL_INO_ROOT		0x1
-#define	ZFSCTL_INO_SNAPDIR	0x2
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _ZFS_CTLDIR_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h
deleted file mode 100644
index 450ac1c..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_ZFS_DEBUG_H
-#define	_SYS_ZFS_DEBUG_H
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-#ifndef TRUE
-#define	TRUE 1
-#endif
-
-#ifndef FALSE
-#define	FALSE 0
-#endif
-
-/*
- * ZFS debugging
- */
-
-#if defined(DEBUG) || !defined(_KERNEL)
-#define	ZFS_DEBUG
-#endif
-
-extern int zfs_flags;
-
-#define	ZFS_DEBUG_DPRINTF	0x0001
-#define	ZFS_DEBUG_DBUF_VERIFY	0x0002
-#define	ZFS_DEBUG_DNODE_VERIFY	0x0004
-#define	ZFS_DEBUG_SNAPNAMES	0x0008
-#define	ZFS_DEBUG_MODIFY	0x0010
-
-#ifdef ZFS_DEBUG
-extern void __dprintf(const char *file, const char *func,
-    int line, const char *fmt, ...);
-#define	dprintf(...) \
-	if (zfs_flags & ZFS_DEBUG_DPRINTF) \
-		__dprintf(__FILE__, __func__, __LINE__, __VA_ARGS__)
-#else
-#define	dprintf(...) ((void)0)
-#endif /* ZFS_DEBUG */
-
-extern void zfs_panic_recover(const char *fmt, ...);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_ZFS_DEBUG_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h
deleted file mode 100644
index f60d614..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef	_SYS_FS_ZFS_DIR_H
-#define	_SYS_FS_ZFS_DIR_H
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/dmu.h>
-#include <sys/zfs_znode.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-/* zfs_dirent_lock() flags */
-#define	ZNEW		0x0001		/* entry should not exist */
-#define	ZEXISTS		0x0002		/* entry should exist */
-#define	ZSHARED		0x0004		/* shared access (zfs_dirlook()) */
-#define	ZXATTR		0x0008		/* we want the xattr dir */
-#define	ZRENAMING	0x0010		/* znode is being renamed */
-
-/* mknode flags */
-#define	IS_ROOT_NODE	0x01		/* create a root node */
-#define	IS_XATTR	0x02		/* create an extended attribute node */
-#define	IS_REPLAY	0x04		/* we are replaying intent log */
-
-extern int zfs_dirent_lock(zfs_dirlock_t **, znode_t *, char *, znode_t **,
-    int);
-extern void zfs_dirent_unlock(zfs_dirlock_t *);
-extern int zfs_link_create(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int);
-extern int zfs_link_destroy(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int,
-    boolean_t *);
-extern int zfs_dirlook(znode_t *, char *, vnode_t **);
-extern void zfs_mknode(znode_t *, vattr_t *, uint64_t *,
-    dmu_tx_t *, cred_t *, uint_t, znode_t **, int);
-extern void zfs_rmnode(znode_t *);
-extern boolean_t zfs_dirempty(znode_t *);
-extern void zfs_unlinked_add(znode_t *, dmu_tx_t *);
-extern void zfs_unlinked_drain(zfsvfs_t *zfsvfs);
-extern int zfs_sticky_remove_access(znode_t *, znode_t *, cred_t *cr);
-extern int zfs_get_xattrdir(znode_t *, vnode_t **, cred_t *, int);
-extern int zfs_make_xattrdir(znode_t *, vattr_t *, vnode_t **, cred_t *);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_FS_ZFS_DIR_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
deleted file mode 100644
index 61a0a9e..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef	_SYS_ZFS_IOCTL_H
-#define	_SYS_ZFS_IOCTL_H
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/cred.h>
-#include <sys/dmu.h>
-#include <sys/zio.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-/*
- * Property values for snapdir
- */
-#define	ZFS_SNAPDIR_HIDDEN		0
-#define	ZFS_SNAPDIR_VISIBLE		1
-
-#define	DMU_BACKUP_VERSION (1ULL)
-#define	DMU_BACKUP_MAGIC 0x2F5bacbacULL
-
-/*
- * zfs ioctl command structure
- */
-typedef struct dmu_replay_record {
-	enum {
-		DRR_BEGIN, DRR_OBJECT, DRR_FREEOBJECTS,
-		DRR_WRITE, DRR_FREE, DRR_END,
-	} drr_type;
-	uint32_t drr_pad;
-	union {
-		struct drr_begin {
-			uint64_t drr_magic;
-			uint64_t drr_version;
-			uint64_t drr_creation_time;
-			dmu_objset_type_t drr_type;
-			uint32_t drr_pad;
-			uint64_t drr_toguid;
-			uint64_t drr_fromguid;
-			char drr_toname[MAXNAMELEN];
-		} drr_begin;
-		struct drr_end {
-			zio_cksum_t drr_checksum;
-		} drr_end;
-		struct drr_object {
-			uint64_t drr_object;
-			dmu_object_type_t drr_type;
-			dmu_object_type_t drr_bonustype;
-			uint32_t drr_blksz;
-			uint32_t drr_bonuslen;
-			uint8_t drr_checksum;
-			uint8_t drr_compress;
-			uint8_t drr_pad[6];
-			/* bonus content follows */
-		} drr_object;
-		struct drr_freeobjects {
-			uint64_t drr_firstobj;
-			uint64_t drr_numobjs;
-		} drr_freeobjects;
-		struct drr_write {
-			uint64_t drr_object;
-			dmu_object_type_t drr_type;
-			uint32_t drr_pad;
-			uint64_t drr_offset;
-			uint64_t drr_length;
-			/* content follows */
-		} drr_write;
-		struct drr_free {
-			uint64_t drr_object;
-			uint64_t drr_offset;
-			uint64_t drr_length;
-		} drr_free;
-	} drr_u;
-} dmu_replay_record_t;
-
-typedef struct zinject_record {
-	uint64_t	zi_objset;
-	uint64_t	zi_object;
-	uint64_t	zi_start;
-	uint64_t	zi_end;
-	uint64_t	zi_guid;
-	uint32_t	zi_level;
-	uint32_t	zi_error;
-	uint64_t	zi_type;
-	uint32_t	zi_freq;
-} zinject_record_t;
-
-#define	ZINJECT_NULL		0x1
-#define	ZINJECT_FLUSH_ARC	0x2
-#define	ZINJECT_UNLOAD_SPA	0x4
-
-typedef struct zfs_cmd {
-	char		zc_name[MAXPATHLEN];
-	char		zc_value[MAXPATHLEN * 2];
-	uint64_t	zc_guid;
-	uint64_t	zc_nvlist_src;	/* really (char *) */
-	uint64_t	zc_nvlist_src_size;
-	uint64_t	zc_nvlist_dst;	/* really (char *) */
-	uint64_t	zc_nvlist_dst_size;
-	uint64_t	zc_cookie;
-	uint64_t	zc_cred;
-	uint64_t	zc_dev;
-	uint64_t	zc_objset_type;
-	uint64_t	zc_history;	/* really (char *) */
-	uint64_t	zc_history_len;
-	uint64_t	zc_history_offset;
-	uint64_t	zc_obj;
-	uint64_t	zc_jailid;
-	dmu_objset_stats_t zc_objset_stats;
-	struct drr_begin zc_begin_record;
-	zinject_record_t zc_inject_record;
-} zfs_cmd_t;
-
-#ifdef _KERNEL
-typedef struct zfs_create_data {
-	cred_t		*zc_cred;
-	dev_t		zc_dev;
-	nvlist_t	*zc_props;
-} zfs_create_data_t;
-#endif
-
-#define	ZVOL_MAX_MINOR	(1 << 16)
-#define	ZFS_MIN_MINOR	(ZVOL_MAX_MINOR + 1)
-
-#ifdef _KERNEL
-
-extern int zfs_secpolicy_write(const char *dataset, cred_t *cr);
-extern int zfs_busy(void);
-extern int zfs_unmount_snap(char *, void *);
-
-#endif	/* _KERNEL */
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_ZFS_IOCTL_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_rlock.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_rlock.h
deleted file mode 100644
index f302b66..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_rlock.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef	_SYS_FS_ZFS_RLOCK_H
-#define	_SYS_FS_ZFS_RLOCK_H
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-#ifdef _KERNEL
-
-#include <sys/zfs_znode.h>
-
-typedef enum {
-	RL_READER,
-	RL_WRITER,
-	RL_APPEND
-} rl_type_t;
-
-typedef struct rl {
-	znode_t *r_zp;		/* znode this lock applies to */
-	avl_node_t r_node;	/* avl node link */
-	uint64_t r_off;		/* file range offset */
-	uint64_t r_len;		/* file range length */
-	uint_t r_cnt;		/* range reference count in tree */
-	rl_type_t r_type;	/* range type */
-	kcondvar_t r_wr_cv;	/* cv for waiting writers */
-	kcondvar_t r_rd_cv;	/* cv for waiting readers */
-	uint8_t r_proxy;	/* acting for original range */
-	uint8_t r_write_wanted;	/* writer wants to lock this range */
-	uint8_t r_read_wanted;	/* reader wants to lock this range */
-} rl_t;
-
-/*
- * Lock a range (offset, length) as either shared (READER)
- * or exclusive (WRITER or APPEND). APPEND is a special type that
- * is converted to WRITER that specified to lock from the start of the
- * end of file.  zfs_range_lock() returns the range lock structure.
- */
-rl_t *zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type);
-
-/*
- * Unlock range and destroy range lock structure.
- */
-void zfs_range_unlock(rl_t *rl);
-
-/*
- * Reduce range locked as RW_WRITER from whole file to specified range.
- * Asserts the whole file was previously locked.
- */
-void zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len);
-
-/*
- * AVL comparison function used to compare range locks
- */
-int zfs_range_compare(const void *arg1, const void *arg2);
-
-#endif /* _KERNEL */
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_FS_ZFS_RLOCK_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h
deleted file mode 100644
index aa82cc1..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef	_SYS_FS_ZFS_VFSOPS_H
-#define	_SYS_FS_ZFS_VFSOPS_H
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/list.h>
-#include <sys/vfs.h>
-#include <sys/zil.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-typedef struct zfsvfs zfsvfs_t;
-
-struct zfsvfs {
-	vfs_t		*z_vfs;		/* generic fs struct */
-	zfsvfs_t	*z_parent;	/* parent fs */
-	objset_t	*z_os;		/* objset reference */
-	uint64_t	z_root;		/* id of root znode */
-	uint64_t	z_unlinkedobj;	/* id of unlinked zapobj */
-	uint64_t	z_max_blksz;	/* maximum block size for files */
-	uint64_t	z_assign;	/* TXG_NOWAIT or set by zil_replay() */
-	zilog_t		*z_log;		/* intent log pointer */
-	uint_t		z_acl_mode;	/* acl chmod/mode behavior */
-	uint_t		z_acl_inherit;	/* acl inheritance behavior */
-	boolean_t	z_atime;	/* enable atimes mount option */
-	boolean_t	z_unmounted1;	/* unmounted phase 1 */
-	boolean_t	z_unmounted2;	/* unmounted phase 2 */
-	uint32_t	z_op_cnt;	/* vnode/vfs operations ref count */
-	krwlock_t	z_um_lock;	/* rw lock for umount phase 2 */
-	list_t		z_all_znodes;	/* all vnodes in the fs */
-	kmutex_t	z_znodes_lock;	/* lock for z_all_znodes */
-	vnode_t		*z_ctldir;	/* .zfs directory pointer */
-	boolean_t	z_show_ctldir;	/* expose .zfs in the root dir */
-	boolean_t	z_issnap;	/* true if this is a snapshot */
-#define	ZFS_OBJ_MTX_SZ	64
-	kmutex_t	z_hold_mtx[ZFS_OBJ_MTX_SZ];	/* znode hold locks */
-};
-
-/*
- * The total file ID size is limited to 12 bytes (including the length
- * field) in the NFSv2 protocol.  For historical reasons, this same limit
- * is currently being imposed by the Solaris NFSv3 implementation...
- * although the protocol actually permits a maximum of 64 bytes.  It will
- * not be possible to expand beyond 12 bytes without abandoning support
- * of NFSv2 and making some changes to the Solaris NFSv3 implementation.
- *
- * For the time being, we will partition up the available space as follows:
- *	2 bytes		fid length (required)
- *	6 bytes		object number (48 bits)
- *	4 bytes		generation number (32 bits)
- * We reserve only 48 bits for the object number, as this is the limit
- * currently defined and imposed by the DMU.
- */
-typedef struct zfid_short {
-	uint16_t	zf_len;
-	uint8_t		zf_object[6];		/* obj[i] = obj >> (8 * i) */
-	uint8_t		zf_gen[4];		/* gen[i] = gen >> (8 * i) */
-} zfid_short_t;
-
-typedef struct zfid_long {
-	zfid_short_t	z_fid;
-	uint8_t		zf_setid[6];		/* obj[i] = obj >> (8 * i) */
-	uint8_t		zf_setgen[2];		/* gen[i] = gen >> (8 * i) */
-} zfid_long_t;
-
-#define	SHORT_FID_LEN	(sizeof (zfid_short_t) - sizeof (uint16_t))
-#define	LONG_FID_LEN	(sizeof (zfid_long_t) - sizeof (uint16_t))
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_FS_ZFS_VFSOPS_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h
deleted file mode 100644
index c9c317e..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h
+++ /dev/null
@@ -1,298 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef	_SYS_FS_ZFS_ZNODE_H
-#define	_SYS_FS_ZFS_ZNODE_H
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#ifdef _KERNEL
-#include <sys/list.h>
-#include <sys/dmu.h>
-#include <sys/zfs_vfsops.h>
-#endif
-#include <sys/zfs_acl.h>
-#include <sys/zil.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-/*
- * Define special zfs pflags
- */
-#define	ZFS_XATTR	0x1		/* is an extended attribute */
-#define	ZFS_INHERIT_ACE	0x2		/* ace has inheritable ACEs */
-#define	ZFS_ACL_TRIVIAL 0x4		/* files ACL is trivial */
-
-#define	MASTER_NODE_OBJ	1
-
-/*
- * special attributes for master node.
- */
-
-#define	ZFS_FSID		"FSID"
-#define	ZFS_UNLINKED_SET	"DELETE_QUEUE"
-#define	ZFS_ROOT_OBJ		"ROOT"
-#define	ZPL_VERSION_OBJ		"VERSION"
-#define	ZFS_PROP_BLOCKPERPAGE	"BLOCKPERPAGE"
-#define	ZFS_PROP_NOGROWBLOCKS	"NOGROWBLOCKS"
-
-#define	ZFS_FLAG_BLOCKPERPAGE	0x1
-#define	ZFS_FLAG_NOGROWBLOCKS	0x2
-
-/*
- * ZPL version - rev'd whenever an incompatible on-disk format change
- * occurs.  Independent of SPA/DMU/ZAP versioning.
- */
-
-#define	ZPL_VERSION		1ULL
-
-#define	ZFS_MAX_BLOCKSIZE	(SPA_MAXBLOCKSIZE)
-
-/* Path component length */
-/*
- * The generic fs code uses MAXNAMELEN to represent
- * what the largest component length is.  Unfortunately,
- * this length includes the terminating NULL.  ZFS needs
- * to tell the users via pathconf() and statvfs() what the
- * true maximum length of a component is, excluding the NULL.
- */
-#define	ZFS_MAXNAMELEN	(MAXNAMELEN - 1)
-
-/*
- * The directory entry has the type (currently unused on Solaris) in the
- * top 4 bits, and the object number in the low 48 bits.  The "middle"
- * 12 bits are unused.
- */
-#define	ZFS_DIRENT_TYPE(de) BF64_GET(de, 60, 4)
-#define	ZFS_DIRENT_OBJ(de) BF64_GET(de, 0, 48)
-#define	ZFS_DIRENT_MAKE(type, obj) (((uint64_t)type << 60) | obj)
-
-
-/*
- * This is the persistent portion of the znode.  It is stored
- * in the "bonus buffer" of the file.  Short symbolic links
- * are also stored in the bonus buffer.
- */
-typedef struct znode_phys {
-	uint64_t zp_atime[2];		/*  0 - last file access time */
-	uint64_t zp_mtime[2];		/* 16 - last file modification time */
-	uint64_t zp_ctime[2];		/* 32 - last file change time */
-	uint64_t zp_crtime[2];		/* 48 - creation time */
-	uint64_t zp_gen;		/* 64 - generation (txg of creation) */
-	uint64_t zp_mode;		/* 72 - file mode bits */
-	uint64_t zp_size;		/* 80 - size of file */
-	uint64_t zp_parent;		/* 88 - directory parent (`..') */
-	uint64_t zp_links;		/* 96 - number of links to file */
-	uint64_t zp_xattr;		/* 104 - DMU object for xattrs */
-	uint64_t zp_rdev;		/* 112 - dev_t for VBLK & VCHR files */
-	uint64_t zp_flags;		/* 120 - persistent flags */
-	uint64_t zp_uid;		/* 128 - file owner */
-	uint64_t zp_gid;		/* 136 - owning group */
-	uint64_t zp_pad[4];		/* 144 - future */
-	zfs_znode_acl_t zp_acl;		/* 176 - 263 ACL */
-	/*
-	 * Data may pad out any remaining bytes in the znode buffer, eg:
-	 *
-	 * |<---------------------- dnode_phys (512) ------------------------>|
-	 * |<-- dnode (192) --->|<----------- "bonus" buffer (320) ---------->|
-	 *			|<---- znode (264) ---->|<---- data (56) ---->|
-	 *
-	 * At present, we only use this space to store symbolic links.
-	 */
-} znode_phys_t;
-
-/*
- * Directory entry locks control access to directory entries.
- * They are used to protect creates, deletes, and renames.
- * Each directory znode has a mutex and a list of locked names.
- */
-#ifdef _KERNEL
-typedef struct zfs_dirlock {
-	char		*dl_name;	/* directory entry being locked */
-	uint32_t	dl_sharecnt;	/* 0 if exclusive, > 0 if shared */
-	uint16_t	dl_namesize;	/* set if dl_name was allocated */
-	kcondvar_t	dl_cv;		/* wait for entry to be unlocked */
-	struct znode	*dl_dzp;	/* directory znode */
-	struct zfs_dirlock *dl_next;	/* next in z_dirlocks list */
-} zfs_dirlock_t;
-
-typedef struct znode {
-	struct zfsvfs	*z_zfsvfs;
-	vnode_t		*z_vnode;
-	uint64_t	z_id;		/* object ID for this znode */
-	kmutex_t	z_lock;		/* znode modification lock */
-	krwlock_t	z_map_lock;	/* page map lock */
-	krwlock_t	z_parent_lock;	/* parent lock for directories */
-	krwlock_t	z_name_lock;	/* "master" lock for dirent locks */
-	zfs_dirlock_t	*z_dirlocks;	/* directory entry lock list */
-	kmutex_t	z_range_lock;	/* protects changes to z_range_avl */
-	avl_tree_t	z_range_avl;	/* avl tree of file range locks */
-	uint8_t		z_unlinked;	/* file has been unlinked */
-	uint8_t		z_atime_dirty;	/* atime needs to be synced */
-	uint8_t		z_dbuf_held;	/* Is z_dbuf already held? */
-	uint8_t		z_zn_prefetch;	/* Prefetch znodes? */
-	uint_t		z_blksz;	/* block size in bytes */
-	uint_t		z_seq;		/* modification sequence number */
-	uint64_t	z_mapcnt;	/* number of pages mapped to file */
-	uint64_t	z_last_itx;	/* last ZIL itx on this znode */
-	uint32_t	z_sync_cnt;	/* synchronous open count */
-	kmutex_t	z_acl_lock;	/* acl data lock */
-	list_node_t	z_link_node;	/* all znodes in fs link */
-	struct lockf	*z_lockf;	/* Head of byte-level lock list. */
-	/*
-	 * These are dmu managed fields.
-	 */
-	znode_phys_t	*z_phys;	/* pointer to persistent znode */
-	dmu_buf_t	*z_dbuf;	/* buffer containing the z_phys */
-} znode_t;
-
-
-/*
- * Range locking rules
- * --------------------
- * 1. When truncating a file (zfs_create, zfs_setattr, zfs_space) the whole
- *    file range needs to be locked as RL_WRITER. Only then can the pages be
- *    freed etc and zp_size reset. zp_size must be set within range lock.
- * 2. For writes and punching holes (zfs_write & zfs_space) just the range
- *    being written or freed needs to be locked as RL_WRITER.
- *    Multiple writes at the end of the file must coordinate zp_size updates
- *    to ensure data isn't lost. A compare and swap loop is currently used
- *    to ensure the file size is at least the offset last written.
- * 3. For reads (zfs_read, zfs_get_data & zfs_putapage) just the range being
- *    read needs to be locked as RL_READER. A check against zp_size can then
- *    be made for reading beyond end of file.
- */
-
-/*
- * Convert between znode pointers and vnode pointers
- */
-#define	ZTOV(ZP)	((ZP)->z_vnode)
-#define	VTOZ(VP)	((znode_t *)(VP)->v_data)
-
-/*
- * ZFS_ENTER() is called on entry to each ZFS vnode and vfs operation.
- * ZFS_EXIT() must be called before exitting the vop.
- */
-#define	ZFS_ENTER(zfsvfs) \
-	{ \
-		atomic_add_32(&(zfsvfs)->z_op_cnt, 1); \
-		if ((zfsvfs)->z_unmounted1) { \
-			ZFS_EXIT(zfsvfs); \
-			return (EIO); \
-		} \
-	}
-#define	ZFS_EXIT(zfsvfs) atomic_add_32(&(zfsvfs)->z_op_cnt, -1)
-
-/*
- * Macros for dealing with dmu_buf_hold
- */
-#define	ZFS_OBJ_HASH(obj_num)	(obj_num & (ZFS_OBJ_MTX_SZ - 1))
-#define	ZFS_OBJ_MUTEX(zp)	\
-	(&zp->z_zfsvfs->z_hold_mtx[ZFS_OBJ_HASH(zp->z_id)])
-#define	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num) \
-	mutex_enter(&zfsvfs->z_hold_mtx[ZFS_OBJ_HASH(obj_num)]);
-
-#define	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num) \
-	mutex_exit(&zfsvfs->z_hold_mtx[ZFS_OBJ_HASH(obj_num)])
-
-/*
- * Macros to encode/decode ZFS stored time values from/to struct timespec
- */
-#define	ZFS_TIME_ENCODE(tp, stmp)		\
-{						\
-	stmp[0] = (uint64_t)(tp)->tv_sec; 	\
-	stmp[1] = (uint64_t)(tp)->tv_nsec;	\
-}
-
-#define	ZFS_TIME_DECODE(tp, stmp)		\
-{						\
-	(tp)->tv_sec = (time_t)stmp[0];		\
-	(tp)->tv_nsec = (long)stmp[1];		\
-}
-
-/*
- * Timestamp defines
- */
-#define	ACCESSED		(AT_ATIME)
-#define	STATE_CHANGED		(AT_CTIME)
-#define	CONTENT_MODIFIED	(AT_MTIME | AT_CTIME)
-
-#define	ZFS_ACCESSTIME_STAMP(zfsvfs, zp) \
-	if ((zfsvfs)->z_atime && !((zfsvfs)->z_vfs->vfs_flag & VFS_RDONLY)) \
-		zfs_time_stamper(zp, ACCESSED, NULL)
-
-extern int	zfs_init_fs(zfsvfs_t *, znode_t **, cred_t *);
-extern void	zfs_set_dataprop(objset_t *);
-extern void	zfs_create_fs(objset_t *os, cred_t *cr, dmu_tx_t *tx);
-extern void	zfs_time_stamper(znode_t *, uint_t, dmu_tx_t *);
-extern void	zfs_time_stamper_locked(znode_t *, uint_t, dmu_tx_t *);
-extern void	zfs_grow_blocksize(znode_t *, uint64_t, dmu_tx_t *);
-extern int	zfs_freesp(znode_t *, uint64_t, uint64_t, int, boolean_t);
-extern void	zfs_znode_init(void);
-extern void	zfs_znode_fini(void);
-extern int	zfs_zget(zfsvfs_t *, uint64_t, znode_t **);
-extern void	zfs_zinactive(znode_t *);
-extern void	zfs_znode_delete(znode_t *, dmu_tx_t *);
-extern void	zfs_znode_free(znode_t *);
-extern void	zfs_remove_op_tables();
-extern int	zfs_create_op_tables();
-extern dev_t	zfs_cmpldev(uint64_t);
-
-extern void zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, int txtype,
-    znode_t *dzp, znode_t *zp, char *name);
-extern void zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, int txtype,
-    znode_t *dzp, char *name);
-extern void zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, int txtype,
-    znode_t *dzp, znode_t *zp, char *name);
-extern void zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, int txtype,
-    znode_t *dzp, znode_t *zp, char *name, char *link);
-extern void zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, int txtype,
-    znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp);
-extern void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
-    znode_t *zp, offset_t off, ssize_t len, int ioflag);
-extern void zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
-    znode_t *zp, uint64_t off, uint64_t len);
-extern void zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
-    znode_t *zp, vattr_t *vap, uint_t mask_applied);
-#ifndef ZFS_NO_ACL
-extern void zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, int txtype,
-    znode_t *zp, int aclcnt, ace_t *z_ace);
-#endif
-
-extern zil_get_data_t zfs_get_data;
-extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE];
-extern int zfsfstype;
-
-#endif /* _KERNEL */
-
-extern int zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_FS_ZFS_ZNODE_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h
deleted file mode 100644
index 947ba9f..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h
+++ /dev/null
@@ -1,276 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef	_SYS_ZIL_H
-#define	_SYS_ZIL_H
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/types.h>
-#include <sys/spa.h>
-#include <sys/zio.h>
-#include <sys/dmu.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-/*
- * Intent log format:
- *
- * Each objset has its own intent log.  The log header (zil_header_t)
- * for objset N's intent log is kept in the Nth object of the SPA's
- * intent_log objset.  The log header points to a chain of log blocks,
- * each of which contains log records (i.e., transactions) followed by
- * a log block trailer (zil_trailer_t).  The format of a log record
- * depends on the record (or transaction) type, but all records begin
- * with a common structure that defines the type, length, and txg.
- */
-
-/*
- * Intent log header - this on disk structure holds fields to manage
- * the log.  All fields are 64 bit to easily handle cross architectures.
- */
-typedef struct zil_header {
-	uint64_t zh_claim_txg;	/* txg in which log blocks were claimed */
-	uint64_t zh_replay_seq;	/* highest replayed sequence number */
-	blkptr_t zh_log;	/* log chain */
-	uint64_t zh_claim_seq;	/* highest claimed sequence number */
-	uint64_t zh_pad[5];
-} zil_header_t;
-
-/*
- * Log block trailer - structure at the end of the header and each log block
- *
- * The zit_bt contains a zbt_cksum which for the intent log is
- * the sequence number of this log block. A seq of 0 is invalid.
- * The zbt_cksum is checked by the SPA against the sequence
- * number passed in the blk_cksum field of the blkptr_t
- */
-typedef struct zil_trailer {
-	uint64_t zit_pad;
-	blkptr_t zit_next_blk;	/* next block in chain */
-	uint64_t zit_nused;	/* bytes in log block used */
-	zio_block_tail_t zit_bt; /* block trailer */
-} zil_trailer_t;
-
-#define	ZIL_MIN_BLKSZ	4096ULL
-#define	ZIL_MAX_BLKSZ	SPA_MAXBLOCKSIZE
-#define	ZIL_BLK_DATA_SZ(lwb)	((lwb)->lwb_sz - sizeof (zil_trailer_t))
-
-/*
- * The words of a log block checksum.
- */
-#define	ZIL_ZC_GUID_0	0
-#define	ZIL_ZC_GUID_1	1
-#define	ZIL_ZC_OBJSET	2
-#define	ZIL_ZC_SEQ	3
-
-/*
- * Intent log transaction types and record structures
- */
-#define	TX_CREATE	1		/* Create file */
-#define	TX_MKDIR	2		/* Make directory */
-#define	TX_MKXATTR	3		/* Make XATTR directory */
-#define	TX_SYMLINK	4		/* Create symbolic link to a file */
-#define	TX_REMOVE	5		/* Remove file */
-#define	TX_RMDIR	6		/* Remove directory */
-#define	TX_LINK		7		/* Create hard link to a file */
-#define	TX_RENAME	8		/* Rename a file */
-#define	TX_WRITE	9		/* File write */
-#define	TX_TRUNCATE	10		/* Truncate a file */
-#define	TX_SETATTR	11		/* Set file attributes */
-#define	TX_ACL		12		/* Set acl */
-#define	TX_MAX_TYPE	13		/* Max transaction type */
-
-/*
- * Format of log records.
- * The fields are carefully defined to allow them to be aligned
- * and sized the same on sparc & intel architectures.
- * Each log record has a common structure at the beginning.
- *
- * Note, lrc_seq holds two different sequence numbers. Whilst in memory
- * it contains the transaction sequence number.  The log record on
- * disk holds the sequence number of all log records which is used to
- * ensure we don't replay the same record.  The two sequence numbers are
- * different because the transactions can now be pushed out of order.
- */
-typedef struct {			/* common log record header */
-	uint64_t	lrc_txtype;	/* intent log transaction type */
-	uint64_t	lrc_reclen;	/* transaction record length */
-	uint64_t	lrc_txg;	/* dmu transaction group number */
-	uint64_t	lrc_seq;	/* see comment above */
-} lr_t;
-
-typedef struct {
-	lr_t		lr_common;	/* common portion of log record */
-	uint64_t	lr_doid;	/* object id of directory */
-	uint64_t	lr_foid;	/* object id of created file object */
-	uint64_t	lr_mode;	/* mode of object */
-	uint64_t	lr_uid;		/* uid of object */
-	uint64_t	lr_gid;		/* gid of object */
-	uint64_t	lr_gen;		/* generation (txg of creation) */
-	uint64_t	lr_crtime[2];	/* creation time */
-	uint64_t	lr_rdev;	/* rdev of object to create */
-	/* name of object to create follows this */
-	/* for symlinks, link content follows name */
-} lr_create_t;
-
-typedef struct {
-	lr_t		lr_common;	/* common portion of log record */
-	uint64_t	lr_doid;	/* obj id of directory */
-	/* name of object to remove follows this */
-} lr_remove_t;
-
-typedef struct {
-	lr_t		lr_common;	/* common portion of log record */
-	uint64_t	lr_doid;	/* obj id of directory */
-	uint64_t	lr_link_obj;	/* obj id of link */
-	/* name of object to link follows this */
-} lr_link_t;
-
-typedef struct {
-	lr_t		lr_common;	/* common portion of log record */
-	uint64_t	lr_sdoid;	/* obj id of source directory */
-	uint64_t	lr_tdoid;	/* obj id of target directory */
-	/* 2 strings: names of source and destination follow this */
-} lr_rename_t;
-
-typedef struct {
-	lr_t		lr_common;	/* common portion of log record */
-	uint64_t	lr_foid;	/* file object to write */
-	uint64_t	lr_offset;	/* offset to write to */
-	uint64_t	lr_length;	/* user data length to write */
-	uint64_t	lr_blkoff;	/* offset represented by lr_blkptr */
-	blkptr_t	lr_blkptr;	/* spa block pointer for replay */
-	/* write data will follow for small writes */
-} lr_write_t;
-
-typedef struct {
-	lr_t		lr_common;	/* common portion of log record */
-	uint64_t	lr_foid;	/* object id of file to truncate */
-	uint64_t	lr_offset;	/* offset to truncate from */
-	uint64_t	lr_length;	/* length to truncate */
-} lr_truncate_t;
-
-typedef struct {
-	lr_t		lr_common;	/* common portion of log record */
-	uint64_t	lr_foid;	/* file object to change attributes */
-	uint64_t	lr_mask;	/* mask of attributes to set */
-	uint64_t	lr_mode;	/* mode to set */
-	uint64_t	lr_uid;		/* uid to set */
-	uint64_t	lr_gid;		/* gid to set */
-	uint64_t	lr_size;	/* size to set */
-	uint64_t	lr_atime[2];	/* access time */
-	uint64_t	lr_mtime[2];	/* modification time */
-} lr_setattr_t;
-
-typedef struct {
-	lr_t		lr_common;	/* common portion of log record */
-	uint64_t	lr_foid;	/* obj id of file */
-	uint64_t	lr_aclcnt;	/* number of acl entries */
-	/* lr_aclcnt number of ace_t entries follow this */
-} lr_acl_t;
-
-/*
- * ZIL structure definitions, interface function prototype and globals.
- */
-
-/*
- * ZFS intent log transaction structure
- */
-typedef enum {
-	WR_INDIRECT,	/* indirect - a large write (dmu_sync() data */
-			/* and put blkptr in log, rather than actual data) */
-	WR_COPIED,	/* immediate - data is copied into lr_write_t */
-	WR_NEED_COPY,	/* immediate - data needs to be copied if pushed */
-} itx_wr_state_t;
-
-typedef struct itx {
-	list_node_t	itx_node;	/* linkage on zl_itx_list */
-	void		*itx_private;	/* type-specific opaque data */
-	itx_wr_state_t	itx_wr_state;	/* write state */
-	uint8_t		itx_sync;	/* synchronous transaction */
-	lr_t		itx_lr;		/* common part of log record */
-	/* followed by type-specific part of lr_xx_t and its immediate data */
-} itx_t;
-
-
-/*
- * zgd_t is passed through dmu_sync() to the callback routine zfs_get_done()
- * to handle the cleanup of the dmu_sync() buffer write
- */
-typedef struct {
-	zilog_t		*zgd_zilog;	/* zilog */
-	blkptr_t	*zgd_bp;	/* block pointer */
-	struct rl	*zgd_rl;	/* range lock */
-} zgd_t;
-
-
-typedef void zil_parse_blk_func_t(zilog_t *zilog, blkptr_t *bp, void *arg,
-    uint64_t txg);
-typedef void zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg,
-    uint64_t txg);
-typedef int zil_replay_func_t();
-typedef int zil_get_data_t(void *arg, lr_write_t *lr, char *dbuf, zio_t *zio);
-
-extern uint64_t	zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
-    zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg);
-
-extern void	zil_init(void);
-extern void	zil_fini(void);
-
-extern zilog_t	*zil_alloc(objset_t *os, zil_header_t *zh_phys);
-extern void	zil_free(zilog_t *zilog);
-
-extern zilog_t	*zil_open(objset_t *os, zil_get_data_t *get_data);
-extern void	zil_close(zilog_t *zilog);
-
-extern void	zil_replay(objset_t *os, void *arg, uint64_t *txgp,
-    zil_replay_func_t *replay_func[TX_MAX_TYPE]);
-extern void	zil_destroy(zilog_t *zilog, boolean_t keep_first);
-
-extern itx_t	*zil_itx_create(int txtype, size_t lrsize);
-extern uint64_t zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx);
-
-extern void	zil_commit(zilog_t *zilog, uint64_t seq, uint64_t oid);
-
-extern int	zil_claim(char *osname, void *txarg);
-extern void	zil_sync(zilog_t *zilog, dmu_tx_t *tx);
-extern void	zil_clean(zilog_t *zilog);
-extern int	zil_is_committed(zilog_t *zilog);
-
-extern int	zil_suspend(zilog_t *zilog);
-extern void	zil_resume(zilog_t *zilog);
-
-extern void	zil_add_vdev(zilog_t *zilog, uint64_t vdev);
-
-extern int zil_disable;
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_ZIL_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h
deleted file mode 100644
index 3ecf4e4..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef	_SYS_ZIL_IMPL_H
-#define	_SYS_ZIL_IMPL_H
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/zil.h>
-#include <sys/dmu_objset.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-/*
- * Log write buffer.
- */
-typedef struct lwb {
-	zilog_t		*lwb_zilog;	/* back pointer to log struct */
-	blkptr_t	lwb_blk;	/* on disk address of this log blk */
-	int		lwb_nused;	/* # used bytes in buffer */
-	int		lwb_sz;		/* size of block and buffer */
-	char		*lwb_buf;	/* log write buffer */
-	zio_t		*lwb_zio;	/* zio for this buffer */
-	uint64_t	lwb_max_txg;	/* highest txg in this lwb */
-	txg_handle_t	lwb_txgh;	/* txg handle for txg_exit() */
-	list_node_t	lwb_node;	/* zilog->zl_lwb_list linkage */
-} lwb_t;
-
-/*
- * Vdev flushing: We use a bit map of size ZIL_VDEV_BMAP bytes.
- * Any vdev numbers beyond that use a linked list of zil_vdev_t structures.
- */
-
-#define	ZIL_VDEV_BMSZ 16 /* 16 * 8 = 128 vdevs */
-typedef struct zil_vdev {
-	uint64_t	vdev;		/* device written */
-	list_node_t	vdev_seq_node;	/* zilog->zl_vdev_list linkage */
-} zil_vdev_t;
-
-/*
- * Stable storage intent log management structure.  One per dataset.
- */
-struct zilog {
-	kmutex_t	zl_lock;	/* protects most zilog_t fields */
-	struct dsl_pool	*zl_dmu_pool;	/* DSL pool */
-	spa_t		*zl_spa;	/* handle for read/write log */
-	const zil_header_t *zl_header;	/* log header buffer */
-	objset_t	*zl_os;		/* object set we're logging */
-	zil_get_data_t	*zl_get_data;	/* callback to get object content */
-	zio_t		*zl_root_zio;	/* log writer root zio */
-	uint64_t	zl_itx_seq;	/* next itx sequence number */
-	uint64_t	zl_commit_seq;	/* committed upto this number */
-	uint64_t	zl_lr_seq;	/* log record sequence number */
-	uint64_t	zl_destroy_txg;	/* txg of last zil_destroy() */
-	uint64_t	zl_replay_seq[TXG_SIZE]; /* seq of last replayed rec */
-	uint32_t	zl_suspend;	/* log suspend count */
-	kcondvar_t	zl_cv_writer;	/* log writer thread completion */
-	kcondvar_t	zl_cv_suspend;	/* log suspend completion */
-	uint8_t		zl_suspending;	/* log is currently suspending */
-	uint8_t		zl_keep_first;	/* keep first log block in destroy */
-	uint8_t		zl_stop_replay;	/* don't replay any further */
-	uint8_t		zl_stop_sync;	/* for debugging */
-	uint8_t		zl_writer;	/* boolean: write setup in progress */
-	uint8_t		zl_log_error;	/* boolean: log write error */
-	list_t		zl_itx_list;	/* in-memory itx list */
-	uint64_t	zl_itx_list_sz;	/* total size of records on list */
-	uint64_t	zl_cur_used;	/* current commit log size used */
-	uint64_t	zl_prev_used;	/* previous commit log size used */
-	list_t		zl_lwb_list;	/* in-flight log write list */
-	list_t		zl_vdev_list;	/* list of [vdev, seq] pairs */
-	uint8_t		zl_vdev_bmap[ZIL_VDEV_BMSZ]; /* bitmap of vdevs */
-	taskq_t		*zl_clean_taskq; /* runs lwb and itx clean tasks */
-	avl_tree_t	zl_dva_tree;	/* track DVAs during log parse */
-	clock_t		zl_replay_time;	/* lbolt of when replay started */
-	uint64_t	zl_replay_blks;	/* number of log blocks replayed */
-};
-
-typedef struct zil_dva_node {
-	dva_t		zn_dva;
-	avl_node_t	zn_node;
-} zil_dva_node_t;
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_ZIL_IMPL_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
deleted file mode 100644
index b026ae6..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
+++ /dev/null
@@ -1,366 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _ZIO_H
-#define	_ZIO_H
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/spa.h>
-#include <sys/txg.h>
-#include <sys/avl.h>
-#include <sys/dkio.h>
-#include <sys/fs/zfs.h>
-#include <sys/zio_impl.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-#define	ZBT_MAGIC	0x210da7ab10c7a11ULL	/* zio data bloc tail */
-
-typedef struct zio_block_tail {
-	uint64_t	zbt_magic;	/* for validation, endianness	*/
-	zio_cksum_t	zbt_cksum;	/* 256-bit checksum		*/
-} zio_block_tail_t;
-
-/*
- * Gang block headers are self-checksumming and contain an array
- * of block pointers.
- */
-#define	SPA_GANGBLOCKSIZE	SPA_MINBLOCKSIZE
-#define	SPA_GBH_NBLKPTRS	((SPA_GANGBLOCKSIZE - \
-	sizeof (zio_block_tail_t)) / sizeof (blkptr_t))
-#define	SPA_GBH_FILLER		((SPA_GANGBLOCKSIZE - \
-	sizeof (zio_block_tail_t) - \
-	(SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\
-	sizeof (uint64_t))
-
-#define	ZIO_GET_IOSIZE(zio)	\
-	(BP_IS_GANG((zio)->io_bp) ? \
-	SPA_GANGBLOCKSIZE : BP_GET_PSIZE((zio)->io_bp))
-
-typedef struct zio_gbh {
-	blkptr_t		zg_blkptr[SPA_GBH_NBLKPTRS];
-	uint64_t		zg_filler[SPA_GBH_FILLER];
-	zio_block_tail_t	zg_tail;
-} zio_gbh_phys_t;
-
-enum zio_checksum {
-	ZIO_CHECKSUM_INHERIT = 0,
-	ZIO_CHECKSUM_ON,
-	ZIO_CHECKSUM_OFF,
-	ZIO_CHECKSUM_LABEL,
-	ZIO_CHECKSUM_GANG_HEADER,
-	ZIO_CHECKSUM_ZILOG,
-	ZIO_CHECKSUM_FLETCHER_2,
-	ZIO_CHECKSUM_FLETCHER_4,
-	ZIO_CHECKSUM_SHA256,
-	ZIO_CHECKSUM_FUNCTIONS
-};
-
-#define	ZIO_CHECKSUM_ON_VALUE	ZIO_CHECKSUM_FLETCHER_2
-#define	ZIO_CHECKSUM_DEFAULT	ZIO_CHECKSUM_ON
-
-enum zio_compress {
-	ZIO_COMPRESS_INHERIT = 0,
-	ZIO_COMPRESS_ON,
-	ZIO_COMPRESS_OFF,
-	ZIO_COMPRESS_LZJB,
-	ZIO_COMPRESS_EMPTY,
-	ZIO_COMPRESS_GZIP_1,
-	ZIO_COMPRESS_GZIP_2,
-	ZIO_COMPRESS_GZIP_3,
-	ZIO_COMPRESS_GZIP_4,
-	ZIO_COMPRESS_GZIP_5,
-	ZIO_COMPRESS_GZIP_6,
-	ZIO_COMPRESS_GZIP_7,
-	ZIO_COMPRESS_GZIP_8,
-	ZIO_COMPRESS_GZIP_9,
-	ZIO_COMPRESS_FUNCTIONS
-};
-
-#define	ZIO_COMPRESS_ON_VALUE	ZIO_COMPRESS_LZJB
-#define	ZIO_COMPRESS_DEFAULT	ZIO_COMPRESS_OFF
-
-#define	ZIO_PRIORITY_NOW		(zio_priority_table[0])
-#define	ZIO_PRIORITY_SYNC_READ		(zio_priority_table[1])
-#define	ZIO_PRIORITY_SYNC_WRITE		(zio_priority_table[2])
-#define	ZIO_PRIORITY_ASYNC_READ		(zio_priority_table[3])
-#define	ZIO_PRIORITY_ASYNC_WRITE	(zio_priority_table[4])
-#define	ZIO_PRIORITY_FREE		(zio_priority_table[5])
-#define	ZIO_PRIORITY_CACHE_FILL		(zio_priority_table[6])
-#define	ZIO_PRIORITY_LOG_WRITE		(zio_priority_table[7])
-#define	ZIO_PRIORITY_RESILVER		(zio_priority_table[8])
-#define	ZIO_PRIORITY_SCRUB		(zio_priority_table[9])
-#define	ZIO_PRIORITY_TABLE_SIZE		10
-
-#define	ZIO_FLAG_MUSTSUCCEED		0x00000
-#define	ZIO_FLAG_CANFAIL		0x00001
-#define	ZIO_FLAG_FAILFAST		0x00002
-#define	ZIO_FLAG_CONFIG_HELD		0x00004
-#define	ZIO_FLAG_CONFIG_GRABBED		0x00008
-
-#define	ZIO_FLAG_DONT_CACHE		0x00010
-#define	ZIO_FLAG_DONT_QUEUE		0x00020
-#define	ZIO_FLAG_DONT_PROPAGATE		0x00040
-#define	ZIO_FLAG_DONT_RETRY		0x00080
-
-#define	ZIO_FLAG_PHYSICAL		0x00100
-#define	ZIO_FLAG_IO_BYPASS		0x00200
-#define	ZIO_FLAG_IO_REPAIR		0x00400
-#define	ZIO_FLAG_SPECULATIVE		0x00800
-
-#define	ZIO_FLAG_RESILVER		0x01000
-#define	ZIO_FLAG_SCRUB			0x02000
-#define	ZIO_FLAG_SCRUB_THREAD		0x04000
-#define	ZIO_FLAG_SUBBLOCK		0x08000
-
-#define	ZIO_FLAG_NOBOOKMARK		0x10000
-#define	ZIO_FLAG_USER			0x20000
-
-#define	ZIO_FLAG_METADATA		0x40000
-
-#define	ZIO_FLAG_GANG_INHERIT		\
-	(ZIO_FLAG_CANFAIL |		\
-	ZIO_FLAG_FAILFAST |		\
-	ZIO_FLAG_CONFIG_HELD |		\
-	ZIO_FLAG_DONT_RETRY |		\
-	ZIO_FLAG_IO_REPAIR |		\
-	ZIO_FLAG_SPECULATIVE |		\
-	ZIO_FLAG_RESILVER |		\
-	ZIO_FLAG_SCRUB |		\
-	ZIO_FLAG_SCRUB_THREAD)
-
-#define	ZIO_FLAG_VDEV_INHERIT		\
-	(ZIO_FLAG_GANG_INHERIT |	\
-	ZIO_FLAG_DONT_CACHE |		\
-	ZIO_FLAG_PHYSICAL)
-
-/*
- * We'll take the EILSEQ (Illegal byte sequence) errno
- * to indicate checksum errors.
- */
-#define	ECKSUM	EILSEQ
-
-typedef struct zio zio_t;
-typedef void zio_done_func_t(zio_t *zio);
-
-extern uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE];
-extern char *zio_type_name[ZIO_TYPES];
-
-/*
- * A bookmark is a four-tuple <objset, object, level, blkid> that uniquely
- * identifies any block in the pool.  By convention, the meta-objset (MOS)
- * is objset 0, the meta-dnode is object 0, the root block (osphys_t) is
- * level -1 of the meta-dnode, and intent log blocks (which are chained
- * off the root block) have blkid == sequence number.  In summary:
- *
- *	mos is objset 0
- *	meta-dnode is object 0
- *	root block is <objset, 0, -1, 0>
- *	intent log is <objset, 0, -1, ZIL sequence number>
- *
- * Note: this structure is called a bookmark because its first purpose was
- * to remember where to resume a pool-wide traverse.  The absolute ordering
- * for block visitation during traversal is defined in compare_bookmark().
- *
- * Note: this structure is passed between userland and the kernel.
- * Therefore it must not change size or alignment between 32/64 bit
- * compilation options.
- */
-typedef struct zbookmark {
-	uint64_t	zb_objset;
-	uint64_t	zb_object;
-	int64_t		zb_level;
-	uint64_t	zb_blkid;
-} zbookmark_t;
-
-struct zio {
-	/* Core information about this I/O */
-	zio_t		*io_parent;
-	zio_t		*io_root;
-	spa_t		*io_spa;
-	zbookmark_t	io_bookmark;
-	enum zio_checksum io_checksum;
-	enum zio_compress io_compress;
-	int		io_ndvas;
-	uint64_t	io_txg;
-	blkptr_t	*io_bp;
-	blkptr_t	io_bp_copy;
-	zio_t		*io_child;
-	zio_t		*io_sibling_prev;
-	zio_t		*io_sibling_next;
-	zio_transform_t *io_transform_stack;
-	zio_t		*io_logical;
-
-	/* Callback info */
-	zio_done_func_t	*io_ready;
-	zio_done_func_t	*io_done;
-	void		*io_private;
-	blkptr_t	io_bp_orig;
-
-	/* Data represented by this I/O */
-	void		*io_data;
-	uint64_t	io_size;
-
-	/* Stuff for the vdev stack */
-	vdev_t		*io_vd;
-	void		*io_vsd;
-	uint64_t	io_offset;
-	uint64_t	io_deadline;
-	uint64_t	io_timestamp;
-	avl_node_t	io_offset_node;
-	avl_node_t	io_deadline_node;
-	avl_tree_t	*io_vdev_tree;
-	zio_t		*io_delegate_list;
-	zio_t		*io_delegate_next;
-
-	/* Internal pipeline state */
-	int		io_flags;
-	enum zio_type	io_type;
-	enum zio_stage	io_stage;
-	uint8_t		io_stalled;
-	uint8_t		io_priority;
-	struct dk_callback io_dk_callback;
-	int		io_cmd;
-	int		io_retries;
-	int		io_error;
-	uint32_t	io_numerrors;
-	uint32_t	io_pipeline;
-	uint32_t	io_async_stages;
-	uint64_t	io_children_notready;
-	uint64_t	io_children_notdone;
-	void		*io_waiter;
-	kmutex_t	io_lock;
-	kcondvar_t	io_cv;
-
-	/* FMA state */
-	uint64_t	io_ena;
-};
-
-extern zio_t *zio_null(zio_t *pio, spa_t *spa,
-    zio_done_func_t *done, void *private, int flags);
-
-extern zio_t *zio_root(spa_t *spa,
-    zio_done_func_t *done, void *private, int flags);
-
-extern zio_t *zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data,
-    uint64_t size, zio_done_func_t *done, void *private,
-    int priority, int flags, zbookmark_t *zb);
-
-extern zio_t *zio_write(zio_t *pio, spa_t *spa, int checksum, int compress,
-    int ncopies, uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
-    zio_done_func_t *ready, zio_done_func_t *done, void *private, int priority,
-    int flags, zbookmark_t *zb);
-
-extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, int checksum,
-    uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
-    zio_done_func_t *done, void *private, int priority, int flags,
-    zbookmark_t *zb);
-
-extern zio_t *zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
-    zio_done_func_t *done, void *private);
-
-extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
-    zio_done_func_t *done, void *private);
-
-extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
-    zio_done_func_t *done, void *private, int priority, int flags);
-
-extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
-    uint64_t size, void *data, int checksum,
-    zio_done_func_t *done, void *private, int priority, int flags);
-
-extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
-    uint64_t size, void *data, int checksum,
-    zio_done_func_t *done, void *private, int priority, int flags);
-
-extern int zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp,
-    blkptr_t *old_bp, uint64_t txg);
-extern void zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg);
-
-extern int zio_wait(zio_t *zio);
-extern void zio_nowait(zio_t *zio);
-
-extern void *zio_buf_alloc(size_t size);
-extern void zio_buf_free(void *buf, size_t size);
-extern void *zio_data_buf_alloc(size_t size);
-extern void zio_data_buf_free(void *buf, size_t size);
-
-/*
- * Move an I/O to the next stage of the pipeline and execute that stage.
- * There's no locking on io_stage because there's no legitimate way for
- * multiple threads to be attempting to process the same I/O.
- */
-extern void zio_next_stage(zio_t *zio);
-extern void zio_next_stage_async(zio_t *zio);
-extern void zio_wait_children_done(zio_t *zio);
-
-/*
- * Delegate I/O to a child vdev.
- */
-extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd,
-    uint64_t offset, void *data, uint64_t size, int type, int priority,
-    int flags, zio_done_func_t *done, void *private);
-
-extern void zio_vdev_io_bypass(zio_t *zio);
-extern void zio_vdev_io_reissue(zio_t *zio);
-extern void zio_vdev_io_redone(zio_t *zio);
-
-extern void zio_checksum_verified(zio_t *zio);
-extern void zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp);
-
-extern uint8_t zio_checksum_select(uint8_t child, uint8_t parent);
-extern uint8_t zio_compress_select(uint8_t child, uint8_t parent);
-
-boolean_t zio_should_retry(zio_t *zio);
-
-/*
- * Initial setup and teardown.
- */
-extern void zio_init(void);
-extern void zio_fini(void);
-
-/*
- * Fault injection
- */
-struct zinject_record;
-extern uint32_t zio_injection_enabled;
-extern int zio_inject_fault(char *name, int flags, int *id,
-    struct zinject_record *record);
-extern int zio_inject_list_next(int *id, char *name, size_t buflen,
-    struct zinject_record *record);
-extern int zio_clear_fault(int id);
-extern int zio_handle_fault_injection(zio_t *zio, int error);
-extern int zio_handle_device_injection(vdev_t *vd, int error);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _ZIO_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h
deleted file mode 100644
index bb7bd41..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_ZIO_CHECKSUM_H
-#define	_SYS_ZIO_CHECKSUM_H
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/zio.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-/*
- * Signature for checksum functions.
- */
-typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp);
-
-/*
- * Information about each checksum function.
- */
-typedef struct zio_checksum_info {
-	zio_checksum_t	*ci_func[2]; /* checksum function for each byteorder */
-	int		ci_correctable;	/* number of correctable bits	*/
-	int		ci_zbt;		/* uses zio block tail?	*/
-	char		*ci_name;	/* descriptive name */
-} zio_checksum_info_t;
-
-extern zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS];
-
-/*
- * Checksum routines.
- */
-extern zio_checksum_t fletcher_2_native;
-extern zio_checksum_t fletcher_4_native;
-extern zio_checksum_t fletcher_4_incremental_native;
-
-extern zio_checksum_t fletcher_2_byteswap;
-extern zio_checksum_t fletcher_4_byteswap;
-extern zio_checksum_t fletcher_4_incremental_byteswap;
-
-extern zio_checksum_t zio_checksum_SHA256;
-
-extern void zio_checksum(uint_t checksum, zio_cksum_t *zcp,
-    void *data, uint64_t size);
-extern int zio_checksum_error(zio_t *zio);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_ZIO_CHECKSUM_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h
deleted file mode 100644
index 66ee8d4..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_ZIO_COMPRESS_H
-#define	_SYS_ZIO_COMPRESS_H
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/zio.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-/*
- * Common signature for all zio compress/decompress functions.
- */
-typedef size_t zio_compress_func_t(void *src, void *dst,
-    size_t s_len, size_t d_len, int);
-typedef int zio_decompress_func_t(void *src, void *dst,
-    size_t s_len, size_t d_len, int);
-
-/*
- * Information about each compression function.
- */
-typedef struct zio_compress_info {
-	zio_compress_func_t	*ci_compress;	/* compression function */
-	zio_decompress_func_t	*ci_decompress;	/* decompression function */
-	int			ci_level;	/* level parameter */
-	char			*ci_name;	/* algorithm name */
-} zio_compress_info_t;
-
-extern zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS];
-
-/*
- * Compression routines.
- */
-extern size_t lzjb_compress(void *src, void *dst, size_t s_len, size_t d_len,
-    int level);
-extern int lzjb_decompress(void *src, void *dst, size_t s_len, size_t d_len,
-    int level);
-extern size_t gzip_compress(void *src, void *dst, size_t s_len, size_t d_len,
-    int level);
-extern int gzip_decompress(void *src, void *dst, size_t s_len, size_t d_len,
-    int level);
-
-/*
- * Compress and decompress data if necessary.
- */
-extern int zio_compress_data(int cpfunc, void *src, uint64_t srcsize,
-    void **destp, uint64_t *destsizep, uint64_t *destbufsizep);
-extern int zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
-    void *dest, uint64_t destsize);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_ZIO_COMPRESS_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h
deleted file mode 100644
index d2ddbc3..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h
+++ /dev/null
@@ -1,205 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _ZIO_IMPL_H
-#define	_ZIO_IMPL_H
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/zio.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-/*
- * I/O Groups: pipeline stage definitions.
- */
-
-typedef enum zio_stage {
-	ZIO_STAGE_OPEN = 0,			/* RWFCI */
-	ZIO_STAGE_WAIT_CHILDREN_READY,		/* RWFCI */
-
-	ZIO_STAGE_WRITE_COMPRESS,		/* -W--- */
-	ZIO_STAGE_CHECKSUM_GENERATE,		/* -W--- */
-
-	ZIO_STAGE_GANG_PIPELINE,		/* -WFC- */
-
-	ZIO_STAGE_GET_GANG_HEADER,		/* -WFC- */
-	ZIO_STAGE_REWRITE_GANG_MEMBERS,		/* -W--- */
-	ZIO_STAGE_FREE_GANG_MEMBERS,		/* --F-- */
-	ZIO_STAGE_CLAIM_GANG_MEMBERS,		/* ---C- */
-
-	ZIO_STAGE_DVA_ALLOCATE,			/* -W--- */
-	ZIO_STAGE_DVA_FREE,			/* --F-- */
-	ZIO_STAGE_DVA_CLAIM,			/* ---C- */
-
-	ZIO_STAGE_GANG_CHECKSUM_GENERATE,	/* -W--- */
-
-	ZIO_STAGE_READY,			/* RWFCI */
-
-	ZIO_STAGE_VDEV_IO_START,		/* RW--I */
-	ZIO_STAGE_VDEV_IO_DONE,			/* RW--I */
-	ZIO_STAGE_VDEV_IO_ASSESS,		/* RW--I */
-
-	ZIO_STAGE_WAIT_CHILDREN_DONE,		/* RWFCI */
-
-	ZIO_STAGE_CHECKSUM_VERIFY,		/* R---- */
-	ZIO_STAGE_READ_GANG_MEMBERS,		/* R---- */
-	ZIO_STAGE_READ_DECOMPRESS,		/* R---- */
-
-	ZIO_STAGE_DONE				/* RWFCI */
-} zio_stage_t;
-
-/*
- * The stages for which there's some performance value in going async.
- * When compression is enabled, ZIO_STAGE_WRITE_COMPRESS is ORed in as well.
- */
-#define	ZIO_ASYNC_PIPELINE_STAGES				\
-	((1U << ZIO_STAGE_CHECKSUM_GENERATE) |			\
-	(1U << ZIO_STAGE_VDEV_IO_DONE) |			\
-	(1U << ZIO_STAGE_CHECKSUM_VERIFY) |			\
-	(1U << ZIO_STAGE_READ_DECOMPRESS))
-
-#define	ZIO_VDEV_IO_PIPELINE					\
-	((1U << ZIO_STAGE_VDEV_IO_START) |			\
-	(1U << ZIO_STAGE_VDEV_IO_DONE) |			\
-	(1U << ZIO_STAGE_VDEV_IO_ASSESS))
-
-#define	ZIO_READ_PHYS_PIPELINE					\
-	((1U << ZIO_STAGE_OPEN) |				\
-	(1U << ZIO_STAGE_WAIT_CHILDREN_READY) |			\
-	(1U << ZIO_STAGE_READY) |				\
-	ZIO_VDEV_IO_PIPELINE |					\
-	(1U << ZIO_STAGE_WAIT_CHILDREN_DONE) |			\
-	(1U << ZIO_STAGE_CHECKSUM_VERIFY) |			\
-	(1U << ZIO_STAGE_DONE))
-
-#define	ZIO_READ_PIPELINE					\
-	ZIO_READ_PHYS_PIPELINE
-
-#define	ZIO_WRITE_PHYS_PIPELINE					\
-	((1U << ZIO_STAGE_OPEN) |				\
-	(1U << ZIO_STAGE_WAIT_CHILDREN_READY) |			\
-	(1U << ZIO_STAGE_CHECKSUM_GENERATE) |			\
-	(1U << ZIO_STAGE_READY) |				\
-	ZIO_VDEV_IO_PIPELINE |					\
-	(1U << ZIO_STAGE_WAIT_CHILDREN_DONE) |			\
-	(1U << ZIO_STAGE_DONE))
-
-#define	ZIO_WRITE_COMMON_PIPELINE				\
-	ZIO_WRITE_PHYS_PIPELINE
-
-#define	ZIO_WRITE_PIPELINE					\
-	((1U << ZIO_STAGE_WRITE_COMPRESS) |			\
-	ZIO_WRITE_COMMON_PIPELINE)
-
-#define	ZIO_GANG_STAGES						\
-	((1U << ZIO_STAGE_GET_GANG_HEADER) |			\
-	(1U << ZIO_STAGE_REWRITE_GANG_MEMBERS) |		\
-	(1U << ZIO_STAGE_FREE_GANG_MEMBERS) |			\
-	(1U << ZIO_STAGE_CLAIM_GANG_MEMBERS) |			\
-	(1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE) |		\
-	(1U << ZIO_STAGE_READ_GANG_MEMBERS))
-
-#define	ZIO_REWRITE_PIPELINE					\
-	((1U << ZIO_STAGE_GANG_PIPELINE) |			\
-	(1U << ZIO_STAGE_GET_GANG_HEADER) |			\
-	(1U << ZIO_STAGE_REWRITE_GANG_MEMBERS) |		\
-	(1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE) |		\
-	ZIO_WRITE_COMMON_PIPELINE)
-
-#define	ZIO_WRITE_ALLOCATE_PIPELINE				\
-	((1U << ZIO_STAGE_DVA_ALLOCATE) |			\
-	ZIO_WRITE_COMMON_PIPELINE)
-
-#define	ZIO_GANG_FREE_STAGES					\
-	((1U << ZIO_STAGE_GET_GANG_HEADER) |			\
-	(1U << ZIO_STAGE_FREE_GANG_MEMBERS))
-
-#define	ZIO_FREE_PIPELINE					\
-	((1U << ZIO_STAGE_OPEN) |				\
-	(1U << ZIO_STAGE_WAIT_CHILDREN_READY) |			\
-	(1U << ZIO_STAGE_GANG_PIPELINE) |			\
-	(1U << ZIO_STAGE_GET_GANG_HEADER) |			\
-	(1U << ZIO_STAGE_FREE_GANG_MEMBERS) |			\
-	(1U << ZIO_STAGE_DVA_FREE) |				\
-	(1U << ZIO_STAGE_READY) |				\
-	(1U << ZIO_STAGE_WAIT_CHILDREN_DONE) |			\
-	(1U << ZIO_STAGE_DONE))
-
-#define	ZIO_CLAIM_PIPELINE					\
-	((1U << ZIO_STAGE_OPEN) |				\
-	(1U << ZIO_STAGE_WAIT_CHILDREN_READY) |			\
-	(1U << ZIO_STAGE_GANG_PIPELINE) |			\
-	(1U << ZIO_STAGE_GET_GANG_HEADER) |			\
-	(1U << ZIO_STAGE_CLAIM_GANG_MEMBERS) |			\
-	(1U << ZIO_STAGE_DVA_CLAIM) |				\
-	(1U << ZIO_STAGE_READY) |				\
-	(1U << ZIO_STAGE_WAIT_CHILDREN_DONE) |			\
-	(1U << ZIO_STAGE_DONE))
-
-#define	ZIO_IOCTL_PIPELINE					\
-	((1U << ZIO_STAGE_OPEN) |				\
-	(1U << ZIO_STAGE_WAIT_CHILDREN_READY) |			\
-	(1U << ZIO_STAGE_READY) |				\
-	ZIO_VDEV_IO_PIPELINE |					\
-	(1U << ZIO_STAGE_WAIT_CHILDREN_DONE) |			\
-	(1U << ZIO_STAGE_DONE))
-
-#define	ZIO_WAIT_FOR_CHILDREN_PIPELINE				\
-	((1U << ZIO_STAGE_WAIT_CHILDREN_READY) |		\
-	(1U << ZIO_STAGE_READY) |				\
-	(1U << ZIO_STAGE_WAIT_CHILDREN_DONE) |			\
-	(1U << ZIO_STAGE_DONE))
-
-#define	ZIO_WAIT_FOR_CHILDREN_DONE_PIPELINE			\
-	((1U << ZIO_STAGE_WAIT_CHILDREN_DONE) |			\
-	(1U << ZIO_STAGE_DONE))
-
-#define	ZIO_VDEV_CHILD_PIPELINE					\
-	(ZIO_WAIT_FOR_CHILDREN_DONE_PIPELINE |			\
-	ZIO_VDEV_IO_PIPELINE)
-
-#define	ZIO_ERROR_PIPELINE_MASK					\
-	ZIO_WAIT_FOR_CHILDREN_PIPELINE
-
-typedef struct zio_transform zio_transform_t;
-struct zio_transform {
-	void		*zt_data;
-	uint64_t	zt_size;
-	uint64_t	zt_bufsize;
-	zio_transform_t	*zt_next;
-};
-
-extern void zio_inject_init(void);
-extern void zio_inject_fini(void);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _ZIO_IMPL_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h
deleted file mode 100644
index df85824..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef	_SYS_ZVOL_H
-#define	_SYS_ZVOL_H
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/zfs_context.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-#ifdef _KERNEL
-extern int zvol_check_volsize(uint64_t volsize, uint64_t blocksize);
-extern int zvol_check_volblocksize(uint64_t volblocksize);
-extern int zvol_get_stats(objset_t *os, nvlist_t *nv);
-extern void zvol_create_cb(objset_t *os, void *arg, dmu_tx_t *tx);
-extern int zvol_create_minor(const char *, dev_t);
-extern int zvol_remove_minor(const char *);
-extern int zvol_set_volsize(const char *, dev_t, uint64_t);
-extern int zvol_set_volblocksize(const char *, uint64_t);
-
-extern int zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr);
-extern int zvol_close(dev_t dev, int flag, int otyp, cred_t *cr);
-#ifndef __FreeBSD__
-extern int zvol_strategy(buf_t *bp);
-extern int zvol_read(dev_t dev, uio_t *uiop, cred_t *cr);
-extern int zvol_write(dev_t dev, uio_t *uiop, cred_t *cr);
-extern int zvol_aread(dev_t dev, struct aio_req *aio, cred_t *cr);
-extern int zvol_awrite(dev_t dev, struct aio_req *aio, cred_t *cr);
-#endif
-extern int zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr,
-    int *rvalp);
-extern int zvol_busy(void);
-extern void zvol_init(void);
-extern void zvol_fini(void);
-#endif
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_ZVOL_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/txg.c b/sys/contrib/opensolaris/uts/common/fs/zfs/txg.c
deleted file mode 100644
index 844beb6..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/txg.c
+++ /dev/null
@@ -1,611 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/txg_impl.h>
-#include <sys/dmu_impl.h>
-#include <sys/dsl_pool.h>
-#include <sys/callb.h>
-
-/*
- * Pool-wide transaction groups.
- */
-
-static void txg_sync_thread(void *arg);
-static void txg_quiesce_thread(void *arg);
-static void txg_timelimit_thread(void *arg);
-
-int txg_time = 5;	/* max 5 seconds worth of delta per txg */
-
-/*
- * Prepare the txg subsystem.
- */
-void
-txg_init(dsl_pool_t *dp, uint64_t txg)
-{
-	tx_state_t *tx = &dp->dp_tx;
-	int c, i;
-	bzero(tx, sizeof (tx_state_t));
-
-	tx->tx_cpu = kmem_zalloc(max_ncpus * sizeof (tx_cpu_t), KM_SLEEP);
-	for (c = 0; c < max_ncpus; c++) {
-		mutex_init(&tx->tx_cpu[c].tc_lock, NULL, MUTEX_DEFAULT, NULL);
-		for (i = 0; i < TXG_SIZE; i++)
-			cv_init(&tx->tx_cpu[c].tc_cv[i], NULL, CV_DEFAULT, NULL);
-	}
-
-	rw_init(&tx->tx_suspend, NULL, RW_DEFAULT, NULL);
-	mutex_init(&tx->tx_sync_lock, NULL, MUTEX_DEFAULT, NULL);
-	cv_init(&tx->tx_sync_more_cv, NULL, CV_DEFAULT, NULL);
-	cv_init(&tx->tx_sync_done_cv, NULL, CV_DEFAULT, NULL);
-	cv_init(&tx->tx_quiesce_more_cv, NULL, CV_DEFAULT, NULL);
-	cv_init(&tx->tx_quiesce_done_cv, NULL, CV_DEFAULT, NULL);
-	cv_init(&tx->tx_timeout_exit_cv, NULL, CV_DEFAULT, NULL);
-	cv_init(&tx->tx_exit_cv, NULL, CV_DEFAULT, NULL);
-
-	tx->tx_open_txg = txg;
-}
-
-/*
- * Close down the txg subsystem.
- */
-void
-txg_fini(dsl_pool_t *dp)
-{
-	tx_state_t *tx = &dp->dp_tx;
-	int c, i;
-
-	ASSERT(tx->tx_threads == 0);
-
-	cv_destroy(&tx->tx_exit_cv);
-	cv_destroy(&tx->tx_timeout_exit_cv);
-	cv_destroy(&tx->tx_quiesce_done_cv);
-	cv_destroy(&tx->tx_quiesce_more_cv);
-	cv_destroy(&tx->tx_sync_done_cv);
-	cv_destroy(&tx->tx_sync_more_cv);
-	rw_destroy(&tx->tx_suspend);
-	mutex_destroy(&tx->tx_sync_lock);
-
-	for (c = 0; c < max_ncpus; c++) {
-		for (i = 0; i < TXG_SIZE; i++)
-			cv_destroy(&tx->tx_cpu[c].tc_cv[i]);
-		mutex_destroy(&tx->tx_cpu[c].tc_lock);
-	}
-
-	kmem_free(tx->tx_cpu, max_ncpus * sizeof (tx_cpu_t));
-
-	bzero(tx, sizeof (tx_state_t));
-}
-
-/*
- * Start syncing transaction groups.
- */
-void
-txg_sync_start(dsl_pool_t *dp)
-{
-	tx_state_t *tx = &dp->dp_tx;
-
-	mutex_enter(&tx->tx_sync_lock);
-
-	dprintf("pool %p\n", dp);
-
-	ASSERT(tx->tx_threads == 0);
-
-	tx->tx_threads = 3;
-
-	tx->tx_quiesce_thread = thread_create(NULL, 0, txg_quiesce_thread,
-	    dp, 0, &p0, TS_RUN, minclsyspri);
-
-	tx->tx_sync_thread = thread_create(NULL, 0, txg_sync_thread,
-	    dp, 0, &p0, TS_RUN, minclsyspri);
-
-	tx->tx_timelimit_thread = thread_create(NULL, 0, txg_timelimit_thread,
-	    dp, 0, &p0, TS_RUN, minclsyspri);
-
-	mutex_exit(&tx->tx_sync_lock);
-}
-
-static void
-txg_thread_enter(tx_state_t *tx, callb_cpr_t *cpr)
-{
-	CALLB_CPR_INIT(cpr, &tx->tx_sync_lock, callb_generic_cpr, FTAG);
-	mutex_enter(&tx->tx_sync_lock);
-}
-
-static void
-txg_thread_exit(tx_state_t *tx, callb_cpr_t *cpr, kthread_t **tpp)
-{
-	ASSERT(*tpp != NULL);
-	*tpp = NULL;
-	tx->tx_threads--;
-	cv_broadcast(&tx->tx_exit_cv);
-	CALLB_CPR_EXIT(cpr);		/* drops &tx->tx_sync_lock */
-	thread_exit();
-}
-
-static void
-txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, int secmax)
-{
-	CALLB_CPR_SAFE_BEGIN(cpr);
-
-	if (secmax)
-		(void) cv_timedwait(cv, &tx->tx_sync_lock, secmax * hz);
-	else
-		cv_wait(cv, &tx->tx_sync_lock);
-
-	CALLB_CPR_SAFE_END(cpr, &tx->tx_sync_lock);
-}
-
-/*
- * Stop syncing transaction groups.
- */
-void
-txg_sync_stop(dsl_pool_t *dp)
-{
-	tx_state_t *tx = &dp->dp_tx;
-
-	dprintf("pool %p\n", dp);
-	/*
-	 * Finish off any work in progress.
-	 */
-	ASSERT(tx->tx_threads == 3);
-	txg_wait_synced(dp, 0);
-
-	/*
-	 * Wake all 3 sync threads (one per state) and wait for them to die.
-	 */
-	mutex_enter(&tx->tx_sync_lock);
-
-	ASSERT(tx->tx_threads == 3);
-
-	tx->tx_exiting = 1;
-
-	cv_broadcast(&tx->tx_quiesce_more_cv);
-	cv_broadcast(&tx->tx_quiesce_done_cv);
-	cv_broadcast(&tx->tx_sync_more_cv);
-	cv_broadcast(&tx->tx_timeout_exit_cv);
-
-	while (tx->tx_threads != 0)
-		cv_wait(&tx->tx_exit_cv, &tx->tx_sync_lock);
-
-	tx->tx_exiting = 0;
-
-	mutex_exit(&tx->tx_sync_lock);
-}
-
-uint64_t
-txg_hold_open(dsl_pool_t *dp, txg_handle_t *th)
-{
-	tx_state_t *tx = &dp->dp_tx;
-	tx_cpu_t *tc = &tx->tx_cpu[CPU_SEQID];
-	uint64_t txg;
-
-	mutex_enter(&tc->tc_lock);
-
-	txg = tx->tx_open_txg;
-	tc->tc_count[txg & TXG_MASK]++;
-
-	th->th_cpu = tc;
-	th->th_txg = txg;
-
-	return (txg);
-}
-
-void
-txg_rele_to_quiesce(txg_handle_t *th)
-{
-	tx_cpu_t *tc = th->th_cpu;
-
-	mutex_exit(&tc->tc_lock);
-}
-
-void
-txg_rele_to_sync(txg_handle_t *th)
-{
-	tx_cpu_t *tc = th->th_cpu;
-	int g = th->th_txg & TXG_MASK;
-
-	mutex_enter(&tc->tc_lock);
-	ASSERT(tc->tc_count[g] != 0);
-	if (--tc->tc_count[g] == 0)
-		cv_broadcast(&tc->tc_cv[g]);
-	mutex_exit(&tc->tc_lock);
-
-	th->th_cpu = NULL;	/* defensive */
-}
-
-static void
-txg_quiesce(dsl_pool_t *dp, uint64_t txg)
-{
-	tx_state_t *tx = &dp->dp_tx;
-	int g = txg & TXG_MASK;
-	int c;
-
-	/*
-	 * Grab all tx_cpu locks so nobody else can get into this txg.
-	 */
-	for (c = 0; c < max_ncpus; c++)
-		mutex_enter(&tx->tx_cpu[c].tc_lock);
-
-	ASSERT(txg == tx->tx_open_txg);
-	tx->tx_open_txg++;
-
-	/*
-	 * Now that we've incremented tx_open_txg, we can let threads
-	 * enter the next transaction group.
-	 */
-	for (c = 0; c < max_ncpus; c++)
-		mutex_exit(&tx->tx_cpu[c].tc_lock);
-
-	/*
-	 * Quiesce the transaction group by waiting for everyone to txg_exit().
-	 */
-	for (c = 0; c < max_ncpus; c++) {
-		tx_cpu_t *tc = &tx->tx_cpu[c];
-		mutex_enter(&tc->tc_lock);
-		while (tc->tc_count[g] != 0)
-			cv_wait(&tc->tc_cv[g], &tc->tc_lock);
-		mutex_exit(&tc->tc_lock);
-	}
-}
-
-static void
-txg_sync_thread(void *arg)
-{
-	dsl_pool_t *dp = arg;
-	tx_state_t *tx = &dp->dp_tx;
-	callb_cpr_t cpr;
-
-	txg_thread_enter(tx, &cpr);
-
-	for (;;) {
-		uint64_t txg;
-
-		/*
-		 * We sync when there's someone waiting on us, or the
-		 * quiesce thread has handed off a txg to us.
-		 */
-		while (!tx->tx_exiting &&
-		    tx->tx_synced_txg >= tx->tx_sync_txg_waiting &&
-		    tx->tx_quiesced_txg == 0) {
-			dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n",
-			    tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
-			txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, 0);
-		}
-
-		/*
-		 * Wait until the quiesce thread hands off a txg to us,
-		 * prompting it to do so if necessary.
-		 */
-		while (!tx->tx_exiting && tx->tx_quiesced_txg == 0) {
-			if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1)
-				tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1;
-			cv_broadcast(&tx->tx_quiesce_more_cv);
-			txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0);
-		}
-
-		if (tx->tx_exiting)
-			txg_thread_exit(tx, &cpr, &tx->tx_sync_thread);
-
-		rw_enter(&tx->tx_suspend, RW_WRITER);
-
-		/*
-		 * Consume the quiesced txg which has been handed off to
-		 * us.  This may cause the quiescing thread to now be
-		 * able to quiesce another txg, so we must signal it.
-		 */
-		txg = tx->tx_quiesced_txg;
-		tx->tx_quiesced_txg = 0;
-		tx->tx_syncing_txg = txg;
-		cv_broadcast(&tx->tx_quiesce_more_cv);
-		rw_exit(&tx->tx_suspend);
-
-		dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
-			txg, tx->tx_quiesce_txg_waiting,
-			tx->tx_sync_txg_waiting);
-		mutex_exit(&tx->tx_sync_lock);
-		spa_sync(dp->dp_spa, txg);
-		mutex_enter(&tx->tx_sync_lock);
-		rw_enter(&tx->tx_suspend, RW_WRITER);
-		tx->tx_synced_txg = txg;
-		tx->tx_syncing_txg = 0;
-		rw_exit(&tx->tx_suspend);
-		cv_broadcast(&tx->tx_sync_done_cv);
-	}
-}
-
-static void
-txg_quiesce_thread(void *arg)
-{
-	dsl_pool_t *dp = arg;
-	tx_state_t *tx = &dp->dp_tx;
-	callb_cpr_t cpr;
-
-	txg_thread_enter(tx, &cpr);
-
-	for (;;) {
-		uint64_t txg;
-
-		/*
-		 * We quiesce when there's someone waiting on us.
-		 * However, we can only have one txg in "quiescing" or
-		 * "quiesced, waiting to sync" state.  So we wait until
-		 * the "quiesced, waiting to sync" txg has been consumed
-		 * by the sync thread.
-		 */
-		while (!tx->tx_exiting &&
-		    (tx->tx_open_txg >= tx->tx_quiesce_txg_waiting ||
-		    tx->tx_quiesced_txg != 0))
-			txg_thread_wait(tx, &cpr, &tx->tx_quiesce_more_cv, 0);
-
-		if (tx->tx_exiting)
-			txg_thread_exit(tx, &cpr, &tx->tx_quiesce_thread);
-
-		txg = tx->tx_open_txg;
-		dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
-		    txg, tx->tx_quiesce_txg_waiting,
-		    tx->tx_sync_txg_waiting);
-		mutex_exit(&tx->tx_sync_lock);
-		txg_quiesce(dp, txg);
-		mutex_enter(&tx->tx_sync_lock);
-
-		/*
-		 * Hand this txg off to the sync thread.
-		 */
-		dprintf("quiesce done, handing off txg %llu\n", txg);
-		tx->tx_quiesced_txg = txg;
-		cv_broadcast(&tx->tx_sync_more_cv);
-		cv_broadcast(&tx->tx_quiesce_done_cv);
-	}
-}
-
-void
-txg_wait_synced(dsl_pool_t *dp, uint64_t txg)
-{
-	tx_state_t *tx = &dp->dp_tx;
-
-	mutex_enter(&tx->tx_sync_lock);
-	ASSERT(tx->tx_threads == 3);
-	if (txg == 0)
-		txg = tx->tx_open_txg;
-	if (tx->tx_sync_txg_waiting < txg)
-		tx->tx_sync_txg_waiting = txg;
-	dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
-	    txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
-	while (tx->tx_synced_txg < txg) {
-		dprintf("broadcasting sync more "
-		    "tx_synced=%llu waiting=%llu dp=%p\n",
-		    tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
-		cv_broadcast(&tx->tx_sync_more_cv);
-		cv_wait(&tx->tx_sync_done_cv, &tx->tx_sync_lock);
-	}
-	mutex_exit(&tx->tx_sync_lock);
-}
-
-void
-txg_wait_open(dsl_pool_t *dp, uint64_t txg)
-{
-	tx_state_t *tx = &dp->dp_tx;
-
-	mutex_enter(&tx->tx_sync_lock);
-	ASSERT(tx->tx_threads == 3);
-	if (txg == 0)
-		txg = tx->tx_open_txg + 1;
-	if (tx->tx_quiesce_txg_waiting < txg)
-		tx->tx_quiesce_txg_waiting = txg;
-	dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
-	    txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
-	while (tx->tx_open_txg < txg) {
-		cv_broadcast(&tx->tx_quiesce_more_cv);
-		cv_wait(&tx->tx_quiesce_done_cv, &tx->tx_sync_lock);
-	}
-	mutex_exit(&tx->tx_sync_lock);
-}
-
-static void
-txg_timelimit_thread(void *arg)
-{
-	dsl_pool_t *dp = arg;
-	tx_state_t *tx = &dp->dp_tx;
-	callb_cpr_t cpr;
-
-	txg_thread_enter(tx, &cpr);
-
-	while (!tx->tx_exiting) {
-		uint64_t txg = tx->tx_open_txg + 1;
-
-		txg_thread_wait(tx, &cpr, &tx->tx_timeout_exit_cv, txg_time);
-
-		if (tx->tx_quiesce_txg_waiting < txg)
-			tx->tx_quiesce_txg_waiting = txg;
-
-		while (!tx->tx_exiting && tx->tx_open_txg < txg) {
-			dprintf("pushing out %llu\n", txg);
-			cv_broadcast(&tx->tx_quiesce_more_cv);
-			txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0);
-		}
-	}
-	txg_thread_exit(tx, &cpr, &tx->tx_timelimit_thread);
-}
-
-int
-txg_stalled(dsl_pool_t *dp)
-{
-	tx_state_t *tx = &dp->dp_tx;
-	return (tx->tx_quiesce_txg_waiting > tx->tx_open_txg);
-}
-
-void
-txg_suspend(dsl_pool_t *dp)
-{
-	tx_state_t *tx = &dp->dp_tx;
-	/* XXX some code paths suspend when they are already suspended! */
-	rw_enter(&tx->tx_suspend, RW_READER);
-}
-
-void
-txg_resume(dsl_pool_t *dp)
-{
-	tx_state_t *tx = &dp->dp_tx;
-	rw_exit(&tx->tx_suspend);
-}
-
-/*
- * Per-txg object lists.
- */
-void
-txg_list_create(txg_list_t *tl, size_t offset)
-{
-	int t;
-
-	mutex_init(&tl->tl_lock, NULL, MUTEX_DEFAULT, NULL);
-
-	tl->tl_offset = offset;
-
-	for (t = 0; t < TXG_SIZE; t++)
-		tl->tl_head[t] = NULL;
-}
-
-void
-txg_list_destroy(txg_list_t *tl)
-{
-	int t;
-
-	for (t = 0; t < TXG_SIZE; t++)
-		ASSERT(txg_list_empty(tl, t));
-
-	mutex_destroy(&tl->tl_lock);
-}
-
-int
-txg_list_empty(txg_list_t *tl, uint64_t txg)
-{
-	return (tl->tl_head[txg & TXG_MASK] == NULL);
-}
-
-/*
- * Add an entry to the list.
- * Returns 0 if it's a new entry, 1 if it's already there.
- */
-int
-txg_list_add(txg_list_t *tl, void *p, uint64_t txg)
-{
-	int t = txg & TXG_MASK;
-	txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
-	int already_on_list;
-
-	mutex_enter(&tl->tl_lock);
-	already_on_list = tn->tn_member[t];
-	if (!already_on_list) {
-		tn->tn_member[t] = 1;
-		tn->tn_next[t] = tl->tl_head[t];
-		tl->tl_head[t] = tn;
-	}
-	mutex_exit(&tl->tl_lock);
-
-	return (already_on_list);
-}
-
-/*
- * Remove the head of the list and return it.
- */
-void *
-txg_list_remove(txg_list_t *tl, uint64_t txg)
-{
-	int t = txg & TXG_MASK;
-	txg_node_t *tn;
-	void *p = NULL;
-
-	mutex_enter(&tl->tl_lock);
-	if ((tn = tl->tl_head[t]) != NULL) {
-		p = (char *)tn - tl->tl_offset;
-		tl->tl_head[t] = tn->tn_next[t];
-		tn->tn_next[t] = NULL;
-		tn->tn_member[t] = 0;
-	}
-	mutex_exit(&tl->tl_lock);
-
-	return (p);
-}
-
-/*
- * Remove a specific item from the list and return it.
- */
-void *
-txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg)
-{
-	int t = txg & TXG_MASK;
-	txg_node_t *tn, **tp;
-
-	mutex_enter(&tl->tl_lock);
-
-	for (tp = &tl->tl_head[t]; (tn = *tp) != NULL; tp = &tn->tn_next[t]) {
-		if ((char *)tn - tl->tl_offset == p) {
-			*tp = tn->tn_next[t];
-			tn->tn_next[t] = NULL;
-			tn->tn_member[t] = 0;
-			mutex_exit(&tl->tl_lock);
-			return (p);
-		}
-	}
-
-	mutex_exit(&tl->tl_lock);
-
-	return (NULL);
-}
-
-int
-txg_list_member(txg_list_t *tl, void *p, uint64_t txg)
-{
-	int t = txg & TXG_MASK;
-	txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
-
-	return (tn->tn_member[t]);
-}
-
-/*
- * Walk a txg list -- only safe if you know it's not changing.
- */
-void *
-txg_list_head(txg_list_t *tl, uint64_t txg)
-{
-	int t = txg & TXG_MASK;
-	txg_node_t *tn = tl->tl_head[t];
-
-	return (tn == NULL ? NULL : (char *)tn - tl->tl_offset);
-}
-
-void *
-txg_list_next(txg_list_t *tl, void *p, uint64_t txg)
-{
-	int t = txg & TXG_MASK;
-	txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
-
-	tn = tn->tn_next[t];
-
-	return (tn == NULL ? NULL : (char *)tn - tl->tl_offset);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/uberblock.c b/sys/contrib/opensolaris/uts/common/fs/zfs/uberblock.c
deleted file mode 100644
index 34d7e0c..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/uberblock.c
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/uberblock_impl.h>
-#include <sys/vdev_impl.h>
-
-int
-uberblock_verify(uberblock_t *ub)
-{
-	if (ub->ub_magic == BSWAP_64((uint64_t)UBERBLOCK_MAGIC))
-		byteswap_uint64_array(ub, sizeof (uberblock_t));
-
-	if (ub->ub_magic != UBERBLOCK_MAGIC)
-		return (EINVAL);
-
-	return (0);
-}
-
-/*
- * Update the uberblock and return a boolean value indicating whether
- * anything changed in this transaction group.
- */
-int
-uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg)
-{
-	ASSERT(ub->ub_txg < txg);
-
-	/*
-	 * We explicitly do not set ub_version here, so that older versions
-	 * continue to be written with the previous uberblock version.
-	 */
-	ub->ub_magic = UBERBLOCK_MAGIC;
-	ub->ub_txg = txg;
-	ub->ub_guid_sum = rvd->vdev_guid_sum;
-	ub->ub_timestamp = gethrestime_sec();
-
-	return (ub->ub_rootbp.blk_birth == txg);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/unique.c b/sys/contrib/opensolaris/uts/common/fs/zfs/unique.c
deleted file mode 100644
index b52e729..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/unique.c
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/avl.h>
-#include <sys/unique.h>
-
-static avl_tree_t unique_avl;
-static kmutex_t unique_mtx;	/* Lock never initialized. */
-SX_SYSINIT(unique, &unique_mtx, "unique lock");
-
-typedef struct unique {
-	avl_node_t un_link;
-	uint64_t un_value;
-} unique_t;
-
-#define	UNIQUE_MASK ((1ULL << UNIQUE_BITS) - 1)
-
-static int
-unique_compare(const void *a, const void *b)
-{
-	const unique_t *una = a;
-	const unique_t *unb = b;
-
-	if (una->un_value < unb->un_value)
-		return (-1);
-	if (una->un_value > unb->un_value)
-		return (+1);
-	return (0);
-}
-
-void
-unique_init(void)
-{
-	avl_create(&unique_avl, unique_compare,
-	    sizeof (unique_t), offsetof(unique_t, un_link));
-}
-
-uint64_t
-unique_create(void)
-{
-	return (unique_insert(0));
-}
-
-uint64_t
-unique_insert(uint64_t value)
-{
-	avl_index_t idx;
-	unique_t *un = kmem_alloc(sizeof (unique_t), KM_SLEEP);
-
-	un->un_value = value;
-
-	mutex_enter(&unique_mtx);
-	while (un->un_value == 0 || un->un_value & ~UNIQUE_MASK ||
-	    avl_find(&unique_avl, un, &idx)) {
-		mutex_exit(&unique_mtx);
-		(void) random_get_pseudo_bytes((void*)&un->un_value,
-		    sizeof (un->un_value));
-		un->un_value &= UNIQUE_MASK;
-		mutex_enter(&unique_mtx);
-	}
-
-	avl_insert(&unique_avl, un, idx);
-	mutex_exit(&unique_mtx);
-
-	return (un->un_value);
-}
-
-void
-unique_remove(uint64_t value)
-{
-	unique_t un_tofind;
-	unique_t *un;
-
-	un_tofind.un_value = value;
-	mutex_enter(&unique_mtx);
-	un = avl_find(&unique_avl, &un_tofind, NULL);
-	if (un != NULL) {
-		avl_remove(&unique_avl, un);
-		kmem_free(un, sizeof (unique_t));
-	}
-	mutex_exit(&unique_mtx);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev.c b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev.c
deleted file mode 100644
index b966099..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev.c
+++ /dev/null
@@ -1,1915 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/fm/fs/zfs.h>
-#include <sys/spa.h>
-#include <sys/spa_impl.h>
-#include <sys/dmu.h>
-#include <sys/dmu_tx.h>
-#include <sys/vdev_impl.h>
-#include <sys/uberblock_impl.h>
-#include <sys/metaslab.h>
-#include <sys/metaslab_impl.h>
-#include <sys/space_map.h>
-#include <sys/zio.h>
-#include <sys/zap.h>
-#include <sys/fs/zfs.h>
-
-SYSCTL_DECL(_vfs_zfs);
-SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV");
-
-/*
- * Virtual device management.
- */
-
-static vdev_ops_t *vdev_ops_table[] = {
-	&vdev_root_ops,
-	&vdev_raidz_ops,
-	&vdev_mirror_ops,
-	&vdev_replacing_ops,
-	&vdev_spare_ops,
-#ifdef _KERNEL
-	&vdev_geom_ops,
-#else
-	&vdev_disk_ops,
-	&vdev_file_ops,
-#endif
-	&vdev_missing_ops,
-	NULL
-};
-
-/* maximum scrub/resilver I/O queue */
-int zfs_scrub_limit = 70;
-
-/*
- * Given a vdev type, return the appropriate ops vector.
- */
-static vdev_ops_t *
-vdev_getops(const char *type)
-{
-	vdev_ops_t *ops, **opspp;
-
-	for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++)
-		if (strcmp(ops->vdev_op_type, type) == 0)
-			break;
-
-	return (ops);
-}
-
-/*
- * Default asize function: return the MAX of psize with the asize of
- * all children.  This is what's used by anything other than RAID-Z.
- */
-uint64_t
-vdev_default_asize(vdev_t *vd, uint64_t psize)
-{
-	uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);
-	uint64_t csize;
-	uint64_t c;
-
-	for (c = 0; c < vd->vdev_children; c++) {
-		csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
-		asize = MAX(asize, csize);
-	}
-
-	return (asize);
-}
-
-/*
- * Get the replaceable or attachable device size.
- * If the parent is a mirror or raidz, the replaceable size is the minimum
- * psize of all its children. For the rest, just return our own psize.
- *
- * e.g.
- *			psize	rsize
- * root			-	-
- *	mirror/raidz	-	-
- *	    disk1	20g	20g
- *	    disk2 	40g	20g
- *	disk3 		80g	80g
- */
-uint64_t
-vdev_get_rsize(vdev_t *vd)
-{
-	vdev_t *pvd, *cvd;
-	uint64_t c, rsize;
-
-	pvd = vd->vdev_parent;
-
-	/*
-	 * If our parent is NULL or the root, just return our own psize.
-	 */
-	if (pvd == NULL || pvd->vdev_parent == NULL)
-		return (vd->vdev_psize);
-
-	rsize = 0;
-
-	for (c = 0; c < pvd->vdev_children; c++) {
-		cvd = pvd->vdev_child[c];
-		rsize = MIN(rsize - 1, cvd->vdev_psize - 1) + 1;
-	}
-
-	return (rsize);
-}
-
-vdev_t *
-vdev_lookup_top(spa_t *spa, uint64_t vdev)
-{
-	vdev_t *rvd = spa->spa_root_vdev;
-
-	if (vdev < rvd->vdev_children)
-		return (rvd->vdev_child[vdev]);
-
-	return (NULL);
-}
-
-vdev_t *
-vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
-{
-	int c;
-	vdev_t *mvd;
-
-	if (vd->vdev_guid == guid)
-		return (vd);
-
-	for (c = 0; c < vd->vdev_children; c++)
-		if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) !=
-		    NULL)
-			return (mvd);
-
-	return (NULL);
-}
-
-void
-vdev_add_child(vdev_t *pvd, vdev_t *cvd)
-{
-	size_t oldsize, newsize;
-	uint64_t id = cvd->vdev_id;
-	vdev_t **newchild;
-
-	ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER));
-	ASSERT(cvd->vdev_parent == NULL);
-
-	cvd->vdev_parent = pvd;
-
-	if (pvd == NULL)
-		return;
-
-	ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL);
-
-	oldsize = pvd->vdev_children * sizeof (vdev_t *);
-	pvd->vdev_children = MAX(pvd->vdev_children, id + 1);
-	newsize = pvd->vdev_children * sizeof (vdev_t *);
-
-	newchild = kmem_zalloc(newsize, KM_SLEEP);
-	if (pvd->vdev_child != NULL) {
-		bcopy(pvd->vdev_child, newchild, oldsize);
-		kmem_free(pvd->vdev_child, oldsize);
-	}
-
-	pvd->vdev_child = newchild;
-	pvd->vdev_child[id] = cvd;
-
-	cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd);
-	ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL);
-
-	/*
-	 * Walk up all ancestors to update guid sum.
-	 */
-	for (; pvd != NULL; pvd = pvd->vdev_parent)
-		pvd->vdev_guid_sum += cvd->vdev_guid_sum;
-
-	if (cvd->vdev_ops->vdev_op_leaf)
-		cvd->vdev_spa->spa_scrub_maxinflight += zfs_scrub_limit;
-}
-
-void
-vdev_remove_child(vdev_t *pvd, vdev_t *cvd)
-{
-	int c;
-	uint_t id = cvd->vdev_id;
-
-	ASSERT(cvd->vdev_parent == pvd);
-
-	if (pvd == NULL)
-		return;
-
-	ASSERT(id < pvd->vdev_children);
-	ASSERT(pvd->vdev_child[id] == cvd);
-
-	pvd->vdev_child[id] = NULL;
-	cvd->vdev_parent = NULL;
-
-	for (c = 0; c < pvd->vdev_children; c++)
-		if (pvd->vdev_child[c])
-			break;
-
-	if (c == pvd->vdev_children) {
-		kmem_free(pvd->vdev_child, c * sizeof (vdev_t *));
-		pvd->vdev_child = NULL;
-		pvd->vdev_children = 0;
-	}
-
-	/*
-	 * Walk up all ancestors to update guid sum.
-	 */
-	for (; pvd != NULL; pvd = pvd->vdev_parent)
-		pvd->vdev_guid_sum -= cvd->vdev_guid_sum;
-
-	if (cvd->vdev_ops->vdev_op_leaf)
-		cvd->vdev_spa->spa_scrub_maxinflight -= zfs_scrub_limit;
-}
-
-/*
- * Remove any holes in the child array.
- */
-void
-vdev_compact_children(vdev_t *pvd)
-{
-	vdev_t **newchild, *cvd;
-	int oldc = pvd->vdev_children;
-	int newc, c;
-
-	ASSERT(spa_config_held(pvd->vdev_spa, RW_WRITER));
-
-	for (c = newc = 0; c < oldc; c++)
-		if (pvd->vdev_child[c])
-			newc++;
-
-	newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP);
-
-	for (c = newc = 0; c < oldc; c++) {
-		if ((cvd = pvd->vdev_child[c]) != NULL) {
-			newchild[newc] = cvd;
-			cvd->vdev_id = newc++;
-		}
-	}
-
-	kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *));
-	pvd->vdev_child = newchild;
-	pvd->vdev_children = newc;
-}
-
-/*
- * Allocate and minimally initialize a vdev_t.
- */
-static vdev_t *
-vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
-{
-	vdev_t *vd;
-
-	vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
-
-	if (spa->spa_root_vdev == NULL) {
-		ASSERT(ops == &vdev_root_ops);
-		spa->spa_root_vdev = vd;
-	}
-
-	if (guid == 0) {
-		if (spa->spa_root_vdev == vd) {
-			/*
-			 * The root vdev's guid will also be the pool guid,
-			 * which must be unique among all pools.
-			 */
-			while (guid == 0 || spa_guid_exists(guid, 0))
-				guid = spa_get_random(-1ULL);
-		} else {
-			/*
-			 * Any other vdev's guid must be unique within the pool.
-			 */
-			while (guid == 0 ||
-			    spa_guid_exists(spa_guid(spa), guid))
-				guid = spa_get_random(-1ULL);
-		}
-		ASSERT(!spa_guid_exists(spa_guid(spa), guid));
-	}
-
-	vd->vdev_spa = spa;
-	vd->vdev_id = id;
-	vd->vdev_guid = guid;
-	vd->vdev_guid_sum = guid;
-	vd->vdev_ops = ops;
-	vd->vdev_state = VDEV_STATE_CLOSED;
-
-	mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
-	space_map_create(&vd->vdev_dtl_map, 0, -1ULL, 0, &vd->vdev_dtl_lock);
-	space_map_create(&vd->vdev_dtl_scrub, 0, -1ULL, 0, &vd->vdev_dtl_lock);
-	txg_list_create(&vd->vdev_ms_list,
-	    offsetof(struct metaslab, ms_txg_node));
-	txg_list_create(&vd->vdev_dtl_list,
-	    offsetof(struct vdev, vdev_dtl_node));
-	vd->vdev_stat.vs_timestamp = gethrtime();
-
-	return (vd);
-}
-
-/*
- * Free a vdev_t that has been removed from service.
- */
-static void
-vdev_free_common(vdev_t *vd)
-{
-	spa_t *spa = vd->vdev_spa;
-
-	if (vd->vdev_path)
-		spa_strfree(vd->vdev_path);
-	if (vd->vdev_devid)
-		spa_strfree(vd->vdev_devid);
-
-	if (vd->vdev_isspare)
-		spa_spare_remove(vd);
-
-	txg_list_destroy(&vd->vdev_ms_list);
-	txg_list_destroy(&vd->vdev_dtl_list);
-	mutex_enter(&vd->vdev_dtl_lock);
-	space_map_unload(&vd->vdev_dtl_map);
-	space_map_destroy(&vd->vdev_dtl_map);
-	space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
-	space_map_destroy(&vd->vdev_dtl_scrub);
-	mutex_exit(&vd->vdev_dtl_lock);
-	mutex_destroy(&vd->vdev_dtl_lock);
-	mutex_destroy(&vd->vdev_stat_lock);
-
-	if (vd == spa->spa_root_vdev)
-		spa->spa_root_vdev = NULL;
-
-	kmem_free(vd, sizeof (vdev_t));
-}
-
-/*
- * Allocate a new vdev.  The 'alloctype' is used to control whether we are
- * creating a new vdev or loading an existing one - the behavior is slightly
- * different for each case.
- */
-int
-vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
-    int alloctype)
-{
-	vdev_ops_t *ops;
-	char *type;
-	uint64_t guid = 0;
-	vdev_t *vd;
-
-	ASSERT(spa_config_held(spa, RW_WRITER));
-
-	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
-		return (EINVAL);
-
-	if ((ops = vdev_getops(type)) == NULL)
-		return (EINVAL);
-
-	/*
-	 * If this is a load, get the vdev guid from the nvlist.
-	 * Otherwise, vdev_alloc_common() will generate one for us.
-	 */
-	if (alloctype == VDEV_ALLOC_LOAD) {
-		uint64_t label_id;
-
-		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) ||
-		    label_id != id)
-			return (EINVAL);
-
-		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
-			return (EINVAL);
-	} else if (alloctype == VDEV_ALLOC_SPARE) {
-		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
-			return (EINVAL);
-	}
-
-	/*
-	 * The first allocated vdev must be of type 'root'.
-	 */
-	if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL)
-		return (EINVAL);
-
-	vd = vdev_alloc_common(spa, id, guid, ops);
-
-	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
-		vd->vdev_path = spa_strdup(vd->vdev_path);
-	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0)
-		vd->vdev_devid = spa_strdup(vd->vdev_devid);
-
-	/*
-	 * Set the nparity propery for RAID-Z vdevs.
-	 */
-	if (ops == &vdev_raidz_ops) {
-		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
-		    &vd->vdev_nparity) == 0) {
-			/*
-			 * Currently, we can only support 2 parity devices.
-			 */
-			if (vd->vdev_nparity > 2)
-				return (EINVAL);
-			/*
-			 * Older versions can only support 1 parity device.
-			 */
-			if (vd->vdev_nparity == 2 &&
-			    spa_version(spa) < ZFS_VERSION_RAID6)
-				return (ENOTSUP);
-
-		} else {
-			/*
-			 * We require the parity to be specified for SPAs that
-			 * support multiple parity levels.
-			 */
-			if (spa_version(spa) >= ZFS_VERSION_RAID6)
-				return (EINVAL);
-
-			/*
-			 * Otherwise, we default to 1 parity device for RAID-Z.
-			 */
-			vd->vdev_nparity = 1;
-		}
-	} else {
-		vd->vdev_nparity = 0;
-	}
-
-	/*
-	 * Set the whole_disk property.  If it's not specified, leave the value
-	 * as -1.
-	 */
-	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
-	    &vd->vdev_wholedisk) != 0)
-		vd->vdev_wholedisk = -1ULL;
-
-	/*
-	 * Look for the 'not present' flag.  This will only be set if the device
-	 * was not present at the time of import.
-	 */
-	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
-	    &vd->vdev_not_present);
-
-	/*
-	 * Get the alignment requirement.
-	 */
-	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift);
-
-	/*
-	 * If we're a top-level vdev, try to load the allocation parameters.
-	 */
-	if (parent && !parent->vdev_parent && alloctype == VDEV_ALLOC_LOAD) {
-		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
-		    &vd->vdev_ms_array);
-		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
-		    &vd->vdev_ms_shift);
-		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
-		    &vd->vdev_asize);
-	}
-
-	/*
-	 * If we're a leaf vdev, try to load the DTL object and offline state.
-	 */
-	if (vd->vdev_ops->vdev_op_leaf && alloctype == VDEV_ALLOC_LOAD) {
-		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
-		    &vd->vdev_dtl.smo_object);
-		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
-		    &vd->vdev_offline);
-	}
-
-	/*
-	 * Add ourselves to the parent's list of children.
-	 */
-	vdev_add_child(parent, vd);
-
-	*vdp = vd;
-
-	return (0);
-}
-
-void
-vdev_free(vdev_t *vd)
-{
-	int c;
-
-	/*
-	 * vdev_free() implies closing the vdev first.  This is simpler than
-	 * trying to ensure complicated semantics for all callers.
-	 */
-	vdev_close(vd);
-
-	ASSERT(!list_link_active(&vd->vdev_dirty_node));
-
-	/*
-	 * Free all children.
-	 */
-	for (c = 0; c < vd->vdev_children; c++)
-		vdev_free(vd->vdev_child[c]);
-
-	ASSERT(vd->vdev_child == NULL);
-	ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
-
-	/*
-	 * Discard allocation state.
-	 */
-	if (vd == vd->vdev_top)
-		vdev_metaslab_fini(vd);
-
-	ASSERT3U(vd->vdev_stat.vs_space, ==, 0);
-	ASSERT3U(vd->vdev_stat.vs_dspace, ==, 0);
-	ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0);
-
-	/*
-	 * Remove this vdev from its parent's child list.
-	 */
-	vdev_remove_child(vd->vdev_parent, vd);
-
-	ASSERT(vd->vdev_parent == NULL);
-
-	vdev_free_common(vd);
-}
-
-/*
- * Transfer top-level vdev state from svd to tvd.
- */
-static void
-vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
-{
-	spa_t *spa = svd->vdev_spa;
-	metaslab_t *msp;
-	vdev_t *vd;
-	int t;
-
-	ASSERT(tvd == tvd->vdev_top);
-
-	tvd->vdev_ms_array = svd->vdev_ms_array;
-	tvd->vdev_ms_shift = svd->vdev_ms_shift;
-	tvd->vdev_ms_count = svd->vdev_ms_count;
-
-	svd->vdev_ms_array = 0;
-	svd->vdev_ms_shift = 0;
-	svd->vdev_ms_count = 0;
-
-	tvd->vdev_mg = svd->vdev_mg;
-	tvd->vdev_ms = svd->vdev_ms;
-
-	svd->vdev_mg = NULL;
-	svd->vdev_ms = NULL;
-
-	if (tvd->vdev_mg != NULL)
-		tvd->vdev_mg->mg_vd = tvd;
-
-	tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc;
-	tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space;
-	tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace;
-
-	svd->vdev_stat.vs_alloc = 0;
-	svd->vdev_stat.vs_space = 0;
-	svd->vdev_stat.vs_dspace = 0;
-
-	for (t = 0; t < TXG_SIZE; t++) {
-		while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
-			(void) txg_list_add(&tvd->vdev_ms_list, msp, t);
-		while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL)
-			(void) txg_list_add(&tvd->vdev_dtl_list, vd, t);
-		if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t))
-			(void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t);
-	}
-
-	if (list_link_active(&svd->vdev_dirty_node)) {
-		vdev_config_clean(svd);
-		vdev_config_dirty(tvd);
-	}
-
-	tvd->vdev_reopen_wanted = svd->vdev_reopen_wanted;
-	svd->vdev_reopen_wanted = 0;
-
-	tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio;
-	svd->vdev_deflate_ratio = 0;
-}
-
-static void
-vdev_top_update(vdev_t *tvd, vdev_t *vd)
-{
-	int c;
-
-	if (vd == NULL)
-		return;
-
-	vd->vdev_top = tvd;
-
-	for (c = 0; c < vd->vdev_children; c++)
-		vdev_top_update(tvd, vd->vdev_child[c]);
-}
-
-/*
- * Add a mirror/replacing vdev above an existing vdev.
- */
-vdev_t *
-vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
-{
-	spa_t *spa = cvd->vdev_spa;
-	vdev_t *pvd = cvd->vdev_parent;
-	vdev_t *mvd;
-
-	ASSERT(spa_config_held(spa, RW_WRITER));
-
-	mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);
-
-	mvd->vdev_asize = cvd->vdev_asize;
-	mvd->vdev_ashift = cvd->vdev_ashift;
-	mvd->vdev_state = cvd->vdev_state;
-
-	vdev_remove_child(pvd, cvd);
-	vdev_add_child(pvd, mvd);
-	cvd->vdev_id = mvd->vdev_children;
-	vdev_add_child(mvd, cvd);
-	vdev_top_update(cvd->vdev_top, cvd->vdev_top);
-
-	if (mvd == mvd->vdev_top)
-		vdev_top_transfer(cvd, mvd);
-
-	return (mvd);
-}
-
-/*
- * Remove a 1-way mirror/replacing vdev from the tree.
- */
-void
-vdev_remove_parent(vdev_t *cvd)
-{
-	vdev_t *mvd = cvd->vdev_parent;
-	vdev_t *pvd = mvd->vdev_parent;
-
-	ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER));
-
-	ASSERT(mvd->vdev_children == 1);
-	ASSERT(mvd->vdev_ops == &vdev_mirror_ops ||
-	    mvd->vdev_ops == &vdev_replacing_ops ||
-	    mvd->vdev_ops == &vdev_spare_ops);
-	cvd->vdev_ashift = mvd->vdev_ashift;
-
-	vdev_remove_child(mvd, cvd);
-	vdev_remove_child(pvd, mvd);
-	cvd->vdev_id = mvd->vdev_id;
-	vdev_add_child(pvd, cvd);
-	/*
-	 * If we created a new toplevel vdev, then we need to change the child's
-	 * vdev GUID to match the old toplevel vdev.  Otherwise, we could have
-	 * detached an offline device, and when we go to import the pool we'll
-	 * think we have two toplevel vdevs, instead of a different version of
-	 * the same toplevel vdev.
-	 */
-	if (cvd->vdev_top == cvd) {
-		pvd->vdev_guid_sum -= cvd->vdev_guid;
-		cvd->vdev_guid_sum -= cvd->vdev_guid;
-		cvd->vdev_guid = mvd->vdev_guid;
-		cvd->vdev_guid_sum += mvd->vdev_guid;
-		pvd->vdev_guid_sum += cvd->vdev_guid;
-	}
-	vdev_top_update(cvd->vdev_top, cvd->vdev_top);
-
-	if (cvd == cvd->vdev_top)
-		vdev_top_transfer(mvd, cvd);
-
-	ASSERT(mvd->vdev_children == 0);
-	vdev_free(mvd);
-}
-
-int
-vdev_metaslab_init(vdev_t *vd, uint64_t txg)
-{
-	spa_t *spa = vd->vdev_spa;
-	objset_t *mos = spa->spa_meta_objset;
-	metaslab_class_t *mc = spa_metaslab_class_select(spa);
-	uint64_t m;
-	uint64_t oldc = vd->vdev_ms_count;
-	uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
-	metaslab_t **mspp;
-	int error;
-
-	if (vd->vdev_ms_shift == 0)	/* not being allocated from yet */
-		return (0);
-
-	dprintf("%s oldc %llu newc %llu\n", vdev_description(vd), oldc, newc);
-
-	ASSERT(oldc <= newc);
-
-	if (vd->vdev_mg == NULL)
-		vd->vdev_mg = metaslab_group_create(mc, vd);
-
-	mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
-
-	if (oldc != 0) {
-		bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp));
-		kmem_free(vd->vdev_ms, oldc * sizeof (*mspp));
-	}
-
-	vd->vdev_ms = mspp;
-	vd->vdev_ms_count = newc;
-
-	for (m = oldc; m < newc; m++) {
-		space_map_obj_t smo = { 0, 0, 0 };
-		if (txg == 0) {
-			uint64_t object = 0;
-			error = dmu_read(mos, vd->vdev_ms_array,
-			    m * sizeof (uint64_t), sizeof (uint64_t), &object);
-			if (error)
-				return (error);
-			if (object != 0) {
-				dmu_buf_t *db;
-				error = dmu_bonus_hold(mos, object, FTAG, &db);
-				if (error)
-					return (error);
-				ASSERT3U(db->db_size, ==, sizeof (smo));
-				bcopy(db->db_data, &smo, db->db_size);
-				ASSERT3U(smo.smo_object, ==, object);
-				dmu_buf_rele(db, FTAG);
-			}
-		}
-		vd->vdev_ms[m] = metaslab_init(vd->vdev_mg, &smo,
-		    m << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg);
-	}
-
-	return (0);
-}
-
-void
-vdev_metaslab_fini(vdev_t *vd)
-{
-	uint64_t m;
-	uint64_t count = vd->vdev_ms_count;
-
-	if (vd->vdev_ms != NULL) {
-		for (m = 0; m < count; m++)
-			if (vd->vdev_ms[m] != NULL)
-				metaslab_fini(vd->vdev_ms[m]);
-		kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
-		vd->vdev_ms = NULL;
-	}
-}
-
-/*
- * Prepare a virtual device for access.
- */
-int
-vdev_open(vdev_t *vd)
-{
-	int error;
-	int c;
-	uint64_t osize = 0;
-	uint64_t asize, psize;
-	uint64_t ashift = 0;
-
-	ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
-	    vd->vdev_state == VDEV_STATE_CANT_OPEN ||
-	    vd->vdev_state == VDEV_STATE_OFFLINE);
-
-	if (vd->vdev_fault_mode == VDEV_FAULT_COUNT)
-		vd->vdev_fault_arg >>= 1;
-	else
-		vd->vdev_fault_mode = VDEV_FAULT_NONE;
-
-	vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
-
-	if (vd->vdev_ops->vdev_op_leaf) {
-		vdev_cache_init(vd);
-		vdev_queue_init(vd);
-		vd->vdev_cache_active = B_TRUE;
-	}
-
-	if (vd->vdev_offline) {
-		ASSERT(vd->vdev_children == 0);
-		vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE);
-		return (ENXIO);
-	}
-
-	error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift);
-
-	if (zio_injection_enabled && error == 0)
-		error = zio_handle_device_injection(vd, ENXIO);
-
-	dprintf("%s = %d, osize %llu, state = %d\n",
-	    vdev_description(vd), error, osize, vd->vdev_state);
-
-	if (error) {
-		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
-		    vd->vdev_stat.vs_aux);
-		return (error);
-	}
-
-	vd->vdev_state = VDEV_STATE_HEALTHY;
-
-	for (c = 0; c < vd->vdev_children; c++)
-		if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
-			vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
-			    VDEV_AUX_NONE);
-			break;
-		}
-
-	osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t));
-
-	if (vd->vdev_children == 0) {
-		if (osize < SPA_MINDEVSIZE) {
-			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
-			    VDEV_AUX_TOO_SMALL);
-			return (EOVERFLOW);
-		}
-		psize = osize;
-		asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE);
-	} else {
-		if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE -
-		    (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) {
-			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
-			    VDEV_AUX_TOO_SMALL);
-			return (EOVERFLOW);
-		}
-		psize = 0;
-		asize = osize;
-	}
-
-	vd->vdev_psize = psize;
-
-	if (vd->vdev_asize == 0) {
-		/*
-		 * This is the first-ever open, so use the computed values.
-		 * For testing purposes, a higher ashift can be requested.
-		 */
-		vd->vdev_asize = asize;
-		vd->vdev_ashift = MAX(ashift, vd->vdev_ashift);
-	} else {
-		/*
-		 * Make sure the alignment requirement hasn't increased.
-		 */
-		if (ashift > vd->vdev_top->vdev_ashift) {
-			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
-			    VDEV_AUX_BAD_LABEL);
-			return (EINVAL);
-		}
-
-		/*
-		 * Make sure the device hasn't shrunk.
-		 */
-		if (asize < vd->vdev_asize) {
-			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
-			    VDEV_AUX_BAD_LABEL);
-			return (EINVAL);
-		}
-
-		/*
-		 * If all children are healthy and the asize has increased,
-		 * then we've experienced dynamic LUN growth.
-		 */
-		if (vd->vdev_state == VDEV_STATE_HEALTHY &&
-		    asize > vd->vdev_asize) {
-			vd->vdev_asize = asize;
-		}
-	}
-
-	/*
-	 * If this is a top-level vdev, compute the raidz-deflation
-	 * ratio.  Note, we hard-code in 128k (1<<17) because it is the
-	 * current "typical" blocksize.  Even if SPA_MAXBLOCKSIZE
-	 * changes, this algorithm must never change, or we will
-	 * inconsistently account for existing bp's.
-	 */
-	if (vd->vdev_top == vd) {
-		vd->vdev_deflate_ratio = (1<<17) /
-		    (vdev_psize_to_asize(vd, 1<<17) >> SPA_MINBLOCKSHIFT);
-	}
-
-	/*
-	 * This allows the ZFS DE to close cases appropriately.  If a device
-	 * goes away and later returns, we want to close the associated case.
-	 * But it's not enough to simply post this only when a device goes from
-	 * CANT_OPEN -> HEALTHY.  If we reboot the system and the device is
-	 * back, we also need to close the case (otherwise we will try to replay
-	 * it).  So we have to post this notifier every time.  Since this only
-	 * occurs during pool open or error recovery, this should not be an
-	 * issue.
-	 */
-	zfs_post_ok(vd->vdev_spa, vd);
-
-	return (0);
-}
-
-/*
- * Called once the vdevs are all opened, this routine validates the label
- * contents.  This needs to be done before vdev_load() so that we don't
- * inadvertently do repair I/Os to the wrong device, and so that vdev_reopen()
- * won't succeed if the device has been changed underneath.
- *
- * This function will only return failure if one of the vdevs indicates that it
- * has since been destroyed or exported.  This is only possible if
- * /etc/zfs/zpool.cache was readonly at the time.  Otherwise, the vdev state
- * will be updated but the function will return 0.
- */
-int
-vdev_validate(vdev_t *vd)
-{
-	spa_t *spa = vd->vdev_spa;
-	int c;
-	nvlist_t *label;
-	uint64_t guid;
-	uint64_t state;
-
-	for (c = 0; c < vd->vdev_children; c++)
-		if (vdev_validate(vd->vdev_child[c]) != 0)
-			return (EBADF);
-
-	/*
-	 * If the device has already failed, or was marked offline, don't do
-	 * any further validation.  Otherwise, label I/O will fail and we will
-	 * overwrite the previous state.
-	 */
-	if (vd->vdev_ops->vdev_op_leaf && !vdev_is_dead(vd)) {
-
-		if ((label = vdev_label_read_config(vd)) == NULL) {
-			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
-			    VDEV_AUX_BAD_LABEL);
-			return (0);
-		}
-
-		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID,
-		    &guid) != 0 || guid != spa_guid(spa)) {
-			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
-			    VDEV_AUX_CORRUPT_DATA);
-			nvlist_free(label);
-			return (0);
-		}
-
-		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID,
-		    &guid) != 0 || guid != vd->vdev_guid) {
-			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
-			    VDEV_AUX_CORRUPT_DATA);
-			nvlist_free(label);
-			return (0);
-		}
-
-		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
-		    &state) != 0) {
-			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
-			    VDEV_AUX_CORRUPT_DATA);
-			nvlist_free(label);
-			return (0);
-		}
-
-		nvlist_free(label);
-
-		if (spa->spa_load_state == SPA_LOAD_OPEN &&
-		    state != POOL_STATE_ACTIVE)
-			return (EBADF);
-	}
-
-	/*
-	 * If we were able to open and validate a vdev that was previously
-	 * marked permanently unavailable, clear that state now.
-	 */
-	if (vd->vdev_not_present)
-		vd->vdev_not_present = 0;
-
-	return (0);
-}
-
-/*
- * Close a virtual device.
- */
-void
-vdev_close(vdev_t *vd)
-{
-	vd->vdev_ops->vdev_op_close(vd);
-
-	if (vd->vdev_cache_active) {
-		vdev_cache_fini(vd);
-		vdev_queue_fini(vd);
-		vd->vdev_cache_active = B_FALSE;
-	}
-
-	/*
-	 * We record the previous state before we close it, so  that if we are
-	 * doing a reopen(), we don't generate FMA ereports if we notice that
-	 * it's still faulted.
-	 */
-	vd->vdev_prevstate = vd->vdev_state;
-
-	if (vd->vdev_offline)
-		vd->vdev_state = VDEV_STATE_OFFLINE;
-	else
-		vd->vdev_state = VDEV_STATE_CLOSED;
-	vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
-}
-
-void
-vdev_reopen(vdev_t *vd)
-{
-	spa_t *spa = vd->vdev_spa;
-
-	ASSERT(spa_config_held(spa, RW_WRITER));
-
-	vdev_close(vd);
-	(void) vdev_open(vd);
-
-	/*
-	 * Call vdev_validate() here to make sure we have the same device.
-	 * Otherwise, a device with an invalid label could be successfully
-	 * opened in response to vdev_reopen().
-	 *
-	 * The downside to this is that if the user is simply experimenting by
-	 * overwriting an entire disk, we'll fault the device rather than
-	 * demonstrate self-healing capabilities.  On the other hand, with
-	 * proper FMA integration, the series of errors we'd see from the device
-	 * would result in a faulted device anyway.  Given that this doesn't
-	 * model any real-world corruption, it's better to catch this here and
-	 * correctly identify that the device has either changed beneath us, or
-	 * is corrupted beyond recognition.
-	 */
-	(void) vdev_validate(vd);
-
-	/*
-	 * Reassess root vdev's health.
-	 */
-	vdev_propagate_state(spa->spa_root_vdev);
-}
-
-int
-vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing)
-{
-	int error;
-
-	/*
-	 * Normally, partial opens (e.g. of a mirror) are allowed.
-	 * For a create, however, we want to fail the request if
-	 * there are any components we can't open.
-	 */
-	error = vdev_open(vd);
-
-	if (error || vd->vdev_state != VDEV_STATE_HEALTHY) {
-		vdev_close(vd);
-		return (error ? error : ENXIO);
-	}
-
-	/*
-	 * Recursively initialize all labels.
-	 */
-	if ((error = vdev_label_init(vd, txg, isreplacing ?
-	    VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) {
-		vdev_close(vd);
-		return (error);
-	}
-
-	return (0);
-}
-
-/*
- * The is the latter half of vdev_create().  It is distinct because it
- * involves initiating transactions in order to do metaslab creation.
- * For creation, we want to try to create all vdevs at once and then undo it
- * if anything fails; this is much harder if we have pending transactions.
- */
-void
-vdev_init(vdev_t *vd, uint64_t txg)
-{
-	/*
-	 * Aim for roughly 200 metaslabs per vdev.
-	 */
-	vd->vdev_ms_shift = highbit(vd->vdev_asize / 200);
-	vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT);
-
-	/*
-	 * Initialize the vdev's metaslabs.  This can't fail because
-	 * there's nothing to read when creating all new metaslabs.
-	 */
-	VERIFY(vdev_metaslab_init(vd, txg) == 0);
-}
-
-void
-vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
-{
-	ASSERT(vd == vd->vdev_top);
-	ASSERT(ISP2(flags));
-
-	if (flags & VDD_METASLAB)
-		(void) txg_list_add(&vd->vdev_ms_list, arg, txg);
-
-	if (flags & VDD_DTL)
-		(void) txg_list_add(&vd->vdev_dtl_list, arg, txg);
-
-	(void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg);
-}
-
-void
-vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size)
-{
-	mutex_enter(sm->sm_lock);
-	if (!space_map_contains(sm, txg, size))
-		space_map_add(sm, txg, size);
-	mutex_exit(sm->sm_lock);
-}
-
-int
-vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size)
-{
-	int dirty;
-
-	/*
-	 * Quick test without the lock -- covers the common case that
-	 * there are no dirty time segments.
-	 */
-	if (sm->sm_space == 0)
-		return (0);
-
-	mutex_enter(sm->sm_lock);
-	dirty = space_map_contains(sm, txg, size);
-	mutex_exit(sm->sm_lock);
-
-	return (dirty);
-}
-
-/*
- * Reassess DTLs after a config change or scrub completion.
- */
-void
-vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
-{
-	spa_t *spa = vd->vdev_spa;
-	int c;
-
-	ASSERT(spa_config_held(spa, RW_WRITER));
-
-	if (vd->vdev_children == 0) {
-		mutex_enter(&vd->vdev_dtl_lock);
-		/*
-		 * We're successfully scrubbed everything up to scrub_txg.
-		 * Therefore, excise all old DTLs up to that point, then
-		 * fold in the DTLs for everything we couldn't scrub.
-		 */
-		if (scrub_txg != 0) {
-			space_map_excise(&vd->vdev_dtl_map, 0, scrub_txg);
-			space_map_union(&vd->vdev_dtl_map, &vd->vdev_dtl_scrub);
-		}
-		if (scrub_done)
-			space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
-		mutex_exit(&vd->vdev_dtl_lock);
-		if (txg != 0)
-			vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
-		return;
-	}
-
-	/*
-	 * Make sure the DTLs are always correct under the scrub lock.
-	 */
-	if (vd == spa->spa_root_vdev)
-		mutex_enter(&spa->spa_scrub_lock);
-
-	mutex_enter(&vd->vdev_dtl_lock);
-	space_map_vacate(&vd->vdev_dtl_map, NULL, NULL);
-	space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
-	mutex_exit(&vd->vdev_dtl_lock);
-
-	for (c = 0; c < vd->vdev_children; c++) {
-		vdev_t *cvd = vd->vdev_child[c];
-		vdev_dtl_reassess(cvd, txg, scrub_txg, scrub_done);
-		mutex_enter(&vd->vdev_dtl_lock);
-		space_map_union(&vd->vdev_dtl_map, &cvd->vdev_dtl_map);
-		space_map_union(&vd->vdev_dtl_scrub, &cvd->vdev_dtl_scrub);
-		mutex_exit(&vd->vdev_dtl_lock);
-	}
-
-	if (vd == spa->spa_root_vdev)
-		mutex_exit(&spa->spa_scrub_lock);
-}
-
-static int
-vdev_dtl_load(vdev_t *vd)
-{
-	spa_t *spa = vd->vdev_spa;
-	space_map_obj_t *smo = &vd->vdev_dtl;
-	objset_t *mos = spa->spa_meta_objset;
-	dmu_buf_t *db;
-	int error;
-
-	ASSERT(vd->vdev_children == 0);
-
-	if (smo->smo_object == 0)
-		return (0);
-
-	if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0)
-		return (error);
-
-	ASSERT3U(db->db_size, ==, sizeof (*smo));
-	bcopy(db->db_data, smo, db->db_size);
-	dmu_buf_rele(db, FTAG);
-
-	mutex_enter(&vd->vdev_dtl_lock);
-	error = space_map_load(&vd->vdev_dtl_map, NULL, SM_ALLOC, smo, mos);
-	mutex_exit(&vd->vdev_dtl_lock);
-
-	return (error);
-}
-
-void
-vdev_dtl_sync(vdev_t *vd, uint64_t txg)
-{
-	spa_t *spa = vd->vdev_spa;
-	space_map_obj_t *smo = &vd->vdev_dtl;
-	space_map_t *sm = &vd->vdev_dtl_map;
-	objset_t *mos = spa->spa_meta_objset;
-	space_map_t smsync;
-	kmutex_t smlock;
-	dmu_buf_t *db;
-	dmu_tx_t *tx;
-
-	dprintf("%s in txg %llu pass %d\n",
-	    vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa));
-
-	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
-
-	if (vd->vdev_detached) {
-		if (smo->smo_object != 0) {
-			int err = dmu_object_free(mos, smo->smo_object, tx);
-			ASSERT3U(err, ==, 0);
-			smo->smo_object = 0;
-		}
-		dmu_tx_commit(tx);
-		dprintf("detach %s committed in txg %llu\n",
-		    vdev_description(vd), txg);
-		return;
-	}
-
-	if (smo->smo_object == 0) {
-		ASSERT(smo->smo_objsize == 0);
-		ASSERT(smo->smo_alloc == 0);
-		smo->smo_object = dmu_object_alloc(mos,
-		    DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
-		    DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
-		ASSERT(smo->smo_object != 0);
-		vdev_config_dirty(vd->vdev_top);
-	}
-
-	mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL);
-
-	space_map_create(&smsync, sm->sm_start, sm->sm_size, sm->sm_shift,
-	    &smlock);
-
-	mutex_enter(&smlock);
-
-	mutex_enter(&vd->vdev_dtl_lock);
-	space_map_walk(sm, space_map_add, &smsync);
-	mutex_exit(&vd->vdev_dtl_lock);
-
-	space_map_truncate(smo, mos, tx);
-	space_map_sync(&smsync, SM_ALLOC, smo, mos, tx);
-
-	space_map_destroy(&smsync);
-
-	mutex_exit(&smlock);
-	mutex_destroy(&smlock);
-
-	VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db));
-	dmu_buf_will_dirty(db, tx);
-	ASSERT3U(db->db_size, ==, sizeof (*smo));
-	bcopy(smo, db->db_data, db->db_size);
-	dmu_buf_rele(db, FTAG);
-
-	dmu_tx_commit(tx);
-}
-
-void
-vdev_load(vdev_t *vd)
-{
-	int c;
-
-	/*
-	 * Recursively load all children.
-	 */
-	for (c = 0; c < vd->vdev_children; c++)
-		vdev_load(vd->vdev_child[c]);
-
-	/*
-	 * If this is a top-level vdev, initialize its metaslabs.
-	 */
-	if (vd == vd->vdev_top &&
-	    (vd->vdev_ashift == 0 || vd->vdev_asize == 0 ||
-	    vdev_metaslab_init(vd, 0) != 0))
-		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
-		    VDEV_AUX_CORRUPT_DATA);
-
-	/*
-	 * If this is a leaf vdev, load its DTL.
-	 */
-	if (vd->vdev_ops->vdev_op_leaf && vdev_dtl_load(vd) != 0)
-		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
-		    VDEV_AUX_CORRUPT_DATA);
-}
-
-/*
- * This special case of vdev_spare() is used for hot spares.  It's sole purpose
- * it to set the vdev state for the associated vdev.  To do this, we make sure
- * that we can open the underlying device, then try to read the label, and make
- * sure that the label is sane and that it hasn't been repurposed to another
- * pool.
- */
-int
-vdev_validate_spare(vdev_t *vd)
-{
-	nvlist_t *label;
-	uint64_t guid, version;
-	uint64_t state;
-
-	if ((label = vdev_label_read_config(vd)) == NULL) {
-		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
-		    VDEV_AUX_CORRUPT_DATA);
-		return (-1);
-	}
-
-	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 ||
-	    version > ZFS_VERSION ||
-	    nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 ||
-	    guid != vd->vdev_guid ||
-	    nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) {
-		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
-		    VDEV_AUX_CORRUPT_DATA);
-		nvlist_free(label);
-		return (-1);
-	}
-
-	spa_spare_add(vd);
-
-	/*
-	 * We don't actually check the pool state here.  If it's in fact in
-	 * use by another pool, we update this fact on the fly when requested.
-	 */
-	nvlist_free(label);
-	return (0);
-}
-
-void
-vdev_sync_done(vdev_t *vd, uint64_t txg)
-{
-	metaslab_t *msp;
-
-	dprintf("%s txg %llu\n", vdev_description(vd), txg);
-
-	while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
-		metaslab_sync_done(msp, txg);
-}
-
-void
-vdev_sync(vdev_t *vd, uint64_t txg)
-{
-	spa_t *spa = vd->vdev_spa;
-	vdev_t *lvd;
-	metaslab_t *msp;
-	dmu_tx_t *tx;
-
-	dprintf("%s txg %llu pass %d\n",
-	    vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa));
-
-	if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) {
-		ASSERT(vd == vd->vdev_top);
-		tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
-		vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
-		    DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
-		ASSERT(vd->vdev_ms_array != 0);
-		vdev_config_dirty(vd);
-		dmu_tx_commit(tx);
-	}
-
-	while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
-		metaslab_sync(msp, txg);
-		(void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
-	}
-
-	while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL)
-		vdev_dtl_sync(lvd, txg);
-
-	(void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
-}
-
-uint64_t
-vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
-{
-	return (vd->vdev_ops->vdev_op_asize(vd, psize));
-}
-
-void
-vdev_io_start(zio_t *zio)
-{
-	zio->io_vd->vdev_ops->vdev_op_io_start(zio);
-}
-
-void
-vdev_io_done(zio_t *zio)
-{
-	zio->io_vd->vdev_ops->vdev_op_io_done(zio);
-}
-
-const char *
-vdev_description(vdev_t *vd)
-{
-	if (vd == NULL || vd->vdev_ops == NULL)
-		return ("<unknown>");
-
-	if (vd->vdev_path != NULL)
-		return (vd->vdev_path);
-
-	if (vd->vdev_parent == NULL)
-		return (spa_name(vd->vdev_spa));
-
-	return (vd->vdev_ops->vdev_op_type);
-}
-
-int
-vdev_online(spa_t *spa, uint64_t guid)
-{
-	vdev_t *rvd, *vd;
-	uint64_t txg;
-
-	txg = spa_vdev_enter(spa);
-
-	rvd = spa->spa_root_vdev;
-
-	if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL)
-		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
-
-	if (!vd->vdev_ops->vdev_op_leaf)
-		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
-
-	dprintf("ONLINE: %s\n", vdev_description(vd));
-
-	vd->vdev_offline = B_FALSE;
-	vd->vdev_tmpoffline = B_FALSE;
-	vdev_reopen(vd->vdev_top);
-
-	vdev_config_dirty(vd->vdev_top);
-
-	(void) spa_vdev_exit(spa, NULL, txg, 0);
-
-	VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
-
-	return (0);
-}
-
-int
-vdev_offline(spa_t *spa, uint64_t guid, int istmp)
-{
-	vdev_t *rvd, *vd;
-	uint64_t txg;
-
-	txg = spa_vdev_enter(spa);
-
-	rvd = spa->spa_root_vdev;
-
-	if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL)
-		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
-
-	if (!vd->vdev_ops->vdev_op_leaf)
-		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
-
-	dprintf("OFFLINE: %s\n", vdev_description(vd));
-
-	/*
-	 * If the device isn't already offline, try to offline it.
-	 */
-	if (!vd->vdev_offline) {
-		/*
-		 * If this device's top-level vdev has a non-empty DTL,
-		 * don't allow the device to be offlined.
-		 *
-		 * XXX -- make this more precise by allowing the offline
-		 * as long as the remaining devices don't have any DTL holes.
-		 */
-		if (vd->vdev_top->vdev_dtl_map.sm_space != 0)
-			return (spa_vdev_exit(spa, NULL, txg, EBUSY));
-
-		/*
-		 * Offline this device and reopen its top-level vdev.
-		 * If this action results in the top-level vdev becoming
-		 * unusable, undo it and fail the request.
-		 */
-		vd->vdev_offline = B_TRUE;
-		vdev_reopen(vd->vdev_top);
-		if (vdev_is_dead(vd->vdev_top)) {
-			vd->vdev_offline = B_FALSE;
-			vdev_reopen(vd->vdev_top);
-			return (spa_vdev_exit(spa, NULL, txg, EBUSY));
-		}
-	}
-
-	vd->vdev_tmpoffline = istmp;
-
-	vdev_config_dirty(vd->vdev_top);
-
-	return (spa_vdev_exit(spa, NULL, txg, 0));
-}
-
-/*
- * Clear the error counts associated with this vdev.  Unlike vdev_online() and
- * vdev_offline(), we assume the spa config is locked.  We also clear all
- * children.  If 'vd' is NULL, then the user wants to clear all vdevs.
- */
-void
-vdev_clear(spa_t *spa, vdev_t *vd)
-{
-	int c;
-
-	if (vd == NULL)
-		vd = spa->spa_root_vdev;
-
-	vd->vdev_stat.vs_read_errors = 0;
-	vd->vdev_stat.vs_write_errors = 0;
-	vd->vdev_stat.vs_checksum_errors = 0;
-
-	for (c = 0; c < vd->vdev_children; c++)
-		vdev_clear(spa, vd->vdev_child[c]);
-}
-
-int
-vdev_is_dead(vdev_t *vd)
-{
-	return (vd->vdev_state <= VDEV_STATE_CANT_OPEN);
-}
-
-int
-vdev_error_inject(vdev_t *vd, zio_t *zio)
-{
-	int error = 0;
-
-	if (vd->vdev_fault_mode == VDEV_FAULT_NONE)
-		return (0);
-
-	if (((1ULL << zio->io_type) & vd->vdev_fault_mask) == 0)
-		return (0);
-
-	switch (vd->vdev_fault_mode) {
-	case VDEV_FAULT_RANDOM:
-		if (spa_get_random(vd->vdev_fault_arg) == 0)
-			error = EIO;
-		break;
-
-	case VDEV_FAULT_COUNT:
-		if ((int64_t)--vd->vdev_fault_arg <= 0)
-			vd->vdev_fault_mode = VDEV_FAULT_NONE;
-		error = EIO;
-		break;
-	}
-
-	if (error != 0) {
-		dprintf("returning %d for type %d on %s state %d offset %llx\n",
-		    error, zio->io_type, vdev_description(vd),
-		    vd->vdev_state, zio->io_offset);
-	}
-
-	return (error);
-}
-
-/*
- * Get statistics for the given vdev.
- */
-void
-vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
-{
-	vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
-	int c, t;
-
-	mutex_enter(&vd->vdev_stat_lock);
-	bcopy(&vd->vdev_stat, vs, sizeof (*vs));
-	vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
-	vs->vs_state = vd->vdev_state;
-	vs->vs_rsize = vdev_get_rsize(vd);
-	mutex_exit(&vd->vdev_stat_lock);
-
-	/*
-	 * If we're getting stats on the root vdev, aggregate the I/O counts
-	 * over all top-level vdevs (i.e. the direct children of the root).
-	 */
-	if (vd == rvd) {
-		for (c = 0; c < rvd->vdev_children; c++) {
-			vdev_t *cvd = rvd->vdev_child[c];
-			vdev_stat_t *cvs = &cvd->vdev_stat;
-
-			mutex_enter(&vd->vdev_stat_lock);
-			for (t = 0; t < ZIO_TYPES; t++) {
-				vs->vs_ops[t] += cvs->vs_ops[t];
-				vs->vs_bytes[t] += cvs->vs_bytes[t];
-			}
-			vs->vs_read_errors += cvs->vs_read_errors;
-			vs->vs_write_errors += cvs->vs_write_errors;
-			vs->vs_checksum_errors += cvs->vs_checksum_errors;
-			vs->vs_scrub_examined += cvs->vs_scrub_examined;
-			vs->vs_scrub_errors += cvs->vs_scrub_errors;
-			mutex_exit(&vd->vdev_stat_lock);
-		}
-	}
-}
-
-void
-vdev_stat_update(zio_t *zio)
-{
-	vdev_t *vd = zio->io_vd;
-	vdev_t *pvd;
-	uint64_t txg = zio->io_txg;
-	vdev_stat_t *vs = &vd->vdev_stat;
-	zio_type_t type = zio->io_type;
-	int flags = zio->io_flags;
-
-	if (zio->io_error == 0) {
-		if (!(flags & ZIO_FLAG_IO_BYPASS)) {
-			mutex_enter(&vd->vdev_stat_lock);
-			vs->vs_ops[type]++;
-			vs->vs_bytes[type] += zio->io_size;
-			mutex_exit(&vd->vdev_stat_lock);
-		}
-		if ((flags & ZIO_FLAG_IO_REPAIR) &&
-		    zio->io_delegate_list == NULL) {
-			mutex_enter(&vd->vdev_stat_lock);
-			if (flags & ZIO_FLAG_SCRUB_THREAD)
-				vs->vs_scrub_repaired += zio->io_size;
-			else
-				vs->vs_self_healed += zio->io_size;
-			mutex_exit(&vd->vdev_stat_lock);
-		}
-		return;
-	}
-
-	if (flags & ZIO_FLAG_SPECULATIVE)
-		return;
-
-	if (!vdev_is_dead(vd)) {
-		mutex_enter(&vd->vdev_stat_lock);
-		if (type == ZIO_TYPE_READ) {
-			if (zio->io_error == ECKSUM)
-				vs->vs_checksum_errors++;
-			else
-				vs->vs_read_errors++;
-		}
-		if (type == ZIO_TYPE_WRITE)
-			vs->vs_write_errors++;
-		mutex_exit(&vd->vdev_stat_lock);
-	}
-
-	if (type == ZIO_TYPE_WRITE) {
-		if (txg == 0 || vd->vdev_children != 0)
-			return;
-		if (flags & ZIO_FLAG_SCRUB_THREAD) {
-			ASSERT(flags & ZIO_FLAG_IO_REPAIR);
-			for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
-				vdev_dtl_dirty(&pvd->vdev_dtl_scrub, txg, 1);
-		}
-		if (!(flags & ZIO_FLAG_IO_REPAIR)) {
-			if (vdev_dtl_contains(&vd->vdev_dtl_map, txg, 1))
-				return;
-			vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
-			for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
-				vdev_dtl_dirty(&pvd->vdev_dtl_map, txg, 1);
-		}
-	}
-}
-
-void
-vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete)
-{
-	int c;
-	vdev_stat_t *vs = &vd->vdev_stat;
-
-	for (c = 0; c < vd->vdev_children; c++)
-		vdev_scrub_stat_update(vd->vdev_child[c], type, complete);
-
-	mutex_enter(&vd->vdev_stat_lock);
-
-	if (type == POOL_SCRUB_NONE) {
-		/*
-		 * Update completion and end time.  Leave everything else alone
-		 * so we can report what happened during the previous scrub.
-		 */
-		vs->vs_scrub_complete = complete;
-		vs->vs_scrub_end = gethrestime_sec();
-	} else {
-		vs->vs_scrub_type = type;
-		vs->vs_scrub_complete = 0;
-		vs->vs_scrub_examined = 0;
-		vs->vs_scrub_repaired = 0;
-		vs->vs_scrub_errors = 0;
-		vs->vs_scrub_start = gethrestime_sec();
-		vs->vs_scrub_end = 0;
-	}
-
-	mutex_exit(&vd->vdev_stat_lock);
-}
-
-/*
- * Update the in-core space usage stats for this vdev and the root vdev.
- */
-void
-vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta)
-{
-	ASSERT(vd == vd->vdev_top);
-	int64_t dspace_delta = space_delta;
-
-	do {
-		if (vd->vdev_ms_count) {
-			/*
-			 * If this is a top-level vdev, apply the
-			 * inverse of its psize-to-asize (ie. RAID-Z)
-			 * space-expansion factor.  We must calculate
-			 * this here and not at the root vdev because
-			 * the root vdev's psize-to-asize is simply the
-			 * max of its childrens', thus not accurate
-			 * enough for us.
-			 */
-			ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0);
-			dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) *
-			    vd->vdev_deflate_ratio;
-		}
-
-		mutex_enter(&vd->vdev_stat_lock);
-		vd->vdev_stat.vs_space += space_delta;
-		vd->vdev_stat.vs_alloc += alloc_delta;
-		vd->vdev_stat.vs_dspace += dspace_delta;
-		mutex_exit(&vd->vdev_stat_lock);
-	} while ((vd = vd->vdev_parent) != NULL);
-}
-
-/*
- * Mark a top-level vdev's config as dirty, placing it on the dirty list
- * so that it will be written out next time the vdev configuration is synced.
- * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs.
- */
-void
-vdev_config_dirty(vdev_t *vd)
-{
-	spa_t *spa = vd->vdev_spa;
-	vdev_t *rvd = spa->spa_root_vdev;
-	int c;
-
-	/*
-	 * The dirty list is protected by the config lock.  The caller must
-	 * either hold the config lock as writer, or must be the sync thread
-	 * (which holds the lock as reader).  There's only one sync thread,
-	 * so this is sufficient to ensure mutual exclusion.
-	 */
-	ASSERT(spa_config_held(spa, RW_WRITER) ||
-	    dsl_pool_sync_context(spa_get_dsl(spa)));
-
-	if (vd == rvd) {
-		for (c = 0; c < rvd->vdev_children; c++)
-			vdev_config_dirty(rvd->vdev_child[c]);
-	} else {
-		ASSERT(vd == vd->vdev_top);
-
-		if (!list_link_active(&vd->vdev_dirty_node))
-			list_insert_head(&spa->spa_dirty_list, vd);
-	}
-}
-
-void
-vdev_config_clean(vdev_t *vd)
-{
-	spa_t *spa = vd->vdev_spa;
-
-	ASSERT(spa_config_held(spa, RW_WRITER) ||
-	    dsl_pool_sync_context(spa_get_dsl(spa)));
-
-	ASSERT(list_link_active(&vd->vdev_dirty_node));
-	list_remove(&spa->spa_dirty_list, vd);
-}
-
-void
-vdev_propagate_state(vdev_t *vd)
-{
-	vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
-	int degraded = 0, faulted = 0;
-	int corrupted = 0;
-	int c;
-	vdev_t *child;
-
-	for (c = 0; c < vd->vdev_children; c++) {
-		child = vd->vdev_child[c];
-		if (child->vdev_state <= VDEV_STATE_CANT_OPEN)
-			faulted++;
-		else if (child->vdev_state == VDEV_STATE_DEGRADED)
-			degraded++;
-
-		if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA)
-			corrupted++;
-	}
-
-	vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded);
-
-	/*
-	 * Root special: if there is a toplevel vdev that cannot be
-	 * opened due to corrupted metadata, then propagate the root
-	 * vdev's aux state as 'corrupt' rather than 'insufficient
-	 * replicas'.
-	 */
-	if (corrupted && vd == rvd && rvd->vdev_state == VDEV_STATE_CANT_OPEN)
-		vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN,
-		    VDEV_AUX_CORRUPT_DATA);
-}
-
-/*
- * Set a vdev's state.  If this is during an open, we don't update the parent
- * state, because we're in the process of opening children depth-first.
- * Otherwise, we propagate the change to the parent.
- *
- * If this routine places a device in a faulted state, an appropriate ereport is
- * generated.
- */
-void
-vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
-{
-	uint64_t save_state;
-
-	if (state == vd->vdev_state) {
-		vd->vdev_stat.vs_aux = aux;
-		return;
-	}
-
-	save_state = vd->vdev_state;
-
-	vd->vdev_state = state;
-	vd->vdev_stat.vs_aux = aux;
-
-	/*
-	 * If we are setting the vdev state to anything but an open state, then
-	 * always close the underlying device.  Otherwise, we keep accessible
-	 * but invalid devices open forever.  We don't call vdev_close() itself,
-	 * because that implies some extra checks (offline, etc) that we don't
-	 * want here.  This is limited to leaf devices, because otherwise
-	 * closing the device will affect other children.
-	 */
-	if (vdev_is_dead(vd) && vd->vdev_ops->vdev_op_leaf)
-		vd->vdev_ops->vdev_op_close(vd);
-
-	if (state == VDEV_STATE_CANT_OPEN) {
-		/*
-		 * If we fail to open a vdev during an import, we mark it as
-		 * "not available", which signifies that it was never there to
-		 * begin with.  Failure to open such a device is not considered
-		 * an error.
-		 */
-		if (vd->vdev_spa->spa_load_state == SPA_LOAD_IMPORT &&
-		    vd->vdev_ops->vdev_op_leaf)
-			vd->vdev_not_present = 1;
-
-		/*
-		 * Post the appropriate ereport.  If the 'prevstate' field is
-		 * set to something other than VDEV_STATE_UNKNOWN, it indicates
-		 * that this is part of a vdev_reopen().  In this case, we don't
-		 * want to post the ereport if the device was already in the
-		 * CANT_OPEN state beforehand.
-		 */
-		if (vd->vdev_prevstate != state && !vd->vdev_not_present &&
-		    vd != vd->vdev_spa->spa_root_vdev) {
-			const char *class;
-
-			switch (aux) {
-			case VDEV_AUX_OPEN_FAILED:
-				class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED;
-				break;
-			case VDEV_AUX_CORRUPT_DATA:
-				class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA;
-				break;
-			case VDEV_AUX_NO_REPLICAS:
-				class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS;
-				break;
-			case VDEV_AUX_BAD_GUID_SUM:
-				class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM;
-				break;
-			case VDEV_AUX_TOO_SMALL:
-				class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL;
-				break;
-			case VDEV_AUX_BAD_LABEL:
-				class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL;
-				break;
-			default:
-				class = FM_EREPORT_ZFS_DEVICE_UNKNOWN;
-			}
-
-			zfs_ereport_post(class, vd->vdev_spa,
-			    vd, NULL, save_state, 0);
-		}
-	}
-
-	if (isopen)
-		return;
-
-	if (vd->vdev_parent != NULL)
-		vdev_propagate_state(vd->vdev_parent);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c
deleted file mode 100644
index 4e419b6..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c
+++ /dev/null
@@ -1,394 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/spa.h>
-#include <sys/vdev_impl.h>
-#include <sys/zio.h>
-
-/*
- * Virtual device read-ahead caching.
- *
- * This file implements a simple LRU read-ahead cache.  When the DMU reads
- * a given block, it will often want other, nearby blocks soon thereafter.
- * We take advantage of this by reading a larger disk region and caching
- * the result.  In the best case, this can turn 256 back-to-back 512-byte
- * reads into a single 128k read followed by 255 cache hits; this reduces
- * latency dramatically.  In the worst case, it can turn an isolated 512-byte
- * read into a 128k read, which doesn't affect latency all that much but is
- * terribly wasteful of bandwidth.  A more intelligent version of the cache
- * could keep track of access patterns and not do read-ahead unless it sees
- * at least two temporally close I/Os to the same region.  It could also
- * take advantage of semantic information about the I/O.  And it could use
- * something faster than an AVL tree; that was chosen solely for convenience.
- *
- * There are five cache operations: allocate, fill, read, write, evict.
- *
- * (1) Allocate.  This reserves a cache entry for the specified region.
- *     We separate the allocate and fill operations so that multiple threads
- *     don't generate I/O for the same cache miss.
- *
- * (2) Fill.  When the I/O for a cache miss completes, the fill routine
- *     places the data in the previously allocated cache entry.
- *
- * (3) Read.  Read data from the cache.
- *
- * (4) Write.  Update cache contents after write completion.
- *
- * (5) Evict.  When allocating a new entry, we evict the oldest (LRU) entry
- *     if the total cache size exceeds zfs_vdev_cache_size.
- */
-
-/*
- * These tunables are for performance analysis.
- */
-/*
- * All i/os smaller than zfs_vdev_cache_max will be turned into
- * 1<<zfs_vdev_cache_bshift byte reads by the vdev_cache (aka software
- * track buffer.  At most zfs_vdev_cache_size bytes will be kept in each
- * vdev's vdev_cache.
- */
-int zfs_vdev_cache_max = 1<<14;
-int zfs_vdev_cache_size = 10ULL << 20;
-int zfs_vdev_cache_bshift = 16;
-
-SYSCTL_DECL(_vfs_zfs_vdev);
-SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, cache, CTLFLAG_RW, 0, "ZFS VDEV Cache");
-TUNABLE_INT("vfs.zfs.vdev.cache.max", &zfs_vdev_cache_max);
-SYSCTL_INT(_vfs_zfs_vdev_cache, OID_AUTO, max, CTLFLAG_RDTUN,
-    &zfs_vdev_cache_max, 0, "Maximum I/O request size that increase read size");
-TUNABLE_INT("vfs.zfs.vdev.cache.size", &zfs_vdev_cache_size);
-SYSCTL_INT(_vfs_zfs_vdev_cache, OID_AUTO, size, CTLFLAG_RDTUN,
-    &zfs_vdev_cache_size, 0, "Size of VDEV cache");
-
-#define	VCBS (1 << zfs_vdev_cache_bshift)
-
-static int
-vdev_cache_offset_compare(const void *a1, const void *a2)
-{
-	const vdev_cache_entry_t *ve1 = a1;
-	const vdev_cache_entry_t *ve2 = a2;
-
-	if (ve1->ve_offset < ve2->ve_offset)
-		return (-1);
-	if (ve1->ve_offset > ve2->ve_offset)
-		return (1);
-	return (0);
-}
-
-static int
-vdev_cache_lastused_compare(const void *a1, const void *a2)
-{
-	const vdev_cache_entry_t *ve1 = a1;
-	const vdev_cache_entry_t *ve2 = a2;
-
-	if (ve1->ve_lastused < ve2->ve_lastused)
-		return (-1);
-	if (ve1->ve_lastused > ve2->ve_lastused)
-		return (1);
-
-	/*
-	 * Among equally old entries, sort by offset to ensure uniqueness.
-	 */
-	return (vdev_cache_offset_compare(a1, a2));
-}
-
-/*
- * Evict the specified entry from the cache.
- */
-static void
-vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve)
-{
-	ASSERT(MUTEX_HELD(&vc->vc_lock));
-	ASSERT(ve->ve_fill_io == NULL);
-	ASSERT(ve->ve_data != NULL);
-
-	dprintf("evicting %p, off %llx, LRU %llu, age %lu, hits %u, stale %u\n",
-	    vc, ve->ve_offset, ve->ve_lastused, LBOLT - ve->ve_lastused,
-	    ve->ve_hits, ve->ve_missed_update);
-
-	avl_remove(&vc->vc_lastused_tree, ve);
-	avl_remove(&vc->vc_offset_tree, ve);
-	zio_buf_free(ve->ve_data, VCBS);
-	kmem_free(ve, sizeof (vdev_cache_entry_t));
-}
-
-/*
- * Allocate an entry in the cache.  At the point we don't have the data,
- * we're just creating a placeholder so that multiple threads don't all
- * go off and read the same blocks.
- */
-static vdev_cache_entry_t *
-vdev_cache_allocate(zio_t *zio)
-{
-	vdev_cache_t *vc = &zio->io_vd->vdev_cache;
-	uint64_t offset = P2ALIGN(zio->io_offset, VCBS);
-	vdev_cache_entry_t *ve;
-
-	ASSERT(MUTEX_HELD(&vc->vc_lock));
-
-	if (zfs_vdev_cache_size == 0)
-		return (NULL);
-
-	/*
-	 * If adding a new entry would exceed the cache size,
-	 * evict the oldest entry (LRU).
-	 */
-	if ((avl_numnodes(&vc->vc_lastused_tree) << zfs_vdev_cache_bshift) >
-	    zfs_vdev_cache_size) {
-		ve = avl_first(&vc->vc_lastused_tree);
-		if (ve->ve_fill_io != NULL) {
-			dprintf("can't evict in %p, still filling\n", vc);
-			return (NULL);
-		}
-		ASSERT(ve->ve_hits != 0);
-		vdev_cache_evict(vc, ve);
-	}
-
-	ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP);
-	ve->ve_offset = offset;
-	ve->ve_lastused = LBOLT;
-	ve->ve_data = zio_buf_alloc(VCBS);
-
-	avl_add(&vc->vc_offset_tree, ve);
-	avl_add(&vc->vc_lastused_tree, ve);
-
-	return (ve);
-}
-
-static void
-vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio)
-{
-	uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS);
-
-	ASSERT(MUTEX_HELD(&vc->vc_lock));
-	ASSERT(ve->ve_fill_io == NULL);
-
-	if (ve->ve_lastused != LBOLT) {
-		avl_remove(&vc->vc_lastused_tree, ve);
-		ve->ve_lastused = LBOLT;
-		avl_add(&vc->vc_lastused_tree, ve);
-	}
-
-	ve->ve_hits++;
-	bcopy(ve->ve_data + cache_phase, zio->io_data, zio->io_size);
-}
-
-/*
- * Fill a previously allocated cache entry with data.
- */
-static void
-vdev_cache_fill(zio_t *zio)
-{
-	vdev_t *vd = zio->io_vd;
-	vdev_cache_t *vc = &vd->vdev_cache;
-	vdev_cache_entry_t *ve = zio->io_private;
-	zio_t *dio;
-
-	ASSERT(zio->io_size == VCBS);
-
-	/*
-	 * Add data to the cache.
-	 */
-	mutex_enter(&vc->vc_lock);
-
-	ASSERT(ve->ve_fill_io == zio);
-	ASSERT(ve->ve_offset == zio->io_offset);
-	ASSERT(ve->ve_data == zio->io_data);
-
-	ve->ve_fill_io = NULL;
-
-	/*
-	 * Even if this cache line was invalidated by a missed write update,
-	 * any reads that were queued up before the missed update are still
-	 * valid, so we can satisfy them from this line before we evict it.
-	 */
-	for (dio = zio->io_delegate_list; dio; dio = dio->io_delegate_next)
-		vdev_cache_hit(vc, ve, dio);
-
-	if (zio->io_error || ve->ve_missed_update)
-		vdev_cache_evict(vc, ve);
-
-	mutex_exit(&vc->vc_lock);
-
-	while ((dio = zio->io_delegate_list) != NULL) {
-		zio->io_delegate_list = dio->io_delegate_next;
-		dio->io_delegate_next = NULL;
-		dio->io_error = zio->io_error;
-		zio_next_stage(dio);
-	}
-}
-
-/*
- * Read data from the cache.  Returns 0 on cache hit, errno on a miss.
- */
-int
-vdev_cache_read(zio_t *zio)
-{
-	vdev_cache_t *vc = &zio->io_vd->vdev_cache;
-	vdev_cache_entry_t *ve, ve_search;
-	uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS);
-	uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS);
-	zio_t *fio;
-
-	ASSERT(zio->io_type == ZIO_TYPE_READ);
-
-	if (zio->io_flags & ZIO_FLAG_DONT_CACHE)
-		return (EINVAL);
-
-	if (zio->io_size > zfs_vdev_cache_max)
-		return (EOVERFLOW);
-
-	/*
-	 * If the I/O straddles two or more cache blocks, don't cache it.
-	 */
-	if (P2CROSS(zio->io_offset, zio->io_offset + zio->io_size - 1, VCBS))
-		return (EXDEV);
-
-	ASSERT(cache_phase + zio->io_size <= VCBS);
-
-	mutex_enter(&vc->vc_lock);
-
-	ve_search.ve_offset = cache_offset;
-	ve = avl_find(&vc->vc_offset_tree, &ve_search, NULL);
-
-	if (ve != NULL) {
-		if (ve->ve_missed_update) {
-			mutex_exit(&vc->vc_lock);
-			return (ESTALE);
-		}
-
-		if ((fio = ve->ve_fill_io) != NULL) {
-			zio->io_delegate_next = fio->io_delegate_list;
-			fio->io_delegate_list = zio;
-			zio_vdev_io_bypass(zio);
-			mutex_exit(&vc->vc_lock);
-			return (0);
-		}
-
-		vdev_cache_hit(vc, ve, zio);
-		zio_vdev_io_bypass(zio);
-
-		mutex_exit(&vc->vc_lock);
-		zio_next_stage(zio);
-		return (0);
-	}
-
-	ve = vdev_cache_allocate(zio);
-
-	if (ve == NULL) {
-		mutex_exit(&vc->vc_lock);
-		return (ENOMEM);
-	}
-
-	fio = zio_vdev_child_io(zio, NULL, zio->io_vd, cache_offset,
-	    ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_CACHE_FILL,
-	    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE |
-	    ZIO_FLAG_DONT_RETRY | ZIO_FLAG_NOBOOKMARK,
-	    vdev_cache_fill, ve);
-
-	ve->ve_fill_io = fio;
-	fio->io_delegate_list = zio;
-	zio_vdev_io_bypass(zio);
-
-	mutex_exit(&vc->vc_lock);
-	zio_nowait(fio);
-
-	return (0);
-}
-
-/*
- * Update cache contents upon write completion.
- */
-void
-vdev_cache_write(zio_t *zio)
-{
-	vdev_cache_t *vc = &zio->io_vd->vdev_cache;
-	vdev_cache_entry_t *ve, ve_search;
-	uint64_t io_start = zio->io_offset;
-	uint64_t io_end = io_start + zio->io_size;
-	uint64_t min_offset = P2ALIGN(io_start, VCBS);
-	uint64_t max_offset = P2ROUNDUP(io_end, VCBS);
-	avl_index_t where;
-
-	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
-
-	mutex_enter(&vc->vc_lock);
-
-	ve_search.ve_offset = min_offset;
-	ve = avl_find(&vc->vc_offset_tree, &ve_search, &where);
-
-	if (ve == NULL)
-		ve = avl_nearest(&vc->vc_offset_tree, where, AVL_AFTER);
-
-	while (ve != NULL && ve->ve_offset < max_offset) {
-		uint64_t start = MAX(ve->ve_offset, io_start);
-		uint64_t end = MIN(ve->ve_offset + VCBS, io_end);
-
-		if (ve->ve_fill_io != NULL) {
-			ve->ve_missed_update = 1;
-		} else {
-			bcopy((char *)zio->io_data + start - io_start,
-			    ve->ve_data + start - ve->ve_offset, end - start);
-		}
-		ve = AVL_NEXT(&vc->vc_offset_tree, ve);
-	}
-	mutex_exit(&vc->vc_lock);
-}
-
-void
-vdev_cache_init(vdev_t *vd)
-{
-	vdev_cache_t *vc = &vd->vdev_cache;
-
-	mutex_init(&vc->vc_lock, NULL, MUTEX_DEFAULT, NULL);
-
-	avl_create(&vc->vc_offset_tree, vdev_cache_offset_compare,
-	    sizeof (vdev_cache_entry_t),
-	    offsetof(struct vdev_cache_entry, ve_offset_node));
-
-	avl_create(&vc->vc_lastused_tree, vdev_cache_lastused_compare,
-	    sizeof (vdev_cache_entry_t),
-	    offsetof(struct vdev_cache_entry, ve_lastused_node));
-}
-
-void
-vdev_cache_fini(vdev_t *vd)
-{
-	vdev_cache_t *vc = &vd->vdev_cache;
-	vdev_cache_entry_t *ve;
-
-	mutex_enter(&vc->vc_lock);
-	while ((ve = avl_first(&vc->vc_offset_tree)) != NULL)
-		vdev_cache_evict(vc, ve);
-	mutex_exit(&vc->vc_lock);
-
-	avl_destroy(&vc->vc_offset_tree);
-	avl_destroy(&vc->vc_lastused_tree);
-
-	mutex_destroy(&vc->vc_lock);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c
deleted file mode 100644
index b965b1c..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c
+++ /dev/null
@@ -1,363 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/spa.h>
-#include <sys/vdev_disk.h>
-#include <sys/vdev_impl.h>
-#include <sys/fs/zfs.h>
-#include <sys/zio.h>
-#include <sys/sunldi.h>
-
-/*
- * Virtual device vector for disks.
- */
-
-extern ldi_ident_t zfs_li;
-
-typedef struct vdev_disk_buf {
-	buf_t	vdb_buf;
-	zio_t	*vdb_io;
-} vdev_disk_buf_t;
-
-static int
-vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
-{
-	vdev_disk_t *dvd;
-	struct dk_minfo dkm;
-	int error;
-
-	/*
-	 * We must have a pathname, and it must be absolute.
-	 */
-	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
-		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
-		return (EINVAL);
-	}
-
-	dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
-
-	/*
-	 * When opening a disk device, we want to preserve the user's original
-	 * intent.  We always want to open the device by the path the user gave
-	 * us, even if it is one of multiple paths to the save device.  But we
-	 * also want to be able to survive disks being removed/recabled.
-	 * Therefore the sequence of opening devices is:
-	 *
-	 * 1. Try opening the device by path.  For legacy pools without the
-	 *    'whole_disk' property, attempt to fix the path by appending 's0'.
-	 *
-	 * 2. If the devid of the device matches the stored value, return
-	 *    success.
-	 *
-	 * 3. Otherwise, the device may have moved.  Try opening the device
-	 *    by the devid instead.
-	 *
-	 */
-	if (vd->vdev_devid != NULL) {
-		if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid,
-		    &dvd->vd_minor) != 0) {
-			vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
-			return (EINVAL);
-		}
-	}
-
-	error = EINVAL;		/* presume failure */
-
-	if (vd->vdev_path != NULL) {
-		ddi_devid_t devid;
-
-		if (vd->vdev_wholedisk == -1ULL) {
-			size_t len = strlen(vd->vdev_path) + 3;
-			char *buf = kmem_alloc(len, KM_SLEEP);
-			ldi_handle_t lh;
-
-			(void) snprintf(buf, len, "%ss0", vd->vdev_path);
-
-			if (ldi_open_by_name(buf, spa_mode, kcred,
-			    &lh, zfs_li) == 0) {
-				spa_strfree(vd->vdev_path);
-				vd->vdev_path = buf;
-				vd->vdev_wholedisk = 1ULL;
-				(void) ldi_close(lh, spa_mode, kcred);
-			} else {
-				kmem_free(buf, len);
-			}
-		}
-
-		error = ldi_open_by_name(vd->vdev_path, spa_mode, kcred,
-		    &dvd->vd_lh, zfs_li);
-
-		/*
-		 * Compare the devid to the stored value.
-		 */
-		if (error == 0 && vd->vdev_devid != NULL &&
-		    ldi_get_devid(dvd->vd_lh, &devid) == 0) {
-			if (ddi_devid_compare(devid, dvd->vd_devid) != 0) {
-				error = EINVAL;
-				(void) ldi_close(dvd->vd_lh, spa_mode, kcred);
-				dvd->vd_lh = NULL;
-			}
-			ddi_devid_free(devid);
-		}
-
-		/*
-		 * If we succeeded in opening the device, but 'vdev_wholedisk'
-		 * is not yet set, then this must be a slice.
-		 */
-		if (error == 0 && vd->vdev_wholedisk == -1ULL)
-			vd->vdev_wholedisk = 0;
-	}
-
-	/*
-	 * If we were unable to open by path, or the devid check fails, open by
-	 * devid instead.
-	 */
-	if (error != 0 && vd->vdev_devid != NULL)
-		error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor,
-		    spa_mode, kcred, &dvd->vd_lh, zfs_li);
-
-	if (error) {
-		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
-		return (error);
-	}
-
-	/*
-	 * Determine the actual size of the device.
-	 */
-	if (ldi_get_size(dvd->vd_lh, psize) != 0) {
-		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
-		return (EINVAL);
-	}
-
-	/*
-	 * If we own the whole disk, try to enable disk write caching.
-	 * We ignore errors because it's OK if we can't do it.
-	 */
-	if (vd->vdev_wholedisk == 1) {
-		int wce = 1;
-		(void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce,
-		    FKIOCTL, kcred, NULL);
-	}
-
-	/*
-	 * Determine the device's minimum transfer size.
-	 * If the ioctl isn't supported, assume DEV_BSIZE.
-	 */
-	if (ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFO, (intptr_t)&dkm,
-	    FKIOCTL, kcred, NULL) != 0)
-		dkm.dki_lbsize = DEV_BSIZE;
-
-	*ashift = highbit(MAX(dkm.dki_lbsize, SPA_MINBLOCKSIZE)) - 1;
-
-	/*
-	 * Clear the nowritecache bit, so that on a vdev_reopen() we will
-	 * try again.
-	 */
-	vd->vdev_nowritecache = B_FALSE;
-
-	return (0);
-}
-
-static void
-vdev_disk_close(vdev_t *vd)
-{
-	vdev_disk_t *dvd = vd->vdev_tsd;
-
-	if (dvd == NULL)
-		return;
-
-	dprintf("removing disk %s, devid %s\n",
-	    vd->vdev_path ? vd->vdev_path : "<none>",
-	    vd->vdev_devid ? vd->vdev_devid : "<none>");
-
-	if (dvd->vd_minor != NULL)
-		ddi_devid_str_free(dvd->vd_minor);
-
-	if (dvd->vd_devid != NULL)
-		ddi_devid_free(dvd->vd_devid);
-
-	if (dvd->vd_lh != NULL)
-		(void) ldi_close(dvd->vd_lh, spa_mode, kcred);
-
-	kmem_free(dvd, sizeof (vdev_disk_t));
-	vd->vdev_tsd = NULL;
-}
-
-static void
-vdev_disk_io_intr(buf_t *bp)
-{
-	vdev_disk_buf_t *vdb = (vdev_disk_buf_t *)bp;
-	zio_t *zio = vdb->vdb_io;
-
-	if ((zio->io_error = geterror(bp)) == 0 && bp->b_resid != 0)
-		zio->io_error = EIO;
-
-	kmem_free(vdb, sizeof (vdev_disk_buf_t));
-
-	zio_next_stage_async(zio);
-}
-
-static void
-vdev_disk_ioctl_done(void *zio_arg, int error)
-{
-	zio_t *zio = zio_arg;
-
-	zio->io_error = error;
-
-	zio_next_stage_async(zio);
-}
-
-static void
-vdev_disk_io_start(zio_t *zio)
-{
-	vdev_t *vd = zio->io_vd;
-	vdev_disk_t *dvd = vd->vdev_tsd;
-	vdev_disk_buf_t *vdb;
-	buf_t *bp;
-	int flags, error;
-
-	if (zio->io_type == ZIO_TYPE_IOCTL) {
-		zio_vdev_io_bypass(zio);
-
-		/* XXPOLICY */
-		if (vdev_is_dead(vd)) {
-			zio->io_error = ENXIO;
-			zio_next_stage_async(zio);
-			return;
-		}
-
-		switch (zio->io_cmd) {
-
-		case DKIOCFLUSHWRITECACHE:
-
-			if (zfs_nocacheflush)
-				break;
-
-			if (vd->vdev_nowritecache) {
-				zio->io_error = ENOTSUP;
-				break;
-			}
-
-			zio->io_dk_callback.dkc_callback = vdev_disk_ioctl_done;
-			zio->io_dk_callback.dkc_cookie = zio;
-
-			error = ldi_ioctl(dvd->vd_lh, zio->io_cmd,
-			    (uintptr_t)&zio->io_dk_callback,
-			    FKIOCTL, kcred, NULL);
-
-			if (error == 0) {
-				/*
-				 * The ioctl will be done asychronously,
-				 * and will call vdev_disk_ioctl_done()
-				 * upon completion.
-				 */
-				return;
-			} else if (error == ENOTSUP) {
-				/*
-				 * If we get ENOTSUP, we know that no future
-				 * attempts will ever succeed.  In this case we
-				 * set a persistent bit so that we don't bother
-				 * with the ioctl in the future.
-				 */
-				vd->vdev_nowritecache = B_TRUE;
-			}
-			zio->io_error = error;
-
-			break;
-
-		default:
-			zio->io_error = ENOTSUP;
-		}
-
-		zio_next_stage_async(zio);
-		return;
-	}
-
-	if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
-		return;
-
-	if ((zio = vdev_queue_io(zio)) == NULL)
-		return;
-
-	flags = (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE);
-	flags |= B_BUSY | B_NOCACHE;
-	if (zio->io_flags & ZIO_FLAG_FAILFAST)
-		flags |= B_FAILFAST;
-
-	vdb = kmem_alloc(sizeof (vdev_disk_buf_t), KM_SLEEP);
-
-	vdb->vdb_io = zio;
-	bp = &vdb->vdb_buf;
-
-	bioinit(bp);
-	bp->b_flags = flags;
-	bp->b_bcount = zio->io_size;
-	bp->b_un.b_addr = zio->io_data;
-	bp->b_lblkno = lbtodb(zio->io_offset);
-	bp->b_bufsize = zio->io_size;
-	bp->b_iodone = (int (*)())vdev_disk_io_intr;
-
-	/* XXPOLICY */
-	error = vdev_is_dead(vd) ? ENXIO : vdev_error_inject(vd, zio);
-	if (error) {
-		zio->io_error = error;
-		bioerror(bp, error);
-		bp->b_resid = bp->b_bcount;
-		bp->b_iodone(bp);
-		return;
-	}
-
-	error = ldi_strategy(dvd->vd_lh, bp);
-	/* ldi_strategy() will return non-zero only on programming errors */
-	ASSERT(error == 0);
-}
-
-static void
-vdev_disk_io_done(zio_t *zio)
-{
-	vdev_queue_io_done(zio);
-
-	if (zio->io_type == ZIO_TYPE_WRITE)
-		vdev_cache_write(zio);
-
-	if (zio_injection_enabled && zio->io_error == 0)
-		zio->io_error = zio_handle_device_injection(zio->io_vd, EIO);
-
-	zio_next_stage(zio);
-}
-
-vdev_ops_t vdev_disk_ops = {
-	vdev_disk_open,
-	vdev_disk_close,
-	vdev_default_asize,
-	vdev_disk_io_start,
-	vdev_disk_io_done,
-	NULL,
-	VDEV_TYPE_DISK,		/* name of this vdev type */
-	B_TRUE			/* leaf vdev */
-};
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c
deleted file mode 100644
index b8e79f8..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c
+++ /dev/null
@@ -1,225 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/spa.h>
-#include <sys/vdev_file.h>
-#include <sys/vdev_impl.h>
-#include <sys/zio.h>
-#include <sys/fs/zfs.h>
-
-/*
- * Virtual device vector for files.
- */
-
-static int
-vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
-{
-	vdev_file_t *vf;
-	vnode_t *vp;
-	vattr_t vattr;
-	int error;
-
-	/*
-	 * We must have a pathname, and it must be absolute.
-	 */
-	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
-		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
-		return (EINVAL);
-	}
-
-	vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP);
-
-	/*
-	 * We always open the files from the root of the global zone, even if
-	 * we're in a local zone.  If the user has gotten to this point, the
-	 * administrator has already decided that the pool should be available
-	 * to local zone users, so the underlying devices should be as well.
-	 */
-	ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/');
-	error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE, spa_mode | FOFFMAX,
-	    0, &vp, 0, 0, rootdir);
-
-	if (error) {
-		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
-		return (error);
-	}
-
-	vf->vf_vnode = vp;
-
-#ifdef _KERNEL
-	/*
-	 * Make sure it's a regular file.
-	 */
-	if (vp->v_type != VREG) {
-		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
-		return (ENODEV);
-	}
-#endif
-
-	/*
-	 * Determine the physical size of the file.
-	 */
-	vattr.va_mask = AT_SIZE;
-	error = VOP_GETATTR(vp, &vattr, 0, kcred);
-	if (error) {
-		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
-		return (error);
-	}
-
-	*psize = vattr.va_size;
-	*ashift = SPA_MINBLOCKSHIFT;
-
-	return (0);
-}
-
-static void
-vdev_file_close(vdev_t *vd)
-{
-	vdev_file_t *vf = vd->vdev_tsd;
-
-	if (vf == NULL)
-		return;
-
-	if (vf->vf_vnode != NULL) {
-		(void) VOP_PUTPAGE(vf->vf_vnode, 0, 0, B_INVAL, kcred);
-		(void) VOP_CLOSE(vf->vf_vnode, spa_mode, 1, 0, kcred);
-		VN_RELE(vf->vf_vnode);
-	}
-
-	kmem_free(vf, sizeof (vdev_file_t));
-	vd->vdev_tsd = NULL;
-}
-
-static void
-vdev_file_io_start(zio_t *zio)
-{
-	vdev_t *vd = zio->io_vd;
-	vdev_file_t *vf = vd->vdev_tsd;
-	ssize_t resid;
-	int error;
-
-	if (zio->io_type == ZIO_TYPE_IOCTL) {
-		zio_vdev_io_bypass(zio);
-
-		/* XXPOLICY */
-		if (vdev_is_dead(vd)) {
-			zio->io_error = ENXIO;
-			zio_next_stage_async(zio);
-			return;
-		}
-
-		switch (zio->io_cmd) {
-		case DKIOCFLUSHWRITECACHE:
-			zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC,
-			    kcred);
-			dprintf("fsync(%s) = %d\n", vdev_description(vd),
-			    zio->io_error);
-			break;
-		default:
-			zio->io_error = ENOTSUP;
-		}
-
-		zio_next_stage_async(zio);
-		return;
-	}
-
-	/*
-	 * In the kernel, don't bother double-caching, but in userland,
-	 * we want to test the vdev_cache code.
-	 */
-#ifndef _KERNEL
-	if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
-		return;
-#endif
-
-	if ((zio = vdev_queue_io(zio)) == NULL)
-		return;
-
-	/* XXPOLICY */
-	error = vdev_is_dead(vd) ? ENXIO : vdev_error_inject(vd, zio);
-	if (error) {
-		zio->io_error = error;
-		zio_next_stage_async(zio);
-		return;
-	}
-
-	zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ?
-	    UIO_READ : UIO_WRITE, vf->vf_vnode, zio->io_data,
-	    zio->io_size, zio->io_offset, UIO_SYSSPACE,
-	    0, RLIM64_INFINITY, kcred, &resid);
-
-	if (resid != 0 && zio->io_error == 0)
-		zio->io_error = ENOSPC;
-
-	zio_next_stage_async(zio);
-}
-
-static void
-vdev_file_io_done(zio_t *zio)
-{
-	vdev_queue_io_done(zio);
-
-#ifndef _KERNEL
-	if (zio->io_type == ZIO_TYPE_WRITE)
-		vdev_cache_write(zio);
-#endif
-
-	if (zio_injection_enabled && zio->io_error == 0)
-		zio->io_error = zio_handle_device_injection(zio->io_vd, EIO);
-
-	zio_next_stage(zio);
-}
-
-vdev_ops_t vdev_file_ops = {
-	vdev_file_open,
-	vdev_file_close,
-	vdev_default_asize,
-	vdev_file_io_start,
-	vdev_file_io_done,
-	NULL,
-	VDEV_TYPE_FILE,		/* name of this vdev type */
-	B_TRUE			/* leaf vdev */
-};
-
-/*
- * From userland we access disks just like files.
- */
-#ifndef _KERNEL
-
-vdev_ops_t vdev_disk_ops = {
-	vdev_file_open,
-	vdev_file_close,
-	vdev_default_asize,
-	vdev_file_io_start,
-	vdev_file_io_done,
-	NULL,
-	VDEV_TYPE_DISK,		/* name of this vdev type */
-	B_TRUE			/* leaf vdev */
-};
-
-#endif
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
deleted file mode 100644
index eebc911..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
+++ /dev/null
@@ -1,583 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
- * All rights reserved.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/param.h>
-#include <sys/kernel.h>
-#include <sys/bio.h>
-#include <sys/disk.h>
-#include <sys/spa.h>
-#include <sys/vdev_impl.h>
-#include <sys/fs/zfs.h>
-#include <sys/zio.h>
-#include <geom/geom.h>
-#include <geom/geom_int.h>
-
-/*
- * Virtual device vector for GEOM.
- */
-
-struct g_class zfs_vdev_class = {
-	.name = "ZFS::VDEV",
-	.version = G_VERSION,
-};
-
-DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev);
-
-typedef struct vdev_geom_ctx {
-	struct g_consumer *gc_consumer;
-	int gc_state;
-	struct bio_queue_head gc_queue;
-	struct mtx gc_queue_mtx;
-} vdev_geom_ctx_t;
-
-static void
-vdev_geom_release(vdev_t *vd)
-{
-	vdev_geom_ctx_t *ctx;
-
-	ctx = vd->vdev_tsd;
-	vd->vdev_tsd = NULL;
-
-	mtx_lock(&ctx->gc_queue_mtx);
-	ctx->gc_state = 1;
-	wakeup_one(&ctx->gc_queue);
-	while (ctx->gc_state != 2)
-		msleep(&ctx->gc_state, &ctx->gc_queue_mtx, 0, "vgeom:w", 0);
-	mtx_unlock(&ctx->gc_queue_mtx);
-	mtx_destroy(&ctx->gc_queue_mtx);
-	kmem_free(ctx, sizeof(*ctx));
-}
-
-static void
-vdev_geom_orphan(struct g_consumer *cp)
-{
-	struct g_geom *gp;
-	vdev_t *vd;
-	int error;
-
-	g_topology_assert();
-
-	vd = cp->private;
-	gp = cp->geom;
-	error = cp->provider->error;
-
-	ZFS_LOG(1, "Closing access to %s.", cp->provider->name);
-	if (cp->acr + cp->acw + cp->ace > 0)
-		g_access(cp, -cp->acr, -cp->acw, -cp->ace);
-	ZFS_LOG(1, "Destroyed consumer to %s.", cp->provider->name);
-	g_detach(cp);
-	g_destroy_consumer(cp);
-	/* Destroy geom if there are no consumers left. */
-	if (LIST_EMPTY(&gp->consumer)) {
-		ZFS_LOG(1, "Destroyed geom %s.", gp->name);
-		g_wither_geom(gp, error);
-	}
-	vdev_geom_release(vd);
-	/* Both methods below work, but in a bit different way. */
-#if 0
-	vd->vdev_reopen_wanted = 1;
-#else
-	vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
-	vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, vd->vdev_stat.vs_aux);
-#endif
-}
-
-static struct g_consumer *
-vdev_geom_attach(struct g_provider *pp, int write)
-{
-	struct g_geom *gp;
-	struct g_consumer *cp;
-
-	g_topology_assert();
-
-	ZFS_LOG(1, "Attaching to %s.", pp->name);
-	/* Do we have geom already? No? Create one. */
-	LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) {
-		if (gp->flags & G_GEOM_WITHER)
-			continue;
-		if (strcmp(gp->name, "zfs::vdev") != 0)
-			continue;
-		break;
-	}
-	if (gp == NULL) {
-		gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev");
-		gp->orphan = vdev_geom_orphan;
-		cp = g_new_consumer(gp);
-		if (g_attach(cp, pp) != 0) {
-			g_wither_geom(gp, ENXIO);
-			return (NULL);
-		}
-		if (g_access(cp, 1, write, 1) != 0) {
-			g_wither_geom(gp, ENXIO);
-			return (NULL);
-		}
-		ZFS_LOG(1, "Created geom and consumer for %s.", pp->name);
-	} else {
-		/* Check if we are already connected to this provider. */
-		LIST_FOREACH(cp, &gp->consumer, consumer) {
-			if (cp->provider == pp) {
-				ZFS_LOG(1, "Found consumer for %s.", pp->name);
-				break;
-			}
-		}
-		if (cp == NULL) {
-			cp = g_new_consumer(gp);
-			if (g_attach(cp, pp) != 0) {
-				g_destroy_consumer(cp);
-				return (NULL);
-			}
-			if (g_access(cp, 1, write, 1) != 0) {
-				g_detach(cp);
-				g_destroy_consumer(cp);
-				return (NULL);
-			}
-			ZFS_LOG(1, "Created consumer for %s.", pp->name);
-		} else {
-			if (g_access(cp, 1, cp->acw > 0 ? 0 : write, 1) != 0)
-				return (NULL);
-			ZFS_LOG(1, "Used existing consumer for %s.", pp->name);
-		}
-	}
-	return (cp);
-}
-
-static void
-vdev_geom_detach(void *arg, int flag __unused)
-{
-	struct g_geom *gp;
-	struct g_consumer *cp;
-
-	g_topology_assert();
-	cp = arg;
-	gp = cp->geom;
-
-	ZFS_LOG(1, "Closing access to %s.", cp->provider->name);
-	g_access(cp, -1, 0, -1);
-	/* Destroy consumer on last close. */
-	if (cp->acr == 0 && cp->ace == 0) {
-		ZFS_LOG(1, "Destroyed consumer to %s.", cp->provider->name);
-		if (cp->acw > 0)
-			g_access(cp, 0, -cp->acw, 0);
-		g_detach(cp);
-		g_destroy_consumer(cp);
-	}
-	/* Destroy geom if there are no consumers left. */
-	if (LIST_EMPTY(&gp->consumer)) {
-		ZFS_LOG(1, "Destroyed geom %s.", gp->name);
-		g_wither_geom(gp, ENXIO);
-	}
-}
-
-static void
-vdev_geom_worker(void *arg)
-{
-	vdev_geom_ctx_t *ctx;
-	zio_t *zio;
-	struct bio *bp;
-
-	ctx = arg;
-	for (;;) {
-		mtx_lock(&ctx->gc_queue_mtx);
-		bp = bioq_takefirst(&ctx->gc_queue);
-		if (bp == NULL) {
-			if (ctx->gc_state == 1) {
-				ctx->gc_state = 2;
-				wakeup_one(&ctx->gc_state);
-				mtx_unlock(&ctx->gc_queue_mtx);
-				kproc_exit(0);
-			}
-			msleep(&ctx->gc_queue, &ctx->gc_queue_mtx,
-			    PRIBIO | PDROP, "vgeom:io", 0);
-			continue;
-		}
-		mtx_unlock(&ctx->gc_queue_mtx);
-		zio = bp->bio_caller1;
-		zio->io_error = bp->bio_error;
-		if (bp->bio_cmd == BIO_FLUSH && bp->bio_error == ENOTSUP) {
-			vdev_t *vd;
-
-			/*
-			 * If we get ENOTSUP, we know that no future
-			 * attempts will ever succeed.  In this case we
-			 * set a persistent bit so that we don't bother
-			 * with the ioctl in the future.
-			 */
-			vd = zio->io_vd;
-			vd->vdev_nowritecache = B_TRUE;
-		}
-		g_destroy_bio(bp);
-		zio_next_stage_async(zio);
-	}
-}
-
-static char *
-vdev_geom_get_id(struct g_consumer *cp)
-{
-	char *id;
-	int len;
-
-	g_topology_assert_not();
-	len = DISK_IDENT_SIZE;
-	id = kmem_zalloc(len, KM_SLEEP);
-	if (g_io_getattr("GEOM::ident", cp, &len, id) != 0) {
-		kmem_free(id, DISK_IDENT_SIZE);
-		return (NULL);
-	}
-	return (id);
-}
-
-static void
-vdev_geom_free_id(char *id)
-{
-
-	if (id != NULL)
-		kmem_free(id, DISK_IDENT_SIZE);
-}
-
-struct vdev_geom_find {
-	const char *id;
-	int write;
-	struct g_consumer *cp;
-};
-
-static void
-vdev_geom_taste_orphan(struct g_consumer *cp)
-{
-
-	KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
-	    cp->provider->name));
-}
-
-static void
-vdev_geom_attach_by_id_event(void *arg, int flags __unused)
-{
-	struct vdev_geom_find *ap;
-	struct g_class *mp;
-	struct g_geom *gp, *zgp;
-	struct g_provider *pp;
-	struct g_consumer *zcp;
-	char *id;
-
-	g_topology_assert();
-
-	ap = arg;
-
-	zgp = g_new_geomf(&zfs_vdev_class, "zfs::vdev::taste");
-	/* This orphan function should be never called. */
-	zgp->orphan = vdev_geom_taste_orphan;
-	zcp = g_new_consumer(zgp);
-
-	LIST_FOREACH(mp, &g_classes, class) {
-		if (mp == &zfs_vdev_class)
-			continue;
-		LIST_FOREACH(gp, &mp->geom, geom) {
-			if (gp->flags & G_GEOM_WITHER)
-				continue;
-			LIST_FOREACH(pp, &gp->provider, provider) {
-				if (pp->flags & G_PF_WITHER)
-					continue;
-				g_attach(zcp, pp);
-				if (g_access(zcp, 1, 0, 0) != 0) {
-					g_detach(zcp);
-					continue;
-				}
-				g_topology_unlock();
-				id = vdev_geom_get_id(zcp);
-				g_topology_lock();
-				g_access(zcp, -1, 0, 0);
-				g_detach(zcp);
-				if (id == NULL || strcmp(id, ap->id) != 0) {
-					vdev_geom_free_id(id);
-					continue;
-				}
-				vdev_geom_free_id(id);
-				ap->cp = vdev_geom_attach(pp, ap->write);
-				if (ap->cp == NULL) {
-					printf("ZFS WARNING: Cannot open %s "
-					    "for writting.\n", pp->name);
-					continue;
-				}
-				goto end;
-			}
-		}
-	}
-	ap->cp = NULL;
-end:
-	g_destroy_consumer(zcp);
-	g_destroy_geom(zgp);
-}
-
-static struct g_consumer *
-vdev_geom_attach_by_id(const char *id, int write)
-{
-	struct vdev_geom_find *ap;
-	struct g_consumer *cp;
-
-	ap = kmem_zalloc(sizeof(*ap), KM_SLEEP);
-	ap->id = id;
-	ap->write = write;
-	g_waitfor_event(vdev_geom_attach_by_id_event, ap, M_WAITOK, NULL);
-	cp = ap->cp;
-	kmem_free(ap, sizeof(*ap));
-	return (cp);
-}
-
-static int
-vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
-{
-	vdev_geom_ctx_t *ctx;
-	struct g_provider *pp;
-	struct g_consumer *cp;
-	char *id = NULL;
-	int owned;
-
-	/*
-	 * We must have a pathname, and it must be absolute.
-	 */
-	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
-		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
-		return (EINVAL);
-	}
-
-	if ((owned = mtx_owned(&Giant)))
-		mtx_unlock(&Giant);
-	cp = NULL;
-	g_topology_lock();
-	pp = g_provider_by_name(vd->vdev_path + sizeof("/dev/") - 1);
-	if (pp != NULL) {
-		ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path);
-		cp = vdev_geom_attach(pp, !!(spa_mode & FWRITE));
-		if (cp != NULL && vd->vdev_devid != NULL) {
-			g_topology_unlock();
-			id = vdev_geom_get_id(cp);
-			g_topology_lock();
-			if (id == NULL || strcmp(id, vd->vdev_devid) != 0) {
-				vdev_geom_detach(cp, 0);
-				cp = NULL;
-				ZFS_LOG(1, "ID mismatch for provider %s: "
-				    "[%s]!=[%s].", vd->vdev_path,
-				    vd->vdev_devid, id);
-				goto next;
-			}
-			ZFS_LOG(1, "ID match for provider %s.", vd->vdev_path);
-		}
-	}
-next:
-	g_topology_unlock();
-	vdev_geom_free_id(id);
-	if (cp == NULL && vd->vdev_devid != NULL) {
-		ZFS_LOG(1, "Searching by ID [%s].", vd->vdev_devid);
-		cp = vdev_geom_attach_by_id(vd->vdev_devid,
-		    !!(spa_mode & FWRITE));
-		if (cp != NULL) {
-			size_t len = strlen(cp->provider->name) + 6; /* 6 == strlen("/dev/") + 1 */
-			char *buf = kmem_alloc(len, KM_SLEEP);
-
-			snprintf(buf, len, "/dev/%s", cp->provider->name);
-			spa_strfree(vd->vdev_path);
-			vd->vdev_path = buf;
-
-			ZFS_LOG(1, "Attach by ID [%s] succeeded, provider %s.",
-			    vd->vdev_devid, vd->vdev_path);
-		}
-	}
-	if (owned)
-		mtx_lock(&Giant);
-	if (cp == NULL) {
-		ZFS_LOG(1, "Provider %s (id=[%s]) not found.", vd->vdev_path,
-		    vd->vdev_devid);
-		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
-		return (EACCES);
-	}
-	pp = cp->provider;
-
-	/*
-	 * Determine the actual size of the device.
-	 */
-	*psize = pp->mediasize;
-
-	/*
-	 * Determine the device's minimum transfer size.
-	 */
-	*ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1;
-
-	/*
-	 * Clear the nowritecache bit, so that on a vdev_reopen() we will
-	 * try again.
-	 */
-	vd->vdev_nowritecache = B_FALSE;
-
-	cp->private = vd;
-
-	ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP);
-	bioq_init(&ctx->gc_queue);
-	mtx_init(&ctx->gc_queue_mtx, "zfs:vdev:geom:queue", NULL, MTX_DEF);
-	ctx->gc_consumer = cp;
-	ctx->gc_state = 0;
-
-	vd->vdev_tsd = ctx;
-
-	kproc_create(vdev_geom_worker, ctx, NULL, 0, 0, "vdev:worker %s",
-	    pp->name);
-
-	return (0);
-}
-
-static void
-vdev_geom_close(vdev_t *vd)
-{
-	vdev_geom_ctx_t *ctx;
-	struct g_consumer *cp;
-
-	if ((ctx = vd->vdev_tsd) == NULL)
-		return;
-	if ((cp = ctx->gc_consumer) == NULL)
-		return;
-	vdev_geom_release(vd);
-	g_post_event(vdev_geom_detach, cp, M_WAITOK, NULL);
-}
-
-static void
-vdev_geom_io_intr(struct bio *bp)
-{
-	vdev_geom_ctx_t *ctx;
-	zio_t *zio;
-
-	zio = bp->bio_caller1;
-	ctx = zio->io_vd->vdev_tsd;
-
-	mtx_lock(&ctx->gc_queue_mtx);
-	bioq_insert_tail(&ctx->gc_queue, bp);
-	wakeup_one(&ctx->gc_queue);
-	mtx_unlock(&ctx->gc_queue_mtx);
-}
-
-static void
-vdev_geom_io_start(zio_t *zio)
-{
-	vdev_t *vd;
-	vdev_geom_ctx_t *ctx;
-	struct g_consumer *cp;
-	struct bio *bp;
-	int error;
-
-	cp = NULL;
-
-	vd = zio->io_vd;
-	ctx = vd->vdev_tsd;
-	if (ctx != NULL)
-		cp = ctx->gc_consumer;
-
-	if (zio->io_type == ZIO_TYPE_IOCTL) {
-		zio_vdev_io_bypass(zio);
-
-		/* XXPOLICY */
-		if (vdev_is_dead(vd)) {
-			zio->io_error = ENXIO;
-			zio_next_stage_async(zio);
-			return;
-		}
-
-		switch (zio->io_cmd) {
-
-		case DKIOCFLUSHWRITECACHE:
-			if (vd->vdev_nowritecache) {
-				zio->io_error = ENOTSUP;
-				break;
-			}
-
-			goto sendreq;
-		default:
-			zio->io_error = ENOTSUP;
-		}
-
-		zio_next_stage_async(zio);
-		return;
-	}
-
-	if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
-		return;
-
-	if ((zio = vdev_queue_io(zio)) == NULL)
-		return;
-
-sendreq:
-
-	error = vdev_is_dead(vd) ? ENXIO : vdev_error_inject(vd, zio);
-	if (error == 0 && cp == NULL)
-		error = ENXIO;
-	if (error) {
-		zio->io_error = error;
-		zio_next_stage_async(zio);
-		return;
-	}
-
-	bp = g_alloc_bio();
-	bp->bio_caller1 = zio;
-	switch (zio->io_type) {
-	case ZIO_TYPE_READ:
-	case ZIO_TYPE_WRITE:
-		bp->bio_cmd = zio->io_type == ZIO_TYPE_READ ? BIO_READ : BIO_WRITE;
-		bp->bio_data = zio->io_data;
-		bp->bio_offset = zio->io_offset;
-		bp->bio_length = zio->io_size;
-		break;
-	case ZIO_TYPE_IOCTL:
-		bp->bio_cmd = BIO_FLUSH;
-		bp->bio_data = NULL;
-		bp->bio_offset = cp->provider->mediasize;
-		bp->bio_length = 0;
-		break;
-	}
-	bp->bio_done = vdev_geom_io_intr;
-
-	g_io_request(bp, cp);
-}
-
-static void
-vdev_geom_io_done(zio_t *zio)
-{
-	vdev_queue_io_done(zio);
-
-	if (zio->io_type == ZIO_TYPE_WRITE)
-		vdev_cache_write(zio);
-
-	if (zio_injection_enabled && zio->io_error == 0)
-		zio->io_error = zio_handle_device_injection(zio->io_vd, EIO);
-
-	zio_next_stage(zio);
-}
-
-vdev_ops_t vdev_geom_ops = {
-	vdev_geom_open,
-	vdev_geom_close,
-	vdev_default_asize,
-	vdev_geom_io_start,
-	vdev_geom_io_done,
-	NULL,
-	VDEV_TYPE_DISK,		/* name of this vdev type */
-	B_TRUE			/* leaf vdev */
-};
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
deleted file mode 100644
index 9d9f555..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
+++ /dev/null
@@ -1,1011 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-/*
- * Virtual Device Labels
- * ---------------------
- *
- * The vdev label serves several distinct purposes:
- *
- *	1. Uniquely identify this device as part of a ZFS pool and confirm its
- *	   identity within the pool.
- *
- * 	2. Verify that all the devices given in a configuration are present
- *         within the pool.
- *
- * 	3. Determine the uberblock for the pool.
- *
- * 	4. In case of an import operation, determine the configuration of the
- *         toplevel vdev of which it is a part.
- *
- * 	5. If an import operation cannot find all the devices in the pool,
- *         provide enough information to the administrator to determine which
- *         devices are missing.
- *
- * It is important to note that while the kernel is responsible for writing the
- * label, it only consumes the information in the first three cases.  The
- * latter information is only consumed in userland when determining the
- * configuration to import a pool.
- *
- *
- * Label Organization
- * ------------------
- *
- * Before describing the contents of the label, it's important to understand how
- * the labels are written and updated with respect to the uberblock.
- *
- * When the pool configuration is altered, either because it was newly created
- * or a device was added, we want to update all the labels such that we can deal
- * with fatal failure at any point.  To this end, each disk has two labels which
- * are updated before and after the uberblock is synced.  Assuming we have
- * labels and an uberblock with the following transacation groups:
- *
- *              L1          UB          L2
- *           +------+    +------+    +------+
- *           |      |    |      |    |      |
- *           | t10  |    | t10  |    | t10  |
- *           |      |    |      |    |      |
- *           +------+    +------+    +------+
- *
- * In this stable state, the labels and the uberblock were all updated within
- * the same transaction group (10).  Each label is mirrored and checksummed, so
- * that we can detect when we fail partway through writing the label.
- *
- * In order to identify which labels are valid, the labels are written in the
- * following manner:
- *
- * 	1. For each vdev, update 'L1' to the new label
- * 	2. Update the uberblock
- * 	3. For each vdev, update 'L2' to the new label
- *
- * Given arbitrary failure, we can determine the correct label to use based on
- * the transaction group.  If we fail after updating L1 but before updating the
- * UB, we will notice that L1's transaction group is greater than the uberblock,
- * so L2 must be valid.  If we fail after writing the uberblock but before
- * writing L2, we will notice that L2's transaction group is less than L1, and
- * therefore L1 is valid.
- *
- * Another added complexity is that not every label is updated when the config
- * is synced.  If we add a single device, we do not want to have to re-write
- * every label for every device in the pool.  This means that both L1 and L2 may
- * be older than the pool uberblock, because the necessary information is stored
- * on another vdev.
- *
- *
- * On-disk Format
- * --------------
- *
- * The vdev label consists of two distinct parts, and is wrapped within the
- * vdev_label_t structure.  The label includes 8k of padding to permit legacy
- * VTOC disk labels, but is otherwise ignored.
- *
- * The first half of the label is a packed nvlist which contains pool wide
- * properties, per-vdev properties, and configuration information.  It is
- * described in more detail below.
- *
- * The latter half of the label consists of a redundant array of uberblocks.
- * These uberblocks are updated whenever a transaction group is committed,
- * or when the configuration is updated.  When a pool is loaded, we scan each
- * vdev for the 'best' uberblock.
- *
- *
- * Configuration Information
- * -------------------------
- *
- * The nvlist describing the pool and vdev contains the following elements:
- *
- * 	version		ZFS on-disk version
- * 	name		Pool name
- * 	state		Pool state
- * 	txg		Transaction group in which this label was written
- * 	pool_guid	Unique identifier for this pool
- * 	vdev_tree	An nvlist describing vdev tree.
- *
- * Each leaf device label also contains the following:
- *
- * 	top_guid	Unique ID for top-level vdev in which this is contained
- * 	guid		Unique ID for the leaf vdev
- *
- * The 'vs' configuration follows the format described in 'spa_config.c'.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/spa.h>
-#include <sys/spa_impl.h>
-#include <sys/dmu.h>
-#include <sys/zap.h>
-#include <sys/vdev.h>
-#include <sys/vdev_impl.h>
-#include <sys/uberblock_impl.h>
-#include <sys/metaslab.h>
-#include <sys/zio.h>
-#include <sys/fs/zfs.h>
-
-/*
- * Basic routines to read and write from a vdev label.
- * Used throughout the rest of this file.
- */
-uint64_t
-vdev_label_offset(uint64_t psize, int l, uint64_t offset)
-{
-	ASSERT(offset < sizeof (vdev_label_t));
-
-	return (offset + l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ?
-	    0 : psize - VDEV_LABELS * sizeof (vdev_label_t)));
-}
-
-static void
-vdev_label_read(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
-	uint64_t size, zio_done_func_t *done, void *private)
-{
-	ASSERT(vd->vdev_children == 0);
-
-	zio_nowait(zio_read_phys(zio, vd,
-	    vdev_label_offset(vd->vdev_psize, l, offset),
-	    size, buf, ZIO_CHECKSUM_LABEL, done, private,
-	    ZIO_PRIORITY_SYNC_READ,
-	    ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE));
-}
-
-static void
-vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
-	uint64_t size, zio_done_func_t *done, void *private)
-{
-	ASSERT(vd->vdev_children == 0);
-
-	zio_nowait(zio_write_phys(zio, vd,
-	    vdev_label_offset(vd->vdev_psize, l, offset),
-	    size, buf, ZIO_CHECKSUM_LABEL, done, private,
-	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL));
-}
-
-/*
- * Generate the nvlist representing this vdev's config.
- */
-nvlist_t *
-vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
-    boolean_t isspare)
-{
-	nvlist_t *nv = NULL;
-
-	VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-
-	VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE,
-	    vd->vdev_ops->vdev_op_type) == 0);
-	if (!isspare)
-		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id)
-		    == 0);
-	VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0);
-
-	if (vd->vdev_path != NULL)
-		VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_PATH,
-		    vd->vdev_path) == 0);
-
-	if (vd->vdev_devid != NULL)
-		VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_DEVID,
-		    vd->vdev_devid) == 0);
-
-	if (vd->vdev_nparity != 0) {
-		ASSERT(strcmp(vd->vdev_ops->vdev_op_type,
-		    VDEV_TYPE_RAIDZ) == 0);
-
-		/*
-		 * Make sure someone hasn't managed to sneak a fancy new vdev
-		 * into a crufty old storage pool.
-		 */
-		ASSERT(vd->vdev_nparity == 1 ||
-		    (vd->vdev_nparity == 2 &&
-		    spa_version(spa) >= ZFS_VERSION_RAID6));
-
-		/*
-		 * Note that we'll add the nparity tag even on storage pools
-		 * that only support a single parity device -- older software
-		 * will just ignore it.
-		 */
-		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY,
-		    vd->vdev_nparity) == 0);
-	}
-
-	if (vd->vdev_wholedisk != -1ULL)
-		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
-		    vd->vdev_wholedisk) == 0);
-
-	if (vd->vdev_not_present)
-		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1) == 0);
-
-	if (vd->vdev_isspare)
-		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1) == 0);
-
-	if (!isspare && vd == vd->vdev_top) {
-		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
-		    vd->vdev_ms_array) == 0);
-		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
-		    vd->vdev_ms_shift) == 0);
-		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT,
-		    vd->vdev_ashift) == 0);
-		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE,
-		    vd->vdev_asize) == 0);
-	}
-
-	if (vd->vdev_dtl.smo_object != 0)
-		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DTL,
-		    vd->vdev_dtl.smo_object) == 0);
-
-	if (getstats) {
-		vdev_stat_t vs;
-		vdev_get_stats(vd, &vs);
-		VERIFY(nvlist_add_uint64_array(nv, ZPOOL_CONFIG_STATS,
-		    (uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t)) == 0);
-	}
-
-	if (!vd->vdev_ops->vdev_op_leaf) {
-		nvlist_t **child;
-		int c;
-
-		child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *),
-		    KM_SLEEP);
-
-		for (c = 0; c < vd->vdev_children; c++)
-			child[c] = vdev_config_generate(spa, vd->vdev_child[c],
-			    getstats, isspare);
-
-		VERIFY(nvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
-		    child, vd->vdev_children) == 0);
-
-		for (c = 0; c < vd->vdev_children; c++)
-			nvlist_free(child[c]);
-
-		kmem_free(child, vd->vdev_children * sizeof (nvlist_t *));
-
-	} else {
-		if (vd->vdev_offline && !vd->vdev_tmpoffline)
-			VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE,
-			    B_TRUE) == 0);
-		else
-			(void) nvlist_remove(nv, ZPOOL_CONFIG_OFFLINE,
-			    DATA_TYPE_UINT64);
-	}
-
-	return (nv);
-}
-
-nvlist_t *
-vdev_label_read_config(vdev_t *vd)
-{
-	spa_t *spa = vd->vdev_spa;
-	nvlist_t *config = NULL;
-	vdev_phys_t *vp;
-	zio_t *zio;
-	int l;
-
-	ASSERT(spa_config_held(spa, RW_READER));
-
-	if (vdev_is_dead(vd))
-		return (NULL);
-
-	vp = zio_buf_alloc(sizeof (vdev_phys_t));
-
-	for (l = 0; l < VDEV_LABELS; l++) {
-
-		zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL |
-		    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CONFIG_HELD);
-
-		vdev_label_read(zio, vd, l, vp,
-		    offsetof(vdev_label_t, vl_vdev_phys),
-		    sizeof (vdev_phys_t), NULL, NULL);
-
-		if (zio_wait(zio) == 0 &&
-		    nvlist_unpack(vp->vp_nvlist, sizeof (vp->vp_nvlist),
-		    &config, 0) == 0)
-			break;
-
-		if (config != NULL) {
-			nvlist_free(config);
-			config = NULL;
-		}
-	}
-
-	zio_buf_free(vp, sizeof (vdev_phys_t));
-
-	return (config);
-}
-
-/*
- * Determine if a device is in use.  The 'spare_guid' parameter will be filled
- * in with the device guid if this spare is active elsewhere on the system.
- */
-static boolean_t
-vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason,
-    uint64_t *spare_guid)
-{
-	spa_t *spa = vd->vdev_spa;
-	uint64_t state, pool_guid, device_guid, txg, spare_pool;
-	uint64_t vdtxg = 0;
-	nvlist_t *label;
-
-	if (spare_guid)
-		*spare_guid = 0ULL;
-
-	/*
-	 * Read the label, if any, and perform some basic sanity checks.
-	 */
-	if ((label = vdev_label_read_config(vd)) == NULL)
-		return (B_FALSE);
-
-	(void) nvlist_lookup_uint64(label, ZPOOL_CONFIG_CREATE_TXG,
-	    &vdtxg);
-
-	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
-	    &state) != 0 ||
-	    nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID,
-	    &device_guid) != 0) {
-		nvlist_free(label);
-		return (B_FALSE);
-	}
-
-	if (state != POOL_STATE_SPARE &&
-	    (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID,
-	    &pool_guid) != 0 ||
-	    nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
-	    &txg) != 0)) {
-		nvlist_free(label);
-		return (B_FALSE);
-	}
-
-	nvlist_free(label);
-
-	/*
-	 * Check to see if this device indeed belongs to the pool it claims to
-	 * be a part of.  The only way this is allowed is if the device is a hot
-	 * spare (which we check for later on).
-	 */
-	if (state != POOL_STATE_SPARE &&
-	    !spa_guid_exists(pool_guid, device_guid) &&
-	    !spa_spare_exists(device_guid, NULL))
-		return (B_FALSE);
-
-	/*
-	 * If the transaction group is zero, then this an initialized (but
-	 * unused) label.  This is only an error if the create transaction
-	 * on-disk is the same as the one we're using now, in which case the
-	 * user has attempted to add the same vdev multiple times in the same
-	 * transaction.
-	 */
-	if (state != POOL_STATE_SPARE && txg == 0 && vdtxg == crtxg)
-		return (B_TRUE);
-
-	/*
-	 * Check to see if this is a spare device.  We do an explicit check for
-	 * spa_has_spare() here because it may be on our pending list of spares
-	 * to add.
-	 */
-	if (spa_spare_exists(device_guid, &spare_pool) ||
-	    spa_has_spare(spa, device_guid)) {
-		if (spare_guid)
-			*spare_guid = device_guid;
-
-		switch (reason) {
-		case VDEV_LABEL_CREATE:
-			return (B_TRUE);
-
-		case VDEV_LABEL_REPLACE:
-			return (!spa_has_spare(spa, device_guid) ||
-			    spare_pool != 0ULL);
-
-		case VDEV_LABEL_SPARE:
-			return (spa_has_spare(spa, device_guid));
-		}
-	}
-
-	/*
-	 * If the device is marked ACTIVE, then this device is in use by another
-	 * pool on the system.
-	 */
-	return (state == POOL_STATE_ACTIVE);
-}
-
-/*
- * Initialize a vdev label.  We check to make sure each leaf device is not in
- * use, and writable.  We put down an initial label which we will later
- * overwrite with a complete label.  Note that it's important to do this
- * sequentially, not in parallel, so that we catch cases of multiple use of the
- * same leaf vdev in the vdev we're creating -- e.g. mirroring a disk with
- * itself.
- */
-int
-vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
-{
-	spa_t *spa = vd->vdev_spa;
-	nvlist_t *label;
-	vdev_phys_t *vp;
-	vdev_boot_header_t *vb;
-	uberblock_t *ub;
-	zio_t *zio;
-	int l, c, n;
-	char *buf;
-	size_t buflen;
-	int error;
-	uint64_t spare_guid;
-
-	ASSERT(spa_config_held(spa, RW_WRITER));
-
-	for (c = 0; c < vd->vdev_children; c++)
-		if ((error = vdev_label_init(vd->vdev_child[c],
-		    crtxg, reason)) != 0)
-			return (error);
-
-	if (!vd->vdev_ops->vdev_op_leaf)
-		return (0);
-
-	/*
-	 * Dead vdevs cannot be initialized.
-	 */
-	if (vdev_is_dead(vd))
-		return (EIO);
-
-	/*
-	 * Determine if the vdev is in use.
-	 */
-	if (reason != VDEV_LABEL_REMOVE &&
-	    vdev_inuse(vd, crtxg, reason, &spare_guid))
-		return (EBUSY);
-
-	ASSERT(reason != VDEV_LABEL_REMOVE ||
-	    vdev_inuse(vd, crtxg, reason, NULL));
-
-	/*
-	 * If this is a request to add or replace a spare that is in use
-	 * elsewhere on the system, then we must update the guid (which was
-	 * initialized to a random value) to reflect the actual GUID (which is
-	 * shared between multiple pools).
-	 */
-	if (reason != VDEV_LABEL_REMOVE && spare_guid != 0ULL) {
-		vdev_t *pvd = vd->vdev_parent;
-
-		for (; pvd != NULL; pvd = pvd->vdev_parent) {
-			pvd->vdev_guid_sum -= vd->vdev_guid;
-			pvd->vdev_guid_sum += spare_guid;
-		}
-
-		vd->vdev_guid = vd->vdev_guid_sum = spare_guid;
-
-		/*
-		 * If this is a replacement, then we want to fallthrough to the
-		 * rest of the code.  If we're adding a spare, then it's already
-		 * labelled appropriately and we can just return.
-		 */
-		if (reason == VDEV_LABEL_SPARE)
-			return (0);
-		ASSERT(reason == VDEV_LABEL_REPLACE);
-	}
-
-	/*
-	 * Initialize its label.
-	 */
-	vp = zio_buf_alloc(sizeof (vdev_phys_t));
-	bzero(vp, sizeof (vdev_phys_t));
-
-	/*
-	 * Generate a label describing the pool and our top-level vdev.
-	 * We mark it as being from txg 0 to indicate that it's not
-	 * really part of an active pool just yet.  The labels will
-	 * be written again with a meaningful txg by spa_sync().
-	 */
-	if (reason == VDEV_LABEL_SPARE ||
-	    (reason == VDEV_LABEL_REMOVE && vd->vdev_isspare)) {
-		/*
-		 * For inactive hot spares, we generate a special label that
-		 * identifies as a mutually shared hot spare.  We write the
-		 * label if we are adding a hot spare, or if we are removing an
-		 * active hot spare (in which case we want to revert the
-		 * labels).
-		 */
-		VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-
-		VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION,
-		    spa_version(spa)) == 0);
-		VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE,
-		    POOL_STATE_SPARE) == 0);
-		VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID,
-		    vd->vdev_guid) == 0);
-	} else {
-		label = spa_config_generate(spa, vd, 0ULL, B_FALSE);
-
-		/*
-		 * Add our creation time.  This allows us to detect multiple
-		 * vdev uses as described above, and automatically expires if we
-		 * fail.
-		 */
-		VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_CREATE_TXG,
-		    crtxg) == 0);
-	}
-
-	buf = vp->vp_nvlist;
-	buflen = sizeof (vp->vp_nvlist);
-
-	error = nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP);
-	if (error != 0) {
-		nvlist_free(label);
-		zio_buf_free(vp, sizeof (vdev_phys_t));
-		/* EFAULT means nvlist_pack ran out of room */
-		return (error == EFAULT ? ENAMETOOLONG : EINVAL);
-	}
-
-	/*
-	 * Initialize boot block header.
-	 */
-	vb = zio_buf_alloc(sizeof (vdev_boot_header_t));
-	bzero(vb, sizeof (vdev_boot_header_t));
-	vb->vb_magic = VDEV_BOOT_MAGIC;
-	vb->vb_version = VDEV_BOOT_VERSION;
-	vb->vb_offset = VDEV_BOOT_OFFSET;
-	vb->vb_size = VDEV_BOOT_SIZE;
-
-	/*
-	 * Initialize uberblock template.
-	 */
-	ub = zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd));
-	bzero(ub, VDEV_UBERBLOCK_SIZE(vd));
-	*ub = spa->spa_uberblock;
-	ub->ub_txg = 0;
-
-	/*
-	 * Write everything in parallel.
-	 */
-	zio = zio_root(spa, NULL, NULL,
-	    ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL);
-
-	for (l = 0; l < VDEV_LABELS; l++) {
-
-		vdev_label_write(zio, vd, l, vp,
-		    offsetof(vdev_label_t, vl_vdev_phys),
-		    sizeof (vdev_phys_t), NULL, NULL);
-
-		vdev_label_write(zio, vd, l, vb,
-		    offsetof(vdev_label_t, vl_boot_header),
-		    sizeof (vdev_boot_header_t), NULL, NULL);
-
-		for (n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
-			vdev_label_write(zio, vd, l, ub,
-			    VDEV_UBERBLOCK_OFFSET(vd, n),
-			    VDEV_UBERBLOCK_SIZE(vd), NULL, NULL);
-		}
-	}
-
-	error = zio_wait(zio);
-
-	nvlist_free(label);
-	zio_buf_free(ub, VDEV_UBERBLOCK_SIZE(vd));
-	zio_buf_free(vb, sizeof (vdev_boot_header_t));
-	zio_buf_free(vp, sizeof (vdev_phys_t));
-
-	/*
-	 * If this vdev hasn't been previously identified as a spare, then we
-	 * mark it as such only if a) we are labelling it as a spare, or b) it
-	 * exists as a spare elsewhere in the system.
-	 */
-	if (error == 0 && !vd->vdev_isspare &&
-	    (reason == VDEV_LABEL_SPARE ||
-	    spa_spare_exists(vd->vdev_guid, NULL)))
-		spa_spare_add(vd);
-
-	return (error);
-}
-
-/*
- * ==========================================================================
- * uberblock load/sync
- * ==========================================================================
- */
-
-/*
- * Consider the following situation: txg is safely synced to disk.  We've
- * written the first uberblock for txg + 1, and then we lose power.  When we
- * come back up, we fail to see the uberblock for txg + 1 because, say,
- * it was on a mirrored device and the replica to which we wrote txg + 1
- * is now offline.  If we then make some changes and sync txg + 1, and then
- * the missing replica comes back, then for a new seconds we'll have two
- * conflicting uberblocks on disk with the same txg.  The solution is simple:
- * among uberblocks with equal txg, choose the one with the latest timestamp.
- */
-static int
-vdev_uberblock_compare(uberblock_t *ub1, uberblock_t *ub2)
-{
-	if (ub1->ub_txg < ub2->ub_txg)
-		return (-1);
-	if (ub1->ub_txg > ub2->ub_txg)
-		return (1);
-
-	if (ub1->ub_timestamp < ub2->ub_timestamp)
-		return (-1);
-	if (ub1->ub_timestamp > ub2->ub_timestamp)
-		return (1);
-
-	return (0);
-}
-
-static void
-vdev_uberblock_load_done(zio_t *zio)
-{
-	uberblock_t *ub = zio->io_data;
-	uberblock_t *ubbest = zio->io_private;
-	spa_t *spa = zio->io_spa;
-
-	ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(zio->io_vd));
-
-	if (zio->io_error == 0 && uberblock_verify(ub) == 0) {
-		mutex_enter(&spa->spa_uberblock_lock);
-		if (vdev_uberblock_compare(ub, ubbest) > 0)
-			*ubbest = *ub;
-		mutex_exit(&spa->spa_uberblock_lock);
-	}
-
-	zio_buf_free(zio->io_data, zio->io_size);
-}
-
-void
-vdev_uberblock_load(zio_t *zio, vdev_t *vd, uberblock_t *ubbest)
-{
-	int l, c, n;
-
-	for (c = 0; c < vd->vdev_children; c++)
-		vdev_uberblock_load(zio, vd->vdev_child[c], ubbest);
-
-	if (!vd->vdev_ops->vdev_op_leaf)
-		return;
-
-	if (vdev_is_dead(vd))
-		return;
-
-	for (l = 0; l < VDEV_LABELS; l++) {
-		for (n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
-			vdev_label_read(zio, vd, l,
-			    zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)),
-			    VDEV_UBERBLOCK_OFFSET(vd, n),
-			    VDEV_UBERBLOCK_SIZE(vd),
-			    vdev_uberblock_load_done, ubbest);
-		}
-	}
-}
-
-/*
- * Write the uberblock to both labels of all leaves of the specified vdev.
- * We only get credit for writes to known-visible vdevs; see spa_vdev_add().
- */
-static void
-vdev_uberblock_sync_done(zio_t *zio)
-{
-	uint64_t *good_writes = zio->io_root->io_private;
-
-	if (zio->io_error == 0 && zio->io_vd->vdev_top->vdev_ms_array != 0)
-		atomic_add_64(good_writes, 1);
-}
-
-static void
-vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, uint64_t txg)
-{
-	int l, c, n;
-
-	for (c = 0; c < vd->vdev_children; c++)
-		vdev_uberblock_sync(zio, ub, vd->vdev_child[c], txg);
-
-	if (!vd->vdev_ops->vdev_op_leaf)
-		return;
-
-	if (vdev_is_dead(vd))
-		return;
-
-	n = txg & (VDEV_UBERBLOCK_COUNT(vd) - 1);
-
-	ASSERT(ub->ub_txg == txg);
-
-	for (l = 0; l < VDEV_LABELS; l++)
-		vdev_label_write(zio, vd, l, ub,
-		    VDEV_UBERBLOCK_OFFSET(vd, n),
-		    VDEV_UBERBLOCK_SIZE(vd),
-		    vdev_uberblock_sync_done, NULL);
-
-	dprintf("vdev %s in txg %llu\n", vdev_description(vd), txg);
-}
-
-static int
-vdev_uberblock_sync_tree(spa_t *spa, uberblock_t *ub, vdev_t *vd, uint64_t txg)
-{
-	uberblock_t *ubbuf;
-	size_t size = vd->vdev_top ? VDEV_UBERBLOCK_SIZE(vd) : SPA_MAXBLOCKSIZE;
-	uint64_t *good_writes;
-	zio_t *zio;
-	int error;
-
-	ubbuf = zio_buf_alloc(size);
-	bzero(ubbuf, size);
-	*ubbuf = *ub;
-
-	good_writes = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
-
-	zio = zio_root(spa, NULL, good_writes,
-	    ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL);
-
-	vdev_uberblock_sync(zio, ubbuf, vd, txg);
-
-	error = zio_wait(zio);
-
-	if (error && *good_writes != 0) {
-		dprintf("partial success: good_writes = %llu\n", *good_writes);
-		error = 0;
-	}
-
-	/*
-	 * It's possible to have no good writes and no error if every vdev is in
-	 * the CANT_OPEN state.
-	 */
-	if (*good_writes == 0 && error == 0)
-		error = EIO;
-
-	kmem_free(good_writes, sizeof (uint64_t));
-	zio_buf_free(ubbuf, size);
-
-	return (error);
-}
-
-/*
- * Sync out an individual vdev.
- */
-static void
-vdev_sync_label_done(zio_t *zio)
-{
-	uint64_t *good_writes = zio->io_root->io_private;
-
-	if (zio->io_error == 0)
-		atomic_add_64(good_writes, 1);
-}
-
-static void
-vdev_sync_label(zio_t *zio, vdev_t *vd, int l, uint64_t txg)
-{
-	nvlist_t *label;
-	vdev_phys_t *vp;
-	char *buf;
-	size_t buflen;
-	int c;
-
-	for (c = 0; c < vd->vdev_children; c++)
-		vdev_sync_label(zio, vd->vdev_child[c], l, txg);
-
-	if (!vd->vdev_ops->vdev_op_leaf)
-		return;
-
-	if (vdev_is_dead(vd))
-		return;
-
-	/*
-	 * Generate a label describing the top-level config to which we belong.
-	 */
-	label = spa_config_generate(vd->vdev_spa, vd, txg, B_FALSE);
-
-	vp = zio_buf_alloc(sizeof (vdev_phys_t));
-	bzero(vp, sizeof (vdev_phys_t));
-
-	buf = vp->vp_nvlist;
-	buflen = sizeof (vp->vp_nvlist);
-
-	if (nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP) == 0)
-		vdev_label_write(zio, vd, l, vp,
-		    offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t),
-		    vdev_sync_label_done, NULL);
-
-	zio_buf_free(vp, sizeof (vdev_phys_t));
-	nvlist_free(label);
-
-	dprintf("%s label %d txg %llu\n", vdev_description(vd), l, txg);
-}
-
-static int
-vdev_sync_labels(vdev_t *vd, int l, uint64_t txg)
-{
-	uint64_t *good_writes;
-	zio_t *zio;
-	int error;
-
-	ASSERT(vd == vd->vdev_top);
-
-	good_writes = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
-
-	zio = zio_root(vd->vdev_spa, NULL, good_writes,
-	    ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL);
-
-	/*
-	 * Recursively kick off writes to all labels.
-	 */
-	vdev_sync_label(zio, vd, l, txg);
-
-	error = zio_wait(zio);
-
-	if (error && *good_writes != 0) {
-		dprintf("partial success: good_writes = %llu\n", *good_writes);
-		error = 0;
-	}
-
-	if (*good_writes == 0 && error == 0)
-		error = ENODEV;
-
-	kmem_free(good_writes, sizeof (uint64_t));
-
-	return (error);
-}
-
-/*
- * Sync the entire vdev configuration.
- *
- * The order of operations is carefully crafted to ensure that
- * if the system panics or loses power at any time, the state on disk
- * is still transactionally consistent.  The in-line comments below
- * describe the failure semantics at each stage.
- *
- * Moreover, it is designed to be idempotent: if spa_sync_labels() fails
- * at any time, you can just call it again, and it will resume its work.
- */
-int
-vdev_config_sync(vdev_t *uvd, uint64_t txg)
-{
-	spa_t *spa = uvd->vdev_spa;
-	uberblock_t *ub = &spa->spa_uberblock;
-	vdev_t *rvd = spa->spa_root_vdev;
-	vdev_t *vd;
-	zio_t *zio;
-	int l, error;
-
-	ASSERT(ub->ub_txg <= txg);
-
-	/*
-	 * If this isn't a resync due to I/O errors, and nothing changed
-	 * in this transaction group, and the vdev configuration hasn't changed,
-	 * then there's nothing to do.
-	 */
-	if (ub->ub_txg < txg && uberblock_update(ub, rvd, txg) == B_FALSE &&
-	    list_is_empty(&spa->spa_dirty_list)) {
-		dprintf("nothing to sync in %s in txg %llu\n",
-		    spa_name(spa), txg);
-		return (0);
-	}
-
-	if (txg > spa_freeze_txg(spa))
-		return (0);
-
-	ASSERT(txg <= spa->spa_final_txg);
-
-	dprintf("syncing %s txg %llu\n", spa_name(spa), txg);
-
-	/*
-	 * Flush the write cache of every disk that's been written to
-	 * in this transaction group.  This ensures that all blocks
-	 * written in this txg will be committed to stable storage
-	 * before any uberblock that references them.
-	 */
-	zio = zio_root(spa, NULL, NULL,
-	    ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL);
-	for (vd = txg_list_head(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)); vd;
-	    vd = txg_list_next(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg))) {
-		zio_nowait(zio_ioctl(zio, spa, vd, DKIOCFLUSHWRITECACHE,
-		    NULL, NULL, ZIO_PRIORITY_NOW,
-		    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
-	}
-	(void) zio_wait(zio);
-
-	/*
-	 * Sync out the even labels (L0, L2) for every dirty vdev.  If the
-	 * system dies in the middle of this process, that's OK: all of the
-	 * even labels that made it to disk will be newer than any uberblock,
-	 * and will therefore be considered invalid.  The odd labels (L1, L3),
-	 * which have not yet been touched, will still be valid.
-	 */
-	for (vd = list_head(&spa->spa_dirty_list); vd != NULL;
-	    vd = list_next(&spa->spa_dirty_list, vd)) {
-		for (l = 0; l < VDEV_LABELS; l++) {
-			if (l & 1)
-				continue;
-			if ((error = vdev_sync_labels(vd, l, txg)) != 0)
-				return (error);
-		}
-	}
-
-	/*
-	 * Flush the new labels to disk.  This ensures that all even-label
-	 * updates are committed to stable storage before the uberblock update.
-	 */
-	zio = zio_root(spa, NULL, NULL,
-	    ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL);
-	for (vd = list_head(&spa->spa_dirty_list); vd != NULL;
-	    vd = list_next(&spa->spa_dirty_list, vd)) {
-		zio_nowait(zio_ioctl(zio, spa, vd, DKIOCFLUSHWRITECACHE,
-		    NULL, NULL, ZIO_PRIORITY_NOW,
-		    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
-	}
-	(void) zio_wait(zio);
-
-	/*
-	 * Sync the uberblocks to all vdevs in the tree specified by uvd.
-	 * If the system dies in the middle of this step, there are two cases
-	 * to consider, and the on-disk state is consistent either way:
-	 *
-	 * (1)	If none of the new uberblocks made it to disk, then the
-	 *	previous uberblock will be the newest, and the odd labels
-	 *	(which had not yet been touched) will be valid with respect
-	 *	to that uberblock.
-	 *
-	 * (2)	If one or more new uberblocks made it to disk, then they
-	 *	will be the newest, and the even labels (which had all
-	 *	been successfully committed) will be valid with respect
-	 *	to the new uberblocks.
-	 */
-	if ((error = vdev_uberblock_sync_tree(spa, ub, uvd, txg)) != 0)
-		return (error);
-
-	/*
-	 * Flush the uberblocks to disk.  This ensures that the odd labels
-	 * are no longer needed (because the new uberblocks and the even
-	 * labels are safely on disk), so it is safe to overwrite them.
-	 */
-	(void) zio_wait(zio_ioctl(NULL, spa, uvd, DKIOCFLUSHWRITECACHE,
-	    NULL, NULL, ZIO_PRIORITY_NOW,
-	    ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
-
-	/*
-	 * Sync out odd labels for every dirty vdev.  If the system dies
-	 * in the middle of this process, the even labels and the new
-	 * uberblocks will suffice to open the pool.  The next time
-	 * the pool is opened, the first thing we'll do -- before any
-	 * user data is modified -- is mark every vdev dirty so that
-	 * all labels will be brought up to date.
-	 */
-	for (vd = list_head(&spa->spa_dirty_list); vd != NULL;
-	    vd = list_next(&spa->spa_dirty_list, vd)) {
-		for (l = 0; l < VDEV_LABELS; l++) {
-			if ((l & 1) == 0)
-				continue;
-			if ((error = vdev_sync_labels(vd, l, txg)) != 0)
-				return (error);
-		}
-	}
-
-	/*
-	 * Flush the new labels to disk.  This ensures that all odd-label
-	 * updates are committed to stable storage before the next
-	 * transaction group begins.
-	 */
-	zio = zio_root(spa, NULL, NULL,
-	    ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL);
-	for (vd = list_head(&spa->spa_dirty_list); vd != NULL;
-	    vd = list_next(&spa->spa_dirty_list, vd)) {
-		zio_nowait(zio_ioctl(zio, spa, vd, DKIOCFLUSHWRITECACHE,
-		    NULL, NULL, ZIO_PRIORITY_NOW,
-		    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
-	}
-	(void) zio_wait(zio);
-
-	return (0);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
deleted file mode 100644
index 73d1a83..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
+++ /dev/null
@@ -1,495 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/spa.h>
-#include <sys/vdev_impl.h>
-#include <sys/zio.h>
-#include <sys/fs/zfs.h>
-
-/*
- * Virtual device vector for mirroring.
- */
-
-typedef struct mirror_child {
-	vdev_t		*mc_vd;
-	uint64_t	mc_offset;
-	int		mc_error;
-	short		mc_tried;
-	short		mc_skipped;
-} mirror_child_t;
-
-typedef struct mirror_map {
-	int		mm_children;
-	int		mm_replacing;
-	int		mm_preferred;
-	int		mm_root;
-	mirror_child_t	mm_child[1];
-} mirror_map_t;
-
-int vdev_mirror_shift = 21;
-
-static mirror_map_t *
-vdev_mirror_map_alloc(zio_t *zio)
-{
-	mirror_map_t *mm = NULL;
-	mirror_child_t *mc;
-	vdev_t *vd = zio->io_vd;
-	int c, d;
-
-	if (vd == NULL) {
-		dva_t *dva = zio->io_bp->blk_dva;
-		spa_t *spa = zio->io_spa;
-
-		c = BP_GET_NDVAS(zio->io_bp);
-
-		mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
-		mm->mm_children = c;
-		mm->mm_replacing = B_FALSE;
-		mm->mm_preferred = spa_get_random(c);
-		mm->mm_root = B_TRUE;
-
-		/*
-		 * Check the other, lower-index DVAs to see if they're on
-		 * the same vdev as the child we picked.  If they are, use
-		 * them since they are likely to have been allocated from
-		 * the primary metaslab in use at the time, and hence are
-		 * more likely to have locality with single-copy data.
-		 */
-		for (c = mm->mm_preferred, d = c - 1; d >= 0; d--) {
-			if (DVA_GET_VDEV(&dva[d]) == DVA_GET_VDEV(&dva[c]))
-				mm->mm_preferred = d;
-		}
-
-		for (c = 0; c < mm->mm_children; c++) {
-			mc = &mm->mm_child[c];
-
-			mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c]));
-			mc->mc_offset = DVA_GET_OFFSET(&dva[c]);
-		}
-	} else {
-		c = vd->vdev_children;
-
-		mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
-		mm->mm_children = c;
-		mm->mm_replacing = (vd->vdev_ops == &vdev_replacing_ops ||
-		    vd->vdev_ops == &vdev_spare_ops);
-		mm->mm_preferred = mm->mm_replacing ? 0 :
-		    (zio->io_offset >> vdev_mirror_shift) % c;
-		mm->mm_root = B_FALSE;
-
-		for (c = 0; c < mm->mm_children; c++) {
-			mc = &mm->mm_child[c];
-			mc->mc_vd = vd->vdev_child[c];
-			mc->mc_offset = zio->io_offset;
-		}
-	}
-
-	zio->io_vsd = mm;
-	return (mm);
-}
-
-static void
-vdev_mirror_map_free(zio_t *zio)
-{
-	mirror_map_t *mm = zio->io_vsd;
-
-	kmem_free(mm, offsetof(mirror_map_t, mm_child[mm->mm_children]));
-	zio->io_vsd = NULL;
-}
-
-static int
-vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
-{
-	vdev_t *cvd;
-	uint64_t c;
-	int numerrors = 0;
-	int ret, lasterror = 0;
-
-	if (vd->vdev_children == 0) {
-		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
-		return (EINVAL);
-	}
-
-	for (c = 0; c < vd->vdev_children; c++) {
-		cvd = vd->vdev_child[c];
-
-		if ((ret = vdev_open(cvd)) != 0) {
-			lasterror = ret;
-			numerrors++;
-			continue;
-		}
-
-		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
-		*ashift = MAX(*ashift, cvd->vdev_ashift);
-	}
-
-	if (numerrors == vd->vdev_children) {
-		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
-		return (lasterror);
-	}
-
-	return (0);
-}
-
-static void
-vdev_mirror_close(vdev_t *vd)
-{
-	uint64_t c;
-
-	for (c = 0; c < vd->vdev_children; c++)
-		vdev_close(vd->vdev_child[c]);
-}
-
-static void
-vdev_mirror_child_done(zio_t *zio)
-{
-	mirror_child_t *mc = zio->io_private;
-
-	mc->mc_error = zio->io_error;
-	mc->mc_tried = 1;
-	mc->mc_skipped = 0;
-}
-
-static void
-vdev_mirror_scrub_done(zio_t *zio)
-{
-	mirror_child_t *mc = zio->io_private;
-
-	if (zio->io_error == 0) {
-		zio_t *pio = zio->io_parent;
-		mutex_enter(&pio->io_lock);
-		ASSERT3U(zio->io_size, >=, pio->io_size);
-		bcopy(zio->io_data, pio->io_data, pio->io_size);
-		mutex_exit(&pio->io_lock);
-	}
-
-	zio_buf_free(zio->io_data, zio->io_size);
-
-	mc->mc_error = zio->io_error;
-	mc->mc_tried = 1;
-	mc->mc_skipped = 0;
-}
-
-static void
-vdev_mirror_repair_done(zio_t *zio)
-{
-	ASSERT(zio->io_private == zio->io_parent);
-	vdev_mirror_map_free(zio->io_private);
-}
-
-/*
- * Try to find a child whose DTL doesn't contain the block we want to read.
- * If we can't, try the read on any vdev we haven't already tried.
- */
-static int
-vdev_mirror_child_select(zio_t *zio)
-{
-	mirror_map_t *mm = zio->io_vsd;
-	mirror_child_t *mc;
-	uint64_t txg = zio->io_txg;
-	int i, c;
-
-	ASSERT(zio->io_bp == NULL || zio->io_bp->blk_birth == txg);
-
-	/*
-	 * Try to find a child whose DTL doesn't contain the block to read.
-	 * If a child is known to be completely inaccessible (indicated by
-	 * vdev_is_dead() returning B_TRUE), don't even try.
-	 */
-	for (i = 0, c = mm->mm_preferred; i < mm->mm_children; i++, c++) {
-		if (c >= mm->mm_children)
-			c = 0;
-		mc = &mm->mm_child[c];
-		if (mc->mc_tried || mc->mc_skipped)
-			continue;
-		if (vdev_is_dead(mc->mc_vd)) {
-			mc->mc_error = ENXIO;
-			mc->mc_tried = 1;	/* don't even try */
-			mc->mc_skipped = 1;
-			continue;
-		}
-		if (!vdev_dtl_contains(&mc->mc_vd->vdev_dtl_map, txg, 1))
-			return (c);
-		mc->mc_error = ESTALE;
-		mc->mc_skipped = 1;
-	}
-
-	/*
-	 * Every device is either missing or has this txg in its DTL.
-	 * Look for any child we haven't already tried before giving up.
-	 */
-	for (c = 0; c < mm->mm_children; c++)
-		if (!mm->mm_child[c].mc_tried)
-			return (c);
-
-	/*
-	 * Every child failed.  There's no place left to look.
-	 */
-	return (-1);
-}
-
-static void
-vdev_mirror_io_start(zio_t *zio)
-{
-	mirror_map_t *mm;
-	mirror_child_t *mc;
-	int c, children;
-
-	mm = vdev_mirror_map_alloc(zio);
-
-	if (zio->io_type == ZIO_TYPE_READ) {
-		if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_replacing) {
-			/*
-			 * For scrubbing reads we need to allocate a read
-			 * buffer for each child and issue reads to all
-			 * children.  If any child succeeds, it will copy its
-			 * data into zio->io_data in vdev_mirror_scrub_done.
-			 */
-			for (c = 0; c < mm->mm_children; c++) {
-				mc = &mm->mm_child[c];
-				zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
-				    mc->mc_vd, mc->mc_offset,
-				    zio_buf_alloc(zio->io_size), zio->io_size,
-				    zio->io_type, zio->io_priority,
-				    ZIO_FLAG_CANFAIL,
-				    vdev_mirror_scrub_done, mc));
-			}
-			zio_wait_children_done(zio);
-			return;
-		}
-		/*
-		 * For normal reads just pick one child.
-		 */
-		c = vdev_mirror_child_select(zio);
-		children = (c >= 0);
-	} else {
-		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
-
-		/*
-		 * If this is a resilvering I/O to a replacing vdev,
-		 * only the last child should be written -- unless the
-		 * first child happens to have a DTL entry here as well.
-		 * All other writes go to all children.
-		 */
-		if ((zio->io_flags & ZIO_FLAG_RESILVER) && mm->mm_replacing &&
-		    !vdev_dtl_contains(&mm->mm_child[0].mc_vd->vdev_dtl_map,
-		    zio->io_txg, 1)) {
-			c = mm->mm_children - 1;
-			children = 1;
-		} else {
-			c = 0;
-			children = mm->mm_children;
-		}
-	}
-
-	while (children--) {
-		mc = &mm->mm_child[c];
-		zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
-		    mc->mc_vd, mc->mc_offset,
-		    zio->io_data, zio->io_size, zio->io_type, zio->io_priority,
-		    ZIO_FLAG_CANFAIL, vdev_mirror_child_done, mc));
-		c++;
-	}
-
-	zio_wait_children_done(zio);
-}
-
-static void
-vdev_mirror_io_done(zio_t *zio)
-{
-	mirror_map_t *mm = zio->io_vsd;
-	mirror_child_t *mc;
-	int c;
-	int good_copies = 0;
-	int unexpected_errors = 0;
-
-	zio->io_error = 0;
-	zio->io_numerrors = 0;
-
-	for (c = 0; c < mm->mm_children; c++) {
-		mc = &mm->mm_child[c];
-
-		if (mc->mc_tried && mc->mc_error == 0) {
-			good_copies++;
-			continue;
-		}
-
-		/*
-		 * We preserve any EIOs because those may be worth retrying;
-		 * whereas ECKSUM and ENXIO are more likely to be persistent.
-		 */
-		if (mc->mc_error) {
-			if (zio->io_error != EIO)
-				zio->io_error = mc->mc_error;
-			if (!mc->mc_skipped)
-				unexpected_errors++;
-			zio->io_numerrors++;
-		}
-	}
-
-	if (zio->io_type == ZIO_TYPE_WRITE) {
-		/*
-		 * XXX -- for now, treat partial writes as success.
-		 * XXX -- For a replacing vdev, we need to make sure the
-		 *	  new child succeeds.
-		 */
-		/* XXPOLICY */
-		if (good_copies != 0)
-			zio->io_error = 0;
-		vdev_mirror_map_free(zio);
-		zio_next_stage(zio);
-		return;
-	}
-
-	ASSERT(zio->io_type == ZIO_TYPE_READ);
-
-	/*
-	 * If we don't have a good copy yet, keep trying other children.
-	 */
-	/* XXPOLICY */
-	if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) {
-		ASSERT(c >= 0 && c < mm->mm_children);
-		mc = &mm->mm_child[c];
-		dprintf("retrying i/o (err=%d) on child %s\n",
-		    zio->io_error, vdev_description(mc->mc_vd));
-		zio->io_error = 0;
-		zio_vdev_io_redone(zio);
-		zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
-		    mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size,
-		    ZIO_TYPE_READ, zio->io_priority, ZIO_FLAG_CANFAIL,
-		    vdev_mirror_child_done, mc));
-		zio_wait_children_done(zio);
-		return;
-	}
-
-	/* XXPOLICY */
-	if (good_copies)
-		zio->io_error = 0;
-	else
-		ASSERT(zio->io_error != 0);
-
-	if (good_copies && (spa_mode & FWRITE) &&
-	    (unexpected_errors ||
-	    (zio->io_flags & ZIO_FLAG_RESILVER) ||
-	    ((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_replacing))) {
-		zio_t *rio;
-
-		/*
-		 * Use the good data we have in hand to repair damaged children.
-		 *
-		 * We issue all repair I/Os as children of 'rio' to arrange
-		 * that vdev_mirror_map_free(zio) will be invoked after all
-		 * repairs complete, but before we advance to the next stage.
-		 */
-		rio = zio_null(zio, zio->io_spa,
-		    vdev_mirror_repair_done, zio, ZIO_FLAG_CANFAIL);
-
-		for (c = 0; c < mm->mm_children; c++) {
-			/*
-			 * Don't rewrite known good children.
-			 * Not only is it unnecessary, it could
-			 * actually be harmful: if the system lost
-			 * power while rewriting the only good copy,
-			 * there would be no good copies left!
-			 */
-			mc = &mm->mm_child[c];
-
-			if (mc->mc_error == 0) {
-				if (mc->mc_tried)
-					continue;
-				if (!(zio->io_flags & ZIO_FLAG_SCRUB) &&
-				    !vdev_dtl_contains(&mc->mc_vd->vdev_dtl_map,
-				    zio->io_txg, 1))
-					continue;
-				mc->mc_error = ESTALE;
-			}
-
-			dprintf("resilvered %s @ 0x%llx error %d\n",
-			    vdev_description(mc->mc_vd), mc->mc_offset,
-			    mc->mc_error);
-
-			zio_nowait(zio_vdev_child_io(rio, zio->io_bp, mc->mc_vd,
-			    mc->mc_offset, zio->io_data, zio->io_size,
-			    ZIO_TYPE_WRITE, zio->io_priority,
-			    ZIO_FLAG_IO_REPAIR | ZIO_FLAG_CANFAIL |
-			    ZIO_FLAG_DONT_PROPAGATE, NULL, NULL));
-		}
-
-		zio_nowait(rio);
-		zio_wait_children_done(zio);
-		return;
-	}
-
-	vdev_mirror_map_free(zio);
-	zio_next_stage(zio);
-}
-
-static void
-vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded)
-{
-	if (faulted == vd->vdev_children)
-		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
-		    VDEV_AUX_NO_REPLICAS);
-	else if (degraded + faulted != 0)
-		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
-	else
-		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
-}
-
-vdev_ops_t vdev_mirror_ops = {
-	vdev_mirror_open,
-	vdev_mirror_close,
-	vdev_default_asize,
-	vdev_mirror_io_start,
-	vdev_mirror_io_done,
-	vdev_mirror_state_change,
-	VDEV_TYPE_MIRROR,	/* name of this vdev type */
-	B_FALSE			/* not a leaf vdev */
-};
-
-vdev_ops_t vdev_replacing_ops = {
-	vdev_mirror_open,
-	vdev_mirror_close,
-	vdev_default_asize,
-	vdev_mirror_io_start,
-	vdev_mirror_io_done,
-	vdev_mirror_state_change,
-	VDEV_TYPE_REPLACING,	/* name of this vdev type */
-	B_FALSE			/* not a leaf vdev */
-};
-
-vdev_ops_t vdev_spare_ops = {
-	vdev_mirror_open,
-	vdev_mirror_close,
-	vdev_default_asize,
-	vdev_mirror_io_start,
-	vdev_mirror_io_done,
-	vdev_mirror_state_change,
-	VDEV_TYPE_SPARE,	/* name of this vdev type */
-	B_FALSE			/* not a leaf vdev */
-};
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c
deleted file mode 100644
index b35f4a5..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-/*
- * The 'missing' vdev is a special vdev type used only during import.  It
- * signifies a placeholder in the root vdev for some vdev that we know is
- * missing.  We pass it down to the kernel to allow the rest of the
- * configuration to parsed and an attempt made to open all available devices.
- * Because its GUID is always 0, we know that the guid sum will mismatch and we
- * won't be able to open the pool anyway.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/spa.h>
-#include <sys/vdev_impl.h>
-#include <sys/fs/zfs.h>
-#include <sys/zio.h>
-
-/* ARGSUSED */
-static int
-vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
-{
-	/*
-	 * Really this should just fail.  But then the root vdev will be in the
-	 * faulted state with VDEV_AUX_NO_REPLICAS, when what we really want is
-	 * VDEV_AUX_BAD_GUID_SUM.  So we pretend to succeed, knowing that we
-	 * will fail the GUID sum check before ever trying to open the pool.
-	 */
-	*psize = SPA_MINDEVSIZE;
-	*ashift = SPA_MINBLOCKSHIFT;
-	return (0);
-}
-
-/* ARGSUSED */
-static void
-vdev_missing_close(vdev_t *vd)
-{
-}
-
-/* ARGSUSED */
-static void
-vdev_missing_io_start(zio_t *zio)
-{
-	zio->io_error = ENOTSUP;
-	zio_next_stage_async(zio);
-}
-
-/* ARGSUSED */
-static void
-vdev_missing_io_done(zio_t *zio)
-{
-	zio_next_stage(zio);
-}
-
-vdev_ops_t vdev_missing_ops = {
-	vdev_missing_open,
-	vdev_missing_close,
-	vdev_default_asize,
-	vdev_missing_io_start,
-	vdev_missing_io_done,
-	NULL,
-	VDEV_TYPE_MISSING,	/* name of this vdev type */
-	B_TRUE			/* leaf vdev */
-};
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
deleted file mode 100644
index 8ef524f..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
+++ /dev/null
@@ -1,323 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/spa.h>
-#include <sys/vdev_impl.h>
-#include <sys/zio.h>
-#include <sys/avl.h>
-
-/*
- * These tunables are for performance analysis.
- */
-/*
- * zfs_vdev_max_pending is the maximum number of i/os concurrently
- * pending to each device.  zfs_vdev_min_pending is the initial number
- * of i/os pending to each device (before it starts ramping up to
- * max_pending).
- */
-int zfs_vdev_max_pending = 35;
-int zfs_vdev_min_pending = 4;
-
-/* deadline = pri + (LBOLT >> time_shift) */
-int zfs_vdev_time_shift = 6;
-
-/* exponential I/O issue ramp-up rate */
-int zfs_vdev_ramp_rate = 2;
-
-/*
- * i/os will be aggregated into a single large i/o up to
- * zfs_vdev_aggregation_limit bytes long.
- */
-int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE;
-
-/*
- * Virtual device vector for disk I/O scheduling.
- */
-int
-vdev_queue_deadline_compare(const void *x1, const void *x2)
-{
-	const zio_t *z1 = x1;
-	const zio_t *z2 = x2;
-
-	if (z1->io_deadline < z2->io_deadline)
-		return (-1);
-	if (z1->io_deadline > z2->io_deadline)
-		return (1);
-
-	if (z1->io_offset < z2->io_offset)
-		return (-1);
-	if (z1->io_offset > z2->io_offset)
-		return (1);
-
-	if (z1 < z2)
-		return (-1);
-	if (z1 > z2)
-		return (1);
-
-	return (0);
-}
-
-int
-vdev_queue_offset_compare(const void *x1, const void *x2)
-{
-	const zio_t *z1 = x1;
-	const zio_t *z2 = x2;
-
-	if (z1->io_offset < z2->io_offset)
-		return (-1);
-	if (z1->io_offset > z2->io_offset)
-		return (1);
-
-	if (z1 < z2)
-		return (-1);
-	if (z1 > z2)
-		return (1);
-
-	return (0);
-}
-
-void
-vdev_queue_init(vdev_t *vd)
-{
-	vdev_queue_t *vq = &vd->vdev_queue;
-
-	mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
-
-	avl_create(&vq->vq_deadline_tree, vdev_queue_deadline_compare,
-	    sizeof (zio_t), offsetof(struct zio, io_deadline_node));
-
-	avl_create(&vq->vq_read_tree, vdev_queue_offset_compare,
-	    sizeof (zio_t), offsetof(struct zio, io_offset_node));
-
-	avl_create(&vq->vq_write_tree, vdev_queue_offset_compare,
-	    sizeof (zio_t), offsetof(struct zio, io_offset_node));
-
-	avl_create(&vq->vq_pending_tree, vdev_queue_offset_compare,
-	    sizeof (zio_t), offsetof(struct zio, io_offset_node));
-}
-
-void
-vdev_queue_fini(vdev_t *vd)
-{
-	vdev_queue_t *vq = &vd->vdev_queue;
-
-	avl_destroy(&vq->vq_deadline_tree);
-	avl_destroy(&vq->vq_read_tree);
-	avl_destroy(&vq->vq_write_tree);
-	avl_destroy(&vq->vq_pending_tree);
-
-	mutex_destroy(&vq->vq_lock);
-}
-
-static void
-vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
-{
-	avl_add(&vq->vq_deadline_tree, zio);
-	avl_add(zio->io_vdev_tree, zio);
-}
-
-static void
-vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
-{
-	avl_remove(&vq->vq_deadline_tree, zio);
-	avl_remove(zio->io_vdev_tree, zio);
-}
-
-static void
-vdev_queue_agg_io_done(zio_t *aio)
-{
-	zio_t *dio;
-	uint64_t offset = 0;
-
-	while ((dio = aio->io_delegate_list) != NULL) {
-		if (aio->io_type == ZIO_TYPE_READ)
-			bcopy((char *)aio->io_data + offset, dio->io_data,
-			    dio->io_size);
-		offset += dio->io_size;
-		aio->io_delegate_list = dio->io_delegate_next;
-		dio->io_delegate_next = NULL;
-		dio->io_error = aio->io_error;
-		zio_next_stage(dio);
-	}
-	ASSERT3U(offset, ==, aio->io_size);
-
-	zio_buf_free(aio->io_data, aio->io_size);
-}
-
-#define	IS_ADJACENT(io, nio) \
-	((io)->io_offset + (io)->io_size == (nio)->io_offset)
-
-typedef void zio_issue_func_t(zio_t *);
-
-static zio_t *
-vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit,
-	zio_issue_func_t **funcp)
-{
-	zio_t *fio, *lio, *aio, *dio;
-	avl_tree_t *tree;
-	uint64_t size;
-
-	ASSERT(MUTEX_HELD(&vq->vq_lock));
-
-	*funcp = NULL;
-
-	if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit ||
-	    avl_numnodes(&vq->vq_deadline_tree) == 0)
-		return (NULL);
-
-	fio = lio = avl_first(&vq->vq_deadline_tree);
-
-	tree = fio->io_vdev_tree;
-	size = fio->io_size;
-
-	while ((dio = AVL_PREV(tree, fio)) != NULL && IS_ADJACENT(dio, fio) &&
-	    size + dio->io_size <= zfs_vdev_aggregation_limit) {
-		dio->io_delegate_next = fio;
-		fio = dio;
-		size += dio->io_size;
-	}
-
-	while ((dio = AVL_NEXT(tree, lio)) != NULL && IS_ADJACENT(lio, dio) &&
-	    size + dio->io_size <= zfs_vdev_aggregation_limit) {
-		lio->io_delegate_next = dio;
-		lio = dio;
-		size += dio->io_size;
-	}
-
-	if (fio != lio) {
-		char *buf = zio_buf_alloc(size);
-		uint64_t offset = 0;
-		int nagg = 0;
-
-		ASSERT(size <= zfs_vdev_aggregation_limit);
-
-		aio = zio_vdev_child_io(fio, NULL, fio->io_vd,
-		    fio->io_offset, buf, size, fio->io_type,
-		    ZIO_PRIORITY_NOW, ZIO_FLAG_DONT_QUEUE |
-		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE |
-		    ZIO_FLAG_NOBOOKMARK,
-		    vdev_queue_agg_io_done, NULL);
-
-		aio->io_delegate_list = fio;
-
-		for (dio = fio; dio != NULL; dio = dio->io_delegate_next) {
-			ASSERT(dio->io_type == aio->io_type);
-			ASSERT(dio->io_vdev_tree == tree);
-			if (dio->io_type == ZIO_TYPE_WRITE)
-				bcopy(dio->io_data, buf + offset, dio->io_size);
-			offset += dio->io_size;
-			vdev_queue_io_remove(vq, dio);
-			zio_vdev_io_bypass(dio);
-			nagg++;
-		}
-
-		ASSERT(offset == size);
-
-		dprintf("%5s  T=%llu  off=%8llx  agg=%3d  "
-		    "old=%5llx  new=%5llx\n",
-		    zio_type_name[fio->io_type],
-		    fio->io_deadline, fio->io_offset, nagg, fio->io_size, size);
-
-		avl_add(&vq->vq_pending_tree, aio);
-
-		*funcp = zio_nowait;
-		return (aio);
-	}
-
-	ASSERT(fio->io_vdev_tree == tree);
-	vdev_queue_io_remove(vq, fio);
-
-	avl_add(&vq->vq_pending_tree, fio);
-
-	*funcp = zio_next_stage;
-
-	return (fio);
-}
-
-zio_t *
-vdev_queue_io(zio_t *zio)
-{
-	vdev_queue_t *vq = &zio->io_vd->vdev_queue;
-	zio_t *nio;
-	zio_issue_func_t *func;
-
-	ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
-
-	if (zio->io_flags & ZIO_FLAG_DONT_QUEUE)
-		return (zio);
-
-	zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE;
-
-	if (zio->io_type == ZIO_TYPE_READ)
-		zio->io_vdev_tree = &vq->vq_read_tree;
-	else
-		zio->io_vdev_tree = &vq->vq_write_tree;
-
-	mutex_enter(&vq->vq_lock);
-
-	zio->io_deadline = (zio->io_timestamp >> zfs_vdev_time_shift) +
-	    zio->io_priority;
-
-	vdev_queue_io_add(vq, zio);
-
-	nio = vdev_queue_io_to_issue(vq, zfs_vdev_min_pending, &func);
-
-	mutex_exit(&vq->vq_lock);
-
-	if (nio == NULL || func != zio_nowait)
-		return (nio);
-
-	func(nio);
-	return (NULL);
-}
-
-void
-vdev_queue_io_done(zio_t *zio)
-{
-	vdev_queue_t *vq = &zio->io_vd->vdev_queue;
-	zio_t *nio;
-	zio_issue_func_t *func;
-	int i;
-
-	mutex_enter(&vq->vq_lock);
-
-	avl_remove(&vq->vq_pending_tree, zio);
-
-	for (i = 0; i < zfs_vdev_ramp_rate; i++) {
-		nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending, &func);
-		if (nio == NULL)
-			break;
-		mutex_exit(&vq->vq_lock);
-		if (func == zio_next_stage)
-			zio_vdev_io_reissue(nio);
-		func(nio);
-		mutex_enter(&vq->vq_lock);
-	}
-
-	mutex_exit(&vq->vq_lock);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
deleted file mode 100644
index 0c86630..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
+++ /dev/null
@@ -1,1237 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/spa.h>
-#include <sys/vdev_impl.h>
-#include <sys/zio.h>
-#include <sys/zio_checksum.h>
-#include <sys/fs/zfs.h>
-#include <sys/fm/fs/zfs.h>
-
-/*
- * Virtual device vector for RAID-Z.
- *
- * This vdev supports both single and double parity. For single parity, we
- * use a simple XOR of all the data columns. For double parity, we use both
- * the simple XOR as well as a technique described in "The mathematics of
- * RAID-6" by H. Peter Anvin. This technique defines a Galois field, GF(2^8),
- * over the integers expressable in a single byte. Briefly, the operations on
- * the field are defined as follows:
- *
- *   o addition (+) is represented by a bitwise XOR
- *   o subtraction (-) is therefore identical to addition: A + B = A - B
- *   o multiplication of A by 2 is defined by the following bitwise expression:
- *	(A * 2)_7 = A_6
- *	(A * 2)_6 = A_5
- *	(A * 2)_5 = A_4
- *	(A * 2)_4 = A_3 + A_7
- *	(A * 2)_3 = A_2 + A_7
- *	(A * 2)_2 = A_1 + A_7
- *	(A * 2)_1 = A_0
- *	(A * 2)_0 = A_7
- *
- * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
- *
- * Observe that any number in the field (except for 0) can be expressed as a
- * power of 2 -- a generator for the field. We store a table of the powers of
- * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
- * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
- * than field addition). The inverse of a field element A (A^-1) is A^254.
- *
- * The two parity columns, P and Q, over several data columns, D_0, ... D_n-1,
- * can be expressed by field operations:
- *
- *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
- *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
- *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
- *
- * See the reconstruction code below for how P and Q can used individually or
- * in concert to recover missing data columns.
- */
-
-typedef struct raidz_col {
-	uint64_t rc_devidx;		/* child device index for I/O */
-	uint64_t rc_offset;		/* device offset */
-	uint64_t rc_size;		/* I/O size */
-	void *rc_data;			/* I/O data */
-	int rc_error;			/* I/O error for this device */
-	uint8_t rc_tried;		/* Did we attempt this I/O column? */
-	uint8_t rc_skipped;		/* Did we skip this I/O column? */
-} raidz_col_t;
-
-typedef struct raidz_map {
-	uint64_t rm_cols;		/* Column count */
-	uint64_t rm_bigcols;		/* Number of oversized columns */
-	uint64_t rm_asize;		/* Actual total I/O size */
-	uint64_t rm_missingdata;	/* Count of missing data devices */
-	uint64_t rm_missingparity;	/* Count of missing parity devices */
-	uint64_t rm_firstdatacol;	/* First data column/parity count */
-	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
-} raidz_map_t;
-
-#define	VDEV_RAIDZ_P		0
-#define	VDEV_RAIDZ_Q		1
-
-#define	VDEV_RAIDZ_MAXPARITY	2
-
-#define	VDEV_RAIDZ_MUL_2(a)	(((a) << 1) ^ (((a) & 0x80) ? 0x1d : 0))
-
-/*
- * These two tables represent powers and logs of 2 in the Galois field defined
- * above. These values were computed by repeatedly multiplying by 2 as above.
- */
-static const uint8_t vdev_raidz_pow2[256] = {
-	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
-	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
-	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
-	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
-	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
-	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
-	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
-	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
-	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
-	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
-	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
-	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
-	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
-	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
-	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
-	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
-	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
-	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
-	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
-	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
-	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
-	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
-	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
-	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
-	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
-	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
-	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
-	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
-	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
-	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
-	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
-};
-static const uint8_t vdev_raidz_log2[256] = {
-	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
-	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
-	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
-	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
-	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
-	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
-	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
-	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
-	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
-	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
-	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
-	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
-	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
-	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
-	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
-	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
-	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
-	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
-	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
-	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
-	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
-	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
-	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
-	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
-	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
-	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
-	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
-	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
-	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
-	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
-	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
-	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
-};
-
-/*
- * Multiply a given number by 2 raised to the given power.
- */
-static uint8_t
-vdev_raidz_exp2(uint_t a, int exp)
-{
-	if (a == 0)
-		return (0);
-
-	ASSERT(exp >= 0);
-	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
-
-	exp += vdev_raidz_log2[a];
-	if (exp > 255)
-		exp -= 255;
-
-	return (vdev_raidz_pow2[exp]);
-}
-
-static raidz_map_t *
-vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
-    uint64_t nparity)
-{
-	raidz_map_t *rm;
-	uint64_t b = zio->io_offset >> unit_shift;
-	uint64_t s = zio->io_size >> unit_shift;
-	uint64_t f = b % dcols;
-	uint64_t o = (b / dcols) << unit_shift;
-	uint64_t q, r, c, bc, col, acols, coff, devidx;
-
-	q = s / (dcols - nparity);
-	r = s - q * (dcols - nparity);
-	bc = (r == 0 ? 0 : r + nparity);
-
-	acols = (q == 0 ? bc : dcols);
-
-	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP);
-
-	rm->rm_cols = acols;
-	rm->rm_bigcols = bc;
-	rm->rm_asize = 0;
-	rm->rm_missingdata = 0;
-	rm->rm_missingparity = 0;
-	rm->rm_firstdatacol = nparity;
-
-	for (c = 0; c < acols; c++) {
-		col = f + c;
-		coff = o;
-		if (col >= dcols) {
-			col -= dcols;
-			coff += 1ULL << unit_shift;
-		}
-		rm->rm_col[c].rc_devidx = col;
-		rm->rm_col[c].rc_offset = coff;
-		rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift;
-		rm->rm_col[c].rc_data = NULL;
-		rm->rm_col[c].rc_error = 0;
-		rm->rm_col[c].rc_tried = 0;
-		rm->rm_col[c].rc_skipped = 0;
-		rm->rm_asize += rm->rm_col[c].rc_size;
-	}
-
-	rm->rm_asize = roundup(rm->rm_asize, (nparity + 1) << unit_shift);
-
-	for (c = 0; c < rm->rm_firstdatacol; c++)
-		rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
-
-	rm->rm_col[c].rc_data = zio->io_data;
-
-	for (c = c + 1; c < acols; c++)
-		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
-		    rm->rm_col[c - 1].rc_size;
-
-	/*
-	 * If all data stored spans all columns, there's a danger that parity
-	 * will always be on the same device and, since parity isn't read
-	 * during normal operation, that that device's I/O bandwidth won't be
-	 * used effectively. We therefore switch the parity every 1MB.
-	 *
-	 * ... at least that was, ostensibly, the theory. As a practical
-	 * matter unless we juggle the parity between all devices evenly, we
-	 * won't see any benefit. Further, occasional writes that aren't a
-	 * multiple of the LCM of the number of children and the minimum
-	 * stripe width are sufficient to avoid pessimal behavior.
-	 * Unfortunately, this decision created an implicit on-disk format
-	 * requirement that we need to support for all eternity, but only
-	 * for single-parity RAID-Z.
-	 */
-	ASSERT(rm->rm_cols >= 2);
-	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
-
-	if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
-		devidx = rm->rm_col[0].rc_devidx;
-		o = rm->rm_col[0].rc_offset;
-		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
-		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
-		rm->rm_col[1].rc_devidx = devidx;
-		rm->rm_col[1].rc_offset = o;
-	}
-
-	zio->io_vsd = rm;
-	return (rm);
-}
-
-static void
-vdev_raidz_map_free(zio_t *zio)
-{
-	raidz_map_t *rm = zio->io_vsd;
-	int c;
-
-	for (c = 0; c < rm->rm_firstdatacol; c++)
-		zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
-
-	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols]));
-	zio->io_vsd = NULL;
-}
-
-static void
-vdev_raidz_generate_parity_p(raidz_map_t *rm)
-{
-	uint64_t *p, *src, pcount, ccount, i;
-	int c;
-
-	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
-
-	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
-		src = rm->rm_col[c].rc_data;
-		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
-		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
-
-		if (c == rm->rm_firstdatacol) {
-			ASSERT(ccount == pcount);
-			for (i = 0; i < ccount; i++, p++, src++) {
-				*p = *src;
-			}
-		} else {
-			ASSERT(ccount <= pcount);
-			for (i = 0; i < ccount; i++, p++, src++) {
-				*p ^= *src;
-			}
-		}
-	}
-}
-
-static void
-vdev_raidz_generate_parity_pq(raidz_map_t *rm)
-{
-	uint64_t *q, *p, *src, pcount, ccount, mask, i;
-	int c;
-
-	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
-	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
-	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
-
-	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
-		src = rm->rm_col[c].rc_data;
-		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
-		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
-		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
-
-		if (c == rm->rm_firstdatacol) {
-			ASSERT(ccount == pcount || ccount == 0);
-			for (i = 0; i < ccount; i++, p++, q++, src++) {
-				*q = *src;
-				*p = *src;
-			}
-			for (; i < pcount; i++, p++, q++, src++) {
-				*q = 0;
-				*p = 0;
-			}
-		} else {
-			ASSERT(ccount <= pcount);
-
-			/*
-			 * Rather than multiplying each byte individually (as
-			 * described above), we are able to handle 8 at once
-			 * by generating a mask based on the high bit in each
-			 * byte and using that to conditionally XOR in 0x1d.
-			 */
-			for (i = 0; i < ccount; i++, p++, q++, src++) {
-				mask = *q & 0x8080808080808080ULL;
-				mask = (mask << 1) - (mask >> 7);
-				*q = ((*q << 1) & 0xfefefefefefefefeULL) ^
-				    (mask & 0x1d1d1d1d1d1d1d1dULL);
-				*q ^= *src;
-				*p ^= *src;
-			}
-
-			/*
-			 * Treat short columns as though they are full of 0s.
-			 */
-			for (; i < pcount; i++, q++) {
-				mask = *q & 0x8080808080808080ULL;
-				mask = (mask << 1) - (mask >> 7);
-				*q = ((*q << 1) & 0xfefefefefefefefeULL) ^
-				    (mask & 0x1d1d1d1d1d1d1d1dULL);
-			}
-		}
-	}
-}
-
-static void
-vdev_raidz_reconstruct_p(raidz_map_t *rm, int x)
-{
-	uint64_t *dst, *src, xcount, ccount, count, i;
-	int c;
-
-	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
-	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
-	ASSERT(xcount > 0);
-
-	src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
-	dst = rm->rm_col[x].rc_data;
-	for (i = 0; i < xcount; i++, dst++, src++) {
-		*dst = *src;
-	}
-
-	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
-		src = rm->rm_col[c].rc_data;
-		dst = rm->rm_col[x].rc_data;
-
-		if (c == x)
-			continue;
-
-		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
-		count = MIN(ccount, xcount);
-
-		for (i = 0; i < count; i++, dst++, src++) {
-			*dst ^= *src;
-		}
-	}
-}
-
-static void
-vdev_raidz_reconstruct_q(raidz_map_t *rm, int x)
-{
-	uint64_t *dst, *src, xcount, ccount, count, mask, i;
-	uint8_t *b;
-	int c, j, exp;
-
-	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
-	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
-
-	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
-		src = rm->rm_col[c].rc_data;
-		dst = rm->rm_col[x].rc_data;
-
-		if (c == x)
-			ccount = 0;
-		else
-			ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
-
-		count = MIN(ccount, xcount);
-
-		if (c == rm->rm_firstdatacol) {
-			for (i = 0; i < count; i++, dst++, src++) {
-				*dst = *src;
-			}
-			for (; i < xcount; i++, dst++) {
-				*dst = 0;
-			}
-
-		} else {
-			/*
-			 * For an explanation of this, see the comment in
-			 * vdev_raidz_generate_parity_pq() above.
-			 */
-			for (i = 0; i < count; i++, dst++, src++) {
-				mask = *dst & 0x8080808080808080ULL;
-				mask = (mask << 1) - (mask >> 7);
-				*dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
-				    (mask & 0x1d1d1d1d1d1d1d1dULL);
-				*dst ^= *src;
-			}
-
-			for (; i < xcount; i++, dst++) {
-				mask = *dst & 0x8080808080808080ULL;
-				mask = (mask << 1) - (mask >> 7);
-				*dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
-				    (mask & 0x1d1d1d1d1d1d1d1dULL);
-			}
-		}
-	}
-
-	src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
-	dst = rm->rm_col[x].rc_data;
-	exp = 255 - (rm->rm_cols - 1 - x);
-
-	for (i = 0; i < xcount; i++, dst++, src++) {
-		*dst ^= *src;
-		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
-			*b = vdev_raidz_exp2(*b, exp);
-		}
-	}
-}
-
-static void
-vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y)
-{
-	uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
-	void *pdata, *qdata;
-	uint64_t xsize, ysize, i;
-
-	ASSERT(x < y);
-	ASSERT(x >= rm->rm_firstdatacol);
-	ASSERT(y < rm->rm_cols);
-
-	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
-
-	/*
-	 * Move the parity data aside -- we're going to compute parity as
-	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
-	 * reuse the parity generation mechanism without trashing the actual
-	 * parity so we make those columns appear to be full of zeros by
-	 * setting their lengths to zero.
-	 */
-	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
-	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
-	xsize = rm->rm_col[x].rc_size;
-	ysize = rm->rm_col[y].rc_size;
-
-	rm->rm_col[VDEV_RAIDZ_P].rc_data =
-	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
-	rm->rm_col[VDEV_RAIDZ_Q].rc_data =
-	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
-	rm->rm_col[x].rc_size = 0;
-	rm->rm_col[y].rc_size = 0;
-
-	vdev_raidz_generate_parity_pq(rm);
-
-	rm->rm_col[x].rc_size = xsize;
-	rm->rm_col[y].rc_size = ysize;
-
-	p = pdata;
-	q = qdata;
-	pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
-	qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
-	xd = rm->rm_col[x].rc_data;
-	yd = rm->rm_col[y].rc_data;
-
-	/*
-	 * We now have:
-	 *	Pxy = P + D_x + D_y
-	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
-	 *
-	 * We can then solve for D_x:
-	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
-	 * where
-	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
-	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
-	 *
-	 * With D_x in hand, we can easily solve for D_y:
-	 *	D_y = P + Pxy + D_x
-	 */
-
-	a = vdev_raidz_pow2[255 + x - y];
-	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
-	tmp = 255 - vdev_raidz_log2[a ^ 1];
-
-	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
-	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
-
-	for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
-		*xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
-		    vdev_raidz_exp2(*q ^ *qxy, bexp);
-
-		if (i < ysize)
-			*yd = *p ^ *pxy ^ *xd;
-	}
-
-	zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
-	    rm->rm_col[VDEV_RAIDZ_P].rc_size);
-	zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
-	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
-
-	/*
-	 * Restore the saved parity data.
-	 */
-	rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
-	rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
-}
-
-
-static int
-vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
-{
-	vdev_t *cvd;
-	uint64_t nparity = vd->vdev_nparity;
-	int c, error;
-	int lasterror = 0;
-	int numerrors = 0;
-
-	ASSERT(nparity > 0);
-
-	if (nparity > VDEV_RAIDZ_MAXPARITY ||
-	    vd->vdev_children < nparity + 1) {
-		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
-		return (EINVAL);
-	}
-
-	for (c = 0; c < vd->vdev_children; c++) {
-		cvd = vd->vdev_child[c];
-
-		if ((error = vdev_open(cvd)) != 0) {
-			lasterror = error;
-			numerrors++;
-			continue;
-		}
-
-		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
-		*ashift = MAX(*ashift, cvd->vdev_ashift);
-	}
-
-	*asize *= vd->vdev_children;
-
-	if (numerrors > nparity) {
-		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
-		return (lasterror);
-	}
-
-	return (0);
-}
-
-static void
-vdev_raidz_close(vdev_t *vd)
-{
-	int c;
-
-	for (c = 0; c < vd->vdev_children; c++)
-		vdev_close(vd->vdev_child[c]);
-}
-
-static uint64_t
-vdev_raidz_asize(vdev_t *vd, uint64_t psize)
-{
-	uint64_t asize;
-	uint64_t ashift = vd->vdev_top->vdev_ashift;
-	uint64_t cols = vd->vdev_children;
-	uint64_t nparity = vd->vdev_nparity;
-
-	asize = ((psize - 1) >> ashift) + 1;
-	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
-	asize = roundup(asize, nparity + 1) << ashift;
-
-	return (asize);
-}
-
-static void
-vdev_raidz_child_done(zio_t *zio)
-{
-	raidz_col_t *rc = zio->io_private;
-
-	rc->rc_error = zio->io_error;
-	rc->rc_tried = 1;
-	rc->rc_skipped = 0;
-}
-
-static void
-vdev_raidz_repair_done(zio_t *zio)
-{
-	ASSERT(zio->io_private == zio->io_parent);
-	vdev_raidz_map_free(zio->io_private);
-}
-
-static void
-vdev_raidz_io_start(zio_t *zio)
-{
-	vdev_t *vd = zio->io_vd;
-	vdev_t *tvd = vd->vdev_top;
-	vdev_t *cvd;
-	blkptr_t *bp = zio->io_bp;
-	raidz_map_t *rm;
-	raidz_col_t *rc;
-	int c;
-
-	rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
-	    vd->vdev_nparity);
-
-	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
-
-	if (zio->io_type == ZIO_TYPE_WRITE) {
-		/*
-		 * Generate RAID parity in the first virtual columns.
-		 */
-		if (rm->rm_firstdatacol == 1)
-			vdev_raidz_generate_parity_p(rm);
-		else
-			vdev_raidz_generate_parity_pq(rm);
-
-		for (c = 0; c < rm->rm_cols; c++) {
-			rc = &rm->rm_col[c];
-			cvd = vd->vdev_child[rc->rc_devidx];
-			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
-			    rc->rc_offset, rc->rc_data, rc->rc_size,
-			    zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
-			    vdev_raidz_child_done, rc));
-		}
-		zio_wait_children_done(zio);
-		return;
-	}
-
-	ASSERT(zio->io_type == ZIO_TYPE_READ);
-
-	/*
-	 * Iterate over the columns in reverse order so that we hit the parity
-	 * last -- any errors along the way will force us to read the parity
-	 * data.
-	 */
-	for (c = rm->rm_cols - 1; c >= 0; c--) {
-		rc = &rm->rm_col[c];
-		cvd = vd->vdev_child[rc->rc_devidx];
-		if (vdev_is_dead(cvd)) {
-			if (c >= rm->rm_firstdatacol)
-				rm->rm_missingdata++;
-			else
-				rm->rm_missingparity++;
-			rc->rc_error = ENXIO;
-			rc->rc_tried = 1;	/* don't even try */
-			rc->rc_skipped = 1;
-			continue;
-		}
-		if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) {
-			if (c >= rm->rm_firstdatacol)
-				rm->rm_missingdata++;
-			else
-				rm->rm_missingparity++;
-			rc->rc_error = ESTALE;
-			rc->rc_skipped = 1;
-			continue;
-		}
-		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
-		    (zio->io_flags & ZIO_FLAG_SCRUB)) {
-			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
-			    rc->rc_offset, rc->rc_data, rc->rc_size,
-			    zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
-			    vdev_raidz_child_done, rc));
-		}
-	}
-
-	zio_wait_children_done(zio);
-}
-
-/*
- * Report a checksum error for a child of a RAID-Z device.
- */
-static void
-raidz_checksum_error(zio_t *zio, raidz_col_t *rc)
-{
-	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
-	dprintf_bp(zio->io_bp, "imputed checksum error on %s: ",
-	    vdev_description(vd));
-
-	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
-		mutex_enter(&vd->vdev_stat_lock);
-		vd->vdev_stat.vs_checksum_errors++;
-		mutex_exit(&vd->vdev_stat_lock);
-	}
-
-	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE))
-		zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
-		    zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size);
-}
-
-/*
- * Generate the parity from the data columns. If we tried and were able to
- * read the parity without error, verify that the generated parity matches the
- * data we read. If it doesn't, we fire off a checksum error. Return the
- * number such failures.
- */
-static int
-raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
-{
-	void *orig[VDEV_RAIDZ_MAXPARITY];
-	int c, ret = 0;
-	raidz_col_t *rc;
-
-	for (c = 0; c < rm->rm_firstdatacol; c++) {
-		rc = &rm->rm_col[c];
-		if (!rc->rc_tried || rc->rc_error != 0)
-			continue;
-		orig[c] = zio_buf_alloc(rc->rc_size);
-		bcopy(rc->rc_data, orig[c], rc->rc_size);
-	}
-
-	if (rm->rm_firstdatacol == 1)
-		vdev_raidz_generate_parity_p(rm);
-	else
-		vdev_raidz_generate_parity_pq(rm);
-
-	for (c = 0; c < rm->rm_firstdatacol; c++) {
-		rc = &rm->rm_col[c];
-		if (!rc->rc_tried || rc->rc_error != 0)
-			continue;
-		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
-			raidz_checksum_error(zio, rc);
-			rc->rc_error = ECKSUM;
-			ret++;
-		}
-		zio_buf_free(orig[c], rc->rc_size);
-	}
-
-	return (ret);
-}
-
-static uint64_t raidz_corrected_p;
-static uint64_t raidz_corrected_q;
-static uint64_t raidz_corrected_pq;
-
-static void
-vdev_raidz_io_done(zio_t *zio)
-{
-	vdev_t *vd = zio->io_vd;
-	vdev_t *cvd;
-	raidz_map_t *rm = zio->io_vsd;
-	raidz_col_t *rc, *rc1;
-	int unexpected_errors = 0;
-	int parity_errors = 0;
-	int parity_untried = 0;
-	int data_errors = 0;
-	int n, c, c1;
-
-	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
-
-	zio->io_error = 0;
-	zio->io_numerrors = 0;
-
-	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
-	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
-
-	for (c = 0; c < rm->rm_cols; c++) {
-		rc = &rm->rm_col[c];
-
-		/*
-		 * We preserve any EIOs because those may be worth retrying;
-		 * whereas ECKSUM and ENXIO are more likely to be persistent.
-		 */
-		if (rc->rc_error) {
-			if (zio->io_error != EIO)
-				zio->io_error = rc->rc_error;
-
-			if (c < rm->rm_firstdatacol)
-				parity_errors++;
-			else
-				data_errors++;
-
-			if (!rc->rc_skipped)
-				unexpected_errors++;
-
-			zio->io_numerrors++;
-		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
-			parity_untried++;
-		}
-	}
-
-	if (zio->io_type == ZIO_TYPE_WRITE) {
-		/*
-		 * If this is not a failfast write, and we were able to
-		 * write enough columns to reconstruct the data, good enough.
-		 */
-		/* XXPOLICY */
-		if (zio->io_numerrors <= rm->rm_firstdatacol &&
-		    !(zio->io_flags & ZIO_FLAG_FAILFAST))
-			zio->io_error = 0;
-
-		vdev_raidz_map_free(zio);
-		zio_next_stage(zio);
-		return;
-	}
-
-	ASSERT(zio->io_type == ZIO_TYPE_READ);
-	/*
-	 * There are three potential phases for a read:
-	 *	1. produce valid data from the columns read
-	 *	2. read all disks and try again
-	 *	3. perform combinatorial reconstruction
-	 *
-	 * Each phase is progressively both more expensive and less likely to
-	 * occur. If we encounter more errors than we can repair or all phases
-	 * fail, we have no choice but to return an error.
-	 */
-
-	/*
-	 * If the number of errors we saw was correctable -- less than or equal
-	 * to the number of parity disks read -- attempt to produce data that
-	 * has a valid checksum. Naturally, this case applies in the absence of
-	 * any errors.
-	 */
-	if (zio->io_numerrors <= rm->rm_firstdatacol - parity_untried) {
-		switch (data_errors) {
-		case 0:
-			if (zio_checksum_error(zio) == 0) {
-				zio->io_error = 0;
-
-				/*
-				 * If we read parity information (unnecessarily
-				 * as it happens since no reconstruction was
-				 * needed) regenerate and verify the parity.
-				 * We also regenerate parity when resilvering
-				 * so we can write it out to the failed device
-				 * later.
-				 */
-				if (parity_errors + parity_untried <
-				    rm->rm_firstdatacol ||
-				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
-					n = raidz_parity_verify(zio, rm);
-					unexpected_errors += n;
-					ASSERT(parity_errors + n <=
-					    rm->rm_firstdatacol);
-				}
-				goto done;
-			}
-			break;
-
-		case 1:
-			/*
-			 * We either attempt to read all the parity columns or
-			 * none of them. If we didn't try to read parity, we
-			 * wouldn't be here in the correctable case. There must
-			 * also have been fewer parity errors than parity
-			 * columns or, again, we wouldn't be in this code path.
-			 */
-			ASSERT(parity_untried == 0);
-			ASSERT(parity_errors < rm->rm_firstdatacol);
-
-			/*
-			 * Find the column that reported the error.
-			 */
-			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
-				rc = &rm->rm_col[c];
-				if (rc->rc_error != 0)
-					break;
-			}
-			ASSERT(c != rm->rm_cols);
-			ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
-			    rc->rc_error == ESTALE);
-
-			if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
-				vdev_raidz_reconstruct_p(rm, c);
-			} else {
-				ASSERT(rm->rm_firstdatacol > 1);
-				vdev_raidz_reconstruct_q(rm, c);
-			}
-
-			if (zio_checksum_error(zio) == 0) {
-				zio->io_error = 0;
-				if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0)
-					atomic_inc_64(&raidz_corrected_p);
-				else
-					atomic_inc_64(&raidz_corrected_q);
-
-				/*
-				 * If there's more than one parity disk that
-				 * was successfully read, confirm that the
-				 * other parity disk produced the correct data.
-				 * This routine is suboptimal in that it
-				 * regenerates both the parity we wish to test
-				 * as well as the parity we just used to
-				 * perform the reconstruction, but this should
-				 * be a relatively uncommon case, and can be
-				 * optimized if it becomes a problem.
-				 * We also regenerate parity when resilvering
-				 * so we can write it out to the failed device
-				 * later.
-				 */
-				if (parity_errors < rm->rm_firstdatacol - 1 ||
-				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
-					n = raidz_parity_verify(zio, rm);
-					unexpected_errors += n;
-					ASSERT(parity_errors + n <=
-					    rm->rm_firstdatacol);
-				}
-
-				goto done;
-			}
-			break;
-
-		case 2:
-			/*
-			 * Two data column errors require double parity.
-			 */
-			ASSERT(rm->rm_firstdatacol == 2);
-
-			/*
-			 * Find the two columns that reported errors.
-			 */
-			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
-				rc = &rm->rm_col[c];
-				if (rc->rc_error != 0)
-					break;
-			}
-			ASSERT(c != rm->rm_cols);
-			ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
-			    rc->rc_error == ESTALE);
-
-			for (c1 = c++; c < rm->rm_cols; c++) {
-				rc = &rm->rm_col[c];
-				if (rc->rc_error != 0)
-					break;
-			}
-			ASSERT(c != rm->rm_cols);
-			ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
-			    rc->rc_error == ESTALE);
-
-			vdev_raidz_reconstruct_pq(rm, c1, c);
-
-			if (zio_checksum_error(zio) == 0) {
-				zio->io_error = 0;
-				atomic_inc_64(&raidz_corrected_pq);
-
-				goto done;
-			}
-			break;
-
-		default:
-			ASSERT(rm->rm_firstdatacol <= 2);
-			ASSERT(0);
-		}
-	}
-
-	/*
-	 * This isn't a typical situation -- either we got a read error or
-	 * a child silently returned bad data. Read every block so we can
-	 * try again with as much data and parity as we can track down. If
-	 * we've already been through once before, all children will be marked
-	 * as tried so we'll proceed to combinatorial reconstruction.
-	 */
-	unexpected_errors = 1;
-	rm->rm_missingdata = 0;
-	rm->rm_missingparity = 0;
-
-	for (c = 0; c < rm->rm_cols; c++) {
-		if (rm->rm_col[c].rc_tried)
-			continue;
-
-		zio->io_error = 0;
-		zio_vdev_io_redone(zio);
-		do {
-			rc = &rm->rm_col[c];
-			if (rc->rc_tried)
-				continue;
-			zio_nowait(zio_vdev_child_io(zio, NULL,
-			    vd->vdev_child[rc->rc_devidx],
-			    rc->rc_offset, rc->rc_data, rc->rc_size,
-			    zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
-			    vdev_raidz_child_done, rc));
-		} while (++c < rm->rm_cols);
-		dprintf("rereading\n");
-		zio_wait_children_done(zio);
-		return;
-	}
-
-	/*
-	 * At this point we've attempted to reconstruct the data given the
-	 * errors we detected, and we've attempted to read all columns. There
-	 * must, therefore, be one or more additional problems -- silent errors
-	 * resulting in invalid data rather than explicit I/O errors resulting
-	 * in absent data. Before we attempt combinatorial reconstruction make
-	 * sure we have a chance of coming up with the right answer.
-	 */
-	if (zio->io_numerrors >= rm->rm_firstdatacol) {
-		ASSERT(zio->io_error != 0);
-		goto done;
-	}
-
-	if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
-		/*
-		 * Attempt to reconstruct the data from parity P.
-		 */
-		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
-			void *orig;
-			rc = &rm->rm_col[c];
-
-			orig = zio_buf_alloc(rc->rc_size);
-			bcopy(rc->rc_data, orig, rc->rc_size);
-			vdev_raidz_reconstruct_p(rm, c);
-
-			if (zio_checksum_error(zio) == 0) {
-				zio_buf_free(orig, rc->rc_size);
-				zio->io_error = 0;
-				atomic_inc_64(&raidz_corrected_p);
-
-				/*
-				 * If this child didn't know that it returned
-				 * bad data, inform it.
-				 */
-				if (rc->rc_tried && rc->rc_error == 0)
-					raidz_checksum_error(zio, rc);
-				rc->rc_error = ECKSUM;
-				goto done;
-			}
-
-			bcopy(orig, rc->rc_data, rc->rc_size);
-			zio_buf_free(orig, rc->rc_size);
-		}
-	}
-
-	if (rm->rm_firstdatacol > 1 && rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
-		/*
-		 * Attempt to reconstruct the data from parity Q.
-		 */
-		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
-			void *orig;
-			rc = &rm->rm_col[c];
-
-			orig = zio_buf_alloc(rc->rc_size);
-			bcopy(rc->rc_data, orig, rc->rc_size);
-			vdev_raidz_reconstruct_q(rm, c);
-
-			if (zio_checksum_error(zio) == 0) {
-				zio_buf_free(orig, rc->rc_size);
-				zio->io_error = 0;
-				atomic_inc_64(&raidz_corrected_q);
-
-				/*
-				 * If this child didn't know that it returned
-				 * bad data, inform it.
-				 */
-				if (rc->rc_tried && rc->rc_error == 0)
-					raidz_checksum_error(zio, rc);
-				rc->rc_error = ECKSUM;
-				goto done;
-			}
-
-			bcopy(orig, rc->rc_data, rc->rc_size);
-			zio_buf_free(orig, rc->rc_size);
-		}
-	}
-
-	if (rm->rm_firstdatacol > 1 &&
-	    rm->rm_col[VDEV_RAIDZ_P].rc_error == 0 &&
-	    rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
-		/*
-		 * Attempt to reconstruct the data from both P and Q.
-		 */
-		for (c = rm->rm_firstdatacol; c < rm->rm_cols - 1; c++) {
-			void *orig, *orig1;
-			rc = &rm->rm_col[c];
-
-			orig = zio_buf_alloc(rc->rc_size);
-			bcopy(rc->rc_data, orig, rc->rc_size);
-
-			for (c1 = c + 1; c1 < rm->rm_cols; c1++) {
-				rc1 = &rm->rm_col[c1];
-
-				orig1 = zio_buf_alloc(rc1->rc_size);
-				bcopy(rc1->rc_data, orig1, rc1->rc_size);
-
-				vdev_raidz_reconstruct_pq(rm, c, c1);
-
-				if (zio_checksum_error(zio) == 0) {
-					zio_buf_free(orig, rc->rc_size);
-					zio_buf_free(orig1, rc1->rc_size);
-					zio->io_error = 0;
-					atomic_inc_64(&raidz_corrected_pq);
-
-					/*
-					 * If these children didn't know they
-					 * returned bad data, inform them.
-					 */
-					if (rc->rc_tried && rc->rc_error == 0)
-						raidz_checksum_error(zio, rc);
-					if (rc1->rc_tried && rc1->rc_error == 0)
-						raidz_checksum_error(zio, rc1);
-
-					rc->rc_error = ECKSUM;
-					rc1->rc_error = ECKSUM;
-
-					goto done;
-				}
-
-				bcopy(orig1, rc1->rc_data, rc1->rc_size);
-				zio_buf_free(orig1, rc1->rc_size);
-			}
-
-			bcopy(orig, rc->rc_data, rc->rc_size);
-			zio_buf_free(orig, rc->rc_size);
-		}
-	}
-
-	/*
-	 * All combinations failed to checksum. Generate checksum ereports for
-	 * all children.
-	 */
-	zio->io_error = ECKSUM;
-	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
-		for (c = 0; c < rm->rm_cols; c++) {
-			rc = &rm->rm_col[c];
-			zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
-			    zio->io_spa, vd->vdev_child[rc->rc_devidx], zio,
-			    rc->rc_offset, rc->rc_size);
-		}
-	}
-
-done:
-	zio_checksum_verified(zio);
-
-	if (zio->io_error == 0 && (spa_mode & FWRITE) &&
-	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
-		zio_t *rio;
-
-		/*
-		 * Use the good data we have in hand to repair damaged children.
-		 *
-		 * We issue all repair I/Os as children of 'rio' to arrange
-		 * that vdev_raidz_map_free(zio) will be invoked after all
-		 * repairs complete, but before we advance to the next stage.
-		 */
-		rio = zio_null(zio, zio->io_spa,
-		    vdev_raidz_repair_done, zio, ZIO_FLAG_CANFAIL);
-
-		for (c = 0; c < rm->rm_cols; c++) {
-			rc = &rm->rm_col[c];
-			cvd = vd->vdev_child[rc->rc_devidx];
-
-			if (rc->rc_error == 0)
-				continue;
-
-			dprintf("%s resilvered %s @ 0x%llx error %d\n",
-			    vdev_description(vd),
-			    vdev_description(cvd),
-			    zio->io_offset, rc->rc_error);
-
-			zio_nowait(zio_vdev_child_io(rio, NULL, cvd,
-			    rc->rc_offset, rc->rc_data, rc->rc_size,
-			    ZIO_TYPE_WRITE, zio->io_priority,
-			    ZIO_FLAG_IO_REPAIR | ZIO_FLAG_DONT_PROPAGATE |
-			    ZIO_FLAG_CANFAIL, NULL, NULL));
-		}
-
-		zio_nowait(rio);
-		zio_wait_children_done(zio);
-		return;
-	}
-
-	vdev_raidz_map_free(zio);
-	zio_next_stage(zio);
-}
-
-static void
-vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
-{
-	if (faulted > vd->vdev_nparity)
-		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
-		    VDEV_AUX_NO_REPLICAS);
-	else if (degraded + faulted != 0)
-		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
-	else
-		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
-}
-
-vdev_ops_t vdev_raidz_ops = {
-	vdev_raidz_open,
-	vdev_raidz_close,
-	vdev_raidz_asize,
-	vdev_raidz_io_start,
-	vdev_raidz_io_done,
-	vdev_raidz_state_change,
-	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
-	B_FALSE			/* not a leaf vdev */
-};
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c
deleted file mode 100644
index 0e8752c..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/spa.h>
-#include <sys/vdev_impl.h>
-#include <sys/zio.h>
-#include <sys/fs/zfs.h>
-
-/*
- * Virtual device vector for the pool's root vdev.
- */
-
-/*
- * We should be able to tolerate one failure with absolutely no damage
- * to our metadata.  Two failures will take out space maps, a bunch of
- * indirect block trees, meta dnodes, dnodes, etc.  Probably not a happy
- * place to live.  When we get smarter, we can liberalize this policy.
- * e.g. If we haven't lost two consecutive top-level vdevs, then we are
- * probably fine.  Adding bean counters during alloc/free can make this
- * future guesswork more accurate.
- */
-/*ARGSUSED*/
-static int
-too_many_errors(vdev_t *vd, int numerrors)
-{
-	return (numerrors > 0);
-}
-
-static int
-vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
-{
-	vdev_t *cvd;
-	int c, error;
-	int lasterror = 0;
-	int numerrors = 0;
-
-	if (vd->vdev_children == 0) {
-		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
-		return (EINVAL);
-	}
-
-	for (c = 0; c < vd->vdev_children; c++) {
-		cvd = vd->vdev_child[c];
-
-		if ((error = vdev_open(cvd)) != 0) {
-			lasterror = error;
-			numerrors++;
-			continue;
-		}
-	}
-
-	if (too_many_errors(vd, numerrors)) {
-		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
-		return (lasterror);
-	}
-
-	*asize = 0;
-	*ashift = 0;
-
-	return (0);
-}
-
-static void
-vdev_root_close(vdev_t *vd)
-{
-	int c;
-
-	for (c = 0; c < vd->vdev_children; c++)
-		vdev_close(vd->vdev_child[c]);
-}
-
-static void
-vdev_root_state_change(vdev_t *vd, int faulted, int degraded)
-{
-	if (too_many_errors(vd, faulted))
-		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
-		    VDEV_AUX_NO_REPLICAS);
-	else if (degraded != 0)
-		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
-	else
-		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
-}
-
-vdev_ops_t vdev_root_ops = {
-	vdev_root_open,
-	vdev_root_close,
-	vdev_default_asize,
-	NULL,			/* io_start - not applicable to the root */
-	NULL,			/* io_done - not applicable to the root */
-	vdev_root_state_change,
-	VDEV_TYPE_ROOT,		/* name of this vdev type */
-	B_FALSE			/* not a leaf vdev */
-};
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zap.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zap.c
deleted file mode 100644
index 4246ec0..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/zap.c
+++ /dev/null
@@ -1,1071 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-
-/*
- * This file contains the top half of the zfs directory structure
- * implementation. The bottom half is in zap_leaf.c.
- *
- * The zdir is an extendable hash data structure. There is a table of
- * pointers to buckets (zap_t->zd_data->zd_leafs). The buckets are
- * each a constant size and hold a variable number of directory entries.
- * The buckets (aka "leaf nodes") are implemented in zap_leaf.c.
- *
- * The pointer table holds a power of 2 number of pointers.
- * (1<<zap_t->zd_data->zd_phys->zd_prefix_len).  The bucket pointed to
- * by the pointer at index i in the table holds entries whose hash value
- * has a zd_prefix_len - bit prefix
- */
-
-#include <sys/spa.h>
-#include <sys/dmu.h>
-#include <sys/zfs_context.h>
-#include <sys/zap.h>
-#include <sys/refcount.h>
-#include <sys/zap_impl.h>
-#include <sys/zap_leaf.h>
-#include <sys/zfs_znode.h>
-
-int fzap_default_block_shift = 14; /* 16k blocksize */
-
-static void zap_leaf_pageout(dmu_buf_t *db, void *vl);
-static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks);
-
-
-void
-fzap_byteswap(void *vbuf, size_t size)
-{
-	uint64_t block_type;
-
-	block_type = *(uint64_t *)vbuf;
-
-	if (block_type == ZBT_LEAF || block_type == BSWAP_64(ZBT_LEAF))
-		zap_leaf_byteswap(vbuf, size);
-	else {
-		/* it's a ptrtbl block */
-		byteswap_uint64_array(vbuf, size);
-	}
-}
-
-void
-fzap_upgrade(zap_t *zap, dmu_tx_t *tx)
-{
-	dmu_buf_t *db;
-	zap_leaf_t *l;
-	int i;
-	zap_phys_t *zp;
-
-	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
-	zap->zap_ismicro = FALSE;
-
-	(void) dmu_buf_update_user(zap->zap_dbuf, zap, zap,
-	    &zap->zap_f.zap_phys, zap_evict);
-
-	mutex_init(&zap->zap_f.zap_num_entries_mtx, NULL, MUTEX_DEFAULT, 0);
-	zap->zap_f.zap_block_shift = highbit(zap->zap_dbuf->db_size) - 1;
-
-	zp = zap->zap_f.zap_phys;
-	/*
-	 * explicitly zero it since it might be coming from an
-	 * initialized microzap
-	 */
-	bzero(zap->zap_dbuf->db_data, zap->zap_dbuf->db_size);
-	zp->zap_block_type = ZBT_HEADER;
-	zp->zap_magic = ZAP_MAGIC;
-
-	zp->zap_ptrtbl.zt_shift = ZAP_EMBEDDED_PTRTBL_SHIFT(zap);
-
-	zp->zap_freeblk = 2;		/* block 1 will be the first leaf */
-	zp->zap_num_leafs = 1;
-	zp->zap_num_entries = 0;
-	zp->zap_salt = zap->zap_salt;
-
-	/* block 1 will be the first leaf */
-	for (i = 0; i < (1<<zp->zap_ptrtbl.zt_shift); i++)
-		ZAP_EMBEDDED_PTRTBL_ENT(zap, i) = 1;
-
-	/*
-	 * set up block 1 - the first leaf
-	 */
-	VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
-	    1<<FZAP_BLOCK_SHIFT(zap), FTAG, &db));
-	dmu_buf_will_dirty(db, tx);
-
-	l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
-	l->l_dbuf = db;
-	l->l_phys = db->db_data;
-
-	zap_leaf_init(l);
-
-	kmem_free(l, sizeof (zap_leaf_t));
-	dmu_buf_rele(db, FTAG);
-}
-
-static int
-zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx)
-{
-	if (RW_WRITE_HELD(&zap->zap_rwlock))
-		return (1);
-	if (rw_tryupgrade(&zap->zap_rwlock)) {
-		dmu_buf_will_dirty(zap->zap_dbuf, tx);
-		return (1);
-	}
-	return (0);
-}
-
-/*
- * Generic routines for dealing with the pointer & cookie tables.
- */
-
-static int
-zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
-    void (*transfer_func)(const uint64_t *src, uint64_t *dst, int n),
-    dmu_tx_t *tx)
-{
-	uint64_t b, newblk;
-	dmu_buf_t *db_old, *db_new;
-	int err;
-	int bs = FZAP_BLOCK_SHIFT(zap);
-	int hepb = 1<<(bs-4);
-	/* hepb = half the number of entries in a block */
-
-	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
-	ASSERT(tbl->zt_blk != 0);
-	ASSERT(tbl->zt_numblks > 0);
-
-	if (tbl->zt_nextblk != 0) {
-		newblk = tbl->zt_nextblk;
-	} else {
-		newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2);
-		tbl->zt_nextblk = newblk;
-		ASSERT3U(tbl->zt_blks_copied, ==, 0);
-		dmu_prefetch(zap->zap_objset, zap->zap_object,
-		    tbl->zt_blk << bs, tbl->zt_numblks << bs);
-	}
-
-	/*
-	 * Copy the ptrtbl from the old to new location.
-	 */
-
-	b = tbl->zt_blks_copied;
-	err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
-	    (tbl->zt_blk + b) << bs, FTAG, &db_old);
-	if (err)
-		return (err);
-
-	/* first half of entries in old[b] go to new[2*b+0] */
-	VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
-	    (newblk + 2*b+0) << bs, FTAG, &db_new));
-	dmu_buf_will_dirty(db_new, tx);
-	transfer_func(db_old->db_data, db_new->db_data, hepb);
-	dmu_buf_rele(db_new, FTAG);
-
-	/* second half of entries in old[b] go to new[2*b+1] */
-	VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
-	    (newblk + 2*b+1) << bs, FTAG, &db_new));
-	dmu_buf_will_dirty(db_new, tx);
-	transfer_func((uint64_t *)db_old->db_data + hepb,
-	    db_new->db_data, hepb);
-	dmu_buf_rele(db_new, FTAG);
-
-	dmu_buf_rele(db_old, FTAG);
-
-	tbl->zt_blks_copied++;
-
-	dprintf("copied block %llu of %llu\n",
-	    tbl->zt_blks_copied, tbl->zt_numblks);
-
-	if (tbl->zt_blks_copied == tbl->zt_numblks) {
-		(void) dmu_free_range(zap->zap_objset, zap->zap_object,
-		    tbl->zt_blk << bs, tbl->zt_numblks << bs, tx);
-
-		tbl->zt_blk = newblk;
-		tbl->zt_numblks *= 2;
-		tbl->zt_shift++;
-		tbl->zt_nextblk = 0;
-		tbl->zt_blks_copied = 0;
-
-		dprintf("finished; numblocks now %llu (%lluk entries)\n",
-		    tbl->zt_numblks, 1<<(tbl->zt_shift-10));
-	}
-
-	return (0);
-}
-
-static int
-zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
-    dmu_tx_t *tx)
-{
-	int err;
-	uint64_t blk, off;
-	int bs = FZAP_BLOCK_SHIFT(zap);
-	dmu_buf_t *db;
-
-	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
-	ASSERT(tbl->zt_blk != 0);
-
-	dprintf("storing %llx at index %llx\n", val, idx);
-
-	blk = idx >> (bs-3);
-	off = idx & ((1<<(bs-3))-1);
-
-	err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
-	    (tbl->zt_blk + blk) << bs, FTAG, &db);
-	if (err)
-		return (err);
-	dmu_buf_will_dirty(db, tx);
-
-	if (tbl->zt_nextblk != 0) {
-		uint64_t idx2 = idx * 2;
-		uint64_t blk2 = idx2 >> (bs-3);
-		uint64_t off2 = idx2 & ((1<<(bs-3))-1);
-		dmu_buf_t *db2;
-
-		err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
-		    (tbl->zt_nextblk + blk2) << bs, FTAG, &db2);
-		if (err) {
-			dmu_buf_rele(db, FTAG);
-			return (err);
-		}
-		dmu_buf_will_dirty(db2, tx);
-		((uint64_t *)db2->db_data)[off2] = val;
-		((uint64_t *)db2->db_data)[off2+1] = val;
-		dmu_buf_rele(db2, FTAG);
-	}
-
-	((uint64_t *)db->db_data)[off] = val;
-	dmu_buf_rele(db, FTAG);
-
-	return (0);
-}
-
-static int
-zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
-{
-	uint64_t blk, off;
-	int err;
-	dmu_buf_t *db;
-	int bs = FZAP_BLOCK_SHIFT(zap);
-
-	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
-
-	blk = idx >> (bs-3);
-	off = idx & ((1<<(bs-3))-1);
-
-	err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
-	    (tbl->zt_blk + blk) << bs, FTAG, &db);
-	if (err)
-		return (err);
-	*valp = ((uint64_t *)db->db_data)[off];
-	dmu_buf_rele(db, FTAG);
-
-	if (tbl->zt_nextblk != 0) {
-		/*
-		 * read the nextblk for the sake of i/o error checking,
-		 * so that zap_table_load() will catch errors for
-		 * zap_table_store.
-		 */
-		blk = (idx*2) >> (bs-3);
-
-		err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
-		    (tbl->zt_nextblk + blk) << bs, FTAG, &db);
-		dmu_buf_rele(db, FTAG);
-	}
-	return (err);
-}
-
-/*
- * Routines for growing the ptrtbl.
- */
-
-static void
-zap_ptrtbl_transfer(const uint64_t *src, uint64_t *dst, int n)
-{
-	int i;
-	for (i = 0; i < n; i++) {
-		uint64_t lb = src[i];
-		dst[2*i+0] = lb;
-		dst[2*i+1] = lb;
-	}
-}
-
-static int
-zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx)
-{
-	/* In case things go horribly wrong. */
-	if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift >= ZAP_HASHBITS-2)
-		return (ENOSPC);
-
-	if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
-		/*
-		 * We are outgrowing the "embedded" ptrtbl (the one
-		 * stored in the header block).  Give it its own entire
-		 * block, which will double the size of the ptrtbl.
-		 */
-		uint64_t newblk;
-		dmu_buf_t *db_new;
-		int err;
-
-		ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==,
-		    ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
-		ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_blk, ==, 0);
-
-		newblk = zap_allocate_blocks(zap, 1);
-		err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
-		    newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new);
-		if (err)
-			return (err);
-		dmu_buf_will_dirty(db_new, tx);
-		zap_ptrtbl_transfer(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
-		    db_new->db_data, 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
-		dmu_buf_rele(db_new, FTAG);
-
-		zap->zap_f.zap_phys->zap_ptrtbl.zt_blk = newblk;
-		zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks = 1;
-		zap->zap_f.zap_phys->zap_ptrtbl.zt_shift++;
-
-		ASSERT3U(1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==,
-		    zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks <<
-		    (FZAP_BLOCK_SHIFT(zap)-3));
-
-		return (0);
-	} else {
-		return (zap_table_grow(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
-		    zap_ptrtbl_transfer, tx));
-	}
-}
-
-static void
-zap_increment_num_entries(zap_t *zap, int delta, dmu_tx_t *tx)
-{
-	dmu_buf_will_dirty(zap->zap_dbuf, tx);
-	mutex_enter(&zap->zap_f.zap_num_entries_mtx);
-	ASSERT(delta > 0 || zap->zap_f.zap_phys->zap_num_entries >= -delta);
-	zap->zap_f.zap_phys->zap_num_entries += delta;
-	mutex_exit(&zap->zap_f.zap_num_entries_mtx);
-}
-
-static uint64_t
-zap_allocate_blocks(zap_t *zap, int nblocks)
-{
-	uint64_t newblk;
-	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
-	newblk = zap->zap_f.zap_phys->zap_freeblk;
-	zap->zap_f.zap_phys->zap_freeblk += nblocks;
-	return (newblk);
-}
-
-static zap_leaf_t *
-zap_create_leaf(zap_t *zap, dmu_tx_t *tx)
-{
-	void *winner;
-	zap_leaf_t *l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP);
-
-	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
-
-	rw_init(&l->l_rwlock, NULL, RW_DEFAULT, 0);
-	rw_enter(&l->l_rwlock, RW_WRITER);
-	l->l_blkid = zap_allocate_blocks(zap, 1);
-	l->l_dbuf = NULL;
-	l->l_phys = NULL;
-
-	VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
-	    l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf));
-	winner = dmu_buf_set_user(l->l_dbuf, l, &l->l_phys, zap_leaf_pageout);
-	ASSERT(winner == NULL);
-	dmu_buf_will_dirty(l->l_dbuf, tx);
-
-	zap_leaf_init(l);
-
-	zap->zap_f.zap_phys->zap_num_leafs++;
-
-	return (l);
-}
-
-int
-fzap_count(zap_t *zap, uint64_t *count)
-{
-	ASSERT(!zap->zap_ismicro);
-	mutex_enter(&zap->zap_f.zap_num_entries_mtx); /* unnecessary */
-	*count = zap->zap_f.zap_phys->zap_num_entries;
-	mutex_exit(&zap->zap_f.zap_num_entries_mtx);
-	return (0);
-}
-
-/*
- * Routines for obtaining zap_leaf_t's
- */
-
-void
-zap_put_leaf(zap_leaf_t *l)
-{
-	rw_exit(&l->l_rwlock);
-	dmu_buf_rele(l->l_dbuf, NULL);
-}
-
-_NOTE(ARGSUSED(0))
-static void
-zap_leaf_pageout(dmu_buf_t *db, void *vl)
-{
-	zap_leaf_t *l = vl;
-
-	rw_destroy(&l->l_rwlock);
-	kmem_free(l, sizeof (zap_leaf_t));
-}
-
-static zap_leaf_t *
-zap_open_leaf(uint64_t blkid, dmu_buf_t *db)
-{
-	zap_leaf_t *l, *winner;
-
-	ASSERT(blkid != 0);
-
-	l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP);
-	rw_init(&l->l_rwlock, NULL, RW_DEFAULT, 0);
-	rw_enter(&l->l_rwlock, RW_WRITER);
-	l->l_blkid = blkid;
-	l->l_bs = highbit(db->db_size)-1;
-	l->l_dbuf = db;
-	l->l_phys = NULL;
-
-	winner = dmu_buf_set_user(db, l, &l->l_phys, zap_leaf_pageout);
-
-	rw_exit(&l->l_rwlock);
-	if (winner != NULL) {
-		/* someone else set it first */
-		zap_leaf_pageout(NULL, l);
-		l = winner;
-	}
-
-	/*
-	 * lhr_pad was previously used for the next leaf in the leaf
-	 * chain.  There should be no chained leafs (as we have removed
-	 * support for them).
-	 */
-	ASSERT3U(l->l_phys->l_hdr.lh_pad1, ==, 0);
-
-	/*
-	 * There should be more hash entries than there can be
-	 * chunks to put in the hash table
-	 */
-	ASSERT3U(ZAP_LEAF_HASH_NUMENTRIES(l), >, ZAP_LEAF_NUMCHUNKS(l) / 3);
-
-	/* The chunks should begin at the end of the hash table */
-	ASSERT3P(&ZAP_LEAF_CHUNK(l, 0), ==,
-	    &l->l_phys->l_hash[ZAP_LEAF_HASH_NUMENTRIES(l)]);
-
-	/* The chunks should end at the end of the block */
-	ASSERT3U((uintptr_t)&ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)) -
-	    (uintptr_t)l->l_phys, ==, l->l_dbuf->db_size);
-
-	return (l);
-}
-
-static int
-zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt,
-    zap_leaf_t **lp)
-{
-	dmu_buf_t *db;
-	zap_leaf_t *l;
-	int bs = FZAP_BLOCK_SHIFT(zap);
-	int err;
-
-	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
-
-	err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
-	    blkid << bs, NULL, &db);
-	if (err)
-		return (err);
-
-	ASSERT3U(db->db_object, ==, zap->zap_object);
-	ASSERT3U(db->db_offset, ==, blkid << bs);
-	ASSERT3U(db->db_size, ==, 1 << bs);
-	ASSERT(blkid != 0);
-
-	l = dmu_buf_get_user(db);
-
-	if (l == NULL)
-		l = zap_open_leaf(blkid, db);
-
-	rw_enter(&l->l_rwlock, lt);
-	/*
-	 * Must lock before dirtying, otherwise l->l_phys could change,
-	 * causing ASSERT below to fail.
-	 */
-	if (lt == RW_WRITER)
-		dmu_buf_will_dirty(db, tx);
-	ASSERT3U(l->l_blkid, ==, blkid);
-	ASSERT3P(l->l_dbuf, ==, db);
-	ASSERT3P(l->l_phys, ==, l->l_dbuf->db_data);
-	ASSERT3U(l->l_phys->l_hdr.lh_block_type, ==, ZBT_LEAF);
-	ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
-
-	*lp = l;
-	return (0);
-}
-
-static int
-zap_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t *valp)
-{
-	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
-
-	if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
-		ASSERT3U(idx, <,
-		    (1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift));
-		*valp = ZAP_EMBEDDED_PTRTBL_ENT(zap, idx);
-		return (0);
-	} else {
-		return (zap_table_load(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
-		    idx, valp));
-	}
-}
-
-static int
-zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx)
-{
-	ASSERT(tx != NULL);
-	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
-
-	if (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk == 0) {
-		ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) = blk;
-		return (0);
-	} else {
-		return (zap_table_store(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
-		    idx, blk, tx));
-	}
-}
-
-static int
-zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp)
-{
-	uint64_t idx, blk;
-	int err;
-
-	ASSERT(zap->zap_dbuf == NULL ||
-	    zap->zap_f.zap_phys == zap->zap_dbuf->db_data);
-	ASSERT3U(zap->zap_f.zap_phys->zap_magic, ==, ZAP_MAGIC);
-	idx = ZAP_HASH_IDX(h, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
-	err = zap_idx_to_blk(zap, idx, &blk);
-	if (err != 0)
-		return (err);
-	err = zap_get_leaf_byblk(zap, blk, tx, lt, lp);
-
-	ASSERT(err || ZAP_HASH_IDX(h, (*lp)->l_phys->l_hdr.lh_prefix_len) ==
-	    (*lp)->l_phys->l_hdr.lh_prefix);
-	return (err);
-}
-
-static int
-zap_expand_leaf(zap_t *zap, zap_leaf_t *l, uint64_t hash, dmu_tx_t *tx,
-    zap_leaf_t **lp)
-{
-	zap_leaf_t *nl;
-	int prefix_diff, i, err;
-	uint64_t sibling;
-	int old_prefix_len = l->l_phys->l_hdr.lh_prefix_len;
-
-	ASSERT3U(old_prefix_len, <=, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
-	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
-
-	ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
-	    l->l_phys->l_hdr.lh_prefix);
-
-	if (zap_tryupgradedir(zap, tx) == 0 ||
-	    old_prefix_len == zap->zap_f.zap_phys->zap_ptrtbl.zt_shift) {
-		/* We failed to upgrade, or need to grow the pointer table */
-		objset_t *os = zap->zap_objset;
-		uint64_t object = zap->zap_object;
-
-		zap_put_leaf(l);
-		zap_unlockdir(zap);
-		err = zap_lockdir(os, object, tx, RW_WRITER, FALSE, &zap);
-		if (err)
-			return (err);
-		ASSERT(!zap->zap_ismicro);
-
-		while (old_prefix_len ==
-		    zap->zap_f.zap_phys->zap_ptrtbl.zt_shift) {
-			err = zap_grow_ptrtbl(zap, tx);
-			if (err)
-				return (err);
-		}
-
-		err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l);
-		if (err)
-			return (err);
-
-		if (l->l_phys->l_hdr.lh_prefix_len != old_prefix_len) {
-			/* it split while our locks were down */
-			*lp = l;
-			return (0);
-		}
-	}
-	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
-	ASSERT3U(old_prefix_len, <, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
-	ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
-	    l->l_phys->l_hdr.lh_prefix);
-
-	prefix_diff = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift -
-	    (old_prefix_len + 1);
-	sibling = (ZAP_HASH_IDX(hash, old_prefix_len + 1) | 1) << prefix_diff;
-
-	/* check for i/o errors before doing zap_leaf_split */
-	for (i = 0; i < (1ULL<<prefix_diff); i++) {
-		uint64_t blk;
-		err = zap_idx_to_blk(zap, sibling+i, &blk);
-		if (err)
-			return (err);
-		ASSERT3U(blk, ==, l->l_blkid);
-	}
-
-	nl = zap_create_leaf(zap, tx);
-	zap_leaf_split(l, nl);
-
-	/* set sibling pointers */
-	for (i = 0; i < (1ULL<<prefix_diff); i++) {
-		err = zap_set_idx_to_blk(zap, sibling+i, nl->l_blkid, tx);
-		ASSERT3U(err, ==, 0); /* we checked for i/o errors above */
-	}
-
-	if (hash & (1ULL << (64 - l->l_phys->l_hdr.lh_prefix_len))) {
-		/* we want the sibling */
-		zap_put_leaf(l);
-		*lp = nl;
-	} else {
-		zap_put_leaf(nl);
-		*lp = l;
-	}
-
-	return (0);
-}
-
-static void
-zap_put_leaf_maybe_grow_ptrtbl(zap_t *zap, zap_leaf_t *l, dmu_tx_t *tx)
-{
-	int shift = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift;
-	int leaffull = (l->l_phys->l_hdr.lh_prefix_len == shift &&
-	    l->l_phys->l_hdr.lh_nfree < ZAP_LEAF_LOW_WATER);
-
-	zap_put_leaf(l);
-
-	if (leaffull || zap->zap_f.zap_phys->zap_ptrtbl.zt_nextblk) {
-		int err;
-
-		/*
-		 * We are in the middle of growing the pointer table, or
-		 * this leaf will soon make us grow it.
-		 */
-		if (zap_tryupgradedir(zap, tx) == 0) {
-			objset_t *os = zap->zap_objset;
-			uint64_t zapobj = zap->zap_object;
-
-			zap_unlockdir(zap);
-			err = zap_lockdir(os, zapobj, tx,
-			    RW_WRITER, FALSE, &zap);
-			if (err)
-				return;
-		}
-
-		/* could have finished growing while our locks were down */
-		if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift == shift)
-			(void) zap_grow_ptrtbl(zap, tx);
-	}
-}
-
-
-static int
-fzap_checksize(const char *name, uint64_t integer_size, uint64_t num_integers)
-{
-	if (name && strlen(name) > ZAP_MAXNAMELEN)
-		return (E2BIG);
-
-	/* Only integer sizes supported by C */
-	switch (integer_size) {
-	case 1:
-	case 2:
-	case 4:
-	case 8:
-		break;
-	default:
-		return (EINVAL);
-	}
-
-	if (integer_size * num_integers > ZAP_MAXVALUELEN)
-		return (E2BIG);
-
-	return (0);
-}
-
-/*
- * Routines for maniplulating attributes.
- */
-int
-fzap_lookup(zap_t *zap, const char *name,
-    uint64_t integer_size, uint64_t num_integers, void *buf)
-{
-	zap_leaf_t *l;
-	int err;
-	uint64_t hash;
-	zap_entry_handle_t zeh;
-
-	err = fzap_checksize(name, integer_size, num_integers);
-	if (err != 0)
-		return (err);
-
-	hash = zap_hash(zap, name);
-	err = zap_deref_leaf(zap, hash, NULL, RW_READER, &l);
-	if (err != 0)
-		return (err);
-	err = zap_leaf_lookup(l, name, hash, &zeh);
-	if (err == 0)
-		err = zap_entry_read(&zeh, integer_size, num_integers, buf);
-
-	zap_put_leaf(l);
-	return (err);
-}
-
-int
-fzap_add_cd(zap_t *zap, const char *name,
-    uint64_t integer_size, uint64_t num_integers,
-    const void *val, uint32_t cd, dmu_tx_t *tx)
-{
-	zap_leaf_t *l;
-	uint64_t hash;
-	int err;
-	zap_entry_handle_t zeh;
-
-	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
-	ASSERT(!zap->zap_ismicro);
-	ASSERT(fzap_checksize(name, integer_size, num_integers) == 0);
-
-	hash = zap_hash(zap, name);
-	err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l);
-	if (err != 0)
-		return (err);
-retry:
-	err = zap_leaf_lookup(l, name, hash, &zeh);
-	if (err == 0) {
-		err = EEXIST;
-		goto out;
-	}
-	if (err != ENOENT)
-		goto out;
-
-	err = zap_entry_create(l, name, hash, cd,
-	    integer_size, num_integers, val, &zeh);
-
-	if (err == 0) {
-		zap_increment_num_entries(zap, 1, tx);
-	} else if (err == EAGAIN) {
-		err = zap_expand_leaf(zap, l, hash, tx, &l);
-		if (err == 0)
-			goto retry;
-	}
-
-out:
-	zap_put_leaf_maybe_grow_ptrtbl(zap, l, tx);
-	return (err);
-}
-
-int
-fzap_add(zap_t *zap, const char *name,
-    uint64_t integer_size, uint64_t num_integers,
-    const void *val, dmu_tx_t *tx)
-{
-	int err = fzap_checksize(name, integer_size, num_integers);
-	if (err != 0)
-		return (err);
-
-	return (fzap_add_cd(zap, name, integer_size, num_integers,
-	    val, ZAP_MAXCD, tx));
-}
-
-int
-fzap_update(zap_t *zap, const char *name,
-    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
-{
-	zap_leaf_t *l;
-	uint64_t hash;
-	int err, create;
-	zap_entry_handle_t zeh;
-
-	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
-	err = fzap_checksize(name, integer_size, num_integers);
-	if (err != 0)
-		return (err);
-
-	hash = zap_hash(zap, name);
-	err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l);
-	if (err != 0)
-		return (err);
-retry:
-	err = zap_leaf_lookup(l, name, hash, &zeh);
-	create = (err == ENOENT);
-	ASSERT(err == 0 || err == ENOENT);
-
-	/* XXX If this leaf is chained, split it if we can. */
-
-	if (create) {
-		err = zap_entry_create(l, name, hash, ZAP_MAXCD,
-		    integer_size, num_integers, val, &zeh);
-		if (err == 0)
-			zap_increment_num_entries(zap, 1, tx);
-	} else {
-		err = zap_entry_update(&zeh, integer_size, num_integers, val);
-	}
-
-	if (err == EAGAIN) {
-		err = zap_expand_leaf(zap, l, hash, tx, &l);
-		if (err == 0)
-			goto retry;
-	}
-
-	zap_put_leaf_maybe_grow_ptrtbl(zap, l, tx);
-	return (err);
-}
-
-int
-fzap_length(zap_t *zap, const char *name,
-    uint64_t *integer_size, uint64_t *num_integers)
-{
-	zap_leaf_t *l;
-	int err;
-	uint64_t hash;
-	zap_entry_handle_t zeh;
-
-	hash = zap_hash(zap, name);
-	err = zap_deref_leaf(zap, hash, NULL, RW_READER, &l);
-	if (err != 0)
-		return (err);
-	err = zap_leaf_lookup(l, name, hash, &zeh);
-	if (err != 0)
-		goto out;
-
-	if (integer_size)
-		*integer_size = zeh.zeh_integer_size;
-	if (num_integers)
-		*num_integers = zeh.zeh_num_integers;
-out:
-	zap_put_leaf(l);
-	return (err);
-}
-
-int
-fzap_remove(zap_t *zap, const char *name, dmu_tx_t *tx)
-{
-	zap_leaf_t *l;
-	uint64_t hash;
-	int err;
-	zap_entry_handle_t zeh;
-
-	hash = zap_hash(zap, name);
-	err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l);
-	if (err != 0)
-		return (err);
-	err = zap_leaf_lookup(l, name, hash, &zeh);
-	if (err == 0) {
-		zap_entry_remove(&zeh);
-		zap_increment_num_entries(zap, -1, tx);
-	}
-	zap_put_leaf(l);
-	dprintf("fzap_remove: ds=%p obj=%llu name=%s err=%d\n",
-	    zap->zap_objset, zap->zap_object, name, err);
-	return (err);
-}
-
-int
-zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, char *name)
-{
-	zap_cursor_t zc;
-	zap_attribute_t *za;
-	int err;
-
-	za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
-	for (zap_cursor_init(&zc, os, zapobj);
-	    (err = zap_cursor_retrieve(&zc, za)) == 0;
-	    zap_cursor_advance(&zc)) {
-		if (ZFS_DIRENT_OBJ(za->za_first_integer) == value) {
-			(void) strcpy(name, za->za_name);
-			break;
-		}
-	}
-	zap_cursor_fini(&zc);
-	kmem_free(za, sizeof (zap_attribute_t));
-	return (err);
-}
-
-
-/*
- * Routines for iterating over the attributes.
- */
-
-int
-fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za)
-{
-	int err = ENOENT;
-	zap_entry_handle_t zeh;
-	zap_leaf_t *l;
-
-	/* retrieve the next entry at or after zc_hash/zc_cd */
-	/* if no entry, return ENOENT */
-
-	if (zc->zc_leaf &&
-	    (ZAP_HASH_IDX(zc->zc_hash,
-	    zc->zc_leaf->l_phys->l_hdr.lh_prefix_len) !=
-	    zc->zc_leaf->l_phys->l_hdr.lh_prefix)) {
-		rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
-		zap_put_leaf(zc->zc_leaf);
-		zc->zc_leaf = NULL;
-	}
-
-again:
-	if (zc->zc_leaf == NULL) {
-		err = zap_deref_leaf(zap, zc->zc_hash, NULL, RW_READER,
-		    &zc->zc_leaf);
-		if (err != 0)
-			return (err);
-	} else {
-		rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
-	}
-	l = zc->zc_leaf;
-
-	err = zap_leaf_lookup_closest(l, zc->zc_hash, zc->zc_cd, &zeh);
-
-	if (err == ENOENT) {
-		uint64_t nocare =
-		    (1ULL << (64 - l->l_phys->l_hdr.lh_prefix_len)) - 1;
-		zc->zc_hash = (zc->zc_hash & ~nocare) + nocare + 1;
-		zc->zc_cd = 0;
-		if (l->l_phys->l_hdr.lh_prefix_len == 0 || zc->zc_hash == 0) {
-			zc->zc_hash = -1ULL;
-		} else {
-			zap_put_leaf(zc->zc_leaf);
-			zc->zc_leaf = NULL;
-			goto again;
-		}
-	}
-
-	if (err == 0) {
-		zc->zc_hash = zeh.zeh_hash;
-		zc->zc_cd = zeh.zeh_cd;
-		za->za_integer_length = zeh.zeh_integer_size;
-		za->za_num_integers = zeh.zeh_num_integers;
-		if (zeh.zeh_num_integers == 0) {
-			za->za_first_integer = 0;
-		} else {
-			err = zap_entry_read(&zeh, 8, 1, &za->za_first_integer);
-			ASSERT(err == 0 || err == EOVERFLOW);
-		}
-		err = zap_entry_read_name(&zeh,
-		    sizeof (za->za_name), za->za_name);
-		ASSERT(err == 0);
-	}
-	rw_exit(&zc->zc_leaf->l_rwlock);
-	return (err);
-}
-
-
-static void
-zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs)
-{
-	int i, err;
-	uint64_t lastblk = 0;
-
-	/*
-	 * NB: if a leaf has more pointers than an entire ptrtbl block
-	 * can hold, then it'll be accounted for more than once, since
-	 * we won't have lastblk.
-	 */
-	for (i = 0; i < len; i++) {
-		zap_leaf_t *l;
-
-		if (tbl[i] == lastblk)
-			continue;
-		lastblk = tbl[i];
-
-		err = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER, &l);
-		if (err == 0) {
-			zap_leaf_stats(zap, l, zs);
-			zap_put_leaf(l);
-		}
-	}
-}
-
-void
-fzap_get_stats(zap_t *zap, zap_stats_t *zs)
-{
-	int bs = FZAP_BLOCK_SHIFT(zap);
-	zs->zs_blocksize = 1ULL << bs;
-
-	/*
-	 * Set zap_phys_t fields
-	 */
-	zs->zs_num_leafs = zap->zap_f.zap_phys->zap_num_leafs;
-	zs->zs_num_entries = zap->zap_f.zap_phys->zap_num_entries;
-	zs->zs_num_blocks = zap->zap_f.zap_phys->zap_freeblk;
-	zs->zs_block_type = zap->zap_f.zap_phys->zap_block_type;
-	zs->zs_magic = zap->zap_f.zap_phys->zap_magic;
-	zs->zs_salt = zap->zap_f.zap_phys->zap_salt;
-
-	/*
-	 * Set zap_ptrtbl fields
-	 */
-	zs->zs_ptrtbl_len = 1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift;
-	zs->zs_ptrtbl_nextblk = zap->zap_f.zap_phys->zap_ptrtbl.zt_nextblk;
-	zs->zs_ptrtbl_blks_copied =
-	    zap->zap_f.zap_phys->zap_ptrtbl.zt_blks_copied;
-	zs->zs_ptrtbl_zt_blk = zap->zap_f.zap_phys->zap_ptrtbl.zt_blk;
-	zs->zs_ptrtbl_zt_numblks = zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks;
-	zs->zs_ptrtbl_zt_shift = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift;
-
-	if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
-		/* the ptrtbl is entirely in the header block. */
-		zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
-		    1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs);
-	} else {
-		int b;
-
-		dmu_prefetch(zap->zap_objset, zap->zap_object,
-		    zap->zap_f.zap_phys->zap_ptrtbl.zt_blk << bs,
-		    zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks << bs);
-
-		for (b = 0; b < zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks;
-		    b++) {
-			dmu_buf_t *db;
-			int err;
-
-			err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
-			    (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk + b) << bs,
-			    FTAG, &db);
-			if (err == 0) {
-				zap_stats_ptrtbl(zap, db->db_data,
-				    1<<(bs-3), zs);
-				dmu_buf_rele(db, FTAG);
-			}
-		}
-	}
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c
deleted file mode 100644
index 5dff514..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c
+++ /dev/null
@@ -1,741 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-/*
- * The 512-byte leaf is broken into 32 16-byte chunks.
- * chunk number n means l_chunk[n], even though the header precedes it.
- * the names are stored null-terminated.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/zap.h>
-#include <sys/zap_impl.h>
-#include <sys/zap_leaf.h>
-#include <sys/spa.h>
-#include <sys/dmu.h>
-
-#define	CHAIN_END 0xffff /* end of the chunk chain */
-
-/* half the (current) minimum block size */
-#define	MAX_ARRAY_BYTES (8<<10)
-
-#define	LEAF_HASH(l, h) \
-	((ZAP_LEAF_HASH_NUMENTRIES(l)-1) & \
-	((h) >> (64 - ZAP_LEAF_HASH_SHIFT(l)-(l)->l_phys->l_hdr.lh_prefix_len)))
-
-#define	LEAF_HASH_ENTPTR(l, h) (&(l)->l_phys->l_hash[LEAF_HASH(l, h)])
-
-
-static void
-zap_memset(void *a, int c, size_t n)
-{
-	char *cp = a;
-	char *cpend = cp + n;
-
-	while (cp < cpend)
-		*cp++ = c;
-}
-
-static void
-stv(int len, void *addr, uint64_t value)
-{
-	switch (len) {
-	case 1:
-		*(uint8_t *)addr = value;
-		return;
-	case 2:
-		*(uint16_t *)addr = value;
-		return;
-	case 4:
-		*(uint32_t *)addr = value;
-		return;
-	case 8:
-		*(uint64_t *)addr = value;
-		return;
-	}
-	ASSERT(!"bad int len");
-}
-
-static uint64_t
-ldv(int len, const void *addr)
-{
-	switch (len) {
-	case 1:
-		return (*(uint8_t *)addr);
-	case 2:
-		return (*(uint16_t *)addr);
-	case 4:
-		return (*(uint32_t *)addr);
-	case 8:
-		return (*(uint64_t *)addr);
-	}
-	ASSERT(!"bad int len");
-	return (0xFEEDFACEDEADBEEFULL);
-}
-
-void
-zap_leaf_byteswap(zap_leaf_phys_t *buf, int size)
-{
-	int i;
-	zap_leaf_t l;
-	l.l_bs = highbit(size)-1;
-	l.l_phys = buf;
-
-	buf->l_hdr.lh_block_type = 	BSWAP_64(buf->l_hdr.lh_block_type);
-	buf->l_hdr.lh_prefix = 		BSWAP_64(buf->l_hdr.lh_prefix);
-	buf->l_hdr.lh_magic = 		BSWAP_32(buf->l_hdr.lh_magic);
-	buf->l_hdr.lh_nfree = 		BSWAP_16(buf->l_hdr.lh_nfree);
-	buf->l_hdr.lh_nentries = 	BSWAP_16(buf->l_hdr.lh_nentries);
-	buf->l_hdr.lh_prefix_len = 	BSWAP_16(buf->l_hdr.lh_prefix_len);
-	buf->l_hdr.lh_freelist = 	BSWAP_16(buf->l_hdr.lh_freelist);
-
-	for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++)
-		buf->l_hash[i] = BSWAP_16(buf->l_hash[i]);
-
-	for (i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) {
-		zap_leaf_chunk_t *lc = &ZAP_LEAF_CHUNK(&l, i);
-		struct zap_leaf_entry *le;
-
-		switch (lc->l_free.lf_type) {
-		case ZAP_CHUNK_ENTRY:
-			le = &lc->l_entry;
-
-			le->le_type =		BSWAP_8(le->le_type);
-			le->le_int_size =	BSWAP_8(le->le_int_size);
-			le->le_next =		BSWAP_16(le->le_next);
-			le->le_name_chunk =	BSWAP_16(le->le_name_chunk);
-			le->le_name_length =	BSWAP_16(le->le_name_length);
-			le->le_value_chunk =	BSWAP_16(le->le_value_chunk);
-			le->le_value_length =	BSWAP_16(le->le_value_length);
-			le->le_cd =		BSWAP_32(le->le_cd);
-			le->le_hash =		BSWAP_64(le->le_hash);
-			break;
-		case ZAP_CHUNK_FREE:
-			lc->l_free.lf_type =	BSWAP_8(lc->l_free.lf_type);
-			lc->l_free.lf_next =	BSWAP_16(lc->l_free.lf_next);
-			break;
-		case ZAP_CHUNK_ARRAY:
-			lc->l_array.la_type =	BSWAP_8(lc->l_array.la_type);
-			lc->l_array.la_next =	BSWAP_16(lc->l_array.la_next);
-			/* la_array doesn't need swapping */
-			break;
-		default:
-			ASSERT(!"bad leaf type");
-		}
-	}
-}
-
-void
-zap_leaf_init(zap_leaf_t *l)
-{
-	int i;
-
-	l->l_bs = highbit(l->l_dbuf->db_size)-1;
-	zap_memset(&l->l_phys->l_hdr, 0, sizeof (struct zap_leaf_header));
-	zap_memset(l->l_phys->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l));
-	for (i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
-		ZAP_LEAF_CHUNK(l, i).l_free.lf_type = ZAP_CHUNK_FREE;
-		ZAP_LEAF_CHUNK(l, i).l_free.lf_next = i+1;
-	}
-	ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)-1).l_free.lf_next = CHAIN_END;
-	l->l_phys->l_hdr.lh_block_type = ZBT_LEAF;
-	l->l_phys->l_hdr.lh_magic = ZAP_LEAF_MAGIC;
-	l->l_phys->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l);
-}
-
-/*
- * Routines which manipulate leaf chunks (l_chunk[]).
- */
-
-static uint16_t
-zap_leaf_chunk_alloc(zap_leaf_t *l)
-{
-	int chunk;
-
-	ASSERT(l->l_phys->l_hdr.lh_nfree > 0);
-
-	chunk = l->l_phys->l_hdr.lh_freelist;
-	ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
-	ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_free.lf_type, ==, ZAP_CHUNK_FREE);
-
-	l->l_phys->l_hdr.lh_freelist = ZAP_LEAF_CHUNK(l, chunk).l_free.lf_next;
-
-	l->l_phys->l_hdr.lh_nfree--;
-
-	return (chunk);
-}
-
-static void
-zap_leaf_chunk_free(zap_leaf_t *l, uint16_t chunk)
-{
-	struct zap_leaf_free *zlf = &ZAP_LEAF_CHUNK(l, chunk).l_free;
-	ASSERT3U(l->l_phys->l_hdr.lh_nfree, <, ZAP_LEAF_NUMCHUNKS(l));
-	ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
-	ASSERT(zlf->lf_type != ZAP_CHUNK_FREE);
-
-	zlf->lf_type = ZAP_CHUNK_FREE;
-	zlf->lf_next = l->l_phys->l_hdr.lh_freelist;
-	bzero(zlf->lf_pad, sizeof (zlf->lf_pad)); /* help it to compress */
-	l->l_phys->l_hdr.lh_freelist = chunk;
-
-	l->l_phys->l_hdr.lh_nfree++;
-}
-
-/*
- * Routines which manipulate leaf arrays (zap_leaf_array type chunks).
- */
-
-static uint16_t
-zap_leaf_array_create(zap_leaf_t *l, const char *buf,
-	int integer_size, int num_integers)
-{
-	uint16_t chunk_head;
-	uint16_t *chunkp = &chunk_head;
-	int byten = 0;
-	uint64_t value;
-	int shift = (integer_size-1)*8;
-	int len = num_integers;
-
-	ASSERT3U(num_integers * integer_size, <, MAX_ARRAY_BYTES);
-
-	while (len > 0) {
-		uint16_t chunk = zap_leaf_chunk_alloc(l);
-		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
-		int i;
-
-		la->la_type = ZAP_CHUNK_ARRAY;
-		for (i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) {
-			if (byten == 0)
-				value = ldv(integer_size, buf);
-			la->la_array[i] = value >> shift;
-			value <<= 8;
-			if (++byten == integer_size) {
-				byten = 0;
-				buf += integer_size;
-				if (--len == 0)
-					break;
-			}
-		}
-
-		*chunkp = chunk;
-		chunkp = &la->la_next;
-	}
-	*chunkp = CHAIN_END;
-
-	return (chunk_head);
-}
-
-static void
-zap_leaf_array_free(zap_leaf_t *l, uint16_t *chunkp)
-{
-	uint16_t chunk = *chunkp;
-
-	*chunkp = CHAIN_END;
-
-	while (chunk != CHAIN_END) {
-		int nextchunk = ZAP_LEAF_CHUNK(l, chunk).l_array.la_next;
-		ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_array.la_type, ==,
-		    ZAP_CHUNK_ARRAY);
-		zap_leaf_chunk_free(l, chunk);
-		chunk = nextchunk;
-	}
-}
-
-/* array_len and buf_len are in integers, not bytes */
-static void
-zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk,
-    int array_int_len, int array_len, int buf_int_len, uint64_t buf_len,
-    char *buf)
-{
-	int len = MIN(array_len, buf_len);
-	int byten = 0;
-	uint64_t value = 0;
-
-	ASSERT3U(array_int_len, <=, buf_int_len);
-
-	/* Fast path for one 8-byte integer */
-	if (array_int_len == 8 && buf_int_len == 8 && len == 1) {
-		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
-		uint8_t *ip = la->la_array;
-		uint64_t *buf64 = (uint64_t *)buf;
-
-		*buf64 = (uint64_t)ip[0] << 56 | (uint64_t)ip[1] << 48 |
-		    (uint64_t)ip[2] << 40 | (uint64_t)ip[3] << 32 |
-		    (uint64_t)ip[4] << 24 | (uint64_t)ip[5] << 16 |
-		    (uint64_t)ip[6] << 8 | (uint64_t)ip[7];
-		return;
-	}
-
-	/* Fast path for an array of 1-byte integers (eg. the entry name) */
-	if (array_int_len == 1 && buf_int_len == 1 &&
-	    buf_len > array_len + ZAP_LEAF_ARRAY_BYTES) {
-		while (chunk != CHAIN_END) {
-			struct zap_leaf_array *la =
-			    &ZAP_LEAF_CHUNK(l, chunk).l_array;
-			bcopy(la->la_array, buf, ZAP_LEAF_ARRAY_BYTES);
-			buf += ZAP_LEAF_ARRAY_BYTES;
-			chunk = la->la_next;
-		}
-		return;
-	}
-
-	while (len > 0) {
-		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
-		int i;
-
-		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
-		for (i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) {
-			value = (value << 8) | la->la_array[i];
-			byten++;
-			if (byten == array_int_len) {
-				stv(buf_int_len, buf, value);
-				byten = 0;
-				len--;
-				if (len == 0)
-					return;
-				buf += buf_int_len;
-			}
-		}
-		chunk = la->la_next;
-	}
-}
-
-/*
- * Only to be used on 8-bit arrays.
- * array_len is actual len in bytes (not encoded le_value_length).
- * buf is null-terminated.
- */
-static int
-zap_leaf_array_equal(zap_leaf_t *l, int chunk,
-    int array_len, const char *buf)
-{
-	int bseen = 0;
-
-	while (bseen < array_len) {
-		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
-		int toread = MIN(array_len - bseen, ZAP_LEAF_ARRAY_BYTES);
-		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
-		if (bcmp(la->la_array, buf + bseen, toread))
-			break;
-		chunk = la->la_next;
-		bseen += toread;
-	}
-	return (bseen == array_len);
-}
-
-/*
- * Routines which manipulate leaf entries.
- */
-
-int
-zap_leaf_lookup(zap_leaf_t *l,
-    const char *name, uint64_t h, zap_entry_handle_t *zeh)
-{
-	uint16_t *chunkp;
-	struct zap_leaf_entry *le;
-
-	ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
-
-	for (chunkp = LEAF_HASH_ENTPTR(l, h);
-	    *chunkp != CHAIN_END; chunkp = &le->le_next) {
-		uint16_t chunk = *chunkp;
-		le = ZAP_LEAF_ENTRY(l, chunk);
-
-		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
-		ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
-
-		if (le->le_hash != h)
-			continue;
-
-		if (zap_leaf_array_equal(l, le->le_name_chunk,
-		    le->le_name_length, name)) {
-			zeh->zeh_num_integers = le->le_value_length;
-			zeh->zeh_integer_size = le->le_int_size;
-			zeh->zeh_cd = le->le_cd;
-			zeh->zeh_hash = le->le_hash;
-			zeh->zeh_chunkp = chunkp;
-			zeh->zeh_leaf = l;
-			return (0);
-		}
-	}
-
-	return (ENOENT);
-}
-
-/* Return (h1,cd1 >= h2,cd2) */
-#define	HCD_GTEQ(h1, cd1, h2, cd2) \
-	((h1 > h2) ? TRUE : ((h1 == h2 && cd1 >= cd2) ? TRUE : FALSE))
-
-int
-zap_leaf_lookup_closest(zap_leaf_t *l,
-    uint64_t h, uint32_t cd, zap_entry_handle_t *zeh)
-{
-	uint16_t chunk;
-	uint64_t besth = -1ULL;
-	uint32_t bestcd = ZAP_MAXCD;
-	uint16_t bestlh = ZAP_LEAF_HASH_NUMENTRIES(l)-1;
-	uint16_t lh;
-	struct zap_leaf_entry *le;
-
-	ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
-
-	for (lh = LEAF_HASH(l, h); lh <= bestlh; lh++) {
-		for (chunk = l->l_phys->l_hash[lh];
-		    chunk != CHAIN_END; chunk = le->le_next) {
-			le = ZAP_LEAF_ENTRY(l, chunk);
-
-			ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
-			ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
-
-			if (HCD_GTEQ(le->le_hash, le->le_cd, h, cd) &&
-			    HCD_GTEQ(besth, bestcd, le->le_hash, le->le_cd)) {
-				ASSERT3U(bestlh, >=, lh);
-				bestlh = lh;
-				besth = le->le_hash;
-				bestcd = le->le_cd;
-
-				zeh->zeh_num_integers = le->le_value_length;
-				zeh->zeh_integer_size = le->le_int_size;
-				zeh->zeh_cd = le->le_cd;
-				zeh->zeh_hash = le->le_hash;
-				zeh->zeh_fakechunk = chunk;
-				zeh->zeh_chunkp = &zeh->zeh_fakechunk;
-				zeh->zeh_leaf = l;
-			}
-		}
-	}
-
-	return (bestcd == ZAP_MAXCD ? ENOENT : 0);
-}
-
-int
-zap_entry_read(const zap_entry_handle_t *zeh,
-    uint8_t integer_size, uint64_t num_integers, void *buf)
-{
-	struct zap_leaf_entry *le =
-	    ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp);
-	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
-
-	if (le->le_int_size > integer_size)
-		return (EINVAL);
-
-	zap_leaf_array_read(zeh->zeh_leaf, le->le_value_chunk, le->le_int_size,
-	    le->le_value_length, integer_size, num_integers, buf);
-
-	if (zeh->zeh_num_integers > num_integers)
-		return (EOVERFLOW);
-	return (0);
-
-}
-
-int
-zap_entry_read_name(const zap_entry_handle_t *zeh, uint16_t buflen, char *buf)
-{
-	struct zap_leaf_entry *le =
-	    ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp);
-	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
-
-	zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 1,
-	    le->le_name_length, 1, buflen, buf);
-	if (le->le_name_length > buflen)
-		return (EOVERFLOW);
-	return (0);
-}
-
-int
-zap_entry_update(zap_entry_handle_t *zeh,
-	uint8_t integer_size, uint64_t num_integers, const void *buf)
-{
-	int delta_chunks;
-	zap_leaf_t *l = zeh->zeh_leaf;
-	struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, *zeh->zeh_chunkp);
-
-	delta_chunks = ZAP_LEAF_ARRAY_NCHUNKS(num_integers * integer_size) -
-	    ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_length * le->le_int_size);
-
-	if ((int)l->l_phys->l_hdr.lh_nfree < delta_chunks)
-		return (EAGAIN);
-
-	/*
-	 * We should search other chained leaves (via
-	 * zap_entry_remove,create?) otherwise returning EAGAIN will
-	 * just send us into an infinite loop if we have to chain
-	 * another leaf block, rather than being able to split this
-	 * block.
-	 */
-
-	zap_leaf_array_free(l, &le->le_value_chunk);
-	le->le_value_chunk =
-	    zap_leaf_array_create(l, buf, integer_size, num_integers);
-	le->le_value_length = num_integers;
-	le->le_int_size = integer_size;
-	return (0);
-}
-
-void
-zap_entry_remove(zap_entry_handle_t *zeh)
-{
-	uint16_t entry_chunk;
-	struct zap_leaf_entry *le;
-	zap_leaf_t *l = zeh->zeh_leaf;
-
-	ASSERT3P(zeh->zeh_chunkp, !=, &zeh->zeh_fakechunk);
-
-	entry_chunk = *zeh->zeh_chunkp;
-	le = ZAP_LEAF_ENTRY(l, entry_chunk);
-	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
-
-	zap_leaf_array_free(l, &le->le_name_chunk);
-	zap_leaf_array_free(l, &le->le_value_chunk);
-
-	*zeh->zeh_chunkp = le->le_next;
-	zap_leaf_chunk_free(l, entry_chunk);
-
-	l->l_phys->l_hdr.lh_nentries--;
-}
-
-int
-zap_entry_create(zap_leaf_t *l, const char *name, uint64_t h, uint32_t cd,
-    uint8_t integer_size, uint64_t num_integers, const void *buf,
-    zap_entry_handle_t *zeh)
-{
-	uint16_t chunk;
-	uint16_t *chunkp;
-	struct zap_leaf_entry *le;
-	uint64_t namelen, valuelen;
-	int numchunks;
-
-	valuelen = integer_size * num_integers;
-	namelen = strlen(name) + 1;
-	ASSERT(namelen >= 2);
-
-	numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(namelen) +
-	    ZAP_LEAF_ARRAY_NCHUNKS(valuelen);
-	if (numchunks > ZAP_LEAF_NUMCHUNKS(l))
-		return (E2BIG);
-
-	if (cd == ZAP_MAXCD) {
-		for (cd = 0; cd < ZAP_MAXCD; cd++) {
-			for (chunk = *LEAF_HASH_ENTPTR(l, h);
-			    chunk != CHAIN_END; chunk = le->le_next) {
-				le = ZAP_LEAF_ENTRY(l, chunk);
-				if (le->le_hash == h &&
-				    le->le_cd == cd) {
-					break;
-				}
-			}
-			/* If this cd is not in use, we are good. */
-			if (chunk == CHAIN_END)
-				break;
-		}
-		/* If we tried all the cd's, we lose. */
-		if (cd == ZAP_MAXCD)
-			return (ENOSPC);
-	}
-
-	if (l->l_phys->l_hdr.lh_nfree < numchunks)
-		return (EAGAIN);
-
-	/* make the entry */
-	chunk = zap_leaf_chunk_alloc(l);
-	le = ZAP_LEAF_ENTRY(l, chunk);
-	le->le_type = ZAP_CHUNK_ENTRY;
-	le->le_name_chunk = zap_leaf_array_create(l, name, 1, namelen);
-	le->le_name_length = namelen;
-	le->le_value_chunk =
-	    zap_leaf_array_create(l, buf, integer_size, num_integers);
-	le->le_value_length = num_integers;
-	le->le_int_size = integer_size;
-	le->le_hash = h;
-	le->le_cd = cd;
-
-	/* link it into the hash chain */
-	chunkp = LEAF_HASH_ENTPTR(l, h);
-	le->le_next = *chunkp;
-	*chunkp = chunk;
-
-	l->l_phys->l_hdr.lh_nentries++;
-
-	zeh->zeh_leaf = l;
-	zeh->zeh_num_integers = num_integers;
-	zeh->zeh_integer_size = le->le_int_size;
-	zeh->zeh_cd = le->le_cd;
-	zeh->zeh_hash = le->le_hash;
-	zeh->zeh_chunkp = chunkp;
-
-	return (0);
-}
-
-/*
- * Routines for transferring entries between leafs.
- */
-
-static void
-zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry)
-{
-	struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry);
-	uint16_t *ptr = LEAF_HASH_ENTPTR(l, le->le_hash);
-	le->le_next = *ptr;
-	*ptr = entry;
-}
-
-static uint16_t
-zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl)
-{
-	uint16_t new_chunk;
-	uint16_t *nchunkp = &new_chunk;
-
-	while (chunk != CHAIN_END) {
-		uint16_t nchunk = zap_leaf_chunk_alloc(nl);
-		struct zap_leaf_array *nla =
-		    &ZAP_LEAF_CHUNK(nl, nchunk).l_array;
-		struct zap_leaf_array *la =
-		    &ZAP_LEAF_CHUNK(l, chunk).l_array;
-		int nextchunk = la->la_next;
-
-		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
-		ASSERT3U(nchunk, <, ZAP_LEAF_NUMCHUNKS(l));
-
-		*nla = *la; /* structure assignment */
-
-		zap_leaf_chunk_free(l, chunk);
-		chunk = nextchunk;
-		*nchunkp = nchunk;
-		nchunkp = &nla->la_next;
-	}
-	*nchunkp = CHAIN_END;
-	return (new_chunk);
-}
-
-static void
-zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl)
-{
-	struct zap_leaf_entry *le, *nle;
-	uint16_t chunk;
-
-	le = ZAP_LEAF_ENTRY(l, entry);
-	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
-
-	chunk = zap_leaf_chunk_alloc(nl);
-	nle = ZAP_LEAF_ENTRY(nl, chunk);
-	*nle = *le; /* structure assignment */
-
-	zap_leaf_rehash_entry(nl, chunk);
-
-	nle->le_name_chunk = zap_leaf_transfer_array(l, le->le_name_chunk, nl);
-	nle->le_value_chunk =
-	    zap_leaf_transfer_array(l, le->le_value_chunk, nl);
-
-	zap_leaf_chunk_free(l, entry);
-
-	l->l_phys->l_hdr.lh_nentries--;
-	nl->l_phys->l_hdr.lh_nentries++;
-}
-
-/*
- * Transfer the entries whose hash prefix ends in 1 to the new leaf.
- */
-void
-zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl)
-{
-	int i;
-	int bit = 64 - 1 - l->l_phys->l_hdr.lh_prefix_len;
-
-	/* set new prefix and prefix_len */
-	l->l_phys->l_hdr.lh_prefix <<= 1;
-	l->l_phys->l_hdr.lh_prefix_len++;
-	nl->l_phys->l_hdr.lh_prefix = l->l_phys->l_hdr.lh_prefix | 1;
-	nl->l_phys->l_hdr.lh_prefix_len = l->l_phys->l_hdr.lh_prefix_len;
-
-	/* break existing hash chains */
-	zap_memset(l->l_phys->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l));
-
-	/*
-	 * Transfer entries whose hash bit 'bit' is set to nl; rehash
-	 * the remaining entries
-	 *
-	 * NB: We could find entries via the hashtable instead. That
-	 * would be O(hashents+numents) rather than O(numblks+numents),
-	 * but this accesses memory more sequentially, and when we're
-	 * called, the block is usually pretty full.
-	 */
-	for (i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
-		struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, i);
-		if (le->le_type != ZAP_CHUNK_ENTRY)
-			continue;
-
-		if (le->le_hash & (1ULL << bit))
-			zap_leaf_transfer_entry(l, i, nl);
-		else
-			zap_leaf_rehash_entry(l, i);
-	}
-}
-
-void
-zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs)
-{
-	int i, n;
-
-	n = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift -
-	    l->l_phys->l_hdr.lh_prefix_len;
-	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
-	zs->zs_leafs_with_2n_pointers[n]++;
-
-
-	n = l->l_phys->l_hdr.lh_nentries/5;
-	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
-	zs->zs_blocks_with_n5_entries[n]++;
-
-	n = ((1<<FZAP_BLOCK_SHIFT(zap)) -
-	    l->l_phys->l_hdr.lh_nfree * (ZAP_LEAF_ARRAY_BYTES+1))*10 /
-	    (1<<FZAP_BLOCK_SHIFT(zap));
-	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
-	zs->zs_blocks_n_tenths_full[n]++;
-
-	for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) {
-		int nentries = 0;
-		int chunk = l->l_phys->l_hash[i];
-
-		while (chunk != CHAIN_END) {
-			struct zap_leaf_entry *le =
-			    ZAP_LEAF_ENTRY(l, chunk);
-
-			n = 1 + ZAP_LEAF_ARRAY_NCHUNKS(le->le_name_length) +
-			    ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_length *
-				le->le_int_size);
-			n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
-			zs->zs_entries_using_n_chunks[n]++;
-
-			chunk = le->le_next;
-			nentries++;
-		}
-
-		n = nentries;
-		n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
-		zs->zs_buckets_with_n_entries[n]++;
-	}
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c
deleted file mode 100644
index 9a882a5..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c
+++ /dev/null
@@ -1,857 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/spa.h>
-#include <sys/dmu.h>
-#include <sys/zfs_context.h>
-#include <sys/zap.h>
-#include <sys/refcount.h>
-#include <sys/zap_impl.h>
-#include <sys/zap_leaf.h>
-#include <sys/avl.h>
-
-
-static void mzap_upgrade(zap_t *zap, dmu_tx_t *tx);
-
-
-static void
-mzap_byteswap(mzap_phys_t *buf, size_t size)
-{
-	int i, max;
-	buf->mz_block_type = BSWAP_64(buf->mz_block_type);
-	buf->mz_salt = BSWAP_64(buf->mz_salt);
-	max = (size / MZAP_ENT_LEN) - 1;
-	for (i = 0; i < max; i++) {
-		buf->mz_chunk[i].mze_value =
-		    BSWAP_64(buf->mz_chunk[i].mze_value);
-		buf->mz_chunk[i].mze_cd =
-		    BSWAP_32(buf->mz_chunk[i].mze_cd);
-	}
-}
-
-void
-zap_byteswap(void *buf, size_t size)
-{
-	uint64_t block_type;
-
-	block_type = *(uint64_t *)buf;
-
-	if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) {
-		/* ASSERT(magic == ZAP_LEAF_MAGIC); */
-		mzap_byteswap(buf, size);
-	} else {
-		fzap_byteswap(buf, size);
-	}
-}
-
-static int
-mze_compare(const void *arg1, const void *arg2)
-{
-	const mzap_ent_t *mze1 = arg1;
-	const mzap_ent_t *mze2 = arg2;
-
-	if (mze1->mze_hash > mze2->mze_hash)
-		return (+1);
-	if (mze1->mze_hash < mze2->mze_hash)
-		return (-1);
-	if (mze1->mze_phys.mze_cd > mze2->mze_phys.mze_cd)
-		return (+1);
-	if (mze1->mze_phys.mze_cd < mze2->mze_phys.mze_cd)
-		return (-1);
-	return (0);
-}
-
-static void
-mze_insert(zap_t *zap, int chunkid, uint64_t hash, mzap_ent_phys_t *mzep)
-{
-	mzap_ent_t *mze;
-
-	ASSERT(zap->zap_ismicro);
-	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
-	ASSERT(mzep->mze_cd < ZAP_MAXCD);
-	ASSERT3U(zap_hash(zap, mzep->mze_name), ==, hash);
-
-	mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP);
-	mze->mze_chunkid = chunkid;
-	mze->mze_hash = hash;
-	mze->mze_phys = *mzep;
-	avl_add(&zap->zap_m.zap_avl, mze);
-}
-
-static mzap_ent_t *
-mze_find(zap_t *zap, const char *name, uint64_t hash)
-{
-	mzap_ent_t mze_tofind;
-	mzap_ent_t *mze;
-	avl_index_t idx;
-	avl_tree_t *avl = &zap->zap_m.zap_avl;
-
-	ASSERT(zap->zap_ismicro);
-	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
-	ASSERT3U(zap_hash(zap, name), ==, hash);
-
-	if (strlen(name) >= sizeof (mze_tofind.mze_phys.mze_name))
-		return (NULL);
-
-	mze_tofind.mze_hash = hash;
-	mze_tofind.mze_phys.mze_cd = 0;
-
-	mze = avl_find(avl, &mze_tofind, &idx);
-	if (mze == NULL)
-		mze = avl_nearest(avl, idx, AVL_AFTER);
-	for (; mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) {
-		if (strcmp(name, mze->mze_phys.mze_name) == 0)
-			return (mze);
-	}
-	return (NULL);
-}
-
-static uint32_t
-mze_find_unused_cd(zap_t *zap, uint64_t hash)
-{
-	mzap_ent_t mze_tofind;
-	mzap_ent_t *mze;
-	avl_index_t idx;
-	avl_tree_t *avl = &zap->zap_m.zap_avl;
-	uint32_t cd;
-
-	ASSERT(zap->zap_ismicro);
-	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
-
-	mze_tofind.mze_hash = hash;
-	mze_tofind.mze_phys.mze_cd = 0;
-
-	cd = 0;
-	for (mze = avl_find(avl, &mze_tofind, &idx);
-	    mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) {
-		if (mze->mze_phys.mze_cd != cd)
-			break;
-		cd++;
-	}
-
-	return (cd);
-}
-
-static void
-mze_remove(zap_t *zap, mzap_ent_t *mze)
-{
-	ASSERT(zap->zap_ismicro);
-	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
-
-	avl_remove(&zap->zap_m.zap_avl, mze);
-	kmem_free(mze, sizeof (mzap_ent_t));
-}
-
-static void
-mze_destroy(zap_t *zap)
-{
-	mzap_ent_t *mze;
-	void *avlcookie = NULL;
-
-	while (mze = avl_destroy_nodes(&zap->zap_m.zap_avl, &avlcookie))
-		kmem_free(mze, sizeof (mzap_ent_t));
-	avl_destroy(&zap->zap_m.zap_avl);
-}
-
-static zap_t *
-mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
-{
-	zap_t *winner;
-	zap_t *zap;
-	int i;
-
-	ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t));
-
-	zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP);
-	rw_init(&zap->zap_rwlock, NULL, RW_DEFAULT, 0);
-	rw_enter(&zap->zap_rwlock, RW_WRITER);
-	zap->zap_objset = os;
-	zap->zap_object = obj;
-	zap->zap_dbuf = db;
-
-	if (((uint64_t *)db->db_data)[0] != ZBT_MICRO) {
-		mutex_init(&zap->zap_f.zap_num_entries_mtx, NULL,
-		    MUTEX_DEFAULT, 0);
-		zap->zap_f.zap_block_shift = highbit(db->db_size) - 1;
-	} else {
-		zap->zap_ismicro = TRUE;
-	}
-
-	/*
-	 * Make sure that zap_ismicro is set before we let others see
-	 * it, because zap_lockdir() checks zap_ismicro without the lock
-	 * held.
-	 */
-	winner = dmu_buf_set_user(db, zap, &zap->zap_m.zap_phys, zap_evict);
-
-	if (winner != NULL) {
-		rw_exit(&zap->zap_rwlock);
-		rw_destroy(&zap->zap_rwlock);
-		if (!zap->zap_ismicro)
-			mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
-		kmem_free(zap, sizeof (zap_t));
-		return (winner);
-	}
-
-	if (zap->zap_ismicro) {
-		zap->zap_salt = zap->zap_m.zap_phys->mz_salt;
-		zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1;
-		avl_create(&zap->zap_m.zap_avl, mze_compare,
-		    sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node));
-
-		for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
-			mzap_ent_phys_t *mze =
-			    &zap->zap_m.zap_phys->mz_chunk[i];
-			if (mze->mze_name[0]) {
-				zap->zap_m.zap_num_entries++;
-				mze_insert(zap, i,
-				    zap_hash(zap, mze->mze_name), mze);
-			}
-		}
-	} else {
-		zap->zap_salt = zap->zap_f.zap_phys->zap_salt;
-
-		ASSERT3U(sizeof (struct zap_leaf_header), ==,
-		    2*ZAP_LEAF_CHUNKSIZE);
-
-		/*
-		 * The embedded pointer table should not overlap the
-		 * other members.
-		 */
-		ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >,
-		    &zap->zap_f.zap_phys->zap_salt);
-
-		/*
-		 * The embedded pointer table should end at the end of
-		 * the block
-		 */
-		ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap,
-		    1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)) -
-		    (uintptr_t)zap->zap_f.zap_phys, ==,
-		    zap->zap_dbuf->db_size);
-	}
-	rw_exit(&zap->zap_rwlock);
-	return (zap);
-}
-
-int
-zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
-    krw_t lti, int fatreader, zap_t **zapp)
-{
-	zap_t *zap;
-	dmu_buf_t *db;
-	krw_t lt;
-	int err;
-
-	*zapp = NULL;
-
-	err = dmu_buf_hold(os, obj, 0, NULL, &db);
-	if (err)
-		return (err);
-
-#ifdef ZFS_DEBUG
-	{
-		dmu_object_info_t doi;
-		dmu_object_info_from_db(db, &doi);
-		ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap);
-	}
-#endif
-
-	zap = dmu_buf_get_user(db);
-	if (zap == NULL)
-		zap = mzap_open(os, obj, db);
-
-	/*
-	 * We're checking zap_ismicro without the lock held, in order to
-	 * tell what type of lock we want.  Once we have some sort of
-	 * lock, see if it really is the right type.  In practice this
-	 * can only be different if it was upgraded from micro to fat,
-	 * and micro wanted WRITER but fat only needs READER.
-	 */
-	lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti;
-	rw_enter(&zap->zap_rwlock, lt);
-	if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) {
-		/* it was upgraded, now we only need reader */
-		ASSERT(lt == RW_WRITER);
-		ASSERT(RW_READER ==
-		    (!zap->zap_ismicro && fatreader) ? RW_READER : lti);
-		rw_downgrade(&zap->zap_rwlock);
-		lt = RW_READER;
-	}
-
-	zap->zap_objset = os;
-
-	if (lt == RW_WRITER)
-		dmu_buf_will_dirty(db, tx);
-
-	ASSERT3P(zap->zap_dbuf, ==, db);
-
-	ASSERT(!zap->zap_ismicro ||
-	    zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks);
-	if (zap->zap_ismicro && tx &&
-	    zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) {
-		uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE;
-		if (newsz > MZAP_MAX_BLKSZ) {
-			dprintf("upgrading obj %llu: num_entries=%u\n",
-			    obj, zap->zap_m.zap_num_entries);
-			mzap_upgrade(zap, tx);
-			*zapp = zap;
-			return (0);
-		}
-		err = dmu_object_set_blocksize(os, obj, newsz, 0, tx);
-		ASSERT3U(err, ==, 0);
-		zap->zap_m.zap_num_chunks =
-		    db->db_size / MZAP_ENT_LEN - 1;
-	}
-
-	*zapp = zap;
-	return (0);
-}
-
-void
-zap_unlockdir(zap_t *zap)
-{
-	rw_exit(&zap->zap_rwlock);
-	dmu_buf_rele(zap->zap_dbuf, NULL);
-}
-
-static void
-mzap_upgrade(zap_t *zap, dmu_tx_t *tx)
-{
-	mzap_phys_t *mzp;
-	int i, sz, nchunks, err;
-
-	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
-
-	sz = zap->zap_dbuf->db_size;
-	mzp = kmem_alloc(sz, KM_SLEEP);
-	bcopy(zap->zap_dbuf->db_data, mzp, sz);
-	nchunks = zap->zap_m.zap_num_chunks;
-
-	err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object,
-	    1ULL << fzap_default_block_shift, 0, tx);
-	ASSERT(err == 0);
-
-	dprintf("upgrading obj=%llu with %u chunks\n",
-	    zap->zap_object, nchunks);
-	mze_destroy(zap);
-
-	fzap_upgrade(zap, tx);
-
-	for (i = 0; i < nchunks; i++) {
-		int err;
-		mzap_ent_phys_t *mze = &mzp->mz_chunk[i];
-		if (mze->mze_name[0] == 0)
-			continue;
-		dprintf("adding %s=%llu\n",
-		    mze->mze_name, mze->mze_value);
-		err = fzap_add_cd(zap,
-		    mze->mze_name, 8, 1, &mze->mze_value,
-		    mze->mze_cd, tx);
-		ASSERT3U(err, ==, 0);
-	}
-	kmem_free(mzp, sz);
-}
-
-uint64_t
-zap_hash(zap_t *zap, const char *name)
-{
-	const uint8_t *cp;
-	uint8_t c;
-	uint64_t crc = zap->zap_salt;
-
-	ASSERT(crc != 0);
-	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
-	for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
-		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
-
-	/*
-	 * Only use 28 bits, since we need 4 bits in the cookie for the
-	 * collision differentiator.  We MUST use the high bits, since
-	 * those are the onces that we first pay attention to when
-	 * chosing the bucket.
-	 */
-	crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
-
-	return (crc);
-}
-
-
-static void
-mzap_create_impl(objset_t *os, uint64_t obj, dmu_tx_t *tx)
-{
-	dmu_buf_t *db;
-	mzap_phys_t *zp;
-
-	VERIFY(0 == dmu_buf_hold(os, obj, 0, FTAG, &db));
-
-#ifdef ZFS_DEBUG
-	{
-		dmu_object_info_t doi;
-		dmu_object_info_from_db(db, &doi);
-		ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap);
-	}
-#endif
-
-	dmu_buf_will_dirty(db, tx);
-	zp = db->db_data;
-	zp->mz_block_type = ZBT_MICRO;
-	zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL;
-	ASSERT(zp->mz_salt != 0);
-	dmu_buf_rele(db, FTAG);
-}
-
-int
-zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot,
-    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
-{
-	int err;
-
-	err = dmu_object_claim(os, obj, ot, 0, bonustype, bonuslen, tx);
-	if (err != 0)
-		return (err);
-	mzap_create_impl(os, obj, tx);
-	return (0);
-}
-
-uint64_t
-zap_create(objset_t *os, dmu_object_type_t ot,
-    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
-{
-	uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx);
-
-	mzap_create_impl(os, obj, tx);
-	return (obj);
-}
-
-int
-zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx)
-{
-	/*
-	 * dmu_object_free will free the object number and free the
-	 * data.  Freeing the data will cause our pageout function to be
-	 * called, which will destroy our data (zap_leaf_t's and zap_t).
-	 */
-
-	return (dmu_object_free(os, zapobj, tx));
-}
-
-_NOTE(ARGSUSED(0))
-void
-zap_evict(dmu_buf_t *db, void *vzap)
-{
-	zap_t *zap = vzap;
-
-	rw_destroy(&zap->zap_rwlock);
-
-	if (zap->zap_ismicro)
-		mze_destroy(zap);
-	else
-		mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
-
-	kmem_free(zap, sizeof (zap_t));
-}
-
-int
-zap_count(objset_t *os, uint64_t zapobj, uint64_t *count)
-{
-	zap_t *zap;
-	int err;
-
-	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, &zap);
-	if (err)
-		return (err);
-	if (!zap->zap_ismicro) {
-		err = fzap_count(zap, count);
-	} else {
-		*count = zap->zap_m.zap_num_entries;
-	}
-	zap_unlockdir(zap);
-	return (err);
-}
-
-/*
- * Routines for maniplulating attributes.
- */
-
-int
-zap_lookup(objset_t *os, uint64_t zapobj, const char *name,
-    uint64_t integer_size, uint64_t num_integers, void *buf)
-{
-	zap_t *zap;
-	int err;
-	mzap_ent_t *mze;
-
-	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, &zap);
-	if (err)
-		return (err);
-	if (!zap->zap_ismicro) {
-		err = fzap_lookup(zap, name,
-		    integer_size, num_integers, buf);
-	} else {
-		mze = mze_find(zap, name, zap_hash(zap, name));
-		if (mze == NULL) {
-			err = ENOENT;
-		} else {
-			if (num_integers < 1)
-				err = EOVERFLOW;
-			else if (integer_size != 8)
-				err = EINVAL;
-			else
-				*(uint64_t *)buf = mze->mze_phys.mze_value;
-		}
-	}
-	zap_unlockdir(zap);
-	return (err);
-}
-
-int
-zap_length(objset_t *os, uint64_t zapobj, const char *name,
-    uint64_t *integer_size, uint64_t *num_integers)
-{
-	zap_t *zap;
-	int err;
-	mzap_ent_t *mze;
-
-	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, &zap);
-	if (err)
-		return (err);
-	if (!zap->zap_ismicro) {
-		err = fzap_length(zap, name, integer_size, num_integers);
-	} else {
-		mze = mze_find(zap, name, zap_hash(zap, name));
-		if (mze == NULL) {
-			err = ENOENT;
-		} else {
-			if (integer_size)
-				*integer_size = 8;
-			if (num_integers)
-				*num_integers = 1;
-		}
-	}
-	zap_unlockdir(zap);
-	return (err);
-}
-
-static void
-mzap_addent(zap_t *zap, const char *name, uint64_t hash, uint64_t value)
-{
-	int i;
-	int start = zap->zap_m.zap_alloc_next;
-	uint32_t cd;
-
-	dprintf("obj=%llu %s=%llu\n", zap->zap_object, name, value);
-	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
-
-#ifdef ZFS_DEBUG
-	for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
-		mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i];
-		ASSERT(strcmp(name, mze->mze_name) != 0);
-	}
-#endif
-
-	cd = mze_find_unused_cd(zap, hash);
-	/* given the limited size of the microzap, this can't happen */
-	ASSERT(cd != ZAP_MAXCD);
-
-again:
-	for (i = start; i < zap->zap_m.zap_num_chunks; i++) {
-		mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i];
-		if (mze->mze_name[0] == 0) {
-			mze->mze_value = value;
-			mze->mze_cd = cd;
-			(void) strcpy(mze->mze_name, name);
-			zap->zap_m.zap_num_entries++;
-			zap->zap_m.zap_alloc_next = i+1;
-			if (zap->zap_m.zap_alloc_next ==
-			    zap->zap_m.zap_num_chunks)
-				zap->zap_m.zap_alloc_next = 0;
-			mze_insert(zap, i, hash, mze);
-			return;
-		}
-	}
-	if (start != 0) {
-		start = 0;
-		goto again;
-	}
-	ASSERT(!"out of entries!");
-}
-
-int
-zap_add(objset_t *os, uint64_t zapobj, const char *name,
-    int integer_size, uint64_t num_integers,
-    const void *val, dmu_tx_t *tx)
-{
-	zap_t *zap;
-	int err;
-	mzap_ent_t *mze;
-	const uint64_t *intval = val;
-	uint64_t hash;
-
-	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, &zap);
-	if (err)
-		return (err);
-	if (!zap->zap_ismicro) {
-		err = fzap_add(zap, name, integer_size, num_integers, val, tx);
-	} else if (integer_size != 8 || num_integers != 1 ||
-	    strlen(name) >= MZAP_NAME_LEN) {
-		dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
-		    zapobj, integer_size, num_integers, name);
-		mzap_upgrade(zap, tx);
-		err = fzap_add(zap, name, integer_size, num_integers, val, tx);
-	} else {
-		hash = zap_hash(zap, name);
-		mze = mze_find(zap, name, hash);
-		if (mze != NULL) {
-			err = EEXIST;
-		} else {
-			mzap_addent(zap, name, hash, *intval);
-		}
-	}
-	zap_unlockdir(zap);
-	return (err);
-}
-
-int
-zap_update(objset_t *os, uint64_t zapobj, const char *name,
-    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
-{
-	zap_t *zap;
-	mzap_ent_t *mze;
-	const uint64_t *intval = val;
-	uint64_t hash;
-	int err;
-
-	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, &zap);
-	if (err)
-		return (err);
-	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
-	if (!zap->zap_ismicro) {
-		err = fzap_update(zap, name,
-		    integer_size, num_integers, val, tx);
-	} else if (integer_size != 8 || num_integers != 1 ||
-	    strlen(name) >= MZAP_NAME_LEN) {
-		dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
-		    zapobj, integer_size, num_integers, name);
-		mzap_upgrade(zap, tx);
-		err = fzap_update(zap, name,
-		    integer_size, num_integers, val, tx);
-	} else {
-		hash = zap_hash(zap, name);
-		mze = mze_find(zap, name, hash);
-		if (mze != NULL) {
-			mze->mze_phys.mze_value = *intval;
-			zap->zap_m.zap_phys->mz_chunk
-			    [mze->mze_chunkid].mze_value = *intval;
-		} else {
-			mzap_addent(zap, name, hash, *intval);
-		}
-	}
-	zap_unlockdir(zap);
-	return (err);
-}
-
-int
-zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx)
-{
-	zap_t *zap;
-	int err;
-	mzap_ent_t *mze;
-
-	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, &zap);
-	if (err)
-		return (err);
-	if (!zap->zap_ismicro) {
-		err = fzap_remove(zap, name, tx);
-	} else {
-		mze = mze_find(zap, name, zap_hash(zap, name));
-		if (mze == NULL) {
-			dprintf("fail: %s\n", name);
-			err = ENOENT;
-		} else {
-			dprintf("success: %s\n", name);
-			zap->zap_m.zap_num_entries--;
-			bzero(&zap->zap_m.zap_phys->mz_chunk[mze->mze_chunkid],
-			    sizeof (mzap_ent_phys_t));
-			mze_remove(zap, mze);
-		}
-	}
-	zap_unlockdir(zap);
-	return (err);
-}
-
-
-/*
- * Routines for iterating over the attributes.
- */
-
-/*
- * We want to keep the high 32 bits of the cursor zero if we can, so
- * that 32-bit programs can access this.  So use a small hash value so
- * we can fit 4 bits of cd into the 32-bit cursor.
- *
- * [ 4 zero bits | 32-bit collision differentiator | 28-bit hash value ]
- */
-void
-zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
-    uint64_t serialized)
-{
-	zc->zc_objset = os;
-	zc->zc_zap = NULL;
-	zc->zc_leaf = NULL;
-	zc->zc_zapobj = zapobj;
-	if (serialized == -1ULL) {
-		zc->zc_hash = -1ULL;
-		zc->zc_cd = 0;
-	} else {
-		zc->zc_hash = serialized << (64-ZAP_HASHBITS);
-		zc->zc_cd = serialized >> ZAP_HASHBITS;
-		if (zc->zc_cd >= ZAP_MAXCD) /* corrupt serialized */
-			zc->zc_cd = 0;
-	}
-}
-
-void
-zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
-{
-	zap_cursor_init_serialized(zc, os, zapobj, 0);
-}
-
-void
-zap_cursor_fini(zap_cursor_t *zc)
-{
-	if (zc->zc_zap) {
-		rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
-		zap_unlockdir(zc->zc_zap);
-		zc->zc_zap = NULL;
-	}
-	if (zc->zc_leaf) {
-		rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
-		zap_put_leaf(zc->zc_leaf);
-		zc->zc_leaf = NULL;
-	}
-	zc->zc_objset = NULL;
-}
-
-uint64_t
-zap_cursor_serialize(zap_cursor_t *zc)
-{
-	if (zc->zc_hash == -1ULL)
-		return (-1ULL);
-	ASSERT((zc->zc_hash & (ZAP_MAXCD-1)) == 0);
-	ASSERT(zc->zc_cd < ZAP_MAXCD);
-	return ((zc->zc_hash >> (64-ZAP_HASHBITS)) |
-	    ((uint64_t)zc->zc_cd << ZAP_HASHBITS));
-}
-
-int
-zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
-{
-	int err;
-	avl_index_t idx;
-	mzap_ent_t mze_tofind;
-	mzap_ent_t *mze;
-
-	if (zc->zc_hash == -1ULL)
-		return (ENOENT);
-
-	if (zc->zc_zap == NULL) {
-		err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
-		    RW_READER, TRUE, &zc->zc_zap);
-		if (err)
-			return (err);
-	} else {
-		rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
-	}
-	if (!zc->zc_zap->zap_ismicro) {
-		err = fzap_cursor_retrieve(zc->zc_zap, zc, za);
-	} else {
-		err = ENOENT;
-
-		mze_tofind.mze_hash = zc->zc_hash;
-		mze_tofind.mze_phys.mze_cd = zc->zc_cd;
-
-		mze = avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx);
-		ASSERT(mze == NULL || 0 == bcmp(&mze->mze_phys,
-		    &zc->zc_zap->zap_m.zap_phys->mz_chunk[mze->mze_chunkid],
-		    sizeof (mze->mze_phys)));
-		if (mze == NULL) {
-			mze = avl_nearest(&zc->zc_zap->zap_m.zap_avl,
-			    idx, AVL_AFTER);
-		}
-		if (mze) {
-			za->za_integer_length = 8;
-			za->za_num_integers = 1;
-			za->za_first_integer = mze->mze_phys.mze_value;
-			(void) strcpy(za->za_name, mze->mze_phys.mze_name);
-			zc->zc_hash = mze->mze_hash;
-			zc->zc_cd = mze->mze_phys.mze_cd;
-			err = 0;
-		} else {
-			zc->zc_hash = -1ULL;
-		}
-	}
-	rw_exit(&zc->zc_zap->zap_rwlock);
-	return (err);
-}
-
-void
-zap_cursor_advance(zap_cursor_t *zc)
-{
-	if (zc->zc_hash == -1ULL)
-		return;
-	zc->zc_cd++;
-	if (zc->zc_cd >= ZAP_MAXCD) {
-		zc->zc_cd = 0;
-		zc->zc_hash += 1ULL<<(64-ZAP_HASHBITS);
-		if (zc->zc_hash == 0) /* EOF */
-			zc->zc_hash = -1ULL;
-	}
-}
-
-int
-zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
-{
-	int err;
-	zap_t *zap;
-
-	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, &zap);
-	if (err)
-		return (err);
-
-	bzero(zs, sizeof (zap_stats_t));
-
-	if (zap->zap_ismicro) {
-		zs->zs_blocksize = zap->zap_dbuf->db_size;
-		zs->zs_num_entries = zap->zap_m.zap_num_entries;
-		zs->zs_num_blocks = 1;
-	} else {
-		fzap_get_stats(zap, zs);
-	}
-	zap_unlockdir(zap);
-	return (0);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs.conf b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs.conf
deleted file mode 100644
index 0988190..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs.conf
+++ /dev/null
@@ -1,28 +0,0 @@
-#
-# CDDL HEADER START
-#
-# The contents of this file are subject to the terms of the
-# Common Development and Distribution License, Version 1.0 only
-# (the "License").  You may not use this file except in compliance
-# with the License.
-#
-# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
-# or http://www.opensolaris.org/os/licensing.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-# When distributing Covered Code, include this CDDL HEADER in each
-# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
-# If applicable, add the following below this CDDL HEADER, with the
-# fields enclosed by brackets "[]" replaced with your own identifying
-# information: Portions Copyright [yyyy] [name of copyright owner]
-#
-# CDDL HEADER END
-#
-#
-# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
-# Use is subject to license terms.
-#
-# ident	"%Z%%M%	%I%	%E% SMI"
-#
-name="zfs" parent="pseudo";
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c
deleted file mode 100644
index dd94618..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c
+++ /dev/null
@@ -1,1608 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/time.h>
-#include <sys/systm.h>
-#include <sys/sysmacros.h>
-#include <sys/resource.h>
-#include <sys/vfs.h>
-#include <sys/vnode.h>
-#include <sys/file.h>
-#include <sys/stat.h>
-#include <sys/kmem.h>
-#include <sys/cmn_err.h>
-#include <sys/errno.h>
-#include <sys/unistd.h>
-#include <sys/sdt.h>
-#include <sys/fs/zfs.h>
-#include <sys/policy.h>
-#include <sys/zfs_znode.h>
-#include <sys/zfs_acl.h>
-#include <sys/zfs_dir.h>
-#include <sys/zfs_vfsops.h>
-#include <sys/dmu.h>
-#include <sys/zap.h>
-#include <acl/acl_common.h>
-
-#define	ALLOW	ACE_ACCESS_ALLOWED_ACE_TYPE
-#define	DENY	ACE_ACCESS_DENIED_ACE_TYPE
-
-#define	OWNING_GROUP		(ACE_GROUP|ACE_IDENTIFIER_GROUP)
-#define	EVERYONE_ALLOW_MASK (ACE_READ_ACL|ACE_READ_ATTRIBUTES | \
-    ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE)
-#define	EVERYONE_DENY_MASK (ACE_WRITE_ACL|ACE_WRITE_OWNER | \
-    ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS)
-#define	OWNER_ALLOW_MASK (ACE_WRITE_ACL | ACE_WRITE_OWNER | \
-    ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS)
-#define	WRITE_MASK (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS| \
-    ACE_WRITE_ATTRIBUTES|ACE_WRITE_ACL|ACE_WRITE_OWNER)
-
-#define	OGE_CLEAR	(ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \
-    ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE)
-
-#define	OKAY_MASK_BITS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \
-    ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE)
-
-#define	ALL_INHERIT	(ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE | \
-    ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE)
-
-#define	SECURE_CLEAR	(ACE_WRITE_ACL|ACE_WRITE_OWNER)
-
-#define	OGE_PAD	6		/* traditional owner/group/everyone ACES */
-
-static int zfs_ace_can_use(znode_t *zp, ace_t *);
-
-static zfs_acl_t *
-zfs_acl_alloc(int slots)
-{
-	zfs_acl_t *aclp;
-
-	aclp = kmem_zalloc(sizeof (zfs_acl_t), KM_SLEEP);
-	if (slots != 0) {
-		aclp->z_acl = kmem_alloc(ZFS_ACL_SIZE(slots), KM_SLEEP);
-		aclp->z_acl_count = 0;
-		aclp->z_state = ACL_DATA_ALLOCED;
-	} else {
-		aclp->z_state = 0;
-	}
-	aclp->z_slots = slots;
-	return (aclp);
-}
-
-void
-zfs_acl_free(zfs_acl_t *aclp)
-{
-	if (aclp->z_state == ACL_DATA_ALLOCED) {
-		kmem_free(aclp->z_acl, ZFS_ACL_SIZE(aclp->z_slots));
-	}
-	kmem_free(aclp, sizeof (zfs_acl_t));
-}
-
-static uint32_t
-zfs_v4_to_unix(uint32_t access_mask)
-{
-	uint32_t new_mask = 0;
-
-	/*
-	 * This is used for mapping v4 permissions into permissions
-	 * that can be passed to secpolicy_vnode_access()
-	 */
-	if (access_mask & (ACE_READ_DATA | ACE_LIST_DIRECTORY |
-	    ACE_READ_ATTRIBUTES | ACE_READ_ACL))
-		new_mask |= S_IROTH;
-	if (access_mask & (ACE_WRITE_DATA | ACE_APPEND_DATA |
-	    ACE_WRITE_ATTRIBUTES | ACE_ADD_FILE | ACE_WRITE_NAMED_ATTRS))
-		new_mask |= S_IWOTH;
-	if (access_mask & (ACE_EXECUTE | ACE_READ_NAMED_ATTRS))
-		new_mask |= S_IXOTH;
-
-	return (new_mask);
-}
-
-/*
- * Convert unix access mask to v4 access mask
- */
-static uint32_t
-zfs_unix_to_v4(uint32_t access_mask)
-{
-	uint32_t new_mask = 0;
-
-	if (access_mask & 01)
-		new_mask |= (ACE_EXECUTE);
-	if (access_mask & 02) {
-		new_mask |= (ACE_WRITE_DATA);
-	} if (access_mask & 04) {
-		new_mask |= ACE_READ_DATA;
-	}
-	return (new_mask);
-}
-
-static void
-zfs_set_ace(ace_t *zacep, uint32_t access_mask, int access_type,
-    uid_t uid, int entry_type)
-{
-	zacep->a_access_mask = access_mask;
-	zacep->a_type = access_type;
-	zacep->a_who = uid;
-	zacep->a_flags = entry_type;
-}
-
-static uint64_t
-zfs_mode_compute(znode_t *zp, zfs_acl_t *aclp)
-{
-	int 	i;
-	int	entry_type;
-	mode_t	mode = (zp->z_phys->zp_mode &
-	    (S_IFMT | S_ISUID | S_ISGID | S_ISVTX));
-	mode_t	 seen = 0;
-	ace_t 	*acep;
-
-	for (i = 0, acep = aclp->z_acl;
-	    i != aclp->z_acl_count; i++, acep++) {
-		entry_type = (acep->a_flags & ACE_TYPE_FLAGS);
-		if (entry_type == ACE_OWNER) {
-			if ((acep->a_access_mask & ACE_READ_DATA) &&
-			    (!(seen & S_IRUSR))) {
-				seen |= S_IRUSR;
-				if (acep->a_type == ALLOW) {
-					mode |= S_IRUSR;
-				}
-			}
-			if ((acep->a_access_mask & ACE_WRITE_DATA) &&
-			    (!(seen & S_IWUSR))) {
-				seen |= S_IWUSR;
-				if (acep->a_type == ALLOW) {
-					mode |= S_IWUSR;
-				}
-			}
-			if ((acep->a_access_mask & ACE_EXECUTE) &&
-			    (!(seen & S_IXUSR))) {
-				seen |= S_IXUSR;
-				if (acep->a_type == ALLOW) {
-					mode |= S_IXUSR;
-				}
-			}
-		} else if (entry_type == OWNING_GROUP) {
-			if ((acep->a_access_mask & ACE_READ_DATA) &&
-			    (!(seen & S_IRGRP))) {
-				seen |= S_IRGRP;
-				if (acep->a_type == ALLOW) {
-					mode |= S_IRGRP;
-				}
-			}
-			if ((acep->a_access_mask & ACE_WRITE_DATA) &&
-			    (!(seen & S_IWGRP))) {
-				seen |= S_IWGRP;
-				if (acep->a_type == ALLOW) {
-					mode |= S_IWGRP;
-				}
-			}
-			if ((acep->a_access_mask & ACE_EXECUTE) &&
-			    (!(seen & S_IXGRP))) {
-				seen |= S_IXGRP;
-				if (acep->a_type == ALLOW) {
-					mode |= S_IXGRP;
-				}
-			}
-		} else if (entry_type == ACE_EVERYONE) {
-			if ((acep->a_access_mask & ACE_READ_DATA)) {
-				if (!(seen & S_IRUSR)) {
-					seen |= S_IRUSR;
-					if (acep->a_type == ALLOW) {
-						mode |= S_IRUSR;
-					}
-				}
-				if (!(seen & S_IRGRP)) {
-					seen |= S_IRGRP;
-					if (acep->a_type == ALLOW) {
-						mode |= S_IRGRP;
-					}
-				}
-				if (!(seen & S_IROTH)) {
-					seen |= S_IROTH;
-					if (acep->a_type == ALLOW) {
-						mode |= S_IROTH;
-					}
-				}
-			}
-			if ((acep->a_access_mask & ACE_WRITE_DATA)) {
-				if (!(seen & S_IWUSR)) {
-					seen |= S_IWUSR;
-					if (acep->a_type == ALLOW) {
-						mode |= S_IWUSR;
-					}
-				}
-				if (!(seen & S_IWGRP)) {
-					seen |= S_IWGRP;
-					if (acep->a_type == ALLOW) {
-						mode |= S_IWGRP;
-					}
-				}
-				if (!(seen & S_IWOTH)) {
-					seen |= S_IWOTH;
-					if (acep->a_type == ALLOW) {
-						mode |= S_IWOTH;
-					}
-				}
-			}
-			if ((acep->a_access_mask & ACE_EXECUTE)) {
-				if (!(seen & S_IXUSR)) {
-					seen |= S_IXUSR;
-					if (acep->a_type == ALLOW) {
-						mode |= S_IXUSR;
-					}
-				}
-				if (!(seen & S_IXGRP)) {
-					seen |= S_IXGRP;
-					if (acep->a_type == ALLOW) {
-						mode |= S_IXGRP;
-					}
-				}
-				if (!(seen & S_IXOTH)) {
-					seen |= S_IXOTH;
-					if (acep->a_type == ALLOW) {
-						mode |= S_IXOTH;
-					}
-				}
-			}
-		}
-	}
-	return (mode);
-}
-
-static zfs_acl_t *
-zfs_acl_node_read_internal(znode_t *zp)
-{
-	zfs_acl_t	*aclp;
-
-	aclp = zfs_acl_alloc(0);
-	aclp->z_acl_count = zp->z_phys->zp_acl.z_acl_count;
-	aclp->z_acl = &zp->z_phys->zp_acl.z_ace_data[0];
-
-	return (aclp);
-}
-
-/*
- * Read an external acl object.
- */
-static int
-zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp)
-{
-	uint64_t extacl = zp->z_phys->zp_acl.z_acl_extern_obj;
-	zfs_acl_t	*aclp;
-	int error;
-
-	ASSERT(MUTEX_HELD(&zp->z_acl_lock));
-
-	if (zp->z_phys->zp_acl.z_acl_extern_obj == 0) {
-		*aclpp = zfs_acl_node_read_internal(zp);
-		return (0);
-	}
-
-	aclp = zfs_acl_alloc(zp->z_phys->zp_acl.z_acl_count);
-
-	error = dmu_read(zp->z_zfsvfs->z_os, extacl, 0,
-	    ZFS_ACL_SIZE(zp->z_phys->zp_acl.z_acl_count), aclp->z_acl);
-	if (error != 0) {
-		zfs_acl_free(aclp);
-		return (error);
-	}
-
-	aclp->z_acl_count = zp->z_phys->zp_acl.z_acl_count;
-
-	*aclpp = aclp;
-	return (0);
-}
-
-static boolean_t
-zfs_acl_valid(znode_t *zp, ace_t *uace, int aclcnt, int *inherit)
-{
-	ace_t 	*acep;
-	int i;
-
-	*inherit = 0;
-
-	if (aclcnt > MAX_ACL_ENTRIES || aclcnt <= 0) {
-		return (B_FALSE);
-	}
-
-	for (i = 0, acep = uace; i != aclcnt; i++, acep++) {
-
-		/*
-		 * first check type of entry
-		 */
-
-		switch (acep->a_flags & ACE_TYPE_FLAGS) {
-		case ACE_OWNER:
-			acep->a_who = -1;
-			break;
-		case (ACE_IDENTIFIER_GROUP | ACE_GROUP):
-		case ACE_IDENTIFIER_GROUP:
-			if (acep->a_flags & ACE_GROUP) {
-				acep->a_who = -1;
-			}
-			break;
-		case ACE_EVERYONE:
-			acep->a_who = -1;
-			break;
-		}
-
-		/*
-		 * next check inheritance level flags
-		 */
-
-		if (acep->a_type != ALLOW && acep->a_type != DENY)
-			return (B_FALSE);
-
-		/*
-		 * Only directories should have inheritance flags.
-		 */
-		if (ZTOV(zp)->v_type != VDIR && (acep->a_flags &
-		    (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE|
-		    ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE))) {
-			return (B_FALSE);
-		}
-
-		if (acep->a_flags &
-		    (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))
-			*inherit = 1;
-
-		if (acep->a_flags &
-		    (ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE)) {
-			if ((acep->a_flags & (ACE_FILE_INHERIT_ACE|
-			    ACE_DIRECTORY_INHERIT_ACE)) == 0) {
-				return (B_FALSE);
-			}
-		}
-	}
-
-	return (B_TRUE);
-}
-/*
- * common code for setting acl's.
- *
- * This function is called from zfs_mode_update, zfs_perm_init, and zfs_setacl.
- * zfs_setacl passes a non-NULL inherit pointer (ihp) to indicate that it's
- * already checked the acl and knows whether to inherit.
- */
-int
-zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, dmu_tx_t *tx, int *ihp)
-{
-	int 		inherit = 0;
-	int		error;
-	znode_phys_t	*zphys = zp->z_phys;
-	zfs_znode_acl_t	*zacl = &zphys->zp_acl;
-	uint32_t	acl_phys_size = ZFS_ACL_SIZE(aclp->z_acl_count);
-	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	uint64_t	aoid = zphys->zp_acl.z_acl_extern_obj;
-
-	ASSERT(MUTEX_HELD(&zp->z_lock));
-	ASSERT(MUTEX_HELD(&zp->z_acl_lock));
-
-	if (ihp)
-		inherit = *ihp;		/* already determined by caller */
-	else if (!zfs_acl_valid(zp, aclp->z_acl,
-	    aclp->z_acl_count, &inherit)) {
-		return (EINVAL);
-	}
-
-	dmu_buf_will_dirty(zp->z_dbuf, tx);
-
-	/*
-	 * Will ACL fit internally?
-	 */
-	if (aclp->z_acl_count > ACE_SLOT_CNT) {
-		if (aoid == 0) {
-			aoid = dmu_object_alloc(zfsvfs->z_os,
-			    DMU_OT_ACL, acl_phys_size, DMU_OT_NONE, 0, tx);
-		} else {
-			(void) dmu_object_set_blocksize(zfsvfs->z_os, aoid,
-			    acl_phys_size, 0, tx);
-		}
-		zphys->zp_acl.z_acl_extern_obj = aoid;
-		zphys->zp_acl.z_acl_count = aclp->z_acl_count;
-		dmu_write(zfsvfs->z_os, aoid, 0,
-		    acl_phys_size, aclp->z_acl, tx);
-	} else {
-		/*
-		 * Migrating back embedded?
-		 */
-		if (zphys->zp_acl.z_acl_extern_obj) {
-			error = dmu_object_free(zfsvfs->z_os,
-				zp->z_phys->zp_acl.z_acl_extern_obj, tx);
-			if (error)
-				return (error);
-			zphys->zp_acl.z_acl_extern_obj = 0;
-		}
-		bcopy(aclp->z_acl, zacl->z_ace_data,
-		    aclp->z_acl_count * sizeof (ace_t));
-		zacl->z_acl_count = aclp->z_acl_count;
-	}
-
-	zp->z_phys->zp_flags &= ~(ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE);
-	if (inherit) {
-		zp->z_phys->zp_flags |= ZFS_INHERIT_ACE;
-	} else if (ace_trivial(zacl->z_ace_data, zacl->z_acl_count) == 0) {
-		zp->z_phys->zp_flags |= ZFS_ACL_TRIVIAL;
-	}
-
-	zphys->zp_mode = zfs_mode_compute(zp, aclp);
-	zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
-
-	return (0);
-}
-
-/*
- * Create space for slots_needed ACEs to be append
- * to aclp.
- */
-static void
-zfs_acl_append(zfs_acl_t *aclp, int slots_needed)
-{
-	ace_t	*newacep;
-	ace_t	*oldaclp;
-	int	slot_cnt;
-	int 	slots_left = aclp->z_slots - aclp->z_acl_count;
-
-	if (aclp->z_state == ACL_DATA_ALLOCED)
-		ASSERT(aclp->z_slots >= aclp->z_acl_count);
-	if (slots_left < slots_needed || aclp->z_state != ACL_DATA_ALLOCED) {
-		slot_cnt = aclp->z_slots +  1 + (slots_needed - slots_left);
-		newacep = kmem_alloc(ZFS_ACL_SIZE(slot_cnt), KM_SLEEP);
-		bcopy(aclp->z_acl, newacep,
-		    ZFS_ACL_SIZE(aclp->z_acl_count));
-		oldaclp = aclp->z_acl;
-		if (aclp->z_state == ACL_DATA_ALLOCED)
-			kmem_free(oldaclp, ZFS_ACL_SIZE(aclp->z_slots));
-		aclp->z_acl = newacep;
-		aclp->z_slots = slot_cnt;
-		aclp->z_state = ACL_DATA_ALLOCED;
-	}
-}
-
-/*
- * Remove "slot" ACE from aclp
- */
-static void
-zfs_ace_remove(zfs_acl_t *aclp, int slot)
-{
-	if (aclp->z_acl_count > 1) {
-		(void) memmove(&aclp->z_acl[slot],
-		    &aclp->z_acl[slot +1], sizeof (ace_t) *
-		    (--aclp->z_acl_count - slot));
-	} else
-		aclp->z_acl_count--;
-}
-
-/*
- * Update access mask for prepended ACE
- *
- * This applies the "groupmask" value for aclmode property.
- */
-static void
-zfs_acl_prepend_fixup(ace_t *acep, ace_t *origacep, mode_t mode, uid_t owner)
-{
-
-	int	rmask, wmask, xmask;
-	int	user_ace;
-
-	user_ace = (!(acep->a_flags &
-	    (ACE_OWNER|ACE_GROUP|ACE_IDENTIFIER_GROUP)));
-
-	if (user_ace && (acep->a_who == owner)) {
-		rmask = S_IRUSR;
-		wmask = S_IWUSR;
-		xmask = S_IXUSR;
-	} else {
-		rmask = S_IRGRP;
-		wmask = S_IWGRP;
-		xmask = S_IXGRP;
-	}
-
-	if (origacep->a_access_mask & ACE_READ_DATA) {
-		if (mode & rmask)
-			acep->a_access_mask &= ~ACE_READ_DATA;
-		else
-			acep->a_access_mask |= ACE_READ_DATA;
-	}
-
-	if (origacep->a_access_mask & ACE_WRITE_DATA) {
-		if (mode & wmask)
-			acep->a_access_mask &= ~ACE_WRITE_DATA;
-		else
-			acep->a_access_mask |= ACE_WRITE_DATA;
-	}
-
-	if (origacep->a_access_mask & ACE_APPEND_DATA) {
-		if (mode & wmask)
-			acep->a_access_mask &= ~ACE_APPEND_DATA;
-		else
-			acep->a_access_mask |= ACE_APPEND_DATA;
-	}
-
-	if (origacep->a_access_mask & ACE_EXECUTE) {
-		if (mode & xmask)
-			acep->a_access_mask &= ~ACE_EXECUTE;
-		else
-			acep->a_access_mask |= ACE_EXECUTE;
-	}
-}
-
-/*
- * Apply mode to canonical six ACEs.
- */
-static void
-zfs_acl_fixup_canonical_six(zfs_acl_t *aclp, mode_t mode)
-{
-	int	cnt;
-	ace_t	*acep;
-
-	cnt = aclp->z_acl_count -1;
-	acep = aclp->z_acl;
-
-	/*
-	 * Fixup final ACEs to match the mode
-	 */
-
-	ASSERT(cnt >= 5);
-	adjust_ace_pair(&acep[cnt - 1], mode);	/* everyone@ */
-	adjust_ace_pair(&acep[cnt - 3], (mode & 0070) >> 3);	/* group@ */
-	adjust_ace_pair(&acep[cnt - 5], (mode & 0700) >> 6);	/* owner@ */
-}
-
-
-static int
-zfs_acl_ace_match(ace_t *acep, int allow_deny, int type, int mask)
-{
-	return (acep->a_access_mask == mask && acep->a_type == allow_deny &&
-	    ((acep->a_flags & ACE_TYPE_FLAGS) == type));
-}
-
-/*
- * Can prepended ACE be reused?
- */
-static int
-zfs_reuse_deny(ace_t *acep, int i)
-{
-	int okay_masks;
-
-	if (i < 1)
-		return (B_FALSE);
-
-	if (acep[i-1].a_type != DENY)
-		return (B_FALSE);
-
-	if (acep[i-1].a_flags != (acep[i].a_flags & ACE_IDENTIFIER_GROUP))
-		return (B_FALSE);
-
-	okay_masks = (acep[i].a_access_mask & OKAY_MASK_BITS);
-
-	if (acep[i-1].a_access_mask & ~okay_masks)
-		return (B_FALSE);
-
-	return (B_TRUE);
-}
-
-/*
- * Create space to prepend an ACE
- */
-static void
-zfs_acl_prepend(zfs_acl_t *aclp, int i)
-{
-	ace_t	*oldaclp = NULL;
-	ace_t	*to, *from;
-	int	slots_left = aclp->z_slots - aclp->z_acl_count;
-	int	oldslots;
-	int	need_free = 0;
-
-	if (aclp->z_state == ACL_DATA_ALLOCED)
-		ASSERT(aclp->z_slots >= aclp->z_acl_count);
-
-	if (slots_left == 0 || aclp->z_state != ACL_DATA_ALLOCED) {
-
-		to = kmem_alloc(ZFS_ACL_SIZE(aclp->z_acl_count +
-		    OGE_PAD), KM_SLEEP);
-		if (aclp->z_state == ACL_DATA_ALLOCED)
-			need_free++;
-		from = aclp->z_acl;
-		oldaclp = aclp->z_acl;
-		(void) memmove(to, from,
-		    sizeof (ace_t) * aclp->z_acl_count);
-		aclp->z_state = ACL_DATA_ALLOCED;
-	} else {
-		from = aclp->z_acl;
-		to = aclp->z_acl;
-	}
-
-
-	(void) memmove(&to[i + 1], &from[i],
-	    sizeof (ace_t) * (aclp->z_acl_count - i));
-
-	if (oldaclp) {
-		aclp->z_acl = to;
-		oldslots = aclp->z_slots;
-		aclp->z_slots = aclp->z_acl_count + OGE_PAD;
-		if (need_free)
-			kmem_free(oldaclp, ZFS_ACL_SIZE(oldslots));
-	}
-
-}
-
-/*
- * Prepend deny ACE
- */
-static void
-zfs_acl_prepend_deny(znode_t *zp, zfs_acl_t *aclp, int i,
-    mode_t mode)
-{
-	ace_t	*acep;
-
-	zfs_acl_prepend(aclp, i);
-
-	acep = aclp->z_acl;
-	zfs_set_ace(&acep[i], 0, DENY, acep[i + 1].a_who,
-	    (acep[i + 1].a_flags & ACE_TYPE_FLAGS));
-	zfs_acl_prepend_fixup(&acep[i], &acep[i+1], mode, zp->z_phys->zp_uid);
-	aclp->z_acl_count++;
-}
-
-/*
- * Split an inherited ACE into inherit_only ACE
- * and original ACE with inheritance flags stripped off.
- */
-static void
-zfs_acl_split_ace(zfs_acl_t *aclp, int i)
-{
-	ace_t *acep = aclp->z_acl;
-
-	zfs_acl_prepend(aclp, i);
-	acep = aclp->z_acl;
-	acep[i] = acep[i + 1];
-	acep[i].a_flags |= ACE_INHERIT_ONLY_ACE;
-	acep[i + 1].a_flags &= ~ALL_INHERIT;
-	aclp->z_acl_count++;
-}
-
-/*
- * Are ACES started at index i, the canonical six ACES?
- */
-static int
-zfs_have_canonical_six(zfs_acl_t *aclp, int i)
-{
-	ace_t *acep = aclp->z_acl;
-
-	if ((zfs_acl_ace_match(&acep[i],
-	    DENY, ACE_OWNER, 0) &&
-	    zfs_acl_ace_match(&acep[i + 1], ALLOW, ACE_OWNER,
-	    OWNER_ALLOW_MASK) && zfs_acl_ace_match(&acep[i + 2],
-	    DENY, OWNING_GROUP, 0) && zfs_acl_ace_match(&acep[i + 3],
-	    ALLOW, OWNING_GROUP, 0) && zfs_acl_ace_match(&acep[i + 4],
-	    DENY, ACE_EVERYONE, EVERYONE_DENY_MASK) &&
-	    zfs_acl_ace_match(&acep[i + 5], ALLOW, ACE_EVERYONE,
-	    EVERYONE_ALLOW_MASK))) {
-		return (1);
-	} else {
-		return (0);
-	}
-}
-
-/*
- * Apply step 1g, to group entries
- *
- * Need to deal with corner case where group may have
- * greater permissions than owner.  If so then limit
- * group permissions, based on what extra permissions
- * group has.
- */
-static void
-zfs_fixup_group_entries(ace_t *acep, mode_t mode)
-{
-	mode_t extramode = (mode >> 3) & 07;
-	mode_t ownermode = (mode >> 6);
-
-	if (acep[0].a_flags & ACE_IDENTIFIER_GROUP) {
-
-		extramode &= ~ownermode;
-
-		if (extramode) {
-			if (extramode & 04) {
-				acep[0].a_access_mask &= ~ACE_READ_DATA;
-				acep[1].a_access_mask &= ~ACE_READ_DATA;
-			}
-			if (extramode & 02) {
-				acep[0].a_access_mask &=
-				    ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
-				acep[1].a_access_mask &=
-				    ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
-			}
-			if (extramode & 01) {
-				acep[0].a_access_mask &= ~ACE_EXECUTE;
-				acep[1].a_access_mask &= ~ACE_EXECUTE;
-			}
-		}
-	}
-}
-
-/*
- * Apply the chmod algorithm as described
- * in PSARC/2002/240
- */
-static int
-zfs_acl_chmod(znode_t *zp, uint64_t mode, zfs_acl_t *aclp,
-    dmu_tx_t *tx)
-{
-	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	ace_t 		*acep;
-	int 		i;
-	int		error;
-	int 		entry_type;
-	int 		reuse_deny;
-	int 		need_canonical_six = 1;
-	int		inherit = 0;
-	int		iflags;
-
-	ASSERT(MUTEX_HELD(&zp->z_acl_lock));
-	ASSERT(MUTEX_HELD(&zp->z_lock));
-
-	i = 0;
-	while (i < aclp->z_acl_count) {
-		acep = aclp->z_acl;
-		entry_type = (acep[i].a_flags & ACE_TYPE_FLAGS);
-		iflags = (acep[i].a_flags & ALL_INHERIT);
-
-		if ((acep[i].a_type != ALLOW && acep[i].a_type != DENY) ||
-		    (iflags & ACE_INHERIT_ONLY_ACE)) {
-			i++;
-			if (iflags)
-				inherit = 1;
-			continue;
-		}
-
-
-		if (zfsvfs->z_acl_mode == ZFS_ACL_DISCARD) {
-			zfs_ace_remove(aclp, i);
-			continue;
-		}
-
-		/*
-		 * Need to split ace into two?
-		 */
-		if ((iflags & (ACE_FILE_INHERIT_ACE|
-		    ACE_DIRECTORY_INHERIT_ACE)) &&
-		    (!(iflags & ACE_INHERIT_ONLY_ACE))) {
-			zfs_acl_split_ace(aclp, i);
-			i++;
-			inherit = 1;
-			continue;
-		}
-
-		if (entry_type == ACE_OWNER || entry_type == ACE_EVERYONE ||
-		    (entry_type == OWNING_GROUP)) {
-			acep[i].a_access_mask &= ~OGE_CLEAR;
-			i++;
-			continue;
-
-		} else {
-			if (acep[i].a_type == ALLOW) {
-
-				/*
-				 * Check preceding ACE if any, to see
-				 * if we need to prepend a DENY ACE.
-				 * This is only applicable when the acl_mode
-				 * property == groupmask.
-				 */
-				if (zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK) {
-
-					reuse_deny = zfs_reuse_deny(acep, i);
-
-					if (reuse_deny == B_FALSE) {
-						zfs_acl_prepend_deny(zp, aclp,
-						    i, mode);
-						i++;
-						acep = aclp->z_acl;
-					} else {
-						zfs_acl_prepend_fixup(
-						    &acep[i - 1],
-						    &acep[i], mode,
-						    zp->z_phys->zp_uid);
-					}
-					zfs_fixup_group_entries(&acep[i - 1],
-					    mode);
-				}
-			}
-			i++;
-		}
-	}
-
-	/*
-	 * Check out last six aces, if we have six.
-	 */
-
-	if (aclp->z_acl_count >= 6) {
-		i = aclp->z_acl_count - 6;
-
-		if (zfs_have_canonical_six(aclp, i)) {
-			need_canonical_six = 0;
-		}
-	}
-
-	if (need_canonical_six) {
-
-		zfs_acl_append(aclp, 6);
-		i = aclp->z_acl_count;
-		acep = aclp->z_acl;
-		zfs_set_ace(&acep[i++], 0, DENY, -1, ACE_OWNER);
-		zfs_set_ace(&acep[i++], OWNER_ALLOW_MASK, ALLOW, -1, ACE_OWNER);
-		zfs_set_ace(&acep[i++], 0, DENY, -1, OWNING_GROUP);
-		zfs_set_ace(&acep[i++], 0, ALLOW, -1, OWNING_GROUP);
-		zfs_set_ace(&acep[i++], EVERYONE_DENY_MASK,
-		    DENY, -1, ACE_EVERYONE);
-		zfs_set_ace(&acep[i++], EVERYONE_ALLOW_MASK,
-		    ALLOW, -1, ACE_EVERYONE);
-		aclp->z_acl_count += 6;
-	}
-
-	zfs_acl_fixup_canonical_six(aclp, mode);
-
-	zp->z_phys->zp_mode = mode;
-	error = zfs_aclset_common(zp, aclp, tx, &inherit);
-	return (error);
-}
-
-
-int
-zfs_acl_chmod_setattr(znode_t *zp, uint64_t mode, dmu_tx_t *tx)
-{
-	zfs_acl_t *aclp = NULL;
-	int error;
-
-	ASSERT(MUTEX_HELD(&zp->z_lock));
-	mutex_enter(&zp->z_acl_lock);
-	error = zfs_acl_node_read(zp, &aclp);
-	if (error == 0)
-		error = zfs_acl_chmod(zp, mode, aclp, tx);
-	mutex_exit(&zp->z_acl_lock);
-	if (aclp)
-		zfs_acl_free(aclp);
-	return (error);
-}
-
-/*
- * strip off write_owner and write_acl
- */
-static void
-zfs_securemode_update(zfsvfs_t *zfsvfs, ace_t *acep)
-{
-	if ((zfsvfs->z_acl_inherit == ZFS_ACL_SECURE) &&
-	    (acep->a_type == ALLOW))
-		acep->a_access_mask &= ~SECURE_CLEAR;
-}
-
-/*
- * inherit inheritable ACEs from parent
- */
-static zfs_acl_t *
-zfs_acl_inherit(znode_t *zp, zfs_acl_t *paclp)
-{
-	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	ace_t 		*pacep;
-	ace_t		*acep;
-	int 		ace_cnt = 0;
-	int		pace_cnt;
-	int 		i, j;
-	zfs_acl_t	*aclp = NULL;
-
-	i = j = 0;
-	pace_cnt = paclp->z_acl_count;
-	pacep = paclp->z_acl;
-	if (zfsvfs->z_acl_inherit != ZFS_ACL_DISCARD) {
-		for (i = 0; i != pace_cnt; i++) {
-
-			if (zfsvfs->z_acl_inherit == ZFS_ACL_NOALLOW &&
-			    pacep[i].a_type == ALLOW)
-				continue;
-
-			if (zfs_ace_can_use(zp, &pacep[i])) {
-				ace_cnt++;
-				if (!(pacep[i].a_flags &
-				    ACE_NO_PROPAGATE_INHERIT_ACE))
-					ace_cnt++;
-			}
-		}
-	}
-
-	aclp = zfs_acl_alloc(ace_cnt + OGE_PAD);
-	if (ace_cnt && zfsvfs->z_acl_inherit != ZFS_ACL_DISCARD) {
-		acep = aclp->z_acl;
-		pacep = paclp->z_acl;
-		for (i = 0; i != pace_cnt; i++) {
-
-			if (zfsvfs->z_acl_inherit == ZFS_ACL_NOALLOW &&
-			    pacep[i].a_type == ALLOW)
-				continue;
-
-			if (zfs_ace_can_use(zp, &pacep[i])) {
-
-				/*
-				 * Now create entry for inherited ace
-				 */
-
-				acep[j] = pacep[i];
-
-				/*
-				 * When AUDIT/ALARM a_types are supported
-				 * they should be inherited here.
-				 */
-
-				if ((pacep[i].a_flags &
-				    ACE_NO_PROPAGATE_INHERIT_ACE) ||
-				    (ZTOV(zp)->v_type != VDIR)) {
-					acep[j].a_flags &= ~ALL_INHERIT;
-					zfs_securemode_update(zfsvfs, &acep[j]);
-					j++;
-					continue;
-				}
-
-				ASSERT(ZTOV(zp)->v_type == VDIR);
-
-				/*
-				 * If we are inheriting an ACE targeted for
-				 * only files, then make sure inherit_only
-				 * is on for future propagation.
-				 */
-				if ((pacep[i].a_flags & (ACE_FILE_INHERIT_ACE |
-				    ACE_DIRECTORY_INHERIT_ACE)) !=
-				    ACE_FILE_INHERIT_ACE) {
-					j++;
-					acep[j] = acep[j-1];
-					acep[j-1].a_flags |=
-					    ACE_INHERIT_ONLY_ACE;
-					acep[j].a_flags &= ~ALL_INHERIT;
-				} else {
-					acep[j].a_flags |= ACE_INHERIT_ONLY_ACE;
-				}
-				zfs_securemode_update(zfsvfs, &acep[j]);
-				j++;
-			}
-		}
-	}
-	aclp->z_acl_count = j;
-	ASSERT(aclp->z_slots >= aclp->z_acl_count);
-
-	return (aclp);
-}
-
-/*
- * Create file system object initial permissions
- * including inheritable ACEs.
- */
-void
-zfs_perm_init(znode_t *zp, znode_t *parent, int flag,
-    vattr_t *vap, dmu_tx_t *tx, cred_t *cr)
-{
-	uint64_t	mode;
-	uid_t		uid;
-	gid_t		gid;
-	int		error;
-	int		pull_down;
-	zfs_acl_t	*aclp, *paclp;
-
-	mode = MAKEIMODE(vap->va_type, vap->va_mode);
-
-	/*
-	 * Determine uid and gid.
-	 */
-	if ((flag & (IS_ROOT_NODE | IS_REPLAY)) ||
-	    ((flag & IS_XATTR) && (vap->va_type == VDIR))) {
-		uid = vap->va_uid;
-		gid = vap->va_gid;
-	} else {
-		uid = crgetuid(cr);
-		if ((vap->va_mask & AT_GID) &&
-		    ((vap->va_gid == parent->z_phys->zp_gid) ||
-		    groupmember(vap->va_gid, cr) ||
-		    secpolicy_vnode_create_gid(cr) == 0))
-			gid = vap->va_gid;
-		else
-#ifdef __FreeBSD__
-			gid = parent->z_phys->zp_gid;
-#else
-			gid = (parent->z_phys->zp_mode & S_ISGID) ?
-			    parent->z_phys->zp_gid : crgetgid(cr);
-#endif
-	}
-
-	/*
-	 * If we're creating a directory, and the parent directory has the
-	 * set-GID bit set, set in on the new directory.
-	 * Otherwise, if the user is neither privileged nor a member of the
-	 * file's new group, clear the file's set-GID bit.
-	 */
-
-	if ((parent->z_phys->zp_mode & S_ISGID) && (vap->va_type == VDIR))
-		mode |= S_ISGID;
-	else {
-		if ((mode & S_ISGID) &&
-		    secpolicy_vnode_setids_setgids(cr, gid) != 0)
-			mode &= ~S_ISGID;
-	}
-
-	zp->z_phys->zp_uid = uid;
-	zp->z_phys->zp_gid = gid;
-	zp->z_phys->zp_mode = mode;
-
-	mutex_enter(&parent->z_lock);
-	pull_down = (parent->z_phys->zp_flags & ZFS_INHERIT_ACE);
-	if (pull_down) {
-		mutex_enter(&parent->z_acl_lock);
-		VERIFY(0 == zfs_acl_node_read(parent, &paclp));
-		mutex_exit(&parent->z_acl_lock);
-		aclp = zfs_acl_inherit(zp, paclp);
-		zfs_acl_free(paclp);
-	} else {
-		aclp = zfs_acl_alloc(6);
-	}
-	mutex_exit(&parent->z_lock);
-	mutex_enter(&zp->z_lock);
-	mutex_enter(&zp->z_acl_lock);
-	error = zfs_acl_chmod(zp, mode, aclp, tx);
-	mutex_exit(&zp->z_lock);
-	mutex_exit(&zp->z_acl_lock);
-	ASSERT3U(error, ==, 0);
-	zfs_acl_free(aclp);
-}
-
-/*
- * Should ACE be inherited?
- */
-static int
-zfs_ace_can_use(znode_t *zp, ace_t *acep)
-{
-	int vtype = ZTOV(zp)->v_type;
-
-	int	iflags = (acep->a_flags & 0xf);
-
-	if ((vtype == VDIR) && (iflags & ACE_DIRECTORY_INHERIT_ACE))
-		return (1);
-	else if (iflags & ACE_FILE_INHERIT_ACE)
-		return (!((vtype == VDIR) &&
-		    (iflags & ACE_NO_PROPAGATE_INHERIT_ACE)));
-	return (0);
-}
-
-#ifdef TODO
-/*
- * Retrieve a files ACL
- */
-int
-zfs_getacl(znode_t *zp, vsecattr_t  *vsecp, cred_t *cr)
-{
-	zfs_acl_t	*aclp;
-	ulong_t		mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT);
-	int		error;
-
-	if (error = zfs_zaccess(zp, ACE_READ_ACL, cr)) {
-		/*
-		 * If owner of file then allow reading of the
-		 * ACL.
-		 */
-		if (crgetuid(cr) != zp->z_phys->zp_uid)
-			return (error);
-	}
-
-	if (mask == 0)
-		return (ENOSYS);
-
-	mutex_enter(&zp->z_acl_lock);
-
-	error = zfs_acl_node_read(zp, &aclp);
-	if (error != 0) {
-		mutex_exit(&zp->z_acl_lock);
-		return (error);
-	}
-
-
-	if (mask & VSA_ACECNT) {
-		vsecp->vsa_aclcnt = aclp->z_acl_count;
-	}
-
-	if (mask & VSA_ACE) {
-		vsecp->vsa_aclentp = kmem_alloc(aclp->z_acl_count *
-		    sizeof (ace_t), KM_SLEEP);
-		bcopy(aclp->z_acl, vsecp->vsa_aclentp,
-		    aclp->z_acl_count * sizeof (ace_t));
-	}
-
-	mutex_exit(&zp->z_acl_lock);
-
-	zfs_acl_free(aclp);
-
-	return (0);
-}
-#endif	/* TODO */
-
-#ifdef TODO
-/*
- * Set a files ACL
- */
-int
-zfs_setacl(znode_t *zp, vsecattr_t *vsecp, cred_t *cr)
-{
-	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	zilog_t		*zilog = zfsvfs->z_log;
-	ace_t		*acep = vsecp->vsa_aclentp;
-	int		aclcnt = vsecp->vsa_aclcnt;
-	ulong_t		mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT);
-	dmu_tx_t	*tx;
-	int		error;
-	int		inherit;
-	zfs_acl_t	*aclp;
-
-	if (mask == 0)
-		return (EINVAL);
-
-	if (!zfs_acl_valid(zp, acep, aclcnt, &inherit))
-		return (EINVAL);
-top:
-	error = zfs_zaccess_v4_perm(zp, ACE_WRITE_ACL, cr);
-	if (error == EACCES || error == ACCESS_UNDETERMINED) {
-		if ((error = secpolicy_vnode_setdac(cr,
-		    zp->z_phys->zp_uid)) != 0) {
-			return (error);
-		}
-	} else if (error) {
-		return (error == EROFS ? error : EPERM);
-	}
-
-	mutex_enter(&zp->z_lock);
-	mutex_enter(&zp->z_acl_lock);
-
-	tx = dmu_tx_create(zfsvfs->z_os);
-	dmu_tx_hold_bonus(tx, zp->z_id);
-
-	if (zp->z_phys->zp_acl.z_acl_extern_obj) {
-		dmu_tx_hold_write(tx, zp->z_phys->zp_acl.z_acl_extern_obj,
-		    0, ZFS_ACL_SIZE(aclcnt));
-	} else if (aclcnt > ACE_SLOT_CNT) {
-		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, ZFS_ACL_SIZE(aclcnt));
-	}
-
-	error = dmu_tx_assign(tx, zfsvfs->z_assign);
-	if (error) {
-		mutex_exit(&zp->z_acl_lock);
-		mutex_exit(&zp->z_lock);
-
-		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
-			dmu_tx_wait(tx);
-			dmu_tx_abort(tx);
-			goto top;
-		}
-		dmu_tx_abort(tx);
-		return (error);
-	}
-
-	aclp = zfs_acl_alloc(aclcnt);
-	bcopy(acep, aclp->z_acl, sizeof (ace_t) * aclcnt);
-	aclp->z_acl_count = aclcnt;
-	error = zfs_aclset_common(zp, aclp, tx, &inherit);
-	ASSERT(error == 0);
-
-	zfs_acl_free(aclp);
-	zfs_log_acl(zilog, tx, TX_ACL, zp, aclcnt, acep);
-	dmu_tx_commit(tx);
-done:
-	mutex_exit(&zp->z_acl_lock);
-	mutex_exit(&zp->z_lock);
-
-	return (error);
-}
-#endif	/* TODO */
-
-static int
-zfs_ace_access(ace_t *zacep, int *working_mode)
-{
-	if (*working_mode == 0) {
-		return (0);
-	}
-
-	if (zacep->a_access_mask & *working_mode) {
-		if (zacep->a_type == ALLOW) {
-			*working_mode &=
-			    ~(*working_mode & zacep->a_access_mask);
-			if (*working_mode == 0)
-				return (0);
-		} else if (zacep->a_type == DENY) {
-			return (EACCES);
-		}
-	}
-
-	/*
-	 * haven't been specifcally denied at this point
-	 * so return UNDETERMINED.
-	 */
-
-	return (ACCESS_UNDETERMINED);
-}
-
-
-static int
-zfs_zaccess_common(znode_t *zp, int v4_mode, int *working_mode, cred_t *cr)
-{
-	zfs_acl_t	*aclp;
-	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	ace_t		*zacep;
-	gid_t		gid;
-	int		cnt;
-	int		i;
-	int		error;
-	int		access_deny = ACCESS_UNDETERMINED;
-	uint_t		entry_type;
-	uid_t		uid = crgetuid(cr);
-
-	if (zfsvfs->z_assign >= TXG_INITIAL) {		/* ZIL replay */
-		*working_mode = 0;
-		return (0);
-	}
-
-	*working_mode = v4_mode;
-
-	if ((v4_mode & WRITE_MASK) &&
-	    (zp->z_zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) &&
-	    (!IS_DEVVP(ZTOV(zp)))) {
-		return (EROFS);
-	}
-
-	mutex_enter(&zp->z_acl_lock);
-
-	error = zfs_acl_node_read(zp, &aclp);
-	if (error != 0) {
-		mutex_exit(&zp->z_acl_lock);
-		return (error);
-	}
-
-
-	zacep = aclp->z_acl;
-	cnt = aclp->z_acl_count;
-
-	for (i = 0; i != cnt; i++) {
-
-		DTRACE_PROBE2(zfs__access__common,
-		    ace_t *, &zacep[i], int, *working_mode);
-
-		if (zacep[i].a_flags & ACE_INHERIT_ONLY_ACE)
-			continue;
-
-		entry_type = (zacep[i].a_flags & ACE_TYPE_FLAGS);
-		switch (entry_type) {
-		case ACE_OWNER:
-			if (uid == zp->z_phys->zp_uid) {
-				access_deny = zfs_ace_access(&zacep[i],
-				    working_mode);
-			}
-			break;
-		case (ACE_IDENTIFIER_GROUP | ACE_GROUP):
-		case ACE_IDENTIFIER_GROUP:
-			/*
-			 * Owning group gid is in znode not ACL
-			 */
-			if (entry_type == (ACE_IDENTIFIER_GROUP | ACE_GROUP))
-				gid = zp->z_phys->zp_gid;
-			else
-				gid = zacep[i].a_who;
-
-			if (groupmember(gid, cr)) {
-				access_deny = zfs_ace_access(&zacep[i],
-				    working_mode);
-			}
-			break;
-		case ACE_EVERYONE:
-			access_deny = zfs_ace_access(&zacep[i], working_mode);
-			break;
-
-		/* USER Entry */
-		default:
-			if (entry_type == 0) {
-				if (uid == zacep[i].a_who) {
-					access_deny = zfs_ace_access(&zacep[i],
-					    working_mode);
-				}
-				break;
-			}
-			zfs_acl_free(aclp);
-			mutex_exit(&zp->z_acl_lock);
-			return (EIO);
-		}
-
-		if (access_deny != ACCESS_UNDETERMINED)
-			break;
-	}
-
-	mutex_exit(&zp->z_acl_lock);
-	zfs_acl_free(aclp);
-
-	return (access_deny);
-}
-
-
-/*
- * Determine whether Access should be granted/denied, invoking least
- * priv subsytem when a deny is determined.
- */
-int
-zfs_zaccess(znode_t *zp, int mode, cred_t *cr)
-{
-	int	working_mode;
-	int	error;
-	int	is_attr;
-	znode_t	*xzp;
-	znode_t *check_zp = zp;
-
-	is_attr = ((zp->z_phys->zp_flags & ZFS_XATTR) &&
-	    (ZTOV(zp)->v_type == VDIR));
-
-	/*
-	 * If attribute then validate against base file
-	 */
-	if (is_attr) {
-		if ((error = zfs_zget(zp->z_zfsvfs,
-		    zp->z_phys->zp_parent, &xzp)) != 0)	{
-			return (error);
-		}
-		check_zp = xzp;
-		/*
-		 * fixup mode to map to xattr perms
-		 */
-
-		if (mode & (ACE_WRITE_DATA|ACE_APPEND_DATA)) {
-			mode &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
-			mode |= ACE_WRITE_NAMED_ATTRS;
-		}
-
-		if (mode & (ACE_READ_DATA|ACE_EXECUTE)) {
-			mode &= ~(ACE_READ_DATA|ACE_EXECUTE);
-			mode |= ACE_READ_NAMED_ATTRS;
-		}
-	}
-
-	error = zfs_zaccess_common(check_zp, mode, &working_mode, cr);
-
-	if (error == EROFS) {
-		if (is_attr)
-			VN_RELE(ZTOV(xzp));
-		return (error);
-	}
-
-	if (error || working_mode) {
-		working_mode = (zfs_v4_to_unix(working_mode) << 6);
-		error = secpolicy_vnode_access(cr, ZTOV(check_zp),
-		    check_zp->z_phys->zp_uid, working_mode);
-	}
-
-	if (is_attr)
-		VN_RELE(ZTOV(xzp));
-
-	return (error);
-}
-
-/*
- * Special zaccess function to check for special nfsv4 perm.
- * doesn't call secpolicy_vnode_access() for failure, since that
- * would probably be the wrong policy function to call.
- * instead its up to the caller to handle that situation.
- */
-
-int
-zfs_zaccess_v4_perm(znode_t *zp, int mode, cred_t *cr)
-{
-	int working_mode = 0;
-	return (zfs_zaccess_common(zp, mode, &working_mode, cr));
-}
-
-/*
- * Translate tradition unix VREAD/VWRITE/VEXEC mode into
- * native ACL format and call zfs_zaccess()
- */
-int
-zfs_zaccess_rwx(znode_t *zp, mode_t mode, cred_t *cr)
-{
-	int v4_mode = zfs_unix_to_v4(mode >> 6);
-
-	return (zfs_zaccess(zp, v4_mode, cr));
-}
-
-static int
-zfs_delete_final_check(znode_t *zp, znode_t *dzp, cred_t *cr)
-{
-	int error;
-
-	error = secpolicy_vnode_access(cr, ZTOV(zp),
-	    dzp->z_phys->zp_uid, S_IWRITE|S_IEXEC);
-
-	if (error == 0)
-		error = zfs_sticky_remove_access(dzp, zp, cr);
-
-	return (error);
-}
-
-/*
- * Determine whether Access should be granted/deny, without
- * consulting least priv subsystem.
- *
- *
- * The following chart is the recommended NFSv4 enforcement for
- * ability to delete an object.
- *
- *      -------------------------------------------------------
- *      |   Parent Dir  |           Target Object Permissions |
- *      |  permissions  |                                     |
- *      -------------------------------------------------------
- *      |               | ACL Allows | ACL Denies| Delete     |
- *      |               |  Delete    |  Delete   | unspecified|
- *      -------------------------------------------------------
- *      |  ACL Allows   | Permit     | Permit    | Permit     |
- *      |  DELETE_CHILD |                                     |
- *      -------------------------------------------------------
- *      |  ACL Denies   | Permit     | Deny      | Deny       |
- *      |  DELETE_CHILD |            |           |            |
- *      -------------------------------------------------------
- *      | ACL specifies |            |           |            |
- *      | only allow    | Permit     | Permit    | Permit     |
- *      | write and     |            |           |            |
- *      | execute       |            |           |            |
- *      -------------------------------------------------------
- *      | ACL denies    |            |           |            |
- *      | write and     | Permit     | Deny      | Deny       |
- *      | execute       |            |           |            |
- *      -------------------------------------------------------
- *         ^
- *         |
- *         No search privilege, can't even look up file?
- *
- */
-int
-zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr)
-{
-	int dzp_working_mode = 0;
-	int zp_working_mode = 0;
-	int dzp_error, zp_error;
-
-	/*
-	 * Arghh, this check is going to require a couple of questions
-	 * to be asked.  We want specific DELETE permissions to
-	 * take precedence over WRITE/EXECUTE.  We don't
-	 * want an ACL such as this to mess us up.
-	 * user:joe:write_data:deny,user:joe:delete:allow
-	 *
-	 * However, deny permissions may ultimately be overridden
-	 * by secpolicy_vnode_access().
-	 */
-
-	dzp_error = zfs_zaccess_common(dzp, ACE_DELETE_CHILD,
-	    &dzp_working_mode, cr);
-	zp_error = zfs_zaccess_common(zp, ACE_DELETE, &zp_working_mode, cr);
-
-	if (dzp_error == EROFS || zp_error == EROFS)
-		return (dzp_error);
-
-	/*
-	 * First check the first row.
-	 * We only need to see if parent Allows delete_child
-	 */
-	if ((dzp_working_mode & ACE_DELETE_CHILD) == 0)
-		return (0);
-
-	/*
-	 * Second row
-	 * we already have the necessary information in
-	 * zp_working_mode, zp_error and dzp_error.
-	 */
-
-	if ((zp_working_mode & ACE_DELETE) == 0)
-		return (0);
-
-	/*
-	 * Now zp_error should either be EACCES which indicates
-	 * a "deny" delete entry or ACCESS_UNDETERMINED if the "delete"
-	 * entry exists on the target.
-	 *
-	 * dzp_error should be either EACCES which indicates a "deny"
-	 * entry for delete_child or ACCESS_UNDETERMINED if no delete_child
-	 * entry exists.  If value is EACCES then we are done
-	 * and zfs_delete_final_check() will make the final decision
-	 * regarding to allow the delete.
-	 */
-
-	ASSERT(zp_error != 0 && dzp_error != 0);
-	if (dzp_error == EACCES)
-		return (zfs_delete_final_check(zp, dzp, cr));
-
-	/*
-	 * Third Row
-	 * Only need to check for write/execute on parent
-	 */
-
-	dzp_error = zfs_zaccess_common(dzp, ACE_WRITE_DATA|ACE_EXECUTE,
-	    &dzp_working_mode, cr);
-
-	if (dzp_error == EROFS)
-		return (dzp_error);
-
-	if ((dzp_working_mode & (ACE_WRITE_DATA|ACE_EXECUTE)) == 0)
-		return (zfs_sticky_remove_access(dzp, zp, cr));
-
-	/*
-	 * Fourth Row
-	 */
-
-	if (((dzp_working_mode & (ACE_WRITE_DATA|ACE_EXECUTE)) != 0) &&
-	    ((zp_working_mode & ACE_DELETE) == 0))
-		return (zfs_sticky_remove_access(dzp, zp, cr));
-
-	return (zfs_delete_final_check(zp, dzp, cr));
-}
-
-int
-zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp,
-    znode_t *tzp, cred_t *cr)
-{
-	int add_perm;
-	int error;
-
-	add_perm = (ZTOV(szp)->v_type == VDIR) ?
-	    ACE_ADD_SUBDIRECTORY : ACE_ADD_FILE;
-
-	/*
-	 * Rename permissions are combination of delete permission +
-	 * add file/subdir permission.
-	 */
-
-	/*
-	 * first make sure we do the delete portion.
-	 *
-	 * If that succeeds then check for add_file/add_subdir permissions
-	 */
-
-	if (error = zfs_zaccess_delete(sdzp, szp, cr))
-		return (error);
-
-	/*
-	 * If we have a tzp, see if we can delete it?
-	 */
-	if (tzp) {
-		if (error = zfs_zaccess_delete(tdzp, tzp, cr))
-			return (error);
-	}
-
-	/*
-	 * Now check for add permissions
-	 */
-	error = zfs_zaccess(tdzp, add_perm, cr);
-
-	return (error);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c
deleted file mode 100644
index c8450d4..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/vfs.h>
-#include <sys/fs/zfs.h>
-#include <sys/zfs_znode.h>
-#include <sys/zfs_acl.h>
-
-void
-zfs_ace_byteswap(ace_t *ace, int ace_cnt)
-{
-	int i;
-
-	for (i = 0; i != ace_cnt; i++, ace++) {
-		ace->a_who = BSWAP_32(ace->a_who);
-		ace->a_access_mask = BSWAP_32(ace->a_access_mask);
-		ace->a_flags = BSWAP_16(ace->a_flags);
-		ace->a_type = BSWAP_16(ace->a_type);
-	}
-}
-
-/* ARGSUSED */
-void
-zfs_acl_byteswap(void *buf, size_t size)
-{
-	int cnt;
-
-	/*
-	 * Arggh, since we don't know how many ACEs are in
-	 * the array, we have to swap the entire block
-	 */
-
-	cnt = size / sizeof (ace_t);
-
-	zfs_ace_byteswap((ace_t *)buf, cnt);
-}
-
-void
-zfs_znode_byteswap(void *buf, size_t size)
-{
-	znode_phys_t *zp = buf;
-
-	ASSERT(size >= sizeof (znode_phys_t));
-
-	zp->zp_crtime[0] = BSWAP_64(zp->zp_crtime[0]);
-	zp->zp_crtime[1] = BSWAP_64(zp->zp_crtime[1]);
-	zp->zp_atime[0] = BSWAP_64(zp->zp_atime[0]);
-	zp->zp_atime[1] = BSWAP_64(zp->zp_atime[1]);
-	zp->zp_mtime[0] = BSWAP_64(zp->zp_mtime[0]);
-	zp->zp_mtime[1] = BSWAP_64(zp->zp_mtime[1]);
-	zp->zp_ctime[0] = BSWAP_64(zp->zp_ctime[0]);
-	zp->zp_ctime[1] = BSWAP_64(zp->zp_ctime[1]);
-	zp->zp_gen = BSWAP_64(zp->zp_gen);
-	zp->zp_mode = BSWAP_64(zp->zp_mode);
-	zp->zp_size = BSWAP_64(zp->zp_size);
-	zp->zp_parent = BSWAP_64(zp->zp_parent);
-	zp->zp_links = BSWAP_64(zp->zp_links);
-	zp->zp_xattr = BSWAP_64(zp->zp_xattr);
-	zp->zp_rdev = BSWAP_64(zp->zp_rdev);
-	zp->zp_flags = BSWAP_64(zp->zp_flags);
-	zp->zp_uid = BSWAP_64(zp->zp_uid);
-	zp->zp_gid = BSWAP_64(zp->zp_gid);
-	zp->zp_pad[0] = BSWAP_64(zp->zp_pad[0]);
-	zp->zp_pad[1] = BSWAP_64(zp->zp_pad[1]);
-	zp->zp_pad[2] = BSWAP_64(zp->zp_pad[2]);
-	zp->zp_pad[3] = BSWAP_64(zp->zp_pad[3]);
-
-	zp->zp_acl.z_acl_extern_obj = BSWAP_64(zp->zp_acl.z_acl_extern_obj);
-	zp->zp_acl.z_acl_count = BSWAP_32(zp->zp_acl.z_acl_count);
-	zp->zp_acl.z_acl_version = BSWAP_16(zp->zp_acl.z_acl_version);
-	zp->zp_acl.z_acl_pad = BSWAP_16(zp->zp_acl.z_acl_pad);
-	zfs_ace_byteswap(&zp->zp_acl.z_ace_data[0], ACE_SLOT_CNT);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c
deleted file mode 100644
index 0c2fb02..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c
+++ /dev/null
@@ -1,1119 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-/*
- * ZFS control directory (a.k.a. ".zfs")
- *
- * This directory provides a common location for all ZFS meta-objects.
- * Currently, this is only the 'snapshot' directory, but this may expand in the
- * future.  The elements are built using the GFS primitives, as the hierarchy
- * does not actually exist on disk.
- *
- * For 'snapshot', we don't want to have all snapshots always mounted, because
- * this would take up a huge amount of space in /etc/mnttab.  We have three
- * types of objects:
- *
- * 	ctldir ------> snapshotdir -------> snapshot
- *                                             |
- *                                             |
- *                                             V
- *                                         mounted fs
- *
- * The 'snapshot' node contains just enough information to lookup '..' and act
- * as a mountpoint for the snapshot.  Whenever we lookup a specific snapshot, we
- * perform an automount of the underlying filesystem and return the
- * corresponding vnode.
- *
- * All mounts are handled automatically by the kernel, but unmounts are
- * (currently) handled from user land.  The main reason is that there is no
- * reliable way to auto-unmount the filesystem when it's "no longer in use".
- * When the user unmounts a filesystem, we call zfsctl_unmount(), which
- * unmounts any snapshots within the snapshot directory.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/zfs_ctldir.h>
-#include <sys/zfs_ioctl.h>
-#include <sys/zfs_vfsops.h>
-#include <sys/namei.h>
-#include <sys/gfs.h>
-#include <sys/stat.h>
-#include <sys/dmu.h>
-#include <sys/mount.h>
-
-typedef struct {
-	char		*se_name;
-	vnode_t		*se_root;
-	avl_node_t	se_node;
-} zfs_snapentry_t;
-
-static int
-snapentry_compare(const void *a, const void *b)
-{
-	const zfs_snapentry_t *sa = a;
-	const zfs_snapentry_t *sb = b;
-	int ret = strcmp(sa->se_name, sb->se_name);
-
-	if (ret < 0)
-		return (-1);
-	else if (ret > 0)
-		return (1);
-	else
-		return (0);
-}
-
-static struct vop_vector zfsctl_ops_root;
-static struct vop_vector zfsctl_ops_snapdir;
-static struct vop_vector zfsctl_ops_snapshot;
-
-static vnode_t *zfsctl_mknode_snapdir(vnode_t *);
-static vnode_t *zfsctl_snapshot_mknode(vnode_t *, uint64_t objset);
-
-typedef struct zfsctl_node {
-	gfs_dir_t	zc_gfs_private;
-	uint64_t	zc_id;
-	timestruc_t	zc_cmtime;	/* ctime and mtime, always the same */
-} zfsctl_node_t;
-
-typedef struct zfsctl_snapdir {
-	zfsctl_node_t	sd_node;
-	kmutex_t	sd_lock;
-	avl_tree_t	sd_snaps;
-} zfsctl_snapdir_t;
-
-/*
- * Root directory elements.  We have only a single static entry, 'snapshot'.
- */
-static gfs_dirent_t zfsctl_root_entries[] = {
-	{ "snapshot", zfsctl_mknode_snapdir, GFS_CACHE_VNODE },
-	{ NULL }
-};
-
-/* include . and .. in the calculation */
-#define	NROOT_ENTRIES	((sizeof (zfsctl_root_entries) / \
-    sizeof (gfs_dirent_t)) + 1)
-
-
-/*
- * Initialize the various GFS pieces we'll need to create and manipulate .zfs
- * directories.  This is called from the ZFS init routine, and initializes the
- * vnode ops vectors that we'll be using.
- */
-void
-zfsctl_init(void)
-{
-}
-
-void
-zfsctl_fini(void)
-{
-}
-
-/*
- * Return the inode number associated with the 'snapshot' directory.
- */
-/* ARGSUSED */
-static ino64_t
-zfsctl_root_inode_cb(vnode_t *vp, int index)
-{
-	ASSERT(index == 0);
-	return (ZFSCTL_INO_SNAPDIR);
-}
-
-/*
- * Create the '.zfs' directory.  This directory is cached as part of the VFS
- * structure.  This results in a hold on the vfs_t.  The code in zfs_umount()
- * therefore checks against a vfs_count of 2 instead of 1.  This reference
- * is removed when the ctldir is destroyed in the unmount.
- */
-void
-zfsctl_create(zfsvfs_t *zfsvfs)
-{
-	vnode_t *vp, *rvp;
-	zfsctl_node_t *zcp;
-
-	ASSERT(zfsvfs->z_ctldir == NULL);
-
-	vp = gfs_root_create(sizeof (zfsctl_node_t), zfsvfs->z_vfs,
-	    &zfsctl_ops_root, ZFSCTL_INO_ROOT, zfsctl_root_entries,
-	    zfsctl_root_inode_cb, MAXNAMELEN, NULL, NULL);
-	zcp = vp->v_data;
-	zcp->zc_id = ZFSCTL_INO_ROOT;
-
-	VERIFY(VFS_ROOT(zfsvfs->z_vfs, LK_EXCLUSIVE, &rvp, curthread) == 0);
-	ZFS_TIME_DECODE(&zcp->zc_cmtime, VTOZ(rvp)->z_phys->zp_crtime);
-	VN_URELE(rvp);
-
-	/*
-	 * We're only faking the fact that we have a root of a filesystem for
-	 * the sake of the GFS interfaces.  Undo the flag manipulation it did
-	 * for us.
-	 */
-	vp->v_vflag &= ~VV_ROOT;
-
-	zfsvfs->z_ctldir = vp;
-}
-
-/*
- * Destroy the '.zfs' directory.  Only called when the filesystem is unmounted.
- * There might still be more references if we were force unmounted, but only
- * new zfs_inactive() calls can occur and they don't reference .zfs
- */
-void
-zfsctl_destroy(zfsvfs_t *zfsvfs)
-{
-	VN_RELE(zfsvfs->z_ctldir);
-	zfsvfs->z_ctldir = NULL;
-}
-
-/*
- * Given a root znode, retrieve the associated .zfs directory.
- * Add a hold to the vnode and return it.
- */
-vnode_t *
-zfsctl_root(znode_t *zp)
-{
-	ASSERT(zfs_has_ctldir(zp));
-	VN_HOLD(zp->z_zfsvfs->z_ctldir);
-	return (zp->z_zfsvfs->z_ctldir);
-}
-
-/*
- * Common open routine.  Disallow any write access.
- */
-/* ARGSUSED */
-static int
-zfsctl_common_open(struct vop_open_args *ap)
-{
-	int flags = ap->a_mode;
-
-	if (flags & FWRITE)
-		return (EACCES);
-
-	return (0);
-}
-
-/*
- * Common close routine.  Nothing to do here.
- */
-/* ARGSUSED */
-static int
-zfsctl_common_close(struct vop_close_args *ap)
-{
-	return (0);
-}
-
-/*
- * Common access routine.  Disallow writes.
- */
-/* ARGSUSED */
-static int
-zfsctl_common_access(ap)
-	struct vop_access_args /* {
-		struct vnode *a_vp;
-		int  a_mode;
-		struct ucred *a_cred;
-		struct thread *a_td;
-	} */ *ap;
-{
-	int mode = ap->a_mode;
-
-	if (mode & VWRITE)
-		return (EACCES);
-
-	return (0);
-}
-
-/*
- * Common getattr function.  Fill in basic information.
- */
-static void
-zfsctl_common_getattr(vnode_t *vp, vattr_t *vap)
-{
-	zfsctl_node_t	*zcp = vp->v_data;
-	timestruc_t	now;
-
-	vap->va_uid = 0;
-	vap->va_gid = 0;
-	vap->va_rdev = 0;
-	/*
-	 * We are a purly virtual object, so we have no
-	 * blocksize or allocated blocks.
-	 */
-	vap->va_blksize = 0;
-	vap->va_nblocks = 0;
-	vap->va_seq = 0;
-	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
-	vap->va_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP |
-	    S_IROTH | S_IXOTH;
-	vap->va_type = VDIR;
-	/*
-	 * We live in the now (for atime).
-	 */
-	gethrestime(&now);
-	vap->va_atime = now;
-	vap->va_mtime = vap->va_ctime = vap->va_birthtime = zcp->zc_cmtime;
-	/* FreeBSD: Reset chflags(2) flags. */
-	vap->va_flags = 0;
-}
-
-static int
-zfsctl_common_fid(ap)
-	struct vop_fid_args /* {
-		struct vnode *a_vp;
-		struct fid *a_fid;
-	} */ *ap;
-{
-	vnode_t		*vp = ap->a_vp;
-	fid_t		*fidp = (void *)ap->a_fid;
-	zfsvfs_t	*zfsvfs = vp->v_vfsp->vfs_data;
-	zfsctl_node_t	*zcp = vp->v_data;
-	uint64_t	object = zcp->zc_id;
-	zfid_short_t	*zfid;
-	int		i;
-
-	ZFS_ENTER(zfsvfs);
-
-	fidp->fid_len = SHORT_FID_LEN;
-
-	zfid = (zfid_short_t *)fidp;
-
-	zfid->zf_len = SHORT_FID_LEN;
-
-	for (i = 0; i < sizeof (zfid->zf_object); i++)
-		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
-
-	/* .zfs znodes always have a generation number of 0 */
-	for (i = 0; i < sizeof (zfid->zf_gen); i++)
-		zfid->zf_gen[i] = 0;
-
-	ZFS_EXIT(zfsvfs);
-	return (0);
-}
-
-static int
-zfsctl_common_reclaim(ap)
-	struct vop_reclaim_args /* {
-		struct vnode *a_vp;
-		struct thread *a_td;
-	} */ *ap;
-{
-	vnode_t *vp = ap->a_vp;
-
-	/*
-	 * Destroy the vm object and flush associated pages.
-	 */
-	vnode_destroy_vobject(vp);
-	VI_LOCK(vp);
-	vp->v_data = NULL;
-	VI_UNLOCK(vp);
-	return (0);
-}
-
-/*
- * .zfs inode namespace
- *
- * We need to generate unique inode numbers for all files and directories
- * within the .zfs pseudo-filesystem.  We use the following scheme:
- *
- * 	ENTRY			ZFSCTL_INODE
- * 	.zfs			1
- * 	.zfs/snapshot		2
- * 	.zfs/snapshot/<snap>	objectid(snap)
- */
-
-#define	ZFSCTL_INO_SNAP(id)	(id)
-
-/*
- * Get root directory attributes.
- */
-/* ARGSUSED */
-static int
-zfsctl_root_getattr(ap)
-	struct vop_getattr_args /* {
-		struct vnode *a_vp;
-		struct vattr *a_vap;
-		struct ucred *a_cred;
-		struct thread *a_td;
-	} */ *ap;
-{
-	struct vnode *vp = ap->a_vp;
-	struct vattr *vap = ap->a_vap;
-	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
-
-	ZFS_ENTER(zfsvfs);
-	vap->va_nodeid = ZFSCTL_INO_ROOT;
-	vap->va_nlink = vap->va_size = NROOT_ENTRIES;
-
-	zfsctl_common_getattr(vp, vap);
-	ZFS_EXIT(zfsvfs);
-
-	return (0);
-}
-
-/*
- * Special case the handling of "..".
- */
-/* ARGSUSED */
-int
-zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
-    int flags, vnode_t *rdir, cred_t *cr)
-{
-	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
-	int err;
-
-	ZFS_ENTER(zfsvfs);
-
-	if (strcmp(nm, "..") == 0) {
-		err = VFS_ROOT(dvp->v_vfsp, LK_EXCLUSIVE, vpp, curthread);
-		if (err == 0)
-			VOP_UNLOCK(*vpp, 0);
-	} else {
-		err = gfs_dir_lookup(dvp, nm, vpp);
-	}
-
-	ZFS_EXIT(zfsvfs);
-
-	return (err);
-}
-
-/*
- * Special case the handling of "..".
- */
-/* ARGSUSED */
-int
-zfsctl_root_lookup_vop(ap)
-	struct vop_lookup_args /* {
-		struct vnode *a_dvp;
-		struct vnode **a_vpp;
-		struct componentname *a_cnp;
-	} */ *ap;
-{
-	vnode_t *dvp = ap->a_dvp;
-	vnode_t **vpp = ap->a_vpp;
-	cred_t *cr = ap->a_cnp->cn_cred;
-	int flags = ap->a_cnp->cn_flags;
-	int nameiop = ap->a_cnp->cn_nameiop;
-	char nm[NAME_MAX + 1];
-	int err;
-
-	if ((flags & ISLASTCN) && (nameiop == RENAME || nameiop == CREATE))
-		return (EOPNOTSUPP);
-
-	ASSERT(ap->a_cnp->cn_namelen < sizeof(nm));
-	strlcpy(nm, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1);
-
-	err = zfsctl_root_lookup(dvp, nm, vpp, NULL, 0, NULL, cr);
-	if (err == 0 && (nm[0] != '.' || nm[1] != '\0'))
-		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
-
-	return (err);
-}
-
-static struct vop_vector zfsctl_ops_root = {
-	.vop_default =	&default_vnodeops,
-	.vop_open =	zfsctl_common_open,
-	.vop_close =	zfsctl_common_close,
-	.vop_ioctl =	VOP_EINVAL,
-	.vop_getattr =	zfsctl_root_getattr,
-	.vop_access =	zfsctl_common_access,
-	.vop_readdir =	gfs_vop_readdir,
-	.vop_lookup =	zfsctl_root_lookup_vop,
-	.vop_inactive =	gfs_vop_inactive,
-	.vop_reclaim =	zfsctl_common_reclaim,
-	.vop_fid =	zfsctl_common_fid,
-};
-
-static int
-zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname)
-{
-	objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os;
-
-	dmu_objset_name(os, zname);
-	if (strlen(zname) + 1 + strlen(name) >= len)
-		return (ENAMETOOLONG);
-	(void) strcat(zname, "@");
-	(void) strcat(zname, name);
-	return (0);
-}
-
-static int
-zfsctl_unmount_snap(vnode_t *dvp, const char *name, int force, cred_t *cr)
-{
-	zfsctl_snapdir_t *sdp = dvp->v_data;
-	zfs_snapentry_t search, *sep;
-	struct vop_inactive_args ap;
-	avl_index_t where;
-	int err;
-
-	ASSERT(MUTEX_HELD(&sdp->sd_lock));
-
-	search.se_name = (char *)name;
-	if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL)
-		return (ENOENT);
-
-	ASSERT(vn_ismntpt(sep->se_root));
-
-	/* this will be dropped by dounmount() */
-	if ((err = vn_vfswlock(sep->se_root)) != 0)
-		return (err);
-
-	err = dounmount(vn_mountedvfs(sep->se_root), force, curthread);
-	if (err)
-		return (err);
-	ASSERT(sep->se_root->v_count == 1);
-	ap.a_vp = sep->se_root;
-	gfs_vop_inactive(&ap);
-
-	avl_remove(&sdp->sd_snaps, sep);
-	kmem_free(sep->se_name, strlen(sep->se_name) + 1);
-	kmem_free(sep, sizeof (zfs_snapentry_t));
-
-	return (0);
-}
-
-#if 0
-static void
-zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm)
-{
-	avl_index_t where;
-	vfs_t *vfsp;
-	refstr_t *pathref;
-	char newpath[MAXNAMELEN];
-	char *tail;
-
-	ASSERT(MUTEX_HELD(&sdp->sd_lock));
-	ASSERT(sep != NULL);
-
-	vfsp = vn_mountedvfs(sep->se_root);
-	ASSERT(vfsp != NULL);
-
-	vfs_lock_wait(vfsp);
-
-	/*
-	 * Change the name in the AVL tree.
-	 */
-	avl_remove(&sdp->sd_snaps, sep);
-	kmem_free(sep->se_name, strlen(sep->se_name) + 1);
-	sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
-	(void) strcpy(sep->se_name, nm);
-	VERIFY(avl_find(&sdp->sd_snaps, sep, &where) == NULL);
-	avl_insert(&sdp->sd_snaps, sep, where);
-
-	/*
-	 * Change the current mountpoint info:
-	 * 	- update the tail of the mntpoint path
-	 *	- update the tail of the resource path
-	 */
-	pathref = vfs_getmntpoint(vfsp);
-	(void) strncpy(newpath, refstr_value(pathref), sizeof (newpath));
-	VERIFY((tail = strrchr(newpath, '/')) != NULL);
-	*(tail+1) = '\0';
-	ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath));
-	(void) strcat(newpath, nm);
-	refstr_rele(pathref);
-	vfs_setmntpoint(vfsp, newpath);
-
-	pathref = vfs_getresource(vfsp);
-	(void) strncpy(newpath, refstr_value(pathref), sizeof (newpath));
-	VERIFY((tail = strrchr(newpath, '@')) != NULL);
-	*(tail+1) = '\0';
-	ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath));
-	(void) strcat(newpath, nm);
-	refstr_rele(pathref);
-	vfs_setresource(vfsp, newpath);
-
-	vfs_unlock(vfsp);
-}
-#endif
-
-#if 0
-static int
-zfsctl_snapdir_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm,
-    cred_t *cr)
-{
-	zfsctl_snapdir_t *sdp = sdvp->v_data;
-	zfs_snapentry_t search, *sep;
-	avl_index_t where;
-	char from[MAXNAMELEN], to[MAXNAMELEN];
-	int err;
-
-	err = zfsctl_snapshot_zname(sdvp, snm, MAXNAMELEN, from);
-	if (err)
-		return (err);
-	err = zfs_secpolicy_write(from, cr);
-	if (err)
-		return (err);
-
-	/*
-	 * Cannot move snapshots out of the snapdir.
-	 */
-	if (sdvp != tdvp)
-		return (EINVAL);
-
-	if (strcmp(snm, tnm) == 0)
-		return (0);
-
-	err = zfsctl_snapshot_zname(tdvp, tnm, MAXNAMELEN, to);
-	if (err)
-		return (err);
-
-	mutex_enter(&sdp->sd_lock);
-
-	search.se_name = (char *)snm;
-	if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL) {
-		mutex_exit(&sdp->sd_lock);
-		return (ENOENT);
-	}
-
-	err = dmu_objset_rename(from, to, B_FALSE);
-	if (err == 0)
-		zfsctl_rename_snap(sdp, sep, tnm);
-
-	mutex_exit(&sdp->sd_lock);
-
-	return (err);
-}
-#endif
-
-#if 0
-/* ARGSUSED */
-static int
-zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr)
-{
-	zfsctl_snapdir_t *sdp = dvp->v_data;
-	char snapname[MAXNAMELEN];
-	int err;
-
-	err = zfsctl_snapshot_zname(dvp, name, MAXNAMELEN, snapname);
-	if (err)
-		return (err);
-	err = zfs_secpolicy_write(snapname, cr);
-	if (err)
-		return (err);
-
-	mutex_enter(&sdp->sd_lock);
-
-	err = zfsctl_unmount_snap(dvp, name, 0, cr);
-	if (err) {
-		mutex_exit(&sdp->sd_lock);
-		return (err);
-	}
-
-	err = dmu_objset_destroy(snapname);
-
-	mutex_exit(&sdp->sd_lock);
-
-	return (err);
-}
-#endif
-
-/*
- * Lookup entry point for the 'snapshot' directory.  Try to open the
- * snapshot if it exist, creating the pseudo filesystem vnode as necessary.
- * Perform a mount of the associated dataset on top of the vnode.
- */
-/* ARGSUSED */
-int
-zfsctl_snapdir_lookup(ap)
-	struct vop_lookup_args /* {
-		struct vnode *a_dvp;
-		struct vnode **a_vpp;
-		struct componentname *a_cnp;
-	} */ *ap;
-{
-	vnode_t *dvp = ap->a_dvp;
-	vnode_t **vpp = ap->a_vpp;
-	char nm[NAME_MAX + 1];
-	zfsctl_snapdir_t *sdp = dvp->v_data;
-	objset_t *snap;
-	char snapname[MAXNAMELEN];
-	char *mountpoint;
-	zfs_snapentry_t *sep, search;
-	size_t mountpoint_len;
-	avl_index_t where;
-	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
-	int err;
-
-	ASSERT(ap->a_cnp->cn_namelen < sizeof(nm));
-	strlcpy(nm, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1);
-
-	ASSERT(dvp->v_type == VDIR);
-
-	if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0)
-		return (0);
-
-	*vpp = NULL;
-
-	/*
-	 * If we get a recursive call, that means we got called
-	 * from the domount() code while it was trying to look up the
-	 * spec (which looks like a local path for zfs).  We need to
-	 * add some flag to domount() to tell it not to do this lookup.
-	 */
-	if (MUTEX_HELD(&sdp->sd_lock))
-		return (ENOENT);
-
-	ZFS_ENTER(zfsvfs);
-
-	mutex_enter(&sdp->sd_lock);
-	search.se_name = (char *)nm;
-	if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) != NULL) {
-		*vpp = sep->se_root;
-		VN_HOLD(*vpp);
-		if ((*vpp)->v_mountedhere == NULL) {
-			/*
-			 * The snapshot was unmounted behind our backs,
-			 * try to remount it.
-			 */
-			goto domount;
-		}
-		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
-		mutex_exit(&sdp->sd_lock);
-		ZFS_EXIT(zfsvfs);
-		return (0);
-	}
-
-	/*
-	 * The requested snapshot is not currently mounted, look it up.
-	 */
-	err = zfsctl_snapshot_zname(dvp, nm, MAXNAMELEN, snapname);
-	if (err) {
-		mutex_exit(&sdp->sd_lock);
-		ZFS_EXIT(zfsvfs);
-		return (err);
-	}
-	if (dmu_objset_open(snapname, DMU_OST_ZFS,
-	    DS_MODE_STANDARD | DS_MODE_READONLY, &snap) != 0) {
-		mutex_exit(&sdp->sd_lock);
-		ZFS_EXIT(zfsvfs);
-		return (ENOENT);
-	}
-
-	sep = kmem_alloc(sizeof (zfs_snapentry_t), KM_SLEEP);
-	sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
-	(void) strcpy(sep->se_name, nm);
-	*vpp = sep->se_root = zfsctl_snapshot_mknode(dvp, dmu_objset_id(snap));
-	VN_HOLD(*vpp);
-	avl_insert(&sdp->sd_snaps, sep, where);
-
-	dmu_objset_close(snap);
-domount:
-	mountpoint_len = strlen(dvp->v_vfsp->mnt_stat.f_mntonname) +
-	    strlen("/.zfs/snapshot/") + strlen(nm) + 1;
-	mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP);
-	(void) snprintf(mountpoint, mountpoint_len, "%s/.zfs/snapshot/%s",
-	    dvp->v_vfsp->mnt_stat.f_mntonname, nm);
-	err = domount(curthread, *vpp, "zfs", mountpoint, snapname, 0);
-	kmem_free(mountpoint, mountpoint_len);
-	/* FreeBSD: This line was moved from below to avoid a lock recursion. */
-	if (err == 0)
-		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
-	mutex_exit(&sdp->sd_lock);
-
-	/*
-	 * If we had an error, drop our hold on the vnode and
-	 * zfsctl_snapshot_inactive() will clean up.
-	 */
-	if (err) {
-		VN_RELE(*vpp);
-		*vpp = NULL;
-	}
-	return (err);
-}
-
-/* ARGSUSED */
-static int
-zfsctl_snapdir_readdir_cb(vnode_t *vp, struct dirent64 *dp, int *eofp,
-    offset_t *offp, offset_t *nextp, void *data)
-{
-	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
-	char snapname[MAXNAMELEN];
-	uint64_t id, cookie;
-
-	ZFS_ENTER(zfsvfs);
-
-	cookie = *offp;
-	if (dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN, snapname, &id,
-	    &cookie) == ENOENT) {
-		*eofp = 1;
-		ZFS_EXIT(zfsvfs);
-		return (0);
-	}
-
-	(void) strcpy(dp->d_name, snapname);
-	dp->d_ino = ZFSCTL_INO_SNAP(id);
-	*nextp = cookie;
-
-	ZFS_EXIT(zfsvfs);
-
-	return (0);
-}
-
-vnode_t *
-zfsctl_mknode_snapdir(vnode_t *pvp)
-{
-	vnode_t *vp;
-	zfsctl_snapdir_t *sdp;
-
-	vp = gfs_dir_create(sizeof (zfsctl_snapdir_t), pvp, pvp->v_vfsp,
-	    &zfsctl_ops_snapdir, NULL, NULL, MAXNAMELEN,
-	    zfsctl_snapdir_readdir_cb, NULL);
-	sdp = vp->v_data;
-	sdp->sd_node.zc_id = ZFSCTL_INO_SNAPDIR;
-	sdp->sd_node.zc_cmtime = ((zfsctl_node_t *)pvp->v_data)->zc_cmtime;
-	mutex_init(&sdp->sd_lock, NULL, MUTEX_DEFAULT, NULL);
-	avl_create(&sdp->sd_snaps, snapentry_compare,
-	    sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, se_node));
-	return (vp);
-}
-
-/* ARGSUSED */
-static int
-zfsctl_snapdir_getattr(ap)
-	struct vop_getattr_args /* {
-		struct vnode *a_vp;
-		struct vattr *a_vap;
-		struct ucred *a_cred;
-		struct thread *a_td;
-	} */ *ap;
-{
-	struct vnode *vp = ap->a_vp;
-	struct vattr *vap = ap->a_vap;
-	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
-	zfsctl_snapdir_t *sdp = vp->v_data;
-
-	ZFS_ENTER(zfsvfs);
-	zfsctl_common_getattr(vp, vap);
-	vap->va_nodeid = gfs_file_inode(vp);
-	vap->va_nlink = vap->va_size = avl_numnodes(&sdp->sd_snaps) + 2;
-	ZFS_EXIT(zfsvfs);
-
-	return (0);
-}
-
-/* ARGSUSED */
-static int
-zfsctl_snapdir_inactive(ap)
-	struct vop_inactive_args /* {
-		struct vnode *a_vp;
-		struct thread *a_td;
-	} */ *ap;
-{
-	vnode_t *vp = ap->a_vp;
-	zfsctl_snapdir_t *sdp = vp->v_data;
-	void *private;
-
-	private = gfs_dir_inactive(vp);
-	if (private != NULL) {
-		ASSERT(avl_numnodes(&sdp->sd_snaps) == 0);
-		mutex_destroy(&sdp->sd_lock);
-		avl_destroy(&sdp->sd_snaps);
-		kmem_free(private, sizeof (zfsctl_snapdir_t));
-	}
-	return (0);
-}
-
-static struct vop_vector zfsctl_ops_snapdir = {
-	.vop_default =	&default_vnodeops,
-	.vop_open =	zfsctl_common_open,
-	.vop_close =	zfsctl_common_close,
-	.vop_ioctl =	VOP_EINVAL,
-	.vop_getattr =	zfsctl_snapdir_getattr,
-	.vop_access =	zfsctl_common_access,
-	.vop_readdir =	gfs_vop_readdir,
-	.vop_lookup =	zfsctl_snapdir_lookup,
-	.vop_inactive =	zfsctl_snapdir_inactive,
-	.vop_reclaim =	zfsctl_common_reclaim,
-	.vop_fid =	zfsctl_common_fid,
-};
-
-static vnode_t *
-zfsctl_snapshot_mknode(vnode_t *pvp, uint64_t objset)
-{
-	vnode_t *vp;
-	zfsctl_node_t *zcp;
-
-	vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp, pvp->v_vfsp,
-	    &zfsctl_ops_snapshot, NULL, NULL, MAXNAMELEN, NULL, NULL);
-	zcp = vp->v_data;
-	zcp->zc_id = objset;
-
-	return (vp);
-}
-
-static int
-zfsctl_snapshot_inactive(ap)
-	struct vop_inactive_args /* {
-		struct vnode *a_vp;
-		struct thread *a_td;
-	} */ *ap;
-{
-	vnode_t *vp = ap->a_vp;
-	struct vop_inactive_args iap;
-	zfsctl_snapdir_t *sdp;
-	zfs_snapentry_t *sep, *next;
-	int locked;
-	vnode_t *dvp;
-
-	VERIFY(gfs_dir_lookup(vp, "..", &dvp) == 0);
-	sdp = dvp->v_data;
-	VOP_UNLOCK(dvp, 0);
-
-	if (!(locked = MUTEX_HELD(&sdp->sd_lock)))
-		mutex_enter(&sdp->sd_lock);
-
-	if (vp->v_count > 1) {
-		if (!locked)
-			mutex_exit(&sdp->sd_lock);
-		return (0);
-	}
-	ASSERT(!vn_ismntpt(vp));
-
-	sep = avl_first(&sdp->sd_snaps);
-	while (sep != NULL) {
-		next = AVL_NEXT(&sdp->sd_snaps, sep);
-
-		if (sep->se_root == vp) {
-			avl_remove(&sdp->sd_snaps, sep);
-			kmem_free(sep->se_name, strlen(sep->se_name) + 1);
-			kmem_free(sep, sizeof (zfs_snapentry_t));
-			break;
-		}
-		sep = next;
-	}
-	ASSERT(sep != NULL);
-
-	if (!locked)
-		mutex_exit(&sdp->sd_lock);
-	VN_RELE(dvp);
-
-	/*
-	 * Dispose of the vnode for the snapshot mount point.
-	 * This is safe to do because once this entry has been removed
-	 * from the AVL tree, it can't be found again, so cannot become
-	 * "active".  If we lookup the same name again we will end up
-	 * creating a new vnode.
-	 */
-	iap.a_vp = vp;
-	return (gfs_vop_inactive(&iap));
-}
-
-static int
-zfsctl_traverse_begin(vnode_t **vpp, int lktype, kthread_t *td)
-{
-
-	VN_HOLD(*vpp);
-	/* Snapshot should be already mounted, but just in case. */
-	if (vn_mountedvfs(*vpp) == NULL)
-		return (ENOENT);
-	return (traverse(vpp, lktype));
-}
-
-static void
-zfsctl_traverse_end(vnode_t *vp, int err)
-{
-
-	if (err == 0)
-		vput(vp);
-	else
-		VN_RELE(vp);
-}
-
-static int
-zfsctl_snapshot_getattr(ap)
-	struct vop_getattr_args /* {
-		struct vnode *a_vp;
-		struct vattr *a_vap;
-		struct ucred *a_cred;
-		struct thread *a_td;
-	} */ *ap;
-{
-	vnode_t *vp = ap->a_vp;
-	int err;
-
-	err = zfsctl_traverse_begin(&vp, LK_SHARED | LK_RETRY, ap->a_td);
-	if (err == 0)
-		err = VOP_GETATTR(vp, ap->a_vap, ap->a_cred, ap->a_td);
-	zfsctl_traverse_end(vp, err);
-	return (err);
-}
-
-static int
-zfsctl_snapshot_fid(ap)
-	struct vop_fid_args /* {
-		struct vnode *a_vp;
-		struct fid *a_fid;
-	} */ *ap;
-{
-	vnode_t *vp = ap->a_vp;
-	int err;
-
-	err = zfsctl_traverse_begin(&vp, LK_SHARED | LK_RETRY, curthread);
-	if (err == 0)
-		err = VOP_VPTOFH(vp, (void *)ap->a_fid);
-	zfsctl_traverse_end(vp, err);
-	return (err);
-}
-
-/*
- * These VP's should never see the light of day.  They should always
- * be covered.
- */
-static struct vop_vector zfsctl_ops_snapshot = {
-	.vop_default =	&default_vnodeops,
-	.vop_inactive =	zfsctl_snapshot_inactive,
-	.vop_reclaim =	zfsctl_common_reclaim,
-	.vop_getattr =	zfsctl_snapshot_getattr,
-	.vop_fid =	zfsctl_snapshot_fid,
-};
-
-int
-zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp)
-{
-	zfsvfs_t *zfsvfs = vfsp->vfs_data;
-	vnode_t *dvp, *vp;
-	zfsctl_snapdir_t *sdp;
-	zfsctl_node_t *zcp;
-	zfs_snapentry_t *sep;
-	int error;
-
-	ASSERT(zfsvfs->z_ctldir != NULL);
-	error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
-	    NULL, 0, NULL, kcred);
-	if (error != 0)
-		return (error);
-	sdp = dvp->v_data;
-
-	mutex_enter(&sdp->sd_lock);
-	sep = avl_first(&sdp->sd_snaps);
-	while (sep != NULL) {
-		vp = sep->se_root;
-		zcp = vp->v_data;
-		if (zcp->zc_id == objsetid)
-			break;
-
-		sep = AVL_NEXT(&sdp->sd_snaps, sep);
-	}
-
-	if (sep != NULL) {
-		VN_HOLD(vp);
-		error = traverse(&vp, LK_SHARED | LK_RETRY);
-		if (error == 0) {
-			if (vp == sep->se_root)
-				error = EINVAL;
-			else
-				*zfsvfsp = VTOZ(vp)->z_zfsvfs;
-		}
-		mutex_exit(&sdp->sd_lock);
-		if (error == 0)
-			VN_URELE(vp);
-		else
-			VN_RELE(vp);
-	} else {
-		error = EINVAL;
-		mutex_exit(&sdp->sd_lock);
-	}
-
-	VN_RELE(dvp);
-
-	return (error);
-}
-
-/*
- * Unmount any snapshots for the given filesystem.  This is called from
- * zfs_umount() - if we have a ctldir, then go through and unmount all the
- * snapshots.
- */
-int
-zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr)
-{
-	struct vop_inactive_args ap;
-	zfsvfs_t *zfsvfs = vfsp->vfs_data;
-	vnode_t *dvp, *svp;
-	zfsctl_snapdir_t *sdp;
-	zfs_snapentry_t *sep, *next;
-	int error;
-
-	ASSERT(zfsvfs->z_ctldir != NULL);
-	error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
-	    NULL, 0, NULL, cr);
-	if (error != 0)
-		return (error);
-	sdp = dvp->v_data;
-
-	mutex_enter(&sdp->sd_lock);
-
-	sep = avl_first(&sdp->sd_snaps);
-	while (sep != NULL) {
-		svp = sep->se_root;
-		next = AVL_NEXT(&sdp->sd_snaps, sep);
-
-		/*
-		 * If this snapshot is not mounted, then it must
-		 * have just been unmounted by somebody else, and
-		 * will be cleaned up by zfsctl_snapdir_inactive().
-		 */
-		if (vn_ismntpt(svp)) {
-			if ((error = vn_vfswlock(svp)) != 0)
-				goto out;
-
-			/*
-			 * Increase usecount, so dounmount() won't vrele() it
-			 * to 0 and call zfsctl_snapdir_inactive().
-			 */
-			VN_HOLD(svp);
-			vfsp = vn_mountedvfs(svp);
-			mtx_lock(&Giant);
-			error = dounmount(vfsp, fflags, curthread);
-			mtx_unlock(&Giant);
-			if (error != 0) {
-				VN_RELE(svp);
-				goto out;
-			}
-
-			avl_remove(&sdp->sd_snaps, sep);
-			kmem_free(sep->se_name, strlen(sep->se_name) + 1);
-			kmem_free(sep, sizeof (zfs_snapentry_t));
-
-			/*
-			 * We can't use VN_RELE(), as that will try to
-			 * invoke zfsctl_snapdir_inactive(), and that
-			 * would lead to an attempt to re-grab the sd_lock.
-			 */
-			ASSERT3U(svp->v_count, ==, 1);
-			ap.a_vp = svp;
-			gfs_vop_inactive(&ap);
-		}
-		sep = next;
-	}
-out:
-	mutex_exit(&sdp->sd_lock);
-	VN_RELE(dvp);
-
-	return (error);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c
deleted file mode 100644
index f233b8f..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c
+++ /dev/null
@@ -1,797 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/time.h>
-#include <sys/systm.h>
-#include <sys/sysmacros.h>
-#include <sys/resource.h>
-#include <sys/vfs.h>
-#include <sys/vnode.h>
-#include <sys/file.h>
-#include <sys/kmem.h>
-#include <sys/uio.h>
-#include <sys/cmn_err.h>
-#include <sys/errno.h>
-#include <sys/stat.h>
-#include <sys/unistd.h>
-#include <sys/random.h>
-#include <sys/policy.h>
-#include <sys/kcondvar.h>
-#include <sys/callb.h>
-#include <sys/smp.h>
-#include <sys/zfs_dir.h>
-#include <sys/zfs_acl.h>
-#include <sys/fs/zfs.h>
-#include <sys/zap.h>
-#include <sys/dmu.h>
-#include <sys/atomic.h>
-#include <sys/zfs_ctldir.h>
-#include <sys/dnlc.h>
-
-/*
- * Lock a directory entry.  A dirlock on <dzp, name> protects that name
- * in dzp's directory zap object.  As long as you hold a dirlock, you can
- * assume two things: (1) dzp cannot be reaped, and (2) no other thread
- * can change the zap entry for (i.e. link or unlink) this name.
- *
- * Input arguments:
- *	dzp	- znode for directory
- *	name	- name of entry to lock
- *	flag	- ZNEW: if the entry already exists, fail with EEXIST.
- *		  ZEXISTS: if the entry does not exist, fail with ENOENT.
- *		  ZSHARED: allow concurrent access with other ZSHARED callers.
- *		  ZXATTR: we want dzp's xattr directory
- *
- * Output arguments:
- *	zpp	- pointer to the znode for the entry (NULL if there isn't one)
- *	dlpp	- pointer to the dirlock for this entry (NULL on error)
- *
- * Return value: 0 on success or errno on failure.
- *
- * NOTE: Always checks for, and rejects, '.' and '..'.
- */
-int
-zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp,
-	int flag)
-{
-	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
-	zfs_dirlock_t	*dl;
-	uint64_t	zoid;
-	int		error;
-	vnode_t		*vp;
-
-	*zpp = NULL;
-	*dlpp = NULL;
-
-	/*
-	 * Verify that we are not trying to lock '.', '..', or '.zfs'
-	 */
-	if (name[0] == '.' &&
-	    (name[1] == '\0' || (name[1] == '.' && name[2] == '\0')) ||
-	    zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0)
-		return (EEXIST);
-
-	/*
-	 * Wait until there are no locks on this name.
-	 */
-	rw_enter(&dzp->z_name_lock, RW_READER);
-	mutex_enter(&dzp->z_lock);
-	for (;;) {
-		if (dzp->z_unlinked) {
-			mutex_exit(&dzp->z_lock);
-			rw_exit(&dzp->z_name_lock);
-			return (ENOENT);
-		}
-		for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next)
-			if (strcmp(name, dl->dl_name) == 0)
-				break;
-		if (dl == NULL)	{
-			/*
-			 * Allocate a new dirlock and add it to the list.
-			 */
-			dl = kmem_alloc(sizeof (zfs_dirlock_t), KM_SLEEP);
-			cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL);
-			dl->dl_name = name;
-			dl->dl_sharecnt = 0;
-			dl->dl_namesize = 0;
-			dl->dl_dzp = dzp;
-			dl->dl_next = dzp->z_dirlocks;
-			dzp->z_dirlocks = dl;
-			break;
-		}
-		if ((flag & ZSHARED) && dl->dl_sharecnt != 0)
-			break;
-		cv_wait(&dl->dl_cv, &dzp->z_lock);
-	}
-
-	if ((flag & ZSHARED) && ++dl->dl_sharecnt > 1 && dl->dl_namesize == 0) {
-		/*
-		 * We're the second shared reference to dl.  Make a copy of
-		 * dl_name in case the first thread goes away before we do.
-		 * Note that we initialize the new name before storing its
-		 * pointer into dl_name, because the first thread may load
-		 * dl->dl_name at any time.  He'll either see the old value,
-		 * which is his, or the new shared copy; either is OK.
-		 */
-		dl->dl_namesize = strlen(dl->dl_name) + 1;
-		name = kmem_alloc(dl->dl_namesize, KM_SLEEP);
-		bcopy(dl->dl_name, name, dl->dl_namesize);
-		dl->dl_name = name;
-	}
-
-	mutex_exit(&dzp->z_lock);
-
-	/*
-	 * We have a dirlock on the name.  (Note that it is the dirlock,
-	 * not the dzp's z_lock, that protects the name in the zap object.)
-	 * See if there's an object by this name; if so, put a hold on it.
-	 */
-	if (flag & ZXATTR) {
-		zoid = dzp->z_phys->zp_xattr;
-		error = (zoid == 0 ? ENOENT : 0);
-	} else {
-		vp = dnlc_lookup(ZTOV(dzp), name);
-		if (vp == DNLC_NO_VNODE) {
-			VN_RELE(vp);
-			error = ENOENT;
-		} else if (vp) {
-			if (flag & ZNEW) {
-				zfs_dirent_unlock(dl);
-				VN_RELE(vp);
-				return (EEXIST);
-			}
-			*dlpp = dl;
-			*zpp = VTOZ(vp);
-			return (0);
-		} else {
-			error = zap_lookup(zfsvfs->z_os, dzp->z_id, name,
-			    8, 1, &zoid);
-			zoid = ZFS_DIRENT_OBJ(zoid);
-			if (error == ENOENT)
-				dnlc_update(ZTOV(dzp), name, DNLC_NO_VNODE);
-		}
-	}
-	if (error) {
-		if (error != ENOENT || (flag & ZEXISTS)) {
-			zfs_dirent_unlock(dl);
-			return (error);
-		}
-	} else {
-		if (flag & ZNEW) {
-			zfs_dirent_unlock(dl);
-			return (EEXIST);
-		}
-		error = zfs_zget(zfsvfs, zoid, zpp);
-		if (error) {
-			zfs_dirent_unlock(dl);
-			return (error);
-		}
-		if (!(flag & ZXATTR))
-			dnlc_update(ZTOV(dzp), name, ZTOV(*zpp));
-	}
-
-	*dlpp = dl;
-
-	return (0);
-}
-
-/*
- * Unlock this directory entry and wake anyone who was waiting for it.
- */
-void
-zfs_dirent_unlock(zfs_dirlock_t *dl)
-{
-	znode_t *dzp = dl->dl_dzp;
-	zfs_dirlock_t **prev_dl, *cur_dl;
-
-	mutex_enter(&dzp->z_lock);
-	rw_exit(&dzp->z_name_lock);
-	if (dl->dl_sharecnt > 1) {
-		dl->dl_sharecnt--;
-		mutex_exit(&dzp->z_lock);
-		return;
-	}
-	prev_dl = &dzp->z_dirlocks;
-	while ((cur_dl = *prev_dl) != dl)
-		prev_dl = &cur_dl->dl_next;
-	*prev_dl = dl->dl_next;
-	cv_broadcast(&dl->dl_cv);
-	mutex_exit(&dzp->z_lock);
-
-	if (dl->dl_namesize != 0)
-		kmem_free(dl->dl_name, dl->dl_namesize);
-	cv_destroy(&dl->dl_cv);
-	kmem_free(dl, sizeof (*dl));
-}
-
-/*
- * Look up an entry in a directory.
- *
- * NOTE: '.' and '..' are handled as special cases because
- *	no directory entries are actually stored for them.  If this is
- *	the root of a filesystem, then '.zfs' is also treated as a
- *	special pseudo-directory.
- */
-int
-zfs_dirlook(znode_t *dzp, char *name, vnode_t **vpp)
-{
-	zfs_dirlock_t *dl;
-	znode_t *zp;
-	int error = 0;
-
-	if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
-		*vpp = ZTOV(dzp);
-		VN_HOLD(*vpp);
-	} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
-		zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
-		/*
-		 * If we are a snapshot mounted under .zfs, return
-		 * the vp for the snapshot directory.
-		 */
-		if (dzp->z_phys->zp_parent == dzp->z_id &&
-		    zfsvfs->z_parent != zfsvfs) {
-			error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir,
-			    "snapshot", vpp, NULL, 0, NULL, kcred);
-			return (error);
-		}
-		rw_enter(&dzp->z_parent_lock, RW_READER);
-		error = zfs_zget(zfsvfs, dzp->z_phys->zp_parent, &zp);
-		if (error == 0)
-			*vpp = ZTOV(zp);
-		rw_exit(&dzp->z_parent_lock);
-	} else if (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) {
-		*vpp = zfsctl_root(dzp);
-	} else {
-		error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS | ZSHARED);
-		if (error == 0) {
-			*vpp = ZTOV(zp);
-			zfs_dirent_unlock(dl);
-			dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */
-		}
-	}
-
-	return (error);
-}
-
-static char *
-zfs_unlinked_hexname(char namebuf[17], uint64_t x)
-{
-	char *name = &namebuf[16];
-	const char digits[16] = "0123456789abcdef";
-
-	*name = '\0';
-	do {
-		*--name = digits[x & 0xf];
-		x >>= 4;
-	} while (x != 0);
-
-	return (name);
-}
-
-/*
- * unlinked Set (formerly known as the "delete queue") Error Handling
- *
- * When dealing with the unlinked set, we dmu_tx_hold_zap(), but we
- * don't specify the name of the entry that we will be manipulating.  We
- * also fib and say that we won't be adding any new entries to the
- * unlinked set, even though we might (this is to lower the minimum file
- * size that can be deleted in a full filesystem).  So on the small
- * chance that the nlink list is using a fat zap (ie. has more than
- * 2000 entries), we *may* not pre-read a block that's needed.
- * Therefore it is remotely possible for some of the assertions
- * regarding the unlinked set below to fail due to i/o error.  On a
- * nondebug system, this will result in the space being leaked.
- */
-void
-zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx)
-{
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	char obj_name[17];
-	int error;
-
-	ASSERT(zp->z_unlinked);
-	ASSERT3U(zp->z_phys->zp_links, ==, 0);
-
-	error = zap_add(zfsvfs->z_os, zfsvfs->z_unlinkedobj,
-	    zfs_unlinked_hexname(obj_name, zp->z_id), 8, 1, &zp->z_id, tx);
-	ASSERT3U(error, ==, 0);
-}
-
-/*
- * Clean up any znodes that had no links when we either crashed or
- * (force) umounted the file system.
- */
-void
-zfs_unlinked_drain(zfsvfs_t *zfsvfs)
-{
-	zap_cursor_t	zc;
-	zap_attribute_t zap;
-	dmu_object_info_t doi;
-	znode_t		*zp;
-	int		error;
-
-	/*
-	 * Interate over the contents of the unlinked set.
-	 */
-	for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj);
-	    zap_cursor_retrieve(&zc, &zap) == 0;
-	    zap_cursor_advance(&zc)) {
-
-		/*
-		 * See what kind of object we have in list
-		 */
-
-		error = dmu_object_info(zfsvfs->z_os,
-		    zap.za_first_integer, &doi);
-		if (error != 0)
-			continue;
-
-		ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) ||
-		    (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS));
-		/*
-		 * We need to re-mark these list entries for deletion,
-		 * so we pull them back into core and set zp->z_unlinked.
-		 */
-		error = zfs_zget(zfsvfs, zap.za_first_integer, &zp);
-
-		/*
-		 * We may pick up znodes that are already marked for deletion.
-		 * This could happen during the purge of an extended attribute
-		 * directory.  All we need to do is skip over them, since they
-		 * are already in the system marked z_unlinked.
-		 */
-		if (error != 0)
-			continue;
-
-		zp->z_unlinked = B_TRUE;
-		VN_RELE(ZTOV(zp));
-	}
-	zap_cursor_fini(&zc);
-}
-
-/*
- * Delete the entire contents of a directory.  Return a count
- * of the number of entries that could not be deleted.
- *
- * NOTE: this function assumes that the directory is inactive,
- *	so there is no need to lock its entries before deletion.
- *	Also, it assumes the directory contents is *only* regular
- *	files.
- */
-static int
-zfs_purgedir(znode_t *dzp)
-{
-	zap_cursor_t	zc;
-	zap_attribute_t	zap;
-	znode_t		*xzp;
-	dmu_tx_t	*tx;
-	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
-	zfs_dirlock_t	dl;
-	int skipped = 0;
-	int error;
-
-	for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id);
-	    (error = zap_cursor_retrieve(&zc, &zap)) == 0;
-	    zap_cursor_advance(&zc)) {
-		error = zfs_zget(zfsvfs,
-		    ZFS_DIRENT_OBJ(zap.za_first_integer), &xzp);
-		ASSERT3U(error, ==, 0);
-
-		ASSERT((ZTOV(xzp)->v_type == VREG) ||
-		    (ZTOV(xzp)->v_type == VLNK));
-
-		tx = dmu_tx_create(zfsvfs->z_os);
-		dmu_tx_hold_bonus(tx, dzp->z_id);
-		dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name);
-		dmu_tx_hold_bonus(tx, xzp->z_id);
-		dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
-		error = dmu_tx_assign(tx, TXG_WAIT);
-		if (error) {
-			dmu_tx_abort(tx);
-			VN_RELE(ZTOV(xzp));
-			skipped += 1;
-			continue;
-		}
-		bzero(&dl, sizeof (dl));
-		dl.dl_dzp = dzp;
-		dl.dl_name = zap.za_name;
-
-		error = zfs_link_destroy(&dl, xzp, tx, 0, NULL);
-		ASSERT3U(error, ==, 0);
-		dmu_tx_commit(tx);
-
-		VN_RELE(ZTOV(xzp));
-	}
-	zap_cursor_fini(&zc);
-	ASSERT(error == ENOENT);
-	return (skipped);
-}
-
-void
-zfs_rmnode(znode_t *zp)
-{
-	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	objset_t	*os = zfsvfs->z_os;
-	znode_t		*xzp = NULL;
-	char		obj_name[17];
-	dmu_tx_t	*tx;
-	uint64_t	acl_obj;
-	int		error;
-	int		vfslocked;
-
-	vfslocked = VFS_LOCK_GIANT(zfsvfs->z_vfs);
-
-	ASSERT(zp->z_phys->zp_links == 0);
-
-	/*
-	 * If this is an attribute directory, purge its contents.
-	 */
-	if (ZTOV(zp) != NULL && ZTOV(zp)->v_type == VDIR &&
-	    (zp->z_phys->zp_flags & ZFS_XATTR)) {
-		if (zfs_purgedir(zp) != 0) {
-			/*
-			 * Not enough space to delete some xattrs.
-			 * Leave it on the unlinked set.
-			 */
-			VFS_UNLOCK_GIANT(vfslocked);
-			return;
-		}
-	}
-
-	/*
-	 * If the file has extended attributes, we're going to unlink
-	 * the xattr dir.
-	 */
-	if (zp->z_phys->zp_xattr) {
-		error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp);
-		ASSERT(error == 0);
-	}
-
-	acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj;
-
-	/*
-	 * Set up the transaction.
-	 */
-	tx = dmu_tx_create(os);
-	dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
-	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
-	if (xzp) {
-		dmu_tx_hold_bonus(tx, xzp->z_id);
-		dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL);
-	}
-	if (acl_obj)
-		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
-	error = dmu_tx_assign(tx, TXG_WAIT);
-	if (error) {
-		/*
-		 * Not enough space to delete the file.  Leave it in the
-		 * unlinked set, leaking it until the fs is remounted (at
-		 * which point we'll call zfs_unlinked_drain() to process it).
-		 */
-		dmu_tx_abort(tx);
-		VFS_UNLOCK_GIANT(vfslocked);
-		return;
-	}
-
-	if (xzp) {
-		dmu_buf_will_dirty(xzp->z_dbuf, tx);
-		mutex_enter(&xzp->z_lock);
-		xzp->z_unlinked = B_TRUE;	/* mark xzp for deletion */
-		xzp->z_phys->zp_links = 0;	/* no more links to it */
-		mutex_exit(&xzp->z_lock);
-		zfs_unlinked_add(xzp, tx);
-	}
-
-	/* Remove this znode from the unlinked set */
-	error = zap_remove(os, zfsvfs->z_unlinkedobj,
-	    zfs_unlinked_hexname(obj_name, zp->z_id), tx);
-	ASSERT3U(error, ==, 0);
-
-	zfs_znode_delete(zp, tx);
-
-	dmu_tx_commit(tx);
-
-	if (xzp)
-		VN_RELE(ZTOV(xzp));
-	VFS_UNLOCK_GIANT(vfslocked);
-}
-
-/*
- * Link zp into dl.  Can only fail if zp has been unlinked.
- */
-int
-zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
-{
-	znode_t *dzp = dl->dl_dzp;
-	vnode_t *vp = ZTOV(zp);
-	uint64_t value;
-	int zp_is_dir = (vp->v_type == VDIR);
-	int error;
-
-	dmu_buf_will_dirty(zp->z_dbuf, tx);
-	mutex_enter(&zp->z_lock);
-
-	if (!(flag & ZRENAMING)) {
-		if (zp->z_unlinked) {	/* no new links to unlinked zp */
-			ASSERT(!(flag & (ZNEW | ZEXISTS)));
-			mutex_exit(&zp->z_lock);
-			return (ENOENT);
-		}
-		zp->z_phys->zp_links++;
-	}
-	zp->z_phys->zp_parent = dzp->z_id;	/* dzp is now zp's parent */
-
-	if (!(flag & ZNEW))
-		zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
-	mutex_exit(&zp->z_lock);
-
-	dmu_buf_will_dirty(dzp->z_dbuf, tx);
-	mutex_enter(&dzp->z_lock);
-	dzp->z_phys->zp_size++;			/* one dirent added */
-	dzp->z_phys->zp_links += zp_is_dir;	/* ".." link from zp */
-	zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx);
-	mutex_exit(&dzp->z_lock);
-
-	/*
-	 * MacOS X will fill in the 4-bit object type here.
-	 */
-	value = ZFS_DIRENT_MAKE(IFTODT(zp->z_phys->zp_mode), zp->z_id);
-	error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, dl->dl_name,
-	    8, 1, &value, tx);
-	ASSERT(error == 0);
-
-	dnlc_update(ZTOV(dzp), dl->dl_name, vp);
-
-	return (0);
-}
-
-/*
- * Unlink zp from dl, and mark zp for deletion if this was the last link.
- * Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST).
- * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list.
- * If it's non-NULL, we use it to indicate whether the znode needs deletion,
- * and it's the caller's job to do it.
- */
-int
-zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
-	boolean_t *unlinkedp)
-{
-	znode_t *dzp = dl->dl_dzp;
-	vnode_t *vp = ZTOV(zp);
-	int zp_is_dir = (vp->v_type == VDIR);
-	boolean_t unlinked = B_FALSE;
-	int error;
-
-	dnlc_remove(ZTOV(dzp), dl->dl_name);
-
-	if (!(flag & ZRENAMING)) {
-		dmu_buf_will_dirty(zp->z_dbuf, tx);
-
-		if (vn_vfswlock(vp))		/* prevent new mounts on zp */
-			return (EBUSY);
-
-		if (vn_ismntpt(vp)) {		/* don't remove mount point */
-			vn_vfsunlock(vp);
-			return (EBUSY);
-		}
-
-		mutex_enter(&zp->z_lock);
-		if (zp_is_dir && !zfs_dirempty(zp)) {	/* dir not empty */
-			mutex_exit(&zp->z_lock);
-			vn_vfsunlock(vp);
-			return (ENOTEMPTY);
-		}
-		if (zp->z_phys->zp_links <= zp_is_dir) {
-			zfs_panic_recover("zfs: link count on vnode %p is %u, "
-			    "should be at least %u", zp->z_vnode,
-			    (int)zp->z_phys->zp_links,
-			    zp_is_dir + 1);
-			zp->z_phys->zp_links = zp_is_dir + 1;
-		}
-		if (--zp->z_phys->zp_links == zp_is_dir) {
-			zp->z_unlinked = B_TRUE;
-			zp->z_phys->zp_links = 0;
-			unlinked = B_TRUE;
-		} else {
-			zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
-		}
-		mutex_exit(&zp->z_lock);
-		vn_vfsunlock(vp);
-	}
-
-	dmu_buf_will_dirty(dzp->z_dbuf, tx);
-	mutex_enter(&dzp->z_lock);
-	dzp->z_phys->zp_size--;			/* one dirent removed */
-	dzp->z_phys->zp_links -= zp_is_dir;	/* ".." link from zp */
-	zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx);
-	mutex_exit(&dzp->z_lock);
-
-	error = zap_remove(zp->z_zfsvfs->z_os, dzp->z_id, dl->dl_name, tx);
-	ASSERT(error == 0);
-
-	if (unlinkedp != NULL)
-		*unlinkedp = unlinked;
-	else if (unlinked)
-		zfs_unlinked_add(zp, tx);
-
-	return (0);
-}
-
-/*
- * Indicate whether the directory is empty.  Works with or without z_lock
- * held, but can only be consider a hint in the latter case.  Returns true
- * if only "." and ".." remain and there's no work in progress.
- */
-boolean_t
-zfs_dirempty(znode_t *dzp)
-{
-	return (dzp->z_phys->zp_size == 2 && dzp->z_dirlocks == 0);
-}
-
-int
-zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr)
-{
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	znode_t *xzp;
-	dmu_tx_t *tx;
-	uint64_t xoid;
-	int error;
-
-	*xvpp = NULL;
-
-	if (error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, cr))
-		return (error);
-
-	tx = dmu_tx_create(zfsvfs->z_os);
-	dmu_tx_hold_bonus(tx, zp->z_id);
-	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
-	error = dmu_tx_assign(tx, zfsvfs->z_assign);
-	if (error) {
-		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT)
-			dmu_tx_wait(tx);
-		dmu_tx_abort(tx);
-		return (error);
-	}
-	zfs_mknode(zp, vap, &xoid, tx, cr, IS_XATTR, &xzp, 0);
-	ASSERT(xzp->z_id == xoid);
-	ASSERT(xzp->z_phys->zp_parent == zp->z_id);
-	dmu_buf_will_dirty(zp->z_dbuf, tx);
-	zp->z_phys->zp_xattr = xoid;
-
-	(void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp, xzp, "");
-	dmu_tx_commit(tx);
-
-	*xvpp = ZTOV(xzp);
-
-	return (0);
-}
-
-/*
- * Return a znode for the extended attribute directory for zp.
- * ** If the directory does not already exist, it is created **
- *
- *	IN:	zp	- znode to obtain attribute directory from
- *		cr	- credentials of caller
- *		flags	- flags from the VOP_LOOKUP call
- *
- *	OUT:	xzpp	- pointer to extended attribute znode
- *
- *	RETURN:	0 on success
- *		error number on failure
- */
-int
-zfs_get_xattrdir(znode_t *zp, vnode_t **xvpp, cred_t *cr, int flags)
-{
-	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	znode_t		*xzp;
-	zfs_dirlock_t	*dl;
-	vattr_t		va;
-	int		error;
-top:
-	error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR);
-	if (error)
-		return (error);
-
-	if (xzp != NULL) {
-		*xvpp = ZTOV(xzp);
-		zfs_dirent_unlock(dl);
-		return (0);
-	}
-
-	ASSERT(zp->z_phys->zp_xattr == 0);
-
-#ifdef TODO
-	if (!(flags & CREATE_XATTR_DIR)) {
-		zfs_dirent_unlock(dl);
-		return (ENOENT);
-	}
-#endif
-
-	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
-		zfs_dirent_unlock(dl);
-		return (EROFS);
-	}
-
-	/*
-	 * The ability to 'create' files in an attribute
-	 * directory comes from the write_xattr permission on the base file.
-	 *
-	 * The ability to 'search' an attribute directory requires
-	 * read_xattr permission on the base file.
-	 *
-	 * Once in a directory the ability to read/write attributes
-	 * is controlled by the permissions on the attribute file.
-	 */
-	va.va_mask = AT_TYPE | AT_MODE | AT_UID | AT_GID;
-	va.va_type = VDIR;
-	va.va_mode = S_IFDIR | S_ISVTX | 0777;
-	va.va_uid = (uid_t)zp->z_phys->zp_uid;
-	va.va_gid = (gid_t)zp->z_phys->zp_gid;
-
-	error = zfs_make_xattrdir(zp, &va, xvpp, cr);
-	zfs_dirent_unlock(dl);
-
-	if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
-		/* NB: we already did dmu_tx_wait() if necessary */
-		goto top;
-	}
-
-	return (error);
-}
-
-/*
- * Decide whether it is okay to remove within a sticky directory.
- *
- * In sticky directories, write access is not sufficient;
- * you can remove entries from a directory only if:
- *
- *	you own the directory,
- *	you own the entry,
- *	the entry is a plain file and you have write access,
- *	or you are privileged (checked in secpolicy...).
- *
- * The function returns 0 if remove access is granted.
- */
-int
-zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr)
-{
-	uid_t  		uid;
-
-	if (zdp->z_zfsvfs->z_assign >= TXG_INITIAL)	/* ZIL replay */
-		return (0);
-
-	if ((zdp->z_phys->zp_mode & S_ISVTX) == 0 ||
-	    (uid = crgetuid(cr)) == zdp->z_phys->zp_uid ||
-	    uid == zp->z_phys->zp_uid ||
-	    (ZTOV(zp)->v_type == VREG &&
-	    zfs_zaccess(zp, ACE_WRITE_DATA, cr) == 0))
-		return (0);
-	else
-		return (secpolicy_vnode_remove(cr));
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c
deleted file mode 100644
index e2385a0..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c
+++ /dev/null
@@ -1,335 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/spa.h>
-#include <sys/spa_impl.h>
-#include <sys/vdev.h>
-#include <sys/vdev_impl.h>
-#include <sys/zio.h>
-
-#include <sys/fm/fs/zfs.h>
-#include <sys/fm/protocol.h>
-#include <sys/fm/util.h>
-
-#ifdef _KERNEL
-/* Including sys/bus.h is just too hard, so I declare what I need here. */
-extern void devctl_notify(const char *__system, const char *__subsystem,
-    const char *__type, const char *__data);
-#endif
-
-/*
- * This general routine is responsible for generating all the different ZFS
- * ereports.  The payload is dependent on the class, and which arguments are
- * supplied to the function:
- *
- * 	EREPORT			POOL	VDEV	IO
- * 	block			X	X	X
- * 	data			X		X
- * 	device			X	X
- * 	pool			X
- *
- * If we are in a loading state, all errors are chained together by the same
- * SPA-wide ENA.
- *
- * For isolated I/O requests, we get the ENA from the zio_t. The propagation
- * gets very complicated due to RAID-Z, gang blocks, and vdev caching.  We want
- * to chain together all ereports associated with a logical piece of data.  For
- * read I/Os, there  are basically three 'types' of I/O, which form a roughly
- * layered diagram:
- *
- *      +---------------+
- * 	| Aggregate I/O |	No associated logical data or device
- * 	+---------------+
- *              |
- *              V
- * 	+---------------+	Reads associated with a piece of logical data.
- * 	|   Read I/O    |	This includes reads on behalf of RAID-Z,
- * 	+---------------+       mirrors, gang blocks, retries, etc.
- *              |
- *              V
- * 	+---------------+	Reads associated with a particular device, but
- * 	| Physical I/O  |	no logical data.  Issued as part of vdev caching
- * 	+---------------+	and I/O aggregation.
- *
- * Note that 'physical I/O' here is not the same terminology as used in the rest
- * of ZIO.  Typically, 'physical I/O' simply means that there is no attached
- * blockpointer.  But I/O with no associated block pointer can still be related
- * to a logical piece of data (i.e. RAID-Z requests).
- *
- * Purely physical I/O always have unique ENAs.  They are not related to a
- * particular piece of logical data, and therefore cannot be chained together.
- * We still generate an ereport, but the DE doesn't correlate it with any
- * logical piece of data.  When such an I/O fails, the delegated I/O requests
- * will issue a retry, which will trigger the 'real' ereport with the correct
- * ENA.
- *
- * We keep track of the ENA for a ZIO chain through the 'io_logical' member.
- * When a new logical I/O is issued, we set this to point to itself.  Child I/Os
- * then inherit this pointer, so that when it is first set subsequent failures
- * will use the same ENA.  If a physical I/O is issued (by passing the
- * ZIO_FLAG_NOBOOKMARK flag), then this pointer is reset, guaranteeing that a
- * unique ENA will be generated.  For an aggregate I/O, this pointer is set to
- * NULL, and no ereport will be generated (since it doesn't actually correspond
- * to any particular device or piece of data).
- */
-void
-zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
-    uint64_t stateoroffset, uint64_t size)
-{
-#ifdef _KERNEL
-	char buf[1024];
-	struct sbuf sb;
-	struct timespec ts;
-
-	/*
-	 * If we are doing a spa_tryimport(), ignore errors.
-	 */
-	if (spa->spa_load_state == SPA_LOAD_TRYIMPORT)
-		return;
-
-	/*
-	 * If we are in the middle of opening a pool, and the previous attempt
-	 * failed, don't bother logging any new ereports - we're just going to
-	 * get the same diagnosis anyway.
-	 */
-	if (spa->spa_load_state != SPA_LOAD_NONE &&
-	    spa->spa_last_open_failed)
-		return;
-
-	/*
-	 * Ignore any errors from I/Os that we are going to retry anyway - we
-	 * only generate errors from the final failure.
-	 */
-	if (zio && zio_should_retry(zio))
-		return;
-
-	/*
-	 * If this is not a read or write zio, ignore the error.  This can occur
-	 * if the DKIOCFLUSHWRITECACHE ioctl fails.
-	 */
-	if (zio && zio->io_type != ZIO_TYPE_READ &&
-	    zio->io_type != ZIO_TYPE_WRITE)
-		return;
-
-	nanotime(&ts);
-
-	sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
-	sbuf_printf(&sb, "time=%ju.%ld", (uintmax_t)ts.tv_sec, ts.tv_nsec);
-
-	/*
-	 * Serialize ereport generation
-	 */
-	mutex_enter(&spa->spa_errlist_lock);
-
-#if 0
-	/*
-	 * Determine the ENA to use for this event.  If we are in a loading
-	 * state, use a SPA-wide ENA.  Otherwise, if we are in an I/O state, use
-	 * a root zio-wide ENA.  Otherwise, simply use a unique ENA.
-	 */
-	if (spa->spa_load_state != SPA_LOAD_NONE) {
-#if 0
-		if (spa->spa_ena == 0)
-			spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1);
-#endif
-		ena = spa->spa_ena;
-	} else if (zio != NULL && zio->io_logical != NULL) {
-#if 0
-		if (zio->io_logical->io_ena == 0)
-			zio->io_logical->io_ena =
-			    fm_ena_generate(0, FM_ENA_FMT1);
-#endif
-		ena = zio->io_logical->io_ena;
-	} else {
-#if 0
-		ena = fm_ena_generate(0, FM_ENA_FMT1);
-#else
-		ena = 0;
-#endif
-	}
-#endif
-
-	/*
-	 * Construct the full class, detector, and other standard FMA fields.
-	 */
-	sbuf_printf(&sb, " ereport_version=%u", FM_EREPORT_VERSION);
-	sbuf_printf(&sb, " class=%s.%s", ZFS_ERROR_CLASS, subclass);
-
-	sbuf_printf(&sb, " zfs_scheme_version=%u", FM_ZFS_SCHEME_VERSION);
-
-	/*
-	 * Construct the per-ereport payload, depending on which parameters are
-	 * passed in.
-	 */
-
-	/*
-	 * Generic payload members common to all ereports.
-	 *
-	 * The direct reference to spa_name is used rather than spa_name()
-	 * because of the asynchronous nature of the zio pipeline.  spa_name()
-	 * asserts that the config lock is held in some form.  This is always
-	 * the case in I/O context, but because the check for RW_WRITER compares
-	 * against 'curthread', we may be in an asynchronous context and blow
-	 * this assert.  Rather than loosen this assert, we acknowledge that all
-	 * contexts in which this function is called (pool open, I/O) are safe,
-	 * and dereference the name directly.
-	 */
-	sbuf_printf(&sb, " %s=%s", FM_EREPORT_PAYLOAD_ZFS_POOL, spa->spa_name);
-	sbuf_printf(&sb, " %s=%ju", FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
-	    spa_guid(spa));
-	sbuf_printf(&sb, " %s=%u", FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT,
-	    spa->spa_load_state);
-
-	if (vd != NULL) {
-		vdev_t *pvd = vd->vdev_parent;
-
-		sbuf_printf(&sb, " %s=%ju", FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
-		    vd->vdev_guid);
-		sbuf_printf(&sb, " %s=%s", FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
-		    vd->vdev_ops->vdev_op_type);
-		if (vd->vdev_path)
-			sbuf_printf(&sb, " %s=%s",
-			    FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, vd->vdev_path);
-		if (vd->vdev_devid)
-			sbuf_printf(&sb, " %s=%s",
-			    FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, vd->vdev_devid);
-
-		if (pvd != NULL) {
-			sbuf_printf(&sb, " %s=%ju",
-			    FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID, pvd->vdev_guid);
-			sbuf_printf(&sb, " %s=%s",
-			    FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE,
-			    pvd->vdev_ops->vdev_op_type);
-			if (pvd->vdev_path)
-				sbuf_printf(&sb, " %s=%s",
-				    FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH,
-				    pvd->vdev_path);
-			if (pvd->vdev_devid)
-				sbuf_printf(&sb, " %s=%s",
-				    FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID,
-				    pvd->vdev_devid);
-		}
-	}
-
-	if (zio != NULL) {
-		/*
-		 * Payload common to all I/Os.
-		 */
-		sbuf_printf(&sb, " %s=%u", FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR,
-		    zio->io_error);
-
-		/*
-		 * If the 'size' parameter is non-zero, it indicates this is a
-		 * RAID-Z or other I/O where the physical offset and length are
-		 * provided for us, instead of within the zio_t.
-		 */
-		if (vd != NULL) {
-			if (size) {
-				sbuf_printf(&sb, " %s=%ju",
-				    FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
-				    stateoroffset);
-				sbuf_printf(&sb, " %s=%ju",
-				    FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, size);
-			} else {
-				sbuf_printf(&sb, " %s=%ju",
-				    FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
-				    zio->io_offset);
-				sbuf_printf(&sb, " %s=%ju",
-				    FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
-				    zio->io_size);
-			}
-		}
-
-		/*
-		 * Payload for I/Os with corresponding logical information.
-		 */
-		if (zio->io_logical != NULL) {
-			sbuf_printf(&sb, " %s=%ju",
-			    FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT,
-			    zio->io_logical->io_bookmark.zb_object);
-			sbuf_printf(&sb, " %s=%ju",
-			    FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL,
-			    zio->io_logical->io_bookmark.zb_level);
-			sbuf_printf(&sb, " %s=%ju",
-			    FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID,
-			    zio->io_logical->io_bookmark.zb_blkid);
-		}
-	} else if (vd != NULL) {
-		/*
-		 * If we have a vdev but no zio, this is a device fault, and the
-		 * 'stateoroffset' parameter indicates the previous state of the
-		 * vdev.
-		 */
-		sbuf_printf(&sb, " %s=%ju", FM_EREPORT_PAYLOAD_ZFS_PREV_STATE,
-		    stateoroffset);
-	}
-	mutex_exit(&spa->spa_errlist_lock);
-
-	sbuf_finish(&sb);
-	ZFS_LOG(1, "%s", sbuf_data(&sb));
-	devctl_notify("ZFS", spa->spa_name, subclass, sbuf_data(&sb));
-	if (sbuf_overflowed(&sb))
-		printf("ZFS WARNING: sbuf overflowed\n");
-	sbuf_delete(&sb);
-#endif
-}
-
-/*
- * The 'resource.fs.zfs.ok' event is an internal signal that the associated
- * resource (pool or disk) has been identified by ZFS as healthy.  This will
- * then trigger the DE to close the associated case, if any.
- */
-void
-zfs_post_ok(spa_t *spa, vdev_t *vd)
-{
-#ifdef _KERNEL
-	char buf[1024];
-	char class[64];
-	struct sbuf sb;
-	struct timespec ts;
-
-	nanotime(&ts);
-
-	sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
-	sbuf_printf(&sb, "time=%ju.%ld", (uintmax_t)ts.tv_sec, ts.tv_nsec);
-
-	snprintf(class, sizeof(class), "%s.%s.%s", FM_RSRC_RESOURCE,
-	    ZFS_ERROR_CLASS, FM_RESOURCE_OK);
-	sbuf_printf(&sb, " %s=%hhu", FM_VERSION, FM_RSRC_VERSION);
-	sbuf_printf(&sb, " %s=%s", FM_CLASS, class);
-	sbuf_printf(&sb, " %s=%ju", FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
-	    spa_guid(spa));
-	if (vd)
-		sbuf_printf(&sb, " %s=%ju", FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
-		    vd->vdev_guid);
-	sbuf_finish(&sb);
-	devctl_notify("ZFS", spa->spa_name, class, sbuf_data(&sb));
-	if (sbuf_overflowed(&sb))
-		printf("ZFS WARNING: sbuf overflowed\n");
-	sbuf_delete(&sb);
-#endif
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
deleted file mode 100644
index c9424be..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
+++ /dev/null
@@ -1,1826 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/conf.h>
-#include <sys/kernel.h>
-#include <sys/lock.h>
-#include <sys/malloc.h>
-#include <sys/mutex.h>
-#include <sys/proc.h>
-#include <sys/errno.h>
-#include <sys/uio.h>
-#include <sys/buf.h>
-#include <sys/file.h>
-#include <sys/kmem.h>
-#include <sys/conf.h>
-#include <sys/cmn_err.h>
-#include <sys/stat.h>
-#include <sys/zfs_ioctl.h>
-#include <sys/zap.h>
-#include <sys/spa.h>
-#include <sys/spa_impl.h>
-#include <sys/vdev.h>
-#include <sys/vdev_impl.h>
-#include <sys/dmu.h>
-#include <sys/dsl_dir.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_prop.h>
-#include <sys/sunddi.h>
-#include <sys/policy.h>
-#include <sys/zone.h>
-#include <sys/nvpair.h>
-#include <sys/mount.h>
-#include <sys/taskqueue.h>
-#include <sys/sdt.h>
-#include <sys/varargs.h>
-#include <sys/fs/zfs.h>
-#include <sys/zfs_ctldir.h>
-#include <sys/zvol.h>
-
-#include "zfs_namecheck.h"
-#include "zfs_prop.h"
-
-CTASSERT(sizeof(zfs_cmd_t) <= PAGE_SIZE);
-
-static struct cdev *zfsdev;
-
-extern void zfs_init(void);
-extern void zfs_fini(void);
-
-typedef int zfs_ioc_func_t(zfs_cmd_t *);
-typedef int zfs_secpolicy_func_t(const char *, cred_t *);
-
-typedef struct zfs_ioc_vec {
-	zfs_ioc_func_t		*zvec_func;
-	zfs_secpolicy_func_t	*zvec_secpolicy;
-	enum {
-		no_name,
-		pool_name,
-		dataset_name
-	}			zvec_namecheck;
-} zfs_ioc_vec_t;
-
-/* _NOTE(PRINTFLIKE(4)) - this is printf-like, but lint is too whiney */
-void
-__dprintf(const char *file, const char *func, int line, const char *fmt, ...)
-{
-	const char *newfile;
-	char buf[256];
-	va_list adx;
-
-	/*
-	 * Get rid of annoying "../common/" prefix to filename.
-	 */
-	newfile = strrchr(file, '/');
-	if (newfile != NULL) {
-		newfile = newfile + 1; /* Get rid of leading / */
-	} else {
-		newfile = file;
-	}
-
-	va_start(adx, fmt);
-	(void) vsnprintf(buf, sizeof (buf), fmt, adx);
-	va_end(adx);
-
-	/*
-	 * To get this data, use the zfs-dprintf probe as so:
-	 * dtrace -q -n 'zfs-dprintf \
-	 *	/stringof(arg0) == "dbuf.c"/ \
-	 *	{printf("%s: %s", stringof(arg1), stringof(arg3))}'
-	 * arg0 = file name
-	 * arg1 = function name
-	 * arg2 = line number
-	 * arg3 = message
-	 */
-	DTRACE_PROBE4(zfs__dprintf,
-	    char *, newfile, char *, func, int, line, char *, buf);
-}
-
-/*
- * Policy for top-level read operations (list pools).  Requires no privileges,
- * and can be used in the local zone, as there is no associated dataset.
- */
-/* ARGSUSED */
-static int
-zfs_secpolicy_none(const char *unused1, cred_t *cr)
-{
-	return (0);
-}
-
-/*
- * Policy for dataset read operations (list children, get statistics).  Requires
- * no privileges, but must be visible in the local zone.
- */
-/* ARGSUSED */
-static int
-zfs_secpolicy_read(const char *dataset, cred_t *cr)
-{
-	if (INGLOBALZONE(curproc) ||
-	    zone_dataset_visible(dataset, NULL))
-		return (0);
-
-	return (ENOENT);
-}
-
-static int
-zfs_dozonecheck(const char *dataset, cred_t *cr)
-{
-	uint64_t zoned;
-	int writable = 1;
-
-	/*
-	 * The dataset must be visible by this zone -- check this first
-	 * so they don't see EPERM on something they shouldn't know about.
-	 */
-	if (!INGLOBALZONE(curproc) &&
-	    !zone_dataset_visible(dataset, &writable))
-		return (ENOENT);
-
-	if (dsl_prop_get_integer(dataset, "jailed", &zoned, NULL))
-		return (ENOENT);
-
-	if (INGLOBALZONE(curproc)) {
-		/*
-		 * If the fs is zoned, only root can access it from the
-		 * global zone.
-		 */
-		if (secpolicy_zfs(cr) && zoned)
-			return (EPERM);
-	} else {
-		/*
-		 * If we are in a local zone, the 'zoned' property must be set.
-		 */
-		if (!zoned)
-			return (EPERM);
-
-		/* must be writable by this zone */
-		if (!writable)
-			return (EPERM);
-	}
-	return (0);
-}
-
-/*
- * Policy for dataset write operations (create children, set properties, etc).
- * Requires SYS_MOUNT privilege, and must be writable in the local zone.
- */
-int
-zfs_secpolicy_write(const char *dataset, cred_t *cr)
-{
-	int error;
-
-	if (error = zfs_dozonecheck(dataset, cr))
-		return (error);
-
-	return (secpolicy_zfs(cr));
-}
-
-/*
- * Policy for operations that want to write a dataset's parent:
- * create, destroy, snapshot, clone, restore.
- */
-static int
-zfs_secpolicy_parent(const char *dataset, cred_t *cr)
-{
-	char parentname[MAXNAMELEN];
-	char *cp;
-
-	/*
-	 * Remove the @bla or /bla from the end of the name to get the parent.
-	 */
-	(void) strncpy(parentname, dataset, sizeof (parentname));
-	cp = strrchr(parentname, '@');
-	if (cp != NULL) {
-		cp[0] = '\0';
-	} else {
-		cp = strrchr(parentname, '/');
-		if (cp == NULL)
-			return (ENOENT);
-		cp[0] = '\0';
-
-	}
-
-	return (zfs_secpolicy_write(parentname, cr));
-}
-
-/*
- * Policy for pool operations - create/destroy pools, add vdevs, etc.  Requires
- * SYS_CONFIG privilege, which is not available in a local zone.
- */
-/* ARGSUSED */
-static int
-zfs_secpolicy_config(const char *unused, cred_t *cr)
-{
-	if (secpolicy_sys_config(cr, B_FALSE) != 0)
-		return (EPERM);
-
-	return (0);
-}
-
-/*
- * Policy for fault injection.  Requires all privileges.
- */
-/* ARGSUSED */
-static int
-zfs_secpolicy_inject(const char *unused, cred_t *cr)
-{
-	return (secpolicy_zinject(cr));
-}
-
-/*
- * Policy for dataset backup operations (sendbackup).
- * Requires SYS_MOUNT privilege, and must be writable in the local zone.
- */
-static int
-zfs_secpolicy_operator(const char *dataset, cred_t *cr)
-{
-	int writable = 1;
-
-	if (!INGLOBALZONE(curproc) && !zone_dataset_visible(dataset, &writable))
-		return (ENOENT);
-	if (secpolicy_zfs(cr) != 0 && !groupmember(GID_OPERATOR, cr))
-		return (EPERM);
-	return (0);
-}
-
-/*
- * Returns the nvlist as specified by the user in the zfs_cmd_t.
- */
-static int
-get_nvlist(zfs_cmd_t *zc, nvlist_t **nvp)
-{
-	char *packed;
-	size_t size;
-	int error;
-	nvlist_t *config = NULL;
-
-	/*
-	 * Read in and unpack the user-supplied nvlist.
-	 */
-	if ((size = zc->zc_nvlist_src_size) == 0)
-		return (EINVAL);
-
-	packed = kmem_alloc(size, KM_SLEEP);
-
-	if ((error = xcopyin((void *)(uintptr_t)zc->zc_nvlist_src, packed,
-	    size)) != 0) {
-		kmem_free(packed, size);
-		return (error);
-	}
-
-	if ((error = nvlist_unpack(packed, size, &config, 0)) != 0) {
-		kmem_free(packed, size);
-		return (error);
-	}
-
-	kmem_free(packed, size);
-
-	*nvp = config;
-	return (0);
-}
-
-static int
-put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl)
-{
-	char *packed = NULL;
-	size_t size;
-	int error;
-
-	VERIFY(nvlist_size(nvl, &size, NV_ENCODE_NATIVE) == 0);
-
-	if (size > zc->zc_nvlist_dst_size) {
-		/*
-		 * Solaris returns ENOMEM here, because even if an error is
-		 * returned from an ioctl(2), new zc_nvlist_dst_size will be
-		 * passed to the userland. This is not the case for FreeBSD.
-		 * We need to return 0, so the kernel will copy the
-		 * zc_nvlist_dst_size back and the userland can discover that a
-		 * bigger buffer is needed.
-		 */
-		error = 0;
-	} else {
-		VERIFY(nvlist_pack(nvl, &packed, &size, NV_ENCODE_NATIVE,
-		    KM_SLEEP) == 0);
-		error = xcopyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst,
-		    size);
-		kmem_free(packed, size);
-	}
-
-	zc->zc_nvlist_dst_size = size;
-	return (error);
-}
-
-static int
-zfs_ioc_pool_create(zfs_cmd_t *zc)
-{
-	int error;
-	nvlist_t *config;
-
-	if ((error = get_nvlist(zc, &config)) != 0)
-		return (error);
-
-	error = spa_create(zc->zc_name, config, zc->zc_value[0] == '\0' ?
-	    NULL : zc->zc_value);
-
-	nvlist_free(config);
-
-	return (error);
-}
-
-static int
-zfs_ioc_pool_destroy(zfs_cmd_t *zc)
-{
-	return (spa_destroy(zc->zc_name));
-}
-
-static int
-zfs_ioc_pool_import(zfs_cmd_t *zc)
-{
-	int error;
-	nvlist_t *config;
-	uint64_t guid;
-
-	if ((error = get_nvlist(zc, &config)) != 0)
-		return (error);
-
-	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 ||
-	    guid != zc->zc_guid)
-		error = EINVAL;
-	else
-		error = spa_import(zc->zc_name, config,
-		    zc->zc_value[0] == '\0' ? NULL : zc->zc_value);
-
-	nvlist_free(config);
-
-	return (error);
-}
-
-static int
-zfs_ioc_pool_export(zfs_cmd_t *zc)
-{
-	return (spa_export(zc->zc_name, NULL));
-}
-
-static int
-zfs_ioc_pool_configs(zfs_cmd_t *zc)
-{
-	nvlist_t *configs;
-	int error;
-
-	if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL)
-		return (EEXIST);
-
-	error = put_nvlist(zc, configs);
-
-	nvlist_free(configs);
-
-	return (error);
-}
-
-static int
-zfs_ioc_pool_stats(zfs_cmd_t *zc)
-{
-	nvlist_t *config;
-	int error;
-	int ret = 0;
-
-	error = spa_get_stats(zc->zc_name, &config, zc->zc_value,
-	    sizeof (zc->zc_value));
-
-	if (config != NULL) {
-		ret = put_nvlist(zc, config);
-		nvlist_free(config);
-
-		/*
-		 * The config may be present even if 'error' is non-zero.
-		 * In this case we return success, and preserve the real errno
-		 * in 'zc_cookie'.
-		 */
-		zc->zc_cookie = error;
-	} else {
-		ret = error;
-	}
-
-	return (ret);
-}
-
-/*
- * Try to import the given pool, returning pool stats as appropriate so that
- * user land knows which devices are available and overall pool health.
- */
-static int
-zfs_ioc_pool_tryimport(zfs_cmd_t *zc)
-{
-	nvlist_t *tryconfig, *config;
-	int error;
-
-	if ((error = get_nvlist(zc, &tryconfig)) != 0)
-		return (error);
-
-	config = spa_tryimport(tryconfig);
-
-	nvlist_free(tryconfig);
-
-	if (config == NULL)
-		return (EINVAL);
-
-	error = put_nvlist(zc, config);
-	nvlist_free(config);
-
-	return (error);
-}
-
-static int
-zfs_ioc_pool_scrub(zfs_cmd_t *zc)
-{
-	spa_t *spa;
-	int error;
-
-	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
-		return (error);
-
-	error = spa_scrub(spa, zc->zc_cookie, B_FALSE);
-
-	spa_close(spa, FTAG);
-
-	return (error);
-}
-
-static int
-zfs_ioc_pool_freeze(zfs_cmd_t *zc)
-{
-	spa_t *spa;
-	int error;
-
-	error = spa_open(zc->zc_name, &spa, FTAG);
-	if (error == 0) {
-		spa_freeze(spa);
-		spa_close(spa, FTAG);
-	}
-	return (error);
-}
-
-static int
-zfs_ioc_pool_upgrade(zfs_cmd_t *zc)
-{
-	spa_t *spa;
-	int error;
-
-	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
-		return (error);
-
-	spa_upgrade(spa);
-
-	spa_close(spa, FTAG);
-
-	return (error);
-}
-
-static int
-zfs_ioc_pool_get_history(zfs_cmd_t *zc)
-{
-	spa_t *spa;
-	char *hist_buf;
-	uint64_t size;
-	int error;
-
-	if ((size = zc->zc_history_len) == 0)
-		return (EINVAL);
-
-	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
-		return (error);
-
-	if (spa_version(spa) < ZFS_VERSION_ZPOOL_HISTORY) {
-		spa_close(spa, FTAG);
-		return (ENOTSUP);
-	}
-
-	hist_buf = kmem_alloc(size, KM_SLEEP);
-	if ((error = spa_history_get(spa, &zc->zc_history_offset,
-	    &zc->zc_history_len, hist_buf)) == 0) {
-		error = xcopyout(hist_buf, (char *)(uintptr_t)zc->zc_history,
-		    zc->zc_history_len);
-	}
-
-	spa_close(spa, FTAG);
-	kmem_free(hist_buf, size);
-	return (error);
-}
-
-static int
-zfs_ioc_pool_log_history(zfs_cmd_t *zc)
-{
-	spa_t *spa;
-	char *history_str = NULL;
-	size_t size;
-	int error;
-
-	size = zc->zc_history_len;
-	if (size == 0 || size > HIS_MAX_RECORD_LEN)
-		return (EINVAL);
-
-	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
-		return (error);
-
-	if (spa_version(spa) < ZFS_VERSION_ZPOOL_HISTORY) {
-		spa_close(spa, FTAG);
-		return (ENOTSUP);
-	}
-
-	/* add one for the NULL delimiter */
-	size++;
-	history_str = kmem_alloc(size, KM_SLEEP);
-	if ((error = xcopyin((void *)(uintptr_t)zc->zc_history, history_str,
-	    size)) != 0) {
-		spa_close(spa, FTAG);
-		kmem_free(history_str, size);
-		return (error);
-	}
-	history_str[size - 1] = '\0';
-
-	error = spa_history_log(spa, history_str, zc->zc_history_offset);
-
-	spa_close(spa, FTAG);
-	kmem_free(history_str, size);
-
-	return (error);
-}
-
-static int
-zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc)
-{
-	int error;
-
-	if (error = dsl_dsobj_to_dsname(zc->zc_name, zc->zc_obj, zc->zc_value))
-		return (error);
-
-	return (0);
-}
-
-static int
-zfs_ioc_obj_to_path(zfs_cmd_t *zc)
-{
-	objset_t *osp;
-	int error;
-
-	if ((error = dmu_objset_open(zc->zc_name, DMU_OST_ZFS,
-	    DS_MODE_NONE | DS_MODE_READONLY, &osp)) != 0)
-		return (error);
-
-	error = zfs_obj_to_path(osp, zc->zc_obj, zc->zc_value,
-	    sizeof (zc->zc_value));
-	dmu_objset_close(osp);
-
-	return (error);
-}
-
-static int
-zfs_ioc_vdev_add(zfs_cmd_t *zc)
-{
-	spa_t *spa;
-	int error;
-	nvlist_t *config;
-
-	error = spa_open(zc->zc_name, &spa, FTAG);
-	if (error != 0)
-		return (error);
-
-	/*
-	 * A root pool with concatenated devices is not supported.
-	 * Thus, can not add a device to a root pool with one device.
-	 */
-	if (spa->spa_root_vdev->vdev_children == 1 && spa->spa_bootfs != 0) {
-		spa_close(spa, FTAG);
-		return (EDOM);
-	}
-
-	if ((error = get_nvlist(zc, &config)) == 0) {
-		error = spa_vdev_add(spa, config);
-		nvlist_free(config);
-	}
-
-	spa_close(spa, FTAG);
-	return (error);
-}
-
-static int
-zfs_ioc_vdev_remove(zfs_cmd_t *zc)
-{
-	spa_t *spa;
-	int error;
-
-	error = spa_open(zc->zc_name, &spa, FTAG);
-	if (error != 0)
-		return (error);
-	error = spa_vdev_remove(spa, zc->zc_guid, B_FALSE);
-	spa_close(spa, FTAG);
-	return (error);
-}
-
-static int
-zfs_ioc_vdev_online(zfs_cmd_t *zc)
-{
-	spa_t *spa;
-	int error;
-
-	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
-		return (error);
-	error = vdev_online(spa, zc->zc_guid);
-	spa_close(spa, FTAG);
-	return (error);
-}
-
-static int
-zfs_ioc_vdev_offline(zfs_cmd_t *zc)
-{
-	spa_t *spa;
-	int istmp = zc->zc_cookie;
-	int error;
-
-	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
-		return (error);
-	error = vdev_offline(spa, zc->zc_guid, istmp);
-	spa_close(spa, FTAG);
-	return (error);
-}
-
-static int
-zfs_ioc_vdev_attach(zfs_cmd_t *zc)
-{
-	spa_t *spa;
-	int replacing = zc->zc_cookie;
-	nvlist_t *config;
-	int error;
-
-	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
-		return (error);
-
-	if ((error = get_nvlist(zc, &config)) == 0) {
-		error = spa_vdev_attach(spa, zc->zc_guid, config, replacing);
-		nvlist_free(config);
-	}
-
-	spa_close(spa, FTAG);
-	return (error);
-}
-
-static int
-zfs_ioc_vdev_detach(zfs_cmd_t *zc)
-{
-	spa_t *spa;
-	int error;
-
-	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
-		return (error);
-
-	error = spa_vdev_detach(spa, zc->zc_guid, B_FALSE);
-
-	spa_close(spa, FTAG);
-	return (error);
-}
-
-static int
-zfs_ioc_vdev_setpath(zfs_cmd_t *zc)
-{
-	spa_t *spa;
-	char *path = zc->zc_value;
-	uint64_t guid = zc->zc_guid;
-	int error;
-
-	error = spa_open(zc->zc_name, &spa, FTAG);
-	if (error != 0)
-		return (error);
-
-	error = spa_vdev_setpath(spa, guid, path);
-	spa_close(spa, FTAG);
-	return (error);
-}
-
-static int
-zfs_ioc_objset_stats(zfs_cmd_t *zc)
-{
-	objset_t *os = NULL;
-	int error;
-	nvlist_t *nv;
-
-retry:
-	error = dmu_objset_open(zc->zc_name, DMU_OST_ANY,
-	    DS_MODE_STANDARD | DS_MODE_READONLY, &os);
-	if (error != 0) {
-		/*
-		 * This is ugly: dmu_objset_open() can return EBUSY if
-		 * the objset is held exclusively. Fortunately this hold is
-		 * only for a short while, so we retry here.
-		 * This avoids user code having to handle EBUSY,
-		 * for example for a "zfs list".
-		 */
-		if (error == EBUSY) {
-			delay(1);
-			goto retry;
-		}
-		return (error);
-	}
-
-	dmu_objset_fast_stat(os, &zc->zc_objset_stats);
-
-	if (zc->zc_nvlist_dst != 0 &&
-	    (error = dsl_prop_get_all(os, &nv)) == 0) {
-		dmu_objset_stats(os, nv);
-		/*
-		 * NB: zvol_get_stats() will read the objset contents,
-		 * which we aren't supposed to do with a
-		 * DS_MODE_STANDARD open, because it could be
-		 * inconsistent.  So this is a bit of a workaround...
-		 */
-		if (!zc->zc_objset_stats.dds_inconsistent &&
-		    dmu_objset_type(os) == DMU_OST_ZVOL)
-			VERIFY(zvol_get_stats(os, nv) == 0);
-		error = put_nvlist(zc, nv);
-		nvlist_free(nv);
-	}
-
-	spa_altroot(dmu_objset_spa(os), zc->zc_value, sizeof (zc->zc_value));
-
-	dmu_objset_close(os);
-	if (error == ENOMEM)
-		error = 0;
-	return (error);
-}
-
-static int
-zfs_ioc_dataset_list_next(zfs_cmd_t *zc)
-{
-	objset_t *os;
-	int error;
-	char *p;
-
-retry:
-	error = dmu_objset_open(zc->zc_name, DMU_OST_ANY,
-	    DS_MODE_STANDARD | DS_MODE_READONLY, &os);
-	if (error != 0) {
-		/*
-		 * This is ugly: dmu_objset_open() can return EBUSY if
-		 * the objset is held exclusively. Fortunately this hold is
-		 * only for a short while, so we retry here.
-		 * This avoids user code having to handle EBUSY,
-		 * for example for a "zfs list".
-		 */
-		if (error == EBUSY) {
-			delay(1);
-			goto retry;
-		}
-		if (error == ENOENT)
-			error = ESRCH;
-		return (error);
-	}
-
-	p = strrchr(zc->zc_name, '/');
-	if (p == NULL || p[1] != '\0')
-		(void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name));
-	p = zc->zc_name + strlen(zc->zc_name);
-
-	do {
-		error = dmu_dir_list_next(os,
-		    sizeof (zc->zc_name) - (p - zc->zc_name), p,
-		    NULL, &zc->zc_cookie);
-		if (error == ENOENT)
-			error = ESRCH;
-	} while (error == 0 && !INGLOBALZONE(curproc) &&
-	    !zone_dataset_visible(zc->zc_name, NULL));
-
-	/*
-	 * If it's a hidden dataset (ie. with a '$' in its name), don't
-	 * try to get stats for it.  Userland will skip over it.
-	 */
-	if (error == 0 && strchr(zc->zc_name, '$') == NULL)
-		error = zfs_ioc_objset_stats(zc); /* fill in the stats */
-
-	dmu_objset_close(os);
-	return (error);
-}
-
-static int
-zfs_ioc_snapshot_list_next(zfs_cmd_t *zc)
-{
-	objset_t *os;
-	int error;
-
-retry:
-	error = dmu_objset_open(zc->zc_name, DMU_OST_ANY,
-	    DS_MODE_STANDARD | DS_MODE_READONLY, &os);
-	if (error != 0) {
-		/*
-		 * This is ugly: dmu_objset_open() can return EBUSY if
-		 * the objset is held exclusively. Fortunately this hold is
-		 * only for a short while, so we retry here.
-		 * This avoids user code having to handle EBUSY,
-		 * for example for a "zfs list".
-		 */
-		if (error == EBUSY) {
-			delay(1);
-			goto retry;
-		}
-		if (error == ENOENT)
-			error = ESRCH;
-		return (error);
-	}
-
-	/*
-	 * A dataset name of maximum length cannot have any snapshots,
-	 * so exit immediately.
-	 */
-	if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >= MAXNAMELEN) {
-		dmu_objset_close(os);
-		return (ESRCH);
-	}
-
-	error = dmu_snapshot_list_next(os,
-	    sizeof (zc->zc_name) - strlen(zc->zc_name),
-	    zc->zc_name + strlen(zc->zc_name), NULL, &zc->zc_cookie);
-	if (error == ENOENT)
-		error = ESRCH;
-
-	if (error == 0)
-		error = zfs_ioc_objset_stats(zc); /* fill in the stats */
-
-	dmu_objset_close(os);
-	return (error);
-}
-
-static int
-zfs_set_prop_nvlist(const char *name, dev_t dev, cred_t *cr, nvlist_t *nvl)
-{
-	nvpair_t *elem;
-	int error;
-	const char *propname;
-	zfs_prop_t prop;
-	uint64_t intval;
-	char *strval;
-	char buf[MAXNAMELEN];
-	const char *p;
-	spa_t *spa;
-
-	elem = NULL;
-	while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
-		propname = nvpair_name(elem);
-
-		if ((prop = zfs_name_to_prop(propname)) ==
-		    ZFS_PROP_INVAL) {
-			/*
-			 * If this is a user-defined property, it must be a
-			 * string, and there is no further validation to do.
-			 */
-			if (!zfs_prop_user(propname) ||
-			    nvpair_type(elem) != DATA_TYPE_STRING)
-				return (EINVAL);
-
-			VERIFY(nvpair_value_string(elem, &strval) == 0);
-			error = dsl_prop_set(name, propname, 1,
-			    strlen(strval) + 1, strval);
-			if (error == 0)
-				continue;
-			else
-				return (error);
-		}
-
-		/*
-		 * Check permissions for special properties.
-		 */
-		switch (prop) {
-		case ZFS_PROP_ZONED:
-			/*
-			 * Disallow setting of 'zoned' from within a local zone.
-			 */
-			if (!INGLOBALZONE(curproc))
-				return (EPERM);
-			break;
-
-		case ZFS_PROP_QUOTA:
-			if (error = zfs_dozonecheck(name, cr))
-				return (error);
-
-			if (!INGLOBALZONE(curproc)) {
-				uint64_t zoned;
-				char setpoint[MAXNAMELEN];
-				int dslen;
-				/*
-				 * Unprivileged users are allowed to modify the
-				 * quota on things *under* (ie. contained by)
-				 * the thing they own.
-				 */
-				if (dsl_prop_get_integer(name, "jailed", &zoned,
-				    setpoint))
-					return (EPERM);
-				if (!zoned) /* this shouldn't happen */
-					return (EPERM);
-				dslen = strlen(name);
-				if (dslen <= strlen(setpoint))
-					return (EPERM);
-			}
-			break;
-
-		case ZFS_PROP_COMPRESSION:
-			/*
-			 * If the user specified gzip compression, make sure
-			 * the SPA supports it. We ignore any errors here since
-			 * we'll catch them later.
-			 */
-			if (nvpair_type(elem) == DATA_TYPE_UINT64 &&
-			    nvpair_value_uint64(elem, &intval) == 0 &&
-			    intval >= ZIO_COMPRESS_GZIP_1 &&
-			    intval <= ZIO_COMPRESS_GZIP_9) {
-				if ((p = strchr(name, '/')) == NULL) {
-					p = name;
-				} else {
-					bcopy(name, buf, p - name);
-					buf[p - name] = '\0';
-					p = buf;
-				}
-
-				if (spa_open(p, &spa, FTAG) == 0) {
-					if (spa_version(spa) <
-					    ZFS_VERSION_GZIP_COMPRESSION) {
-						spa_close(spa, FTAG);
-						return (ENOTSUP);
-					}
-
-					spa_close(spa, FTAG);
-				}
-			}
-			break;
-		}
-
-		switch (prop) {
-		case ZFS_PROP_QUOTA:
-			if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
-			    (error = dsl_dir_set_quota(name,
-			    intval)) != 0)
-				return (error);
-			break;
-
-		case ZFS_PROP_RESERVATION:
-			if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
-			    (error = dsl_dir_set_reservation(name,
-			    intval)) != 0)
-				return (error);
-			break;
-
-		case ZFS_PROP_VOLSIZE:
-			if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
-			    (error = zvol_set_volsize(name, dev,
-			    intval)) != 0)
-				return (error);
-			break;
-
-		case ZFS_PROP_VOLBLOCKSIZE:
-			if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
-			    (error = zvol_set_volblocksize(name,
-			    intval)) != 0)
-				return (error);
-			break;
-
-		default:
-			if (nvpair_type(elem) == DATA_TYPE_STRING) {
-				if (zfs_prop_get_type(prop) !=
-				    prop_type_string)
-					return (EINVAL);
-				VERIFY(nvpair_value_string(elem, &strval) == 0);
-				if ((error = dsl_prop_set(name,
-				    nvpair_name(elem), 1, strlen(strval) + 1,
-				    strval)) != 0)
-					return (error);
-			} else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
-				const char *unused;
-
-				VERIFY(nvpair_value_uint64(elem, &intval) == 0);
-
-				switch (zfs_prop_get_type(prop)) {
-				case prop_type_number:
-					break;
-				case prop_type_boolean:
-					if (intval > 1)
-						return (EINVAL);
-					break;
-				case prop_type_string:
-					return (EINVAL);
-				case prop_type_index:
-					if (zfs_prop_index_to_string(prop,
-					    intval, &unused) != 0)
-						return (EINVAL);
-					break;
-				default:
-					cmn_err(CE_PANIC, "unknown property "
-					    "type");
-					break;
-				}
-
-				if ((error = dsl_prop_set(name, propname,
-				    8, 1, &intval)) != 0)
-					return (error);
-			} else {
-				return (EINVAL);
-			}
-			break;
-		}
-	}
-
-	return (0);
-}
-
-static int
-zfs_ioc_set_prop(zfs_cmd_t *zc)
-{
-	nvlist_t *nvl;
-	int error;
-	zfs_prop_t prop;
-
-	/*
-	 * If zc_value is set, then this is an attempt to inherit a value.
-	 * Otherwise, zc_nvlist refers to a list of properties to set.
-	 */
-	if (zc->zc_value[0] != '\0') {
-		if (!zfs_prop_user(zc->zc_value) &&
-		    ((prop = zfs_name_to_prop(zc->zc_value)) ==
-		    ZFS_PROP_INVAL ||
-		    !zfs_prop_inheritable(prop)))
-			return (EINVAL);
-
-		return (dsl_prop_set(zc->zc_name, zc->zc_value, 0, 0, NULL));
-	}
-
-	if ((error = get_nvlist(zc, &nvl)) != 0)
-		return (error);
-
-	error = zfs_set_prop_nvlist(zc->zc_name, zc->zc_dev,
-	    (cred_t *)(uintptr_t)zc->zc_cred, nvl);
-	nvlist_free(nvl);
-	return (error);
-}
-
-static int
-zfs_ioc_pool_set_props(zfs_cmd_t *zc)
-{
-	nvlist_t *nvl;
-	int error, reset_bootfs = 0;
-	uint64_t objnum;
-	zpool_prop_t prop;
-	nvpair_t *elem;
-	char *propname, *strval;
-	spa_t *spa;
-	vdev_t *rvdev;
-	char *vdev_type;
-	objset_t *os;
-
-	if ((error = get_nvlist(zc, &nvl)) != 0)
-		return (error);
-
-	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) {
-		nvlist_free(nvl);
-		return (error);
-	}
-
-	if (spa_version(spa) < ZFS_VERSION_BOOTFS) {
-		nvlist_free(nvl);
-		spa_close(spa, FTAG);
-		return (ENOTSUP);
-	}
-
-	elem = NULL;
-	while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
-
-		propname = nvpair_name(elem);
-
-		if ((prop = zpool_name_to_prop(propname)) ==
-		    ZFS_PROP_INVAL) {
-			nvlist_free(nvl);
-			spa_close(spa, FTAG);
-			return (EINVAL);
-		}
-
-		switch (prop) {
-		case ZFS_PROP_BOOTFS:
-			/*
-			 * A bootable filesystem can not be on a RAIDZ pool
-			 * nor a striped pool with more than 1 device.
-			 */
-			rvdev = spa->spa_root_vdev;
-			vdev_type =
-			    rvdev->vdev_child[0]->vdev_ops->vdev_op_type;
-			if (strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 ||
-			    (strcmp(vdev_type, VDEV_TYPE_MIRROR) != 0 &&
-			    rvdev->vdev_children > 1)) {
-				error = ENOTSUP;
-				break;
-			}
-
-			reset_bootfs = 1;
-
-			VERIFY(nvpair_value_string(elem, &strval) == 0);
-			if (strval == NULL || strval[0] == '\0') {
-				objnum =
-				    zfs_prop_default_numeric(ZFS_PROP_BOOTFS);
-				break;
-			}
-
-			if (error = dmu_objset_open(strval, DMU_OST_ZFS,
-			    DS_MODE_STANDARD | DS_MODE_READONLY, &os))
-				break;
-			objnum = dmu_objset_id(os);
-			dmu_objset_close(os);
-			break;
-
-		default:
-			error = EINVAL;
-		}
-
-		if (error)
-			break;
-	}
-	if (error == 0) {
-		if (reset_bootfs) {
-			VERIFY(nvlist_remove(nvl,
-			    zpool_prop_to_name(ZFS_PROP_BOOTFS),
-			    DATA_TYPE_STRING) == 0);
-			VERIFY(nvlist_add_uint64(nvl,
-			    zpool_prop_to_name(ZFS_PROP_BOOTFS), objnum) == 0);
-		}
-		error = spa_set_props(spa, nvl);
-	}
-
-	nvlist_free(nvl);
-	spa_close(spa, FTAG);
-
-	return (error);
-}
-
-static int
-zfs_ioc_pool_get_props(zfs_cmd_t *zc)
-{
-	spa_t *spa;
-	int error;
-	nvlist_t *nvp = NULL;
-
-	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
-		return (error);
-
-	error = spa_get_props(spa, &nvp);
-
-	if (error == 0 && zc->zc_nvlist_dst != 0)
-		error = put_nvlist(zc, nvp);
-	else
-		error = EFAULT;
-
-	spa_close(spa, FTAG);
-
-	if (nvp)
-		nvlist_free(nvp);
-	return (error);
-}
-
-static int
-zfs_ioc_create_minor(zfs_cmd_t *zc)
-{
-	return (zvol_create_minor(zc->zc_name, zc->zc_dev));
-}
-
-static int
-zfs_ioc_remove_minor(zfs_cmd_t *zc)
-{
-	return (zvol_remove_minor(zc->zc_name));
-}
-
-/*
- * Search the vfs list for a specified resource.  Returns a pointer to it
- * or NULL if no suitable entry is found. The caller of this routine
- * is responsible for releasing the returned vfs pointer.
- */
-static vfs_t *
-zfs_get_vfs(const char *resource)
-{
-	vfs_t *vfsp;
-
-	mtx_lock(&mountlist_mtx);
-	TAILQ_FOREACH(vfsp, &mountlist, mnt_list) {
-		if (strcmp(vfsp->mnt_stat.f_mntfromname, resource) == 0) {
-			VFS_HOLD(vfsp);
-			break;
-		}
-	}
-	mtx_unlock(&mountlist_mtx);
-	return (vfsp);
-}
-
-static void
-zfs_create_cb(objset_t *os, void *arg, dmu_tx_t *tx)
-{
-	zfs_create_data_t *zc = arg;
-
-	zfs_create_fs(os, (cred_t *)(uintptr_t)zc->zc_cred, tx);
-}
-
-static int
-zfs_ioc_create(zfs_cmd_t *zc)
-{
-	objset_t *clone;
-	int error = 0;
-	zfs_create_data_t cbdata = { 0 };
-	void (*cbfunc)(objset_t *os, void *arg, dmu_tx_t *tx);
-	dmu_objset_type_t type = zc->zc_objset_type;
-
-	switch (type) {
-
-	case DMU_OST_ZFS:
-		cbfunc = zfs_create_cb;
-		break;
-
-	case DMU_OST_ZVOL:
-		cbfunc = zvol_create_cb;
-		break;
-
-	default:
-		cbfunc = NULL;
-	}
-	if (strchr(zc->zc_name, '@'))
-		return (EINVAL);
-
-	if (zc->zc_nvlist_src != 0 &&
-	    (error = get_nvlist(zc, &cbdata.zc_props)) != 0)
-		return (error);
-
-	cbdata.zc_cred = (cred_t *)(uintptr_t)zc->zc_cred;
-	cbdata.zc_dev = (dev_t)zc->zc_dev;
-
-	if (zc->zc_value[0] != '\0') {
-		/*
-		 * We're creating a clone of an existing snapshot.
-		 */
-		zc->zc_value[sizeof (zc->zc_value) - 1] = '\0';
-		if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0) {
-			nvlist_free(cbdata.zc_props);
-			return (EINVAL);
-		}
-
-		error = dmu_objset_open(zc->zc_value, type,
-		    DS_MODE_STANDARD | DS_MODE_READONLY, &clone);
-		if (error) {
-			nvlist_free(cbdata.zc_props);
-			return (error);
-		}
-		error = dmu_objset_create(zc->zc_name, type, clone, NULL, NULL);
-		dmu_objset_close(clone);
-	} else {
-		if (cbfunc == NULL) {
-			nvlist_free(cbdata.zc_props);
-			return (EINVAL);
-		}
-
-		if (type == DMU_OST_ZVOL) {
-			uint64_t volsize, volblocksize;
-
-			if (cbdata.zc_props == NULL ||
-			    nvlist_lookup_uint64(cbdata.zc_props,
-			    zfs_prop_to_name(ZFS_PROP_VOLSIZE),
-			    &volsize) != 0) {
-				nvlist_free(cbdata.zc_props);
-				return (EINVAL);
-			}
-
-			if ((error = nvlist_lookup_uint64(cbdata.zc_props,
-			    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
-			    &volblocksize)) != 0 && error != ENOENT) {
-				nvlist_free(cbdata.zc_props);
-				return (EINVAL);
-			}
-
-			if (error != 0)
-				volblocksize = zfs_prop_default_numeric(
-				    ZFS_PROP_VOLBLOCKSIZE);
-
-			if ((error = zvol_check_volblocksize(
-			    volblocksize)) != 0 ||
-			    (error = zvol_check_volsize(volsize,
-			    volblocksize)) != 0) {
-				nvlist_free(cbdata.zc_props);
-				return (error);
-			}
-		}
-
-		error = dmu_objset_create(zc->zc_name, type, NULL, cbfunc,
-		    &cbdata);
-	}
-
-	/*
-	 * It would be nice to do this atomically.
-	 */
-	if (error == 0) {
-		if ((error = zfs_set_prop_nvlist(zc->zc_name,
-		    zc->zc_dev, (cred_t *)(uintptr_t)zc->zc_cred,
-		    cbdata.zc_props)) != 0)
-			(void) dmu_objset_destroy(zc->zc_name);
-	}
-
-	nvlist_free(cbdata.zc_props);
-	return (error);
-}
-
-static int
-zfs_ioc_snapshot(zfs_cmd_t *zc)
-{
-	if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0)
-		return (EINVAL);
-	return (dmu_objset_snapshot(zc->zc_name,
-	    zc->zc_value, zc->zc_cookie));
-}
-
-int
-zfs_unmount_snap(char *name, void *arg)
-{
-	char *snapname = arg;
-	char *cp;
-	vfs_t *vfsp = NULL;
-
-	/*
-	 * Snapshots (which are under .zfs control) must be unmounted
-	 * before they can be destroyed.
-	 */
-
-	if (snapname) {
-		(void) strcat(name, "@");
-		(void) strcat(name, snapname);
-		vfsp = zfs_get_vfs(name);
-		cp = strchr(name, '@');
-		*cp = '\0';
-	} else if (strchr(name, '@')) {
-		vfsp = zfs_get_vfs(name);
-	}
-
-	if (vfsp) {
-		/*
-		 * Always force the unmount for snapshots.
-		 */
-		int flag = MS_FORCE;
-		int err;
-
-		if ((err = vn_vfswlock(vfsp->vfs_vnodecovered)) != 0) {
-			VFS_RELE(vfsp);
-			return (err);
-		}
-		VFS_RELE(vfsp);
-		mtx_lock(&Giant);	/* dounmount() */
-		dounmount(vfsp, flag, curthread);
-		mtx_unlock(&Giant);	/* dounmount() */
-	}
-	return (0);
-}
-
-static int
-zfs_ioc_destroy_snaps(zfs_cmd_t *zc)
-{
-	int err;
-
-	if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0)
-		return (EINVAL);
-	err = dmu_objset_find(zc->zc_name,
-	    zfs_unmount_snap, zc->zc_value, DS_FIND_CHILDREN);
-	if (err)
-		return (err);
-	return (dmu_snapshots_destroy(zc->zc_name, zc->zc_value));
-}
-
-static int
-zfs_ioc_destroy(zfs_cmd_t *zc)
-{
-	if (strchr(zc->zc_name, '@') && zc->zc_objset_type == DMU_OST_ZFS) {
-		int err = zfs_unmount_snap(zc->zc_name, NULL);
-		if (err)
-			return (err);
-	}
-
-	return (dmu_objset_destroy(zc->zc_name));
-}
-
-static int
-zfs_ioc_rollback(zfs_cmd_t *zc)
-{
-	return (dmu_objset_rollback(zc->zc_name));
-}
-
-static int
-zfs_ioc_rename(zfs_cmd_t *zc)
-{
-	int recursive = zc->zc_cookie & 1;
-
-	zc->zc_value[sizeof (zc->zc_value) - 1] = '\0';
-	if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0)
-		return (EINVAL);
-
-	/*
-	 * Unmount snapshot unless we're doing a recursive rename,
-	 * in which case the dataset code figures out which snapshots
-	 * to unmount.
-	 */
-	if (!recursive && strchr(zc->zc_name, '@') != NULL &&
-	    zc->zc_objset_type == DMU_OST_ZFS) {
-		int err = zfs_unmount_snap(zc->zc_name, NULL);
-		if (err)
-			return (err);
-	}
-
-	return (dmu_objset_rename(zc->zc_name, zc->zc_value, recursive));
-}
-
-static int
-zfs_ioc_recvbackup(zfs_cmd_t *zc)
-{
-	kthread_t *td = curthread;
-	struct file *fp;
-	int error;
-	offset_t new_off;
-
-	if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 ||
-	    strchr(zc->zc_value, '@') == NULL)
-		return (EINVAL);
-
-	error = fget_read(td, zc->zc_cookie, &fp);
-	if (error)
-		return (error);
-
-	error = dmu_recvbackup(zc->zc_value, &zc->zc_begin_record,
-	    &zc->zc_cookie, (boolean_t)zc->zc_guid, fp,
-	    fp->f_offset);
-
-	new_off = fp->f_offset + zc->zc_cookie;
-	fp->f_offset = new_off;
-
-	fdrop(fp, td);
-	return (error);
-}
-
-static int
-zfs_ioc_sendbackup(zfs_cmd_t *zc)
-{
-	kthread_t *td = curthread;
-	struct file *fp;
-	objset_t *fromsnap = NULL;
-	objset_t *tosnap;
-	int error, fd;
-
-	error = dmu_objset_open(zc->zc_name, DMU_OST_ANY,
-	    DS_MODE_STANDARD | DS_MODE_READONLY, &tosnap);
-	if (error)
-		return (error);
-
-	if (zc->zc_value[0] != '\0') {
-		char buf[MAXPATHLEN];
-		char *cp;
-
-		(void) strncpy(buf, zc->zc_name, sizeof (buf));
-		cp = strchr(buf, '@');
-		if (cp)
-			*(cp+1) = 0;
-		(void) strlcat(buf, zc->zc_value, sizeof (buf));
-		error = dmu_objset_open(buf, DMU_OST_ANY,
-		    DS_MODE_STANDARD | DS_MODE_READONLY, &fromsnap);
-		if (error) {
-			dmu_objset_close(tosnap);
-			return (error);
-		}
-	}
-
-	fd = zc->zc_cookie;
-	error = fget_write(td, fd, &fp);
-	if (error) {
-		dmu_objset_close(tosnap);
-		if (fromsnap)
-			dmu_objset_close(fromsnap);
-		return (error);
-	}
-
-	error = dmu_sendbackup(tosnap, fromsnap, fp);
-
-	fdrop(fp, td);
-	if (fromsnap)
-		dmu_objset_close(fromsnap);
-	dmu_objset_close(tosnap);
-	return (error);
-}
-
-static int
-zfs_ioc_inject_fault(zfs_cmd_t *zc)
-{
-	int id, error;
-
-	error = zio_inject_fault(zc->zc_name, (int)zc->zc_guid, &id,
-	    &zc->zc_inject_record);
-
-	if (error == 0)
-		zc->zc_guid = (uint64_t)id;
-
-	return (error);
-}
-
-static int
-zfs_ioc_clear_fault(zfs_cmd_t *zc)
-{
-	return (zio_clear_fault((int)zc->zc_guid));
-}
-
-static int
-zfs_ioc_inject_list_next(zfs_cmd_t *zc)
-{
-	int id = (int)zc->zc_guid;
-	int error;
-
-	error = zio_inject_list_next(&id, zc->zc_name, sizeof (zc->zc_name),
-	    &zc->zc_inject_record);
-
-	zc->zc_guid = id;
-
-	return (error);
-}
-
-static int
-zfs_ioc_error_log(zfs_cmd_t *zc)
-{
-	spa_t *spa;
-	int error;
-	size_t count = (size_t)zc->zc_nvlist_dst_size;
-
-	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
-		return (error);
-
-	error = spa_get_errlog(spa, (void *)(uintptr_t)zc->zc_nvlist_dst,
-	    &count);
-	if (error == 0)
-		zc->zc_nvlist_dst_size = count;
-	else
-		zc->zc_nvlist_dst_size = spa_get_errlog_size(spa);
-
-	spa_close(spa, FTAG);
-
-	return (error);
-}
-
-static int
-zfs_ioc_clear(zfs_cmd_t *zc)
-{
-	spa_t *spa;
-	vdev_t *vd;
-	int error;
-
-	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
-		return (error);
-
-	spa_config_enter(spa, RW_WRITER, FTAG);
-
-	if (zc->zc_guid == 0) {
-		vd = NULL;
-	} else if ((vd = spa_lookup_by_guid(spa, zc->zc_guid)) == NULL) {
-		spa_config_exit(spa, FTAG);
-		spa_close(spa, FTAG);
-		return (ENODEV);
-	}
-
-	vdev_clear(spa, vd);
-
-	spa_config_exit(spa, FTAG);
-
-	spa_close(spa, FTAG);
-
-	return (0);
-}
-
-static int
-zfs_ioc_promote(zfs_cmd_t *zc)
-{
-	char *cp;
-
-	/*
-	 * We don't need to unmount *all* the origin fs's snapshots, but
-	 * it's easier.
-	 */
-	cp = strchr(zc->zc_value, '@');
-	if (cp)
-		*cp = '\0';
-	(void) dmu_objset_find(zc->zc_value,
-	    zfs_unmount_snap, NULL, DS_FIND_SNAPSHOTS);
-	return (dsl_dataset_promote(zc->zc_name));
-}
-
-static int
-zfs_ioc_jail(zfs_cmd_t *zc)
-{
-
-	return (zone_dataset_attach((cred_t *)(uintptr_t)zc->zc_cred,
-	    zc->zc_name, (int)zc->zc_jailid));
-}
-
-static int
-zfs_ioc_unjail(zfs_cmd_t *zc)
-{
-
-	return (zone_dataset_detach((cred_t *)(uintptr_t)zc->zc_cred,
-	    zc->zc_name, (int)zc->zc_jailid));
-}
-
-static zfs_ioc_vec_t zfs_ioc_vec[] = {
-	{ zfs_ioc_pool_create,		zfs_secpolicy_config,	pool_name },
-	{ zfs_ioc_pool_destroy,		zfs_secpolicy_config,	pool_name },
-	{ zfs_ioc_pool_import,		zfs_secpolicy_config,	pool_name },
-	{ zfs_ioc_pool_export,		zfs_secpolicy_config,	pool_name },
-	{ zfs_ioc_pool_configs,		zfs_secpolicy_none,	no_name },
-	{ zfs_ioc_pool_stats,		zfs_secpolicy_read,	pool_name },
-	{ zfs_ioc_pool_tryimport,	zfs_secpolicy_config,	no_name },
-	{ zfs_ioc_pool_scrub,		zfs_secpolicy_config,	pool_name },
-	{ zfs_ioc_pool_freeze,		zfs_secpolicy_config,	no_name },
-	{ zfs_ioc_pool_upgrade,		zfs_secpolicy_config,	pool_name },
-	{ zfs_ioc_pool_get_history,	zfs_secpolicy_config,	pool_name },
-	{ zfs_ioc_pool_log_history,	zfs_secpolicy_config,	pool_name },
-	{ zfs_ioc_vdev_add,		zfs_secpolicy_config,	pool_name },
-	{ zfs_ioc_vdev_remove,		zfs_secpolicy_config,	pool_name },
-	{ zfs_ioc_vdev_online,		zfs_secpolicy_config,	pool_name },
-	{ zfs_ioc_vdev_offline,		zfs_secpolicy_config,	pool_name },
-	{ zfs_ioc_vdev_attach,		zfs_secpolicy_config,	pool_name },
-	{ zfs_ioc_vdev_detach,		zfs_secpolicy_config,	pool_name },
-	{ zfs_ioc_vdev_setpath,		zfs_secpolicy_config,	pool_name },
-	{ zfs_ioc_objset_stats,		zfs_secpolicy_read,	dataset_name },
-	{ zfs_ioc_dataset_list_next,	zfs_secpolicy_read,	dataset_name },
-	{ zfs_ioc_snapshot_list_next,	zfs_secpolicy_read,	dataset_name },
-	{ zfs_ioc_set_prop,		zfs_secpolicy_write,	dataset_name },
-	{ zfs_ioc_create_minor,		zfs_secpolicy_config,	dataset_name },
-	{ zfs_ioc_remove_minor,		zfs_secpolicy_config,	dataset_name },
-	{ zfs_ioc_create,		zfs_secpolicy_parent,	dataset_name },
-	{ zfs_ioc_destroy,		zfs_secpolicy_parent,	dataset_name },
-	{ zfs_ioc_rollback,		zfs_secpolicy_write,	dataset_name },
-	{ zfs_ioc_rename,		zfs_secpolicy_write,	dataset_name },
-	{ zfs_ioc_recvbackup,		zfs_secpolicy_write,	dataset_name },
-	{ zfs_ioc_sendbackup,		zfs_secpolicy_operator,	dataset_name },
-	{ zfs_ioc_inject_fault,		zfs_secpolicy_inject,	no_name },
-	{ zfs_ioc_clear_fault,		zfs_secpolicy_inject,	no_name },
-	{ zfs_ioc_inject_list_next,	zfs_secpolicy_inject,	no_name },
-	{ zfs_ioc_error_log,		zfs_secpolicy_inject,	pool_name },
-	{ zfs_ioc_clear,		zfs_secpolicy_config,	pool_name },
-	{ zfs_ioc_promote,		zfs_secpolicy_write,	dataset_name },
-	{ zfs_ioc_destroy_snaps,	zfs_secpolicy_write,	dataset_name },
-	{ zfs_ioc_snapshot,		zfs_secpolicy_operator,	dataset_name },
-	{ zfs_ioc_dsobj_to_dsname,	zfs_secpolicy_config,	pool_name },
-	{ zfs_ioc_obj_to_path,		zfs_secpolicy_config,	no_name },
-	{ zfs_ioc_pool_set_props,	zfs_secpolicy_config,	pool_name },
-	{ zfs_ioc_pool_get_props,	zfs_secpolicy_read,	pool_name },
-	{ zfs_ioc_jail,			zfs_secpolicy_config,	dataset_name },
-	{ zfs_ioc_unjail,		zfs_secpolicy_config,	dataset_name }
-};
-
-static int
-zfsdev_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag,
-    struct thread *td)
-{
-	zfs_cmd_t *zc = (void *)addr;
-	uint_t vec;
-	int error;
-
-	vec = ZFS_IOC(cmd);
-
-	if (vec >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0]))
-		return (EINVAL);
-
-	zc->zc_cred = (uintptr_t)td->td_ucred;
-	zc->zc_dev = (uintptr_t)dev;
-	error = zfs_ioc_vec[vec].zvec_secpolicy(zc->zc_name, td->td_ucred);
-
-	/*
-	 * Ensure that all pool/dataset names are valid before we pass down to
-	 * the lower layers.
-	 */
-	if (error == 0) {
-		zc->zc_name[sizeof (zc->zc_name) - 1] = '\0';
-		switch (zfs_ioc_vec[vec].zvec_namecheck) {
-		case pool_name:
-			if (pool_namecheck(zc->zc_name, NULL, NULL) != 0)
-				error = EINVAL;
-			break;
-
-		case dataset_name:
-			if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0)
-				error = EINVAL;
-			break;
-
-		case no_name:
-			break;
-		}
-	}
-
-	if (error == 0)
-		error = zfs_ioc_vec[vec].zvec_func(zc);
-
-	return (error);
-}
-
-/*
- * OK, so this is a little weird.
- *
- * /dev/zfs is the control node, i.e. minor 0.
- * /dev/zvol/[r]dsk/pool/dataset are the zvols, minor > 0.
- *
- * /dev/zfs has basically nothing to do except serve up ioctls,
- * so most of the standard driver entry points are in zvol.c.
- */
-static struct cdevsw zfs_cdevsw = {
-	.d_version =	D_VERSION,
-	.d_ioctl =	zfsdev_ioctl,
-	.d_name =	ZFS_DEV_NAME
-};
-
-static void
-zfsdev_init(void)
-{
-	zfsdev = make_dev(&zfs_cdevsw, 0x0, UID_ROOT, GID_OPERATOR, 0660,
-	    ZFS_DEV_NAME);
-}
-
-static void
-zfsdev_fini(void)
-{
-	if (zfsdev != NULL)
-		destroy_dev(zfsdev);
-}
-
-static struct task zfs_start_task;
-static struct root_hold_token *zfs_root_token;
-
-static void
-zfs_start(void *context __unused, int pending __unused)
-{
-
-	zfsdev_init();
-	spa_init(FREAD | FWRITE);
-	zfs_init();
-	zvol_init();
-	printf("ZFS storage pool version " ZFS_VERSION_STRING "\n");
-	root_mount_rel(zfs_root_token);
-}
-
-static int
-zfs_modevent(module_t mod, int type, void *unused __unused)
-{
-	int error;
-
-	error = EOPNOTSUPP;
-	switch (type) {
-	case MOD_LOAD:
-		zfs_root_token = root_mount_hold("ZFS");
-		printf("WARNING: ZFS is considered to be an experimental "
-		    "feature in FreeBSD.\n");
-		TASK_INIT(&zfs_start_task, 0, zfs_start, NULL);
-		taskqueue_enqueue(taskqueue_thread, &zfs_start_task);
-		error = 0;
-		break;
-	case MOD_UNLOAD:
-		if (spa_busy() || zfs_busy() || zvol_busy() ||
-		    zio_injection_enabled) {
-			error = EBUSY;
-			break;
-		}
-		zvol_fini();
-		zfs_fini();
-		spa_fini();
-		zfsdev_fini();
-		error = 0;
-		break;
-	}
-	return (error);
-}
-
-static moduledata_t zfs_mod = {
-	"zfsctrl",
-	zfs_modevent,
-	0
-};
-DECLARE_MODULE(zfsctrl, zfs_mod, SI_SUB_VFS, SI_ORDER_ANY);
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c
deleted file mode 100644
index dde9ec1..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c
+++ /dev/null
@@ -1,349 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/sysmacros.h>
-#include <sys/cmn_err.h>
-#include <sys/kmem.h>
-#include <sys/file.h>
-#include <sys/vfs.h>
-#include <sys/zfs_znode.h>
-#include <sys/zfs_dir.h>
-#include <sys/zil.h>
-#include <sys/byteorder.h>
-#include <sys/policy.h>
-#include <sys/stat.h>
-#include <sys/acl.h>
-#include <sys/dmu.h>
-#include <sys/spa.h>
-
-/*
- * All the functions in this file are used to construct the log entries
- * to record transactions. They allocate * a intent log transaction
- * structure (itx_t) and save within it all the information necessary to
- * possibly replay the transaction. The itx is then assigned a sequence
- * number and inserted in the in-memory list anchored in the zilog.
- */
-
-/*
- * zfs_log_create() is used to handle TX_CREATE, TX_MKDIR and TX_MKXATTR
- * transactions.
- */
-void
-zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, int txtype,
-	znode_t *dzp, znode_t *zp, char *name)
-{
-	itx_t *itx;
-	uint64_t seq;
-	lr_create_t *lr;
-	size_t namesize = strlen(name) + 1;
-
-	if (zilog == NULL)
-		return;
-
-	itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
-	lr = (lr_create_t *)&itx->itx_lr;
-	lr->lr_doid = dzp->z_id;
-	lr->lr_foid = zp->z_id;
-	lr->lr_mode = zp->z_phys->zp_mode;
-	lr->lr_uid = zp->z_phys->zp_uid;
-	lr->lr_gid = zp->z_phys->zp_gid;
-	lr->lr_gen = zp->z_phys->zp_gen;
-	lr->lr_crtime[0] = zp->z_phys->zp_crtime[0];
-	lr->lr_crtime[1] = zp->z_phys->zp_crtime[1];
-	lr->lr_rdev = zp->z_phys->zp_rdev;
-	bcopy(name, (char *)(lr + 1), namesize);
-
-	seq = zil_itx_assign(zilog, itx, tx);
-	dzp->z_last_itx = seq;
-	zp->z_last_itx = seq;
-}
-
-/*
- * zfs_log_remove() handles both TX_REMOVE and TX_RMDIR transactions.
- */
-void
-zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, int txtype,
-	znode_t *dzp, char *name)
-{
-	itx_t *itx;
-	uint64_t seq;
-	lr_remove_t *lr;
-	size_t namesize = strlen(name) + 1;
-
-	if (zilog == NULL)
-		return;
-
-	itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
-	lr = (lr_remove_t *)&itx->itx_lr;
-	lr->lr_doid = dzp->z_id;
-	bcopy(name, (char *)(lr + 1), namesize);
-
-	seq = zil_itx_assign(zilog, itx, tx);
-	dzp->z_last_itx = seq;
-}
-
-/*
- * zfs_log_link() handles TX_LINK transactions.
- */
-void
-zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, int txtype,
-	znode_t *dzp, znode_t *zp, char *name)
-{
-	itx_t *itx;
-	uint64_t seq;
-	lr_link_t *lr;
-	size_t namesize = strlen(name) + 1;
-
-	if (zilog == NULL)
-		return;
-
-	itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
-	lr = (lr_link_t *)&itx->itx_lr;
-	lr->lr_doid = dzp->z_id;
-	lr->lr_link_obj = zp->z_id;
-	bcopy(name, (char *)(lr + 1), namesize);
-
-	seq = zil_itx_assign(zilog, itx, tx);
-	dzp->z_last_itx = seq;
-	zp->z_last_itx = seq;
-}
-
-/*
- * zfs_log_symlink() handles TX_SYMLINK transactions.
- */
-void
-zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, int txtype,
-	znode_t *dzp, znode_t *zp, char *name, char *link)
-{
-	itx_t *itx;
-	uint64_t seq;
-	lr_create_t *lr;
-	size_t namesize = strlen(name) + 1;
-	size_t linksize = strlen(link) + 1;
-
-	if (zilog == NULL)
-		return;
-
-	itx = zil_itx_create(txtype, sizeof (*lr) + namesize + linksize);
-	lr = (lr_create_t *)&itx->itx_lr;
-	lr->lr_doid = dzp->z_id;
-	lr->lr_foid = zp->z_id;
-	lr->lr_mode = zp->z_phys->zp_mode;
-	lr->lr_uid = zp->z_phys->zp_uid;
-	lr->lr_gid = zp->z_phys->zp_gid;
-	lr->lr_gen = zp->z_phys->zp_gen;
-	lr->lr_crtime[0] = zp->z_phys->zp_crtime[0];
-	lr->lr_crtime[1] = zp->z_phys->zp_crtime[1];
-	bcopy(name, (char *)(lr + 1), namesize);
-	bcopy(link, (char *)(lr + 1) + namesize, linksize);
-
-	seq = zil_itx_assign(zilog, itx, tx);
-	dzp->z_last_itx = seq;
-	zp->z_last_itx = seq;
-}
-
-/*
- * zfs_log_rename() handles TX_RENAME transactions.
- */
-void
-zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, int txtype,
-	znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp)
-{
-	itx_t *itx;
-	uint64_t seq;
-	lr_rename_t *lr;
-	size_t snamesize = strlen(sname) + 1;
-	size_t dnamesize = strlen(dname) + 1;
-
-	if (zilog == NULL)
-		return;
-
-	itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize);
-	lr = (lr_rename_t *)&itx->itx_lr;
-	lr->lr_sdoid = sdzp->z_id;
-	lr->lr_tdoid = tdzp->z_id;
-	bcopy(sname, (char *)(lr + 1), snamesize);
-	bcopy(dname, (char *)(lr + 1) + snamesize, dnamesize);
-
-	seq = zil_itx_assign(zilog, itx, tx);
-	sdzp->z_last_itx = seq;
-	tdzp->z_last_itx = seq;
-	szp->z_last_itx = seq;
-}
-
-/*
- * zfs_log_write() handles TX_WRITE transactions.
- */
-ssize_t zfs_immediate_write_sz = 32768;
-
-void
-zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
-	znode_t *zp, offset_t off, ssize_t len, int ioflag)
-{
-	itx_t *itx;
-	uint64_t seq;
-	lr_write_t *lr;
-	itx_wr_state_t write_state;
-	int err;
-
-	if (zilog == NULL || zp->z_unlinked)
-		return;
-
-	/*
-	 * Writes are handled in three different ways:
-	 *
-	 * WR_INDIRECT:
-	 *    If the write is greater than zfs_immediate_write_sz then
-	 *    later *if* we need to log the write then dmu_sync() is used
-	 *    to immediately write the block and it's block pointer is put
-	 *    in the log record.
-	 * WR_COPIED:
-	 *    If we know we'll immediately be committing the
-	 *    transaction (FDSYNC (O_DSYNC)), the we allocate a larger
-	 *    log record here for the data and copy the data in.
-	 * WR_NEED_COPY:
-	 *    Otherwise we don't allocate a buffer, and *if* we need to
-	 *    flush the write later then a buffer is allocated and
-	 *    we retrieve the data using the dmu.
-	 */
-	if (len > zfs_immediate_write_sz)
-		write_state = WR_INDIRECT;
-	else if (ioflag & FDSYNC)
-		write_state = WR_COPIED;
-	else
-		write_state = WR_NEED_COPY;
-
-	itx = zil_itx_create(txtype, sizeof (*lr) +
-	    (write_state == WR_COPIED ? len : 0));
-	lr = (lr_write_t *)&itx->itx_lr;
-	if (write_state == WR_COPIED) {
-		err = dmu_read(zp->z_zfsvfs->z_os, zp->z_id, off, len, lr + 1);
-		if (err) {
-			kmem_free(itx, offsetof(itx_t, itx_lr) +
-			    itx->itx_lr.lrc_reclen);
-			itx = zil_itx_create(txtype, sizeof (*lr));
-			lr = (lr_write_t *)&itx->itx_lr;
-			write_state = WR_NEED_COPY;
-		}
-	}
-
-	itx->itx_wr_state = write_state;
-	lr->lr_foid = zp->z_id;
-	lr->lr_offset = off;
-	lr->lr_length = len;
-	lr->lr_blkoff = 0;
-	BP_ZERO(&lr->lr_blkptr);
-
-	itx->itx_private = zp->z_zfsvfs;
-
-	itx->itx_sync = (zp->z_sync_cnt != 0);
-	seq = zil_itx_assign(zilog, itx, tx);
-	zp->z_last_itx = seq;
-}
-
-/*
- * zfs_log_truncate() handles TX_TRUNCATE transactions.
- */
-void
-zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
-	znode_t *zp, uint64_t off, uint64_t len)
-{
-	itx_t *itx;
-	uint64_t seq;
-	lr_truncate_t *lr;
-
-	if (zilog == NULL || zp->z_unlinked)
-		return;
-
-	itx = zil_itx_create(txtype, sizeof (*lr));
-	lr = (lr_truncate_t *)&itx->itx_lr;
-	lr->lr_foid = zp->z_id;
-	lr->lr_offset = off;
-	lr->lr_length = len;
-
-	itx->itx_sync = (zp->z_sync_cnt != 0);
-	seq = zil_itx_assign(zilog, itx, tx);
-	zp->z_last_itx = seq;
-}
-
-/*
- * zfs_log_setattr() handles TX_SETATTR transactions.
- */
-void
-zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
-	znode_t *zp, vattr_t *vap, uint_t mask_applied)
-{
-	itx_t *itx;
-	uint64_t seq;
-	lr_setattr_t *lr;
-
-	if (zilog == NULL || zp->z_unlinked)
-		return;
-
-	itx = zil_itx_create(txtype, sizeof (*lr));
-	lr = (lr_setattr_t *)&itx->itx_lr;
-	lr->lr_foid = zp->z_id;
-	lr->lr_mask = (uint64_t)mask_applied;
-	lr->lr_mode = (uint64_t)vap->va_mode;
-	lr->lr_uid = (uint64_t)vap->va_uid;
-	lr->lr_gid = (uint64_t)vap->va_gid;
-	lr->lr_size = (uint64_t)vap->va_size;
-	ZFS_TIME_ENCODE(&vap->va_atime, lr->lr_atime);
-	ZFS_TIME_ENCODE(&vap->va_mtime, lr->lr_mtime);
-
-	itx->itx_sync = (zp->z_sync_cnt != 0);
-	seq = zil_itx_assign(zilog, itx, tx);
-	zp->z_last_itx = seq;
-}
-
-/*
- * zfs_log_acl() handles TX_ACL transactions.
- */
-void
-zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, int txtype,
-	znode_t *zp, int aclcnt, ace_t *z_ace)
-{
-	itx_t *itx;
-	uint64_t seq;
-	lr_acl_t *lr;
-
-	if (zilog == NULL || zp->z_unlinked)
-		return;
-
-	itx = zil_itx_create(txtype, sizeof (*lr) + aclcnt * sizeof (ace_t));
-	lr = (lr_acl_t *)&itx->itx_lr;
-	lr->lr_foid = zp->z_id;
-	lr->lr_aclcnt = (uint64_t)aclcnt;
-	bcopy(z_ace, (ace_t *)(lr + 1), aclcnt * sizeof (ace_t));
-
-	itx->itx_sync = (zp->z_sync_cnt != 0);
-	seq = zil_itx_assign(zilog, itx, tx);
-	zp->z_last_itx = seq;
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c
deleted file mode 100644
index 2be3093..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c
+++ /dev/null
@@ -1,430 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/sysmacros.h>
-#include <sys/cmn_err.h>
-#include <sys/kmem.h>
-#include <sys/file.h>
-#include <sys/fcntl.h>
-#include <sys/vfs.h>
-#include <sys/fs/zfs.h>
-#include <sys/zfs_znode.h>
-#include <sys/zfs_dir.h>
-#include <sys/zfs_acl.h>
-#include <sys/spa.h>
-#include <sys/zil.h>
-#include <sys/byteorder.h>
-#include <sys/stat.h>
-#include <sys/acl.h>
-#include <sys/atomic.h>
-#include <sys/cred.h>
-#include <sys/namei.h>
-
-/*
- * Functions to replay ZFS intent log (ZIL) records
- * The functions are called through a function vector (zfs_replay_vector)
- * which is indexed by the transaction type.
- */
-
-static void
-zfs_init_vattr(vattr_t *vap, uint64_t mask, uint64_t mode,
-	uint64_t uid, uint64_t gid, uint64_t rdev, uint64_t nodeid)
-{
-	VATTR_NULL(vap);
-	vap->va_mask = (uint_t)mask;
-	vap->va_type = IFTOVT(mode);
-	vap->va_mode = mode & MODEMASK;
-	vap->va_uid = (uid_t)uid;
-	vap->va_gid = (gid_t)gid;
-	vap->va_rdev = zfs_cmpldev(rdev);
-	vap->va_nodeid = nodeid;
-}
-
-/* ARGSUSED */
-static int
-zfs_replay_error(zfsvfs_t *zfsvfs, lr_t *lr, boolean_t byteswap)
-{
-	return (ENOTSUP);
-}
-
-static int
-zfs_replay_create(zfsvfs_t *zfsvfs, lr_create_t *lr, boolean_t byteswap)
-{
-	char *name = (char *)(lr + 1);	/* name follows lr_create_t */
-	char *link;			/* symlink content follows name */
-	znode_t *dzp;
-	vnode_t *vp = NULL;
-	vattr_t va;
-	struct componentname cn;
-	int error;
-
-	if (byteswap)
-		byteswap_uint64_array(lr, sizeof (*lr));
-
-	if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
-		return (error);
-
-	zfs_init_vattr(&va, AT_TYPE | AT_MODE | AT_UID | AT_GID,
-	    lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, lr->lr_foid);
-
-	/*
-	 * All forms of zfs create (create, mkdir, mkxattrdir, symlink)
-	 * eventually end up in zfs_mknode(), which assigns the object's
-	 * creation time and generation number.  The generic VOP_CREATE()
-	 * doesn't have either concept, so we smuggle the values inside
-	 * the vattr's otherwise unused va_ctime and va_nblocks fields.
-	 */
-	ZFS_TIME_DECODE(&va.va_ctime, lr->lr_crtime);
-	va.va_nblocks = lr->lr_gen;
-
-	cn.cn_nameptr = name;
-	cn.cn_cred = kcred;
-	cn.cn_thread = curthread;
-	cn.cn_flags = SAVENAME;
-
-	vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY);
-	switch ((int)lr->lr_common.lrc_txtype) {
-	case TX_CREATE:
-		error = VOP_CREATE(ZTOV(dzp), &vp, &cn, &va);
-		break;
-	case TX_MKDIR:
-		error = VOP_MKDIR(ZTOV(dzp), &vp, &cn, &va);
-		break;
-	case TX_MKXATTR:
-		error = zfs_make_xattrdir(dzp, &va, &vp, kcred);
-		break;
-	case TX_SYMLINK:
-		link = name + strlen(name) + 1;
-		error = VOP_SYMLINK(ZTOV(dzp), &vp, &cn, &va, link);
-		break;
-	default:
-		error = ENOTSUP;
-	}
-	VOP_UNLOCK(ZTOV(dzp), 0);
-
-	if (error == 0 && vp != NULL) {
-		VOP_UNLOCK(vp, 0);
-		VN_RELE(vp);
-	}
-
-	VN_RELE(ZTOV(dzp));
-
-	return (error);
-}
-
-static int
-zfs_replay_remove(zfsvfs_t *zfsvfs, lr_remove_t *lr, boolean_t byteswap)
-{
-	char *name = (char *)(lr + 1);	/* name follows lr_remove_t */
-	znode_t *dzp;
-	struct componentname cn;
-	vnode_t *vp;
-	int error;
-
-	if (byteswap)
-		byteswap_uint64_array(lr, sizeof (*lr));
-
-	if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
-		return (error);
-
-	bzero(&cn, sizeof(cn));
-	cn.cn_nameptr = name;
-	cn.cn_namelen = strlen(name);
-	cn.cn_nameiop = DELETE;
-	cn.cn_flags = ISLASTCN | SAVENAME;
-	cn.cn_lkflags = LK_EXCLUSIVE | LK_RETRY;
-	cn.cn_cred = kcred;
-	cn.cn_thread = curthread;
-	vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY);
-	error = VOP_LOOKUP(ZTOV(dzp), &vp, &cn);
-	if (error != 0) {
-		VOP_UNLOCK(ZTOV(dzp), 0);
-		goto fail;
-	}
-
-	switch ((int)lr->lr_common.lrc_txtype) {
-	case TX_REMOVE:
-		error = VOP_REMOVE(ZTOV(dzp), vp, &cn);
-		break;
-	case TX_RMDIR:
-		error = VOP_RMDIR(ZTOV(dzp), vp, &cn);
-		break;
-	default:
-		error = ENOTSUP;
-	}
-	vput(vp);
-	VOP_UNLOCK(ZTOV(dzp), 0);
-fail:
-	VN_RELE(ZTOV(dzp));
-
-	return (error);
-}
-
-static int
-zfs_replay_link(zfsvfs_t *zfsvfs, lr_link_t *lr, boolean_t byteswap)
-{
-	char *name = (char *)(lr + 1);	/* name follows lr_link_t */
-	znode_t *dzp, *zp;
-	struct componentname cn;
-	int error;
-
-	if (byteswap)
-		byteswap_uint64_array(lr, sizeof (*lr));
-
-	if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
-		return (error);
-
-	if ((error = zfs_zget(zfsvfs, lr->lr_link_obj, &zp)) != 0) {
-		VN_RELE(ZTOV(dzp));
-		return (error);
-	}
-
-	cn.cn_nameptr = name;
-	cn.cn_cred = kcred;
-	cn.cn_thread = curthread;
-	cn.cn_flags = SAVENAME;
-
-	vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY);
-	vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY);
-	error = VOP_LINK(ZTOV(dzp), ZTOV(zp), &cn);
-	VOP_UNLOCK(ZTOV(zp), 0);
-	VOP_UNLOCK(ZTOV(dzp), 0);
-
-	VN_RELE(ZTOV(zp));
-	VN_RELE(ZTOV(dzp));
-
-	return (error);
-}
-
-static int
-zfs_replay_rename(zfsvfs_t *zfsvfs, lr_rename_t *lr, boolean_t byteswap)
-{
-	char *sname = (char *)(lr + 1);	/* sname and tname follow lr_rename_t */
-	char *tname = sname + strlen(sname) + 1;
-	znode_t *sdzp, *tdzp;
-	struct componentname scn, tcn;
-	vnode_t *svp, *tvp;
-	kthread_t *td = curthread;
-	int error;
-
-	if (byteswap)
-		byteswap_uint64_array(lr, sizeof (*lr));
-
-	if ((error = zfs_zget(zfsvfs, lr->lr_sdoid, &sdzp)) != 0)
-		return (error);
-
-	if ((error = zfs_zget(zfsvfs, lr->lr_tdoid, &tdzp)) != 0) {
-		VN_RELE(ZTOV(sdzp));
-		return (error);
-	}
-
-	svp = tvp = NULL;
-
-	bzero(&scn, sizeof(scn));
-	scn.cn_nameptr = sname;
-	scn.cn_namelen = strlen(sname);
-	scn.cn_nameiop = DELETE;
-	scn.cn_flags = ISLASTCN | SAVENAME;
-	scn.cn_lkflags = LK_EXCLUSIVE | LK_RETRY;
-	scn.cn_cred = kcred;
-	scn.cn_thread = td;
-	vn_lock(ZTOV(sdzp), LK_EXCLUSIVE | LK_RETRY);
-	error = VOP_LOOKUP(ZTOV(sdzp), &svp, &scn);
-	VOP_UNLOCK(ZTOV(sdzp), 0);
-	if (error != 0)
-		goto fail;
-	VOP_UNLOCK(svp, 0);
-
-	bzero(&tcn, sizeof(tcn));
-	tcn.cn_nameptr = tname;
-	tcn.cn_namelen = strlen(tname);
-	tcn.cn_nameiop = RENAME;
-	tcn.cn_flags = ISLASTCN | SAVENAME;
-	tcn.cn_lkflags = LK_EXCLUSIVE | LK_RETRY;
-	tcn.cn_cred = kcred;
-	tcn.cn_thread = td;
-	vn_lock(ZTOV(tdzp), LK_EXCLUSIVE | LK_RETRY);
-	error = VOP_LOOKUP(ZTOV(tdzp), &tvp, &tcn);
-	if (error == EJUSTRETURN)
-		tvp = NULL;
-	else if (error != 0) {
-		VOP_UNLOCK(ZTOV(tdzp), 0);
-		goto fail;
-	}
-
-	error = VOP_RENAME(ZTOV(sdzp), svp, &scn, ZTOV(tdzp), tvp, &tcn);
-	return (error);
-fail:
-	if (svp != NULL)
-		vrele(svp);
-	if (tvp != NULL)
-		vrele(tvp);
-	VN_RELE(ZTOV(tdzp));
-	VN_RELE(ZTOV(sdzp));
-
-	return (error);
-}
-
-static int
-zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap)
-{
-	char *data = (char *)(lr + 1);	/* data follows lr_write_t */
-	znode_t	*zp;
-	int error;
-	ssize_t resid;
-
-	if (byteswap)
-		byteswap_uint64_array(lr, sizeof (*lr));
-
-	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
-		/*
-		 * As we can log writes out of order, it's possible the
-		 * file has been removed. In this case just drop the write
-		 * and return success.
-		 */
-		if (error == ENOENT)
-			error = 0;
-		return (error);
-	}
-
-	error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, lr->lr_length,
-	    lr->lr_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
-
-	VN_RELE(ZTOV(zp));
-
-	return (error);
-}
-
-static int
-zfs_replay_truncate(zfsvfs_t *zfsvfs, lr_truncate_t *lr, boolean_t byteswap)
-{
-
-	ZFS_LOG(0, "Unexpected code path, report to pjd@FreeBSD.org");
-	return (EOPNOTSUPP);
-}
-
-static int
-zfs_replay_setattr(zfsvfs_t *zfsvfs, lr_setattr_t *lr, boolean_t byteswap)
-{
-	znode_t *zp;
-	vattr_t va;
-	vnode_t *vp;
-	int error;
-
-	if (byteswap)
-		byteswap_uint64_array(lr, sizeof (*lr));
-
-	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
-		/*
-		 * As we can log setattrs out of order, it's possible the
-		 * file has been removed. In this case just drop the setattr
-		 * and return success.
-		 */
-		if (error == ENOENT)
-			error = 0;
-		return (error);
-	}
-
-	zfs_init_vattr(&va, lr->lr_mask, lr->lr_mode,
-	    lr->lr_uid, lr->lr_gid, 0, lr->lr_foid);
-
-	va.va_size = lr->lr_size;
-	ZFS_TIME_DECODE(&va.va_atime, lr->lr_atime);
-	ZFS_TIME_DECODE(&va.va_mtime, lr->lr_mtime);
-
-	vp = ZTOV(zp);
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
-	error = VOP_SETATTR(vp, &va, kcred, curthread);
-	VOP_UNLOCK(vp, 0);
-	VN_RELE(vp);
-
-	return (error);
-}
-
-static int
-zfs_replay_acl(zfsvfs_t *zfsvfs, lr_acl_t *lr, boolean_t byteswap)
-{
-	ace_t *ace = (ace_t *)(lr + 1);	/* ace array follows lr_acl_t */
-#ifdef TODO
-	vsecattr_t vsa;
-#endif
-	znode_t *zp;
-	int error;
-
-	if (byteswap) {
-		byteswap_uint64_array(lr, sizeof (*lr));
-		zfs_ace_byteswap(ace, lr->lr_aclcnt);
-	}
-
-	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
-		/*
-		 * As we can log acls out of order, it's possible the
-		 * file has been removed. In this case just drop the acl
-		 * and return success.
-		 */
-		if (error == ENOENT)
-			error = 0;
-		return (error);
-	}
-
-#ifdef TODO
-	bzero(&vsa, sizeof (vsa));
-	vsa.vsa_mask = VSA_ACE | VSA_ACECNT;
-	vsa.vsa_aclcnt = lr->lr_aclcnt;
-	vsa.vsa_aclentp = ace;
-
-	error = VOP_SETSECATTR(ZTOV(zp), &vsa, 0, kcred);
-#else
-	error = EOPNOTSUPP;
-#endif
-
-	VN_RELE(ZTOV(zp));
-
-	return (error);
-}
-
-/*
- * Callback vectors for replaying records
- */
-zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = {
-	zfs_replay_error,	/* 0 no such transaction type */
-	zfs_replay_create,	/* TX_CREATE */
-	zfs_replay_create,	/* TX_MKDIR */
-	zfs_replay_create,	/* TX_MKXATTR */
-	zfs_replay_create,	/* TX_SYMLINK */
-	zfs_replay_remove,	/* TX_REMOVE */
-	zfs_replay_remove,	/* TX_RMDIR */
-	zfs_replay_link,	/* TX_LINK */
-	zfs_replay_rename,	/* TX_RENAME */
-	zfs_replay_write,	/* TX_WRITE */
-	zfs_replay_truncate,	/* TX_TRUNCATE */
-	zfs_replay_setattr,	/* TX_SETATTR */
-	zfs_replay_acl,		/* TX_ACL */
-};
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c
deleted file mode 100644
index 07ec0f6..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c
+++ /dev/null
@@ -1,594 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-/*
- * This file contains the code to implement file range locking in
- * ZFS, although there isn't much specific to ZFS (all that comes to mind
- * support for growing the blocksize).
- *
- * Interface
- * ---------
- * Defined in zfs_rlock.h but essentially:
- *	rl = zfs_range_lock(zp, off, len, lock_type);
- *	zfs_range_unlock(rl);
- *	zfs_range_reduce(rl, off, len);
- *
- * AVL tree
- * --------
- * An AVL tree is used to maintain the state of the existing ranges
- * that are locked for exclusive (writer) or shared (reader) use.
- * The starting range offset is used for searching and sorting the tree.
- *
- * Common case
- * -----------
- * The (hopefully) usual case is of no overlaps or contention for
- * locks. On entry to zfs_lock_range() a rl_t is allocated; the tree
- * searched that finds no overlap, and *this* rl_t is placed in the tree.
- *
- * Overlaps/Reference counting/Proxy locks
- * ---------------------------------------
- * The avl code only allows one node at a particular offset. Also it's very
- * inefficient to search through all previous entries looking for overlaps
- * (because the very 1st in the ordered list might be at offset 0 but
- * cover the whole file).
- * So this implementation uses reference counts and proxy range locks.
- * Firstly, only reader locks use reference counts and proxy locks,
- * because writer locks are exclusive.
- * When a reader lock overlaps with another then a proxy lock is created
- * for that range and replaces the original lock. If the overlap
- * is exact then the reference count of the proxy is simply incremented.
- * Otherwise, the proxy lock is split into smaller lock ranges and
- * new proxy locks created for non overlapping ranges.
- * The reference counts are adjusted accordingly.
- * Meanwhile, the orginal lock is kept around (this is the callers handle)
- * and its offset and length are used when releasing the lock.
- *
- * Thread coordination
- * -------------------
- * In order to make wakeups efficient and to ensure multiple continuous
- * readers on a range don't starve a writer for the same range lock,
- * two condition variables are allocated in each rl_t.
- * If a writer (or reader) can't get a range it initialises the writer
- * (or reader) cv; sets a flag saying there's a writer (or reader) waiting;
- * and waits on that cv. When a thread unlocks that range it wakes up all
- * writers then all readers before destroying the lock.
- *
- * Append mode writes
- * ------------------
- * Append mode writes need to lock a range at the end of a file.
- * The offset of the end of the file is determined under the
- * range locking mutex, and the lock type converted from RL_APPEND to
- * RL_WRITER and the range locked.
- *
- * Grow block handling
- * -------------------
- * ZFS supports multiple block sizes currently upto 128K. The smallest
- * block size is used for the file which is grown as needed. During this
- * growth all other writers and readers must be excluded.
- * So if the block size needs to be grown then the whole file is
- * exclusively locked, then later the caller will reduce the lock
- * range to just the range to be written using zfs_reduce_range.
- */
-
-#include <sys/zfs_rlock.h>
-
-/*
- * Check if a write lock can be grabbed, or wait and recheck until available.
- */
-static void
-zfs_range_lock_writer(znode_t *zp, rl_t *new)
-{
-	avl_tree_t *tree = &zp->z_range_avl;
-	rl_t *rl;
-	avl_index_t where;
-	uint64_t end_size;
-	uint64_t off = new->r_off;
-	uint64_t len = new->r_len;
-
-	for (;;) {
-		/*
-		 * Range locking is also used by zvol and uses a
-		 * dummied up znode. However, for zvol, we don't need to
-		 * append or grow blocksize, and besides we don't have
-		 * a z_phys or z_zfsvfs - so skip that processing.
-		 *
-		 * Yes, this is ugly, and would be solved by not handling
-		 * grow or append in range lock code. If that was done then
-		 * we could make the range locking code generically available
-		 * to other non-zfs consumers.
-		 */
-		if (zp->z_vnode) { /* caller is ZPL */
-			/*
-			 * If in append mode pick up the current end of file.
-			 * This is done under z_range_lock to avoid races.
-			 */
-			if (new->r_type == RL_APPEND)
-				new->r_off = zp->z_phys->zp_size;
-
-			/*
-			 * If we need to grow the block size then grab the whole
-			 * file range. This is also done under z_range_lock to
-			 * avoid races.
-			 */
-			end_size = MAX(zp->z_phys->zp_size, new->r_off + len);
-			if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) ||
-			    zp->z_blksz < zp->z_zfsvfs->z_max_blksz)) {
-				new->r_off = 0;
-				new->r_len = UINT64_MAX;
-			}
-		}
-
-		/*
-		 * First check for the usual case of no locks
-		 */
-		if (avl_numnodes(tree) == 0) {
-			new->r_type = RL_WRITER; /* convert to writer */
-			avl_add(tree, new);
-			return;
-		}
-
-		/*
-		 * Look for any locks in the range.
-		 */
-		rl = avl_find(tree, new, &where);
-		if (rl)
-			goto wait; /* already locked at same offset */
-
-		rl = (rl_t *)avl_nearest(tree, where, AVL_AFTER);
-		if (rl && (rl->r_off < new->r_off + new->r_len))
-			goto wait;
-
-		rl = (rl_t *)avl_nearest(tree, where, AVL_BEFORE);
-		if (rl && rl->r_off + rl->r_len > new->r_off)
-			goto wait;
-
-		new->r_type = RL_WRITER; /* convert possible RL_APPEND */
-		avl_insert(tree, new, where);
-		return;
-wait:
-		if (!rl->r_write_wanted) {
-			cv_init(&rl->r_wr_cv, NULL, CV_DEFAULT, NULL);
-			rl->r_write_wanted = B_TRUE;
-		}
-		cv_wait(&rl->r_wr_cv, &zp->z_range_lock);
-
-		/* reset to original */
-		new->r_off = off;
-		new->r_len = len;
-	}
-}
-
-/*
- * If this is an original (non-proxy) lock then replace it by
- * a proxy and return the proxy.
- */
-static rl_t *
-zfs_range_proxify(avl_tree_t *tree, rl_t *rl)
-{
-	rl_t *proxy;
-
-	if (rl->r_proxy)
-		return (rl); /* already a proxy */
-
-	ASSERT3U(rl->r_cnt, ==, 1);
-	ASSERT(rl->r_write_wanted == B_FALSE);
-	ASSERT(rl->r_read_wanted == B_FALSE);
-	avl_remove(tree, rl);
-	rl->r_cnt = 0;
-
-	/* create a proxy range lock */
-	proxy = kmem_alloc(sizeof (rl_t), KM_SLEEP);
-	proxy->r_off = rl->r_off;
-	proxy->r_len = rl->r_len;
-	proxy->r_cnt = 1;
-	proxy->r_type = RL_READER;
-	proxy->r_proxy = B_TRUE;
-	proxy->r_write_wanted = B_FALSE;
-	proxy->r_read_wanted = B_FALSE;
-	avl_add(tree, proxy);
-
-	return (proxy);
-}
-
-/*
- * Split the range lock at the supplied offset
- * returning the *front* proxy.
- */
-static rl_t *
-zfs_range_split(avl_tree_t *tree, rl_t *rl, uint64_t off)
-{
-	rl_t *front, *rear;
-
-	ASSERT3U(rl->r_len, >, 1);
-	ASSERT3U(off, >, rl->r_off);
-	ASSERT3U(off, <, rl->r_off + rl->r_len);
-	ASSERT(rl->r_write_wanted == B_FALSE);
-	ASSERT(rl->r_read_wanted == B_FALSE);
-
-	/* create the rear proxy range lock */
-	rear = kmem_alloc(sizeof (rl_t), KM_SLEEP);
-	rear->r_off = off;
-	rear->r_len = rl->r_off + rl->r_len - off;
-	rear->r_cnt = rl->r_cnt;
-	rear->r_type = RL_READER;
-	rear->r_proxy = B_TRUE;
-	rear->r_write_wanted = B_FALSE;
-	rear->r_read_wanted = B_FALSE;
-
-	front = zfs_range_proxify(tree, rl);
-	front->r_len = off - rl->r_off;
-
-	avl_insert_here(tree, rear, front, AVL_AFTER);
-	return (front);
-}
-
-/*
- * Create and add a new proxy range lock for the supplied range.
- */
-static void
-zfs_range_new_proxy(avl_tree_t *tree, uint64_t off, uint64_t len)
-{
-	rl_t *rl;
-
-	ASSERT(len);
-	rl = kmem_alloc(sizeof (rl_t), KM_SLEEP);
-	rl->r_off = off;
-	rl->r_len = len;
-	rl->r_cnt = 1;
-	rl->r_type = RL_READER;
-	rl->r_proxy = B_TRUE;
-	rl->r_write_wanted = B_FALSE;
-	rl->r_read_wanted = B_FALSE;
-	avl_add(tree, rl);
-}
-
-static void
-zfs_range_add_reader(avl_tree_t *tree, rl_t *new, rl_t *prev, avl_index_t where)
-{
-	rl_t *next;
-	uint64_t off = new->r_off;
-	uint64_t len = new->r_len;
-
-	/*
-	 * prev arrives either:
-	 * - pointing to an entry at the same offset
-	 * - pointing to the entry with the closest previous offset whose
-	 *   range may overlap with the new range
-	 * - null, if there were no ranges starting before the new one
-	 */
-	if (prev) {
-		if (prev->r_off + prev->r_len <= off) {
-			prev = NULL;
-		} else if (prev->r_off != off) {
-			/*
-			 * convert to proxy if needed then
-			 * split this entry and bump ref count
-			 */
-			prev = zfs_range_split(tree, prev, off);
-			prev = AVL_NEXT(tree, prev); /* move to rear range */
-		}
-	}
-	ASSERT((prev == NULL) || (prev->r_off == off));
-
-	if (prev)
-		next = prev;
-	else
-		next = (rl_t *)avl_nearest(tree, where, AVL_AFTER);
-
-	if (next == NULL || off + len <= next->r_off) {
-		/* no overlaps, use the original new rl_t in the tree */
-		avl_insert(tree, new, where);
-		return;
-	}
-
-	if (off < next->r_off) {
-		/* Add a proxy for initial range before the overlap */
-		zfs_range_new_proxy(tree, off, next->r_off - off);
-	}
-
-	new->r_cnt = 0; /* will use proxies in tree */
-	/*
-	 * We now search forward through the ranges, until we go past the end
-	 * of the new range. For each entry we make it a proxy if it
-	 * isn't already, then bump its reference count. If there's any
-	 * gaps between the ranges then we create a new proxy range.
-	 */
-	for (prev = NULL; next; prev = next, next = AVL_NEXT(tree, next)) {
-		if (off + len <= next->r_off)
-			break;
-		if (prev && prev->r_off + prev->r_len < next->r_off) {
-			/* there's a gap */
-			ASSERT3U(next->r_off, >, prev->r_off + prev->r_len);
-			zfs_range_new_proxy(tree, prev->r_off + prev->r_len,
-			    next->r_off - (prev->r_off + prev->r_len));
-		}
-		if (off + len == next->r_off + next->r_len) {
-			/* exact overlap with end */
-			next = zfs_range_proxify(tree, next);
-			next->r_cnt++;
-			return;
-		}
-		if (off + len < next->r_off + next->r_len) {
-			/* new range ends in the middle of this block */
-			next = zfs_range_split(tree, next, off + len);
-			next->r_cnt++;
-			return;
-		}
-		ASSERT3U(off + len, >, next->r_off + next->r_len);
-		next = zfs_range_proxify(tree, next);
-		next->r_cnt++;
-	}
-
-	/* Add the remaining end range. */
-	zfs_range_new_proxy(tree, prev->r_off + prev->r_len,
-	    (off + len) - (prev->r_off + prev->r_len));
-}
-
-/*
- * Check if a reader lock can be grabbed, or wait and recheck until available.
- */
-static void
-zfs_range_lock_reader(znode_t *zp, rl_t *new)
-{
-	avl_tree_t *tree = &zp->z_range_avl;
-	rl_t *prev, *next;
-	avl_index_t where;
-	uint64_t off = new->r_off;
-	uint64_t len = new->r_len;
-
-	/*
-	 * Look for any writer locks in the range.
-	 */
-retry:
-	prev = avl_find(tree, new, &where);
-	if (prev == NULL)
-		prev = (rl_t *)avl_nearest(tree, where, AVL_BEFORE);
-
-	/*
-	 * Check the previous range for a writer lock overlap.
-	 */
-	if (prev && (off < prev->r_off + prev->r_len)) {
-		if ((prev->r_type == RL_WRITER) || (prev->r_write_wanted)) {
-			if (!prev->r_read_wanted) {
-				cv_init(&prev->r_rd_cv, NULL, CV_DEFAULT, NULL);
-				prev->r_read_wanted = B_TRUE;
-			}
-			cv_wait(&prev->r_rd_cv, &zp->z_range_lock);
-			goto retry;
-		}
-		if (off + len < prev->r_off + prev->r_len)
-			goto got_lock;
-	}
-
-	/*
-	 * Search through the following ranges to see if there's
-	 * write lock any overlap.
-	 */
-	if (prev)
-		next = AVL_NEXT(tree, prev);
-	else
-		next = (rl_t *)avl_nearest(tree, where, AVL_AFTER);
-	for (; next; next = AVL_NEXT(tree, next)) {
-		if (off + len <= next->r_off)
-			goto got_lock;
-		if ((next->r_type == RL_WRITER) || (next->r_write_wanted)) {
-			if (!next->r_read_wanted) {
-				cv_init(&next->r_rd_cv, NULL, CV_DEFAULT, NULL);
-				next->r_read_wanted = B_TRUE;
-			}
-			cv_wait(&next->r_rd_cv, &zp->z_range_lock);
-			goto retry;
-		}
-		if (off + len <= next->r_off + next->r_len)
-			goto got_lock;
-	}
-
-got_lock:
-	/*
-	 * Add the read lock, which may involve splitting existing
-	 * locks and bumping ref counts (r_cnt).
-	 */
-	zfs_range_add_reader(tree, new, prev, where);
-}
-
-/*
- * Lock a range (offset, length) as either shared (RL_READER)
- * or exclusive (RL_WRITER). Returns the range lock structure
- * for later unlocking or reduce range (if entire file
- * previously locked as RL_WRITER).
- */
-rl_t *
-zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type)
-{
-	rl_t *new;
-
-	ASSERT(type == RL_READER || type == RL_WRITER || type == RL_APPEND);
-
-	new = kmem_alloc(sizeof (rl_t), KM_SLEEP);
-	new->r_zp = zp;
-	new->r_off = off;
-	new->r_len = len;
-	new->r_cnt = 1; /* assume it's going to be in the tree */
-	new->r_type = type;
-	new->r_proxy = B_FALSE;
-	new->r_write_wanted = B_FALSE;
-	new->r_read_wanted = B_FALSE;
-
-	mutex_enter(&zp->z_range_lock);
-	if (type == RL_READER) {
-		/*
-		 * First check for the usual case of no locks
-		 */
-		if (avl_numnodes(&zp->z_range_avl) == 0)
-			avl_add(&zp->z_range_avl, new);
-		else
-			zfs_range_lock_reader(zp, new);
-	} else
-		zfs_range_lock_writer(zp, new); /* RL_WRITER or RL_APPEND */
-	mutex_exit(&zp->z_range_lock);
-	return (new);
-}
-
-/*
- * Unlock a reader lock
- */
-static void
-zfs_range_unlock_reader(znode_t *zp, rl_t *remove)
-{
-	avl_tree_t *tree = &zp->z_range_avl;
-	rl_t *rl, *next;
-	uint64_t len;
-
-	/*
-	 * The common case is when the remove entry is in the tree
-	 * (cnt == 1) meaning there's been no other reader locks overlapping
-	 * with this one. Otherwise the remove entry will have been
-	 * removed from the tree and replaced by proxies (one or
-	 * more ranges mapping to the entire range).
-	 */
-	if (remove->r_cnt == 1) {
-		avl_remove(tree, remove);
-		if (remove->r_write_wanted)
-			cv_broadcast(&remove->r_wr_cv);
-		if (remove->r_read_wanted)
-			cv_broadcast(&remove->r_rd_cv);
-	} else {
-		ASSERT3U(remove->r_cnt, ==, 0);
-		ASSERT3U(remove->r_write_wanted, ==, 0);
-		ASSERT3U(remove->r_read_wanted, ==, 0);
-		/*
-		 * Find start proxy representing this reader lock,
-		 * then decrement ref count on all proxies
-		 * that make up this range, freeing them as needed.
-		 */
-		rl = avl_find(tree, remove, NULL);
-		ASSERT(rl);
-		ASSERT(rl->r_cnt);
-		ASSERT(rl->r_type == RL_READER);
-		for (len = remove->r_len; len != 0; rl = next) {
-			len -= rl->r_len;
-			if (len) {
-				next = AVL_NEXT(tree, rl);
-				ASSERT(next);
-				ASSERT(rl->r_off + rl->r_len == next->r_off);
-				ASSERT(next->r_cnt);
-				ASSERT(next->r_type == RL_READER);
-			}
-			rl->r_cnt--;
-			if (rl->r_cnt == 0) {
-				avl_remove(tree, rl);
-				if (rl->r_write_wanted)
-					cv_broadcast(&rl->r_wr_cv);
-				if (rl->r_read_wanted)
-					cv_broadcast(&rl->r_rd_cv);
-				kmem_free(rl, sizeof (rl_t));
-			}
-		}
-	}
-	kmem_free(remove, sizeof (rl_t));
-}
-
-/*
- * Unlock range and destroy range lock structure.
- */
-void
-zfs_range_unlock(rl_t *rl)
-{
-	znode_t *zp = rl->r_zp;
-
-	ASSERT(rl->r_type == RL_WRITER || rl->r_type == RL_READER);
-	ASSERT(rl->r_cnt == 1 || rl->r_cnt == 0);
-	ASSERT(!rl->r_proxy);
-
-	mutex_enter(&zp->z_range_lock);
-	if (rl->r_type == RL_WRITER) {
-		/* writer locks can't be shared or split */
-		avl_remove(&zp->z_range_avl, rl);
-		mutex_exit(&zp->z_range_lock);
-		if (rl->r_write_wanted) {
-			cv_broadcast(&rl->r_wr_cv);
-			cv_destroy(&rl->r_wr_cv);
-		}
-		if (rl->r_read_wanted) {
-			cv_broadcast(&rl->r_rd_cv);
-			cv_destroy(&rl->r_rd_cv);
-		}
-		kmem_free(rl, sizeof (rl_t));
-	} else {
-		/*
-		 * lock may be shared, let zfs_range_unlock_reader()
-		 * release the lock and free the rl_t
-		 */
-		zfs_range_unlock_reader(zp, rl);
-		mutex_exit(&zp->z_range_lock);
-	}
-}
-
-/*
- * Reduce range locked as RL_WRITER from whole file to specified range.
- * Asserts the whole file is exclusivly locked and so there's only one
- * entry in the tree.
- */
-void
-zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len)
-{
-	znode_t *zp = rl->r_zp;
-
-	/* Ensure there are no other locks */
-	ASSERT(avl_numnodes(&zp->z_range_avl) == 1);
-	ASSERT(rl->r_off == 0);
-	ASSERT(rl->r_type == RL_WRITER);
-	ASSERT(!rl->r_proxy);
-	ASSERT3U(rl->r_len, ==, UINT64_MAX);
-	ASSERT3U(rl->r_cnt, ==, 1);
-
-	mutex_enter(&zp->z_range_lock);
-	rl->r_off = off;
-	rl->r_len = len;
-	mutex_exit(&zp->z_range_lock);
-	if (rl->r_write_wanted)
-		cv_broadcast(&rl->r_wr_cv);
-	if (rl->r_read_wanted)
-		cv_broadcast(&rl->r_rd_cv);
-}
-
-/*
- * AVL comparison function used to order range locks
- * Locks are ordered on the start offset of the range.
- */
-int
-zfs_range_compare(const void *arg1, const void *arg2)
-{
-	const rl_t *rl1 = arg1;
-	const rl_t *rl2 = arg2;
-
-	if (rl1->r_off > rl2->r_off)
-		return (1);
-	if (rl1->r_off < rl2->r_off)
-		return (-1);
-	return (0);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
deleted file mode 100644
index 28f3293..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
+++ /dev/null
@@ -1,1021 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/kernel.h>
-#include <sys/sysmacros.h>
-#include <sys/kmem.h>
-#include <sys/acl.h>
-#include <sys/vnode.h>
-#include <sys/vfs.h>
-#include <sys/mntent.h>
-#include <sys/mount.h>
-#include <sys/cmn_err.h>
-#include <sys/zfs_znode.h>
-#include <sys/zfs_dir.h>
-#include <sys/zil.h>
-#include <sys/fs/zfs.h>
-#include <sys/dmu.h>
-#include <sys/dsl_prop.h>
-#include <sys/dsl_dataset.h>
-#include <sys/spa.h>
-#include <sys/zap.h>
-#include <sys/varargs.h>
-#include <sys/policy.h>
-#include <sys/atomic.h>
-#include <sys/zfs_ioctl.h>
-#include <sys/zfs_ctldir.h>
-#include <sys/sunddi.h>
-#include <sys/dnlc.h>
-
-struct mtx zfs_debug_mtx;
-MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF);
-SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system");
-int zfs_debug_level = 0;
-TUNABLE_INT("vfs.zfs.debug", &zfs_debug_level);
-SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RW, &zfs_debug_level, 0,
-    "Debug level");
-
-static int zfs_mount(vfs_t *vfsp, kthread_t *td);
-static int zfs_umount(vfs_t *vfsp, int fflag, kthread_t *td);
-static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp, kthread_t *td);
-static int zfs_statfs(vfs_t *vfsp, struct statfs *statp, kthread_t *td);
-static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp);
-static int zfs_sync(vfs_t *vfsp, int waitfor, kthread_t *td);
-static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp);
-static void zfs_objset_close(zfsvfs_t *zfsvfs);
-static void zfs_freevfs(vfs_t *vfsp);
-
-static struct vfsops zfs_vfsops = {
-	.vfs_mount =		zfs_mount,
-	.vfs_unmount =		zfs_umount,
-	.vfs_root =		zfs_root,
-	.vfs_statfs =		zfs_statfs,
-	.vfs_vget =		zfs_vget,
-	.vfs_sync =		zfs_sync,
-	.vfs_fhtovp =		zfs_fhtovp,
-};
-
-VFS_SET(zfs_vfsops, zfs, VFCF_JAIL);
-
-/*
- * We need to keep a count of active fs's.
- * This is necessary to prevent our module
- * from being unloaded after a umount -f
- */
-static uint32_t	zfs_active_fs_count = 0;
-
-/*ARGSUSED*/
-static int
-zfs_sync(vfs_t *vfsp, int waitfor, kthread_t *td)
-{
-
-	/*
-	 * Data integrity is job one.  We don't want a compromised kernel
-	 * writing to the storage pool, so we never sync during panic.
-	 */
-	if (panicstr)
-		return (0);
-
-	if (vfsp != NULL) {
-		/*
-		 * Sync a specific filesystem.
-		 */
-		zfsvfs_t *zfsvfs = vfsp->vfs_data;
-		int error;
-
-		error = vfs_stdsync(vfsp, waitfor, td);
-		if (error != 0)
-			return (error);
-
-		ZFS_ENTER(zfsvfs);
-		if (zfsvfs->z_log != NULL)
-			zil_commit(zfsvfs->z_log, UINT64_MAX, 0);
-		else
-			txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
-		ZFS_EXIT(zfsvfs);
-	} else {
-		/*
-		 * Sync all ZFS filesystems.  This is what happens when you
-		 * run sync(1M).  Unlike other filesystems, ZFS honors the
-		 * request by waiting for all pools to commit all dirty data.
-		 */
-		spa_sync_allpools();
-	}
-
-	return (0);
-}
-
-static void
-atime_changed_cb(void *arg, uint64_t newval)
-{
-	zfsvfs_t *zfsvfs = arg;
-
-	if (newval == TRUE) {
-		zfsvfs->z_atime = TRUE;
-		zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME;
-		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
-		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
-	} else {
-		zfsvfs->z_atime = FALSE;
-		zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME;
-		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
-		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
-	}
-}
-
-static void
-xattr_changed_cb(void *arg, uint64_t newval)
-{
-	zfsvfs_t *zfsvfs = arg;
-
-	if (newval == TRUE) {
-		/* XXX locking on vfs_flag? */
-#ifdef TODO
-		zfsvfs->z_vfs->vfs_flag |= VFS_XATTR;
-#endif
-		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR);
-		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0);
-	} else {
-		/* XXX locking on vfs_flag? */
-#ifdef TODO
-		zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR;
-#endif
-		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR);
-		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0);
-	}
-}
-
-static void
-blksz_changed_cb(void *arg, uint64_t newval)
-{
-	zfsvfs_t *zfsvfs = arg;
-
-	if (newval < SPA_MINBLOCKSIZE ||
-	    newval > SPA_MAXBLOCKSIZE || !ISP2(newval))
-		newval = SPA_MAXBLOCKSIZE;
-
-	zfsvfs->z_max_blksz = newval;
-	zfsvfs->z_vfs->vfs_bsize = newval;
-}
-
-static void
-readonly_changed_cb(void *arg, uint64_t newval)
-{
-	zfsvfs_t *zfsvfs = arg;
-
-	if (newval) {
-		/* XXX locking on vfs_flag? */
-		zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
-		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
-		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
-	} else {
-		/* XXX locking on vfs_flag? */
-		zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
-		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
-		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
-	}
-}
-
-static void
-setuid_changed_cb(void *arg, uint64_t newval)
-{
-	zfsvfs_t *zfsvfs = arg;
-
-	if (newval == FALSE) {
-		zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
-		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
-		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
-	} else {
-		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
-		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
-		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
-	}
-}
-
-static void
-exec_changed_cb(void *arg, uint64_t newval)
-{
-	zfsvfs_t *zfsvfs = arg;
-
-	if (newval == FALSE) {
-		zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
-		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
-		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
-	} else {
-		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
-		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
-		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
-	}
-}
-
-static void
-snapdir_changed_cb(void *arg, uint64_t newval)
-{
-	zfsvfs_t *zfsvfs = arg;
-
-	zfsvfs->z_show_ctldir = newval;
-}
-
-static void
-acl_mode_changed_cb(void *arg, uint64_t newval)
-{
-	zfsvfs_t *zfsvfs = arg;
-
-	zfsvfs->z_acl_mode = newval;
-}
-
-static void
-acl_inherit_changed_cb(void *arg, uint64_t newval)
-{
-	zfsvfs_t *zfsvfs = arg;
-
-	zfsvfs->z_acl_inherit = newval;
-}
-
-static int
-zfs_refresh_properties(vfs_t *vfsp)
-{
-	zfsvfs_t *zfsvfs = vfsp->vfs_data;
-
-	/*
-	 * Remount operations default to "rw" unless "ro" is explicitly
-	 * specified.
-	 */
-	if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
-		readonly_changed_cb(zfsvfs, B_TRUE);
-	} else {
-		if (!dmu_objset_is_snapshot(zfsvfs->z_os))
-			readonly_changed_cb(zfsvfs, B_FALSE);
-		else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL))
-			return (EROFS);
-	}
-
-	if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
-		setuid_changed_cb(zfsvfs, B_FALSE);
-	} else {
-		if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
-			setuid_changed_cb(zfsvfs, B_FALSE);
-		else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL))
-			setuid_changed_cb(zfsvfs, B_TRUE);
-	}
-
-	if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
-		exec_changed_cb(zfsvfs, B_FALSE);
-	else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL))
-		exec_changed_cb(zfsvfs, B_TRUE);
-
-	if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL))
-		atime_changed_cb(zfsvfs, B_TRUE);
-	else if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL))
-		atime_changed_cb(zfsvfs, B_FALSE);
-
-	if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL))
-		xattr_changed_cb(zfsvfs, B_TRUE);
-	else if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL))
-		xattr_changed_cb(zfsvfs, B_FALSE);
-
-	return (0);
-}
-
-static int
-zfs_register_callbacks(vfs_t *vfsp)
-{
-	struct dsl_dataset *ds = NULL;
-	objset_t *os = NULL;
-	zfsvfs_t *zfsvfs = NULL;
-	int readonly, do_readonly = FALSE;
-	int setuid, do_setuid = FALSE;
-	int exec, do_exec = FALSE;
-	int xattr, do_xattr = FALSE;
-	int error = 0;
-
-	ASSERT(vfsp);
-	zfsvfs = vfsp->vfs_data;
-	ASSERT(zfsvfs);
-	os = zfsvfs->z_os;
-
-	/*
-	 * The act of registering our callbacks will destroy any mount
-	 * options we may have.  In order to enable temporary overrides
-	 * of mount options, we stash away the current values and
-	 * restore them after we register the callbacks.
-	 */
-	if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
-		readonly = B_TRUE;
-		do_readonly = B_TRUE;
-	} else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
-		readonly = B_FALSE;
-		do_readonly = B_TRUE;
-	}
-	if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
-		setuid = B_FALSE;
-		do_setuid = B_TRUE;
-	} else {
-		if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
-			setuid = B_FALSE;
-			do_setuid = B_TRUE;
-		} else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
-			setuid = B_TRUE;
-			do_setuid = B_TRUE;
-		}
-	}
-	if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
-		exec = B_FALSE;
-		do_exec = B_TRUE;
-	} else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
-		exec = B_TRUE;
-		do_exec = B_TRUE;
-	}
-	if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
-		xattr = B_FALSE;
-		do_xattr = B_TRUE;
-	} else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
-		xattr = B_TRUE;
-		do_xattr = B_TRUE;
-	}
-
-	/*
-	 * Register property callbacks.
-	 *
-	 * It would probably be fine to just check for i/o error from
-	 * the first prop_register(), but I guess I like to go
-	 * overboard...
-	 */
-	ds = dmu_objset_ds(os);
-	error = dsl_prop_register(ds, "atime", atime_changed_cb, zfsvfs);
-	error = error ? error : dsl_prop_register(ds,
-	    "xattr", xattr_changed_cb, zfsvfs);
-	error = error ? error : dsl_prop_register(ds,
-	    "recordsize", blksz_changed_cb, zfsvfs);
-	error = error ? error : dsl_prop_register(ds,
-	    "readonly", readonly_changed_cb, zfsvfs);
-	error = error ? error : dsl_prop_register(ds,
-	    "setuid", setuid_changed_cb, zfsvfs);
-	error = error ? error : dsl_prop_register(ds,
-	    "exec", exec_changed_cb, zfsvfs);
-	error = error ? error : dsl_prop_register(ds,
-	    "snapdir", snapdir_changed_cb, zfsvfs);
-	error = error ? error : dsl_prop_register(ds,
-	    "aclmode", acl_mode_changed_cb, zfsvfs);
-	error = error ? error : dsl_prop_register(ds,
-	    "aclinherit", acl_inherit_changed_cb, zfsvfs);
-	if (error)
-		goto unregister;
-
-	/*
-	 * Invoke our callbacks to restore temporary mount options.
-	 */
-	if (do_readonly)
-		readonly_changed_cb(zfsvfs, readonly);
-	if (do_setuid)
-		setuid_changed_cb(zfsvfs, setuid);
-	if (do_exec)
-		exec_changed_cb(zfsvfs, exec);
-	if (do_xattr)
-		xattr_changed_cb(zfsvfs, xattr);
-
-	return (0);
-
-unregister:
-	/*
-	 * We may attempt to unregister some callbacks that are not
-	 * registered, but this is OK; it will simply return ENOMSG,
-	 * which we will ignore.
-	 */
-	(void) dsl_prop_unregister(ds, "atime", atime_changed_cb, zfsvfs);
-	(void) dsl_prop_unregister(ds, "xattr", xattr_changed_cb, zfsvfs);
-	(void) dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zfsvfs);
-	(void) dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zfsvfs);
-	(void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs);
-	(void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs);
-	(void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs);
-	(void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs);
-	(void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb,
-	    zfsvfs);
-	return (error);
-
-}
-
-static int
-zfs_domount(vfs_t *vfsp, char *osname, kthread_t *td)
-{
-	cred_t *cr = td->td_ucred;
-	uint64_t recordsize, readonly;
-	int error = 0;
-	int mode;
-	zfsvfs_t *zfsvfs;
-	znode_t *zp = NULL;
-
-	ASSERT(vfsp);
-	ASSERT(osname);
-
-	/*
-	 * Initialize the zfs-specific filesystem structure.
-	 * Should probably make this a kmem cache, shuffle fields,
-	 * and just bzero up to z_hold_mtx[].
-	 */
-	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
-	zfsvfs->z_vfs = vfsp;
-	zfsvfs->z_parent = zfsvfs;
-	zfsvfs->z_assign = TXG_NOWAIT;
-	zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
-	zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
-
-	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
-	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
-	    offsetof(znode_t, z_link_node));
-	rw_init(&zfsvfs->z_um_lock, NULL, RW_DEFAULT, NULL);
-
-	if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
-	    NULL))
-		goto out;
-	zfsvfs->z_vfs->vfs_bsize = recordsize;
-
-	vfsp->vfs_data = zfsvfs;
-	vfsp->mnt_flag |= MNT_LOCAL;
-	vfsp->mnt_kern_flag |= MNTK_MPSAFE;
-	vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED;
-
-	if (error = dsl_prop_get_integer(osname, "readonly", &readonly, NULL))
-		goto out;
-
-	if (readonly)
-		mode = DS_MODE_PRIMARY | DS_MODE_READONLY;
-	else
-		mode = DS_MODE_PRIMARY;
-
-	error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os);
-	if (error == EROFS) {
-		mode = DS_MODE_PRIMARY | DS_MODE_READONLY;
-		error = dmu_objset_open(osname, DMU_OST_ZFS, mode,
-		    &zfsvfs->z_os);
-	}
-
-	if (error)
-		goto out;
-
-	if (error = zfs_init_fs(zfsvfs, &zp, cr))
-		goto out;
-
-	if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
-		uint64_t xattr;
-
-		ASSERT(mode & DS_MODE_READONLY);
-		atime_changed_cb(zfsvfs, B_FALSE);
-		readonly_changed_cb(zfsvfs, B_TRUE);
-		if (error = dsl_prop_get_integer(osname, "xattr", &xattr, NULL))
-			goto out;
-		xattr_changed_cb(zfsvfs, xattr);
-		zfsvfs->z_issnap = B_TRUE;
-	} else {
-		error = zfs_register_callbacks(vfsp);
-		if (error)
-			goto out;
-
-		zfs_unlinked_drain(zfsvfs);
-
-		/*
-		 * Parse and replay the intent log.
-		 */
-		zil_replay(zfsvfs->z_os, zfsvfs, &zfsvfs->z_assign,
-		    zfs_replay_vector);
-
-		if (!zil_disable)
-			zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
-	}
-
-	vfs_mountedfrom(vfsp, osname);
-
-	if (!zfsvfs->z_issnap)
-		zfsctl_create(zfsvfs);
-out:
-	if (error) {
-		if (zfsvfs->z_os)
-			dmu_objset_close(zfsvfs->z_os);
-		rw_destroy(&zfsvfs->z_um_lock);
-		mutex_destroy(&zfsvfs->z_znodes_lock);
-		kmem_free(zfsvfs, sizeof (zfsvfs_t));
-	} else {
-		atomic_add_32(&zfs_active_fs_count, 1);
-	}
-
-	return (error);
-
-}
-
-void
-zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
-{
-	objset_t *os = zfsvfs->z_os;
-	struct dsl_dataset *ds;
-
-	/*
-	 * Unregister properties.
-	 */
-	if (!dmu_objset_is_snapshot(os)) {
-		ds = dmu_objset_ds(os);
-		VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb,
-		    zfsvfs) == 0);
-
-		VERIFY(dsl_prop_unregister(ds, "xattr", xattr_changed_cb,
-		    zfsvfs) == 0);
-
-		VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb,
-		    zfsvfs) == 0);
-
-		VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb,
-		    zfsvfs) == 0);
-
-		VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb,
-		    zfsvfs) == 0);
-
-		VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb,
-		    zfsvfs) == 0);
-
-		VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb,
-		    zfsvfs) == 0);
-
-		VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb,
-		    zfsvfs) == 0);
-
-		VERIFY(dsl_prop_unregister(ds, "aclinherit",
-		    acl_inherit_changed_cb, zfsvfs) == 0);
-	}
-}
-
-/*ARGSUSED*/
-static int
-zfs_mount(vfs_t *vfsp, kthread_t *td)
-{
-	char *from;
-	int error;
-
-	/*
-	 * When doing a remount, we simply refresh our temporary properties
-	 * according to those options set in the current VFS options.
-	 */
-	if (vfsp->vfs_flag & MS_REMOUNT)
-		return (zfs_refresh_properties(vfsp));
-
-	if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&from, NULL))
-		return (EINVAL);
-
-	DROP_GIANT();
-	error = zfs_domount(vfsp, from, td);
-	PICKUP_GIANT();
-	return (error);
-}
-
-static int
-zfs_statfs(vfs_t *vfsp, struct statfs *statp, kthread_t *td)
-{
-	zfsvfs_t *zfsvfs = vfsp->vfs_data;
-	uint64_t refdbytes, availbytes, usedobjs, availobjs;
-
-	statp->f_version = STATFS_VERSION;
-
-	ZFS_ENTER(zfsvfs);
-
-	dmu_objset_space(zfsvfs->z_os,
-	    &refdbytes, &availbytes, &usedobjs, &availobjs);
-
-	/*
-	 * The underlying storage pool actually uses multiple block sizes.
-	 * We report the fragsize as the smallest block size we support,
-	 * and we report our blocksize as the filesystem's maximum blocksize.
-	 */
-	statp->f_bsize = zfsvfs->z_vfs->vfs_bsize;
-	statp->f_iosize = zfsvfs->z_vfs->vfs_bsize;
-
-	/*
-	 * The following report "total" blocks of various kinds in the
-	 * file system, but reported in terms of f_frsize - the
-	 * "fragment" size.
-	 */
-
-	statp->f_blocks = (refdbytes + availbytes) / statp->f_bsize;
-	statp->f_bfree = availbytes / statp->f_bsize;
-	statp->f_bavail = statp->f_bfree; /* no root reservation */
-
-	/*
-	 * statvfs() should really be called statufs(), because it assumes
-	 * static metadata.  ZFS doesn't preallocate files, so the best
-	 * we can do is report the max that could possibly fit in f_files,
-	 * and that minus the number actually used in f_ffree.
-	 * For f_ffree, report the smaller of the number of object available
-	 * and the number of blocks (each object will take at least a block).
-	 */
-	statp->f_ffree = MIN(availobjs, statp->f_bfree);
-	statp->f_files = statp->f_ffree + usedobjs;
-
-	/*
-	 * We're a zfs filesystem.
-	 */
-	(void) strlcpy(statp->f_fstypename, "zfs", sizeof(statp->f_fstypename));
-
-	strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname,
-	    sizeof(statp->f_mntfromname));
-	strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname,
-	    sizeof(statp->f_mntonname));
-
-	statp->f_namemax = ZFS_MAXNAMELEN;
-
-	ZFS_EXIT(zfsvfs);
-	return (0);
-}
-
-static int
-zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp, kthread_t *td)
-{
-	zfsvfs_t *zfsvfs = vfsp->vfs_data;
-	znode_t *rootzp;
-	int error;
-
-	ZFS_ENTER(zfsvfs);
-
-	error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
-	if (error == 0) {
-		*vpp = ZTOV(rootzp);
-		error = vn_lock(*vpp, flags);
-		(*vpp)->v_vflag |= VV_ROOT;
-	}
-
-	ZFS_EXIT(zfsvfs);
-	return (error);
-}
-
-/*ARGSUSED*/
-static int
-zfs_umount(vfs_t *vfsp, int fflag, kthread_t *td)
-{
-	zfsvfs_t *zfsvfs = vfsp->vfs_data;
-	cred_t *cr = td->td_ucred;
-	int ret;
-
-	if ((ret = secpolicy_fs_unmount(cr, vfsp)) != 0)
-		return (ret);
-
-	(void) dnlc_purge_vfsp(vfsp, 0);
-
-	/*
-	 * Unmount any snapshots mounted under .zfs before unmounting the
-	 * dataset itself.
-	 */
-	if (zfsvfs->z_ctldir != NULL) {
-		if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0)
-			return (ret);
-		ret = vflush(vfsp, 0, 0, td);
-		ASSERT(ret == EBUSY);
-		if (!(fflag & MS_FORCE)) {
-			if (zfsvfs->z_ctldir->v_count > 1)
-				return (EBUSY);
-			ASSERT(zfsvfs->z_ctldir->v_count == 1);
-		}
-		zfsctl_destroy(zfsvfs);
-		ASSERT(zfsvfs->z_ctldir == NULL);
-	}
-
-	/*
-	 * Flush all the files.
-	 */
-	ret = vflush(vfsp, 1, (fflag & MS_FORCE) ? FORCECLOSE : 0, td);
-	if (ret != 0) {
-		if (!zfsvfs->z_issnap) {
-			zfsctl_create(zfsvfs);
-			ASSERT(zfsvfs->z_ctldir != NULL);
-		}
-		return (ret);
-	}
-
-	if (fflag & MS_FORCE) {
-		MNT_ILOCK(vfsp);
-		vfsp->mnt_kern_flag |= MNTK_UNMOUNTF;
-		MNT_IUNLOCK(vfsp);
-		zfsvfs->z_unmounted1 = B_TRUE;
-
-		/*
-		 * Wait for all zfs threads to leave zfs.
-		 * Grabbing a rwlock as reader in all vops and
-		 * as writer here doesn't work because it too easy to get
-		 * multiple reader enters as zfs can re-enter itself.
-		 * This can lead to deadlock if there is an intervening
-		 * rw_enter as writer.
-		 * So a file system threads ref count (z_op_cnt) is used.
-		 * A polling loop on z_op_cnt may seem inefficient, but
-		 * - this saves all threads on exit from having to grab a
-		 *   mutex in order to cv_signal
-		 * - only occurs on forced unmount in the rare case when
-		 *   there are outstanding threads within the file system.
-		 */
-		while (zfsvfs->z_op_cnt) {
-			delay(1);
-		}
-	}
-
-	zfs_objset_close(zfsvfs);
-	VFS_RELE(vfsp);
-	zfs_freevfs(vfsp);
-
-	return (0);
-}
-
-static int
-zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
-{
-	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
-	znode_t		*zp;
-	int 		err;
-
-	ZFS_ENTER(zfsvfs);
-	err = zfs_zget(zfsvfs, ino, &zp);
-	if (err == 0 && zp->z_unlinked) {
-		VN_RELE(ZTOV(zp));
-		err = EINVAL;
-	}
-	if (err != 0)
-		*vpp = NULL;
-	else {
-		*vpp = ZTOV(zp);
-		vn_lock(*vpp, flags);
-	}
-	ZFS_EXIT(zfsvfs);
-	return (err);
-}
-
-static int
-zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp)
-{
-	kthread_t	*td = curthread;
-	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
-	znode_t		*zp;
-	uint64_t	object = 0;
-	uint64_t	fid_gen = 0;
-	uint64_t	gen_mask;
-	uint64_t	zp_gen;
-	int		i, err;
-
-	*vpp = NULL;
-
-	ZFS_ENTER(zfsvfs);
-
-	if (fidp->fid_len == LONG_FID_LEN) {
-		zfid_long_t	*zlfid = (zfid_long_t *)fidp;
-		uint64_t	objsetid = 0;
-		uint64_t	setgen = 0;
-
-		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
-			objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
-
-		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
-			setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
-
-		ZFS_EXIT(zfsvfs);
-
-		err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
-		if (err)
-			return (EINVAL);
-		ZFS_ENTER(zfsvfs);
-	}
-
-	if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
-		zfid_short_t	*zfid = (zfid_short_t *)fidp;
-
-		for (i = 0; i < sizeof (zfid->zf_object); i++)
-			object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
-
-		for (i = 0; i < sizeof (zfid->zf_gen); i++)
-			fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
-	} else {
-		ZFS_EXIT(zfsvfs);
-		return (EINVAL);
-	}
-
-	/* A zero fid_gen means we are in the .zfs control directories */
-	if (fid_gen == 0 &&
-	    (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) {
-		*vpp = zfsvfs->z_ctldir;
-		ASSERT(*vpp != NULL);
-		if (object == ZFSCTL_INO_SNAPDIR) {
-			VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL,
-			    0, NULL, NULL) == 0);
-		} else {
-			VN_HOLD(*vpp);
-		}
-		ZFS_EXIT(zfsvfs);
-		/* XXX: LK_RETRY? */
-		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
-		return (0);
-	}
-
-	gen_mask = -1ULL >> (64 - 8 * i);
-
-	dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask);
-	if (err = zfs_zget(zfsvfs, object, &zp)) {
-		ZFS_EXIT(zfsvfs);
-		return (err);
-	}
-	zp_gen = zp->z_phys->zp_gen & gen_mask;
-	if (zp_gen == 0)
-		zp_gen = 1;
-	if (zp->z_unlinked || zp_gen != fid_gen) {
-		dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
-		VN_RELE(ZTOV(zp));
-		ZFS_EXIT(zfsvfs);
-		return (EINVAL);
-	}
-
-	*vpp = ZTOV(zp);
-	/* XXX: LK_RETRY? */
-	vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
-	vnode_create_vobject(*vpp, zp->z_phys->zp_size, td);
-	ZFS_EXIT(zfsvfs);
-	return (0);
-}
-
-static void
-zfs_objset_close(zfsvfs_t *zfsvfs)
-{
-	znode_t		*zp, *nextzp;
-	objset_t	*os = zfsvfs->z_os;
-
-	/*
-	 * For forced unmount, at this point all vops except zfs_inactive
-	 * are erroring EIO. We need to now suspend zfs_inactive threads
-	 * while we are freeing dbufs before switching zfs_inactive
-	 * to use behaviour without a objset.
-	 */
-	rw_enter(&zfsvfs->z_um_lock, RW_WRITER);
-
-	/*
-	 * Release all holds on dbufs
-	 * Note, although we have stopped all other vop threads and
-	 * zfs_inactive(), the dmu can callback via znode_pageout_func()
-	 * which can zfs_znode_free() the znode.
-	 * So we lock z_all_znodes; search the list for a held
-	 * dbuf; drop the lock (we know zp can't disappear if we hold
-	 * a dbuf lock; then regrab the lock and restart.
-	 */
-	mutex_enter(&zfsvfs->z_znodes_lock);
-	for (zp = list_head(&zfsvfs->z_all_znodes); zp; zp = nextzp) {
-		nextzp = list_next(&zfsvfs->z_all_znodes, zp);
-		if (zp->z_dbuf_held) {
-			/* dbufs should only be held when force unmounting */
-			zp->z_dbuf_held = 0;
-			mutex_exit(&zfsvfs->z_znodes_lock);
-			dmu_buf_rele(zp->z_dbuf, NULL);
-			/* Start again */
-			mutex_enter(&zfsvfs->z_znodes_lock);
-			nextzp = list_head(&zfsvfs->z_all_znodes);
-		}
-	}
-	mutex_exit(&zfsvfs->z_znodes_lock);
-
-	/*
-	 * Unregister properties.
-	 */
-	if (!dmu_objset_is_snapshot(os))
-		zfs_unregister_callbacks(zfsvfs);
-
-	/*
-	 * Switch zfs_inactive to behaviour without an objset.
-	 * It just tosses cached pages and frees the znode & vnode.
-	 * Then re-enable zfs_inactive threads in that new behaviour.
-	 */
-	zfsvfs->z_unmounted2 = B_TRUE;
-	rw_exit(&zfsvfs->z_um_lock); /* re-enable any zfs_inactive threads */
-
-	/*
-	 * Close the zil. Can't close the zil while zfs_inactive
-	 * threads are blocked as zil_close can call zfs_inactive.
-	 */
-	if (zfsvfs->z_log) {
-		zil_close(zfsvfs->z_log);
-		zfsvfs->z_log = NULL;
-	}
-
-	/*
-	 * Evict all dbufs so that cached znodes will be freed
-	 */
-	if (dmu_objset_evict_dbufs(os, 1)) {
-		txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
-		(void) dmu_objset_evict_dbufs(os, 0);
-	}
-
-	/*
-	 * Finally close the objset
-	 */
-	dmu_objset_close(os);
-}
-
-static void
-zfs_freevfs(vfs_t *vfsp)
-{
-	zfsvfs_t *zfsvfs = vfsp->vfs_data;
-	int i;
-
-	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
-		mutex_destroy(&zfsvfs->z_hold_mtx[i]);
-	rw_destroy(&zfsvfs->z_um_lock);
-	mutex_destroy(&zfsvfs->z_znodes_lock);
-	kmem_free(zfsvfs, sizeof (zfsvfs_t));
-
-	atomic_add_32(&zfs_active_fs_count, -1);
-}
-
-#ifdef __i386__
-static int desiredvnodes_backup;
-#endif
-
-static void
-zfs_vnodes_adjust(void)
-{
-#ifdef __i386__
-	int val;
-
-	desiredvnodes_backup = desiredvnodes;
-
-	/*
-	 * We calculate newdesiredvnodes the same way it is done in
-	 * vntblinit(). If it is equal to desiredvnodes, it means that
-	 * it wasn't tuned by the administrator and we can tune it down.
-	 */
-	val = min(maxproc + cnt.v_page_count / 4, 2 * vm_kmem_size /
-	    (5 * (sizeof(struct vm_object) + sizeof(struct vnode))));
-	if (desiredvnodes == val)
-		desiredvnodes = (3 * desiredvnodes) / 4;
-#endif
-}
-
-static void
-zfs_vnodes_adjust_back(void)
-{
-
-#ifdef __i386__
-	desiredvnodes = desiredvnodes_backup;
-#endif
-}
-
-void
-zfs_init(void)
-{
-
-	printf("ZFS filesystem version " ZFS_VERSION_STRING "\n");
-
-	/*
-	 * Initialize .zfs directory structures
-	 */
-	zfsctl_init();
-
-	/*
-	 * Initialize znode cache, vnode ops, etc...
-	 */
-	zfs_znode_init();
-
-	/*
-	 * Reduce number of vnodes. Originally number of vnodes is calculated
-	 * with UFS inode in mind. We reduce it here, because it's too big for
-	 * ZFS/i386.
-	 */
-	zfs_vnodes_adjust();
-}
-
-void
-zfs_fini(void)
-{
-	zfsctl_fini();
-	zfs_znode_fini();
-	zfs_vnodes_adjust_back();
-}
-
-int
-zfs_busy(void)
-{
-	return (zfs_active_fs_count != 0);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
deleted file mode 100644
index 088103a..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
+++ /dev/null
@@ -1,3623 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-/* Portions Copyright 2007 Jeremy Teo */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/time.h>
-#include <sys/systm.h>
-#include <sys/sysmacros.h>
-#include <sys/resource.h>
-#include <sys/vfs.h>
-#include <sys/vnode.h>
-#include <sys/file.h>
-#include <sys/stat.h>
-#include <sys/kmem.h>
-#include <sys/taskq.h>
-#include <sys/uio.h>
-#include <sys/atomic.h>
-#include <sys/namei.h>
-#include <sys/mman.h>
-#include <sys/cmn_err.h>
-#include <sys/errno.h>
-#include <sys/unistd.h>
-#include <sys/zfs_vfsops.h>
-#include <sys/zfs_dir.h>
-#include <sys/zfs_acl.h>
-#include <sys/zfs_ioctl.h>
-#include <sys/fs/zfs.h>
-#include <sys/dmu.h>
-#include <sys/spa.h>
-#include <sys/txg.h>
-#include <sys/dbuf.h>
-#include <sys/zap.h>
-#include <sys/dirent.h>
-#include <sys/policy.h>
-#include <sys/sunddi.h>
-#include <sys/filio.h>
-#include <sys/zfs_ctldir.h>
-#include <sys/dnlc.h>
-#include <sys/zfs_rlock.h>
-#include <sys/bio.h>
-#include <sys/buf.h>
-#include <sys/sf_buf.h>
-#include <sys/sched.h>
-
-/*
- * Programming rules.
- *
- * Each vnode op performs some logical unit of work.  To do this, the ZPL must
- * properly lock its in-core state, create a DMU transaction, do the work,
- * record this work in the intent log (ZIL), commit the DMU transaction,
- * and wait the the intent log to commit if it's is a synchronous operation.
- * Morover, the vnode ops must work in both normal and log replay context.
- * The ordering of events is important to avoid deadlocks and references
- * to freed memory.  The example below illustrates the following Big Rules:
- *
- *  (1) A check must be made in each zfs thread for a mounted file system.
- *	This is done avoiding races using ZFS_ENTER(zfsvfs).
- *	A ZFS_EXIT(zfsvfs) is needed before all returns.
- *
- *  (2)	VN_RELE() should always be the last thing except for zil_commit()
- *	(if necessary) and ZFS_EXIT(). This is for 3 reasons:
- *	First, if it's the last reference, the vnode/znode
- *	can be freed, so the zp may point to freed memory.  Second, the last
- *	reference will call zfs_zinactive(), which may induce a lot of work --
- *	pushing cached pages (which acquires range locks) and syncing out
- *	cached atime changes.  Third, zfs_zinactive() may require a new tx,
- *	which could deadlock the system if you were already holding one.
- *
- *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
- *	as they can span dmu_tx_assign() calls.
- *
- *  (4)	Always pass zfsvfs->z_assign as the second argument to dmu_tx_assign().
- *	In normal operation, this will be TXG_NOWAIT.  During ZIL replay,
- *	it will be a specific txg.  Either way, dmu_tx_assign() never blocks.
- *	This is critical because we don't want to block while holding locks.
- *	Note, in particular, that if a lock is sometimes acquired before
- *	the tx assigns, and sometimes after (e.g. z_lock), then failing to
- *	use a non-blocking assign can deadlock the system.  The scenario:
- *
- *	Thread A has grabbed a lock before calling dmu_tx_assign().
- *	Thread B is in an already-assigned tx, and blocks for this lock.
- *	Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
- *	forever, because the previous txg can't quiesce until B's tx commits.
- *
- *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
- *	then drop all locks, call dmu_tx_wait(), and try again.
- *
- *  (5)	If the operation succeeded, generate the intent log entry for it
- *	before dropping locks.  This ensures that the ordering of events
- *	in the intent log matches the order in which they actually occurred.
- *
- *  (6)	At the end of each vnode op, the DMU tx must always commit,
- *	regardless of whether there were any errors.
- *
- *  (7)	After dropping all locks, invoke zil_commit(zilog, seq, foid)
- *	to ensure that synchronous semantics are provided when necessary.
- *
- * In general, this is how things should be ordered in each vnode op:
- *
- *	ZFS_ENTER(zfsvfs);		// exit if unmounted
- * top:
- *	zfs_dirent_lock(&dl, ...)	// lock directory entry (may VN_HOLD())
- *	rw_enter(...);			// grab any other locks you need
- *	tx = dmu_tx_create(...);	// get DMU tx
- *	dmu_tx_hold_*();		// hold each object you might modify
- *	error = dmu_tx_assign(tx, zfsvfs->z_assign);	// try to assign
- *	if (error) {
- *		rw_exit(...);		// drop locks
- *		zfs_dirent_unlock(dl);	// unlock directory entry
- *		VN_RELE(...);		// release held vnodes
- *		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
- *			dmu_tx_wait(tx);
- *			dmu_tx_abort(tx);
- *			goto top;
- *		}
- *		dmu_tx_abort(tx);	// abort DMU tx
- *		ZFS_EXIT(zfsvfs);	// finished in zfs
- *		return (error);		// really out of space
- *	}
- *	error = do_real_work();		// do whatever this VOP does
- *	if (error == 0)
- *		zfs_log_*(...);		// on success, make ZIL entry
- *	dmu_tx_commit(tx);		// commit DMU tx -- error or not
- *	rw_exit(...);			// drop locks
- *	zfs_dirent_unlock(dl);		// unlock directory entry
- *	VN_RELE(...);			// release held vnodes
- *	zil_commit(zilog, seq, foid);	// synchronous when necessary
- *	ZFS_EXIT(zfsvfs);		// finished in zfs
- *	return (error);			// done, report error
- */
-/* ARGSUSED */
-static int
-zfs_open(vnode_t **vpp, int flag, cred_t *cr)
-{
-	znode_t	*zp = VTOZ(*vpp);
-
-	/* Keep a count of the synchronous opens in the znode */
-	if (flag & (FSYNC | FDSYNC))
-		atomic_inc_32(&zp->z_sync_cnt);
-	return (0);
-}
-
-/* ARGSUSED */
-static int
-zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr)
-{
-	znode_t	*zp = VTOZ(vp);
-
-	/* Decrement the synchronous opens in the znode */
-	if (flag & (FSYNC | FDSYNC))
-		atomic_dec_32(&zp->z_sync_cnt);
-
-	/*
-	 * Clean up any locks held by this process on the vp.
-	 */
-	cleanlocks(vp, ddi_get_pid(), 0);
-	cleanshares(vp, ddi_get_pid());
-
-	return (0);
-}
-
-/*
- * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
- * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
- */
-static int
-zfs_holey(vnode_t *vp, u_long cmd, offset_t *off)
-{
-	znode_t	*zp = VTOZ(vp);
-	uint64_t noff = (uint64_t)*off; /* new offset */
-	uint64_t file_sz;
-	int error;
-	boolean_t hole;
-
-	file_sz = zp->z_phys->zp_size;
-	if (noff >= file_sz)  {
-		return (ENXIO);
-	}
-
-	if (cmd == _FIO_SEEK_HOLE)
-		hole = B_TRUE;
-	else
-		hole = B_FALSE;
-
-	error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
-
-	/* end of file? */
-	if ((error == ESRCH) || (noff > file_sz)) {
-		/*
-		 * Handle the virtual hole at the end of file.
-		 */
-		if (hole) {
-			*off = file_sz;
-			return (0);
-		}
-		return (ENXIO);
-	}
-
-	if (noff < *off)
-		return (error);
-	*off = noff;
-	return (error);
-}
-
-/* ARGSUSED */
-static int
-zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred,
-    int *rvalp)
-{
-	offset_t off;
-	int error;
-	zfsvfs_t *zfsvfs;
-
-	switch (com) {
-	    case _FIOFFS:
-		return (0);
-
-		/*
-		 * The following two ioctls are used by bfu.  Faking out,
-		 * necessary to avoid bfu errors.
-		 */
-	    case _FIOGDIO:
-	    case _FIOSDIO:
-		return (0);
-
-	    case _FIO_SEEK_DATA:
-	    case _FIO_SEEK_HOLE:
-		if (ddi_copyin((void *)data, &off, sizeof (off), flag))
-			return (EFAULT);
-
-		zfsvfs = VTOZ(vp)->z_zfsvfs;
-		ZFS_ENTER(zfsvfs);
-
-		/* offset parameter is in/out */
-		error = zfs_holey(vp, com, &off);
-		ZFS_EXIT(zfsvfs);
-		if (error)
-			return (error);
-		if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
-			return (EFAULT);
-		return (0);
-	}
-	return (ENOTTY);
-}
-
-/*
- * When a file is memory mapped, we must keep the IO data synchronized
- * between the DMU cache and the memory mapped pages.  What this means:
- *
- * On Write:	If we find a memory mapped page, we write to *both*
- *		the page and the dmu buffer.
- *
- * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
- *	the file is memory mapped.
- */
-static int
-mappedwrite(vnode_t *vp, int nbytes, uio_t *uio, dmu_tx_t *tx)
-{
-	znode_t *zp = VTOZ(vp);
-	objset_t *os = zp->z_zfsvfs->z_os;
-	vm_object_t obj;
-	vm_page_t m;
-	struct sf_buf *sf;
-	int64_t start, off;
-	int len = nbytes;
-	int error = 0;
-	uint64_t dirbytes;
-
-	ASSERT(vp->v_mount != NULL);
-	obj = vp->v_object;
-	ASSERT(obj != NULL);
-
-	start = uio->uio_loffset;
-	off = start & PAGEOFFSET;
-	dirbytes = 0;
-	VM_OBJECT_LOCK(obj);
-	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
-		uint64_t bytes = MIN(PAGESIZE - off, len);
-		uint64_t fsize;
-
-again:
-		if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
-		    vm_page_is_valid(m, (vm_offset_t)off, bytes)) {
-			uint64_t woff;
-			caddr_t va;
-
-			if (vm_page_sleep_if_busy(m, FALSE, "zfsmwb"))
-				goto again;
-			fsize = obj->un_pager.vnp.vnp_size;
-			vm_page_busy(m);
-			vm_page_lock_queues();
-			vm_page_undirty(m);
-			vm_page_unlock_queues();
-			VM_OBJECT_UNLOCK(obj);
-			if (dirbytes > 0) {
-				error = dmu_write_uio(os, zp->z_id, uio,
-				    dirbytes, tx);
-				dirbytes = 0;
-			}
-			if (error == 0) {
-				sched_pin();
-				sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
-				va = (caddr_t)sf_buf_kva(sf);
-				woff = uio->uio_loffset - off;
-				error = uiomove(va + off, bytes, UIO_WRITE, uio);
-				/*
-				 * The uiomove() above could have been partially
-				 * successful, that's why we call dmu_write()
-				 * below unconditionally. The page was marked
-				 * non-dirty above and we would lose the changes
-				 * without doing so. If the uiomove() failed
-				 * entirely, well, we just write what we got
-				 * before one more time.
-				 */
-				dmu_write(os, zp->z_id, woff,
-				    MIN(PAGESIZE, fsize - woff), va, tx);
-				sf_buf_free(sf);
-				sched_unpin();
-			}
-			VM_OBJECT_LOCK(obj);
-			vm_page_wakeup(m);
-		} else {
-			if (__predict_false(obj->cache != NULL)) {
-				vm_page_cache_free(obj, OFF_TO_IDX(start),
-				    OFF_TO_IDX(start) + 1);
-			}
-			dirbytes += bytes;
-		}
-		len -= bytes;
-		off = 0;
-		if (error)
-			break;
-	}
-	VM_OBJECT_UNLOCK(obj);
-	if (error == 0 && dirbytes > 0)
-		error = dmu_write_uio(os, zp->z_id, uio, dirbytes, tx);
-	return (error);
-}
-
-/*
- * When a file is memory mapped, we must keep the IO data synchronized
- * between the DMU cache and the memory mapped pages.  What this means:
- *
- * On Read:	We "read" preferentially from memory mapped pages,
- *		else we default from the dmu buffer.
- *
- * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
- *	the file is memory mapped.
- */
-static int
-mappedread(vnode_t *vp, int nbytes, uio_t *uio)
-{
-	znode_t *zp = VTOZ(vp);
-	objset_t *os = zp->z_zfsvfs->z_os;
-	vm_object_t obj;
-	vm_page_t m;
-	struct sf_buf *sf;
-	int64_t start, off;
-	caddr_t va;
-	int len = nbytes;
-	int error = 0;
-	uint64_t dirbytes;
-
-	ASSERT(vp->v_mount != NULL);
-	obj = vp->v_object;
-	ASSERT(obj != NULL);
-
-	start = uio->uio_loffset;
-	off = start & PAGEOFFSET;
-	dirbytes = 0;
-	VM_OBJECT_LOCK(obj);
-	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
-		uint64_t bytes = MIN(PAGESIZE - off, len);
-
-again:
-		if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
-		    vm_page_is_valid(m, (vm_offset_t)off, bytes)) {
-			if (vm_page_sleep_if_busy(m, FALSE, "zfsmrb"))
-				goto again;
-			vm_page_busy(m);
-			VM_OBJECT_UNLOCK(obj);
-			if (dirbytes > 0) {
-				error = dmu_read_uio(os, zp->z_id, uio,
-				    dirbytes);
-				dirbytes = 0;
-			}
-			if (error == 0) {
-				sched_pin();
-				sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
-				va = (caddr_t)sf_buf_kva(sf);
-				error = uiomove(va + off, bytes, UIO_READ, uio);
-				sf_buf_free(sf);
-				sched_unpin();
-			}
-			VM_OBJECT_LOCK(obj);
-			vm_page_wakeup(m);
-		} else if (m != NULL && uio->uio_segflg == UIO_NOCOPY) {
-			/*
-			 * The code below is here to make sendfile(2) work
-			 * correctly with ZFS. As pointed out by ups@
-			 * sendfile(2) should be changed to use VOP_GETPAGES(),
-			 * but it pessimize performance of sendfile/UFS, that's
-			 * why I handle this special case in ZFS code.
-			 */
-			if (vm_page_sleep_if_busy(m, FALSE, "zfsmrb"))
-				goto again;
-			vm_page_busy(m);
-			VM_OBJECT_UNLOCK(obj);
-			if (dirbytes > 0) {
-				error = dmu_read_uio(os, zp->z_id, uio,
-				    dirbytes);
-				dirbytes = 0;
-			}
-			if (error == 0) {
-				sched_pin();
-				sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
-				va = (caddr_t)sf_buf_kva(sf);
-				error = dmu_read(os, zp->z_id, start + off,
-				    bytes, (void *)(va + off));
-				sf_buf_free(sf);
-				sched_unpin();
-			}
-			VM_OBJECT_LOCK(obj);
-			vm_page_wakeup(m);
-			if (error == 0)
-				uio->uio_resid -= bytes;
-		} else {
-			dirbytes += bytes;
-		}
-		len -= bytes;
-		off = 0;
-		if (error)
-			break;
-	}
-	VM_OBJECT_UNLOCK(obj);
-	if (error == 0 && dirbytes > 0)
-		error = dmu_read_uio(os, zp->z_id, uio, dirbytes);
-	return (error);
-}
-
-offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
-
-/*
- * Read bytes from specified file into supplied buffer.
- *
- *	IN:	vp	- vnode of file to be read from.
- *		uio	- structure supplying read location, range info,
- *			  and return buffer.
- *		ioflag	- SYNC flags; used to provide FRSYNC semantics.
- *		cr	- credentials of caller.
- *
- *	OUT:	uio	- updated offset and range, buffer filled.
- *
- *	RETURN:	0 if success
- *		error code if failure
- *
- * Side Effects:
- *	vp - atime updated if byte count > 0
- */
-/* ARGSUSED */
-static int
-zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
-{
-	znode_t		*zp = VTOZ(vp);
-	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	objset_t	*os = zfsvfs->z_os;
-	ssize_t		n, nbytes;
-	int		error;
-	rl_t		*rl;
-
-	ZFS_ENTER(zfsvfs);
-
-	/*
-	 * Validate file offset
-	 */
-	if (uio->uio_loffset < (offset_t)0) {
-		ZFS_EXIT(zfsvfs);
-		return (EINVAL);
-	}
-
-	/*
-	 * Fasttrack empty reads
-	 */
-	if (uio->uio_resid == 0) {
-		ZFS_EXIT(zfsvfs);
-		return (0);
-	}
-
-	/*
-	 * Check for mandatory locks
-	 */
-	if (MANDMODE((mode_t)zp->z_phys->zp_mode)) {
-		if (error = chklock(vp, FREAD,
-		    uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
-			ZFS_EXIT(zfsvfs);
-			return (error);
-		}
-	}
-
-	/*
-	 * If we're in FRSYNC mode, sync out this znode before reading it.
-	 */
-	if (ioflag & FRSYNC)
-		zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
-
-	/*
-	 * Lock the range against changes.
-	 */
-	rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
-
-	/*
-	 * If we are reading past end-of-file we can skip
-	 * to the end; but we might still need to set atime.
-	 */
-	if (uio->uio_loffset >= zp->z_phys->zp_size) {
-		error = 0;
-		goto out;
-	}
-
-	ASSERT(uio->uio_loffset < zp->z_phys->zp_size);
-	n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset);
-
-	while (n > 0) {
-		nbytes = MIN(n, zfs_read_chunk_size -
-		    P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
-
-		if (vn_has_cached_data(vp))
-			error = mappedread(vp, nbytes, uio);
-		else
-			error = dmu_read_uio(os, zp->z_id, uio, nbytes);
-		if (error)
-			break;
-
-		n -= nbytes;
-	}
-
-out:
-	zfs_range_unlock(rl);
-
-	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
-	ZFS_EXIT(zfsvfs);
-	return (error);
-}
-
-/*
- * Fault in the pages of the first n bytes specified by the uio structure.
- * 1 byte in each page is touched and the uio struct is unmodified.
- * Any error will exit this routine as this is only a best
- * attempt to get the pages resident. This is a copy of ufs_trans_touch().
- */
-static void
-zfs_prefault_write(ssize_t n, struct uio *uio)
-{
-	struct iovec *iov;
-	ulong_t cnt, incr;
-	caddr_t p;
-
-	if (uio->uio_segflg != UIO_USERSPACE)
-		return;
-
-	iov = uio->uio_iov;
-
-	while (n) {
-		cnt = MIN(iov->iov_len, n);
-		if (cnt == 0) {
-			/* empty iov entry */
-			iov++;
-			continue;
-		}
-		n -= cnt;
-		/*
-		 * touch each page in this segment.
-		 */
-		p = iov->iov_base;
-		while (cnt) {
-			if (fubyte(p) == -1)
-				return;
-			incr = MIN(cnt, PAGESIZE);
-			p += incr;
-			cnt -= incr;
-		}
-		/*
-		 * touch the last byte in case it straddles a page.
-		 */
-		p--;
-		if (fubyte(p) == -1)
-			return;
-		iov++;
-	}
-}
-
-/*
- * Write the bytes to a file.
- *
- *	IN:	vp	- vnode of file to be written to.
- *		uio	- structure supplying write location, range info,
- *			  and data buffer.
- *		ioflag	- IO_APPEND flag set if in append mode.
- *		cr	- credentials of caller.
- *
- *	OUT:	uio	- updated offset and range.
- *
- *	RETURN:	0 if success
- *		error code if failure
- *
- * Timestamps:
- *	vp - ctime|mtime updated if byte count > 0
- */
-/* ARGSUSED */
-static int
-zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
-{
-	znode_t		*zp = VTOZ(vp);
-	rlim64_t	limit = MAXOFFSET_T;
-	ssize_t		start_resid = uio->uio_resid;
-	ssize_t		tx_bytes;
-	uint64_t	end_size;
-	dmu_tx_t	*tx;
-	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	zilog_t		*zilog = zfsvfs->z_log;
-	offset_t	woff;
-	ssize_t		n, nbytes;
-	rl_t		*rl;
-	int		max_blksz = zfsvfs->z_max_blksz;
-	int		error;
-
-	/*
-	 * Fasttrack empty write
-	 */
-	n = start_resid;
-	if (n == 0)
-		return (0);
-
-	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
-		limit = MAXOFFSET_T;
-
-	ZFS_ENTER(zfsvfs);
-
-	/*
-	 * Pre-fault the pages to ensure slow (eg NFS) pages
-	 * don't hold up txg.
-	 */
-	zfs_prefault_write(n, uio);
-
-	/*
-	 * If in append mode, set the io offset pointer to eof.
-	 */
-	if (ioflag & IO_APPEND) {
-		/*
-		 * Range lock for a file append:
-		 * The value for the start of range will be determined by
-		 * zfs_range_lock() (to guarantee append semantics).
-		 * If this write will cause the block size to increase,
-		 * zfs_range_lock() will lock the entire file, so we must
-		 * later reduce the range after we grow the block size.
-		 */
-		rl = zfs_range_lock(zp, 0, n, RL_APPEND);
-		if (rl->r_len == UINT64_MAX) {
-			/* overlocked, zp_size can't change */
-			woff = uio->uio_loffset = zp->z_phys->zp_size;
-		} else {
-			woff = uio->uio_loffset = rl->r_off;
-		}
-	} else {
-		woff = uio->uio_loffset;
-		/*
-		 * Validate file offset
-		 */
-		if (woff < 0) {
-			ZFS_EXIT(zfsvfs);
-			return (EINVAL);
-		}
-
-		/*
-		 * If we need to grow the block size then zfs_range_lock()
-		 * will lock a wider range than we request here.
-		 * Later after growing the block size we reduce the range.
-		 */
-		rl = zfs_range_lock(zp, woff, n, RL_WRITER);
-	}
-
-	if (woff >= limit) {
-		zfs_range_unlock(rl);
-		ZFS_EXIT(zfsvfs);
-		return (EFBIG);
-	}
-
-	if ((woff + n) > limit || woff > (limit - n))
-		n = limit - woff;
-
-	/*
-	 * Check for mandatory locks
-	 */
-	if (MANDMODE((mode_t)zp->z_phys->zp_mode) &&
-	    (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
-		zfs_range_unlock(rl);
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-	end_size = MAX(zp->z_phys->zp_size, woff + n);
-
-	/*
-	 * Write the file in reasonable size chunks.  Each chunk is written
-	 * in a separate transaction; this keeps the intent log records small
-	 * and allows us to do more fine-grained space accounting.
-	 */
-	while (n > 0) {
-		/*
-		 * Start a transaction.
-		 */
-		woff = uio->uio_loffset;
-		tx = dmu_tx_create(zfsvfs->z_os);
-		dmu_tx_hold_bonus(tx, zp->z_id);
-		dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
-		error = dmu_tx_assign(tx, zfsvfs->z_assign);
-		if (error) {
-			if (error == ERESTART &&
-			    zfsvfs->z_assign == TXG_NOWAIT) {
-				dmu_tx_wait(tx);
-				dmu_tx_abort(tx);
-				continue;
-			}
-			dmu_tx_abort(tx);
-			break;
-		}
-
-		/*
-		 * If zfs_range_lock() over-locked we grow the blocksize
-		 * and then reduce the lock range.  This will only happen
-		 * on the first iteration since zfs_range_reduce() will
-		 * shrink down r_len to the appropriate size.
-		 */
-		if (rl->r_len == UINT64_MAX) {
-			uint64_t new_blksz;
-
-			if (zp->z_blksz > max_blksz) {
-				ASSERT(!ISP2(zp->z_blksz));
-				new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE);
-			} else {
-				new_blksz = MIN(end_size, max_blksz);
-			}
-			zfs_grow_blocksize(zp, new_blksz, tx);
-			zfs_range_reduce(rl, woff, n);
-		}
-
-		/*
-		 * XXX - should we really limit each write to z_max_blksz?
-		 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
-		 */
-		nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
-
-		if (woff + nbytes > zp->z_phys->zp_size)
-			vnode_pager_setsize(vp, woff + nbytes);
-
-		rw_enter(&zp->z_map_lock, RW_READER);
-
-		tx_bytes = uio->uio_resid;
-		if (vn_has_cached_data(vp)) {
-			rw_exit(&zp->z_map_lock);
-			error = mappedwrite(vp, nbytes, uio, tx);
-		} else {
-			error = dmu_write_uio(zfsvfs->z_os, zp->z_id,
-			    uio, nbytes, tx);
-			rw_exit(&zp->z_map_lock);
-		}
-		tx_bytes -= uio->uio_resid;
-
-		/*
-		 * If we made no progress, we're done.  If we made even
-		 * partial progress, update the znode and ZIL accordingly.
-		 */
-		if (tx_bytes == 0) {
-			dmu_tx_commit(tx);
-			ASSERT(error != 0);
-			break;
-		}
-
-		/*
-		 * Clear Set-UID/Set-GID bits on successful write if not
-		 * privileged and at least one of the excute bits is set.
-		 *
-		 * It would be nice to to this after all writes have
-		 * been done, but that would still expose the ISUID/ISGID
-		 * to another app after the partial write is committed.
-		 */
-		mutex_enter(&zp->z_acl_lock);
-		if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) |
-		    (S_IXUSR >> 6))) != 0 &&
-		    (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 &&
-		    secpolicy_vnode_setid_retain(cr,
-		    (zp->z_phys->zp_mode & S_ISUID) != 0 &&
-		    zp->z_phys->zp_uid == 0) != 0) {
-			    zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID);
-		}
-		mutex_exit(&zp->z_acl_lock);
-
-		/*
-		 * Update time stamp.  NOTE: This marks the bonus buffer as
-		 * dirty, so we don't have to do it again for zp_size.
-		 */
-		zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
-
-		/*
-		 * Update the file size (zp_size) if it has changed;
-		 * account for possible concurrent updates.
-		 */
-		while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset)
-			(void) atomic_cas_64(&zp->z_phys->zp_size, end_size,
-			    uio->uio_loffset);
-		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
-		dmu_tx_commit(tx);
-
-		if (error != 0)
-			break;
-		ASSERT(tx_bytes == nbytes);
-		n -= nbytes;
-	}
-
-	zfs_range_unlock(rl);
-
-	/*
-	 * If we're in replay mode, or we made no progress, return error.
-	 * Otherwise, it's at least a partial write, so it's successful.
-	 */
-	if (zfsvfs->z_assign >= TXG_INITIAL || uio->uio_resid == start_resid) {
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-
-	if (ioflag & (FSYNC | FDSYNC))
-		zil_commit(zilog, zp->z_last_itx, zp->z_id);
-
-	ZFS_EXIT(zfsvfs);
-	return (0);
-}
-
-void
-zfs_get_done(dmu_buf_t *db, void *vzgd)
-{
-	zgd_t *zgd = (zgd_t *)vzgd;
-	rl_t *rl = zgd->zgd_rl;
-	vnode_t *vp = ZTOV(rl->r_zp);
-	int vfslocked;
-
-	vfslocked = VFS_LOCK_GIANT(vp->v_vfsp);
-	dmu_buf_rele(db, vzgd);
-	zfs_range_unlock(rl);
-	VN_RELE(vp);
-	zil_add_vdev(zgd->zgd_zilog, DVA_GET_VDEV(BP_IDENTITY(zgd->zgd_bp)));
-	kmem_free(zgd, sizeof (zgd_t));
-	VFS_UNLOCK_GIANT(vfslocked);
-}
-
-/*
- * Get data to generate a TX_WRITE intent log record.
- */
-int
-zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
-{
-	zfsvfs_t *zfsvfs = arg;
-	objset_t *os = zfsvfs->z_os;
-	znode_t *zp;
-	uint64_t off = lr->lr_offset;
-	dmu_buf_t *db;
-	rl_t *rl;
-	zgd_t *zgd;
-	int dlen = lr->lr_length;		/* length of user data */
-	int error = 0;
-
-	ASSERT(zio);
-	ASSERT(dlen != 0);
-
-	/*
-	 * Nothing to do if the file has been removed
-	 */
-	if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0)
-		return (ENOENT);
-	if (zp->z_unlinked) {
-		VN_RELE(ZTOV(zp));
-		return (ENOENT);
-	}
-
-	/*
-	 * Write records come in two flavors: immediate and indirect.
-	 * For small writes it's cheaper to store the data with the
-	 * log record (immediate); for large writes it's cheaper to
-	 * sync the data and get a pointer to it (indirect) so that
-	 * we don't have to write the data twice.
-	 */
-	if (buf != NULL) { /* immediate write */
-		rl = zfs_range_lock(zp, off, dlen, RL_READER);
-		/* test for truncation needs to be done while range locked */
-		if (off >= zp->z_phys->zp_size) {
-			error = ENOENT;
-			goto out;
-		}
-		VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf));
-	} else { /* indirect write */
-		uint64_t boff; /* block starting offset */
-
-		/*
-		 * Have to lock the whole block to ensure when it's
-		 * written out and it's checksum is being calculated
-		 * that no one can change the data. We need to re-check
-		 * blocksize after we get the lock in case it's changed!
-		 */
-		for (;;) {
-			if (ISP2(zp->z_blksz)) {
-				boff = P2ALIGN_TYPED(off, zp->z_blksz,
-				    uint64_t);
-			} else {
-				boff = 0;
-			}
-			dlen = zp->z_blksz;
-			rl = zfs_range_lock(zp, boff, dlen, RL_READER);
-			if (zp->z_blksz == dlen)
-				break;
-			zfs_range_unlock(rl);
-		}
-		/* test for truncation needs to be done while range locked */
-		if (off >= zp->z_phys->zp_size) {
-			error = ENOENT;
-			goto out;
-		}
-		zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP);
-		zgd->zgd_rl = rl;
-		zgd->zgd_zilog = zfsvfs->z_log;
-		zgd->zgd_bp = &lr->lr_blkptr;
-		VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db));
-		ASSERT(boff == db->db_offset);
-		lr->lr_blkoff = off - boff;
-		error = dmu_sync(zio, db, &lr->lr_blkptr,
-		    lr->lr_common.lrc_txg, zfs_get_done, zgd);
-		ASSERT(error == EEXIST || lr->lr_length <= zp->z_blksz);
-		if (error == 0) {
-			zil_add_vdev(zfsvfs->z_log,
-			    DVA_GET_VDEV(BP_IDENTITY(&lr->lr_blkptr)));
-		}
-		/*
-		 * If we get EINPROGRESS, then we need to wait for a
-		 * write IO initiated by dmu_sync() to complete before
-		 * we can release this dbuf.  We will finish everything
-		 * up in the zfs_get_done() callback.
-		 */
-		if (error == EINPROGRESS)
-			return (0);
-		dmu_buf_rele(db, zgd);
-		kmem_free(zgd, sizeof (zgd_t));
-	}
-out:
-	zfs_range_unlock(rl);
-	VN_RELE(ZTOV(zp));
-	return (error);
-}
-
-/*ARGSUSED*/
-static int
-zfs_access(vnode_t *vp, int mode, int flags, cred_t *cr)
-{
-	znode_t *zp = VTOZ(vp);
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	int error;
-
-	ZFS_ENTER(zfsvfs);
-	error = zfs_zaccess_rwx(zp, mode, cr);
-	ZFS_EXIT(zfsvfs);
-	return (error);
-}
-
-/*
- * Lookup an entry in a directory, or an extended attribute directory.
- * If it exists, return a held vnode reference for it.
- *
- *	IN:	dvp	- vnode of directory to search.
- *		nm	- name of entry to lookup.
- *		pnp	- full pathname to lookup [UNUSED].
- *		flags	- LOOKUP_XATTR set if looking for an attribute.
- *		rdir	- root directory vnode [UNUSED].
- *		cr	- credentials of caller.
- *
- *	OUT:	vpp	- vnode of located entry, NULL if not found.
- *
- *	RETURN:	0 if success
- *		error code if failure
- *
- * Timestamps:
- *	NA
- */
-/* ARGSUSED */
-static int
-zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
-    int nameiop, cred_t *cr, kthread_t *td)
-{
-
-	znode_t *zdp = VTOZ(dvp);
-	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
-	int	error;
-
-	ZFS_ENTER(zfsvfs);
-
-	*vpp = NULL;
-
-#ifdef TODO
-	if (flags & LOOKUP_XATTR) {
-		/*
-		 * If the xattr property is off, refuse the lookup request.
-		 */
-		if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
-			ZFS_EXIT(zfsvfs);
-			return (EINVAL);
-		}
-
-		/*
-		 * We don't allow recursive attributes..
-		 * Maybe someday we will.
-		 */
-		if (zdp->z_phys->zp_flags & ZFS_XATTR) {
-			ZFS_EXIT(zfsvfs);
-			return (EINVAL);
-		}
-
-		if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
-			ZFS_EXIT(zfsvfs);
-			return (error);
-		}
-
-		/*
-		 * Do we have permission to get into attribute directory?
-		 */
-
-		if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, cr)) {
-			VN_RELE(*vpp);
-		}
-
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-#endif	/* TODO */
-
-	if (dvp->v_type != VDIR) {
-		ZFS_EXIT(zfsvfs);
-		return (ENOTDIR);
-	}
-
-	/*
-	 * Check accessibility of directory.
-	 */
-
-	if (error = zfs_zaccess(zdp, ACE_EXECUTE, cr)) {
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-
-	if ((error = zfs_dirlook(zdp, nm, vpp)) == 0) {
-
-		/*
-		 * Convert device special files
-		 */
-		if (IS_DEVVP(*vpp)) {
-			vnode_t	*svp;
-
-			svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
-			VN_RELE(*vpp);
-			if (svp == NULL)
-				error = ENOSYS;
-			else
-				*vpp = svp;
-		}
-	}
-
-	ZFS_EXIT(zfsvfs);
-
-	/* Translate errors and add SAVENAME when needed. */
-	if (cnp->cn_flags & ISLASTCN) {
-		switch (nameiop) {
-		case CREATE:
-		case RENAME:
-			if (error == ENOENT) {
-				error = EJUSTRETURN;
-				cnp->cn_flags |= SAVENAME;
-				break;
-			}
-			/* FALLTHROUGH */
-		case DELETE:
-			if (error == 0)
-				cnp->cn_flags |= SAVENAME;
-			break;
-		}
-	}
-	if (error == 0 && (nm[0] != '.' || nm[1] != '\0')) {
-		int ltype = 0;
-
-		if (cnp->cn_flags & ISDOTDOT) {
-			ltype = VOP_ISLOCKED(dvp);
-			VOP_UNLOCK(dvp, 0);
-		}
-		error = vn_lock(*vpp, cnp->cn_lkflags);
-		if (cnp->cn_flags & ISDOTDOT)
-			vn_lock(dvp, ltype | LK_RETRY);
-		if (error != 0) {
-			VN_RELE(*vpp);
-			*vpp = NULL;
-			return (error);
-		}
-	}
-
-#ifdef FREEBSD_NAMECACHE
-	/*
-	 * Insert name into cache (as non-existent) if appropriate.
-	 */
-	if (error == ENOENT && (cnp->cn_flags & MAKEENTRY) && nameiop != CREATE)
-		cache_enter(dvp, *vpp, cnp);
-	/*
-	 * Insert name into cache if appropriate.
-	 */
-	if (error == 0 && (cnp->cn_flags & MAKEENTRY)) {
-		if (!(cnp->cn_flags & ISLASTCN) ||
-		    (nameiop != DELETE && nameiop != RENAME)) {
-			cache_enter(dvp, *vpp, cnp);
-		}
-	}
-#endif
-
-	return (error);
-}
-
-/*
- * Attempt to create a new entry in a directory.  If the entry
- * already exists, truncate the file if permissible, else return
- * an error.  Return the vp of the created or trunc'd file.
- *
- *	IN:	dvp	- vnode of directory to put new file entry in.
- *		name	- name of new file entry.
- *		vap	- attributes of new file.
- *		excl	- flag indicating exclusive or non-exclusive mode.
- *		mode	- mode to open file with.
- *		cr	- credentials of caller.
- *		flag	- large file flag [UNUSED].
- *
- *	OUT:	vpp	- vnode of created or trunc'd entry.
- *
- *	RETURN:	0 if success
- *		error code if failure
- *
- * Timestamps:
- *	dvp - ctime|mtime updated if new entry created
- *	 vp - ctime|mtime always, atime if new
- */
-/* ARGSUSED */
-static int
-zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
-    vnode_t **vpp, cred_t *cr)
-{
-	znode_t		*zp, *dzp = VTOZ(dvp);
-	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
-	zilog_t		*zilog = zfsvfs->z_log;
-	objset_t	*os = zfsvfs->z_os;
-	zfs_dirlock_t	*dl;
-	dmu_tx_t	*tx;
-	int		error;
-	uint64_t	zoid;
-
-	ZFS_ENTER(zfsvfs);
-
-top:
-	*vpp = NULL;
-
-	if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr))
-		vap->va_mode &= ~VSVTX;
-
-	if (*name == '\0') {
-		/*
-		 * Null component name refers to the directory itself.
-		 */
-		VN_HOLD(dvp);
-		zp = dzp;
-		dl = NULL;
-		error = 0;
-	} else {
-		/* possible VN_HOLD(zp) */
-		if (error = zfs_dirent_lock(&dl, dzp, name, &zp, 0)) {
-			if (strcmp(name, "..") == 0)
-				error = EISDIR;
-			ZFS_EXIT(zfsvfs);
-			return (error);
-		}
-	}
-
-	zoid = zp ? zp->z_id : -1ULL;
-
-	if (zp == NULL) {
-		/*
-		 * Create a new file object and update the directory
-		 * to reference it.
-		 */
-		if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) {
-			goto out;
-		}
-
-		/*
-		 * We only support the creation of regular files in
-		 * extended attribute directories.
-		 */
-		if ((dzp->z_phys->zp_flags & ZFS_XATTR) &&
-		    (vap->va_type != VREG)) {
-			error = EINVAL;
-			goto out;
-		}
-
-		tx = dmu_tx_create(os);
-		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
-		dmu_tx_hold_bonus(tx, dzp->z_id);
-		dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
-		if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
-			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
-			    0, SPA_MAXBLOCKSIZE);
-		error = dmu_tx_assign(tx, zfsvfs->z_assign);
-		if (error) {
-			zfs_dirent_unlock(dl);
-			if (error == ERESTART &&
-			    zfsvfs->z_assign == TXG_NOWAIT) {
-				dmu_tx_wait(tx);
-				dmu_tx_abort(tx);
-				goto top;
-			}
-			dmu_tx_abort(tx);
-			ZFS_EXIT(zfsvfs);
-			return (error);
-		}
-		zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0);
-		ASSERT(zp->z_id == zoid);
-		(void) zfs_link_create(dl, zp, tx, ZNEW);
-		zfs_log_create(zilog, tx, TX_CREATE, dzp, zp, name);
-		dmu_tx_commit(tx);
-	} else {
-		/*
-		 * A directory entry already exists for this name.
-		 */
-		/*
-		 * Can't truncate an existing file if in exclusive mode.
-		 */
-		if (excl == EXCL) {
-			error = EEXIST;
-			goto out;
-		}
-		/*
-		 * Can't open a directory for writing.
-		 */
-		if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) {
-			error = EISDIR;
-			goto out;
-		}
-		/*
-		 * Verify requested access to file.
-		 */
-		if (mode && (error = zfs_zaccess_rwx(zp, mode, cr))) {
-			goto out;
-		}
-
-		mutex_enter(&dzp->z_lock);
-		dzp->z_seq++;
-		mutex_exit(&dzp->z_lock);
-
-		/*
-		 * Truncate regular files if requested.
-		 */
-		if ((ZTOV(zp)->v_type == VREG) &&
-		    (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) {
-			error = zfs_freesp(zp, 0, 0, mode, TRUE);
-			if (error == ERESTART &&
-			    zfsvfs->z_assign == TXG_NOWAIT) {
-				/* NB: we already did dmu_tx_wait() */
-				zfs_dirent_unlock(dl);
-				VN_RELE(ZTOV(zp));
-				goto top;
-			}
-		}
-	}
-out:
-
-	if (error == 0) {
-		*vpp = ZTOV(zp);
-		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
-	}
-
-	if (dl)
-		zfs_dirent_unlock(dl);
-
-	if (error) {
-		if (zp)
-			VN_RELE(ZTOV(zp));
-	} else {
-		*vpp = ZTOV(zp);
-		/*
-		 * If vnode is for a device return a specfs vnode instead.
-		 */
-		if (IS_DEVVP(*vpp)) {
-			struct vnode *svp;
-
-			svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
-			VN_RELE(*vpp);
-			if (svp == NULL) {
-				error = ENOSYS;
-			}
-			*vpp = svp;
-		}
-	}
-
-	ZFS_EXIT(zfsvfs);
-	return (error);
-}
-
-/*
- * Remove an entry from a directory.
- *
- *	IN:	dvp	- vnode of directory to remove entry from.
- *		name	- name of entry to remove.
- *		cr	- credentials of caller.
- *
- *	RETURN:	0 if success
- *		error code if failure
- *
- * Timestamps:
- *	dvp - ctime|mtime
- *	 vp - ctime (if nlink > 0)
- */
-static int
-zfs_remove(vnode_t *dvp, char *name, cred_t *cr)
-{
-	znode_t		*zp, *dzp = VTOZ(dvp);
-	znode_t		*xzp = NULL;
-	vnode_t		*vp;
-	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
-	zilog_t		*zilog = zfsvfs->z_log;
-	uint64_t	acl_obj, xattr_obj;
-	zfs_dirlock_t	*dl;
-	dmu_tx_t	*tx;
-	boolean_t	may_delete_now, delete_now = FALSE;
-	boolean_t	unlinked;
-	int		error;
-
-	ZFS_ENTER(zfsvfs);
-
-top:
-	/*
-	 * Attempt to lock directory; fail if entry doesn't exist.
-	 */
-	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS)) {
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-
-	vp = ZTOV(zp);
-
-	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
-		goto out;
-	}
-
-	/*
-	 * Need to use rmdir for removing directories.
-	 */
-	if (vp->v_type == VDIR) {
-		error = EPERM;
-		goto out;
-	}
-
-	vnevent_remove(vp);
-
-	dnlc_remove(dvp, name);
-
-	may_delete_now = FALSE;
-
-	/*
-	 * We may delete the znode now, or we may put it in the unlinked set;
-	 * it depends on whether we're the last link, and on whether there are
-	 * other holds on the vnode.  So we dmu_tx_hold() the right things to
-	 * allow for either case.
-	 */
-	tx = dmu_tx_create(zfsvfs->z_os);
-	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
-	dmu_tx_hold_bonus(tx, zp->z_id);
-	if (may_delete_now)
-		dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
-
-	/* are there any extended attributes? */
-	if ((xattr_obj = zp->z_phys->zp_xattr) != 0) {
-		/* XXX - do we need this if we are deleting? */
-		dmu_tx_hold_bonus(tx, xattr_obj);
-	}
-
-	/* are there any additional acls */
-	if ((acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj) != 0 &&
-	    may_delete_now)
-		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
-
-	/* charge as an update -- would be nice not to charge at all */
-	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
-
-	error = dmu_tx_assign(tx, zfsvfs->z_assign);
-	if (error) {
-		zfs_dirent_unlock(dl);
-		VN_RELE(vp);
-		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
-			dmu_tx_wait(tx);
-			dmu_tx_abort(tx);
-			goto top;
-		}
-		dmu_tx_abort(tx);
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-
-	/*
-	 * Remove the directory entry.
-	 */
-	error = zfs_link_destroy(dl, zp, tx, 0, &unlinked);
-
-	if (error) {
-		dmu_tx_commit(tx);
-		goto out;
-	}
-
-	if (0 && unlinked) {
-		VI_LOCK(vp);
-		delete_now = may_delete_now &&
-		    vp->v_count == 1 && !vn_has_cached_data(vp) &&
-		    zp->z_phys->zp_xattr == xattr_obj &&
-		    zp->z_phys->zp_acl.z_acl_extern_obj == acl_obj;
-		VI_UNLOCK(vp);
-	}
-
-	if (delete_now) {
-		if (zp->z_phys->zp_xattr) {
-			error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp);
-			ASSERT3U(error, ==, 0);
-			ASSERT3U(xzp->z_phys->zp_links, ==, 2);
-			dmu_buf_will_dirty(xzp->z_dbuf, tx);
-			mutex_enter(&xzp->z_lock);
-			xzp->z_unlinked = 1;
-			xzp->z_phys->zp_links = 0;
-			mutex_exit(&xzp->z_lock);
-			zfs_unlinked_add(xzp, tx);
-			zp->z_phys->zp_xattr = 0; /* probably unnecessary */
-		}
-		mutex_enter(&zp->z_lock);
-		VI_LOCK(vp);
-		vp->v_count--;
-		ASSERT3U(vp->v_count, ==, 0);
-		VI_UNLOCK(vp);
-		mutex_exit(&zp->z_lock);
-		zfs_znode_delete(zp, tx);
-		VFS_RELE(zfsvfs->z_vfs);
-	} else if (unlinked) {
-		zfs_unlinked_add(zp, tx);
-	}
-
-	zfs_log_remove(zilog, tx, TX_REMOVE, dzp, name);
-
-	dmu_tx_commit(tx);
-out:
-	zfs_dirent_unlock(dl);
-
-	if (!delete_now) {
-		VN_RELE(vp);
-	} else if (xzp) {
-		/* this rele delayed to prevent nesting transactions */
-		VN_RELE(ZTOV(xzp));
-	}
-
-	ZFS_EXIT(zfsvfs);
-	return (error);
-}
-
-/*
- * Create a new directory and insert it into dvp using the name
- * provided.  Return a pointer to the inserted directory.
- *
- *	IN:	dvp	- vnode of directory to add subdir to.
- *		dirname	- name of new directory.
- *		vap	- attributes of new directory.
- *		cr	- credentials of caller.
- *
- *	OUT:	vpp	- vnode of created directory.
- *
- *	RETURN:	0 if success
- *		error code if failure
- *
- * Timestamps:
- *	dvp - ctime|mtime updated
- *	 vp - ctime|mtime|atime updated
- */
-static int
-zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr)
-{
-	znode_t		*zp, *dzp = VTOZ(dvp);
-	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
-	zilog_t		*zilog = zfsvfs->z_log;
-	zfs_dirlock_t	*dl;
-	uint64_t	zoid = 0;
-	dmu_tx_t	*tx;
-	int		error;
-
-	ASSERT(vap->va_type == VDIR);
-
-	ZFS_ENTER(zfsvfs);
-
-	if (dzp->z_phys->zp_flags & ZFS_XATTR) {
-		ZFS_EXIT(zfsvfs);
-		return (EINVAL);
-	}
-top:
-	*vpp = NULL;
-
-	/*
-	 * First make sure the new directory doesn't exist.
-	 */
-	if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, ZNEW)) {
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-
-	if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, cr)) {
-		zfs_dirent_unlock(dl);
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-
-	/*
-	 * Add a new entry to the directory.
-	 */
-	tx = dmu_tx_create(zfsvfs->z_os);
-	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
-	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
-	if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
-		dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
-		    0, SPA_MAXBLOCKSIZE);
-	error = dmu_tx_assign(tx, zfsvfs->z_assign);
-	if (error) {
-		zfs_dirent_unlock(dl);
-		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
-			dmu_tx_wait(tx);
-			dmu_tx_abort(tx);
-			goto top;
-		}
-		dmu_tx_abort(tx);
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-
-	/*
-	 * Create new node.
-	 */
-	zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0);
-
-	/*
-	 * Now put new name in parent dir.
-	 */
-	(void) zfs_link_create(dl, zp, tx, ZNEW);
-
-	*vpp = ZTOV(zp);
-
-	zfs_log_create(zilog, tx, TX_MKDIR, dzp, zp, dirname);
-	dmu_tx_commit(tx);
-
-	vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
-
-	zfs_dirent_unlock(dl);
-
-	ZFS_EXIT(zfsvfs);
-	return (0);
-}
-
-/*
- * Remove a directory subdir entry.  If the current working
- * directory is the same as the subdir to be removed, the
- * remove will fail.
- *
- *	IN:	dvp	- vnode of directory to remove from.
- *		name	- name of directory to be removed.
- *		cwd	- vnode of current working directory.
- *		cr	- credentials of caller.
- *
- *	RETURN:	0 if success
- *		error code if failure
- *
- * Timestamps:
- *	dvp - ctime|mtime updated
- */
-static int
-zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr)
-{
-	znode_t		*dzp = VTOZ(dvp);
-	znode_t		*zp;
-	vnode_t		*vp;
-	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
-	zilog_t		*zilog = zfsvfs->z_log;
-	zfs_dirlock_t	*dl;
-	dmu_tx_t	*tx;
-	int		error;
-
-	ZFS_ENTER(zfsvfs);
-
-top:
-	zp = NULL;
-
-	/*
-	 * Attempt to lock directory; fail if entry doesn't exist.
-	 */
-	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS)) {
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-
-	vp = ZTOV(zp);
-
-	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
-		goto out;
-	}
-
-	if (vp->v_type != VDIR) {
-		error = ENOTDIR;
-		goto out;
-	}
-
-	if (vp == cwd) {
-		error = EINVAL;
-		goto out;
-	}
-
-	vnevent_rmdir(vp);
-
-	/*
-	 * Grab a lock on the directory to make sure that noone is
-	 * trying to add (or lookup) entries while we are removing it.
-	 */
-	rw_enter(&zp->z_name_lock, RW_WRITER);
-
-	/*
-	 * Grab a lock on the parent pointer to make sure we play well
-	 * with the treewalk and directory rename code.
-	 */
-	rw_enter(&zp->z_parent_lock, RW_WRITER);
-
-	tx = dmu_tx_create(zfsvfs->z_os);
-	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
-	dmu_tx_hold_bonus(tx, zp->z_id);
-	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
-	error = dmu_tx_assign(tx, zfsvfs->z_assign);
-	if (error) {
-		rw_exit(&zp->z_parent_lock);
-		rw_exit(&zp->z_name_lock);
-		zfs_dirent_unlock(dl);
-		VN_RELE(vp);
-		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
-			dmu_tx_wait(tx);
-			dmu_tx_abort(tx);
-			goto top;
-		}
-		dmu_tx_abort(tx);
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-
-#ifdef FREEBSD_NAMECACHE
-	cache_purge(dvp);
-#endif
-
-	error = zfs_link_destroy(dl, zp, tx, 0, NULL);
-
-	if (error == 0)
-		zfs_log_remove(zilog, tx, TX_RMDIR, dzp, name);
-
-	dmu_tx_commit(tx);
-
-	rw_exit(&zp->z_parent_lock);
-	rw_exit(&zp->z_name_lock);
-#ifdef FREEBSD_NAMECACHE
-	cache_purge(vp);
-#endif
-out:
-	zfs_dirent_unlock(dl);
-
-	VN_RELE(vp);
-
-	ZFS_EXIT(zfsvfs);
-	return (error);
-}
-
-/*
- * Read as many directory entries as will fit into the provided
- * buffer from the given directory cursor position (specified in
- * the uio structure.
- *
- *	IN:	vp	- vnode of directory to read.
- *		uio	- structure supplying read location, range info,
- *			  and return buffer.
- *		cr	- credentials of caller.
- *
- *	OUT:	uio	- updated offset and range, buffer filled.
- *		eofp	- set to true if end-of-file detected.
- *
- *	RETURN:	0 if success
- *		error code if failure
- *
- * Timestamps:
- *	vp - atime updated
- *
- * Note that the low 4 bits of the cookie returned by zap is always zero.
- * This allows us to use the low range for "special" directory entries:
- * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
- * we use the offset 2 for the '.zfs' directory.
- */
-/* ARGSUSED */
-static int
-zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies)
-{
-	znode_t		*zp = VTOZ(vp);
-	iovec_t		*iovp;
-	dirent64_t	*odp;
-	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	objset_t	*os;
-	caddr_t		outbuf;
-	size_t		bufsize;
-	zap_cursor_t	zc;
-	zap_attribute_t	zap;
-	uint_t		bytes_wanted;
-	uint64_t	offset; /* must be unsigned; checks for < 1 */
-	int		local_eof;
-	int		outcount;
-	int		error;
-	uint8_t		prefetch;
-	uint8_t		type;
-	int		ncooks;
-	u_long		*cooks = NULL;
-
-	ZFS_ENTER(zfsvfs);
-
-	/*
-	 * If we are not given an eof variable,
-	 * use a local one.
-	 */
-	if (eofp == NULL)
-		eofp = &local_eof;
-
-	/*
-	 * Check for valid iov_len.
-	 */
-	if (uio->uio_iov->iov_len <= 0) {
-		ZFS_EXIT(zfsvfs);
-		return (EINVAL);
-	}
-
-	/*
-	 * Quit if directory has been removed (posix)
-	 */
-	if ((*eofp = zp->z_unlinked) != 0) {
-		ZFS_EXIT(zfsvfs);
-		return (0);
-	}
-
-	error = 0;
-	os = zfsvfs->z_os;
-	offset = uio->uio_loffset;
-	prefetch = zp->z_zn_prefetch;
-
-	/*
-	 * Initialize the iterator cursor.
-	 */
-	if (offset <= 3) {
-		/*
-		 * Start iteration from the beginning of the directory.
-		 */
-		zap_cursor_init(&zc, os, zp->z_id);
-	} else {
-		/*
-		 * The offset is a serialized cursor.
-		 */
-		zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
-	}
-
-	/*
-	 * Get space to change directory entries into fs independent format.
-	 */
-	iovp = uio->uio_iov;
-	bytes_wanted = iovp->iov_len;
-	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
-		bufsize = bytes_wanted;
-		outbuf = kmem_alloc(bufsize, KM_SLEEP);
-		odp = (struct dirent64 *)outbuf;
-	} else {
-		bufsize = bytes_wanted;
-		odp = (struct dirent64 *)iovp->iov_base;
-	}
-
-	if (ncookies != NULL) {
-		/*
-		 * Minimum entry size is dirent size and 1 byte for a file name.
-		 */
-		ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
-		cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK);
-		*cookies = cooks;
-		*ncookies = ncooks;
-	}
-
-	/*
-	 * Transform to file-system independent format
-	 */
-	outcount = 0;
-	while (outcount < bytes_wanted) {
-		ino64_t objnum;
-		ushort_t reclen;
-
-		/*
-		 * Special case `.', `..', and `.zfs'.
-		 */
-		if (offset == 0) {
-			(void) strcpy(zap.za_name, ".");
-			objnum = zp->z_id;
-			type = DT_DIR;
-		} else if (offset == 1) {
-			(void) strcpy(zap.za_name, "..");
-			objnum = zp->z_phys->zp_parent;
-			type = DT_DIR;
-		} else if (offset == 2 && zfs_show_ctldir(zp)) {
-			(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
-			objnum = ZFSCTL_INO_ROOT;
-			type = DT_DIR;
-		} else {
-			/*
-			 * Grab next entry.
-			 */
-			if (error = zap_cursor_retrieve(&zc, &zap)) {
-				if ((*eofp = (error == ENOENT)) != 0)
-					break;
-				else
-					goto update;
-			}
-
-			if (zap.za_integer_length != 8 ||
-			    zap.za_num_integers != 1) {
-				cmn_err(CE_WARN, "zap_readdir: bad directory "
-				    "entry, obj = %lld, offset = %lld\n",
-				    (u_longlong_t)zp->z_id,
-				    (u_longlong_t)offset);
-				error = ENXIO;
-				goto update;
-			}
-
-			objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
-			/*
-			 * MacOS X can extract the object type here such as:
-			 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
-			 */
-			type = ZFS_DIRENT_TYPE(zap.za_first_integer);
-		}
-		reclen = DIRENT64_RECLEN(strlen(zap.za_name));
-
-		/*
-		 * Will this entry fit in the buffer?
-		 */
-		if (outcount + reclen > bufsize) {
-			/*
-			 * Did we manage to fit anything in the buffer?
-			 */
-			if (!outcount) {
-				error = EINVAL;
-				goto update;
-			}
-			break;
-		}
-		/*
-		 * Add this entry:
-		 */
-		odp->d_ino = objnum;
-		odp->d_reclen = reclen;
-		odp->d_namlen = strlen(zap.za_name);
-		(void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
-		odp->d_type = type;
-		outcount += reclen;
-		odp = (dirent64_t *)((intptr_t)odp + reclen);
-
-		ASSERT(outcount <= bufsize);
-
-		/* Prefetch znode */
-		if (prefetch)
-			dmu_prefetch(os, objnum, 0, 0);
-
-		/*
-		 * Move to the next entry, fill in the previous offset.
-		 */
-		if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
-			zap_cursor_advance(&zc);
-			offset = zap_cursor_serialize(&zc);
-		} else {
-			offset += 1;
-		}
-
-		if (cooks != NULL) {
-			*cooks++ = offset;
-			ncooks--;
-			KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
-		}
-	}
-	zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
-
-	/* Subtract unused cookies */
-	if (ncookies != NULL)
-		*ncookies -= ncooks;
-
-	if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
-		iovp->iov_base += outcount;
-		iovp->iov_len -= outcount;
-		uio->uio_resid -= outcount;
-	} else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
-		/*
-		 * Reset the pointer.
-		 */
-		offset = uio->uio_loffset;
-	}
-
-update:
-	zap_cursor_fini(&zc);
-	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
-		kmem_free(outbuf, bufsize);
-
-	if (error == ENOENT)
-		error = 0;
-
-	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
-
-	uio->uio_loffset = offset;
-	ZFS_EXIT(zfsvfs);
-	if (error != 0 && cookies != NULL) {
-		free(*cookies, M_TEMP);
-		*cookies = NULL;
-		*ncookies = 0;
-	}
-	return (error);
-}
-
-static int
-zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr)
-{
-	znode_t	*zp = VTOZ(vp);
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-
-	ZFS_ENTER(zfsvfs);
-	zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
-	ZFS_EXIT(zfsvfs);
-	return (0);
-}
-
-/*
- * Get the requested file attributes and place them in the provided
- * vattr structure.
- *
- *	IN:	vp	- vnode of file.
- *		vap	- va_mask identifies requested attributes.
- *		flags	- [UNUSED]
- *		cr	- credentials of caller.
- *
- *	OUT:	vap	- attribute values.
- *
- *	RETURN:	0 (always succeeds)
- */
-/* ARGSUSED */
-static int
-zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr)
-{
-	znode_t *zp = VTOZ(vp);
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	znode_phys_t *pzp = zp->z_phys;
-	uint32_t blksize;
-	u_longlong_t nblocks;
-	int	error;
-
-	ZFS_ENTER(zfsvfs);
-
-	/*
-	 * Return all attributes.  It's cheaper to provide the answer
-	 * than to determine whether we were asked the question.
-	 */
-	mutex_enter(&zp->z_lock);
-
-	vap->va_type = IFTOVT(pzp->zp_mode);
-	vap->va_mode = pzp->zp_mode & ~S_IFMT;
-	vap->va_uid = zp->z_phys->zp_uid;
-	vap->va_gid = zp->z_phys->zp_gid;
-	vap->va_nodeid = zp->z_id;
-	vap->va_nlink = MIN(pzp->zp_links, UINT32_MAX);	/* nlink_t limit! */
-	vap->va_size = pzp->zp_size;
-	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
-	vap->va_rdev = zfs_cmpldev(pzp->zp_rdev);
-	vap->va_seq = zp->z_seq;
-	vap->va_flags = 0;	/* FreeBSD: Reset chflags(2) flags. */
-
-	ZFS_TIME_DECODE(&vap->va_atime, pzp->zp_atime);
-	ZFS_TIME_DECODE(&vap->va_mtime, pzp->zp_mtime);
-	ZFS_TIME_DECODE(&vap->va_ctime, pzp->zp_ctime);
-	ZFS_TIME_DECODE(&vap->va_birthtime, pzp->zp_crtime);
-
-	/*
-	 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
-	 * Also, if we are the owner don't bother, since owner should
-	 * always be allowed to read basic attributes of file.
-	 */
-	if (!(zp->z_phys->zp_flags & ZFS_ACL_TRIVIAL) &&
-	    (zp->z_phys->zp_uid != crgetuid(cr))) {
-		if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, cr)) {
-			mutex_exit(&zp->z_lock);
-			ZFS_EXIT(zfsvfs);
-			return (error);
-		}
-	}
-
-	mutex_exit(&zp->z_lock);
-
-	dmu_object_size_from_db(zp->z_dbuf, &blksize, &nblocks);
-	vap->va_blksize = blksize;
-	vap->va_bytes = nblocks << 9;	/* nblocks * 512 */
-
-	if (zp->z_blksz == 0) {
-		/*
-		 * Block size hasn't been set; suggest maximal I/O transfers.
-		 */
-		vap->va_blksize = zfsvfs->z_max_blksz;
-	}
-
-	ZFS_EXIT(zfsvfs);
-	return (0);
-}
-
-/*
- * Set the file attributes to the values contained in the
- * vattr structure.
- *
- *	IN:	vp	- vnode of file to be modified.
- *		vap	- new attribute values.
- *		flags	- ATTR_UTIME set if non-default time values provided.
- *		cr	- credentials of caller.
- *
- *	RETURN:	0 if success
- *		error code if failure
- *
- * Timestamps:
- *	vp - ctime updated, mtime updated if size changed.
- */
-/* ARGSUSED */
-static int
-zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
-	caller_context_t *ct)
-{
-	struct znode	*zp = VTOZ(vp);
-	znode_phys_t	*pzp = zp->z_phys;
-	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	zilog_t		*zilog = zfsvfs->z_log;
-	dmu_tx_t	*tx;
-	vattr_t		oldva;
-	uint_t		mask = vap->va_mask;
-	uint_t		saved_mask;
-	int		trim_mask = 0;
-	uint64_t	new_mode;
-	znode_t		*attrzp;
-	int		need_policy = FALSE;
-	int		err;
-
-	if (mask == 0)
-		return (0);
-
-	if (mask & AT_NOSET)
-		return (EINVAL);
-
-	if (mask & AT_SIZE && vp->v_type == VDIR)
-		return (EISDIR);
-
-	if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO)
-		return (EINVAL);
-
-	ZFS_ENTER(zfsvfs);
-
-top:
-	attrzp = NULL;
-
-	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
-		ZFS_EXIT(zfsvfs);
-		return (EROFS);
-	}
-
-	/*
-	 * First validate permissions
-	 */
-
-	if (mask & AT_SIZE) {
-		err = zfs_zaccess(zp, ACE_WRITE_DATA, cr);
-		if (err) {
-			ZFS_EXIT(zfsvfs);
-			return (err);
-		}
-		/*
-		 * XXX - Note, we are not providing any open
-		 * mode flags here (like FNDELAY), so we may
-		 * block if there are locks present... this
-		 * should be addressed in openat().
-		 */
-		do {
-			err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
-			/* NB: we already did dmu_tx_wait() if necessary */
-		} while (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT);
-		if (err) {
-			ZFS_EXIT(zfsvfs);
-			return (err);
-		}
-	}
-
-	if (mask & (AT_ATIME|AT_MTIME))
-		need_policy = zfs_zaccess_v4_perm(zp, ACE_WRITE_ATTRIBUTES, cr);
-
-	if (mask & (AT_UID|AT_GID)) {
-		int	idmask = (mask & (AT_UID|AT_GID));
-		int	take_owner;
-		int	take_group;
-
-		/*
-		 * NOTE: even if a new mode is being set,
-		 * we may clear S_ISUID/S_ISGID bits.
-		 */
-
-		if (!(mask & AT_MODE))
-			vap->va_mode = pzp->zp_mode;
-
-		/*
-		 * Take ownership or chgrp to group we are a member of
-		 */
-
-		take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
-		take_group = (mask & AT_GID) && groupmember(vap->va_gid, cr);
-
-		/*
-		 * If both AT_UID and AT_GID are set then take_owner and
-		 * take_group must both be set in order to allow taking
-		 * ownership.
-		 *
-		 * Otherwise, send the check through secpolicy_vnode_setattr()
-		 *
-		 */
-
-		if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
-		    ((idmask == AT_UID) && take_owner) ||
-		    ((idmask == AT_GID) && take_group)) {
-			if (zfs_zaccess_v4_perm(zp, ACE_WRITE_OWNER, cr) == 0) {
-				/*
-				 * Remove setuid/setgid for non-privileged users
-				 */
-				secpolicy_setid_clear(vap, cr);
-				trim_mask = (mask & (AT_UID|AT_GID));
-			} else {
-				need_policy =  TRUE;
-			}
-		} else {
-			need_policy =  TRUE;
-		}
-	}
-
-	mutex_enter(&zp->z_lock);
-	oldva.va_mode = pzp->zp_mode;
-	oldva.va_uid = zp->z_phys->zp_uid;
-	oldva.va_gid = zp->z_phys->zp_gid;
-	mutex_exit(&zp->z_lock);
-
-	if (mask & AT_MODE) {
-		if (zfs_zaccess_v4_perm(zp, ACE_WRITE_ACL, cr) == 0) {
-			err = secpolicy_setid_setsticky_clear(vp, vap,
-			    &oldva, cr);
-			if (err) {
-				ZFS_EXIT(zfsvfs);
-				return (err);
-			}
-			trim_mask |= AT_MODE;
-		} else {
-			need_policy = TRUE;
-		}
-	}
-
-	if (need_policy) {
-		/*
-		 * If trim_mask is set then take ownership
-		 * has been granted or write_acl is present and user
-		 * has the ability to modify mode.  In that case remove
-		 * UID|GID and or MODE from mask so that
-		 * secpolicy_vnode_setattr() doesn't revoke it.
-		 */
-
-		if (trim_mask) {
-			saved_mask = vap->va_mask;
-			vap->va_mask &= ~trim_mask;
-
-		}
-		err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
-		    (int (*)(void *, int, cred_t *))zfs_zaccess_rwx, zp);
-		if (err) {
-			ZFS_EXIT(zfsvfs);
-			return (err);
-		}
-
-		if (trim_mask)
-			vap->va_mask |= saved_mask;
-	}
-
-	/*
-	 * secpolicy_vnode_setattr, or take ownership may have
-	 * changed va_mask
-	 */
-	mask = vap->va_mask;
-
-	tx = dmu_tx_create(zfsvfs->z_os);
-	dmu_tx_hold_bonus(tx, zp->z_id);
-
-	if (mask & AT_MODE) {
-		uint64_t pmode = pzp->zp_mode;
-
-		new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
-
-		if (zp->z_phys->zp_acl.z_acl_extern_obj)
-			dmu_tx_hold_write(tx,
-			    pzp->zp_acl.z_acl_extern_obj, 0, SPA_MAXBLOCKSIZE);
-		else
-			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
-			    0, ZFS_ACL_SIZE(MAX_ACL_SIZE));
-	}
-
-	if ((mask & (AT_UID | AT_GID)) && zp->z_phys->zp_xattr != 0) {
-		err = zfs_zget(zp->z_zfsvfs, zp->z_phys->zp_xattr, &attrzp);
-		if (err) {
-			dmu_tx_abort(tx);
-			ZFS_EXIT(zfsvfs);
-			return (err);
-		}
-		dmu_tx_hold_bonus(tx, attrzp->z_id);
-	}
-
-	err = dmu_tx_assign(tx, zfsvfs->z_assign);
-	if (err) {
-		if (attrzp)
-			VN_RELE(ZTOV(attrzp));
-		if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
-			dmu_tx_wait(tx);
-			dmu_tx_abort(tx);
-			goto top;
-		}
-		dmu_tx_abort(tx);
-		ZFS_EXIT(zfsvfs);
-		return (err);
-	}
-
-	dmu_buf_will_dirty(zp->z_dbuf, tx);
-
-	/*
-	 * Set each attribute requested.
-	 * We group settings according to the locks they need to acquire.
-	 *
-	 * Note: you cannot set ctime directly, although it will be
-	 * updated as a side-effect of calling this function.
-	 */
-
-	mutex_enter(&zp->z_lock);
-
-	if (mask & AT_MODE) {
-		err = zfs_acl_chmod_setattr(zp, new_mode, tx);
-		ASSERT3U(err, ==, 0);
-	}
-
-	if (attrzp)
-		mutex_enter(&attrzp->z_lock);
-
-	if (mask & AT_UID) {
-		zp->z_phys->zp_uid = (uint64_t)vap->va_uid;
-		if (attrzp) {
-			attrzp->z_phys->zp_uid = (uint64_t)vap->va_uid;
-		}
-	}
-
-	if (mask & AT_GID) {
-		zp->z_phys->zp_gid = (uint64_t)vap->va_gid;
-		if (attrzp)
-			attrzp->z_phys->zp_gid = (uint64_t)vap->va_gid;
-	}
-
-	if (attrzp)
-		mutex_exit(&attrzp->z_lock);
-
-	if (mask & AT_ATIME)
-		ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime);
-
-	if (mask & AT_MTIME)
-		ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
-
-	if (mask & AT_SIZE)
-		zfs_time_stamper_locked(zp, CONTENT_MODIFIED, tx);
-	else if (mask != 0)
-		zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
-
-	if (mask != 0)
-		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask);
-
-	mutex_exit(&zp->z_lock);
-
-	if (attrzp)
-		VN_RELE(ZTOV(attrzp));
-
-	dmu_tx_commit(tx);
-
-	ZFS_EXIT(zfsvfs);
-	return (err);
-}
-
-typedef struct zfs_zlock {
-	krwlock_t	*zl_rwlock;	/* lock we acquired */
-	znode_t		*zl_znode;	/* znode we held */
-	struct zfs_zlock *zl_next;	/* next in list */
-} zfs_zlock_t;
-
-/*
- * Drop locks and release vnodes that were held by zfs_rename_lock().
- */
-static void
-zfs_rename_unlock(zfs_zlock_t **zlpp)
-{
-	zfs_zlock_t *zl;
-
-	while ((zl = *zlpp) != NULL) {
-		if (zl->zl_znode != NULL)
-			VN_RELE(ZTOV(zl->zl_znode));
-		rw_exit(zl->zl_rwlock);
-		*zlpp = zl->zl_next;
-		kmem_free(zl, sizeof (*zl));
-	}
-}
-
-/*
- * Search back through the directory tree, using the ".." entries.
- * Lock each directory in the chain to prevent concurrent renames.
- * Fail any attempt to move a directory into one of its own descendants.
- * XXX - z_parent_lock can overlap with map or grow locks
- */
-static int
-zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
-{
-	zfs_zlock_t	*zl;
-	znode_t		*zp = tdzp;
-	uint64_t	rootid = zp->z_zfsvfs->z_root;
-	uint64_t	*oidp = &zp->z_id;
-	krwlock_t	*rwlp = &szp->z_parent_lock;
-	krw_t		rw = RW_WRITER;
-
-	/*
-	 * First pass write-locks szp and compares to zp->z_id.
-	 * Later passes read-lock zp and compare to zp->z_parent.
-	 */
-	do {
-		if (!rw_tryenter(rwlp, rw)) {
-			/*
-			 * Another thread is renaming in this path.
-			 * Note that if we are a WRITER, we don't have any
-			 * parent_locks held yet.
-			 */
-			if (rw == RW_READER && zp->z_id > szp->z_id) {
-				/*
-				 * Drop our locks and restart
-				 */
-				zfs_rename_unlock(&zl);
-				*zlpp = NULL;
-				zp = tdzp;
-				oidp = &zp->z_id;
-				rwlp = &szp->z_parent_lock;
-				rw = RW_WRITER;
-				continue;
-			} else {
-				/*
-				 * Wait for other thread to drop its locks
-				 */
-				rw_enter(rwlp, rw);
-			}
-		}
-
-		zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
-		zl->zl_rwlock = rwlp;
-		zl->zl_znode = NULL;
-		zl->zl_next = *zlpp;
-		*zlpp = zl;
-
-		if (*oidp == szp->z_id)		/* We're a descendant of szp */
-			return (EINVAL);
-
-		if (*oidp == rootid)		/* We've hit the top */
-			return (0);
-
-		if (rw == RW_READER) {		/* i.e. not the first pass */
-			int error = zfs_zget(zp->z_zfsvfs, *oidp, &zp);
-			if (error)
-				return (error);
-			zl->zl_znode = zp;
-		}
-		oidp = &zp->z_phys->zp_parent;
-		rwlp = &zp->z_parent_lock;
-		rw = RW_READER;
-
-	} while (zp->z_id != sdzp->z_id);
-
-	return (0);
-}
-
-/*
- * Move an entry from the provided source directory to the target
- * directory.  Change the entry name as indicated.
- *
- *	IN:	sdvp	- Source directory containing the "old entry".
- *		snm	- Old entry name.
- *		tdvp	- Target directory to contain the "new entry".
- *		tnm	- New entry name.
- *		cr	- credentials of caller.
- *
- *	RETURN:	0 if success
- *		error code if failure
- *
- * Timestamps:
- *	sdvp,tdvp - ctime|mtime updated
- */
-static int
-zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr)
-{
-	znode_t		*tdzp, *szp, *tzp;
-	znode_t		*sdzp = VTOZ(sdvp);
-	zfsvfs_t	*zfsvfs = sdzp->z_zfsvfs;
-	zilog_t		*zilog = zfsvfs->z_log;
-	vnode_t		*realvp;
-	zfs_dirlock_t	*sdl, *tdl;
-	dmu_tx_t	*tx;
-	zfs_zlock_t	*zl;
-	int		cmp, serr, terr, error;
-
-	ZFS_ENTER(zfsvfs);
-
-	/*
-	 * Make sure we have the real vp for the target directory.
-	 */
-	if (VOP_REALVP(tdvp, &realvp) == 0)
-		tdvp = realvp;
-
-	if (tdvp->v_vfsp != sdvp->v_vfsp) {
-		ZFS_EXIT(zfsvfs);
-		return (EXDEV);
-	}
-
-	tdzp = VTOZ(tdvp);
-top:
-	szp = NULL;
-	tzp = NULL;
-	zl = NULL;
-
-	/*
-	 * This is to prevent the creation of links into attribute space
-	 * by renaming a linked file into/outof an attribute directory.
-	 * See the comment in zfs_link() for why this is considered bad.
-	 */
-	if ((tdzp->z_phys->zp_flags & ZFS_XATTR) !=
-	    (sdzp->z_phys->zp_flags & ZFS_XATTR)) {
-		ZFS_EXIT(zfsvfs);
-		return (EINVAL);
-	}
-
-	/*
-	 * Lock source and target directory entries.  To prevent deadlock,
-	 * a lock ordering must be defined.  We lock the directory with
-	 * the smallest object id first, or if it's a tie, the one with
-	 * the lexically first name.
-	 */
-	if (sdzp->z_id < tdzp->z_id) {
-		cmp = -1;
-	} else if (sdzp->z_id > tdzp->z_id) {
-		cmp = 1;
-	} else {
-		cmp = strcmp(snm, tnm);
-		if (cmp == 0) {
-			/*
-			 * POSIX: "If the old argument and the new argument
-			 * both refer to links to the same existing file,
-			 * the rename() function shall return successfully
-			 * and perform no other action."
-			 */
-			ZFS_EXIT(zfsvfs);
-			return (0);
-		}
-	}
-	if (cmp < 0) {
-		serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS);
-		terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, 0);
-	} else {
-		terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, 0);
-		serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS);
-	}
-
-	if (serr) {
-		/*
-		 * Source entry invalid or not there.
-		 */
-		if (!terr) {
-			zfs_dirent_unlock(tdl);
-			if (tzp)
-				VN_RELE(ZTOV(tzp));
-		}
-		if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0)
-			serr = EINVAL;
-		ZFS_EXIT(zfsvfs);
-		return (serr);
-	}
-	if (terr) {
-		zfs_dirent_unlock(sdl);
-		VN_RELE(ZTOV(szp));
-		if (strcmp(tnm, "..") == 0)
-			terr = EINVAL;
-		ZFS_EXIT(zfsvfs);
-		return (terr);
-	}
-
-	/*
-	 * Must have write access at the source to remove the old entry
-	 * and write access at the target to create the new entry.
-	 * Note that if target and source are the same, this can be
-	 * done in a single check.
-	 */
-
-	if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
-		goto out;
-
-	if (ZTOV(szp)->v_type == VDIR) {
-		/*
-		 * Check to make sure rename is valid.
-		 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
-		 */
-		if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl))
-			goto out;
-	}
-
-	/*
-	 * Does target exist?
-	 */
-	if (tzp) {
-		/*
-		 * Source and target must be the same type.
-		 */
-		if (ZTOV(szp)->v_type == VDIR) {
-			if (ZTOV(tzp)->v_type != VDIR) {
-				error = ENOTDIR;
-				goto out;
-			}
-		} else {
-			if (ZTOV(tzp)->v_type == VDIR) {
-				error = EISDIR;
-				goto out;
-			}
-		}
-		/*
-		 * POSIX dictates that when the source and target
-		 * entries refer to the same file object, rename
-		 * must do nothing and exit without error.
-		 */
-		if (szp->z_id == tzp->z_id) {
-			error = 0;
-			goto out;
-		}
-	}
-
-	vnevent_rename_src(ZTOV(szp));
-	if (tzp)
-		vnevent_rename_dest(ZTOV(tzp));
-
-	tx = dmu_tx_create(zfsvfs->z_os);
-	dmu_tx_hold_bonus(tx, szp->z_id);	/* nlink changes */
-	dmu_tx_hold_bonus(tx, sdzp->z_id);	/* nlink changes */
-	dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
-	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
-	if (sdzp != tdzp)
-		dmu_tx_hold_bonus(tx, tdzp->z_id);	/* nlink changes */
-	if (tzp)
-		dmu_tx_hold_bonus(tx, tzp->z_id);	/* parent changes */
-	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
-	error = dmu_tx_assign(tx, zfsvfs->z_assign);
-	if (error) {
-		if (zl != NULL)
-			zfs_rename_unlock(&zl);
-		zfs_dirent_unlock(sdl);
-		zfs_dirent_unlock(tdl);
-		VN_RELE(ZTOV(szp));
-		if (tzp)
-			VN_RELE(ZTOV(tzp));
-		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
-			dmu_tx_wait(tx);
-			dmu_tx_abort(tx);
-			goto top;
-		}
-		dmu_tx_abort(tx);
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-
-	if (tzp)	/* Attempt to remove the existing target */
-		error = zfs_link_destroy(tdl, tzp, tx, 0, NULL);
-
-	if (error == 0) {
-		error = zfs_link_create(tdl, szp, tx, ZRENAMING);
-		if (error == 0) {
-			error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
-			ASSERT(error == 0);
-			zfs_log_rename(zilog, tx, TX_RENAME, sdzp,
-			    sdl->dl_name, tdzp, tdl->dl_name, szp);
-		}
-#ifdef FREEBSD_NAMECACHE
-		if (error == 0) {
-			cache_purge(sdvp);
-			cache_purge(tdvp);
-		}
-#endif
-	}
-
-	dmu_tx_commit(tx);
-out:
-	if (zl != NULL)
-		zfs_rename_unlock(&zl);
-
-	zfs_dirent_unlock(sdl);
-	zfs_dirent_unlock(tdl);
-
-	VN_RELE(ZTOV(szp));
-	if (tzp)
-		VN_RELE(ZTOV(tzp));
-
-	ZFS_EXIT(zfsvfs);
-
-	return (error);
-}
-
-/*
- * Insert the indicated symbolic reference entry into the directory.
- *
- *	IN:	dvp	- Directory to contain new symbolic link.
- *		link	- Name for new symlink entry.
- *		vap	- Attributes of new entry.
- *		target	- Target path of new symlink.
- *		cr	- credentials of caller.
- *
- *	RETURN:	0 if success
- *		error code if failure
- *
- * Timestamps:
- *	dvp - ctime|mtime updated
- */
-static int
-zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link, cred_t *cr, kthread_t *td)
-{
-	znode_t		*zp, *dzp = VTOZ(dvp);
-	zfs_dirlock_t	*dl;
-	dmu_tx_t	*tx;
-	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
-	zilog_t		*zilog = zfsvfs->z_log;
-	uint64_t	zoid;
-	int		len = strlen(link);
-	int		error;
-
-	ASSERT(vap->va_type == VLNK);
-
-	ZFS_ENTER(zfsvfs);
-top:
-	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) {
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-
-	if (len > MAXPATHLEN) {
-		ZFS_EXIT(zfsvfs);
-		return (ENAMETOOLONG);
-	}
-
-	/*
-	 * Attempt to lock directory; fail if entry already exists.
-	 */
-	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZNEW)) {
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-
-	tx = dmu_tx_create(zfsvfs->z_os);
-	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
-	dmu_tx_hold_bonus(tx, dzp->z_id);
-	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
-	if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
-		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE);
-	error = dmu_tx_assign(tx, zfsvfs->z_assign);
-	if (error) {
-		zfs_dirent_unlock(dl);
-		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
-			dmu_tx_wait(tx);
-			dmu_tx_abort(tx);
-			goto top;
-		}
-		dmu_tx_abort(tx);
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-
-	dmu_buf_will_dirty(dzp->z_dbuf, tx);
-
-	/*
-	 * Create a new object for the symlink.
-	 * Put the link content into bonus buffer if it will fit;
-	 * otherwise, store it just like any other file data.
-	 */
-	zoid = 0;
-	if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) {
-		zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, len);
-		if (len != 0)
-			bcopy(link, zp->z_phys + 1, len);
-	} else {
-		dmu_buf_t *dbp;
-
-		zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0);
-
-		/*
-		 * Nothing can access the znode yet so no locking needed
-		 * for growing the znode's blocksize.
-		 */
-		zfs_grow_blocksize(zp, len, tx);
-
-		VERIFY(0 == dmu_buf_hold(zfsvfs->z_os, zoid, 0, FTAG, &dbp));
-		dmu_buf_will_dirty(dbp, tx);
-
-		ASSERT3U(len, <=, dbp->db_size);
-		bcopy(link, dbp->db_data, len);
-		dmu_buf_rele(dbp, FTAG);
-	}
-	zp->z_phys->zp_size = len;
-
-	/*
-	 * Insert the new object into the directory.
-	 */
-	(void) zfs_link_create(dl, zp, tx, ZNEW);
-out:
-	if (error == 0) {
-		zfs_log_symlink(zilog, tx, TX_SYMLINK, dzp, zp, name, link);
-		*vpp = ZTOV(zp);
-		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
-	}
-
-	dmu_tx_commit(tx);
-
-	zfs_dirent_unlock(dl);
-
-	ZFS_EXIT(zfsvfs);
-	return (error);
-}
-
-/*
- * Return, in the buffer contained in the provided uio structure,
- * the symbolic path referred to by vp.
- *
- *	IN:	vp	- vnode of symbolic link.
- *		uoip	- structure to contain the link path.
- *		cr	- credentials of caller.
- *
- *	OUT:	uio	- structure to contain the link path.
- *
- *	RETURN:	0 if success
- *		error code if failure
- *
- * Timestamps:
- *	vp - atime updated
- */
-/* ARGSUSED */
-static int
-zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr)
-{
-	znode_t		*zp = VTOZ(vp);
-	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	size_t		bufsz;
-	int		error;
-
-	ZFS_ENTER(zfsvfs);
-
-	bufsz = (size_t)zp->z_phys->zp_size;
-	if (bufsz + sizeof (znode_phys_t) <= zp->z_dbuf->db_size) {
-		error = uiomove(zp->z_phys + 1,
-		    MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
-	} else {
-		dmu_buf_t *dbp;
-		error = dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0, FTAG, &dbp);
-		if (error) {
-			ZFS_EXIT(zfsvfs);
-			return (error);
-		}
-		error = uiomove(dbp->db_data,
-		    MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
-		dmu_buf_rele(dbp, FTAG);
-	}
-
-	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
-	ZFS_EXIT(zfsvfs);
-	return (error);
-}
-
-/*
- * Insert a new entry into directory tdvp referencing svp.
- *
- *	IN:	tdvp	- Directory to contain new entry.
- *		svp	- vnode of new entry.
- *		name	- name of new entry.
- *		cr	- credentials of caller.
- *
- *	RETURN:	0 if success
- *		error code if failure
- *
- * Timestamps:
- *	tdvp - ctime|mtime updated
- *	 svp - ctime updated
- */
-/* ARGSUSED */
-static int
-zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr)
-{
-	znode_t		*dzp = VTOZ(tdvp);
-	znode_t		*tzp, *szp;
-	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
-	zilog_t		*zilog = zfsvfs->z_log;
-	zfs_dirlock_t	*dl;
-	dmu_tx_t	*tx;
-	vnode_t		*realvp;
-	int		error;
-
-	ASSERT(tdvp->v_type == VDIR);
-
-	ZFS_ENTER(zfsvfs);
-
-	if (VOP_REALVP(svp, &realvp) == 0)
-		svp = realvp;
-
-	if (svp->v_vfsp != tdvp->v_vfsp) {
-		ZFS_EXIT(zfsvfs);
-		return (EXDEV);
-	}
-
-	szp = VTOZ(svp);
-top:
-	/*
-	 * We do not support links between attributes and non-attributes
-	 * because of the potential security risk of creating links
-	 * into "normal" file space in order to circumvent restrictions
-	 * imposed in attribute space.
-	 */
-	if ((szp->z_phys->zp_flags & ZFS_XATTR) !=
-	    (dzp->z_phys->zp_flags & ZFS_XATTR)) {
-		ZFS_EXIT(zfsvfs);
-		return (EINVAL);
-	}
-
-	/*
-	 * POSIX dictates that we return EPERM here.
-	 * Better choices include ENOTSUP or EISDIR.
-	 */
-	if (svp->v_type == VDIR) {
-		ZFS_EXIT(zfsvfs);
-		return (EPERM);
-	}
-
-	if ((uid_t)szp->z_phys->zp_uid != crgetuid(cr) &&
-	    secpolicy_basic_link(cr) != 0) {
-		ZFS_EXIT(zfsvfs);
-		return (EPERM);
-	}
-
-	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) {
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-
-	/*
-	 * Attempt to lock directory; fail if entry already exists.
-	 */
-	if (error = zfs_dirent_lock(&dl, dzp, name, &tzp, ZNEW)) {
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-
-	tx = dmu_tx_create(zfsvfs->z_os);
-	dmu_tx_hold_bonus(tx, szp->z_id);
-	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
-	error = dmu_tx_assign(tx, zfsvfs->z_assign);
-	if (error) {
-		zfs_dirent_unlock(dl);
-		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
-			dmu_tx_wait(tx);
-			dmu_tx_abort(tx);
-			goto top;
-		}
-		dmu_tx_abort(tx);
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-
-	error = zfs_link_create(dl, szp, tx, 0);
-
-	if (error == 0)
-		zfs_log_link(zilog, tx, TX_LINK, dzp, szp, name);
-
-	dmu_tx_commit(tx);
-
-	zfs_dirent_unlock(dl);
-
-	ZFS_EXIT(zfsvfs);
-	return (error);
-}
-
-void
-zfs_inactive(vnode_t *vp, cred_t *cr)
-{
-	znode_t	*zp = VTOZ(vp);
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	int error;
-
-	rw_enter(&zfsvfs->z_um_lock, RW_READER);
-	if (zfsvfs->z_unmounted2) {
-		ASSERT(zp->z_dbuf_held == 0);
-
-		mutex_enter(&zp->z_lock);
-		VI_LOCK(vp);
-		vp->v_count = 0; /* count arrives as 1 */
-		VI_UNLOCK(vp);
-		if (zp->z_dbuf == NULL) {
-			mutex_exit(&zp->z_lock);
-			zfs_znode_free(zp);
-		} else {
-			mutex_exit(&zp->z_lock);
-		}
-		rw_exit(&zfsvfs->z_um_lock);
-		VFS_RELE(zfsvfs->z_vfs);
-		return;
-	}
-
-	if (zp->z_atime_dirty && zp->z_unlinked == 0) {
-		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
-
-		dmu_tx_hold_bonus(tx, zp->z_id);
-		error = dmu_tx_assign(tx, TXG_WAIT);
-		if (error) {
-			dmu_tx_abort(tx);
-		} else {
-			dmu_buf_will_dirty(zp->z_dbuf, tx);
-			mutex_enter(&zp->z_lock);
-			zp->z_atime_dirty = 0;
-			mutex_exit(&zp->z_lock);
-			dmu_tx_commit(tx);
-		}
-	}
-
-	zfs_zinactive(zp);
-	rw_exit(&zfsvfs->z_um_lock);
-}
-
-CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
-CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
-
-static int
-zfs_fid(vnode_t *vp, fid_t *fidp)
-{
-	znode_t		*zp = VTOZ(vp);
-	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	uint32_t	gen = (uint32_t)zp->z_phys->zp_gen;
-	uint64_t	object = zp->z_id;
-	zfid_short_t	*zfid;
-	int		size, i;
-
-	ZFS_ENTER(zfsvfs);
-
-	size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
-	fidp->fid_len = size;
-
-	zfid = (zfid_short_t *)fidp;
-
-	zfid->zf_len = size;
-
-	for (i = 0; i < sizeof (zfid->zf_object); i++)
-		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
-
-	/* Must have a non-zero generation number to distinguish from .zfs */
-	if (gen == 0)
-		gen = 1;
-	for (i = 0; i < sizeof (zfid->zf_gen); i++)
-		zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
-
-	if (size == LONG_FID_LEN) {
-		uint64_t	objsetid = dmu_objset_id(zfsvfs->z_os);
-		zfid_long_t	*zlfid;
-
-		zlfid = (zfid_long_t *)fidp;
-
-		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
-			zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
-
-		/* XXX - this should be the generation number for the objset */
-		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
-			zlfid->zf_setgen[i] = 0;
-	}
-
-	ZFS_EXIT(zfsvfs);
-	return (0);
-}
-
-static int
-zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr)
-{
-	znode_t		*zp, *xzp;
-	zfsvfs_t	*zfsvfs;
-	zfs_dirlock_t	*dl;
-	int		error;
-
-	switch (cmd) {
-	case _PC_LINK_MAX:
-		*valp = INT_MAX;
-		return (0);
-
-	case _PC_FILESIZEBITS:
-		*valp = 64;
-		return (0);
-
-#if 0
-	case _PC_XATTR_EXISTS:
-		zp = VTOZ(vp);
-		zfsvfs = zp->z_zfsvfs;
-		ZFS_ENTER(zfsvfs);
-		*valp = 0;
-		error = zfs_dirent_lock(&dl, zp, "", &xzp,
-		    ZXATTR | ZEXISTS | ZSHARED);
-		if (error == 0) {
-			zfs_dirent_unlock(dl);
-			if (!zfs_dirempty(xzp))
-				*valp = 1;
-			VN_RELE(ZTOV(xzp));
-		} else if (error == ENOENT) {
-			/*
-			 * If there aren't extended attributes, it's the
-			 * same as having zero of them.
-			 */
-			error = 0;
-		}
-		ZFS_EXIT(zfsvfs);
-		return (error);
-#endif
-
-	case _PC_ACL_EXTENDED:
-		*valp = 0;	/* TODO */
-		return (0);
-
-	case _PC_MIN_HOLE_SIZE:
-		*valp = (int)SPA_MINBLOCKSIZE;
-		return (0);
-
-	default:
-		return (EOPNOTSUPP);
-	}
-}
-
-#ifdef TODO
-/*ARGSUSED*/
-static int
-zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr)
-{
-	znode_t *zp = VTOZ(vp);
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	int error;
-
-	ZFS_ENTER(zfsvfs);
-	error = zfs_getacl(zp, vsecp, cr);
-	ZFS_EXIT(zfsvfs);
-
-	return (error);
-}
-#endif	/* TODO */
-
-#ifdef TODO
-/*ARGSUSED*/
-static int
-zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr)
-{
-	znode_t *zp = VTOZ(vp);
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	int error;
-
-	ZFS_ENTER(zfsvfs);
-	error = zfs_setacl(zp, vsecp, cr);
-	ZFS_EXIT(zfsvfs);
-	return (error);
-}
-#endif	/* TODO */
-
-static int
-zfs_freebsd_open(ap)
-	struct vop_open_args /* {
-		struct vnode *a_vp;
-		int a_mode;
-		struct ucred *a_cred;
-		struct thread *a_td;
-	} */ *ap;
-{
-	vnode_t	*vp = ap->a_vp;
-	znode_t *zp = VTOZ(vp);
-	int error;
-
-	error = zfs_open(&vp, ap->a_mode, ap->a_cred);
-	if (error == 0)
-		vnode_create_vobject(vp, zp->z_phys->zp_size, ap->a_td);
-	return (error);
-}
-
-static int
-zfs_freebsd_close(ap)
-	struct vop_close_args /* {
-		struct vnode *a_vp;
-		int  a_fflag;
-		struct ucred *a_cred;
-		struct thread *a_td;
-	} */ *ap;
-{
-
-	return (zfs_close(ap->a_vp, ap->a_fflag, 0, 0, ap->a_cred));
-}
-
-static int
-zfs_freebsd_ioctl(ap)
-	struct vop_ioctl_args /* {
-		struct vnode *a_vp;
-		u_long a_command;
-		caddr_t a_data;
-		int a_fflag;
-		struct ucred *cred;
-		struct thread *td;
-	} */ *ap;
-{
-
-	return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
-	    ap->a_fflag, ap->a_cred, NULL));
-}
-
-static int
-zfs_freebsd_read(ap)
-	struct vop_read_args /* {
-		struct vnode *a_vp;
-		struct uio *a_uio;
-		int a_ioflag;
-		struct ucred *a_cred;
-	} */ *ap;
-{
-
-	return (zfs_read(ap->a_vp, ap->a_uio, ap->a_ioflag, ap->a_cred, NULL));
-}
-
-static int
-zfs_freebsd_write(ap)
-	struct vop_write_args /* {
-		struct vnode *a_vp;
-		struct uio *a_uio;
-		int a_ioflag;
-		struct ucred *a_cred;
-	} */ *ap;
-{
-
-	return (zfs_write(ap->a_vp, ap->a_uio, ap->a_ioflag, ap->a_cred, NULL));
-}
-
-static int
-zfs_freebsd_access(ap)
-	struct vop_access_args /* {
-		struct vnode *a_vp;
-		int  a_mode;
-		struct ucred *a_cred;
-		struct thread *a_td;
-	} */ *ap;
-{
-
-	return (zfs_access(ap->a_vp, ap->a_mode, 0, ap->a_cred));
-}
-
-static int
-zfs_freebsd_lookup(ap)
-	struct vop_lookup_args /* {
-		struct vnode *a_dvp;
-		struct vnode **a_vpp;
-		struct componentname *a_cnp;
-	} */ *ap;
-{
-	struct componentname *cnp = ap->a_cnp;
-	char nm[NAME_MAX + 1];
-
-	ASSERT(cnp->cn_namelen < sizeof(nm));
-	strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm)));
-
-	return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
-	    cnp->cn_cred, cnp->cn_thread));
-}
-
-static int
-zfs_freebsd_create(ap)
-	struct vop_create_args /* {
-		struct vnode *a_dvp;
-		struct vnode **a_vpp;
-		struct componentname *a_cnp;
-		struct vattr *a_vap;
-	} */ *ap;
-{
-	struct componentname *cnp = ap->a_cnp;
-	vattr_t *vap = ap->a_vap;
-	int mode;
-
-	ASSERT(cnp->cn_flags & SAVENAME);
-
-	vattr_init_mask(vap);
-	mode = vap->va_mode & ALLPERMS;
-
-	return (zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode,
-	    ap->a_vpp, cnp->cn_cred));
-}
-
-static int
-zfs_freebsd_remove(ap)
-	struct vop_remove_args /* {
-		struct vnode *a_dvp;
-		struct vnode *a_vp;
-		struct componentname *a_cnp;
-	} */ *ap;
-{
-
-	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
-
-	return (zfs_remove(ap->a_dvp, ap->a_cnp->cn_nameptr,
-	    ap->a_cnp->cn_cred));
-}
-
-static int
-zfs_freebsd_mkdir(ap)
-	struct vop_mkdir_args /* {
-		struct vnode *a_dvp;
-		struct vnode **a_vpp;
-		struct componentname *a_cnp;
-		struct vattr *a_vap;
-	} */ *ap;
-{
-	vattr_t *vap = ap->a_vap;
-
-	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
-
-	vattr_init_mask(vap);
-
-	return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp,
-	    ap->a_cnp->cn_cred));
-}
-
-static int
-zfs_freebsd_rmdir(ap)
-	struct vop_rmdir_args /* {
-		struct vnode *a_dvp;
-		struct vnode *a_vp;
-		struct componentname *a_cnp;
-	} */ *ap;
-{
-	struct componentname *cnp = ap->a_cnp;
-
-	ASSERT(cnp->cn_flags & SAVENAME);
-
-	return (zfs_rmdir(ap->a_dvp, cnp->cn_nameptr, NULL, cnp->cn_cred));
-}
-
-static int
-zfs_freebsd_readdir(ap)
-	struct vop_readdir_args /* {
-		struct vnode *a_vp;
-		struct uio *a_uio;
-		struct ucred *a_cred;
-		int *a_eofflag;
-		int *a_ncookies;
-		u_long **a_cookies;
-	} */ *ap;
-{
-
-	return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag,
-	    ap->a_ncookies, ap->a_cookies));
-}
-
-static int
-zfs_freebsd_fsync(ap)
-	struct vop_fsync_args /* {
-		struct vnode *a_vp;
-		int a_waitfor;
-		struct thread *a_td;
-	} */ *ap;
-{
-
-	vop_stdfsync(ap);
-	return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred));
-}
-
-static int
-zfs_freebsd_getattr(ap)
-	struct vop_getattr_args /* {
-		struct vnode *a_vp;
-		struct vattr *a_vap;
-		struct ucred *a_cred;
-		struct thread *a_td;
-	} */ *ap;
-{
-
-	return (zfs_getattr(ap->a_vp, ap->a_vap, 0, ap->a_cred));
-}
-
-static int
-zfs_freebsd_setattr(ap)
-	struct vop_setattr_args /* {
-		struct vnode *a_vp;
-		struct vattr *a_vap;
-		struct ucred *a_cred;
-		struct thread *a_td;
-	} */ *ap;
-{
-	vattr_t *vap = ap->a_vap;
-
-	/* No support for FreeBSD's chflags(2). */
-	if (vap->va_flags != VNOVAL)
-		return (EOPNOTSUPP);
-
-	vattr_init_mask(vap);
-	vap->va_mask &= ~AT_NOSET;
-
-	return (zfs_setattr(ap->a_vp, vap, 0, ap->a_cred, NULL));
-}
-
-static int
-zfs_freebsd_rename(ap)
-	struct vop_rename_args  /* {
-		struct vnode *a_fdvp;
-		struct vnode *a_fvp;
-		struct componentname *a_fcnp;
-		struct vnode *a_tdvp;
-		struct vnode *a_tvp;
-		struct componentname *a_tcnp;
-	} */ *ap;
-{
-	vnode_t *fdvp = ap->a_fdvp;
-	vnode_t *fvp = ap->a_fvp;
-	vnode_t *tdvp = ap->a_tdvp;
-	vnode_t *tvp = ap->a_tvp;
-	int error;
-
-	ASSERT(ap->a_fcnp->cn_flags & SAVENAME);
-	ASSERT(ap->a_tcnp->cn_flags & SAVENAME);
-
-	error = zfs_rename(fdvp, ap->a_fcnp->cn_nameptr, tdvp,
-	    ap->a_tcnp->cn_nameptr, ap->a_fcnp->cn_cred);
-
-	if (tdvp == tvp)
-		VN_RELE(tdvp);
-	else
-		VN_URELE(tdvp);
-	if (tvp)
-		VN_URELE(tvp);
-	VN_RELE(fdvp);
-	VN_RELE(fvp);
-
-	return (error);
-}
-
-static int
-zfs_freebsd_symlink(ap)
-	struct vop_symlink_args /* {
-		struct vnode *a_dvp;
-		struct vnode **a_vpp;
-		struct componentname *a_cnp;
-		struct vattr *a_vap;
-		char *a_target;
-	} */ *ap;
-{
-	struct componentname *cnp = ap->a_cnp;
-	vattr_t *vap = ap->a_vap;
-
-	ASSERT(cnp->cn_flags & SAVENAME);
-
-	vap->va_type = VLNK;	/* FreeBSD: Syscall only sets va_mode. */
-	vattr_init_mask(vap);
-
-	return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap,
-	    ap->a_target, cnp->cn_cred, cnp->cn_thread));
-}
-
-static int
-zfs_freebsd_readlink(ap)
-	struct vop_readlink_args /* {
-		struct vnode *a_vp;
-		struct uio *a_uio;
-		struct ucred *a_cred;
-	} */ *ap;
-{
-
-	return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred));
-}
-
-static int
-zfs_freebsd_link(ap)
-	struct vop_link_args /* {
-		struct vnode *a_tdvp;
-		struct vnode *a_vp;
-		struct componentname *a_cnp;
-	} */ *ap;
-{
-	struct componentname *cnp = ap->a_cnp;
-
-	ASSERT(cnp->cn_flags & SAVENAME);
-
-	return (zfs_link(ap->a_tdvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred));
-}
-
-static int
-zfs_freebsd_inactive(ap)
-	struct vop_inactive_args /* {
-		struct vnode *a_vp;
-		struct thread *a_td;
-	} */ *ap;
-{
-	vnode_t *vp = ap->a_vp;
-
-	zfs_inactive(vp, ap->a_td->td_ucred);
-	return (0);
-}
-
-static int
-zfs_freebsd_reclaim(ap)
-	struct vop_reclaim_args /* {
-		struct vnode *a_vp;
-		struct thread *a_td;
-	} */ *ap;
-{
-	vnode_t	*vp = ap->a_vp;
-	znode_t	*zp = VTOZ(vp);
-	zfsvfs_t *zfsvfs;
-	int rele = 1;
-
-	ASSERT(zp != NULL);
-
-	/*
-	 * Destroy the vm object and flush associated pages.
-	 */
-	vnode_destroy_vobject(vp);
-
-	mutex_enter(&zp->z_lock);
-	ASSERT(zp->z_phys);
-	ASSERT(zp->z_dbuf_held);
-	zfsvfs = zp->z_zfsvfs;
-	if (!zp->z_unlinked) {
-		zp->z_dbuf_held = 0;
-		ZTOV(zp) = NULL;
-		mutex_exit(&zp->z_lock);
-		dmu_buf_rele(zp->z_dbuf, NULL);
-	} else {
-		mutex_exit(&zp->z_lock);
-	}
-	VI_LOCK(vp);
-	if (vp->v_count > 0)
-		rele = 0;
-	vp->v_data = NULL;
-	ASSERT(vp->v_holdcnt >= 1);
-	VI_UNLOCK(vp);
-	if (!zp->z_unlinked && rele)
-		VFS_RELE(zfsvfs->z_vfs);
-	return (0);
-}
-
-static int
-zfs_freebsd_fid(ap)
-	struct vop_fid_args /* {
-		struct vnode *a_vp;
-		struct fid *a_fid;
-	} */ *ap;
-{
-
-	return (zfs_fid(ap->a_vp, (void *)ap->a_fid));
-}
-
-static int
-zfs_freebsd_pathconf(ap)
-	struct vop_pathconf_args /* {
-		struct vnode *a_vp;
-		int a_name;
-		register_t *a_retval;
-	} */ *ap;
-{
-	ulong_t val;
-	int error;
-
-	error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred);
-	if (error == 0)
-		*ap->a_retval = val;
-	else if (error == EOPNOTSUPP)
-		error = vop_stdpathconf(ap);
-	return (error);
-}
-
-/*
- * Advisory record locking support
- */
-static int
-zfs_freebsd_advlock(ap)
-	struct vop_advlock_args /* {
-		struct vnode *a_vp;
-		caddr_t  a_id;
-		int  a_op;
-		struct flock *a_fl;
-		int  a_flags;
-	} */ *ap;
-{
-	znode_t	*zp = VTOZ(ap->a_vp);
-
-	return (lf_advlock(ap, &(zp->z_lockf), zp->z_phys->zp_size));
-}
-
-/*
- * Advisory record locking support
- */
-static int
-zfs_freebsd_advlockasync(ap)
-	struct vop_advlockasync_args /* {
-		struct vnode *a_vp;
-		caddr_t  a_id;
-		int  a_op;
-		struct flock *a_fl;
-		int  a_flags;
-		struct task *a_task;
-	} */ *ap;
-{
-	znode_t	*zp = VTOZ(ap->a_vp);
-
-	return (lf_advlockasync(ap, &(zp->z_lockf), zp->z_phys->zp_size));
-}
-
-struct vop_vector zfs_vnodeops;
-struct vop_vector zfs_fifoops;
-
-struct vop_vector zfs_vnodeops = {
-	.vop_default =	&default_vnodeops,
-	.vop_inactive =	zfs_freebsd_inactive,
-	.vop_reclaim =	zfs_freebsd_reclaim,
-	.vop_access =	zfs_freebsd_access,
-#ifdef FREEBSD_NAMECACHE
-	.vop_lookup =	vfs_cache_lookup,
-	.vop_cachedlookup = zfs_freebsd_lookup,
-#else
-	.vop_lookup =	zfs_freebsd_lookup,
-#endif
-	.vop_getattr =	zfs_freebsd_getattr,
-	.vop_setattr =	zfs_freebsd_setattr,
-	.vop_create =	zfs_freebsd_create,
-	.vop_mknod =	zfs_freebsd_create,
-	.vop_mkdir =	zfs_freebsd_mkdir,
-	.vop_readdir =	zfs_freebsd_readdir,
-	.vop_fsync =	zfs_freebsd_fsync,
-	.vop_open =	zfs_freebsd_open,
-	.vop_close =	zfs_freebsd_close,
-	.vop_rmdir =	zfs_freebsd_rmdir,
-	.vop_ioctl =	zfs_freebsd_ioctl,
-	.vop_link =	zfs_freebsd_link,
-	.vop_symlink =	zfs_freebsd_symlink,
-	.vop_readlink =	zfs_freebsd_readlink,
-	.vop_read =	zfs_freebsd_read,
-	.vop_write =	zfs_freebsd_write,
-	.vop_remove =	zfs_freebsd_remove,
-	.vop_rename =	zfs_freebsd_rename,
-	.vop_advlock =	zfs_freebsd_advlock,
-	.vop_advlockasync = zfs_freebsd_advlockasync,
-	.vop_pathconf =	zfs_freebsd_pathconf,
-	.vop_bmap =	VOP_EOPNOTSUPP,
-	.vop_fid =	zfs_freebsd_fid,
-};
-
-struct vop_vector zfs_fifoops = {
-	.vop_default =	&fifo_specops,
-	.vop_fsync =	VOP_PANIC,
-	.vop_access =	zfs_freebsd_access,
-	.vop_getattr =	zfs_freebsd_getattr,
-	.vop_inactive =	zfs_freebsd_inactive,
-	.vop_read =	VOP_PANIC,
-	.vop_reclaim =	zfs_freebsd_reclaim,
-	.vop_setattr =	zfs_freebsd_setattr,
-	.vop_write =	VOP_PANIC,
-	.vop_fid =	zfs_freebsd_fid,
-};
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
deleted file mode 100644
index 46e501c..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
+++ /dev/null
@@ -1,1072 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-/* Portions Copyright 2007 Jeremy Teo */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#ifdef _KERNEL
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/time.h>
-#include <sys/systm.h>
-#include <sys/sysmacros.h>
-#include <sys/resource.h>
-#include <sys/mntent.h>
-#include <sys/vfs.h>
-#include <sys/vnode.h>
-#include <sys/file.h>
-#include <sys/kmem.h>
-#include <sys/cmn_err.h>
-#include <sys/errno.h>
-#include <sys/unistd.h>
-#include <sys/atomic.h>
-#include <sys/zfs_dir.h>
-#include <sys/zfs_acl.h>
-#include <sys/zfs_ioctl.h>
-#include <sys/zfs_rlock.h>
-#include <sys/fs/zfs.h>
-#endif /* _KERNEL */
-
-#include <sys/dmu.h>
-#include <sys/refcount.h>
-#include <sys/stat.h>
-#include <sys/zap.h>
-#include <sys/zfs_znode.h>
-#include <sys/refcount.h>
-
-/* Used by fstat(1). */
-SYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD, 0, sizeof(znode_t),
-    "sizeof(znode_t)");
-
-/*
- * Functions needed for userland (ie: libzpool) are not put under
- * #ifdef_KERNEL; the rest of the functions have dependencies
- * (such as VFS logic) that will not compile easily in userland.
- */
-#ifdef _KERNEL
-struct kmem_cache *znode_cache = NULL;
-
-/*ARGSUSED*/
-static void
-znode_pageout_func(dmu_buf_t *dbuf, void *user_ptr)
-{
-	znode_t *zp = user_ptr;
-	vnode_t *vp;
-
-	mutex_enter(&zp->z_lock);
-	vp = ZTOV(zp);
-	if (vp == NULL) {
-		mutex_exit(&zp->z_lock);
-		zfs_znode_free(zp);
-	} else if (vp->v_count == 0) {
-		ZTOV(zp) = NULL;
-		vhold(vp);
-		mutex_exit(&zp->z_lock);
-		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
-		vrecycle(vp, curthread);
-		VOP_UNLOCK(vp, 0);
-		vdrop(vp);
-		zfs_znode_free(zp);
-	} else {
-		/* signal force unmount that this znode can be freed */
-		zp->z_dbuf = NULL;
-		mutex_exit(&zp->z_lock);
-	}
-}
-
-extern struct vop_vector zfs_vnodeops;
-extern struct vop_vector zfs_fifoops;
-
-/*
- * XXX: We cannot use this function as a cache constructor, because
- *      there is one global cache for all file systems and we need
- *      to pass vfsp here, which is not possible, because argument
- *      'cdrarg' is defined at kmem_cache_create() time.
- */
-static int
-zfs_znode_cache_constructor(void *buf, void *cdrarg, int kmflags)
-{
-	znode_t *zp = buf;
-	vnode_t *vp;
-	vfs_t *vfsp = cdrarg;
-	int error;
-
-	if (cdrarg != NULL) {
-		error = getnewvnode("zfs", vfsp, &zfs_vnodeops, &vp);
-		ASSERT(error == 0);
-		zp->z_vnode = vp;
-		vp->v_data = (caddr_t)zp;
-		VN_LOCK_AREC(vp);
-		VN_LOCK_ASHARE(vp);
-	} else {
-		zp->z_vnode = NULL;
-	}
-	mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
-	rw_init(&zp->z_map_lock, NULL, RW_DEFAULT, NULL);
-	rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
-	rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL);
-	mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
-
-	mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL);
-	avl_create(&zp->z_range_avl, zfs_range_compare,
-	    sizeof (rl_t), offsetof(rl_t, r_node));
-
-	zp->z_dbuf_held = 0;
-	zp->z_dirlocks = 0;
-	zp->z_lockf = NULL;
-	return (0);
-}
-
-/*ARGSUSED*/
-static void
-zfs_znode_cache_destructor(void *buf, void *cdarg)
-{
-	znode_t *zp = buf;
-
-	ASSERT(zp->z_dirlocks == 0);
-	mutex_destroy(&zp->z_lock);
-	rw_destroy(&zp->z_map_lock);
-	rw_destroy(&zp->z_parent_lock);
-	rw_destroy(&zp->z_name_lock);
-	mutex_destroy(&zp->z_acl_lock);
-	mutex_destroy(&zp->z_range_lock);
-	avl_destroy(&zp->z_range_avl);
-
-	ASSERT(zp->z_dbuf_held == 0);
-}
-
-void
-zfs_znode_init(void)
-{
-	/*
-	 * Initialize zcache
-	 */
-	ASSERT(znode_cache == NULL);
-	znode_cache = kmem_cache_create("zfs_znode_cache",
-	    sizeof (znode_t), 0, /* zfs_znode_cache_constructor */ NULL,
-	    zfs_znode_cache_destructor, NULL, NULL, NULL, 0);
-}
-
-void
-zfs_znode_fini(void)
-{
-	/*
-	 * Cleanup zcache
-	 */
-	if (znode_cache)
-		kmem_cache_destroy(znode_cache);
-	znode_cache = NULL;
-}
-
-/*
- * zfs_init_fs - Initialize the zfsvfs struct and the file system
- *	incore "master" object.  Verify version compatibility.
- */
-int
-zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp, cred_t *cr)
-{
-	objset_t	*os = zfsvfs->z_os;
-	uint64_t	version = ZPL_VERSION;
-	int		i, error;
-	dmu_object_info_t doi;
-	uint64_t fsid_guid;
-
-	*zpp = NULL;
-
-	/*
-	 * XXX - hack to auto-create the pool root filesystem at
-	 * the first attempted mount.
-	 */
-	if (dmu_object_info(os, MASTER_NODE_OBJ, &doi) == ENOENT) {
-		dmu_tx_t *tx = dmu_tx_create(os);
-
-		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL); /* master */
-		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL); /* del queue */
-		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); /* root node */
-		error = dmu_tx_assign(tx, TXG_WAIT);
-		ASSERT3U(error, ==, 0);
-		zfs_create_fs(os, cr, tx);
-		dmu_tx_commit(tx);
-	}
-
-	error = zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_OBJ, 8, 1,
-	    &version);
-	if (error) {
-		return (error);
-	} else if (version != ZPL_VERSION) {
-		(void) printf("Mismatched versions:  File system "
-		    "is version %lld on-disk format, which is "
-		    "incompatible with this software version %lld!",
-		    (u_longlong_t)version, ZPL_VERSION);
-		return (ENOTSUP);
-	}
-
-	/*
-	 * The fsid is 64 bits, composed of an 8-bit fs type, which
-	 * separates our fsid from any other filesystem types, and a
-	 * 56-bit objset unique ID.  The objset unique ID is unique to
-	 * all objsets open on this system, provided by unique_create().
-	 * The 8-bit fs type must be put in the low bits of fsid[1]
-	 * because that's where other Solaris filesystems put it.
-	 */
-	fsid_guid = dmu_objset_fsid_guid(os);
-	ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
-	zfsvfs->z_vfs->vfs_fsid.val[0] = fsid_guid;
-	zfsvfs->z_vfs->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) |
-	    zfsvfs->z_vfs->mnt_vfc->vfc_typenum & 0xFF;
-
-	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
-	    &zfsvfs->z_root);
-	if (error)
-		return (error);
-	ASSERT(zfsvfs->z_root != 0);
-
-	/*
-	 * Create the per mount vop tables.
-	 */
-
-	/*
-	 * Initialize zget mutex's
-	 */
-	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
-		mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
-
-	error = zfs_zget(zfsvfs, zfsvfs->z_root, zpp);
-	if (error)
-		return (error);
-	ASSERT3U((*zpp)->z_id, ==, zfsvfs->z_root);
-
-	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
-	    &zfsvfs->z_unlinkedobj);
-	if (error)
-		return (error);
-
-	return (0);
-}
-
-/*
- * define a couple of values we need available
- * for both 64 and 32 bit environments.
- */
-#ifndef NBITSMINOR64
-#define	NBITSMINOR64	32
-#endif
-#ifndef MAXMAJ64
-#define	MAXMAJ64	0xffffffffUL
-#endif
-#ifndef	MAXMIN64
-#define	MAXMIN64	0xffffffffUL
-#endif
-#ifndef major
-#define	major(x)	((int)(((u_int)(x) >> 8)&0xff))	/* major number */
-#endif
-#ifndef minor
-#define	minor(x)	((int)((x)&0xffff00ff))		/* minor number */
-#endif
-
-/*
- * Create special expldev for ZFS private use.
- * Can't use standard expldev since it doesn't do
- * what we want.  The standard expldev() takes a
- * dev32_t in LP64 and expands it to a long dev_t.
- * We need an interface that takes a dev32_t in ILP32
- * and expands it to a long dev_t.
- */
-static uint64_t
-zfs_expldev(dev_t dev)
-{
-	return (((uint64_t)major(dev) << NBITSMINOR64) | minor(dev));
-}
-/*
- * Special cmpldev for ZFS private use.
- * Can't use standard cmpldev since it takes
- * a long dev_t and compresses it to dev32_t in
- * LP64.  We need to do a compaction of a long dev_t
- * to a dev32_t in ILP32.
- */
-dev_t
-zfs_cmpldev(uint64_t dev)
-{
-	return (makedev((dev >> NBITSMINOR64), (dev & MAXMIN64)));
-}
-
-/*
- * Construct a new znode/vnode and intialize.
- *
- * This does not do a call to dmu_set_user() that is
- * up to the caller to do, in case you don't want to
- * return the znode
- */
-static znode_t *
-zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, uint64_t obj_num, int blksz)
-{
-	znode_t	*zp;
-	vnode_t *vp;
-	int error;
-
-	zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
-	zfs_znode_cache_constructor(zp, zfsvfs->z_vfs, 0);
-
-	ASSERT(zp->z_dirlocks == NULL);
-
-	zp->z_phys = db->db_data;
-	zp->z_zfsvfs = zfsvfs;
-	zp->z_unlinked = 0;
-	zp->z_atime_dirty = 0;
-	zp->z_dbuf_held = 0;
-	zp->z_mapcnt = 0;
-	zp->z_last_itx = 0;
-	zp->z_dbuf = db;
-	zp->z_id = obj_num;
-	zp->z_blksz = blksz;
-	zp->z_seq = 0x7A4653;
-	zp->z_sync_cnt = 0;
-
-	mutex_enter(&zfsvfs->z_znodes_lock);
-	list_insert_tail(&zfsvfs->z_all_znodes, zp);
-	mutex_exit(&zfsvfs->z_znodes_lock);
-
-	vp = ZTOV(zp);
-	if (vp == NULL)
-		return (zp);
-
-	error = insmntque(vp, zfsvfs->z_vfs);
-	KASSERT(error == 0, ("insmntque() failed: error %d", error));
-
-	vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode);
-	switch (vp->v_type) {
-	case VDIR:
-		zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */
-		break;
-	case VFIFO:
-		vp->v_op = &zfs_fifoops;
-		break;
-	}
-
-	return (zp);
-}
-
-static void
-zfs_znode_dmu_init(znode_t *zp)
-{
-	znode_t		*nzp;
-	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	dmu_buf_t	*db = zp->z_dbuf;
-
-	mutex_enter(&zp->z_lock);
-
-	nzp = dmu_buf_set_user_ie(db, zp, &zp->z_phys, znode_pageout_func);
-
-	/*
-	 * there should be no
-	 * concurrent zgets on this object.
-	 */
-	ASSERT3P(nzp, ==, NULL);
-
-	/*
-	 * Slap on VROOT if we are the root znode
-	 */
-	if (zp->z_id == zfsvfs->z_root) {
-		ZTOV(zp)->v_flag |= VROOT;
-	}
-
-	ASSERT(zp->z_dbuf_held == 0);
-	zp->z_dbuf_held = 1;
-	VFS_HOLD(zfsvfs->z_vfs);
-	mutex_exit(&zp->z_lock);
-}
-
-/*
- * Create a new DMU object to hold a zfs znode.
- *
- *	IN:	dzp	- parent directory for new znode
- *		vap	- file attributes for new znode
- *		tx	- dmu transaction id for zap operations
- *		cr	- credentials of caller
- *		flag	- flags:
- *			  IS_ROOT_NODE	- new object will be root
- *			  IS_XATTR	- new object is an attribute
- *			  IS_REPLAY	- intent log replay
- *
- *	OUT:	oid	- ID of created object
- *
- */
-void
-zfs_mknode(znode_t *dzp, vattr_t *vap, uint64_t *oid, dmu_tx_t *tx, cred_t *cr,
-	uint_t flag, znode_t **zpp, int bonuslen)
-{
-	dmu_buf_t	*dbp;
-	znode_phys_t	*pzp;
-	znode_t		*zp;
-	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
-	timestruc_t	now;
-	uint64_t	gen;
-	int		err;
-
-	ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
-
-	if (zfsvfs->z_assign >= TXG_INITIAL) {		/* ZIL replay */
-		*oid = vap->va_nodeid;
-		flag |= IS_REPLAY;
-		now = vap->va_ctime;		/* see zfs_replay_create() */
-		gen = vap->va_nblocks;		/* ditto */
-	} else {
-		*oid = 0;
-		gethrestime(&now);
-		gen = dmu_tx_get_txg(tx);
-	}
-
-	/*
-	 * Create a new DMU object.
-	 */
-	/*
-	 * There's currently no mechanism for pre-reading the blocks that will
-	 * be to needed allocate a new object, so we accept the small chance
-	 * that there will be an i/o error and we will fail one of the
-	 * assertions below.
-	 */
-	if (vap->va_type == VDIR) {
-		if (flag & IS_REPLAY) {
-			err = zap_create_claim(zfsvfs->z_os, *oid,
-			    DMU_OT_DIRECTORY_CONTENTS,
-			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
-			ASSERT3U(err, ==, 0);
-		} else {
-			*oid = zap_create(zfsvfs->z_os,
-			    DMU_OT_DIRECTORY_CONTENTS,
-			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
-		}
-	} else {
-		if (flag & IS_REPLAY) {
-			err = dmu_object_claim(zfsvfs->z_os, *oid,
-			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
-			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
-			ASSERT3U(err, ==, 0);
-		} else {
-			*oid = dmu_object_alloc(zfsvfs->z_os,
-			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
-			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
-		}
-	}
-	VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, *oid, NULL, &dbp));
-	dmu_buf_will_dirty(dbp, tx);
-
-	/*
-	 * Initialize the znode physical data to zero.
-	 */
-	ASSERT(dbp->db_size >= sizeof (znode_phys_t));
-	bzero(dbp->db_data, dbp->db_size);
-	pzp = dbp->db_data;
-
-	/*
-	 * If this is the root, fix up the half-initialized parent pointer
-	 * to reference the just-allocated physical data area.
-	 */
-	if (flag & IS_ROOT_NODE) {
-		dzp->z_phys = pzp;
-		dzp->z_id = *oid;
-	}
-
-	/*
-	 * If parent is an xattr, so am I.
-	 */
-	if (dzp->z_phys->zp_flags & ZFS_XATTR)
-		flag |= IS_XATTR;
-
-	if (vap->va_type == VBLK || vap->va_type == VCHR) {
-		pzp->zp_rdev = zfs_expldev(vap->va_rdev);
-	}
-
-	if (vap->va_type == VDIR) {
-		pzp->zp_size = 2;		/* contents ("." and "..") */
-		pzp->zp_links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1;
-	}
-
-	pzp->zp_parent = dzp->z_id;
-	if (flag & IS_XATTR)
-		pzp->zp_flags |= ZFS_XATTR;
-
-	pzp->zp_gen = gen;
-
-	ZFS_TIME_ENCODE(&now, pzp->zp_crtime);
-	ZFS_TIME_ENCODE(&now, pzp->zp_ctime);
-
-	if (vap->va_mask & AT_ATIME) {
-		ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime);
-	} else {
-		ZFS_TIME_ENCODE(&now, pzp->zp_atime);
-	}
-
-	if (vap->va_mask & AT_MTIME) {
-		ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
-	} else {
-		ZFS_TIME_ENCODE(&now, pzp->zp_mtime);
-	}
-
-	pzp->zp_mode = MAKEIMODE(vap->va_type, vap->va_mode);
-	zp = zfs_znode_alloc(zfsvfs, dbp, *oid, 0);
-
-	zfs_perm_init(zp, dzp, flag, vap, tx, cr);
-
-	if (zpp) {
-		kmutex_t *hash_mtx = ZFS_OBJ_MUTEX(zp);
-
-		mutex_enter(hash_mtx);
-		zfs_znode_dmu_init(zp);
-		mutex_exit(hash_mtx);
-
-		*zpp = zp;
-	} else {
-		if (ZTOV(zp) != NULL)
-			ZTOV(zp)->v_count = 0;
-		dmu_buf_rele(dbp, NULL);
-		zfs_znode_free(zp);
-	}
-}
-
-int
-zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
-{
-	dmu_object_info_t doi;
-	dmu_buf_t	*db;
-	znode_t		*zp;
-	vnode_t		*vp;
-	int err;
-
-	*zpp = NULL;
-
-	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
-
-	err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db);
-	if (err) {
-		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
-		return (err);
-	}
-
-	dmu_object_info_from_db(db, &doi);
-	if (doi.doi_bonus_type != DMU_OT_ZNODE ||
-	    doi.doi_bonus_size < sizeof (znode_phys_t)) {
-		dmu_buf_rele(db, NULL);
-		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
-		return (EINVAL);
-	}
-
-	ASSERT(db->db_object == obj_num);
-	ASSERT(db->db_offset == -1);
-	ASSERT(db->db_data != NULL);
-
-	zp = dmu_buf_get_user(db);
-
-	if (zp != NULL) {
-		mutex_enter(&zp->z_lock);
-
-		ASSERT3U(zp->z_id, ==, obj_num);
-		if (zp->z_unlinked) {
-			dmu_buf_rele(db, NULL);
-			mutex_exit(&zp->z_lock);
-			ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
-			return (ENOENT);
-		} else if (zp->z_dbuf_held) {
-			dmu_buf_rele(db, NULL);
-		} else {
-			zp->z_dbuf_held = 1;
-			VFS_HOLD(zfsvfs->z_vfs);
-		}
-
-		if (ZTOV(zp) != NULL)
-			VN_HOLD(ZTOV(zp));
-		else {
-			err = getnewvnode("zfs", zfsvfs->z_vfs, &zfs_vnodeops,
-			    &zp->z_vnode);
-			ASSERT(err == 0);
-			vp = ZTOV(zp);
-			vp->v_data = (caddr_t)zp;
-			VN_LOCK_AREC(vp);
-			VN_LOCK_ASHARE(vp);
-			vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode);
-			if (vp->v_type == VDIR)
-				zp->z_zn_prefetch = B_TRUE;	/* z_prefetch default is enabled */
-			err = insmntque(vp, zfsvfs->z_vfs);
-			KASSERT(err == 0, ("insmntque() failed: error %d", err));
-		}
-		mutex_exit(&zp->z_lock);
-		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
-		*zpp = zp;
-		return (0);
-	}
-
-	/*
-	 * Not found create new znode/vnode
-	 */
-	zp = zfs_znode_alloc(zfsvfs, db, obj_num, doi.doi_data_block_size);
-	ASSERT3U(zp->z_id, ==, obj_num);
-	zfs_znode_dmu_init(zp);
-	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
-	*zpp = zp;
-	return (0);
-}
-
-void
-zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
-{
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	int error;
-
-	ZFS_OBJ_HOLD_ENTER(zfsvfs, zp->z_id);
-	if (zp->z_phys->zp_acl.z_acl_extern_obj) {
-		error = dmu_object_free(zfsvfs->z_os,
-		    zp->z_phys->zp_acl.z_acl_extern_obj, tx);
-		ASSERT3U(error, ==, 0);
-	}
-	error = dmu_object_free(zfsvfs->z_os, zp->z_id, tx);
-	ASSERT3U(error, ==, 0);
-	zp->z_dbuf_held = 0;
-	ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id);
-	dmu_buf_rele(zp->z_dbuf, NULL);
-}
-
-void
-zfs_zinactive(znode_t *zp)
-{
-	vnode_t	*vp = ZTOV(zp);
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	uint64_t z_id = zp->z_id;
-
-	ASSERT(zp->z_dbuf_held && zp->z_phys);
-
-	/*
-	 * Don't allow a zfs_zget() while were trying to release this znode
-	 */
-	ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id);
-
-	mutex_enter(&zp->z_lock);
-	VI_LOCK(vp);
-	if (vp->v_count > 0) {
-		/*
-		 * If the hold count is greater than zero, somebody has
-		 * obtained a new reference on this znode while we were
-		 * processing it here, so we are done.
-		 */
-		VI_UNLOCK(vp);
-		mutex_exit(&zp->z_lock);
-		ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
-		return;
-	}
-	VI_UNLOCK(vp);
-
-	/*
-	 * If this was the last reference to a file with no links,
-	 * remove the file from the file system.
-	 */
-	if (zp->z_unlinked) {
-		ZTOV(zp) = NULL;
-		mutex_exit(&zp->z_lock);
-		ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
-		ASSERT(vp->v_count == 0);
-		vrecycle(vp, curthread);
-		zfs_rmnode(zp);
-		VFS_RELE(zfsvfs->z_vfs);
-		return;
-	}
-	ASSERT(zp->z_phys);
-	ASSERT(zp->z_dbuf_held);
-	mutex_exit(&zp->z_lock);
-	ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
-}
-
-void
-zfs_znode_free(znode_t *zp)
-{
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-
-	mutex_enter(&zfsvfs->z_znodes_lock);
-	list_remove(&zfsvfs->z_all_znodes, zp);
-	mutex_exit(&zfsvfs->z_znodes_lock);
-
-	kmem_cache_free(znode_cache, zp);
-}
-
-void
-zfs_time_stamper_locked(znode_t *zp, uint_t flag, dmu_tx_t *tx)
-{
-	timestruc_t	now;
-
-	ASSERT(MUTEX_HELD(&zp->z_lock));
-
-	gethrestime(&now);
-
-	if (tx) {
-		dmu_buf_will_dirty(zp->z_dbuf, tx);
-		zp->z_atime_dirty = 0;
-		zp->z_seq++;
-	} else {
-		zp->z_atime_dirty = 1;
-	}
-
-	if (flag & AT_ATIME)
-		ZFS_TIME_ENCODE(&now, zp->z_phys->zp_atime);
-
-	if (flag & AT_MTIME)
-		ZFS_TIME_ENCODE(&now, zp->z_phys->zp_mtime);
-
-	if (flag & AT_CTIME)
-		ZFS_TIME_ENCODE(&now, zp->z_phys->zp_ctime);
-}
-
-/*
- * Update the requested znode timestamps with the current time.
- * If we are in a transaction, then go ahead and mark the znode
- * dirty in the transaction so the timestamps will go to disk.
- * Otherwise, we will get pushed next time the znode is updated
- * in a transaction, or when this znode eventually goes inactive.
- *
- * Why is this OK?
- *  1 - Only the ACCESS time is ever updated outside of a transaction.
- *  2 - Multiple consecutive updates will be collapsed into a single
- *	znode update by the transaction grouping semantics of the DMU.
- */
-void
-zfs_time_stamper(znode_t *zp, uint_t flag, dmu_tx_t *tx)
-{
-	mutex_enter(&zp->z_lock);
-	zfs_time_stamper_locked(zp, flag, tx);
-	mutex_exit(&zp->z_lock);
-}
-
-/*
- * Grow the block size for a file.
- *
- *	IN:	zp	- znode of file to free data in.
- *		size	- requested block size
- *		tx	- open transaction.
- *
- * NOTE: this function assumes that the znode is write locked.
- */
-void
-zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
-{
-	int		error;
-	u_longlong_t	dummy;
-
-	if (size <= zp->z_blksz)
-		return;
-	/*
-	 * If the file size is already greater than the current blocksize,
-	 * we will not grow.  If there is more than one block in a file,
-	 * the blocksize cannot change.
-	 */
-	if (zp->z_blksz && zp->z_phys->zp_size > zp->z_blksz)
-		return;
-
-	error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id,
-	    size, 0, tx);
-	if (error == ENOTSUP)
-		return;
-	ASSERT3U(error, ==, 0);
-
-	/* What blocksize did we actually get? */
-	dmu_object_size_from_db(zp->z_dbuf, &zp->z_blksz, &dummy);
-}
-
-/*
- * Free space in a file.
- *
- *	IN:	zp	- znode of file to free data in.
- *		off	- start of section to free.
- *		len	- length of section to free (0 => to EOF).
- *		flag	- current file open mode flags.
- *
- * 	RETURN:	0 if success
- *		error code if failure
- */
-int
-zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
-{
-	vnode_t *vp = ZTOV(zp);
-	dmu_tx_t *tx;
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	zilog_t *zilog = zfsvfs->z_log;
-	rl_t *rl;
-	uint64_t end = off + len;
-	uint64_t size, new_blksz;
-	int error;
-
-	if (ZTOV(zp)->v_type == VFIFO)
-		return (0);
-
-	/*
-	 * If we will change zp_size then lock the whole file,
-	 * otherwise just lock the range being freed.
-	 */
-	if (len == 0 || off + len > zp->z_phys->zp_size) {
-		rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
-	} else {
-		rl = zfs_range_lock(zp, off, len, RL_WRITER);
-		/* recheck, in case zp_size changed */
-		if (off + len > zp->z_phys->zp_size) {
-			/* lost race: file size changed, lock whole file */
-			zfs_range_unlock(rl);
-			rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
-		}
-	}
-
-	/*
-	 * Nothing to do if file already at desired length.
-	 */
-	size = zp->z_phys->zp_size;
-	if (len == 0 && size == off && off != 0) {
-		zfs_range_unlock(rl);
-		return (0);
-	}
-
-	tx = dmu_tx_create(zfsvfs->z_os);
-	dmu_tx_hold_bonus(tx, zp->z_id);
-	new_blksz = 0;
-	if (end > size &&
-	    (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
-		/*
-		 * We are growing the file past the current block size.
-		 */
-		if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) {
-			ASSERT(!ISP2(zp->z_blksz));
-			new_blksz = MIN(end, SPA_MAXBLOCKSIZE);
-		} else {
-			new_blksz = MIN(end, zp->z_zfsvfs->z_max_blksz);
-		}
-		dmu_tx_hold_write(tx, zp->z_id, 0, MIN(end, new_blksz));
-	} else if (off < size) {
-		/*
-		 * If len == 0, we are truncating the file.
-		 */
-		dmu_tx_hold_free(tx, zp->z_id, off, len ? len : DMU_OBJECT_END);
-	}
-
-	error = dmu_tx_assign(tx, zfsvfs->z_assign);
-	if (error) {
-		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT)
-			dmu_tx_wait(tx);
-		dmu_tx_abort(tx);
-		zfs_range_unlock(rl);
-		return (error);
-	}
-
-	if (new_blksz)
-		zfs_grow_blocksize(zp, new_blksz, tx);
-
-	if (end > size || len == 0)
-		zp->z_phys->zp_size = end;
-
-	if (off < size) {
-		objset_t *os = zfsvfs->z_os;
-		uint64_t rlen = len;
-
-		if (len == 0)
-			rlen = -1;
-		else if (end > size)
-			rlen = size - off;
-		VERIFY(0 == dmu_free_range(os, zp->z_id, off, rlen, tx));
-	}
-
-	if (log) {
-		zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
-		zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
-	}
-
-	zfs_range_unlock(rl);
-
-	dmu_tx_commit(tx);
-
-	/*
-	 * Clear any mapped pages in the truncated region.  This has to
-	 * happen outside of the transaction to avoid the possibility of
-	 * a deadlock with someone trying to push a page that we are
-	 * about to invalidate.
-	 */
-	rw_enter(&zp->z_map_lock, RW_WRITER);
-	if (end > size)
-		vnode_pager_setsize(vp, end);
-	else if (len == 0) {
-#if 0
-		error = vtruncbuf(vp, curthread->td_ucred, curthread, end, PAGE_SIZE);
-#else
-		error = vinvalbuf(vp, V_SAVE, curthread, 0, 0);
-		vnode_pager_setsize(vp, end);
-#endif
-	}
-	rw_exit(&zp->z_map_lock);
-
-	return (0);
-}
-
-void
-zfs_create_fs(objset_t *os, cred_t *cr, dmu_tx_t *tx)
-{
-	zfsvfs_t	zfsvfs;
-	uint64_t	moid, doid, roid = 0;
-	uint64_t	version = ZPL_VERSION;
-	int		error;
-	znode_t		*rootzp = NULL;
-	vattr_t		vattr;
-
-	/*
-	 * First attempt to create master node.
-	 */
-	/*
-	 * In an empty objset, there are no blocks to read and thus
-	 * there can be no i/o errors (which we assert below).
-	 */
-	moid = MASTER_NODE_OBJ;
-	error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
-	    DMU_OT_NONE, 0, tx);
-	ASSERT(error == 0);
-
-	/*
-	 * Set starting attributes.
-	 */
-
-	error = zap_update(os, moid, ZPL_VERSION_OBJ, 8, 1, &version, tx);
-	ASSERT(error == 0);
-
-	/*
-	 * Create a delete queue.
-	 */
-	doid = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
-
-	error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &doid, tx);
-	ASSERT(error == 0);
-
-	/*
-	 * Create root znode.  Create minimal znode/vnode/zfsvfs
-	 * to allow zfs_mknode to work.
-	 */
-	vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
-	vattr.va_type = VDIR;
-	vattr.va_mode = S_IFDIR|0755;
-	vattr.va_uid = UID_ROOT;
-	vattr.va_gid = GID_WHEEL;
-
-	rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
-	zfs_znode_cache_constructor(rootzp, NULL, 0);
-	rootzp->z_zfsvfs = &zfsvfs;
-	rootzp->z_unlinked = 0;
-	rootzp->z_atime_dirty = 0;
-	rootzp->z_dbuf_held = 0;
-
-	bzero(&zfsvfs, sizeof (zfsvfs_t));
-
-	zfsvfs.z_os = os;
-	zfsvfs.z_assign = TXG_NOWAIT;
-	zfsvfs.z_parent = &zfsvfs;
-
-	mutex_init(&zfsvfs.z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
-	list_create(&zfsvfs.z_all_znodes, sizeof (znode_t),
-	    offsetof(znode_t, z_link_node));
-
-	zfs_mknode(rootzp, &vattr, &roid, tx, cr, IS_ROOT_NODE, NULL, 0);
-	ASSERT3U(rootzp->z_id, ==, roid);
-	error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &roid, tx);
-	ASSERT(error == 0);
-
-	mutex_destroy(&zfsvfs.z_znodes_lock);
-	kmem_cache_free(znode_cache, rootzp);
-}
-#endif /* _KERNEL */
-
-/*
- * Given an object number, return its parent object number and whether
- * or not the object is an extended attribute directory.
- */
-static int
-zfs_obj_to_pobj(objset_t *osp, uint64_t obj, uint64_t *pobjp, int *is_xattrdir)
-{
-	dmu_buf_t *db;
-	dmu_object_info_t doi;
-	znode_phys_t *zp;
-	int error;
-
-	if ((error = dmu_bonus_hold(osp, obj, FTAG, &db)) != 0)
-		return (error);
-
-	dmu_object_info_from_db(db, &doi);
-	if (doi.doi_bonus_type != DMU_OT_ZNODE ||
-	    doi.doi_bonus_size < sizeof (znode_phys_t)) {
-		dmu_buf_rele(db, FTAG);
-		return (EINVAL);
-	}
-
-	zp = db->db_data;
-	*pobjp = zp->zp_parent;
-	*is_xattrdir = ((zp->zp_flags & ZFS_XATTR) != 0) &&
-	    S_ISDIR(zp->zp_mode);
-	dmu_buf_rele(db, FTAG);
-
-	return (0);
-}
-
-int
-zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
-{
-	char *path = buf + len - 1;
-	int error;
-
-	*path = '\0';
-
-	for (;;) {
-		uint64_t pobj;
-		char component[MAXNAMELEN + 2];
-		size_t complen;
-		int is_xattrdir;
-
-		if ((error = zfs_obj_to_pobj(osp, obj, &pobj,
-		    &is_xattrdir)) != 0)
-			break;
-
-		if (pobj == obj) {
-			if (path[0] != '/')
-				*--path = '/';
-			break;
-		}
-
-		component[0] = '/';
-		if (is_xattrdir) {
-			(void) sprintf(component + 1, "<xattrdir>");
-		} else {
-			error = zap_value_search(osp, pobj, obj, component + 1);
-			if (error != 0)
-				break;
-		}
-
-		complen = strlen(component);
-		path -= complen;
-		ASSERT(path >= buf);
-		bcopy(component, path, complen);
-		obj = pobj;
-	}
-
-	if (error == 0)
-		(void) memmove(buf, path, buf + len - path);
-	return (error);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zil.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zil.c
deleted file mode 100644
index 69ee509..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/zil.c
+++ /dev/null
@@ -1,1607 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/spa.h>
-#include <sys/dmu.h>
-#include <sys/zap.h>
-#include <sys/arc.h>
-#include <sys/stat.h>
-#include <sys/resource.h>
-#include <sys/zil.h>
-#include <sys/zil_impl.h>
-#include <sys/dsl_dataset.h>
-#include <sys/vdev.h>
-#include <sys/dmu_tx.h>
-
-/*
- * The zfs intent log (ZIL) saves transaction records of system calls
- * that change the file system in memory with enough information
- * to be able to replay them. These are stored in memory until
- * either the DMU transaction group (txg) commits them to the stable pool
- * and they can be discarded, or they are flushed to the stable log
- * (also in the pool) due to a fsync, O_DSYNC or other synchronous
- * requirement. In the event of a panic or power fail then those log
- * records (transactions) are replayed.
- *
- * There is one ZIL per file system. Its on-disk (pool) format consists
- * of 3 parts:
- *
- * 	- ZIL header
- * 	- ZIL blocks
- * 	- ZIL records
- *
- * A log record holds a system call transaction. Log blocks can
- * hold many log records and the blocks are chained together.
- * Each ZIL block contains a block pointer (blkptr_t) to the next
- * ZIL block in the chain. The ZIL header points to the first
- * block in the chain. Note there is not a fixed place in the pool
- * to hold blocks. They are dynamically allocated and freed as
- * needed from the blocks available. Figure X shows the ZIL structure:
- */
-
-/*
- * This global ZIL switch affects all pools
- */
-int zil_disable = 0;	/* disable intent logging */
-SYSCTL_DECL(_vfs_zfs);
-TUNABLE_INT("vfs.zfs.zil_disable", &zil_disable);
-SYSCTL_INT(_vfs_zfs, OID_AUTO, zil_disable, CTLFLAG_RW, &zil_disable, 0,
-    "Disable ZFS Intent Log (ZIL)");
-
-/*
- * Tunable parameter for debugging or performance analysis.  Setting
- * zfs_nocacheflush will cause corruption on power loss if a volatile
- * out-of-order write cache is enabled.
- */
-boolean_t zfs_nocacheflush = B_FALSE;
-TUNABLE_INT("vfs.zfs.cache_flush_disable", &zfs_nocacheflush);
-SYSCTL_INT(_vfs_zfs, OID_AUTO, cache_flush_disable, CTLFLAG_RDTUN,
-    &zfs_nocacheflush, 0, "Disable cache flush");
-
-static kmem_cache_t *zil_lwb_cache;
-
-static int
-zil_dva_compare(const void *x1, const void *x2)
-{
-	const dva_t *dva1 = x1;
-	const dva_t *dva2 = x2;
-
-	if (DVA_GET_VDEV(dva1) < DVA_GET_VDEV(dva2))
-		return (-1);
-	if (DVA_GET_VDEV(dva1) > DVA_GET_VDEV(dva2))
-		return (1);
-
-	if (DVA_GET_OFFSET(dva1) < DVA_GET_OFFSET(dva2))
-		return (-1);
-	if (DVA_GET_OFFSET(dva1) > DVA_GET_OFFSET(dva2))
-		return (1);
-
-	return (0);
-}
-
-static void
-zil_dva_tree_init(avl_tree_t *t)
-{
-	avl_create(t, zil_dva_compare, sizeof (zil_dva_node_t),
-	    offsetof(zil_dva_node_t, zn_node));
-}
-
-static void
-zil_dva_tree_fini(avl_tree_t *t)
-{
-	zil_dva_node_t *zn;
-	void *cookie = NULL;
-
-	while ((zn = avl_destroy_nodes(t, &cookie)) != NULL)
-		kmem_free(zn, sizeof (zil_dva_node_t));
-
-	avl_destroy(t);
-}
-
-static int
-zil_dva_tree_add(avl_tree_t *t, dva_t *dva)
-{
-	zil_dva_node_t *zn;
-	avl_index_t where;
-
-	if (avl_find(t, dva, &where) != NULL)
-		return (EEXIST);
-
-	zn = kmem_alloc(sizeof (zil_dva_node_t), KM_SLEEP);
-	zn->zn_dva = *dva;
-	avl_insert(t, zn, where);
-
-	return (0);
-}
-
-static zil_header_t *
-zil_header_in_syncing_context(zilog_t *zilog)
-{
-	return ((zil_header_t *)zilog->zl_header);
-}
-
-static void
-zil_init_log_chain(zilog_t *zilog, blkptr_t *bp)
-{
-	zio_cksum_t *zc = &bp->blk_cksum;
-
-	zc->zc_word[ZIL_ZC_GUID_0] = spa_get_random(-1ULL);
-	zc->zc_word[ZIL_ZC_GUID_1] = spa_get_random(-1ULL);
-	zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os);
-	zc->zc_word[ZIL_ZC_SEQ] = 1ULL;
-}
-
-/*
- * Read a log block, make sure it's valid, and byteswap it if necessary.
- */
-static int
-zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, arc_buf_t **abufpp)
-{
-	blkptr_t blk = *bp;
-	zbookmark_t zb;
-	uint32_t aflags = ARC_WAIT;
-	int error;
-
-	zb.zb_objset = bp->blk_cksum.zc_word[ZIL_ZC_OBJSET];
-	zb.zb_object = 0;
-	zb.zb_level = -1;
-	zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
-
-	*abufpp = NULL;
-
-	error = arc_read(NULL, zilog->zl_spa, &blk, byteswap_uint64_array,
-	    arc_getbuf_func, abufpp, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL |
-	    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB, &aflags, &zb);
-
-	if (error == 0) {
-		char *data = (*abufpp)->b_data;
-		uint64_t blksz = BP_GET_LSIZE(bp);
-		zil_trailer_t *ztp = (zil_trailer_t *)(data + blksz) - 1;
-		zio_cksum_t cksum = bp->blk_cksum;
-
-		/*
-		 * Sequence numbers should be... sequential.  The checksum
-		 * verifier for the next block should be bp's checksum plus 1.
-		 */
-		cksum.zc_word[ZIL_ZC_SEQ]++;
-
-		if (bcmp(&cksum, &ztp->zit_next_blk.blk_cksum, sizeof (cksum)))
-			error = ESTALE;
-		else if (BP_IS_HOLE(&ztp->zit_next_blk))
-			error = ENOENT;
-		else if (ztp->zit_nused > (blksz - sizeof (zil_trailer_t)))
-			error = EOVERFLOW;
-
-		if (error) {
-			VERIFY(arc_buf_remove_ref(*abufpp, abufpp) == 1);
-			*abufpp = NULL;
-		}
-	}
-
-	dprintf("error %d on %llu:%llu\n", error, zb.zb_objset, zb.zb_blkid);
-
-	return (error);
-}
-
-/*
- * Parse the intent log, and call parse_func for each valid record within.
- * Return the highest sequence number.
- */
-uint64_t
-zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
-    zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg)
-{
-	const zil_header_t *zh = zilog->zl_header;
-	uint64_t claim_seq = zh->zh_claim_seq;
-	uint64_t seq = 0;
-	uint64_t max_seq = 0;
-	blkptr_t blk = zh->zh_log;
-	arc_buf_t *abuf;
-	char *lrbuf, *lrp;
-	zil_trailer_t *ztp;
-	int reclen, error;
-
-	if (BP_IS_HOLE(&blk))
-		return (max_seq);
-
-	/*
-	 * Starting at the block pointed to by zh_log we read the log chain.
-	 * For each block in the chain we strongly check that block to
-	 * ensure its validity.  We stop when an invalid block is found.
-	 * For each block pointer in the chain we call parse_blk_func().
-	 * For each record in each valid block we call parse_lr_func().
-	 * If the log has been claimed, stop if we encounter a sequence
-	 * number greater than the highest claimed sequence number.
-	 */
-	zil_dva_tree_init(&zilog->zl_dva_tree);
-	for (;;) {
-		seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
-
-		if (claim_seq != 0 && seq > claim_seq)
-			break;
-
-		ASSERT(max_seq < seq);
-		max_seq = seq;
-
-		error = zil_read_log_block(zilog, &blk, &abuf);
-
-		if (parse_blk_func != NULL)
-			parse_blk_func(zilog, &blk, arg, txg);
-
-		if (error)
-			break;
-
-		lrbuf = abuf->b_data;
-		ztp = (zil_trailer_t *)(lrbuf + BP_GET_LSIZE(&blk)) - 1;
-		blk = ztp->zit_next_blk;
-
-		if (parse_lr_func == NULL) {
-			VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
-			continue;
-		}
-
-		for (lrp = lrbuf; lrp < lrbuf + ztp->zit_nused; lrp += reclen) {
-			lr_t *lr = (lr_t *)lrp;
-			reclen = lr->lrc_reclen;
-			ASSERT3U(reclen, >=, sizeof (lr_t));
-			parse_lr_func(zilog, lr, arg, txg);
-		}
-		VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
-	}
-	zil_dva_tree_fini(&zilog->zl_dva_tree);
-
-	return (max_seq);
-}
-
-/* ARGSUSED */
-static void
-zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg)
-{
-	spa_t *spa = zilog->zl_spa;
-	int err;
-
-	/*
-	 * Claim log block if not already committed and not already claimed.
-	 */
-	if (bp->blk_birth >= first_txg &&
-	    zil_dva_tree_add(&zilog->zl_dva_tree, BP_IDENTITY(bp)) == 0) {
-		err = zio_wait(zio_claim(NULL, spa, first_txg, bp, NULL, NULL));
-		ASSERT(err == 0);
-	}
-}
-
-static void
-zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg)
-{
-	if (lrc->lrc_txtype == TX_WRITE) {
-		lr_write_t *lr = (lr_write_t *)lrc;
-		zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg);
-	}
-}
-
-/* ARGSUSED */
-static void
-zil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg)
-{
-	zio_free_blk(zilog->zl_spa, bp, dmu_tx_get_txg(tx));
-}
-
-static void
-zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg)
-{
-	/*
-	 * If we previously claimed it, we need to free it.
-	 */
-	if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE) {
-		lr_write_t *lr = (lr_write_t *)lrc;
-		blkptr_t *bp = &lr->lr_blkptr;
-		if (bp->blk_birth >= claim_txg &&
-		    !zil_dva_tree_add(&zilog->zl_dva_tree, BP_IDENTITY(bp))) {
-			(void) arc_free(NULL, zilog->zl_spa,
-			    dmu_tx_get_txg(tx), bp, NULL, NULL, ARC_WAIT);
-		}
-	}
-}
-
-/*
- * Create an on-disk intent log.
- */
-static void
-zil_create(zilog_t *zilog)
-{
-	const zil_header_t *zh = zilog->zl_header;
-	lwb_t *lwb;
-	uint64_t txg = 0;
-	dmu_tx_t *tx = NULL;
-	blkptr_t blk;
-	int error = 0;
-
-	/*
-	 * Wait for any previous destroy to complete.
-	 */
-	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
-
-	ASSERT(zh->zh_claim_txg == 0);
-	ASSERT(zh->zh_replay_seq == 0);
-
-	blk = zh->zh_log;
-
-	/*
-	 * If we don't already have an initial log block, allocate one now.
-	 */
-	if (BP_IS_HOLE(&blk)) {
-		tx = dmu_tx_create(zilog->zl_os);
-		(void) dmu_tx_assign(tx, TXG_WAIT);
-		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
-		txg = dmu_tx_get_txg(tx);
-
-		error = zio_alloc_blk(zilog->zl_spa, ZIL_MIN_BLKSZ, &blk,
-		    NULL, txg);
-
-		if (error == 0)
-			zil_init_log_chain(zilog, &blk);
-	}
-
-	/*
-	 * Allocate a log write buffer (lwb) for the first log block.
-	 */
-	if (error == 0) {
-		lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
-		lwb->lwb_zilog = zilog;
-		lwb->lwb_blk = blk;
-		lwb->lwb_nused = 0;
-		lwb->lwb_sz = BP_GET_LSIZE(&lwb->lwb_blk);
-		lwb->lwb_buf = zio_buf_alloc(lwb->lwb_sz);
-		lwb->lwb_max_txg = txg;
-		lwb->lwb_zio = NULL;
-
-		mutex_enter(&zilog->zl_lock);
-		list_insert_tail(&zilog->zl_lwb_list, lwb);
-		mutex_exit(&zilog->zl_lock);
-	}
-
-	/*
-	 * If we just allocated the first log block, commit our transaction
-	 * and wait for zil_sync() to stuff the block poiner into zh_log.
-	 * (zh is part of the MOS, so we cannot modify it in open context.)
-	 */
-	if (tx != NULL) {
-		dmu_tx_commit(tx);
-		txg_wait_synced(zilog->zl_dmu_pool, txg);
-	}
-
-	ASSERT(bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0);
-}
-
-/*
- * In one tx, free all log blocks and clear the log header.
- * If keep_first is set, then we're replaying a log with no content.
- * We want to keep the first block, however, so that the first
- * synchronous transaction doesn't require a txg_wait_synced()
- * in zil_create().  We don't need to txg_wait_synced() here either
- * when keep_first is set, because both zil_create() and zil_destroy()
- * will wait for any in-progress destroys to complete.
- */
-void
-zil_destroy(zilog_t *zilog, boolean_t keep_first)
-{
-	const zil_header_t *zh = zilog->zl_header;
-	lwb_t *lwb;
-	dmu_tx_t *tx;
-	uint64_t txg;
-
-	/*
-	 * Wait for any previous destroy to complete.
-	 */
-	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
-
-	if (BP_IS_HOLE(&zh->zh_log))
-		return;
-
-	tx = dmu_tx_create(zilog->zl_os);
-	(void) dmu_tx_assign(tx, TXG_WAIT);
-	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
-	txg = dmu_tx_get_txg(tx);
-
-	mutex_enter(&zilog->zl_lock);
-
-	ASSERT3U(zilog->zl_destroy_txg, <, txg);
-	zilog->zl_destroy_txg = txg;
-	zilog->zl_keep_first = keep_first;
-
-	if (!list_is_empty(&zilog->zl_lwb_list)) {
-		ASSERT(zh->zh_claim_txg == 0);
-		ASSERT(!keep_first);
-		while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
-			list_remove(&zilog->zl_lwb_list, lwb);
-			if (lwb->lwb_buf != NULL)
-				zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
-			zio_free_blk(zilog->zl_spa, &lwb->lwb_blk, txg);
-			kmem_cache_free(zil_lwb_cache, lwb);
-		}
-	} else {
-		if (!keep_first) {
-			(void) zil_parse(zilog, zil_free_log_block,
-			    zil_free_log_record, tx, zh->zh_claim_txg);
-		}
-	}
-	mutex_exit(&zilog->zl_lock);
-
-	dmu_tx_commit(tx);
-
-	if (keep_first)			/* no need to wait in this case */
-		return;
-
-	txg_wait_synced(zilog->zl_dmu_pool, txg);
-	ASSERT(BP_IS_HOLE(&zh->zh_log));
-}
-
-int
-zil_claim(char *osname, void *txarg)
-{
-	dmu_tx_t *tx = txarg;
-	uint64_t first_txg = dmu_tx_get_txg(tx);
-	zilog_t *zilog;
-	zil_header_t *zh;
-	objset_t *os;
-	int error;
-
-	error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_STANDARD, &os);
-	if (error) {
-		cmn_err(CE_WARN, "can't process intent log for %s", osname);
-		return (0);
-	}
-
-	zilog = dmu_objset_zil(os);
-	zh = zil_header_in_syncing_context(zilog);
-
-	/*
-	 * Claim all log blocks if we haven't already done so, and remember
-	 * the highest claimed sequence number.  This ensures that if we can
-	 * read only part of the log now (e.g. due to a missing device),
-	 * but we can read the entire log later, we will not try to replay
-	 * or destroy beyond the last block we successfully claimed.
-	 */
-	ASSERT3U(zh->zh_claim_txg, <=, first_txg);
-	if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) {
-		zh->zh_claim_txg = first_txg;
-		zh->zh_claim_seq = zil_parse(zilog, zil_claim_log_block,
-		    zil_claim_log_record, tx, first_txg);
-		dsl_dataset_dirty(dmu_objset_ds(os), tx);
-	}
-
-	ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1));
-	dmu_objset_close(os);
-	return (0);
-}
-
-void
-zil_add_vdev(zilog_t *zilog, uint64_t vdev)
-{
-	zil_vdev_t *zv, *new;
-	uint64_t bmap_sz = sizeof (zilog->zl_vdev_bmap) << 3;
-	uchar_t *cp;
-
-	if (zfs_nocacheflush)
-		return;
-
-	if (vdev < bmap_sz) {
-		cp = zilog->zl_vdev_bmap + (vdev / 8);
-		atomic_or_8(cp, 1 << (vdev % 8));
-	} else  {
-		/*
-		 * insert into ordered list
-		 */
-		mutex_enter(&zilog->zl_lock);
-		for (zv = list_head(&zilog->zl_vdev_list); zv != NULL;
-		    zv = list_next(&zilog->zl_vdev_list, zv)) {
-			if (zv->vdev == vdev) {
-				/* duplicate found - just return */
-				mutex_exit(&zilog->zl_lock);
-				return;
-			}
-			if (zv->vdev > vdev) {
-				/* insert before this entry */
-				new = kmem_alloc(sizeof (zil_vdev_t),
-				    KM_SLEEP);
-				new->vdev = vdev;
-				list_insert_before(&zilog->zl_vdev_list,
-				    zv, new);
-				mutex_exit(&zilog->zl_lock);
-				return;
-			}
-		}
-		/* ran off end of list, insert at the end */
-		ASSERT(zv == NULL);
-		new = kmem_alloc(sizeof (zil_vdev_t), KM_SLEEP);
-		new->vdev = vdev;
-		list_insert_tail(&zilog->zl_vdev_list, new);
-		mutex_exit(&zilog->zl_lock);
-	}
-}
-
-/* start an async flush of the write cache for this vdev */
-void
-zil_flush_vdev(spa_t *spa, uint64_t vdev, zio_t **zio)
-{
-	vdev_t *vd;
-
-	if (*zio == NULL)
-		*zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
-
-	vd = vdev_lookup_top(spa, vdev);
-	ASSERT(vd);
-
-	(void) zio_nowait(zio_ioctl(*zio, spa, vd, DKIOCFLUSHWRITECACHE,
-	    NULL, NULL, ZIO_PRIORITY_NOW,
-	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
-}
-
-void
-zil_flush_vdevs(zilog_t *zilog)
-{
-	zil_vdev_t *zv;
-	zio_t *zio = NULL;
-	spa_t *spa = zilog->zl_spa;
-	uint64_t vdev;
-	uint8_t b;
-	int i, j;
-
-	ASSERT(zilog->zl_writer);
-
-	for (i = 0; i < sizeof (zilog->zl_vdev_bmap); i++) {
-		b = zilog->zl_vdev_bmap[i];
-		if (b == 0)
-			continue;
-		for (j = 0; j < 8; j++) {
-			if (b & (1 << j)) {
-				vdev = (i << 3) + j;
-				zil_flush_vdev(spa, vdev, &zio);
-			}
-		}
-		zilog->zl_vdev_bmap[i] = 0;
-	}
-
-	while ((zv = list_head(&zilog->zl_vdev_list)) != NULL) {
-		zil_flush_vdev(spa, zv->vdev, &zio);
-		list_remove(&zilog->zl_vdev_list, zv);
-		kmem_free(zv, sizeof (zil_vdev_t));
-	}
-	/*
-	 * Wait for all the flushes to complete.  Not all devices actually
-	 * support the DKIOCFLUSHWRITECACHE ioctl, so it's OK if it fails.
-	 */
-	if (zio)
-		(void) zio_wait(zio);
-}
-
-/*
- * Function called when a log block write completes
- */
-static void
-zil_lwb_write_done(zio_t *zio)
-{
-	lwb_t *lwb = zio->io_private;
-	zilog_t *zilog = lwb->lwb_zilog;
-
-	/*
-	 * Now that we've written this log block, we have a stable pointer
-	 * to the next block in the chain, so it's OK to let the txg in
-	 * which we allocated the next block sync.
-	 */
-	txg_rele_to_sync(&lwb->lwb_txgh);
-
-	zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
-	mutex_enter(&zilog->zl_lock);
-	lwb->lwb_buf = NULL;
-	if (zio->io_error) {
-		zilog->zl_log_error = B_TRUE;
-		mutex_exit(&zilog->zl_lock);
-		return;
-	}
-	mutex_exit(&zilog->zl_lock);
-}
-
-/*
- * Initialize the io for a log block.
- *
- * Note, we should not initialize the IO until we are about
- * to use it, since zio_rewrite() does a spa_config_enter().
- */
-static void
-zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
-{
-	zbookmark_t zb;
-
-	zb.zb_objset = lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET];
-	zb.zb_object = 0;
-	zb.zb_level = -1;
-	zb.zb_blkid = lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
-
-	if (zilog->zl_root_zio == NULL) {
-		zilog->zl_root_zio = zio_root(zilog->zl_spa, NULL, NULL,
-		    ZIO_FLAG_CANFAIL);
-	}
-	if (lwb->lwb_zio == NULL) {
-		lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa,
-		    ZIO_CHECKSUM_ZILOG, 0, &lwb->lwb_blk, lwb->lwb_buf,
-		    lwb->lwb_sz, zil_lwb_write_done, lwb,
-		    ZIO_PRIORITY_LOG_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
-	}
-}
-
-/*
- * Start a log block write and advance to the next log block.
- * Calls are serialized.
- */
-static lwb_t *
-zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
-{
-	lwb_t *nlwb;
-	zil_trailer_t *ztp = (zil_trailer_t *)(lwb->lwb_buf + lwb->lwb_sz) - 1;
-	spa_t *spa = zilog->zl_spa;
-	blkptr_t *bp = &ztp->zit_next_blk;
-	uint64_t txg;
-	uint64_t zil_blksz;
-	int error;
-
-	ASSERT(lwb->lwb_nused <= ZIL_BLK_DATA_SZ(lwb));
-
-	/*
-	 * Allocate the next block and save its address in this block
-	 * before writing it in order to establish the log chain.
-	 * Note that if the allocation of nlwb synced before we wrote
-	 * the block that points at it (lwb), we'd leak it if we crashed.
-	 * Therefore, we don't do txg_rele_to_sync() until zil_lwb_write_done().
-	 */
-	txg = txg_hold_open(zilog->zl_dmu_pool, &lwb->lwb_txgh);
-	txg_rele_to_quiesce(&lwb->lwb_txgh);
-
-	/*
-	 * Pick a ZIL blocksize. We request a size that is the
-	 * maximum of the previous used size, the current used size and
-	 * the amount waiting in the queue.
-	 */
-	zil_blksz = MAX(zilog->zl_prev_used,
-	    zilog->zl_cur_used + sizeof (*ztp));
-	zil_blksz = MAX(zil_blksz, zilog->zl_itx_list_sz + sizeof (*ztp));
-	zil_blksz = P2ROUNDUP_TYPED(zil_blksz, ZIL_MIN_BLKSZ, uint64_t);
-	if (zil_blksz > ZIL_MAX_BLKSZ)
-		zil_blksz = ZIL_MAX_BLKSZ;
-
-	BP_ZERO(bp);
-	/* pass the old blkptr in order to spread log blocks across devs */
-	error = zio_alloc_blk(spa, zil_blksz, bp, &lwb->lwb_blk, txg);
-	if (error) {
-		dmu_tx_t *tx = dmu_tx_create_assigned(zilog->zl_dmu_pool, txg);
-
-		/*
-		 * We dirty the dataset to ensure that zil_sync() will
-		 * be called to remove this lwb from our zl_lwb_list.
-		 * Failing to do so, may leave an lwb with a NULL lwb_buf
-		 * hanging around on the zl_lwb_list.
-		 */
-		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
-		dmu_tx_commit(tx);
-
-		/*
-		 * Since we've just experienced an allocation failure so we
-		 * terminate the current lwb and send it on its way.
-		 */
-		ztp->zit_pad = 0;
-		ztp->zit_nused = lwb->lwb_nused;
-		ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum;
-		zio_nowait(lwb->lwb_zio);
-
-		/*
-		 * By returning NULL the caller will call tx_wait_synced()
-		 */
-		return (NULL);
-	}
-
-	ASSERT3U(bp->blk_birth, ==, txg);
-	ztp->zit_pad = 0;
-	ztp->zit_nused = lwb->lwb_nused;
-	ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum;
-	bp->blk_cksum = lwb->lwb_blk.blk_cksum;
-	bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;
-
-	/*
-	 * Allocate a new log write buffer (lwb).
-	 */
-	nlwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
-
-	nlwb->lwb_zilog = zilog;
-	nlwb->lwb_blk = *bp;
-	nlwb->lwb_nused = 0;
-	nlwb->lwb_sz = BP_GET_LSIZE(&nlwb->lwb_blk);
-	nlwb->lwb_buf = zio_buf_alloc(nlwb->lwb_sz);
-	nlwb->lwb_max_txg = txg;
-	nlwb->lwb_zio = NULL;
-
-	/*
-	 * Put new lwb at the end of the log chain
-	 */
-	mutex_enter(&zilog->zl_lock);
-	list_insert_tail(&zilog->zl_lwb_list, nlwb);
-	mutex_exit(&zilog->zl_lock);
-
-	/* Record the vdev for later flushing */
-	zil_add_vdev(zilog, DVA_GET_VDEV(BP_IDENTITY(&(lwb->lwb_blk))));
-
-	/*
-	 * kick off the write for the old log block
-	 */
-	dprintf_bp(&lwb->lwb_blk, "lwb %p txg %llu: ", lwb, txg);
-	ASSERT(lwb->lwb_zio);
-	zio_nowait(lwb->lwb_zio);
-
-	return (nlwb);
-}
-
-static lwb_t *
-zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
-{
-	lr_t *lrc = &itx->itx_lr; /* common log record */
-	lr_write_t *lr = (lr_write_t *)lrc;
-	uint64_t txg = lrc->lrc_txg;
-	uint64_t reclen = lrc->lrc_reclen;
-	uint64_t dlen;
-
-	if (lwb == NULL)
-		return (NULL);
-	ASSERT(lwb->lwb_buf != NULL);
-
-	if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY)
-		dlen = P2ROUNDUP_TYPED(
-		    lr->lr_length, sizeof (uint64_t), uint64_t);
-	else
-		dlen = 0;
-
-	zilog->zl_cur_used += (reclen + dlen);
-
-	zil_lwb_write_init(zilog, lwb);
-
-	/*
-	 * If this record won't fit in the current log block, start a new one.
-	 */
-	if (lwb->lwb_nused + reclen + dlen > ZIL_BLK_DATA_SZ(lwb)) {
-		lwb = zil_lwb_write_start(zilog, lwb);
-		if (lwb == NULL)
-			return (NULL);
-		zil_lwb_write_init(zilog, lwb);
-		ASSERT(lwb->lwb_nused == 0);
-		if (reclen + dlen > ZIL_BLK_DATA_SZ(lwb)) {
-			txg_wait_synced(zilog->zl_dmu_pool, txg);
-			return (lwb);
-		}
-	}
-
-	/*
-	 * Update the lrc_seq, to be log record sequence number. See zil.h
-	 * Then copy the record to the log buffer.
-	 */
-	lrc->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */
-	bcopy(lrc, lwb->lwb_buf + lwb->lwb_nused, reclen);
-
-	/*
-	 * If it's a write, fetch the data or get its blkptr as appropriate.
-	 */
-	if (lrc->lrc_txtype == TX_WRITE) {
-		if (txg > spa_freeze_txg(zilog->zl_spa))
-			txg_wait_synced(zilog->zl_dmu_pool, txg);
-		if (itx->itx_wr_state != WR_COPIED) {
-			char *dbuf;
-			int error;
-
-			/* alignment is guaranteed */
-			lr = (lr_write_t *)(lwb->lwb_buf + lwb->lwb_nused);
-			if (dlen) {
-				ASSERT(itx->itx_wr_state == WR_NEED_COPY);
-				dbuf = lwb->lwb_buf + lwb->lwb_nused + reclen;
-				lr->lr_common.lrc_reclen += dlen;
-			} else {
-				ASSERT(itx->itx_wr_state == WR_INDIRECT);
-				dbuf = NULL;
-			}
-			error = zilog->zl_get_data(
-			    itx->itx_private, lr, dbuf, lwb->lwb_zio);
-			if (error) {
-				ASSERT(error == ENOENT || error == EEXIST ||
-				    error == EALREADY);
-				return (lwb);
-			}
-		}
-	}
-
-	lwb->lwb_nused += reclen + dlen;
-	lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg);
-	ASSERT3U(lwb->lwb_nused, <=, ZIL_BLK_DATA_SZ(lwb));
-	ASSERT3U(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)), ==, 0);
-
-	return (lwb);
-}
-
-itx_t *
-zil_itx_create(int txtype, size_t lrsize)
-{
-	itx_t *itx;
-
-	lrsize = P2ROUNDUP_TYPED(lrsize, sizeof (uint64_t), size_t);
-
-	itx = kmem_alloc(offsetof(itx_t, itx_lr) + lrsize, KM_SLEEP);
-	itx->itx_lr.lrc_txtype = txtype;
-	itx->itx_lr.lrc_reclen = lrsize;
-	itx->itx_lr.lrc_seq = 0;	/* defensive */
-
-	return (itx);
-}
-
-uint64_t
-zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
-{
-	uint64_t seq;
-
-	ASSERT(itx->itx_lr.lrc_seq == 0);
-
-	mutex_enter(&zilog->zl_lock);
-	list_insert_tail(&zilog->zl_itx_list, itx);
-	zilog->zl_itx_list_sz += itx->itx_lr.lrc_reclen;
-	itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx);
-	itx->itx_lr.lrc_seq = seq = ++zilog->zl_itx_seq;
-	mutex_exit(&zilog->zl_lock);
-
-	return (seq);
-}
-
-/*
- * Free up all in-memory intent log transactions that have now been synced.
- */
-static void
-zil_itx_clean(zilog_t *zilog)
-{
-	uint64_t synced_txg = spa_last_synced_txg(zilog->zl_spa);
-	uint64_t freeze_txg = spa_freeze_txg(zilog->zl_spa);
-	list_t clean_list;
-	itx_t *itx;
-
-	list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node));
-
-	mutex_enter(&zilog->zl_lock);
-	/* wait for a log writer to finish walking list */
-	while (zilog->zl_writer) {
-		cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
-	}
-
-	/*
-	 * Move the sync'd log transactions to a separate list so we can call
-	 * kmem_free without holding the zl_lock.
-	 *
-	 * There is no need to set zl_writer as we don't drop zl_lock here
-	 */
-	while ((itx = list_head(&zilog->zl_itx_list)) != NULL &&
-	    itx->itx_lr.lrc_txg <= MIN(synced_txg, freeze_txg)) {
-		list_remove(&zilog->zl_itx_list, itx);
-		zilog->zl_itx_list_sz -= itx->itx_lr.lrc_reclen;
-		list_insert_tail(&clean_list, itx);
-	}
-	cv_broadcast(&zilog->zl_cv_writer);
-	mutex_exit(&zilog->zl_lock);
-
-	/* destroy sync'd log transactions */
-	while ((itx = list_head(&clean_list)) != NULL) {
-		list_remove(&clean_list, itx);
-		kmem_free(itx, offsetof(itx_t, itx_lr)
-		    + itx->itx_lr.lrc_reclen);
-	}
-	list_destroy(&clean_list);
-}
-
-/*
- * If there are any in-memory intent log transactions which have now been
- * synced then start up a taskq to free them.
- */
-void
-zil_clean(zilog_t *zilog)
-{
-	itx_t *itx;
-
-	mutex_enter(&zilog->zl_lock);
-	itx = list_head(&zilog->zl_itx_list);
-	if ((itx != NULL) &&
-	    (itx->itx_lr.lrc_txg <= spa_last_synced_txg(zilog->zl_spa))) {
-		(void) taskq_dispatch(zilog->zl_clean_taskq,
-		    (void (*)(void *))zil_itx_clean, zilog, TQ_NOSLEEP);
-	}
-	mutex_exit(&zilog->zl_lock);
-}
-
-void
-zil_commit_writer(zilog_t *zilog, uint64_t seq, uint64_t foid)
-{
-	uint64_t txg;
-	uint64_t reclen;
-	uint64_t commit_seq = 0;
-	itx_t *itx, *itx_next = (itx_t *)-1;
-	lwb_t *lwb;
-	spa_t *spa;
-
-	zilog->zl_writer = B_TRUE;
-	zilog->zl_root_zio = NULL;
-	spa = zilog->zl_spa;
-
-	if (zilog->zl_suspend) {
-		lwb = NULL;
-	} else {
-		lwb = list_tail(&zilog->zl_lwb_list);
-		if (lwb == NULL) {
-			/*
-			 * Return if there's nothing to flush before we
-			 * dirty the fs by calling zil_create()
-			 */
-			if (list_is_empty(&zilog->zl_itx_list)) {
-				zilog->zl_writer = B_FALSE;
-				return;
-			}
-			mutex_exit(&zilog->zl_lock);
-			zil_create(zilog);
-			mutex_enter(&zilog->zl_lock);
-			lwb = list_tail(&zilog->zl_lwb_list);
-		}
-	}
-
-	/* Loop through in-memory log transactions filling log blocks. */
-	DTRACE_PROBE1(zil__cw1, zilog_t *, zilog);
-	for (;;) {
-		/*
-		 * Find the next itx to push:
-		 * Push all transactions related to specified foid and all
-		 * other transactions except TX_WRITE, TX_TRUNCATE,
-		 * TX_SETATTR and TX_ACL for all other files.
-		 */
-		if (itx_next != (itx_t *)-1)
-			itx = itx_next;
-		else
-			itx = list_head(&zilog->zl_itx_list);
-		for (; itx != NULL; itx = list_next(&zilog->zl_itx_list, itx)) {
-			if (foid == 0) /* push all foids? */
-				break;
-			if (itx->itx_sync) /* push all O_[D]SYNC */
-				break;
-			switch (itx->itx_lr.lrc_txtype) {
-			case TX_SETATTR:
-			case TX_WRITE:
-			case TX_TRUNCATE:
-			case TX_ACL:
-				/* lr_foid is same offset for these records */
-				if (((lr_write_t *)&itx->itx_lr)->lr_foid
-				    != foid) {
-					continue; /* skip this record */
-				}
-			}
-			break;
-		}
-		if (itx == NULL)
-			break;
-
-		reclen = itx->itx_lr.lrc_reclen;
-		if ((itx->itx_lr.lrc_seq > seq) &&
-		    ((lwb == NULL) || (lwb->lwb_nused == 0) ||
-		    (lwb->lwb_nused + reclen > ZIL_BLK_DATA_SZ(lwb)))) {
-			break;
-		}
-
-		/*
-		 * Save the next pointer.  Even though we soon drop
-		 * zl_lock all threads that may change the list
-		 * (another writer or zil_itx_clean) can't do so until
-		 * they have zl_writer.
-		 */
-		itx_next = list_next(&zilog->zl_itx_list, itx);
-		list_remove(&zilog->zl_itx_list, itx);
-		mutex_exit(&zilog->zl_lock);
-		txg = itx->itx_lr.lrc_txg;
-		ASSERT(txg);
-
-		if (txg > spa_last_synced_txg(spa) ||
-		    txg > spa_freeze_txg(spa))
-			lwb = zil_lwb_commit(zilog, itx, lwb);
-		kmem_free(itx, offsetof(itx_t, itx_lr)
-		    + itx->itx_lr.lrc_reclen);
-		mutex_enter(&zilog->zl_lock);
-		zilog->zl_itx_list_sz -= reclen;
-	}
-	DTRACE_PROBE1(zil__cw2, zilog_t *, zilog);
-	/* determine commit sequence number */
-	itx = list_head(&zilog->zl_itx_list);
-	if (itx)
-		commit_seq = itx->itx_lr.lrc_seq;
-	else
-		commit_seq = zilog->zl_itx_seq;
-	mutex_exit(&zilog->zl_lock);
-
-	/* write the last block out */
-	if (lwb != NULL && lwb->lwb_zio != NULL)
-		lwb = zil_lwb_write_start(zilog, lwb);
-
-	zilog->zl_prev_used = zilog->zl_cur_used;
-	zilog->zl_cur_used = 0;
-
-	/*
-	 * Wait if necessary for the log blocks to be on stable storage.
-	 */
-	if (zilog->zl_root_zio) {
-		DTRACE_PROBE1(zil__cw3, zilog_t *, zilog);
-		(void) zio_wait(zilog->zl_root_zio);
-		DTRACE_PROBE1(zil__cw4, zilog_t *, zilog);
-		if (!zfs_nocacheflush)
-			zil_flush_vdevs(zilog);
-	}
-
-	if (zilog->zl_log_error || lwb == NULL) {
-		zilog->zl_log_error = 0;
-		txg_wait_synced(zilog->zl_dmu_pool, 0);
-	}
-
-	mutex_enter(&zilog->zl_lock);
-	zilog->zl_writer = B_FALSE;
-
-	ASSERT3U(commit_seq, >=, zilog->zl_commit_seq);
-	zilog->zl_commit_seq = commit_seq;
-}
-
-/*
- * Push zfs transactions to stable storage up to the supplied sequence number.
- * If foid is 0 push out all transactions, otherwise push only those
- * for that file or might have been used to create that file.
- */
-void
-zil_commit(zilog_t *zilog, uint64_t seq, uint64_t foid)
-{
-	if (zilog == NULL || seq == 0)
-		return;
-
-	mutex_enter(&zilog->zl_lock);
-
-	seq = MIN(seq, zilog->zl_itx_seq);	/* cap seq at largest itx seq */
-
-	while (zilog->zl_writer) {
-		cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
-		if (seq < zilog->zl_commit_seq) {
-			mutex_exit(&zilog->zl_lock);
-			return;
-		}
-	}
-	zil_commit_writer(zilog, seq, foid); /* drops zl_lock */
-	/* wake up others waiting on the commit */
-	cv_broadcast(&zilog->zl_cv_writer);
-	mutex_exit(&zilog->zl_lock);
-}
-
-/*
- * Called in syncing context to free committed log blocks and update log header.
- */
-void
-zil_sync(zilog_t *zilog, dmu_tx_t *tx)
-{
-	zil_header_t *zh = zil_header_in_syncing_context(zilog);
-	uint64_t txg = dmu_tx_get_txg(tx);
-	spa_t *spa = zilog->zl_spa;
-	lwb_t *lwb;
-
-	mutex_enter(&zilog->zl_lock);
-
-	ASSERT(zilog->zl_stop_sync == 0);
-
-	zh->zh_replay_seq = zilog->zl_replay_seq[txg & TXG_MASK];
-
-	if (zilog->zl_destroy_txg == txg) {
-		blkptr_t blk = zh->zh_log;
-
-		ASSERT(list_head(&zilog->zl_lwb_list) == NULL);
-		ASSERT(spa_sync_pass(spa) == 1);
-
-		bzero(zh, sizeof (zil_header_t));
-		bzero(zilog->zl_replay_seq, sizeof (zilog->zl_replay_seq));
-
-		if (zilog->zl_keep_first) {
-			/*
-			 * If this block was part of log chain that couldn't
-			 * be claimed because a device was missing during
-			 * zil_claim(), but that device later returns,
-			 * then this block could erroneously appear valid.
-			 * To guard against this, assign a new GUID to the new
-			 * log chain so it doesn't matter what blk points to.
-			 */
-			zil_init_log_chain(zilog, &blk);
-			zh->zh_log = blk;
-		}
-	}
-
-	for (;;) {
-		lwb = list_head(&zilog->zl_lwb_list);
-		if (lwb == NULL) {
-			mutex_exit(&zilog->zl_lock);
-			return;
-		}
-		zh->zh_log = lwb->lwb_blk;
-		if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg)
-			break;
-		list_remove(&zilog->zl_lwb_list, lwb);
-		zio_free_blk(spa, &lwb->lwb_blk, txg);
-		kmem_cache_free(zil_lwb_cache, lwb);
-
-		/*
-		 * If we don't have anything left in the lwb list then
-		 * we've had an allocation failure and we need to zero
-		 * out the zil_header blkptr so that we don't end
-		 * up freeing the same block twice.
-		 */
-		if (list_head(&zilog->zl_lwb_list) == NULL)
-			BP_ZERO(&zh->zh_log);
-	}
-	mutex_exit(&zilog->zl_lock);
-}
-
-void
-zil_init(void)
-{
-	zil_lwb_cache = kmem_cache_create("zil_lwb_cache",
-	    sizeof (struct lwb), 0, NULL, NULL, NULL, NULL, NULL, 0);
-}
-
-void
-zil_fini(void)
-{
-	kmem_cache_destroy(zil_lwb_cache);
-}
-
-zilog_t *
-zil_alloc(objset_t *os, zil_header_t *zh_phys)
-{
-	zilog_t *zilog;
-
-	zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP);
-
-	zilog->zl_header = zh_phys;
-	zilog->zl_os = os;
-	zilog->zl_spa = dmu_objset_spa(os);
-	zilog->zl_dmu_pool = dmu_objset_pool(os);
-	zilog->zl_destroy_txg = TXG_INITIAL - 1;
-
-	mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL);
-	cv_init(&zilog->zl_cv_writer, NULL, CV_DEFAULT, NULL);
-	cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL);
-
-	list_create(&zilog->zl_itx_list, sizeof (itx_t),
-	    offsetof(itx_t, itx_node));
-
-	list_create(&zilog->zl_lwb_list, sizeof (lwb_t),
-	    offsetof(lwb_t, lwb_node));
-
-	list_create(&zilog->zl_vdev_list, sizeof (zil_vdev_t),
-	    offsetof(zil_vdev_t, vdev_seq_node));
-
-	return (zilog);
-}
-
-void
-zil_free(zilog_t *zilog)
-{
-	lwb_t *lwb;
-	zil_vdev_t *zv;
-
-	zilog->zl_stop_sync = 1;
-
-	while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
-		list_remove(&zilog->zl_lwb_list, lwb);
-		if (lwb->lwb_buf != NULL)
-			zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
-		kmem_cache_free(zil_lwb_cache, lwb);
-	}
-	list_destroy(&zilog->zl_lwb_list);
-
-	while ((zv = list_head(&zilog->zl_vdev_list)) != NULL) {
-		list_remove(&zilog->zl_vdev_list, zv);
-		kmem_free(zv, sizeof (zil_vdev_t));
-	}
-	list_destroy(&zilog->zl_vdev_list);
-
-	ASSERT(list_head(&zilog->zl_itx_list) == NULL);
-	list_destroy(&zilog->zl_itx_list);
-	cv_destroy(&zilog->zl_cv_suspend);
-	cv_destroy(&zilog->zl_cv_writer);
-	mutex_destroy(&zilog->zl_lock);
-
-	kmem_free(zilog, sizeof (zilog_t));
-}
-
-/*
- * return true if the initial log block is not valid
- */
-static int
-zil_empty(zilog_t *zilog)
-{
-	const zil_header_t *zh = zilog->zl_header;
-	arc_buf_t *abuf = NULL;
-
-	if (BP_IS_HOLE(&zh->zh_log))
-		return (1);
-
-	if (zil_read_log_block(zilog, &zh->zh_log, &abuf) != 0)
-		return (1);
-
-	VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
-	return (0);
-}
-
-/*
- * Open an intent log.
- */
-zilog_t *
-zil_open(objset_t *os, zil_get_data_t *get_data)
-{
-	zilog_t *zilog = dmu_objset_zil(os);
-
-	zilog->zl_get_data = get_data;
-	zilog->zl_clean_taskq = taskq_create("zil_clean", 1, minclsyspri,
-	    2, 2, TASKQ_PREPOPULATE);
-
-	return (zilog);
-}
-
-/*
- * Close an intent log.
- */
-void
-zil_close(zilog_t *zilog)
-{
-	/*
-	 * If the log isn't already committed, mark the objset dirty
-	 * (so zil_sync() will be called) and wait for that txg to sync.
-	 */
-	if (!zil_is_committed(zilog)) {
-		uint64_t txg;
-		dmu_tx_t *tx = dmu_tx_create(zilog->zl_os);
-		(void) dmu_tx_assign(tx, TXG_WAIT);
-		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
-		txg = dmu_tx_get_txg(tx);
-		dmu_tx_commit(tx);
-		txg_wait_synced(zilog->zl_dmu_pool, txg);
-	}
-
-	taskq_destroy(zilog->zl_clean_taskq);
-	zilog->zl_clean_taskq = NULL;
-	zilog->zl_get_data = NULL;
-
-	zil_itx_clean(zilog);
-	ASSERT(list_head(&zilog->zl_itx_list) == NULL);
-}
-
-/*
- * Suspend an intent log.  While in suspended mode, we still honor
- * synchronous semantics, but we rely on txg_wait_synced() to do it.
- * We suspend the log briefly when taking a snapshot so that the snapshot
- * contains all the data it's supposed to, and has an empty intent log.
- */
-int
-zil_suspend(zilog_t *zilog)
-{
-	const zil_header_t *zh = zilog->zl_header;
-
-	mutex_enter(&zilog->zl_lock);
-	if (zh->zh_claim_txg != 0) {		/* unplayed log */
-		mutex_exit(&zilog->zl_lock);
-		return (EBUSY);
-	}
-	if (zilog->zl_suspend++ != 0) {
-		/*
-		 * Someone else already began a suspend.
-		 * Just wait for them to finish.
-		 */
-		while (zilog->zl_suspending)
-			cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock);
-		ASSERT(BP_IS_HOLE(&zh->zh_log));
-		mutex_exit(&zilog->zl_lock);
-		return (0);
-	}
-	zilog->zl_suspending = B_TRUE;
-	mutex_exit(&zilog->zl_lock);
-
-	zil_commit(zilog, UINT64_MAX, 0);
-
-	/*
-	 * Wait for any in-flight log writes to complete.
-	 */
-	mutex_enter(&zilog->zl_lock);
-	while (zilog->zl_writer)
-		cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
-	mutex_exit(&zilog->zl_lock);
-
-	zil_destroy(zilog, B_FALSE);
-
-	mutex_enter(&zilog->zl_lock);
-	ASSERT(BP_IS_HOLE(&zh->zh_log));
-	zilog->zl_suspending = B_FALSE;
-	cv_broadcast(&zilog->zl_cv_suspend);
-	mutex_exit(&zilog->zl_lock);
-
-	return (0);
-}
-
-void
-zil_resume(zilog_t *zilog)
-{
-	mutex_enter(&zilog->zl_lock);
-	ASSERT(zilog->zl_suspend != 0);
-	zilog->zl_suspend--;
-	mutex_exit(&zilog->zl_lock);
-}
-
-typedef struct zil_replay_arg {
-	objset_t	*zr_os;
-	zil_replay_func_t **zr_replay;
-	void		*zr_arg;
-	uint64_t	*zr_txgp;
-	boolean_t	zr_byteswap;
-	char		*zr_lrbuf;
-} zil_replay_arg_t;
-
-static void
-zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
-{
-	zil_replay_arg_t *zr = zra;
-	const zil_header_t *zh = zilog->zl_header;
-	uint64_t reclen = lr->lrc_reclen;
-	uint64_t txtype = lr->lrc_txtype;
-	char *name;
-	int pass, error, sunk;
-
-	if (zilog->zl_stop_replay)
-		return;
-
-	if (lr->lrc_txg < claim_txg)		/* already committed */
-		return;
-
-	if (lr->lrc_seq <= zh->zh_replay_seq)	/* already replayed */
-		return;
-
-	/*
-	 * Make a copy of the data so we can revise and extend it.
-	 */
-	bcopy(lr, zr->zr_lrbuf, reclen);
-
-	/*
-	 * The log block containing this lr may have been byteswapped
-	 * so that we can easily examine common fields like lrc_txtype.
-	 * However, the log is a mix of different data types, and only the
-	 * replay vectors know how to byteswap their records.  Therefore, if
-	 * the lr was byteswapped, undo it before invoking the replay vector.
-	 */
-	if (zr->zr_byteswap)
-		byteswap_uint64_array(zr->zr_lrbuf, reclen);
-
-	/*
-	 * If this is a TX_WRITE with a blkptr, suck in the data.
-	 */
-	if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) {
-		lr_write_t *lrw = (lr_write_t *)lr;
-		blkptr_t *wbp = &lrw->lr_blkptr;
-		uint64_t wlen = lrw->lr_length;
-		char *wbuf = zr->zr_lrbuf + reclen;
-
-		if (BP_IS_HOLE(wbp)) {	/* compressed to a hole */
-			bzero(wbuf, wlen);
-		} else {
-			/*
-			 * A subsequent write may have overwritten this block,
-			 * in which case wbp may have been been freed and
-			 * reallocated, and our read of wbp may fail with a
-			 * checksum error.  We can safely ignore this because
-			 * the later write will provide the correct data.
-			 */
-			zbookmark_t zb;
-
-			zb.zb_objset = dmu_objset_id(zilog->zl_os);
-			zb.zb_object = lrw->lr_foid;
-			zb.zb_level = -1;
-			zb.zb_blkid = lrw->lr_offset / BP_GET_LSIZE(wbp);
-
-			(void) zio_wait(zio_read(NULL, zilog->zl_spa,
-			    wbp, wbuf, BP_GET_LSIZE(wbp), NULL, NULL,
-			    ZIO_PRIORITY_SYNC_READ,
-			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &zb));
-			(void) memmove(wbuf, wbuf + lrw->lr_blkoff, wlen);
-		}
-	}
-
-	/*
-	 * We must now do two things atomically: replay this log record,
-	 * and update the log header to reflect the fact that we did so.
-	 * We use the DMU's ability to assign into a specific txg to do this.
-	 */
-	for (pass = 1, sunk = B_FALSE; /* CONSTANTCONDITION */; pass++) {
-		uint64_t replay_txg;
-		dmu_tx_t *replay_tx;
-
-		replay_tx = dmu_tx_create(zr->zr_os);
-		error = dmu_tx_assign(replay_tx, TXG_WAIT);
-		if (error) {
-			dmu_tx_abort(replay_tx);
-			break;
-		}
-
-		replay_txg = dmu_tx_get_txg(replay_tx);
-
-		if (txtype == 0 || txtype >= TX_MAX_TYPE) {
-			error = EINVAL;
-		} else {
-			/*
-			 * On the first pass, arrange for the replay vector
-			 * to fail its dmu_tx_assign().  That's the only way
-			 * to ensure that those code paths remain well tested.
-			 */
-			*zr->zr_txgp = replay_txg - (pass == 1);
-			error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lrbuf,
-			    zr->zr_byteswap);
-			*zr->zr_txgp = TXG_NOWAIT;
-		}
-
-		if (error == 0) {
-			dsl_dataset_dirty(dmu_objset_ds(zr->zr_os), replay_tx);
-			zilog->zl_replay_seq[replay_txg & TXG_MASK] =
-			    lr->lrc_seq;
-		}
-
-		dmu_tx_commit(replay_tx);
-
-		if (!error)
-			return;
-
-		/*
-		 * The DMU's dnode layer doesn't see removes until the txg
-		 * commits, so a subsequent claim can spuriously fail with
-		 * EEXIST. So if we receive any error other than ERESTART
-		 * we try syncing out any removes then retrying the
-		 * transaction.
-		 */
-		if (error != ERESTART && !sunk) {
-			txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
-			sunk = B_TRUE;
-			continue; /* retry */
-		}
-
-		if (error != ERESTART)
-			break;
-
-		if (pass != 1)
-			txg_wait_open(spa_get_dsl(zilog->zl_spa),
-			    replay_txg + 1);
-
-		dprintf("pass %d, retrying\n", pass);
-	}
-
-	ASSERT(error && error != ERESTART);
-	name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
-	dmu_objset_name(zr->zr_os, name);
-	cmn_err(CE_WARN, "ZFS replay transaction error %d, "
-	    "dataset %s, seq 0x%llx, txtype %llu\n",
-	    error, name, (u_longlong_t)lr->lrc_seq, (u_longlong_t)txtype);
-	zilog->zl_stop_replay = 1;
-	kmem_free(name, MAXNAMELEN);
-}
-
-/* ARGSUSED */
-static void
-zil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
-{
-	zilog->zl_replay_blks++;
-}
-
-/*
- * If this dataset has a non-empty intent log, replay it and destroy it.
- */
-void
-zil_replay(objset_t *os, void *arg, uint64_t *txgp,
-	zil_replay_func_t *replay_func[TX_MAX_TYPE])
-{
-	zilog_t *zilog = dmu_objset_zil(os);
-	const zil_header_t *zh = zilog->zl_header;
-	zil_replay_arg_t zr;
-
-	if (zil_empty(zilog)) {
-		zil_destroy(zilog, B_TRUE);
-		return;
-	}
-	//printf("ZFS: Replaying ZIL on %s...\n", os->os->os_spa->spa_name);
-
-	zr.zr_os = os;
-	zr.zr_replay = replay_func;
-	zr.zr_arg = arg;
-	zr.zr_txgp = txgp;
-	zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log);
-	zr.zr_lrbuf = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP);
-
-	/*
-	 * Wait for in-progress removes to sync before starting replay.
-	 */
-	txg_wait_synced(zilog->zl_dmu_pool, 0);
-
-	zilog->zl_stop_replay = 0;
-	zilog->zl_replay_time = LBOLT;
-	ASSERT(zilog->zl_replay_blks == 0);
-	(void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr,
-	    zh->zh_claim_txg);
-	kmem_free(zr.zr_lrbuf, 2 * SPA_MAXBLOCKSIZE);
-
-	zil_destroy(zilog, B_FALSE);
-	//printf("ZFS: Replay of ZIL on %s finished.\n", os->os->os_spa->spa_name);
-}
-
-/*
- * Report whether all transactions are committed
- */
-int
-zil_is_committed(zilog_t *zilog)
-{
-	lwb_t *lwb;
-	int ret;
-
-	mutex_enter(&zilog->zl_lock);
-	while (zilog->zl_writer)
-		cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
-
-	/* recent unpushed intent log transactions? */
-	if (!list_is_empty(&zilog->zl_itx_list)) {
-		ret = B_FALSE;
-		goto out;
-	}
-
-	/* intent log never used? */
-	lwb = list_head(&zilog->zl_lwb_list);
-	if (lwb == NULL) {
-		ret = B_TRUE;
-		goto out;
-	}
-
-	/*
-	 * more than 1 log buffer means zil_sync() hasn't yet freed
-	 * entries after a txg has committed
-	 */
-	if (list_next(&zilog->zl_lwb_list, lwb)) {
-		ret = B_FALSE;
-		goto out;
-	}
-
-	ASSERT(zil_empty(zilog));
-	ret = B_TRUE;
-out:
-	cv_broadcast(&zilog->zl_cv_writer);
-	mutex_exit(&zilog->zl_lock);
-	return (ret);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zio.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zio.c
deleted file mode 100644
index b5dd35f..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/zio.c
+++ /dev/null
@@ -1,1861 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/fm/fs/zfs.h>
-#include <sys/spa.h>
-#include <sys/txg.h>
-#include <sys/spa_impl.h>
-#include <sys/vdev_impl.h>
-#include <sys/zio_impl.h>
-#include <sys/zio_compress.h>
-#include <sys/zio_checksum.h>
-
-/*
- * ==========================================================================
- * I/O priority table
- * ==========================================================================
- */
-uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = {
-	0,	/* ZIO_PRIORITY_NOW		*/
-	0,	/* ZIO_PRIORITY_SYNC_READ	*/
-	0,	/* ZIO_PRIORITY_SYNC_WRITE	*/
-	6,	/* ZIO_PRIORITY_ASYNC_READ	*/
-	4,	/* ZIO_PRIORITY_ASYNC_WRITE	*/
-	4,	/* ZIO_PRIORITY_FREE		*/
-	0,	/* ZIO_PRIORITY_CACHE_FILL	*/
-	0,	/* ZIO_PRIORITY_LOG_WRITE	*/
-	10,	/* ZIO_PRIORITY_RESILVER	*/
-	20,	/* ZIO_PRIORITY_SCRUB		*/
-};
-
-/*
- * ==========================================================================
- * I/O type descriptions
- * ==========================================================================
- */
-char *zio_type_name[ZIO_TYPES] = {
-	"null", "read", "write", "free", "claim", "ioctl" };
-
-/* At or above this size, force gang blocking - for testing */
-uint64_t zio_gang_bang = SPA_MAXBLOCKSIZE + 1;
-
-/* Force an allocation failure when non-zero */
-uint16_t zio_zil_fail_shift = 0;
-
-typedef struct zio_sync_pass {
-	int	zp_defer_free;		/* defer frees after this pass */
-	int	zp_dontcompress;	/* don't compress after this pass */
-	int	zp_rewrite;		/* rewrite new bps after this pass */
-} zio_sync_pass_t;
-
-zio_sync_pass_t zio_sync_pass = {
-	1,	/* zp_defer_free */
-	4,	/* zp_dontcompress */
-	1,	/* zp_rewrite */
-};
-
-/*
- * ==========================================================================
- * I/O kmem caches
- * ==========================================================================
- */
-kmem_cache_t *zio_cache;
-#ifdef ZIO_USE_UMA
-kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
-kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
-#endif
-
-#ifdef _KERNEL
-extern vmem_t *zio_alloc_arena;
-#endif
-
-void
-zio_init(void)
-{
-#ifdef ZIO_USE_UMA
-	size_t c;
-#endif
-#if 0
-	vmem_t *data_alloc_arena = NULL;
-
-#ifdef _KERNEL
-	data_alloc_arena = zio_alloc_arena;
-#endif
-#endif
-
-	zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0,
-	    NULL, NULL, NULL, NULL, NULL, 0);
-
-#ifdef ZIO_USE_UMA
-	/*
-	 * For small buffers, we want a cache for each multiple of
-	 * SPA_MINBLOCKSIZE.  For medium-size buffers, we want a cache
-	 * for each quarter-power of 2.  For large buffers, we want
-	 * a cache for each multiple of PAGESIZE.
-	 */
-	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
-		size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
-		size_t p2 = size;
-		size_t align = 0;
-
-		while (p2 & (p2 - 1))
-			p2 &= p2 - 1;
-
-		if (size <= 4 * SPA_MINBLOCKSIZE) {
-			align = SPA_MINBLOCKSIZE;
-		} else if (P2PHASE(size, PAGESIZE) == 0) {
-			align = PAGESIZE;
-		} else if (P2PHASE(size, p2 >> 2) == 0) {
-			align = p2 >> 2;
-		}
-
-		if (align != 0) {
-			char name[36];
-			(void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
-			zio_buf_cache[c] = kmem_cache_create(name, size,
-			    align, NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG);
-
-			(void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
-			zio_data_buf_cache[c] = kmem_cache_create(name, size,
-			    align, NULL, NULL, NULL, NULL, data_alloc_arena,
-			    KMC_NODEBUG);
-
-			dprintf("creating cache for size %5lx align %5lx\n",
-			    size, align);
-		}
-	}
-
-	while (--c != 0) {
-		ASSERT(zio_buf_cache[c] != NULL);
-		if (zio_buf_cache[c - 1] == NULL)
-			zio_buf_cache[c - 1] = zio_buf_cache[c];
-
-		ASSERT(zio_data_buf_cache[c] != NULL);
-		if (zio_data_buf_cache[c - 1] == NULL)
-			zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
-	}
-#endif
-
-	zio_inject_init();
-}
-
-void
-zio_fini(void)
-{
-#ifdef ZIO_USE_UMA
-	size_t c;
-	kmem_cache_t *last_cache = NULL;
-	kmem_cache_t *last_data_cache = NULL;
-
-	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
-		if (zio_buf_cache[c] != last_cache) {
-			last_cache = zio_buf_cache[c];
-			kmem_cache_destroy(zio_buf_cache[c]);
-		}
-		zio_buf_cache[c] = NULL;
-
-		if (zio_data_buf_cache[c] != last_data_cache) {
-			last_data_cache = zio_data_buf_cache[c];
-			kmem_cache_destroy(zio_data_buf_cache[c]);
-		}
-		zio_data_buf_cache[c] = NULL;
-	}
-#endif
-
-	kmem_cache_destroy(zio_cache);
-
-	zio_inject_fini();
-}
-
-/*
- * ==========================================================================
- * Allocate and free I/O buffers
- * ==========================================================================
- */
-
-/*
- * Use zio_buf_alloc to allocate ZFS metadata.  This data will appear in a
- * crashdump if the kernel panics, so use it judiciously.  Obviously, it's
- * useful to inspect ZFS metadata, but if possible, we should avoid keeping
- * excess / transient data in-core during a crashdump.
- */
-void *
-zio_buf_alloc(size_t size)
-{
-#ifdef ZIO_USE_UMA
-	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
-
-	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
-
-	return (kmem_cache_alloc(zio_buf_cache[c], KM_SLEEP));
-#else
-	return (kmem_alloc(size, KM_SLEEP));
-#endif
-}
-
-/*
- * Use zio_data_buf_alloc to allocate data.  The data will not appear in a
- * crashdump if the kernel panics.  This exists so that we will limit the amount
- * of ZFS data that shows up in a kernel crashdump.  (Thus reducing the amount
- * of kernel heap dumped to disk when the kernel panics)
- */
-void *
-zio_data_buf_alloc(size_t size)
-{
-#ifdef ZIO_USE_UMA
-	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
-
-	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
-
-	return (kmem_cache_alloc(zio_data_buf_cache[c], KM_SLEEP));
-#else
-	return (kmem_alloc(size, KM_SLEEP));
-#endif
-}
-
-void
-zio_buf_free(void *buf, size_t size)
-{
-#ifdef ZIO_USE_UMA
-	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
-
-	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
-
-	kmem_cache_free(zio_buf_cache[c], buf);
-#else
-	kmem_free(buf, size);
-#endif
-}
-
-void
-zio_data_buf_free(void *buf, size_t size)
-{
-#ifdef ZIO_USE_UMA
-	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
-
-	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
-
-	kmem_cache_free(zio_data_buf_cache[c], buf);
-#else
-	kmem_free(buf, size);
-#endif
-}
-
-/*
- * ==========================================================================
- * Push and pop I/O transform buffers
- * ==========================================================================
- */
-static void
-zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize)
-{
-	zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
-
-	zt->zt_data = data;
-	zt->zt_size = size;
-	zt->zt_bufsize = bufsize;
-
-	zt->zt_next = zio->io_transform_stack;
-	zio->io_transform_stack = zt;
-
-	zio->io_data = data;
-	zio->io_size = size;
-}
-
-static void
-zio_pop_transform(zio_t *zio, void **data, uint64_t *size, uint64_t *bufsize)
-{
-	zio_transform_t *zt = zio->io_transform_stack;
-
-	*data = zt->zt_data;
-	*size = zt->zt_size;
-	*bufsize = zt->zt_bufsize;
-
-	zio->io_transform_stack = zt->zt_next;
-	kmem_free(zt, sizeof (zio_transform_t));
-
-	if ((zt = zio->io_transform_stack) != NULL) {
-		zio->io_data = zt->zt_data;
-		zio->io_size = zt->zt_size;
-	}
-}
-
-static void
-zio_clear_transform_stack(zio_t *zio)
-{
-	void *data;
-	uint64_t size, bufsize;
-
-	ASSERT(zio->io_transform_stack != NULL);
-
-	zio_pop_transform(zio, &data, &size, &bufsize);
-	while (zio->io_transform_stack != NULL) {
-		zio_buf_free(data, bufsize);
-		zio_pop_transform(zio, &data, &size, &bufsize);
-	}
-}
-
-/*
- * ==========================================================================
- * Create the various types of I/O (read, write, free)
- * ==========================================================================
- */
-static zio_t *
-zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
-    void *data, uint64_t size, zio_done_func_t *done, void *private,
-    zio_type_t type, int priority, int flags, uint8_t stage, uint32_t pipeline)
-{
-	zio_t *zio;
-
-	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
-	ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
-
-	zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
-	bzero(zio, sizeof (zio_t));
-	zio->io_parent = pio;
-	zio->io_spa = spa;
-	zio->io_txg = txg;
-	if (bp != NULL) {
-		zio->io_bp = bp;
-		zio->io_bp_copy = *bp;
-		zio->io_bp_orig = *bp;
-	}
-	zio->io_done = done;
-	zio->io_private = private;
-	zio->io_type = type;
-	zio->io_priority = priority;
-	zio->io_stage = stage;
-	zio->io_pipeline = pipeline;
-	zio->io_async_stages = ZIO_ASYNC_PIPELINE_STAGES;
-	zio->io_timestamp = lbolt64;
-	zio->io_flags = flags;
-	mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
-	cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
-	zio_push_transform(zio, data, size, size);
-
-	/*
-	 * Note on config lock:
-	 *
-	 * If CONFIG_HELD is set, then the caller already has the config
-	 * lock, so we don't need it for this io.
-	 *
-	 * We set CONFIG_GRABBED to indicate that we have grabbed the
-	 * config lock on behalf of this io, so it should be released
-	 * in zio_done.
-	 *
-	 * Unless CONFIG_HELD is set, we will grab the config lock for
-	 * any top-level (parent-less) io, *except* NULL top-level ios.
-	 * The NULL top-level ios rarely have any children, so we delay
-	 * grabbing the lock until the first child is added (but it is
-	 * still grabbed on behalf of the top-level i/o, so additional
-	 * children don't need to also grab it).  This greatly reduces
-	 * contention on the config lock.
-	 */
-	if (pio == NULL) {
-		if (type != ZIO_TYPE_NULL &&
-		    !(flags & ZIO_FLAG_CONFIG_HELD)) {
-			spa_config_enter(zio->io_spa, RW_READER, zio);
-			zio->io_flags |= ZIO_FLAG_CONFIG_GRABBED;
-		}
-		zio->io_root = zio;
-	} else {
-		zio->io_root = pio->io_root;
-		if (!(flags & ZIO_FLAG_NOBOOKMARK))
-			zio->io_logical = pio->io_logical;
-		mutex_enter(&pio->io_lock);
-		if (pio->io_parent == NULL &&
-		    pio->io_type == ZIO_TYPE_NULL &&
-		    !(pio->io_flags & ZIO_FLAG_CONFIG_GRABBED) &&
-		    !(pio->io_flags & ZIO_FLAG_CONFIG_HELD)) {
-			pio->io_flags |= ZIO_FLAG_CONFIG_GRABBED;
-			spa_config_enter(zio->io_spa, RW_READER, pio);
-		}
-		if (stage < ZIO_STAGE_READY)
-			pio->io_children_notready++;
-		pio->io_children_notdone++;
-		zio->io_sibling_next = pio->io_child;
-		zio->io_sibling_prev = NULL;
-		if (pio->io_child != NULL)
-			pio->io_child->io_sibling_prev = zio;
-		pio->io_child = zio;
-		zio->io_ndvas = pio->io_ndvas;
-		mutex_exit(&pio->io_lock);
-	}
-
-	return (zio);
-}
-
-zio_t *
-zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private,
-	int flags)
-{
-	zio_t *zio;
-
-	zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
-	    ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, ZIO_STAGE_OPEN,
-	    ZIO_WAIT_FOR_CHILDREN_PIPELINE);
-
-	return (zio);
-}
-
-zio_t *
-zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags)
-{
-	return (zio_null(NULL, spa, done, private, flags));
-}
-
-zio_t *
-zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data,
-    uint64_t size, zio_done_func_t *done, void *private,
-    int priority, int flags, zbookmark_t *zb)
-{
-	zio_t *zio;
-
-	ASSERT3U(size, ==, BP_GET_LSIZE(bp));
-
-	zio = zio_create(pio, spa, bp->blk_birth, bp, data, size, done, private,
-	    ZIO_TYPE_READ, priority, flags | ZIO_FLAG_USER,
-	    ZIO_STAGE_OPEN, ZIO_READ_PIPELINE);
-	zio->io_bookmark = *zb;
-
-	zio->io_logical = zio;
-
-	/*
-	 * Work off our copy of the bp so the caller can free it.
-	 */
-	zio->io_bp = &zio->io_bp_copy;
-
-	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
-		uint64_t csize = BP_GET_PSIZE(bp);
-		void *cbuf = zio_buf_alloc(csize);
-
-		zio_push_transform(zio, cbuf, csize, csize);
-		zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS;
-	}
-
-	if (BP_IS_GANG(bp)) {
-		uint64_t gsize = SPA_GANGBLOCKSIZE;
-		void *gbuf = zio_buf_alloc(gsize);
-
-		zio_push_transform(zio, gbuf, gsize, gsize);
-		zio->io_pipeline |= 1U << ZIO_STAGE_READ_GANG_MEMBERS;
-	}
-
-	return (zio);
-}
-
-zio_t *
-zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies,
-    uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
-    zio_done_func_t *ready, zio_done_func_t *done, void *private, int priority,
-    int flags, zbookmark_t *zb)
-{
-	zio_t *zio;
-
-	ASSERT(checksum >= ZIO_CHECKSUM_OFF &&
-	    checksum < ZIO_CHECKSUM_FUNCTIONS);
-
-	ASSERT(compress >= ZIO_COMPRESS_OFF &&
-	    compress < ZIO_COMPRESS_FUNCTIONS);
-
-	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
-	    ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER,
-	    ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE);
-
-	zio->io_ready = ready;
-
-	zio->io_bookmark = *zb;
-
-	zio->io_logical = zio;
-
-	zio->io_checksum = checksum;
-	zio->io_compress = compress;
-	zio->io_ndvas = ncopies;
-
-	if (compress != ZIO_COMPRESS_OFF)
-		zio->io_async_stages |= 1U << ZIO_STAGE_WRITE_COMPRESS;
-
-	if (bp->blk_birth != txg) {
-		/* XXX the bp usually (always?) gets re-zeroed later */
-		BP_ZERO(bp);
-		BP_SET_LSIZE(bp, size);
-		BP_SET_PSIZE(bp, size);
-	} else {
-		/* Make sure someone doesn't change their mind on overwrites */
-		ASSERT(MIN(zio->io_ndvas + BP_IS_GANG(bp),
-		    spa_max_replication(spa)) == BP_GET_NDVAS(bp));
-	}
-
-	return (zio);
-}
-
-zio_t *
-zio_rewrite(zio_t *pio, spa_t *spa, int checksum,
-    uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
-    zio_done_func_t *done, void *private, int priority, int flags,
-    zbookmark_t *zb)
-{
-	zio_t *zio;
-
-	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
-	    ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER,
-	    ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
-
-	zio->io_bookmark = *zb;
-	zio->io_checksum = checksum;
-	zio->io_compress = ZIO_COMPRESS_OFF;
-
-	if (pio != NULL)
-		ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp));
-
-	return (zio);
-}
-
-static zio_t *
-zio_write_allocate(zio_t *pio, spa_t *spa, int checksum,
-    uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
-    zio_done_func_t *done, void *private, int priority, int flags)
-{
-	zio_t *zio;
-
-	BP_ZERO(bp);
-	BP_SET_LSIZE(bp, size);
-	BP_SET_PSIZE(bp, size);
-	BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
-
-	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
-	    ZIO_TYPE_WRITE, priority, flags,
-	    ZIO_STAGE_OPEN, ZIO_WRITE_ALLOCATE_PIPELINE);
-
-	zio->io_checksum = checksum;
-	zio->io_compress = ZIO_COMPRESS_OFF;
-
-	return (zio);
-}
-
-zio_t *
-zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
-    zio_done_func_t *done, void *private)
-{
-	zio_t *zio;
-
-	ASSERT(!BP_IS_HOLE(bp));
-
-	if (txg == spa->spa_syncing_txg &&
-	    spa->spa_sync_pass > zio_sync_pass.zp_defer_free) {
-		bplist_enqueue_deferred(&spa->spa_sync_bplist, bp);
-		return (zio_null(pio, spa, NULL, NULL, 0));
-	}
-
-	zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private,
-	    ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, ZIO_FLAG_USER,
-	    ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE);
-
-	zio->io_bp = &zio->io_bp_copy;
-
-	return (zio);
-}
-
-zio_t *
-zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
-    zio_done_func_t *done, void *private)
-{
-	zio_t *zio;
-
-	/*
-	 * A claim is an allocation of a specific block.  Claims are needed
-	 * to support immediate writes in the intent log.  The issue is that
-	 * immediate writes contain committed data, but in a txg that was
-	 * *not* committed.  Upon opening the pool after an unclean shutdown,
-	 * the intent log claims all blocks that contain immediate write data
-	 * so that the SPA knows they're in use.
-	 *
-	 * All claims *must* be resolved in the first txg -- before the SPA
-	 * starts allocating blocks -- so that nothing is allocated twice.
-	 */
-	ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
-	ASSERT3U(spa_first_txg(spa), <=, txg);
-
-	zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private,
-	    ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, 0,
-	    ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
-
-	zio->io_bp = &zio->io_bp_copy;
-
-	return (zio);
-}
-
-zio_t *
-zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
-    zio_done_func_t *done, void *private, int priority, int flags)
-{
-	zio_t *zio;
-	int c;
-
-	if (vd->vdev_children == 0) {
-		zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
-		    ZIO_TYPE_IOCTL, priority, flags,
-		    ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
-
-		zio->io_vd = vd;
-		zio->io_cmd = cmd;
-	} else {
-		zio = zio_null(pio, spa, NULL, NULL, flags);
-
-		for (c = 0; c < vd->vdev_children; c++)
-			zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
-			    done, private, priority, flags));
-	}
-
-	return (zio);
-}
-
-static void
-zio_phys_bp_init(vdev_t *vd, blkptr_t *bp, uint64_t offset, uint64_t size,
-    int checksum)
-{
-	ASSERT(vd->vdev_children == 0);
-
-	ASSERT(size <= SPA_MAXBLOCKSIZE);
-	ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
-	ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
-
-	ASSERT(offset + size <= VDEV_LABEL_START_SIZE ||
-	    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
-	ASSERT3U(offset + size, <=, vd->vdev_psize);
-
-	BP_ZERO(bp);
-
-	BP_SET_LSIZE(bp, size);
-	BP_SET_PSIZE(bp, size);
-
-	BP_SET_CHECKSUM(bp, checksum);
-	BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
-	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
-
-	if (checksum != ZIO_CHECKSUM_OFF)
-		ZIO_SET_CHECKSUM(&bp->blk_cksum, offset, 0, 0, 0);
-}
-
-zio_t *
-zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
-    void *data, int checksum, zio_done_func_t *done, void *private,
-    int priority, int flags)
-{
-	zio_t *zio;
-	blkptr_t blk;
-
-	zio_phys_bp_init(vd, &blk, offset, size, checksum);
-
-	zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private,
-	    ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL,
-	    ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
-
-	zio->io_vd = vd;
-	zio->io_offset = offset;
-
-	/*
-	 * Work off our copy of the bp so the caller can free it.
-	 */
-	zio->io_bp = &zio->io_bp_copy;
-
-	return (zio);
-}
-
-zio_t *
-zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
-    void *data, int checksum, zio_done_func_t *done, void *private,
-    int priority, int flags)
-{
-	zio_block_tail_t *zbt;
-	void *wbuf;
-	zio_t *zio;
-	blkptr_t blk;
-
-	zio_phys_bp_init(vd, &blk, offset, size, checksum);
-
-	zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private,
-	    ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL,
-	    ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
-
-	zio->io_vd = vd;
-	zio->io_offset = offset;
-
-	zio->io_bp = &zio->io_bp_copy;
-	zio->io_checksum = checksum;
-
-	if (zio_checksum_table[checksum].ci_zbt) {
-		/*
-		 * zbt checksums are necessarily destructive -- they modify
-		 * one word of the write buffer to hold the verifier/checksum.
-		 * Therefore, we must make a local copy in case the data is
-		 * being written to multiple places.
-		 */
-		wbuf = zio_buf_alloc(size);
-		bcopy(data, wbuf, size);
-		zio_push_transform(zio, wbuf, size, size);
-
-		zbt = (zio_block_tail_t *)((char *)wbuf + size) - 1;
-		zbt->zbt_cksum = blk.blk_cksum;
-	}
-
-	return (zio);
-}
-
-/*
- * Create a child I/O to do some work for us.  It has no associated bp.
- */
-zio_t *
-zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
-	void *data, uint64_t size, int type, int priority, int flags,
-	zio_done_func_t *done, void *private)
-{
-	uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE;
-	zio_t *cio;
-
-	if (type == ZIO_TYPE_READ && bp != NULL) {
-		/*
-		 * If we have the bp, then the child should perform the
-		 * checksum and the parent need not.  This pushes error
-		 * detection as close to the leaves as possible and
-		 * eliminates redundant checksums in the interior nodes.
-		 */
-		pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY;
-		zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY);
-	}
-
-	cio = zio_create(zio, zio->io_spa, zio->io_txg, bp, data, size,
-	    done, private, type, priority,
-	    (zio->io_flags & ZIO_FLAG_VDEV_INHERIT) | ZIO_FLAG_CANFAIL | flags,
-	    ZIO_STAGE_VDEV_IO_START - 1, pipeline);
-
-	cio->io_vd = vd;
-	cio->io_offset = offset;
-
-	return (cio);
-}
-
-/*
- * ==========================================================================
- * Initiate I/O, either sync or async
- * ==========================================================================
- */
-int
-zio_wait(zio_t *zio)
-{
-	int error;
-
-	ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
-
-	zio->io_waiter = curthread;
-
-	zio_next_stage_async(zio);
-
-	mutex_enter(&zio->io_lock);
-	while (zio->io_stalled != ZIO_STAGE_DONE)
-		cv_wait(&zio->io_cv, &zio->io_lock);
-	mutex_exit(&zio->io_lock);
-
-	error = zio->io_error;
-	cv_destroy(&zio->io_cv);
-	mutex_destroy(&zio->io_lock);
-	kmem_cache_free(zio_cache, zio);
-
-	return (error);
-}
-
-void
-zio_nowait(zio_t *zio)
-{
-	zio_next_stage_async(zio);
-}
-
-/*
- * ==========================================================================
- * I/O pipeline interlocks: parent/child dependency scoreboarding
- * ==========================================================================
- */
-static void
-zio_wait_for_children(zio_t *zio, uint32_t stage, uint64_t *countp)
-{
-	mutex_enter(&zio->io_lock);
-	if (*countp == 0) {
-		ASSERT(zio->io_stalled == 0);
-		mutex_exit(&zio->io_lock);
-		zio_next_stage(zio);
-	} else {
-		zio->io_stalled = stage;
-		mutex_exit(&zio->io_lock);
-	}
-}
-
-static void
-zio_notify_parent(zio_t *zio, uint32_t stage, uint64_t *countp)
-{
-	zio_t *pio = zio->io_parent;
-
-	mutex_enter(&pio->io_lock);
-	if (pio->io_error == 0 && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
-		pio->io_error = zio->io_error;
-	if (--*countp == 0 && pio->io_stalled == stage) {
-		pio->io_stalled = 0;
-		mutex_exit(&pio->io_lock);
-		zio_next_stage_async(pio);
-	} else {
-		mutex_exit(&pio->io_lock);
-	}
-}
-
-static void
-zio_wait_children_ready(zio_t *zio)
-{
-	zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_READY,
-	    &zio->io_children_notready);
-}
-
-void
-zio_wait_children_done(zio_t *zio)
-{
-	zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_DONE,
-	    &zio->io_children_notdone);
-}
-
-static void
-zio_ready(zio_t *zio)
-{
-	zio_t *pio = zio->io_parent;
-
-	if (zio->io_ready)
-		zio->io_ready(zio);
-
-	if (pio != NULL)
-		zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_READY,
-		    &pio->io_children_notready);
-
-	if (zio->io_bp)
-		zio->io_bp_copy = *zio->io_bp;
-
-	zio_next_stage(zio);
-}
-
-static void
-zio_done(zio_t *zio)
-{
-	zio_t *pio = zio->io_parent;
-	spa_t *spa = zio->io_spa;
-	blkptr_t *bp = zio->io_bp;
-	vdev_t *vd = zio->io_vd;
-
-	ASSERT(zio->io_children_notready == 0);
-	ASSERT(zio->io_children_notdone == 0);
-
-	if (bp != NULL) {
-		ASSERT(bp->blk_pad[0] == 0);
-		ASSERT(bp->blk_pad[1] == 0);
-		ASSERT(bp->blk_pad[2] == 0);
-		ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0);
-		if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) &&
-		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
-			ASSERT(!BP_SHOULD_BYTESWAP(bp));
-			if (zio->io_ndvas != 0)
-				ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp));
-			ASSERT(BP_COUNT_GANG(bp) == 0 ||
-			    (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp)));
-		}
-	}
-
-	if (vd != NULL)
-		vdev_stat_update(zio);
-
-	if (zio->io_error) {
-		/*
-		 * If this I/O is attached to a particular vdev,
-		 * generate an error message describing the I/O failure
-		 * at the block level.  We ignore these errors if the
-		 * device is currently unavailable.
-		 */
-		if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd))
-			zfs_ereport_post(FM_EREPORT_ZFS_IO,
-			    zio->io_spa, vd, zio, 0, 0);
-
-		if ((zio->io_error == EIO ||
-		    !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) &&
-		    zio->io_logical == zio) {
-			/*
-			 * For root I/O requests, tell the SPA to log the error
-			 * appropriately.  Also, generate a logical data
-			 * ereport.
-			 */
-			spa_log_error(zio->io_spa, zio);
-
-			zfs_ereport_post(FM_EREPORT_ZFS_DATA,
-			    zio->io_spa, NULL, zio, 0, 0);
-		}
-
-		/*
-		 * For I/O requests that cannot fail, panic appropriately.
-		 */
-		if (!(zio->io_flags & ZIO_FLAG_CANFAIL)) {
-			char *blkbuf;
-
-			blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_NOSLEEP);
-			if (blkbuf) {
-				sprintf_blkptr(blkbuf, BP_SPRINTF_LEN,
-				    bp ? bp : &zio->io_bp_copy);
-			}
-			panic("ZFS: %s (%s on %s off %llx: zio %p %s): error "
-			    "%d", zio->io_error == ECKSUM ?
-			    "bad checksum" : "I/O failure",
-			    zio_type_name[zio->io_type],
-			    vdev_description(vd),
-			    (u_longlong_t)zio->io_offset,
-			    zio, blkbuf ? blkbuf : "", zio->io_error);
-		}
-	}
-	zio_clear_transform_stack(zio);
-
-	if (zio->io_done)
-		zio->io_done(zio);
-
-	ASSERT(zio->io_delegate_list == NULL);
-	ASSERT(zio->io_delegate_next == NULL);
-
-	if (pio != NULL) {
-		zio_t *next, *prev;
-
-		mutex_enter(&pio->io_lock);
-		next = zio->io_sibling_next;
-		prev = zio->io_sibling_prev;
-		if (next != NULL)
-			next->io_sibling_prev = prev;
-		if (prev != NULL)
-			prev->io_sibling_next = next;
-		if (pio->io_child == zio)
-			pio->io_child = next;
-		mutex_exit(&pio->io_lock);
-
-		zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_DONE,
-		    &pio->io_children_notdone);
-	}
-
-	/*
-	 * Note: this I/O is now done, and will shortly be freed, so there is no
-	 * need to clear this (or any other) flag.
-	 */
-	if (zio->io_flags & ZIO_FLAG_CONFIG_GRABBED)
-		spa_config_exit(spa, zio);
-
-	if (zio->io_waiter != NULL) {
-		mutex_enter(&zio->io_lock);
-		ASSERT(zio->io_stage == ZIO_STAGE_DONE);
-		zio->io_stalled = zio->io_stage;
-		cv_broadcast(&zio->io_cv);
-		mutex_exit(&zio->io_lock);
-	} else {
-		cv_destroy(&zio->io_cv);
-		mutex_destroy(&zio->io_lock);
-		kmem_cache_free(zio_cache, zio);
-	}
-}
-
-/*
- * ==========================================================================
- * Compression support
- * ==========================================================================
- */
-static void
-zio_write_compress(zio_t *zio)
-{
-	int compress = zio->io_compress;
-	blkptr_t *bp = zio->io_bp;
-	void *cbuf;
-	uint64_t lsize = zio->io_size;
-	uint64_t csize = lsize;
-	uint64_t cbufsize = 0;
-	int pass;
-
-	if (bp->blk_birth == zio->io_txg) {
-		/*
-		 * We're rewriting an existing block, which means we're
-		 * working on behalf of spa_sync().  For spa_sync() to
-		 * converge, it must eventually be the case that we don't
-		 * have to allocate new blocks.  But compression changes
-		 * the blocksize, which forces a reallocate, and makes
-		 * convergence take longer.  Therefore, after the first
-		 * few passes, stop compressing to ensure convergence.
-		 */
-		pass = spa_sync_pass(zio->io_spa);
-		if (pass > zio_sync_pass.zp_dontcompress)
-			compress = ZIO_COMPRESS_OFF;
-	} else {
-		ASSERT(BP_IS_HOLE(bp));
-		pass = 1;
-	}
-
-	if (compress != ZIO_COMPRESS_OFF)
-		if (!zio_compress_data(compress, zio->io_data, zio->io_size,
-		    &cbuf, &csize, &cbufsize))
-			compress = ZIO_COMPRESS_OFF;
-
-	if (compress != ZIO_COMPRESS_OFF && csize != 0)
-		zio_push_transform(zio, cbuf, csize, cbufsize);
-
-	/*
-	 * The final pass of spa_sync() must be all rewrites, but the first
-	 * few passes offer a trade-off: allocating blocks defers convergence,
-	 * but newly allocated blocks are sequential, so they can be written
-	 * to disk faster.  Therefore, we allow the first few passes of
-	 * spa_sync() to reallocate new blocks, but force rewrites after that.
-	 * There should only be a handful of blocks after pass 1 in any case.
-	 */
-	if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize &&
-	    pass > zio_sync_pass.zp_rewrite) {
-		ASSERT(csize != 0);
-		BP_SET_LSIZE(bp, lsize);
-		BP_SET_COMPRESS(bp, compress);
-		zio->io_pipeline = ZIO_REWRITE_PIPELINE;
-	} else {
-		if (bp->blk_birth == zio->io_txg)
-			BP_ZERO(bp);
-		if (csize == 0) {
-			BP_ZERO(bp);
-			zio->io_pipeline = ZIO_WAIT_FOR_CHILDREN_PIPELINE;
-		} else {
-			ASSERT3U(BP_GET_NDVAS(bp), ==, 0);
-			BP_SET_LSIZE(bp, lsize);
-			BP_SET_PSIZE(bp, csize);
-			BP_SET_COMPRESS(bp, compress);
-			zio->io_pipeline = ZIO_WRITE_ALLOCATE_PIPELINE;
-		}
-	}
-
-	zio_next_stage(zio);
-}
-
-static void
-zio_read_decompress(zio_t *zio)
-{
-	blkptr_t *bp = zio->io_bp;
-	void *data;
-	uint64_t size;
-	uint64_t bufsize;
-	int compress = BP_GET_COMPRESS(bp);
-
-	ASSERT(compress != ZIO_COMPRESS_OFF);
-
-	zio_pop_transform(zio, &data, &size, &bufsize);
-
-	if (zio_decompress_data(compress, data, size,
-	    zio->io_data, zio->io_size))
-		zio->io_error = EIO;
-
-	zio_buf_free(data, bufsize);
-
-	zio_next_stage(zio);
-}
-
-/*
- * ==========================================================================
- * Gang block support
- * ==========================================================================
- */
-static void
-zio_gang_pipeline(zio_t *zio)
-{
-	/*
-	 * By default, the pipeline assumes that we're dealing with a gang
-	 * block.  If we're not, strip out any gang-specific stages.
-	 */
-	if (!BP_IS_GANG(zio->io_bp))
-		zio->io_pipeline &= ~ZIO_GANG_STAGES;
-
-	zio_next_stage(zio);
-}
-
-static void
-zio_gang_byteswap(zio_t *zio)
-{
-	ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
-
-	if (BP_SHOULD_BYTESWAP(zio->io_bp))
-		byteswap_uint64_array(zio->io_data, zio->io_size);
-}
-
-static void
-zio_get_gang_header(zio_t *zio)
-{
-	blkptr_t *bp = zio->io_bp;
-	uint64_t gsize = SPA_GANGBLOCKSIZE;
-	void *gbuf = zio_buf_alloc(gsize);
-
-	ASSERT(BP_IS_GANG(bp));
-
-	zio_push_transform(zio, gbuf, gsize, gsize);
-
-	zio_nowait(zio_create(zio, zio->io_spa, bp->blk_birth, bp, gbuf, gsize,
-	    NULL, NULL, ZIO_TYPE_READ, zio->io_priority,
-	    zio->io_flags & ZIO_FLAG_GANG_INHERIT,
-	    ZIO_STAGE_OPEN, ZIO_READ_PIPELINE));
-
-	zio_wait_children_done(zio);
-}
-
-static void
-zio_read_gang_members(zio_t *zio)
-{
-	zio_gbh_phys_t *gbh;
-	uint64_t gsize, gbufsize, loff, lsize;
-	int i;
-
-	ASSERT(BP_IS_GANG(zio->io_bp));
-
-	zio_gang_byteswap(zio);
-	zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
-
-	for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) {
-		blkptr_t *gbp = &gbh->zg_blkptr[i];
-		lsize = BP_GET_PSIZE(gbp);
-
-		ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF);
-		ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp));
-		ASSERT3U(loff + lsize, <=, zio->io_size);
-		ASSERT(i < SPA_GBH_NBLKPTRS);
-		ASSERT(!BP_IS_HOLE(gbp));
-
-		zio_nowait(zio_read(zio, zio->io_spa, gbp,
-		    (char *)zio->io_data + loff, lsize, NULL, NULL,
-		    zio->io_priority, zio->io_flags & ZIO_FLAG_GANG_INHERIT,
-		    &zio->io_bookmark));
-	}
-
-	zio_buf_free(gbh, gbufsize);
-	zio_wait_children_done(zio);
-}
-
-static void
-zio_rewrite_gang_members(zio_t *zio)
-{
-	zio_gbh_phys_t *gbh;
-	uint64_t gsize, gbufsize, loff, lsize;
-	int i;
-
-	ASSERT(BP_IS_GANG(zio->io_bp));
-	ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE);
-
-	zio_gang_byteswap(zio);
-	zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
-
-	ASSERT(gsize == gbufsize);
-
-	for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) {
-		blkptr_t *gbp = &gbh->zg_blkptr[i];
-		lsize = BP_GET_PSIZE(gbp);
-
-		ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF);
-		ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp));
-		ASSERT3U(loff + lsize, <=, zio->io_size);
-		ASSERT(i < SPA_GBH_NBLKPTRS);
-		ASSERT(!BP_IS_HOLE(gbp));
-
-		zio_nowait(zio_rewrite(zio, zio->io_spa, zio->io_checksum,
-		    zio->io_txg, gbp, (char *)zio->io_data + loff, lsize,
-		    NULL, NULL, zio->io_priority, zio->io_flags,
-		    &zio->io_bookmark));
-	}
-
-	zio_push_transform(zio, gbh, gsize, gbufsize);
-	zio_wait_children_ready(zio);
-}
-
-static void
-zio_free_gang_members(zio_t *zio)
-{
-	zio_gbh_phys_t *gbh;
-	uint64_t gsize, gbufsize;
-	int i;
-
-	ASSERT(BP_IS_GANG(zio->io_bp));
-
-	zio_gang_byteswap(zio);
-	zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
-
-	for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
-		blkptr_t *gbp = &gbh->zg_blkptr[i];
-
-		if (BP_IS_HOLE(gbp))
-			continue;
-		zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg,
-		    gbp, NULL, NULL));
-	}
-
-	zio_buf_free(gbh, gbufsize);
-	zio_next_stage(zio);
-}
-
-static void
-zio_claim_gang_members(zio_t *zio)
-{
-	zio_gbh_phys_t *gbh;
-	uint64_t gsize, gbufsize;
-	int i;
-
-	ASSERT(BP_IS_GANG(zio->io_bp));
-
-	zio_gang_byteswap(zio);
-	zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
-
-	for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
-		blkptr_t *gbp = &gbh->zg_blkptr[i];
-		if (BP_IS_HOLE(gbp))
-			continue;
-		zio_nowait(zio_claim(zio, zio->io_spa, zio->io_txg,
-		    gbp, NULL, NULL));
-	}
-
-	zio_buf_free(gbh, gbufsize);
-	zio_next_stage(zio);
-}
-
-static void
-zio_write_allocate_gang_member_done(zio_t *zio)
-{
-	zio_t *pio = zio->io_parent;
-	dva_t *cdva = zio->io_bp->blk_dva;
-	dva_t *pdva = pio->io_bp->blk_dva;
-	uint64_t asize;
-	int d;
-
-	ASSERT3U(pio->io_ndvas, ==, zio->io_ndvas);
-	ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
-	ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(zio->io_bp));
-	ASSERT3U(pio->io_ndvas, <=, BP_GET_NDVAS(pio->io_bp));
-
-	mutex_enter(&pio->io_lock);
-	for (d = 0; d < BP_GET_NDVAS(pio->io_bp); d++) {
-		ASSERT(DVA_GET_GANG(&pdva[d]));
-		asize = DVA_GET_ASIZE(&pdva[d]);
-		asize += DVA_GET_ASIZE(&cdva[d]);
-		DVA_SET_ASIZE(&pdva[d], asize);
-	}
-	mutex_exit(&pio->io_lock);
-}
-
-static void
-zio_write_allocate_gang_members(zio_t *zio)
-{
-	blkptr_t *bp = zio->io_bp;
-	dva_t *dva = bp->blk_dva;
-	spa_t *spa = zio->io_spa;
-	zio_gbh_phys_t *gbh;
-	uint64_t txg = zio->io_txg;
-	uint64_t resid = zio->io_size;
-	uint64_t maxalloc = P2ROUNDUP(zio->io_size >> 1, SPA_MINBLOCKSIZE);
-	uint64_t gsize, loff, lsize;
-	uint32_t gbps_left;
-	int ndvas = zio->io_ndvas;
-	int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa));
-	int error;
-	int i, d;
-
-	gsize = SPA_GANGBLOCKSIZE;
-	gbps_left = SPA_GBH_NBLKPTRS;
-
-	error = metaslab_alloc(spa, gsize, bp, gbh_ndvas, txg, NULL, B_FALSE);
-	if (error == ENOSPC)
-		panic("can't allocate gang block header");
-	ASSERT(error == 0);
-
-	for (d = 0; d < gbh_ndvas; d++)
-		DVA_SET_GANG(&dva[d], 1);
-
-	bp->blk_birth = txg;
-
-	gbh = zio_buf_alloc(gsize);
-	bzero(gbh, gsize);
-
-	/* We need to test multi-level gang blocks */
-	if (maxalloc >= zio_gang_bang && (LBOLT & 0x1) == 0)
-		maxalloc = MAX(maxalloc >> 2, SPA_MINBLOCKSIZE);
-
-	for (loff = 0, i = 0; loff != zio->io_size;
-	    loff += lsize, resid -= lsize, gbps_left--, i++) {
-		blkptr_t *gbp = &gbh->zg_blkptr[i];
-		dva = gbp->blk_dva;
-
-		ASSERT(gbps_left != 0);
-		maxalloc = MIN(maxalloc, resid);
-
-		while (resid <= maxalloc * gbps_left) {
-			error = metaslab_alloc(spa, maxalloc, gbp, ndvas,
-			    txg, bp, B_FALSE);
-			if (error == 0)
-				break;
-			ASSERT3U(error, ==, ENOSPC);
-			if (maxalloc == SPA_MINBLOCKSIZE)
-				panic("really out of space");
-			maxalloc = P2ROUNDUP(maxalloc >> 1, SPA_MINBLOCKSIZE);
-		}
-
-		if (resid <= maxalloc * gbps_left) {
-			lsize = maxalloc;
-			BP_SET_LSIZE(gbp, lsize);
-			BP_SET_PSIZE(gbp, lsize);
-			BP_SET_COMPRESS(gbp, ZIO_COMPRESS_OFF);
-			gbp->blk_birth = txg;
-			zio_nowait(zio_rewrite(zio, spa,
-			    zio->io_checksum, txg, gbp,
-			    (char *)zio->io_data + loff, lsize,
-			    zio_write_allocate_gang_member_done, NULL,
-			    zio->io_priority, zio->io_flags,
-			    &zio->io_bookmark));
-		} else {
-			lsize = P2ROUNDUP(resid / gbps_left, SPA_MINBLOCKSIZE);
-			ASSERT(lsize != SPA_MINBLOCKSIZE);
-			zio_nowait(zio_write_allocate(zio, spa,
-			    zio->io_checksum, txg, gbp,
-			    (char *)zio->io_data + loff, lsize,
-			    zio_write_allocate_gang_member_done, NULL,
-			    zio->io_priority, zio->io_flags));
-		}
-	}
-
-	ASSERT(resid == 0 && loff == zio->io_size);
-
-	zio->io_pipeline |= 1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE;
-
-	zio_push_transform(zio, gbh, gsize, gsize);
-	/*
-	 * As much as we'd like this to be zio_wait_children_ready(),
-	 * updating our ASIZE doesn't happen until the io_done callback,
-	 * so we have to wait for that to finish in order for our BP
-	 * to be stable.
-	 */
-	zio_wait_children_done(zio);
-}
-
-/*
- * ==========================================================================
- * Allocate and free blocks
- * ==========================================================================
- */
-static void
-zio_dva_allocate(zio_t *zio)
-{
-	blkptr_t *bp = zio->io_bp;
-	int error;
-
-	ASSERT(BP_IS_HOLE(bp));
-	ASSERT3U(BP_GET_NDVAS(bp), ==, 0);
-	ASSERT3U(zio->io_ndvas, >, 0);
-	ASSERT3U(zio->io_ndvas, <=, spa_max_replication(zio->io_spa));
-
-	/* For testing, make some blocks above a certain size be gang blocks */
-	if (zio->io_size >= zio_gang_bang && (LBOLT & 0x3) == 0) {
-		zio_write_allocate_gang_members(zio);
-		return;
-	}
-
-	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
-
-	error = metaslab_alloc(zio->io_spa, zio->io_size, bp, zio->io_ndvas,
-	    zio->io_txg, NULL, B_FALSE);
-
-	if (error == 0) {
-		bp->blk_birth = zio->io_txg;
-	} else if (error == ENOSPC) {
-		if (zio->io_size == SPA_MINBLOCKSIZE)
-			panic("really, truly out of space");
-		zio_write_allocate_gang_members(zio);
-		return;
-	} else {
-		zio->io_error = error;
-	}
-	zio_next_stage(zio);
-}
-
-static void
-zio_dva_free(zio_t *zio)
-{
-	blkptr_t *bp = zio->io_bp;
-
-	metaslab_free(zio->io_spa, bp, zio->io_txg, B_FALSE);
-
-	BP_ZERO(bp);
-
-	zio_next_stage(zio);
-}
-
-static void
-zio_dva_claim(zio_t *zio)
-{
-	zio->io_error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
-
-	zio_next_stage(zio);
-}
-
-/*
- * ==========================================================================
- * Read and write to physical devices
- * ==========================================================================
- */
-
-static void
-zio_vdev_io_start(zio_t *zio)
-{
-	vdev_t *vd = zio->io_vd;
-	vdev_t *tvd = vd ? vd->vdev_top : NULL;
-	blkptr_t *bp = zio->io_bp;
-	uint64_t align;
-
-	if (vd == NULL) {
-		/* The mirror_ops handle multiple DVAs in a single BP */
-		vdev_mirror_ops.vdev_op_io_start(zio);
-		return;
-	}
-
-	align = 1ULL << tvd->vdev_ashift;
-
-	if (zio->io_retries == 0 && vd == tvd)
-		zio->io_flags |= ZIO_FLAG_FAILFAST;
-
-	if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
-	    vd->vdev_children == 0) {
-		zio->io_flags |= ZIO_FLAG_PHYSICAL;
-		zio->io_offset += VDEV_LABEL_START_SIZE;
-	}
-
-	if (P2PHASE(zio->io_size, align) != 0) {
-		uint64_t asize = P2ROUNDUP(zio->io_size, align);
-		char *abuf = zio_buf_alloc(asize);
-		ASSERT(vd == tvd);
-		if (zio->io_type == ZIO_TYPE_WRITE) {
-			bcopy(zio->io_data, abuf, zio->io_size);
-			bzero(abuf + zio->io_size, asize - zio->io_size);
-		}
-		zio_push_transform(zio, abuf, asize, asize);
-		ASSERT(!(zio->io_flags & ZIO_FLAG_SUBBLOCK));
-		zio->io_flags |= ZIO_FLAG_SUBBLOCK;
-	}
-
-	ASSERT(P2PHASE(zio->io_offset, align) == 0);
-	ASSERT(P2PHASE(zio->io_size, align) == 0);
-	ASSERT(bp == NULL ||
-	    P2ROUNDUP(ZIO_GET_IOSIZE(zio), align) == zio->io_size);
-	ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE));
-
-	vdev_io_start(zio);
-
-	/* zio_next_stage_async() gets called from io completion interrupt */
-}
-
-static void
-zio_vdev_io_done(zio_t *zio)
-{
-	if (zio->io_vd == NULL)
-		/* The mirror_ops handle multiple DVAs in a single BP */
-		vdev_mirror_ops.vdev_op_io_done(zio);
-	else
-		vdev_io_done(zio);
-}
-
-/* XXPOLICY */
-boolean_t
-zio_should_retry(zio_t *zio)
-{
-	vdev_t *vd = zio->io_vd;
-
-	if (zio->io_error == 0)
-		return (B_FALSE);
-	if (zio->io_delegate_list != NULL)
-		return (B_FALSE);
-	if (vd && vd != vd->vdev_top)
-		return (B_FALSE);
-	if (zio->io_flags & ZIO_FLAG_DONT_RETRY)
-		return (B_FALSE);
-	if (zio->io_retries > 0)
-		return (B_FALSE);
-
-	return (B_TRUE);
-}
-
-static void
-zio_vdev_io_assess(zio_t *zio)
-{
-	vdev_t *vd = zio->io_vd;
-	vdev_t *tvd = vd ? vd->vdev_top : NULL;
-
-	ASSERT(zio->io_vsd == NULL);
-
-	if (zio->io_flags & ZIO_FLAG_SUBBLOCK) {
-		void *abuf;
-		uint64_t asize;
-		ASSERT(vd == tvd);
-		zio_pop_transform(zio, &abuf, &asize, &asize);
-		if (zio->io_type == ZIO_TYPE_READ)
-			bcopy(abuf, zio->io_data, zio->io_size);
-		zio_buf_free(abuf, asize);
-		zio->io_flags &= ~ZIO_FLAG_SUBBLOCK;
-	}
-
-	if (zio_injection_enabled && !zio->io_error)
-		zio->io_error = zio_handle_fault_injection(zio, EIO);
-
-	/*
-	 * If the I/O failed, determine whether we should attempt to retry it.
-	 */
-	/* XXPOLICY */
-	if (zio_should_retry(zio)) {
-		ASSERT(tvd == vd);
-
-		zio->io_retries++;
-		zio->io_error = 0;
-		zio->io_flags &= ZIO_FLAG_VDEV_INHERIT |
-		    ZIO_FLAG_CONFIG_GRABBED;
-		/* XXPOLICY */
-		zio->io_flags &= ~ZIO_FLAG_FAILFAST;
-		zio->io_flags |= ZIO_FLAG_DONT_CACHE;
-		zio->io_stage = ZIO_STAGE_VDEV_IO_START - 1;
-
-		dprintf("retry #%d for %s to %s offset %llx\n",
-		    zio->io_retries, zio_type_name[zio->io_type],
-		    vdev_description(vd), zio->io_offset);
-
-		zio_next_stage_async(zio);
-		return;
-	}
-
-	if (zio->io_error != 0 && zio->io_error != ECKSUM &&
-	    !(zio->io_flags & ZIO_FLAG_SPECULATIVE) && vd) {
-		/*
-		 * Poor man's hotplug support.  Even if we're done retrying this
-		 * I/O, try to reopen the vdev to see if it's still attached.
-		 * To avoid excessive thrashing, we only try it once a minute.
-		 * This also has the effect of detecting when missing devices
-		 * have come back, by polling the device once a minute.
-		 *
-		 * We need to do this asynchronously because we can't grab
-		 * all the necessary locks way down here.
-		 */
-		if (gethrtime() - vd->vdev_last_try > 60ULL * NANOSEC) {
-			vd->vdev_last_try = gethrtime();
-			tvd->vdev_reopen_wanted = 1;
-			spa_async_request(vd->vdev_spa, SPA_ASYNC_REOPEN);
-		}
-	}
-
-	zio_next_stage(zio);
-}
-
-void
-zio_vdev_io_reissue(zio_t *zio)
-{
-	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
-	ASSERT(zio->io_error == 0);
-
-	zio->io_stage--;
-}
-
-void
-zio_vdev_io_redone(zio_t *zio)
-{
-	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
-
-	zio->io_stage--;
-}
-
-void
-zio_vdev_io_bypass(zio_t *zio)
-{
-	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
-	ASSERT(zio->io_error == 0);
-
-	zio->io_flags |= ZIO_FLAG_IO_BYPASS;
-	zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS - 1;
-}
-
-/*
- * ==========================================================================
- * Generate and verify checksums
- * ==========================================================================
- */
-static void
-zio_checksum_generate(zio_t *zio)
-{
-	int checksum = zio->io_checksum;
-	blkptr_t *bp = zio->io_bp;
-
-	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
-
-	BP_SET_CHECKSUM(bp, checksum);
-	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
-
-	zio_checksum(checksum, &bp->blk_cksum, zio->io_data, zio->io_size);
-
-	zio_next_stage(zio);
-}
-
-static void
-zio_gang_checksum_generate(zio_t *zio)
-{
-	zio_cksum_t zc;
-	zio_gbh_phys_t *gbh = zio->io_data;
-
-	ASSERT(BP_IS_GANG(zio->io_bp));
-	ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE);
-
-	zio_set_gang_verifier(zio, &gbh->zg_tail.zbt_cksum);
-
-	zio_checksum(ZIO_CHECKSUM_GANG_HEADER, &zc, zio->io_data, zio->io_size);
-
-	zio_next_stage(zio);
-}
-
-static void
-zio_checksum_verify(zio_t *zio)
-{
-	if (zio->io_bp != NULL) {
-		zio->io_error = zio_checksum_error(zio);
-		if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE))
-			zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
-			    zio->io_spa, zio->io_vd, zio, 0, 0);
-	}
-
-	zio_next_stage(zio);
-}
-
-/*
- * Called by RAID-Z to ensure we don't compute the checksum twice.
- */
-void
-zio_checksum_verified(zio_t *zio)
-{
-	zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY);
-}
-
-/*
- * Set the external verifier for a gang block based on stuff in the bp
- */
-void
-zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp)
-{
-	blkptr_t *bp = zio->io_bp;
-
-	zcp->zc_word[0] = DVA_GET_VDEV(BP_IDENTITY(bp));
-	zcp->zc_word[1] = DVA_GET_OFFSET(BP_IDENTITY(bp));
-	zcp->zc_word[2] = bp->blk_birth;
-	zcp->zc_word[3] = 0;
-}
-
-/*
- * ==========================================================================
- * Define the pipeline
- * ==========================================================================
- */
-typedef void zio_pipe_stage_t(zio_t *zio);
-
-static void
-zio_badop(zio_t *zio)
-{
-	panic("Invalid I/O pipeline stage %u for zio %p", zio->io_stage, zio);
-}
-
-zio_pipe_stage_t *zio_pipeline[ZIO_STAGE_DONE + 2] = {
-	zio_badop,
-	zio_wait_children_ready,
-	zio_write_compress,
-	zio_checksum_generate,
-	zio_gang_pipeline,
-	zio_get_gang_header,
-	zio_rewrite_gang_members,
-	zio_free_gang_members,
-	zio_claim_gang_members,
-	zio_dva_allocate,
-	zio_dva_free,
-	zio_dva_claim,
-	zio_gang_checksum_generate,
-	zio_ready,
-	zio_vdev_io_start,
-	zio_vdev_io_done,
-	zio_vdev_io_assess,
-	zio_wait_children_done,
-	zio_checksum_verify,
-	zio_read_gang_members,
-	zio_read_decompress,
-	zio_done,
-	zio_badop
-};
-
-/*
- * Move an I/O to the next stage of the pipeline and execute that stage.
- * There's no locking on io_stage because there's no legitimate way for
- * multiple threads to be attempting to process the same I/O.
- */
-void
-zio_next_stage(zio_t *zio)
-{
-	uint32_t pipeline = zio->io_pipeline;
-
-	ASSERT(!MUTEX_HELD(&zio->io_lock));
-
-	if (zio->io_error) {
-		dprintf("zio %p vdev %s offset %llx stage %d error %d\n",
-		    zio, vdev_description(zio->io_vd),
-		    zio->io_offset, zio->io_stage, zio->io_error);
-		if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0)
-			pipeline &= ZIO_ERROR_PIPELINE_MASK;
-	}
-
-	while (((1U << ++zio->io_stage) & pipeline) == 0)
-		continue;
-
-	ASSERT(zio->io_stage <= ZIO_STAGE_DONE);
-	ASSERT(zio->io_stalled == 0);
-
-	/*
-	 * See the comment in zio_next_stage_async() about per-CPU taskqs.
-	 */
-	if (((1U << zio->io_stage) & zio->io_async_stages) &&
-	    (zio->io_stage == ZIO_STAGE_WRITE_COMPRESS) &&
-	    !(zio->io_flags & ZIO_FLAG_METADATA)) {
-		taskq_t *tq = zio->io_spa->spa_zio_issue_taskq[zio->io_type];
-		(void) taskq_dispatch(tq,
-		    (task_func_t *)zio_pipeline[zio->io_stage], zio, TQ_SLEEP);
-	} else {
-		zio_pipeline[zio->io_stage](zio);
-	}
-}
-
-void
-zio_next_stage_async(zio_t *zio)
-{
-	taskq_t *tq;
-	uint32_t pipeline = zio->io_pipeline;
-
-	ASSERT(!MUTEX_HELD(&zio->io_lock));
-
-	if (zio->io_error) {
-		dprintf("zio %p vdev %s offset %llx stage %d error %d\n",
-		    zio, vdev_description(zio->io_vd),
-		    zio->io_offset, zio->io_stage, zio->io_error);
-		if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0)
-			pipeline &= ZIO_ERROR_PIPELINE_MASK;
-	}
-
-	while (((1U << ++zio->io_stage) & pipeline) == 0)
-		continue;
-
-	ASSERT(zio->io_stage <= ZIO_STAGE_DONE);
-	ASSERT(zio->io_stalled == 0);
-
-	/*
-	 * For performance, we'll probably want two sets of task queues:
-	 * per-CPU issue taskqs and per-CPU completion taskqs.  The per-CPU
-	 * part is for read performance: since we have to make a pass over
-	 * the data to checksum it anyway, we want to do this on the same CPU
-	 * that issued the read, because (assuming CPU scheduling affinity)
-	 * that thread is probably still there.  Getting this optimization
-	 * right avoids performance-hostile cache-to-cache transfers.
-	 *
-	 * Note that having two sets of task queues is also necessary for
-	 * correctness: if all of the issue threads get bogged down waiting
-	 * for dependent reads (e.g. metaslab freelist) to complete, then
-	 * there won't be any threads available to service I/O completion
-	 * interrupts.
-	 */
-	if ((1U << zio->io_stage) & zio->io_async_stages) {
-		if (zio->io_stage < ZIO_STAGE_VDEV_IO_DONE)
-			tq = zio->io_spa->spa_zio_issue_taskq[zio->io_type];
-		else
-			tq = zio->io_spa->spa_zio_intr_taskq[zio->io_type];
-		(void) taskq_dispatch(tq,
-		    (task_func_t *)zio_pipeline[zio->io_stage], zio, TQ_SLEEP);
-	} else {
-		zio_pipeline[zio->io_stage](zio);
-	}
-}
-
-static boolean_t
-zio_alloc_should_fail(void)
-{
-	static uint16_t	allocs = 0;
-
-	return (P2PHASE(allocs++, 1U<<zio_zil_fail_shift) == 0);
-}
-
-/*
- * Try to allocate an intent log block.  Return 0 on success, errno on failure.
- */
-int
-zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, blkptr_t *old_bp,
-    uint64_t txg)
-{
-	int error;
-
-	spa_config_enter(spa, RW_READER, FTAG);
-
-	if (zio_zil_fail_shift && zio_alloc_should_fail()) {
-		spa_config_exit(spa, FTAG);
-		return (ENOSPC);
-	}
-
-	/*
-	 * We were passed the previous log blocks dva_t in bp->blk_dva[0].
-	 */
-	error = metaslab_alloc(spa, size, new_bp, 1, txg, old_bp, B_TRUE);
-
-	if (error == 0) {
-		BP_SET_LSIZE(new_bp, size);
-		BP_SET_PSIZE(new_bp, size);
-		BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
-		BP_SET_CHECKSUM(new_bp, ZIO_CHECKSUM_ZILOG);
-		BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
-		BP_SET_LEVEL(new_bp, 0);
-		BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
-		new_bp->blk_birth = txg;
-	}
-
-	spa_config_exit(spa, FTAG);
-
-	return (error);
-}
-
-/*
- * Free an intent log block.  We know it can't be a gang block, so there's
- * nothing to do except metaslab_free() it.
- */
-void
-zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg)
-{
-	ASSERT(!BP_IS_GANG(bp));
-
-	spa_config_enter(spa, RW_READER, FTAG);
-
-	metaslab_free(spa, bp, txg, B_FALSE);
-
-	spa_config_exit(spa, FTAG);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c
deleted file mode 100644
index f0d9a14..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c
+++ /dev/null
@@ -1,172 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/spa.h>
-#include <sys/zio.h>
-#include <sys/zio_checksum.h>
-
-/*
- * Checksum vectors.
- *
- * In the SPA, everything is checksummed.  We support checksum vectors
- * for three distinct reasons:
- *
- *   1. Different kinds of data need different levels of protection.
- *	For SPA metadata, we always want a very strong checksum.
- *	For user data, we let users make the trade-off between speed
- *	and checksum strength.
- *
- *   2. Cryptographic hash and MAC algorithms are an area of active research.
- *	It is likely that in future hash functions will be at least as strong
- *	as current best-of-breed, and may be substantially faster as well.
- *	We want the ability to take advantage of these new hashes as soon as
- *	they become available.
- *
- *   3. If someone develops hardware that can compute a strong hash quickly,
- *	we want the ability to take advantage of that hardware.
- *
- * Of course, we don't want a checksum upgrade to invalidate existing
- * data, so we store the checksum *function* in five bits of the DVA.
- * This gives us room for up to 32 different checksum functions.
- *
- * When writing a block, we always checksum it with the latest-and-greatest
- * checksum function of the appropriate strength.  When reading a block,
- * we compare the expected checksum against the actual checksum, which we
- * compute via the checksum function specified in the DVA encoding.
- */
-
-/*ARGSUSED*/
-static void
-zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp)
-{
-	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
-}
-
-zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
-	{{NULL,			NULL},			0, 0,	"inherit"},
-	{{NULL,			NULL},			0, 0,	"on"},
-	{{zio_checksum_off,	zio_checksum_off},	0, 0,	"off"},
-	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 1,	"label"},
-	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 1,	"gang_header"},
-	{{fletcher_2_native,	fletcher_2_byteswap},	0, 1,	"zilog"},
-	{{fletcher_2_native,	fletcher_2_byteswap},	0, 0,	"fletcher2"},
-	{{fletcher_4_native,	fletcher_4_byteswap},	1, 0,	"fletcher4"},
-	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 0,	"SHA256"},
-};
-
-uint8_t
-zio_checksum_select(uint8_t child, uint8_t parent)
-{
-	ASSERT(child < ZIO_CHECKSUM_FUNCTIONS);
-	ASSERT(parent < ZIO_CHECKSUM_FUNCTIONS);
-	ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON);
-
-	if (child == ZIO_CHECKSUM_INHERIT)
-		return (parent);
-
-	if (child == ZIO_CHECKSUM_ON)
-		return (ZIO_CHECKSUM_ON_VALUE);
-
-	return (child);
-}
-
-/*
- * Generate the checksum.
- */
-void
-zio_checksum(uint_t checksum, zio_cksum_t *zcp, void *data, uint64_t size)
-{
-	zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1;
-	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
-	zio_cksum_t zbt_cksum;
-
-	ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS);
-	ASSERT(ci->ci_func[0] != NULL);
-
-	if (ci->ci_zbt) {
-		*zcp = zbt->zbt_cksum;
-		zbt->zbt_magic = ZBT_MAGIC;
-		ci->ci_func[0](data, size, &zbt_cksum);
-		zbt->zbt_cksum = zbt_cksum;
-	} else {
-		ci->ci_func[0](data, size, zcp);
-	}
-}
-
-int
-zio_checksum_error(zio_t *zio)
-{
-	blkptr_t *bp = zio->io_bp;
-	zio_cksum_t zc = bp->blk_cksum;
-	uint_t checksum = BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER :
-	    BP_GET_CHECKSUM(bp);
-	int byteswap = BP_SHOULD_BYTESWAP(bp);
-	void *data = zio->io_data;
-	uint64_t size = ZIO_GET_IOSIZE(zio);
-	zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1;
-	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
-	zio_cksum_t actual_cksum, expected_cksum;
-
-	if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL)
-		return (EINVAL);
-
-	if (ci->ci_zbt) {
-		if (checksum == ZIO_CHECKSUM_GANG_HEADER)
-			zio_set_gang_verifier(zio, &zc);
-
-		if (zbt->zbt_magic == BSWAP_64(ZBT_MAGIC)) {
-			expected_cksum = zbt->zbt_cksum;
-			byteswap_uint64_array(&expected_cksum,
-			    sizeof (zio_cksum_t));
-			zbt->zbt_cksum = zc;
-			byteswap_uint64_array(&zbt->zbt_cksum,
-			    sizeof (zio_cksum_t));
-			ci->ci_func[1](data, size, &actual_cksum);
-			zbt->zbt_cksum = expected_cksum;
-			byteswap_uint64_array(&zbt->zbt_cksum,
-			    sizeof (zio_cksum_t));
-		} else {
-			expected_cksum = zbt->zbt_cksum;
-			zbt->zbt_cksum = zc;
-			ci->ci_func[0](data, size, &actual_cksum);
-			zbt->zbt_cksum = expected_cksum;
-		}
-		zc = expected_cksum;
-	} else {
-		ASSERT(!BP_IS_GANG(bp));
-		ci->ci_func[byteswap](data, size, &actual_cksum);
-	}
-
-	if (!ZIO_CHECKSUM_EQUAL(actual_cksum, zc))
-		return (ECKSUM);
-
-	if (zio_injection_enabled && !zio->io_error)
-		return (zio_handle_fault_injection(zio, ECKSUM));
-
-	return (0);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c
deleted file mode 100644
index c563be4..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/compress.h>
-#include <sys/spa.h>
-#include <sys/zio.h>
-#include <sys/zio_compress.h>
-
-/*
- * Compression vectors.
- */
-
-zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
-	{NULL,			NULL,			0,	"inherit"},
-	{NULL,			NULL,			0,	"on"},
-	{NULL,			NULL,			0,	"uncompressed"},
-	{lzjb_compress,		lzjb_decompress,	0,	"lzjb"},
-	{NULL,			NULL,			0,	"empty"},
-	{gzip_compress,		gzip_decompress,	1,	"gzip-1"},
-	{gzip_compress,		gzip_decompress,	2,	"gzip-2"},
-	{gzip_compress,		gzip_decompress,	3,	"gzip-3"},
-	{gzip_compress,		gzip_decompress,	4,	"gzip-4"},
-	{gzip_compress,		gzip_decompress,	5,	"gzip-5"},
-	{gzip_compress,		gzip_decompress,	6,	"gzip-6"},
-	{gzip_compress,		gzip_decompress,	7,	"gzip-7"},
-	{gzip_compress,		gzip_decompress,	8,	"gzip-8"},
-	{gzip_compress,		gzip_decompress,	9,	"gzip-9"},
-};
-
-uint8_t
-zio_compress_select(uint8_t child, uint8_t parent)
-{
-	ASSERT(child < ZIO_COMPRESS_FUNCTIONS);
-	ASSERT(parent < ZIO_COMPRESS_FUNCTIONS);
-	ASSERT(parent != ZIO_COMPRESS_INHERIT && parent != ZIO_COMPRESS_ON);
-
-	if (child == ZIO_COMPRESS_INHERIT)
-		return (parent);
-
-	if (child == ZIO_COMPRESS_ON)
-		return (ZIO_COMPRESS_ON_VALUE);
-
-	return (child);
-}
-
-int
-zio_compress_data(int cpfunc, void *src, uint64_t srcsize, void **destp,
-    uint64_t *destsizep, uint64_t *destbufsizep)
-{
-	uint64_t *word, *word_end;
-	uint64_t ciosize, gapsize, destbufsize;
-	zio_compress_info_t *ci = &zio_compress_table[cpfunc];
-	char *dest;
-	uint_t allzero;
-
-	ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS);
-	ASSERT((uint_t)cpfunc == ZIO_COMPRESS_EMPTY || ci->ci_compress != NULL);
-
-	/*
-	 * If the data is all zeroes, we don't even need to allocate
-	 * a block for it.  We indicate this by setting *destsizep = 0.
-	 */
-	allzero = 1;
-	word = src;
-	word_end = (uint64_t *)(uintptr_t)((uintptr_t)word + srcsize);
-	while (word < word_end) {
-		if (*word++ != 0) {
-			allzero = 0;
-			break;
-		}
-	}
-	if (allzero) {
-		*destp = NULL;
-		*destsizep = 0;
-		*destbufsizep = 0;
-		return (1);
-	}
-
-	if (cpfunc == ZIO_COMPRESS_EMPTY)
-		return (0);
-
-	/* Compress at least 12.5% */
-	destbufsize = P2ALIGN(srcsize - (srcsize >> 3), SPA_MINBLOCKSIZE);
-	if (destbufsize == 0)
-		return (0);
-	dest = zio_buf_alloc(destbufsize);
-
-	ciosize = ci->ci_compress(src, dest, (size_t)srcsize,
-	    (size_t)destbufsize, ci->ci_level);
-	if (ciosize > destbufsize) {
-		zio_buf_free(dest, destbufsize);
-		return (0);
-	}
-
-	/* Cool.  We compressed at least as much as we were hoping to. */
-
-	/* For security, make sure we don't write random heap crap to disk */
-	gapsize = P2ROUNDUP(ciosize, SPA_MINBLOCKSIZE) - ciosize;
-	if (gapsize != 0) {
-		bzero(dest + ciosize, gapsize);
-		ciosize += gapsize;
-	}
-
-	ASSERT3U(ciosize, <=, destbufsize);
-	ASSERT(P2PHASE(ciosize, SPA_MINBLOCKSIZE) == 0);
-	*destp = dest;
-	*destsizep = ciosize;
-	*destbufsizep = destbufsize;
-
-	return (1);
-}
-
-int
-zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
-	void *dest, uint64_t destsize)
-{
-	zio_compress_info_t *ci = &zio_compress_table[cpfunc];
-
-	ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS);
-
-	return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c
deleted file mode 100644
index 4cada09..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c
+++ /dev/null
@@ -1,315 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-/*
- * ZFS fault injection
- *
- * To handle fault injection, we keep track of a series of zinject_record_t
- * structures which describe which logical block(s) should be injected with a
- * fault.  These are kept in a global list.  Each record corresponds to a given
- * spa_t and maintains a special hold on the spa_t so that it cannot be deleted
- * or exported while the injection record exists.
- *
- * Device level injection is done using the 'zi_guid' field.  If this is set, it
- * means that the error is destined for a particular device, not a piece of
- * data.
- *
- * This is a rather poor data structure and algorithm, but we don't expect more
- * than a few faults at any one time, so it should be sufficient for our needs.
- */
-
-#include <sys/arc.h>
-#include <sys/zio_impl.h>
-#include <sys/zfs_ioctl.h>
-#include <sys/spa_impl.h>
-#include <sys/vdev_impl.h>
-
-uint32_t zio_injection_enabled;
-
-typedef struct inject_handler {
-	int			zi_id;
-	spa_t			*zi_spa;
-	zinject_record_t	zi_record;
-	list_node_t		zi_link;
-} inject_handler_t;
-
-static list_t inject_handlers;
-static krwlock_t inject_lock;
-static int inject_next_id = 1;
-
-/*
- * Returns true if the given record matches the I/O in progress.
- */
-static boolean_t
-zio_match_handler(zbookmark_t *zb, uint64_t type,
-    zinject_record_t *record, int error)
-{
-	/*
-	 * Check for a match against the MOS, which is based on type
-	 */
-	if (zb->zb_objset == 0 && record->zi_objset == 0 &&
-	    record->zi_object == 0) {
-		if (record->zi_type == DMU_OT_NONE ||
-		    type == record->zi_type)
-			return (record->zi_freq == 0 ||
-			    spa_get_random(100) < record->zi_freq);
-		else
-			return (B_FALSE);
-	}
-
-	/*
-	 * Check for an exact match.
-	 */
-	if (zb->zb_objset == record->zi_objset &&
-	    zb->zb_object == record->zi_object &&
-	    zb->zb_level == record->zi_level &&
-	    zb->zb_blkid >= record->zi_start &&
-	    zb->zb_blkid <= record->zi_end &&
-	    error == record->zi_error)
-		return (record->zi_freq == 0 ||
-		    spa_get_random(100) < record->zi_freq);
-
-	return (B_FALSE);
-}
-
-/*
- * Determine if the I/O in question should return failure.  Returns the errno
- * to be returned to the caller.
- */
-int
-zio_handle_fault_injection(zio_t *zio, int error)
-{
-	int ret = 0;
-	inject_handler_t *handler;
-
-	/*
-	 * Ignore I/O not associated with any logical data.
-	 */
-	if (zio->io_logical == NULL)
-		return (0);
-
-	/*
-	 * Currently, we only support fault injection on reads.
-	 */
-	if (zio->io_type != ZIO_TYPE_READ)
-		return (0);
-
-	rw_enter(&inject_lock, RW_READER);
-
-	for (handler = list_head(&inject_handlers); handler != NULL;
-	    handler = list_next(&inject_handlers, handler)) {
-
-		/* Ignore errors not destined for this pool */
-		if (zio->io_spa != handler->zi_spa)
-			continue;
-
-		/* Ignore device errors */
-		if (handler->zi_record.zi_guid != 0)
-			continue;
-
-		/* If this handler matches, return EIO */
-		if (zio_match_handler(&zio->io_logical->io_bookmark,
-		    zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE,
-		    &handler->zi_record, error)) {
-			ret = error;
-			break;
-		}
-	}
-
-	rw_exit(&inject_lock);
-
-	return (ret);
-}
-
-int
-zio_handle_device_injection(vdev_t *vd, int error)
-{
-	inject_handler_t *handler;
-	int ret = 0;
-
-	rw_enter(&inject_lock, RW_READER);
-
-	for (handler = list_head(&inject_handlers); handler != NULL;
-	    handler = list_next(&inject_handlers, handler)) {
-
-		if (vd->vdev_guid == handler->zi_record.zi_guid) {
-			if (handler->zi_record.zi_error == error) {
-				/*
-				 * For a failed open, pretend like the device
-				 * has gone away.
-				 */
-				if (error == ENXIO)
-					vd->vdev_stat.vs_aux =
-					    VDEV_AUX_OPEN_FAILED;
-				ret = error;
-				break;
-			}
-			if (handler->zi_record.zi_error == ENXIO) {
-				ret = EIO;
-				break;
-			}
-		}
-	}
-
-	rw_exit(&inject_lock);
-
-	return (ret);
-}
-
-/*
- * Create a new handler for the given record.  We add it to the list, adding
- * a reference to the spa_t in the process.  We increment zio_injection_enabled,
- * which is the switch to trigger all fault injection.
- */
-int
-zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record)
-{
-	inject_handler_t *handler;
-	int error;
-	spa_t *spa;
-
-	/*
-	 * If this is pool-wide metadata, make sure we unload the corresponding
-	 * spa_t, so that the next attempt to load it will trigger the fault.
-	 * We call spa_reset() to unload the pool appropriately.
-	 */
-	if (flags & ZINJECT_UNLOAD_SPA)
-		if ((error = spa_reset(name)) != 0)
-			return (error);
-
-	if (!(flags & ZINJECT_NULL)) {
-		/*
-		 * spa_inject_ref() will add an injection reference, which will
-		 * prevent the pool from being removed from the namespace while
-		 * still allowing it to be unloaded.
-		 */
-		if ((spa = spa_inject_addref(name)) == NULL)
-			return (ENOENT);
-
-		handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP);
-
-		rw_enter(&inject_lock, RW_WRITER);
-
-		*id = handler->zi_id = inject_next_id++;
-		handler->zi_spa = spa;
-		handler->zi_record = *record;
-		list_insert_tail(&inject_handlers, handler);
-		atomic_add_32(&zio_injection_enabled, 1);
-
-		rw_exit(&inject_lock);
-	}
-
-	/*
-	 * Flush the ARC, so that any attempts to read this data will end up
-	 * going to the ZIO layer.  Note that this is a little overkill, but
-	 * we don't have the necessary ARC interfaces to do anything else, and
-	 * fault injection isn't a performance critical path.
-	 */
-	if (flags & ZINJECT_FLUSH_ARC)
-		arc_flush();
-
-	return (0);
-}
-
-/*
- * Returns the next record with an ID greater than that supplied to the
- * function.  Used to iterate over all handlers in the system.
- */
-int
-zio_inject_list_next(int *id, char *name, size_t buflen,
-    zinject_record_t *record)
-{
-	inject_handler_t *handler;
-	int ret;
-
-	mutex_enter(&spa_namespace_lock);
-	rw_enter(&inject_lock, RW_READER);
-
-	for (handler = list_head(&inject_handlers); handler != NULL;
-	    handler = list_next(&inject_handlers, handler))
-		if (handler->zi_id > *id)
-			break;
-
-	if (handler) {
-		*record = handler->zi_record;
-		*id = handler->zi_id;
-		(void) strncpy(name, spa_name(handler->zi_spa), buflen);
-		ret = 0;
-	} else {
-		ret = ENOENT;
-	}
-
-	rw_exit(&inject_lock);
-	mutex_exit(&spa_namespace_lock);
-
-	return (ret);
-}
-
-/*
- * Clear the fault handler with the given identifier, or return ENOENT if none
- * exists.
- */
-int
-zio_clear_fault(int id)
-{
-	inject_handler_t *handler;
-	int ret;
-
-	rw_enter(&inject_lock, RW_WRITER);
-
-	for (handler = list_head(&inject_handlers); handler != NULL;
-	    handler = list_next(&inject_handlers, handler))
-		if (handler->zi_id == id)
-			break;
-
-	if (handler == NULL) {
-		ret = ENOENT;
-	} else {
-		list_remove(&inject_handlers, handler);
-		spa_inject_delref(handler->zi_spa);
-		kmem_free(handler, sizeof (inject_handler_t));
-		atomic_add_32(&zio_injection_enabled, -1);
-		ret = 0;
-	}
-
-	rw_exit(&inject_lock);
-
-	return (ret);
-}
-
-void
-zio_inject_init(void)
-{
-	list_create(&inject_handlers, sizeof (inject_handler_t),
-	    offsetof(inject_handler_t, zi_link));
-}
-
-void
-zio_inject_fini(void)
-{
-	list_destroy(&inject_handlers);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zvol.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zvol.c
deleted file mode 100644
index fedae03..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/zvol.c
+++ /dev/null
@@ -1,801 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
- * All rights reserved.
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-/*
- * ZFS volume emulation driver.
- *
- * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
- * Volumes are accessed through the symbolic links named:
- *
- * /dev/zvol/dsk/<pool_name>/<dataset_name>
- * /dev/zvol/rdsk/<pool_name>/<dataset_name>
- *
- * These links are created by the ZFS-specific devfsadm link generator.
- * Volumes are persistent through reboot.  No user command needs to be
- * run before opening and using a device.
- */
-
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/kernel.h>
-#include <sys/errno.h>
-#include <sys/uio.h>
-#include <sys/bio.h>
-#include <sys/buf.h>
-#include <sys/kmem.h>
-#include <sys/conf.h>
-#include <sys/cmn_err.h>
-#include <sys/stat.h>
-#include <sys/zap.h>
-#include <sys/spa.h>
-#include <sys/zio.h>
-#include <sys/dsl_prop.h>
-#include <sys/dkio.h>
-#include <sys/byteorder.h>
-#include <sys/sunddi.h>
-#include <sys/dirent.h>
-#include <sys/policy.h>
-#include <sys/fs/zfs.h>
-#include <sys/zfs_ioctl.h>
-#include <sys/zil.h>
-#include <sys/refcount.h>
-#include <sys/zfs_znode.h>
-#include <sys/zfs_rlock.h>
-#include <geom/geom.h>
-
-#include "zfs_namecheck.h"
-
-struct g_class zfs_zvol_class = {
-	.name = "ZFS::ZVOL",
-	.version = G_VERSION,
-};
-
-DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
-
-#define	ZVOL_OBJ		1ULL
-#define	ZVOL_ZAP_OBJ		2ULL
-
-static uint32_t zvol_minors;
-
-/*
- * The in-core state of each volume.
- */
-typedef struct zvol_state {
-	char		zv_name[MAXPATHLEN]; /* pool/dd name */
-	uint64_t	zv_volsize;	/* amount of space we advertise */
-	uint64_t	zv_volblocksize; /* volume block size */
-	struct g_provider *zv_provider;	/* GEOM provider */
-	uint8_t		zv_min_bs;	/* minimum addressable block shift */
-	uint8_t		zv_readonly;	/* hard readonly; like write-protect */
-	objset_t	*zv_objset;	/* objset handle */
-	uint32_t	zv_mode;	/* DS_MODE_* flags at open time */
-	uint32_t	zv_total_opens;	/* total open count */
-	zilog_t		*zv_zilog;	/* ZIL handle */
-	uint64_t	zv_txg_assign;	/* txg to assign during ZIL replay */
-	znode_t		zv_znode;	/* for range locking */
-	int		zv_state;
-	struct bio_queue_head zv_queue;
-	struct mtx	zv_queue_mtx;	/* zv_queue mutex */
-} zvol_state_t;
-
-/*
- * zvol maximum transfer in one DMU tx.
- */
-int zvol_maxphys = DMU_MAX_ACCESS/2;
-
-static int zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio);
-
-int
-zvol_check_volsize(uint64_t volsize, uint64_t blocksize)
-{
-	if (volsize == 0)
-		return (EINVAL);
-
-	if (volsize % blocksize != 0)
-		return (EINVAL);
-
-#ifdef _ILP32
-	if (volsize - 1 > SPEC_MAXOFFSET_T)
-		return (EOVERFLOW);
-#endif
-	return (0);
-}
-
-int
-zvol_check_volblocksize(uint64_t volblocksize)
-{
-	if (volblocksize < SPA_MINBLOCKSIZE ||
-	    volblocksize > SPA_MAXBLOCKSIZE ||
-	    !ISP2(volblocksize))
-		return (EDOM);
-
-	return (0);
-}
-
-static void
-zvol_readonly_changed_cb(void *arg, uint64_t newval)
-{
-	zvol_state_t *zv = arg;
-
-	zv->zv_readonly = (uint8_t)newval;
-}
-
-int
-zvol_get_stats(objset_t *os, nvlist_t *nv)
-{
-	int error;
-	dmu_object_info_t doi;
-	uint64_t val;
-
-
-	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val);
-	if (error)
-		return (error);
-
-	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val);
-
-	error = dmu_object_info(os, ZVOL_OBJ, &doi);
-
-	if (error == 0) {
-		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE,
-		    doi.doi_data_block_size);
-	}
-
-	return (error);
-}
-
-static zvol_state_t *
-zvol_minor_lookup(const char *name)
-{
-	struct g_provider *pp;
-	struct g_geom *gp;
-
-	g_topology_assert();
-
-	LIST_FOREACH(gp, &zfs_zvol_class.geom, geom) {
-		LIST_FOREACH(pp, &gp->provider, provider) {
-			if (strcmp(pp->name + sizeof(ZVOL_DEV_DIR), name) == 0)
-				return (pp->private);
-		}
-	}
-
-	return (NULL);
-}
-
-static int
-zvol_access(struct g_provider *pp, int acr, int acw, int ace)
-{
-	zvol_state_t *zv;
-
-	g_topology_assert();
-
-	zv = pp->private;
-	if (zv == NULL) {
-		if (acr <= 0 && acw <= 0 && ace <= 0)
-			return (0);
-		return (pp->error);
-	}
-
-	ASSERT(zv->zv_objset != NULL);
-
-	if (acw > 0 && (zv->zv_readonly || (zv->zv_mode & DS_MODE_READONLY)))
-		return (EROFS);
-
-	zv->zv_total_opens += acr + acw + ace;
-
-	return (0);
-}
-
-/*
- * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions.
- *
- * We store data in the log buffers if it's small enough.
- * Otherwise we will later flush the data out via dmu_sync().
- */
-ssize_t zvol_immediate_write_sz = 32768;
-
-static void
-zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t len)
-{
-	uint32_t blocksize = zv->zv_volblocksize;
-	lr_write_t *lr;
-
-	while (len) {
-		ssize_t nbytes = MIN(len, blocksize - P2PHASE(off, blocksize));
-		itx_t *itx = zil_itx_create(TX_WRITE, sizeof (*lr));
-
-		itx->itx_wr_state =
-		    len > zvol_immediate_write_sz ?  WR_INDIRECT : WR_NEED_COPY;
-		itx->itx_private = zv;
-		lr = (lr_write_t *)&itx->itx_lr;
-		lr->lr_foid = ZVOL_OBJ;
-		lr->lr_offset = off;
-		lr->lr_length = nbytes;
-		lr->lr_blkoff = off - P2ALIGN_TYPED(off, blocksize, uint64_t);
-		BP_ZERO(&lr->lr_blkptr);
-
-		(void) zil_itx_assign(zv->zv_zilog, itx, tx);
-		len -= nbytes;
-		off += nbytes;
-	}
-}
-
-static void
-zvol_start(struct bio *bp)
-{
-	zvol_state_t *zv;
-
-	switch (bp->bio_cmd) {
-	case BIO_READ:
-	case BIO_WRITE:
-	case BIO_FLUSH:
-		zv = bp->bio_to->private;
-		ASSERT(zv != NULL);
-		mtx_lock(&zv->zv_queue_mtx);
-		bioq_insert_tail(&zv->zv_queue, bp);
-		wakeup_one(&zv->zv_queue);
-		mtx_unlock(&zv->zv_queue_mtx);
-		break;
-	case BIO_DELETE:
-	case BIO_GETATTR:
-	default:
-		g_io_deliver(bp, EOPNOTSUPP);
-		break;
-	}
-}
-
-static void
-zvol_serve_one(zvol_state_t *zv, struct bio *bp)
-{
-	uint64_t off, volsize;
-	size_t size, resid;
-	char *addr;
-	objset_t *os;
-	rl_t *rl;
-	int error = 0;
-	boolean_t reading;
-
-	off = bp->bio_offset;
-	volsize = zv->zv_volsize;
-
-	os = zv->zv_objset;
-	ASSERT(os != NULL);
-
-	addr = bp->bio_data;
-	resid = bp->bio_length;
-
-	error = 0;
-
-	/*
-	 * There must be no buffer changes when doing a dmu_sync() because
-	 * we can't change the data whilst calculating the checksum.
-	 * A better approach than a per zvol rwlock would be to lock ranges.
-	 */
-	reading = (bp->bio_cmd == BIO_READ);
-	rl = zfs_range_lock(&zv->zv_znode, off, resid,
-	    reading ? RL_READER : RL_WRITER);
-
-	while (resid != 0 && off < volsize) {
-
-		size = MIN(resid, zvol_maxphys); /* zvol_maxphys per tx */
-
-		if (size > volsize - off)	/* don't write past the end */
-			size = volsize - off;
-
-		if (reading) {
-			error = dmu_read(os, ZVOL_OBJ, off, size, addr);
-		} else {
-			dmu_tx_t *tx = dmu_tx_create(os);
-			dmu_tx_hold_write(tx, ZVOL_OBJ, off, size);
-			error = dmu_tx_assign(tx, TXG_WAIT);
-			if (error) {
-				dmu_tx_abort(tx);
-			} else {
-				dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
-				zvol_log_write(zv, tx, off, size);
-				dmu_tx_commit(tx);
-			}
-		}
-		if (error)
-			break;
-		off += size;
-		addr += size;
-		resid -= size;
-	}
-	zfs_range_unlock(rl);
-
-	bp->bio_completed = bp->bio_length - resid;
-	if (bp->bio_completed < bp->bio_length)
-		bp->bio_error = (off > volsize ? EINVAL : error);
-}
-
-static void
-zvol_worker(void *arg)
-{
-	zvol_state_t *zv;
-	struct bio *bp;
-
-	zv = arg;
-	for (;;) {
-		mtx_lock(&zv->zv_queue_mtx);
-		bp = bioq_takefirst(&zv->zv_queue);
-		if (bp == NULL) {
-			if (zv->zv_state == 1) {
-				zv->zv_state = 2;
-				wakeup(&zv->zv_state);
-				mtx_unlock(&zv->zv_queue_mtx);
-				kproc_exit(0);
-			}
-			msleep(&zv->zv_queue, &zv->zv_queue_mtx, PRIBIO | PDROP,
-			    "zvol:io", 0);
-			continue;
-		}
-		mtx_unlock(&zv->zv_queue_mtx);
-		switch (bp->bio_cmd) {
-		case BIO_FLUSH:
-			break;
-		case BIO_READ:
-		case BIO_WRITE:
-			zvol_serve_one(zv, bp);
-			break;
-		}
-
-		if (bp->bio_cmd != BIO_READ && !zil_disable)
-			zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ);
-
-		g_io_deliver(bp, bp->bio_error);
-	}
-}
-
-void
-zvol_create_cb(objset_t *os, void *arg, dmu_tx_t *tx)
-{
-	zfs_create_data_t *zc = arg;
-	int error;
-	uint64_t volblocksize, volsize;
-
-	VERIFY(nvlist_lookup_uint64(zc->zc_props,
-	    zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0);
-	if (nvlist_lookup_uint64(zc->zc_props,
-	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0)
-		volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
-
-	/*
-	 * These properites must be removed from the list so the generic
-	 * property setting step won't apply to them.
-	 */
-	VERIFY(nvlist_remove_all(zc->zc_props,
-	    zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0);
-	(void) nvlist_remove_all(zc->zc_props,
-	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE));
-
-	error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize,
-	    DMU_OT_NONE, 0, tx);
-	ASSERT(error == 0);
-
-	error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP,
-	    DMU_OT_NONE, 0, tx);
-	ASSERT(error == 0);
-
-	error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx);
-	ASSERT(error == 0);
-}
-
-/*
- * Replay a TX_WRITE ZIL transaction that didn't get committed
- * after a system failure
- */
-static int
-zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap)
-{
-	objset_t *os = zv->zv_objset;
-	char *data = (char *)(lr + 1);	/* data follows lr_write_t */
-	uint64_t off = lr->lr_offset;
-	uint64_t len = lr->lr_length;
-	dmu_tx_t *tx;
-	int error;
-
-	if (byteswap)
-		byteswap_uint64_array(lr, sizeof (*lr));
-
-	tx = dmu_tx_create(os);
-	dmu_tx_hold_write(tx, ZVOL_OBJ, off, len);
-	error = dmu_tx_assign(tx, zv->zv_txg_assign);
-	if (error) {
-		dmu_tx_abort(tx);
-	} else {
-		dmu_write(os, ZVOL_OBJ, off, len, data, tx);
-		dmu_tx_commit(tx);
-	}
-
-	return (error);
-}
-
-/* ARGSUSED */
-static int
-zvol_replay_err(zvol_state_t *zv, lr_t *lr, boolean_t byteswap)
-{
-	return (ENOTSUP);
-}
-
-/*
- * Callback vectors for replaying records.
- * Only TX_WRITE is needed for zvol.
- */
-zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
-	zvol_replay_err,	/* 0 no such transaction type */
-	zvol_replay_err,	/* TX_CREATE */
-	zvol_replay_err,	/* TX_MKDIR */
-	zvol_replay_err,	/* TX_MKXATTR */
-	zvol_replay_err,	/* TX_SYMLINK */
-	zvol_replay_err,	/* TX_REMOVE */
-	zvol_replay_err,	/* TX_RMDIR */
-	zvol_replay_err,	/* TX_LINK */
-	zvol_replay_err,	/* TX_RENAME */
-	zvol_replay_write,	/* TX_WRITE */
-	zvol_replay_err,	/* TX_TRUNCATE */
-	zvol_replay_err,	/* TX_SETATTR */
-	zvol_replay_err,	/* TX_ACL */
-};
-
-/*
- * Create a minor node for the specified volume.
- */
-int
-zvol_create_minor(const char *name, dev_t dev)
-{
-	struct g_provider *pp;
-	struct g_geom *gp;
-	zvol_state_t *zv;
-	objset_t *os;
-	dmu_object_info_t doi;
-	uint64_t volsize;
-	int ds_mode = DS_MODE_PRIMARY;
-	int error;
-
-	DROP_GIANT();
-	g_topology_lock();
-
-	if ((zv = zvol_minor_lookup(name)) != NULL) {
-		error = EEXIST;
-		goto end;
-	}
-
-	if (strchr(name, '@') != 0)
-		ds_mode |= DS_MODE_READONLY;
-
-	error = dmu_objset_open(name, DMU_OST_ZVOL, ds_mode, &os);
-	if (error)
-		goto end;
-
-	g_topology_unlock();
-	PICKUP_GIANT();
-	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
-	DROP_GIANT();
-	g_topology_lock();
-	if (error) {
-		dmu_objset_close(os);
-		goto end;
-	}
-
-	gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
-	gp->start = zvol_start;
-	gp->access = zvol_access;
-	pp = g_new_providerf(gp, "%s/%s", ZVOL_DEV_DIR, name);
-	pp->mediasize = volsize;
-	pp->sectorsize = DEV_BSIZE;
-
-	zv = kmem_zalloc(sizeof(*zv), KM_SLEEP);
-	(void) strcpy(zv->zv_name, name);
-	zv->zv_min_bs = DEV_BSHIFT;
-	zv->zv_provider = pp;
-	zv->zv_volsize = pp->mediasize;
-	zv->zv_objset = os;
-	zv->zv_mode = ds_mode;
-	zv->zv_zilog = zil_open(os, zvol_get_data);
-	mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL);
-	avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare,
-	    sizeof (rl_t), offsetof(rl_t, r_node));
-
-
-	/* get and cache the blocksize */
-	error = dmu_object_info(os, ZVOL_OBJ, &doi);
-	ASSERT(error == 0);
-	zv->zv_volblocksize = doi.doi_data_block_size;
-
-	zil_replay(os, zv, &zv->zv_txg_assign, zvol_replay_vector);
-
-	/* XXX this should handle the possible i/o error */
-	VERIFY(dsl_prop_register(dmu_objset_ds(zv->zv_objset),
-	    "readonly", zvol_readonly_changed_cb, zv) == 0);
-
-	pp->private = zv;
-	g_error_provider(pp, 0);
-
-	bioq_init(&zv->zv_queue);
-	mtx_init(&zv->zv_queue_mtx, "zvol", NULL, MTX_DEF);
-	zv->zv_state = 0;
-	kproc_create(zvol_worker, zv, NULL, 0, 0, "zvol:worker %s", pp->name);
-
-	zvol_minors++;
-end:
-	g_topology_unlock();
-	PICKUP_GIANT();
-
-	return (error);
-}
-
-/*
- * Remove minor node for the specified volume.
- */
-int
-zvol_remove_minor(const char *name)
-{
-	struct g_provider *pp;
-	zvol_state_t *zv;
-	int error = 0;
-
-	DROP_GIANT();
-	g_topology_lock();
-
-	if ((zv = zvol_minor_lookup(name)) == NULL) {
-		error = ENXIO;
-		goto end;
-	}
-
-	if (zv->zv_total_opens != 0) {
-		error = EBUSY;
-		goto end;
-	}
-
-	VERIFY(dsl_prop_unregister(dmu_objset_ds(zv->zv_objset),
-	    "readonly", zvol_readonly_changed_cb, zv) == 0);
-
-	mtx_lock(&zv->zv_queue_mtx);
-	zv->zv_state = 1;
-	wakeup_one(&zv->zv_queue);
-	while (zv->zv_state != 2)
-		msleep(&zv->zv_state, &zv->zv_queue_mtx, 0, "zvol:w", 0);
-	mtx_unlock(&zv->zv_queue_mtx);
-	mtx_destroy(&zv->zv_queue_mtx);
-
-	pp = zv->zv_provider;
-	pp->private = NULL;
-	g_wither_geom(pp->geom, ENXIO);
-
-	zil_close(zv->zv_zilog);
-	zv->zv_zilog = NULL;
-	dmu_objset_close(zv->zv_objset);
-	zv->zv_objset = NULL;
-	avl_destroy(&zv->zv_znode.z_range_avl);
-	mutex_destroy(&zv->zv_znode.z_range_lock);
-
-	kmem_free(zv, sizeof(*zv));
-
-	zvol_minors--;
-end:
-	g_topology_unlock();
-	PICKUP_GIANT();
-
-	return (error);
-}
-
-int
-zvol_set_volsize(const char *name, dev_t dev, uint64_t volsize)
-{
-	zvol_state_t *zv;
-	dmu_tx_t *tx;
-	int error;
-	dmu_object_info_t doi;
-
-	DROP_GIANT();
-	g_topology_lock();
-
-	if ((zv = zvol_minor_lookup(name)) == NULL) {
-		error = ENXIO;
-		goto end;
-	}
-
-	if ((error = dmu_object_info(zv->zv_objset, ZVOL_OBJ, &doi)) != 0 ||
-	    (error = zvol_check_volsize(volsize,
-	    doi.doi_data_block_size)) != 0) {
-		goto end;
-	}
-
-	if (zv->zv_readonly || (zv->zv_mode & DS_MODE_READONLY)) {
-		error = EROFS;
-		goto end;
-	}
-
-	tx = dmu_tx_create(zv->zv_objset);
-	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
-	dmu_tx_hold_free(tx, ZVOL_OBJ, volsize, DMU_OBJECT_END);
-	error = dmu_tx_assign(tx, TXG_WAIT);
-	if (error) {
-		dmu_tx_abort(tx);
-		goto end;
-	}
-
-	error = zap_update(zv->zv_objset, ZVOL_ZAP_OBJ, "size", 8, 1,
-	    &volsize, tx);
-	if (error == 0) {
-		error = dmu_free_range(zv->zv_objset, ZVOL_OBJ, volsize,
-		    DMU_OBJECT_END, tx);
-	}
-
-	dmu_tx_commit(tx);
-
-	if (error == 0) {
-		zv->zv_volsize = volsize;
-		zv->zv_provider->mediasize = volsize;	/* XXX: Not supported. */
-	}
-end:
-	g_topology_unlock();
-	PICKUP_GIANT();
-
-	return (error);
-}
-
-int
-zvol_set_volblocksize(const char *name, uint64_t volblocksize)
-{
-	zvol_state_t *zv;
-	dmu_tx_t *tx;
-	int error;
-
-	DROP_GIANT();
-	g_topology_lock();
-
-	if ((zv = zvol_minor_lookup(name)) == NULL) {
-		error = ENXIO;
-		goto end;
-	}
-
-	if (zv->zv_readonly || (zv->zv_mode & DS_MODE_READONLY)) {
-		error = EROFS;
-		goto end;
-	}
-
-	tx = dmu_tx_create(zv->zv_objset);
-	dmu_tx_hold_bonus(tx, ZVOL_OBJ);
-	error = dmu_tx_assign(tx, TXG_WAIT);
-	if (error) {
-		dmu_tx_abort(tx);
-	} else {
-		error = dmu_object_set_blocksize(zv->zv_objset, ZVOL_OBJ,
-		    volblocksize, 0, tx);
-		if (error == ENOTSUP)
-			error = EBUSY;
-		dmu_tx_commit(tx);
-		/* XXX: Not supported. */
-#if 0
-		if (error == 0)
-			zv->zv_provider->sectorsize = zc->zc_volblocksize;
-#endif
-	}
-end:
-	g_topology_unlock();
-	PICKUP_GIANT();
-
-	return (error);
-}
-
-void
-zvol_get_done(dmu_buf_t *db, void *vzgd)
-{
-	zgd_t *zgd = (zgd_t *)vzgd;
-	rl_t *rl = zgd->zgd_rl;
-
-	dmu_buf_rele(db, vzgd);
-	zfs_range_unlock(rl);
-	zil_add_vdev(zgd->zgd_zilog, DVA_GET_VDEV(BP_IDENTITY(zgd->zgd_bp)));
-	kmem_free(zgd, sizeof (zgd_t));
-}
-
-/*
- * Get data to generate a TX_WRITE intent log record.
- */
-static int
-zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
-{
-	zvol_state_t *zv = arg;
-	objset_t *os = zv->zv_objset;
-	dmu_buf_t *db;
-	rl_t *rl;
-	zgd_t *zgd;
-	uint64_t boff; 			/* block starting offset */
-	int dlen = lr->lr_length;	/* length of user data */
-	int error;
-
-	ASSERT(zio);
-	ASSERT(dlen != 0);
-
-	/*
-	 * Write records come in two flavors: immediate and indirect.
-	 * For small writes it's cheaper to store the data with the
-	 * log record (immediate); for large writes it's cheaper to
-	 * sync the data and get a pointer to it (indirect) so that
-	 * we don't have to write the data twice.
-	 */
-	if (buf != NULL) /* immediate write */
-		return (dmu_read(os, ZVOL_OBJ, lr->lr_offset, dlen, buf));
-
-	zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP);
-	zgd->zgd_zilog = zv->zv_zilog;
-	zgd->zgd_bp = &lr->lr_blkptr;
-
-	/*
-	 * Lock the range of the block to ensure that when the data is
-	 * written out and it's checksum is being calculated that no other
-	 * thread can change the block.
-	 */
-	boff = P2ALIGN_TYPED(lr->lr_offset, zv->zv_volblocksize, uint64_t);
-	rl = zfs_range_lock(&zv->zv_znode, boff, zv->zv_volblocksize,
-	    RL_READER);
-	zgd->zgd_rl = rl;
-
-	VERIFY(0 == dmu_buf_hold(os, ZVOL_OBJ, lr->lr_offset, zgd, &db));
-	error = dmu_sync(zio, db, &lr->lr_blkptr,
-	    lr->lr_common.lrc_txg, zvol_get_done, zgd);
-	if (error == 0)
-		zil_add_vdev(zv->zv_zilog,
-		    DVA_GET_VDEV(BP_IDENTITY(&lr->lr_blkptr)));
-	/*
-	 * If we get EINPROGRESS, then we need to wait for a
-	 * write IO initiated by dmu_sync() to complete before
-	 * we can release this dbuf.  We will finish everything
-	 * up in the zvol_get_done() callback.
-	 */
-	if (error == EINPROGRESS)
-		return (0);
-	dmu_buf_rele(db, zgd);
-	zfs_range_unlock(rl);
-	kmem_free(zgd, sizeof (zgd_t));
-	return (error);
-}
-
-int
-zvol_busy(void)
-{
-	return (zvol_minors != 0);
-}
-
-void
-zvol_init(void)
-{
-	ZFS_LOG(1, "ZVOL Initialized.");
-}
-
-void
-zvol_fini(void)
-{
-	ZFS_LOG(1, "ZVOL Deinitialized.");
-}