summaryrefslogtreecommitdiffstats
path: root/sys/cddl/contrib/opensolaris/uts/common/fs/zfs
diff options
context:
space:
mode:
authorpjd <pjd@FreeBSD.org>2007-04-06 01:09:06 +0000
committerpjd <pjd@FreeBSD.org>2007-04-06 01:09:06 +0000
commit3b005d330261f33318ca1ee3fef1940237fd788b (patch)
tree3061c8734d9ce560165e672836837a0f411a83c9 /sys/cddl/contrib/opensolaris/uts/common/fs/zfs
parent3be454b8211f48e634e6587f53807d3b5013e973 (diff)
downloadFreeBSD-src-3b005d330261f33318ca1ee3fef1940237fd788b.zip
FreeBSD-src-3b005d330261f33318ca1ee3fef1940237fd788b.tar.gz
Please welcome ZFS - The last word in file systems.
ZFS file system was ported from OpenSolaris operating system. The code in under CDDL license. I'd like to thank all SUN developers that created this great piece of software. Supported by: Wheel LTD (http://www.wheel.pl/) Supported by: The FreeBSD Foundation (http://www.freebsdfoundation.org/) Supported by: Sentex (http://www.sentex.net/)
Diffstat (limited to 'sys/cddl/contrib/opensolaris/uts/common/fs/zfs')
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c2829
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c312
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c2240
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c1029
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c160
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c1034
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c1009
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c888
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c992
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c655
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c1370
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c621
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c1889
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c1192
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c255
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c501
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c196
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/fletcher.c145
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/gzip.c69
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c129
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c1023
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c194
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c131
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c3265
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c361
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c440
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c354
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c1126
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c501
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h109
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h89
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h334
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h586
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h237
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h125
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h120
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h134
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_zfetch.h75
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h267
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h185
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h142
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h82
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h77
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h77
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h69
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h81
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h103
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h491
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h168
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h162
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h120
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h77
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h50
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h63
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/unique.h56
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h132
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h52
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_file.h46
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h298
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h359
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h204
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h234
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h115
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h122
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h71
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h75
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h71
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h162
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_rlock.h89
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h100
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h298
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h276
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h111
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h366
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h75
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h82
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h205
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h68
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c611
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/uberblock.c63
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/unique.c107
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c1905
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c394
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c363
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c225
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c432
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c1011
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c495
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c89
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c323
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c1223
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c118
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c1070
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c741
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c855
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs.conf28
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c1607
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c99
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c1120
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c796
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c336
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c1811
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c348
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c424
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c594
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c986
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c3227
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c1061
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c1607
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c1853
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c172
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c148
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c315
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c796
114 files changed, 59953 insertions, 0 deletions
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
new file mode 100644
index 0000000..59a376a
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
@@ -0,0 +1,2829 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * DVA-based Adjustable Replacement Cache
+ *
+ * While much of the theory of operation used here is
+ * based on the self-tuning, low overhead replacement cache
+ * presented by Megiddo and Modha at FAST 2003, there are some
+ * significant differences:
+ *
+ * 1. The Megiddo and Modha model assumes any page is evictable.
+ * Pages in its cache cannot be "locked" into memory. This makes
+ * the eviction algorithm simple: evict the last page in the list.
+ * This also make the performance characteristics easy to reason
+ * about. Our cache is not so simple. At any given moment, some
+ * subset of the blocks in the cache are un-evictable because we
+ * have handed out a reference to them. Blocks are only evictable
+ * when there are no external references active. This makes
+ * eviction far more problematic: we choose to evict the evictable
+ * blocks that are the "lowest" in the list.
+ *
+ * There are times when it is not possible to evict the requested
+ * space. In these circumstances we are unable to adjust the cache
+ * size. To prevent the cache growing unbounded at these times we
+ * implement a "cache throttle" that slowes the flow of new data
+ * into the cache until we can make space avaiable.
+ *
+ * 2. The Megiddo and Modha model assumes a fixed cache size.
+ * Pages are evicted when the cache is full and there is a cache
+ * miss. Our model has a variable sized cache. It grows with
+ * high use, but also tries to react to memory preasure from the
+ * operating system: decreasing its size when system memory is
+ * tight.
+ *
+ * 3. The Megiddo and Modha model assumes a fixed page size. All
+ * elements of the cache are therefor exactly the same size. So
+ * when adjusting the cache size following a cache miss, its simply
+ * a matter of choosing a single page to evict. In our model, we
+ * have variable sized cache blocks (rangeing from 512 bytes to
+ * 128K bytes). We therefor choose a set of blocks to evict to make
+ * space for a cache miss that approximates as closely as possible
+ * the space used by the new block.
+ *
+ * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache"
+ * by N. Megiddo & D. Modha, FAST 2003
+ */
+
+/*
+ * The locking model:
+ *
+ * A new reference to a cache buffer can be obtained in two
+ * ways: 1) via a hash table lookup using the DVA as a key,
+ * or 2) via one of the ARC lists. The arc_read() inerface
+ * uses method 1, while the internal arc algorithms for
+ * adjusting the cache use method 2. We therefor provide two
+ * types of locks: 1) the hash table lock array, and 2) the
+ * arc list locks.
+ *
+ * Buffers do not have their own mutexs, rather they rely on the
+ * hash table mutexs for the bulk of their protection (i.e. most
+ * fields in the arc_buf_hdr_t are protected by these mutexs).
+ *
+ * buf_hash_find() returns the appropriate mutex (held) when it
+ * locates the requested buffer in the hash table. It returns
+ * NULL for the mutex if the buffer was not in the table.
+ *
+ * buf_hash_remove() expects the appropriate hash mutex to be
+ * already held before it is invoked.
+ *
+ * Each arc state also has a mutex which is used to protect the
+ * buffer list associated with the state. When attempting to
+ * obtain a hash table lock while holding an arc list lock you
+ * must use: mutex_tryenter() to avoid deadlock. Also note that
+ * the active state mutex must be held before the ghost state mutex.
+ *
+ * Arc buffers may have an associated eviction callback function.
+ * This function will be invoked prior to removing the buffer (e.g.
+ * in arc_do_user_evicts()). Note however that the data associated
+ * with the buffer may be evicted prior to the callback. The callback
+ * must be made with *no locks held* (to prevent deadlock). Additionally,
+ * the users of callbacks must ensure that their private data is
+ * protected from simultaneous callbacks from arc_buf_evict()
+ * and arc_do_user_evicts().
+ *
+ * Note that the majority of the performance stats are manipulated
+ * with atomic operations.
+ */
+
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+#include <sys/zfs_context.h>
+#include <sys/arc.h>
+#include <sys/refcount.h>
+#ifdef _KERNEL
+#include <sys/dnlc.h>
+#endif
+#include <sys/callb.h>
+#include <sys/kstat.h>
+#include <sys/sdt.h>
+
+#define ARC_FREE_AT_ONCE 4194304
+
+static kmutex_t arc_reclaim_thr_lock;
+static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */
+static uint8_t arc_thread_exit;
+
+#define ARC_REDUCE_DNLC_PERCENT 3
+uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
+
+typedef enum arc_reclaim_strategy {
+ ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */
+ ARC_RECLAIM_CONS /* Conservative reclaim strategy */
+} arc_reclaim_strategy_t;
+
+/* number of seconds before growing cache again */
+static int arc_grow_retry = 60;
+
+/*
+ * minimum lifespan of a prefetch block in clock ticks
+ * (initialized in arc_init())
+ */
+static int arc_min_prefetch_lifespan;
+
+static int arc_dead;
+
+/*
+ * These tunables are for performance analysis.
+ */
+uint64_t zfs_arc_max;
+uint64_t zfs_arc_min;
+
+/*
+ * Note that buffers can be on one of 5 states:
+ * ARC_anon - anonymous (discussed below)
+ * ARC_mru - recently used, currently cached
+ * ARC_mru_ghost - recentely used, no longer in cache
+ * ARC_mfu - frequently used, currently cached
+ * ARC_mfu_ghost - frequently used, no longer in cache
+ * When there are no active references to the buffer, they
+ * are linked onto one of the lists in arc. These are the
+ * only buffers that can be evicted or deleted.
+ *
+ * Anonymous buffers are buffers that are not associated with
+ * a DVA. These are buffers that hold dirty block copies
+ * before they are written to stable storage. By definition,
+ * they are "ref'd" and are considered part of arc_mru
+ * that cannot be freed. Generally, they will aquire a DVA
+ * as they are written and migrate onto the arc_mru list.
+ */
+
+typedef struct arc_state {
+ list_t arcs_list; /* linked list of evictable buffer in state */
+ uint64_t arcs_lsize; /* total size of buffers in the linked list */
+ uint64_t arcs_size; /* total size of all buffers in this state */
+ kmutex_t arcs_mtx;
+} arc_state_t;
+
+/* The 5 states: */
+static arc_state_t ARC_anon;
+static arc_state_t ARC_mru;
+static arc_state_t ARC_mru_ghost;
+static arc_state_t ARC_mfu;
+static arc_state_t ARC_mfu_ghost;
+
+typedef struct arc_stats {
+ kstat_named_t arcstat_hits;
+ kstat_named_t arcstat_misses;
+ kstat_named_t arcstat_demand_data_hits;
+ kstat_named_t arcstat_demand_data_misses;
+ kstat_named_t arcstat_demand_metadata_hits;
+ kstat_named_t arcstat_demand_metadata_misses;
+ kstat_named_t arcstat_prefetch_data_hits;
+ kstat_named_t arcstat_prefetch_data_misses;
+ kstat_named_t arcstat_prefetch_metadata_hits;
+ kstat_named_t arcstat_prefetch_metadata_misses;
+ kstat_named_t arcstat_mru_hits;
+ kstat_named_t arcstat_mru_ghost_hits;
+ kstat_named_t arcstat_mfu_hits;
+ kstat_named_t arcstat_mfu_ghost_hits;
+ kstat_named_t arcstat_deleted;
+ kstat_named_t arcstat_recycle_miss;
+ kstat_named_t arcstat_mutex_miss;
+ kstat_named_t arcstat_evict_skip;
+ kstat_named_t arcstat_hash_elements;
+ kstat_named_t arcstat_hash_elements_max;
+ kstat_named_t arcstat_hash_collisions;
+ kstat_named_t arcstat_hash_chains;
+ kstat_named_t arcstat_hash_chain_max;
+ kstat_named_t arcstat_p;
+ kstat_named_t arcstat_c;
+ kstat_named_t arcstat_c_min;
+ kstat_named_t arcstat_c_max;
+ kstat_named_t arcstat_size;
+} arc_stats_t;
+
+static arc_stats_t arc_stats = {
+ { "hits", KSTAT_DATA_UINT64 },
+ { "misses", KSTAT_DATA_UINT64 },
+ { "demand_data_hits", KSTAT_DATA_UINT64 },
+ { "demand_data_misses", KSTAT_DATA_UINT64 },
+ { "demand_metadata_hits", KSTAT_DATA_UINT64 },
+ { "demand_metadata_misses", KSTAT_DATA_UINT64 },
+ { "prefetch_data_hits", KSTAT_DATA_UINT64 },
+ { "prefetch_data_misses", KSTAT_DATA_UINT64 },
+ { "prefetch_metadata_hits", KSTAT_DATA_UINT64 },
+ { "prefetch_metadata_misses", KSTAT_DATA_UINT64 },
+ { "mru_hits", KSTAT_DATA_UINT64 },
+ { "mru_ghost_hits", KSTAT_DATA_UINT64 },
+ { "mfu_hits", KSTAT_DATA_UINT64 },
+ { "mfu_ghost_hits", KSTAT_DATA_UINT64 },
+ { "deleted", KSTAT_DATA_UINT64 },
+ { "recycle_miss", KSTAT_DATA_UINT64 },
+ { "mutex_miss", KSTAT_DATA_UINT64 },
+ { "evict_skip", KSTAT_DATA_UINT64 },
+ { "hash_elements", KSTAT_DATA_UINT64 },
+ { "hash_elements_max", KSTAT_DATA_UINT64 },
+ { "hash_collisions", KSTAT_DATA_UINT64 },
+ { "hash_chains", KSTAT_DATA_UINT64 },
+ { "hash_chain_max", KSTAT_DATA_UINT64 },
+ { "p", KSTAT_DATA_UINT64 },
+ { "c", KSTAT_DATA_UINT64 },
+ { "c_min", KSTAT_DATA_UINT64 },
+ { "c_max", KSTAT_DATA_UINT64 },
+ { "size", KSTAT_DATA_UINT64 }
+};
+
+#define ARCSTAT(stat) (arc_stats.stat.value.ui64)
+
+#define ARCSTAT_INCR(stat, val) \
+ atomic_add_64(&arc_stats.stat.value.ui64, (val));
+
+#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1)
+#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1)
+
+#define ARCSTAT_MAX(stat, val) { \
+ uint64_t m; \
+ while ((val) > (m = arc_stats.stat.value.ui64) && \
+ (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \
+ continue; \
+}
+
+#define ARCSTAT_MAXSTAT(stat) \
+ ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
+
+/*
+ * We define a macro to allow ARC hits/misses to be easily broken down by
+ * two separate conditions, giving a total of four different subtypes for
+ * each of hits and misses (so eight statistics total).
+ */
+#define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
+ if (cond1) { \
+ if (cond2) { \
+ ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
+ } else { \
+ ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
+ } \
+ } else { \
+ if (cond2) { \
+ ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
+ } else { \
+ ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
+ } \
+ }
+
+kstat_t *arc_ksp;
+static arc_state_t *arc_anon;
+static arc_state_t *arc_mru;
+static arc_state_t *arc_mru_ghost;
+static arc_state_t *arc_mfu;
+static arc_state_t *arc_mfu_ghost;
+
+/*
+ * There are several ARC variables that are critical to export as kstats --
+ * but we don't want to have to grovel around in the kstat whenever we wish to
+ * manipulate them. For these variables, we therefore define them to be in
+ * terms of the statistic variable. This assures that we are not introducing
+ * the possibility of inconsistency by having shadow copies of the variables,
+ * while still allowing the code to be readable.
+ */
+#define arc_size ARCSTAT(arcstat_size) /* actual total arc size */
+#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */
+#define arc_c ARCSTAT(arcstat_c) /* target size of cache */
+#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */
+#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */
+
+static int arc_no_grow; /* Don't try to grow cache size */
+static uint64_t arc_tempreserve;
+
+typedef struct arc_callback arc_callback_t;
+
+struct arc_callback {
+ void *acb_private;
+ arc_done_func_t *acb_done;
+ arc_byteswap_func_t *acb_byteswap;
+ arc_buf_t *acb_buf;
+ zio_t *acb_zio_dummy;
+ arc_callback_t *acb_next;
+};
+
+typedef struct arc_write_callback arc_write_callback_t;
+
+struct arc_write_callback {
+ void *awcb_private;
+ arc_done_func_t *awcb_ready;
+ arc_done_func_t *awcb_done;
+ arc_buf_t *awcb_buf;
+};
+
+struct arc_buf_hdr {
+ /* protected by hash lock */
+ dva_t b_dva;
+ uint64_t b_birth;
+ uint64_t b_cksum0;
+
+ kmutex_t b_freeze_lock;
+ zio_cksum_t *b_freeze_cksum;
+
+ arc_buf_hdr_t *b_hash_next;
+ arc_buf_t *b_buf;
+ uint32_t b_flags;
+ uint32_t b_datacnt;
+
+ arc_callback_t *b_acb;
+ kcondvar_t b_cv;
+
+ /* immutable */
+ arc_buf_contents_t b_type;
+ uint64_t b_size;
+ spa_t *b_spa;
+
+ /* protected by arc state mutex */
+ arc_state_t *b_state;
+ list_node_t b_arc_node;
+
+ /* updated atomically */
+ clock_t b_arc_access;
+
+ /* self protecting */
+ refcount_t b_refcnt;
+};
+
+static arc_buf_t *arc_eviction_list;
+static kmutex_t arc_eviction_mtx;
+static arc_buf_hdr_t arc_eviction_hdr;
+static void arc_get_data_buf(arc_buf_t *buf);
+static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
+
+#define GHOST_STATE(state) \
+ ((state) == arc_mru_ghost || (state) == arc_mfu_ghost)
+
+/*
+ * Private ARC flags. These flags are private ARC only flags that will show up
+ * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can
+ * be passed in as arc_flags in things like arc_read. However, these flags
+ * should never be passed and should only be set by ARC code. When adding new
+ * public flags, make sure not to smash the private ones.
+ */
+
+#define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */
+#define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */
+#define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */
+#define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */
+#define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */
+#define ARC_INDIRECT (1 << 14) /* this is an indirect block */
+
+#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE)
+#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS)
+#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR)
+#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ)
+#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE)
+
+/*
+ * Hash table routines
+ */
+
+#define HT_LOCK_PAD 128
+
+struct ht_lock {
+ kmutex_t ht_lock;
+#ifdef _KERNEL
+ unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
+#endif
+};
+
+#define BUF_LOCKS 256
+typedef struct buf_hash_table {
+ uint64_t ht_mask;
+ arc_buf_hdr_t **ht_table;
+ struct ht_lock ht_locks[BUF_LOCKS];
+} buf_hash_table_t;
+
+static buf_hash_table_t buf_hash_table;
+
+#define BUF_HASH_INDEX(spa, dva, birth) \
+ (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
+#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
+#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
+#define HDR_LOCK(buf) \
+ (BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth)))
+
+uint64_t zfs_crc64_table[256];
+
+static uint64_t
+buf_hash(spa_t *spa, dva_t *dva, uint64_t birth)
+{
+ uintptr_t spav = (uintptr_t)spa;
+ uint8_t *vdva = (uint8_t *)dva;
+ uint64_t crc = -1ULL;
+ int i;
+
+ ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
+
+ for (i = 0; i < sizeof (dva_t); i++)
+ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
+
+ crc ^= (spav>>8) ^ birth;
+
+ return (crc);
+}
+
+#define BUF_EMPTY(buf) \
+ ((buf)->b_dva.dva_word[0] == 0 && \
+ (buf)->b_dva.dva_word[1] == 0 && \
+ (buf)->b_birth == 0)
+
+#define BUF_EQUAL(spa, dva, birth, buf) \
+ ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \
+ ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \
+ ((buf)->b_birth == birth) && ((buf)->b_spa == spa)
+
+static arc_buf_hdr_t *
+buf_hash_find(spa_t *spa, dva_t *dva, uint64_t birth, kmutex_t **lockp)
+{
+ uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
+ kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
+ arc_buf_hdr_t *buf;
+
+ mutex_enter(hash_lock);
+ for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
+ buf = buf->b_hash_next) {
+ if (BUF_EQUAL(spa, dva, birth, buf)) {
+ *lockp = hash_lock;
+ return (buf);
+ }
+ }
+ mutex_exit(hash_lock);
+ *lockp = NULL;
+ return (NULL);
+}
+
+/*
+ * Insert an entry into the hash table. If there is already an element
+ * equal to elem in the hash table, then the already existing element
+ * will be returned and the new element will not be inserted.
+ * Otherwise returns NULL.
+ */
+static arc_buf_hdr_t *
+buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
+{
+ uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
+ kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
+ arc_buf_hdr_t *fbuf;
+ uint32_t i;
+
+ ASSERT(!HDR_IN_HASH_TABLE(buf));
+ *lockp = hash_lock;
+ mutex_enter(hash_lock);
+ for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
+ fbuf = fbuf->b_hash_next, i++) {
+ if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
+ return (fbuf);
+ }
+
+ buf->b_hash_next = buf_hash_table.ht_table[idx];
+ buf_hash_table.ht_table[idx] = buf;
+ buf->b_flags |= ARC_IN_HASH_TABLE;
+
+ /* collect some hash table performance data */
+ if (i > 0) {
+ ARCSTAT_BUMP(arcstat_hash_collisions);
+ if (i == 1)
+ ARCSTAT_BUMP(arcstat_hash_chains);
+
+ ARCSTAT_MAX(arcstat_hash_chain_max, i);
+ }
+
+ ARCSTAT_BUMP(arcstat_hash_elements);
+ ARCSTAT_MAXSTAT(arcstat_hash_elements);
+
+ return (NULL);
+}
+
+static void
+buf_hash_remove(arc_buf_hdr_t *buf)
+{
+ arc_buf_hdr_t *fbuf, **bufp;
+ uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
+
+ ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
+ ASSERT(HDR_IN_HASH_TABLE(buf));
+
+ bufp = &buf_hash_table.ht_table[idx];
+ while ((fbuf = *bufp) != buf) {
+ ASSERT(fbuf != NULL);
+ bufp = &fbuf->b_hash_next;
+ }
+ *bufp = buf->b_hash_next;
+ buf->b_hash_next = NULL;
+ buf->b_flags &= ~ARC_IN_HASH_TABLE;
+
+ /* collect some hash table performance data */
+ ARCSTAT_BUMPDOWN(arcstat_hash_elements);
+
+ if (buf_hash_table.ht_table[idx] &&
+ buf_hash_table.ht_table[idx]->b_hash_next == NULL)
+ ARCSTAT_BUMPDOWN(arcstat_hash_chains);
+}
+
+/*
+ * Global data structures and functions for the buf kmem cache.
+ */
+static kmem_cache_t *hdr_cache;
+static kmem_cache_t *buf_cache;
+
+static void
+buf_fini(void)
+{
+ int i;
+
+ kmem_free(buf_hash_table.ht_table,
+ (buf_hash_table.ht_mask + 1) * sizeof (void *));
+ for (i = 0; i < BUF_LOCKS; i++)
+ mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
+ kmem_cache_destroy(hdr_cache);
+ kmem_cache_destroy(buf_cache);
+}
+
+/*
+ * Constructor callback - called when the cache is empty
+ * and a new buf is requested.
+ */
+/* ARGSUSED */
+static int
+hdr_cons(void *vbuf, void *unused, int kmflag)
+{
+ arc_buf_hdr_t *buf = vbuf;
+
+ bzero(buf, sizeof (arc_buf_hdr_t));
+ refcount_create(&buf->b_refcnt);
+ cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
+ return (0);
+}
+
+/*
+ * Destructor callback - called when a cached buf is
+ * no longer required.
+ */
+/* ARGSUSED */
+static void
+hdr_dest(void *vbuf, void *unused)
+{
+ arc_buf_hdr_t *buf = vbuf;
+
+ refcount_destroy(&buf->b_refcnt);
+ cv_destroy(&buf->b_cv);
+}
+
+/*
+ * Reclaim callback -- invoked when memory is low.
+ */
+/* ARGSUSED */
+static void
+hdr_recl(void *unused)
+{
+ dprintf("hdr_recl called\n");
+ /*
+ * umem calls the reclaim func when we destroy the buf cache,
+ * which is after we do arc_fini().
+ */
+ if (!arc_dead)
+ cv_signal(&arc_reclaim_thr_cv);
+}
+
+static void
+buf_init(void)
+{
+ uint64_t *ct;
+ uint64_t hsize = 1ULL << 12;
+ int i, j;
+
+ /*
+ * The hash table is big enough to fill all of physical memory
+ * with an average 64K block size. The table will take up
+ * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
+ */
+ while (hsize * 65536 < physmem * PAGESIZE)
+ hsize <<= 1;
+retry:
+ buf_hash_table.ht_mask = hsize - 1;
+ buf_hash_table.ht_table =
+ kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
+ if (buf_hash_table.ht_table == NULL) {
+ ASSERT(hsize > (1ULL << 8));
+ hsize >>= 1;
+ goto retry;
+ }
+
+ hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
+ 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
+ buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
+ 0, NULL, NULL, NULL, NULL, NULL, 0);
+
+ for (i = 0; i < 256; i++)
+ for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
+ *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
+
+ for (i = 0; i < BUF_LOCKS; i++) {
+ mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
+ NULL, MUTEX_DEFAULT, NULL);
+ }
+}
+
+#define ARC_MINTIME (hz>>4) /* 62 ms */
+
+static void
+arc_cksum_verify(arc_buf_t *buf)
+{
+ zio_cksum_t zc;
+
+ if (!(zfs_flags & ZFS_DEBUG_MODIFY))
+ return;
+
+ mutex_enter(&buf->b_hdr->b_freeze_lock);
+ if (buf->b_hdr->b_freeze_cksum == NULL ||
+ (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
+ mutex_exit(&buf->b_hdr->b_freeze_lock);
+ return;
+ }
+ fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
+ if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
+ panic("buffer modified while frozen!");
+ mutex_exit(&buf->b_hdr->b_freeze_lock);
+}
+
+static void
+arc_cksum_compute(arc_buf_t *buf)
+{
+ if (!(zfs_flags & ZFS_DEBUG_MODIFY))
+ return;
+
+ mutex_enter(&buf->b_hdr->b_freeze_lock);
+ if (buf->b_hdr->b_freeze_cksum != NULL) {
+ mutex_exit(&buf->b_hdr->b_freeze_lock);
+ return;
+ }
+ buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
+ fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
+ buf->b_hdr->b_freeze_cksum);
+ mutex_exit(&buf->b_hdr->b_freeze_lock);
+}
+
+void
+arc_buf_thaw(arc_buf_t *buf)
+{
+ if (!(zfs_flags & ZFS_DEBUG_MODIFY))
+ return;
+
+ if (buf->b_hdr->b_state != arc_anon)
+ panic("modifying non-anon buffer!");
+ if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
+ panic("modifying buffer while i/o in progress!");
+ arc_cksum_verify(buf);
+ mutex_enter(&buf->b_hdr->b_freeze_lock);
+ if (buf->b_hdr->b_freeze_cksum != NULL) {
+ kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
+ buf->b_hdr->b_freeze_cksum = NULL;
+ }
+ mutex_exit(&buf->b_hdr->b_freeze_lock);
+}
+
+void
+arc_buf_freeze(arc_buf_t *buf)
+{
+ if (!(zfs_flags & ZFS_DEBUG_MODIFY))
+ return;
+
+ ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
+ buf->b_hdr->b_state == arc_anon);
+ arc_cksum_compute(buf);
+}
+
+static void
+add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
+{
+ ASSERT(MUTEX_HELD(hash_lock));
+
+ if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
+ (ab->b_state != arc_anon)) {
+ uint64_t delta = ab->b_size * ab->b_datacnt;
+
+ ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx));
+ mutex_enter(&ab->b_state->arcs_mtx);
+ ASSERT(list_link_active(&ab->b_arc_node));
+ list_remove(&ab->b_state->arcs_list, ab);
+ if (GHOST_STATE(ab->b_state)) {
+ ASSERT3U(ab->b_datacnt, ==, 0);
+ ASSERT3P(ab->b_buf, ==, NULL);
+ delta = ab->b_size;
+ }
+ ASSERT(delta > 0);
+ ASSERT3U(ab->b_state->arcs_lsize, >=, delta);
+ atomic_add_64(&ab->b_state->arcs_lsize, -delta);
+ mutex_exit(&ab->b_state->arcs_mtx);
+ /* remove the prefetch flag is we get a reference */
+ if (ab->b_flags & ARC_PREFETCH)
+ ab->b_flags &= ~ARC_PREFETCH;
+ }
+}
+
+static int
+remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
+{
+ int cnt;
+ arc_state_t *state = ab->b_state;
+
+ ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
+ ASSERT(!GHOST_STATE(state));
+
+ if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
+ (state != arc_anon)) {
+ ASSERT(!MUTEX_HELD(&state->arcs_mtx));
+ mutex_enter(&state->arcs_mtx);
+ ASSERT(!list_link_active(&ab->b_arc_node));
+ list_insert_head(&state->arcs_list, ab);
+ ASSERT(ab->b_datacnt > 0);
+ atomic_add_64(&state->arcs_lsize, ab->b_size * ab->b_datacnt);
+ ASSERT3U(state->arcs_size, >=, state->arcs_lsize);
+ mutex_exit(&state->arcs_mtx);
+ }
+ return (cnt);
+}
+
+/*
+ * Move the supplied buffer to the indicated state. The mutex
+ * for the buffer must be held by the caller.
+ */
+static void
+arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
+{
+ arc_state_t *old_state = ab->b_state;
+ int64_t refcnt = refcount_count(&ab->b_refcnt);
+ uint64_t from_delta, to_delta;
+
+ ASSERT(MUTEX_HELD(hash_lock));
+ ASSERT(new_state != old_state);
+ ASSERT(refcnt == 0 || ab->b_datacnt > 0);
+ ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
+
+ from_delta = to_delta = ab->b_datacnt * ab->b_size;
+
+ /*
+ * If this buffer is evictable, transfer it from the
+ * old state list to the new state list.
+ */
+ if (refcnt == 0) {
+ if (old_state != arc_anon) {
+ int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
+
+ if (use_mutex)
+ mutex_enter(&old_state->arcs_mtx);
+
+ ASSERT(list_link_active(&ab->b_arc_node));
+ list_remove(&old_state->arcs_list, ab);
+
+ /*
+ * If prefetching out of the ghost cache,
+ * we will have a non-null datacnt.
+ */
+ if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
+ /* ghost elements have a ghost size */
+ ASSERT(ab->b_buf == NULL);
+ from_delta = ab->b_size;
+ }
+ ASSERT3U(old_state->arcs_lsize, >=, from_delta);
+ atomic_add_64(&old_state->arcs_lsize, -from_delta);
+
+ if (use_mutex)
+ mutex_exit(&old_state->arcs_mtx);
+ }
+ if (new_state != arc_anon) {
+ int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
+
+ if (use_mutex)
+ mutex_enter(&new_state->arcs_mtx);
+
+ list_insert_head(&new_state->arcs_list, ab);
+
+ /* ghost elements have a ghost size */
+ if (GHOST_STATE(new_state)) {
+ ASSERT(ab->b_datacnt == 0);
+ ASSERT(ab->b_buf == NULL);
+ to_delta = ab->b_size;
+ }
+ atomic_add_64(&new_state->arcs_lsize, to_delta);
+ ASSERT3U(new_state->arcs_size + to_delta, >=,
+ new_state->arcs_lsize);
+
+ if (use_mutex)
+ mutex_exit(&new_state->arcs_mtx);
+ }
+ }
+
+ ASSERT(!BUF_EMPTY(ab));
+ if (new_state == arc_anon && old_state != arc_anon) {
+ buf_hash_remove(ab);
+ }
+
+ /* adjust state sizes */
+ if (to_delta)
+ atomic_add_64(&new_state->arcs_size, to_delta);
+ if (from_delta) {
+ ASSERT3U(old_state->arcs_size, >=, from_delta);
+ atomic_add_64(&old_state->arcs_size, -from_delta);
+ }
+ ab->b_state = new_state;
+}
+
+arc_buf_t *
+arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
+{
+ arc_buf_hdr_t *hdr;
+ arc_buf_t *buf;
+
+ ASSERT3U(size, >, 0);
+ hdr = kmem_cache_alloc(hdr_cache, KM_SLEEP);
+ ASSERT(BUF_EMPTY(hdr));
+ hdr->b_size = size;
+ hdr->b_type = type;
+ hdr->b_spa = spa;
+ hdr->b_state = arc_anon;
+ hdr->b_arc_access = 0;
+ mutex_init(&hdr->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
+ buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
+ buf->b_hdr = hdr;
+ buf->b_data = NULL;
+ buf->b_efunc = NULL;
+ buf->b_private = NULL;
+ buf->b_next = NULL;
+ hdr->b_buf = buf;
+ arc_get_data_buf(buf);
+ hdr->b_datacnt = 1;
+ hdr->b_flags = 0;
+ ASSERT(refcount_is_zero(&hdr->b_refcnt));
+ (void) refcount_add(&hdr->b_refcnt, tag);
+
+ return (buf);
+}
+
+static arc_buf_t *
+arc_buf_clone(arc_buf_t *from)
+{
+ arc_buf_t *buf;
+ arc_buf_hdr_t *hdr = from->b_hdr;
+ uint64_t size = hdr->b_size;
+
+ buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
+ buf->b_hdr = hdr;
+ buf->b_data = NULL;
+ buf->b_efunc = NULL;
+ buf->b_private = NULL;
+ buf->b_next = hdr->b_buf;
+ hdr->b_buf = buf;
+ arc_get_data_buf(buf);
+ bcopy(from->b_data, buf->b_data, size);
+ hdr->b_datacnt += 1;
+ return (buf);
+}
+
+void
+arc_buf_add_ref(arc_buf_t *buf, void* tag)
+{
+ arc_buf_hdr_t *hdr;
+ kmutex_t *hash_lock;
+
+ /*
+ * Check to see if this buffer is currently being evicted via
+ * arc_do_user_evicts().
+ */
+ mutex_enter(&arc_eviction_mtx);
+ hdr = buf->b_hdr;
+ if (hdr == NULL) {
+ mutex_exit(&arc_eviction_mtx);
+ return;
+ }
+ hash_lock = HDR_LOCK(hdr);
+ mutex_exit(&arc_eviction_mtx);
+
+ mutex_enter(hash_lock);
+ if (buf->b_data == NULL) {
+ /*
+ * This buffer is evicted.
+ */
+ mutex_exit(hash_lock);
+ return;
+ }
+
+ ASSERT(buf->b_hdr == hdr);
+ ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
+ add_reference(hdr, hash_lock, tag);
+ arc_access(hdr, hash_lock);
+ mutex_exit(hash_lock);
+ ARCSTAT_BUMP(arcstat_hits);
+ ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
+ demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
+ data, metadata, hits);
+}
+
+static void
+arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
+{
+ arc_buf_t **bufp;
+
+ /* free up data associated with the buf */
+ if (buf->b_data) {
+ arc_state_t *state = buf->b_hdr->b_state;
+ uint64_t size = buf->b_hdr->b_size;
+ arc_buf_contents_t type = buf->b_hdr->b_type;
+
+ arc_cksum_verify(buf);
+ if (!recycle) {
+ if (type == ARC_BUFC_METADATA) {
+ zio_buf_free(buf->b_data, size);
+ } else {
+ ASSERT(type == ARC_BUFC_DATA);
+ zio_data_buf_free(buf->b_data, size);
+ }
+ atomic_add_64(&arc_size, -size);
+ }
+ if (list_link_active(&buf->b_hdr->b_arc_node)) {
+ ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
+ ASSERT(state != arc_anon);
+ ASSERT3U(state->arcs_lsize, >=, size);
+ atomic_add_64(&state->arcs_lsize, -size);
+ }
+ ASSERT3U(state->arcs_size, >=, size);
+ atomic_add_64(&state->arcs_size, -size);
+ buf->b_data = NULL;
+ ASSERT(buf->b_hdr->b_datacnt > 0);
+ buf->b_hdr->b_datacnt -= 1;
+ }
+
+ /* only remove the buf if requested */
+ if (!all)
+ return;
+
+ /* remove the buf from the hdr list */
+ for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
+ continue;
+ *bufp = buf->b_next;
+
+ ASSERT(buf->b_efunc == NULL);
+
+ /* clean up the buf */
+ buf->b_hdr = NULL;
+ kmem_cache_free(buf_cache, buf);
+}
+
+static void
+arc_hdr_destroy(arc_buf_hdr_t *hdr)
+{
+ ASSERT(refcount_is_zero(&hdr->b_refcnt));
+ ASSERT3P(hdr->b_state, ==, arc_anon);
+ ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+
+ if (!BUF_EMPTY(hdr)) {
+ ASSERT(!HDR_IN_HASH_TABLE(hdr));
+ bzero(&hdr->b_dva, sizeof (dva_t));
+ hdr->b_birth = 0;
+ hdr->b_cksum0 = 0;
+ }
+ while (hdr->b_buf) {
+ arc_buf_t *buf = hdr->b_buf;
+
+ if (buf->b_efunc) {
+ mutex_enter(&arc_eviction_mtx);
+ ASSERT(buf->b_hdr != NULL);
+ arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
+ hdr->b_buf = buf->b_next;
+ buf->b_hdr = &arc_eviction_hdr;
+ buf->b_next = arc_eviction_list;
+ arc_eviction_list = buf;
+ mutex_exit(&arc_eviction_mtx);
+ } else {
+ arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
+ }
+ }
+ if (hdr->b_freeze_cksum != NULL) {
+ kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
+ hdr->b_freeze_cksum = NULL;
+ }
+ mutex_destroy(&hdr->b_freeze_lock);
+
+ ASSERT(!list_link_active(&hdr->b_arc_node));
+ ASSERT3P(hdr->b_hash_next, ==, NULL);
+ ASSERT3P(hdr->b_acb, ==, NULL);
+ kmem_cache_free(hdr_cache, hdr);
+}
+
+void
+arc_buf_free(arc_buf_t *buf, void *tag)
+{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+ int hashed = hdr->b_state != arc_anon;
+
+ ASSERT(buf->b_efunc == NULL);
+ ASSERT(buf->b_data != NULL);
+
+ if (hashed) {
+ kmutex_t *hash_lock = HDR_LOCK(hdr);
+
+ mutex_enter(hash_lock);
+ (void) remove_reference(hdr, hash_lock, tag);
+ if (hdr->b_datacnt > 1)
+ arc_buf_destroy(buf, FALSE, TRUE);
+ else
+ hdr->b_flags |= ARC_BUF_AVAILABLE;
+ mutex_exit(hash_lock);
+ } else if (HDR_IO_IN_PROGRESS(hdr)) {
+ int destroy_hdr;
+ /*
+ * We are in the middle of an async write. Don't destroy
+ * this buffer unless the write completes before we finish
+ * decrementing the reference count.
+ */
+ mutex_enter(&arc_eviction_mtx);
+ (void) remove_reference(hdr, NULL, tag);
+ ASSERT(refcount_is_zero(&hdr->b_refcnt));
+ destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
+ mutex_exit(&arc_eviction_mtx);
+ if (destroy_hdr)
+ arc_hdr_destroy(hdr);
+ } else {
+ if (remove_reference(hdr, NULL, tag) > 0) {
+ ASSERT(HDR_IO_ERROR(hdr));
+ arc_buf_destroy(buf, FALSE, TRUE);
+ } else {
+ arc_hdr_destroy(hdr);
+ }
+ }
+}
+
+int
+arc_buf_remove_ref(arc_buf_t *buf, void* tag)
+{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+ kmutex_t *hash_lock = HDR_LOCK(hdr);
+ int no_callback = (buf->b_efunc == NULL);
+
+ if (hdr->b_state == arc_anon) {
+ arc_buf_free(buf, tag);
+ return (no_callback);
+ }
+
+ mutex_enter(hash_lock);
+ ASSERT(hdr->b_state != arc_anon);
+ ASSERT(buf->b_data != NULL);
+
+ (void) remove_reference(hdr, hash_lock, tag);
+ if (hdr->b_datacnt > 1) {
+ if (no_callback)
+ arc_buf_destroy(buf, FALSE, TRUE);
+ } else if (no_callback) {
+ ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
+ hdr->b_flags |= ARC_BUF_AVAILABLE;
+ }
+ ASSERT(no_callback || hdr->b_datacnt > 1 ||
+ refcount_is_zero(&hdr->b_refcnt));
+ mutex_exit(hash_lock);
+ return (no_callback);
+}
+
+int
+arc_buf_size(arc_buf_t *buf)
+{
+ return (buf->b_hdr->b_size);
+}
+
+/*
+ * Evict buffers from list until we've removed the specified number of
+ * bytes. Move the removed buffers to the appropriate evict state.
+ * If the recycle flag is set, then attempt to "recycle" a buffer:
+ * - look for a buffer to evict that is `bytes' long.
+ * - return the data block from this buffer rather than freeing it.
+ * This flag is used by callers that are trying to make space for a
+ * new buffer in a full arc cache.
+ */
+static void *
+arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle,
+ arc_buf_contents_t type)
+{
+ arc_state_t *evicted_state;
+ uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
+ arc_buf_hdr_t *ab, *ab_prev = NULL;
+ kmutex_t *hash_lock;
+ boolean_t have_lock;
+ void *stolen = NULL;
+
+ ASSERT(state == arc_mru || state == arc_mfu);
+
+ evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
+
+ mutex_enter(&state->arcs_mtx);
+ mutex_enter(&evicted_state->arcs_mtx);
+
+ for (ab = list_tail(&state->arcs_list); ab; ab = ab_prev) {
+ ab_prev = list_prev(&state->arcs_list, ab);
+ /* prefetch buffers have a minimum lifespan */
+ if (HDR_IO_IN_PROGRESS(ab) ||
+ (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
+ lbolt - ab->b_arc_access < arc_min_prefetch_lifespan)) {
+ skipped++;
+ continue;
+ }
+ /* "lookahead" for better eviction candidate */
+ if (recycle && ab->b_size != bytes &&
+ ab_prev && ab_prev->b_size == bytes)
+ continue;
+ hash_lock = HDR_LOCK(ab);
+ have_lock = MUTEX_HELD(hash_lock);
+ if (have_lock || mutex_tryenter(hash_lock)) {
+ ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0);
+ ASSERT(ab->b_datacnt > 0);
+ while (ab->b_buf) {
+ arc_buf_t *buf = ab->b_buf;
+ if (buf->b_data) {
+ bytes_evicted += ab->b_size;
+ if (recycle && ab->b_type == type &&
+ ab->b_size == bytes) {
+ stolen = buf->b_data;
+ recycle = FALSE;
+ }
+ }
+ if (buf->b_efunc) {
+ mutex_enter(&arc_eviction_mtx);
+ arc_buf_destroy(buf,
+ buf->b_data == stolen, FALSE);
+ ab->b_buf = buf->b_next;
+ buf->b_hdr = &arc_eviction_hdr;
+ buf->b_next = arc_eviction_list;
+ arc_eviction_list = buf;
+ mutex_exit(&arc_eviction_mtx);
+ } else {
+ arc_buf_destroy(buf,
+ buf->b_data == stolen, TRUE);
+ }
+ }
+ ASSERT(ab->b_datacnt == 0);
+ arc_change_state(evicted_state, ab, hash_lock);
+ ASSERT(HDR_IN_HASH_TABLE(ab));
+ ab->b_flags = ARC_IN_HASH_TABLE;
+ DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
+ if (!have_lock)
+ mutex_exit(hash_lock);
+ if (bytes >= 0 && bytes_evicted >= bytes)
+ break;
+ } else {
+ missed += 1;
+ }
+ }
+
+ mutex_exit(&evicted_state->arcs_mtx);
+ mutex_exit(&state->arcs_mtx);
+
+ if (bytes_evicted < bytes)
+ dprintf("only evicted %lld bytes from %x",
+ (longlong_t)bytes_evicted, state);
+
+ if (skipped)
+ ARCSTAT_INCR(arcstat_evict_skip, skipped);
+
+ if (missed)
+ ARCSTAT_INCR(arcstat_mutex_miss, missed);
+
+ return (stolen);
+}
+
+/*
+ * Remove buffers from list until we've removed the specified number of
+ * bytes. Destroy the buffers that are removed.
+ */
+static void
+arc_evict_ghost(arc_state_t *state, int64_t bytes)
+{
+ arc_buf_hdr_t *ab, *ab_prev;
+ kmutex_t *hash_lock;
+ uint64_t bytes_deleted = 0;
+ uint64_t bufs_skipped = 0;
+
+ ASSERT(GHOST_STATE(state));
+top:
+ mutex_enter(&state->arcs_mtx);
+ for (ab = list_tail(&state->arcs_list); ab; ab = ab_prev) {
+ ab_prev = list_prev(&state->arcs_list, ab);
+ hash_lock = HDR_LOCK(ab);
+ if (mutex_tryenter(hash_lock)) {
+ ASSERT(!HDR_IO_IN_PROGRESS(ab));
+ ASSERT(ab->b_buf == NULL);
+ arc_change_state(arc_anon, ab, hash_lock);
+ mutex_exit(hash_lock);
+ ARCSTAT_BUMP(arcstat_deleted);
+ bytes_deleted += ab->b_size;
+ arc_hdr_destroy(ab);
+ DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
+ if (bytes >= 0 && bytes_deleted >= bytes)
+ break;
+ } else {
+ if (bytes < 0) {
+ mutex_exit(&state->arcs_mtx);
+ mutex_enter(hash_lock);
+ mutex_exit(hash_lock);
+ goto top;
+ }
+ bufs_skipped += 1;
+ }
+ }
+ mutex_exit(&state->arcs_mtx);
+
+ if (bufs_skipped) {
+ ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
+ ASSERT(bytes >= 0);
+ }
+
+ if (bytes_deleted < bytes)
+ dprintf("only deleted %lld bytes from %p",
+ (longlong_t)bytes_deleted, state);
+}
+
+static void
+arc_adjust(void)
+{
+ int64_t top_sz, mru_over, arc_over, todelete;
+
+ top_sz = arc_anon->arcs_size + arc_mru->arcs_size;
+
+ if (top_sz > arc_p && arc_mru->arcs_lsize > 0) {
+ int64_t toevict = MIN(arc_mru->arcs_lsize, top_sz - arc_p);
+ (void) arc_evict(arc_mru, toevict, FALSE, ARC_BUFC_UNDEF);
+ top_sz = arc_anon->arcs_size + arc_mru->arcs_size;
+ }
+
+ mru_over = top_sz + arc_mru_ghost->arcs_size - arc_c;
+
+ if (mru_over > 0) {
+ if (arc_mru_ghost->arcs_lsize > 0) {
+ todelete = MIN(arc_mru_ghost->arcs_lsize, mru_over);
+ arc_evict_ghost(arc_mru_ghost, todelete);
+ }
+ }
+
+ if ((arc_over = arc_size - arc_c) > 0) {
+ int64_t tbl_over;
+
+ if (arc_mfu->arcs_lsize > 0) {
+ int64_t toevict = MIN(arc_mfu->arcs_lsize, arc_over);
+ (void) arc_evict(arc_mfu, toevict, FALSE,
+ ARC_BUFC_UNDEF);
+ }
+
+ tbl_over = arc_size + arc_mru_ghost->arcs_lsize +
+ arc_mfu_ghost->arcs_lsize - arc_c*2;
+
+ if (tbl_over > 0 && arc_mfu_ghost->arcs_lsize > 0) {
+ todelete = MIN(arc_mfu_ghost->arcs_lsize, tbl_over);
+ arc_evict_ghost(arc_mfu_ghost, todelete);
+ }
+ }
+}
+
+static void
+arc_do_user_evicts(void)
+{
+ mutex_enter(&arc_eviction_mtx);
+ while (arc_eviction_list != NULL) {
+ arc_buf_t *buf = arc_eviction_list;
+ arc_eviction_list = buf->b_next;
+ buf->b_hdr = NULL;
+ mutex_exit(&arc_eviction_mtx);
+
+ if (buf->b_efunc != NULL)
+ VERIFY(buf->b_efunc(buf) == 0);
+
+ buf->b_efunc = NULL;
+ buf->b_private = NULL;
+ kmem_cache_free(buf_cache, buf);
+ mutex_enter(&arc_eviction_mtx);
+ }
+ mutex_exit(&arc_eviction_mtx);
+}
+
+/*
+ * Flush all *evictable* data from the cache.
+ * NOTE: this will not touch "active" (i.e. referenced) data.
+ */
+void
+arc_flush(void)
+{
+ while (list_head(&arc_mru->arcs_list))
+ (void) arc_evict(arc_mru, -1, FALSE, ARC_BUFC_UNDEF);
+ while (list_head(&arc_mfu->arcs_list))
+ (void) arc_evict(arc_mfu, -1, FALSE, ARC_BUFC_UNDEF);
+
+ arc_evict_ghost(arc_mru_ghost, -1);
+ arc_evict_ghost(arc_mfu_ghost, -1);
+
+ mutex_enter(&arc_reclaim_thr_lock);
+ arc_do_user_evicts();
+ mutex_exit(&arc_reclaim_thr_lock);
+ ASSERT(arc_eviction_list == NULL);
+}
+
+int arc_shrink_shift = 5; /* log2(fraction of arc to reclaim) */
+
+void
+arc_shrink(void)
+{
+ if (arc_c > arc_c_min) {
+ uint64_t to_free;
+
+#ifdef _KERNEL
+ to_free = arc_c >> arc_shrink_shift;
+#else
+ to_free = arc_c >> arc_shrink_shift;
+#endif
+ if (arc_c > arc_c_min + to_free)
+ atomic_add_64(&arc_c, -to_free);
+ else
+ arc_c = arc_c_min;
+
+ atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
+ if (arc_c > arc_size)
+ arc_c = MAX(arc_size, arc_c_min);
+ if (arc_p > arc_c)
+ arc_p = (arc_c >> 1);
+ ASSERT(arc_c >= arc_c_min);
+ ASSERT((int64_t)arc_p >= 0);
+ }
+
+ if (arc_size > arc_c)
+ arc_adjust();
+}
+
+static int zfs_needfree = 0;
+
+static int
+arc_reclaim_needed(void)
+{
+#if 0
+ uint64_t extra;
+#endif
+
+#ifdef _KERNEL
+
+ if (zfs_needfree)
+ return (1);
+
+#if 0
+ /*
+ * check to make sure that swapfs has enough space so that anon
+ * reservations can still succeeed. anon_resvmem() checks that the
+ * availrmem is greater than swapfs_minfree, and the number of reserved
+ * swap pages. We also add a bit of extra here just to prevent
+ * circumstances from getting really dire.
+ */
+ if (availrmem < swapfs_minfree + swapfs_reserve + extra)
+ return (1);
+
+ /*
+ * If zio data pages are being allocated out of a separate heap segment,
+ * then check that the size of available vmem for this area remains
+ * above 1/4th free. This needs to be done when the size of the
+ * non-default segment is smaller than physical memory, so we could
+ * conceivably run out of VA in that segment before running out of
+ * physical memory.
+ */
+ if (zio_arena != NULL) {
+ size_t arc_ziosize =
+ btop(vmem_size(zio_arena, VMEM_FREE | VMEM_ALLOC));
+
+ if ((physmem > arc_ziosize) &&
+ (btop(vmem_size(zio_arena, VMEM_FREE)) < arc_ziosize >> 2))
+ return (1);
+ }
+
+#if defined(__i386)
+ /*
+ * If we're on an i386 platform, it's possible that we'll exhaust the
+ * kernel heap space before we ever run out of available physical
+ * memory. Most checks of the size of the heap_area compare against
+ * tune.t_minarmem, which is the minimum available real memory that we
+ * can have in the system. However, this is generally fixed at 25 pages
+ * which is so low that it's useless. In this comparison, we seek to
+ * calculate the total heap-size, and reclaim if more than 3/4ths of the
+ * heap is allocated. (Or, in the caclulation, if less than 1/4th is
+ * free)
+ */
+ if (btop(vmem_size(heap_arena, VMEM_FREE)) <
+ (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2))
+ return (1);
+#endif
+#else
+ if (kmem_map->size > (vm_kmem_size * 3) / 4)
+ return (1);
+#endif
+
+#else
+ if (spa_get_random(100) == 0)
+ return (1);
+#endif
+ return (0);
+}
+
+static void
+arc_kmem_reap_now(arc_reclaim_strategy_t strat)
+{
+#ifdef ZIO_USE_UMA
+ size_t i;
+ kmem_cache_t *prev_cache = NULL;
+ kmem_cache_t *prev_data_cache = NULL;
+ extern kmem_cache_t *zio_buf_cache[];
+ extern kmem_cache_t *zio_data_buf_cache[];
+#endif
+
+#ifdef _KERNEL
+ /*
+ * First purge some DNLC entries, in case the DNLC is using
+ * up too much memory.
+ */
+ dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
+
+#if defined(__i386)
+ /*
+ * Reclaim unused memory from all kmem caches.
+ */
+ kmem_reap();
+#endif
+#endif
+
+ /*
+ * An agressive reclamation will shrink the cache size as well as
+ * reap free buffers from the arc kmem caches.
+ */
+ if (strat == ARC_RECLAIM_AGGR)
+ arc_shrink();
+
+#ifdef ZIO_USE_UMA
+ for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
+ if (zio_buf_cache[i] != prev_cache) {
+ prev_cache = zio_buf_cache[i];
+ kmem_cache_reap_now(zio_buf_cache[i]);
+ }
+ if (zio_data_buf_cache[i] != prev_data_cache) {
+ prev_data_cache = zio_data_buf_cache[i];
+ kmem_cache_reap_now(zio_data_buf_cache[i]);
+ }
+ }
+#endif
+ kmem_cache_reap_now(buf_cache);
+ kmem_cache_reap_now(hdr_cache);
+}
+
+static void
+arc_reclaim_thread(void *dummy __unused)
+{
+ clock_t growtime = 0;
+ arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS;
+ callb_cpr_t cpr;
+
+ CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
+
+ mutex_enter(&arc_reclaim_thr_lock);
+ while (arc_thread_exit == 0) {
+ if (arc_reclaim_needed()) {
+
+ if (arc_no_grow) {
+ if (last_reclaim == ARC_RECLAIM_CONS) {
+ last_reclaim = ARC_RECLAIM_AGGR;
+ } else {
+ last_reclaim = ARC_RECLAIM_CONS;
+ }
+ } else {
+ arc_no_grow = TRUE;
+ last_reclaim = ARC_RECLAIM_AGGR;
+ membar_producer();
+ }
+
+ /* reset the growth delay for every reclaim */
+ growtime = lbolt + (arc_grow_retry * hz);
+ ASSERT(growtime > 0);
+
+ if (zfs_needfree && last_reclaim == ARC_RECLAIM_CONS) {
+ /*
+ * If zfs_needfree is TRUE our vm_lowmem hook
+ * was called and in that case we must free some
+ * memory, so switch to aggressive mode.
+ */
+ arc_no_grow = TRUE;
+ last_reclaim = ARC_RECLAIM_AGGR;
+ }
+ arc_kmem_reap_now(last_reclaim);
+ } else if ((growtime > 0) && ((growtime - lbolt) <= 0)) {
+ arc_no_grow = FALSE;
+ }
+
+ if (zfs_needfree ||
+ (2 * arc_c < arc_size +
+ arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size))
+ arc_adjust();
+
+ if (arc_eviction_list != NULL)
+ arc_do_user_evicts();
+
+ if (arc_reclaim_needed()) {
+ zfs_needfree = 0;
+#ifdef _KERNEL
+ wakeup(&zfs_needfree);
+#endif
+ }
+
+ /* block until needed, or one second, whichever is shorter */
+ CALLB_CPR_SAFE_BEGIN(&cpr);
+ (void) cv_timedwait(&arc_reclaim_thr_cv,
+ &arc_reclaim_thr_lock, hz);
+ CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
+ }
+
+ arc_thread_exit = 0;
+ cv_broadcast(&arc_reclaim_thr_cv);
+ CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */
+ thread_exit();
+}
+
+/*
+ * Adapt arc info given the number of bytes we are trying to add and
+ * the state that we are comming from. This function is only called
+ * when we are adding new content to the cache.
+ */
+static void
+arc_adapt(int bytes, arc_state_t *state)
+{
+ int mult;
+
+ ASSERT(bytes > 0);
+ /*
+ * Adapt the target size of the MRU list:
+ * - if we just hit in the MRU ghost list, then increase
+ * the target size of the MRU list.
+ * - if we just hit in the MFU ghost list, then increase
+ * the target size of the MFU list by decreasing the
+ * target size of the MRU list.
+ */
+ if (state == arc_mru_ghost) {
+ mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
+ 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
+
+ arc_p = MIN(arc_c, arc_p + bytes * mult);
+ } else if (state == arc_mfu_ghost) {
+ mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
+ 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
+
+ arc_p = MAX(0, (int64_t)arc_p - bytes * mult);
+ }
+ ASSERT((int64_t)arc_p >= 0);
+
+ if (arc_reclaim_needed()) {
+ cv_signal(&arc_reclaim_thr_cv);
+ return;
+ }
+
+ if (arc_no_grow)
+ return;
+
+ if (arc_c >= arc_c_max)
+ return;
+
+ /*
+ * If we're within (2 * maxblocksize) bytes of the target
+ * cache size, increment the target cache size
+ */
+ if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
+ atomic_add_64(&arc_c, (int64_t)bytes);
+ if (arc_c > arc_c_max)
+ arc_c = arc_c_max;
+ else if (state == arc_anon)
+ atomic_add_64(&arc_p, (int64_t)bytes);
+ if (arc_p > arc_c)
+ arc_p = arc_c;
+ }
+ ASSERT((int64_t)arc_p >= 0);
+}
+
+/*
+ * Check if the cache has reached its limits and eviction is required
+ * prior to insert.
+ */
+static int
+arc_evict_needed()
+{
+ if (arc_reclaim_needed())
+ return (1);
+
+ return (arc_size > arc_c);
+}
+
+/*
+ * The buffer, supplied as the first argument, needs a data block.
+ * So, if we are at cache max, determine which cache should be victimized.
+ * We have the following cases:
+ *
+ * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
+ * In this situation if we're out of space, but the resident size of the MFU is
+ * under the limit, victimize the MFU cache to satisfy this insertion request.
+ *
+ * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
+ * Here, we've used up all of the available space for the MRU, so we need to
+ * evict from our own cache instead. Evict from the set of resident MRU
+ * entries.
+ *
+ * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
+ * c minus p represents the MFU space in the cache, since p is the size of the
+ * cache that is dedicated to the MRU. In this situation there's still space on
+ * the MFU side, so the MRU side needs to be victimized.
+ *
+ * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
+ * MFU's resident set is consuming more space than it has been allotted. In
+ * this situation, we must victimize our own cache, the MFU, for this insertion.
+ */
+static void
+arc_get_data_buf(arc_buf_t *buf)
+{
+ arc_state_t *state = buf->b_hdr->b_state;
+ uint64_t size = buf->b_hdr->b_size;
+ arc_buf_contents_t type = buf->b_hdr->b_type;
+
+ arc_adapt(size, state);
+
+ /*
+ * We have not yet reached cache maximum size,
+ * just allocate a new buffer.
+ */
+ if (!arc_evict_needed()) {
+ if (type == ARC_BUFC_METADATA) {
+ buf->b_data = zio_buf_alloc(size);
+ } else {
+ ASSERT(type == ARC_BUFC_DATA);
+ buf->b_data = zio_data_buf_alloc(size);
+ }
+ atomic_add_64(&arc_size, size);
+ goto out;
+ }
+
+ /*
+ * If we are prefetching from the mfu ghost list, this buffer
+ * will end up on the mru list; so steal space from there.
+ */
+ if (state == arc_mfu_ghost)
+ state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
+ else if (state == arc_mru_ghost)
+ state = arc_mru;
+
+ if (state == arc_mru || state == arc_anon) {
+ uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
+ state = (arc_p > mru_used) ? arc_mfu : arc_mru;
+ } else {
+ /* MFU cases */
+ uint64_t mfu_space = arc_c - arc_p;
+ state = (mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
+ }
+ if ((buf->b_data = arc_evict(state, size, TRUE, type)) == NULL) {
+ if (type == ARC_BUFC_METADATA) {
+ buf->b_data = zio_buf_alloc(size);
+ } else {
+ ASSERT(type == ARC_BUFC_DATA);
+ buf->b_data = zio_data_buf_alloc(size);
+ }
+ atomic_add_64(&arc_size, size);
+ ARCSTAT_BUMP(arcstat_recycle_miss);
+ }
+ ASSERT(buf->b_data != NULL);
+out:
+ /*
+ * Update the state size. Note that ghost states have a
+ * "ghost size" and so don't need to be updated.
+ */
+ if (!GHOST_STATE(buf->b_hdr->b_state)) {
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+
+ atomic_add_64(&hdr->b_state->arcs_size, size);
+ if (list_link_active(&hdr->b_arc_node)) {
+ ASSERT(refcount_is_zero(&hdr->b_refcnt));
+ atomic_add_64(&hdr->b_state->arcs_lsize, size);
+ }
+ /*
+ * If we are growing the cache, and we are adding anonymous
+ * data, and we have outgrown arc_p, update arc_p
+ */
+ if (arc_size < arc_c && hdr->b_state == arc_anon &&
+ arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
+ arc_p = MIN(arc_c, arc_p + size);
+ }
+}
+
+/*
+ * This routine is called whenever a buffer is accessed.
+ * NOTE: the hash lock is dropped in this function.
+ */
+static void
+arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
+{
+ ASSERT(MUTEX_HELD(hash_lock));
+
+ if (buf->b_state == arc_anon) {
+ /*
+ * This buffer is not in the cache, and does not
+ * appear in our "ghost" list. Add the new buffer
+ * to the MRU state.
+ */
+
+ ASSERT(buf->b_arc_access == 0);
+ buf->b_arc_access = lbolt;
+ DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
+ arc_change_state(arc_mru, buf, hash_lock);
+
+ } else if (buf->b_state == arc_mru) {
+ /*
+ * If this buffer is here because of a prefetch, then either:
+ * - clear the flag if this is a "referencing" read
+ * (any subsequent access will bump this into the MFU state).
+ * or
+ * - move the buffer to the head of the list if this is
+ * another prefetch (to make it less likely to be evicted).
+ */
+ if ((buf->b_flags & ARC_PREFETCH) != 0) {
+ if (refcount_count(&buf->b_refcnt) == 0) {
+ ASSERT(list_link_active(&buf->b_arc_node));
+ mutex_enter(&arc_mru->arcs_mtx);
+ list_remove(&arc_mru->arcs_list, buf);
+ list_insert_head(&arc_mru->arcs_list, buf);
+ mutex_exit(&arc_mru->arcs_mtx);
+ } else {
+ buf->b_flags &= ~ARC_PREFETCH;
+ ARCSTAT_BUMP(arcstat_mru_hits);
+ }
+ buf->b_arc_access = lbolt;
+ return;
+ }
+
+ /*
+ * This buffer has been "accessed" only once so far,
+ * but it is still in the cache. Move it to the MFU
+ * state.
+ */
+ if (lbolt > buf->b_arc_access + ARC_MINTIME) {
+ /*
+ * More than 125ms have passed since we
+ * instantiated this buffer. Move it to the
+ * most frequently used state.
+ */
+ buf->b_arc_access = lbolt;
+ DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
+ arc_change_state(arc_mfu, buf, hash_lock);
+ }
+ ARCSTAT_BUMP(arcstat_mru_hits);
+ } else if (buf->b_state == arc_mru_ghost) {
+ arc_state_t *new_state;
+ /*
+ * This buffer has been "accessed" recently, but
+ * was evicted from the cache. Move it to the
+ * MFU state.
+ */
+
+ if (buf->b_flags & ARC_PREFETCH) {
+ new_state = arc_mru;
+ if (refcount_count(&buf->b_refcnt) > 0)
+ buf->b_flags &= ~ARC_PREFETCH;
+ DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
+ } else {
+ new_state = arc_mfu;
+ DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
+ }
+
+ buf->b_arc_access = lbolt;
+ arc_change_state(new_state, buf, hash_lock);
+
+ ARCSTAT_BUMP(arcstat_mru_ghost_hits);
+ } else if (buf->b_state == arc_mfu) {
+ /*
+ * This buffer has been accessed more than once and is
+ * still in the cache. Keep it in the MFU state.
+ *
+ * NOTE: an add_reference() that occurred when we did
+ * the arc_read() will have kicked this off the list.
+ * If it was a prefetch, we will explicitly move it to
+ * the head of the list now.
+ */
+ if ((buf->b_flags & ARC_PREFETCH) != 0) {
+ ASSERT(refcount_count(&buf->b_refcnt) == 0);
+ ASSERT(list_link_active(&buf->b_arc_node));
+ mutex_enter(&arc_mfu->arcs_mtx);
+ list_remove(&arc_mfu->arcs_list, buf);
+ list_insert_head(&arc_mfu->arcs_list, buf);
+ mutex_exit(&arc_mfu->arcs_mtx);
+ }
+ ARCSTAT_BUMP(arcstat_mfu_hits);
+ buf->b_arc_access = lbolt;
+ } else if (buf->b_state == arc_mfu_ghost) {
+ arc_state_t *new_state = arc_mfu;
+ /*
+ * This buffer has been accessed more than once but has
+ * been evicted from the cache. Move it back to the
+ * MFU state.
+ */
+
+ if (buf->b_flags & ARC_PREFETCH) {
+ /*
+ * This is a prefetch access...
+ * move this block back to the MRU state.
+ */
+ ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0);
+ new_state = arc_mru;
+ }
+
+ buf->b_arc_access = lbolt;
+ DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
+ arc_change_state(new_state, buf, hash_lock);
+
+ ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
+ } else {
+ ASSERT(!"invalid arc state");
+ }
+}
+
+/* a generic arc_done_func_t which you can use */
+/* ARGSUSED */
+void
+arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
+{
+ bcopy(buf->b_data, arg, buf->b_hdr->b_size);
+ VERIFY(arc_buf_remove_ref(buf, arg) == 1);
+}
+
+/* a generic arc_done_func_t which you can use */
+void
+arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
+{
+ arc_buf_t **bufp = arg;
+ if (zio && zio->io_error) {
+ VERIFY(arc_buf_remove_ref(buf, arg) == 1);
+ *bufp = NULL;
+ } else {
+ *bufp = buf;
+ }
+}
+
+static void
+arc_read_done(zio_t *zio)
+{
+ arc_buf_hdr_t *hdr, *found;
+ arc_buf_t *buf;
+ arc_buf_t *abuf; /* buffer we're assigning to callback */
+ kmutex_t *hash_lock;
+ arc_callback_t *callback_list, *acb;
+ int freeable = FALSE;
+
+ buf = zio->io_private;
+ hdr = buf->b_hdr;
+
+ /*
+ * The hdr was inserted into hash-table and removed from lists
+ * prior to starting I/O. We should find this header, since
+ * it's in the hash table, and it should be legit since it's
+ * not possible to evict it during the I/O. The only possible
+ * reason for it not to be found is if we were freed during the
+ * read.
+ */
+ found = buf_hash_find(zio->io_spa, &hdr->b_dva, hdr->b_birth,
+ &hash_lock);
+
+ ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
+ (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))));
+
+ /* byteswap if necessary */
+ callback_list = hdr->b_acb;
+ ASSERT(callback_list != NULL);
+ if (BP_SHOULD_BYTESWAP(zio->io_bp) && callback_list->acb_byteswap)
+ callback_list->acb_byteswap(buf->b_data, hdr->b_size);
+
+ arc_cksum_compute(buf);
+
+ /* create copies of the data buffer for the callers */
+ abuf = buf;
+ for (acb = callback_list; acb; acb = acb->acb_next) {
+ if (acb->acb_done) {
+ if (abuf == NULL)
+ abuf = arc_buf_clone(buf);
+ acb->acb_buf = abuf;
+ abuf = NULL;
+ }
+ }
+ hdr->b_acb = NULL;
+ hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
+ ASSERT(!HDR_BUF_AVAILABLE(hdr));
+ if (abuf == buf)
+ hdr->b_flags |= ARC_BUF_AVAILABLE;
+
+ ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
+
+ if (zio->io_error != 0) {
+ hdr->b_flags |= ARC_IO_ERROR;
+ if (hdr->b_state != arc_anon)
+ arc_change_state(arc_anon, hdr, hash_lock);
+ if (HDR_IN_HASH_TABLE(hdr))
+ buf_hash_remove(hdr);
+ freeable = refcount_is_zero(&hdr->b_refcnt);
+ /* convert checksum errors into IO errors */
+ if (zio->io_error == ECKSUM)
+ zio->io_error = EIO;
+ }
+
+ /*
+ * Broadcast before we drop the hash_lock to avoid the possibility
+ * that the hdr (and hence the cv) might be freed before we get to
+ * the cv_broadcast().
+ */
+ cv_broadcast(&hdr->b_cv);
+
+ if (hash_lock) {
+ /*
+ * Only call arc_access on anonymous buffers. This is because
+ * if we've issued an I/O for an evicted buffer, we've already
+ * called arc_access (to prevent any simultaneous readers from
+ * getting confused).
+ */
+ if (zio->io_error == 0 && hdr->b_state == arc_anon)
+ arc_access(hdr, hash_lock);
+ mutex_exit(hash_lock);
+ } else {
+ /*
+ * This block was freed while we waited for the read to
+ * complete. It has been removed from the hash table and
+ * moved to the anonymous state (so that it won't show up
+ * in the cache).
+ */
+ ASSERT3P(hdr->b_state, ==, arc_anon);
+ freeable = refcount_is_zero(&hdr->b_refcnt);
+ }
+
+ /* execute each callback and free its structure */
+ while ((acb = callback_list) != NULL) {
+ if (acb->acb_done)
+ acb->acb_done(zio, acb->acb_buf, acb->acb_private);
+
+ if (acb->acb_zio_dummy != NULL) {
+ acb->acb_zio_dummy->io_error = zio->io_error;
+ zio_nowait(acb->acb_zio_dummy);
+ }
+
+ callback_list = acb->acb_next;
+ kmem_free(acb, sizeof (arc_callback_t));
+ }
+
+ if (freeable)
+ arc_hdr_destroy(hdr);
+}
+
+/*
+ * "Read" the block block at the specified DVA (in bp) via the
+ * cache. If the block is found in the cache, invoke the provided
+ * callback immediately and return. Note that the `zio' parameter
+ * in the callback will be NULL in this case, since no IO was
+ * required. If the block is not in the cache pass the read request
+ * on to the spa with a substitute callback function, so that the
+ * requested block will be added to the cache.
+ *
+ * If a read request arrives for a block that has a read in-progress,
+ * either wait for the in-progress read to complete (and return the
+ * results); or, if this is a read with a "done" func, add a record
+ * to the read to invoke the "done" func when the read completes,
+ * and return; or just return.
+ *
+ * arc_read_done() will invoke all the requested "done" functions
+ * for readers of this block.
+ */
+int
+arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap,
+ arc_done_func_t *done, void *private, int priority, int flags,
+ uint32_t *arc_flags, zbookmark_t *zb)
+{
+ arc_buf_hdr_t *hdr;
+ arc_buf_t *buf;
+ kmutex_t *hash_lock;
+ zio_t *rzio;
+
+top:
+ hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
+ if (hdr && hdr->b_datacnt > 0) {
+
+ *arc_flags |= ARC_CACHED;
+
+ if (HDR_IO_IN_PROGRESS(hdr)) {
+
+ if (*arc_flags & ARC_WAIT) {
+ cv_wait(&hdr->b_cv, hash_lock);
+ mutex_exit(hash_lock);
+ goto top;
+ }
+ ASSERT(*arc_flags & ARC_NOWAIT);
+
+ if (done) {
+ arc_callback_t *acb = NULL;
+
+ acb = kmem_zalloc(sizeof (arc_callback_t),
+ KM_SLEEP);
+ acb->acb_done = done;
+ acb->acb_private = private;
+ acb->acb_byteswap = swap;
+ if (pio != NULL)
+ acb->acb_zio_dummy = zio_null(pio,
+ spa, NULL, NULL, flags);
+
+ ASSERT(acb->acb_done != NULL);
+ acb->acb_next = hdr->b_acb;
+ hdr->b_acb = acb;
+ add_reference(hdr, hash_lock, private);
+ mutex_exit(hash_lock);
+ return (0);
+ }
+ mutex_exit(hash_lock);
+ return (0);
+ }
+
+ ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
+
+ if (done) {
+ add_reference(hdr, hash_lock, private);
+ /*
+ * If this block is already in use, create a new
+ * copy of the data so that we will be guaranteed
+ * that arc_release() will always succeed.
+ */
+ buf = hdr->b_buf;
+ ASSERT(buf);
+ ASSERT(buf->b_data);
+ if (HDR_BUF_AVAILABLE(hdr)) {
+ ASSERT(buf->b_efunc == NULL);
+ hdr->b_flags &= ~ARC_BUF_AVAILABLE;
+ } else {
+ buf = arc_buf_clone(buf);
+ }
+ } else if (*arc_flags & ARC_PREFETCH &&
+ refcount_count(&hdr->b_refcnt) == 0) {
+ hdr->b_flags |= ARC_PREFETCH;
+ }
+ DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
+ arc_access(hdr, hash_lock);
+ mutex_exit(hash_lock);
+ ARCSTAT_BUMP(arcstat_hits);
+ ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
+ demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
+ data, metadata, hits);
+
+ if (done)
+ done(NULL, buf, private);
+ } else {
+ uint64_t size = BP_GET_LSIZE(bp);
+ arc_callback_t *acb;
+
+ if (hdr == NULL) {
+ /* this block is not in the cache */
+ arc_buf_hdr_t *exists;
+ arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
+ buf = arc_buf_alloc(spa, size, private, type);
+ hdr = buf->b_hdr;
+ hdr->b_dva = *BP_IDENTITY(bp);
+ hdr->b_birth = bp->blk_birth;
+ hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
+ exists = buf_hash_insert(hdr, &hash_lock);
+ if (exists) {
+ /* somebody beat us to the hash insert */
+ mutex_exit(hash_lock);
+ bzero(&hdr->b_dva, sizeof (dva_t));
+ hdr->b_birth = 0;
+ hdr->b_cksum0 = 0;
+ (void) arc_buf_remove_ref(buf, private);
+ goto top; /* restart the IO request */
+ }
+ /* if this is a prefetch, we don't have a reference */
+ if (*arc_flags & ARC_PREFETCH) {
+ (void) remove_reference(hdr, hash_lock,
+ private);
+ hdr->b_flags |= ARC_PREFETCH;
+ }
+ if (BP_GET_LEVEL(bp) > 0)
+ hdr->b_flags |= ARC_INDIRECT;
+ } else {
+ /* this block is in the ghost cache */
+ ASSERT(GHOST_STATE(hdr->b_state));
+ ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+ ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0);
+ ASSERT(hdr->b_buf == NULL);
+
+ /* if this is a prefetch, we don't have a reference */
+ if (*arc_flags & ARC_PREFETCH)
+ hdr->b_flags |= ARC_PREFETCH;
+ else
+ add_reference(hdr, hash_lock, private);
+ buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
+ buf->b_hdr = hdr;
+ buf->b_data = NULL;
+ buf->b_efunc = NULL;
+ buf->b_private = NULL;
+ buf->b_next = NULL;
+ hdr->b_buf = buf;
+ arc_get_data_buf(buf);
+ ASSERT(hdr->b_datacnt == 0);
+ hdr->b_datacnt = 1;
+
+ }
+
+ acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
+ acb->acb_done = done;
+ acb->acb_private = private;
+ acb->acb_byteswap = swap;
+
+ ASSERT(hdr->b_acb == NULL);
+ hdr->b_acb = acb;
+ hdr->b_flags |= ARC_IO_IN_PROGRESS;
+
+ /*
+ * If the buffer has been evicted, migrate it to a present state
+ * before issuing the I/O. Once we drop the hash-table lock,
+ * the header will be marked as I/O in progress and have an
+ * attached buffer. At this point, anybody who finds this
+ * buffer ought to notice that it's legit but has a pending I/O.
+ */
+
+ if (GHOST_STATE(hdr->b_state))
+ arc_access(hdr, hash_lock);
+ mutex_exit(hash_lock);
+
+ ASSERT3U(hdr->b_size, ==, size);
+ DTRACE_PROBE3(arc__miss, blkptr_t *, bp, uint64_t, size,
+ zbookmark_t *, zb);
+ ARCSTAT_BUMP(arcstat_misses);
+ ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
+ demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
+ data, metadata, misses);
+
+ rzio = zio_read(pio, spa, bp, buf->b_data, size,
+ arc_read_done, buf, priority, flags, zb);
+
+ if (*arc_flags & ARC_WAIT)
+ return (zio_wait(rzio));
+
+ ASSERT(*arc_flags & ARC_NOWAIT);
+ zio_nowait(rzio);
+ }
+ return (0);
+}
+
+/*
+ * arc_read() variant to support pool traversal. If the block is already
+ * in the ARC, make a copy of it; otherwise, the caller will do the I/O.
+ * The idea is that we don't want pool traversal filling up memory, but
+ * if the ARC already has the data anyway, we shouldn't pay for the I/O.
+ */
+int
+arc_tryread(spa_t *spa, blkptr_t *bp, void *data)
+{
+ arc_buf_hdr_t *hdr;
+ kmutex_t *hash_mtx;
+ int rc = 0;
+
+ hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx);
+
+ if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) {
+ arc_buf_t *buf = hdr->b_buf;
+
+ ASSERT(buf);
+ while (buf->b_data == NULL) {
+ buf = buf->b_next;
+ ASSERT(buf);
+ }
+ bcopy(buf->b_data, data, hdr->b_size);
+ } else {
+ rc = ENOENT;
+ }
+
+ if (hash_mtx)
+ mutex_exit(hash_mtx);
+
+ return (rc);
+}
+
+void
+arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
+{
+ ASSERT(buf->b_hdr != NULL);
+ ASSERT(buf->b_hdr->b_state != arc_anon);
+ ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
+ buf->b_efunc = func;
+ buf->b_private = private;
+}
+
+/*
+ * This is used by the DMU to let the ARC know that a buffer is
+ * being evicted, so the ARC should clean up. If this arc buf
+ * is not yet in the evicted state, it will be put there.
+ */
+int
+arc_buf_evict(arc_buf_t *buf)
+{
+ arc_buf_hdr_t *hdr;
+ kmutex_t *hash_lock;
+ arc_buf_t **bufp;
+
+ mutex_enter(&arc_eviction_mtx);
+ hdr = buf->b_hdr;
+ if (hdr == NULL) {
+ /*
+ * We are in arc_do_user_evicts().
+ */
+ ASSERT(buf->b_data == NULL);
+ mutex_exit(&arc_eviction_mtx);
+ return (0);
+ }
+ hash_lock = HDR_LOCK(hdr);
+ mutex_exit(&arc_eviction_mtx);
+
+ mutex_enter(hash_lock);
+
+ if (buf->b_data == NULL) {
+ /*
+ * We are on the eviction list.
+ */
+ mutex_exit(hash_lock);
+ mutex_enter(&arc_eviction_mtx);
+ if (buf->b_hdr == NULL) {
+ /*
+ * We are already in arc_do_user_evicts().
+ */
+ mutex_exit(&arc_eviction_mtx);
+ return (0);
+ } else {
+ arc_buf_t copy = *buf; /* structure assignment */
+ /*
+ * Process this buffer now
+ * but let arc_do_user_evicts() do the reaping.
+ */
+ buf->b_efunc = NULL;
+ mutex_exit(&arc_eviction_mtx);
+ VERIFY(copy.b_efunc(&copy) == 0);
+ return (1);
+ }
+ }
+
+ ASSERT(buf->b_hdr == hdr);
+ ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
+ ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
+
+ /*
+ * Pull this buffer off of the hdr
+ */
+ bufp = &hdr->b_buf;
+ while (*bufp != buf)
+ bufp = &(*bufp)->b_next;
+ *bufp = buf->b_next;
+
+ ASSERT(buf->b_data != NULL);
+ arc_buf_destroy(buf, FALSE, FALSE);
+
+ if (hdr->b_datacnt == 0) {
+ arc_state_t *old_state = hdr->b_state;
+ arc_state_t *evicted_state;
+
+ ASSERT(refcount_is_zero(&hdr->b_refcnt));
+
+ evicted_state =
+ (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
+
+ mutex_enter(&old_state->arcs_mtx);
+ mutex_enter(&evicted_state->arcs_mtx);
+
+ arc_change_state(evicted_state, hdr, hash_lock);
+ ASSERT(HDR_IN_HASH_TABLE(hdr));
+ hdr->b_flags = ARC_IN_HASH_TABLE;
+
+ mutex_exit(&evicted_state->arcs_mtx);
+ mutex_exit(&old_state->arcs_mtx);
+ }
+ mutex_exit(hash_lock);
+
+ VERIFY(buf->b_efunc(buf) == 0);
+ buf->b_efunc = NULL;
+ buf->b_private = NULL;
+ buf->b_hdr = NULL;
+ kmem_cache_free(buf_cache, buf);
+ return (1);
+}
+
+/*
+ * Release this buffer from the cache. This must be done
+ * after a read and prior to modifying the buffer contents.
+ * If the buffer has more than one reference, we must make
+ * make a new hdr for the buffer.
+ */
+void
+arc_release(arc_buf_t *buf, void *tag)
+{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+ kmutex_t *hash_lock = HDR_LOCK(hdr);
+
+ /* this buffer is not on any list */
+ ASSERT(refcount_count(&hdr->b_refcnt) > 0);
+
+ if (hdr->b_state == arc_anon) {
+ /* this buffer is already released */
+ ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1);
+ ASSERT(BUF_EMPTY(hdr));
+ ASSERT(buf->b_efunc == NULL);
+ arc_buf_thaw(buf);
+ return;
+ }
+
+ mutex_enter(hash_lock);
+
+ /*
+ * Do we have more than one buf?
+ */
+ if (hdr->b_buf != buf || buf->b_next != NULL) {
+ arc_buf_hdr_t *nhdr;
+ arc_buf_t **bufp;
+ uint64_t blksz = hdr->b_size;
+ spa_t *spa = hdr->b_spa;
+ arc_buf_contents_t type = hdr->b_type;
+
+ ASSERT(hdr->b_datacnt > 1);
+ /*
+ * Pull the data off of this buf and attach it to
+ * a new anonymous buf.
+ */
+ (void) remove_reference(hdr, hash_lock, tag);
+ bufp = &hdr->b_buf;
+ while (*bufp != buf)
+ bufp = &(*bufp)->b_next;
+ *bufp = (*bufp)->b_next;
+ buf->b_next = NULL;
+
+ ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
+ atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
+ if (refcount_is_zero(&hdr->b_refcnt)) {
+ ASSERT3U(hdr->b_state->arcs_lsize, >=, hdr->b_size);
+ atomic_add_64(&hdr->b_state->arcs_lsize, -hdr->b_size);
+ }
+ hdr->b_datacnt -= 1;
+ arc_cksum_verify(buf);
+
+ mutex_exit(hash_lock);
+
+ nhdr = kmem_cache_alloc(hdr_cache, KM_SLEEP);
+ nhdr->b_size = blksz;
+ nhdr->b_spa = spa;
+ nhdr->b_type = type;
+ nhdr->b_buf = buf;
+ nhdr->b_state = arc_anon;
+ nhdr->b_arc_access = 0;
+ nhdr->b_flags = 0;
+ nhdr->b_datacnt = 1;
+ nhdr->b_freeze_cksum = NULL;
+ (void) refcount_add(&nhdr->b_refcnt, tag);
+ buf->b_hdr = nhdr;
+ atomic_add_64(&arc_anon->arcs_size, blksz);
+
+ hdr = nhdr;
+ } else {
+ ASSERT(refcount_count(&hdr->b_refcnt) == 1);
+ ASSERT(!list_link_active(&hdr->b_arc_node));
+ ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+ arc_change_state(arc_anon, hdr, hash_lock);
+ hdr->b_arc_access = 0;
+ mutex_exit(hash_lock);
+ bzero(&hdr->b_dva, sizeof (dva_t));
+ hdr->b_birth = 0;
+ hdr->b_cksum0 = 0;
+ arc_buf_thaw(buf);
+ }
+ buf->b_efunc = NULL;
+ buf->b_private = NULL;
+}
+
+int
+arc_released(arc_buf_t *buf)
+{
+ return (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
+}
+
+int
+arc_has_callback(arc_buf_t *buf)
+{
+ return (buf->b_efunc != NULL);
+}
+
+#ifdef ZFS_DEBUG
+int
+arc_referenced(arc_buf_t *buf)
+{
+ return (refcount_count(&buf->b_hdr->b_refcnt));
+}
+#endif
+
+static void
+arc_write_ready(zio_t *zio)
+{
+ arc_write_callback_t *callback = zio->io_private;
+ arc_buf_t *buf = callback->awcb_buf;
+
+ if (callback->awcb_ready) {
+ ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
+ callback->awcb_ready(zio, buf, callback->awcb_private);
+ }
+ arc_cksum_compute(buf);
+}
+
+static void
+arc_write_done(zio_t *zio)
+{
+ arc_write_callback_t *callback = zio->io_private;
+ arc_buf_t *buf = callback->awcb_buf;
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+
+ hdr->b_acb = NULL;
+
+ /* this buffer is on no lists and is not in the hash table */
+ ASSERT3P(hdr->b_state, ==, arc_anon);
+
+ hdr->b_dva = *BP_IDENTITY(zio->io_bp);
+ hdr->b_birth = zio->io_bp->blk_birth;
+ hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
+ /*
+ * If the block to be written was all-zero, we may have
+ * compressed it away. In this case no write was performed
+ * so there will be no dva/birth-date/checksum. The buffer
+ * must therefor remain anonymous (and uncached).
+ */
+ if (!BUF_EMPTY(hdr)) {
+ arc_buf_hdr_t *exists;
+ kmutex_t *hash_lock;
+
+ arc_cksum_verify(buf);
+
+ exists = buf_hash_insert(hdr, &hash_lock);
+ if (exists) {
+ /*
+ * This can only happen if we overwrite for
+ * sync-to-convergence, because we remove
+ * buffers from the hash table when we arc_free().
+ */
+ ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig),
+ BP_IDENTITY(zio->io_bp)));
+ ASSERT3U(zio->io_bp_orig.blk_birth, ==,
+ zio->io_bp->blk_birth);
+
+ ASSERT(refcount_is_zero(&exists->b_refcnt));
+ arc_change_state(arc_anon, exists, hash_lock);
+ mutex_exit(hash_lock);
+ arc_hdr_destroy(exists);
+ exists = buf_hash_insert(hdr, &hash_lock);
+ ASSERT3P(exists, ==, NULL);
+ }
+ hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
+ arc_access(hdr, hash_lock);
+ mutex_exit(hash_lock);
+ } else if (callback->awcb_done == NULL) {
+ int destroy_hdr;
+ /*
+ * This is an anonymous buffer with no user callback,
+ * destroy it if there are no active references.
+ */
+ mutex_enter(&arc_eviction_mtx);
+ destroy_hdr = refcount_is_zero(&hdr->b_refcnt);
+ hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
+ mutex_exit(&arc_eviction_mtx);
+ if (destroy_hdr)
+ arc_hdr_destroy(hdr);
+ } else {
+ hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
+ }
+
+ if (callback->awcb_done) {
+ ASSERT(!refcount_is_zero(&hdr->b_refcnt));
+ callback->awcb_done(zio, buf, callback->awcb_private);
+ }
+
+ kmem_free(callback, sizeof (arc_write_callback_t));
+}
+
+zio_t *
+arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies,
+ uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
+ arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority,
+ int flags, zbookmark_t *zb)
+{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+ arc_write_callback_t *callback;
+ zio_t *zio;
+
+ /* this is a private buffer - no locking required */
+ ASSERT3P(hdr->b_state, ==, arc_anon);
+ ASSERT(BUF_EMPTY(hdr));
+ ASSERT(!HDR_IO_ERROR(hdr));
+ ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
+ ASSERT(hdr->b_acb == 0);
+ callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
+ callback->awcb_ready = ready;
+ callback->awcb_done = done;
+ callback->awcb_private = private;
+ callback->awcb_buf = buf;
+ hdr->b_flags |= ARC_IO_IN_PROGRESS;
+ zio = zio_write(pio, spa, checksum, compress, ncopies, txg, bp,
+ buf->b_data, hdr->b_size, arc_write_ready, arc_write_done, callback,
+ priority, flags, zb);
+
+ return (zio);
+}
+
+int
+arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+ zio_done_func_t *done, void *private, uint32_t arc_flags)
+{
+ arc_buf_hdr_t *ab;
+ kmutex_t *hash_lock;
+ zio_t *zio;
+
+ /*
+ * If this buffer is in the cache, release it, so it
+ * can be re-used.
+ */
+ ab = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
+ if (ab != NULL) {
+ /*
+ * The checksum of blocks to free is not always
+ * preserved (eg. on the deadlist). However, if it is
+ * nonzero, it should match what we have in the cache.
+ */
+ ASSERT(bp->blk_cksum.zc_word[0] == 0 ||
+ ab->b_cksum0 == bp->blk_cksum.zc_word[0]);
+ if (ab->b_state != arc_anon)
+ arc_change_state(arc_anon, ab, hash_lock);
+ if (HDR_IO_IN_PROGRESS(ab)) {
+ /*
+ * This should only happen when we prefetch.
+ */
+ ASSERT(ab->b_flags & ARC_PREFETCH);
+ ASSERT3U(ab->b_datacnt, ==, 1);
+ ab->b_flags |= ARC_FREED_IN_READ;
+ if (HDR_IN_HASH_TABLE(ab))
+ buf_hash_remove(ab);
+ ab->b_arc_access = 0;
+ bzero(&ab->b_dva, sizeof (dva_t));
+ ab->b_birth = 0;
+ ab->b_cksum0 = 0;
+ ab->b_buf->b_efunc = NULL;
+ ab->b_buf->b_private = NULL;
+ mutex_exit(hash_lock);
+ } else if (refcount_is_zero(&ab->b_refcnt)) {
+ mutex_exit(hash_lock);
+ arc_hdr_destroy(ab);
+ ARCSTAT_BUMP(arcstat_deleted);
+ } else {
+ /*
+ * We still have an active reference on this
+ * buffer. This can happen, e.g., from
+ * dbuf_unoverride().
+ */
+ ASSERT(!HDR_IN_HASH_TABLE(ab));
+ ab->b_arc_access = 0;
+ bzero(&ab->b_dva, sizeof (dva_t));
+ ab->b_birth = 0;
+ ab->b_cksum0 = 0;
+ ab->b_buf->b_efunc = NULL;
+ ab->b_buf->b_private = NULL;
+ mutex_exit(hash_lock);
+ }
+ }
+
+ zio = zio_free(pio, spa, txg, bp, done, private);
+
+ if (arc_flags & ARC_WAIT)
+ return (zio_wait(zio));
+
+ ASSERT(arc_flags & ARC_NOWAIT);
+ zio_nowait(zio);
+
+ return (0);
+}
+
+void
+arc_tempreserve_clear(uint64_t tempreserve)
+{
+ atomic_add_64(&arc_tempreserve, -tempreserve);
+ ASSERT((int64_t)arc_tempreserve >= 0);
+}
+
+int
+arc_tempreserve_space(uint64_t tempreserve)
+{
+#ifdef ZFS_DEBUG
+ /*
+ * Once in a while, fail for no reason. Everything should cope.
+ */
+ if (spa_get_random(10000) == 0) {
+ dprintf("forcing random failure\n");
+ return (ERESTART);
+ }
+#endif
+ if (tempreserve > arc_c/4 && !arc_no_grow)
+ arc_c = MIN(arc_c_max, tempreserve * 4);
+ if (tempreserve > arc_c)
+ return (ENOMEM);
+
+ /*
+ * Throttle writes when the amount of dirty data in the cache
+ * gets too large. We try to keep the cache less than half full
+ * of dirty blocks so that our sync times don't grow too large.
+ * Note: if two requests come in concurrently, we might let them
+ * both succeed, when one of them should fail. Not a huge deal.
+ *
+ * XXX The limit should be adjusted dynamically to keep the time
+ * to sync a dataset fixed (around 1-5 seconds?).
+ */
+
+ if (tempreserve + arc_tempreserve + arc_anon->arcs_size > arc_c / 2 &&
+ arc_tempreserve + arc_anon->arcs_size > arc_c / 4) {
+ dprintf("failing, arc_tempreserve=%lluK anon=%lluK "
+ "tempreserve=%lluK arc_c=%lluK\n",
+ arc_tempreserve>>10, arc_anon->arcs_lsize>>10,
+ tempreserve>>10, arc_c>>10);
+ return (ERESTART);
+ }
+ atomic_add_64(&arc_tempreserve, tempreserve);
+ return (0);
+}
+
+#ifdef _KERNEL
+static eventhandler_tag zfs_event_lowmem = NULL;
+
+static void
+zfs_lowmem(void *arg __unused, int howto __unused)
+{
+
+ zfs_needfree = 1;
+ cv_signal(&arc_reclaim_thr_cv);
+ while (zfs_needfree)
+ tsleep(&zfs_needfree, 0, "zfs:lowmem", hz / 5);
+}
+#endif
+
+void
+arc_init(void)
+{
+ mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
+
+ /* Convert seconds to clock ticks */
+ arc_min_prefetch_lifespan = 1 * hz;
+
+ /* Start out with 1/8 of all memory */
+ arc_c = physmem * PAGESIZE / 8;
+#if 0
+#ifdef _KERNEL
+ /*
+ * On architectures where the physical memory can be larger
+ * than the addressable space (intel in 32-bit mode), we may
+ * need to limit the cache to 1/8 of VM size.
+ */
+ arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
+#endif
+#endif
+ /* set min cache to 1/32 of all memory, or 64MB, whichever is more */
+ arc_c_min = MAX(arc_c / 4, 64<<20);
+ /* set max to 3/4 of all memory, or all but 1GB, whichever is more */
+ if (arc_c * 8 >= 1<<30)
+ arc_c_max = (arc_c * 8) - (1<<30);
+ else
+ arc_c_max = arc_c_min;
+ arc_c_max = MAX(arc_c * 6, arc_c_max);
+#ifdef notyet
+ /*
+ * Allow the tunables to override our calculations if they are
+ * reasonable (ie. over 64MB)
+ */
+ if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE)
+ arc_c_max = zfs_arc_max;
+ if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max)
+ arc_c_min = zfs_arc_min;
+#endif
+ arc_c = arc_c_max;
+ arc_p = (arc_c >> 1);
+
+ /* if kmem_flags are set, lets try to use less memory */
+ if (kmem_debugging())
+ arc_c = arc_c / 2;
+ if (arc_c < arc_c_min)
+ arc_c = arc_c_min;
+
+ arc_anon = &ARC_anon;
+ arc_mru = &ARC_mru;
+ arc_mru_ghost = &ARC_mru_ghost;
+ arc_mfu = &ARC_mfu;
+ arc_mfu_ghost = &ARC_mfu_ghost;
+ arc_size = 0;
+
+ mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
+
+ list_create(&arc_mru->arcs_list, sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_mru_ghost->arcs_list, sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_mfu->arcs_list, sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_mfu_ghost->arcs_list, sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_arc_node));
+
+ buf_init();
+
+ arc_thread_exit = 0;
+ arc_eviction_list = NULL;
+ mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
+ bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
+
+ arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
+ sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
+
+ if (arc_ksp != NULL) {
+ arc_ksp->ks_data = &arc_stats;
+ kstat_install(arc_ksp);
+ }
+
+ (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
+ TS_RUN, minclsyspri);
+
+#ifdef _KERNEL
+ zfs_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, zfs_lowmem, NULL,
+ EVENTHANDLER_PRI_FIRST);
+#endif
+
+ arc_dead = FALSE;
+}
+
+void
+arc_fini(void)
+{
+ mutex_enter(&arc_reclaim_thr_lock);
+ arc_thread_exit = 1;
+ cv_signal(&arc_reclaim_thr_cv);
+ while (arc_thread_exit != 0)
+ cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
+ mutex_exit(&arc_reclaim_thr_lock);
+
+ arc_flush();
+
+ arc_dead = TRUE;
+
+ if (arc_ksp != NULL) {
+ kstat_delete(arc_ksp);
+ arc_ksp = NULL;
+ }
+
+ mutex_destroy(&arc_eviction_mtx);
+ mutex_destroy(&arc_reclaim_thr_lock);
+ cv_destroy(&arc_reclaim_thr_cv);
+
+ list_destroy(&arc_mru->arcs_list);
+ list_destroy(&arc_mru_ghost->arcs_list);
+ list_destroy(&arc_mfu->arcs_list);
+ list_destroy(&arc_mfu_ghost->arcs_list);
+
+ mutex_destroy(&arc_anon->arcs_mtx);
+ mutex_destroy(&arc_mru->arcs_mtx);
+ mutex_destroy(&arc_mru_ghost->arcs_mtx);
+ mutex_destroy(&arc_mfu->arcs_mtx);
+ mutex_destroy(&arc_mfu_ghost->arcs_mtx);
+
+ buf_fini();
+
+#ifdef _KERNEL
+ if (zfs_event_lowmem != NULL)
+ EVENTHANDLER_DEREGISTER(vm_lowmem, zfs_event_lowmem);
+#endif
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c
new file mode 100644
index 0000000..4442b1f
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bplist.c
@@ -0,0 +1,312 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/bplist.h>
+#include <sys/zfs_context.h>
+
+static int
+bplist_hold(bplist_t *bpl)
+{
+ ASSERT(MUTEX_HELD(&bpl->bpl_lock));
+ if (bpl->bpl_dbuf == NULL) {
+ int err = dmu_bonus_hold(bpl->bpl_mos,
+ bpl->bpl_object, bpl, &bpl->bpl_dbuf);
+ if (err)
+ return (err);
+ bpl->bpl_phys = bpl->bpl_dbuf->db_data;
+ }
+ return (0);
+}
+
+uint64_t
+bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx)
+{
+ int size;
+
+ size = spa_version(dmu_objset_spa(mos)) < ZFS_VERSION_BPLIST_ACCOUNT ?
+ BPLIST_SIZE_V0 : sizeof (bplist_phys_t);
+
+ return (dmu_object_alloc(mos, DMU_OT_BPLIST, blocksize,
+ DMU_OT_BPLIST_HDR, size, tx));
+}
+
+void
+bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx)
+{
+ VERIFY(dmu_object_free(mos, object, tx) == 0);
+}
+
+int
+bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object)
+{
+ dmu_object_info_t doi;
+ int err;
+
+ err = dmu_object_info(mos, object, &doi);
+ if (err)
+ return (err);
+
+ mutex_enter(&bpl->bpl_lock);
+
+ ASSERT(bpl->bpl_dbuf == NULL);
+ ASSERT(bpl->bpl_phys == NULL);
+ ASSERT(bpl->bpl_cached_dbuf == NULL);
+ ASSERT(bpl->bpl_queue == NULL);
+ ASSERT(object != 0);
+ ASSERT3U(doi.doi_type, ==, DMU_OT_BPLIST);
+ ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPLIST_HDR);
+
+ bpl->bpl_mos = mos;
+ bpl->bpl_object = object;
+ bpl->bpl_blockshift = highbit(doi.doi_data_block_size - 1);
+ bpl->bpl_bpshift = bpl->bpl_blockshift - SPA_BLKPTRSHIFT;
+ bpl->bpl_havecomp = (doi.doi_bonus_size == sizeof (bplist_phys_t));
+
+ mutex_exit(&bpl->bpl_lock);
+ return (0);
+}
+
+void
+bplist_close(bplist_t *bpl)
+{
+ mutex_enter(&bpl->bpl_lock);
+
+ ASSERT(bpl->bpl_queue == NULL);
+
+ if (bpl->bpl_cached_dbuf) {
+ dmu_buf_rele(bpl->bpl_cached_dbuf, bpl);
+ bpl->bpl_cached_dbuf = NULL;
+ }
+ if (bpl->bpl_dbuf) {
+ dmu_buf_rele(bpl->bpl_dbuf, bpl);
+ bpl->bpl_dbuf = NULL;
+ bpl->bpl_phys = NULL;
+ }
+
+ mutex_exit(&bpl->bpl_lock);
+}
+
+boolean_t
+bplist_empty(bplist_t *bpl)
+{
+ boolean_t rv;
+
+ if (bpl->bpl_object == 0)
+ return (B_TRUE);
+
+ mutex_enter(&bpl->bpl_lock);
+ VERIFY(0 == bplist_hold(bpl)); /* XXX */
+ rv = (bpl->bpl_phys->bpl_entries == 0);
+ mutex_exit(&bpl->bpl_lock);
+
+ return (rv);
+}
+
+static int
+bplist_cache(bplist_t *bpl, uint64_t blkid)
+{
+ int err = 0;
+
+ if (bpl->bpl_cached_dbuf == NULL ||
+ bpl->bpl_cached_dbuf->db_offset != (blkid << bpl->bpl_blockshift)) {
+ if (bpl->bpl_cached_dbuf != NULL)
+ dmu_buf_rele(bpl->bpl_cached_dbuf, bpl);
+ err = dmu_buf_hold(bpl->bpl_mos,
+ bpl->bpl_object, blkid << bpl->bpl_blockshift,
+ bpl, &bpl->bpl_cached_dbuf);
+ ASSERT(err || bpl->bpl_cached_dbuf->db_size ==
+ 1ULL << bpl->bpl_blockshift);
+ }
+ return (err);
+}
+
+int
+bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp)
+{
+ uint64_t blk, off;
+ blkptr_t *bparray;
+ int err;
+
+ mutex_enter(&bpl->bpl_lock);
+
+ err = bplist_hold(bpl);
+ if (err) {
+ mutex_exit(&bpl->bpl_lock);
+ return (err);
+ }
+
+ if (*itorp >= bpl->bpl_phys->bpl_entries) {
+ mutex_exit(&bpl->bpl_lock);
+ return (ENOENT);
+ }
+
+ blk = *itorp >> bpl->bpl_bpshift;
+ off = P2PHASE(*itorp, 1ULL << bpl->bpl_bpshift);
+
+ err = bplist_cache(bpl, blk);
+ if (err) {
+ mutex_exit(&bpl->bpl_lock);
+ return (err);
+ }
+
+ bparray = bpl->bpl_cached_dbuf->db_data;
+ *bp = bparray[off];
+ (*itorp)++;
+ mutex_exit(&bpl->bpl_lock);
+ return (0);
+}
+
+int
+bplist_enqueue(bplist_t *bpl, blkptr_t *bp, dmu_tx_t *tx)
+{
+ uint64_t blk, off;
+ blkptr_t *bparray;
+ int err;
+
+ ASSERT(!BP_IS_HOLE(bp));
+ mutex_enter(&bpl->bpl_lock);
+ err = bplist_hold(bpl);
+ if (err)
+ return (err);
+
+ blk = bpl->bpl_phys->bpl_entries >> bpl->bpl_bpshift;
+ off = P2PHASE(bpl->bpl_phys->bpl_entries, 1ULL << bpl->bpl_bpshift);
+
+ err = bplist_cache(bpl, blk);
+ if (err) {
+ mutex_exit(&bpl->bpl_lock);
+ return (err);
+ }
+
+ dmu_buf_will_dirty(bpl->bpl_cached_dbuf, tx);
+ bparray = bpl->bpl_cached_dbuf->db_data;
+ bparray[off] = *bp;
+
+ /* We never need the fill count. */
+ bparray[off].blk_fill = 0;
+
+ /* The bplist will compress better if we can leave off the checksum */
+ bzero(&bparray[off].blk_cksum, sizeof (bparray[off].blk_cksum));
+
+ dmu_buf_will_dirty(bpl->bpl_dbuf, tx);
+ bpl->bpl_phys->bpl_entries++;
+ bpl->bpl_phys->bpl_bytes +=
+ bp_get_dasize(dmu_objset_spa(bpl->bpl_mos), bp);
+ if (bpl->bpl_havecomp) {
+ bpl->bpl_phys->bpl_comp += BP_GET_PSIZE(bp);
+ bpl->bpl_phys->bpl_uncomp += BP_GET_UCSIZE(bp);
+ }
+ mutex_exit(&bpl->bpl_lock);
+
+ return (0);
+}
+
+/*
+ * Deferred entry; will be written later by bplist_sync().
+ */
+void
+bplist_enqueue_deferred(bplist_t *bpl, blkptr_t *bp)
+{
+ bplist_q_t *bpq = kmem_alloc(sizeof (*bpq), KM_SLEEP);
+
+ ASSERT(!BP_IS_HOLE(bp));
+ mutex_enter(&bpl->bpl_lock);
+ bpq->bpq_blk = *bp;
+ bpq->bpq_next = bpl->bpl_queue;
+ bpl->bpl_queue = bpq;
+ mutex_exit(&bpl->bpl_lock);
+}
+
+void
+bplist_sync(bplist_t *bpl, dmu_tx_t *tx)
+{
+ bplist_q_t *bpq;
+
+ mutex_enter(&bpl->bpl_lock);
+ while ((bpq = bpl->bpl_queue) != NULL) {
+ bpl->bpl_queue = bpq->bpq_next;
+ mutex_exit(&bpl->bpl_lock);
+ VERIFY(0 == bplist_enqueue(bpl, &bpq->bpq_blk, tx));
+ kmem_free(bpq, sizeof (*bpq));
+ mutex_enter(&bpl->bpl_lock);
+ }
+ mutex_exit(&bpl->bpl_lock);
+}
+
+void
+bplist_vacate(bplist_t *bpl, dmu_tx_t *tx)
+{
+ mutex_enter(&bpl->bpl_lock);
+ ASSERT3P(bpl->bpl_queue, ==, NULL);
+ VERIFY(0 == bplist_hold(bpl));
+ dmu_buf_will_dirty(bpl->bpl_dbuf, tx);
+ VERIFY(0 == dmu_free_range(bpl->bpl_mos,
+ bpl->bpl_object, 0, -1ULL, tx));
+ bpl->bpl_phys->bpl_entries = 0;
+ bpl->bpl_phys->bpl_bytes = 0;
+ if (bpl->bpl_havecomp) {
+ bpl->bpl_phys->bpl_comp = 0;
+ bpl->bpl_phys->bpl_uncomp = 0;
+ }
+ mutex_exit(&bpl->bpl_lock);
+}
+
+int
+bplist_space(bplist_t *bpl, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+ uint64_t itor = 0, comp = 0, uncomp = 0;
+ int err;
+ blkptr_t bp;
+
+ mutex_enter(&bpl->bpl_lock);
+
+ err = bplist_hold(bpl);
+ if (err) {
+ mutex_exit(&bpl->bpl_lock);
+ return (err);
+ }
+
+ *usedp = bpl->bpl_phys->bpl_bytes;
+ if (bpl->bpl_havecomp) {
+ *compp = bpl->bpl_phys->bpl_comp;
+ *uncompp = bpl->bpl_phys->bpl_uncomp;
+ }
+ mutex_exit(&bpl->bpl_lock);
+
+ if (!bpl->bpl_havecomp) {
+ while ((err = bplist_iterate(bpl, &itor, &bp)) == 0) {
+ comp += BP_GET_PSIZE(&bp);
+ uncomp += BP_GET_UCSIZE(&bp);
+ }
+ if (err == ENOENT)
+ err = 0;
+ *compp = comp;
+ *uncompp = uncomp;
+ }
+
+ return (err);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
new file mode 100644
index 0000000..1fde66f
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
@@ -0,0 +1,2240 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dbuf.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dmu_tx.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/dmu_zfetch.h>
+
+static void dbuf_destroy(dmu_buf_impl_t *db);
+static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
+static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum,
+ int compress, dmu_tx_t *tx);
+static arc_done_func_t dbuf_write_ready;
+static arc_done_func_t dbuf_write_done;
+
+int zfs_mdcomp_disable = 0;
+SYSCTL_DECL(_vfs_zfs);
+TUNABLE_INT("vfs.zfs.mdcomp_disable", &zfs_mdcomp_disable);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RDTUN,
+ &zfs_mdcomp_disable, 0, "Disable metadata compression");
+
+/*
+ * Global data structures and functions for the dbuf cache.
+ */
+static kmem_cache_t *dbuf_cache;
+
+/* ARGSUSED */
+static int
+dbuf_cons(void *vdb, void *unused, int kmflag)
+{
+ dmu_buf_impl_t *db = vdb;
+ bzero(db, sizeof (dmu_buf_impl_t));
+
+ mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
+ refcount_create(&db->db_holds);
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+dbuf_dest(void *vdb, void *unused)
+{
+ dmu_buf_impl_t *db = vdb;
+ mutex_destroy(&db->db_mtx);
+ cv_destroy(&db->db_changed);
+ refcount_destroy(&db->db_holds);
+}
+
+/*
+ * dbuf hash table routines
+ */
+static dbuf_hash_table_t dbuf_hash_table;
+
+static uint64_t dbuf_hash_count;
+
+static uint64_t
+dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
+{
+ uintptr_t osv = (uintptr_t)os;
+ uint64_t crc = -1ULL;
+
+ ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
+ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
+ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
+ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
+ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
+ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
+ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
+
+ crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
+
+ return (crc);
+}
+
+#define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
+
+#define DBUF_EQUAL(dbuf, os, obj, level, blkid) \
+ ((dbuf)->db.db_object == (obj) && \
+ (dbuf)->db_objset == (os) && \
+ (dbuf)->db_level == (level) && \
+ (dbuf)->db_blkid == (blkid))
+
+dmu_buf_impl_t *
+dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
+{
+ dbuf_hash_table_t *h = &dbuf_hash_table;
+ objset_impl_t *os = dn->dn_objset;
+ uint64_t obj = dn->dn_object;
+ uint64_t hv = DBUF_HASH(os, obj, level, blkid);
+ uint64_t idx = hv & h->hash_table_mask;
+ dmu_buf_impl_t *db;
+
+ mutex_enter(DBUF_HASH_MUTEX(h, idx));
+ for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
+ if (DBUF_EQUAL(db, os, obj, level, blkid)) {
+ mutex_enter(&db->db_mtx);
+ if (db->db_state != DB_EVICTING) {
+ mutex_exit(DBUF_HASH_MUTEX(h, idx));
+ return (db);
+ }
+ mutex_exit(&db->db_mtx);
+ }
+ }
+ mutex_exit(DBUF_HASH_MUTEX(h, idx));
+ return (NULL);
+}
+
+/*
+ * Insert an entry into the hash table. If there is already an element
+ * equal to elem in the hash table, then the already existing element
+ * will be returned and the new element will not be inserted.
+ * Otherwise returns NULL.
+ */
+static dmu_buf_impl_t *
+dbuf_hash_insert(dmu_buf_impl_t *db)
+{
+ dbuf_hash_table_t *h = &dbuf_hash_table;
+ objset_impl_t *os = db->db_objset;
+ uint64_t obj = db->db.db_object;
+ int level = db->db_level;
+ uint64_t blkid = db->db_blkid;
+ uint64_t hv = DBUF_HASH(os, obj, level, blkid);
+ uint64_t idx = hv & h->hash_table_mask;
+ dmu_buf_impl_t *dbf;
+
+ mutex_enter(DBUF_HASH_MUTEX(h, idx));
+ for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
+ if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
+ mutex_enter(&dbf->db_mtx);
+ if (dbf->db_state != DB_EVICTING) {
+ mutex_exit(DBUF_HASH_MUTEX(h, idx));
+ return (dbf);
+ }
+ mutex_exit(&dbf->db_mtx);
+ }
+ }
+
+ mutex_enter(&db->db_mtx);
+ db->db_hash_next = h->hash_table[idx];
+ h->hash_table[idx] = db;
+ mutex_exit(DBUF_HASH_MUTEX(h, idx));
+ atomic_add_64(&dbuf_hash_count, 1);
+
+ return (NULL);
+}
+
+/*
+ * Remove an entry from the hash table. This operation will
+ * fail if there are any existing holds on the db.
+ */
+static void
+dbuf_hash_remove(dmu_buf_impl_t *db)
+{
+ dbuf_hash_table_t *h = &dbuf_hash_table;
+ uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object,
+ db->db_level, db->db_blkid);
+ uint64_t idx = hv & h->hash_table_mask;
+ dmu_buf_impl_t *dbf, **dbp;
+
+ /*
+ * We musn't hold db_mtx to maintin lock ordering:
+ * DBUF_HASH_MUTEX > db_mtx.
+ */
+ ASSERT(refcount_is_zero(&db->db_holds));
+ ASSERT(db->db_state == DB_EVICTING);
+ ASSERT(!MUTEX_HELD(&db->db_mtx));
+
+ mutex_enter(DBUF_HASH_MUTEX(h, idx));
+ dbp = &h->hash_table[idx];
+ while ((dbf = *dbp) != db) {
+ dbp = &dbf->db_hash_next;
+ ASSERT(dbf != NULL);
+ }
+ *dbp = db->db_hash_next;
+ db->db_hash_next = NULL;
+ mutex_exit(DBUF_HASH_MUTEX(h, idx));
+ atomic_add_64(&dbuf_hash_count, -1);
+}
+
+static arc_evict_func_t dbuf_do_evict;
+
+static void
+dbuf_evict_user(dmu_buf_impl_t *db)
+{
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+
+ if (db->db_level != 0 || db->db_evict_func == NULL)
+ return;
+
+ if (db->db_user_data_ptr_ptr)
+ *db->db_user_data_ptr_ptr = db->db.db_data;
+ db->db_evict_func(&db->db, db->db_user_ptr);
+ db->db_user_ptr = NULL;
+ db->db_user_data_ptr_ptr = NULL;
+ db->db_evict_func = NULL;
+}
+
+void
+dbuf_evict(dmu_buf_impl_t *db)
+{
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ ASSERT(db->db_buf == NULL);
+ ASSERT(db->db_data_pending == NULL);
+
+ dbuf_clear(db);
+ dbuf_destroy(db);
+}
+
+void
+dbuf_init(void)
+{
+ uint64_t hsize = 1ULL << 16;
+ dbuf_hash_table_t *h = &dbuf_hash_table;
+ int i;
+
+ /*
+ * The hash table is big enough to fill all of physical memory
+ * with an average 4K block size. The table will take up
+ * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
+ */
+ while (hsize * 4096 < physmem * PAGESIZE)
+ hsize <<= 1;
+
+retry:
+ h->hash_table_mask = hsize - 1;
+ h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
+ if (h->hash_table == NULL) {
+ /* XXX - we should really return an error instead of assert */
+ ASSERT(hsize > (1ULL << 10));
+ hsize >>= 1;
+ goto retry;
+ }
+
+ dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
+ sizeof (dmu_buf_impl_t),
+ 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
+
+ for (i = 0; i < DBUF_MUTEXES; i++)
+ mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
+}
+
+void
+dbuf_fini(void)
+{
+ dbuf_hash_table_t *h = &dbuf_hash_table;
+ int i;
+
+ for (i = 0; i < DBUF_MUTEXES; i++)
+ mutex_destroy(&h->hash_mutexes[i]);
+ kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
+ kmem_cache_destroy(dbuf_cache);
+}
+
+/*
+ * Other stuff.
+ */
+
+#ifdef ZFS_DEBUG
+static void
+dbuf_verify(dmu_buf_impl_t *db)
+{
+ dnode_t *dn = db->db_dnode;
+
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+
+ if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
+ return;
+
+ ASSERT(db->db_objset != NULL);
+ if (dn == NULL) {
+ ASSERT(db->db_parent == NULL);
+ ASSERT(db->db_blkptr == NULL);
+ } else {
+ ASSERT3U(db->db.db_object, ==, dn->dn_object);
+ ASSERT3P(db->db_objset, ==, dn->dn_objset);
+ ASSERT3U(db->db_level, <, dn->dn_nlevels);
+ ASSERT(db->db_blkid == DB_BONUS_BLKID ||
+ list_head(&dn->dn_dbufs));
+ }
+ if (db->db_blkid == DB_BONUS_BLKID) {
+ ASSERT(dn != NULL);
+ ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen);
+ ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID);
+ } else {
+ ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
+ }
+
+ if (db->db_level == 0) {
+ /* we can be momentarily larger in dnode_set_blksz() */
+ if (db->db_blkid != DB_BONUS_BLKID && dn) {
+ ASSERT3U(db->db.db_size, >=, dn->dn_datablksz);
+ }
+ if (db->db.db_object == DMU_META_DNODE_OBJECT) {
+ dbuf_dirty_record_t *dr = db->db_data_pending;
+ /*
+ * it should only be modified in syncing
+ * context, so make sure we only have
+ * one copy of the data.
+ */
+ ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
+ }
+ }
+
+ /* verify db->db_blkptr */
+ if (db->db_blkptr) {
+ if (db->db_parent == dn->dn_dbuf) {
+ /* db is pointed to by the dnode */
+ /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
+ if (db->db.db_object == DMU_META_DNODE_OBJECT)
+ ASSERT(db->db_parent == NULL);
+ else
+ ASSERT(db->db_parent != NULL);
+ ASSERT3P(db->db_blkptr, ==,
+ &dn->dn_phys->dn_blkptr[db->db_blkid]);
+ } else {
+ /* db is pointed to by an indirect block */
+ int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
+ ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
+ ASSERT3U(db->db_parent->db.db_object, ==,
+ db->db.db_object);
+ /*
+ * dnode_grow_indblksz() can make this fail if we don't
+ * have the struct_rwlock. XXX indblksz no longer
+ * grows. safe to do this now?
+ */
+ if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) {
+ ASSERT3P(db->db_blkptr, ==,
+ ((blkptr_t *)db->db_parent->db.db_data +
+ db->db_blkid % epb));
+ }
+ }
+ }
+ if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
+ db->db.db_data && db->db_blkid != DB_BONUS_BLKID &&
+ db->db_state != DB_FILL && !dn->dn_free_txg) {
+ /*
+ * If the blkptr isn't set but they have nonzero data,
+ * it had better be dirty, otherwise we'll lose that
+ * data when we evict this buffer.
+ */
+ if (db->db_dirtycnt == 0) {
+ uint64_t *buf = db->db.db_data;
+ int i;
+
+ for (i = 0; i < db->db.db_size >> 3; i++) {
+ ASSERT(buf[i] == 0);
+ }
+ }
+ }
+}
+#endif
+
+static void
+dbuf_update_data(dmu_buf_impl_t *db)
+{
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ if (db->db_level == 0 && db->db_user_data_ptr_ptr) {
+ ASSERT(!refcount_is_zero(&db->db_holds));
+ *db->db_user_data_ptr_ptr = db->db.db_data;
+ }
+}
+
+static void
+dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
+{
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf));
+ db->db_buf = buf;
+ if (buf != NULL) {
+ ASSERT(buf->b_data != NULL);
+ db->db.db_data = buf->b_data;
+ if (!arc_released(buf))
+ arc_set_callback(buf, dbuf_do_evict, db);
+ dbuf_update_data(db);
+ } else {
+ dbuf_evict_user(db);
+ db->db.db_data = NULL;
+ db->db_state = DB_UNCACHED;
+ }
+}
+
+uint64_t
+dbuf_whichblock(dnode_t *dn, uint64_t offset)
+{
+ if (dn->dn_datablkshift) {
+ return (offset >> dn->dn_datablkshift);
+ } else {
+ ASSERT3U(offset, <, dn->dn_datablksz);
+ return (0);
+ }
+}
+
+static void
+dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
+{
+ dmu_buf_impl_t *db = vdb;
+
+ mutex_enter(&db->db_mtx);
+ ASSERT3U(db->db_state, ==, DB_READ);
+ /*
+ * All reads are synchronous, so we must have a hold on the dbuf
+ */
+ ASSERT(refcount_count(&db->db_holds) > 0);
+ ASSERT(db->db_buf == NULL);
+ ASSERT(db->db.db_data == NULL);
+ if (db->db_level == 0 && db->db_freed_in_flight) {
+ /* we were freed in flight; disregard any error */
+ arc_release(buf, db);
+ bzero(buf->b_data, db->db.db_size);
+ arc_buf_freeze(buf);
+ db->db_freed_in_flight = FALSE;
+ dbuf_set_data(db, buf);
+ db->db_state = DB_CACHED;
+ } else if (zio == NULL || zio->io_error == 0) {
+ dbuf_set_data(db, buf);
+ db->db_state = DB_CACHED;
+ } else {
+ ASSERT(db->db_blkid != DB_BONUS_BLKID);
+ ASSERT3P(db->db_buf, ==, NULL);
+ VERIFY(arc_buf_remove_ref(buf, db) == 1);
+ db->db_state = DB_UNCACHED;
+ }
+ cv_broadcast(&db->db_changed);
+ mutex_exit(&db->db_mtx);
+ dbuf_rele(db, NULL);
+}
+
+static void
+dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
+{
+ blkptr_t *bp;
+ zbookmark_t zb;
+ uint32_t aflags = ARC_NOWAIT;
+
+ ASSERT(!refcount_is_zero(&db->db_holds));
+ /* We need the struct_rwlock to prevent db_blkptr from changing. */
+ ASSERT(RW_LOCK_HELD(&db->db_dnode->dn_struct_rwlock));
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ ASSERT(db->db_state == DB_UNCACHED);
+ ASSERT(db->db_buf == NULL);
+
+ if (db->db_blkid == DB_BONUS_BLKID) {
+ ASSERT3U(db->db_dnode->dn_bonuslen, ==, db->db.db_size);
+ db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
+ if (db->db.db_size < DN_MAX_BONUSLEN)
+ bzero(db->db.db_data, DN_MAX_BONUSLEN);
+ bcopy(DN_BONUS(db->db_dnode->dn_phys), db->db.db_data,
+ db->db.db_size);
+ dbuf_update_data(db);
+ db->db_state = DB_CACHED;
+ mutex_exit(&db->db_mtx);
+ return;
+ }
+
+ if (db->db_level == 0 && dnode_block_freed(db->db_dnode, db->db_blkid))
+ bp = NULL;
+ else
+ bp = db->db_blkptr;
+
+ if (bp == NULL)
+ dprintf_dbuf(db, "blkptr: %s\n", "NULL");
+ else
+ dprintf_dbuf_bp(db, bp, "%s", "blkptr:");
+
+ if (bp == NULL || BP_IS_HOLE(bp)) {
+ arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+
+ ASSERT(bp == NULL || BP_IS_HOLE(bp));
+ dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
+ db->db.db_size, db, type));
+ bzero(db->db.db_data, db->db.db_size);
+ db->db_state = DB_CACHED;
+ *flags |= DB_RF_CACHED;
+ mutex_exit(&db->db_mtx);
+ return;
+ }
+
+ db->db_state = DB_READ;
+ mutex_exit(&db->db_mtx);
+
+ zb.zb_objset = db->db_objset->os_dsl_dataset ?
+ db->db_objset->os_dsl_dataset->ds_object : 0;
+ zb.zb_object = db->db.db_object;
+ zb.zb_level = db->db_level;
+ zb.zb_blkid = db->db_blkid;
+
+ dbuf_add_ref(db, NULL);
+ /* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */
+ ASSERT3U(db->db_dnode->dn_type, <, DMU_OT_NUMTYPES);
+ (void) arc_read(zio, db->db_dnode->dn_objset->os_spa, bp,
+ db->db_level > 0 ? byteswap_uint64_array :
+ dmu_ot[db->db_dnode->dn_type].ot_byteswap,
+ dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
+ (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
+ &aflags, &zb);
+ if (aflags & ARC_CACHED)
+ *flags |= DB_RF_CACHED;
+}
+
+int
+dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
+{
+ int err = 0;
+ int havepzio = (zio != NULL);
+ int prefetch;
+
+ /*
+ * We don't have to hold the mutex to check db_state because it
+ * can't be freed while we have a hold on the buffer.
+ */
+ ASSERT(!refcount_is_zero(&db->db_holds));
+
+ if ((flags & DB_RF_HAVESTRUCT) == 0)
+ rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
+
+ prefetch = db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
+ (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL;
+
+ mutex_enter(&db->db_mtx);
+ if (db->db_state == DB_CACHED) {
+ mutex_exit(&db->db_mtx);
+ if (prefetch)
+ dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
+ db->db.db_size, TRUE);
+ if ((flags & DB_RF_HAVESTRUCT) == 0)
+ rw_exit(&db->db_dnode->dn_struct_rwlock);
+ } else if (db->db_state == DB_UNCACHED) {
+ if (zio == NULL) {
+ zio = zio_root(db->db_dnode->dn_objset->os_spa,
+ NULL, NULL, ZIO_FLAG_CANFAIL);
+ }
+ dbuf_read_impl(db, zio, &flags);
+
+ /* dbuf_read_impl has dropped db_mtx for us */
+
+ if (prefetch)
+ dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
+ db->db.db_size, flags & DB_RF_CACHED);
+
+ if ((flags & DB_RF_HAVESTRUCT) == 0)
+ rw_exit(&db->db_dnode->dn_struct_rwlock);
+
+ if (!havepzio)
+ err = zio_wait(zio);
+ } else {
+ mutex_exit(&db->db_mtx);
+ if (prefetch)
+ dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
+ db->db.db_size, TRUE);
+ if ((flags & DB_RF_HAVESTRUCT) == 0)
+ rw_exit(&db->db_dnode->dn_struct_rwlock);
+
+ mutex_enter(&db->db_mtx);
+ if ((flags & DB_RF_NEVERWAIT) == 0) {
+ while (db->db_state == DB_READ ||
+ db->db_state == DB_FILL) {
+ ASSERT(db->db_state == DB_READ ||
+ (flags & DB_RF_HAVESTRUCT) == 0);
+ cv_wait(&db->db_changed, &db->db_mtx);
+ }
+ if (db->db_state == DB_UNCACHED)
+ err = EIO;
+ }
+ mutex_exit(&db->db_mtx);
+ }
+
+ ASSERT(err || havepzio || db->db_state == DB_CACHED);
+ return (err);
+}
+
+static void
+dbuf_noread(dmu_buf_impl_t *db)
+{
+ ASSERT(!refcount_is_zero(&db->db_holds));
+ ASSERT(db->db_blkid != DB_BONUS_BLKID);
+ mutex_enter(&db->db_mtx);
+ while (db->db_state == DB_READ || db->db_state == DB_FILL)
+ cv_wait(&db->db_changed, &db->db_mtx);
+ if (db->db_state == DB_UNCACHED) {
+ arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+
+ ASSERT(db->db_buf == NULL);
+ ASSERT(db->db.db_data == NULL);
+ dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
+ db->db.db_size, db, type));
+ db->db_state = DB_FILL;
+ } else {
+ ASSERT3U(db->db_state, ==, DB_CACHED);
+ }
+ mutex_exit(&db->db_mtx);
+}
+
+/*
+ * This is our just-in-time copy function. It makes a copy of
+ * buffers, that have been modified in a previous transaction
+ * group, before we modify them in the current active group.
+ *
+ * This function is used in two places: when we are dirtying a
+ * buffer for the first time in a txg, and when we are freeing
+ * a range in a dnode that includes this buffer.
+ *
+ * Note that when we are called from dbuf_free_range() we do
+ * not put a hold on the buffer, we just traverse the active
+ * dbuf list for the dnode.
+ */
+static void
+dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
+{
+ dbuf_dirty_record_t *dr = db->db_last_dirty;
+
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ ASSERT(db->db.db_data != NULL);
+ ASSERT(db->db_level == 0);
+ ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
+
+ if (dr == NULL ||
+ (dr->dt.dl.dr_data !=
+ ((db->db_blkid == DB_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
+ return;
+
+ /*
+ * If the last dirty record for this dbuf has not yet synced
+ * and its referencing the dbuf data, either:
+ * reset the reference to point to a new copy,
+ * or (if there a no active holders)
+ * just null out the current db_data pointer.
+ */
+ ASSERT(dr->dr_txg >= txg - 2);
+ if (db->db_blkid == DB_BONUS_BLKID) {
+ /* Note that the data bufs here are zio_bufs */
+ dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
+ bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
+ } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
+ int size = db->db.db_size;
+ arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+ dr->dt.dl.dr_data = arc_buf_alloc(
+ db->db_dnode->dn_objset->os_spa, size, db, type);
+ bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
+ } else {
+ dbuf_set_data(db, NULL);
+ }
+}
+
+void
+dbuf_unoverride(dbuf_dirty_record_t *dr)
+{
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+ uint64_t txg = dr->dr_txg;
+
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
+ ASSERT(db->db_level == 0);
+
+ if (db->db_blkid == DB_BONUS_BLKID ||
+ dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
+ return;
+
+ /* free this block */
+ if (!BP_IS_HOLE(&dr->dt.dl.dr_overridden_by)) {
+ /* XXX can get silent EIO here */
+ (void) arc_free(NULL, db->db_dnode->dn_objset->os_spa,
+ txg, &dr->dt.dl.dr_overridden_by, NULL, NULL, ARC_WAIT);
+ }
+ dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
+ /*
+ * Release the already-written buffer, so we leave it in
+ * a consistent dirty state. Note that all callers are
+ * modifying the buffer, so they will immediately do
+ * another (redundant) arc_release(). Therefore, leave
+ * the buf thawed to save the effort of freezing &
+ * immediately re-thawing it.
+ */
+ arc_release(dr->dt.dl.dr_data, db);
+}
+
+void
+dbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db, *db_next;
+ uint64_t txg = tx->tx_txg;
+
+ dprintf_dnode(dn, "blkid=%llu nblks=%llu\n", blkid, nblks);
+ mutex_enter(&dn->dn_dbufs_mtx);
+ for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
+ db_next = list_next(&dn->dn_dbufs, db);
+ ASSERT(db->db_blkid != DB_BONUS_BLKID);
+ if (db->db_level != 0)
+ continue;
+ dprintf_dbuf(db, "found buf %s\n", "");
+ if (db->db_blkid < blkid ||
+ db->db_blkid >= blkid+nblks)
+ continue;
+
+ /* found a level 0 buffer in the range */
+ if (dbuf_undirty(db, tx))
+ continue;
+
+ mutex_enter(&db->db_mtx);
+ if (db->db_state == DB_UNCACHED ||
+ db->db_state == DB_EVICTING) {
+ ASSERT(db->db.db_data == NULL);
+ mutex_exit(&db->db_mtx);
+ continue;
+ }
+ if (db->db_state == DB_READ || db->db_state == DB_FILL) {
+ /* will be handled in dbuf_read_done or dbuf_rele */
+ db->db_freed_in_flight = TRUE;
+ mutex_exit(&db->db_mtx);
+ continue;
+ }
+ if (refcount_count(&db->db_holds) == 0) {
+ ASSERT(db->db_buf);
+ dbuf_clear(db);
+ continue;
+ }
+ /* The dbuf is referenced */
+
+ if (db->db_last_dirty != NULL) {
+ dbuf_dirty_record_t *dr = db->db_last_dirty;
+
+ if (dr->dr_txg == txg) {
+ /*
+ * This buffer is "in-use", re-adjust the file
+ * size to reflect that this buffer may
+ * contain new data when we sync.
+ */
+ if (db->db_blkid > dn->dn_maxblkid)
+ dn->dn_maxblkid = db->db_blkid;
+ dbuf_unoverride(dr);
+ } else {
+ /*
+ * This dbuf is not dirty in the open context.
+ * Either uncache it (if its not referenced in
+ * the open context) or reset its contents to
+ * empty.
+ */
+ dbuf_fix_old_data(db, txg);
+ }
+ }
+ /* clear the contents if its cached */
+ if (db->db_state == DB_CACHED) {
+ ASSERT(db->db.db_data != NULL);
+ arc_release(db->db_buf, db);
+ bzero(db->db.db_data, db->db.db_size);
+ arc_buf_freeze(db->db_buf);
+ }
+
+ mutex_exit(&db->db_mtx);
+ }
+ mutex_exit(&dn->dn_dbufs_mtx);
+}
+
+static int
+dbuf_new_block(dmu_buf_impl_t *db)
+{
+ dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
+ uint64_t birth_txg = 0;
+
+ /* Don't count meta-objects */
+ if (ds == NULL)
+ return (FALSE);
+
+ /*
+ * We don't need any locking to protect db_blkptr:
+ * If it's syncing, then db_last_dirty will be set
+ * so we'll ignore db_blkptr.
+ */
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ /* If we have been dirtied since the last snapshot, its not new */
+ if (db->db_last_dirty)
+ birth_txg = db->db_last_dirty->dr_txg;
+ else if (db->db_blkptr)
+ birth_txg = db->db_blkptr->blk_birth;
+
+ if (birth_txg)
+ return (!dsl_dataset_block_freeable(ds, birth_txg));
+ else
+ return (TRUE);
+}
+
+void
+dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
+{
+ arc_buf_t *buf, *obuf;
+ int osize = db->db.db_size;
+ arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+
+ ASSERT(db->db_blkid != DB_BONUS_BLKID);
+
+ /* XXX does *this* func really need the lock? */
+ ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock));
+
+ /*
+ * This call to dbuf_will_dirty() with the dn_struct_rwlock held
+ * is OK, because there can be no other references to the db
+ * when we are changing its size, so no concurrent DB_FILL can
+ * be happening.
+ */
+ /*
+ * XXX we should be doing a dbuf_read, checking the return
+ * value and returning that up to our callers
+ */
+ dbuf_will_dirty(db, tx);
+
+ /* create the data buffer for the new block */
+ buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, size, db, type);
+
+ /* copy old block data to the new block */
+ obuf = db->db_buf;
+ bcopy(obuf->b_data, buf->b_data, MIN(osize, size));
+ /* zero the remainder */
+ if (size > osize)
+ bzero((uint8_t *)buf->b_data + osize, size - osize);
+
+ mutex_enter(&db->db_mtx);
+ dbuf_set_data(db, buf);
+ VERIFY(arc_buf_remove_ref(obuf, db) == 1);
+ db->db.db_size = size;
+
+ if (db->db_level == 0) {
+ ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
+ db->db_last_dirty->dt.dl.dr_data = buf;
+ }
+ mutex_exit(&db->db_mtx);
+
+ dnode_willuse_space(db->db_dnode, size-osize, tx);
+}
+
+dbuf_dirty_record_t *
+dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
+{
+ dnode_t *dn = db->db_dnode;
+ objset_impl_t *os = dn->dn_objset;
+ dbuf_dirty_record_t **drp, *dr;
+ int drop_struct_lock = FALSE;
+ int txgoff = tx->tx_txg & TXG_MASK;
+
+ ASSERT(tx->tx_txg != 0);
+ ASSERT(!refcount_is_zero(&db->db_holds));
+ DMU_TX_DIRTY_BUF(tx, db);
+
+ /*
+ * Shouldn't dirty a regular buffer in syncing context. Private
+ * objects may be dirtied in syncing context, but only if they
+ * were already pre-dirtied in open context.
+ * XXX We may want to prohibit dirtying in syncing context even
+ * if they did pre-dirty.
+ */
+ ASSERT(!dmu_tx_is_syncing(tx) ||
+ BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
+ dn->dn_object == DMU_META_DNODE_OBJECT ||
+ dn->dn_objset->os_dsl_dataset == NULL ||
+ dsl_dir_is_private(dn->dn_objset->os_dsl_dataset->ds_dir));
+
+ /*
+ * We make this assert for private objects as well, but after we
+ * check if we're already dirty. They are allowed to re-dirty
+ * in syncing context.
+ */
+ ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
+ dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
+ (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
+
+ mutex_enter(&db->db_mtx);
+ /*
+ * XXX make this true for indirects too? The problem is that
+ * transactions created with dmu_tx_create_assigned() from
+ * syncing context don't bother holding ahead.
+ */
+ ASSERT(db->db_level != 0 ||
+ db->db_state == DB_CACHED || db->db_state == DB_FILL);
+
+ mutex_enter(&dn->dn_mtx);
+ /*
+ * Don't set dirtyctx to SYNC if we're just modifying this as we
+ * initialize the objset.
+ */
+ if (dn->dn_dirtyctx == DN_UNDIRTIED &&
+ !BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
+ dn->dn_dirtyctx =
+ (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
+ ASSERT(dn->dn_dirtyctx_firstset == NULL);
+ dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
+ }
+ mutex_exit(&dn->dn_mtx);
+
+ /*
+ * If this buffer is already dirty, we're done.
+ */
+ drp = &db->db_last_dirty;
+ ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
+ db->db.db_object == DMU_META_DNODE_OBJECT);
+ while (*drp && (*drp)->dr_txg > tx->tx_txg)
+ drp = &(*drp)->dr_next;
+ if (*drp && (*drp)->dr_txg == tx->tx_txg) {
+ if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) {
+ /*
+ * If this buffer has already been written out,
+ * we now need to reset its state.
+ */
+ dbuf_unoverride(*drp);
+ if (db->db.db_object != DMU_META_DNODE_OBJECT)
+ arc_buf_thaw(db->db_buf);
+ }
+ mutex_exit(&db->db_mtx);
+ return (*drp);
+ }
+
+ /*
+ * Only valid if not already dirty.
+ */
+ ASSERT(dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
+ (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
+
+ ASSERT3U(dn->dn_nlevels, >, db->db_level);
+ ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
+ dn->dn_phys->dn_nlevels > db->db_level ||
+ dn->dn_next_nlevels[txgoff] > db->db_level ||
+ dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
+ dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
+
+ /*
+ * We should only be dirtying in syncing context if it's the
+ * mos, a spa os, or we're initializing the os. However, we are
+ * allowed to dirty in syncing context provided we already
+ * dirtied it in open context. Hence we must make this
+ * assertion only if we're not already dirty.
+ */
+ ASSERT(!dmu_tx_is_syncing(tx) ||
+ os->os_dsl_dataset == NULL ||
+ !dsl_dir_is_private(os->os_dsl_dataset->ds_dir) ||
+ !BP_IS_HOLE(os->os_rootbp));
+ ASSERT(db->db.db_size != 0);
+
+ dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
+
+ /*
+ * If this buffer is dirty in an old transaction group we need
+ * to make a copy of it so that the changes we make in this
+ * transaction group won't leak out when we sync the older txg.
+ */
+ dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
+ if (db->db_level == 0) {
+ void *data_old = db->db_buf;
+
+ if (db->db_blkid == DB_BONUS_BLKID) {
+ dbuf_fix_old_data(db, tx->tx_txg);
+ data_old = db->db.db_data;
+ } else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
+ /*
+ * Release the data buffer from the cache so that we
+ * can modify it without impacting possible other users
+ * of this cached data block. Note that indirect
+ * blocks and private objects are not released until the
+ * syncing state (since they are only modified then).
+ */
+ arc_release(db->db_buf, db);
+ dbuf_fix_old_data(db, tx->tx_txg);
+ data_old = db->db_buf;
+ }
+ ASSERT(data_old != NULL);
+ dr->dt.dl.dr_data = data_old;
+ } else {
+ mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&dr->dt.di.dr_children,
+ sizeof (dbuf_dirty_record_t),
+ offsetof(dbuf_dirty_record_t, dr_dirty_node));
+ }
+ dr->dr_dbuf = db;
+ dr->dr_txg = tx->tx_txg;
+ dr->dr_next = *drp;
+ *drp = dr;
+
+ /*
+ * We could have been freed_in_flight between the dbuf_noread
+ * and dbuf_dirty. We win, as though the dbuf_noread() had
+ * happened after the free.
+ */
+ if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) {
+ mutex_enter(&dn->dn_mtx);
+ dnode_clear_range(dn, db->db_blkid, 1, tx);
+ mutex_exit(&dn->dn_mtx);
+ db->db_freed_in_flight = FALSE;
+ }
+
+ if (db->db_blkid != DB_BONUS_BLKID) {
+ /*
+ * Update the accounting.
+ */
+ if (!dbuf_new_block(db) && db->db_blkptr) {
+ /*
+ * This is only a guess -- if the dbuf is dirty
+ * in a previous txg, we don't know how much
+ * space it will use on disk yet. We should
+ * really have the struct_rwlock to access
+ * db_blkptr, but since this is just a guess,
+ * it's OK if we get an odd answer.
+ */
+ dnode_willuse_space(dn,
+ -bp_get_dasize(os->os_spa, db->db_blkptr), tx);
+ }
+ dnode_willuse_space(dn, db->db.db_size, tx);
+ }
+
+ /*
+ * This buffer is now part of this txg
+ */
+ dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
+ db->db_dirtycnt += 1;
+ ASSERT3U(db->db_dirtycnt, <=, 3);
+
+ mutex_exit(&db->db_mtx);
+
+ if (db->db_blkid == DB_BONUS_BLKID) {
+ mutex_enter(&dn->dn_mtx);
+ ASSERT(!list_link_active(&dr->dr_dirty_node));
+ list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
+ mutex_exit(&dn->dn_mtx);
+ dnode_setdirty(dn, tx);
+ return (dr);
+ }
+
+ if (db->db_level == 0) {
+ dnode_new_blkid(dn, db->db_blkid, tx);
+ ASSERT(dn->dn_maxblkid >= db->db_blkid);
+ }
+
+ if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ drop_struct_lock = TRUE;
+ }
+
+ if (db->db_level+1 < dn->dn_nlevels) {
+ dmu_buf_impl_t *parent = db->db_parent;
+ dbuf_dirty_record_t *di;
+ int parent_held = FALSE;
+
+ if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
+ int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+ parent = dbuf_hold_level(dn, db->db_level+1,
+ db->db_blkid >> epbs, FTAG);
+ parent_held = TRUE;
+ }
+ if (drop_struct_lock)
+ rw_exit(&dn->dn_struct_rwlock);
+ ASSERT3U(db->db_level+1, ==, parent->db_level);
+ di = dbuf_dirty(parent, tx);
+ if (parent_held)
+ dbuf_rele(parent, FTAG);
+
+ mutex_enter(&db->db_mtx);
+ /* possible race with dbuf_undirty() */
+ if (db->db_last_dirty == dr ||
+ dn->dn_object == DMU_META_DNODE_OBJECT) {
+ mutex_enter(&di->dt.di.dr_mtx);
+ ASSERT3U(di->dr_txg, ==, tx->tx_txg);
+ ASSERT(!list_link_active(&dr->dr_dirty_node));
+ list_insert_tail(&di->dt.di.dr_children, dr);
+ mutex_exit(&di->dt.di.dr_mtx);
+ dr->dr_parent = di;
+ }
+ mutex_exit(&db->db_mtx);
+ } else {
+ ASSERT(db->db_level+1 == dn->dn_nlevels);
+ ASSERT(db->db_blkid < dn->dn_nblkptr);
+ ASSERT(db->db_parent == NULL ||
+ db->db_parent == db->db_dnode->dn_dbuf);
+ mutex_enter(&dn->dn_mtx);
+ ASSERT(!list_link_active(&dr->dr_dirty_node));
+ list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
+ mutex_exit(&dn->dn_mtx);
+ if (drop_struct_lock)
+ rw_exit(&dn->dn_struct_rwlock);
+ }
+
+ dnode_setdirty(dn, tx);
+ return (dr);
+}
+
+static int
+dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
+{
+ dnode_t *dn = db->db_dnode;
+ uint64_t txg = tx->tx_txg;
+ dbuf_dirty_record_t *dr;
+
+ ASSERT(txg != 0);
+ ASSERT(db->db_blkid != DB_BONUS_BLKID);
+
+ mutex_enter(&db->db_mtx);
+
+ /*
+ * If this buffer is not dirty, we're done.
+ */
+ for (dr = db->db_last_dirty; dr; dr = dr->dr_next)
+ if (dr->dr_txg <= txg)
+ break;
+ if (dr == NULL || dr->dr_txg < txg) {
+ mutex_exit(&db->db_mtx);
+ return (0);
+ }
+ ASSERT(dr->dr_txg == txg);
+
+ /*
+ * If this buffer is currently held, we cannot undirty
+ * it, since one of the current holders may be in the
+ * middle of an update. Note that users of dbuf_undirty()
+ * should not place a hold on the dbuf before the call.
+ */
+ if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
+ mutex_exit(&db->db_mtx);
+ /* Make sure we don't toss this buffer at sync phase */
+ mutex_enter(&dn->dn_mtx);
+ dnode_clear_range(dn, db->db_blkid, 1, tx);
+ mutex_exit(&dn->dn_mtx);
+ return (0);
+ }
+
+ dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
+
+ ASSERT(db->db.db_size != 0);
+
+ /* XXX would be nice to fix up dn_towrite_space[] */
+
+ db->db_last_dirty = dr->dr_next;
+
+ if (dr->dr_parent) {
+ mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
+ list_remove(&dr->dr_parent->dt.di.dr_children, dr);
+ mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
+ } else if (db->db_level+1 == dn->dn_nlevels) {
+ ASSERT3P(db->db_parent, ==, dn->dn_dbuf);
+ mutex_enter(&dn->dn_mtx);
+ list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
+ mutex_exit(&dn->dn_mtx);
+ }
+
+ if (db->db_level == 0) {
+ dbuf_unoverride(dr);
+
+ ASSERT(db->db_buf != NULL);
+ ASSERT(dr->dt.dl.dr_data != NULL);
+ if (dr->dt.dl.dr_data != db->db_buf)
+ VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1);
+ } else {
+ ASSERT(db->db_buf != NULL);
+ ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
+ /* XXX - mutex and list destroy? */
+ }
+ kmem_free(dr, sizeof (dbuf_dirty_record_t));
+
+ ASSERT(db->db_dirtycnt > 0);
+ db->db_dirtycnt -= 1;
+
+ if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
+ arc_buf_t *buf = db->db_buf;
+
+ ASSERT(arc_released(buf));
+ dbuf_set_data(db, NULL);
+ VERIFY(arc_buf_remove_ref(buf, db) == 1);
+ dbuf_evict(db);
+ return (1);
+ }
+
+ mutex_exit(&db->db_mtx);
+ return (0);
+}
+
+#pragma weak dmu_buf_will_dirty = dbuf_will_dirty
+void
+dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
+{
+ int rf = DB_RF_MUST_SUCCEED;
+
+ ASSERT(tx->tx_txg != 0);
+ ASSERT(!refcount_is_zero(&db->db_holds));
+
+ if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock))
+ rf |= DB_RF_HAVESTRUCT;
+ (void) dbuf_read(db, NULL, rf);
+ (void) dbuf_dirty(db, tx);
+}
+
+void
+dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+ ASSERT(db->db_blkid != DB_BONUS_BLKID);
+ ASSERT(tx->tx_txg != 0);
+ ASSERT(db->db_level == 0);
+ ASSERT(!refcount_is_zero(&db->db_holds));
+
+ ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
+ dmu_tx_private_ok(tx));
+
+ dbuf_noread(db);
+ (void) dbuf_dirty(db, tx);
+}
+
+#pragma weak dmu_buf_fill_done = dbuf_fill_done
+/* ARGSUSED */
+void
+dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
+{
+ mutex_enter(&db->db_mtx);
+ DBUF_VERIFY(db);
+
+ if (db->db_state == DB_FILL) {
+ if (db->db_level == 0 && db->db_freed_in_flight) {
+ ASSERT(db->db_blkid != DB_BONUS_BLKID);
+ /* we were freed while filling */
+ /* XXX dbuf_undirty? */
+ bzero(db->db.db_data, db->db.db_size);
+ db->db_freed_in_flight = FALSE;
+ }
+ db->db_state = DB_CACHED;
+ cv_broadcast(&db->db_changed);
+ }
+ mutex_exit(&db->db_mtx);
+}
+
+/*
+ * "Clear" the contents of this dbuf. This will mark the dbuf
+ * EVICTING and clear *most* of its references. Unfortunetely,
+ * when we are not holding the dn_dbufs_mtx, we can't clear the
+ * entry in the dn_dbufs list. We have to wait until dbuf_destroy()
+ * in this case. For callers from the DMU we will usually see:
+ * dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy()
+ * For the arc callback, we will usually see:
+ * dbuf_do_evict()->dbuf_clear();dbuf_destroy()
+ * Sometimes, though, we will get a mix of these two:
+ * DMU: dbuf_clear()->arc_buf_evict()
+ * ARC: dbuf_do_evict()->dbuf_destroy()
+ */
+void
+dbuf_clear(dmu_buf_impl_t *db)
+{
+ dnode_t *dn = db->db_dnode;
+ dmu_buf_impl_t *parent = db->db_parent;
+ dmu_buf_impl_t *dndb = dn->dn_dbuf;
+ int dbuf_gone = FALSE;
+
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ ASSERT(refcount_is_zero(&db->db_holds));
+
+ dbuf_evict_user(db);
+
+ if (db->db_state == DB_CACHED) {
+ ASSERT(db->db.db_data != NULL);
+ if (db->db_blkid == DB_BONUS_BLKID)
+ zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
+ db->db.db_data = NULL;
+ db->db_state = DB_UNCACHED;
+ }
+
+ ASSERT3U(db->db_state, ==, DB_UNCACHED);
+ ASSERT(db->db_data_pending == NULL);
+
+ db->db_state = DB_EVICTING;
+ db->db_blkptr = NULL;
+
+ if (db->db_blkid != DB_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
+ list_remove(&dn->dn_dbufs, db);
+ dnode_rele(dn, db);
+ }
+
+ if (db->db_buf)
+ dbuf_gone = arc_buf_evict(db->db_buf);
+
+ if (!dbuf_gone)
+ mutex_exit(&db->db_mtx);
+
+ /*
+ * If this dbuf is referened from an indirect dbuf,
+ * decrement the ref count on the indirect dbuf.
+ */
+ if (parent && parent != dndb)
+ dbuf_rele(parent, db);
+}
+
+static int
+dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
+ dmu_buf_impl_t **parentp, blkptr_t **bpp)
+{
+ int nlevels, epbs;
+
+ *parentp = NULL;
+ *bpp = NULL;
+
+ ASSERT(blkid != DB_BONUS_BLKID);
+
+ if (dn->dn_phys->dn_nlevels == 0)
+ nlevels = 1;
+ else
+ nlevels = dn->dn_phys->dn_nlevels;
+
+ epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+ ASSERT3U(level * epbs, <, 64);
+ ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
+ if (level >= nlevels ||
+ (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
+ /* the buffer has no parent yet */
+ return (ENOENT);
+ } else if (level < nlevels-1) {
+ /* this block is referenced from an indirect block */
+ int err = dbuf_hold_impl(dn, level+1,
+ blkid >> epbs, fail_sparse, NULL, parentp);
+ if (err)
+ return (err);
+ err = dbuf_read(*parentp, NULL,
+ (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
+ if (err) {
+ dbuf_rele(*parentp, NULL);
+ *parentp = NULL;
+ return (err);
+ }
+ *bpp = ((blkptr_t *)(*parentp)->db.db_data) +
+ (blkid & ((1ULL << epbs) - 1));
+ return (0);
+ } else {
+ /* the block is referenced from the dnode */
+ ASSERT3U(level, ==, nlevels-1);
+ ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
+ blkid < dn->dn_phys->dn_nblkptr);
+ if (dn->dn_dbuf) {
+ dbuf_add_ref(dn->dn_dbuf, NULL);
+ *parentp = dn->dn_dbuf;
+ }
+ *bpp = &dn->dn_phys->dn_blkptr[blkid];
+ return (0);
+ }
+}
+
+static dmu_buf_impl_t *
+dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
+ dmu_buf_impl_t *parent, blkptr_t *blkptr)
+{
+ objset_impl_t *os = dn->dn_objset;
+ dmu_buf_impl_t *db, *odb;
+
+ ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
+ ASSERT(dn->dn_type != DMU_OT_NONE);
+
+ db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
+
+ db->db_objset = os;
+ db->db.db_object = dn->dn_object;
+ db->db_level = level;
+ db->db_blkid = blkid;
+ db->db_last_dirty = NULL;
+ db->db_dirtycnt = 0;
+ db->db_dnode = dn;
+ db->db_parent = parent;
+ db->db_blkptr = blkptr;
+
+ db->db_user_ptr = NULL;
+ db->db_user_data_ptr_ptr = NULL;
+ db->db_evict_func = NULL;
+ db->db_immediate_evict = 0;
+ db->db_freed_in_flight = 0;
+
+ if (blkid == DB_BONUS_BLKID) {
+ ASSERT3P(parent, ==, dn->dn_dbuf);
+ db->db.db_size = dn->dn_bonuslen;
+ db->db.db_offset = DB_BONUS_BLKID;
+ db->db_state = DB_UNCACHED;
+ /* the bonus dbuf is not placed in the hash table */
+ return (db);
+ } else {
+ int blocksize =
+ db->db_level ? 1<<dn->dn_indblkshift : dn->dn_datablksz;
+ db->db.db_size = blocksize;
+ db->db.db_offset = db->db_blkid * blocksize;
+ }
+
+ /*
+ * Hold the dn_dbufs_mtx while we get the new dbuf
+ * in the hash table *and* added to the dbufs list.
+ * This prevents a possible deadlock with someone
+ * trying to look up this dbuf before its added to the
+ * dn_dbufs list.
+ */
+ mutex_enter(&dn->dn_dbufs_mtx);
+ db->db_state = DB_EVICTING;
+ if ((odb = dbuf_hash_insert(db)) != NULL) {
+ /* someone else inserted it first */
+ kmem_cache_free(dbuf_cache, db);
+ mutex_exit(&dn->dn_dbufs_mtx);
+ return (odb);
+ }
+ list_insert_head(&dn->dn_dbufs, db);
+ db->db_state = DB_UNCACHED;
+ mutex_exit(&dn->dn_dbufs_mtx);
+
+ if (parent && parent != dn->dn_dbuf)
+ dbuf_add_ref(parent, db);
+
+ ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
+ refcount_count(&dn->dn_holds) > 0);
+ (void) refcount_add(&dn->dn_holds, db);
+
+ dprintf_dbuf(db, "db=%p\n", db);
+
+ return (db);
+}
+
+static int
+dbuf_do_evict(void *private)
+{
+ arc_buf_t *buf = private;
+ dmu_buf_impl_t *db = buf->b_private;
+
+ if (!MUTEX_HELD(&db->db_mtx))
+ mutex_enter(&db->db_mtx);
+
+ ASSERT(refcount_is_zero(&db->db_holds));
+
+ if (db->db_state != DB_EVICTING) {
+ ASSERT(db->db_state == DB_CACHED);
+ DBUF_VERIFY(db);
+ db->db_buf = NULL;
+ dbuf_evict(db);
+ } else {
+ mutex_exit(&db->db_mtx);
+ dbuf_destroy(db);
+ }
+ return (0);
+}
+
+static void
+dbuf_destroy(dmu_buf_impl_t *db)
+{
+ ASSERT(refcount_is_zero(&db->db_holds));
+
+ if (db->db_blkid != DB_BONUS_BLKID) {
+ dnode_t *dn = db->db_dnode;
+
+ /*
+ * If this dbuf is still on the dn_dbufs list,
+ * remove it from that list.
+ */
+ if (list_link_active(&db->db_link)) {
+ mutex_enter(&dn->dn_dbufs_mtx);
+ list_remove(&dn->dn_dbufs, db);
+ mutex_exit(&dn->dn_dbufs_mtx);
+
+ dnode_rele(dn, db);
+ }
+ dbuf_hash_remove(db);
+ }
+ db->db_parent = NULL;
+ db->db_dnode = NULL;
+ db->db_buf = NULL;
+
+ ASSERT(db->db.db_data == NULL);
+ ASSERT(db->db_hash_next == NULL);
+ ASSERT(db->db_blkptr == NULL);
+ ASSERT(db->db_data_pending == NULL);
+
+ kmem_cache_free(dbuf_cache, db);
+}
+
+void
+dbuf_prefetch(dnode_t *dn, uint64_t blkid)
+{
+ dmu_buf_impl_t *db = NULL;
+ blkptr_t *bp = NULL;
+
+ ASSERT(blkid != DB_BONUS_BLKID);
+ ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
+
+ if (dnode_block_freed(dn, blkid))
+ return;
+
+ /* dbuf_find() returns with db_mtx held */
+ if (db = dbuf_find(dn, 0, blkid)) {
+ if (refcount_count(&db->db_holds) > 0) {
+ /*
+ * This dbuf is active. We assume that it is
+ * already CACHED, or else about to be either
+ * read or filled.
+ */
+ mutex_exit(&db->db_mtx);
+ return;
+ }
+ mutex_exit(&db->db_mtx);
+ db = NULL;
+ }
+
+ if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
+ if (bp && !BP_IS_HOLE(bp)) {
+ uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
+ zbookmark_t zb;
+ zb.zb_objset = dn->dn_objset->os_dsl_dataset ?
+ dn->dn_objset->os_dsl_dataset->ds_object : 0;
+ zb.zb_object = dn->dn_object;
+ zb.zb_level = 0;
+ zb.zb_blkid = blkid;
+
+ (void) arc_read(NULL, dn->dn_objset->os_spa, bp,
+ dmu_ot[dn->dn_type].ot_byteswap,
+ NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
+ &aflags, &zb);
+ }
+ if (db)
+ dbuf_rele(db, NULL);
+ }
+}
+
+/*
+ * Returns with db_holds incremented, and db_mtx not held.
+ * Note: dn_struct_rwlock must be held.
+ */
+int
+dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
+ void *tag, dmu_buf_impl_t **dbp)
+{
+ dmu_buf_impl_t *db, *parent = NULL;
+
+ ASSERT(blkid != DB_BONUS_BLKID);
+ ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
+ ASSERT3U(dn->dn_nlevels, >, level);
+
+ *dbp = NULL;
+top:
+ /* dbuf_find() returns with db_mtx held */
+ db = dbuf_find(dn, level, blkid);
+
+ if (db == NULL) {
+ blkptr_t *bp = NULL;
+ int err;
+
+ ASSERT3P(parent, ==, NULL);
+ err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
+ if (fail_sparse) {
+ if (err == 0 && bp && BP_IS_HOLE(bp))
+ err = ENOENT;
+ if (err) {
+ if (parent)
+ dbuf_rele(parent, NULL);
+ return (err);
+ }
+ }
+ if (err && err != ENOENT)
+ return (err);
+ db = dbuf_create(dn, level, blkid, parent, bp);
+ }
+
+ if (db->db_buf && refcount_is_zero(&db->db_holds)) {
+ arc_buf_add_ref(db->db_buf, db);
+ if (db->db_buf->b_data == NULL) {
+ dbuf_clear(db);
+ if (parent) {
+ dbuf_rele(parent, NULL);
+ parent = NULL;
+ }
+ goto top;
+ }
+ ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
+ }
+
+ ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
+
+ /*
+ * If this buffer is currently syncing out, and we are are
+ * still referencing it from db_data, we need to make a copy
+ * of it in case we decide we want to dirty it again in this txg.
+ */
+ if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
+ dn->dn_object != DMU_META_DNODE_OBJECT &&
+ db->db_state == DB_CACHED && db->db_data_pending) {
+ dbuf_dirty_record_t *dr = db->db_data_pending;
+
+ if (dr->dt.dl.dr_data == db->db_buf) {
+ arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+
+ dbuf_set_data(db,
+ arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
+ db->db.db_size, db, type));
+ bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
+ db->db.db_size);
+ }
+ }
+
+ (void) refcount_add(&db->db_holds, tag);
+ dbuf_update_data(db);
+ DBUF_VERIFY(db);
+ mutex_exit(&db->db_mtx);
+
+ /* NOTE: we can't rele the parent until after we drop the db_mtx */
+ if (parent)
+ dbuf_rele(parent, NULL);
+
+ ASSERT3P(db->db_dnode, ==, dn);
+ ASSERT3U(db->db_blkid, ==, blkid);
+ ASSERT3U(db->db_level, ==, level);
+ *dbp = db;
+
+ return (0);
+}
+
+dmu_buf_impl_t *
+dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
+{
+ dmu_buf_impl_t *db;
+ int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db);
+ return (err ? NULL : db);
+}
+
+dmu_buf_impl_t *
+dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
+{
+ dmu_buf_impl_t *db;
+ int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db);
+ return (err ? NULL : db);
+}
+
+dmu_buf_impl_t *
+dbuf_create_bonus(dnode_t *dn)
+{
+ dmu_buf_impl_t *db = dn->dn_bonus;
+
+ ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
+
+ ASSERT(dn->dn_bonus == NULL);
+ db = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL);
+ return (db);
+}
+
+#pragma weak dmu_buf_add_ref = dbuf_add_ref
+void
+dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
+{
+ int64_t holds = refcount_add(&db->db_holds, tag);
+ ASSERT(holds > 1);
+}
+
+#pragma weak dmu_buf_rele = dbuf_rele
+void
+dbuf_rele(dmu_buf_impl_t *db, void *tag)
+{
+ int64_t holds;
+
+ mutex_enter(&db->db_mtx);
+ DBUF_VERIFY(db);
+
+ holds = refcount_remove(&db->db_holds, tag);
+ ASSERT(holds >= 0);
+
+ /*
+ * We can't freeze indirects if there is a possibility that they
+ * may be modified in the current syncing context.
+ */
+ if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0))
+ arc_buf_freeze(db->db_buf);
+
+ if (holds == db->db_dirtycnt &&
+ db->db_level == 0 && db->db_immediate_evict)
+ dbuf_evict_user(db);
+
+ if (holds == 0) {
+ if (db->db_blkid == DB_BONUS_BLKID) {
+ mutex_exit(&db->db_mtx);
+ dnode_rele(db->db_dnode, db);
+ } else if (db->db_buf == NULL) {
+ /*
+ * This is a special case: we never associated this
+ * dbuf with any data allocated from the ARC.
+ */
+ ASSERT3U(db->db_state, ==, DB_UNCACHED);
+ dbuf_evict(db);
+ } else if (arc_released(db->db_buf)) {
+ arc_buf_t *buf = db->db_buf;
+ /*
+ * This dbuf has anonymous data associated with it.
+ */
+ dbuf_set_data(db, NULL);
+ VERIFY(arc_buf_remove_ref(buf, db) == 1);
+ dbuf_evict(db);
+ } else {
+ VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0);
+ mutex_exit(&db->db_mtx);
+ }
+ } else {
+ mutex_exit(&db->db_mtx);
+ }
+}
+
+#pragma weak dmu_buf_refcount = dbuf_refcount
+uint64_t
+dbuf_refcount(dmu_buf_impl_t *db)
+{
+ return (refcount_count(&db->db_holds));
+}
+
+void *
+dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
+ dmu_buf_evict_func_t *evict_func)
+{
+ return (dmu_buf_update_user(db_fake, NULL, user_ptr,
+ user_data_ptr_ptr, evict_func));
+}
+
+void *
+dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
+ dmu_buf_evict_func_t *evict_func)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+ db->db_immediate_evict = TRUE;
+ return (dmu_buf_update_user(db_fake, NULL, user_ptr,
+ user_data_ptr_ptr, evict_func));
+}
+
+void *
+dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr,
+ void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ ASSERT(db->db_level == 0);
+
+ ASSERT((user_ptr == NULL) == (evict_func == NULL));
+
+ mutex_enter(&db->db_mtx);
+
+ if (db->db_user_ptr == old_user_ptr) {
+ db->db_user_ptr = user_ptr;
+ db->db_user_data_ptr_ptr = user_data_ptr_ptr;
+ db->db_evict_func = evict_func;
+
+ dbuf_update_data(db);
+ } else {
+ old_user_ptr = db->db_user_ptr;
+ }
+
+ mutex_exit(&db->db_mtx);
+ return (old_user_ptr);
+}
+
+void *
+dmu_buf_get_user(dmu_buf_t *db_fake)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ ASSERT(!refcount_is_zero(&db->db_holds));
+
+ return (db->db_user_ptr);
+}
+
+static void
+dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
+{
+ /* ASSERT(dmu_tx_is_syncing(tx) */
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+
+ if (db->db_blkptr != NULL)
+ return;
+
+ if (db->db_level == dn->dn_phys->dn_nlevels-1) {
+ /*
+ * This buffer was allocated at a time when there was
+ * no available blkptrs from the dnode, or it was
+ * inappropriate to hook it in (i.e., nlevels mis-match).
+ */
+ ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
+ ASSERT(db->db_parent == NULL);
+ db->db_parent = dn->dn_dbuf;
+ db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
+ DBUF_VERIFY(db);
+ } else {
+ dmu_buf_impl_t *parent = db->db_parent;
+ int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+ ASSERT(dn->dn_phys->dn_nlevels > 1);
+ if (parent == NULL) {
+ mutex_exit(&db->db_mtx);
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ (void) dbuf_hold_impl(dn, db->db_level+1,
+ db->db_blkid >> epbs, FALSE, db, &parent);
+ rw_exit(&dn->dn_struct_rwlock);
+ mutex_enter(&db->db_mtx);
+ db->db_parent = parent;
+ }
+ db->db_blkptr = (blkptr_t *)parent->db.db_data +
+ (db->db_blkid & ((1ULL << epbs) - 1));
+ DBUF_VERIFY(db);
+ }
+}
+
+static void
+dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+ dnode_t *dn = db->db_dnode;
+ zio_t *zio;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
+
+ mutex_enter(&db->db_mtx);
+
+ ASSERT(db->db_level > 0);
+ DBUF_VERIFY(db);
+
+ if (db->db_buf == NULL) {
+ mutex_exit(&db->db_mtx);
+ (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
+ mutex_enter(&db->db_mtx);
+ }
+ ASSERT3U(db->db_state, ==, DB_CACHED);
+ ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
+ ASSERT(db->db_buf != NULL);
+
+ dbuf_check_blkptr(dn, db);
+
+ db->db_data_pending = dr;
+
+ arc_release(db->db_buf, db);
+ mutex_exit(&db->db_mtx);
+
+ /*
+ * XXX -- we should design a compression algorithm
+ * that specializes in arrays of bps.
+ */
+ dbuf_write(dr, db->db_buf, ZIO_CHECKSUM_FLETCHER_4,
+ zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY : ZIO_COMPRESS_LZJB, tx);
+
+ zio = dr->dr_zio;
+ mutex_enter(&dr->dt.di.dr_mtx);
+ dbuf_sync_list(&dr->dt.di.dr_children, tx);
+ ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
+ mutex_exit(&dr->dt.di.dr_mtx);
+ zio_nowait(zio);
+}
+
+static void
+dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
+{
+ arc_buf_t **datap = &dr->dt.dl.dr_data;
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+ dnode_t *dn = db->db_dnode;
+ objset_impl_t *os = dn->dn_objset;
+ uint64_t txg = tx->tx_txg;
+ int checksum, compress;
+ int blksz;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
+
+ mutex_enter(&db->db_mtx);
+ /*
+ * To be synced, we must be dirtied. But we
+ * might have been freed after the dirty.
+ */
+ if (db->db_state == DB_UNCACHED) {
+ /* This buffer has been freed since it was dirtied */
+ ASSERT(db->db.db_data == NULL);
+ } else if (db->db_state == DB_FILL) {
+ /* This buffer was freed and is now being re-filled */
+ ASSERT(db->db.db_data != dr->dt.dl.dr_data);
+ } else {
+ ASSERT3U(db->db_state, ==, DB_CACHED);
+ }
+ DBUF_VERIFY(db);
+
+ /*
+ * If this is a bonus buffer, simply copy the bonus data into the
+ * dnode. It will be written out when the dnode is synced (and it
+ * will be synced, since it must have been dirty for dbuf_sync to
+ * be called).
+ */
+ if (db->db_blkid == DB_BONUS_BLKID) {
+ dbuf_dirty_record_t **drp;
+ /*
+ * Use dn_phys->dn_bonuslen since db.db_size is the length
+ * of the bonus buffer in the open transaction rather than
+ * the syncing transaction.
+ */
+ ASSERT(*datap != NULL);
+ ASSERT3U(db->db_level, ==, 0);
+ ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
+ bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
+ if (*datap != db->db.db_data)
+ zio_buf_free(*datap, DN_MAX_BONUSLEN);
+ db->db_data_pending = NULL;
+ drp = &db->db_last_dirty;
+ while (*drp != dr)
+ drp = &(*drp)->dr_next;
+ ASSERT((*drp)->dr_next == NULL);
+ *drp = NULL;
+ kmem_free(dr, sizeof (dbuf_dirty_record_t));
+ ASSERT(db->db_dirtycnt > 0);
+ db->db_dirtycnt -= 1;
+ mutex_exit(&db->db_mtx);
+ dbuf_rele(db, (void *)(uintptr_t)txg);
+ return;
+ }
+
+ /*
+ * If this buffer is in the middle of an immdiate write,
+ * wait for the synchronous IO to complete.
+ */
+ while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
+ ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
+ cv_wait(&db->db_changed, &db->db_mtx);
+ ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
+ }
+
+ dbuf_check_blkptr(dn, db);
+
+ /*
+ * If this dbuf has already been written out via an immediate write,
+ * just complete the write by copying over the new block pointer and
+ * updating the accounting via the write-completion functions.
+ */
+ if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
+ zio_t zio_fake;
+
+ zio_fake.io_private = &db;
+ zio_fake.io_error = 0;
+ zio_fake.io_bp = db->db_blkptr;
+ zio_fake.io_bp_orig = *db->db_blkptr;
+ zio_fake.io_txg = txg;
+
+ *db->db_blkptr = dr->dt.dl.dr_overridden_by;
+ dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
+ db->db_data_pending = dr;
+ dr->dr_zio = &zio_fake;
+ mutex_exit(&db->db_mtx);
+
+ if (BP_IS_OLDER(&zio_fake.io_bp_orig, txg))
+ dsl_dataset_block_kill(os->os_dsl_dataset,
+ &zio_fake.io_bp_orig, dn->dn_zio, tx);
+
+ dbuf_write_ready(&zio_fake, db->db_buf, db);
+ dbuf_write_done(&zio_fake, db->db_buf, db);
+
+ return;
+ }
+
+ blksz = arc_buf_size(*datap);
+
+ if (dn->dn_object != DMU_META_DNODE_OBJECT) {
+ /*
+ * If this buffer is currently "in use" (i.e., there are
+ * active holds and db_data still references it), then make
+ * a copy before we start the write so that any modifications
+ * from the open txg will not leak into this write.
+ *
+ * NOTE: this copy does not need to be made for objects only
+ * modified in the syncing context (e.g. DNONE_DNODE blocks).
+ */
+ if (refcount_count(&db->db_holds) > 1 && *datap == db->db_buf) {
+ arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+ *datap = arc_buf_alloc(os->os_spa, blksz, db, type);
+ bcopy(db->db.db_data, (*datap)->b_data, blksz);
+ }
+ } else {
+ /*
+ * Private object buffers are released here rather
+ * than in dbuf_dirty() since they are only modified
+ * in the syncing context and we don't want the
+ * overhead of making multiple copies of the data.
+ */
+ arc_release(db->db_buf, db);
+ }
+
+ ASSERT(*datap != NULL);
+ db->db_data_pending = dr;
+
+ mutex_exit(&db->db_mtx);
+
+ /*
+ * Allow dnode settings to override objset settings,
+ * except for metadata checksums.
+ */
+ if (dmu_ot[dn->dn_type].ot_metadata) {
+ checksum = os->os_md_checksum;
+ compress = zio_compress_select(dn->dn_compress,
+ os->os_md_compress);
+ } else {
+ checksum = zio_checksum_select(dn->dn_checksum,
+ os->os_checksum);
+ compress = zio_compress_select(dn->dn_compress,
+ os->os_compress);
+ }
+
+ dbuf_write(dr, *datap, checksum, compress, tx);
+
+ ASSERT(!list_link_active(&dr->dr_dirty_node));
+ if (dn->dn_object == DMU_META_DNODE_OBJECT)
+ list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
+ else
+ zio_nowait(dr->dr_zio);
+}
+
+void
+dbuf_sync_list(list_t *list, dmu_tx_t *tx)
+{
+ dbuf_dirty_record_t *dr;
+
+ while (dr = list_head(list)) {
+ if (dr->dr_zio != NULL) {
+ /*
+ * If we find an already initialized zio then we
+ * are processing the meta-dnode, and we have finished.
+ * The dbufs for all dnodes are put back on the list
+ * during processing, so that we can zio_wait()
+ * these IOs after initiating all child IOs.
+ */
+ ASSERT3U(dr->dr_dbuf->db.db_object, ==,
+ DMU_META_DNODE_OBJECT);
+ break;
+ }
+ list_remove(list, dr);
+ if (dr->dr_dbuf->db_level > 0)
+ dbuf_sync_indirect(dr, tx);
+ else
+ dbuf_sync_leaf(dr, tx);
+ }
+}
+
+static void
+dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum,
+ int compress, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+ dnode_t *dn = db->db_dnode;
+ objset_impl_t *os = dn->dn_objset;
+ dmu_buf_impl_t *parent = db->db_parent;
+ uint64_t txg = tx->tx_txg;
+ zbookmark_t zb;
+ zio_t *zio;
+ int zio_flags;
+
+ if (parent != dn->dn_dbuf) {
+ ASSERT(parent && parent->db_data_pending);
+ ASSERT(db->db_level == parent->db_level-1);
+ ASSERT(arc_released(parent->db_buf));
+ zio = parent->db_data_pending->dr_zio;
+ } else {
+ ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1);
+ ASSERT3P(db->db_blkptr, ==,
+ &dn->dn_phys->dn_blkptr[db->db_blkid]);
+ zio = dn->dn_zio;
+ }
+
+ ASSERT(db->db_level == 0 || data == db->db_buf);
+ ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
+ ASSERT(zio);
+
+ zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0;
+ zb.zb_object = db->db.db_object;
+ zb.zb_level = db->db_level;
+ zb.zb_blkid = db->db_blkid;
+
+ zio_flags = ZIO_FLAG_MUSTSUCCEED;
+ if (dmu_ot[dn->dn_type].ot_metadata || zb.zb_level != 0)
+ zio_flags |= ZIO_FLAG_METADATA;
+ if (BP_IS_OLDER(db->db_blkptr, txg))
+ dsl_dataset_block_kill(
+ os->os_dsl_dataset, db->db_blkptr, zio, tx);
+
+ dr->dr_zio = arc_write(zio, os->os_spa, checksum, compress,
+ dmu_get_replication_level(os, &zb, dn->dn_type), txg,
+ db->db_blkptr, data, dbuf_write_ready, dbuf_write_done, db,
+ ZIO_PRIORITY_ASYNC_WRITE, zio_flags, &zb);
+}
+
+/* ARGSUSED */
+static void
+dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
+{
+ dmu_buf_impl_t *db = vdb;
+ dnode_t *dn = db->db_dnode;
+ objset_impl_t *os = dn->dn_objset;
+ blkptr_t *bp_orig = &zio->io_bp_orig;
+ uint64_t fill = 0;
+ int old_size, new_size, i;
+
+ dprintf_dbuf_bp(db, bp_orig, "bp_orig: %s", "");
+
+ old_size = bp_get_dasize(os->os_spa, bp_orig);
+ new_size = bp_get_dasize(os->os_spa, zio->io_bp);
+
+ dnode_diduse_space(dn, new_size-old_size);
+
+ if (BP_IS_HOLE(zio->io_bp)) {
+ dsl_dataset_t *ds = os->os_dsl_dataset;
+ dmu_tx_t *tx = os->os_synctx;
+
+ if (bp_orig->blk_birth == tx->tx_txg)
+ dsl_dataset_block_kill(ds, bp_orig, NULL, tx);
+ ASSERT3U(db->db_blkptr->blk_fill, ==, 0);
+ return;
+ }
+
+ mutex_enter(&db->db_mtx);
+
+ if (db->db_level == 0) {
+ mutex_enter(&dn->dn_mtx);
+ if (db->db_blkid > dn->dn_phys->dn_maxblkid)
+ dn->dn_phys->dn_maxblkid = db->db_blkid;
+ mutex_exit(&dn->dn_mtx);
+
+ if (dn->dn_type == DMU_OT_DNODE) {
+ dnode_phys_t *dnp = db->db.db_data;
+ for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
+ i--, dnp++) {
+ if (dnp->dn_type != DMU_OT_NONE)
+ fill++;
+ }
+ } else {
+ fill = 1;
+ }
+ } else {
+ blkptr_t *bp = db->db.db_data;
+ ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
+ for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, bp++) {
+ if (BP_IS_HOLE(bp))
+ continue;
+ ASSERT3U(BP_GET_LSIZE(bp), ==,
+ db->db_level == 1 ? dn->dn_datablksz :
+ (1<<dn->dn_phys->dn_indblkshift));
+ fill += bp->blk_fill;
+ }
+ }
+
+ db->db_blkptr->blk_fill = fill;
+ BP_SET_TYPE(db->db_blkptr, dn->dn_type);
+ BP_SET_LEVEL(db->db_blkptr, db->db_level);
+
+ mutex_exit(&db->db_mtx);
+
+ /* We must do this after we've set the bp's type and level */
+ if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp), BP_IDENTITY(bp_orig))) {
+ dsl_dataset_t *ds = os->os_dsl_dataset;
+ dmu_tx_t *tx = os->os_synctx;
+
+ if (bp_orig->blk_birth == tx->tx_txg)
+ dsl_dataset_block_kill(ds, bp_orig, NULL, tx);
+ dsl_dataset_block_born(ds, zio->io_bp, tx);
+ }
+}
+
+/* ARGSUSED */
+static void
+dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
+{
+ dmu_buf_impl_t *db = vdb;
+ uint64_t txg = zio->io_txg;
+ dbuf_dirty_record_t **drp, *dr;
+
+ ASSERT3U(zio->io_error, ==, 0);
+
+ mutex_enter(&db->db_mtx);
+
+ drp = &db->db_last_dirty;
+ while (*drp != db->db_data_pending)
+ drp = &(*drp)->dr_next;
+ ASSERT(!list_link_active(&(*drp)->dr_dirty_node));
+ ASSERT((*drp)->dr_txg == txg);
+ ASSERT((*drp)->dr_next == NULL);
+ dr = *drp;
+ *drp = NULL;
+
+ if (db->db_level == 0) {
+ ASSERT(db->db_blkid != DB_BONUS_BLKID);
+ ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
+
+ if (dr->dt.dl.dr_data != db->db_buf)
+ VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1);
+ else if (!BP_IS_HOLE(db->db_blkptr))
+ arc_set_callback(db->db_buf, dbuf_do_evict, db);
+ else
+ ASSERT(arc_released(db->db_buf));
+ } else {
+ dnode_t *dn = db->db_dnode;
+
+ ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
+ ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
+ if (!BP_IS_HOLE(db->db_blkptr)) {
+ int epbs =
+ dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+ ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
+ db->db.db_size);
+ ASSERT3U(dn->dn_phys->dn_maxblkid
+ >> (db->db_level * epbs), >=, db->db_blkid);
+ arc_set_callback(db->db_buf, dbuf_do_evict, db);
+ }
+ }
+ kmem_free(dr, sizeof (dbuf_dirty_record_t));
+
+ cv_broadcast(&db->db_changed);
+ ASSERT(db->db_dirtycnt > 0);
+ db->db_dirtycnt -= 1;
+ db->db_data_pending = NULL;
+ mutex_exit(&db->db_mtx);
+
+ dprintf_dbuf_bp(db, zio->io_bp, "bp: %s", "");
+
+ dbuf_rele(db, (void *)(uintptr_t)txg);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
new file mode 100644
index 0000000..d3be6b4
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
@@ -0,0 +1,1029 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/zfs_context.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dsl_prop.h>
+#include <sys/dmu_zfetch.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zap.h>
+#include <sys/zio_checksum.h>
+
+const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
+ { byteswap_uint8_array, TRUE, "unallocated" },
+ { zap_byteswap, TRUE, "object directory" },
+ { byteswap_uint64_array, TRUE, "object array" },
+ { byteswap_uint8_array, TRUE, "packed nvlist" },
+ { byteswap_uint64_array, TRUE, "packed nvlist size" },
+ { byteswap_uint64_array, TRUE, "bplist" },
+ { byteswap_uint64_array, TRUE, "bplist header" },
+ { byteswap_uint64_array, TRUE, "SPA space map header" },
+ { byteswap_uint64_array, TRUE, "SPA space map" },
+ { byteswap_uint64_array, TRUE, "ZIL intent log" },
+ { dnode_buf_byteswap, TRUE, "DMU dnode" },
+ { dmu_objset_byteswap, TRUE, "DMU objset" },
+ { byteswap_uint64_array, TRUE, "DSL directory" },
+ { zap_byteswap, TRUE, "DSL directory child map"},
+ { zap_byteswap, TRUE, "DSL dataset snap map" },
+ { zap_byteswap, TRUE, "DSL props" },
+ { byteswap_uint64_array, TRUE, "DSL dataset" },
+ { zfs_znode_byteswap, TRUE, "ZFS znode" },
+ { zfs_acl_byteswap, TRUE, "ZFS ACL" },
+ { byteswap_uint8_array, FALSE, "ZFS plain file" },
+ { zap_byteswap, TRUE, "ZFS directory" },
+ { zap_byteswap, TRUE, "ZFS master node" },
+ { zap_byteswap, TRUE, "ZFS delete queue" },
+ { byteswap_uint8_array, FALSE, "zvol object" },
+ { zap_byteswap, TRUE, "zvol prop" },
+ { byteswap_uint8_array, FALSE, "other uint8[]" },
+ { byteswap_uint64_array, FALSE, "other uint64[]" },
+ { zap_byteswap, TRUE, "other ZAP" },
+ { zap_byteswap, TRUE, "persistent error log" },
+ { byteswap_uint8_array, TRUE, "SPA history" },
+ { byteswap_uint64_array, TRUE, "SPA history offsets" },
+ { zap_byteswap, TRUE, "Pool properties" },
+};
+
+int
+dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
+ void *tag, dmu_buf_t **dbp)
+{
+ dnode_t *dn;
+ uint64_t blkid;
+ dmu_buf_impl_t *db;
+ int err;
+
+ err = dnode_hold(os->os, object, FTAG, &dn);
+ if (err)
+ return (err);
+ blkid = dbuf_whichblock(dn, offset);
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ db = dbuf_hold(dn, blkid, tag);
+ rw_exit(&dn->dn_struct_rwlock);
+ if (db == NULL) {
+ err = EIO;
+ } else {
+ err = dbuf_read(db, NULL, DB_RF_CANFAIL);
+ if (err) {
+ dbuf_rele(db, tag);
+ db = NULL;
+ }
+ }
+
+ dnode_rele(dn, FTAG);
+ *dbp = &db->db;
+ return (err);
+}
+
+int
+dmu_bonus_max(void)
+{
+ return (DN_MAX_BONUSLEN);
+}
+
+/*
+ * returns ENOENT, EIO, or 0.
+ */
+int
+dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
+{
+ dnode_t *dn;
+ int err, count;
+ dmu_buf_impl_t *db;
+
+ err = dnode_hold(os->os, object, FTAG, &dn);
+ if (err)
+ return (err);
+
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ if (dn->dn_bonus == NULL) {
+ rw_exit(&dn->dn_struct_rwlock);
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ if (dn->dn_bonus == NULL)
+ dn->dn_bonus = dbuf_create_bonus(dn);
+ }
+ db = dn->dn_bonus;
+ rw_exit(&dn->dn_struct_rwlock);
+ mutex_enter(&db->db_mtx);
+ count = refcount_add(&db->db_holds, tag);
+ mutex_exit(&db->db_mtx);
+ if (count == 1)
+ dnode_add_ref(dn, db);
+ dnode_rele(dn, FTAG);
+
+ VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED));
+
+ *dbp = &db->db;
+ return (0);
+}
+
+/*
+ * Note: longer-term, we should modify all of the dmu_buf_*() interfaces
+ * to take a held dnode rather than <os, object> -- the lookup is wasteful,
+ * and can induce severe lock contention when writing to several files
+ * whose dnodes are in the same block.
+ */
+static int
+dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset,
+ uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
+{
+ dmu_buf_t **dbp;
+ uint64_t blkid, nblks, i;
+ uint32_t flags;
+ int err;
+ zio_t *zio;
+
+ ASSERT(length <= DMU_MAX_ACCESS);
+
+ flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT;
+ if (length > zfetch_array_rd_sz)
+ flags |= DB_RF_NOPREFETCH;
+
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ if (dn->dn_datablkshift) {
+ int blkshift = dn->dn_datablkshift;
+ nblks = (P2ROUNDUP(offset+length, 1ULL<<blkshift) -
+ P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift;
+ } else {
+ if (offset + length > dn->dn_datablksz) {
+ zfs_panic_recover("zfs: accessing past end of object "
+ "%llx/%llx (size=%u access=%llu+%llu)",
+ (longlong_t)dn->dn_objset->
+ os_dsl_dataset->ds_object,
+ (longlong_t)dn->dn_object, dn->dn_datablksz,
+ (longlong_t)offset, (longlong_t)length);
+ return (EIO);
+ }
+ nblks = 1;
+ }
+ dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
+
+ zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, TRUE);
+ blkid = dbuf_whichblock(dn, offset);
+ for (i = 0; i < nblks; i++) {
+ dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag);
+ if (db == NULL) {
+ rw_exit(&dn->dn_struct_rwlock);
+ dmu_buf_rele_array(dbp, nblks, tag);
+ zio_nowait(zio);
+ return (EIO);
+ }
+ /* initiate async i/o */
+ if (read) {
+ rw_exit(&dn->dn_struct_rwlock);
+ (void) dbuf_read(db, zio, flags);
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ }
+ dbp[i] = &db->db;
+ }
+ rw_exit(&dn->dn_struct_rwlock);
+
+ /* wait for async i/o */
+ err = zio_wait(zio);
+ if (err) {
+ dmu_buf_rele_array(dbp, nblks, tag);
+ return (err);
+ }
+
+ /* wait for other io to complete */
+ if (read) {
+ for (i = 0; i < nblks; i++) {
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
+ mutex_enter(&db->db_mtx);
+ while (db->db_state == DB_READ ||
+ db->db_state == DB_FILL)
+ cv_wait(&db->db_changed, &db->db_mtx);
+ if (db->db_state == DB_UNCACHED)
+ err = EIO;
+ mutex_exit(&db->db_mtx);
+ if (err) {
+ dmu_buf_rele_array(dbp, nblks, tag);
+ return (err);
+ }
+ }
+ }
+
+ *numbufsp = nblks;
+ *dbpp = dbp;
+ return (0);
+}
+
+static int
+dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
+ uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
+{
+ dnode_t *dn;
+ int err;
+
+ err = dnode_hold(os->os, object, FTAG, &dn);
+ if (err)
+ return (err);
+
+ err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
+ numbufsp, dbpp);
+
+ dnode_rele(dn, FTAG);
+
+ return (err);
+}
+
+int
+dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset,
+ uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
+{
+ dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
+ int err;
+
+ err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
+ numbufsp, dbpp);
+
+ return (err);
+}
+
+void
+dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag)
+{
+ int i;
+ dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
+
+ if (numbufs == 0)
+ return;
+
+ for (i = 0; i < numbufs; i++) {
+ if (dbp[i])
+ dbuf_rele(dbp[i], tag);
+ }
+
+ kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs);
+}
+
+void
+dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
+{
+ dnode_t *dn;
+ uint64_t blkid;
+ int nblks, i, err;
+
+ if (zfs_prefetch_disable)
+ return;
+
+ if (len == 0) { /* they're interested in the bonus buffer */
+ dn = os->os->os_meta_dnode;
+
+ if (object == 0 || object >= DN_MAX_OBJECT)
+ return;
+
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t));
+ dbuf_prefetch(dn, blkid);
+ rw_exit(&dn->dn_struct_rwlock);
+ return;
+ }
+
+ /*
+ * XXX - Note, if the dnode for the requested object is not
+ * already cached, we will do a *synchronous* read in the
+ * dnode_hold() call. The same is true for any indirects.
+ */
+ err = dnode_hold(os->os, object, FTAG, &dn);
+ if (err != 0)
+ return;
+
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ if (dn->dn_datablkshift) {
+ int blkshift = dn->dn_datablkshift;
+ nblks = (P2ROUNDUP(offset+len, 1<<blkshift) -
+ P2ALIGN(offset, 1<<blkshift)) >> blkshift;
+ } else {
+ nblks = (offset < dn->dn_datablksz);
+ }
+
+ if (nblks != 0) {
+ blkid = dbuf_whichblock(dn, offset);
+ for (i = 0; i < nblks; i++)
+ dbuf_prefetch(dn, blkid+i);
+ }
+
+ rw_exit(&dn->dn_struct_rwlock);
+
+ dnode_rele(dn, FTAG);
+}
+
+int
+dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
+ uint64_t size, dmu_tx_t *tx)
+{
+ dnode_t *dn;
+ int err = dnode_hold(os->os, object, FTAG, &dn);
+ if (err)
+ return (err);
+ ASSERT(offset < UINT64_MAX);
+ ASSERT(size == -1ULL || size <= UINT64_MAX - offset);
+ dnode_free_range(dn, offset, size, tx);
+ dnode_rele(dn, FTAG);
+ return (0);
+}
+
+int
+dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+ void *buf)
+{
+ dnode_t *dn;
+ dmu_buf_t **dbp;
+ int numbufs, i, err;
+
+ err = dnode_hold(os->os, object, FTAG, &dn);
+ if (err)
+ return (err);
+
+ /*
+ * Deal with odd block sizes, where there can't be data past the first
+ * block. If we ever do the tail block optimization, we will need to
+ * handle that here as well.
+ */
+ if (dn->dn_datablkshift == 0) {
+ int newsz = offset > dn->dn_datablksz ? 0 :
+ MIN(size, dn->dn_datablksz - offset);
+ bzero((char *)buf + newsz, size - newsz);
+ size = newsz;
+ }
+
+ while (size > 0) {
+ uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
+ int err;
+
+ /*
+ * NB: we could do this block-at-a-time, but it's nice
+ * to be reading in parallel.
+ */
+ err = dmu_buf_hold_array_by_dnode(dn, offset, mylen,
+ TRUE, FTAG, &numbufs, &dbp);
+ if (err)
+ return (err);
+
+ for (i = 0; i < numbufs; i++) {
+ int tocpy;
+ int bufoff;
+ dmu_buf_t *db = dbp[i];
+
+ ASSERT(size > 0);
+
+ bufoff = offset - db->db_offset;
+ tocpy = (int)MIN(db->db_size - bufoff, size);
+
+ bcopy((char *)db->db_data + bufoff, buf, tocpy);
+
+ offset += tocpy;
+ size -= tocpy;
+ buf = (char *)buf + tocpy;
+ }
+ dmu_buf_rele_array(dbp, numbufs, FTAG);
+ }
+ dnode_rele(dn, FTAG);
+ return (0);
+}
+
+void
+dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+ const void *buf, dmu_tx_t *tx)
+{
+ dmu_buf_t **dbp;
+ int numbufs, i;
+
+ if (size == 0)
+ return;
+
+ VERIFY(0 == dmu_buf_hold_array(os, object, offset, size,
+ FALSE, FTAG, &numbufs, &dbp));
+
+ for (i = 0; i < numbufs; i++) {
+ int tocpy;
+ int bufoff;
+ dmu_buf_t *db = dbp[i];
+
+ ASSERT(size > 0);
+
+ bufoff = offset - db->db_offset;
+ tocpy = (int)MIN(db->db_size - bufoff, size);
+
+ ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
+
+ if (tocpy == db->db_size)
+ dmu_buf_will_fill(db, tx);
+ else
+ dmu_buf_will_dirty(db, tx);
+
+ bcopy(buf, (char *)db->db_data + bufoff, tocpy);
+
+ if (tocpy == db->db_size)
+ dmu_buf_fill_done(db, tx);
+
+ offset += tocpy;
+ size -= tocpy;
+ buf = (char *)buf + tocpy;
+ }
+ dmu_buf_rele_array(dbp, numbufs, FTAG);
+}
+
+#ifdef _KERNEL
+int
+dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
+{
+ dmu_buf_t **dbp;
+ int numbufs, i, err;
+
+ /*
+ * NB: we could do this block-at-a-time, but it's nice
+ * to be reading in parallel.
+ */
+ err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, TRUE, FTAG,
+ &numbufs, &dbp);
+ if (err)
+ return (err);
+
+ for (i = 0; i < numbufs; i++) {
+ int tocpy;
+ int bufoff;
+ dmu_buf_t *db = dbp[i];
+
+ ASSERT(size > 0);
+
+ bufoff = uio->uio_loffset - db->db_offset;
+ tocpy = (int)MIN(db->db_size - bufoff, size);
+
+ err = uiomove((char *)db->db_data + bufoff, tocpy,
+ UIO_READ, uio);
+ if (err)
+ break;
+
+ size -= tocpy;
+ }
+ dmu_buf_rele_array(dbp, numbufs, FTAG);
+
+ return (err);
+}
+
+int
+dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size,
+ dmu_tx_t *tx)
+{
+ dmu_buf_t **dbp;
+ int numbufs, i;
+ int err = 0;
+
+ if (size == 0)
+ return (0);
+
+ err = dmu_buf_hold_array(os, object, uio->uio_loffset, size,
+ FALSE, FTAG, &numbufs, &dbp);
+ if (err)
+ return (err);
+
+ for (i = 0; i < numbufs; i++) {
+ int tocpy;
+ int bufoff;
+ dmu_buf_t *db = dbp[i];
+
+ ASSERT(size > 0);
+
+ bufoff = uio->uio_loffset - db->db_offset;
+ tocpy = (int)MIN(db->db_size - bufoff, size);
+
+ ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
+
+ if (tocpy == db->db_size)
+ dmu_buf_will_fill(db, tx);
+ else
+ dmu_buf_will_dirty(db, tx);
+
+ /*
+ * XXX uiomove could block forever (eg. nfs-backed
+ * pages). There needs to be a uiolockdown() function
+ * to lock the pages in memory, so that uiomove won't
+ * block.
+ */
+ err = uiomove((char *)db->db_data + bufoff, tocpy,
+ UIO_WRITE, uio);
+
+ if (tocpy == db->db_size)
+ dmu_buf_fill_done(db, tx);
+
+ if (err)
+ break;
+
+ size -= tocpy;
+ }
+ dmu_buf_rele_array(dbp, numbufs, FTAG);
+ return (err);
+}
+
+#ifndef __FreeBSD__
+int
+dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+ page_t *pp, dmu_tx_t *tx)
+{
+ dmu_buf_t **dbp;
+ int numbufs, i;
+ int err;
+
+ if (size == 0)
+ return (0);
+
+ err = dmu_buf_hold_array(os, object, offset, size,
+ FALSE, FTAG, &numbufs, &dbp);
+ if (err)
+ return (err);
+
+ for (i = 0; i < numbufs; i++) {
+ int tocpy, copied, thiscpy;
+ int bufoff;
+ dmu_buf_t *db = dbp[i];
+ caddr_t va;
+
+ ASSERT(size > 0);
+ ASSERT3U(db->db_size, >=, PAGESIZE);
+
+ bufoff = offset - db->db_offset;
+ tocpy = (int)MIN(db->db_size - bufoff, size);
+
+ ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
+
+ if (tocpy == db->db_size)
+ dmu_buf_will_fill(db, tx);
+ else
+ dmu_buf_will_dirty(db, tx);
+
+ for (copied = 0; copied < tocpy; copied += PAGESIZE) {
+ ASSERT3U(pp->p_offset, ==, db->db_offset + bufoff);
+ thiscpy = MIN(PAGESIZE, tocpy - copied);
+ va = ppmapin(pp, PROT_READ, (caddr_t)-1);
+ bcopy(va, (char *)db->db_data + bufoff, thiscpy);
+ ppmapout(va);
+ pp = pp->p_next;
+ bufoff += PAGESIZE;
+ }
+
+ if (tocpy == db->db_size)
+ dmu_buf_fill_done(db, tx);
+
+ if (err)
+ break;
+
+ offset += tocpy;
+ size -= tocpy;
+ }
+ dmu_buf_rele_array(dbp, numbufs, FTAG);
+ return (err);
+}
+#endif /* !__FreeBSD__ */
+#endif /* _KERNEL */
+
+typedef struct {
+ dbuf_dirty_record_t *dr;
+ dmu_sync_cb_t *done;
+ void *arg;
+} dmu_sync_arg_t;
+
+/* ARGSUSED */
+static void
+dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
+{
+ dmu_sync_arg_t *in = varg;
+ dbuf_dirty_record_t *dr = in->dr;
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+ dmu_sync_cb_t *done = in->done;
+
+ if (!BP_IS_HOLE(zio->io_bp)) {
+ zio->io_bp->blk_fill = 1;
+ BP_SET_TYPE(zio->io_bp, db->db_dnode->dn_type);
+ BP_SET_LEVEL(zio->io_bp, 0);
+ }
+
+ mutex_enter(&db->db_mtx);
+ ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
+ dr->dt.dl.dr_overridden_by = *zio->io_bp; /* structure assignment */
+ dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
+ cv_broadcast(&db->db_changed);
+ mutex_exit(&db->db_mtx);
+
+ if (done)
+ done(&(db->db), in->arg);
+
+ kmem_free(in, sizeof (dmu_sync_arg_t));
+}
+
+/*
+ * Intent log support: sync the block associated with db to disk.
+ * N.B. and XXX: the caller is responsible for making sure that the
+ * data isn't changing while dmu_sync() is writing it.
+ *
+ * Return values:
+ *
+ * EEXIST: this txg has already been synced, so there's nothing to to.
+ * The caller should not log the write.
+ *
+ * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do.
+ * The caller should not log the write.
+ *
+ * EALREADY: this block is already in the process of being synced.
+ * The caller should track its progress (somehow).
+ *
+ * EINPROGRESS: the IO has been initiated.
+ * The caller should log this blkptr in the callback.
+ *
+ * 0: completed. Sets *bp to the blkptr just written.
+ * The caller should log this blkptr immediately.
+ */
+int
+dmu_sync(zio_t *pio, dmu_buf_t *db_fake,
+ blkptr_t *bp, uint64_t txg, dmu_sync_cb_t *done, void *arg)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ objset_impl_t *os = db->db_objset;
+ dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool;
+ tx_state_t *tx = &dp->dp_tx;
+ dbuf_dirty_record_t *dr;
+ dmu_sync_arg_t *in;
+ zbookmark_t zb;
+ zio_t *zio;
+ int zio_flags;
+ int err;
+
+ ASSERT(BP_IS_HOLE(bp));
+ ASSERT(txg != 0);
+
+
+ dprintf("dmu_sync txg=%llu, s,o,q %llu %llu %llu\n",
+ txg, tx->tx_synced_txg, tx->tx_open_txg, tx->tx_quiesced_txg);
+
+ /*
+ * XXX - would be nice if we could do this without suspending...
+ */
+ txg_suspend(dp);
+
+ /*
+ * If this txg already synced, there's nothing to do.
+ */
+ if (txg <= tx->tx_synced_txg) {
+ txg_resume(dp);
+ /*
+ * If we're running ziltest, we need the blkptr regardless.
+ */
+ if (txg > spa_freeze_txg(dp->dp_spa)) {
+ /* if db_blkptr == NULL, this was an empty write */
+ if (db->db_blkptr)
+ *bp = *db->db_blkptr; /* structure assignment */
+ return (0);
+ }
+ return (EEXIST);
+ }
+
+ mutex_enter(&db->db_mtx);
+
+ if (txg == tx->tx_syncing_txg) {
+ while (db->db_data_pending) {
+ /*
+ * IO is in-progress. Wait for it to finish.
+ * XXX - would be nice to be able to somehow "attach"
+ * this zio to the parent zio passed in.
+ */
+ cv_wait(&db->db_changed, &db->db_mtx);
+ if (!db->db_data_pending &&
+ db->db_blkptr && BP_IS_HOLE(db->db_blkptr)) {
+ /*
+ * IO was compressed away
+ */
+ *bp = *db->db_blkptr; /* structure assignment */
+ mutex_exit(&db->db_mtx);
+ txg_resume(dp);
+ return (0);
+ }
+ ASSERT(db->db_data_pending ||
+ (db->db_blkptr && db->db_blkptr->blk_birth == txg));
+ }
+
+ if (db->db_blkptr && db->db_blkptr->blk_birth == txg) {
+ /*
+ * IO is already completed.
+ */
+ *bp = *db->db_blkptr; /* structure assignment */
+ mutex_exit(&db->db_mtx);
+ txg_resume(dp);
+ return (0);
+ }
+ }
+
+ dr = db->db_last_dirty;
+ while (dr && dr->dr_txg > txg)
+ dr = dr->dr_next;
+ if (dr == NULL || dr->dr_txg < txg) {
+ /*
+ * This dbuf isn't dirty, must have been free_range'd.
+ * There's no need to log writes to freed blocks, so we're done.
+ */
+ mutex_exit(&db->db_mtx);
+ txg_resume(dp);
+ return (ENOENT);
+ }
+
+ ASSERT(dr->dr_txg == txg);
+ if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
+ /*
+ * We have already issued a sync write for this buffer.
+ */
+ mutex_exit(&db->db_mtx);
+ txg_resume(dp);
+ return (EALREADY);
+ } else if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
+ /*
+ * This buffer has already been synced. It could not
+ * have been dirtied since, or we would have cleared the state.
+ */
+ *bp = dr->dt.dl.dr_overridden_by; /* structure assignment */
+ mutex_exit(&db->db_mtx);
+ txg_resume(dp);
+ return (0);
+ }
+
+ dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
+ in = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
+ in->dr = dr;
+ in->done = done;
+ in->arg = arg;
+ mutex_exit(&db->db_mtx);
+ txg_resume(dp);
+
+ zb.zb_objset = os->os_dsl_dataset->ds_object;
+ zb.zb_object = db->db.db_object;
+ zb.zb_level = db->db_level;
+ zb.zb_blkid = db->db_blkid;
+ zio_flags = ZIO_FLAG_MUSTSUCCEED;
+ if (dmu_ot[db->db_dnode->dn_type].ot_metadata || zb.zb_level != 0)
+ zio_flags |= ZIO_FLAG_METADATA;
+ zio = arc_write(pio, os->os_spa,
+ zio_checksum_select(db->db_dnode->dn_checksum, os->os_checksum),
+ zio_compress_select(db->db_dnode->dn_compress, os->os_compress),
+ dmu_get_replication_level(os, &zb, db->db_dnode->dn_type),
+ txg, bp, dr->dt.dl.dr_data, NULL, dmu_sync_done, in,
+ ZIO_PRIORITY_SYNC_WRITE, zio_flags, &zb);
+
+ if (pio) {
+ zio_nowait(zio);
+ err = EINPROGRESS;
+ } else {
+ err = zio_wait(zio);
+ ASSERT(err == 0);
+ }
+ return (err);
+}
+
+int
+dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
+ dmu_tx_t *tx)
+{
+ dnode_t *dn;
+ int err;
+
+ err = dnode_hold(os->os, object, FTAG, &dn);
+ if (err)
+ return (err);
+ err = dnode_set_blksz(dn, size, ibs, tx);
+ dnode_rele(dn, FTAG);
+ return (err);
+}
+
+void
+dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
+ dmu_tx_t *tx)
+{
+ dnode_t *dn;
+
+ /* XXX assumes dnode_hold will not get an i/o error */
+ (void) dnode_hold(os->os, object, FTAG, &dn);
+ ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS);
+ dn->dn_checksum = checksum;
+ dnode_setdirty(dn, tx);
+ dnode_rele(dn, FTAG);
+}
+
+void
+dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
+ dmu_tx_t *tx)
+{
+ dnode_t *dn;
+
+ /* XXX assumes dnode_hold will not get an i/o error */
+ (void) dnode_hold(os->os, object, FTAG, &dn);
+ ASSERT(compress < ZIO_COMPRESS_FUNCTIONS);
+ dn->dn_compress = compress;
+ dnode_setdirty(dn, tx);
+ dnode_rele(dn, FTAG);
+}
+
+int
+dmu_get_replication_level(objset_impl_t *os,
+ zbookmark_t *zb, dmu_object_type_t ot)
+{
+ int ncopies = os->os_copies;
+
+ /* If it's the mos, it should have max copies set. */
+ ASSERT(zb->zb_objset != 0 ||
+ ncopies == spa_max_replication(os->os_spa));
+
+ if (dmu_ot[ot].ot_metadata || zb->zb_level != 0)
+ ncopies++;
+ return (MIN(ncopies, spa_max_replication(os->os_spa)));
+}
+
+int
+dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
+{
+ dnode_t *dn;
+ int i, err;
+
+ err = dnode_hold(os->os, object, FTAG, &dn);
+ if (err)
+ return (err);
+ /*
+ * Sync any current changes before
+ * we go trundling through the block pointers.
+ */
+ for (i = 0; i < TXG_SIZE; i++) {
+ if (list_link_active(&dn->dn_dirty_link[i]))
+ break;
+ }
+ if (i != TXG_SIZE) {
+ dnode_rele(dn, FTAG);
+ txg_wait_synced(dmu_objset_pool(os), 0);
+ err = dnode_hold(os->os, object, FTAG, &dn);
+ if (err)
+ return (err);
+ }
+
+ err = dnode_next_offset(dn, hole, off, 1, 1, 0);
+ dnode_rele(dn, FTAG);
+
+ return (err);
+}
+
+void
+dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
+{
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ mutex_enter(&dn->dn_mtx);
+
+ doi->doi_data_block_size = dn->dn_datablksz;
+ doi->doi_metadata_block_size = dn->dn_indblkshift ?
+ 1ULL << dn->dn_indblkshift : 0;
+ doi->doi_indirection = dn->dn_nlevels;
+ doi->doi_checksum = dn->dn_checksum;
+ doi->doi_compress = dn->dn_compress;
+ doi->doi_physical_blks = (DN_USED_BYTES(dn->dn_phys) +
+ SPA_MINBLOCKSIZE/2) >> SPA_MINBLOCKSHIFT;
+ doi->doi_max_block_offset = dn->dn_phys->dn_maxblkid;
+ doi->doi_type = dn->dn_type;
+ doi->doi_bonus_size = dn->dn_bonuslen;
+ doi->doi_bonus_type = dn->dn_bonustype;
+
+ mutex_exit(&dn->dn_mtx);
+ rw_exit(&dn->dn_struct_rwlock);
+}
+
+/*
+ * Get information on a DMU object.
+ * If doi is NULL, just indicates whether the object exists.
+ */
+int
+dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi)
+{
+ dnode_t *dn;
+ int err = dnode_hold(os->os, object, FTAG, &dn);
+
+ if (err)
+ return (err);
+
+ if (doi != NULL)
+ dmu_object_info_from_dnode(dn, doi);
+
+ dnode_rele(dn, FTAG);
+ return (0);
+}
+
+/*
+ * As above, but faster; can be used when you have a held dbuf in hand.
+ */
+void
+dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi)
+{
+ dmu_object_info_from_dnode(((dmu_buf_impl_t *)db)->db_dnode, doi);
+}
+
+/*
+ * Faster still when you only care about the size.
+ * This is specifically optimized for zfs_getattr().
+ */
+void
+dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, u_longlong_t *nblk512)
+{
+ dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
+
+ *blksize = dn->dn_datablksz;
+ /* add 1 for dnode space */
+ *nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >>
+ SPA_MINBLOCKSHIFT) + 1;
+}
+
+void
+byteswap_uint64_array(void *vbuf, size_t size)
+{
+ uint64_t *buf = vbuf;
+ size_t count = size >> 3;
+ int i;
+
+ ASSERT((size & 7) == 0);
+
+ for (i = 0; i < count; i++)
+ buf[i] = BSWAP_64(buf[i]);
+}
+
+void
+byteswap_uint32_array(void *vbuf, size_t size)
+{
+ uint32_t *buf = vbuf;
+ size_t count = size >> 2;
+ int i;
+
+ ASSERT((size & 3) == 0);
+
+ for (i = 0; i < count; i++)
+ buf[i] = BSWAP_32(buf[i]);
+}
+
+void
+byteswap_uint16_array(void *vbuf, size_t size)
+{
+ uint16_t *buf = vbuf;
+ size_t count = size >> 1;
+ int i;
+
+ ASSERT((size & 1) == 0);
+
+ for (i = 0; i < count; i++)
+ buf[i] = BSWAP_16(buf[i]);
+}
+
+/* ARGSUSED */
+void
+byteswap_uint8_array(void *vbuf, size_t size)
+{
+}
+
+void
+dmu_init(void)
+{
+ dbuf_init();
+ dnode_init();
+ arc_init();
+}
+
+void
+dmu_fini(void)
+{
+ arc_fini();
+ dnode_fini();
+ dbuf_fini();
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c
new file mode 100644
index 0000000..93168cc
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c
@@ -0,0 +1,160 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_tx.h>
+#include <sys/dnode.h>
+
+uint64_t
+dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+ objset_impl_t *osi = os->os;
+ uint64_t object;
+ uint64_t L2_dnode_count = DNODES_PER_BLOCK <<
+ (osi->os_meta_dnode->dn_indblkshift - SPA_BLKPTRSHIFT);
+ dnode_t *dn = NULL;
+ int restarted = B_FALSE;
+
+ mutex_enter(&osi->os_obj_lock);
+ for (;;) {
+ object = osi->os_obj_next;
+ /*
+ * Each time we polish off an L2 bp worth of dnodes
+ * (2^13 objects), move to another L2 bp that's still
+ * reasonably sparse (at most 1/4 full). Look from the
+ * beginning once, but after that keep looking from here.
+ * If we can't find one, just keep going from here.
+ */
+ if (P2PHASE(object, L2_dnode_count) == 0) {
+ uint64_t offset = restarted ? object << DNODE_SHIFT : 0;
+ int error = dnode_next_offset(osi->os_meta_dnode,
+ B_TRUE, &offset, 2, DNODES_PER_BLOCK >> 2, 0);
+ restarted = B_TRUE;
+ if (error == 0)
+ object = offset >> DNODE_SHIFT;
+ }
+ osi->os_obj_next = ++object;
+
+ /*
+ * XXX We should check for an i/o error here and return
+ * up to our caller. Actually we should pre-read it in
+ * dmu_tx_assign(), but there is currently no mechanism
+ * to do so.
+ */
+ (void) dnode_hold_impl(os->os, object, DNODE_MUST_BE_FREE,
+ FTAG, &dn);
+ if (dn)
+ break;
+
+ if (dmu_object_next(os, &object, B_TRUE, 0) == 0)
+ osi->os_obj_next = object - 1;
+ }
+
+ dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx);
+ dnode_rele(dn, FTAG);
+
+ mutex_exit(&osi->os_obj_lock);
+
+ dmu_tx_add_new_object(tx, os, object);
+ return (object);
+}
+
+int
+dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
+ int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+ dnode_t *dn;
+ int err;
+
+ if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx))
+ return (EBADF);
+
+ err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_FREE, FTAG, &dn);
+ if (err)
+ return (err);
+ dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx);
+ dnode_rele(dn, FTAG);
+
+ dmu_tx_add_new_object(tx, os, object);
+ return (0);
+}
+
+int
+dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
+ int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+ dnode_t *dn;
+ int err;
+
+ if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx))
+ return (EBADF);
+
+ err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED,
+ FTAG, &dn);
+ if (err)
+ return (err);
+ dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, tx);
+ dnode_rele(dn, FTAG);
+
+ return (0);
+}
+
+int
+dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
+{
+ dnode_t *dn;
+ int err;
+
+ ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
+
+ err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED,
+ FTAG, &dn);
+ if (err)
+ return (err);
+
+ ASSERT(dn->dn_type != DMU_OT_NONE);
+ dnode_free(dn, tx);
+ dnode_rele(dn, FTAG);
+
+ return (0);
+}
+
+int
+dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
+{
+ uint64_t offset = (*objectp + 1) << DNODE_SHIFT;
+ int error;
+
+ error = dnode_next_offset(os->os->os_meta_dnode,
+ hole, &offset, 0, DNODES_PER_BLOCK, txg);
+
+ *objectp = offset >> DNODE_SHIFT;
+
+ return (error);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
new file mode 100644
index 0000000..07f8c86
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
@@ -0,0 +1,1034 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dnode.h>
+#include <sys/dbuf.h>
+#include <sys/zvol.h>
+#include <sys/dmu_tx.h>
+#include <sys/zio_checksum.h>
+#include <sys/zap.h>
+#include <sys/zil.h>
+#include <sys/dmu_impl.h>
+
+
+spa_t *
+dmu_objset_spa(objset_t *os)
+{
+ return (os->os->os_spa);
+}
+
+zilog_t *
+dmu_objset_zil(objset_t *os)
+{
+ return (os->os->os_zil);
+}
+
+dsl_pool_t *
+dmu_objset_pool(objset_t *os)
+{
+ dsl_dataset_t *ds;
+
+ if ((ds = os->os->os_dsl_dataset) != NULL && ds->ds_dir)
+ return (ds->ds_dir->dd_pool);
+ else
+ return (spa_get_dsl(os->os->os_spa));
+}
+
+dsl_dataset_t *
+dmu_objset_ds(objset_t *os)
+{
+ return (os->os->os_dsl_dataset);
+}
+
+dmu_objset_type_t
+dmu_objset_type(objset_t *os)
+{
+ return (os->os->os_phys->os_type);
+}
+
+void
+dmu_objset_name(objset_t *os, char *buf)
+{
+ dsl_dataset_name(os->os->os_dsl_dataset, buf);
+}
+
+uint64_t
+dmu_objset_id(objset_t *os)
+{
+ dsl_dataset_t *ds = os->os->os_dsl_dataset;
+
+ return (ds ? ds->ds_object : 0);
+}
+
+static void
+checksum_changed_cb(void *arg, uint64_t newval)
+{
+ objset_impl_t *osi = arg;
+
+ /*
+ * Inheritance should have been done by now.
+ */
+ ASSERT(newval != ZIO_CHECKSUM_INHERIT);
+
+ osi->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE);
+}
+
+static void
+compression_changed_cb(void *arg, uint64_t newval)
+{
+ objset_impl_t *osi = arg;
+
+ /*
+ * Inheritance and range checking should have been done by now.
+ */
+ ASSERT(newval != ZIO_COMPRESS_INHERIT);
+
+ osi->os_compress = zio_compress_select(newval, ZIO_COMPRESS_ON_VALUE);
+}
+
+static void
+copies_changed_cb(void *arg, uint64_t newval)
+{
+ objset_impl_t *osi = arg;
+
+ /*
+ * Inheritance and range checking should have been done by now.
+ */
+ ASSERT(newval > 0);
+ ASSERT(newval <= spa_max_replication(osi->os_spa));
+
+ osi->os_copies = newval;
+}
+
+void
+dmu_objset_byteswap(void *buf, size_t size)
+{
+ objset_phys_t *osp = buf;
+
+ ASSERT(size == sizeof (objset_phys_t));
+ dnode_byteswap(&osp->os_meta_dnode);
+ byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t));
+ osp->os_type = BSWAP_64(osp->os_type);
+}
+
+int
+dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
+ objset_impl_t **osip)
+{
+ objset_impl_t *winner, *osi;
+ int i, err, checksum;
+
+ osi = kmem_zalloc(sizeof (objset_impl_t), KM_SLEEP);
+ osi->os.os = osi;
+ osi->os_dsl_dataset = ds;
+ osi->os_spa = spa;
+ osi->os_rootbp = bp;
+ if (!BP_IS_HOLE(osi->os_rootbp)) {
+ uint32_t aflags = ARC_WAIT;
+ zbookmark_t zb;
+ zb.zb_objset = ds ? ds->ds_object : 0;
+ zb.zb_object = 0;
+ zb.zb_level = -1;
+ zb.zb_blkid = 0;
+
+ dprintf_bp(osi->os_rootbp, "reading %s", "");
+ err = arc_read(NULL, spa, osi->os_rootbp,
+ dmu_ot[DMU_OT_OBJSET].ot_byteswap,
+ arc_getbuf_func, &osi->os_phys_buf,
+ ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb);
+ if (err) {
+ kmem_free(osi, sizeof (objset_impl_t));
+ return (err);
+ }
+ osi->os_phys = osi->os_phys_buf->b_data;
+ arc_release(osi->os_phys_buf, &osi->os_phys_buf);
+ } else {
+ osi->os_phys_buf = arc_buf_alloc(spa, sizeof (objset_phys_t),
+ &osi->os_phys_buf, ARC_BUFC_METADATA);
+ osi->os_phys = osi->os_phys_buf->b_data;
+ bzero(osi->os_phys, sizeof (objset_phys_t));
+ }
+
+ /*
+ * Note: the changed_cb will be called once before the register
+ * func returns, thus changing the checksum/compression from the
+ * default (fletcher2/off). Snapshots don't need to know, and
+ * registering would complicate clone promotion.
+ */
+ if (ds && ds->ds_phys->ds_num_children == 0) {
+ err = dsl_prop_register(ds, "checksum",
+ checksum_changed_cb, osi);
+ if (err == 0)
+ err = dsl_prop_register(ds, "compression",
+ compression_changed_cb, osi);
+ if (err == 0)
+ err = dsl_prop_register(ds, "copies",
+ copies_changed_cb, osi);
+ if (err) {
+ VERIFY(arc_buf_remove_ref(osi->os_phys_buf,
+ &osi->os_phys_buf) == 1);
+ kmem_free(osi, sizeof (objset_impl_t));
+ return (err);
+ }
+ } else if (ds == NULL) {
+ /* It's the meta-objset. */
+ osi->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
+ osi->os_compress = ZIO_COMPRESS_LZJB;
+ osi->os_copies = spa_max_replication(spa);
+ }
+
+ osi->os_zil = zil_alloc(&osi->os, &osi->os_phys->os_zil_header);
+
+ /*
+ * Metadata always gets compressed and checksummed.
+ * If the data checksum is multi-bit correctable, and it's not
+ * a ZBT-style checksum, then it's suitable for metadata as well.
+ * Otherwise, the metadata checksum defaults to fletcher4.
+ */
+ checksum = osi->os_checksum;
+
+ if (zio_checksum_table[checksum].ci_correctable &&
+ !zio_checksum_table[checksum].ci_zbt)
+ osi->os_md_checksum = checksum;
+ else
+ osi->os_md_checksum = ZIO_CHECKSUM_FLETCHER_4;
+ osi->os_md_compress = ZIO_COMPRESS_LZJB;
+
+ for (i = 0; i < TXG_SIZE; i++) {
+ list_create(&osi->os_dirty_dnodes[i], sizeof (dnode_t),
+ offsetof(dnode_t, dn_dirty_link[i]));
+ list_create(&osi->os_free_dnodes[i], sizeof (dnode_t),
+ offsetof(dnode_t, dn_dirty_link[i]));
+ }
+ list_create(&osi->os_dnodes, sizeof (dnode_t),
+ offsetof(dnode_t, dn_link));
+ list_create(&osi->os_downgraded_dbufs, sizeof (dmu_buf_impl_t),
+ offsetof(dmu_buf_impl_t, db_link));
+
+ mutex_init(&osi->os_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&osi->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ osi->os_meta_dnode = dnode_special_open(osi,
+ &osi->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT);
+
+ if (ds != NULL) {
+ winner = dsl_dataset_set_user_ptr(ds, osi, dmu_objset_evict);
+ if (winner) {
+ dmu_objset_evict(ds, osi);
+ osi = winner;
+ }
+ }
+
+ *osip = osi;
+ return (0);
+}
+
+/* called from zpl */
+int
+dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
+ objset_t **osp)
+{
+ dsl_dataset_t *ds;
+ int err;
+ objset_t *os;
+ objset_impl_t *osi;
+
+ os = kmem_alloc(sizeof (objset_t), KM_SLEEP);
+ err = dsl_dataset_open(name, mode, os, &ds);
+ if (err) {
+ kmem_free(os, sizeof (objset_t));
+ return (err);
+ }
+
+ osi = dsl_dataset_get_user_ptr(ds);
+ if (osi == NULL) {
+ err = dmu_objset_open_impl(dsl_dataset_get_spa(ds),
+ ds, &ds->ds_phys->ds_bp, &osi);
+ if (err) {
+ dsl_dataset_close(ds, mode, os);
+ kmem_free(os, sizeof (objset_t));
+ return (err);
+ }
+ }
+
+ os->os = osi;
+ os->os_mode = mode;
+
+ if (type != DMU_OST_ANY && type != os->os->os_phys->os_type) {
+ dmu_objset_close(os);
+ return (EINVAL);
+ }
+ *osp = os;
+ return (0);
+}
+
+void
+dmu_objset_close(objset_t *os)
+{
+ dsl_dataset_close(os->os->os_dsl_dataset, os->os_mode, os);
+ kmem_free(os, sizeof (objset_t));
+}
+
+int
+dmu_objset_evict_dbufs(objset_t *os, int try)
+{
+ objset_impl_t *osi = os->os;
+ dnode_t *dn;
+
+ mutex_enter(&osi->os_lock);
+
+ /* process the mdn last, since the other dnodes have holds on it */
+ list_remove(&osi->os_dnodes, osi->os_meta_dnode);
+ list_insert_tail(&osi->os_dnodes, osi->os_meta_dnode);
+
+ /*
+ * Find the first dnode with holds. We have to do this dance
+ * because dnode_add_ref() only works if you already have a
+ * hold. If there are no holds then it has no dbufs so OK to
+ * skip.
+ */
+ for (dn = list_head(&osi->os_dnodes);
+ dn && refcount_is_zero(&dn->dn_holds);
+ dn = list_next(&osi->os_dnodes, dn))
+ continue;
+ if (dn)
+ dnode_add_ref(dn, FTAG);
+
+ while (dn) {
+ dnode_t *next_dn = dn;
+
+ do {
+ next_dn = list_next(&osi->os_dnodes, next_dn);
+ } while (next_dn && refcount_is_zero(&next_dn->dn_holds));
+ if (next_dn)
+ dnode_add_ref(next_dn, FTAG);
+
+ mutex_exit(&osi->os_lock);
+ if (dnode_evict_dbufs(dn, try)) {
+ dnode_rele(dn, FTAG);
+ if (next_dn)
+ dnode_rele(next_dn, FTAG);
+ return (1);
+ }
+ dnode_rele(dn, FTAG);
+ mutex_enter(&osi->os_lock);
+ dn = next_dn;
+ }
+ mutex_exit(&osi->os_lock);
+ return (0);
+}
+
+void
+dmu_objset_evict(dsl_dataset_t *ds, void *arg)
+{
+ objset_impl_t *osi = arg;
+ objset_t os;
+ int i;
+
+ for (i = 0; i < TXG_SIZE; i++) {
+ ASSERT(list_head(&osi->os_dirty_dnodes[i]) == NULL);
+ ASSERT(list_head(&osi->os_free_dnodes[i]) == NULL);
+ }
+
+ if (ds && ds->ds_phys->ds_num_children == 0) {
+ VERIFY(0 == dsl_prop_unregister(ds, "checksum",
+ checksum_changed_cb, osi));
+ VERIFY(0 == dsl_prop_unregister(ds, "compression",
+ compression_changed_cb, osi));
+ VERIFY(0 == dsl_prop_unregister(ds, "copies",
+ copies_changed_cb, osi));
+ }
+
+ /*
+ * We should need only a single pass over the dnode list, since
+ * nothing can be added to the list at this point.
+ */
+ os.os = osi;
+ (void) dmu_objset_evict_dbufs(&os, 0);
+
+ ASSERT3P(list_head(&osi->os_dnodes), ==, osi->os_meta_dnode);
+ ASSERT3P(list_tail(&osi->os_dnodes), ==, osi->os_meta_dnode);
+ ASSERT3P(list_head(&osi->os_meta_dnode->dn_dbufs), ==, NULL);
+
+ dnode_special_close(osi->os_meta_dnode);
+ zil_free(osi->os_zil);
+
+ VERIFY(arc_buf_remove_ref(osi->os_phys_buf, &osi->os_phys_buf) == 1);
+ mutex_destroy(&osi->os_lock);
+ mutex_destroy(&osi->os_obj_lock);
+ kmem_free(osi, sizeof (objset_impl_t));
+}
+
+/* called from dsl for meta-objset */
+objset_impl_t *
+dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
+ dmu_objset_type_t type, dmu_tx_t *tx)
+{
+ objset_impl_t *osi;
+ dnode_t *mdn;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ VERIFY(0 == dmu_objset_open_impl(spa, ds, bp, &osi));
+ mdn = osi->os_meta_dnode;
+
+ dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT,
+ DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx);
+
+ /*
+ * We don't want to have to increase the meta-dnode's nlevels
+ * later, because then we could do it in quescing context while
+ * we are also accessing it in open context.
+ *
+ * This precaution is not necessary for the MOS (ds == NULL),
+ * because the MOS is only updated in syncing context.
+ * This is most fortunate: the MOS is the only objset that
+ * needs to be synced multiple times as spa_sync() iterates
+ * to convergence, so minimizing its dn_nlevels matters.
+ */
+ if (ds != NULL) {
+ int levels = 1;
+
+ /*
+ * Determine the number of levels necessary for the meta-dnode
+ * to contain DN_MAX_OBJECT dnodes.
+ */
+ while ((uint64_t)mdn->dn_nblkptr << (mdn->dn_datablkshift +
+ (levels - 1) * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) <
+ DN_MAX_OBJECT * sizeof (dnode_phys_t))
+ levels++;
+
+ mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] =
+ mdn->dn_nlevels = levels;
+ }
+
+ ASSERT(type != DMU_OST_NONE);
+ ASSERT(type != DMU_OST_ANY);
+ ASSERT(type < DMU_OST_NUMTYPES);
+ osi->os_phys->os_type = type;
+
+ dsl_dataset_dirty(ds, tx);
+
+ return (osi);
+}
+
+struct oscarg {
+ void (*userfunc)(objset_t *os, void *arg, dmu_tx_t *tx);
+ void *userarg;
+ dsl_dataset_t *clone_parent;
+ const char *lastname;
+ dmu_objset_type_t type;
+};
+
+/* ARGSUSED */
+static int
+dmu_objset_create_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dir_t *dd = arg1;
+ struct oscarg *oa = arg2;
+ objset_t *mos = dd->dd_pool->dp_meta_objset;
+ int err;
+ uint64_t ddobj;
+
+ err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj,
+ oa->lastname, sizeof (uint64_t), 1, &ddobj);
+ if (err != ENOENT)
+ return (err ? err : EEXIST);
+
+ if (oa->clone_parent != NULL) {
+ /*
+ * You can't clone across pools.
+ */
+ if (oa->clone_parent->ds_dir->dd_pool != dd->dd_pool)
+ return (EXDEV);
+
+ /*
+ * You can only clone snapshots, not the head datasets.
+ */
+ if (oa->clone_parent->ds_phys->ds_num_children == 0)
+ return (EINVAL);
+ }
+ return (0);
+}
+
+static void
+dmu_objset_create_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dir_t *dd = arg1;
+ struct oscarg *oa = arg2;
+ dsl_dataset_t *ds;
+ blkptr_t *bp;
+ uint64_t dsobj;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ dsobj = dsl_dataset_create_sync(dd, oa->lastname,
+ oa->clone_parent, tx);
+
+ VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, dsobj, NULL,
+ DS_MODE_STANDARD | DS_MODE_READONLY, FTAG, &ds));
+ bp = dsl_dataset_get_blkptr(ds);
+ if (BP_IS_HOLE(bp)) {
+ objset_impl_t *osi;
+
+ /* This is an empty dmu_objset; not a clone. */
+ osi = dmu_objset_create_impl(dsl_dataset_get_spa(ds),
+ ds, bp, oa->type, tx);
+
+ if (oa->userfunc)
+ oa->userfunc(&osi->os, oa->userarg, tx);
+ }
+ dsl_dataset_close(ds, DS_MODE_STANDARD | DS_MODE_READONLY, FTAG);
+}
+
+int
+dmu_objset_create(const char *name, dmu_objset_type_t type,
+ objset_t *clone_parent,
+ void (*func)(objset_t *os, void *arg, dmu_tx_t *tx), void *arg)
+{
+ dsl_dir_t *pdd;
+ const char *tail;
+ int err = 0;
+ struct oscarg oa = { 0 };
+
+ ASSERT(strchr(name, '@') == NULL);
+ err = dsl_dir_open(name, FTAG, &pdd, &tail);
+ if (err)
+ return (err);
+ if (tail == NULL) {
+ dsl_dir_close(pdd, FTAG);
+ return (EEXIST);
+ }
+
+ dprintf("name=%s\n", name);
+
+ oa.userfunc = func;
+ oa.userarg = arg;
+ oa.lastname = tail;
+ oa.type = type;
+ if (clone_parent != NULL) {
+ /*
+ * You can't clone to a different type.
+ */
+ if (clone_parent->os->os_phys->os_type != type) {
+ dsl_dir_close(pdd, FTAG);
+ return (EINVAL);
+ }
+ oa.clone_parent = clone_parent->os->os_dsl_dataset;
+ }
+ err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check,
+ dmu_objset_create_sync, pdd, &oa, 5);
+ dsl_dir_close(pdd, FTAG);
+ return (err);
+}
+
+int
+dmu_objset_destroy(const char *name)
+{
+ objset_t *os;
+ int error;
+
+ /*
+ * If it looks like we'll be able to destroy it, and there's
+ * an unplayed replay log sitting around, destroy the log.
+ * It would be nicer to do this in dsl_dataset_destroy_sync(),
+ * but the replay log objset is modified in open context.
+ */
+ error = dmu_objset_open(name, DMU_OST_ANY, DS_MODE_EXCLUSIVE, &os);
+ if (error == 0) {
+ zil_destroy(dmu_objset_zil(os), B_FALSE);
+ dmu_objset_close(os);
+ }
+
+ return (dsl_dataset_destroy(name));
+}
+
+int
+dmu_objset_rollback(const char *name)
+{
+ int err;
+ objset_t *os;
+
+ err = dmu_objset_open(name, DMU_OST_ANY,
+ DS_MODE_EXCLUSIVE | DS_MODE_INCONSISTENT, &os);
+ if (err == 0) {
+ err = zil_suspend(dmu_objset_zil(os));
+ if (err == 0)
+ zil_resume(dmu_objset_zil(os));
+ if (err == 0) {
+ /* XXX uncache everything? */
+ err = dsl_dataset_rollback(os->os->os_dsl_dataset);
+ }
+ dmu_objset_close(os);
+ }
+ return (err);
+}
+
+struct snaparg {
+ dsl_sync_task_group_t *dstg;
+ char *snapname;
+ char failed[MAXPATHLEN];
+};
+
+static int
+dmu_objset_snapshot_one(char *name, void *arg)
+{
+ struct snaparg *sn = arg;
+ objset_t *os;
+ dmu_objset_stats_t stat;
+ int err;
+
+ (void) strcpy(sn->failed, name);
+
+ err = dmu_objset_open(name, DMU_OST_ANY, DS_MODE_STANDARD, &os);
+ if (err != 0)
+ return (err);
+
+ /*
+ * If the objset is in an inconsistent state, return busy.
+ */
+ dmu_objset_fast_stat(os, &stat);
+ if (stat.dds_inconsistent) {
+ dmu_objset_close(os);
+ return (EBUSY);
+ }
+
+ /*
+ * NB: we need to wait for all in-flight changes to get to disk,
+ * so that we snapshot those changes. zil_suspend does this as
+ * a side effect.
+ */
+ err = zil_suspend(dmu_objset_zil(os));
+ if (err == 0) {
+ dsl_sync_task_create(sn->dstg, dsl_dataset_snapshot_check,
+ dsl_dataset_snapshot_sync, os, sn->snapname, 3);
+ } else {
+ dmu_objset_close(os);
+ }
+
+ return (err);
+}
+
+int
+dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive)
+{
+ dsl_sync_task_t *dst;
+ struct snaparg sn = { 0 };
+ char *cp;
+ spa_t *spa;
+ int err;
+
+ (void) strcpy(sn.failed, fsname);
+
+ cp = strchr(fsname, '/');
+ if (cp) {
+ *cp = '\0';
+ err = spa_open(fsname, &spa, FTAG);
+ *cp = '/';
+ } else {
+ err = spa_open(fsname, &spa, FTAG);
+ }
+ if (err)
+ return (err);
+
+ sn.dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
+ sn.snapname = snapname;
+
+ if (recursive) {
+ err = dmu_objset_find(fsname,
+ dmu_objset_snapshot_one, &sn, DS_FIND_CHILDREN);
+ } else {
+ err = dmu_objset_snapshot_one(fsname, &sn);
+ }
+
+ if (err)
+ goto out;
+
+ err = dsl_sync_task_group_wait(sn.dstg);
+
+ for (dst = list_head(&sn.dstg->dstg_tasks); dst;
+ dst = list_next(&sn.dstg->dstg_tasks, dst)) {
+ objset_t *os = dst->dst_arg1;
+ if (dst->dst_err)
+ dmu_objset_name(os, sn.failed);
+ zil_resume(dmu_objset_zil(os));
+ dmu_objset_close(os);
+ }
+out:
+ if (err)
+ (void) strcpy(fsname, sn.failed);
+ dsl_sync_task_group_destroy(sn.dstg);
+ spa_close(spa, FTAG);
+ return (err);
+}
+
+static void
+dmu_objset_sync_dnodes(list_t *list, dmu_tx_t *tx)
+{
+ dnode_t *dn;
+
+ while (dn = list_head(list)) {
+ ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
+ ASSERT(dn->dn_dbuf->db_data_pending);
+ /*
+ * Initialize dn_zio outside dnode_sync()
+ * to accomodate meta-dnode
+ */
+ dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio;
+ ASSERT(dn->dn_zio);
+
+ ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS);
+ list_remove(list, dn);
+ dnode_sync(dn, tx);
+ }
+}
+
+/* ARGSUSED */
+static void
+ready(zio_t *zio, arc_buf_t *abuf, void *arg)
+{
+ objset_impl_t *os = arg;
+ blkptr_t *bp = os->os_rootbp;
+ dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;
+ int i;
+
+ /*
+ * Update rootbp fill count.
+ */
+ bp->blk_fill = 1; /* count the meta-dnode */
+ for (i = 0; i < dnp->dn_nblkptr; i++)
+ bp->blk_fill += dnp->dn_blkptr[i].blk_fill;
+}
+
+/* ARGSUSED */
+static void
+killer(zio_t *zio, arc_buf_t *abuf, void *arg)
+{
+ objset_impl_t *os = arg;
+
+ ASSERT3U(zio->io_error, ==, 0);
+
+ BP_SET_TYPE(zio->io_bp, DMU_OT_OBJSET);
+ BP_SET_LEVEL(zio->io_bp, 0);
+
+ if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp),
+ BP_IDENTITY(&zio->io_bp_orig))) {
+ if (zio->io_bp_orig.blk_birth == os->os_synctx->tx_txg)
+ dsl_dataset_block_kill(os->os_dsl_dataset,
+ &zio->io_bp_orig, NULL, os->os_synctx);
+ dsl_dataset_block_born(os->os_dsl_dataset, zio->io_bp,
+ os->os_synctx);
+ }
+ arc_release(os->os_phys_buf, &os->os_phys_buf);
+}
+
+/* called from dsl */
+void
+dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx)
+{
+ int txgoff;
+ zbookmark_t zb;
+ zio_t *zio;
+ list_t *list;
+ dbuf_dirty_record_t *dr;
+ int zio_flags;
+
+ dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg);
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ /* XXX the write_done callback should really give us the tx... */
+ os->os_synctx = tx;
+
+ if (os->os_dsl_dataset == NULL) {
+ /*
+ * This is the MOS. If we have upgraded,
+ * spa_max_replication() could change, so reset
+ * os_copies here.
+ */
+ os->os_copies = spa_max_replication(os->os_spa);
+ }
+
+ /*
+ * Create the root block IO
+ */
+ zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0;
+ zb.zb_object = 0;
+ zb.zb_level = -1;
+ zb.zb_blkid = 0;
+ zio_flags = ZIO_FLAG_MUSTSUCCEED;
+ if (dmu_ot[DMU_OT_OBJSET].ot_metadata || zb.zb_level != 0)
+ zio_flags |= ZIO_FLAG_METADATA;
+ if (BP_IS_OLDER(os->os_rootbp, tx->tx_txg))
+ dsl_dataset_block_kill(os->os_dsl_dataset,
+ os->os_rootbp, pio, tx);
+ zio = arc_write(pio, os->os_spa, os->os_md_checksum,
+ os->os_md_compress,
+ dmu_get_replication_level(os, &zb, DMU_OT_OBJSET),
+ tx->tx_txg, os->os_rootbp, os->os_phys_buf, ready, killer, os,
+ ZIO_PRIORITY_ASYNC_WRITE, zio_flags, &zb);
+
+ /*
+ * Sync meta-dnode - the parent IO for the sync is the root block
+ */
+ os->os_meta_dnode->dn_zio = zio;
+ dnode_sync(os->os_meta_dnode, tx);
+
+ txgoff = tx->tx_txg & TXG_MASK;
+
+ dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], tx);
+ dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], tx);
+
+ list = &os->os_meta_dnode->dn_dirty_records[txgoff];
+ while (dr = list_head(list)) {
+ ASSERT(dr->dr_dbuf->db_level == 0);
+ list_remove(list, dr);
+ if (dr->dr_zio)
+ zio_nowait(dr->dr_zio);
+ }
+ /*
+ * Free intent log blocks up to this tx.
+ */
+ zil_sync(os->os_zil, tx);
+ zio_nowait(zio);
+}
+
+void
+dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
+ uint64_t *usedobjsp, uint64_t *availobjsp)
+{
+ dsl_dataset_space(os->os->os_dsl_dataset, refdbytesp, availbytesp,
+ usedobjsp, availobjsp);
+}
+
+uint64_t
+dmu_objset_fsid_guid(objset_t *os)
+{
+ return (dsl_dataset_fsid_guid(os->os->os_dsl_dataset));
+}
+
+void
+dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat)
+{
+ stat->dds_type = os->os->os_phys->os_type;
+ if (os->os->os_dsl_dataset)
+ dsl_dataset_fast_stat(os->os->os_dsl_dataset, stat);
+}
+
+void
+dmu_objset_stats(objset_t *os, nvlist_t *nv)
+{
+ ASSERT(os->os->os_dsl_dataset ||
+ os->os->os_phys->os_type == DMU_OST_META);
+
+ if (os->os->os_dsl_dataset != NULL)
+ dsl_dataset_stats(os->os->os_dsl_dataset, nv);
+
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE,
+ os->os->os_phys->os_type);
+}
+
+int
+dmu_objset_is_snapshot(objset_t *os)
+{
+ if (os->os->os_dsl_dataset != NULL)
+ return (dsl_dataset_is_snapshot(os->os->os_dsl_dataset));
+ else
+ return (B_FALSE);
+}
+
+int
+dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
+ uint64_t *idp, uint64_t *offp)
+{
+ dsl_dataset_t *ds = os->os->os_dsl_dataset;
+ zap_cursor_t cursor;
+ zap_attribute_t attr;
+
+ if (ds->ds_phys->ds_snapnames_zapobj == 0)
+ return (ENOENT);
+
+ zap_cursor_init_serialized(&cursor,
+ ds->ds_dir->dd_pool->dp_meta_objset,
+ ds->ds_phys->ds_snapnames_zapobj, *offp);
+
+ if (zap_cursor_retrieve(&cursor, &attr) != 0) {
+ zap_cursor_fini(&cursor);
+ return (ENOENT);
+ }
+
+ if (strlen(attr.za_name) + 1 > namelen) {
+ zap_cursor_fini(&cursor);
+ return (ENAMETOOLONG);
+ }
+
+ (void) strcpy(name, attr.za_name);
+ if (idp)
+ *idp = attr.za_first_integer;
+ zap_cursor_advance(&cursor);
+ *offp = zap_cursor_serialize(&cursor);
+ zap_cursor_fini(&cursor);
+
+ return (0);
+}
+
+int
+dmu_dir_list_next(objset_t *os, int namelen, char *name,
+ uint64_t *idp, uint64_t *offp)
+{
+ dsl_dir_t *dd = os->os->os_dsl_dataset->ds_dir;
+ zap_cursor_t cursor;
+ zap_attribute_t attr;
+
+ /* there is no next dir on a snapshot! */
+ if (os->os->os_dsl_dataset->ds_object !=
+ dd->dd_phys->dd_head_dataset_obj)
+ return (ENOENT);
+
+ zap_cursor_init_serialized(&cursor,
+ dd->dd_pool->dp_meta_objset,
+ dd->dd_phys->dd_child_dir_zapobj, *offp);
+
+ if (zap_cursor_retrieve(&cursor, &attr) != 0) {
+ zap_cursor_fini(&cursor);
+ return (ENOENT);
+ }
+
+ if (strlen(attr.za_name) + 1 > namelen) {
+ zap_cursor_fini(&cursor);
+ return (ENAMETOOLONG);
+ }
+
+ (void) strcpy(name, attr.za_name);
+ if (idp)
+ *idp = attr.za_first_integer;
+ zap_cursor_advance(&cursor);
+ *offp = zap_cursor_serialize(&cursor);
+ zap_cursor_fini(&cursor);
+
+ return (0);
+}
+
+/*
+ * Find all objsets under name, and for each, call 'func(child_name, arg)'.
+ */
+int
+dmu_objset_find(char *name, int func(char *, void *), void *arg, int flags)
+{
+ dsl_dir_t *dd;
+ objset_t *os;
+ uint64_t snapobj;
+ zap_cursor_t zc;
+ zap_attribute_t attr;
+ char *child;
+ int do_self, err;
+
+ err = dsl_dir_open(name, FTAG, &dd, NULL);
+ if (err)
+ return (err);
+
+ /* NB: the $MOS dir doesn't have a head dataset */
+ do_self = (dd->dd_phys->dd_head_dataset_obj != 0);
+
+ /*
+ * Iterate over all children.
+ */
+ if (flags & DS_FIND_CHILDREN) {
+ for (zap_cursor_init(&zc, dd->dd_pool->dp_meta_objset,
+ dd->dd_phys->dd_child_dir_zapobj);
+ zap_cursor_retrieve(&zc, &attr) == 0;
+ (void) zap_cursor_advance(&zc)) {
+ ASSERT(attr.za_integer_length == sizeof (uint64_t));
+ ASSERT(attr.za_num_integers == 1);
+
+ /*
+ * No separating '/' because parent's name ends in /.
+ */
+ child = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+ /* XXX could probably just use name here */
+ dsl_dir_name(dd, child);
+ (void) strcat(child, "/");
+ (void) strcat(child, attr.za_name);
+ err = dmu_objset_find(child, func, arg, flags);
+ kmem_free(child, MAXPATHLEN);
+ if (err)
+ break;
+ }
+ zap_cursor_fini(&zc);
+
+ if (err) {
+ dsl_dir_close(dd, FTAG);
+ return (err);
+ }
+ }
+
+ /*
+ * Iterate over all snapshots.
+ */
+ if ((flags & DS_FIND_SNAPSHOTS) &&
+ dmu_objset_open(name, DMU_OST_ANY,
+ DS_MODE_STANDARD | DS_MODE_READONLY, &os) == 0) {
+
+ snapobj = os->os->os_dsl_dataset->ds_phys->ds_snapnames_zapobj;
+ dmu_objset_close(os);
+
+ for (zap_cursor_init(&zc, dd->dd_pool->dp_meta_objset, snapobj);
+ zap_cursor_retrieve(&zc, &attr) == 0;
+ (void) zap_cursor_advance(&zc)) {
+ ASSERT(attr.za_integer_length == sizeof (uint64_t));
+ ASSERT(attr.za_num_integers == 1);
+
+ child = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+ /* XXX could probably just use name here */
+ dsl_dir_name(dd, child);
+ (void) strcat(child, "@");
+ (void) strcat(child, attr.za_name);
+ err = func(child, arg);
+ kmem_free(child, MAXPATHLEN);
+ if (err)
+ break;
+ }
+ zap_cursor_fini(&zc);
+ }
+
+ dsl_dir_close(dd, FTAG);
+
+ if (err)
+ return (err);
+
+ /*
+ * Apply to self if appropriate.
+ */
+ if (do_self)
+ err = func(name, arg);
+ return (err);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
new file mode 100644
index 0000000..46facc3
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
@@ -0,0 +1,1009 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/zfs_context.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_synctask.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zap.h>
+#include <sys/zio_checksum.h>
+
+struct backuparg {
+ dmu_replay_record_t *drr;
+ kthread_t *td;
+ struct file *fp;
+ objset_t *os;
+ zio_cksum_t zc;
+ int err;
+};
+
+static int
+dump_bytes(struct backuparg *ba, void *buf, int len)
+{
+ struct uio auio;
+ struct iovec aiov;
+
+ ASSERT3U(len % 8, ==, 0);
+
+ fletcher_4_incremental_native(buf, len, &ba->zc);
+
+ aiov.iov_base = buf;
+ aiov.iov_len = len;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_resid = len;
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_rw = UIO_WRITE;
+ auio.uio_offset = (off_t)-1;
+ auio.uio_td = ba->td;
+#ifdef _KERNEL
+ if (ba->fp->f_type == DTYPE_VNODE)
+ bwillwrite();
+ ba->err = fo_write(ba->fp, &auio, ba->td->td_ucred, 0, ba->td);
+#else
+ fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__);
+ ba->err = EOPNOTSUPP;
+#endif
+
+ return (ba->err);
+}
+
+static int
+dump_free(struct backuparg *ba, uint64_t object, uint64_t offset,
+ uint64_t length)
+{
+ /* write a FREE record */
+ bzero(ba->drr, sizeof (dmu_replay_record_t));
+ ba->drr->drr_type = DRR_FREE;
+ ba->drr->drr_u.drr_free.drr_object = object;
+ ba->drr->drr_u.drr_free.drr_offset = offset;
+ ba->drr->drr_u.drr_free.drr_length = length;
+
+ if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
+ return (EINTR);
+ return (0);
+}
+
+static int
+dump_data(struct backuparg *ba, dmu_object_type_t type,
+ uint64_t object, uint64_t offset, int blksz, void *data)
+{
+ /* write a DATA record */
+ bzero(ba->drr, sizeof (dmu_replay_record_t));
+ ba->drr->drr_type = DRR_WRITE;
+ ba->drr->drr_u.drr_write.drr_object = object;
+ ba->drr->drr_u.drr_write.drr_type = type;
+ ba->drr->drr_u.drr_write.drr_offset = offset;
+ ba->drr->drr_u.drr_write.drr_length = blksz;
+
+ if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
+ return (EINTR);
+ if (dump_bytes(ba, data, blksz))
+ return (EINTR);
+ return (0);
+}
+
+static int
+dump_freeobjects(struct backuparg *ba, uint64_t firstobj, uint64_t numobjs)
+{
+ /* write a FREEOBJECTS record */
+ bzero(ba->drr, sizeof (dmu_replay_record_t));
+ ba->drr->drr_type = DRR_FREEOBJECTS;
+ ba->drr->drr_u.drr_freeobjects.drr_firstobj = firstobj;
+ ba->drr->drr_u.drr_freeobjects.drr_numobjs = numobjs;
+
+ if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
+ return (EINTR);
+ return (0);
+}
+
+static int
+dump_dnode(struct backuparg *ba, uint64_t object, dnode_phys_t *dnp)
+{
+ if (dnp == NULL || dnp->dn_type == DMU_OT_NONE)
+ return (dump_freeobjects(ba, object, 1));
+
+ /* write an OBJECT record */
+ bzero(ba->drr, sizeof (dmu_replay_record_t));
+ ba->drr->drr_type = DRR_OBJECT;
+ ba->drr->drr_u.drr_object.drr_object = object;
+ ba->drr->drr_u.drr_object.drr_type = dnp->dn_type;
+ ba->drr->drr_u.drr_object.drr_bonustype = dnp->dn_bonustype;
+ ba->drr->drr_u.drr_object.drr_blksz =
+ dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
+ ba->drr->drr_u.drr_object.drr_bonuslen = dnp->dn_bonuslen;
+ ba->drr->drr_u.drr_object.drr_checksum = dnp->dn_checksum;
+ ba->drr->drr_u.drr_object.drr_compress = dnp->dn_compress;
+
+ if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
+ return (EINTR);
+
+ if (dump_bytes(ba, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)))
+ return (EINTR);
+
+ /* free anything past the end of the file */
+ if (dump_free(ba, object, (dnp->dn_maxblkid + 1) *
+ (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL))
+ return (EINTR);
+ if (ba->err)
+ return (EINTR);
+ return (0);
+}
+
+#define BP_SPAN(dnp, level) \
+ (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \
+ (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)))
+
+static int
+backup_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
+{
+ struct backuparg *ba = arg;
+ uint64_t object = bc->bc_bookmark.zb_object;
+ int level = bc->bc_bookmark.zb_level;
+ uint64_t blkid = bc->bc_bookmark.zb_blkid;
+ blkptr_t *bp = bc->bc_blkptr.blk_birth ? &bc->bc_blkptr : NULL;
+ dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE;
+ void *data = bc->bc_data;
+ int err = 0;
+
+ if (SIGPENDING(curthread))
+ return (EINTR);
+
+ ASSERT(data || bp == NULL);
+
+ if (bp == NULL && object == 0) {
+ uint64_t span = BP_SPAN(bc->bc_dnode, level);
+ uint64_t dnobj = (blkid * span) >> DNODE_SHIFT;
+ err = dump_freeobjects(ba, dnobj, span >> DNODE_SHIFT);
+ } else if (bp == NULL) {
+ uint64_t span = BP_SPAN(bc->bc_dnode, level);
+ err = dump_free(ba, object, blkid * span, span);
+ } else if (data && level == 0 && type == DMU_OT_DNODE) {
+ dnode_phys_t *blk = data;
+ int i;
+ int blksz = BP_GET_LSIZE(bp);
+
+ for (i = 0; i < blksz >> DNODE_SHIFT; i++) {
+ uint64_t dnobj =
+ (blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i;
+ err = dump_dnode(ba, dnobj, blk+i);
+ if (err)
+ break;
+ }
+ } else if (level == 0 &&
+ type != DMU_OT_DNODE && type != DMU_OT_OBJSET) {
+ int blksz = BP_GET_LSIZE(bp);
+ if (data == NULL) {
+ uint32_t aflags = ARC_WAIT;
+ arc_buf_t *abuf;
+ zbookmark_t zb;
+
+ zb.zb_objset = ba->os->os->os_dsl_dataset->ds_object;
+ zb.zb_object = object;
+ zb.zb_level = level;
+ zb.zb_blkid = blkid;
+ (void) arc_read(NULL, spa, bp,
+ dmu_ot[type].ot_byteswap, arc_getbuf_func, &abuf,
+ ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_MUSTSUCCEED,
+ &aflags, &zb);
+
+ if (abuf) {
+ err = dump_data(ba, type, object, blkid * blksz,
+ blksz, abuf->b_data);
+ (void) arc_buf_remove_ref(abuf, &abuf);
+ }
+ } else {
+ err = dump_data(ba, type, object, blkid * blksz,
+ blksz, data);
+ }
+ }
+
+ ASSERT(err == 0 || err == EINTR);
+ return (err);
+}
+
+int
+dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, struct file *fp)
+{
+ dsl_dataset_t *ds = tosnap->os->os_dsl_dataset;
+ dsl_dataset_t *fromds = fromsnap ? fromsnap->os->os_dsl_dataset : NULL;
+ dmu_replay_record_t *drr;
+ struct backuparg ba;
+ int err;
+
+ /* tosnap must be a snapshot */
+ if (ds->ds_phys->ds_next_snap_obj == 0)
+ return (EINVAL);
+
+ /* fromsnap must be an earlier snapshot from the same fs as tosnap */
+ if (fromds && (ds->ds_dir != fromds->ds_dir ||
+ fromds->ds_phys->ds_creation_txg >=
+ ds->ds_phys->ds_creation_txg))
+ return (EXDEV);
+
+ drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
+ drr->drr_type = DRR_BEGIN;
+ drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC;
+ drr->drr_u.drr_begin.drr_version = DMU_BACKUP_VERSION;
+ drr->drr_u.drr_begin.drr_creation_time =
+ ds->ds_phys->ds_creation_time;
+ drr->drr_u.drr_begin.drr_type = tosnap->os->os_phys->os_type;
+ drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid;
+ if (fromds)
+ drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid;
+ dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname);
+
+ ba.drr = drr;
+ ba.td = curthread;
+ ba.fp = fp;
+ ba.os = tosnap;
+ ZIO_SET_CHECKSUM(&ba.zc, 0, 0, 0, 0);
+
+ if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) {
+ kmem_free(drr, sizeof (dmu_replay_record_t));
+ return (ba.err);
+ }
+
+ err = traverse_dsl_dataset(ds,
+ fromds ? fromds->ds_phys->ds_creation_txg : 0,
+ ADVANCE_PRE | ADVANCE_HOLES | ADVANCE_DATA | ADVANCE_NOLOCK,
+ backup_cb, &ba);
+
+ if (err) {
+ if (err == EINTR && ba.err)
+ err = ba.err;
+ kmem_free(drr, sizeof (dmu_replay_record_t));
+ return (err);
+ }
+
+ bzero(drr, sizeof (dmu_replay_record_t));
+ drr->drr_type = DRR_END;
+ drr->drr_u.drr_end.drr_checksum = ba.zc;
+
+ if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) {
+ kmem_free(drr, sizeof (dmu_replay_record_t));
+ return (ba.err);
+ }
+
+ kmem_free(drr, sizeof (dmu_replay_record_t));
+
+ return (0);
+}
+
+struct restorearg {
+ int err;
+ int byteswap;
+ kthread_t *td;
+ struct file *fp;
+ char *buf;
+ uint64_t voff;
+ int buflen; /* number of valid bytes in buf */
+ int bufoff; /* next offset to read */
+ int bufsize; /* amount of memory allocated for buf */
+ zio_cksum_t zc;
+};
+
+/* ARGSUSED */
+static int
+replay_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = arg1;
+ struct drr_begin *drrb = arg2;
+ const char *snapname;
+ int err;
+ uint64_t val;
+
+ /* must already be a snapshot of this fs */
+ if (ds->ds_phys->ds_prev_snap_obj == 0)
+ return (ENODEV);
+
+ /* most recent snapshot must match fromguid */
+ if (ds->ds_prev->ds_phys->ds_guid != drrb->drr_fromguid)
+ return (ENODEV);
+ /* must not have any changes since most recent snapshot */
+ if (ds->ds_phys->ds_bp.blk_birth >
+ ds->ds_prev->ds_phys->ds_creation_txg)
+ return (ETXTBSY);
+
+ /* new snapshot name must not exist */
+ snapname = strrchr(drrb->drr_toname, '@');
+ if (snapname == NULL)
+ return (EEXIST);
+
+ snapname++;
+ err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
+ ds->ds_phys->ds_snapnames_zapobj, snapname, 8, 1, &val);
+ if (err == 0)
+ return (EEXIST);
+ if (err != ENOENT)
+ return (err);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+replay_incremental_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = arg1;
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
+}
+
+/* ARGSUSED */
+static int
+replay_full_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dir_t *dd = arg1;
+ struct drr_begin *drrb = arg2;
+ objset_t *mos = dd->dd_pool->dp_meta_objset;
+ char *cp;
+ uint64_t val;
+ int err;
+
+ cp = strchr(drrb->drr_toname, '@');
+ *cp = '\0';
+ err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj,
+ strrchr(drrb->drr_toname, '/') + 1,
+ sizeof (uint64_t), 1, &val);
+ *cp = '@';
+
+ if (err != ENOENT)
+ return (err ? err : EEXIST);
+
+ return (0);
+}
+
+static void
+replay_full_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dir_t *dd = arg1;
+ struct drr_begin *drrb = arg2;
+ char *cp;
+ dsl_dataset_t *ds;
+ uint64_t dsobj;
+
+ cp = strchr(drrb->drr_toname, '@');
+ *cp = '\0';
+ dsobj = dsl_dataset_create_sync(dd, strrchr(drrb->drr_toname, '/') + 1,
+ NULL, tx);
+ *cp = '@';
+
+ VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, dsobj, NULL,
+ DS_MODE_EXCLUSIVE, FTAG, &ds));
+
+ (void) dmu_objset_create_impl(dsl_dataset_get_spa(ds),
+ ds, &ds->ds_phys->ds_bp, drrb->drr_type, tx);
+
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
+
+ dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+}
+
+static int
+replay_end_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ objset_t *os = arg1;
+ struct drr_begin *drrb = arg2;
+ char *snapname;
+
+ /* XXX verify that drr_toname is in dd */
+
+ snapname = strchr(drrb->drr_toname, '@');
+ if (snapname == NULL)
+ return (EINVAL);
+ snapname++;
+
+ return (dsl_dataset_snapshot_check(os, snapname, tx));
+}
+
+static void
+replay_end_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ objset_t *os = arg1;
+ struct drr_begin *drrb = arg2;
+ char *snapname;
+ dsl_dataset_t *ds, *hds;
+
+ snapname = strchr(drrb->drr_toname, '@') + 1;
+
+ dsl_dataset_snapshot_sync(os, snapname, tx);
+
+ /* set snapshot's creation time and guid */
+ hds = os->os->os_dsl_dataset;
+ VERIFY(0 == dsl_dataset_open_obj(hds->ds_dir->dd_pool,
+ hds->ds_phys->ds_prev_snap_obj, NULL,
+ DS_MODE_PRIMARY | DS_MODE_READONLY | DS_MODE_INCONSISTENT,
+ FTAG, &ds));
+
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ ds->ds_phys->ds_creation_time = drrb->drr_creation_time;
+ ds->ds_phys->ds_guid = drrb->drr_toguid;
+ ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
+
+ dsl_dataset_close(ds, DS_MODE_PRIMARY, FTAG);
+
+ dmu_buf_will_dirty(hds->ds_dbuf, tx);
+ hds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
+}
+
+static int
+restore_bytes(struct restorearg *ra, void *buf, int len, off_t off, int *resid)
+{
+ struct uio auio;
+ struct iovec aiov;
+ int error;
+
+ aiov.iov_base = buf;
+ aiov.iov_len = len;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_resid = len;
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_rw = UIO_READ;
+ auio.uio_offset = off;
+ auio.uio_td = ra->td;
+#ifdef _KERNEL
+ error = fo_read(ra->fp, &auio, ra->td->td_ucred, FOF_OFFSET, ra->td);
+#else
+ fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__);
+ error = EOPNOTSUPP;
+#endif
+ *resid = auio.uio_resid;
+ return (error);
+}
+
+static void *
+restore_read(struct restorearg *ra, int len)
+{
+ void *rv;
+
+ /* some things will require 8-byte alignment, so everything must */
+ ASSERT3U(len % 8, ==, 0);
+
+ while (ra->buflen - ra->bufoff < len) {
+ int resid;
+ int leftover = ra->buflen - ra->bufoff;
+
+ (void) memmove(ra->buf, ra->buf + ra->bufoff, leftover);
+
+ ra->err = restore_bytes(ra, (caddr_t)ra->buf + leftover,
+ ra->bufsize - leftover, ra->voff, &resid);
+
+ ra->voff += ra->bufsize - leftover - resid;
+ ra->buflen = ra->bufsize - resid;
+ ra->bufoff = 0;
+ if (resid == ra->bufsize - leftover)
+ ra->err = EINVAL;
+ if (ra->err)
+ return (NULL);
+ /* Could compute checksum here? */
+ }
+
+ ASSERT3U(ra->bufoff % 8, ==, 0);
+ ASSERT3U(ra->buflen - ra->bufoff, >=, len);
+ rv = ra->buf + ra->bufoff;
+ ra->bufoff += len;
+ if (ra->byteswap)
+ fletcher_4_incremental_byteswap(rv, len, &ra->zc);
+ else
+ fletcher_4_incremental_native(rv, len, &ra->zc);
+ return (rv);
+}
+
+static void
+backup_byteswap(dmu_replay_record_t *drr)
+{
+#define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X))
+#define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X))
+ drr->drr_type = BSWAP_32(drr->drr_type);
+ switch (drr->drr_type) {
+ case DRR_BEGIN:
+ DO64(drr_begin.drr_magic);
+ DO64(drr_begin.drr_version);
+ DO64(drr_begin.drr_creation_time);
+ DO32(drr_begin.drr_type);
+ DO64(drr_begin.drr_toguid);
+ DO64(drr_begin.drr_fromguid);
+ break;
+ case DRR_OBJECT:
+ DO64(drr_object.drr_object);
+ /* DO64(drr_object.drr_allocation_txg); */
+ DO32(drr_object.drr_type);
+ DO32(drr_object.drr_bonustype);
+ DO32(drr_object.drr_blksz);
+ DO32(drr_object.drr_bonuslen);
+ break;
+ case DRR_FREEOBJECTS:
+ DO64(drr_freeobjects.drr_firstobj);
+ DO64(drr_freeobjects.drr_numobjs);
+ break;
+ case DRR_WRITE:
+ DO64(drr_write.drr_object);
+ DO32(drr_write.drr_type);
+ DO64(drr_write.drr_offset);
+ DO64(drr_write.drr_length);
+ break;
+ case DRR_FREE:
+ DO64(drr_free.drr_object);
+ DO64(drr_free.drr_offset);
+ DO64(drr_free.drr_length);
+ break;
+ case DRR_END:
+ DO64(drr_end.drr_checksum.zc_word[0]);
+ DO64(drr_end.drr_checksum.zc_word[1]);
+ DO64(drr_end.drr_checksum.zc_word[2]);
+ DO64(drr_end.drr_checksum.zc_word[3]);
+ break;
+ }
+#undef DO64
+#undef DO32
+}
+
+static int
+restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
+{
+ int err;
+ dmu_tx_t *tx;
+
+ err = dmu_object_info(os, drro->drr_object, NULL);
+
+ if (err != 0 && err != ENOENT)
+ return (EINVAL);
+
+ if (drro->drr_type == DMU_OT_NONE ||
+ drro->drr_type >= DMU_OT_NUMTYPES ||
+ drro->drr_bonustype >= DMU_OT_NUMTYPES ||
+ drro->drr_checksum >= ZIO_CHECKSUM_FUNCTIONS ||
+ drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
+ P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
+ drro->drr_blksz < SPA_MINBLOCKSIZE ||
+ drro->drr_blksz > SPA_MAXBLOCKSIZE ||
+ drro->drr_bonuslen > DN_MAX_BONUSLEN) {
+ return (EINVAL);
+ }
+
+ tx = dmu_tx_create(os);
+
+ if (err == ENOENT) {
+ /* currently free, want to be allocated */
+ dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 1);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err) {
+ dmu_tx_abort(tx);
+ return (err);
+ }
+ err = dmu_object_claim(os, drro->drr_object,
+ drro->drr_type, drro->drr_blksz,
+ drro->drr_bonustype, drro->drr_bonuslen, tx);
+ } else {
+ /* currently allocated, want to be allocated */
+ dmu_tx_hold_bonus(tx, drro->drr_object);
+ /*
+ * We may change blocksize, so need to
+ * hold_write
+ */
+ dmu_tx_hold_write(tx, drro->drr_object, 0, 1);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err) {
+ dmu_tx_abort(tx);
+ return (err);
+ }
+
+ err = dmu_object_reclaim(os, drro->drr_object,
+ drro->drr_type, drro->drr_blksz,
+ drro->drr_bonustype, drro->drr_bonuslen, tx);
+ }
+ if (err) {
+ dmu_tx_commit(tx);
+ return (EINVAL);
+ }
+
+ dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksum, tx);
+ dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx);
+
+ if (drro->drr_bonuslen) {
+ dmu_buf_t *db;
+ void *data;
+ VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db));
+ dmu_buf_will_dirty(db, tx);
+
+ ASSERT3U(db->db_size, ==, drro->drr_bonuslen);
+ data = restore_read(ra, P2ROUNDUP(db->db_size, 8));
+ if (data == NULL) {
+ dmu_tx_commit(tx);
+ return (ra->err);
+ }
+ bcopy(data, db->db_data, db->db_size);
+ if (ra->byteswap) {
+ dmu_ot[drro->drr_bonustype].ot_byteswap(db->db_data,
+ drro->drr_bonuslen);
+ }
+ dmu_buf_rele(db, FTAG);
+ }
+ dmu_tx_commit(tx);
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+restore_freeobjects(struct restorearg *ra, objset_t *os,
+ struct drr_freeobjects *drrfo)
+{
+ uint64_t obj;
+
+ if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj)
+ return (EINVAL);
+
+ for (obj = drrfo->drr_firstobj;
+ obj < drrfo->drr_firstobj + drrfo->drr_numobjs;
+ (void) dmu_object_next(os, &obj, FALSE, 0)) {
+ dmu_tx_t *tx;
+ int err;
+
+ if (dmu_object_info(os, obj, NULL) != 0)
+ continue;
+
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_bonus(tx, obj);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err) {
+ dmu_tx_abort(tx);
+ return (err);
+ }
+ err = dmu_object_free(os, obj, tx);
+ dmu_tx_commit(tx);
+ if (err && err != ENOENT)
+ return (EINVAL);
+ }
+ return (0);
+}
+
+static int
+restore_write(struct restorearg *ra, objset_t *os,
+ struct drr_write *drrw)
+{
+ dmu_tx_t *tx;
+ void *data;
+ int err;
+
+ if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset ||
+ drrw->drr_type >= DMU_OT_NUMTYPES)
+ return (EINVAL);
+
+ data = restore_read(ra, drrw->drr_length);
+ if (data == NULL)
+ return (ra->err);
+
+ if (dmu_object_info(os, drrw->drr_object, NULL) != 0)
+ return (EINVAL);
+
+ tx = dmu_tx_create(os);
+
+ dmu_tx_hold_write(tx, drrw->drr_object,
+ drrw->drr_offset, drrw->drr_length);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err) {
+ dmu_tx_abort(tx);
+ return (err);
+ }
+ if (ra->byteswap)
+ dmu_ot[drrw->drr_type].ot_byteswap(data, drrw->drr_length);
+ dmu_write(os, drrw->drr_object,
+ drrw->drr_offset, drrw->drr_length, data, tx);
+ dmu_tx_commit(tx);
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+restore_free(struct restorearg *ra, objset_t *os,
+ struct drr_free *drrf)
+{
+ dmu_tx_t *tx;
+ int err;
+
+ if (drrf->drr_length != -1ULL &&
+ drrf->drr_offset + drrf->drr_length < drrf->drr_offset)
+ return (EINVAL);
+
+ if (dmu_object_info(os, drrf->drr_object, NULL) != 0)
+ return (EINVAL);
+
+ tx = dmu_tx_create(os);
+
+ dmu_tx_hold_free(tx, drrf->drr_object,
+ drrf->drr_offset, drrf->drr_length);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err) {
+ dmu_tx_abort(tx);
+ return (err);
+ }
+ err = dmu_free_range(os, drrf->drr_object,
+ drrf->drr_offset, drrf->drr_length, tx);
+ dmu_tx_commit(tx);
+ return (err);
+}
+
+int
+dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep,
+ boolean_t force, struct file *fp, uint64_t voffset)
+{
+ kthread_t *td = curthread;
+ struct restorearg ra;
+ dmu_replay_record_t *drr;
+ char *cp;
+ objset_t *os = NULL;
+ zio_cksum_t pzc;
+
+ bzero(&ra, sizeof (ra));
+ ra.td = td;
+ ra.fp = fp;
+ ra.voff = voffset;
+ ra.bufsize = 1<<20;
+ ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP);
+
+ if (drrb->drr_magic == DMU_BACKUP_MAGIC) {
+ ra.byteswap = FALSE;
+ } else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) {
+ ra.byteswap = TRUE;
+ } else {
+ ra.err = EINVAL;
+ goto out;
+ }
+
+ /*
+ * NB: this assumes that struct drr_begin will be the largest in
+ * dmu_replay_record_t's drr_u, and thus we don't need to pad it
+ * with zeros to make it the same length as we wrote out.
+ */
+ ((dmu_replay_record_t *)ra.buf)->drr_type = DRR_BEGIN;
+ ((dmu_replay_record_t *)ra.buf)->drr_pad = 0;
+ ((dmu_replay_record_t *)ra.buf)->drr_u.drr_begin = *drrb;
+ if (ra.byteswap) {
+ fletcher_4_incremental_byteswap(ra.buf,
+ sizeof (dmu_replay_record_t), &ra.zc);
+ } else {
+ fletcher_4_incremental_native(ra.buf,
+ sizeof (dmu_replay_record_t), &ra.zc);
+ }
+ (void) strcpy(drrb->drr_toname, tosnap); /* for the sync funcs */
+
+ if (ra.byteswap) {
+ drrb->drr_magic = BSWAP_64(drrb->drr_magic);
+ drrb->drr_version = BSWAP_64(drrb->drr_version);
+ drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time);
+ drrb->drr_type = BSWAP_32(drrb->drr_type);
+ drrb->drr_toguid = BSWAP_64(drrb->drr_toguid);
+ drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid);
+ }
+
+ ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
+
+ if (drrb->drr_version != DMU_BACKUP_VERSION ||
+ drrb->drr_type >= DMU_OST_NUMTYPES ||
+ strchr(drrb->drr_toname, '@') == NULL) {
+ ra.err = EINVAL;
+ goto out;
+ }
+
+ /*
+ * Process the begin in syncing context.
+ */
+ if (drrb->drr_fromguid) {
+ /* incremental backup */
+ dsl_dataset_t *ds = NULL;
+
+ cp = strchr(tosnap, '@');
+ *cp = '\0';
+ ra.err = dsl_dataset_open(tosnap, DS_MODE_EXCLUSIVE, FTAG, &ds);
+ *cp = '@';
+ if (ra.err)
+ goto out;
+
+ /*
+ * Only do the rollback if the most recent snapshot
+ * matches the incremental source
+ */
+ if (force) {
+ if (ds->ds_prev == NULL ||
+ ds->ds_prev->ds_phys->ds_guid !=
+ drrb->drr_fromguid) {
+ dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+ kmem_free(ra.buf, ra.bufsize);
+ return (ENODEV);
+ }
+ (void) dsl_dataset_rollback(ds);
+ }
+ ra.err = dsl_sync_task_do(ds->ds_dir->dd_pool,
+ replay_incremental_check, replay_incremental_sync,
+ ds, drrb, 1);
+ dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+ } else {
+ /* full backup */
+ dsl_dir_t *dd = NULL;
+ const char *tail;
+
+ /* can't restore full backup into topmost fs, for now */
+ if (strrchr(drrb->drr_toname, '/') == NULL) {
+ ra.err = EINVAL;
+ goto out;
+ }
+
+ cp = strchr(tosnap, '@');
+ *cp = '\0';
+ ra.err = dsl_dir_open(tosnap, FTAG, &dd, &tail);
+ *cp = '@';
+ if (ra.err)
+ goto out;
+ if (tail == NULL) {
+ ra.err = EEXIST;
+ goto out;
+ }
+
+ ra.err = dsl_sync_task_do(dd->dd_pool, replay_full_check,
+ replay_full_sync, dd, drrb, 5);
+ dsl_dir_close(dd, FTAG);
+ }
+ if (ra.err)
+ goto out;
+
+ /*
+ * Open the objset we are modifying.
+ */
+
+ cp = strchr(tosnap, '@');
+ *cp = '\0';
+ ra.err = dmu_objset_open(tosnap, DMU_OST_ANY,
+ DS_MODE_PRIMARY | DS_MODE_INCONSISTENT, &os);
+ *cp = '@';
+ ASSERT3U(ra.err, ==, 0);
+
+ /*
+ * Read records and process them.
+ */
+ pzc = ra.zc;
+ while (ra.err == 0 &&
+ NULL != (drr = restore_read(&ra, sizeof (*drr)))) {
+ if (SIGPENDING(td)) {
+ ra.err = EINTR;
+ goto out;
+ }
+
+ if (ra.byteswap)
+ backup_byteswap(drr);
+
+ switch (drr->drr_type) {
+ case DRR_OBJECT:
+ {
+ /*
+ * We need to make a copy of the record header,
+ * because restore_{object,write} may need to
+ * restore_read(), which will invalidate drr.
+ */
+ struct drr_object drro = drr->drr_u.drr_object;
+ ra.err = restore_object(&ra, os, &drro);
+ break;
+ }
+ case DRR_FREEOBJECTS:
+ {
+ struct drr_freeobjects drrfo =
+ drr->drr_u.drr_freeobjects;
+ ra.err = restore_freeobjects(&ra, os, &drrfo);
+ break;
+ }
+ case DRR_WRITE:
+ {
+ struct drr_write drrw = drr->drr_u.drr_write;
+ ra.err = restore_write(&ra, os, &drrw);
+ break;
+ }
+ case DRR_FREE:
+ {
+ struct drr_free drrf = drr->drr_u.drr_free;
+ ra.err = restore_free(&ra, os, &drrf);
+ break;
+ }
+ case DRR_END:
+ {
+ struct drr_end drre = drr->drr_u.drr_end;
+ /*
+ * We compare against the *previous* checksum
+ * value, because the stored checksum is of
+ * everything before the DRR_END record.
+ */
+ if (drre.drr_checksum.zc_word[0] != 0 &&
+ !ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pzc)) {
+ ra.err = ECKSUM;
+ goto out;
+ }
+
+ ra.err = dsl_sync_task_do(dmu_objset_ds(os)->
+ ds_dir->dd_pool, replay_end_check, replay_end_sync,
+ os, drrb, 3);
+ goto out;
+ }
+ default:
+ ra.err = EINVAL;
+ goto out;
+ }
+ pzc = ra.zc;
+ }
+
+out:
+ if (os)
+ dmu_objset_close(os);
+
+ /*
+ * Make sure we don't rollback/destroy unless we actually
+ * processed the begin properly. 'os' will only be set if this
+ * is the case.
+ */
+ if (ra.err && os && tosnap && strchr(tosnap, '@')) {
+ /*
+ * rollback or destroy what we created, so we don't
+ * leave it in the restoring state.
+ */
+ dsl_dataset_t *ds;
+ int err;
+
+ cp = strchr(tosnap, '@');
+ *cp = '\0';
+ err = dsl_dataset_open(tosnap,
+ DS_MODE_EXCLUSIVE | DS_MODE_INCONSISTENT,
+ FTAG, &ds);
+ if (err == 0) {
+ txg_wait_synced(ds->ds_dir->dd_pool, 0);
+ if (drrb->drr_fromguid) {
+ /* incremental: rollback to most recent snap */
+ (void) dsl_dataset_rollback(ds);
+ dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+ } else {
+ /* full: destroy whole fs */
+ dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+ (void) dsl_dataset_destroy(tosnap);
+ }
+ }
+ *cp = '@';
+ }
+
+ kmem_free(ra.buf, ra.bufsize);
+ if (sizep)
+ *sizep = ra.voff;
+ return (ra.err);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
new file mode 100644
index 0000000..3d2bc3e
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
@@ -0,0 +1,888 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_pool.h>
+#include <sys/dnode.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/dmu_impl.h>
+
+#define BP_SPAN_SHIFT(level, width) ((level) * (width))
+
+#define BP_EQUAL(b1, b2) \
+ (DVA_EQUAL(BP_IDENTITY(b1), BP_IDENTITY(b2)) && \
+ (b1)->blk_birth == (b2)->blk_birth)
+
+/*
+ * Compare two bookmarks.
+ *
+ * For ADVANCE_PRE, the visitation order is:
+ *
+ * objset 0, 1, 2, ..., ZB_MAXOBJSET.
+ * object 0, 1, 2, ..., ZB_MAXOBJECT.
+ * blkoff 0, 1, 2, ...
+ * level ZB_MAXLEVEL, ..., 2, 1, 0.
+ *
+ * where blkoff = blkid << BP_SPAN_SHIFT(level, width), and thus a valid
+ * ordering vector is:
+ *
+ * < objset, object, blkoff, -level >
+ *
+ * For ADVANCE_POST, the starting offsets aren't sequential but ending
+ * offsets [blkoff = (blkid + 1) << BP_SPAN_SHIFT(level, width)] are.
+ * The visitation order is:
+ *
+ * objset 1, 2, ..., ZB_MAXOBJSET, 0.
+ * object 1, 2, ..., ZB_MAXOBJECT, 0.
+ * blkoff 1, 2, ...
+ * level 0, 1, 2, ..., ZB_MAXLEVEL.
+ *
+ * and thus a valid ordering vector is:
+ *
+ * < objset - 1, object - 1, blkoff, level >
+ *
+ * Both orderings can be expressed as:
+ *
+ * < objset + bias, object + bias, blkoff, level ^ bias >
+ *
+ * where 'bias' is either 0 or -1 (for ADVANCE_PRE or ADVANCE_POST)
+ * and 'blkoff' is (blkid - bias) << BP_SPAN_SHIFT(level, wshift).
+ *
+ * Special case: an objset's osphys is represented as level -1 of object 0.
+ * It is always either the very first or very last block we visit in an objset.
+ * Therefore, if either bookmark's level is -1, level alone determines order.
+ */
+static int
+compare_bookmark(zbookmark_t *szb, zbookmark_t *ezb, dnode_phys_t *dnp,
+ int advance)
+{
+ int bias = (advance & ADVANCE_PRE) ? 0 : -1;
+ uint64_t sblkoff, eblkoff;
+ int slevel, elevel, wshift;
+
+ if (szb->zb_objset + bias < ezb->zb_objset + bias)
+ return (-1);
+
+ if (szb->zb_objset + bias > ezb->zb_objset + bias)
+ return (1);
+
+ slevel = szb->zb_level;
+ elevel = ezb->zb_level;
+
+ if ((slevel | elevel) < 0)
+ return ((slevel ^ bias) - (elevel ^ bias));
+
+ if (szb->zb_object + bias < ezb->zb_object + bias)
+ return (-1);
+
+ if (szb->zb_object + bias > ezb->zb_object + bias)
+ return (1);
+
+ if (dnp == NULL)
+ return (0);
+
+ wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+ sblkoff = (szb->zb_blkid - bias) << BP_SPAN_SHIFT(slevel, wshift);
+ eblkoff = (ezb->zb_blkid - bias) << BP_SPAN_SHIFT(elevel, wshift);
+
+ if (sblkoff < eblkoff)
+ return (-1);
+
+ if (sblkoff > eblkoff)
+ return (1);
+
+ return ((elevel ^ bias) - (slevel ^ bias));
+}
+
+#define SET_BOOKMARK(zb, objset, object, level, blkid) \
+{ \
+ (zb)->zb_objset = objset; \
+ (zb)->zb_object = object; \
+ (zb)->zb_level = level; \
+ (zb)->zb_blkid = blkid; \
+}
+
+#define SET_BOOKMARK_LB(zb, level, blkid) \
+{ \
+ (zb)->zb_level = level; \
+ (zb)->zb_blkid = blkid; \
+}
+
+static int
+advance_objset(zseg_t *zseg, uint64_t objset, int advance)
+{
+ zbookmark_t *zb = &zseg->seg_start;
+
+ if (advance & ADVANCE_PRE) {
+ if (objset >= ZB_MAXOBJSET)
+ return (ERANGE);
+ SET_BOOKMARK(zb, objset, 0, -1, 0);
+ } else {
+ if (objset >= ZB_MAXOBJSET)
+ objset = 0;
+ SET_BOOKMARK(zb, objset, 1, 0, 0);
+ }
+
+ if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
+ return (ERANGE);
+
+ return (EAGAIN);
+}
+
+static int
+advance_object(zseg_t *zseg, uint64_t object, int advance)
+{
+ zbookmark_t *zb = &zseg->seg_start;
+
+ if (advance & ADVANCE_PRE) {
+ if (object >= ZB_MAXOBJECT) {
+ SET_BOOKMARK(zb, zb->zb_objset + 1, 0, -1, 0);
+ } else {
+ SET_BOOKMARK(zb, zb->zb_objset, object, ZB_MAXLEVEL, 0);
+ }
+ } else {
+ if (zb->zb_object == 0) {
+ SET_BOOKMARK(zb, zb->zb_objset, 0, -1, 0);
+ } else {
+ if (object >= ZB_MAXOBJECT)
+ object = 0;
+ SET_BOOKMARK(zb, zb->zb_objset, object, 0, 0);
+ }
+ }
+
+ if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
+ return (ERANGE);
+
+ return (EAGAIN);
+}
+
+static int
+advance_from_osphys(zseg_t *zseg, int advance)
+{
+ zbookmark_t *zb = &zseg->seg_start;
+
+ ASSERT(zb->zb_object == 0);
+ ASSERT(zb->zb_level == -1);
+ ASSERT(zb->zb_blkid == 0);
+
+ if (advance & ADVANCE_PRE) {
+ SET_BOOKMARK_LB(zb, ZB_MAXLEVEL, 0);
+ } else {
+ if (zb->zb_objset == 0)
+ return (ERANGE);
+ SET_BOOKMARK(zb, zb->zb_objset + 1, 1, 0, 0);
+ }
+
+ if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
+ return (ERANGE);
+
+ return (EAGAIN);
+}
+
+static int
+advance_block(zseg_t *zseg, dnode_phys_t *dnp, int rc, int advance)
+{
+ zbookmark_t *zb = &zseg->seg_start;
+ int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
+ int maxlevel = dnp->dn_nlevels - 1;
+ int level = zb->zb_level;
+ uint64_t blkid = zb->zb_blkid;
+
+ if (advance & ADVANCE_PRE) {
+ if (level > 0 && rc == 0) {
+ level--;
+ blkid <<= wshift;
+ } else {
+ blkid++;
+
+ if ((blkid << BP_SPAN_SHIFT(level, wshift)) >
+ dnp->dn_maxblkid)
+ return (ERANGE);
+
+ while (level < maxlevel) {
+ if (P2PHASE(blkid, 1ULL << wshift))
+ break;
+ blkid >>= wshift;
+ level++;
+ }
+ }
+ } else {
+ if (level >= maxlevel || P2PHASE(blkid + 1, 1ULL << wshift)) {
+ blkid = (blkid + 1) << BP_SPAN_SHIFT(level, wshift);
+ level = 0;
+ } else {
+ blkid >>= wshift;
+ level++;
+ }
+
+ while ((blkid << BP_SPAN_SHIFT(level, wshift)) >
+ dnp->dn_maxblkid) {
+ if (level == maxlevel)
+ return (ERANGE);
+ blkid >>= wshift;
+ level++;
+ }
+ }
+ SET_BOOKMARK_LB(zb, level, blkid);
+
+ if (compare_bookmark(zb, &zseg->seg_end, dnp, advance) > 0)
+ return (ERANGE);
+
+ return (EAGAIN);
+}
+
+static int
+traverse_callback(traverse_handle_t *th, zseg_t *zseg, traverse_blk_cache_t *bc)
+{
+ /*
+ * Before we issue the callback, prune against maxtxg.
+ *
+ * We prune against mintxg before we get here because it's a big win.
+ * If a given block was born in txg 37, then we know that the entire
+ * subtree below that block must have been born in txg 37 or earlier.
+ * We can therefore lop off huge branches of the tree as we go.
+ *
+ * There's no corresponding optimization for maxtxg because knowing
+ * that bp->blk_birth >= maxtxg doesn't imply anything about the bp's
+ * children. In fact, the copy-on-write design of ZFS ensures that
+ * top-level blocks will pretty much always be new.
+ *
+ * Therefore, in the name of simplicity we don't prune against
+ * maxtxg until the last possible moment -- that being right now.
+ */
+ if (bc->bc_errno == 0 && bc->bc_blkptr.blk_birth >= zseg->seg_maxtxg)
+ return (0);
+
+ /*
+ * Debugging: verify that the order we visit things agrees with the
+ * order defined by compare_bookmark(). We don't check this for
+ * log blocks because there's no defined ordering for them; they're
+ * always visited (or not) as part of visiting the objset_phys_t.
+ */
+ if (bc->bc_errno == 0 && bc != &th->th_zil_cache) {
+ zbookmark_t *zb = &bc->bc_bookmark;
+ zbookmark_t *szb = &zseg->seg_start;
+ zbookmark_t *ezb = &zseg->seg_end;
+ zbookmark_t *lzb = &th->th_lastcb;
+ dnode_phys_t *dnp = bc->bc_dnode;
+
+ ASSERT(compare_bookmark(zb, ezb, dnp, th->th_advance) <= 0);
+ ASSERT(compare_bookmark(zb, szb, dnp, th->th_advance) == 0);
+ ASSERT(compare_bookmark(lzb, zb, dnp, th->th_advance) < 0 ||
+ lzb->zb_level == ZB_NO_LEVEL);
+ *lzb = *zb;
+ }
+
+ th->th_callbacks++;
+ return (th->th_func(bc, th->th_spa, th->th_arg));
+}
+
+static int
+traverse_read(traverse_handle_t *th, traverse_blk_cache_t *bc, blkptr_t *bp,
+ dnode_phys_t *dnp)
+{
+ zbookmark_t *zb = &bc->bc_bookmark;
+ int error;
+
+ th->th_hits++;
+
+ bc->bc_dnode = dnp;
+ bc->bc_errno = 0;
+
+ if (BP_EQUAL(&bc->bc_blkptr, bp))
+ return (0);
+
+ bc->bc_blkptr = *bp;
+
+ if (bc->bc_data == NULL)
+ return (0);
+
+ if (BP_IS_HOLE(bp)) {
+ ASSERT(th->th_advance & ADVANCE_HOLES);
+ return (0);
+ }
+
+ if (compare_bookmark(zb, &th->th_noread, dnp, 0) == 0) {
+ error = EIO;
+ } else if (arc_tryread(th->th_spa, bp, bc->bc_data) == 0) {
+ error = 0;
+ th->th_arc_hits++;
+ } else {
+ error = zio_wait(zio_read(NULL, th->th_spa, bp, bc->bc_data,
+ BP_GET_LSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_READ,
+ th->th_zio_flags | ZIO_FLAG_DONT_CACHE, zb));
+
+ if (BP_SHOULD_BYTESWAP(bp) && error == 0)
+ (zb->zb_level > 0 ? byteswap_uint64_array :
+ dmu_ot[BP_GET_TYPE(bp)].ot_byteswap)(bc->bc_data,
+ BP_GET_LSIZE(bp));
+ th->th_reads++;
+ }
+
+ if (error) {
+ bc->bc_errno = error;
+ error = traverse_callback(th, NULL, bc);
+ ASSERT(error == EAGAIN || error == EINTR || error == ERESTART);
+ bc->bc_blkptr.blk_birth = -1ULL;
+ }
+
+ dprintf("cache %02x error %d <%llu, %llu, %d, %llx>\n",
+ bc - &th->th_cache[0][0], error,
+ zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid);
+
+ return (error);
+}
+
+static int
+find_block(traverse_handle_t *th, zseg_t *zseg, dnode_phys_t *dnp, int depth)
+{
+ zbookmark_t *zb = &zseg->seg_start;
+ traverse_blk_cache_t *bc;
+ blkptr_t *bp = dnp->dn_blkptr;
+ int i, first, level;
+ int nbp = dnp->dn_nblkptr;
+ int minlevel = zb->zb_level;
+ int maxlevel = dnp->dn_nlevels - 1;
+ int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
+ int bp_shift = BP_SPAN_SHIFT(maxlevel - minlevel, wshift);
+ uint64_t blkid = zb->zb_blkid >> bp_shift;
+ int do_holes = (th->th_advance & ADVANCE_HOLES) && depth == ZB_DN_CACHE;
+ int rc;
+
+ if (minlevel > maxlevel || blkid >= nbp)
+ return (ERANGE);
+
+ for (level = maxlevel; level >= minlevel; level--) {
+ first = P2PHASE(blkid, 1ULL << wshift);
+
+ for (i = first; i < nbp; i++)
+ if (bp[i].blk_birth > zseg->seg_mintxg ||
+ BP_IS_HOLE(&bp[i]) && do_holes)
+ break;
+
+ if (i != first) {
+ i--;
+ SET_BOOKMARK_LB(zb, level, blkid + (i - first));
+ return (ENOTBLK);
+ }
+
+ bc = &th->th_cache[depth][level];
+
+ SET_BOOKMARK(&bc->bc_bookmark, zb->zb_objset, zb->zb_object,
+ level, blkid);
+
+ if (rc = traverse_read(th, bc, bp + i, dnp)) {
+ if (rc != EAGAIN) {
+ SET_BOOKMARK_LB(zb, level, blkid);
+ }
+ return (rc);
+ }
+
+ if (BP_IS_HOLE(&bp[i])) {
+ SET_BOOKMARK_LB(zb, level, blkid);
+ th->th_lastcb.zb_level = ZB_NO_LEVEL;
+ return (0);
+ }
+
+ nbp = 1 << wshift;
+ bp = bc->bc_data;
+ bp_shift -= wshift;
+ blkid = zb->zb_blkid >> bp_shift;
+ }
+
+ return (0);
+}
+
+static int
+get_dnode(traverse_handle_t *th, uint64_t objset, dnode_phys_t *mdn,
+ uint64_t *objectp, dnode_phys_t **dnpp, uint64_t txg, int type, int depth)
+{
+ zseg_t zseg;
+ zbookmark_t *zb = &zseg.seg_start;
+ uint64_t object = *objectp;
+ int i, rc;
+
+ SET_BOOKMARK(zb, objset, 0, 0, object / DNODES_PER_BLOCK);
+ SET_BOOKMARK(&zseg.seg_end, objset, 0, 0, ZB_MAXBLKID);
+
+ zseg.seg_mintxg = txg;
+ zseg.seg_maxtxg = -1ULL;
+
+ for (;;) {
+ rc = find_block(th, &zseg, mdn, depth);
+
+ if (rc == EAGAIN || rc == EINTR || rc == ERANGE)
+ break;
+
+ if (rc == 0 && zb->zb_level == 0) {
+ dnode_phys_t *dnp = th->th_cache[depth][0].bc_data;
+ for (i = 0; i < DNODES_PER_BLOCK; i++) {
+ object = (zb->zb_blkid * DNODES_PER_BLOCK) + i;
+ if (object >= *objectp &&
+ dnp[i].dn_type != DMU_OT_NONE &&
+ (type == -1 || dnp[i].dn_type == type)) {
+ *objectp = object;
+ *dnpp = &dnp[i];
+ return (0);
+ }
+ }
+ }
+
+ rc = advance_block(&zseg, mdn, rc, ADVANCE_PRE);
+
+ if (rc == ERANGE)
+ break;
+ }
+
+ if (rc == ERANGE)
+ *objectp = ZB_MAXOBJECT;
+
+ return (rc);
+}
+
+/* ARGSUSED */
+static void
+traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
+{
+ traverse_handle_t *th = arg;
+ traverse_blk_cache_t *bc = &th->th_zil_cache;
+ zbookmark_t *zb = &bc->bc_bookmark;
+ zseg_t *zseg = list_head(&th->th_seglist);
+
+ if (bp->blk_birth <= zseg->seg_mintxg)
+ return;
+
+ if (claim_txg != 0 || bp->blk_birth < spa_first_txg(th->th_spa)) {
+ zb->zb_object = 0;
+ zb->zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
+ bc->bc_blkptr = *bp;
+ (void) traverse_callback(th, zseg, bc);
+ }
+}
+
+/* ARGSUSED */
+static void
+traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
+{
+ traverse_handle_t *th = arg;
+ traverse_blk_cache_t *bc = &th->th_zil_cache;
+ zbookmark_t *zb = &bc->bc_bookmark;
+ zseg_t *zseg = list_head(&th->th_seglist);
+
+ if (lrc->lrc_txtype == TX_WRITE) {
+ lr_write_t *lr = (lr_write_t *)lrc;
+ blkptr_t *bp = &lr->lr_blkptr;
+
+ if (bp->blk_birth <= zseg->seg_mintxg)
+ return;
+
+ if (claim_txg != 0 && bp->blk_birth >= claim_txg) {
+ zb->zb_object = lr->lr_foid;
+ zb->zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp);
+ bc->bc_blkptr = *bp;
+ (void) traverse_callback(th, zseg, bc);
+ }
+ }
+}
+
+static void
+traverse_zil(traverse_handle_t *th, traverse_blk_cache_t *bc)
+{
+ spa_t *spa = th->th_spa;
+ dsl_pool_t *dp = spa_get_dsl(spa);
+ objset_phys_t *osphys = bc->bc_data;
+ zil_header_t *zh = &osphys->os_zil_header;
+ uint64_t claim_txg = zh->zh_claim_txg;
+ zilog_t *zilog;
+
+ ASSERT(bc == &th->th_cache[ZB_MDN_CACHE][ZB_MAXLEVEL - 1]);
+ ASSERT(bc->bc_bookmark.zb_level == -1);
+
+ /*
+ * We only want to visit blocks that have been claimed but not yet
+ * replayed (or, in read-only mode, blocks that *would* be claimed).
+ */
+ if (claim_txg == 0 && (spa_mode & FWRITE))
+ return;
+
+ th->th_zil_cache.bc_bookmark = bc->bc_bookmark;
+
+ zilog = zil_alloc(dp->dp_meta_objset, zh);
+
+ (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, th,
+ claim_txg);
+
+ zil_free(zilog);
+}
+
+static int
+traverse_segment(traverse_handle_t *th, zseg_t *zseg, blkptr_t *mosbp)
+{
+ zbookmark_t *zb = &zseg->seg_start;
+ traverse_blk_cache_t *bc;
+ dnode_phys_t *dn, *dn_tmp;
+ int worklimit = 100;
+ int rc;
+
+ dprintf("<%llu, %llu, %d, %llx>\n",
+ zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid);
+
+ bc = &th->th_cache[ZB_MOS_CACHE][ZB_MAXLEVEL - 1];
+ dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode;
+
+ SET_BOOKMARK(&bc->bc_bookmark, 0, 0, -1, 0);
+
+ rc = traverse_read(th, bc, mosbp, dn);
+
+ if (rc) /* If we get ERESTART, we've got nowhere left to go */
+ return (rc == ERESTART ? EINTR : rc);
+
+ ASSERT(dn->dn_nlevels < ZB_MAXLEVEL);
+
+ if (zb->zb_objset != 0) {
+ uint64_t objset = zb->zb_objset;
+ dsl_dataset_phys_t *dsp;
+
+ rc = get_dnode(th, 0, dn, &objset, &dn_tmp, 0,
+ DMU_OT_DSL_DATASET, ZB_MOS_CACHE);
+
+ if (objset != zb->zb_objset)
+ rc = advance_objset(zseg, objset, th->th_advance);
+
+ if (rc != 0)
+ return (rc);
+
+ dsp = DN_BONUS(dn_tmp);
+
+ bc = &th->th_cache[ZB_MDN_CACHE][ZB_MAXLEVEL - 1];
+ dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode;
+
+ SET_BOOKMARK(&bc->bc_bookmark, objset, 0, -1, 0);
+
+ /*
+ * If we're traversing an open snapshot, we know that it
+ * can't be deleted (because it's open) and it can't change
+ * (because it's a snapshot). Therefore, once we've gotten
+ * from the uberblock down to the snapshot's objset_phys_t,
+ * we no longer need to synchronize with spa_sync(); we're
+ * traversing a completely static block tree from here on.
+ */
+ if (th->th_advance & ADVANCE_NOLOCK) {
+ ASSERT(th->th_locked);
+ rw_exit(spa_traverse_rwlock(th->th_spa));
+ th->th_locked = 0;
+ }
+
+ rc = traverse_read(th, bc, &dsp->ds_bp, dn);
+
+ if (rc != 0) {
+ if (rc == ERESTART)
+ rc = advance_objset(zseg, zb->zb_objset + 1,
+ th->th_advance);
+ return (rc);
+ }
+
+ if (th->th_advance & ADVANCE_PRUNE)
+ zseg->seg_mintxg =
+ MAX(zseg->seg_mintxg, dsp->ds_prev_snap_txg);
+ }
+
+ if (zb->zb_level == -1) {
+ ASSERT(zb->zb_object == 0);
+ ASSERT(zb->zb_blkid == 0);
+ ASSERT(BP_GET_TYPE(&bc->bc_blkptr) == DMU_OT_OBJSET);
+
+ if (bc->bc_blkptr.blk_birth > zseg->seg_mintxg) {
+ rc = traverse_callback(th, zseg, bc);
+ if (rc) {
+ ASSERT(rc == EINTR);
+ return (rc);
+ }
+ if ((th->th_advance & ADVANCE_ZIL) &&
+ zb->zb_objset != 0)
+ traverse_zil(th, bc);
+ }
+
+ return (advance_from_osphys(zseg, th->th_advance));
+ }
+
+ if (zb->zb_object != 0) {
+ uint64_t object = zb->zb_object;
+
+ rc = get_dnode(th, zb->zb_objset, dn, &object, &dn_tmp,
+ zseg->seg_mintxg, -1, ZB_MDN_CACHE);
+
+ if (object != zb->zb_object)
+ rc = advance_object(zseg, object, th->th_advance);
+
+ if (rc != 0)
+ return (rc);
+
+ dn = dn_tmp;
+ }
+
+ if (zb->zb_level == ZB_MAXLEVEL)
+ zb->zb_level = dn->dn_nlevels - 1;
+
+ for (;;) {
+ rc = find_block(th, zseg, dn, ZB_DN_CACHE);
+
+ if (rc == EAGAIN || rc == EINTR || rc == ERANGE)
+ break;
+
+ if (rc == 0) {
+ bc = &th->th_cache[ZB_DN_CACHE][zb->zb_level];
+ ASSERT(bc->bc_dnode == dn);
+ ASSERT(bc->bc_blkptr.blk_birth <= mosbp->blk_birth);
+ rc = traverse_callback(th, zseg, bc);
+ if (rc) {
+ ASSERT(rc == EINTR);
+ return (rc);
+ }
+ if (BP_IS_HOLE(&bc->bc_blkptr)) {
+ ASSERT(th->th_advance & ADVANCE_HOLES);
+ rc = ENOTBLK;
+ }
+ }
+
+ rc = advance_block(zseg, dn, rc, th->th_advance);
+
+ if (rc == ERANGE)
+ break;
+
+ /*
+ * Give spa_sync() a chance to run.
+ */
+ if (th->th_locked && spa_traverse_wanted(th->th_spa)) {
+ th->th_syncs++;
+ return (EAGAIN);
+ }
+
+ if (--worklimit == 0)
+ return (EAGAIN);
+ }
+
+ if (rc == ERANGE)
+ rc = advance_object(zseg, zb->zb_object + 1, th->th_advance);
+
+ return (rc);
+}
+
+/*
+ * It is the caller's responsibility to ensure that the dsl_dataset_t
+ * doesn't go away during traversal.
+ */
+int
+traverse_dsl_dataset(dsl_dataset_t *ds, uint64_t txg_start, int advance,
+ blkptr_cb_t func, void *arg)
+{
+ spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
+ traverse_handle_t *th;
+ int err;
+
+ th = traverse_init(spa, func, arg, advance, ZIO_FLAG_MUSTSUCCEED);
+
+ traverse_add_objset(th, txg_start, -1ULL, ds->ds_object);
+
+ while ((err = traverse_more(th)) == EAGAIN)
+ continue;
+
+ traverse_fini(th);
+ return (err);
+}
+
+int
+traverse_more(traverse_handle_t *th)
+{
+ zseg_t *zseg = list_head(&th->th_seglist);
+ uint64_t save_txg; /* XXX won't be necessary with real itinerary */
+ krwlock_t *rw = spa_traverse_rwlock(th->th_spa);
+ blkptr_t *mosbp = spa_get_rootblkptr(th->th_spa);
+ int rc;
+
+ if (zseg == NULL)
+ return (0);
+
+ th->th_restarts++;
+
+ save_txg = zseg->seg_mintxg;
+
+ rw_enter(rw, RW_READER);
+ th->th_locked = 1;
+
+ rc = traverse_segment(th, zseg, mosbp);
+ ASSERT(rc == ERANGE || rc == EAGAIN || rc == EINTR);
+
+ if (th->th_locked)
+ rw_exit(rw);
+ th->th_locked = 0;
+
+ zseg->seg_mintxg = save_txg;
+
+ if (rc == ERANGE) {
+ list_remove(&th->th_seglist, zseg);
+ kmem_free(zseg, sizeof (*zseg));
+ return (EAGAIN);
+ }
+
+ return (rc);
+}
+
+/*
+ * Note: (mintxg, maxtxg) is an open interval; mintxg and maxtxg themselves
+ * are not included. The blocks covered by this segment will all have
+ * mintxg < birth < maxtxg.
+ */
+static void
+traverse_add_segment(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
+ uint64_t sobjset, uint64_t sobject, int slevel, uint64_t sblkid,
+ uint64_t eobjset, uint64_t eobject, int elevel, uint64_t eblkid)
+{
+ zseg_t *zseg;
+
+ zseg = kmem_alloc(sizeof (zseg_t), KM_SLEEP);
+
+ zseg->seg_mintxg = mintxg;
+ zseg->seg_maxtxg = maxtxg;
+
+ zseg->seg_start.zb_objset = sobjset;
+ zseg->seg_start.zb_object = sobject;
+ zseg->seg_start.zb_level = slevel;
+ zseg->seg_start.zb_blkid = sblkid;
+
+ zseg->seg_end.zb_objset = eobjset;
+ zseg->seg_end.zb_object = eobject;
+ zseg->seg_end.zb_level = elevel;
+ zseg->seg_end.zb_blkid = eblkid;
+
+ list_insert_tail(&th->th_seglist, zseg);
+}
+
+void
+traverse_add_dnode(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
+ uint64_t objset, uint64_t object)
+{
+ if (th->th_advance & ADVANCE_PRE)
+ traverse_add_segment(th, mintxg, maxtxg,
+ objset, object, ZB_MAXLEVEL, 0,
+ objset, object, 0, ZB_MAXBLKID);
+ else
+ traverse_add_segment(th, mintxg, maxtxg,
+ objset, object, 0, 0,
+ objset, object, 0, ZB_MAXBLKID);
+}
+
+void
+traverse_add_objset(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
+ uint64_t objset)
+{
+ if (th->th_advance & ADVANCE_PRE)
+ traverse_add_segment(th, mintxg, maxtxg,
+ objset, 0, -1, 0,
+ objset, ZB_MAXOBJECT, 0, ZB_MAXBLKID);
+ else
+ traverse_add_segment(th, mintxg, maxtxg,
+ objset, 1, 0, 0,
+ objset, 0, -1, 0);
+}
+
+void
+traverse_add_pool(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg)
+{
+ if (th->th_advance & ADVANCE_PRE)
+ traverse_add_segment(th, mintxg, maxtxg,
+ 0, 0, -1, 0,
+ ZB_MAXOBJSET, ZB_MAXOBJECT, 0, ZB_MAXBLKID);
+ else
+ traverse_add_segment(th, mintxg, maxtxg,
+ 1, 1, 0, 0,
+ 0, 0, -1, 0);
+}
+
+traverse_handle_t *
+traverse_init(spa_t *spa, blkptr_cb_t func, void *arg, int advance,
+ int zio_flags)
+{
+ traverse_handle_t *th;
+ int d, l;
+
+ th = kmem_zalloc(sizeof (*th), KM_SLEEP);
+
+ th->th_spa = spa;
+ th->th_func = func;
+ th->th_arg = arg;
+ th->th_advance = advance;
+ th->th_lastcb.zb_level = ZB_NO_LEVEL;
+ th->th_noread.zb_level = ZB_NO_LEVEL;
+ th->th_zio_flags = zio_flags;
+
+ list_create(&th->th_seglist, sizeof (zseg_t),
+ offsetof(zseg_t, seg_node));
+
+ for (d = 0; d < ZB_DEPTH; d++) {
+ for (l = 0; l < ZB_MAXLEVEL; l++) {
+ if ((advance & ADVANCE_DATA) ||
+ l != 0 || d != ZB_DN_CACHE)
+ th->th_cache[d][l].bc_data =
+ zio_buf_alloc(SPA_MAXBLOCKSIZE);
+ }
+ }
+
+ return (th);
+}
+
+void
+traverse_fini(traverse_handle_t *th)
+{
+ int d, l;
+ zseg_t *zseg;
+
+ for (d = 0; d < ZB_DEPTH; d++)
+ for (l = 0; l < ZB_MAXLEVEL; l++)
+ if (th->th_cache[d][l].bc_data != NULL)
+ zio_buf_free(th->th_cache[d][l].bc_data,
+ SPA_MAXBLOCKSIZE);
+
+ while ((zseg = list_head(&th->th_seglist)) != NULL) {
+ list_remove(&th->th_seglist, zseg);
+ kmem_free(zseg, sizeof (*zseg));
+ }
+
+ list_destroy(&th->th_seglist);
+
+ dprintf("%llu hit, %llu ARC, %llu IO, %llu cb, %llu sync, %llu again\n",
+ th->th_hits, th->th_arc_hits, th->th_reads, th->th_callbacks,
+ th->th_syncs, th->th_restarts);
+
+ kmem_free(th, sizeof (*th));
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
new file mode 100644
index 0000000..13fd8d4
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
@@ -0,0 +1,992 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dbuf.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h> /* for dsl_dataset_block_freeable() */
+#include <sys/dsl_dir.h> /* for dsl_dir_tempreserve_*() */
+#include <sys/dsl_pool.h>
+#include <sys/zap_impl.h> /* for fzap_default_block_shift */
+#include <sys/spa.h>
+#include <sys/zfs_context.h>
+
+typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
+ uint64_t arg1, uint64_t arg2);
+
+
+dmu_tx_t *
+dmu_tx_create_dd(dsl_dir_t *dd)
+{
+ dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP);
+ tx->tx_dir = dd;
+ if (dd)
+ tx->tx_pool = dd->dd_pool;
+ list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t),
+ offsetof(dmu_tx_hold_t, txh_node));
+#ifdef ZFS_DEBUG
+ refcount_create(&tx->tx_space_written);
+ refcount_create(&tx->tx_space_freed);
+#endif
+ return (tx);
+}
+
+dmu_tx_t *
+dmu_tx_create(objset_t *os)
+{
+ dmu_tx_t *tx = dmu_tx_create_dd(os->os->os_dsl_dataset->ds_dir);
+ tx->tx_objset = os;
+ tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os->os_dsl_dataset);
+ return (tx);
+}
+
+dmu_tx_t *
+dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg)
+{
+ dmu_tx_t *tx = dmu_tx_create_dd(NULL);
+
+ ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg);
+ tx->tx_pool = dp;
+ tx->tx_txg = txg;
+ tx->tx_anyobj = TRUE;
+
+ return (tx);
+}
+
+int
+dmu_tx_is_syncing(dmu_tx_t *tx)
+{
+ return (tx->tx_anyobj);
+}
+
+int
+dmu_tx_private_ok(dmu_tx_t *tx)
+{
+ return (tx->tx_anyobj);
+}
+
+static dmu_tx_hold_t *
+dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object,
+ enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2)
+{
+ dmu_tx_hold_t *txh;
+ dnode_t *dn = NULL;
+ int err;
+
+ if (object != DMU_NEW_OBJECT) {
+ err = dnode_hold(os->os, object, tx, &dn);
+ if (err) {
+ tx->tx_err = err;
+ return (NULL);
+ }
+
+ if (err == 0 && tx->tx_txg != 0) {
+ mutex_enter(&dn->dn_mtx);
+ /*
+ * dn->dn_assigned_txg == tx->tx_txg doesn't pose a
+ * problem, but there's no way for it to happen (for
+ * now, at least).
+ */
+ ASSERT(dn->dn_assigned_txg == 0);
+ dn->dn_assigned_txg = tx->tx_txg;
+ (void) refcount_add(&dn->dn_tx_holds, tx);
+ mutex_exit(&dn->dn_mtx);
+ }
+ }
+
+ txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP);
+ txh->txh_tx = tx;
+ txh->txh_dnode = dn;
+#ifdef ZFS_DEBUG
+ txh->txh_type = type;
+ txh->txh_arg1 = arg1;
+ txh->txh_arg2 = arg2;
+#endif
+ list_insert_tail(&tx->tx_holds, txh);
+
+ return (txh);
+}
+
+void
+dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object)
+{
+ /*
+ * If we're syncing, they can manipulate any object anyhow, and
+ * the hold on the dnode_t can cause problems.
+ */
+ if (!dmu_tx_is_syncing(tx)) {
+ (void) dmu_tx_hold_object_impl(tx, os,
+ object, THT_NEWOBJECT, 0, 0);
+ }
+}
+
+static int
+dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
+{
+ int err;
+ dmu_buf_impl_t *db;
+
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ db = dbuf_hold_level(dn, level, blkid, FTAG);
+ rw_exit(&dn->dn_struct_rwlock);
+ if (db == NULL)
+ return (EIO);
+ err = dbuf_read(db, zio, DB_RF_CANFAIL);
+ dbuf_rele(db, FTAG);
+ return (err);
+}
+
+/* ARGSUSED */
+static void
+dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
+{
+ dnode_t *dn = txh->txh_dnode;
+ uint64_t start, end, i;
+ int min_bs, max_bs, min_ibs, max_ibs, epbs, bits;
+ int err = 0;
+
+ if (len == 0)
+ return;
+
+ min_bs = SPA_MINBLOCKSHIFT;
+ max_bs = SPA_MAXBLOCKSHIFT;
+ min_ibs = DN_MIN_INDBLKSHIFT;
+ max_ibs = DN_MAX_INDBLKSHIFT;
+
+
+ /*
+ * For i/o error checking, read the first and last level-0
+ * blocks (if they are not aligned), and all the level-1 blocks.
+ */
+
+ if (dn) {
+ if (dn->dn_maxblkid == 0) {
+ err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
+ if (err)
+ goto out;
+ } else {
+ zio_t *zio = zio_root(dn->dn_objset->os_spa,
+ NULL, NULL, ZIO_FLAG_CANFAIL);
+
+ /* first level-0 block */
+ start = off >> dn->dn_datablkshift;
+ if (P2PHASE(off, dn->dn_datablksz) ||
+ len < dn->dn_datablksz) {
+ err = dmu_tx_check_ioerr(zio, dn, 0, start);
+ if (err)
+ goto out;
+ }
+
+ /* last level-0 block */
+ end = (off+len-1) >> dn->dn_datablkshift;
+ if (end != start &&
+ P2PHASE(off+len, dn->dn_datablksz)) {
+ err = dmu_tx_check_ioerr(zio, dn, 0, end);
+ if (err)
+ goto out;
+ }
+
+ /* level-1 blocks */
+ if (dn->dn_nlevels > 1) {
+ start >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ end >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ for (i = start+1; i < end; i++) {
+ err = dmu_tx_check_ioerr(zio, dn, 1, i);
+ if (err)
+ goto out;
+ }
+ }
+
+ err = zio_wait(zio);
+ if (err)
+ goto out;
+ }
+ }
+
+ /*
+ * If there's more than one block, the blocksize can't change,
+ * so we can make a more precise estimate. Alternatively,
+ * if the dnode's ibs is larger than max_ibs, always use that.
+ * This ensures that if we reduce DN_MAX_INDBLKSHIFT,
+ * the code will still work correctly on existing pools.
+ */
+ if (dn && (dn->dn_maxblkid != 0 || dn->dn_indblkshift > max_ibs)) {
+ min_ibs = max_ibs = dn->dn_indblkshift;
+ if (dn->dn_datablkshift != 0)
+ min_bs = max_bs = dn->dn_datablkshift;
+ }
+
+ /*
+ * 'end' is the last thing we will access, not one past.
+ * This way we won't overflow when accessing the last byte.
+ */
+ start = P2ALIGN(off, 1ULL << max_bs);
+ end = P2ROUNDUP(off + len, 1ULL << max_bs) - 1;
+ txh->txh_space_towrite += end - start + 1;
+
+ start >>= min_bs;
+ end >>= min_bs;
+
+ epbs = min_ibs - SPA_BLKPTRSHIFT;
+
+ /*
+ * The object contains at most 2^(64 - min_bs) blocks,
+ * and each indirect level maps 2^epbs.
+ */
+ for (bits = 64 - min_bs; bits >= 0; bits -= epbs) {
+ start >>= epbs;
+ end >>= epbs;
+ /*
+ * If we increase the number of levels of indirection,
+ * we'll need new blkid=0 indirect blocks. If start == 0,
+ * we're already accounting for that blocks; and if end == 0,
+ * we can't increase the number of levels beyond that.
+ */
+ if (start != 0 && end != 0)
+ txh->txh_space_towrite += 1ULL << max_ibs;
+ txh->txh_space_towrite += (end - start + 1) << max_ibs;
+ }
+
+ ASSERT(txh->txh_space_towrite < 2 * DMU_MAX_ACCESS);
+
+out:
+ if (err)
+ txh->txh_tx->tx_err = err;
+}
+
+static void
+dmu_tx_count_dnode(dmu_tx_hold_t *txh)
+{
+ dnode_t *dn = txh->txh_dnode;
+ dnode_t *mdn = txh->txh_tx->tx_objset->os->os_meta_dnode;
+ uint64_t space = mdn->dn_datablksz +
+ ((mdn->dn_nlevels-1) << mdn->dn_indblkshift);
+
+ if (dn && dn->dn_dbuf->db_blkptr &&
+ dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
+ dn->dn_dbuf->db_blkptr->blk_birth)) {
+ txh->txh_space_tooverwrite += space;
+ } else {
+ txh->txh_space_towrite += space;
+ }
+}
+
+void
+dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
+{
+ dmu_tx_hold_t *txh;
+
+ ASSERT(tx->tx_txg == 0);
+ ASSERT(len < DMU_MAX_ACCESS);
+ ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
+
+ txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
+ object, THT_WRITE, off, len);
+ if (txh == NULL)
+ return;
+
+ dmu_tx_count_write(txh, off, len);
+ dmu_tx_count_dnode(txh);
+}
+
+static void
+dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
+{
+ uint64_t blkid, nblks;
+ uint64_t space = 0;
+ dnode_t *dn = txh->txh_dnode;
+ dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
+ spa_t *spa = txh->txh_tx->tx_pool->dp_spa;
+ int dirty;
+
+ /*
+ * We don't need to use any locking to check for dirtyness
+ * because it's OK if we get stale data -- the dnode may become
+ * dirty immediately after our check anyway. This is just a
+ * means to avoid the expensive count when we aren't sure we
+ * need it. We need to be able to deal with a dirty dnode.
+ */
+ dirty = list_link_active(&dn->dn_dirty_link[0]) |
+ list_link_active(&dn->dn_dirty_link[1]) |
+ list_link_active(&dn->dn_dirty_link[2]) |
+ list_link_active(&dn->dn_dirty_link[3]);
+ if (dirty || dn->dn_assigned_txg || dn->dn_phys->dn_nlevels == 0)
+ return;
+
+ /*
+ * the struct_rwlock protects us against dn_phys->dn_nlevels
+ * changing, in case (against all odds) we manage to dirty &
+ * sync out the changes after we check for being dirty.
+ * also, dbuf_hold_impl() wants us to have the struct_rwlock.
+ *
+ * It's fine to use dn_datablkshift rather than the dn_phys
+ * equivalent because if it is changing, maxblkid==0 and we will
+ * bail.
+ */
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ if (dn->dn_phys->dn_maxblkid == 0) {
+ if (off == 0 && len >= dn->dn_datablksz) {
+ blkid = 0;
+ nblks = 1;
+ } else {
+ rw_exit(&dn->dn_struct_rwlock);
+ return;
+ }
+ } else {
+ blkid = off >> dn->dn_datablkshift;
+ nblks = (off + len) >> dn->dn_datablkshift;
+
+ if (blkid >= dn->dn_phys->dn_maxblkid) {
+ rw_exit(&dn->dn_struct_rwlock);
+ return;
+ }
+ if (blkid + nblks > dn->dn_phys->dn_maxblkid)
+ nblks = dn->dn_phys->dn_maxblkid - blkid;
+
+ /* don't bother after 128,000 blocks */
+ nblks = MIN(nblks, 128*1024);
+ }
+
+ if (dn->dn_phys->dn_nlevels == 1) {
+ int i;
+ for (i = 0; i < nblks; i++) {
+ blkptr_t *bp = dn->dn_phys->dn_blkptr;
+ ASSERT3U(blkid + i, <, dn->dn_phys->dn_nblkptr);
+ bp += blkid + i;
+ if (dsl_dataset_block_freeable(ds, bp->blk_birth)) {
+ dprintf_bp(bp, "can free old%s", "");
+ space += bp_get_dasize(spa, bp);
+ }
+ }
+ nblks = 0;
+ }
+
+ while (nblks) {
+ dmu_buf_impl_t *dbuf;
+ int err, epbs, blkoff, tochk;
+
+ epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ blkoff = P2PHASE(blkid, 1<<epbs);
+ tochk = MIN((1<<epbs) - blkoff, nblks);
+
+ err = dbuf_hold_impl(dn, 1, blkid >> epbs, TRUE, FTAG, &dbuf);
+ if (err == 0) {
+ int i;
+ blkptr_t *bp;
+
+ err = dbuf_read(dbuf, NULL,
+ DB_RF_HAVESTRUCT | DB_RF_CANFAIL);
+ if (err != 0) {
+ txh->txh_tx->tx_err = err;
+ dbuf_rele(dbuf, FTAG);
+ break;
+ }
+
+ bp = dbuf->db.db_data;
+ bp += blkoff;
+
+ for (i = 0; i < tochk; i++) {
+ if (dsl_dataset_block_freeable(ds,
+ bp[i].blk_birth)) {
+ dprintf_bp(&bp[i],
+ "can free old%s", "");
+ space += bp_get_dasize(spa, &bp[i]);
+ }
+ }
+ dbuf_rele(dbuf, FTAG);
+ }
+ if (err && err != ENOENT) {
+ txh->txh_tx->tx_err = err;
+ break;
+ }
+
+ blkid += tochk;
+ nblks -= tochk;
+ }
+ rw_exit(&dn->dn_struct_rwlock);
+
+ txh->txh_space_tofree += space;
+}
+
+void
+dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
+{
+ dmu_tx_hold_t *txh;
+ dnode_t *dn;
+ uint64_t start, end, i;
+ int err, shift;
+ zio_t *zio;
+
+ ASSERT(tx->tx_txg == 0);
+
+ txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
+ object, THT_FREE, off, len);
+ if (txh == NULL)
+ return;
+ dn = txh->txh_dnode;
+
+ /* first block */
+ if (off != 0)
+ dmu_tx_count_write(txh, off, 1);
+ /* last block */
+ if (len != DMU_OBJECT_END)
+ dmu_tx_count_write(txh, off+len, 1);
+
+ if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz)
+ return;
+ if (len == DMU_OBJECT_END)
+ len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off;
+
+ /*
+ * For i/o error checking, read the first and last level-0
+ * blocks, and all the level-1 blocks. The above count_write's
+ * will take care of the level-0 blocks.
+ */
+ if (dn->dn_nlevels > 1) {
+ shift = dn->dn_datablkshift + dn->dn_indblkshift -
+ SPA_BLKPTRSHIFT;
+ start = off >> shift;
+ end = dn->dn_datablkshift ? ((off+len) >> shift) : 0;
+
+ zio = zio_root(tx->tx_pool->dp_spa,
+ NULL, NULL, ZIO_FLAG_CANFAIL);
+ for (i = start; i <= end; i++) {
+ uint64_t ibyte = i << shift;
+ err = dnode_next_offset(dn, FALSE, &ibyte, 2, 1, 0);
+ i = ibyte >> shift;
+ if (err == ESRCH)
+ break;
+ if (err) {
+ tx->tx_err = err;
+ return;
+ }
+
+ err = dmu_tx_check_ioerr(zio, dn, 1, i);
+ if (err) {
+ tx->tx_err = err;
+ return;
+ }
+ }
+ err = zio_wait(zio);
+ if (err) {
+ tx->tx_err = err;
+ return;
+ }
+ }
+
+ dmu_tx_count_dnode(txh);
+ dmu_tx_count_free(txh, off, len);
+}
+
+void
+dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name)
+{
+ dmu_tx_hold_t *txh;
+ dnode_t *dn;
+ uint64_t nblocks;
+ int epbs, err;
+
+ ASSERT(tx->tx_txg == 0);
+
+ txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
+ object, THT_ZAP, add, (uintptr_t)name);
+ if (txh == NULL)
+ return;
+ dn = txh->txh_dnode;
+
+ dmu_tx_count_dnode(txh);
+
+ if (dn == NULL) {
+ /*
+ * We will be able to fit a new object's entries into one leaf
+ * block. So there will be at most 2 blocks total,
+ * including the header block.
+ */
+ dmu_tx_count_write(txh, 0, 2 << fzap_default_block_shift);
+ return;
+ }
+
+ ASSERT3P(dmu_ot[dn->dn_type].ot_byteswap, ==, zap_byteswap);
+
+ if (dn->dn_maxblkid == 0 && !add) {
+ /*
+ * If there is only one block (i.e. this is a micro-zap)
+ * and we are not adding anything, the accounting is simple.
+ */
+ err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
+ if (err) {
+ tx->tx_err = err;
+ return;
+ }
+
+ /*
+ * Use max block size here, since we don't know how much
+ * the size will change between now and the dbuf dirty call.
+ */
+ if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
+ dn->dn_phys->dn_blkptr[0].blk_birth))
+ txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
+ else
+ txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
+ return;
+ }
+
+ if (dn->dn_maxblkid > 0 && name) {
+ /*
+ * access the name in this fat-zap so that we'll check
+ * for i/o errors to the leaf blocks, etc.
+ */
+ err = zap_lookup(&dn->dn_objset->os, dn->dn_object, name,
+ 8, 0, NULL);
+ if (err == EIO) {
+ tx->tx_err = err;
+ return;
+ }
+ }
+
+ /*
+ * 3 blocks overwritten: target leaf, ptrtbl block, header block
+ * 3 new blocks written if adding: new split leaf, 2 grown ptrtbl blocks
+ */
+ dmu_tx_count_write(txh, dn->dn_maxblkid * dn->dn_datablksz,
+ (3 + add ? 3 : 0) << dn->dn_datablkshift);
+
+ /*
+ * If the modified blocks are scattered to the four winds,
+ * we'll have to modify an indirect twig for each.
+ */
+ epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs)
+ txh->txh_space_towrite += 3 << dn->dn_indblkshift;
+}
+
+void
+dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object)
+{
+ dmu_tx_hold_t *txh;
+
+ ASSERT(tx->tx_txg == 0);
+
+ txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
+ object, THT_BONUS, 0, 0);
+ if (txh)
+ dmu_tx_count_dnode(txh);
+}
+
+void
+dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space)
+{
+ dmu_tx_hold_t *txh;
+ ASSERT(tx->tx_txg == 0);
+
+ txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
+ DMU_NEW_OBJECT, THT_SPACE, space, 0);
+
+ txh->txh_space_towrite += space;
+}
+
+int
+dmu_tx_holds(dmu_tx_t *tx, uint64_t object)
+{
+ dmu_tx_hold_t *txh;
+ int holds = 0;
+
+ /*
+ * By asserting that the tx is assigned, we're counting the
+ * number of dn_tx_holds, which is the same as the number of
+ * dn_holds. Otherwise, we'd be counting dn_holds, but
+ * dn_tx_holds could be 0.
+ */
+ ASSERT(tx->tx_txg != 0);
+
+ /* if (tx->tx_anyobj == TRUE) */
+ /* return (0); */
+
+ for (txh = list_head(&tx->tx_holds); txh;
+ txh = list_next(&tx->tx_holds, txh)) {
+ if (txh->txh_dnode && txh->txh_dnode->dn_object == object)
+ holds++;
+ }
+
+ return (holds);
+}
+
+#ifdef ZFS_DEBUG
+void
+dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
+{
+ dmu_tx_hold_t *txh;
+ int match_object = FALSE, match_offset = FALSE;
+ dnode_t *dn = db->db_dnode;
+
+ ASSERT(tx->tx_txg != 0);
+ ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset->os);
+ ASSERT3U(dn->dn_object, ==, db->db.db_object);
+
+ if (tx->tx_anyobj)
+ return;
+
+ /* XXX No checking on the meta dnode for now */
+ if (db->db.db_object == DMU_META_DNODE_OBJECT)
+ return;
+
+ for (txh = list_head(&tx->tx_holds); txh;
+ txh = list_next(&tx->tx_holds, txh)) {
+ ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg);
+ if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT)
+ match_object = TRUE;
+ if (txh->txh_dnode == NULL || txh->txh_dnode == dn) {
+ int datablkshift = dn->dn_datablkshift ?
+ dn->dn_datablkshift : SPA_MAXBLOCKSHIFT;
+ int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ int shift = datablkshift + epbs * db->db_level;
+ uint64_t beginblk = shift >= 64 ? 0 :
+ (txh->txh_arg1 >> shift);
+ uint64_t endblk = shift >= 64 ? 0 :
+ ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift);
+ uint64_t blkid = db->db_blkid;
+
+ /* XXX txh_arg2 better not be zero... */
+
+ dprintf("found txh type %x beginblk=%llx endblk=%llx\n",
+ txh->txh_type, beginblk, endblk);
+
+ switch (txh->txh_type) {
+ case THT_WRITE:
+ if (blkid >= beginblk && blkid <= endblk)
+ match_offset = TRUE;
+ /*
+ * We will let this hold work for the bonus
+ * buffer so that we don't need to hold it
+ * when creating a new object.
+ */
+ if (blkid == DB_BONUS_BLKID)
+ match_offset = TRUE;
+ /*
+ * They might have to increase nlevels,
+ * thus dirtying the new TLIBs. Or the
+ * might have to change the block size,
+ * thus dirying the new lvl=0 blk=0.
+ */
+ if (blkid == 0)
+ match_offset = TRUE;
+ break;
+ case THT_FREE:
+ if (blkid == beginblk &&
+ (txh->txh_arg1 != 0 ||
+ dn->dn_maxblkid == 0))
+ match_offset = TRUE;
+ if (blkid == endblk &&
+ txh->txh_arg2 != DMU_OBJECT_END)
+ match_offset = TRUE;
+ break;
+ case THT_BONUS:
+ if (blkid == DB_BONUS_BLKID)
+ match_offset = TRUE;
+ break;
+ case THT_ZAP:
+ match_offset = TRUE;
+ break;
+ case THT_NEWOBJECT:
+ match_object = TRUE;
+ break;
+ default:
+ ASSERT(!"bad txh_type");
+ }
+ }
+ if (match_object && match_offset)
+ return;
+ }
+ panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n",
+ (u_longlong_t)db->db.db_object, db->db_level,
+ (u_longlong_t)db->db_blkid);
+}
+#endif
+
+static int
+dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
+{
+ dmu_tx_hold_t *txh;
+ uint64_t lsize, asize, fsize, towrite, tofree, tooverwrite;
+
+ ASSERT3U(tx->tx_txg, ==, 0);
+ if (tx->tx_err)
+ return (tx->tx_err);
+
+ tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
+ tx->tx_needassign_txh = NULL;
+
+ /*
+ * NB: No error returns are allowed after txg_hold_open, but
+ * before processing the dnode holds, due to the
+ * dmu_tx_unassign() logic.
+ */
+
+ towrite = tofree = tooverwrite = 0;
+ for (txh = list_head(&tx->tx_holds); txh;
+ txh = list_next(&tx->tx_holds, txh)) {
+ dnode_t *dn = txh->txh_dnode;
+ if (dn != NULL) {
+ mutex_enter(&dn->dn_mtx);
+ if (dn->dn_assigned_txg == tx->tx_txg - 1) {
+ mutex_exit(&dn->dn_mtx);
+ tx->tx_needassign_txh = txh;
+ return (ERESTART);
+ }
+ if (dn->dn_assigned_txg == 0)
+ dn->dn_assigned_txg = tx->tx_txg;
+ ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
+ (void) refcount_add(&dn->dn_tx_holds, tx);
+ mutex_exit(&dn->dn_mtx);
+ }
+ towrite += txh->txh_space_towrite;
+ tofree += txh->txh_space_tofree;
+ tooverwrite += txh->txh_space_tooverwrite;
+ }
+
+ /*
+ * NB: This check must be after we've held the dnodes, so that
+ * the dmu_tx_unassign() logic will work properly
+ */
+ if (txg_how >= TXG_INITIAL && txg_how != tx->tx_txg)
+ return (ERESTART);
+
+ /*
+ * If a snapshot has been taken since we made our estimates,
+ * assume that we won't be able to free or overwrite anything.
+ */
+ if (tx->tx_objset &&
+ dsl_dataset_prev_snap_txg(tx->tx_objset->os->os_dsl_dataset) >
+ tx->tx_lastsnap_txg) {
+ towrite += tooverwrite;
+ tooverwrite = tofree = 0;
+ }
+
+ /*
+ * Convert logical size to worst-case allocated size.
+ */
+ fsize = spa_get_asize(tx->tx_pool->dp_spa, tooverwrite) + tofree;
+ lsize = towrite + tooverwrite;
+ asize = spa_get_asize(tx->tx_pool->dp_spa, lsize);
+
+#ifdef ZFS_DEBUG
+ tx->tx_space_towrite = asize;
+ tx->tx_space_tofree = tofree;
+ tx->tx_space_tooverwrite = tooverwrite;
+#endif
+
+ if (tx->tx_dir && asize != 0) {
+ int err = dsl_dir_tempreserve_space(tx->tx_dir,
+ lsize, asize, fsize, &tx->tx_tempreserve_cookie, tx);
+ if (err)
+ return (err);
+ }
+
+ return (0);
+}
+
+static void
+dmu_tx_unassign(dmu_tx_t *tx)
+{
+ dmu_tx_hold_t *txh;
+
+ if (tx->tx_txg == 0)
+ return;
+
+ txg_rele_to_quiesce(&tx->tx_txgh);
+
+ for (txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh;
+ txh = list_next(&tx->tx_holds, txh)) {
+ dnode_t *dn = txh->txh_dnode;
+
+ if (dn == NULL)
+ continue;
+ mutex_enter(&dn->dn_mtx);
+ ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
+
+ if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
+ dn->dn_assigned_txg = 0;
+ cv_broadcast(&dn->dn_notxholds);
+ }
+ mutex_exit(&dn->dn_mtx);
+ }
+
+ txg_rele_to_sync(&tx->tx_txgh);
+
+ tx->tx_lasttried_txg = tx->tx_txg;
+ tx->tx_txg = 0;
+}
+
+/*
+ * Assign tx to a transaction group. txg_how can be one of:
+ *
+ * (1) TXG_WAIT. If the current open txg is full, waits until there's
+ * a new one. This should be used when you're not holding locks.
+ * If will only fail if we're truly out of space (or over quota).
+ *
+ * (2) TXG_NOWAIT. If we can't assign into the current open txg without
+ * blocking, returns immediately with ERESTART. This should be used
+ * whenever you're holding locks. On an ERESTART error, the caller
+ * should drop locks, do a dmu_tx_wait(tx), and try again.
+ *
+ * (3) A specific txg. Use this if you need to ensure that multiple
+ * transactions all sync in the same txg. Like TXG_NOWAIT, it
+ * returns ERESTART if it can't assign you into the requested txg.
+ */
+int
+dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how)
+{
+ int err;
+
+ ASSERT(tx->tx_txg == 0);
+ ASSERT(txg_how != 0);
+ ASSERT(!dsl_pool_sync_context(tx->tx_pool));
+
+ while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
+ dmu_tx_unassign(tx);
+
+ if (err != ERESTART || txg_how != TXG_WAIT)
+ return (err);
+
+ dmu_tx_wait(tx);
+ }
+
+ txg_rele_to_quiesce(&tx->tx_txgh);
+
+ return (0);
+}
+
+void
+dmu_tx_wait(dmu_tx_t *tx)
+{
+ ASSERT(tx->tx_txg == 0);
+ ASSERT(tx->tx_lasttried_txg != 0);
+
+ if (tx->tx_needassign_txh) {
+ dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
+
+ mutex_enter(&dn->dn_mtx);
+ while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1)
+ cv_wait(&dn->dn_notxholds, &dn->dn_mtx);
+ mutex_exit(&dn->dn_mtx);
+ tx->tx_needassign_txh = NULL;
+ } else {
+ txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1);
+ }
+}
+
+void
+dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta)
+{
+#ifdef ZFS_DEBUG
+ if (tx->tx_dir == NULL || delta == 0)
+ return;
+
+ if (delta > 0) {
+ ASSERT3U(refcount_count(&tx->tx_space_written) + delta, <=,
+ tx->tx_space_towrite);
+ (void) refcount_add_many(&tx->tx_space_written, delta, NULL);
+ } else {
+ (void) refcount_add_many(&tx->tx_space_freed, -delta, NULL);
+ }
+#endif
+}
+
+void
+dmu_tx_commit(dmu_tx_t *tx)
+{
+ dmu_tx_hold_t *txh;
+
+ ASSERT(tx->tx_txg != 0);
+
+ while (txh = list_head(&tx->tx_holds)) {
+ dnode_t *dn = txh->txh_dnode;
+
+ list_remove(&tx->tx_holds, txh);
+ kmem_free(txh, sizeof (dmu_tx_hold_t));
+ if (dn == NULL)
+ continue;
+ mutex_enter(&dn->dn_mtx);
+ ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
+
+ if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
+ dn->dn_assigned_txg = 0;
+ cv_broadcast(&dn->dn_notxholds);
+ }
+ mutex_exit(&dn->dn_mtx);
+ dnode_rele(dn, tx);
+ }
+
+ if (tx->tx_tempreserve_cookie)
+ dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx);
+
+ if (tx->tx_anyobj == FALSE)
+ txg_rele_to_sync(&tx->tx_txgh);
+#ifdef ZFS_DEBUG
+ dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n",
+ tx->tx_space_towrite, refcount_count(&tx->tx_space_written),
+ tx->tx_space_tofree, refcount_count(&tx->tx_space_freed));
+ refcount_destroy_many(&tx->tx_space_written,
+ refcount_count(&tx->tx_space_written));
+ refcount_destroy_many(&tx->tx_space_freed,
+ refcount_count(&tx->tx_space_freed));
+#endif
+ kmem_free(tx, sizeof (dmu_tx_t));
+}
+
+void
+dmu_tx_abort(dmu_tx_t *tx)
+{
+ dmu_tx_hold_t *txh;
+
+ ASSERT(tx->tx_txg == 0);
+
+ while (txh = list_head(&tx->tx_holds)) {
+ dnode_t *dn = txh->txh_dnode;
+
+ list_remove(&tx->tx_holds, txh);
+ kmem_free(txh, sizeof (dmu_tx_hold_t));
+ if (dn != NULL)
+ dnode_rele(dn, tx);
+ }
+#ifdef ZFS_DEBUG
+ refcount_destroy_many(&tx->tx_space_written,
+ refcount_count(&tx->tx_space_written));
+ refcount_destroy_many(&tx->tx_space_freed,
+ refcount_count(&tx->tx_space_freed));
+#endif
+ kmem_free(tx, sizeof (dmu_tx_t));
+}
+
+uint64_t
+dmu_tx_get_txg(dmu_tx_t *tx)
+{
+ ASSERT(tx->tx_txg != 0);
+ return (tx->tx_txg);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c
new file mode 100644
index 0000000..78d625c
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c
@@ -0,0 +1,655 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/dnode.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_zfetch.h>
+#include <sys/dmu.h>
+#include <sys/dbuf.h>
+
+/*
+ * I'm against tune-ables, but these should probably exist as tweakable globals
+ * until we can get this working the way we want it to.
+ */
+
+int zfs_prefetch_disable = 0;
+SYSCTL_DECL(_vfs_zfs);
+TUNABLE_INT("vfs.zfs.prefetch_disable", &zfs_prefetch_disable);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, prefetch_disable, CTLFLAG_RDTUN,
+ &zfs_prefetch_disable, 0, "Disable prefetch");
+
+/* max # of streams per zfetch */
+uint32_t zfetch_max_streams = 8;
+/* min time before stream reclaim */
+uint32_t zfetch_min_sec_reap = 2;
+/* max number of blocks to fetch at a time */
+uint32_t zfetch_block_cap = 256;
+/* number of bytes in a array_read at which we stop prefetching (1Mb) */
+uint64_t zfetch_array_rd_sz = 1024 * 1024;
+
+/* forward decls for static routines */
+static int dmu_zfetch_colinear(zfetch_t *, zstream_t *);
+static void dmu_zfetch_dofetch(zfetch_t *, zstream_t *);
+static uint64_t dmu_zfetch_fetch(dnode_t *, uint64_t, uint64_t);
+static uint64_t dmu_zfetch_fetchsz(dnode_t *, uint64_t, uint64_t);
+static int dmu_zfetch_find(zfetch_t *, zstream_t *, int);
+static int dmu_zfetch_stream_insert(zfetch_t *, zstream_t *);
+static zstream_t *dmu_zfetch_stream_reclaim(zfetch_t *);
+static void dmu_zfetch_stream_remove(zfetch_t *, zstream_t *);
+static int dmu_zfetch_streams_equal(zstream_t *, zstream_t *);
+
+/*
+ * Given a zfetch structure and a zstream structure, determine whether the
+ * blocks to be read are part of a co-linear pair of existing prefetch
+ * streams. If a set is found, coalesce the streams, removing one, and
+ * configure the prefetch so it looks for a strided access pattern.
+ *
+ * In other words: if we find two sequential access streams that are
+ * the same length and distance N appart, and this read is N from the
+ * last stream, then we are probably in a strided access pattern. So
+ * combine the two sequential streams into a single strided stream.
+ *
+ * If no co-linear streams are found, return NULL.
+ */
+static int
+dmu_zfetch_colinear(zfetch_t *zf, zstream_t *zh)
+{
+ zstream_t *z_walk;
+ zstream_t *z_comp;
+
+ if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER))
+ return (0);
+
+ if (zh == NULL) {
+ rw_exit(&zf->zf_rwlock);
+ return (0);
+ }
+
+ for (z_walk = list_head(&zf->zf_stream); z_walk;
+ z_walk = list_next(&zf->zf_stream, z_walk)) {
+ for (z_comp = list_next(&zf->zf_stream, z_walk); z_comp;
+ z_comp = list_next(&zf->zf_stream, z_comp)) {
+ int64_t diff;
+
+ if (z_walk->zst_len != z_walk->zst_stride ||
+ z_comp->zst_len != z_comp->zst_stride) {
+ continue;
+ }
+
+ diff = z_comp->zst_offset - z_walk->zst_offset;
+ if (z_comp->zst_offset + diff == zh->zst_offset) {
+ z_walk->zst_offset = zh->zst_offset;
+ z_walk->zst_direction = diff < 0 ? -1 : 1;
+ z_walk->zst_stride =
+ diff * z_walk->zst_direction;
+ z_walk->zst_ph_offset =
+ zh->zst_offset + z_walk->zst_stride;
+ dmu_zfetch_stream_remove(zf, z_comp);
+ mutex_destroy(&z_comp->zst_lock);
+ kmem_free(z_comp, sizeof (zstream_t));
+
+ dmu_zfetch_dofetch(zf, z_walk);
+
+ rw_exit(&zf->zf_rwlock);
+ return (1);
+ }
+
+ diff = z_walk->zst_offset - z_comp->zst_offset;
+ if (z_walk->zst_offset + diff == zh->zst_offset) {
+ z_walk->zst_offset = zh->zst_offset;
+ z_walk->zst_direction = diff < 0 ? -1 : 1;
+ z_walk->zst_stride =
+ diff * z_walk->zst_direction;
+ z_walk->zst_ph_offset =
+ zh->zst_offset + z_walk->zst_stride;
+ dmu_zfetch_stream_remove(zf, z_comp);
+ mutex_destroy(&z_comp->zst_lock);
+ kmem_free(z_comp, sizeof (zstream_t));
+
+ dmu_zfetch_dofetch(zf, z_walk);
+
+ rw_exit(&zf->zf_rwlock);
+ return (1);
+ }
+ }
+ }
+
+ rw_exit(&zf->zf_rwlock);
+ return (0);
+}
+
+/*
+ * Given a zstream_t, determine the bounds of the prefetch. Then call the
+ * routine that actually prefetches the individual blocks.
+ */
+static void
+dmu_zfetch_dofetch(zfetch_t *zf, zstream_t *zs)
+{
+ uint64_t prefetch_tail;
+ uint64_t prefetch_limit;
+ uint64_t prefetch_ofst;
+ uint64_t prefetch_len;
+ uint64_t blocks_fetched;
+
+ zs->zst_stride = MAX((int64_t)zs->zst_stride, zs->zst_len);
+ zs->zst_cap = MIN(zfetch_block_cap, 2 * zs->zst_cap);
+
+ prefetch_tail = MAX((int64_t)zs->zst_ph_offset,
+ (int64_t)(zs->zst_offset + zs->zst_stride));
+ /*
+ * XXX: use a faster division method?
+ */
+ prefetch_limit = zs->zst_offset + zs->zst_len +
+ (zs->zst_cap * zs->zst_stride) / zs->zst_len;
+
+ while (prefetch_tail < prefetch_limit) {
+ prefetch_ofst = zs->zst_offset + zs->zst_direction *
+ (prefetch_tail - zs->zst_offset);
+
+ prefetch_len = zs->zst_len;
+
+ /*
+ * Don't prefetch beyond the end of the file, if working
+ * backwards.
+ */
+ if ((zs->zst_direction == ZFETCH_BACKWARD) &&
+ (prefetch_ofst > prefetch_tail)) {
+ prefetch_len += prefetch_ofst;
+ prefetch_ofst = 0;
+ }
+
+ /* don't prefetch more than we're supposed to */
+ if (prefetch_len > zs->zst_len)
+ break;
+
+ blocks_fetched = dmu_zfetch_fetch(zf->zf_dnode,
+ prefetch_ofst, zs->zst_len);
+
+ prefetch_tail += zs->zst_stride;
+ /* stop if we've run out of stuff to prefetch */
+ if (blocks_fetched < zs->zst_len)
+ break;
+ }
+ zs->zst_ph_offset = prefetch_tail;
+ zs->zst_last = lbolt;
+}
+
+/*
+ * This takes a pointer to a zfetch structure and a dnode. It performs the
+ * necessary setup for the zfetch structure, grokking data from the
+ * associated dnode.
+ */
+void
+dmu_zfetch_init(zfetch_t *zf, dnode_t *dno)
+{
+ if (zf == NULL) {
+ return;
+ }
+
+ zf->zf_dnode = dno;
+ zf->zf_stream_cnt = 0;
+ zf->zf_alloc_fail = 0;
+
+ list_create(&zf->zf_stream, sizeof (zstream_t),
+ offsetof(zstream_t, zst_node));
+
+ rw_init(&zf->zf_rwlock, NULL, RW_DEFAULT, NULL);
+}
+
+/*
+ * This function computes the actual size, in blocks, that can be prefetched,
+ * and fetches it.
+ */
+static uint64_t
+dmu_zfetch_fetch(dnode_t *dn, uint64_t blkid, uint64_t nblks)
+{
+ uint64_t fetchsz;
+ uint64_t i;
+
+ fetchsz = dmu_zfetch_fetchsz(dn, blkid, nblks);
+
+ for (i = 0; i < fetchsz; i++) {
+ dbuf_prefetch(dn, blkid + i);
+ }
+
+ return (fetchsz);
+}
+
+/*
+ * this function returns the number of blocks that would be prefetched, based
+ * upon the supplied dnode, blockid, and nblks. This is used so that we can
+ * update streams in place, and then prefetch with their old value after the
+ * fact. This way, we can delay the prefetch, but subsequent accesses to the
+ * stream won't result in the same data being prefetched multiple times.
+ */
+static uint64_t
+dmu_zfetch_fetchsz(dnode_t *dn, uint64_t blkid, uint64_t nblks)
+{
+ uint64_t fetchsz;
+
+ if (blkid > dn->dn_maxblkid) {
+ return (0);
+ }
+
+ /* compute fetch size */
+ if (blkid + nblks + 1 > dn->dn_maxblkid) {
+ fetchsz = (dn->dn_maxblkid - blkid) + 1;
+ ASSERT(blkid + fetchsz - 1 <= dn->dn_maxblkid);
+ } else {
+ fetchsz = nblks;
+ }
+
+
+ return (fetchsz);
+}
+
+/*
+ * given a zfetch and a zsearch structure, see if there is an associated zstream
+ * for this block read. If so, it starts a prefetch for the stream it
+ * located and returns true, otherwise it returns false
+ */
+static int
+dmu_zfetch_find(zfetch_t *zf, zstream_t *zh, int prefetched)
+{
+ zstream_t *zs;
+ int64_t diff;
+ int reset = !prefetched;
+ int rc = 0;
+
+ if (zh == NULL)
+ return (0);
+
+ /*
+ * XXX: This locking strategy is a bit coarse; however, it's impact has
+ * yet to be tested. If this turns out to be an issue, it can be
+ * modified in a number of different ways.
+ */
+
+ rw_enter(&zf->zf_rwlock, RW_READER);
+top:
+
+ for (zs = list_head(&zf->zf_stream); zs;
+ zs = list_next(&zf->zf_stream, zs)) {
+
+ /*
+ * XXX - should this be an assert?
+ */
+ if (zs->zst_len == 0) {
+ /* bogus stream */
+ continue;
+ }
+
+ /*
+ * We hit this case when we are in a strided prefetch stream:
+ * we will read "len" blocks before "striding".
+ */
+ if (zh->zst_offset >= zs->zst_offset &&
+ zh->zst_offset < zs->zst_offset + zs->zst_len) {
+ /* already fetched */
+ rc = 1;
+ goto out;
+ }
+
+ /*
+ * This is the forward sequential read case: we increment
+ * len by one each time we hit here, so we will enter this
+ * case on every read.
+ */
+ if (zh->zst_offset == zs->zst_offset + zs->zst_len) {
+
+ reset = !prefetched && zs->zst_len > 1;
+
+ mutex_enter(&zs->zst_lock);
+
+ if (zh->zst_offset != zs->zst_offset + zs->zst_len) {
+ mutex_exit(&zs->zst_lock);
+ goto top;
+ }
+ zs->zst_len += zh->zst_len;
+ diff = zs->zst_len - zfetch_block_cap;
+ if (diff > 0) {
+ zs->zst_offset += diff;
+ zs->zst_len = zs->zst_len > diff ?
+ zs->zst_len - diff : 0;
+ }
+ zs->zst_direction = ZFETCH_FORWARD;
+
+ break;
+
+ /*
+ * Same as above, but reading backwards through the file.
+ */
+ } else if (zh->zst_offset == zs->zst_offset - zh->zst_len) {
+ /* backwards sequential access */
+
+ reset = !prefetched && zs->zst_len > 1;
+
+ mutex_enter(&zs->zst_lock);
+
+ if (zh->zst_offset != zs->zst_offset - zh->zst_len) {
+ mutex_exit(&zs->zst_lock);
+ goto top;
+ }
+
+ zs->zst_offset = zs->zst_offset > zh->zst_len ?
+ zs->zst_offset - zh->zst_len : 0;
+ zs->zst_ph_offset = zs->zst_ph_offset > zh->zst_len ?
+ zs->zst_ph_offset - zh->zst_len : 0;
+ zs->zst_len += zh->zst_len;
+
+ diff = zs->zst_len - zfetch_block_cap;
+ if (diff > 0) {
+ zs->zst_ph_offset = zs->zst_ph_offset > diff ?
+ zs->zst_ph_offset - diff : 0;
+ zs->zst_len = zs->zst_len > diff ?
+ zs->zst_len - diff : zs->zst_len;
+ }
+ zs->zst_direction = ZFETCH_BACKWARD;
+
+ break;
+
+ } else if ((zh->zst_offset - zs->zst_offset - zs->zst_stride <
+ zs->zst_len) && (zs->zst_len != zs->zst_stride)) {
+ /* strided forward access */
+
+ mutex_enter(&zs->zst_lock);
+
+ if ((zh->zst_offset - zs->zst_offset - zs->zst_stride >=
+ zs->zst_len) || (zs->zst_len == zs->zst_stride)) {
+ mutex_exit(&zs->zst_lock);
+ goto top;
+ }
+
+ zs->zst_offset += zs->zst_stride;
+ zs->zst_direction = ZFETCH_FORWARD;
+
+ break;
+
+ } else if ((zh->zst_offset - zs->zst_offset + zs->zst_stride <
+ zs->zst_len) && (zs->zst_len != zs->zst_stride)) {
+ /* strided reverse access */
+
+ mutex_enter(&zs->zst_lock);
+
+ if ((zh->zst_offset - zs->zst_offset + zs->zst_stride >=
+ zs->zst_len) || (zs->zst_len == zs->zst_stride)) {
+ mutex_exit(&zs->zst_lock);
+ goto top;
+ }
+
+ zs->zst_offset = zs->zst_offset > zs->zst_stride ?
+ zs->zst_offset - zs->zst_stride : 0;
+ zs->zst_ph_offset = (zs->zst_ph_offset >
+ (2 * zs->zst_stride)) ?
+ (zs->zst_ph_offset - (2 * zs->zst_stride)) : 0;
+ zs->zst_direction = ZFETCH_BACKWARD;
+
+ break;
+ }
+ }
+
+ if (zs) {
+ if (reset) {
+ zstream_t *remove = zs;
+
+ rc = 0;
+ mutex_exit(&zs->zst_lock);
+ rw_exit(&zf->zf_rwlock);
+ rw_enter(&zf->zf_rwlock, RW_WRITER);
+ /*
+ * Relocate the stream, in case someone removes
+ * it while we were acquiring the WRITER lock.
+ */
+ for (zs = list_head(&zf->zf_stream); zs;
+ zs = list_next(&zf->zf_stream, zs)) {
+ if (zs == remove) {
+ dmu_zfetch_stream_remove(zf, zs);
+ mutex_destroy(&zs->zst_lock);
+ kmem_free(zs, sizeof (zstream_t));
+ break;
+ }
+ }
+ } else {
+ rc = 1;
+ dmu_zfetch_dofetch(zf, zs);
+ mutex_exit(&zs->zst_lock);
+ }
+ }
+out:
+ rw_exit(&zf->zf_rwlock);
+ return (rc);
+}
+
+/*
+ * Clean-up state associated with a zfetch structure. This frees allocated
+ * structure members, empties the zf_stream tree, and generally makes things
+ * nice. This doesn't free the zfetch_t itself, that's left to the caller.
+ */
+void
+dmu_zfetch_rele(zfetch_t *zf)
+{
+ zstream_t *zs;
+ zstream_t *zs_next;
+
+ ASSERT(!RW_LOCK_HELD(&zf->zf_rwlock));
+
+ for (zs = list_head(&zf->zf_stream); zs; zs = zs_next) {
+ zs_next = list_next(&zf->zf_stream, zs);
+
+ list_remove(&zf->zf_stream, zs);
+ mutex_destroy(&zs->zst_lock);
+ kmem_free(zs, sizeof (zstream_t));
+ }
+ list_destroy(&zf->zf_stream);
+ rw_destroy(&zf->zf_rwlock);
+
+ zf->zf_dnode = NULL;
+}
+
+/*
+ * Given a zfetch and zstream structure, insert the zstream structure into the
+ * AVL tree contained within the zfetch structure. Peform the appropriate
+ * book-keeping. It is possible that another thread has inserted a stream which
+ * matches one that we are about to insert, so we must be sure to check for this
+ * case. If one is found, return failure, and let the caller cleanup the
+ * duplicates.
+ */
+static int
+dmu_zfetch_stream_insert(zfetch_t *zf, zstream_t *zs)
+{
+ zstream_t *zs_walk;
+ zstream_t *zs_next;
+
+ ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
+
+ for (zs_walk = list_head(&zf->zf_stream); zs_walk; zs_walk = zs_next) {
+ zs_next = list_next(&zf->zf_stream, zs_walk);
+
+ if (dmu_zfetch_streams_equal(zs_walk, zs)) {
+ return (0);
+ }
+ }
+
+ list_insert_head(&zf->zf_stream, zs);
+ zf->zf_stream_cnt++;
+
+ return (1);
+}
+
+
+/*
+ * Walk the list of zstreams in the given zfetch, find an old one (by time), and
+ * reclaim it for use by the caller.
+ */
+static zstream_t *
+dmu_zfetch_stream_reclaim(zfetch_t *zf)
+{
+ zstream_t *zs;
+
+ if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER))
+ return (0);
+
+ for (zs = list_head(&zf->zf_stream); zs;
+ zs = list_next(&zf->zf_stream, zs)) {
+
+ if (((lbolt - zs->zst_last) / hz) > zfetch_min_sec_reap)
+ break;
+ }
+
+ if (zs) {
+ dmu_zfetch_stream_remove(zf, zs);
+ mutex_destroy(&zs->zst_lock);
+ bzero(zs, sizeof (zstream_t));
+ } else {
+ zf->zf_alloc_fail++;
+ }
+ rw_exit(&zf->zf_rwlock);
+
+ return (zs);
+}
+
+/*
+ * Given a zfetch and zstream structure, remove the zstream structure from its
+ * container in the zfetch structure. Perform the appropriate book-keeping.
+ */
+static void
+dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs)
+{
+ ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
+
+ list_remove(&zf->zf_stream, zs);
+ zf->zf_stream_cnt--;
+}
+
+static int
+dmu_zfetch_streams_equal(zstream_t *zs1, zstream_t *zs2)
+{
+ if (zs1->zst_offset != zs2->zst_offset)
+ return (0);
+
+ if (zs1->zst_len != zs2->zst_len)
+ return (0);
+
+ if (zs1->zst_stride != zs2->zst_stride)
+ return (0);
+
+ if (zs1->zst_ph_offset != zs2->zst_ph_offset)
+ return (0);
+
+ if (zs1->zst_cap != zs2->zst_cap)
+ return (0);
+
+ if (zs1->zst_direction != zs2->zst_direction)
+ return (0);
+
+ return (1);
+}
+
+/*
+ * This is the prefetch entry point. It calls all of the other dmu_zfetch
+ * routines to create, delete, find, or operate upon prefetch streams.
+ */
+void
+dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched)
+{
+ zstream_t zst;
+ zstream_t *newstream;
+ int fetched;
+ int inserted;
+ unsigned int blkshft;
+ uint64_t blksz;
+
+ if (zfs_prefetch_disable)
+ return;
+
+ /* files that aren't ln2 blocksz are only one block -- nothing to do */
+ if (!zf->zf_dnode->dn_datablkshift)
+ return;
+
+ /* convert offset and size, into blockid and nblocks */
+ blkshft = zf->zf_dnode->dn_datablkshift;
+ blksz = (1 << blkshft);
+
+ bzero(&zst, sizeof (zstream_t));
+ zst.zst_offset = offset >> blkshft;
+ zst.zst_len = (P2ROUNDUP(offset + size, blksz) -
+ P2ALIGN(offset, blksz)) >> blkshft;
+
+ fetched = dmu_zfetch_find(zf, &zst, prefetched);
+ if (!fetched) {
+ fetched = dmu_zfetch_colinear(zf, &zst);
+ }
+
+ if (!fetched) {
+ newstream = dmu_zfetch_stream_reclaim(zf);
+
+ /*
+ * we still couldn't find a stream, drop the lock, and allocate
+ * one if possible. Otherwise, give up and go home.
+ */
+ if (newstream == NULL) {
+ uint64_t maxblocks;
+ uint32_t max_streams;
+ uint32_t cur_streams;
+
+ cur_streams = zf->zf_stream_cnt;
+ maxblocks = zf->zf_dnode->dn_maxblkid;
+
+ max_streams = MIN(zfetch_max_streams,
+ (maxblocks / zfetch_block_cap));
+ if (max_streams == 0) {
+ max_streams++;
+ }
+
+ if (cur_streams >= max_streams) {
+ return;
+ }
+
+ newstream = kmem_zalloc(sizeof (zstream_t), KM_SLEEP);
+ }
+
+ newstream->zst_offset = zst.zst_offset;
+ newstream->zst_len = zst.zst_len;
+ newstream->zst_stride = zst.zst_len;
+ newstream->zst_ph_offset = zst.zst_len + zst.zst_offset;
+ newstream->zst_cap = zst.zst_len;
+ newstream->zst_direction = ZFETCH_FORWARD;
+ newstream->zst_last = lbolt;
+
+ mutex_init(&newstream->zst_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ rw_enter(&zf->zf_rwlock, RW_WRITER);
+ inserted = dmu_zfetch_stream_insert(zf, newstream);
+ rw_exit(&zf->zf_rwlock);
+
+ if (!inserted) {
+ mutex_destroy(&newstream->zst_lock);
+ kmem_free(newstream, sizeof (zstream_t));
+ }
+ }
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
new file mode 100644
index 0000000..65bb518
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
@@ -0,0 +1,1370 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_dataset.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/dmu_zfetch.h>
+
+static int free_range_compar(const void *node1, const void *node2);
+
+static kmem_cache_t *dnode_cache;
+
+static dnode_phys_t dnode_phys_zero;
+
+int zfs_default_bs = SPA_MINBLOCKSHIFT;
+int zfs_default_ibs = DN_MAX_INDBLKSHIFT;
+
+/* ARGSUSED */
+static int
+dnode_cons(void *arg, void *unused, int kmflag)
+{
+ int i;
+ dnode_t *dn = arg;
+ bzero(dn, sizeof (dnode_t));
+
+ cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL);
+ rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL);
+ mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL);
+ refcount_create(&dn->dn_holds);
+ refcount_create(&dn->dn_tx_holds);
+
+ for (i = 0; i < TXG_SIZE; i++) {
+ avl_create(&dn->dn_ranges[i], free_range_compar,
+ sizeof (free_range_t),
+ offsetof(struct free_range, fr_node));
+ list_create(&dn->dn_dirty_records[i],
+ sizeof (dbuf_dirty_record_t),
+ offsetof(dbuf_dirty_record_t, dr_dirty_node));
+ }
+
+ list_create(&dn->dn_dbufs, sizeof (dmu_buf_impl_t),
+ offsetof(dmu_buf_impl_t, db_link));
+
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+dnode_dest(void *arg, void *unused)
+{
+ int i;
+ dnode_t *dn = arg;
+
+ cv_destroy(&dn->dn_notxholds);
+ rw_destroy(&dn->dn_struct_rwlock);
+ mutex_destroy(&dn->dn_mtx);
+ mutex_destroy(&dn->dn_dbufs_mtx);
+ refcount_destroy(&dn->dn_holds);
+ refcount_destroy(&dn->dn_tx_holds);
+
+ for (i = 0; i < TXG_SIZE; i++) {
+ avl_destroy(&dn->dn_ranges[i]);
+ list_destroy(&dn->dn_dirty_records[i]);
+ }
+
+ list_destroy(&dn->dn_dbufs);
+}
+
+void
+dnode_init(void)
+{
+ dnode_cache = kmem_cache_create("dnode_t",
+ sizeof (dnode_t),
+ 0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0);
+}
+
+void
+dnode_fini(void)
+{
+ kmem_cache_destroy(dnode_cache);
+}
+
+
+#ifdef ZFS_DEBUG
+void
+dnode_verify(dnode_t *dn)
+{
+ int drop_struct_lock = FALSE;
+
+ ASSERT(dn->dn_phys);
+ ASSERT(dn->dn_objset);
+
+ ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES);
+
+ if (!(zfs_flags & ZFS_DEBUG_DNODE_VERIFY))
+ return;
+
+ if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ drop_struct_lock = TRUE;
+ }
+ if (dn->dn_phys->dn_type != DMU_OT_NONE || dn->dn_allocated_txg != 0) {
+ int i;
+ ASSERT3U(dn->dn_indblkshift, >=, 0);
+ ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT);
+ if (dn->dn_datablkshift) {
+ ASSERT3U(dn->dn_datablkshift, >=, SPA_MINBLOCKSHIFT);
+ ASSERT3U(dn->dn_datablkshift, <=, SPA_MAXBLOCKSHIFT);
+ ASSERT3U(1<<dn->dn_datablkshift, ==, dn->dn_datablksz);
+ }
+ ASSERT3U(dn->dn_nlevels, <=, 30);
+ ASSERT3U(dn->dn_type, <=, DMU_OT_NUMTYPES);
+ ASSERT3U(dn->dn_nblkptr, >=, 1);
+ ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
+ ASSERT3U(dn->dn_bonuslen, <=, DN_MAX_BONUSLEN);
+ ASSERT3U(dn->dn_datablksz, ==,
+ dn->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+ ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0);
+ ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) +
+ dn->dn_bonuslen, <=, DN_MAX_BONUSLEN);
+ for (i = 0; i < TXG_SIZE; i++) {
+ ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels);
+ }
+ }
+ if (dn->dn_phys->dn_type != DMU_OT_NONE)
+ ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels);
+ ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || dn->dn_dbuf != NULL);
+ if (dn->dn_dbuf != NULL) {
+ ASSERT3P(dn->dn_phys, ==,
+ (dnode_phys_t *)dn->dn_dbuf->db.db_data +
+ (dn->dn_object % (dn->dn_dbuf->db.db_size >> DNODE_SHIFT)));
+ }
+ if (drop_struct_lock)
+ rw_exit(&dn->dn_struct_rwlock);
+}
+#endif
+
+void
+dnode_byteswap(dnode_phys_t *dnp)
+{
+ uint64_t *buf64 = (void*)&dnp->dn_blkptr;
+ int i;
+
+ if (dnp->dn_type == DMU_OT_NONE) {
+ bzero(dnp, sizeof (dnode_phys_t));
+ return;
+ }
+
+ dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec);
+ dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen);
+ dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid);
+ dnp->dn_used = BSWAP_64(dnp->dn_used);
+
+ /*
+ * dn_nblkptr is only one byte, so it's OK to read it in either
+ * byte order. We can't read dn_bouslen.
+ */
+ ASSERT(dnp->dn_indblkshift <= SPA_MAXBLOCKSHIFT);
+ ASSERT(dnp->dn_nblkptr <= DN_MAX_NBLKPTR);
+ for (i = 0; i < dnp->dn_nblkptr * sizeof (blkptr_t)/8; i++)
+ buf64[i] = BSWAP_64(buf64[i]);
+
+ /*
+ * OK to check dn_bonuslen for zero, because it won't matter if
+ * we have the wrong byte order. This is necessary because the
+ * dnode dnode is smaller than a regular dnode.
+ */
+ if (dnp->dn_bonuslen != 0) {
+ /*
+ * Note that the bonus length calculated here may be
+ * longer than the actual bonus buffer. This is because
+ * we always put the bonus buffer after the last block
+ * pointer (instead of packing it against the end of the
+ * dnode buffer).
+ */
+ int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t);
+ size_t len = DN_MAX_BONUSLEN - off;
+ ASSERT3U(dnp->dn_bonustype, <, DMU_OT_NUMTYPES);
+ dmu_ot[dnp->dn_bonustype].ot_byteswap(dnp->dn_bonus + off, len);
+ }
+}
+
+void
+dnode_buf_byteswap(void *vbuf, size_t size)
+{
+ dnode_phys_t *buf = vbuf;
+ int i;
+
+ ASSERT3U(sizeof (dnode_phys_t), ==, (1<<DNODE_SHIFT));
+ ASSERT((size & (sizeof (dnode_phys_t)-1)) == 0);
+
+ size >>= DNODE_SHIFT;
+ for (i = 0; i < size; i++) {
+ dnode_byteswap(buf);
+ buf++;
+ }
+}
+
+static int
+free_range_compar(const void *node1, const void *node2)
+{
+ const free_range_t *rp1 = node1;
+ const free_range_t *rp2 = node2;
+
+ if (rp1->fr_blkid < rp2->fr_blkid)
+ return (-1);
+ else if (rp1->fr_blkid > rp2->fr_blkid)
+ return (1);
+ else return (0);
+}
+
+static void
+dnode_setdblksz(dnode_t *dn, int size)
+{
+ ASSERT3U(P2PHASE(size, SPA_MINBLOCKSIZE), ==, 0);
+ ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
+ ASSERT3U(size, >=, SPA_MINBLOCKSIZE);
+ ASSERT3U(size >> SPA_MINBLOCKSHIFT, <,
+ 1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8));
+ dn->dn_datablksz = size;
+ dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT;
+ dn->dn_datablkshift = ISP2(size) ? highbit(size - 1) : 0;
+}
+
+static dnode_t *
+dnode_create(objset_impl_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
+ uint64_t object)
+{
+ dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
+ (void) dnode_cons(dn, NULL, 0); /* XXX */
+
+ dn->dn_objset = os;
+ dn->dn_object = object;
+ dn->dn_dbuf = db;
+ dn->dn_phys = dnp;
+
+ if (dnp->dn_datablkszsec)
+ dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+ dn->dn_indblkshift = dnp->dn_indblkshift;
+ dn->dn_nlevels = dnp->dn_nlevels;
+ dn->dn_type = dnp->dn_type;
+ dn->dn_nblkptr = dnp->dn_nblkptr;
+ dn->dn_checksum = dnp->dn_checksum;
+ dn->dn_compress = dnp->dn_compress;
+ dn->dn_bonustype = dnp->dn_bonustype;
+ dn->dn_bonuslen = dnp->dn_bonuslen;
+ dn->dn_maxblkid = dnp->dn_maxblkid;
+
+ dmu_zfetch_init(&dn->dn_zfetch, dn);
+
+ ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES);
+ mutex_enter(&os->os_lock);
+ list_insert_head(&os->os_dnodes, dn);
+ mutex_exit(&os->os_lock);
+
+ return (dn);
+}
+
+static void
+dnode_destroy(dnode_t *dn)
+{
+ objset_impl_t *os = dn->dn_objset;
+
+#ifdef ZFS_DEBUG
+ int i;
+
+ for (i = 0; i < TXG_SIZE; i++) {
+ ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
+ ASSERT(NULL == list_head(&dn->dn_dirty_records[i]));
+ ASSERT(0 == avl_numnodes(&dn->dn_ranges[i]));
+ }
+ ASSERT(NULL == list_head(&dn->dn_dbufs));
+#endif
+
+ mutex_enter(&os->os_lock);
+ list_remove(&os->os_dnodes, dn);
+ mutex_exit(&os->os_lock);
+
+ if (dn->dn_dirtyctx_firstset) {
+ kmem_free(dn->dn_dirtyctx_firstset, 1);
+ dn->dn_dirtyctx_firstset = NULL;
+ }
+ dmu_zfetch_rele(&dn->dn_zfetch);
+ if (dn->dn_bonus) {
+ mutex_enter(&dn->dn_bonus->db_mtx);
+ dbuf_evict(dn->dn_bonus);
+ dn->dn_bonus = NULL;
+ }
+ kmem_cache_free(dnode_cache, dn);
+}
+
+void
+dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+ int i;
+
+ if (blocksize == 0)
+ blocksize = 1 << zfs_default_bs;
+ else if (blocksize > SPA_MAXBLOCKSIZE)
+ blocksize = SPA_MAXBLOCKSIZE;
+ else
+ blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE);
+
+ if (ibs == 0)
+ ibs = zfs_default_ibs;
+
+ ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT);
+
+ dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d\n", dn->dn_objset,
+ dn->dn_object, tx->tx_txg, blocksize, ibs);
+
+ ASSERT(dn->dn_type == DMU_OT_NONE);
+ ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0);
+ ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE);
+ ASSERT(ot != DMU_OT_NONE);
+ ASSERT3U(ot, <, DMU_OT_NUMTYPES);
+ ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
+ (bonustype != DMU_OT_NONE && bonuslen != 0));
+ ASSERT3U(bonustype, <, DMU_OT_NUMTYPES);
+ ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
+ ASSERT(dn->dn_type == DMU_OT_NONE);
+ ASSERT3U(dn->dn_maxblkid, ==, 0);
+ ASSERT3U(dn->dn_allocated_txg, ==, 0);
+ ASSERT3U(dn->dn_assigned_txg, ==, 0);
+ ASSERT(refcount_is_zero(&dn->dn_tx_holds));
+ ASSERT3U(refcount_count(&dn->dn_holds), <=, 1);
+ ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
+
+ for (i = 0; i < TXG_SIZE; i++) {
+ ASSERT3U(dn->dn_next_nlevels[i], ==, 0);
+ ASSERT3U(dn->dn_next_indblkshift[i], ==, 0);
+ ASSERT3U(dn->dn_next_blksz[i], ==, 0);
+ ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
+ ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
+ ASSERT3U(avl_numnodes(&dn->dn_ranges[i]), ==, 0);
+ }
+
+ dn->dn_type = ot;
+ dnode_setdblksz(dn, blocksize);
+ dn->dn_indblkshift = ibs;
+ dn->dn_nlevels = 1;
+ dn->dn_nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+ dn->dn_bonustype = bonustype;
+ dn->dn_bonuslen = bonuslen;
+ dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
+ dn->dn_compress = ZIO_COMPRESS_INHERIT;
+ dn->dn_dirtyctx = 0;
+
+ dn->dn_free_txg = 0;
+ if (dn->dn_dirtyctx_firstset) {
+ kmem_free(dn->dn_dirtyctx_firstset, 1);
+ dn->dn_dirtyctx_firstset = NULL;
+ }
+
+ dn->dn_allocated_txg = tx->tx_txg;
+
+ dnode_setdirty(dn, tx);
+ dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs;
+ dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz;
+}
+
+void
+dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+ int i;
+ dmu_buf_impl_t *db = NULL;
+
+ ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
+ ASSERT3U(blocksize, <=, SPA_MAXBLOCKSIZE);
+ ASSERT3U(blocksize % SPA_MINBLOCKSIZE, ==, 0);
+ ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
+ ASSERT(tx->tx_txg != 0);
+ ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
+ (bonustype != DMU_OT_NONE && bonuslen != 0));
+ ASSERT3U(bonustype, <, DMU_OT_NUMTYPES);
+ ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
+
+ for (i = 0; i < TXG_SIZE; i++)
+ ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
+
+ /* clean up any unreferenced dbufs */
+ (void) dnode_evict_dbufs(dn, 0);
+ ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
+
+ /*
+ * XXX I should really have a generation number to tell if we
+ * need to do this...
+ */
+ if (blocksize != dn->dn_datablksz ||
+ dn->dn_bonustype != bonustype || dn->dn_bonuslen != bonuslen) {
+ /* free all old data */
+ dnode_free_range(dn, 0, -1ULL, tx);
+ }
+
+ /* change blocksize */
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ if (blocksize != dn->dn_datablksz &&
+ (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) ||
+ list_head(&dn->dn_dbufs) != NULL)) {
+ db = dbuf_hold(dn, 0, FTAG);
+ dbuf_new_size(db, blocksize, tx);
+ }
+ dnode_setdblksz(dn, blocksize);
+ dnode_setdirty(dn, tx);
+ dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize;
+ rw_exit(&dn->dn_struct_rwlock);
+ if (db) {
+ dbuf_rele(db, FTAG);
+ db = NULL;
+ }
+
+ /* change type */
+ dn->dn_type = ot;
+
+ if (dn->dn_bonuslen != bonuslen) {
+ /* change bonus size */
+ if (bonuslen == 0)
+ bonuslen = 1; /* XXX */
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ if (dn->dn_bonus == NULL)
+ dn->dn_bonus = dbuf_create_bonus(dn);
+ db = dn->dn_bonus;
+ rw_exit(&dn->dn_struct_rwlock);
+ if (refcount_add(&db->db_holds, FTAG) == 1)
+ dnode_add_ref(dn, db);
+ VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED));
+ mutex_enter(&db->db_mtx);
+ ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen);
+ ASSERT(db->db.db_data != NULL);
+ db->db.db_size = bonuslen;
+ mutex_exit(&db->db_mtx);
+ (void) dbuf_dirty(db, tx);
+ }
+
+ /* change bonus size and type */
+ mutex_enter(&dn->dn_mtx);
+ dn->dn_bonustype = bonustype;
+ dn->dn_bonuslen = bonuslen;
+ dn->dn_nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+ dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
+ dn->dn_compress = ZIO_COMPRESS_INHERIT;
+ ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
+
+ /*
+ * NB: we have to do the dbuf_rele after we've changed the
+ * dn_bonuslen, for the sake of dbuf_verify().
+ */
+ if (db)
+ dbuf_rele(db, FTAG);
+
+ dn->dn_allocated_txg = tx->tx_txg;
+ mutex_exit(&dn->dn_mtx);
+}
+
+void
+dnode_special_close(dnode_t *dn)
+{
+ /*
+ * Wait for final references to the dnode to clear. This can
+ * only happen if the arc is asyncronously evicting state that
+ * has a hold on this dnode while we are trying to evict this
+ * dnode.
+ */
+ while (refcount_count(&dn->dn_holds) > 0)
+ delay(1);
+ dnode_destroy(dn);
+}
+
+dnode_t *
+dnode_special_open(objset_impl_t *os, dnode_phys_t *dnp, uint64_t object)
+{
+ dnode_t *dn = dnode_create(os, dnp, NULL, object);
+ DNODE_VERIFY(dn);
+ return (dn);
+}
+
+static void
+dnode_buf_pageout(dmu_buf_t *db, void *arg)
+{
+ dnode_t **children_dnodes = arg;
+ int i;
+ int epb = db->db_size >> DNODE_SHIFT;
+
+ for (i = 0; i < epb; i++) {
+ dnode_t *dn = children_dnodes[i];
+ int n;
+
+ if (dn == NULL)
+ continue;
+#ifdef ZFS_DEBUG
+ /*
+ * If there are holds on this dnode, then there should
+ * be holds on the dnode's containing dbuf as well; thus
+ * it wouldn't be eligable for eviction and this function
+ * would not have been called.
+ */
+ ASSERT(refcount_is_zero(&dn->dn_holds));
+ ASSERT(list_head(&dn->dn_dbufs) == NULL);
+ ASSERT(refcount_is_zero(&dn->dn_tx_holds));
+
+ for (n = 0; n < TXG_SIZE; n++)
+ ASSERT(!list_link_active(&dn->dn_dirty_link[n]));
+#endif
+ children_dnodes[i] = NULL;
+ dnode_destroy(dn);
+ }
+ kmem_free(children_dnodes, epb * sizeof (dnode_t *));
+}
+
+/*
+ * errors:
+ * EINVAL - invalid object number.
+ * EIO - i/o error.
+ * succeeds even for free dnodes.
+ */
+int
+dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag,
+ void *tag, dnode_t **dnp)
+{
+ int epb, idx, err;
+ int drop_struct_lock = FALSE;
+ int type;
+ uint64_t blk;
+ dnode_t *mdn, *dn;
+ dmu_buf_impl_t *db;
+ dnode_t **children_dnodes;
+
+ if (object == 0 || object >= DN_MAX_OBJECT)
+ return (EINVAL);
+
+ mdn = os->os_meta_dnode;
+
+ DNODE_VERIFY(mdn);
+
+ if (!RW_WRITE_HELD(&mdn->dn_struct_rwlock)) {
+ rw_enter(&mdn->dn_struct_rwlock, RW_READER);
+ drop_struct_lock = TRUE;
+ }
+
+ blk = dbuf_whichblock(mdn, object * sizeof (dnode_phys_t));
+
+ db = dbuf_hold(mdn, blk, FTAG);
+ if (drop_struct_lock)
+ rw_exit(&mdn->dn_struct_rwlock);
+ if (db == NULL)
+ return (EIO);
+ err = dbuf_read(db, NULL, DB_RF_CANFAIL);
+ if (err) {
+ dbuf_rele(db, FTAG);
+ return (err);
+ }
+
+ ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT);
+ epb = db->db.db_size >> DNODE_SHIFT;
+
+ idx = object & (epb-1);
+
+ children_dnodes = dmu_buf_get_user(&db->db);
+ if (children_dnodes == NULL) {
+ dnode_t **winner;
+ children_dnodes = kmem_zalloc(epb * sizeof (dnode_t *),
+ KM_SLEEP);
+ if (winner = dmu_buf_set_user(&db->db, children_dnodes, NULL,
+ dnode_buf_pageout)) {
+ kmem_free(children_dnodes, epb * sizeof (dnode_t *));
+ children_dnodes = winner;
+ }
+ }
+
+ if ((dn = children_dnodes[idx]) == NULL) {
+ dnode_t *winner;
+ dn = dnode_create(os, (dnode_phys_t *)db->db.db_data+idx,
+ db, object);
+ winner = atomic_cas_ptr(&children_dnodes[idx], NULL, dn);
+ if (winner != NULL) {
+ dnode_destroy(dn);
+ dn = winner;
+ }
+ }
+
+ mutex_enter(&dn->dn_mtx);
+ type = dn->dn_type;
+ if (dn->dn_free_txg ||
+ ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) ||
+ ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE)) {
+ mutex_exit(&dn->dn_mtx);
+ dbuf_rele(db, FTAG);
+ return (type == DMU_OT_NONE ? ENOENT : EEXIST);
+ }
+ mutex_exit(&dn->dn_mtx);
+
+ if (refcount_add(&dn->dn_holds, tag) == 1)
+ dbuf_add_ref(db, dn);
+
+ DNODE_VERIFY(dn);
+ ASSERT3P(dn->dn_dbuf, ==, db);
+ ASSERT3U(dn->dn_object, ==, object);
+ dbuf_rele(db, FTAG);
+
+ *dnp = dn;
+ return (0);
+}
+
+/*
+ * Return held dnode if the object is allocated, NULL if not.
+ */
+int
+dnode_hold(objset_impl_t *os, uint64_t object, void *tag, dnode_t **dnp)
+{
+ return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, tag, dnp));
+}
+
+void
+dnode_add_ref(dnode_t *dn, void *tag)
+{
+ ASSERT(refcount_count(&dn->dn_holds) > 0);
+ (void) refcount_add(&dn->dn_holds, tag);
+}
+
+void
+dnode_rele(dnode_t *dn, void *tag)
+{
+ uint64_t refs;
+
+ refs = refcount_remove(&dn->dn_holds, tag);
+ /* NOTE: the DNODE_DNODE does not have a dn_dbuf */
+ if (refs == 0 && dn->dn_dbuf)
+ dbuf_rele(dn->dn_dbuf, dn);
+}
+
+void
+dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
+{
+ objset_impl_t *os = dn->dn_objset;
+ uint64_t txg = tx->tx_txg;
+
+ if (dn->dn_object == DMU_META_DNODE_OBJECT)
+ return;
+
+ DNODE_VERIFY(dn);
+
+#ifdef ZFS_DEBUG
+ mutex_enter(&dn->dn_mtx);
+ ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg);
+ /* ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg); */
+ mutex_exit(&dn->dn_mtx);
+#endif
+
+ mutex_enter(&os->os_lock);
+
+ /*
+ * If we are already marked dirty, we're done.
+ */
+ if (list_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) {
+ mutex_exit(&os->os_lock);
+ return;
+ }
+
+ ASSERT(!refcount_is_zero(&dn->dn_holds) || list_head(&dn->dn_dbufs));
+ ASSERT(dn->dn_datablksz != 0);
+ ASSERT3U(dn->dn_next_blksz[txg&TXG_MASK], ==, 0);
+
+ dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n",
+ dn->dn_object, txg);
+
+ if (dn->dn_free_txg > 0 && dn->dn_free_txg <= txg) {
+ list_insert_tail(&os->os_free_dnodes[txg&TXG_MASK], dn);
+ } else {
+ list_insert_tail(&os->os_dirty_dnodes[txg&TXG_MASK], dn);
+ }
+
+ mutex_exit(&os->os_lock);
+
+ /*
+ * The dnode maintains a hold on its containing dbuf as
+ * long as there are holds on it. Each instantiated child
+ * dbuf maintaines a hold on the dnode. When the last child
+ * drops its hold, the dnode will drop its hold on the
+ * containing dbuf. We add a "dirty hold" here so that the
+ * dnode will hang around after we finish processing its
+ * children.
+ */
+ dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg);
+
+ (void) dbuf_dirty(dn->dn_dbuf, tx);
+
+ dsl_dataset_dirty(os->os_dsl_dataset, tx);
+}
+
+void
+dnode_free(dnode_t *dn, dmu_tx_t *tx)
+{
+ int txgoff = tx->tx_txg & TXG_MASK;
+
+ dprintf("dn=%p txg=%llu\n", dn, tx->tx_txg);
+
+ /* we should be the only holder... hopefully */
+ /* ASSERT3U(refcount_count(&dn->dn_holds), ==, 1); */
+
+ mutex_enter(&dn->dn_mtx);
+ if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg) {
+ mutex_exit(&dn->dn_mtx);
+ return;
+ }
+ dn->dn_free_txg = tx->tx_txg;
+ mutex_exit(&dn->dn_mtx);
+
+ /*
+ * If the dnode is already dirty, it needs to be moved from
+ * the dirty list to the free list.
+ */
+ mutex_enter(&dn->dn_objset->os_lock);
+ if (list_link_active(&dn->dn_dirty_link[txgoff])) {
+ list_remove(&dn->dn_objset->os_dirty_dnodes[txgoff], dn);
+ list_insert_tail(&dn->dn_objset->os_free_dnodes[txgoff], dn);
+ mutex_exit(&dn->dn_objset->os_lock);
+ } else {
+ mutex_exit(&dn->dn_objset->os_lock);
+ dnode_setdirty(dn, tx);
+ }
+}
+
+/*
+ * Try to change the block size for the indicated dnode. This can only
+ * succeed if there are no blocks allocated or dirty beyond first block
+ */
+int
+dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db, *db_next;
+ int have_db0 = FALSE;
+
+ if (size == 0)
+ size = SPA_MINBLOCKSIZE;
+ if (size > SPA_MAXBLOCKSIZE)
+ size = SPA_MAXBLOCKSIZE;
+ else
+ size = P2ROUNDUP(size, SPA_MINBLOCKSIZE);
+
+ if (ibs == dn->dn_indblkshift)
+ ibs = 0;
+
+ if (size >> SPA_MINBLOCKSHIFT == dn->dn_datablkszsec && ibs == 0)
+ return (0);
+
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+
+ /* Check for any allocated blocks beyond the first */
+ if (dn->dn_phys->dn_maxblkid != 0)
+ goto fail;
+
+ mutex_enter(&dn->dn_dbufs_mtx);
+ for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
+ db_next = list_next(&dn->dn_dbufs, db);
+
+ if (db->db_blkid == 0) {
+ have_db0 = TRUE;
+ } else if (db->db_blkid != DB_BONUS_BLKID) {
+ mutex_exit(&dn->dn_dbufs_mtx);
+ goto fail;
+ }
+ }
+ mutex_exit(&dn->dn_dbufs_mtx);
+
+ if (ibs && dn->dn_nlevels != 1)
+ goto fail;
+
+ db = NULL;
+ if (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) || have_db0) {
+ /* obtain the old block */
+ db = dbuf_hold(dn, 0, FTAG);
+ dbuf_new_size(db, size, tx);
+ }
+
+ dnode_setdblksz(dn, size);
+ dnode_setdirty(dn, tx);
+ dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = size;
+ if (ibs) {
+ dn->dn_indblkshift = ibs;
+ dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs;
+ }
+
+ if (db)
+ dbuf_rele(db, FTAG);
+
+ rw_exit(&dn->dn_struct_rwlock);
+ return (0);
+
+fail:
+ rw_exit(&dn->dn_struct_rwlock);
+ return (ENOTSUP);
+}
+
+void
+dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx)
+{
+ uint64_t txgoff = tx->tx_txg & TXG_MASK;
+ int drop_struct_lock = FALSE;
+ int epbs, new_nlevels;
+ uint64_t sz;
+
+ ASSERT(blkid != DB_BONUS_BLKID);
+
+ if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ drop_struct_lock = TRUE;
+ }
+
+ if (blkid <= dn->dn_maxblkid)
+ goto out;
+
+ dn->dn_maxblkid = blkid;
+
+ /*
+ * Compute the number of levels necessary to support the new maxblkid.
+ */
+ new_nlevels = 1;
+ epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ for (sz = dn->dn_nblkptr;
+ sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs)
+ new_nlevels++;
+
+ if (new_nlevels > dn->dn_nlevels) {
+ int old_nlevels = dn->dn_nlevels;
+ dmu_buf_impl_t *db;
+ list_t *list;
+ dbuf_dirty_record_t *new, *dr, *dr_next;
+
+ dn->dn_nlevels = new_nlevels;
+
+ ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]);
+ dn->dn_next_nlevels[txgoff] = new_nlevels;
+
+ /* dirty the left indirects */
+ db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
+ new = dbuf_dirty(db, tx);
+ dbuf_rele(db, FTAG);
+
+ /* transfer the dirty records to the new indirect */
+ mutex_enter(&dn->dn_mtx);
+ mutex_enter(&new->dt.di.dr_mtx);
+ list = &dn->dn_dirty_records[txgoff];
+ for (dr = list_head(list); dr; dr = dr_next) {
+ dr_next = list_next(&dn->dn_dirty_records[txgoff], dr);
+ if (dr->dr_dbuf->db_level != new_nlevels-1 &&
+ dr->dr_dbuf->db_blkid != DB_BONUS_BLKID) {
+ ASSERT(dr->dr_dbuf->db_level == old_nlevels-1);
+ list_remove(&dn->dn_dirty_records[txgoff], dr);
+ list_insert_tail(&new->dt.di.dr_children, dr);
+ dr->dr_parent = new;
+ }
+ }
+ mutex_exit(&new->dt.di.dr_mtx);
+ mutex_exit(&dn->dn_mtx);
+ }
+
+out:
+ if (drop_struct_lock)
+ rw_exit(&dn->dn_struct_rwlock);
+}
+
+void
+dnode_clear_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
+{
+ avl_tree_t *tree = &dn->dn_ranges[tx->tx_txg&TXG_MASK];
+ avl_index_t where;
+ free_range_t *rp;
+ free_range_t rp_tofind;
+ uint64_t endblk = blkid + nblks;
+
+ ASSERT(MUTEX_HELD(&dn->dn_mtx));
+ ASSERT(nblks <= UINT64_MAX - blkid); /* no overflow */
+
+ dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
+ blkid, nblks, tx->tx_txg);
+ rp_tofind.fr_blkid = blkid;
+ rp = avl_find(tree, &rp_tofind, &where);
+ if (rp == NULL)
+ rp = avl_nearest(tree, where, AVL_BEFORE);
+ if (rp == NULL)
+ rp = avl_nearest(tree, where, AVL_AFTER);
+
+ while (rp && (rp->fr_blkid <= blkid + nblks)) {
+ uint64_t fr_endblk = rp->fr_blkid + rp->fr_nblks;
+ free_range_t *nrp = AVL_NEXT(tree, rp);
+
+ if (blkid <= rp->fr_blkid && endblk >= fr_endblk) {
+ /* clear this entire range */
+ avl_remove(tree, rp);
+ kmem_free(rp, sizeof (free_range_t));
+ } else if (blkid <= rp->fr_blkid &&
+ endblk > rp->fr_blkid && endblk < fr_endblk) {
+ /* clear the beginning of this range */
+ rp->fr_blkid = endblk;
+ rp->fr_nblks = fr_endblk - endblk;
+ } else if (blkid > rp->fr_blkid && blkid < fr_endblk &&
+ endblk >= fr_endblk) {
+ /* clear the end of this range */
+ rp->fr_nblks = blkid - rp->fr_blkid;
+ } else if (blkid > rp->fr_blkid && endblk < fr_endblk) {
+ /* clear a chunk out of this range */
+ free_range_t *new_rp =
+ kmem_alloc(sizeof (free_range_t), KM_SLEEP);
+
+ new_rp->fr_blkid = endblk;
+ new_rp->fr_nblks = fr_endblk - endblk;
+ avl_insert_here(tree, new_rp, rp, AVL_AFTER);
+ rp->fr_nblks = blkid - rp->fr_blkid;
+ }
+ /* there may be no overlap */
+ rp = nrp;
+ }
+}
+
+void
+dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db;
+ uint64_t blkoff, blkid, nblks;
+ int blksz, head;
+ int trunc = FALSE;
+
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ blksz = dn->dn_datablksz;
+
+ /* If the range is past the end of the file, this is a no-op */
+ if (off >= blksz * (dn->dn_maxblkid+1))
+ goto out;
+ if (len == -1ULL) {
+ len = UINT64_MAX - off;
+ trunc = TRUE;
+ }
+
+ /*
+ * First, block align the region to free:
+ */
+ if (ISP2(blksz)) {
+ head = P2NPHASE(off, blksz);
+ blkoff = P2PHASE(off, blksz);
+ } else {
+ ASSERT(dn->dn_maxblkid == 0);
+ if (off == 0 && len >= blksz) {
+ /* Freeing the whole block; don't do any head. */
+ head = 0;
+ } else {
+ /* Freeing part of the block. */
+ head = blksz - off;
+ ASSERT3U(head, >, 0);
+ }
+ blkoff = off;
+ }
+ /* zero out any partial block data at the start of the range */
+ if (head) {
+ ASSERT3U(blkoff + head, ==, blksz);
+ if (len < head)
+ head = len;
+ if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off), TRUE,
+ FTAG, &db) == 0) {
+ caddr_t data;
+
+ /* don't dirty if it isn't on disk and isn't dirty */
+ if (db->db_last_dirty ||
+ (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
+ rw_exit(&dn->dn_struct_rwlock);
+ dbuf_will_dirty(db, tx);
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ data = db->db.db_data;
+ bzero(data + blkoff, head);
+ }
+ dbuf_rele(db, FTAG);
+ }
+ off += head;
+ len -= head;
+ }
+
+ /* If the range was less than one block, we're done */
+ if (len == 0 || off >= blksz * (dn->dn_maxblkid+1))
+ goto out;
+
+ if (!ISP2(blksz)) {
+ /*
+ * They are freeing the whole block of a
+ * non-power-of-two blocksize file. Skip all the messy
+ * math.
+ */
+ ASSERT3U(off, ==, 0);
+ ASSERT3U(len, >=, blksz);
+ blkid = 0;
+ nblks = 1;
+ } else {
+ int tail;
+ int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ int blkshift = dn->dn_datablkshift;
+
+ /* If the remaining range is past end of file, we're done */
+ if (off > dn->dn_maxblkid << blkshift)
+ goto out;
+
+ if (off + len == UINT64_MAX)
+ tail = 0;
+ else
+ tail = P2PHASE(len, blksz);
+
+ ASSERT3U(P2PHASE(off, blksz), ==, 0);
+ /* zero out any partial block data at the end of the range */
+ if (tail) {
+ if (len < tail)
+ tail = len;
+ if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len),
+ TRUE, FTAG, &db) == 0) {
+ /* don't dirty if not on disk and not dirty */
+ if (db->db_last_dirty ||
+ (db->db_blkptr &&
+ !BP_IS_HOLE(db->db_blkptr))) {
+ rw_exit(&dn->dn_struct_rwlock);
+ dbuf_will_dirty(db, tx);
+ rw_enter(&dn->dn_struct_rwlock,
+ RW_WRITER);
+ bzero(db->db.db_data, tail);
+ }
+ dbuf_rele(db, FTAG);
+ }
+ len -= tail;
+ }
+ /* If the range did not include a full block, we are done */
+ if (len == 0)
+ goto out;
+
+ /* dirty the left indirects */
+ if (dn->dn_nlevels > 1 && off != 0) {
+ db = dbuf_hold_level(dn, 1,
+ (off - head) >> (blkshift + epbs), FTAG);
+ dbuf_will_dirty(db, tx);
+ dbuf_rele(db, FTAG);
+ }
+
+ /* dirty the right indirects */
+ if (dn->dn_nlevels > 1 && !trunc) {
+ db = dbuf_hold_level(dn, 1,
+ (off + len + tail - 1) >> (blkshift + epbs), FTAG);
+ dbuf_will_dirty(db, tx);
+ dbuf_rele(db, FTAG);
+ }
+
+ /*
+ * Finally, add this range to the dnode range list, we
+ * will finish up this free operation in the syncing phase.
+ */
+ ASSERT(IS_P2ALIGNED(off, 1<<blkshift));
+ ASSERT(off + len == UINT64_MAX ||
+ IS_P2ALIGNED(len, 1<<blkshift));
+ blkid = off >> blkshift;
+ nblks = len >> blkshift;
+
+ if (trunc)
+ dn->dn_maxblkid = (blkid ? blkid - 1 : 0);
+ }
+
+ mutex_enter(&dn->dn_mtx);
+ dnode_clear_range(dn, blkid, nblks, tx);
+ {
+ free_range_t *rp, *found;
+ avl_index_t where;
+ avl_tree_t *tree = &dn->dn_ranges[tx->tx_txg&TXG_MASK];
+
+ /* Add new range to dn_ranges */
+ rp = kmem_alloc(sizeof (free_range_t), KM_SLEEP);
+ rp->fr_blkid = blkid;
+ rp->fr_nblks = nblks;
+ found = avl_find(tree, rp, &where);
+ ASSERT(found == NULL);
+ avl_insert(tree, rp, where);
+ dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
+ blkid, nblks, tx->tx_txg);
+ }
+ mutex_exit(&dn->dn_mtx);
+
+ dbuf_free_range(dn, blkid, nblks, tx);
+ dnode_setdirty(dn, tx);
+out:
+ rw_exit(&dn->dn_struct_rwlock);
+}
+
+/* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */
+uint64_t
+dnode_block_freed(dnode_t *dn, uint64_t blkid)
+{
+ free_range_t range_tofind;
+ void *dp = spa_get_dsl(dn->dn_objset->os_spa);
+ int i;
+
+ if (blkid == DB_BONUS_BLKID)
+ return (FALSE);
+
+ /*
+ * If we're in the process of opening the pool, dp will not be
+ * set yet, but there shouldn't be anything dirty.
+ */
+ if (dp == NULL)
+ return (FALSE);
+
+ if (dn->dn_free_txg)
+ return (TRUE);
+
+ /*
+ * If dn_datablkshift is not set, then there's only a single
+ * block, in which case there will never be a free range so it
+ * won't matter.
+ */
+ range_tofind.fr_blkid = blkid;
+ mutex_enter(&dn->dn_mtx);
+ for (i = 0; i < TXG_SIZE; i++) {
+ free_range_t *range_found;
+ avl_index_t idx;
+
+ range_found = avl_find(&dn->dn_ranges[i], &range_tofind, &idx);
+ if (range_found) {
+ ASSERT(range_found->fr_nblks > 0);
+ break;
+ }
+ range_found = avl_nearest(&dn->dn_ranges[i], idx, AVL_BEFORE);
+ if (range_found &&
+ range_found->fr_blkid + range_found->fr_nblks > blkid)
+ break;
+ }
+ mutex_exit(&dn->dn_mtx);
+ return (i < TXG_SIZE);
+}
+
+/* call from syncing context when we actually write/free space for this dnode */
+void
+dnode_diduse_space(dnode_t *dn, int64_t delta)
+{
+ uint64_t space;
+ dprintf_dnode(dn, "dn=%p dnp=%p used=%llu delta=%lld\n",
+ dn, dn->dn_phys,
+ (u_longlong_t)dn->dn_phys->dn_used,
+ (longlong_t)delta);
+
+ mutex_enter(&dn->dn_mtx);
+ space = DN_USED_BYTES(dn->dn_phys);
+ if (delta > 0) {
+ ASSERT3U(space + delta, >=, space); /* no overflow */
+ } else {
+ ASSERT3U(space, >=, -delta); /* no underflow */
+ }
+ space += delta;
+ if (spa_version(dn->dn_objset->os_spa) < ZFS_VERSION_DNODE_BYTES) {
+ ASSERT((dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) == 0);
+ ASSERT3U(P2PHASE(space, 1<<DEV_BSHIFT), ==, 0);
+ dn->dn_phys->dn_used = space >> DEV_BSHIFT;
+ } else {
+ dn->dn_phys->dn_used = space;
+ dn->dn_phys->dn_flags |= DNODE_FLAG_USED_BYTES;
+ }
+ mutex_exit(&dn->dn_mtx);
+}
+
+/*
+ * Call when we think we're going to write/free space in open context.
+ * Be conservative (ie. OK to write less than this or free more than
+ * this, but don't write more or free less).
+ */
+void
+dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx)
+{
+ objset_impl_t *os = dn->dn_objset;
+ dsl_dataset_t *ds = os->os_dsl_dataset;
+
+ if (space > 0)
+ space = spa_get_asize(os->os_spa, space);
+
+ if (ds)
+ dsl_dir_willuse_space(ds->ds_dir, space, tx);
+
+ dmu_tx_willuse_space(tx, space);
+}
+
+static int
+dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset,
+ int lvl, uint64_t blkfill, uint64_t txg)
+{
+ dmu_buf_impl_t *db = NULL;
+ void *data = NULL;
+ uint64_t epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+ uint64_t epb = 1ULL << epbs;
+ uint64_t minfill, maxfill;
+ int i, error, span;
+
+ dprintf("probing object %llu offset %llx level %d of %u\n",
+ dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels);
+
+ if (lvl == dn->dn_phys->dn_nlevels) {
+ error = 0;
+ epb = dn->dn_phys->dn_nblkptr;
+ data = dn->dn_phys->dn_blkptr;
+ } else {
+ uint64_t blkid = dbuf_whichblock(dn, *offset) >> (epbs * lvl);
+ error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FTAG, &db);
+ if (error) {
+ if (error == ENOENT)
+ return (hole ? 0 : ESRCH);
+ return (error);
+ }
+ error = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_HAVESTRUCT);
+ if (error) {
+ dbuf_rele(db, FTAG);
+ return (error);
+ }
+ data = db->db.db_data;
+ }
+
+ if (db && txg &&
+ (db->db_blkptr == NULL || db->db_blkptr->blk_birth <= txg)) {
+ error = ESRCH;
+ } else if (lvl == 0) {
+ dnode_phys_t *dnp = data;
+ span = DNODE_SHIFT;
+ ASSERT(dn->dn_type == DMU_OT_DNODE);
+
+ for (i = (*offset >> span) & (blkfill - 1); i < blkfill; i++) {
+ boolean_t newcontents = B_TRUE;
+ if (txg) {
+ int j;
+ newcontents = B_FALSE;
+ for (j = 0; j < dnp[i].dn_nblkptr; j++) {
+ if (dnp[i].dn_blkptr[j].blk_birth > txg)
+ newcontents = B_TRUE;
+ }
+ }
+ if (!dnp[i].dn_type == hole && newcontents)
+ break;
+ *offset += 1ULL << span;
+ }
+ if (i == blkfill)
+ error = ESRCH;
+ } else {
+ blkptr_t *bp = data;
+ span = (lvl - 1) * epbs + dn->dn_datablkshift;
+ minfill = 0;
+ maxfill = blkfill << ((lvl - 1) * epbs);
+
+ if (hole)
+ maxfill--;
+ else
+ minfill++;
+
+ for (i = (*offset >> span) & ((1ULL << epbs) - 1);
+ i < epb; i++) {
+ if (bp[i].blk_fill >= minfill &&
+ bp[i].blk_fill <= maxfill &&
+ bp[i].blk_birth > txg)
+ break;
+ *offset += 1ULL << span;
+ }
+ if (i >= epb)
+ error = ESRCH;
+ }
+
+ if (db)
+ dbuf_rele(db, FTAG);
+
+ return (error);
+}
+
+/*
+ * Find the next hole, data, or sparse region at or after *offset.
+ * The value 'blkfill' tells us how many items we expect to find
+ * in an L0 data block; this value is 1 for normal objects,
+ * DNODES_PER_BLOCK for the meta dnode, and some fraction of
+ * DNODES_PER_BLOCK when searching for sparse regions thereof.
+ *
+ * Examples:
+ *
+ * dnode_next_offset(dn, hole, offset, 1, 1, 0);
+ * Finds the next hole/data in a file.
+ * Used in dmu_offset_next().
+ *
+ * dnode_next_offset(mdn, hole, offset, 0, DNODES_PER_BLOCK, txg);
+ * Finds the next free/allocated dnode an objset's meta-dnode.
+ * Only finds objects that have new contents since txg (ie.
+ * bonus buffer changes and content removal are ignored).
+ * Used in dmu_object_next().
+ *
+ * dnode_next_offset(mdn, TRUE, offset, 2, DNODES_PER_BLOCK >> 2, 0);
+ * Finds the next L2 meta-dnode bp that's at most 1/4 full.
+ * Used in dmu_object_alloc().
+ */
+int
+dnode_next_offset(dnode_t *dn, boolean_t hole, uint64_t *offset,
+ int minlvl, uint64_t blkfill, uint64_t txg)
+{
+ int lvl, maxlvl;
+ int error = 0;
+ uint64_t initial_offset = *offset;
+
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+
+ if (dn->dn_phys->dn_nlevels == 0) {
+ rw_exit(&dn->dn_struct_rwlock);
+ return (ESRCH);
+ }
+
+ if (dn->dn_datablkshift == 0) {
+ if (*offset < dn->dn_datablksz) {
+ if (hole)
+ *offset = dn->dn_datablksz;
+ } else {
+ error = ESRCH;
+ }
+ rw_exit(&dn->dn_struct_rwlock);
+ return (error);
+ }
+
+ maxlvl = dn->dn_phys->dn_nlevels;
+
+ for (lvl = minlvl; lvl <= maxlvl; lvl++) {
+ error = dnode_next_offset_level(dn,
+ hole, offset, lvl, blkfill, txg);
+ if (error != ESRCH)
+ break;
+ }
+
+ while (--lvl >= minlvl && error == 0) {
+ error = dnode_next_offset_level(dn,
+ hole, offset, lvl, blkfill, txg);
+ }
+
+ rw_exit(&dn->dn_struct_rwlock);
+
+ if (error == 0 && initial_offset > *offset)
+ error = ESRCH;
+
+ return (error);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
new file mode 100644
index 0000000..08f60e8
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
@@ -0,0 +1,621 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h>
+#include <sys/spa.h>
+
+static void
+dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db;
+ int txgoff = tx->tx_txg & TXG_MASK;
+ int nblkptr = dn->dn_phys->dn_nblkptr;
+ int old_toplvl = dn->dn_phys->dn_nlevels - 1;
+ int new_level = dn->dn_next_nlevels[txgoff];
+ int i;
+
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+
+ /* this dnode can't be paged out because it's dirty */
+ ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
+ ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
+ ASSERT(new_level > 1 && dn->dn_phys->dn_nlevels > 0);
+
+ db = dbuf_hold_level(dn, dn->dn_phys->dn_nlevels, 0, FTAG);
+ ASSERT(db != NULL);
+
+ dn->dn_phys->dn_nlevels = new_level;
+ dprintf("os=%p obj=%llu, increase to %d\n",
+ dn->dn_objset, dn->dn_object,
+ dn->dn_phys->dn_nlevels);
+
+ /* check for existing blkptrs in the dnode */
+ for (i = 0; i < nblkptr; i++)
+ if (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[i]))
+ break;
+ if (i != nblkptr) {
+ /* transfer dnode's block pointers to new indirect block */
+ (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED|DB_RF_HAVESTRUCT);
+ ASSERT(db->db.db_data);
+ ASSERT(arc_released(db->db_buf));
+ ASSERT3U(sizeof (blkptr_t) * nblkptr, <=, db->db.db_size);
+ bcopy(dn->dn_phys->dn_blkptr, db->db.db_data,
+ sizeof (blkptr_t) * nblkptr);
+ arc_buf_freeze(db->db_buf);
+ }
+
+ /* set dbuf's parent pointers to new indirect buf */
+ for (i = 0; i < nblkptr; i++) {
+ dmu_buf_impl_t *child = dbuf_find(dn, old_toplvl, i);
+
+ if (child == NULL)
+ continue;
+ ASSERT3P(child->db_dnode, ==, dn);
+ if (child->db_parent && child->db_parent != dn->dn_dbuf) {
+ ASSERT(child->db_parent->db_level == db->db_level);
+ ASSERT(child->db_blkptr !=
+ &dn->dn_phys->dn_blkptr[child->db_blkid]);
+ mutex_exit(&child->db_mtx);
+ continue;
+ }
+ ASSERT(child->db_parent == NULL ||
+ child->db_parent == dn->dn_dbuf);
+
+ child->db_parent = db;
+ dbuf_add_ref(db, child);
+ if (db->db.db_data)
+ child->db_blkptr = (blkptr_t *)db->db.db_data + i;
+ else
+ child->db_blkptr = NULL;
+ dprintf_dbuf_bp(child, child->db_blkptr,
+ "changed db_blkptr to new indirect %s", "");
+
+ mutex_exit(&child->db_mtx);
+ }
+
+ bzero(dn->dn_phys->dn_blkptr, sizeof (blkptr_t) * nblkptr);
+
+ dbuf_rele(db, FTAG);
+
+ rw_exit(&dn->dn_struct_rwlock);
+}
+
+static void
+free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx)
+{
+ objset_impl_t *os = dn->dn_objset;
+ uint64_t bytesfreed = 0;
+ int i;
+
+ dprintf("os=%p obj=%llx num=%d\n", os, dn->dn_object, num);
+
+ for (i = 0; i < num; i++, bp++) {
+ if (BP_IS_HOLE(bp))
+ continue;
+
+ bytesfreed += bp_get_dasize(os->os_spa, bp);
+ ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys));
+ dsl_dataset_block_kill(os->os_dsl_dataset, bp, dn->dn_zio, tx);
+ bzero(bp, sizeof (blkptr_t));
+ }
+ dnode_diduse_space(dn, -bytesfreed);
+}
+
+#ifdef ZFS_DEBUG
+static void
+free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
+{
+ int off, num;
+ int i, err, epbs;
+ uint64_t txg = tx->tx_txg;
+
+ epbs = db->db_dnode->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+ off = start - (db->db_blkid * 1<<epbs);
+ num = end - start + 1;
+
+ ASSERT3U(off, >=, 0);
+ ASSERT3U(num, >=, 0);
+ ASSERT3U(db->db_level, >, 0);
+ ASSERT3U(db->db.db_size, ==, 1<<db->db_dnode->dn_phys->dn_indblkshift);
+ ASSERT3U(off+num, <=, db->db.db_size >> SPA_BLKPTRSHIFT);
+ ASSERT(db->db_blkptr != NULL);
+
+ for (i = off; i < off+num; i++) {
+ uint64_t *buf;
+ dmu_buf_impl_t *child;
+ dbuf_dirty_record_t *dr;
+ int j;
+
+ ASSERT(db->db_level == 1);
+
+ rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
+ err = dbuf_hold_impl(db->db_dnode, db->db_level-1,
+ (db->db_blkid << epbs) + i, TRUE, FTAG, &child);
+ rw_exit(&db->db_dnode->dn_struct_rwlock);
+ if (err == ENOENT)
+ continue;
+ ASSERT(err == 0);
+ ASSERT(child->db_level == 0);
+ dr = child->db_last_dirty;
+ while (dr && dr->dr_txg > txg)
+ dr = dr->dr_next;
+ ASSERT(dr == NULL || dr->dr_txg == txg);
+
+ /* data_old better be zeroed */
+ if (dr) {
+ buf = dr->dt.dl.dr_data->b_data;
+ for (j = 0; j < child->db.db_size >> 3; j++) {
+ if (buf[j] != 0) {
+ panic("freed data not zero: "
+ "child=%p i=%d off=%d num=%d\n",
+ child, i, off, num);
+ }
+ }
+ }
+
+ /*
+ * db_data better be zeroed unless it's dirty in a
+ * future txg.
+ */
+ mutex_enter(&child->db_mtx);
+ buf = child->db.db_data;
+ if (buf != NULL && child->db_state != DB_FILL &&
+ child->db_last_dirty == NULL) {
+ for (j = 0; j < child->db.db_size >> 3; j++) {
+ if (buf[j] != 0) {
+ panic("freed data not zero: "
+ "child=%p i=%d off=%d num=%d\n",
+ child, i, off, num);
+ }
+ }
+ }
+ mutex_exit(&child->db_mtx);
+
+ dbuf_rele(child, FTAG);
+ }
+}
+#endif
+
+static int
+free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
+ dmu_tx_t *tx)
+{
+ dnode_t *dn = db->db_dnode;
+ blkptr_t *bp;
+ dmu_buf_impl_t *subdb;
+ uint64_t start, end, dbstart, dbend, i;
+ int epbs, shift, err;
+ int all = TRUE;
+
+ (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
+ arc_release(db->db_buf, db);
+ bp = (blkptr_t *)db->db.db_data;
+
+ epbs = db->db_dnode->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+ shift = (db->db_level - 1) * epbs;
+ dbstart = db->db_blkid << epbs;
+ start = blkid >> shift;
+ if (dbstart < start) {
+ bp += start - dbstart;
+ all = FALSE;
+ } else {
+ start = dbstart;
+ }
+ dbend = ((db->db_blkid + 1) << epbs) - 1;
+ end = (blkid + nblks - 1) >> shift;
+ if (dbend <= end)
+ end = dbend;
+ else if (all)
+ all = trunc;
+ ASSERT3U(start, <=, end);
+
+ if (db->db_level == 1) {
+ FREE_VERIFY(db, start, end, tx);
+ free_blocks(dn, bp, end-start+1, tx);
+ arc_buf_freeze(db->db_buf);
+ ASSERT(all || db->db_last_dirty);
+ return (all);
+ }
+
+ for (i = start; i <= end; i++, bp++) {
+ if (BP_IS_HOLE(bp))
+ continue;
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ err = dbuf_hold_impl(dn, db->db_level-1, i, TRUE, FTAG, &subdb);
+ ASSERT3U(err, ==, 0);
+ rw_exit(&dn->dn_struct_rwlock);
+
+ if (free_children(subdb, blkid, nblks, trunc, tx)) {
+ ASSERT3P(subdb->db_blkptr, ==, bp);
+ free_blocks(dn, bp, 1, tx);
+ } else {
+ all = FALSE;
+ }
+ dbuf_rele(subdb, FTAG);
+ }
+ arc_buf_freeze(db->db_buf);
+#ifdef ZFS_DEBUG
+ bp -= (end-start)+1;
+ for (i = start; i <= end; i++, bp++) {
+ if (i == start && blkid != 0)
+ continue;
+ else if (i == end && !trunc)
+ continue;
+ ASSERT3U(bp->blk_birth, ==, 0);
+ }
+#endif
+ ASSERT(all || db->db_last_dirty);
+ return (all);
+}
+
+/*
+ * free_range: Traverse the indicated range of the provided file
+ * and "free" all the blocks contained there.
+ */
+static void
+dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
+{
+ blkptr_t *bp = dn->dn_phys->dn_blkptr;
+ dmu_buf_impl_t *db;
+ int trunc, start, end, shift, i, err;
+ int dnlevel = dn->dn_phys->dn_nlevels;
+
+ if (blkid > dn->dn_phys->dn_maxblkid)
+ return;
+
+ ASSERT(dn->dn_phys->dn_maxblkid < UINT64_MAX);
+ trunc = blkid + nblks > dn->dn_phys->dn_maxblkid;
+ if (trunc)
+ nblks = dn->dn_phys->dn_maxblkid - blkid + 1;
+
+ /* There are no indirect blocks in the object */
+ if (dnlevel == 1) {
+ if (blkid >= dn->dn_phys->dn_nblkptr) {
+ /* this range was never made persistent */
+ return;
+ }
+ ASSERT3U(blkid + nblks, <=, dn->dn_phys->dn_nblkptr);
+ free_blocks(dn, bp + blkid, nblks, tx);
+ if (trunc) {
+ uint64_t off = (dn->dn_phys->dn_maxblkid + 1) *
+ (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+ dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0);
+ ASSERT(off < dn->dn_phys->dn_maxblkid ||
+ dn->dn_phys->dn_maxblkid == 0 ||
+ dnode_next_offset(dn, FALSE, &off,
+ 1, 1, 0) != 0);
+ }
+ return;
+ }
+
+ shift = (dnlevel - 1) * (dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT);
+ start = blkid >> shift;
+ ASSERT(start < dn->dn_phys->dn_nblkptr);
+ end = (blkid + nblks - 1) >> shift;
+ bp += start;
+ for (i = start; i <= end; i++, bp++) {
+ if (BP_IS_HOLE(bp))
+ continue;
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ err = dbuf_hold_impl(dn, dnlevel-1, i, TRUE, FTAG, &db);
+ ASSERT3U(err, ==, 0);
+ rw_exit(&dn->dn_struct_rwlock);
+
+ if (free_children(db, blkid, nblks, trunc, tx)) {
+ ASSERT3P(db->db_blkptr, ==, bp);
+ free_blocks(dn, bp, 1, tx);
+ }
+ dbuf_rele(db, FTAG);
+ }
+ if (trunc) {
+ uint64_t off = (dn->dn_phys->dn_maxblkid + 1) *
+ (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+ dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0);
+ ASSERT(off < dn->dn_phys->dn_maxblkid ||
+ dn->dn_phys->dn_maxblkid == 0 ||
+ dnode_next_offset(dn, FALSE, &off, 1, 1, 0) != 0);
+ }
+}
+
+/*
+ * Try to kick all the dnodes dbufs out of the cache...
+ */
+int
+dnode_evict_dbufs(dnode_t *dn, int try)
+{
+ int progress;
+ int pass = 0;
+
+ do {
+ dmu_buf_impl_t *db, marker;
+ int evicting = FALSE;
+
+ progress = FALSE;
+ mutex_enter(&dn->dn_dbufs_mtx);
+ list_insert_tail(&dn->dn_dbufs, &marker);
+ db = list_head(&dn->dn_dbufs);
+ for (; db != &marker; db = list_head(&dn->dn_dbufs)) {
+ list_remove(&dn->dn_dbufs, db);
+ list_insert_tail(&dn->dn_dbufs, db);
+
+ mutex_enter(&db->db_mtx);
+ if (db->db_state == DB_EVICTING) {
+ progress = TRUE;
+ evicting = TRUE;
+ mutex_exit(&db->db_mtx);
+ } else if (refcount_is_zero(&db->db_holds)) {
+ progress = TRUE;
+ ASSERT(!arc_released(db->db_buf));
+ dbuf_clear(db); /* exits db_mtx for us */
+ } else {
+ mutex_exit(&db->db_mtx);
+ }
+
+ }
+ list_remove(&dn->dn_dbufs, &marker);
+ /*
+ * NB: we need to drop dn_dbufs_mtx between passes so
+ * that any DB_EVICTING dbufs can make progress.
+ * Ideally, we would have some cv we could wait on, but
+ * since we don't, just wait a bit to give the other
+ * thread a chance to run.
+ */
+ mutex_exit(&dn->dn_dbufs_mtx);
+ if (evicting)
+ delay(1);
+ pass++;
+ ASSERT(pass < 100); /* sanity check */
+ } while (progress);
+
+ /*
+ * This function works fine even if it can't evict everything.
+ * If were only asked to try to evict everything then
+ * return an error if we can't. Otherwise panic as the caller
+ * expects total eviction.
+ */
+ if (list_head(&dn->dn_dbufs) != NULL) {
+ if (try) {
+ return (1);
+ } else {
+ panic("dangling dbufs (dn=%p, dbuf=%p)\n",
+ dn, list_head(&dn->dn_dbufs));
+ }
+ }
+
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ if (dn->dn_bonus && refcount_is_zero(&dn->dn_bonus->db_holds)) {
+ mutex_enter(&dn->dn_bonus->db_mtx);
+ dbuf_evict(dn->dn_bonus);
+ dn->dn_bonus = NULL;
+ }
+ rw_exit(&dn->dn_struct_rwlock);
+ return (0);
+}
+
+static void
+dnode_undirty_dbufs(list_t *list)
+{
+ dbuf_dirty_record_t *dr;
+
+ while (dr = list_head(list)) {
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+ uint64_t txg = dr->dr_txg;
+
+ mutex_enter(&db->db_mtx);
+ /* XXX - use dbuf_undirty()? */
+ list_remove(list, dr);
+ ASSERT(db->db_last_dirty == dr);
+ db->db_last_dirty = NULL;
+ db->db_dirtycnt -= 1;
+ if (db->db_level == 0) {
+ ASSERT(db->db_blkid == DB_BONUS_BLKID ||
+ dr->dt.dl.dr_data == db->db_buf);
+ dbuf_unoverride(dr);
+ mutex_exit(&db->db_mtx);
+ } else {
+ mutex_exit(&db->db_mtx);
+ dnode_undirty_dbufs(&dr->dt.di.dr_children);
+ }
+ kmem_free(dr, sizeof (dbuf_dirty_record_t));
+ dbuf_rele(db, (void *)(uintptr_t)txg);
+ }
+}
+
+static void
+dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
+{
+ int txgoff = tx->tx_txg & TXG_MASK;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]);
+ (void) dnode_evict_dbufs(dn, 0);
+ ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
+
+ /*
+ * XXX - It would be nice to assert this, but we may still
+ * have residual holds from async evictions from the arc...
+ *
+ * zfs_obj_to_path() also depends on this being
+ * commented out.
+ *
+ * ASSERT3U(refcount_count(&dn->dn_holds), ==, 1);
+ */
+
+ /* Undirty next bits */
+ dn->dn_next_nlevels[txgoff] = 0;
+ dn->dn_next_indblkshift[txgoff] = 0;
+ dn->dn_next_blksz[txgoff] = 0;
+
+ /* free up all the blocks in the file. */
+ dnode_sync_free_range(dn, 0, dn->dn_phys->dn_maxblkid+1, tx);
+ ASSERT3U(DN_USED_BYTES(dn->dn_phys), ==, 0);
+
+ /* ASSERT(blkptrs are zero); */
+ ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
+ ASSERT(dn->dn_type != DMU_OT_NONE);
+
+ ASSERT(dn->dn_free_txg > 0);
+ if (dn->dn_allocated_txg != dn->dn_free_txg)
+ dbuf_will_dirty(dn->dn_dbuf, tx);
+ bzero(dn->dn_phys, sizeof (dnode_phys_t));
+
+ mutex_enter(&dn->dn_mtx);
+ dn->dn_type = DMU_OT_NONE;
+ dn->dn_maxblkid = 0;
+ dn->dn_allocated_txg = 0;
+ mutex_exit(&dn->dn_mtx);
+
+ ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
+
+ dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
+ /*
+ * Now that we've released our hold, the dnode may
+ * be evicted, so we musn't access it.
+ */
+}
+
+/*
+ * Write out the dnode's dirty buffers.
+ *
+ * NOTE: The dnode is kept in memory by being dirty. Once the
+ * dirty bit is cleared, it may be evicted. Beware of this!
+ */
+void
+dnode_sync(dnode_t *dn, dmu_tx_t *tx)
+{
+ free_range_t *rp;
+ dnode_phys_t *dnp = dn->dn_phys;
+ int txgoff = tx->tx_txg & TXG_MASK;
+ list_t *list = &dn->dn_dirty_records[txgoff];
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg);
+ DNODE_VERIFY(dn);
+
+ ASSERT(dn->dn_dbuf == NULL || arc_released(dn->dn_dbuf->db_buf));
+
+ mutex_enter(&dn->dn_mtx);
+ if (dn->dn_allocated_txg == tx->tx_txg) {
+ /* The dnode is newly allocated or reallocated */
+ if (dnp->dn_type == DMU_OT_NONE) {
+ /* this is a first alloc, not a realloc */
+ /* XXX shouldn't the phys already be zeroed? */
+ bzero(dnp, DNODE_CORE_SIZE);
+ dnp->dn_nlevels = 1;
+ }
+
+ if (dn->dn_nblkptr > dnp->dn_nblkptr) {
+ /* zero the new blkptrs we are gaining */
+ bzero(dnp->dn_blkptr + dnp->dn_nblkptr,
+ sizeof (blkptr_t) *
+ (dn->dn_nblkptr - dnp->dn_nblkptr));
+ }
+ dnp->dn_type = dn->dn_type;
+ dnp->dn_bonustype = dn->dn_bonustype;
+ dnp->dn_bonuslen = dn->dn_bonuslen;
+ dnp->dn_nblkptr = dn->dn_nblkptr;
+ }
+
+ ASSERT(dnp->dn_nlevels > 1 ||
+ BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
+ BP_GET_LSIZE(&dnp->dn_blkptr[0]) ==
+ dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+
+ if (dn->dn_next_blksz[txgoff]) {
+ ASSERT(P2PHASE(dn->dn_next_blksz[txgoff],
+ SPA_MINBLOCKSIZE) == 0);
+ ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
+ list_head(list) != NULL ||
+ dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT ==
+ dnp->dn_datablkszsec);
+ dnp->dn_datablkszsec =
+ dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT;
+ dn->dn_next_blksz[txgoff] = 0;
+ }
+
+ if (dn->dn_next_indblkshift[txgoff]) {
+ ASSERT(dnp->dn_nlevels == 1);
+ dnp->dn_indblkshift = dn->dn_next_indblkshift[txgoff];
+ dn->dn_next_indblkshift[txgoff] = 0;
+ }
+
+ /*
+ * Just take the live (open-context) values for checksum and compress.
+ * Strictly speaking it's a future leak, but nothing bad happens if we
+ * start using the new checksum or compress algorithm a little early.
+ */
+ dnp->dn_checksum = dn->dn_checksum;
+ dnp->dn_compress = dn->dn_compress;
+
+ mutex_exit(&dn->dn_mtx);
+
+ /* process all the "freed" ranges in the file */
+ if (dn->dn_free_txg == 0 || dn->dn_free_txg > tx->tx_txg) {
+ for (rp = avl_last(&dn->dn_ranges[txgoff]); rp != NULL;
+ rp = AVL_PREV(&dn->dn_ranges[txgoff], rp))
+ dnode_sync_free_range(dn,
+ rp->fr_blkid, rp->fr_nblks, tx);
+ }
+ mutex_enter(&dn->dn_mtx);
+ for (rp = avl_first(&dn->dn_ranges[txgoff]); rp; ) {
+ free_range_t *last = rp;
+ rp = AVL_NEXT(&dn->dn_ranges[txgoff], rp);
+ avl_remove(&dn->dn_ranges[txgoff], last);
+ kmem_free(last, sizeof (free_range_t));
+ }
+ mutex_exit(&dn->dn_mtx);
+
+ if (dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg) {
+ dnode_sync_free(dn, tx);
+ return;
+ }
+
+ if (dn->dn_next_nlevels[txgoff]) {
+ dnode_increase_indirection(dn, tx);
+ dn->dn_next_nlevels[txgoff] = 0;
+ }
+
+ dbuf_sync_list(list, tx);
+
+ if (dn->dn_object != DMU_META_DNODE_OBJECT) {
+ ASSERT3P(list_head(list), ==, NULL);
+ dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
+ }
+
+ /*
+ * Although we have dropped our reference to the dnode, it
+ * can't be evicted until its written, and we haven't yet
+ * initiated the IO for the dnode's dbuf.
+ */
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
new file mode 100644
index 0000000..a9707a0
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
@@ -0,0 +1,1889 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dmu_tx.h>
+#include <sys/arc.h>
+#include <sys/zio.h>
+#include <sys/zap.h>
+#include <sys/unique.h>
+#include <sys/zfs_context.h>
+
+static dsl_checkfunc_t dsl_dataset_destroy_begin_check;
+static dsl_syncfunc_t dsl_dataset_destroy_begin_sync;
+static dsl_checkfunc_t dsl_dataset_rollback_check;
+static dsl_syncfunc_t dsl_dataset_rollback_sync;
+static dsl_checkfunc_t dsl_dataset_destroy_check;
+static dsl_syncfunc_t dsl_dataset_destroy_sync;
+
+#define DS_REF_MAX (1ULL << 62)
+
+#define DSL_DEADLIST_BLOCKSIZE SPA_MAXBLOCKSIZE
+
+/*
+ * We use weighted reference counts to express the various forms of exclusion
+ * between different open modes. A STANDARD open is 1 point, an EXCLUSIVE open
+ * is DS_REF_MAX, and a PRIMARY open is little more than half of an EXCLUSIVE.
+ * This makes the exclusion logic simple: the total refcnt for all opens cannot
+ * exceed DS_REF_MAX. For example, EXCLUSIVE opens are exclusive because their
+ * weight (DS_REF_MAX) consumes the entire refcnt space. PRIMARY opens consume
+ * just over half of the refcnt space, so there can't be more than one, but it
+ * can peacefully coexist with any number of STANDARD opens.
+ */
+static uint64_t ds_refcnt_weight[DS_MODE_LEVELS] = {
+ 0, /* DS_MODE_NONE - invalid */
+ 1, /* DS_MODE_STANDARD - unlimited number */
+ (DS_REF_MAX >> 1) + 1, /* DS_MODE_PRIMARY - only one of these */
+ DS_REF_MAX /* DS_MODE_EXCLUSIVE - no other opens */
+};
+
+
+void
+dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
+{
+ int used = bp_get_dasize(tx->tx_pool->dp_spa, bp);
+ int compressed = BP_GET_PSIZE(bp);
+ int uncompressed = BP_GET_UCSIZE(bp);
+
+ dprintf_bp(bp, "born, ds=%p\n", ds);
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ /* It could have been compressed away to nothing */
+ if (BP_IS_HOLE(bp))
+ return;
+ ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE);
+ ASSERT3U(BP_GET_TYPE(bp), <, DMU_OT_NUMTYPES);
+ if (ds == NULL) {
+ /*
+ * Account for the meta-objset space in its placeholder
+ * dsl_dir.
+ */
+ ASSERT3U(compressed, ==, uncompressed); /* it's all metadata */
+ dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir,
+ used, compressed, uncompressed, tx);
+ dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx);
+ return;
+ }
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ mutex_enter(&ds->ds_lock);
+ ds->ds_phys->ds_used_bytes += used;
+ ds->ds_phys->ds_compressed_bytes += compressed;
+ ds->ds_phys->ds_uncompressed_bytes += uncompressed;
+ ds->ds_phys->ds_unique_bytes += used;
+ mutex_exit(&ds->ds_lock);
+ dsl_dir_diduse_space(ds->ds_dir,
+ used, compressed, uncompressed, tx);
+}
+
+void
+dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
+ dmu_tx_t *tx)
+{
+ int used = bp_get_dasize(tx->tx_pool->dp_spa, bp);
+ int compressed = BP_GET_PSIZE(bp);
+ int uncompressed = BP_GET_UCSIZE(bp);
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ /* No block pointer => nothing to free */
+ if (BP_IS_HOLE(bp))
+ return;
+
+ ASSERT(used > 0);
+ if (ds == NULL) {
+ int err;
+ /*
+ * Account for the meta-objset space in its placeholder
+ * dataset.
+ */
+ err = arc_free(pio, tx->tx_pool->dp_spa,
+ tx->tx_txg, bp, NULL, NULL, pio ? ARC_NOWAIT: ARC_WAIT);
+ ASSERT(err == 0);
+
+ dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir,
+ -used, -compressed, -uncompressed, tx);
+ dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx);
+ return;
+ }
+ ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);
+
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+
+ if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) {
+ int err;
+
+ dprintf_bp(bp, "freeing: %s", "");
+ err = arc_free(pio, tx->tx_pool->dp_spa,
+ tx->tx_txg, bp, NULL, NULL, pio ? ARC_NOWAIT: ARC_WAIT);
+ ASSERT(err == 0);
+
+ mutex_enter(&ds->ds_lock);
+ /* XXX unique_bytes is not accurate for head datasets */
+ /* ASSERT3U(ds->ds_phys->ds_unique_bytes, >=, used); */
+ ds->ds_phys->ds_unique_bytes -= used;
+ mutex_exit(&ds->ds_lock);
+ dsl_dir_diduse_space(ds->ds_dir,
+ -used, -compressed, -uncompressed, tx);
+ } else {
+ dprintf_bp(bp, "putting on dead list: %s", "");
+ VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, bp, tx));
+ /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
+ if (ds->ds_phys->ds_prev_snap_obj != 0) {
+ ASSERT3U(ds->ds_prev->ds_object, ==,
+ ds->ds_phys->ds_prev_snap_obj);
+ ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0);
+ if (ds->ds_prev->ds_phys->ds_next_snap_obj ==
+ ds->ds_object && bp->blk_birth >
+ ds->ds_prev->ds_phys->ds_prev_snap_txg) {
+ dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
+ mutex_enter(&ds->ds_prev->ds_lock);
+ ds->ds_prev->ds_phys->ds_unique_bytes +=
+ used;
+ mutex_exit(&ds->ds_prev->ds_lock);
+ }
+ }
+ }
+ mutex_enter(&ds->ds_lock);
+ ASSERT3U(ds->ds_phys->ds_used_bytes, >=, used);
+ ds->ds_phys->ds_used_bytes -= used;
+ ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed);
+ ds->ds_phys->ds_compressed_bytes -= compressed;
+ ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed);
+ ds->ds_phys->ds_uncompressed_bytes -= uncompressed;
+ mutex_exit(&ds->ds_lock);
+}
+
+uint64_t
+dsl_dataset_prev_snap_txg(dsl_dataset_t *ds)
+{
+ uint64_t trysnap = 0;
+
+ if (ds == NULL)
+ return (0);
+ /*
+ * The snapshot creation could fail, but that would cause an
+ * incorrect FALSE return, which would only result in an
+ * overestimation of the amount of space that an operation would
+ * consume, which is OK.
+ *
+ * There's also a small window where we could miss a pending
+ * snapshot, because we could set the sync task in the quiescing
+ * phase. So this should only be used as a guess.
+ */
+ if (ds->ds_trysnap_txg >
+ spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa))
+ trysnap = ds->ds_trysnap_txg;
+ return (MAX(ds->ds_phys->ds_prev_snap_txg, trysnap));
+}
+
+int
+dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth)
+{
+ return (blk_birth > dsl_dataset_prev_snap_txg(ds));
+}
+
+/* ARGSUSED */
+static void
+dsl_dataset_evict(dmu_buf_t *db, void *dsv)
+{
+ dsl_dataset_t *ds = dsv;
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+
+ /* open_refcount == DS_REF_MAX when deleting */
+ ASSERT(ds->ds_open_refcount == 0 ||
+ ds->ds_open_refcount == DS_REF_MAX);
+
+ dprintf_ds(ds, "evicting %s\n", "");
+
+ unique_remove(ds->ds_phys->ds_fsid_guid);
+
+ if (ds->ds_user_ptr != NULL)
+ ds->ds_user_evict_func(ds, ds->ds_user_ptr);
+
+ if (ds->ds_prev) {
+ dsl_dataset_close(ds->ds_prev, DS_MODE_NONE, ds);
+ ds->ds_prev = NULL;
+ }
+
+ bplist_close(&ds->ds_deadlist);
+ dsl_dir_close(ds->ds_dir, ds);
+
+ if (list_link_active(&ds->ds_synced_link))
+ list_remove(&dp->dp_synced_objsets, ds);
+
+ mutex_destroy(&ds->ds_lock);
+ mutex_destroy(&ds->ds_deadlist.bpl_lock);
+
+ kmem_free(ds, sizeof (dsl_dataset_t));
+}
+
+static int
+dsl_dataset_get_snapname(dsl_dataset_t *ds)
+{
+ dsl_dataset_phys_t *headphys;
+ int err;
+ dmu_buf_t *headdbuf;
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ objset_t *mos = dp->dp_meta_objset;
+
+ if (ds->ds_snapname[0])
+ return (0);
+ if (ds->ds_phys->ds_next_snap_obj == 0)
+ return (0);
+
+ err = dmu_bonus_hold(mos, ds->ds_dir->dd_phys->dd_head_dataset_obj,
+ FTAG, &headdbuf);
+ if (err)
+ return (err);
+ headphys = headdbuf->db_data;
+ err = zap_value_search(dp->dp_meta_objset,
+ headphys->ds_snapnames_zapobj, ds->ds_object, ds->ds_snapname);
+ dmu_buf_rele(headdbuf, FTAG);
+ return (err);
+}
+
+int
+dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname,
+ int mode, void *tag, dsl_dataset_t **dsp)
+{
+ uint64_t weight = ds_refcnt_weight[DS_MODE_LEVEL(mode)];
+ objset_t *mos = dp->dp_meta_objset;
+ dmu_buf_t *dbuf;
+ dsl_dataset_t *ds;
+ int err;
+
+ ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
+ dsl_pool_sync_context(dp));
+
+ err = dmu_bonus_hold(mos, dsobj, tag, &dbuf);
+ if (err)
+ return (err);
+ ds = dmu_buf_get_user(dbuf);
+ if (ds == NULL) {
+ dsl_dataset_t *winner;
+
+ ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP);
+ ds->ds_dbuf = dbuf;
+ ds->ds_object = dsobj;
+ ds->ds_phys = dbuf->db_data;
+
+ mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&ds->ds_deadlist.bpl_lock, NULL, MUTEX_DEFAULT,
+ NULL);
+
+ err = bplist_open(&ds->ds_deadlist,
+ mos, ds->ds_phys->ds_deadlist_obj);
+ if (err == 0) {
+ err = dsl_dir_open_obj(dp,
+ ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir);
+ }
+ if (err) {
+ /*
+ * we don't really need to close the blist if we
+ * just opened it.
+ */
+ mutex_destroy(&ds->ds_lock);
+ mutex_destroy(&ds->ds_deadlist.bpl_lock);
+ kmem_free(ds, sizeof (dsl_dataset_t));
+ dmu_buf_rele(dbuf, tag);
+ return (err);
+ }
+
+ if (ds->ds_dir->dd_phys->dd_head_dataset_obj == dsobj) {
+ ds->ds_snapname[0] = '\0';
+ if (ds->ds_phys->ds_prev_snap_obj) {
+ err = dsl_dataset_open_obj(dp,
+ ds->ds_phys->ds_prev_snap_obj, NULL,
+ DS_MODE_NONE, ds, &ds->ds_prev);
+ }
+ } else {
+ if (snapname) {
+#ifdef ZFS_DEBUG
+ dsl_dataset_phys_t *headphys;
+ dmu_buf_t *headdbuf;
+ err = dmu_bonus_hold(mos,
+ ds->ds_dir->dd_phys->dd_head_dataset_obj,
+ FTAG, &headdbuf);
+ if (err == 0) {
+ headphys = headdbuf->db_data;
+ uint64_t foundobj;
+ err = zap_lookup(dp->dp_meta_objset,
+ headphys->ds_snapnames_zapobj,
+ snapname, sizeof (foundobj), 1,
+ &foundobj);
+ ASSERT3U(foundobj, ==, dsobj);
+ dmu_buf_rele(headdbuf, FTAG);
+ }
+#endif
+ (void) strcat(ds->ds_snapname, snapname);
+ } else if (zfs_flags & ZFS_DEBUG_SNAPNAMES) {
+ err = dsl_dataset_get_snapname(ds);
+ }
+ }
+
+ if (err == 0) {
+ winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys,
+ dsl_dataset_evict);
+ }
+ if (err || winner) {
+ bplist_close(&ds->ds_deadlist);
+ if (ds->ds_prev) {
+ dsl_dataset_close(ds->ds_prev,
+ DS_MODE_NONE, ds);
+ }
+ dsl_dir_close(ds->ds_dir, ds);
+ mutex_destroy(&ds->ds_lock);
+ mutex_destroy(&ds->ds_deadlist.bpl_lock);
+ kmem_free(ds, sizeof (dsl_dataset_t));
+ if (err) {
+ dmu_buf_rele(dbuf, tag);
+ return (err);
+ }
+ ds = winner;
+ } else {
+ uint64_t new =
+ unique_insert(ds->ds_phys->ds_fsid_guid);
+ if (new != ds->ds_phys->ds_fsid_guid) {
+ /* XXX it won't necessarily be synced... */
+ ds->ds_phys->ds_fsid_guid = new;
+ }
+ }
+ }
+ ASSERT3P(ds->ds_dbuf, ==, dbuf);
+ ASSERT3P(ds->ds_phys, ==, dbuf->db_data);
+
+ mutex_enter(&ds->ds_lock);
+ if ((DS_MODE_LEVEL(mode) == DS_MODE_PRIMARY &&
+ (ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) &&
+ !DS_MODE_IS_INCONSISTENT(mode)) ||
+ (ds->ds_open_refcount + weight > DS_REF_MAX)) {
+ mutex_exit(&ds->ds_lock);
+ dsl_dataset_close(ds, DS_MODE_NONE, tag);
+ return (EBUSY);
+ }
+ ds->ds_open_refcount += weight;
+ mutex_exit(&ds->ds_lock);
+
+ *dsp = ds;
+ return (0);
+}
+
+int
+dsl_dataset_open_spa(spa_t *spa, const char *name, int mode,
+ void *tag, dsl_dataset_t **dsp)
+{
+ dsl_dir_t *dd;
+ dsl_pool_t *dp;
+ const char *tail;
+ uint64_t obj;
+ dsl_dataset_t *ds = NULL;
+ int err = 0;
+
+ err = dsl_dir_open_spa(spa, name, FTAG, &dd, &tail);
+ if (err)
+ return (err);
+
+ dp = dd->dd_pool;
+ obj = dd->dd_phys->dd_head_dataset_obj;
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ if (obj == 0) {
+ /* A dataset with no associated objset */
+ err = ENOENT;
+ goto out;
+ }
+
+ if (tail != NULL) {
+ objset_t *mos = dp->dp_meta_objset;
+
+ err = dsl_dataset_open_obj(dp, obj, NULL,
+ DS_MODE_NONE, tag, &ds);
+ if (err)
+ goto out;
+ obj = ds->ds_phys->ds_snapnames_zapobj;
+ dsl_dataset_close(ds, DS_MODE_NONE, tag);
+ ds = NULL;
+
+ if (tail[0] != '@') {
+ err = ENOENT;
+ goto out;
+ }
+ tail++;
+
+ /* Look for a snapshot */
+ if (!DS_MODE_IS_READONLY(mode)) {
+ err = EROFS;
+ goto out;
+ }
+ dprintf("looking for snapshot '%s'\n", tail);
+ err = zap_lookup(mos, obj, tail, 8, 1, &obj);
+ if (err)
+ goto out;
+ }
+ err = dsl_dataset_open_obj(dp, obj, tail, mode, tag, &ds);
+
+out:
+ rw_exit(&dp->dp_config_rwlock);
+ dsl_dir_close(dd, FTAG);
+
+ ASSERT3U((err == 0), ==, (ds != NULL));
+ /* ASSERT(ds == NULL || strcmp(name, ds->ds_name) == 0); */
+
+ *dsp = ds;
+ return (err);
+}
+
+int
+dsl_dataset_open(const char *name, int mode, void *tag, dsl_dataset_t **dsp)
+{
+ return (dsl_dataset_open_spa(NULL, name, mode, tag, dsp));
+}
+
+void
+dsl_dataset_name(dsl_dataset_t *ds, char *name)
+{
+ if (ds == NULL) {
+ (void) strcpy(name, "mos");
+ } else {
+ dsl_dir_name(ds->ds_dir, name);
+ VERIFY(0 == dsl_dataset_get_snapname(ds));
+ if (ds->ds_snapname[0]) {
+ (void) strcat(name, "@");
+ if (!MUTEX_HELD(&ds->ds_lock)) {
+ /*
+ * We use a "recursive" mutex so that we
+ * can call dprintf_ds() with ds_lock held.
+ */
+ mutex_enter(&ds->ds_lock);
+ (void) strcat(name, ds->ds_snapname);
+ mutex_exit(&ds->ds_lock);
+ } else {
+ (void) strcat(name, ds->ds_snapname);
+ }
+ }
+ }
+}
+
+void
+dsl_dataset_close(dsl_dataset_t *ds, int mode, void *tag)
+{
+ uint64_t weight = ds_refcnt_weight[DS_MODE_LEVEL(mode)];
+ mutex_enter(&ds->ds_lock);
+ ASSERT3U(ds->ds_open_refcount, >=, weight);
+ ds->ds_open_refcount -= weight;
+ dprintf_ds(ds, "closing mode %u refcount now 0x%llx\n",
+ mode, ds->ds_open_refcount);
+ mutex_exit(&ds->ds_lock);
+
+ dmu_buf_rele(ds->ds_dbuf, tag);
+}
+
+void
+dsl_dataset_create_root(dsl_pool_t *dp, uint64_t *ddobjp, dmu_tx_t *tx)
+{
+ objset_t *mos = dp->dp_meta_objset;
+ dmu_buf_t *dbuf;
+ dsl_dataset_phys_t *dsphys;
+ dsl_dataset_t *ds;
+ uint64_t dsobj;
+ dsl_dir_t *dd;
+
+ dsl_dir_create_root(mos, ddobjp, tx);
+ VERIFY(0 == dsl_dir_open_obj(dp, *ddobjp, NULL, FTAG, &dd));
+
+ dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
+ DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
+ VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
+ dmu_buf_will_dirty(dbuf, tx);
+ dsphys = dbuf->db_data;
+ dsphys->ds_dir_obj = dd->dd_object;
+ dsphys->ds_fsid_guid = unique_create();
+ unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */
+ (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
+ sizeof (dsphys->ds_guid));
+ dsphys->ds_snapnames_zapobj =
+ zap_create(mos, DMU_OT_DSL_DS_SNAP_MAP, DMU_OT_NONE, 0, tx);
+ dsphys->ds_creation_time = gethrestime_sec();
+ dsphys->ds_creation_txg = tx->tx_txg;
+ dsphys->ds_deadlist_obj =
+ bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
+ dmu_buf_rele(dbuf, FTAG);
+
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+ dd->dd_phys->dd_head_dataset_obj = dsobj;
+ dsl_dir_close(dd, FTAG);
+
+ VERIFY(0 ==
+ dsl_dataset_open_obj(dp, dsobj, NULL, DS_MODE_NONE, FTAG, &ds));
+ (void) dmu_objset_create_impl(dp->dp_spa, ds,
+ &ds->ds_phys->ds_bp, DMU_OST_ZFS, tx);
+ dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+}
+
+uint64_t
+dsl_dataset_create_sync(dsl_dir_t *pdd,
+ const char *lastname, dsl_dataset_t *clone_parent, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = pdd->dd_pool;
+ dmu_buf_t *dbuf;
+ dsl_dataset_phys_t *dsphys;
+ uint64_t dsobj, ddobj;
+ objset_t *mos = dp->dp_meta_objset;
+ dsl_dir_t *dd;
+
+ ASSERT(clone_parent == NULL || clone_parent->ds_dir->dd_pool == dp);
+ ASSERT(clone_parent == NULL ||
+ clone_parent->ds_phys->ds_num_children > 0);
+ ASSERT(lastname[0] != '@');
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ ddobj = dsl_dir_create_sync(pdd, lastname, tx);
+ VERIFY(0 == dsl_dir_open_obj(dp, ddobj, lastname, FTAG, &dd));
+
+ dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
+ DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
+ VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
+ dmu_buf_will_dirty(dbuf, tx);
+ dsphys = dbuf->db_data;
+ dsphys->ds_dir_obj = dd->dd_object;
+ dsphys->ds_fsid_guid = unique_create();
+ unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */
+ (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
+ sizeof (dsphys->ds_guid));
+ dsphys->ds_snapnames_zapobj =
+ zap_create(mos, DMU_OT_DSL_DS_SNAP_MAP, DMU_OT_NONE, 0, tx);
+ dsphys->ds_creation_time = gethrestime_sec();
+ dsphys->ds_creation_txg = tx->tx_txg;
+ dsphys->ds_deadlist_obj =
+ bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
+ if (clone_parent) {
+ dsphys->ds_prev_snap_obj = clone_parent->ds_object;
+ dsphys->ds_prev_snap_txg =
+ clone_parent->ds_phys->ds_creation_txg;
+ dsphys->ds_used_bytes =
+ clone_parent->ds_phys->ds_used_bytes;
+ dsphys->ds_compressed_bytes =
+ clone_parent->ds_phys->ds_compressed_bytes;
+ dsphys->ds_uncompressed_bytes =
+ clone_parent->ds_phys->ds_uncompressed_bytes;
+ dsphys->ds_bp = clone_parent->ds_phys->ds_bp;
+
+ dmu_buf_will_dirty(clone_parent->ds_dbuf, tx);
+ clone_parent->ds_phys->ds_num_children++;
+
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+ dd->dd_phys->dd_clone_parent_obj = clone_parent->ds_object;
+ }
+ dmu_buf_rele(dbuf, FTAG);
+
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+ dd->dd_phys->dd_head_dataset_obj = dsobj;
+ dsl_dir_close(dd, FTAG);
+
+ return (dsobj);
+}
+
+struct destroyarg {
+ dsl_sync_task_group_t *dstg;
+ char *snapname;
+ void *tag;
+ char *failed;
+};
+
+static int
+dsl_snapshot_destroy_one(char *name, void *arg)
+{
+ struct destroyarg *da = arg;
+ dsl_dataset_t *ds;
+ char *cp;
+ int err;
+
+ (void) strcat(name, "@");
+ (void) strcat(name, da->snapname);
+ err = dsl_dataset_open(name,
+ DS_MODE_EXCLUSIVE | DS_MODE_READONLY | DS_MODE_INCONSISTENT,
+ da->tag, &ds);
+ cp = strchr(name, '@');
+ *cp = '\0';
+ if (err == ENOENT)
+ return (0);
+ if (err) {
+ (void) strcpy(da->failed, name);
+ return (err);
+ }
+
+ dsl_sync_task_create(da->dstg, dsl_dataset_destroy_check,
+ dsl_dataset_destroy_sync, ds, da->tag, 0);
+ return (0);
+}
+
+/*
+ * Destroy 'snapname' in all descendants of 'fsname'.
+ */
+#pragma weak dmu_snapshots_destroy = dsl_snapshots_destroy
+int
+dsl_snapshots_destroy(char *fsname, char *snapname)
+{
+ int err;
+ struct destroyarg da;
+ dsl_sync_task_t *dst;
+ spa_t *spa;
+ char *cp;
+
+ cp = strchr(fsname, '/');
+ if (cp) {
+ *cp = '\0';
+ err = spa_open(fsname, &spa, FTAG);
+ *cp = '/';
+ } else {
+ err = spa_open(fsname, &spa, FTAG);
+ }
+ if (err)
+ return (err);
+ da.dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
+ da.snapname = snapname;
+ da.tag = FTAG;
+ da.failed = fsname;
+
+ err = dmu_objset_find(fsname,
+ dsl_snapshot_destroy_one, &da, DS_FIND_CHILDREN);
+
+ if (err == 0)
+ err = dsl_sync_task_group_wait(da.dstg);
+
+ for (dst = list_head(&da.dstg->dstg_tasks); dst;
+ dst = list_next(&da.dstg->dstg_tasks, dst)) {
+ dsl_dataset_t *ds = dst->dst_arg1;
+ if (dst->dst_err) {
+ dsl_dataset_name(ds, fsname);
+ cp = strchr(fsname, '@');
+ *cp = '\0';
+ }
+ /*
+ * If it was successful, destroy_sync would have
+ * closed the ds
+ */
+ if (err)
+ dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+ }
+
+ dsl_sync_task_group_destroy(da.dstg);
+ spa_close(spa, FTAG);
+ return (err);
+}
+
+int
+dsl_dataset_destroy(const char *name)
+{
+ int err;
+ dsl_sync_task_group_t *dstg;
+ objset_t *os;
+ dsl_dataset_t *ds;
+ dsl_dir_t *dd;
+ uint64_t obj;
+
+ if (strchr(name, '@')) {
+ /* Destroying a snapshot is simpler */
+ err = dsl_dataset_open(name,
+ DS_MODE_EXCLUSIVE | DS_MODE_READONLY | DS_MODE_INCONSISTENT,
+ FTAG, &ds);
+ if (err)
+ return (err);
+ err = dsl_sync_task_do(ds->ds_dir->dd_pool,
+ dsl_dataset_destroy_check, dsl_dataset_destroy_sync,
+ ds, FTAG, 0);
+ if (err)
+ dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+ return (err);
+ }
+
+ err = dmu_objset_open(name, DMU_OST_ANY,
+ DS_MODE_EXCLUSIVE | DS_MODE_INCONSISTENT, &os);
+ if (err)
+ return (err);
+ ds = os->os->os_dsl_dataset;
+ dd = ds->ds_dir;
+
+ /*
+ * Check for errors and mark this ds as inconsistent, in
+ * case we crash while freeing the objects.
+ */
+ err = dsl_sync_task_do(dd->dd_pool, dsl_dataset_destroy_begin_check,
+ dsl_dataset_destroy_begin_sync, ds, NULL, 0);
+ if (err) {
+ dmu_objset_close(os);
+ return (err);
+ }
+
+ /*
+ * remove the objects in open context, so that we won't
+ * have too much to do in syncing context.
+ */
+ for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE,
+ ds->ds_phys->ds_prev_snap_txg)) {
+ dmu_tx_t *tx = dmu_tx_create(os);
+ dmu_tx_hold_free(tx, obj, 0, DMU_OBJECT_END);
+ dmu_tx_hold_bonus(tx, obj);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err) {
+ /*
+ * Perhaps there is not enough disk
+ * space. Just deal with it from
+ * dsl_dataset_destroy_sync().
+ */
+ dmu_tx_abort(tx);
+ continue;
+ }
+ VERIFY(0 == dmu_object_free(os, obj, tx));
+ dmu_tx_commit(tx);
+ }
+ /* Make sure it's not dirty before we finish destroying it. */
+ txg_wait_synced(dd->dd_pool, 0);
+
+ dmu_objset_close(os);
+ if (err != ESRCH)
+ return (err);
+
+ err = dsl_dataset_open(name,
+ DS_MODE_EXCLUSIVE | DS_MODE_READONLY | DS_MODE_INCONSISTENT,
+ FTAG, &ds);
+ if (err)
+ return (err);
+
+ err = dsl_dir_open(name, FTAG, &dd, NULL);
+ if (err) {
+ dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+ return (err);
+ }
+
+ /*
+ * Blow away the dsl_dir + head dataset.
+ */
+ dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool);
+ dsl_sync_task_create(dstg, dsl_dataset_destroy_check,
+ dsl_dataset_destroy_sync, ds, FTAG, 0);
+ dsl_sync_task_create(dstg, dsl_dir_destroy_check,
+ dsl_dir_destroy_sync, dd, FTAG, 0);
+ err = dsl_sync_task_group_wait(dstg);
+ dsl_sync_task_group_destroy(dstg);
+ /* if it is successful, *destroy_sync will close the ds+dd */
+ if (err) {
+ dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+ dsl_dir_close(dd, FTAG);
+ }
+ return (err);
+}
+
+int
+dsl_dataset_rollback(dsl_dataset_t *ds)
+{
+ ASSERT3U(ds->ds_open_refcount, ==, DS_REF_MAX);
+ return (dsl_sync_task_do(ds->ds_dir->dd_pool,
+ dsl_dataset_rollback_check, dsl_dataset_rollback_sync,
+ ds, NULL, 0));
+}
+
+void *
+dsl_dataset_set_user_ptr(dsl_dataset_t *ds,
+ void *p, dsl_dataset_evict_func_t func)
+{
+ void *old;
+
+ mutex_enter(&ds->ds_lock);
+ old = ds->ds_user_ptr;
+ if (old == NULL) {
+ ds->ds_user_ptr = p;
+ ds->ds_user_evict_func = func;
+ }
+ mutex_exit(&ds->ds_lock);
+ return (old);
+}
+
+void *
+dsl_dataset_get_user_ptr(dsl_dataset_t *ds)
+{
+ return (ds->ds_user_ptr);
+}
+
+
+blkptr_t *
+dsl_dataset_get_blkptr(dsl_dataset_t *ds)
+{
+ return (&ds->ds_phys->ds_bp);
+}
+
+void
+dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
+{
+ ASSERT(dmu_tx_is_syncing(tx));
+ /* If it's the meta-objset, set dp_meta_rootbp */
+ if (ds == NULL) {
+ tx->tx_pool->dp_meta_rootbp = *bp;
+ } else {
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ ds->ds_phys->ds_bp = *bp;
+ }
+}
+
+spa_t *
+dsl_dataset_get_spa(dsl_dataset_t *ds)
+{
+ return (ds->ds_dir->dd_pool->dp_spa);
+}
+
+void
+dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp;
+
+ if (ds == NULL) /* this is the meta-objset */
+ return;
+
+ ASSERT(ds->ds_user_ptr != NULL);
+
+ if (ds->ds_phys->ds_next_snap_obj != 0)
+ panic("dirtying snapshot!");
+
+ dp = ds->ds_dir->dd_pool;
+
+ if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg) == 0) {
+ /* up the hold count until we can be written out */
+ dmu_buf_add_ref(ds->ds_dbuf, ds);
+ }
+}
+
+struct killarg {
+ uint64_t *usedp;
+ uint64_t *compressedp;
+ uint64_t *uncompressedp;
+ zio_t *zio;
+ dmu_tx_t *tx;
+};
+
+static int
+kill_blkptr(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
+{
+ struct killarg *ka = arg;
+ blkptr_t *bp = &bc->bc_blkptr;
+
+ ASSERT3U(bc->bc_errno, ==, 0);
+
+ /*
+ * Since this callback is not called concurrently, no lock is
+ * needed on the accounting values.
+ */
+ *ka->usedp += bp_get_dasize(spa, bp);
+ *ka->compressedp += BP_GET_PSIZE(bp);
+ *ka->uncompressedp += BP_GET_UCSIZE(bp);
+ /* XXX check for EIO? */
+ (void) arc_free(ka->zio, spa, ka->tx->tx_txg, bp, NULL, NULL,
+ ARC_NOWAIT);
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+dsl_dataset_rollback_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = arg1;
+
+ /*
+ * There must be a previous snapshot. I suppose we could roll
+ * it back to being empty (and re-initialize the upper (ZPL)
+ * layer). But for now there's no way to do this via the user
+ * interface.
+ */
+ if (ds->ds_phys->ds_prev_snap_txg == 0)
+ return (EINVAL);
+
+ /*
+ * This must not be a snapshot.
+ */
+ if (ds->ds_phys->ds_next_snap_obj != 0)
+ return (EINVAL);
+
+ /*
+ * If we made changes this txg, traverse_dsl_dataset won't find
+ * them. Try again.
+ */
+ if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg)
+ return (EAGAIN);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+dsl_dataset_rollback_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = arg1;
+ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+
+ /* Zero out the deadlist. */
+ bplist_close(&ds->ds_deadlist);
+ bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx);
+ ds->ds_phys->ds_deadlist_obj =
+ bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
+ VERIFY(0 == bplist_open(&ds->ds_deadlist, mos,
+ ds->ds_phys->ds_deadlist_obj));
+
+ {
+ /* Free blkptrs that we gave birth to */
+ zio_t *zio;
+ uint64_t used = 0, compressed = 0, uncompressed = 0;
+ struct killarg ka;
+
+ zio = zio_root(tx->tx_pool->dp_spa, NULL, NULL,
+ ZIO_FLAG_MUSTSUCCEED);
+ ka.usedp = &used;
+ ka.compressedp = &compressed;
+ ka.uncompressedp = &uncompressed;
+ ka.zio = zio;
+ ka.tx = tx;
+ (void) traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
+ ADVANCE_POST, kill_blkptr, &ka);
+ (void) zio_wait(zio);
+
+ dsl_dir_diduse_space(ds->ds_dir,
+ -used, -compressed, -uncompressed, tx);
+ }
+
+ /* Change our contents to that of the prev snapshot */
+ ASSERT3U(ds->ds_prev->ds_object, ==, ds->ds_phys->ds_prev_snap_obj);
+ ds->ds_phys->ds_bp = ds->ds_prev->ds_phys->ds_bp;
+ ds->ds_phys->ds_used_bytes = ds->ds_prev->ds_phys->ds_used_bytes;
+ ds->ds_phys->ds_compressed_bytes =
+ ds->ds_prev->ds_phys->ds_compressed_bytes;
+ ds->ds_phys->ds_uncompressed_bytes =
+ ds->ds_prev->ds_phys->ds_uncompressed_bytes;
+ ds->ds_phys->ds_flags = ds->ds_prev->ds_phys->ds_flags;
+ ds->ds_phys->ds_unique_bytes = 0;
+
+ if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
+ dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
+ ds->ds_prev->ds_phys->ds_unique_bytes = 0;
+ }
+}
+
+/* ARGSUSED */
+static int
+dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = arg1;
+
+ /*
+ * Can't delete a head dataset if there are snapshots of it.
+ * (Except if the only snapshots are from the branch we cloned
+ * from.)
+ */
+ if (ds->ds_prev != NULL &&
+ ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
+ return (EINVAL);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = arg1;
+
+ /* Mark it as inconsistent on-disk, in case we crash */
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
+}
+
+/* ARGSUSED */
+static int
+dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = arg1;
+
+ /* Can't delete a branch point. */
+ if (ds->ds_phys->ds_num_children > 1)
+ return (EEXIST);
+
+ /*
+ * Can't delete a head dataset if there are snapshots of it.
+ * (Except if the only snapshots are from the branch we cloned
+ * from.)
+ */
+ if (ds->ds_prev != NULL &&
+ ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
+ return (EINVAL);
+
+ /*
+ * If we made changes this txg, traverse_dsl_dataset won't find
+ * them. Try again.
+ */
+ if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg)
+ return (EAGAIN);
+
+ /* XXX we should do some i/o error checking... */
+ return (0);
+}
+
+static void
+dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = arg1;
+ uint64_t used = 0, compressed = 0, uncompressed = 0;
+ zio_t *zio;
+ int err;
+ int after_branch_point = FALSE;
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ objset_t *mos = dp->dp_meta_objset;
+ dsl_dataset_t *ds_prev = NULL;
+ uint64_t obj;
+
+ ASSERT3U(ds->ds_open_refcount, ==, DS_REF_MAX);
+ ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
+ ASSERT(ds->ds_prev == NULL ||
+ ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object);
+ ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg);
+
+ ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
+
+ obj = ds->ds_object;
+
+ if (ds->ds_phys->ds_prev_snap_obj != 0) {
+ if (ds->ds_prev) {
+ ds_prev = ds->ds_prev;
+ } else {
+ VERIFY(0 == dsl_dataset_open_obj(dp,
+ ds->ds_phys->ds_prev_snap_obj, NULL,
+ DS_MODE_NONE, FTAG, &ds_prev));
+ }
+ after_branch_point =
+ (ds_prev->ds_phys->ds_next_snap_obj != obj);
+
+ dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
+ if (after_branch_point &&
+ ds->ds_phys->ds_next_snap_obj == 0) {
+ /* This clone is toast. */
+ ASSERT(ds_prev->ds_phys->ds_num_children > 1);
+ ds_prev->ds_phys->ds_num_children--;
+ } else if (!after_branch_point) {
+ ds_prev->ds_phys->ds_next_snap_obj =
+ ds->ds_phys->ds_next_snap_obj;
+ }
+ }
+
+ zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+
+ if (ds->ds_phys->ds_next_snap_obj != 0) {
+ blkptr_t bp;
+ dsl_dataset_t *ds_next;
+ uint64_t itor = 0;
+
+ spa_scrub_restart(dp->dp_spa, tx->tx_txg);
+
+ VERIFY(0 == dsl_dataset_open_obj(dp,
+ ds->ds_phys->ds_next_snap_obj, NULL,
+ DS_MODE_NONE, FTAG, &ds_next));
+ ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj);
+
+ dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
+ ds_next->ds_phys->ds_prev_snap_obj =
+ ds->ds_phys->ds_prev_snap_obj;
+ ds_next->ds_phys->ds_prev_snap_txg =
+ ds->ds_phys->ds_prev_snap_txg;
+ ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
+ ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0);
+
+ /*
+ * Transfer to our deadlist (which will become next's
+ * new deadlist) any entries from next's current
+ * deadlist which were born before prev, and free the
+ * other entries.
+ *
+ * XXX we're doing this long task with the config lock held
+ */
+ while (bplist_iterate(&ds_next->ds_deadlist, &itor,
+ &bp) == 0) {
+ if (bp.blk_birth <= ds->ds_phys->ds_prev_snap_txg) {
+ VERIFY(0 == bplist_enqueue(&ds->ds_deadlist,
+ &bp, tx));
+ if (ds_prev && !after_branch_point &&
+ bp.blk_birth >
+ ds_prev->ds_phys->ds_prev_snap_txg) {
+ ds_prev->ds_phys->ds_unique_bytes +=
+ bp_get_dasize(dp->dp_spa, &bp);
+ }
+ } else {
+ used += bp_get_dasize(dp->dp_spa, &bp);
+ compressed += BP_GET_PSIZE(&bp);
+ uncompressed += BP_GET_UCSIZE(&bp);
+ /* XXX check return value? */
+ (void) arc_free(zio, dp->dp_spa, tx->tx_txg,
+ &bp, NULL, NULL, ARC_NOWAIT);
+ }
+ }
+
+ /* free next's deadlist */
+ bplist_close(&ds_next->ds_deadlist);
+ bplist_destroy(mos, ds_next->ds_phys->ds_deadlist_obj, tx);
+
+ /* set next's deadlist to our deadlist */
+ ds_next->ds_phys->ds_deadlist_obj =
+ ds->ds_phys->ds_deadlist_obj;
+ VERIFY(0 == bplist_open(&ds_next->ds_deadlist, mos,
+ ds_next->ds_phys->ds_deadlist_obj));
+ ds->ds_phys->ds_deadlist_obj = 0;
+
+ if (ds_next->ds_phys->ds_next_snap_obj != 0) {
+ /*
+ * Update next's unique to include blocks which
+ * were previously shared by only this snapshot
+ * and it. Those blocks will be born after the
+ * prev snap and before this snap, and will have
+ * died after the next snap and before the one
+ * after that (ie. be on the snap after next's
+ * deadlist).
+ *
+ * XXX we're doing this long task with the
+ * config lock held
+ */
+ dsl_dataset_t *ds_after_next;
+
+ VERIFY(0 == dsl_dataset_open_obj(dp,
+ ds_next->ds_phys->ds_next_snap_obj, NULL,
+ DS_MODE_NONE, FTAG, &ds_after_next));
+ itor = 0;
+ while (bplist_iterate(&ds_after_next->ds_deadlist,
+ &itor, &bp) == 0) {
+ if (bp.blk_birth >
+ ds->ds_phys->ds_prev_snap_txg &&
+ bp.blk_birth <=
+ ds->ds_phys->ds_creation_txg) {
+ ds_next->ds_phys->ds_unique_bytes +=
+ bp_get_dasize(dp->dp_spa, &bp);
+ }
+ }
+
+ dsl_dataset_close(ds_after_next, DS_MODE_NONE, FTAG);
+ ASSERT3P(ds_next->ds_prev, ==, NULL);
+ } else {
+ /*
+ * It would be nice to update the head dataset's
+ * unique. To do so we would have to traverse
+ * it for blocks born after ds_prev, which is
+ * pretty expensive just to maintain something
+ * for debugging purposes.
+ */
+ ASSERT3P(ds_next->ds_prev, ==, ds);
+ dsl_dataset_close(ds_next->ds_prev, DS_MODE_NONE,
+ ds_next);
+ if (ds_prev) {
+ VERIFY(0 == dsl_dataset_open_obj(dp,
+ ds->ds_phys->ds_prev_snap_obj, NULL,
+ DS_MODE_NONE, ds_next, &ds_next->ds_prev));
+ } else {
+ ds_next->ds_prev = NULL;
+ }
+ }
+ dsl_dataset_close(ds_next, DS_MODE_NONE, FTAG);
+
+ /*
+ * NB: unique_bytes is not accurate for head objsets
+ * because we don't update it when we delete the most
+ * recent snapshot -- see above comment.
+ */
+ ASSERT3U(used, ==, ds->ds_phys->ds_unique_bytes);
+ } else {
+ /*
+ * There's no next snapshot, so this is a head dataset.
+ * Destroy the deadlist. Unless it's a clone, the
+ * deadlist should be empty. (If it's a clone, it's
+ * safe to ignore the deadlist contents.)
+ */
+ struct killarg ka;
+
+ ASSERT(after_branch_point || bplist_empty(&ds->ds_deadlist));
+ bplist_close(&ds->ds_deadlist);
+ bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx);
+ ds->ds_phys->ds_deadlist_obj = 0;
+
+ /*
+ * Free everything that we point to (that's born after
+ * the previous snapshot, if we are a clone)
+ *
+ * XXX we're doing this long task with the config lock held
+ */
+ ka.usedp = &used;
+ ka.compressedp = &compressed;
+ ka.uncompressedp = &uncompressed;
+ ka.zio = zio;
+ ka.tx = tx;
+ err = traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
+ ADVANCE_POST, kill_blkptr, &ka);
+ ASSERT3U(err, ==, 0);
+ }
+
+ err = zio_wait(zio);
+ ASSERT3U(err, ==, 0);
+
+ dsl_dir_diduse_space(ds->ds_dir, -used, -compressed, -uncompressed, tx);
+
+ if (ds->ds_phys->ds_snapnames_zapobj) {
+ err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx);
+ ASSERT(err == 0);
+ }
+
+ if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) {
+ /* Erase the link in the dataset */
+ dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
+ ds->ds_dir->dd_phys->dd_head_dataset_obj = 0;
+ /*
+ * dsl_dir_sync_destroy() called us, they'll destroy
+ * the dataset.
+ */
+ } else {
+ /* remove from snapshot namespace */
+ dsl_dataset_t *ds_head;
+ VERIFY(0 == dsl_dataset_open_obj(dp,
+ ds->ds_dir->dd_phys->dd_head_dataset_obj, NULL,
+ DS_MODE_NONE, FTAG, &ds_head));
+ VERIFY(0 == dsl_dataset_get_snapname(ds));
+#ifdef ZFS_DEBUG
+ {
+ uint64_t val;
+ err = zap_lookup(mos,
+ ds_head->ds_phys->ds_snapnames_zapobj,
+ ds->ds_snapname, 8, 1, &val);
+ ASSERT3U(err, ==, 0);
+ ASSERT3U(val, ==, obj);
+ }
+#endif
+ err = zap_remove(mos, ds_head->ds_phys->ds_snapnames_zapobj,
+ ds->ds_snapname, tx);
+ ASSERT(err == 0);
+ dsl_dataset_close(ds_head, DS_MODE_NONE, FTAG);
+ }
+
+ if (ds_prev && ds->ds_prev != ds_prev)
+ dsl_dataset_close(ds_prev, DS_MODE_NONE, FTAG);
+
+ spa_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
+ dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, tag);
+ VERIFY(0 == dmu_object_free(mos, obj, tx));
+
+}
+
+/* ARGSUSED */
+int
+dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ objset_t *os = arg1;
+ dsl_dataset_t *ds = os->os->os_dsl_dataset;
+ const char *snapname = arg2;
+ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+ int err;
+ uint64_t value;
+
+ /*
+ * We don't allow multiple snapshots of the same txg. If there
+ * is already one, try again.
+ */
+ if (ds->ds_phys->ds_prev_snap_txg >= tx->tx_txg)
+ return (EAGAIN);
+
+ /*
+ * Check for conflicting name snapshot name.
+ */
+ err = zap_lookup(mos, ds->ds_phys->ds_snapnames_zapobj,
+ snapname, 8, 1, &value);
+ if (err == 0)
+ return (EEXIST);
+ if (err != ENOENT)
+ return (err);
+
+ ds->ds_trysnap_txg = tx->tx_txg;
+ return (0);
+}
+
+void
+dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ objset_t *os = arg1;
+ dsl_dataset_t *ds = os->os->os_dsl_dataset;
+ const char *snapname = arg2;
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ dmu_buf_t *dbuf;
+ dsl_dataset_phys_t *dsphys;
+ uint64_t dsobj;
+ objset_t *mos = dp->dp_meta_objset;
+ int err;
+
+ spa_scrub_restart(dp->dp_spa, tx->tx_txg);
+ ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
+
+ dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
+ DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
+ VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
+ dmu_buf_will_dirty(dbuf, tx);
+ dsphys = dbuf->db_data;
+ dsphys->ds_dir_obj = ds->ds_dir->dd_object;
+ dsphys->ds_fsid_guid = unique_create();
+ unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */
+ (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
+ sizeof (dsphys->ds_guid));
+ dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj;
+ dsphys->ds_prev_snap_txg = ds->ds_phys->ds_prev_snap_txg;
+ dsphys->ds_next_snap_obj = ds->ds_object;
+ dsphys->ds_num_children = 1;
+ dsphys->ds_creation_time = gethrestime_sec();
+ dsphys->ds_creation_txg = tx->tx_txg;
+ dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj;
+ dsphys->ds_used_bytes = ds->ds_phys->ds_used_bytes;
+ dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes;
+ dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes;
+ dsphys->ds_flags = ds->ds_phys->ds_flags;
+ dsphys->ds_bp = ds->ds_phys->ds_bp;
+ dmu_buf_rele(dbuf, FTAG);
+
+ ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0);
+ if (ds->ds_prev) {
+ ASSERT(ds->ds_prev->ds_phys->ds_next_snap_obj ==
+ ds->ds_object ||
+ ds->ds_prev->ds_phys->ds_num_children > 1);
+ if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
+ dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
+ ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
+ ds->ds_prev->ds_phys->ds_creation_txg);
+ ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj;
+ }
+ }
+
+ bplist_close(&ds->ds_deadlist);
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, dsphys->ds_creation_txg);
+ ds->ds_phys->ds_prev_snap_obj = dsobj;
+ ds->ds_phys->ds_prev_snap_txg = dsphys->ds_creation_txg;
+ ds->ds_phys->ds_unique_bytes = 0;
+ ds->ds_phys->ds_deadlist_obj =
+ bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
+ VERIFY(0 == bplist_open(&ds->ds_deadlist, mos,
+ ds->ds_phys->ds_deadlist_obj));
+
+ dprintf("snap '%s' -> obj %llu\n", snapname, dsobj);
+ err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj,
+ snapname, 8, 1, &dsobj, tx);
+ ASSERT(err == 0);
+
+ if (ds->ds_prev)
+ dsl_dataset_close(ds->ds_prev, DS_MODE_NONE, ds);
+ VERIFY(0 == dsl_dataset_open_obj(dp,
+ ds->ds_phys->ds_prev_snap_obj, snapname,
+ DS_MODE_NONE, ds, &ds->ds_prev));
+}
+
+void
+dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
+{
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT(ds->ds_user_ptr != NULL);
+ ASSERT(ds->ds_phys->ds_next_snap_obj == 0);
+
+ dsl_dir_dirty(ds->ds_dir, tx);
+ dmu_objset_sync(ds->ds_user_ptr, zio, tx);
+ /* Unneeded? bplist_close(&ds->ds_deadlist); */
+}
+
+void
+dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
+{
+ dsl_dir_stats(ds->ds_dir, nv);
+
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION,
+ ds->ds_phys->ds_creation_time);
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG,
+ ds->ds_phys->ds_creation_txg);
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED,
+ ds->ds_phys->ds_used_bytes);
+
+ if (ds->ds_phys->ds_next_snap_obj) {
+ /*
+ * This is a snapshot; override the dd's space used with
+ * our unique space and compression ratio.
+ */
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
+ ds->ds_phys->ds_unique_bytes);
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO,
+ ds->ds_phys->ds_compressed_bytes == 0 ? 100 :
+ (ds->ds_phys->ds_uncompressed_bytes * 100 /
+ ds->ds_phys->ds_compressed_bytes));
+ }
+}
+
+void
+dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
+{
+ stat->dds_creation_txg = ds->ds_phys->ds_creation_txg;
+ stat->dds_inconsistent = ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT;
+ if (ds->ds_phys->ds_next_snap_obj) {
+ stat->dds_is_snapshot = B_TRUE;
+ stat->dds_num_clones = ds->ds_phys->ds_num_children - 1;
+ }
+
+ /* clone origin is really a dsl_dir thing... */
+ if (ds->ds_dir->dd_phys->dd_clone_parent_obj) {
+ dsl_dataset_t *ods;
+
+ rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER);
+ VERIFY(0 == dsl_dataset_open_obj(ds->ds_dir->dd_pool,
+ ds->ds_dir->dd_phys->dd_clone_parent_obj,
+ NULL, DS_MODE_NONE, FTAG, &ods));
+ dsl_dataset_name(ods, stat->dds_clone_of);
+ dsl_dataset_close(ods, DS_MODE_NONE, FTAG);
+ rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
+ }
+}
+
+uint64_t
+dsl_dataset_fsid_guid(dsl_dataset_t *ds)
+{
+ return (ds->ds_phys->ds_fsid_guid);
+}
+
+void
+dsl_dataset_space(dsl_dataset_t *ds,
+ uint64_t *refdbytesp, uint64_t *availbytesp,
+ uint64_t *usedobjsp, uint64_t *availobjsp)
+{
+ *refdbytesp = ds->ds_phys->ds_used_bytes;
+ *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE);
+ *usedobjsp = ds->ds_phys->ds_bp.blk_fill;
+ *availobjsp = DN_MAX_OBJECT - *usedobjsp;
+}
+
+/* ARGSUSED */
+static int
+dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = arg1;
+ char *newsnapname = arg2;
+ dsl_dir_t *dd = ds->ds_dir;
+ objset_t *mos = dd->dd_pool->dp_meta_objset;
+ dsl_dataset_t *hds;
+ uint64_t val;
+ int err;
+
+ err = dsl_dataset_open_obj(dd->dd_pool,
+ dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG, &hds);
+ if (err)
+ return (err);
+
+ /* new name better not be in use */
+ err = zap_lookup(mos, hds->ds_phys->ds_snapnames_zapobj,
+ newsnapname, 8, 1, &val);
+ dsl_dataset_close(hds, DS_MODE_NONE, FTAG);
+
+ if (err == 0)
+ err = EEXIST;
+ else if (err == ENOENT)
+ err = 0;
+ return (err);
+}
+
+static void
+dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dataset_t *ds = arg1;
+ char *newsnapname = arg2;
+ dsl_dir_t *dd = ds->ds_dir;
+ objset_t *mos = dd->dd_pool->dp_meta_objset;
+ dsl_dataset_t *hds;
+ int err;
+
+ ASSERT(ds->ds_phys->ds_next_snap_obj != 0);
+
+ VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool,
+ dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG, &hds));
+
+ VERIFY(0 == dsl_dataset_get_snapname(ds));
+ err = zap_remove(mos, hds->ds_phys->ds_snapnames_zapobj,
+ ds->ds_snapname, tx);
+ ASSERT3U(err, ==, 0);
+ mutex_enter(&ds->ds_lock);
+ (void) strcpy(ds->ds_snapname, newsnapname);
+ mutex_exit(&ds->ds_lock);
+ err = zap_add(mos, hds->ds_phys->ds_snapnames_zapobj,
+ ds->ds_snapname, 8, 1, &ds->ds_object, tx);
+ ASSERT3U(err, ==, 0);
+
+ dsl_dataset_close(hds, DS_MODE_NONE, FTAG);
+}
+
+#pragma weak dmu_objset_rename = dsl_dataset_rename
+int
+dsl_dataset_rename(const char *oldname, const char *newname)
+{
+ dsl_dir_t *dd;
+ dsl_dataset_t *ds;
+ const char *tail;
+ int err;
+
+ err = dsl_dir_open(oldname, FTAG, &dd, &tail);
+ if (err)
+ return (err);
+ if (tail == NULL) {
+ err = dsl_dir_rename(dd, newname);
+ dsl_dir_close(dd, FTAG);
+ return (err);
+ }
+ if (tail[0] != '@') {
+ /* the name ended in a nonexistant component */
+ dsl_dir_close(dd, FTAG);
+ return (ENOENT);
+ }
+
+ dsl_dir_close(dd, FTAG);
+
+ /* new name must be snapshot in same filesystem */
+ tail = strchr(newname, '@');
+ if (tail == NULL)
+ return (EINVAL);
+ tail++;
+ if (strncmp(oldname, newname, tail - newname) != 0)
+ return (EXDEV);
+
+ err = dsl_dataset_open(oldname,
+ DS_MODE_READONLY | DS_MODE_STANDARD, FTAG, &ds);
+ if (err)
+ return (err);
+
+ err = dsl_sync_task_do(ds->ds_dir->dd_pool,
+ dsl_dataset_snapshot_rename_check,
+ dsl_dataset_snapshot_rename_sync, ds, (char *)tail, 1);
+
+ dsl_dataset_close(ds, DS_MODE_STANDARD, FTAG);
+
+ return (err);
+}
+
+struct promotearg {
+ uint64_t used, comp, uncomp, unique;
+ uint64_t newnext_obj, snapnames_obj;
+};
+
+static int
+dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dataset_t *hds = arg1;
+ struct promotearg *pa = arg2;
+ dsl_dir_t *dd = hds->ds_dir;
+ dsl_pool_t *dp = hds->ds_dir->dd_pool;
+ dsl_dir_t *pdd = NULL;
+ dsl_dataset_t *ds = NULL;
+ dsl_dataset_t *pivot_ds = NULL;
+ dsl_dataset_t *newnext_ds = NULL;
+ int err;
+ char *name = NULL;
+ uint64_t itor = 0;
+ blkptr_t bp;
+
+ bzero(pa, sizeof (*pa));
+
+ /* Check that it is a clone */
+ if (dd->dd_phys->dd_clone_parent_obj == 0)
+ return (EINVAL);
+
+ /* Since this is so expensive, don't do the preliminary check */
+ if (!dmu_tx_is_syncing(tx))
+ return (0);
+
+ if (err = dsl_dataset_open_obj(dp,
+ dd->dd_phys->dd_clone_parent_obj,
+ NULL, DS_MODE_EXCLUSIVE, FTAG, &pivot_ds))
+ goto out;
+ pdd = pivot_ds->ds_dir;
+
+ {
+ dsl_dataset_t *phds;
+ if (err = dsl_dataset_open_obj(dd->dd_pool,
+ pdd->dd_phys->dd_head_dataset_obj,
+ NULL, DS_MODE_NONE, FTAG, &phds))
+ goto out;
+ pa->snapnames_obj = phds->ds_phys->ds_snapnames_zapobj;
+ dsl_dataset_close(phds, DS_MODE_NONE, FTAG);
+ }
+
+ if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE) {
+ err = EXDEV;
+ goto out;
+ }
+
+ /* find pivot point's new next ds */
+ VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, hds->ds_object,
+ NULL, DS_MODE_NONE, FTAG, &newnext_ds));
+ while (newnext_ds->ds_phys->ds_prev_snap_obj != pivot_ds->ds_object) {
+ dsl_dataset_t *prev;
+
+ if (err = dsl_dataset_open_obj(dd->dd_pool,
+ newnext_ds->ds_phys->ds_prev_snap_obj,
+ NULL, DS_MODE_NONE, FTAG, &prev))
+ goto out;
+ dsl_dataset_close(newnext_ds, DS_MODE_NONE, FTAG);
+ newnext_ds = prev;
+ }
+ pa->newnext_obj = newnext_ds->ds_object;
+
+ /* compute pivot point's new unique space */
+ while ((err = bplist_iterate(&newnext_ds->ds_deadlist,
+ &itor, &bp)) == 0) {
+ if (bp.blk_birth > pivot_ds->ds_phys->ds_prev_snap_txg)
+ pa->unique += bp_get_dasize(dd->dd_pool->dp_spa, &bp);
+ }
+ if (err != ENOENT)
+ goto out;
+
+ /* Walk the snapshots that we are moving */
+ name = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+ ds = pivot_ds;
+ /* CONSTCOND */
+ while (TRUE) {
+ uint64_t val, dlused, dlcomp, dluncomp;
+ dsl_dataset_t *prev;
+
+ /* Check that the snapshot name does not conflict */
+ dsl_dataset_name(ds, name);
+ err = zap_lookup(dd->dd_pool->dp_meta_objset,
+ hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname,
+ 8, 1, &val);
+ if (err != ENOENT) {
+ if (err == 0)
+ err = EEXIST;
+ goto out;
+ }
+
+ /*
+ * compute space to transfer. Each snapshot gave birth to:
+ * (my used) - (prev's used) + (deadlist's used)
+ */
+ pa->used += ds->ds_phys->ds_used_bytes;
+ pa->comp += ds->ds_phys->ds_compressed_bytes;
+ pa->uncomp += ds->ds_phys->ds_uncompressed_bytes;
+
+ /* If we reach the first snapshot, we're done. */
+ if (ds->ds_phys->ds_prev_snap_obj == 0)
+ break;
+
+ if (err = bplist_space(&ds->ds_deadlist,
+ &dlused, &dlcomp, &dluncomp))
+ goto out;
+ if (err = dsl_dataset_open_obj(dd->dd_pool,
+ ds->ds_phys->ds_prev_snap_obj, NULL, DS_MODE_EXCLUSIVE,
+ FTAG, &prev))
+ goto out;
+ pa->used += dlused - prev->ds_phys->ds_used_bytes;
+ pa->comp += dlcomp - prev->ds_phys->ds_compressed_bytes;
+ pa->uncomp += dluncomp - prev->ds_phys->ds_uncompressed_bytes;
+
+ /*
+ * We could be a clone of a clone. If we reach our
+ * parent's branch point, we're done.
+ */
+ if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) {
+ dsl_dataset_close(prev, DS_MODE_EXCLUSIVE, FTAG);
+ break;
+ }
+ if (ds != pivot_ds)
+ dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+ ds = prev;
+ }
+
+ /* Check that there is enough space here */
+ err = dsl_dir_transfer_possible(pdd, dd, pa->used);
+
+out:
+ if (ds && ds != pivot_ds)
+ dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+ if (pivot_ds)
+ dsl_dataset_close(pivot_ds, DS_MODE_EXCLUSIVE, FTAG);
+ if (newnext_ds)
+ dsl_dataset_close(newnext_ds, DS_MODE_NONE, FTAG);
+ if (name)
+ kmem_free(name, MAXPATHLEN);
+ return (err);
+}
+
+static void
+dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dataset_t *hds = arg1;
+ struct promotearg *pa = arg2;
+ dsl_dir_t *dd = hds->ds_dir;
+ dsl_pool_t *dp = hds->ds_dir->dd_pool;
+ dsl_dir_t *pdd = NULL;
+ dsl_dataset_t *ds, *pivot_ds;
+ char *name;
+
+ ASSERT(dd->dd_phys->dd_clone_parent_obj != 0);
+ ASSERT(0 == (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE));
+
+ VERIFY(0 == dsl_dataset_open_obj(dp,
+ dd->dd_phys->dd_clone_parent_obj,
+ NULL, DS_MODE_EXCLUSIVE, FTAG, &pivot_ds));
+ /*
+ * We need to explicitly open pdd, since pivot_ds's pdd will be
+ * changing.
+ */
+ VERIFY(0 == dsl_dir_open_obj(dp, pivot_ds->ds_dir->dd_object,
+ NULL, FTAG, &pdd));
+
+ /* move snapshots to this dir */
+ name = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+ ds = pivot_ds;
+ /* CONSTCOND */
+ while (TRUE) {
+ dsl_dataset_t *prev;
+
+ /* move snap name entry */
+ dsl_dataset_name(ds, name);
+ VERIFY(0 == zap_remove(dp->dp_meta_objset,
+ pa->snapnames_obj, ds->ds_snapname, tx));
+ VERIFY(0 == zap_add(dp->dp_meta_objset,
+ hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname,
+ 8, 1, &ds->ds_object, tx));
+
+ /* change containing dsl_dir */
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ ASSERT3U(ds->ds_phys->ds_dir_obj, ==, pdd->dd_object);
+ ds->ds_phys->ds_dir_obj = dd->dd_object;
+ ASSERT3P(ds->ds_dir, ==, pdd);
+ dsl_dir_close(ds->ds_dir, ds);
+ VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object,
+ NULL, ds, &ds->ds_dir));
+
+ ASSERT3U(dsl_prop_numcb(ds), ==, 0);
+
+ if (ds->ds_phys->ds_prev_snap_obj == 0)
+ break;
+
+ VERIFY(0 == dsl_dataset_open_obj(dp,
+ ds->ds_phys->ds_prev_snap_obj, NULL, DS_MODE_EXCLUSIVE,
+ FTAG, &prev));
+
+ if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) {
+ dsl_dataset_close(prev, DS_MODE_EXCLUSIVE, FTAG);
+ break;
+ }
+ if (ds != pivot_ds)
+ dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+ ds = prev;
+ }
+ if (ds != pivot_ds)
+ dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+
+ /* change pivot point's next snap */
+ dmu_buf_will_dirty(pivot_ds->ds_dbuf, tx);
+ pivot_ds->ds_phys->ds_next_snap_obj = pa->newnext_obj;
+
+ /* change clone_parent-age */
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+ ASSERT3U(dd->dd_phys->dd_clone_parent_obj, ==, pivot_ds->ds_object);
+ dd->dd_phys->dd_clone_parent_obj = pdd->dd_phys->dd_clone_parent_obj;
+ dmu_buf_will_dirty(pdd->dd_dbuf, tx);
+ pdd->dd_phys->dd_clone_parent_obj = pivot_ds->ds_object;
+
+ /* change space accounting */
+ dsl_dir_diduse_space(pdd, -pa->used, -pa->comp, -pa->uncomp, tx);
+ dsl_dir_diduse_space(dd, pa->used, pa->comp, pa->uncomp, tx);
+ pivot_ds->ds_phys->ds_unique_bytes = pa->unique;
+
+ dsl_dir_close(pdd, FTAG);
+ dsl_dataset_close(pivot_ds, DS_MODE_EXCLUSIVE, FTAG);
+ kmem_free(name, MAXPATHLEN);
+}
+
+int
+dsl_dataset_promote(const char *name)
+{
+ dsl_dataset_t *ds;
+ int err;
+ dmu_object_info_t doi;
+ struct promotearg pa;
+
+ err = dsl_dataset_open(name, DS_MODE_NONE, FTAG, &ds);
+ if (err)
+ return (err);
+
+ err = dmu_object_info(ds->ds_dir->dd_pool->dp_meta_objset,
+ ds->ds_phys->ds_snapnames_zapobj, &doi);
+ if (err) {
+ dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+ return (err);
+ }
+
+ /*
+ * Add in 128x the snapnames zapobj size, since we will be moving
+ * a bunch of snapnames to the promoted ds, and dirtying their
+ * bonus buffers.
+ */
+ err = dsl_sync_task_do(ds->ds_dir->dd_pool,
+ dsl_dataset_promote_check,
+ dsl_dataset_promote_sync, ds, &pa, 2 + 2 * doi.doi_physical_blks);
+ dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+ return (err);
+}
+
+/*
+ * Given a pool name and a dataset object number in that pool,
+ * return the name of that dataset.
+ */
+int
+dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf)
+{
+ spa_t *spa;
+ dsl_pool_t *dp;
+ dsl_dataset_t *ds = NULL;
+ int error;
+
+ if ((error = spa_open(pname, &spa, FTAG)) != 0)
+ return (error);
+ dp = spa_get_dsl(spa);
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ if ((error = dsl_dataset_open_obj(dp, obj,
+ NULL, DS_MODE_NONE, FTAG, &ds)) != 0) {
+ rw_exit(&dp->dp_config_rwlock);
+ spa_close(spa, FTAG);
+ return (error);
+ }
+ dsl_dataset_name(ds, buf);
+ dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+ rw_exit(&dp->dp_config_rwlock);
+ spa_close(spa, FTAG);
+
+ return (0);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c
new file mode 100644
index 0000000..97779a2
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c
@@ -0,0 +1,1192 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_synctask.h>
+#include <sys/spa.h>
+#include <sys/zap.h>
+#include <sys/zio.h>
+#include <sys/arc.h>
+#include "zfs_namecheck.h"
+
+static uint64_t dsl_dir_estimated_space(dsl_dir_t *dd);
+static void dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx);
+
+
+/* ARGSUSED */
+static void
+dsl_dir_evict(dmu_buf_t *db, void *arg)
+{
+ dsl_dir_t *dd = arg;
+ dsl_pool_t *dp = dd->dd_pool;
+ int t;
+
+ for (t = 0; t < TXG_SIZE; t++) {
+ ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t));
+ ASSERT(dd->dd_tempreserved[t] == 0);
+ ASSERT(dd->dd_space_towrite[t] == 0);
+ }
+
+ ASSERT3U(dd->dd_used_bytes, ==, dd->dd_phys->dd_used_bytes);
+
+ if (dd->dd_parent)
+ dsl_dir_close(dd->dd_parent, dd);
+
+ spa_close(dd->dd_pool->dp_spa, dd);
+
+ /*
+ * The props callback list should be empty since they hold the
+ * dir open.
+ */
+ list_destroy(&dd->dd_prop_cbs);
+ mutex_destroy(&dd->dd_lock);
+ kmem_free(dd, sizeof (dsl_dir_t));
+}
+
+int
+dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
+ const char *tail, void *tag, dsl_dir_t **ddp)
+{
+ dmu_buf_t *dbuf;
+ dsl_dir_t *dd;
+ int err;
+
+ ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
+ dsl_pool_sync_context(dp));
+
+ err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf);
+ if (err)
+ return (err);
+ dd = dmu_buf_get_user(dbuf);
+#ifdef ZFS_DEBUG
+ {
+ dmu_object_info_t doi;
+ dmu_object_info_from_db(dbuf, &doi);
+ ASSERT3U(doi.doi_type, ==, DMU_OT_DSL_DIR);
+ }
+#endif
+ /* XXX assert bonus buffer size is correct */
+ if (dd == NULL) {
+ dsl_dir_t *winner;
+ int err;
+
+ dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP);
+ dd->dd_object = ddobj;
+ dd->dd_dbuf = dbuf;
+ dd->dd_pool = dp;
+ dd->dd_phys = dbuf->db_data;
+ dd->dd_used_bytes = dd->dd_phys->dd_used_bytes;
+ mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ list_create(&dd->dd_prop_cbs, sizeof (dsl_prop_cb_record_t),
+ offsetof(dsl_prop_cb_record_t, cbr_node));
+
+ if (dd->dd_phys->dd_parent_obj) {
+ err = dsl_dir_open_obj(dp, dd->dd_phys->dd_parent_obj,
+ NULL, dd, &dd->dd_parent);
+ if (err) {
+ mutex_destroy(&dd->dd_lock);
+ kmem_free(dd, sizeof (dsl_dir_t));
+ dmu_buf_rele(dbuf, tag);
+ return (err);
+ }
+ if (tail) {
+#ifdef ZFS_DEBUG
+ uint64_t foundobj;
+
+ err = zap_lookup(dp->dp_meta_objset,
+ dd->dd_parent->dd_phys->
+ dd_child_dir_zapobj,
+ tail, sizeof (foundobj), 1, &foundobj);
+ ASSERT(err || foundobj == ddobj);
+#endif
+ (void) strcpy(dd->dd_myname, tail);
+ } else {
+ err = zap_value_search(dp->dp_meta_objset,
+ dd->dd_parent->dd_phys->
+ dd_child_dir_zapobj,
+ ddobj, dd->dd_myname);
+ }
+ if (err) {
+ dsl_dir_close(dd->dd_parent, dd);
+ mutex_destroy(&dd->dd_lock);
+ kmem_free(dd, sizeof (dsl_dir_t));
+ dmu_buf_rele(dbuf, tag);
+ return (err);
+ }
+ } else {
+ (void) strcpy(dd->dd_myname, spa_name(dp->dp_spa));
+ }
+
+ winner = dmu_buf_set_user_ie(dbuf, dd, &dd->dd_phys,
+ dsl_dir_evict);
+ if (winner) {
+ if (dd->dd_parent)
+ dsl_dir_close(dd->dd_parent, dd);
+ mutex_destroy(&dd->dd_lock);
+ kmem_free(dd, sizeof (dsl_dir_t));
+ dd = winner;
+ } else {
+ spa_open_ref(dp->dp_spa, dd);
+ }
+ }
+
+ /*
+ * The dsl_dir_t has both open-to-close and instantiate-to-evict
+ * holds on the spa. We need the open-to-close holds because
+ * otherwise the spa_refcnt wouldn't change when we open a
+ * dir which the spa also has open, so we could incorrectly
+ * think it was OK to unload/export/destroy the pool. We need
+ * the instantiate-to-evict hold because the dsl_dir_t has a
+ * pointer to the dd_pool, which has a pointer to the spa_t.
+ */
+ spa_open_ref(dp->dp_spa, tag);
+ ASSERT3P(dd->dd_pool, ==, dp);
+ ASSERT3U(dd->dd_object, ==, ddobj);
+ ASSERT3P(dd->dd_dbuf, ==, dbuf);
+ *ddp = dd;
+ return (0);
+}
+
+void
+dsl_dir_close(dsl_dir_t *dd, void *tag)
+{
+ dprintf_dd(dd, "%s\n", "");
+ spa_close(dd->dd_pool->dp_spa, tag);
+ dmu_buf_rele(dd->dd_dbuf, tag);
+}
+
+/* buf must be long enough (MAXNAMELEN + strlen(MOS_DIR_NAME) + 1 should do) */
+void
+dsl_dir_name(dsl_dir_t *dd, char *buf)
+{
+ if (dd->dd_parent) {
+ dsl_dir_name(dd->dd_parent, buf);
+ (void) strcat(buf, "/");
+ } else {
+ buf[0] = '\0';
+ }
+ if (!MUTEX_HELD(&dd->dd_lock)) {
+ /*
+ * recursive mutex so that we can use
+ * dprintf_dd() with dd_lock held
+ */
+ mutex_enter(&dd->dd_lock);
+ (void) strcat(buf, dd->dd_myname);
+ mutex_exit(&dd->dd_lock);
+ } else {
+ (void) strcat(buf, dd->dd_myname);
+ }
+}
+
+int
+dsl_dir_is_private(dsl_dir_t *dd)
+{
+ int rv = FALSE;
+
+ if (dd->dd_parent && dsl_dir_is_private(dd->dd_parent))
+ rv = TRUE;
+ if (dataset_name_hidden(dd->dd_myname))
+ rv = TRUE;
+ return (rv);
+}
+
+
+static int
+getcomponent(const char *path, char *component, const char **nextp)
+{
+ char *p;
+ if (path == NULL)
+ return (ENOENT);
+ /* This would be a good place to reserve some namespace... */
+ p = strpbrk(path, "/@");
+ if (p && (p[1] == '/' || p[1] == '@')) {
+ /* two separators in a row */
+ return (EINVAL);
+ }
+ if (p == NULL || p == path) {
+ /*
+ * if the first thing is an @ or /, it had better be an
+ * @ and it had better not have any more ats or slashes,
+ * and it had better have something after the @.
+ */
+ if (p != NULL &&
+ (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0'))
+ return (EINVAL);
+ if (strlen(path) >= MAXNAMELEN)
+ return (ENAMETOOLONG);
+ (void) strcpy(component, path);
+ p = NULL;
+ } else if (p[0] == '/') {
+ if (p-path >= MAXNAMELEN)
+ return (ENAMETOOLONG);
+ (void) strncpy(component, path, p - path);
+ component[p-path] = '\0';
+ p++;
+ } else if (p[0] == '@') {
+ /*
+ * if the next separator is an @, there better not be
+ * any more slashes.
+ */
+ if (strchr(path, '/'))
+ return (EINVAL);
+ if (p-path >= MAXNAMELEN)
+ return (ENAMETOOLONG);
+ (void) strncpy(component, path, p - path);
+ component[p-path] = '\0';
+ } else {
+ ASSERT(!"invalid p");
+ }
+ *nextp = p;
+ return (0);
+}
+
+/*
+ * same as dsl_open_dir, ignore the first component of name and use the
+ * spa instead
+ */
+int
+dsl_dir_open_spa(spa_t *spa, const char *name, void *tag,
+ dsl_dir_t **ddp, const char **tailp)
+{
+ char buf[MAXNAMELEN];
+ const char *next, *nextnext = NULL;
+ int err;
+ dsl_dir_t *dd;
+ dsl_pool_t *dp;
+ uint64_t ddobj;
+ int openedspa = FALSE;
+
+ dprintf("%s\n", name);
+
+ err = getcomponent(name, buf, &next);
+ if (err)
+ return (err);
+ if (spa == NULL) {
+ err = spa_open(buf, &spa, FTAG);
+ if (err) {
+ dprintf("spa_open(%s) failed\n", buf);
+ return (err);
+ }
+ openedspa = TRUE;
+
+ /* XXX this assertion belongs in spa_open */
+ ASSERT(!dsl_pool_sync_context(spa_get_dsl(spa)));
+ }
+
+ dp = spa_get_dsl(spa);
+
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd);
+ if (err) {
+ rw_exit(&dp->dp_config_rwlock);
+ if (openedspa)
+ spa_close(spa, FTAG);
+ return (err);
+ }
+
+ while (next != NULL) {
+ dsl_dir_t *child_ds;
+ err = getcomponent(next, buf, &nextnext);
+ if (err)
+ break;
+ ASSERT(next[0] != '\0');
+ if (next[0] == '@')
+ break;
+ dprintf("looking up %s in obj%lld\n",
+ buf, dd->dd_phys->dd_child_dir_zapobj);
+
+ err = zap_lookup(dp->dp_meta_objset,
+ dd->dd_phys->dd_child_dir_zapobj,
+ buf, sizeof (ddobj), 1, &ddobj);
+ if (err) {
+ if (err == ENOENT)
+ err = 0;
+ break;
+ }
+
+ err = dsl_dir_open_obj(dp, ddobj, buf, tag, &child_ds);
+ if (err)
+ break;
+ dsl_dir_close(dd, tag);
+ dd = child_ds;
+ next = nextnext;
+ }
+ rw_exit(&dp->dp_config_rwlock);
+
+ if (err) {
+ dsl_dir_close(dd, tag);
+ if (openedspa)
+ spa_close(spa, FTAG);
+ return (err);
+ }
+
+ /*
+ * It's an error if there's more than one component left, or
+ * tailp==NULL and there's any component left.
+ */
+ if (next != NULL &&
+ (tailp == NULL || (nextnext && nextnext[0] != '\0'))) {
+ /* bad path name */
+ dsl_dir_close(dd, tag);
+ dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp);
+ err = ENOENT;
+ }
+ if (tailp)
+ *tailp = next;
+ if (openedspa)
+ spa_close(spa, FTAG);
+ *ddp = dd;
+ return (err);
+}
+
+/*
+ * Return the dsl_dir_t, and possibly the last component which couldn't
+ * be found in *tail. Return NULL if the path is bogus, or if
+ * tail==NULL and we couldn't parse the whole name. (*tail)[0] == '@'
+ * means that the last component is a snapshot.
+ */
+int
+dsl_dir_open(const char *name, void *tag, dsl_dir_t **ddp, const char **tailp)
+{
+ return (dsl_dir_open_spa(NULL, name, tag, ddp, tailp));
+}
+
+uint64_t
+dsl_dir_create_sync(dsl_dir_t *pds, const char *name, dmu_tx_t *tx)
+{
+ objset_t *mos = pds->dd_pool->dp_meta_objset;
+ uint64_t ddobj;
+ dsl_dir_phys_t *dsphys;
+ dmu_buf_t *dbuf;
+
+ ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
+ DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx);
+ VERIFY(0 == zap_add(mos, pds->dd_phys->dd_child_dir_zapobj,
+ name, sizeof (uint64_t), 1, &ddobj, tx));
+ VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf));
+ dmu_buf_will_dirty(dbuf, tx);
+ dsphys = dbuf->db_data;
+
+ dsphys->dd_creation_time = gethrestime_sec();
+ dsphys->dd_parent_obj = pds->dd_object;
+ dsphys->dd_props_zapobj = zap_create(mos,
+ DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
+ dsphys->dd_child_dir_zapobj = zap_create(mos,
+ DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
+ dmu_buf_rele(dbuf, FTAG);
+
+ return (ddobj);
+}
+
+/* ARGSUSED */
+int
+dsl_dir_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dir_t *dd = arg1;
+ dsl_pool_t *dp = dd->dd_pool;
+ objset_t *mos = dp->dp_meta_objset;
+ int err;
+ uint64_t count;
+
+ /*
+ * There should be exactly two holds, both from
+ * dsl_dataset_destroy: one on the dd directory, and one on its
+ * head ds. Otherwise, someone is trying to lookup something
+ * inside this dir while we want to destroy it. The
+ * config_rwlock ensures that nobody else opens it after we
+ * check.
+ */
+ if (dmu_buf_refcount(dd->dd_dbuf) > 2)
+ return (EBUSY);
+
+ err = zap_count(mos, dd->dd_phys->dd_child_dir_zapobj, &count);
+ if (err)
+ return (err);
+ if (count != 0)
+ return (EEXIST);
+
+ return (0);
+}
+
+void
+dsl_dir_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
+{
+ dsl_dir_t *dd = arg1;
+ objset_t *mos = dd->dd_pool->dp_meta_objset;
+ uint64_t val, obj;
+
+ ASSERT(RW_WRITE_HELD(&dd->dd_pool->dp_config_rwlock));
+ ASSERT(dd->dd_phys->dd_head_dataset_obj == 0);
+
+ /* Remove our reservation. */
+ val = 0;
+ dsl_dir_set_reservation_sync(dd, &val, tx);
+ ASSERT3U(dd->dd_used_bytes, ==, 0);
+ ASSERT3U(dd->dd_phys->dd_reserved, ==, 0);
+
+ VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_child_dir_zapobj, tx));
+ VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_props_zapobj, tx));
+ VERIFY(0 == zap_remove(mos,
+ dd->dd_parent->dd_phys->dd_child_dir_zapobj, dd->dd_myname, tx));
+
+ obj = dd->dd_object;
+ dsl_dir_close(dd, tag);
+ VERIFY(0 == dmu_object_free(mos, obj, tx));
+}
+
+void
+dsl_dir_create_root(objset_t *mos, uint64_t *ddobjp, dmu_tx_t *tx)
+{
+ dsl_dir_phys_t *dsp;
+ dmu_buf_t *dbuf;
+ int error;
+
+ *ddobjp = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
+ DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx);
+
+ error = zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ROOT_DATASET,
+ sizeof (uint64_t), 1, ddobjp, tx);
+ ASSERT3U(error, ==, 0);
+
+ VERIFY(0 == dmu_bonus_hold(mos, *ddobjp, FTAG, &dbuf));
+ dmu_buf_will_dirty(dbuf, tx);
+ dsp = dbuf->db_data;
+
+ dsp->dd_creation_time = gethrestime_sec();
+ dsp->dd_props_zapobj = zap_create(mos,
+ DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
+ dsp->dd_child_dir_zapobj = zap_create(mos,
+ DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
+
+ dmu_buf_rele(dbuf, FTAG);
+}
+
+void
+dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv)
+{
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE,
+ dsl_dir_space_available(dd, NULL, 0, TRUE));
+
+ mutex_enter(&dd->dd_lock);
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, dd->dd_used_bytes);
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA,
+ dd->dd_phys->dd_quota);
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION,
+ dd->dd_phys->dd_reserved);
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO,
+ dd->dd_phys->dd_compressed_bytes == 0 ? 100 :
+ (dd->dd_phys->dd_uncompressed_bytes * 100 /
+ dd->dd_phys->dd_compressed_bytes));
+ mutex_exit(&dd->dd_lock);
+
+ if (dd->dd_phys->dd_clone_parent_obj) {
+ dsl_dataset_t *ds;
+ char buf[MAXNAMELEN];
+
+ rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
+ VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool,
+ dd->dd_phys->dd_clone_parent_obj,
+ NULL, DS_MODE_NONE, FTAG, &ds));
+ dsl_dataset_name(ds, buf);
+ dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+ rw_exit(&dd->dd_pool->dp_config_rwlock);
+
+ dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf);
+ }
+}
+
+void
+dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = dd->dd_pool;
+
+ ASSERT(dd->dd_phys);
+
+ if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg) == 0) {
+ /* up the hold count until we can be written out */
+ dmu_buf_add_ref(dd->dd_dbuf, dd);
+ }
+}
+
+static int64_t
+parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta)
+{
+ uint64_t old_accounted = MAX(used, dd->dd_phys->dd_reserved);
+ uint64_t new_accounted = MAX(used + delta, dd->dd_phys->dd_reserved);
+ return (new_accounted - old_accounted);
+}
+
+void
+dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx)
+{
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+
+ mutex_enter(&dd->dd_lock);
+ ASSERT3U(dd->dd_tempreserved[tx->tx_txg&TXG_MASK], ==, 0);
+ dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg,
+ dd->dd_space_towrite[tx->tx_txg&TXG_MASK] / 1024);
+ dd->dd_space_towrite[tx->tx_txg&TXG_MASK] = 0;
+ dd->dd_phys->dd_used_bytes = dd->dd_used_bytes;
+ mutex_exit(&dd->dd_lock);
+
+ /* release the hold from dsl_dir_dirty */
+ dmu_buf_rele(dd->dd_dbuf, dd);
+}
+
+static uint64_t
+dsl_dir_estimated_space(dsl_dir_t *dd)
+{
+ int64_t space;
+ int i;
+
+ ASSERT(MUTEX_HELD(&dd->dd_lock));
+
+ space = dd->dd_phys->dd_used_bytes;
+ ASSERT(space >= 0);
+ for (i = 0; i < TXG_SIZE; i++) {
+ space += dd->dd_space_towrite[i&TXG_MASK];
+ ASSERT3U(dd->dd_space_towrite[i&TXG_MASK], >=, 0);
+ }
+ return (space);
+}
+
+/*
+ * How much space would dd have available if ancestor had delta applied
+ * to it? If ondiskonly is set, we're only interested in what's
+ * on-disk, not estimated pending changes.
+ */
+uint64_t
+dsl_dir_space_available(dsl_dir_t *dd,
+ dsl_dir_t *ancestor, int64_t delta, int ondiskonly)
+{
+ uint64_t parentspace, myspace, quota, used;
+
+ /*
+ * If there are no restrictions otherwise, assume we have
+ * unlimited space available.
+ */
+ quota = UINT64_MAX;
+ parentspace = UINT64_MAX;
+
+ if (dd->dd_parent != NULL) {
+ parentspace = dsl_dir_space_available(dd->dd_parent,
+ ancestor, delta, ondiskonly);
+ }
+
+ mutex_enter(&dd->dd_lock);
+ if (dd->dd_phys->dd_quota != 0)
+ quota = dd->dd_phys->dd_quota;
+ if (ondiskonly) {
+ used = dd->dd_used_bytes;
+ } else {
+ used = dsl_dir_estimated_space(dd);
+ }
+ if (dd == ancestor)
+ used += delta;
+
+ if (dd->dd_parent == NULL) {
+ uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, FALSE);
+ quota = MIN(quota, poolsize);
+ }
+
+ if (dd->dd_phys->dd_reserved > used && parentspace != UINT64_MAX) {
+ /*
+ * We have some space reserved, in addition to what our
+ * parent gave us.
+ */
+ parentspace += dd->dd_phys->dd_reserved - used;
+ }
+
+ if (used > quota) {
+ /* over quota */
+ myspace = 0;
+
+ /*
+ * While it's OK to be a little over quota, if
+ * we think we are using more space than there
+ * is in the pool (which is already 1.6% more than
+ * dsl_pool_adjustedsize()), something is very
+ * wrong.
+ */
+ ASSERT3U(used, <=, spa_get_space(dd->dd_pool->dp_spa));
+ } else {
+ /*
+ * the lesser of the space provided by our parent and
+ * the space left in our quota
+ */
+ myspace = MIN(parentspace, quota - used);
+ }
+
+ mutex_exit(&dd->dd_lock);
+
+ return (myspace);
+}
+
+struct tempreserve {
+ list_node_t tr_node;
+ dsl_dir_t *tr_ds;
+ uint64_t tr_size;
+};
+
+/*
+ * Reserve space in this dsl_dir, to be used in this tx's txg.
+ * After the space has been dirtied (and thus
+ * dsl_dir_willuse_space() has been called), the reservation should
+ * be canceled, using dsl_dir_tempreserve_clear().
+ */
+static int
+dsl_dir_tempreserve_impl(dsl_dir_t *dd,
+ uint64_t asize, boolean_t netfree, list_t *tr_list, dmu_tx_t *tx)
+{
+ uint64_t txg = tx->tx_txg;
+ uint64_t est_used, quota, parent_rsrv;
+ int edquot = EDQUOT;
+ int txgidx = txg & TXG_MASK;
+ int i;
+ struct tempreserve *tr;
+
+ ASSERT3U(txg, !=, 0);
+ ASSERT3S(asize, >=, 0);
+
+ mutex_enter(&dd->dd_lock);
+ /*
+ * Check against the dsl_dir's quota. We don't add in the delta
+ * when checking for over-quota because they get one free hit.
+ */
+ est_used = dsl_dir_estimated_space(dd);
+ for (i = 0; i < TXG_SIZE; i++)
+ est_used += dd->dd_tempreserved[i];
+
+ quota = UINT64_MAX;
+
+ if (dd->dd_phys->dd_quota)
+ quota = dd->dd_phys->dd_quota;
+
+ /*
+ * If this transaction will result in a net free of space, we want
+ * to let it through, but we have to be careful: the space that it
+ * frees won't become available until *after* this txg syncs.
+ * Therefore, to ensure that it's possible to remove files from
+ * a full pool without inducing transient overcommits, we throttle
+ * netfree transactions against a quota that is slightly larger,
+ * but still within the pool's allocation slop. In cases where
+ * we're very close to full, this will allow a steady trickle of
+ * removes to get through.
+ */
+ if (dd->dd_parent == NULL) {
+ uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree);
+ if (poolsize < quota) {
+ quota = poolsize;
+ edquot = ENOSPC;
+ }
+ } else if (netfree) {
+ quota = UINT64_MAX;
+ }
+
+ /*
+ * If they are requesting more space, and our current estimate
+ * is over quota. They get to try again unless the actual
+ * on-disk is over quota and there are no pending changes (which
+ * may free up space for us).
+ */
+ if (asize > 0 && est_used > quota) {
+ if (dd->dd_space_towrite[txg & TXG_MASK] != 0 ||
+ dd->dd_space_towrite[(txg-1) & TXG_MASK] != 0 ||
+ dd->dd_space_towrite[(txg-2) & TXG_MASK] != 0 ||
+ dd->dd_used_bytes < quota)
+ edquot = ERESTART;
+ dprintf_dd(dd, "failing: used=%lluK est_used = %lluK "
+ "quota=%lluK tr=%lluK err=%d\n",
+ dd->dd_used_bytes>>10, est_used>>10,
+ quota>>10, asize>>10, edquot);
+ mutex_exit(&dd->dd_lock);
+ return (edquot);
+ }
+
+ /* We need to up our estimated delta before dropping dd_lock */
+ dd->dd_tempreserved[txgidx] += asize;
+
+ parent_rsrv = parent_delta(dd, est_used, asize);
+ mutex_exit(&dd->dd_lock);
+
+ tr = kmem_alloc(sizeof (struct tempreserve), KM_SLEEP);
+ tr->tr_ds = dd;
+ tr->tr_size = asize;
+ list_insert_tail(tr_list, tr);
+
+ /* see if it's OK with our parent */
+ if (dd->dd_parent && parent_rsrv) {
+ return (dsl_dir_tempreserve_impl(dd->dd_parent,
+ parent_rsrv, netfree, tr_list, tx));
+ } else {
+ return (0);
+ }
+}
+
+/*
+ * Reserve space in this dsl_dir, to be used in this tx's txg.
+ * After the space has been dirtied (and thus
+ * dsl_dir_willuse_space() has been called), the reservation should
+ * be canceled, using dsl_dir_tempreserve_clear().
+ */
+int
+dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize,
+ uint64_t asize, uint64_t fsize, void **tr_cookiep, dmu_tx_t *tx)
+{
+ int err = 0;
+ list_t *tr_list;
+
+ tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
+ list_create(tr_list, sizeof (struct tempreserve),
+ offsetof(struct tempreserve, tr_node));
+ ASSERT3S(asize, >=, 0);
+ ASSERT3S(fsize, >=, 0);
+
+ err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize,
+ tr_list, tx);
+
+ if (err == 0) {
+ struct tempreserve *tr;
+
+ err = arc_tempreserve_space(lsize);
+ if (err == 0) {
+ tr = kmem_alloc(sizeof (struct tempreserve), KM_SLEEP);
+ tr->tr_ds = NULL;
+ tr->tr_size = lsize;
+ list_insert_tail(tr_list, tr);
+ }
+ }
+
+ if (err)
+ dsl_dir_tempreserve_clear(tr_list, tx);
+ else
+ *tr_cookiep = tr_list;
+ return (err);
+}
+
+/*
+ * Clear a temporary reservation that we previously made with
+ * dsl_dir_tempreserve_space().
+ */
+void
+dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx)
+{
+ int txgidx = tx->tx_txg & TXG_MASK;
+ list_t *tr_list = tr_cookie;
+ struct tempreserve *tr;
+
+ ASSERT3U(tx->tx_txg, !=, 0);
+
+ while (tr = list_head(tr_list)) {
+ if (tr->tr_ds == NULL) {
+ arc_tempreserve_clear(tr->tr_size);
+ } else {
+ mutex_enter(&tr->tr_ds->dd_lock);
+ ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=,
+ tr->tr_size);
+ tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size;
+ mutex_exit(&tr->tr_ds->dd_lock);
+ }
+ list_remove(tr_list, tr);
+ kmem_free(tr, sizeof (struct tempreserve));
+ }
+
+ kmem_free(tr_list, sizeof (list_t));
+}
+
+/*
+ * Call in open context when we think we're going to write/free space,
+ * eg. when dirtying data. Be conservative (ie. OK to write less than
+ * this or free more than this, but don't write more or free less).
+ */
+void
+dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
+{
+ int64_t parent_space;
+ uint64_t est_used;
+
+ mutex_enter(&dd->dd_lock);
+ if (space > 0)
+ dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space;
+
+ est_used = dsl_dir_estimated_space(dd);
+ parent_space = parent_delta(dd, est_used, space);
+ mutex_exit(&dd->dd_lock);
+
+ /* Make sure that we clean up dd_space_to* */
+ dsl_dir_dirty(dd, tx);
+
+ /* XXX this is potentially expensive and unnecessary... */
+ if (parent_space && dd->dd_parent)
+ dsl_dir_willuse_space(dd->dd_parent, parent_space, tx);
+}
+
+/* call from syncing context when we actually write/free space for this dd */
+void
+dsl_dir_diduse_space(dsl_dir_t *dd,
+ int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx)
+{
+ int64_t accounted_delta;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ dsl_dir_dirty(dd, tx);
+
+ mutex_enter(&dd->dd_lock);
+ accounted_delta = parent_delta(dd, dd->dd_used_bytes, used);
+ ASSERT(used >= 0 || dd->dd_used_bytes >= -used);
+ ASSERT(compressed >= 0 ||
+ dd->dd_phys->dd_compressed_bytes >= -compressed);
+ ASSERT(uncompressed >= 0 ||
+ dd->dd_phys->dd_uncompressed_bytes >= -uncompressed);
+ dd->dd_used_bytes += used;
+ dd->dd_phys->dd_uncompressed_bytes += uncompressed;
+ dd->dd_phys->dd_compressed_bytes += compressed;
+ mutex_exit(&dd->dd_lock);
+
+ if (dd->dd_parent != NULL) {
+ dsl_dir_diduse_space(dd->dd_parent,
+ accounted_delta, compressed, uncompressed, tx);
+ }
+}
+
+/* ARGSUSED */
+static int
+dsl_dir_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dir_t *dd = arg1;
+ uint64_t *quotap = arg2;
+ uint64_t new_quota = *quotap;
+ int err = 0;
+ uint64_t towrite;
+
+ if (new_quota == 0)
+ return (0);
+
+ mutex_enter(&dd->dd_lock);
+ /*
+ * If we are doing the preliminary check in open context, and
+ * there are pending changes, then don't fail it, since the
+ * pending changes could under-estimat the amount of space to be
+ * freed up.
+ */
+ towrite = dd->dd_space_towrite[0] + dd->dd_space_towrite[1] +
+ dd->dd_space_towrite[2] + dd->dd_space_towrite[3];
+ if ((dmu_tx_is_syncing(tx) || towrite == 0) &&
+ (new_quota < dd->dd_phys->dd_reserved ||
+ new_quota < dsl_dir_estimated_space(dd))) {
+ err = ENOSPC;
+ }
+ mutex_exit(&dd->dd_lock);
+ return (err);
+}
+
+static void
+dsl_dir_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dir_t *dd = arg1;
+ uint64_t *quotap = arg2;
+ uint64_t new_quota = *quotap;
+
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+
+ mutex_enter(&dd->dd_lock);
+ dd->dd_phys->dd_quota = new_quota;
+ mutex_exit(&dd->dd_lock);
+}
+
+int
+dsl_dir_set_quota(const char *ddname, uint64_t quota)
+{
+ dsl_dir_t *dd;
+ int err;
+
+ err = dsl_dir_open(ddname, FTAG, &dd, NULL);
+ if (err)
+ return (err);
+ /*
+ * If someone removes a file, then tries to set the quota, we
+ * want to make sure the file freeing takes effect.
+ */
+ txg_wait_open(dd->dd_pool, 0);
+
+ err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_quota_check,
+ dsl_dir_set_quota_sync, dd, &quota, 0);
+ dsl_dir_close(dd, FTAG);
+ return (err);
+}
+
+/* ARGSUSED */
+static int
+dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dir_t *dd = arg1;
+ uint64_t *reservationp = arg2;
+ uint64_t new_reservation = *reservationp;
+ uint64_t used, avail;
+ int64_t delta;
+
+ if (new_reservation > INT64_MAX)
+ return (EOVERFLOW);
+
+ /*
+ * If we are doing the preliminary check in open context, the
+ * space estimates may be inaccurate.
+ */
+ if (!dmu_tx_is_syncing(tx))
+ return (0);
+
+ mutex_enter(&dd->dd_lock);
+ used = dd->dd_used_bytes;
+ delta = MAX(used, new_reservation) -
+ MAX(used, dd->dd_phys->dd_reserved);
+ mutex_exit(&dd->dd_lock);
+
+ if (dd->dd_parent) {
+ avail = dsl_dir_space_available(dd->dd_parent,
+ NULL, 0, FALSE);
+ } else {
+ avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used;
+ }
+
+ if (delta > 0 && delta > avail)
+ return (ENOSPC);
+ if (delta > 0 && dd->dd_phys->dd_quota > 0 &&
+ new_reservation > dd->dd_phys->dd_quota)
+ return (ENOSPC);
+ return (0);
+}
+
+static void
+dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dir_t *dd = arg1;
+ uint64_t *reservationp = arg2;
+ uint64_t new_reservation = *reservationp;
+ uint64_t used;
+ int64_t delta;
+
+ mutex_enter(&dd->dd_lock);
+ used = dd->dd_used_bytes;
+ delta = MAX(used, new_reservation) -
+ MAX(used, dd->dd_phys->dd_reserved);
+ mutex_exit(&dd->dd_lock);
+
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+ dd->dd_phys->dd_reserved = new_reservation;
+
+ if (dd->dd_parent != NULL) {
+ /* Roll up this additional usage into our ancestors */
+ dsl_dir_diduse_space(dd->dd_parent, delta, 0, 0, tx);
+ }
+}
+
+int
+dsl_dir_set_reservation(const char *ddname, uint64_t reservation)
+{
+ dsl_dir_t *dd;
+ int err;
+
+ err = dsl_dir_open(ddname, FTAG, &dd, NULL);
+ if (err)
+ return (err);
+ err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_reservation_check,
+ dsl_dir_set_reservation_sync, dd, &reservation, 0);
+ dsl_dir_close(dd, FTAG);
+ return (err);
+}
+
+static dsl_dir_t *
+closest_common_ancestor(dsl_dir_t *ds1, dsl_dir_t *ds2)
+{
+ for (; ds1; ds1 = ds1->dd_parent) {
+ dsl_dir_t *dd;
+ for (dd = ds2; dd; dd = dd->dd_parent) {
+ if (ds1 == dd)
+ return (dd);
+ }
+ }
+ return (NULL);
+}
+
+/*
+ * If delta is applied to dd, how much of that delta would be applied to
+ * ancestor? Syncing context only.
+ */
+static int64_t
+would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor)
+{
+ if (dd == ancestor)
+ return (delta);
+
+ mutex_enter(&dd->dd_lock);
+ delta = parent_delta(dd, dd->dd_used_bytes, delta);
+ mutex_exit(&dd->dd_lock);
+ return (would_change(dd->dd_parent, delta, ancestor));
+}
+
+struct renamearg {
+ dsl_dir_t *newparent;
+ const char *mynewname;
+};
+
+/* ARGSUSED */
+static int
+dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dir_t *dd = arg1;
+ struct renamearg *ra = arg2;
+ dsl_pool_t *dp = dd->dd_pool;
+ objset_t *mos = dp->dp_meta_objset;
+ int err;
+ uint64_t val;
+
+ /* There should be 2 references: the open and the dirty */
+ if (dmu_buf_refcount(dd->dd_dbuf) > 2)
+ return (EBUSY);
+
+ /* check for existing name */
+ err = zap_lookup(mos, ra->newparent->dd_phys->dd_child_dir_zapobj,
+ ra->mynewname, 8, 1, &val);
+ if (err == 0)
+ return (EEXIST);
+ if (err != ENOENT)
+ return (err);
+
+ if (ra->newparent != dd->dd_parent) {
+ /* is there enough space? */
+ uint64_t myspace =
+ MAX(dd->dd_used_bytes, dd->dd_phys->dd_reserved);
+
+ /* no rename into our descendant */
+ if (closest_common_ancestor(dd, ra->newparent) == dd)
+ return (EINVAL);
+
+ if (err = dsl_dir_transfer_possible(dd->dd_parent,
+ ra->newparent, myspace))
+ return (err);
+ }
+
+ return (0);
+}
+
+static void
+dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dir_t *dd = arg1;
+ struct renamearg *ra = arg2;
+ dsl_pool_t *dp = dd->dd_pool;
+ objset_t *mos = dp->dp_meta_objset;
+ int err;
+
+ ASSERT(dmu_buf_refcount(dd->dd_dbuf) <= 2);
+
+ if (ra->newparent != dd->dd_parent) {
+ uint64_t myspace =
+ MAX(dd->dd_used_bytes, dd->dd_phys->dd_reserved);
+
+ dsl_dir_diduse_space(dd->dd_parent, -myspace,
+ -dd->dd_phys->dd_compressed_bytes,
+ -dd->dd_phys->dd_uncompressed_bytes, tx);
+ dsl_dir_diduse_space(ra->newparent, myspace,
+ dd->dd_phys->dd_compressed_bytes,
+ dd->dd_phys->dd_uncompressed_bytes, tx);
+ }
+
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+
+ /* remove from old parent zapobj */
+ err = zap_remove(mos, dd->dd_parent->dd_phys->dd_child_dir_zapobj,
+ dd->dd_myname, tx);
+ ASSERT3U(err, ==, 0);
+
+ (void) strcpy(dd->dd_myname, ra->mynewname);
+ dsl_dir_close(dd->dd_parent, dd);
+ dd->dd_phys->dd_parent_obj = ra->newparent->dd_object;
+ VERIFY(0 == dsl_dir_open_obj(dd->dd_pool,
+ ra->newparent->dd_object, NULL, dd, &dd->dd_parent));
+
+ /* add to new parent zapobj */
+ err = zap_add(mos, ra->newparent->dd_phys->dd_child_dir_zapobj,
+ dd->dd_myname, 8, 1, &dd->dd_object, tx);
+ ASSERT3U(err, ==, 0);
+}
+
+int
+dsl_dir_rename(dsl_dir_t *dd, const char *newname)
+{
+ struct renamearg ra;
+ int err;
+
+ /* new parent should exist */
+ err = dsl_dir_open(newname, FTAG, &ra.newparent, &ra.mynewname);
+ if (err)
+ return (err);
+
+ /* can't rename to different pool */
+ if (dd->dd_pool != ra.newparent->dd_pool) {
+ err = ENXIO;
+ goto out;
+ }
+
+ /* new name should not already exist */
+ if (ra.mynewname == NULL) {
+ err = EEXIST;
+ goto out;
+ }
+
+
+ err = dsl_sync_task_do(dd->dd_pool,
+ dsl_dir_rename_check, dsl_dir_rename_sync, dd, &ra, 3);
+
+out:
+ dsl_dir_close(ra.newparent, FTAG);
+ return (err);
+}
+
+int
+dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space)
+{
+ dsl_dir_t *ancestor;
+ int64_t adelta;
+ uint64_t avail;
+
+ ancestor = closest_common_ancestor(sdd, tdd);
+ adelta = would_change(sdd, -space, ancestor);
+ avail = dsl_dir_space_available(tdd, ancestor, adelta, FALSE);
+ if (avail < space)
+ return (ENOSPC);
+
+ return (0);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
new file mode 100644
index 0000000..7046254
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
@@ -0,0 +1,255 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/arc.h>
+#include <sys/zap.h>
+#include <sys/zio.h>
+#include <sys/zfs_context.h>
+#include <sys/fs/zfs.h>
+
+static int
+dsl_pool_open_mos_dir(dsl_pool_t *dp, dsl_dir_t **ddp)
+{
+ uint64_t obj;
+ int err;
+
+ err = zap_lookup(dp->dp_meta_objset,
+ dp->dp_root_dir->dd_phys->dd_child_dir_zapobj,
+ MOS_DIR_NAME, sizeof (obj), 1, &obj);
+ if (err)
+ return (err);
+
+ return (dsl_dir_open_obj(dp, obj, MOS_DIR_NAME, dp, ddp));
+}
+
+static dsl_pool_t *
+dsl_pool_open_impl(spa_t *spa, uint64_t txg)
+{
+ dsl_pool_t *dp;
+ blkptr_t *bp = spa_get_rootblkptr(spa);
+
+ dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP);
+ dp->dp_spa = spa;
+ dp->dp_meta_rootbp = *bp;
+ rw_init(&dp->dp_config_rwlock, NULL, RW_DEFAULT, NULL);
+ txg_init(dp, txg);
+
+ txg_list_create(&dp->dp_dirty_datasets,
+ offsetof(dsl_dataset_t, ds_dirty_link));
+ txg_list_create(&dp->dp_dirty_dirs,
+ offsetof(dsl_dir_t, dd_dirty_link));
+ txg_list_create(&dp->dp_sync_tasks,
+ offsetof(dsl_sync_task_group_t, dstg_node));
+ list_create(&dp->dp_synced_objsets, sizeof (dsl_dataset_t),
+ offsetof(dsl_dataset_t, ds_synced_link));
+
+ return (dp);
+}
+
+int
+dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
+{
+ int err;
+ dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
+ objset_impl_t *osi;
+
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, &osi);
+ if (err)
+ goto out;
+ dp->dp_meta_objset = &osi->os;
+
+ err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1,
+ &dp->dp_root_dir_obj);
+ if (err)
+ goto out;
+
+ err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj,
+ NULL, dp, &dp->dp_root_dir);
+ if (err)
+ goto out;
+
+ err = dsl_pool_open_mos_dir(dp, &dp->dp_mos_dir);
+ if (err)
+ goto out;
+
+out:
+ rw_exit(&dp->dp_config_rwlock);
+ if (err)
+ dsl_pool_close(dp);
+ else
+ *dpp = dp;
+
+ return (err);
+}
+
+void
+dsl_pool_close(dsl_pool_t *dp)
+{
+ /* drop our reference from dsl_pool_open() */
+ if (dp->dp_mos_dir)
+ dsl_dir_close(dp->dp_mos_dir, dp);
+ if (dp->dp_root_dir)
+ dsl_dir_close(dp->dp_root_dir, dp);
+
+ /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */
+ if (dp->dp_meta_objset)
+ dmu_objset_evict(NULL, dp->dp_meta_objset->os);
+
+ txg_list_destroy(&dp->dp_dirty_datasets);
+ txg_list_destroy(&dp->dp_dirty_dirs);
+ list_destroy(&dp->dp_synced_objsets);
+
+ arc_flush();
+ txg_fini(dp);
+ rw_destroy(&dp->dp_config_rwlock);
+ kmem_free(dp, sizeof (dsl_pool_t));
+}
+
+dsl_pool_t *
+dsl_pool_create(spa_t *spa, uint64_t txg)
+{
+ int err;
+ dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
+ dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
+ dp->dp_meta_objset = &dmu_objset_create_impl(spa,
+ NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx)->os;
+
+ /* create the pool directory */
+ err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx);
+ ASSERT3U(err, ==, 0);
+
+ /* create and open the root dir */
+ dsl_dataset_create_root(dp, &dp->dp_root_dir_obj, tx);
+ VERIFY(0 == dsl_dir_open_obj(dp, dp->dp_root_dir_obj,
+ NULL, dp, &dp->dp_root_dir));
+
+ /* create and open the meta-objset dir */
+ (void) dsl_dir_create_sync(dp->dp_root_dir, MOS_DIR_NAME, tx);
+ VERIFY(0 == dsl_pool_open_mos_dir(dp, &dp->dp_mos_dir));
+
+ dmu_tx_commit(tx);
+
+ return (dp);
+}
+
+void
+dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
+{
+ zio_t *zio;
+ dmu_tx_t *tx;
+ dsl_dir_t *dd;
+ dsl_dataset_t *ds;
+ dsl_sync_task_group_t *dstg;
+ objset_impl_t *mosi = dp->dp_meta_objset->os;
+ int err;
+
+ tx = dmu_tx_create_assigned(dp, txg);
+
+ zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+ while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) {
+ if (!list_link_active(&ds->ds_synced_link))
+ list_insert_tail(&dp->dp_synced_objsets, ds);
+ else
+ dmu_buf_rele(ds->ds_dbuf, ds);
+ dsl_dataset_sync(ds, zio, tx);
+ }
+ err = zio_wait(zio);
+ ASSERT(err == 0);
+
+ while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg))
+ dsl_sync_task_group_sync(dstg, tx);
+ while (dd = txg_list_remove(&dp->dp_dirty_dirs, txg))
+ dsl_dir_sync(dd, tx);
+
+ if (list_head(&mosi->os_dirty_dnodes[txg & TXG_MASK]) != NULL ||
+ list_head(&mosi->os_free_dnodes[txg & TXG_MASK]) != NULL) {
+ zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+ dmu_objset_sync(mosi, zio, tx);
+ err = zio_wait(zio);
+ ASSERT(err == 0);
+ dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", "");
+ spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
+ }
+
+ dmu_tx_commit(tx);
+}
+
+void
+dsl_pool_zil_clean(dsl_pool_t *dp)
+{
+ dsl_dataset_t *ds;
+
+ while (ds = list_head(&dp->dp_synced_objsets)) {
+ list_remove(&dp->dp_synced_objsets, ds);
+ ASSERT(ds->ds_user_ptr != NULL);
+ zil_clean(((objset_impl_t *)ds->ds_user_ptr)->os_zil);
+ dmu_buf_rele(ds->ds_dbuf, ds);
+ }
+}
+
+/*
+ * TRUE if the current thread is the tx_sync_thread or if we
+ * are being called from SPA context during pool initialization.
+ */
+int
+dsl_pool_sync_context(dsl_pool_t *dp)
+{
+ return (curthread == dp->dp_tx.tx_sync_thread ||
+ spa_get_dsl(dp->dp_spa) == NULL);
+}
+
+uint64_t
+dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree)
+{
+ uint64_t space, resv;
+
+ /*
+ * Reserve about 1.6% (1/64), or at least 32MB, for allocation
+ * efficiency.
+ * XXX The intent log is not accounted for, so it must fit
+ * within this slop.
+ *
+ * If we're trying to assess whether it's OK to do a free,
+ * cut the reservation in half to allow forward progress
+ * (e.g. make it possible to rm(1) files from a full pool).
+ */
+ space = spa_get_dspace(dp->dp_spa);
+ resv = MAX(space >> 6, SPA_MINDEVSIZE >> 1);
+ if (netfree)
+ resv >>= 1;
+
+ return (space - resv);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c
new file mode 100644
index 0000000..2fff66d
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c
@@ -0,0 +1,501 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_tx.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_synctask.h>
+#include <sys/spa.h>
+#include <sys/zio_checksum.h> /* for the default checksum value */
+#include <sys/zap.h>
+#include <sys/fs/zfs.h>
+
+#include "zfs_prop.h"
+
+static int
+dodefault(const char *propname, int intsz, int numint, void *buf)
+{
+ zfs_prop_t prop;
+
+ if ((prop = zfs_name_to_prop(propname)) == ZFS_PROP_INVAL ||
+ zfs_prop_readonly(prop))
+ return (ENOENT);
+
+ if (zfs_prop_get_type(prop) == prop_type_string) {
+ if (intsz != 1)
+ return (EOVERFLOW);
+ (void) strncpy(buf, zfs_prop_default_string(prop), numint);
+ } else {
+ if (intsz != 8 || numint < 1)
+ return (EOVERFLOW);
+
+ *(uint64_t *)buf = zfs_prop_default_numeric(prop);
+ }
+
+ return (0);
+}
+
+static int
+dsl_prop_get_impl(dsl_dir_t *dd, const char *propname,
+ int intsz, int numint, void *buf, char *setpoint)
+{
+ int err = ENOENT;
+ zfs_prop_t prop;
+
+ if (setpoint)
+ setpoint[0] = '\0';
+
+ prop = zfs_name_to_prop(propname);
+
+ /*
+ * Note: dd may be NULL, therefore we shouldn't dereference it
+ * ouside this loop.
+ */
+ for (; dd != NULL; dd = dd->dd_parent) {
+ objset_t *mos = dd->dd_pool->dp_meta_objset;
+ ASSERT(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock));
+ err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj,
+ propname, intsz, numint, buf);
+ if (err != ENOENT) {
+ if (setpoint)
+ dsl_dir_name(dd, setpoint);
+ break;
+ }
+
+ /*
+ * Break out of this loop for non-inheritable properties.
+ */
+ if (prop != ZFS_PROP_INVAL &&
+ !zfs_prop_inheritable(prop))
+ break;
+ }
+ if (err == ENOENT)
+ err = dodefault(propname, intsz, numint, buf);
+
+ return (err);
+}
+
+/*
+ * Register interest in the named property. We'll call the callback
+ * once to notify it of the current property value, and again each time
+ * the property changes, until this callback is unregistered.
+ *
+ * Return 0 on success, errno if the prop is not an integer value.
+ */
+int
+dsl_prop_register(dsl_dataset_t *ds, const char *propname,
+ dsl_prop_changed_cb_t *callback, void *cbarg)
+{
+ dsl_dir_t *dd = ds->ds_dir;
+ uint64_t value;
+ dsl_prop_cb_record_t *cbr;
+ int err;
+ int need_rwlock;
+
+ need_rwlock = !RW_WRITE_HELD(&dd->dd_pool->dp_config_rwlock);
+ if (need_rwlock)
+ rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
+
+ err = dsl_prop_get_impl(dd, propname, 8, 1, &value, NULL);
+ if (err != 0) {
+ rw_exit(&dd->dd_pool->dp_config_rwlock);
+ return (err);
+ }
+
+ cbr = kmem_alloc(sizeof (dsl_prop_cb_record_t), KM_SLEEP);
+ cbr->cbr_ds = ds;
+ cbr->cbr_propname = kmem_alloc(strlen(propname)+1, KM_SLEEP);
+ (void) strcpy((char *)cbr->cbr_propname, propname);
+ cbr->cbr_func = callback;
+ cbr->cbr_arg = cbarg;
+ mutex_enter(&dd->dd_lock);
+ list_insert_head(&dd->dd_prop_cbs, cbr);
+ mutex_exit(&dd->dd_lock);
+
+ cbr->cbr_func(cbr->cbr_arg, value);
+
+ VERIFY(0 == dsl_dir_open_obj(dd->dd_pool, dd->dd_object,
+ NULL, cbr, &dd));
+ if (need_rwlock)
+ rw_exit(&dd->dd_pool->dp_config_rwlock);
+ /* Leave dataset open until this callback is unregistered */
+ return (0);
+}
+
+int
+dsl_prop_get_ds(dsl_dir_t *dd, const char *propname,
+ int intsz, int numints, void *buf, char *setpoint)
+{
+ int err;
+
+ rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
+ err = dsl_prop_get_impl(dd, propname, intsz, numints, buf, setpoint);
+ rw_exit(&dd->dd_pool->dp_config_rwlock);
+
+ return (err);
+}
+
+int
+dsl_prop_get(const char *ddname, const char *propname,
+ int intsz, int numints, void *buf, char *setpoint)
+{
+ dsl_dir_t *dd;
+ const char *tail;
+ int err;
+
+ err = dsl_dir_open(ddname, FTAG, &dd, &tail);
+ if (err)
+ return (err);
+ if (tail && tail[0] != '@') {
+ dsl_dir_close(dd, FTAG);
+ return (ENOENT);
+ }
+
+ err = dsl_prop_get_ds(dd, propname, intsz, numints, buf, setpoint);
+
+ dsl_dir_close(dd, FTAG);
+ return (err);
+}
+
+/*
+ * Get the current property value. It may have changed by the time this
+ * function returns, so it is NOT safe to follow up with
+ * dsl_prop_register() and assume that the value has not changed in
+ * between.
+ *
+ * Return 0 on success, ENOENT if ddname is invalid.
+ */
+int
+dsl_prop_get_integer(const char *ddname, const char *propname,
+ uint64_t *valuep, char *setpoint)
+{
+ return (dsl_prop_get(ddname, propname, 8, 1, valuep, setpoint));
+}
+
+/*
+ * Unregister this callback. Return 0 on success, ENOENT if ddname is
+ * invalid, ENOMSG if no matching callback registered.
+ */
+int
+dsl_prop_unregister(dsl_dataset_t *ds, const char *propname,
+ dsl_prop_changed_cb_t *callback, void *cbarg)
+{
+ dsl_dir_t *dd = ds->ds_dir;
+ dsl_prop_cb_record_t *cbr;
+
+ mutex_enter(&dd->dd_lock);
+ for (cbr = list_head(&dd->dd_prop_cbs);
+ cbr; cbr = list_next(&dd->dd_prop_cbs, cbr)) {
+ if (cbr->cbr_ds == ds &&
+ cbr->cbr_func == callback &&
+ cbr->cbr_arg == cbarg &&
+ strcmp(cbr->cbr_propname, propname) == 0)
+ break;
+ }
+
+ if (cbr == NULL) {
+ mutex_exit(&dd->dd_lock);
+ return (ENOMSG);
+ }
+
+ list_remove(&dd->dd_prop_cbs, cbr);
+ mutex_exit(&dd->dd_lock);
+ kmem_free((void*)cbr->cbr_propname, strlen(cbr->cbr_propname)+1);
+ kmem_free(cbr, sizeof (dsl_prop_cb_record_t));
+
+ /* Clean up from dsl_prop_register */
+ dsl_dir_close(dd, cbr);
+ return (0);
+}
+
+/*
+ * Return the number of callbacks that are registered for this dataset.
+ */
+int
+dsl_prop_numcb(dsl_dataset_t *ds)
+{
+ dsl_dir_t *dd = ds->ds_dir;
+ dsl_prop_cb_record_t *cbr;
+ int num = 0;
+
+ mutex_enter(&dd->dd_lock);
+ for (cbr = list_head(&dd->dd_prop_cbs);
+ cbr; cbr = list_next(&dd->dd_prop_cbs, cbr)) {
+ if (cbr->cbr_ds == ds)
+ num++;
+ }
+ mutex_exit(&dd->dd_lock);
+
+ return (num);
+}
+
+static void
+dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj,
+ const char *propname, uint64_t value, int first)
+{
+ dsl_dir_t *dd;
+ dsl_prop_cb_record_t *cbr;
+ objset_t *mos = dp->dp_meta_objset;
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ int err;
+
+ ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
+ err = dsl_dir_open_obj(dp, ddobj, NULL, FTAG, &dd);
+ if (err)
+ return;
+
+ if (!first) {
+ /*
+ * If the prop is set here, then this change is not
+ * being inherited here or below; stop the recursion.
+ */
+ err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj, propname,
+ 8, 1, &value);
+ if (err == 0) {
+ dsl_dir_close(dd, FTAG);
+ return;
+ }
+ ASSERT3U(err, ==, ENOENT);
+ }
+
+ mutex_enter(&dd->dd_lock);
+ for (cbr = list_head(&dd->dd_prop_cbs);
+ cbr; cbr = list_next(&dd->dd_prop_cbs, cbr)) {
+ if (strcmp(cbr->cbr_propname, propname) == 0) {
+ cbr->cbr_func(cbr->cbr_arg, value);
+ }
+ }
+ mutex_exit(&dd->dd_lock);
+
+ for (zap_cursor_init(&zc, mos,
+ dd->dd_phys->dd_child_dir_zapobj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ /* XXX recursion could blow stack; esp. za! */
+ dsl_prop_changed_notify(dp, za.za_first_integer,
+ propname, value, FALSE);
+ }
+ zap_cursor_fini(&zc);
+ dsl_dir_close(dd, FTAG);
+}
+
+struct prop_set_arg {
+ const char *name;
+ int intsz;
+ int numints;
+ const void *buf;
+};
+
+
+static void
+dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ dsl_dir_t *dd = arg1;
+ struct prop_set_arg *psa = arg2;
+ objset_t *mos = dd->dd_pool->dp_meta_objset;
+ uint64_t zapobj = dd->dd_phys->dd_props_zapobj;
+ uint64_t intval;
+ int isint;
+
+ isint = (dodefault(psa->name, 8, 1, &intval) == 0);
+
+ if (psa->numints == 0) {
+ int err = zap_remove(mos, zapobj, psa->name, tx);
+ ASSERT(err == 0 || err == ENOENT);
+ if (isint) {
+ VERIFY(0 == dsl_prop_get_impl(dd->dd_parent,
+ psa->name, 8, 1, &intval, NULL));
+ }
+ } else {
+ VERIFY(0 == zap_update(mos, zapobj, psa->name,
+ psa->intsz, psa->numints, psa->buf, tx));
+ if (isint)
+ intval = *(uint64_t *)psa->buf;
+ }
+
+ if (isint) {
+ dsl_prop_changed_notify(dd->dd_pool,
+ dd->dd_object, psa->name, intval, TRUE);
+ }
+}
+
+int
+dsl_prop_set_dd(dsl_dir_t *dd, const char *propname,
+ int intsz, int numints, const void *buf)
+{
+ struct prop_set_arg psa;
+
+ psa.name = propname;
+ psa.intsz = intsz;
+ psa.numints = numints;
+ psa.buf = buf;
+
+ return (dsl_sync_task_do(dd->dd_pool,
+ NULL, dsl_prop_set_sync, dd, &psa, 2));
+}
+
+int
+dsl_prop_set(const char *ddname, const char *propname,
+ int intsz, int numints, const void *buf)
+{
+ dsl_dir_t *dd;
+ int err;
+
+ /*
+ * We must do these checks before we get to the syncfunc, since
+ * it can't fail.
+ */
+ if (strlen(propname) >= ZAP_MAXNAMELEN)
+ return (ENAMETOOLONG);
+ if (intsz * numints >= ZAP_MAXVALUELEN)
+ return (E2BIG);
+
+ err = dsl_dir_open(ddname, FTAG, &dd, NULL);
+ if (err)
+ return (err);
+ err = dsl_prop_set_dd(dd, propname, intsz, numints, buf);
+ dsl_dir_close(dd, FTAG);
+ return (err);
+}
+
+/*
+ * Iterate over all properties for this dataset and return them in an nvlist.
+ */
+int
+dsl_prop_get_all(objset_t *os, nvlist_t **nvp)
+{
+ dsl_dataset_t *ds = os->os->os_dsl_dataset;
+ dsl_dir_t *dd = ds->ds_dir;
+ int err = 0;
+ dsl_pool_t *dp;
+ objset_t *mos;
+
+ if (dsl_dataset_is_snapshot(ds)) {
+ VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ return (0);
+ }
+
+ VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+ dp = dd->dd_pool;
+ mos = dp->dp_meta_objset;
+
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ for (; dd != NULL; dd = dd->dd_parent) {
+ char setpoint[MAXNAMELEN];
+ zap_cursor_t zc;
+ zap_attribute_t za;
+
+ dsl_dir_name(dd, setpoint);
+
+ for (zap_cursor_init(&zc, mos, dd->dd_phys->dd_props_zapobj);
+ (err = zap_cursor_retrieve(&zc, &za)) == 0;
+ zap_cursor_advance(&zc)) {
+ nvlist_t *propval;
+ zfs_prop_t prop;
+ /*
+ * Skip non-inheritable properties.
+ */
+ if ((prop = zfs_name_to_prop(za.za_name)) !=
+ ZFS_PROP_INVAL && !zfs_prop_inheritable(prop) &&
+ dd != ds->ds_dir)
+ continue;
+
+ if (nvlist_lookup_nvlist(*nvp, za.za_name,
+ &propval) == 0)
+ continue;
+
+ VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME,
+ KM_SLEEP) == 0);
+ if (za.za_integer_length == 1) {
+ /*
+ * String property
+ */
+ char *tmp = kmem_alloc(za.za_num_integers,
+ KM_SLEEP);
+ err = zap_lookup(mos,
+ dd->dd_phys->dd_props_zapobj,
+ za.za_name, 1, za.za_num_integers,
+ tmp);
+ if (err != 0) {
+ kmem_free(tmp, za.za_num_integers);
+ break;
+ }
+ VERIFY(nvlist_add_string(propval,
+ ZFS_PROP_VALUE, tmp) == 0);
+ kmem_free(tmp, za.za_num_integers);
+ } else {
+ /*
+ * Integer property
+ */
+ ASSERT(za.za_integer_length == 8);
+ (void) nvlist_add_uint64(propval,
+ ZFS_PROP_VALUE, za.za_first_integer);
+ }
+
+ VERIFY(nvlist_add_string(propval,
+ ZFS_PROP_SOURCE, setpoint) == 0);
+ VERIFY(nvlist_add_nvlist(*nvp, za.za_name,
+ propval) == 0);
+ nvlist_free(propval);
+ }
+ zap_cursor_fini(&zc);
+
+ if (err != ENOENT)
+ break;
+ err = 0;
+ }
+ rw_exit(&dp->dp_config_rwlock);
+
+ return (err);
+}
+
+void
+dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value)
+{
+ nvlist_t *propval;
+
+ VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_uint64(propval, ZFS_PROP_VALUE, value) == 0);
+ VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(prop), propval) == 0);
+ nvlist_free(propval);
+}
+
+void
+dsl_prop_nvlist_add_string(nvlist_t *nv, zfs_prop_t prop, const char *value)
+{
+ nvlist_t *propval;
+
+ VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_string(propval, ZFS_PROP_VALUE, value) == 0);
+ VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(prop), propval) == 0);
+ nvlist_free(propval);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c
new file mode 100644
index 0000000..17deb56
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c
@@ -0,0 +1,196 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_synctask.h>
+
+#define DST_AVG_BLKSHIFT 14
+
+/* ARGSUSED */
+static int
+dsl_null_checkfunc(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ return (0);
+}
+
+dsl_sync_task_group_t *
+dsl_sync_task_group_create(dsl_pool_t *dp)
+{
+ dsl_sync_task_group_t *dstg;
+
+ dstg = kmem_zalloc(sizeof (dsl_sync_task_group_t), KM_SLEEP);
+ list_create(&dstg->dstg_tasks, sizeof (dsl_sync_task_t),
+ offsetof(dsl_sync_task_t, dst_node));
+ dstg->dstg_pool = dp;
+
+ return (dstg);
+}
+
+void
+dsl_sync_task_create(dsl_sync_task_group_t *dstg,
+ dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc,
+ void *arg1, void *arg2, int blocks_modified)
+{
+ dsl_sync_task_t *dst;
+
+ if (checkfunc == NULL)
+ checkfunc = dsl_null_checkfunc;
+ dst = kmem_zalloc(sizeof (dsl_sync_task_t), KM_SLEEP);
+ dst->dst_checkfunc = checkfunc;
+ dst->dst_syncfunc = syncfunc;
+ dst->dst_arg1 = arg1;
+ dst->dst_arg2 = arg2;
+ list_insert_tail(&dstg->dstg_tasks, dst);
+
+ dstg->dstg_space += blocks_modified << DST_AVG_BLKSHIFT;
+}
+
+int
+dsl_sync_task_group_wait(dsl_sync_task_group_t *dstg)
+{
+ dmu_tx_t *tx;
+ uint64_t txg;
+ dsl_sync_task_t *dst;
+
+top:
+ tx = dmu_tx_create_dd(dstg->dstg_pool->dp_mos_dir);
+ VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT));
+
+ txg = dmu_tx_get_txg(tx);
+
+ /* Do a preliminary error check. */
+ dstg->dstg_err = 0;
+ rw_enter(&dstg->dstg_pool->dp_config_rwlock, RW_READER);
+ for (dst = list_head(&dstg->dstg_tasks); dst;
+ dst = list_next(&dstg->dstg_tasks, dst)) {
+#ifdef ZFS_DEBUG
+ /*
+ * Only check half the time, otherwise, the sync-context
+ * check will almost never fail.
+ */
+ if (spa_get_random(2) == 0)
+ continue;
+#endif
+ dst->dst_err =
+ dst->dst_checkfunc(dst->dst_arg1, dst->dst_arg2, tx);
+ if (dst->dst_err)
+ dstg->dstg_err = dst->dst_err;
+ }
+ rw_exit(&dstg->dstg_pool->dp_config_rwlock);
+
+ if (dstg->dstg_err) {
+ dmu_tx_commit(tx);
+ return (dstg->dstg_err);
+ }
+
+ VERIFY(0 == txg_list_add(&dstg->dstg_pool->dp_sync_tasks, dstg, txg));
+
+ dmu_tx_commit(tx);
+
+ txg_wait_synced(dstg->dstg_pool, txg);
+
+ if (dstg->dstg_err == EAGAIN)
+ goto top;
+
+ return (dstg->dstg_err);
+}
+
+void
+dsl_sync_task_group_destroy(dsl_sync_task_group_t *dstg)
+{
+ dsl_sync_task_t *dst;
+
+ while (dst = list_head(&dstg->dstg_tasks)) {
+ list_remove(&dstg->dstg_tasks, dst);
+ kmem_free(dst, sizeof (dsl_sync_task_t));
+ }
+ kmem_free(dstg, sizeof (dsl_sync_task_group_t));
+}
+
+void
+dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx)
+{
+ dsl_sync_task_t *dst;
+ void *tr_cookie;
+
+ ASSERT3U(dstg->dstg_err, ==, 0);
+
+ /*
+ * Check for sufficient space.
+ */
+ dstg->dstg_err = dsl_dir_tempreserve_space(dstg->dstg_pool->dp_mos_dir,
+ dstg->dstg_space, dstg->dstg_space * 3, 0, &tr_cookie, tx);
+ /* don't bother trying again */
+ if (dstg->dstg_err == ERESTART)
+ dstg->dstg_err = EAGAIN;
+ if (dstg->dstg_err)
+ return;
+
+ /*
+ * Check for errors by calling checkfuncs.
+ */
+ rw_enter(&dstg->dstg_pool->dp_config_rwlock, RW_WRITER);
+ for (dst = list_head(&dstg->dstg_tasks); dst;
+ dst = list_next(&dstg->dstg_tasks, dst)) {
+ dst->dst_err =
+ dst->dst_checkfunc(dst->dst_arg1, dst->dst_arg2, tx);
+ if (dst->dst_err)
+ dstg->dstg_err = dst->dst_err;
+ }
+
+ if (dstg->dstg_err == 0) {
+ /*
+ * Execute sync tasks.
+ */
+ for (dst = list_head(&dstg->dstg_tasks); dst;
+ dst = list_next(&dstg->dstg_tasks, dst)) {
+ dst->dst_syncfunc(dst->dst_arg1, dst->dst_arg2, tx);
+ }
+ }
+ rw_exit(&dstg->dstg_pool->dp_config_rwlock);
+
+ dsl_dir_tempreserve_clear(tr_cookie, tx);
+}
+
+int
+dsl_sync_task_do(dsl_pool_t *dp,
+ dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc,
+ void *arg1, void *arg2, int blocks_modified)
+{
+ dsl_sync_task_group_t *dstg;
+ int err;
+
+ dstg = dsl_sync_task_group_create(dp);
+ dsl_sync_task_create(dstg, checkfunc, syncfunc,
+ arg1, arg2, blocks_modified);
+ err = dsl_sync_task_group_wait(dstg);
+ dsl_sync_task_group_destroy(dstg);
+ return (err);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/fletcher.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/fletcher.c
new file mode 100644
index 0000000..edda3c9
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/fletcher.c
@@ -0,0 +1,145 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/byteorder.h>
+#include <sys/spa.h>
+
+void
+fletcher_2_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+ const uint64_t *ip = buf;
+ const uint64_t *ipend = ip + (size / sizeof (uint64_t));
+ uint64_t a0, b0, a1, b1;
+
+ for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) {
+ a0 += ip[0];
+ a1 += ip[1];
+ b0 += a0;
+ b1 += a1;
+ }
+
+ ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
+}
+
+void
+fletcher_2_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+ const uint64_t *ip = buf;
+ const uint64_t *ipend = ip + (size / sizeof (uint64_t));
+ uint64_t a0, b0, a1, b1;
+
+ for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) {
+ a0 += BSWAP_64(ip[0]);
+ a1 += BSWAP_64(ip[1]);
+ b0 += a0;
+ b1 += a1;
+ }
+
+ ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
+}
+
+void
+fletcher_4_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+ const uint32_t *ip = buf;
+ const uint32_t *ipend = ip + (size / sizeof (uint32_t));
+ uint64_t a, b, c, d;
+
+ for (a = b = c = d = 0; ip < ipend; ip++) {
+ a += ip[0];
+ b += a;
+ c += b;
+ d += c;
+ }
+
+ ZIO_SET_CHECKSUM(zcp, a, b, c, d);
+}
+
+void
+fletcher_4_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+ const uint32_t *ip = buf;
+ const uint32_t *ipend = ip + (size / sizeof (uint32_t));
+ uint64_t a, b, c, d;
+
+ for (a = b = c = d = 0; ip < ipend; ip++) {
+ a += BSWAP_32(ip[0]);
+ b += a;
+ c += b;
+ d += c;
+ }
+
+ ZIO_SET_CHECKSUM(zcp, a, b, c, d);
+}
+
+void
+fletcher_4_incremental_native(const void *buf, uint64_t size,
+ zio_cksum_t *zcp)
+{
+ const uint32_t *ip = buf;
+ const uint32_t *ipend = ip + (size / sizeof (uint32_t));
+ uint64_t a, b, c, d;
+
+ a = zcp->zc_word[0];
+ b = zcp->zc_word[1];
+ c = zcp->zc_word[2];
+ d = zcp->zc_word[3];
+
+ for (; ip < ipend; ip++) {
+ a += ip[0];
+ b += a;
+ c += b;
+ d += c;
+ }
+
+ ZIO_SET_CHECKSUM(zcp, a, b, c, d);
+}
+
+void
+fletcher_4_incremental_byteswap(const void *buf, uint64_t size,
+ zio_cksum_t *zcp)
+{
+ const uint32_t *ip = buf;
+ const uint32_t *ipend = ip + (size / sizeof (uint32_t));
+ uint64_t a, b, c, d;
+
+ a = zcp->zc_word[0];
+ b = zcp->zc_word[1];
+ c = zcp->zc_word[2];
+ d = zcp->zc_word[3];
+
+ for (; ip < ipend; ip++) {
+ a += BSWAP_32(ip[0]);
+ b += a;
+ c += b;
+ d += c;
+ }
+
+ ZIO_SET_CHECKSUM(zcp, a, b, c, d);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/gzip.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/gzip.c
new file mode 100644
index 0000000..b257d4a
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/gzip.c
@@ -0,0 +1,69 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/debug.h>
+#include <sys/types.h>
+#include <sys/zmod.h>
+
+#ifdef _KERNEL
+#include <sys/systm.h>
+#else
+#include <strings.h>
+#endif
+
+size_t
+gzip_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+{
+ size_t dstlen = d_len;
+
+ ASSERT(d_len <= s_len);
+
+ if (z_compress_level(d_start, &dstlen, s_start, s_len, n) != Z_OK) {
+ if (d_len != s_len)
+ return (s_len);
+
+ bcopy(s_start, d_start, s_len);
+ return (s_len);
+ }
+
+ return (dstlen);
+}
+
+/*ARGSUSED*/
+int
+gzip_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+{
+ size_t dstlen = d_len;
+
+ ASSERT(d_len >= s_len);
+
+ if (z_uncompress(d_start, &dstlen, s_start, s_len) != Z_OK)
+ return (-1);
+
+ return (0);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c
new file mode 100644
index 0000000..a88b85c
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c
@@ -0,0 +1,129 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * We keep our own copy of this algorithm for 2 main reasons:
+ * 1. If we didn't, anyone modifying common/os/compress.c would
+ * directly break our on disk format
+ * 2. Our version of lzjb does not have a number of checks that the
+ * common/os version needs and uses
+ * In particular, we are adding the "feature" that compress() can
+ * take a destination buffer size and return -1 if the data will not
+ * compress to d_len or less.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/types.h>
+
+#define MATCH_BITS 6
+#define MATCH_MIN 3
+#define MATCH_MAX ((1 << MATCH_BITS) + (MATCH_MIN - 1))
+#define OFFSET_MASK ((1 << (16 - MATCH_BITS)) - 1)
+#define LEMPEL_SIZE 256
+
+/*ARGSUSED*/
+size_t
+lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+{
+ uchar_t *src = s_start;
+ uchar_t *dst = d_start;
+ uchar_t *cpy, *copymap;
+ int copymask = 1 << (NBBY - 1);
+ int mlen, offset;
+ uint16_t *hp;
+ uint16_t lempel[LEMPEL_SIZE]; /* uninitialized; see above */
+
+ while (src < (uchar_t *)s_start + s_len) {
+ if ((copymask <<= 1) == (1 << NBBY)) {
+ if (dst >= (uchar_t *)d_start + d_len - 1 - 2 * NBBY) {
+ if (d_len != s_len)
+ return (s_len);
+ mlen = s_len;
+ for (src = s_start, dst = d_start; mlen; mlen--)
+ *dst++ = *src++;
+ return (s_len);
+ }
+ copymask = 1;
+ copymap = dst;
+ *dst++ = 0;
+ }
+ if (src > (uchar_t *)s_start + s_len - MATCH_MAX) {
+ *dst++ = *src++;
+ continue;
+ }
+ hp = &lempel[((src[0] + 13) ^ (src[1] - 13) ^ src[2]) &
+ (LEMPEL_SIZE - 1)];
+ offset = (intptr_t)(src - *hp) & OFFSET_MASK;
+ *hp = (uint16_t)(uintptr_t)src;
+ cpy = src - offset;
+ if (cpy >= (uchar_t *)s_start && cpy != src &&
+ src[0] == cpy[0] && src[1] == cpy[1] && src[2] == cpy[2]) {
+ *copymap |= copymask;
+ for (mlen = MATCH_MIN; mlen < MATCH_MAX; mlen++)
+ if (src[mlen] != cpy[mlen])
+ break;
+ *dst++ = ((mlen - MATCH_MIN) << (NBBY - MATCH_BITS)) |
+ (offset >> NBBY);
+ *dst++ = (uchar_t)offset;
+ src += mlen;
+ } else {
+ *dst++ = *src++;
+ }
+ }
+ return (dst - (uchar_t *)d_start);
+}
+
+/*ARGSUSED*/
+int
+lzjb_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+{
+ uchar_t *src = s_start;
+ uchar_t *dst = d_start;
+ uchar_t *d_end = (uchar_t *)d_start + d_len;
+ uchar_t *cpy, copymap;
+ int copymask = 1 << (NBBY - 1);
+
+ while (dst < d_end) {
+ if ((copymask <<= 1) == (1 << NBBY)) {
+ copymask = 1;
+ copymap = *src++;
+ }
+ if (copymap & copymask) {
+ int mlen = (src[0] >> (NBBY - MATCH_BITS)) + MATCH_MIN;
+ int offset = ((src[0] << NBBY) | src[1]) & OFFSET_MASK;
+ src += 2;
+ if ((cpy = dst - offset) < (uchar_t *)d_start)
+ return (-1);
+ while (--mlen >= 0 && dst < d_end)
+ *dst++ = *cpy++;
+ } else {
+ *dst++ = *src++;
+ }
+ }
+ return (0);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
new file mode 100644
index 0000000..0dba134
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
@@ -0,0 +1,1023 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa_impl.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/space_map.h>
+#include <sys/metaslab_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+
+uint64_t metaslab_aliquot = 512ULL << 10;
+
+/*
+ * ==========================================================================
+ * Metaslab classes
+ * ==========================================================================
+ */
+metaslab_class_t *
+metaslab_class_create(void)
+{
+ metaslab_class_t *mc;
+
+ mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
+
+ mc->mc_rotor = NULL;
+
+ return (mc);
+}
+
+void
+metaslab_class_destroy(metaslab_class_t *mc)
+{
+ metaslab_group_t *mg;
+
+ while ((mg = mc->mc_rotor) != NULL) {
+ metaslab_class_remove(mc, mg);
+ metaslab_group_destroy(mg);
+ }
+
+ kmem_free(mc, sizeof (metaslab_class_t));
+}
+
+void
+metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg)
+{
+ metaslab_group_t *mgprev, *mgnext;
+
+ ASSERT(mg->mg_class == NULL);
+
+ if ((mgprev = mc->mc_rotor) == NULL) {
+ mg->mg_prev = mg;
+ mg->mg_next = mg;
+ } else {
+ mgnext = mgprev->mg_next;
+ mg->mg_prev = mgprev;
+ mg->mg_next = mgnext;
+ mgprev->mg_next = mg;
+ mgnext->mg_prev = mg;
+ }
+ mc->mc_rotor = mg;
+ mg->mg_class = mc;
+}
+
+void
+metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg)
+{
+ metaslab_group_t *mgprev, *mgnext;
+
+ ASSERT(mg->mg_class == mc);
+
+ mgprev = mg->mg_prev;
+ mgnext = mg->mg_next;
+
+ if (mg == mgnext) {
+ mc->mc_rotor = NULL;
+ } else {
+ mc->mc_rotor = mgnext;
+ mgprev->mg_next = mgnext;
+ mgnext->mg_prev = mgprev;
+ }
+
+ mg->mg_prev = NULL;
+ mg->mg_next = NULL;
+ mg->mg_class = NULL;
+}
+
+/*
+ * ==========================================================================
+ * Metaslab groups
+ * ==========================================================================
+ */
+static int
+metaslab_compare(const void *x1, const void *x2)
+{
+ const metaslab_t *m1 = x1;
+ const metaslab_t *m2 = x2;
+
+ if (m1->ms_weight < m2->ms_weight)
+ return (1);
+ if (m1->ms_weight > m2->ms_weight)
+ return (-1);
+
+ /*
+ * If the weights are identical, use the offset to force uniqueness.
+ */
+ if (m1->ms_map.sm_start < m2->ms_map.sm_start)
+ return (-1);
+ if (m1->ms_map.sm_start > m2->ms_map.sm_start)
+ return (1);
+
+ ASSERT3P(m1, ==, m2);
+
+ return (0);
+}
+
+metaslab_group_t *
+metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
+{
+ metaslab_group_t *mg;
+
+ mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
+ mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
+ avl_create(&mg->mg_metaslab_tree, metaslab_compare,
+ sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
+ mg->mg_aliquot = metaslab_aliquot * MAX(1, vd->vdev_children);
+ mg->mg_vd = vd;
+ metaslab_class_add(mc, mg);
+
+ return (mg);
+}
+
+void
+metaslab_group_destroy(metaslab_group_t *mg)
+{
+ avl_destroy(&mg->mg_metaslab_tree);
+ mutex_destroy(&mg->mg_lock);
+ kmem_free(mg, sizeof (metaslab_group_t));
+}
+
+static void
+metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
+{
+ mutex_enter(&mg->mg_lock);
+ ASSERT(msp->ms_group == NULL);
+ msp->ms_group = mg;
+ msp->ms_weight = 0;
+ avl_add(&mg->mg_metaslab_tree, msp);
+ mutex_exit(&mg->mg_lock);
+}
+
+static void
+metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
+{
+ mutex_enter(&mg->mg_lock);
+ ASSERT(msp->ms_group == mg);
+ avl_remove(&mg->mg_metaslab_tree, msp);
+ msp->ms_group = NULL;
+ mutex_exit(&mg->mg_lock);
+}
+
+static void
+metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
+{
+ /*
+ * Although in principle the weight can be any value, in
+ * practice we do not use values in the range [1, 510].
+ */
+ ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0);
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ mutex_enter(&mg->mg_lock);
+ ASSERT(msp->ms_group == mg);
+ avl_remove(&mg->mg_metaslab_tree, msp);
+ msp->ms_weight = weight;
+ avl_add(&mg->mg_metaslab_tree, msp);
+ mutex_exit(&mg->mg_lock);
+}
+
+/*
+ * ==========================================================================
+ * The first-fit block allocator
+ * ==========================================================================
+ */
+static void
+metaslab_ff_load(space_map_t *sm)
+{
+ ASSERT(sm->sm_ppd == NULL);
+ sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
+}
+
+static void
+metaslab_ff_unload(space_map_t *sm)
+{
+ kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
+ sm->sm_ppd = NULL;
+}
+
+static uint64_t
+metaslab_ff_alloc(space_map_t *sm, uint64_t size)
+{
+ avl_tree_t *t = &sm->sm_root;
+ uint64_t align = size & -size;
+ uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
+ space_seg_t *ss, ssearch;
+ avl_index_t where;
+
+ ssearch.ss_start = *cursor;
+ ssearch.ss_end = *cursor + size;
+
+ ss = avl_find(t, &ssearch, &where);
+ if (ss == NULL)
+ ss = avl_nearest(t, where, AVL_AFTER);
+
+ while (ss != NULL) {
+ uint64_t offset = P2ROUNDUP(ss->ss_start, align);
+
+ if (offset + size <= ss->ss_end) {
+ *cursor = offset + size;
+ return (offset);
+ }
+ ss = AVL_NEXT(t, ss);
+ }
+
+ /*
+ * If we know we've searched the whole map (*cursor == 0), give up.
+ * Otherwise, reset the cursor to the beginning and try again.
+ */
+ if (*cursor == 0)
+ return (-1ULL);
+
+ *cursor = 0;
+ return (metaslab_ff_alloc(sm, size));
+}
+
+/* ARGSUSED */
+static void
+metaslab_ff_claim(space_map_t *sm, uint64_t start, uint64_t size)
+{
+ /* No need to update cursor */
+}
+
+/* ARGSUSED */
+static void
+metaslab_ff_free(space_map_t *sm, uint64_t start, uint64_t size)
+{
+ /* No need to update cursor */
+}
+
+static space_map_ops_t metaslab_ff_ops = {
+ metaslab_ff_load,
+ metaslab_ff_unload,
+ metaslab_ff_alloc,
+ metaslab_ff_claim,
+ metaslab_ff_free
+};
+
+/*
+ * ==========================================================================
+ * Metaslabs
+ * ==========================================================================
+ */
+metaslab_t *
+metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
+ uint64_t start, uint64_t size, uint64_t txg)
+{
+ vdev_t *vd = mg->mg_vd;
+ metaslab_t *msp;
+
+ msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
+ mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ msp->ms_smo_syncing = *smo;
+
+ /*
+ * We create the main space map here, but we don't create the
+ * allocmaps and freemaps until metaslab_sync_done(). This serves
+ * two purposes: it allows metaslab_sync_done() to detect the
+ * addition of new space; and for debugging, it ensures that we'd
+ * data fault on any attempt to use this metaslab before it's ready.
+ */
+ space_map_create(&msp->ms_map, start, size,
+ vd->vdev_ashift, &msp->ms_lock);
+
+ metaslab_group_add(mg, msp);
+
+ /*
+ * If we're opening an existing pool (txg == 0) or creating
+ * a new one (txg == TXG_INITIAL), all space is available now.
+ * If we're adding space to an existing pool, the new space
+ * does not become available until after this txg has synced.
+ */
+ if (txg <= TXG_INITIAL)
+ metaslab_sync_done(msp, 0);
+
+ if (txg != 0) {
+ /*
+ * The vdev is dirty, but the metaslab isn't -- it just needs
+ * to have metaslab_sync_done() invoked from vdev_sync_done().
+ * [We could just dirty the metaslab, but that would cause us
+ * to allocate a space map object for it, which is wasteful
+ * and would mess up the locality logic in metaslab_weight().]
+ */
+ ASSERT(TXG_CLEAN(txg) == spa_last_synced_txg(vd->vdev_spa));
+ vdev_dirty(vd, 0, NULL, txg);
+ vdev_dirty(vd, VDD_METASLAB, msp, TXG_CLEAN(txg));
+ }
+
+ return (msp);
+}
+
+void
+metaslab_fini(metaslab_t *msp)
+{
+ metaslab_group_t *mg = msp->ms_group;
+ int t;
+
+ vdev_space_update(mg->mg_vd, -msp->ms_map.sm_size,
+ -msp->ms_smo.smo_alloc);
+
+ metaslab_group_remove(mg, msp);
+
+ mutex_enter(&msp->ms_lock);
+
+ space_map_unload(&msp->ms_map);
+ space_map_destroy(&msp->ms_map);
+
+ for (t = 0; t < TXG_SIZE; t++) {
+ space_map_destroy(&msp->ms_allocmap[t]);
+ space_map_destroy(&msp->ms_freemap[t]);
+ }
+
+ mutex_exit(&msp->ms_lock);
+ mutex_destroy(&msp->ms_lock);
+
+ kmem_free(msp, sizeof (metaslab_t));
+}
+
+#define METASLAB_WEIGHT_PRIMARY (1ULL << 63)
+#define METASLAB_WEIGHT_SECONDARY (1ULL << 62)
+#define METASLAB_ACTIVE_MASK \
+ (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
+#define METASLAB_SMO_BONUS_MULTIPLIER 2
+
+static uint64_t
+metaslab_weight(metaslab_t *msp)
+{
+ metaslab_group_t *mg = msp->ms_group;
+ space_map_t *sm = &msp->ms_map;
+ space_map_obj_t *smo = &msp->ms_smo;
+ vdev_t *vd = mg->mg_vd;
+ uint64_t weight, space;
+
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ /*
+ * The baseline weight is the metaslab's free space.
+ */
+ space = sm->sm_size - smo->smo_alloc;
+ weight = space;
+
+ /*
+ * Modern disks have uniform bit density and constant angular velocity.
+ * Therefore, the outer recording zones are faster (higher bandwidth)
+ * than the inner zones by the ratio of outer to inner track diameter,
+ * which is typically around 2:1. We account for this by assigning
+ * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
+ * In effect, this means that we'll select the metaslab with the most
+ * free bandwidth rather than simply the one with the most free space.
+ */
+ weight = 2 * weight -
+ ((sm->sm_start >> vd->vdev_ms_shift) * weight) / vd->vdev_ms_count;
+ ASSERT(weight >= space && weight <= 2 * space);
+
+ /*
+ * For locality, assign higher weight to metaslabs we've used before.
+ */
+ if (smo->smo_object != 0)
+ weight *= METASLAB_SMO_BONUS_MULTIPLIER;
+ ASSERT(weight >= space &&
+ weight <= 2 * METASLAB_SMO_BONUS_MULTIPLIER * space);
+
+ /*
+ * If this metaslab is one we're actively using, adjust its weight to
+ * make it preferable to any inactive metaslab so we'll polish it off.
+ */
+ weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
+
+ return (weight);
+}
+
+static int
+metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
+{
+ space_map_t *sm = &msp->ms_map;
+
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
+ int error = space_map_load(sm, &metaslab_ff_ops,
+ SM_FREE, &msp->ms_smo,
+ msp->ms_group->mg_vd->vdev_spa->spa_meta_objset);
+ if (error) {
+ metaslab_group_sort(msp->ms_group, msp, 0);
+ return (error);
+ }
+ metaslab_group_sort(msp->ms_group, msp,
+ msp->ms_weight | activation_weight);
+ }
+ ASSERT(sm->sm_loaded);
+ ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
+
+ return (0);
+}
+
+static void
+metaslab_passivate(metaslab_t *msp, uint64_t size)
+{
+ /*
+ * If size < SPA_MINBLOCKSIZE, then we will not allocate from
+ * this metaslab again. In that case, it had better be empty,
+ * or we would be leaving space on the table.
+ */
+ ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map.sm_space == 0);
+ metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size));
+ ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
+}
+
+/*
+ * Write a metaslab to disk in the context of the specified transaction group.
+ */
+void
+metaslab_sync(metaslab_t *msp, uint64_t txg)
+{
+ vdev_t *vd = msp->ms_group->mg_vd;
+ spa_t *spa = vd->vdev_spa;
+ objset_t *mos = spa->spa_meta_objset;
+ space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK];
+ space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK];
+ space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
+ space_map_t *sm = &msp->ms_map;
+ space_map_obj_t *smo = &msp->ms_smo_syncing;
+ dmu_buf_t *db;
+ dmu_tx_t *tx;
+ int t;
+
+ tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
+
+ /*
+ * The only state that can actually be changing concurrently with
+ * metaslab_sync() is the metaslab's ms_map. No other thread can
+ * be modifying this txg's allocmap, freemap, freed_map, or smo.
+ * Therefore, we only hold ms_lock to satify space_map ASSERTs.
+ * We drop it whenever we call into the DMU, because the DMU
+ * can call down to us (e.g. via zio_free()) at any time.
+ */
+ mutex_enter(&msp->ms_lock);
+
+ if (smo->smo_object == 0) {
+ ASSERT(smo->smo_objsize == 0);
+ ASSERT(smo->smo_alloc == 0);
+ mutex_exit(&msp->ms_lock);
+ smo->smo_object = dmu_object_alloc(mos,
+ DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
+ DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
+ ASSERT(smo->smo_object != 0);
+ dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
+ (sm->sm_start >> vd->vdev_ms_shift),
+ sizeof (uint64_t), &smo->smo_object, tx);
+ mutex_enter(&msp->ms_lock);
+ }
+
+ space_map_walk(freemap, space_map_add, freed_map);
+
+ if (sm->sm_loaded && spa_sync_pass(spa) == 1 && smo->smo_objsize >=
+ 2 * sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) {
+ /*
+ * The in-core space map representation is twice as compact
+ * as the on-disk one, so it's time to condense the latter
+ * by generating a pure allocmap from first principles.
+ *
+ * This metaslab is 100% allocated,
+ * minus the content of the in-core map (sm),
+ * minus what's been freed this txg (freed_map),
+ * minus allocations from txgs in the future
+ * (because they haven't been committed yet).
+ */
+ space_map_vacate(allocmap, NULL, NULL);
+ space_map_vacate(freemap, NULL, NULL);
+
+ space_map_add(allocmap, allocmap->sm_start, allocmap->sm_size);
+
+ space_map_walk(sm, space_map_remove, allocmap);
+ space_map_walk(freed_map, space_map_remove, allocmap);
+
+ for (t = 1; t < TXG_CONCURRENT_STATES; t++)
+ space_map_walk(&msp->ms_allocmap[(txg + t) & TXG_MASK],
+ space_map_remove, allocmap);
+
+ mutex_exit(&msp->ms_lock);
+ space_map_truncate(smo, mos, tx);
+ mutex_enter(&msp->ms_lock);
+ }
+
+ space_map_sync(allocmap, SM_ALLOC, smo, mos, tx);
+ space_map_sync(freemap, SM_FREE, smo, mos, tx);
+
+ mutex_exit(&msp->ms_lock);
+
+ VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db));
+ dmu_buf_will_dirty(db, tx);
+ ASSERT3U(db->db_size, ==, sizeof (*smo));
+ bcopy(smo, db->db_data, db->db_size);
+ dmu_buf_rele(db, FTAG);
+
+ dmu_tx_commit(tx);
+}
+
+/*
+ * Called after a transaction group has completely synced to mark
+ * all of the metaslab's free space as usable.
+ */
+void
+metaslab_sync_done(metaslab_t *msp, uint64_t txg)
+{
+ space_map_obj_t *smo = &msp->ms_smo;
+ space_map_obj_t *smosync = &msp->ms_smo_syncing;
+ space_map_t *sm = &msp->ms_map;
+ space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
+ metaslab_group_t *mg = msp->ms_group;
+ vdev_t *vd = mg->mg_vd;
+ int t;
+
+ mutex_enter(&msp->ms_lock);
+
+ /*
+ * If this metaslab is just becoming available, initialize its
+ * allocmaps and freemaps and add its capacity to the vdev.
+ */
+ if (freed_map->sm_size == 0) {
+ for (t = 0; t < TXG_SIZE; t++) {
+ space_map_create(&msp->ms_allocmap[t], sm->sm_start,
+ sm->sm_size, sm->sm_shift, sm->sm_lock);
+ space_map_create(&msp->ms_freemap[t], sm->sm_start,
+ sm->sm_size, sm->sm_shift, sm->sm_lock);
+ }
+ vdev_space_update(vd, sm->sm_size, 0);
+ }
+
+ vdev_space_update(vd, 0, smosync->smo_alloc - smo->smo_alloc);
+
+ ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0);
+ ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0);
+
+ /*
+ * If there's a space_map_load() in progress, wait for it to complete
+ * so that we have a consistent view of the in-core space map.
+ * Then, add everything we freed in this txg to the map.
+ */
+ space_map_load_wait(sm);
+ space_map_vacate(freed_map, sm->sm_loaded ? space_map_free : NULL, sm);
+
+ *smo = *smosync;
+
+ /*
+ * If the map is loaded but no longer active, evict it as soon as all
+ * future allocations have synced. (If we unloaded it now and then
+ * loaded a moment later, the map wouldn't reflect those allocations.)
+ */
+ if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
+ int evictable = 1;
+
+ for (t = 1; t < TXG_CONCURRENT_STATES; t++)
+ if (msp->ms_allocmap[(txg + t) & TXG_MASK].sm_space)
+ evictable = 0;
+
+ if (evictable)
+ space_map_unload(sm);
+ }
+
+ metaslab_group_sort(mg, msp, metaslab_weight(msp));
+
+ mutex_exit(&msp->ms_lock);
+}
+
+static uint64_t
+metaslab_distance(metaslab_t *msp, dva_t *dva)
+{
+ uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
+ uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
+ uint64_t start = msp->ms_map.sm_start >> ms_shift;
+
+ if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
+ return (1ULL << 63);
+
+ if (offset < start)
+ return ((start - offset) << ms_shift);
+ if (offset > start)
+ return ((offset - start) << ms_shift);
+ return (0);
+}
+
+static uint64_t
+metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
+ uint64_t min_distance, dva_t *dva, int d)
+{
+ metaslab_t *msp = NULL;
+ uint64_t offset = -1ULL;
+ avl_tree_t *t = &mg->mg_metaslab_tree;
+ uint64_t activation_weight;
+ uint64_t target_distance;
+ int i;
+
+ activation_weight = METASLAB_WEIGHT_PRIMARY;
+ for (i = 0; i < d; i++)
+ if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id)
+ activation_weight = METASLAB_WEIGHT_SECONDARY;
+
+ for (;;) {
+ mutex_enter(&mg->mg_lock);
+ for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
+ if (msp->ms_weight < size) {
+ mutex_exit(&mg->mg_lock);
+ return (-1ULL);
+ }
+
+ if (activation_weight == METASLAB_WEIGHT_PRIMARY)
+ break;
+
+ target_distance = min_distance +
+ (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1);
+
+ for (i = 0; i < d; i++)
+ if (metaslab_distance(msp, &dva[i]) <
+ target_distance)
+ break;
+ if (i == d)
+ break;
+ }
+ mutex_exit(&mg->mg_lock);
+ if (msp == NULL)
+ return (-1ULL);
+
+ mutex_enter(&msp->ms_lock);
+
+ /*
+ * Ensure that the metaslab we have selected is still
+ * capable of handling our request. It's possible that
+ * another thread may have changed the weight while we
+ * were blocked on the metaslab lock.
+ */
+ if (msp->ms_weight < size) {
+ mutex_exit(&msp->ms_lock);
+ continue;
+ }
+
+ if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
+ activation_weight == METASLAB_WEIGHT_PRIMARY) {
+ metaslab_passivate(msp,
+ msp->ms_weight & ~METASLAB_ACTIVE_MASK);
+ mutex_exit(&msp->ms_lock);
+ continue;
+ }
+
+ if (metaslab_activate(msp, activation_weight) != 0) {
+ mutex_exit(&msp->ms_lock);
+ continue;
+ }
+
+ if ((offset = space_map_alloc(&msp->ms_map, size)) != -1ULL)
+ break;
+
+ metaslab_passivate(msp, size - 1);
+
+ mutex_exit(&msp->ms_lock);
+ }
+
+ if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
+ vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
+
+ space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
+
+ mutex_exit(&msp->ms_lock);
+
+ return (offset);
+}
+
+/*
+ * Allocate a block for the specified i/o.
+ */
+static int
+metaslab_alloc_dva(spa_t *spa, uint64_t psize, dva_t *dva, int d,
+ dva_t *hintdva, uint64_t txg, boolean_t hintdva_avoid)
+{
+ metaslab_group_t *mg, *rotor;
+ metaslab_class_t *mc;
+ vdev_t *vd;
+ int dshift = 3;
+ int all_zero;
+ uint64_t offset = -1ULL;
+ uint64_t asize;
+ uint64_t distance;
+
+ ASSERT(!DVA_IS_VALID(&dva[d]));
+
+ mc = spa_metaslab_class_select(spa);
+
+ /*
+ * Start at the rotor and loop through all mgs until we find something.
+ * Note that there's no locking on mc_rotor or mc_allocated because
+ * nothing actually breaks if we miss a few updates -- we just won't
+ * allocate quite as evenly. It all balances out over time.
+ *
+ * If we are doing ditto or log blocks, try to spread them across
+ * consecutive vdevs. If we're forced to reuse a vdev before we've
+ * allocated all of our ditto blocks, then try and spread them out on
+ * that vdev as much as possible. If it turns out to not be possible,
+ * gradually lower our standards until anything becomes acceptable.
+ * Also, allocating on consecutive vdevs (as opposed to random vdevs)
+ * gives us hope of containing our fault domains to something we're
+ * able to reason about. Otherwise, any two top-level vdev failures
+ * will guarantee the loss of data. With consecutive allocation,
+ * only two adjacent top-level vdev failures will result in data loss.
+ *
+ * If we are doing gang blocks (hintdva is non-NULL), try to keep
+ * ourselves on the same vdev as our gang block header. That
+ * way, we can hope for locality in vdev_cache, plus it makes our
+ * fault domains something tractable.
+ */
+ if (hintdva) {
+ vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
+ if (hintdva_avoid)
+ mg = vd->vdev_mg->mg_next;
+ else
+ mg = vd->vdev_mg;
+ } else if (d != 0) {
+ vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
+ mg = vd->vdev_mg->mg_next;
+ } else {
+ mg = mc->mc_rotor;
+ }
+ rotor = mg;
+
+top:
+ all_zero = B_TRUE;
+ do {
+ vd = mg->mg_vd;
+
+ distance = vd->vdev_asize >> dshift;
+ if (distance <= (1ULL << vd->vdev_ms_shift))
+ distance = 0;
+ else
+ all_zero = B_FALSE;
+
+ asize = vdev_psize_to_asize(vd, psize);
+ ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
+
+ offset = metaslab_group_alloc(mg, asize, txg, distance, dva, d);
+ if (offset != -1ULL) {
+ /*
+ * If we've just selected this metaslab group,
+ * figure out whether the corresponding vdev is
+ * over- or under-used relative to the pool,
+ * and set an allocation bias to even it out.
+ */
+ if (mc->mc_allocated == 0) {
+ vdev_stat_t *vs = &vd->vdev_stat;
+ uint64_t alloc, space;
+ int64_t vu, su;
+
+ alloc = spa_get_alloc(spa);
+ space = spa_get_space(spa);
+
+ /*
+ * Determine percent used in units of 0..1024.
+ * (This is just to avoid floating point.)
+ */
+ vu = (vs->vs_alloc << 10) / (vs->vs_space + 1);
+ su = (alloc << 10) / (space + 1);
+
+ /*
+ * Bias by at most +/- 25% of the aliquot.
+ */
+ mg->mg_bias = ((su - vu) *
+ (int64_t)mg->mg_aliquot) / (1024 * 4);
+ }
+
+ if (atomic_add_64_nv(&mc->mc_allocated, asize) >=
+ mg->mg_aliquot + mg->mg_bias) {
+ mc->mc_rotor = mg->mg_next;
+ mc->mc_allocated = 0;
+ }
+
+ DVA_SET_VDEV(&dva[d], vd->vdev_id);
+ DVA_SET_OFFSET(&dva[d], offset);
+ DVA_SET_GANG(&dva[d], 0);
+ DVA_SET_ASIZE(&dva[d], asize);
+
+ return (0);
+ }
+ mc->mc_rotor = mg->mg_next;
+ mc->mc_allocated = 0;
+ } while ((mg = mg->mg_next) != rotor);
+
+ if (!all_zero) {
+ dshift++;
+ ASSERT(dshift < 64);
+ goto top;
+ }
+
+ bzero(&dva[d], sizeof (dva_t));
+
+ return (ENOSPC);
+}
+
+/*
+ * Free the block represented by DVA in the context of the specified
+ * transaction group.
+ */
+static void
+metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now)
+{
+ uint64_t vdev = DVA_GET_VDEV(dva);
+ uint64_t offset = DVA_GET_OFFSET(dva);
+ uint64_t size = DVA_GET_ASIZE(dva);
+ vdev_t *vd;
+ metaslab_t *msp;
+
+ ASSERT(DVA_IS_VALID(dva));
+
+ if (txg > spa_freeze_txg(spa))
+ return;
+
+ if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
+ (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
+ cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu",
+ (u_longlong_t)vdev, (u_longlong_t)offset);
+ ASSERT(0);
+ return;
+ }
+
+ msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+
+ if (DVA_GET_GANG(dva))
+ size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
+
+ mutex_enter(&msp->ms_lock);
+
+ if (now) {
+ space_map_remove(&msp->ms_allocmap[txg & TXG_MASK],
+ offset, size);
+ space_map_free(&msp->ms_map, offset, size);
+ } else {
+ if (msp->ms_freemap[txg & TXG_MASK].sm_space == 0)
+ vdev_dirty(vd, VDD_METASLAB, msp, txg);
+ space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size);
+
+ /*
+ * verify that this region is actually allocated in
+ * either a ms_allocmap or the ms_map
+ */
+ if (msp->ms_map.sm_loaded) {
+ boolean_t allocd = B_FALSE;
+ int i;
+
+ if (!space_map_contains(&msp->ms_map, offset, size)) {
+ allocd = B_TRUE;
+ } else {
+ for (i = 0; i < TXG_CONCURRENT_STATES; i++) {
+ space_map_t *sm = &msp->ms_allocmap
+ [(txg - i) & TXG_MASK];
+ if (space_map_contains(sm,
+ offset, size)) {
+ allocd = B_TRUE;
+ break;
+ }
+ }
+ }
+
+ if (!allocd) {
+ zfs_panic_recover("freeing free segment "
+ "(vdev=%llu offset=%llx size=%llx)",
+ (longlong_t)vdev, (longlong_t)offset,
+ (longlong_t)size);
+ }
+ }
+
+
+ }
+
+ mutex_exit(&msp->ms_lock);
+}
+
+/*
+ * Intent log support: upon opening the pool after a crash, notify the SPA
+ * of blocks that the intent log has allocated for immediate write, but
+ * which are still considered free by the SPA because the last transaction
+ * group didn't commit yet.
+ */
+static int
+metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
+{
+ uint64_t vdev = DVA_GET_VDEV(dva);
+ uint64_t offset = DVA_GET_OFFSET(dva);
+ uint64_t size = DVA_GET_ASIZE(dva);
+ vdev_t *vd;
+ metaslab_t *msp;
+ int error;
+
+ ASSERT(DVA_IS_VALID(dva));
+
+ if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
+ (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
+ return (ENXIO);
+
+ msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+
+ if (DVA_GET_GANG(dva))
+ size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
+
+ mutex_enter(&msp->ms_lock);
+
+ error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
+ if (error) {
+ mutex_exit(&msp->ms_lock);
+ return (error);
+ }
+
+ if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
+ vdev_dirty(vd, VDD_METASLAB, msp, txg);
+
+ space_map_claim(&msp->ms_map, offset, size);
+ space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
+
+ mutex_exit(&msp->ms_lock);
+
+ return (0);
+}
+
+int
+metaslab_alloc(spa_t *spa, uint64_t psize, blkptr_t *bp, int ndvas,
+ uint64_t txg, blkptr_t *hintbp, boolean_t hintbp_avoid)
+{
+ dva_t *dva = bp->blk_dva;
+ dva_t *hintdva = hintbp->blk_dva;
+ int d;
+ int error = 0;
+
+ ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
+ ASSERT(BP_GET_NDVAS(bp) == 0);
+ ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
+
+ for (d = 0; d < ndvas; d++) {
+ error = metaslab_alloc_dva(spa, psize, dva, d, hintdva,
+ txg, hintbp_avoid);
+ if (error) {
+ for (d--; d >= 0; d--) {
+ metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
+ bzero(&dva[d], sizeof (dva_t));
+ }
+ return (error);
+ }
+ }
+ ASSERT(error == 0);
+ ASSERT(BP_GET_NDVAS(bp) == ndvas);
+
+ return (0);
+}
+
+void
+metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
+{
+ const dva_t *dva = bp->blk_dva;
+ int ndvas = BP_GET_NDVAS(bp);
+ int d;
+
+ ASSERT(!BP_IS_HOLE(bp));
+
+ for (d = 0; d < ndvas; d++)
+ metaslab_free_dva(spa, &dva[d], txg, now);
+}
+
+int
+metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
+{
+ const dva_t *dva = bp->blk_dva;
+ int ndvas = BP_GET_NDVAS(bp);
+ int d, error;
+ int last_error = 0;
+
+ ASSERT(!BP_IS_HOLE(bp));
+
+ for (d = 0; d < ndvas; d++)
+ if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
+ last_error = error;
+
+ return (last_error);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c
new file mode 100644
index 0000000..411ed46
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c
@@ -0,0 +1,194 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/refcount.h>
+
+#if defined(DEBUG) || !defined(_KERNEL)
+
+#ifdef _KERNEL
+int reference_tracking_enable = FALSE; /* runs out of memory too easily */
+#else
+int reference_tracking_enable = TRUE;
+#endif
+int reference_history = 4; /* tunable */
+
+static kmem_cache_t *reference_cache;
+static kmem_cache_t *reference_history_cache;
+
+void
+refcount_init(void)
+{
+ reference_cache = kmem_cache_create("reference_cache",
+ sizeof (reference_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+
+ reference_history_cache = kmem_cache_create("reference_history_cache",
+ sizeof (uint64_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+}
+
+void
+refcount_fini(void)
+{
+ kmem_cache_destroy(reference_cache);
+ kmem_cache_destroy(reference_history_cache);
+}
+
+void
+refcount_create(refcount_t *rc)
+{
+ list_create(&rc->rc_list, sizeof (reference_t),
+ offsetof(reference_t, ref_link));
+ list_create(&rc->rc_removed, sizeof (reference_t),
+ offsetof(reference_t, ref_link));
+ mutex_init(&rc->rc_mtx, NULL, MUTEX_DEFAULT, NULL);
+}
+
+void
+refcount_destroy_many(refcount_t *rc, uint64_t number)
+{
+ reference_t *ref;
+
+ ASSERT(rc->rc_count == number);
+ while (ref = list_head(&rc->rc_list)) {
+ list_remove(&rc->rc_list, ref);
+ kmem_cache_free(reference_cache, ref);
+ }
+ list_destroy(&rc->rc_list);
+
+ while (ref = list_head(&rc->rc_removed)) {
+ list_remove(&rc->rc_removed, ref);
+ kmem_cache_free(reference_history_cache, ref->ref_removed);
+ kmem_cache_free(reference_cache, ref);
+ }
+ list_destroy(&rc->rc_removed);
+ mutex_destroy(&rc->rc_mtx);
+}
+
+void
+refcount_destroy(refcount_t *rc)
+{
+ refcount_destroy_many(rc, 0);
+}
+
+int
+refcount_is_zero(refcount_t *rc)
+{
+ ASSERT(rc->rc_count >= 0);
+ return (rc->rc_count == 0);
+}
+
+int64_t
+refcount_count(refcount_t *rc)
+{
+ ASSERT(rc->rc_count >= 0);
+ return (rc->rc_count);
+}
+
+int64_t
+refcount_add_many(refcount_t *rc, uint64_t number, void *holder)
+{
+ reference_t *ref;
+ int64_t count;
+
+ if (reference_tracking_enable) {
+ ref = kmem_cache_alloc(reference_cache, KM_SLEEP);
+ ref->ref_holder = holder;
+ ref->ref_number = number;
+ }
+ mutex_enter(&rc->rc_mtx);
+ ASSERT(rc->rc_count >= 0);
+ if (reference_tracking_enable)
+ list_insert_head(&rc->rc_list, ref);
+ rc->rc_count += number;
+ count = rc->rc_count;
+ mutex_exit(&rc->rc_mtx);
+
+ return (count);
+}
+
+int64_t
+refcount_add(refcount_t *rc, void *holder)
+{
+ return (refcount_add_many(rc, 1, holder));
+}
+
+int64_t
+refcount_remove_many(refcount_t *rc, uint64_t number, void *holder)
+{
+ reference_t *ref;
+ int64_t count;
+
+ mutex_enter(&rc->rc_mtx);
+ ASSERT(rc->rc_count >= number);
+
+ if (!reference_tracking_enable) {
+ rc->rc_count -= number;
+ count = rc->rc_count;
+ mutex_exit(&rc->rc_mtx);
+ return (count);
+ }
+
+ for (ref = list_head(&rc->rc_list); ref;
+ ref = list_next(&rc->rc_list, ref)) {
+ if (ref->ref_holder == holder && ref->ref_number == number) {
+ list_remove(&rc->rc_list, ref);
+ if (reference_history > 0) {
+ ref->ref_removed =
+ kmem_cache_alloc(reference_history_cache,
+ KM_SLEEP);
+ list_insert_head(&rc->rc_removed, ref);
+ rc->rc_removed_count++;
+ if (rc->rc_removed_count >= reference_history) {
+ ref = list_tail(&rc->rc_removed);
+ list_remove(&rc->rc_removed, ref);
+ kmem_cache_free(reference_history_cache,
+ ref->ref_removed);
+ kmem_cache_free(reference_cache, ref);
+ rc->rc_removed_count--;
+ }
+ } else {
+ kmem_cache_free(reference_cache, ref);
+ }
+ rc->rc_count -= number;
+ count = rc->rc_count;
+ mutex_exit(&rc->rc_mtx);
+ return (count);
+ }
+ }
+ panic("No such hold %p on refcount %llx", holder,
+ (u_longlong_t)(uintptr_t)rc);
+ return (-1);
+}
+
+int64_t
+refcount_remove(refcount_t *rc, void *holder)
+{
+ return (refcount_remove_many(rc, 1, holder));
+}
+
+#endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c
new file mode 100644
index 0000000..ce5c261
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c
@@ -0,0 +1,131 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+
+/*
+ * SHA-256 checksum, as specified in FIPS 180-2, available at:
+ * http://csrc.nist.gov/cryptval
+ *
+ * This is a very compact implementation of SHA-256.
+ * It is designed to be simple and portable, not to be fast.
+ */
+
+/*
+ * The literal definitions according to FIPS180-2 would be:
+ *
+ * Ch(x, y, z) (((x) & (y)) ^ ((~(x)) & (z)))
+ * Maj(x, y, z) (((x) & (y)) | ((x) & (z)) | ((y) & (z)))
+ *
+ * We use logical equivalents which require one less op.
+ */
+#define Ch(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
+#define Maj(x, y, z) (((x) & (y)) ^ ((z) & ((x) ^ (y))))
+#define Rot32(x, s) (((x) >> s) | ((x) << (32 - s)))
+#define SIGMA0(x) (Rot32(x, 2) ^ Rot32(x, 13) ^ Rot32(x, 22))
+#define SIGMA1(x) (Rot32(x, 6) ^ Rot32(x, 11) ^ Rot32(x, 25))
+#define sigma0(x) (Rot32(x, 7) ^ Rot32(x, 18) ^ ((x) >> 3))
+#define sigma1(x) (Rot32(x, 17) ^ Rot32(x, 19) ^ ((x) >> 10))
+
+static const uint32_t SHA256_K[64] = {
+ 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
+ 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+ 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+ 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+ 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
+ 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+ 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
+ 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+ 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+ 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+ 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
+ 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+ 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
+ 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+ 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+ 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+};
+
+static void
+SHA256Transform(uint32_t *H, const uint8_t *cp)
+{
+ uint32_t a, b, c, d, e, f, g, h, t, T1, T2, W[64];
+
+ for (t = 0; t < 16; t++, cp += 4)
+ W[t] = (cp[0] << 24) | (cp[1] << 16) | (cp[2] << 8) | cp[3];
+
+ for (t = 16; t < 64; t++)
+ W[t] = sigma1(W[t - 2]) + W[t - 7] +
+ sigma0(W[t - 15]) + W[t - 16];
+
+ a = H[0]; b = H[1]; c = H[2]; d = H[3];
+ e = H[4]; f = H[5]; g = H[6]; h = H[7];
+
+ for (t = 0; t < 64; t++) {
+ T1 = h + SIGMA1(e) + Ch(e, f, g) + SHA256_K[t] + W[t];
+ T2 = SIGMA0(a) + Maj(a, b, c);
+ h = g; g = f; f = e; e = d + T1;
+ d = c; c = b; b = a; a = T1 + T2;
+ }
+
+ H[0] += a; H[1] += b; H[2] += c; H[3] += d;
+ H[4] += e; H[5] += f; H[6] += g; H[7] += h;
+}
+
+void
+zio_checksum_SHA256(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+ uint32_t H[8] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
+ 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 };
+ uint8_t pad[128];
+ int padsize = size & 63;
+ int i;
+
+ for (i = 0; i < size - padsize; i += 64)
+ SHA256Transform(H, (uint8_t *)buf + i);
+
+ for (i = 0; i < padsize; i++)
+ pad[i] = ((uint8_t *)buf)[i];
+
+ for (pad[padsize++] = 0x80; (padsize & 63) != 56; padsize++)
+ pad[padsize] = 0;
+
+ for (i = 0; i < 8; i++)
+ pad[padsize++] = (size << 3) >> (56 - 8 * i);
+
+ for (i = 0; i < padsize; i += 64)
+ SHA256Transform(H, pad + i);
+
+ ZIO_SET_CHECKSUM(zcp,
+ (uint64_t)H[0] << 32 | H[1],
+ (uint64_t)H[2] << 32 | H[3],
+ (uint64_t)H[4] << 32 | H[5],
+ (uint64_t)H[6] << 32 | H[7]);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
new file mode 100644
index 0000000..c218f72
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
@@ -0,0 +1,3265 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * This file contains all the routines used when modifying on-disk SPA state.
+ * This includes opening, importing, destroying, exporting a pool, and syncing a
+ * pool.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/fm/fs/zfs.h>
+#include <sys/spa_impl.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+#include <sys/zio_compress.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/zap.h>
+#include <sys/zil.h>
+#include <sys/vdev_impl.h>
+#include <sys/metaslab.h>
+#include <sys/uberblock_impl.h>
+#include <sys/txg.h>
+#include <sys/avl.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dmu_objset.h>
+#include <sys/unique.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_synctask.h>
+#include <sys/fs/zfs.h>
+#include <sys/callb.h>
+
+int zio_taskq_threads = 8;
+
+/*
+ * ==========================================================================
+ * SPA state manipulation (open/create/destroy/import/export)
+ * ==========================================================================
+ */
+
+static int
+spa_error_entry_compare(const void *a, const void *b)
+{
+ spa_error_entry_t *sa = (spa_error_entry_t *)a;
+ spa_error_entry_t *sb = (spa_error_entry_t *)b;
+ int ret;
+
+ ret = bcmp(&sa->se_bookmark, &sb->se_bookmark,
+ sizeof (zbookmark_t));
+
+ if (ret < 0)
+ return (-1);
+ else if (ret > 0)
+ return (1);
+ else
+ return (0);
+}
+
+/*
+ * Utility function which retrieves copies of the current logs and
+ * re-initializes them in the process.
+ */
+void
+spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
+{
+ ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
+
+ bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
+ bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
+
+ avl_create(&spa->spa_errlist_scrub,
+ spa_error_entry_compare, sizeof (spa_error_entry_t),
+ offsetof(spa_error_entry_t, se_avl));
+ avl_create(&spa->spa_errlist_last,
+ spa_error_entry_compare, sizeof (spa_error_entry_t),
+ offsetof(spa_error_entry_t, se_avl));
+}
+
+/*
+ * Activate an uninitialized pool.
+ */
+static void
+spa_activate(spa_t *spa)
+{
+ int t;
+
+ ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
+
+ spa->spa_state = POOL_STATE_ACTIVE;
+
+ spa->spa_normal_class = metaslab_class_create();
+
+ for (t = 0; t < ZIO_TYPES; t++) {
+ spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue",
+ zio_taskq_threads, maxclsyspri, 50, INT_MAX,
+ TASKQ_PREPOPULATE);
+ spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr",
+ zio_taskq_threads, maxclsyspri, 50, INT_MAX,
+ TASKQ_PREPOPULATE);
+ }
+
+ rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL);
+
+ mutex_init(&spa->spa_uberblock_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_config_lock.scl_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&spa->spa_config_lock.scl_cv, NULL, CV_DEFAULT, NULL);
+ mutex_init(&spa->spa_sync_bplist.bpl_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ list_create(&spa->spa_dirty_list, sizeof (vdev_t),
+ offsetof(vdev_t, vdev_dirty_node));
+
+ txg_list_create(&spa->spa_vdev_txg_list,
+ offsetof(struct vdev, vdev_txg_node));
+
+ avl_create(&spa->spa_errlist_scrub,
+ spa_error_entry_compare, sizeof (spa_error_entry_t),
+ offsetof(spa_error_entry_t, se_avl));
+ avl_create(&spa->spa_errlist_last,
+ spa_error_entry_compare, sizeof (spa_error_entry_t),
+ offsetof(spa_error_entry_t, se_avl));
+}
+
+/*
+ * Opposite of spa_activate().
+ */
+static void
+spa_deactivate(spa_t *spa)
+{
+ int t;
+
+ ASSERT(spa->spa_sync_on == B_FALSE);
+ ASSERT(spa->spa_dsl_pool == NULL);
+ ASSERT(spa->spa_root_vdev == NULL);
+
+ ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
+
+ txg_list_destroy(&spa->spa_vdev_txg_list);
+
+ list_destroy(&spa->spa_dirty_list);
+
+ for (t = 0; t < ZIO_TYPES; t++) {
+ taskq_destroy(spa->spa_zio_issue_taskq[t]);
+ taskq_destroy(spa->spa_zio_intr_taskq[t]);
+ spa->spa_zio_issue_taskq[t] = NULL;
+ spa->spa_zio_intr_taskq[t] = NULL;
+ }
+
+ metaslab_class_destroy(spa->spa_normal_class);
+ spa->spa_normal_class = NULL;
+
+ /*
+ * If this was part of an import or the open otherwise failed, we may
+ * still have errors left in the queues. Empty them just in case.
+ */
+ spa_errlog_drain(spa);
+
+ avl_destroy(&spa->spa_errlist_scrub);
+ avl_destroy(&spa->spa_errlist_last);
+
+ rw_destroy(&spa->spa_traverse_lock);
+ mutex_destroy(&spa->spa_uberblock_lock);
+ mutex_destroy(&spa->spa_errlog_lock);
+ mutex_destroy(&spa->spa_errlist_lock);
+ mutex_destroy(&spa->spa_config_lock.scl_lock);
+ cv_destroy(&spa->spa_config_lock.scl_cv);
+ mutex_destroy(&spa->spa_sync_bplist.bpl_lock);
+ mutex_destroy(&spa->spa_history_lock);
+ mutex_destroy(&spa->spa_props_lock);
+
+ spa->spa_state = POOL_STATE_UNINITIALIZED;
+}
+
+/*
+ * Verify a pool configuration, and construct the vdev tree appropriately. This
+ * will create all the necessary vdevs in the appropriate layout, with each vdev
+ * in the CLOSED state. This will prep the pool before open/creation/import.
+ * All vdev validation is done by the vdev_alloc() routine.
+ */
+static int
+spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
+ uint_t id, int atype)
+{
+ nvlist_t **child;
+ uint_t c, children;
+ int error;
+
+ if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
+ return (error);
+
+ if ((*vdp)->vdev_ops->vdev_op_leaf)
+ return (0);
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) != 0) {
+ vdev_free(*vdp);
+ *vdp = NULL;
+ return (EINVAL);
+ }
+
+ for (c = 0; c < children; c++) {
+ vdev_t *vd;
+ if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
+ atype)) != 0) {
+ vdev_free(*vdp);
+ *vdp = NULL;
+ return (error);
+ }
+ }
+
+ ASSERT(*vdp != NULL);
+
+ return (0);
+}
+
+/*
+ * Opposite of spa_load().
+ */
+static void
+spa_unload(spa_t *spa)
+{
+ int i;
+
+ /*
+ * Stop async tasks.
+ */
+ spa_async_suspend(spa);
+
+ /*
+ * Stop syncing.
+ */
+ if (spa->spa_sync_on) {
+ txg_sync_stop(spa->spa_dsl_pool);
+ spa->spa_sync_on = B_FALSE;
+ }
+
+ /*
+ * Wait for any outstanding prefetch I/O to complete.
+ */
+ spa_config_enter(spa, RW_WRITER, FTAG);
+ spa_config_exit(spa, FTAG);
+
+ /*
+ * Close the dsl pool.
+ */
+ if (spa->spa_dsl_pool) {
+ dsl_pool_close(spa->spa_dsl_pool);
+ spa->spa_dsl_pool = NULL;
+ }
+
+ /*
+ * Close all vdevs.
+ */
+ if (spa->spa_root_vdev)
+ vdev_free(spa->spa_root_vdev);
+ ASSERT(spa->spa_root_vdev == NULL);
+
+ for (i = 0; i < spa->spa_nspares; i++)
+ vdev_free(spa->spa_spares[i]);
+ if (spa->spa_spares) {
+ kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *));
+ spa->spa_spares = NULL;
+ }
+ if (spa->spa_sparelist) {
+ nvlist_free(spa->spa_sparelist);
+ spa->spa_sparelist = NULL;
+ }
+
+ spa->spa_async_suspended = 0;
+}
+
+/*
+ * Load (or re-load) the current list of vdevs describing the active spares for
+ * this pool. When this is called, we have some form of basic information in
+ * 'spa_sparelist'. We parse this into vdevs, try to open them, and then
+ * re-generate a more complete list including status information.
+ */
+static void
+spa_load_spares(spa_t *spa)
+{
+ nvlist_t **spares;
+ uint_t nspares;
+ int i;
+ vdev_t *vd, *tvd;
+
+ /*
+ * First, close and free any existing spare vdevs.
+ */
+ for (i = 0; i < spa->spa_nspares; i++) {
+ vd = spa->spa_spares[i];
+
+ /* Undo the call to spa_activate() below */
+ if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL &&
+ tvd->vdev_isspare)
+ spa_spare_remove(tvd);
+ vdev_close(vd);
+ vdev_free(vd);
+ }
+
+ if (spa->spa_spares)
+ kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *));
+
+ if (spa->spa_sparelist == NULL)
+ nspares = 0;
+ else
+ VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist,
+ ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
+
+ spa->spa_nspares = (int)nspares;
+ spa->spa_spares = NULL;
+
+ if (nspares == 0)
+ return;
+
+ /*
+ * Construct the array of vdevs, opening them to get status in the
+ * process. For each spare, there is potentially two different vdev_t
+ * structures associated with it: one in the list of spares (used only
+ * for basic validation purposes) and one in the active vdev
+ * configuration (if it's spared in). During this phase we open and
+ * validate each vdev on the spare list. If the vdev also exists in the
+ * active configuration, then we also mark this vdev as an active spare.
+ */
+ spa->spa_spares = kmem_alloc(nspares * sizeof (void *), KM_SLEEP);
+ for (i = 0; i < spa->spa_nspares; i++) {
+ VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
+ VDEV_ALLOC_SPARE) == 0);
+ ASSERT(vd != NULL);
+
+ spa->spa_spares[i] = vd;
+
+ if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL) {
+ if (!tvd->vdev_isspare)
+ spa_spare_add(tvd);
+
+ /*
+ * We only mark the spare active if we were successfully
+ * able to load the vdev. Otherwise, importing a pool
+ * with a bad active spare would result in strange
+ * behavior, because multiple pool would think the spare
+ * is actively in use.
+ *
+ * There is a vulnerability here to an equally bizarre
+ * circumstance, where a dead active spare is later
+ * brought back to life (onlined or otherwise). Given
+ * the rarity of this scenario, and the extra complexity
+ * it adds, we ignore the possibility.
+ */
+ if (!vdev_is_dead(tvd))
+ spa_spare_activate(tvd);
+ }
+
+ if (vdev_open(vd) != 0)
+ continue;
+
+ vd->vdev_top = vd;
+ (void) vdev_validate_spare(vd);
+ }
+
+ /*
+ * Recompute the stashed list of spares, with status information
+ * this time.
+ */
+ VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES,
+ DATA_TYPE_NVLIST_ARRAY) == 0);
+
+ spares = kmem_alloc(spa->spa_nspares * sizeof (void *), KM_SLEEP);
+ for (i = 0; i < spa->spa_nspares; i++)
+ spares[i] = vdev_config_generate(spa, spa->spa_spares[i],
+ B_TRUE, B_TRUE);
+ VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES,
+ spares, spa->spa_nspares) == 0);
+ for (i = 0; i < spa->spa_nspares; i++)
+ nvlist_free(spares[i]);
+ kmem_free(spares, spa->spa_nspares * sizeof (void *));
+}
+
+static int
+load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
+{
+ dmu_buf_t *db;
+ char *packed = NULL;
+ size_t nvsize = 0;
+ int error;
+ *value = NULL;
+
+ VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
+ nvsize = *(uint64_t *)db->db_data;
+ dmu_buf_rele(db, FTAG);
+
+ packed = kmem_alloc(nvsize, KM_SLEEP);
+ error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed);
+ if (error == 0)
+ error = nvlist_unpack(packed, nvsize, value, 0);
+ kmem_free(packed, nvsize);
+
+ return (error);
+}
+
+/*
+ * Load an existing storage pool, using the pool's builtin spa_config as a
+ * source of configuration information.
+ */
+static int
+spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
+{
+ int error = 0;
+ nvlist_t *nvroot = NULL;
+ vdev_t *rvd;
+ uberblock_t *ub = &spa->spa_uberblock;
+ uint64_t config_cache_txg = spa->spa_config_txg;
+ uint64_t pool_guid;
+ uint64_t version;
+ zio_t *zio;
+
+ spa->spa_load_state = state;
+
+ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) ||
+ nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) {
+ error = EINVAL;
+ goto out;
+ }
+
+ /*
+ * Versioning wasn't explicitly added to the label until later, so if
+ * it's not present treat it as the initial version.
+ */
+ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0)
+ version = ZFS_VERSION_INITIAL;
+
+ (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
+ &spa->spa_config_txg);
+
+ if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
+ spa_guid_exists(pool_guid, 0)) {
+ error = EEXIST;
+ goto out;
+ }
+
+ spa->spa_load_guid = pool_guid;
+
+ /*
+ * Parse the configuration into a vdev tree. We explicitly set the
+ * value that will be returned by spa_version() since parsing the
+ * configuration requires knowing the version number.
+ */
+ spa_config_enter(spa, RW_WRITER, FTAG);
+ spa->spa_ubsync.ub_version = version;
+ error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD);
+ spa_config_exit(spa, FTAG);
+
+ if (error != 0)
+ goto out;
+
+ ASSERT(spa->spa_root_vdev == rvd);
+ ASSERT(spa_guid(spa) == pool_guid);
+
+ /*
+ * Try to open all vdevs, loading each label in the process.
+ */
+ if (vdev_open(rvd) != 0) {
+ error = ENXIO;
+ goto out;
+ }
+
+ /*
+ * Validate the labels for all leaf vdevs. We need to grab the config
+ * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD
+ * flag.
+ */
+ spa_config_enter(spa, RW_READER, FTAG);
+ error = vdev_validate(rvd);
+ spa_config_exit(spa, FTAG);
+
+ if (error != 0) {
+ error = EBADF;
+ goto out;
+ }
+
+ if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
+ error = ENXIO;
+ goto out;
+ }
+
+ /*
+ * Find the best uberblock.
+ */
+ bzero(ub, sizeof (uberblock_t));
+
+ zio = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
+ vdev_uberblock_load(zio, rvd, ub);
+ error = zio_wait(zio);
+
+ /*
+ * If we weren't able to find a single valid uberblock, return failure.
+ */
+ if (ub->ub_txg == 0) {
+ vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ error = ENXIO;
+ goto out;
+ }
+
+ /*
+ * If the pool is newer than the code, we can't open it.
+ */
+ if (ub->ub_version > ZFS_VERSION) {
+ vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_VERSION_NEWER);
+ error = ENOTSUP;
+ goto out;
+ }
+
+ /*
+ * If the vdev guid sum doesn't match the uberblock, we have an
+ * incomplete configuration.
+ */
+ if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) {
+ vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_BAD_GUID_SUM);
+ error = ENXIO;
+ goto out;
+ }
+
+ /*
+ * Initialize internal SPA structures.
+ */
+ spa->spa_state = POOL_STATE_ACTIVE;
+ spa->spa_ubsync = spa->spa_uberblock;
+ spa->spa_first_txg = spa_last_synced_txg(spa) + 1;
+ error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
+ if (error) {
+ vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ goto out;
+ }
+ spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
+
+ if (zap_lookup(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
+ sizeof (uint64_t), 1, &spa->spa_config_object) != 0) {
+ vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ error = EIO;
+ goto out;
+ }
+
+ if (!mosconfig) {
+ nvlist_t *newconfig;
+
+ if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) {
+ vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ error = EIO;
+ goto out;
+ }
+
+ spa_config_set(spa, newconfig);
+ spa_unload(spa);
+ spa_deactivate(spa);
+ spa_activate(spa);
+
+ return (spa_load(spa, newconfig, state, B_TRUE));
+ }
+
+ if (zap_lookup(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
+ sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) {
+ vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ error = EIO;
+ goto out;
+ }
+
+ /*
+ * Load the bit that tells us to use the new accounting function
+ * (raid-z deflation). If we have an older pool, this will not
+ * be present.
+ */
+ error = zap_lookup(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
+ sizeof (uint64_t), 1, &spa->spa_deflate);
+ if (error != 0 && error != ENOENT) {
+ vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ error = EIO;
+ goto out;
+ }
+
+ /*
+ * Load the persistent error log. If we have an older pool, this will
+ * not be present.
+ */
+ error = zap_lookup(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST,
+ sizeof (uint64_t), 1, &spa->spa_errlog_last);
+ if (error != 0 && error != ENOENT) {
+ vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ error = EIO;
+ goto out;
+ }
+
+ error = zap_lookup(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB,
+ sizeof (uint64_t), 1, &spa->spa_errlog_scrub);
+ if (error != 0 && error != ENOENT) {
+ vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ error = EIO;
+ goto out;
+ }
+
+ /*
+ * Load the history object. If we have an older pool, this
+ * will not be present.
+ */
+ error = zap_lookup(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY,
+ sizeof (uint64_t), 1, &spa->spa_history);
+ if (error != 0 && error != ENOENT) {
+ vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ error = EIO;
+ goto out;
+ }
+
+ /*
+ * Load any hot spares for this pool.
+ */
+ error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares_object);
+ if (error != 0 && error != ENOENT) {
+ vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ error = EIO;
+ goto out;
+ }
+ if (error == 0) {
+ ASSERT(spa_version(spa) >= ZFS_VERSION_SPARES);
+ if (load_nvlist(spa, spa->spa_spares_object,
+ &spa->spa_sparelist) != 0) {
+ vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ error = EIO;
+ goto out;
+ }
+
+ spa_config_enter(spa, RW_WRITER, FTAG);
+ spa_load_spares(spa);
+ spa_config_exit(spa, FTAG);
+ }
+
+ error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object);
+
+ if (error && error != ENOENT) {
+ vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ error = EIO;
+ goto out;
+ }
+
+ if (error == 0) {
+ (void) zap_lookup(spa->spa_meta_objset,
+ spa->spa_pool_props_object,
+ zpool_prop_to_name(ZFS_PROP_BOOTFS),
+ sizeof (uint64_t), 1, &spa->spa_bootfs);
+ }
+
+ /*
+ * Load the vdev state for all toplevel vdevs.
+ */
+ vdev_load(rvd);
+
+ /*
+ * Propagate the leaf DTLs we just loaded all the way up the tree.
+ */
+ spa_config_enter(spa, RW_WRITER, FTAG);
+ vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
+ spa_config_exit(spa, FTAG);
+
+ /*
+ * Check the state of the root vdev. If it can't be opened, it
+ * indicates one or more toplevel vdevs are faulted.
+ */
+ if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
+ error = ENXIO;
+ goto out;
+ }
+
+ if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) {
+ dmu_tx_t *tx;
+ int need_update = B_FALSE;
+ int c;
+
+ /*
+ * Claim log blocks that haven't been committed yet.
+ * This must all happen in a single txg.
+ */
+ tx = dmu_tx_create_assigned(spa_get_dsl(spa),
+ spa_first_txg(spa));
+ (void) dmu_objset_find(spa->spa_name,
+ zil_claim, tx, DS_FIND_CHILDREN);
+ dmu_tx_commit(tx);
+
+ spa->spa_sync_on = B_TRUE;
+ txg_sync_start(spa->spa_dsl_pool);
+
+ /*
+ * Wait for all claims to sync.
+ */
+ txg_wait_synced(spa->spa_dsl_pool, 0);
+
+ /*
+ * If the config cache is stale, or we have uninitialized
+ * metaslabs (see spa_vdev_add()), then update the config.
+ */
+ if (config_cache_txg != spa->spa_config_txg ||
+ state == SPA_LOAD_IMPORT)
+ need_update = B_TRUE;
+
+ for (c = 0; c < rvd->vdev_children; c++)
+ if (rvd->vdev_child[c]->vdev_ms_array == 0)
+ need_update = B_TRUE;
+
+ /*
+ * Update the config cache asychronously in case we're the
+ * root pool, in which case the config cache isn't writable yet.
+ */
+ if (need_update)
+ spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
+ }
+
+ error = 0;
+out:
+ if (error && error != EBADF)
+ zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0);
+ spa->spa_load_state = SPA_LOAD_NONE;
+ spa->spa_ena = 0;
+
+ return (error);
+}
+
+/*
+ * Pool Open/Import
+ *
+ * The import case is identical to an open except that the configuration is sent
+ * down from userland, instead of grabbed from the configuration cache. For the
+ * case of an open, the pool configuration will exist in the
+ * POOL_STATE_UNITIALIZED state.
+ *
+ * The stats information (gen/count/ustats) is used to gather vdev statistics at
+ * the same time open the pool, without having to keep around the spa_t in some
+ * ambiguous state.
+ */
+static int
+spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config)
+{
+ spa_t *spa;
+ int error;
+ int loaded = B_FALSE;
+ int locked = B_FALSE;
+
+ *spapp = NULL;
+
+ /*
+ * As disgusting as this is, we need to support recursive calls to this
+ * function because dsl_dir_open() is called during spa_load(), and ends
+ * up calling spa_open() again. The real fix is to figure out how to
+ * avoid dsl_dir_open() calling this in the first place.
+ */
+ if (mutex_owner(&spa_namespace_lock) != curthread) {
+ mutex_enter(&spa_namespace_lock);
+ locked = B_TRUE;
+ }
+
+ if ((spa = spa_lookup(pool)) == NULL) {
+ if (locked)
+ mutex_exit(&spa_namespace_lock);
+ return (ENOENT);
+ }
+ if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
+
+ spa_activate(spa);
+
+ error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE);
+
+ if (error == EBADF) {
+ /*
+ * If vdev_validate() returns failure (indicated by
+ * EBADF), it indicates that one of the vdevs indicates
+ * that the pool has been exported or destroyed. If
+ * this is the case, the config cache is out of sync and
+ * we should remove the pool from the namespace.
+ */
+ zfs_post_ok(spa, NULL);
+ spa_unload(spa);
+ spa_deactivate(spa);
+ spa_remove(spa);
+ spa_config_sync();
+ if (locked)
+ mutex_exit(&spa_namespace_lock);
+ return (ENOENT);
+ }
+
+ if (error) {
+ /*
+ * We can't open the pool, but we still have useful
+ * information: the state of each vdev after the
+ * attempted vdev_open(). Return this to the user.
+ */
+ if (config != NULL && spa->spa_root_vdev != NULL) {
+ spa_config_enter(spa, RW_READER, FTAG);
+ *config = spa_config_generate(spa, NULL, -1ULL,
+ B_TRUE);
+ spa_config_exit(spa, FTAG);
+ }
+ spa_unload(spa);
+ spa_deactivate(spa);
+ spa->spa_last_open_failed = B_TRUE;
+ if (locked)
+ mutex_exit(&spa_namespace_lock);
+ *spapp = NULL;
+ return (error);
+ } else {
+ zfs_post_ok(spa, NULL);
+ spa->spa_last_open_failed = B_FALSE;
+ }
+
+ loaded = B_TRUE;
+ }
+
+ spa_open_ref(spa, tag);
+ if (locked)
+ mutex_exit(&spa_namespace_lock);
+
+ *spapp = spa;
+
+ if (config != NULL) {
+ spa_config_enter(spa, RW_READER, FTAG);
+ *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
+ spa_config_exit(spa, FTAG);
+ }
+
+ /*
+ * If we just loaded the pool, resilver anything that's out of date.
+ */
+ if (loaded && (spa_mode & FWRITE))
+ VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
+
+ return (0);
+}
+
+int
+spa_open(const char *name, spa_t **spapp, void *tag)
+{
+ return (spa_open_common(name, spapp, tag, NULL));
+}
+
+/*
+ * Lookup the given spa_t, incrementing the inject count in the process,
+ * preventing it from being exported or destroyed.
+ */
+spa_t *
+spa_inject_addref(char *name)
+{
+ spa_t *spa;
+
+ mutex_enter(&spa_namespace_lock);
+ if ((spa = spa_lookup(name)) == NULL) {
+ mutex_exit(&spa_namespace_lock);
+ return (NULL);
+ }
+ spa->spa_inject_ref++;
+ mutex_exit(&spa_namespace_lock);
+
+ return (spa);
+}
+
+void
+spa_inject_delref(spa_t *spa)
+{
+ mutex_enter(&spa_namespace_lock);
+ spa->spa_inject_ref--;
+ mutex_exit(&spa_namespace_lock);
+}
+
+static void
+spa_add_spares(spa_t *spa, nvlist_t *config)
+{
+ nvlist_t **spares;
+ uint_t i, nspares;
+ nvlist_t *nvroot;
+ uint64_t guid;
+ vdev_stat_t *vs;
+ uint_t vsc;
+ uint64_t pool;
+
+ if (spa->spa_nspares == 0)
+ return;
+
+ VERIFY(nvlist_lookup_nvlist(config,
+ ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
+ VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist,
+ ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
+ if (nspares != 0) {
+ VERIFY(nvlist_add_nvlist_array(nvroot,
+ ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
+ VERIFY(nvlist_lookup_nvlist_array(nvroot,
+ ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
+
+ /*
+ * Go through and find any spares which have since been
+ * repurposed as an active spare. If this is the case, update
+ * their status appropriately.
+ */
+ for (i = 0; i < nspares; i++) {
+ VERIFY(nvlist_lookup_uint64(spares[i],
+ ZPOOL_CONFIG_GUID, &guid) == 0);
+ if (spa_spare_exists(guid, &pool) && pool != 0ULL) {
+ VERIFY(nvlist_lookup_uint64_array(
+ spares[i], ZPOOL_CONFIG_STATS,
+ (uint64_t **)&vs, &vsc) == 0);
+ vs->vs_state = VDEV_STATE_CANT_OPEN;
+ vs->vs_aux = VDEV_AUX_SPARED;
+ }
+ }
+ }
+}
+
+int
+spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen)
+{
+ int error;
+ spa_t *spa;
+
+ *config = NULL;
+ error = spa_open_common(name, &spa, FTAG, config);
+
+ if (spa && *config != NULL) {
+ VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT,
+ spa_get_errlog_size(spa)) == 0);
+
+ spa_add_spares(spa, *config);
+ }
+
+ /*
+ * We want to get the alternate root even for faulted pools, so we cheat
+ * and call spa_lookup() directly.
+ */
+ if (altroot) {
+ if (spa == NULL) {
+ mutex_enter(&spa_namespace_lock);
+ spa = spa_lookup(name);
+ if (spa)
+ spa_altroot(spa, altroot, buflen);
+ else
+ altroot[0] = '\0';
+ spa = NULL;
+ mutex_exit(&spa_namespace_lock);
+ } else {
+ spa_altroot(spa, altroot, buflen);
+ }
+ }
+
+ if (spa != NULL)
+ spa_close(spa, FTAG);
+
+ return (error);
+}
+
+/*
+ * Validate that the 'spares' array is well formed. We must have an array of
+ * nvlists, each which describes a valid leaf vdev. If this is an import (mode
+ * is VDEV_ALLOC_SPARE), then we allow corrupted spares to be specified, as long
+ * as they are well-formed.
+ */
+static int
+spa_validate_spares(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
+{
+ nvlist_t **spares;
+ uint_t i, nspares;
+ vdev_t *vd;
+ int error;
+
+ /*
+ * It's acceptable to have no spares specified.
+ */
+ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
+ &spares, &nspares) != 0)
+ return (0);
+
+ if (nspares == 0)
+ return (EINVAL);
+
+ /*
+ * Make sure the pool is formatted with a version that supports hot
+ * spares.
+ */
+ if (spa_version(spa) < ZFS_VERSION_SPARES)
+ return (ENOTSUP);
+
+ /*
+ * Set the pending spare list so we correctly handle device in-use
+ * checking.
+ */
+ spa->spa_pending_spares = spares;
+ spa->spa_pending_nspares = nspares;
+
+ for (i = 0; i < nspares; i++) {
+ if ((error = spa_config_parse(spa, &vd, spares[i], NULL, 0,
+ mode)) != 0)
+ goto out;
+
+ if (!vd->vdev_ops->vdev_op_leaf) {
+ vdev_free(vd);
+ error = EINVAL;
+ goto out;
+ }
+
+ vd->vdev_top = vd;
+
+ if ((error = vdev_open(vd)) == 0 &&
+ (error = vdev_label_init(vd, crtxg,
+ VDEV_LABEL_SPARE)) == 0) {
+ VERIFY(nvlist_add_uint64(spares[i], ZPOOL_CONFIG_GUID,
+ vd->vdev_guid) == 0);
+ }
+
+ vdev_free(vd);
+
+ if (error && mode != VDEV_ALLOC_SPARE)
+ goto out;
+ else
+ error = 0;
+ }
+
+out:
+ spa->spa_pending_spares = NULL;
+ spa->spa_pending_nspares = 0;
+ return (error);
+}
+
+/*
+ * Pool Creation
+ */
+int
+spa_create(const char *pool, nvlist_t *nvroot, const char *altroot)
+{
+ spa_t *spa;
+ vdev_t *rvd;
+ dsl_pool_t *dp;
+ dmu_tx_t *tx;
+ int c, error = 0;
+ uint64_t txg = TXG_INITIAL;
+ nvlist_t **spares;
+ uint_t nspares;
+
+ /*
+ * If this pool already exists, return failure.
+ */
+ mutex_enter(&spa_namespace_lock);
+ if (spa_lookup(pool) != NULL) {
+ mutex_exit(&spa_namespace_lock);
+ return (EEXIST);
+ }
+
+ /*
+ * Allocate a new spa_t structure.
+ */
+ spa = spa_add(pool, altroot);
+ spa_activate(spa);
+
+ spa->spa_uberblock.ub_txg = txg - 1;
+ spa->spa_uberblock.ub_version = ZFS_VERSION;
+ spa->spa_ubsync = spa->spa_uberblock;
+
+ /*
+ * Create the root vdev.
+ */
+ spa_config_enter(spa, RW_WRITER, FTAG);
+
+ error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
+
+ ASSERT(error != 0 || rvd != NULL);
+ ASSERT(error != 0 || spa->spa_root_vdev == rvd);
+
+ if (error == 0 && rvd->vdev_children == 0)
+ error = EINVAL;
+
+ if (error == 0 &&
+ (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
+ (error = spa_validate_spares(spa, nvroot, txg,
+ VDEV_ALLOC_ADD)) == 0) {
+ for (c = 0; c < rvd->vdev_children; c++)
+ vdev_init(rvd->vdev_child[c], txg);
+ vdev_config_dirty(rvd);
+ }
+
+ spa_config_exit(spa, FTAG);
+
+ if (error != 0) {
+ spa_unload(spa);
+ spa_deactivate(spa);
+ spa_remove(spa);
+ mutex_exit(&spa_namespace_lock);
+ return (error);
+ }
+
+ /*
+ * Get the list of spares, if specified.
+ */
+ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
+ &spares, &nspares) == 0) {
+ VERIFY(nvlist_alloc(&spa->spa_sparelist, NV_UNIQUE_NAME,
+ KM_SLEEP) == 0);
+ VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist,
+ ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
+ spa_config_enter(spa, RW_WRITER, FTAG);
+ spa_load_spares(spa);
+ spa_config_exit(spa, FTAG);
+ spa->spa_sync_spares = B_TRUE;
+ }
+
+ spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg);
+ spa->spa_meta_objset = dp->dp_meta_objset;
+
+ tx = dmu_tx_create_assigned(dp, txg);
+
+ /*
+ * Create the pool config object.
+ */
+ spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
+ DMU_OT_PACKED_NVLIST, 1 << 14,
+ DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
+
+ if (zap_add(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
+ sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
+ cmn_err(CE_PANIC, "failed to add pool config");
+ }
+
+ /* Newly created pools are always deflated. */
+ spa->spa_deflate = TRUE;
+ if (zap_add(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
+ sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
+ cmn_err(CE_PANIC, "failed to add deflate");
+ }
+
+ /*
+ * Create the deferred-free bplist object. Turn off compression
+ * because sync-to-convergence takes longer if the blocksize
+ * keeps changing.
+ */
+ spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset,
+ 1 << 14, tx);
+ dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj,
+ ZIO_COMPRESS_OFF, tx);
+
+ if (zap_add(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
+ sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) {
+ cmn_err(CE_PANIC, "failed to add bplist");
+ }
+
+ /*
+ * Create the pool's history object.
+ */
+ spa_history_create_obj(spa, tx);
+
+ dmu_tx_commit(tx);
+
+ spa->spa_bootfs = zfs_prop_default_numeric(ZFS_PROP_BOOTFS);
+ spa->spa_sync_on = B_TRUE;
+ txg_sync_start(spa->spa_dsl_pool);
+
+ /*
+ * We explicitly wait for the first transaction to complete so that our
+ * bean counters are appropriately updated.
+ */
+ txg_wait_synced(spa->spa_dsl_pool, txg);
+
+ spa_config_sync();
+
+ mutex_exit(&spa_namespace_lock);
+
+ return (0);
+}
+
+/*
+ * Import the given pool into the system. We set up the necessary spa_t and
+ * then call spa_load() to do the dirty work.
+ */
+int
+spa_import(const char *pool, nvlist_t *config, const char *altroot)
+{
+ spa_t *spa;
+ int error;
+ nvlist_t *nvroot;
+ nvlist_t **spares;
+ uint_t nspares;
+
+ if (!(spa_mode & FWRITE))
+ return (EROFS);
+
+ /*
+ * If a pool with this name exists, return failure.
+ */
+ mutex_enter(&spa_namespace_lock);
+ if (spa_lookup(pool) != NULL) {
+ mutex_exit(&spa_namespace_lock);
+ return (EEXIST);
+ }
+
+ /*
+ * Create and initialize the spa structure.
+ */
+ spa = spa_add(pool, altroot);
+ spa_activate(spa);
+
+ /*
+ * Pass off the heavy lifting to spa_load().
+ * Pass TRUE for mosconfig because the user-supplied config
+ * is actually the one to trust when doing an import.
+ */
+ error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE);
+
+ spa_config_enter(spa, RW_WRITER, FTAG);
+ /*
+ * Toss any existing sparelist, as it doesn't have any validity anymore,
+ * and conflicts with spa_has_spare().
+ */
+ if (spa->spa_sparelist) {
+ nvlist_free(spa->spa_sparelist);
+ spa->spa_sparelist = NULL;
+ spa_load_spares(spa);
+ }
+
+ VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+ &nvroot) == 0);
+ if (error == 0)
+ error = spa_validate_spares(spa, nvroot, -1ULL,
+ VDEV_ALLOC_SPARE);
+ spa_config_exit(spa, FTAG);
+
+ if (error != 0) {
+ spa_unload(spa);
+ spa_deactivate(spa);
+ spa_remove(spa);
+ mutex_exit(&spa_namespace_lock);
+ return (error);
+ }
+
+ /*
+ * Override any spares as specified by the user, as these may have
+ * correct device names/devids, etc.
+ */
+ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
+ &spares, &nspares) == 0) {
+ if (spa->spa_sparelist)
+ VERIFY(nvlist_remove(spa->spa_sparelist,
+ ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
+ else
+ VERIFY(nvlist_alloc(&spa->spa_sparelist,
+ NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist,
+ ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
+ spa_config_enter(spa, RW_WRITER, FTAG);
+ spa_load_spares(spa);
+ spa_config_exit(spa, FTAG);
+ spa->spa_sync_spares = B_TRUE;
+ }
+
+ /*
+ * Update the config cache to include the newly-imported pool.
+ */
+ spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
+
+ mutex_exit(&spa_namespace_lock);
+
+ /*
+ * Resilver anything that's out of date.
+ */
+ if (spa_mode & FWRITE)
+ VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
+
+ return (0);
+}
+
+/*
+ * This (illegal) pool name is used when temporarily importing a spa_t in order
+ * to get the vdev stats associated with the imported devices.
+ */
+#define TRYIMPORT_NAME "$import"
+
+nvlist_t *
+spa_tryimport(nvlist_t *tryconfig)
+{
+ nvlist_t *config = NULL;
+ char *poolname;
+ spa_t *spa;
+ uint64_t state;
+
+ if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
+ return (NULL);
+
+ if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
+ return (NULL);
+
+ /*
+ * Create and initialize the spa structure.
+ */
+ mutex_enter(&spa_namespace_lock);
+ spa = spa_add(TRYIMPORT_NAME, NULL);
+ spa_activate(spa);
+
+ /*
+ * Pass off the heavy lifting to spa_load().
+ * Pass TRUE for mosconfig because the user-supplied config
+ * is actually the one to trust when doing an import.
+ */
+ (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE);
+
+ /*
+ * If 'tryconfig' was at least parsable, return the current config.
+ */
+ if (spa->spa_root_vdev != NULL) {
+ spa_config_enter(spa, RW_READER, FTAG);
+ config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
+ spa_config_exit(spa, FTAG);
+ VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
+ poolname) == 0);
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
+ state) == 0);
+
+ /*
+ * Add the list of hot spares.
+ */
+ spa_add_spares(spa, config);
+ }
+
+ spa_unload(spa);
+ spa_deactivate(spa);
+ spa_remove(spa);
+ mutex_exit(&spa_namespace_lock);
+
+ return (config);
+}
+
+/*
+ * Pool export/destroy
+ *
+ * The act of destroying or exporting a pool is very simple. We make sure there
+ * is no more pending I/O and any references to the pool are gone. Then, we
+ * update the pool state and sync all the labels to disk, removing the
+ * configuration from the cache afterwards.
+ */
+static int
+spa_export_common(char *pool, int new_state, nvlist_t **oldconfig)
+{
+ spa_t *spa;
+
+ if (oldconfig)
+ *oldconfig = NULL;
+
+ if (!(spa_mode & FWRITE))
+ return (EROFS);
+
+ mutex_enter(&spa_namespace_lock);
+ if ((spa = spa_lookup(pool)) == NULL) {
+ mutex_exit(&spa_namespace_lock);
+ return (ENOENT);
+ }
+
+ /*
+ * Put a hold on the pool, drop the namespace lock, stop async tasks,
+ * reacquire the namespace lock, and see if we can export.
+ */
+ spa_open_ref(spa, FTAG);
+ mutex_exit(&spa_namespace_lock);
+ spa_async_suspend(spa);
+ mutex_enter(&spa_namespace_lock);
+ spa_close(spa, FTAG);
+
+ /*
+ * The pool will be in core if it's openable,
+ * in which case we can modify its state.
+ */
+ if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) {
+ /*
+ * Objsets may be open only because they're dirty, so we
+ * have to force it to sync before checking spa_refcnt.
+ */
+ spa_scrub_suspend(spa);
+ txg_wait_synced(spa->spa_dsl_pool, 0);
+
+ /*
+ * A pool cannot be exported or destroyed if there are active
+ * references. If we are resetting a pool, allow references by
+ * fault injection handlers.
+ */
+ if (!spa_refcount_zero(spa) ||
+ (spa->spa_inject_ref != 0 &&
+ new_state != POOL_STATE_UNINITIALIZED)) {
+ spa_scrub_resume(spa);
+ spa_async_resume(spa);
+ mutex_exit(&spa_namespace_lock);
+ return (EBUSY);
+ }
+
+ spa_scrub_resume(spa);
+ VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0);
+
+ /*
+ * We want this to be reflected on every label,
+ * so mark them all dirty. spa_unload() will do the
+ * final sync that pushes these changes out.
+ */
+ if (new_state != POOL_STATE_UNINITIALIZED) {
+ spa_config_enter(spa, RW_WRITER, FTAG);
+ spa->spa_state = new_state;
+ spa->spa_final_txg = spa_last_synced_txg(spa) + 1;
+ vdev_config_dirty(spa->spa_root_vdev);
+ spa_config_exit(spa, FTAG);
+ }
+ }
+
+ if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
+ spa_unload(spa);
+ spa_deactivate(spa);
+ }
+
+ if (oldconfig && spa->spa_config)
+ VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
+
+ if (new_state != POOL_STATE_UNINITIALIZED) {
+ spa_remove(spa);
+ spa_config_sync();
+ }
+ mutex_exit(&spa_namespace_lock);
+
+ return (0);
+}
+
+/*
+ * Destroy a storage pool.
+ */
+int
+spa_destroy(char *pool)
+{
+ return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL));
+}
+
+/*
+ * Export a storage pool.
+ */
+int
+spa_export(char *pool, nvlist_t **oldconfig)
+{
+ return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig));
+}
+
+/*
+ * Similar to spa_export(), this unloads the spa_t without actually removing it
+ * from the namespace in any way.
+ */
+int
+spa_reset(char *pool)
+{
+ return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL));
+}
+
+
+/*
+ * ==========================================================================
+ * Device manipulation
+ * ==========================================================================
+ */
+
+/*
+ * Add capacity to a storage pool.
+ */
+int
+spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
+{
+ uint64_t txg;
+ int c, error;
+ vdev_t *rvd = spa->spa_root_vdev;
+ vdev_t *vd, *tvd;
+ nvlist_t **spares;
+ uint_t i, nspares;
+
+ txg = spa_vdev_enter(spa);
+
+ if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
+ VDEV_ALLOC_ADD)) != 0)
+ return (spa_vdev_exit(spa, NULL, txg, error));
+
+ spa->spa_pending_vdev = vd;
+
+ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
+ &spares, &nspares) != 0)
+ nspares = 0;
+
+ if (vd->vdev_children == 0 && nspares == 0) {
+ spa->spa_pending_vdev = NULL;
+ return (spa_vdev_exit(spa, vd, txg, EINVAL));
+ }
+
+ if (vd->vdev_children != 0) {
+ if ((error = vdev_create(vd, txg, B_FALSE)) != 0) {
+ spa->spa_pending_vdev = NULL;
+ return (spa_vdev_exit(spa, vd, txg, error));
+ }
+ }
+
+ /*
+ * We must validate the spares after checking the children. Otherwise,
+ * vdev_inuse() will blindly overwrite the spare.
+ */
+ if ((error = spa_validate_spares(spa, nvroot, txg,
+ VDEV_ALLOC_ADD)) != 0) {
+ spa->spa_pending_vdev = NULL;
+ return (spa_vdev_exit(spa, vd, txg, error));
+ }
+
+ spa->spa_pending_vdev = NULL;
+
+ /*
+ * Transfer each new top-level vdev from vd to rvd.
+ */
+ for (c = 0; c < vd->vdev_children; c++) {
+ tvd = vd->vdev_child[c];
+ vdev_remove_child(vd, tvd);
+ tvd->vdev_id = rvd->vdev_children;
+ vdev_add_child(rvd, tvd);
+ vdev_config_dirty(tvd);
+ }
+
+ if (nspares != 0) {
+ if (spa->spa_sparelist != NULL) {
+ nvlist_t **oldspares;
+ uint_t oldnspares;
+ nvlist_t **newspares;
+
+ VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist,
+ ZPOOL_CONFIG_SPARES, &oldspares, &oldnspares) == 0);
+
+ newspares = kmem_alloc(sizeof (void *) *
+ (nspares + oldnspares), KM_SLEEP);
+ for (i = 0; i < oldnspares; i++)
+ VERIFY(nvlist_dup(oldspares[i],
+ &newspares[i], KM_SLEEP) == 0);
+ for (i = 0; i < nspares; i++)
+ VERIFY(nvlist_dup(spares[i],
+ &newspares[i + oldnspares],
+ KM_SLEEP) == 0);
+
+ VERIFY(nvlist_remove(spa->spa_sparelist,
+ ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
+
+ VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist,
+ ZPOOL_CONFIG_SPARES, newspares,
+ nspares + oldnspares) == 0);
+ for (i = 0; i < oldnspares + nspares; i++)
+ nvlist_free(newspares[i]);
+ kmem_free(newspares, (oldnspares + nspares) *
+ sizeof (void *));
+ } else {
+ VERIFY(nvlist_alloc(&spa->spa_sparelist,
+ NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist,
+ ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
+ }
+
+ spa_load_spares(spa);
+ spa->spa_sync_spares = B_TRUE;
+ }
+
+ /*
+ * We have to be careful when adding new vdevs to an existing pool.
+ * If other threads start allocating from these vdevs before we
+ * sync the config cache, and we lose power, then upon reboot we may
+ * fail to open the pool because there are DVAs that the config cache
+ * can't translate. Therefore, we first add the vdevs without
+ * initializing metaslabs; sync the config cache (via spa_vdev_exit());
+ * and then let spa_config_update() initialize the new metaslabs.
+ *
+ * spa_load() checks for added-but-not-initialized vdevs, so that
+ * if we lose power at any point in this sequence, the remaining
+ * steps will be completed the next time we load the pool.
+ */
+ (void) spa_vdev_exit(spa, vd, txg, 0);
+
+ mutex_enter(&spa_namespace_lock);
+ spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
+ mutex_exit(&spa_namespace_lock);
+
+ return (0);
+}
+
+/*
+ * Attach a device to a mirror. The arguments are the path to any device
+ * in the mirror, and the nvroot for the new device. If the path specifies
+ * a device that is not mirrored, we automatically insert the mirror vdev.
+ *
+ * If 'replacing' is specified, the new device is intended to replace the
+ * existing device; in this case the two devices are made into their own
+ * mirror using the 'replacing' vdev, which is functionally idendical to
+ * the mirror vdev (it actually reuses all the same ops) but has a few
+ * extra rules: you can't attach to it after it's been created, and upon
+ * completion of resilvering, the first disk (the one being replaced)
+ * is automatically detached.
+ */
+int
+spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
+{
+ uint64_t txg, open_txg;
+ int error;
+ vdev_t *rvd = spa->spa_root_vdev;
+ vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
+ vdev_ops_t *pvops;
+
+ txg = spa_vdev_enter(spa);
+
+ oldvd = vdev_lookup_by_guid(rvd, guid);
+
+ if (oldvd == NULL)
+ return (spa_vdev_exit(spa, NULL, txg, ENODEV));
+
+ if (!oldvd->vdev_ops->vdev_op_leaf)
+ return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
+ pvd = oldvd->vdev_parent;
+
+ if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
+ VDEV_ALLOC_ADD)) != 0 || newrootvd->vdev_children != 1)
+ return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
+
+ newvd = newrootvd->vdev_child[0];
+
+ if (!newvd->vdev_ops->vdev_op_leaf)
+ return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
+
+ if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
+ return (spa_vdev_exit(spa, newrootvd, txg, error));
+
+ if (!replacing) {
+ /*
+ * For attach, the only allowable parent is a mirror or the root
+ * vdev.
+ */
+ if (pvd->vdev_ops != &vdev_mirror_ops &&
+ pvd->vdev_ops != &vdev_root_ops)
+ return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+
+ pvops = &vdev_mirror_ops;
+ } else {
+ /*
+ * Active hot spares can only be replaced by inactive hot
+ * spares.
+ */
+ if (pvd->vdev_ops == &vdev_spare_ops &&
+ pvd->vdev_child[1] == oldvd &&
+ !spa_has_spare(spa, newvd->vdev_guid))
+ return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+
+ /*
+ * If the source is a hot spare, and the parent isn't already a
+ * spare, then we want to create a new hot spare. Otherwise, we
+ * want to create a replacing vdev. The user is not allowed to
+ * attach to a spared vdev child unless the 'isspare' state is
+ * the same (spare replaces spare, non-spare replaces
+ * non-spare).
+ */
+ if (pvd->vdev_ops == &vdev_replacing_ops)
+ return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+ else if (pvd->vdev_ops == &vdev_spare_ops &&
+ newvd->vdev_isspare != oldvd->vdev_isspare)
+ return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+ else if (pvd->vdev_ops != &vdev_spare_ops &&
+ newvd->vdev_isspare)
+ pvops = &vdev_spare_ops;
+ else
+ pvops = &vdev_replacing_ops;
+ }
+
+ /*
+ * Compare the new device size with the replaceable/attachable
+ * device size.
+ */
+ if (newvd->vdev_psize < vdev_get_rsize(oldvd))
+ return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
+
+ /*
+ * The new device cannot have a higher alignment requirement
+ * than the top-level vdev.
+ */
+ if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
+ return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
+
+ /*
+ * If this is an in-place replacement, update oldvd's path and devid
+ * to make it distinguishable from newvd, and unopenable from now on.
+ */
+ if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
+ spa_strfree(oldvd->vdev_path);
+ oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
+ KM_SLEEP);
+ (void) sprintf(oldvd->vdev_path, "%s/%s",
+ newvd->vdev_path, "old");
+ if (oldvd->vdev_devid != NULL) {
+ spa_strfree(oldvd->vdev_devid);
+ oldvd->vdev_devid = NULL;
+ }
+ }
+
+ /*
+ * If the parent is not a mirror, or if we're replacing, insert the new
+ * mirror/replacing/spare vdev above oldvd.
+ */
+ if (pvd->vdev_ops != pvops)
+ pvd = vdev_add_parent(oldvd, pvops);
+
+ ASSERT(pvd->vdev_top->vdev_parent == rvd);
+ ASSERT(pvd->vdev_ops == pvops);
+ ASSERT(oldvd->vdev_parent == pvd);
+
+ /*
+ * Extract the new device from its root and add it to pvd.
+ */
+ vdev_remove_child(newrootvd, newvd);
+ newvd->vdev_id = pvd->vdev_children;
+ vdev_add_child(pvd, newvd);
+
+ /*
+ * If newvd is smaller than oldvd, but larger than its rsize,
+ * the addition of newvd may have decreased our parent's asize.
+ */
+ pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize);
+
+ tvd = newvd->vdev_top;
+ ASSERT(pvd->vdev_top == tvd);
+ ASSERT(tvd->vdev_parent == rvd);
+
+ vdev_config_dirty(tvd);
+
+ /*
+ * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate
+ * upward when spa_vdev_exit() calls vdev_dtl_reassess().
+ */
+ open_txg = txg + TXG_CONCURRENT_STATES - 1;
+
+ mutex_enter(&newvd->vdev_dtl_lock);
+ space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL,
+ open_txg - TXG_INITIAL + 1);
+ mutex_exit(&newvd->vdev_dtl_lock);
+
+ if (newvd->vdev_isspare)
+ spa_spare_activate(newvd);
+
+ /*
+ * Mark newvd's DTL dirty in this txg.
+ */
+ vdev_dirty(tvd, VDD_DTL, newvd, txg);
+
+ (void) spa_vdev_exit(spa, newrootvd, open_txg, 0);
+
+ /*
+ * Kick off a resilver to update newvd.
+ */
+ VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
+
+ return (0);
+}
+
+/*
+ * Detach a device from a mirror or replacing vdev.
+ * If 'replace_done' is specified, only detach if the parent
+ * is a replacing vdev.
+ */
+int
+spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
+{
+ uint64_t txg;
+ int c, t, error;
+ vdev_t *rvd = spa->spa_root_vdev;
+ vdev_t *vd, *pvd, *cvd, *tvd;
+ boolean_t unspare = B_FALSE;
+ uint64_t unspare_guid;
+
+ txg = spa_vdev_enter(spa);
+
+ vd = vdev_lookup_by_guid(rvd, guid);
+
+ if (vd == NULL)
+ return (spa_vdev_exit(spa, NULL, txg, ENODEV));
+
+ if (!vd->vdev_ops->vdev_op_leaf)
+ return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
+ pvd = vd->vdev_parent;
+
+ /*
+ * If replace_done is specified, only remove this device if it's
+ * the first child of a replacing vdev. For the 'spare' vdev, either
+ * disk can be removed.
+ */
+ if (replace_done) {
+ if (pvd->vdev_ops == &vdev_replacing_ops) {
+ if (vd->vdev_id != 0)
+ return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+ } else if (pvd->vdev_ops != &vdev_spare_ops) {
+ return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+ }
+ }
+
+ ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
+ spa_version(spa) >= ZFS_VERSION_SPARES);
+
+ /*
+ * Only mirror, replacing, and spare vdevs support detach.
+ */
+ if (pvd->vdev_ops != &vdev_replacing_ops &&
+ pvd->vdev_ops != &vdev_mirror_ops &&
+ pvd->vdev_ops != &vdev_spare_ops)
+ return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
+ /*
+ * If there's only one replica, you can't detach it.
+ */
+ if (pvd->vdev_children <= 1)
+ return (spa_vdev_exit(spa, NULL, txg, EBUSY));
+
+ /*
+ * If all siblings have non-empty DTLs, this device may have the only
+ * valid copy of the data, which means we cannot safely detach it.
+ *
+ * XXX -- as in the vdev_offline() case, we really want a more
+ * precise DTL check.
+ */
+ for (c = 0; c < pvd->vdev_children; c++) {
+ uint64_t dirty;
+
+ cvd = pvd->vdev_child[c];
+ if (cvd == vd)
+ continue;
+ if (vdev_is_dead(cvd))
+ continue;
+ mutex_enter(&cvd->vdev_dtl_lock);
+ dirty = cvd->vdev_dtl_map.sm_space |
+ cvd->vdev_dtl_scrub.sm_space;
+ mutex_exit(&cvd->vdev_dtl_lock);
+ if (!dirty)
+ break;
+ }
+
+ /*
+ * If we are a replacing or spare vdev, then we can always detach the
+ * latter child, as that is how one cancels the operation.
+ */
+ if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) &&
+ c == pvd->vdev_children)
+ return (spa_vdev_exit(spa, NULL, txg, EBUSY));
+
+ /*
+ * If we are detaching the original disk from a spare, then it implies
+ * that the spare should become a real disk, and be removed from the
+ * active spare list for the pool.
+ */
+ if (pvd->vdev_ops == &vdev_spare_ops &&
+ vd->vdev_id == 0)
+ unspare = B_TRUE;
+
+ /*
+ * Erase the disk labels so the disk can be used for other things.
+ * This must be done after all other error cases are handled,
+ * but before we disembowel vd (so we can still do I/O to it).
+ * But if we can't do it, don't treat the error as fatal --
+ * it may be that the unwritability of the disk is the reason
+ * it's being detached!
+ */
+ error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
+
+ /*
+ * Remove vd from its parent and compact the parent's children.
+ */
+ vdev_remove_child(pvd, vd);
+ vdev_compact_children(pvd);
+
+ /*
+ * Remember one of the remaining children so we can get tvd below.
+ */
+ cvd = pvd->vdev_child[0];
+
+ /*
+ * If we need to remove the remaining child from the list of hot spares,
+ * do it now, marking the vdev as no longer a spare in the process. We
+ * must do this before vdev_remove_parent(), because that can change the
+ * GUID if it creates a new toplevel GUID.
+ */
+ if (unspare) {
+ ASSERT(cvd->vdev_isspare);
+ spa_spare_remove(cvd);
+ unspare_guid = cvd->vdev_guid;
+ }
+
+ /*
+ * If the parent mirror/replacing vdev only has one child,
+ * the parent is no longer needed. Remove it from the tree.
+ */
+ if (pvd->vdev_children == 1)
+ vdev_remove_parent(cvd);
+
+ /*
+ * We don't set tvd until now because the parent we just removed
+ * may have been the previous top-level vdev.
+ */
+ tvd = cvd->vdev_top;
+ ASSERT(tvd->vdev_parent == rvd);
+
+ /*
+ * Reevaluate the parent vdev state.
+ */
+ vdev_propagate_state(cvd->vdev_parent);
+
+ /*
+ * If the device we just detached was smaller than the others, it may be
+ * possible to add metaslabs (i.e. grow the pool). vdev_metaslab_init()
+ * can't fail because the existing metaslabs are already in core, so
+ * there's nothing to read from disk.
+ */
+ VERIFY(vdev_metaslab_init(tvd, txg) == 0);
+
+ vdev_config_dirty(tvd);
+
+ /*
+ * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that
+ * vd->vdev_detached is set and free vd's DTL object in syncing context.
+ * But first make sure we're not on any *other* txg's DTL list, to
+ * prevent vd from being accessed after it's freed.
+ */
+ for (t = 0; t < TXG_SIZE; t++)
+ (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
+ vd->vdev_detached = B_TRUE;
+ vdev_dirty(tvd, VDD_DTL, vd, txg);
+
+ error = spa_vdev_exit(spa, vd, txg, 0);
+
+ /*
+ * If this was the removal of the original device in a hot spare vdev,
+ * then we want to go through and remove the device from the hot spare
+ * list of every other pool.
+ */
+ if (unspare) {
+ spa = NULL;
+ mutex_enter(&spa_namespace_lock);
+ while ((spa = spa_next(spa)) != NULL) {
+ if (spa->spa_state != POOL_STATE_ACTIVE)
+ continue;
+
+ (void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
+ }
+ mutex_exit(&spa_namespace_lock);
+ }
+
+ return (error);
+}
+
+/*
+ * Remove a device from the pool. Currently, this supports removing only hot
+ * spares.
+ */
+int
+spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
+{
+ vdev_t *vd;
+ nvlist_t **spares, *nv, **newspares;
+ uint_t i, j, nspares;
+ int ret = 0;
+
+ spa_config_enter(spa, RW_WRITER, FTAG);
+
+ vd = spa_lookup_by_guid(spa, guid);
+
+ nv = NULL;
+ if (spa->spa_spares != NULL &&
+ nvlist_lookup_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES,
+ &spares, &nspares) == 0) {
+ for (i = 0; i < nspares; i++) {
+ uint64_t theguid;
+
+ VERIFY(nvlist_lookup_uint64(spares[i],
+ ZPOOL_CONFIG_GUID, &theguid) == 0);
+ if (theguid == guid) {
+ nv = spares[i];
+ break;
+ }
+ }
+ }
+
+ /*
+ * We only support removing a hot spare, and only if it's not currently
+ * in use in this pool.
+ */
+ if (nv == NULL && vd == NULL) {
+ ret = ENOENT;
+ goto out;
+ }
+
+ if (nv == NULL && vd != NULL) {
+ ret = ENOTSUP;
+ goto out;
+ }
+
+ if (!unspare && nv != NULL && vd != NULL) {
+ ret = EBUSY;
+ goto out;
+ }
+
+ if (nspares == 1) {
+ newspares = NULL;
+ } else {
+ newspares = kmem_alloc((nspares - 1) * sizeof (void *),
+ KM_SLEEP);
+ for (i = 0, j = 0; i < nspares; i++) {
+ if (spares[i] != nv)
+ VERIFY(nvlist_dup(spares[i],
+ &newspares[j++], KM_SLEEP) == 0);
+ }
+ }
+
+ VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES,
+ DATA_TYPE_NVLIST_ARRAY) == 0);
+ VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES,
+ newspares, nspares - 1) == 0);
+ for (i = 0; i < nspares - 1; i++)
+ nvlist_free(newspares[i]);
+ kmem_free(newspares, (nspares - 1) * sizeof (void *));
+ spa_load_spares(spa);
+ spa->spa_sync_spares = B_TRUE;
+
+out:
+ spa_config_exit(spa, FTAG);
+
+ return (ret);
+}
+
+/*
+ * Find any device that's done replacing, so we can detach it.
+ */
+static vdev_t *
+spa_vdev_replace_done_hunt(vdev_t *vd)
+{
+ vdev_t *newvd, *oldvd;
+ int c;
+
+ for (c = 0; c < vd->vdev_children; c++) {
+ oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]);
+ if (oldvd != NULL)
+ return (oldvd);
+ }
+
+ if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) {
+ oldvd = vd->vdev_child[0];
+ newvd = vd->vdev_child[1];
+
+ mutex_enter(&newvd->vdev_dtl_lock);
+ if (newvd->vdev_dtl_map.sm_space == 0 &&
+ newvd->vdev_dtl_scrub.sm_space == 0) {
+ mutex_exit(&newvd->vdev_dtl_lock);
+ return (oldvd);
+ }
+ mutex_exit(&newvd->vdev_dtl_lock);
+ }
+
+ return (NULL);
+}
+
+static void
+spa_vdev_replace_done(spa_t *spa)
+{
+ vdev_t *vd;
+ vdev_t *pvd;
+ uint64_t guid;
+ uint64_t pguid = 0;
+
+ spa_config_enter(spa, RW_READER, FTAG);
+
+ while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) {
+ guid = vd->vdev_guid;
+ /*
+ * If we have just finished replacing a hot spared device, then
+ * we need to detach the parent's first child (the original hot
+ * spare) as well.
+ */
+ pvd = vd->vdev_parent;
+ if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
+ pvd->vdev_id == 0) {
+ ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
+ ASSERT(pvd->vdev_parent->vdev_children == 2);
+ pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid;
+ }
+ spa_config_exit(spa, FTAG);
+ if (spa_vdev_detach(spa, guid, B_TRUE) != 0)
+ return;
+ if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0)
+ return;
+ spa_config_enter(spa, RW_READER, FTAG);
+ }
+
+ spa_config_exit(spa, FTAG);
+}
+
+/*
+ * Update the stored path for this vdev. Dirty the vdev configuration, relying
+ * on spa_vdev_enter/exit() to synchronize the labels and cache.
+ */
+int
+spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
+{
+ vdev_t *rvd, *vd;
+ uint64_t txg;
+
+ rvd = spa->spa_root_vdev;
+
+ txg = spa_vdev_enter(spa);
+
+ if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) {
+ /*
+ * Determine if this is a reference to a hot spare. In that
+ * case, update the path as stored in the spare list.
+ */
+ nvlist_t **spares;
+ uint_t i, nspares;
+ if (spa->spa_sparelist != NULL) {
+ VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist,
+ ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
+ for (i = 0; i < nspares; i++) {
+ uint64_t theguid;
+ VERIFY(nvlist_lookup_uint64(spares[i],
+ ZPOOL_CONFIG_GUID, &theguid) == 0);
+ if (theguid == guid)
+ break;
+ }
+
+ if (i == nspares)
+ return (spa_vdev_exit(spa, NULL, txg, ENOENT));
+
+ VERIFY(nvlist_add_string(spares[i],
+ ZPOOL_CONFIG_PATH, newpath) == 0);
+ spa_load_spares(spa);
+ spa->spa_sync_spares = B_TRUE;
+ return (spa_vdev_exit(spa, NULL, txg, 0));
+ } else {
+ return (spa_vdev_exit(spa, NULL, txg, ENOENT));
+ }
+ }
+
+ if (!vd->vdev_ops->vdev_op_leaf)
+ return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
+ spa_strfree(vd->vdev_path);
+ vd->vdev_path = spa_strdup(newpath);
+
+ vdev_config_dirty(vd->vdev_top);
+
+ return (spa_vdev_exit(spa, NULL, txg, 0));
+}
+
+/*
+ * ==========================================================================
+ * SPA Scrubbing
+ * ==========================================================================
+ */
+
+static void
+spa_scrub_io_done(zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+
+ zio_data_buf_free(zio->io_data, zio->io_size);
+
+ mutex_enter(&spa->spa_scrub_lock);
+ if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+ vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev;
+ spa->spa_scrub_errors++;
+ mutex_enter(&vd->vdev_stat_lock);
+ vd->vdev_stat.vs_scrub_errors++;
+ mutex_exit(&vd->vdev_stat_lock);
+ }
+
+ if (--spa->spa_scrub_inflight < spa->spa_scrub_maxinflight)
+ cv_broadcast(&spa->spa_scrub_io_cv);
+
+ ASSERT(spa->spa_scrub_inflight >= 0);
+
+ mutex_exit(&spa->spa_scrub_lock);
+}
+
+static void
+spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags,
+ zbookmark_t *zb)
+{
+ size_t size = BP_GET_LSIZE(bp);
+ void *data;
+
+ mutex_enter(&spa->spa_scrub_lock);
+ /*
+ * Do not give too much work to vdev(s).
+ */
+ while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) {
+ cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
+ }
+ spa->spa_scrub_inflight++;
+ mutex_exit(&spa->spa_scrub_lock);
+
+ data = zio_data_buf_alloc(size);
+
+ if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)
+ flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */
+
+ flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL;
+
+ zio_nowait(zio_read(NULL, spa, bp, data, size,
+ spa_scrub_io_done, NULL, priority, flags, zb));
+}
+
+/* ARGSUSED */
+static int
+spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a)
+{
+ blkptr_t *bp = &bc->bc_blkptr;
+ vdev_t *vd = spa->spa_root_vdev;
+ dva_t *dva = bp->blk_dva;
+ int needs_resilver = B_FALSE;
+ int d;
+
+ if (bc->bc_errno) {
+ /*
+ * We can't scrub this block, but we can continue to scrub
+ * the rest of the pool. Note the error and move along.
+ */
+ mutex_enter(&spa->spa_scrub_lock);
+ spa->spa_scrub_errors++;
+ mutex_exit(&spa->spa_scrub_lock);
+
+ mutex_enter(&vd->vdev_stat_lock);
+ vd->vdev_stat.vs_scrub_errors++;
+ mutex_exit(&vd->vdev_stat_lock);
+
+ return (ERESTART);
+ }
+
+ ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg);
+
+ for (d = 0; d < BP_GET_NDVAS(bp); d++) {
+ vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]));
+
+ ASSERT(vd != NULL);
+
+ /*
+ * Keep track of how much data we've examined so that
+ * zpool(1M) status can make useful progress reports.
+ */
+ mutex_enter(&vd->vdev_stat_lock);
+ vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]);
+ mutex_exit(&vd->vdev_stat_lock);
+
+ if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) {
+ if (DVA_GET_GANG(&dva[d])) {
+ /*
+ * Gang members may be spread across multiple
+ * vdevs, so the best we can do is look at the
+ * pool-wide DTL.
+ * XXX -- it would be better to change our
+ * allocation policy to ensure that this can't
+ * happen.
+ */
+ vd = spa->spa_root_vdev;
+ }
+ if (vdev_dtl_contains(&vd->vdev_dtl_map,
+ bp->blk_birth, 1))
+ needs_resilver = B_TRUE;
+ }
+ }
+
+ if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING)
+ spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB,
+ ZIO_FLAG_SCRUB, &bc->bc_bookmark);
+ else if (needs_resilver)
+ spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER,
+ ZIO_FLAG_RESILVER, &bc->bc_bookmark);
+
+ return (0);
+}
+
+static void
+spa_scrub_thread(void *arg)
+{
+ spa_t *spa = arg;
+ callb_cpr_t cprinfo;
+ traverse_handle_t *th = spa->spa_scrub_th;
+ vdev_t *rvd = spa->spa_root_vdev;
+ pool_scrub_type_t scrub_type = spa->spa_scrub_type;
+ int error = 0;
+ boolean_t complete;
+
+ CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG);
+
+ /*
+ * If we're restarting due to a snapshot create/delete,
+ * wait for that to complete.
+ */
+ txg_wait_synced(spa_get_dsl(spa), 0);
+
+ dprintf("start %s mintxg=%llu maxtxg=%llu\n",
+ scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub",
+ spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg);
+
+ spa_config_enter(spa, RW_WRITER, FTAG);
+ vdev_reopen(rvd); /* purge all vdev caches */
+ vdev_config_dirty(rvd); /* rewrite all disk labels */
+ vdev_scrub_stat_update(rvd, scrub_type, B_FALSE);
+ spa_config_exit(spa, FTAG);
+
+ mutex_enter(&spa->spa_scrub_lock);
+ spa->spa_scrub_errors = 0;
+ spa->spa_scrub_active = 1;
+ ASSERT(spa->spa_scrub_inflight == 0);
+
+ while (!spa->spa_scrub_stop) {
+ CALLB_CPR_SAFE_BEGIN(&cprinfo);
+ while (spa->spa_scrub_suspended) {
+ spa->spa_scrub_active = 0;
+ cv_broadcast(&spa->spa_scrub_cv);
+ cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
+ spa->spa_scrub_active = 1;
+ }
+ CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock);
+
+ if (spa->spa_scrub_restart_txg != 0)
+ break;
+
+ mutex_exit(&spa->spa_scrub_lock);
+ error = traverse_more(th);
+ mutex_enter(&spa->spa_scrub_lock);
+ if (error != EAGAIN)
+ break;
+ }
+
+ while (spa->spa_scrub_inflight)
+ cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
+
+ spa->spa_scrub_active = 0;
+ cv_broadcast(&spa->spa_scrub_cv);
+
+ mutex_exit(&spa->spa_scrub_lock);
+
+ spa_config_enter(spa, RW_WRITER, FTAG);
+
+ mutex_enter(&spa->spa_scrub_lock);
+
+ /*
+ * Note: we check spa_scrub_restart_txg under both spa_scrub_lock
+ * AND the spa config lock to synchronize with any config changes
+ * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit().
+ */
+ if (spa->spa_scrub_restart_txg != 0)
+ error = ERESTART;
+
+ if (spa->spa_scrub_stop)
+ error = EINTR;
+
+ /*
+ * Even if there were uncorrectable errors, we consider the scrub
+ * completed. The downside is that if there is a transient error during
+ * a resilver, we won't resilver the data properly to the target. But
+ * if the damage is permanent (more likely) we will resilver forever,
+ * which isn't really acceptable. Since there is enough information for
+ * the user to know what has failed and why, this seems like a more
+ * tractable approach.
+ */
+ complete = (error == 0);
+
+ dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n",
+ scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub",
+ spa->spa_scrub_maxtxg, complete ? "done" : "FAILED",
+ error, spa->spa_scrub_errors, spa->spa_scrub_stop);
+
+ mutex_exit(&spa->spa_scrub_lock);
+
+ /*
+ * If the scrub/resilver completed, update all DTLs to reflect this.
+ * Whether it succeeded or not, vacate all temporary scrub DTLs.
+ */
+ vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1,
+ complete ? spa->spa_scrub_maxtxg : 0, B_TRUE);
+ vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete);
+ spa_errlog_rotate(spa);
+
+ spa_config_exit(spa, FTAG);
+
+ mutex_enter(&spa->spa_scrub_lock);
+
+ /*
+ * We may have finished replacing a device.
+ * Let the async thread assess this and handle the detach.
+ */
+ spa_async_request(spa, SPA_ASYNC_REPLACE_DONE);
+
+ /*
+ * If we were told to restart, our final act is to start a new scrub.
+ */
+ if (error == ERESTART)
+ spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ?
+ SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB);
+
+ spa->spa_scrub_type = POOL_SCRUB_NONE;
+ spa->spa_scrub_active = 0;
+ spa->spa_scrub_thread = NULL;
+ cv_broadcast(&spa->spa_scrub_cv);
+ CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */
+ thread_exit();
+}
+
+void
+spa_scrub_suspend(spa_t *spa)
+{
+ mutex_enter(&spa->spa_scrub_lock);
+ spa->spa_scrub_suspended++;
+ while (spa->spa_scrub_active) {
+ cv_broadcast(&spa->spa_scrub_cv);
+ cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
+ }
+ while (spa->spa_scrub_inflight)
+ cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
+ mutex_exit(&spa->spa_scrub_lock);
+}
+
+void
+spa_scrub_resume(spa_t *spa)
+{
+ mutex_enter(&spa->spa_scrub_lock);
+ ASSERT(spa->spa_scrub_suspended != 0);
+ if (--spa->spa_scrub_suspended == 0)
+ cv_broadcast(&spa->spa_scrub_cv);
+ mutex_exit(&spa->spa_scrub_lock);
+}
+
+void
+spa_scrub_restart(spa_t *spa, uint64_t txg)
+{
+ /*
+ * Something happened (e.g. snapshot create/delete) that means
+ * we must restart any in-progress scrubs. The itinerary will
+ * fix this properly.
+ */
+ mutex_enter(&spa->spa_scrub_lock);
+ spa->spa_scrub_restart_txg = txg;
+ mutex_exit(&spa->spa_scrub_lock);
+}
+
+int
+spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force)
+{
+ space_seg_t *ss;
+ uint64_t mintxg, maxtxg;
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ if ((uint_t)type >= POOL_SCRUB_TYPES)
+ return (ENOTSUP);
+
+ mutex_enter(&spa->spa_scrub_lock);
+
+ /*
+ * If there's a scrub or resilver already in progress, stop it.
+ */
+ while (spa->spa_scrub_thread != NULL) {
+ /*
+ * Don't stop a resilver unless forced.
+ */
+ if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) {
+ mutex_exit(&spa->spa_scrub_lock);
+ return (EBUSY);
+ }
+ spa->spa_scrub_stop = 1;
+ cv_broadcast(&spa->spa_scrub_cv);
+ cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
+ }
+
+ /*
+ * Terminate the previous traverse.
+ */
+ if (spa->spa_scrub_th != NULL) {
+ traverse_fini(spa->spa_scrub_th);
+ spa->spa_scrub_th = NULL;
+ }
+
+ if (rvd == NULL) {
+ ASSERT(spa->spa_scrub_stop == 0);
+ ASSERT(spa->spa_scrub_type == type);
+ ASSERT(spa->spa_scrub_restart_txg == 0);
+ mutex_exit(&spa->spa_scrub_lock);
+ return (0);
+ }
+
+ mintxg = TXG_INITIAL - 1;
+ maxtxg = spa_last_synced_txg(spa) + 1;
+
+ mutex_enter(&rvd->vdev_dtl_lock);
+
+ if (rvd->vdev_dtl_map.sm_space == 0) {
+ /*
+ * The pool-wide DTL is empty.
+ * If this is a resilver, there's nothing to do except
+ * check whether any in-progress replacements have completed.
+ */
+ if (type == POOL_SCRUB_RESILVER) {
+ type = POOL_SCRUB_NONE;
+ spa_async_request(spa, SPA_ASYNC_REPLACE_DONE);
+ }
+ } else {
+ /*
+ * The pool-wide DTL is non-empty.
+ * If this is a normal scrub, upgrade to a resilver instead.
+ */
+ if (type == POOL_SCRUB_EVERYTHING)
+ type = POOL_SCRUB_RESILVER;
+ }
+
+ if (type == POOL_SCRUB_RESILVER) {
+ /*
+ * Determine the resilvering boundaries.
+ *
+ * Note: (mintxg, maxtxg) is an open interval,
+ * i.e. mintxg and maxtxg themselves are not included.
+ *
+ * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1
+ * so we don't claim to resilver a txg that's still changing.
+ */
+ ss = avl_first(&rvd->vdev_dtl_map.sm_root);
+ mintxg = ss->ss_start - 1;
+ ss = avl_last(&rvd->vdev_dtl_map.sm_root);
+ maxtxg = MIN(ss->ss_end, maxtxg);
+ }
+
+ mutex_exit(&rvd->vdev_dtl_lock);
+
+ spa->spa_scrub_stop = 0;
+ spa->spa_scrub_type = type;
+ spa->spa_scrub_restart_txg = 0;
+
+ if (type != POOL_SCRUB_NONE) {
+ spa->spa_scrub_mintxg = mintxg;
+ spa->spa_scrub_maxtxg = maxtxg;
+ spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL,
+ ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL,
+ ZIO_FLAG_CANFAIL);
+ traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg);
+ spa->spa_scrub_thread = thread_create(NULL, 0,
+ spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri);
+ }
+
+ mutex_exit(&spa->spa_scrub_lock);
+
+ return (0);
+}
+
+/*
+ * ==========================================================================
+ * SPA async task processing
+ * ==========================================================================
+ */
+
+static void
+spa_async_reopen(spa_t *spa)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ vdev_t *tvd;
+ int c;
+
+ spa_config_enter(spa, RW_WRITER, FTAG);
+
+ for (c = 0; c < rvd->vdev_children; c++) {
+ tvd = rvd->vdev_child[c];
+ if (tvd->vdev_reopen_wanted) {
+ tvd->vdev_reopen_wanted = 0;
+ vdev_reopen(tvd);
+ }
+ }
+
+ spa_config_exit(spa, FTAG);
+}
+
+static void
+spa_async_thread(void *arg)
+{
+ spa_t *spa = arg;
+ int tasks;
+
+ ASSERT(spa->spa_sync_on);
+
+ mutex_enter(&spa->spa_async_lock);
+ tasks = spa->spa_async_tasks;
+ spa->spa_async_tasks = 0;
+ mutex_exit(&spa->spa_async_lock);
+
+ /*
+ * See if the config needs to be updated.
+ */
+ if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
+ mutex_enter(&spa_namespace_lock);
+ spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
+ mutex_exit(&spa_namespace_lock);
+ }
+
+ /*
+ * See if any devices need to be reopened.
+ */
+ if (tasks & SPA_ASYNC_REOPEN)
+ spa_async_reopen(spa);
+
+ /*
+ * If any devices are done replacing, detach them.
+ */
+ if (tasks & SPA_ASYNC_REPLACE_DONE)
+ spa_vdev_replace_done(spa);
+
+ /*
+ * Kick off a scrub.
+ */
+ if (tasks & SPA_ASYNC_SCRUB)
+ VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0);
+
+ /*
+ * Kick off a resilver.
+ */
+ if (tasks & SPA_ASYNC_RESILVER)
+ VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
+
+ /*
+ * Let the world know that we're done.
+ */
+ mutex_enter(&spa->spa_async_lock);
+ spa->spa_async_thread = NULL;
+ cv_broadcast(&spa->spa_async_cv);
+ mutex_exit(&spa->spa_async_lock);
+ thread_exit();
+}
+
+void
+spa_async_suspend(spa_t *spa)
+{
+ mutex_enter(&spa->spa_async_lock);
+ spa->spa_async_suspended++;
+ while (spa->spa_async_thread != NULL)
+ cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
+ mutex_exit(&spa->spa_async_lock);
+}
+
+void
+spa_async_resume(spa_t *spa)
+{
+ mutex_enter(&spa->spa_async_lock);
+ ASSERT(spa->spa_async_suspended != 0);
+ spa->spa_async_suspended--;
+ mutex_exit(&spa->spa_async_lock);
+}
+
+static void
+spa_async_dispatch(spa_t *spa)
+{
+ mutex_enter(&spa->spa_async_lock);
+ if (spa->spa_async_tasks && !spa->spa_async_suspended &&
+ spa->spa_async_thread == NULL &&
+ rootdir != NULL && !vn_is_readonly(rootdir))
+ spa->spa_async_thread = thread_create(NULL, 0,
+ spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
+ mutex_exit(&spa->spa_async_lock);
+}
+
+void
+spa_async_request(spa_t *spa, int task)
+{
+ mutex_enter(&spa->spa_async_lock);
+ spa->spa_async_tasks |= task;
+ mutex_exit(&spa->spa_async_lock);
+}
+
+/*
+ * ==========================================================================
+ * SPA syncing routines
+ * ==========================================================================
+ */
+
+static void
+spa_sync_deferred_frees(spa_t *spa, uint64_t txg)
+{
+ bplist_t *bpl = &spa->spa_sync_bplist;
+ dmu_tx_t *tx;
+ blkptr_t blk;
+ uint64_t itor = 0;
+ zio_t *zio;
+ int error;
+ uint8_t c = 1;
+
+ zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD);
+
+ while (bplist_iterate(bpl, &itor, &blk) == 0)
+ zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL));
+
+ error = zio_wait(zio);
+ ASSERT3U(error, ==, 0);
+
+ tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+ bplist_vacate(bpl, tx);
+
+ /*
+ * Pre-dirty the first block so we sync to convergence faster.
+ * (Usually only the first block is needed.)
+ */
+ dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx);
+ dmu_tx_commit(tx);
+}
+
+static void
+spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
+{
+ char *packed = NULL;
+ size_t nvsize = 0;
+ dmu_buf_t *db;
+
+ VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
+
+ packed = kmem_alloc(nvsize, KM_SLEEP);
+
+ VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
+ KM_SLEEP) == 0);
+
+ dmu_write(spa->spa_meta_objset, obj, 0, nvsize, packed, tx);
+
+ kmem_free(packed, nvsize);
+
+ VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
+ dmu_buf_will_dirty(db, tx);
+ *(uint64_t *)db->db_data = nvsize;
+ dmu_buf_rele(db, FTAG);
+}
+
+static void
+spa_sync_spares(spa_t *spa, dmu_tx_t *tx)
+{
+ nvlist_t *nvroot;
+ nvlist_t **spares;
+ int i;
+
+ if (!spa->spa_sync_spares)
+ return;
+
+ /*
+ * Update the MOS nvlist describing the list of available spares.
+ * spa_validate_spares() will have already made sure this nvlist is
+ * valid and the vdevs are labelled appropriately.
+ */
+ if (spa->spa_spares_object == 0) {
+ spa->spa_spares_object = dmu_object_alloc(spa->spa_meta_objset,
+ DMU_OT_PACKED_NVLIST, 1 << 14,
+ DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
+ VERIFY(zap_update(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SPARES,
+ sizeof (uint64_t), 1, &spa->spa_spares_object, tx) == 0);
+ }
+
+ VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ if (spa->spa_nspares == 0) {
+ VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
+ NULL, 0) == 0);
+ } else {
+ spares = kmem_alloc(spa->spa_nspares * sizeof (void *),
+ KM_SLEEP);
+ for (i = 0; i < spa->spa_nspares; i++)
+ spares[i] = vdev_config_generate(spa,
+ spa->spa_spares[i], B_FALSE, B_TRUE);
+ VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
+ spares, spa->spa_nspares) == 0);
+ for (i = 0; i < spa->spa_nspares; i++)
+ nvlist_free(spares[i]);
+ kmem_free(spares, spa->spa_nspares * sizeof (void *));
+ }
+
+ spa_sync_nvlist(spa, spa->spa_spares_object, nvroot, tx);
+ nvlist_free(nvroot);
+
+ spa->spa_sync_spares = B_FALSE;
+}
+
+static void
+spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
+{
+ nvlist_t *config;
+
+ if (list_is_empty(&spa->spa_dirty_list))
+ return;
+
+ config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE);
+
+ if (spa->spa_config_syncing)
+ nvlist_free(spa->spa_config_syncing);
+ spa->spa_config_syncing = config;
+
+ spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
+}
+
+static void
+spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ spa_t *spa = arg1;
+ nvlist_t *nvp = arg2;
+ nvpair_t *nvpair;
+ objset_t *mos = spa->spa_meta_objset;
+ uint64_t zapobj;
+
+ mutex_enter(&spa->spa_props_lock);
+ if (spa->spa_pool_props_object == 0) {
+ zapobj = zap_create(mos, DMU_OT_POOL_PROPS, DMU_OT_NONE, 0, tx);
+ VERIFY(zapobj > 0);
+
+ spa->spa_pool_props_object = zapobj;
+
+ VERIFY(zap_update(mos, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_PROPS, 8, 1,
+ &spa->spa_pool_props_object, tx) == 0);
+ }
+ mutex_exit(&spa->spa_props_lock);
+
+ nvpair = NULL;
+ while ((nvpair = nvlist_next_nvpair(nvp, nvpair))) {
+ switch (zpool_name_to_prop(nvpair_name(nvpair))) {
+ case ZFS_PROP_BOOTFS:
+ VERIFY(nvlist_lookup_uint64(nvp,
+ nvpair_name(nvpair), &spa->spa_bootfs) == 0);
+ VERIFY(zap_update(mos,
+ spa->spa_pool_props_object,
+ zpool_prop_to_name(ZFS_PROP_BOOTFS), 8, 1,
+ &spa->spa_bootfs, tx) == 0);
+ break;
+ }
+ }
+}
+
+/*
+ * Sync the specified transaction group. New blocks may be dirtied as
+ * part of the process, so we iterate until it converges.
+ */
+void
+spa_sync(spa_t *spa, uint64_t txg)
+{
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+ objset_t *mos = spa->spa_meta_objset;
+ bplist_t *bpl = &spa->spa_sync_bplist;
+ vdev_t *rvd = spa->spa_root_vdev;
+ vdev_t *vd;
+ dmu_tx_t *tx;
+ int dirty_vdevs;
+
+ /*
+ * Lock out configuration changes.
+ */
+ spa_config_enter(spa, RW_READER, FTAG);
+
+ spa->spa_syncing_txg = txg;
+ spa->spa_sync_pass = 0;
+
+ VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj));
+
+ tx = dmu_tx_create_assigned(dp, txg);
+
+ /*
+ * If we are upgrading to ZFS_VERSION_RAIDZ_DEFLATE this txg,
+ * set spa_deflate if we have no raid-z vdevs.
+ */
+ if (spa->spa_ubsync.ub_version < ZFS_VERSION_RAIDZ_DEFLATE &&
+ spa->spa_uberblock.ub_version >= ZFS_VERSION_RAIDZ_DEFLATE) {
+ int i;
+
+ for (i = 0; i < rvd->vdev_children; i++) {
+ vd = rvd->vdev_child[i];
+ if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
+ break;
+ }
+ if (i == rvd->vdev_children) {
+ spa->spa_deflate = TRUE;
+ VERIFY(0 == zap_add(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
+ sizeof (uint64_t), 1, &spa->spa_deflate, tx));
+ }
+ }
+
+ /*
+ * If anything has changed in this txg, push the deferred frees
+ * from the previous txg. If not, leave them alone so that we
+ * don't generate work on an otherwise idle system.
+ */
+ if (!txg_list_empty(&dp->dp_dirty_datasets, txg) ||
+ !txg_list_empty(&dp->dp_dirty_dirs, txg) ||
+ !txg_list_empty(&dp->dp_sync_tasks, txg))
+ spa_sync_deferred_frees(spa, txg);
+
+ /*
+ * Iterate to convergence.
+ */
+ do {
+ spa->spa_sync_pass++;
+
+ spa_sync_config_object(spa, tx);
+ spa_sync_spares(spa, tx);
+ spa_errlog_sync(spa, txg);
+ dsl_pool_sync(dp, txg);
+
+ dirty_vdevs = 0;
+ while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) {
+ vdev_sync(vd, txg);
+ dirty_vdevs++;
+ }
+
+ bplist_sync(bpl, tx);
+ } while (dirty_vdevs);
+
+ bplist_close(bpl);
+
+ dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass);
+
+ /*
+ * Rewrite the vdev configuration (which includes the uberblock)
+ * to commit the transaction group.
+ *
+ * If there are any dirty vdevs, sync the uberblock to all vdevs.
+ * Otherwise, pick a random top-level vdev that's known to be
+ * visible in the config cache (see spa_vdev_add() for details).
+ * If the write fails, try the next vdev until we're tried them all.
+ */
+ if (!list_is_empty(&spa->spa_dirty_list)) {
+ VERIFY(vdev_config_sync(rvd, txg) == 0);
+ } else {
+ int children = rvd->vdev_children;
+ int c0 = spa_get_random(children);
+ int c;
+
+ for (c = 0; c < children; c++) {
+ vd = rvd->vdev_child[(c0 + c) % children];
+ if (vd->vdev_ms_array == 0)
+ continue;
+ if (vdev_config_sync(vd, txg) == 0)
+ break;
+ }
+ if (c == children)
+ VERIFY(vdev_config_sync(rvd, txg) == 0);
+ }
+
+ dmu_tx_commit(tx);
+
+ /*
+ * Clear the dirty config list.
+ */
+ while ((vd = list_head(&spa->spa_dirty_list)) != NULL)
+ vdev_config_clean(vd);
+
+ /*
+ * Now that the new config has synced transactionally,
+ * let it become visible to the config cache.
+ */
+ if (spa->spa_config_syncing != NULL) {
+ spa_config_set(spa, spa->spa_config_syncing);
+ spa->spa_config_txg = txg;
+ spa->spa_config_syncing = NULL;
+ }
+
+ /*
+ * Make a stable copy of the fully synced uberblock.
+ * We use this as the root for pool traversals.
+ */
+ spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */
+
+ spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */
+
+ rw_enter(&spa->spa_traverse_lock, RW_WRITER);
+ spa->spa_traverse_wanted = 0;
+ spa->spa_ubsync = spa->spa_uberblock;
+ rw_exit(&spa->spa_traverse_lock);
+
+ spa_scrub_resume(spa); /* resume scrub with new ubsync */
+
+ /*
+ * Clean up the ZIL records for the synced txg.
+ */
+ dsl_pool_zil_clean(dp);
+
+ /*
+ * Update usable space statistics.
+ */
+ while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
+ vdev_sync_done(vd, txg);
+
+ /*
+ * It had better be the case that we didn't dirty anything
+ * since vdev_config_sync().
+ */
+ ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
+ ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
+ ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
+ ASSERT(bpl->bpl_queue == NULL);
+
+ spa_config_exit(spa, FTAG);
+
+ /*
+ * If any async tasks have been requested, kick them off.
+ */
+ spa_async_dispatch(spa);
+}
+
+/*
+ * Sync all pools. We don't want to hold the namespace lock across these
+ * operations, so we take a reference on the spa_t and drop the lock during the
+ * sync.
+ */
+void
+spa_sync_allpools(void)
+{
+ spa_t *spa = NULL;
+ mutex_enter(&spa_namespace_lock);
+ while ((spa = spa_next(spa)) != NULL) {
+ if (spa_state(spa) != POOL_STATE_ACTIVE)
+ continue;
+ spa_open_ref(spa, FTAG);
+ mutex_exit(&spa_namespace_lock);
+ txg_wait_synced(spa_get_dsl(spa), 0);
+ mutex_enter(&spa_namespace_lock);
+ spa_close(spa, FTAG);
+ }
+ mutex_exit(&spa_namespace_lock);
+}
+
+/*
+ * ==========================================================================
+ * Miscellaneous routines
+ * ==========================================================================
+ */
+
+/*
+ * Remove all pools in the system.
+ */
+void
+spa_evict_all(void)
+{
+ spa_t *spa;
+
+ /*
+ * Remove all cached state. All pools should be closed now,
+ * so every spa in the AVL tree should be unreferenced.
+ */
+ mutex_enter(&spa_namespace_lock);
+ while ((spa = spa_next(NULL)) != NULL) {
+ /*
+ * Stop async tasks. The async thread may need to detach
+ * a device that's been replaced, which requires grabbing
+ * spa_namespace_lock, so we must drop it here.
+ */
+ spa_open_ref(spa, FTAG);
+ mutex_exit(&spa_namespace_lock);
+ spa_async_suspend(spa);
+ VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0);
+ mutex_enter(&spa_namespace_lock);
+ spa_close(spa, FTAG);
+
+ if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
+ spa_unload(spa);
+ spa_deactivate(spa);
+ }
+ spa_remove(spa);
+ }
+ mutex_exit(&spa_namespace_lock);
+}
+
+vdev_t *
+spa_lookup_by_guid(spa_t *spa, uint64_t guid)
+{
+ return (vdev_lookup_by_guid(spa->spa_root_vdev, guid));
+}
+
+void
+spa_upgrade(spa_t *spa)
+{
+ spa_config_enter(spa, RW_WRITER, FTAG);
+
+ /*
+ * This should only be called for a non-faulted pool, and since a
+ * future version would result in an unopenable pool, this shouldn't be
+ * possible.
+ */
+ ASSERT(spa->spa_uberblock.ub_version <= ZFS_VERSION);
+
+ spa->spa_uberblock.ub_version = ZFS_VERSION;
+ vdev_config_dirty(spa->spa_root_vdev);
+
+ spa_config_exit(spa, FTAG);
+
+ txg_wait_synced(spa_get_dsl(spa), 0);
+}
+
+boolean_t
+spa_has_spare(spa_t *spa, uint64_t guid)
+{
+ int i;
+ uint64_t spareguid;
+
+ for (i = 0; i < spa->spa_nspares; i++)
+ if (spa->spa_spares[i]->vdev_guid == guid)
+ return (B_TRUE);
+
+ for (i = 0; i < spa->spa_pending_nspares; i++) {
+ if (nvlist_lookup_uint64(spa->spa_pending_spares[i],
+ ZPOOL_CONFIG_GUID, &spareguid) == 0 &&
+ spareguid == guid)
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+int
+spa_set_props(spa_t *spa, nvlist_t *nvp)
+{
+ return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props,
+ spa, nvp, 3));
+}
+
+int
+spa_get_props(spa_t *spa, nvlist_t **nvp)
+{
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ objset_t *mos = spa->spa_meta_objset;
+ zfs_source_t src;
+ zfs_prop_t prop;
+ nvlist_t *propval;
+ uint64_t value;
+ int err;
+
+ VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+ mutex_enter(&spa->spa_props_lock);
+ /* If no props object, then just return empty nvlist */
+ if (spa->spa_pool_props_object == 0) {
+ mutex_exit(&spa->spa_props_lock);
+ return (0);
+ }
+
+ for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
+ (err = zap_cursor_retrieve(&zc, &za)) == 0;
+ zap_cursor_advance(&zc)) {
+
+ if ((prop = zpool_name_to_prop(za.za_name)) == ZFS_PROP_INVAL)
+ continue;
+
+ VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ switch (za.za_integer_length) {
+ case 8:
+ if (zfs_prop_default_numeric(prop) ==
+ za.za_first_integer)
+ src = ZFS_SRC_DEFAULT;
+ else
+ src = ZFS_SRC_LOCAL;
+ value = za.za_first_integer;
+
+ if (prop == ZFS_PROP_BOOTFS) {
+ dsl_pool_t *dp;
+ dsl_dataset_t *ds = NULL;
+ char strval[MAXPATHLEN];
+
+ dp = spa_get_dsl(spa);
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ if ((err = dsl_dataset_open_obj(dp,
+ za.za_first_integer, NULL, DS_MODE_NONE,
+ FTAG, &ds)) != 0) {
+ rw_exit(&dp->dp_config_rwlock);
+ break;
+ }
+ dsl_dataset_name(ds, strval);
+ dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+ rw_exit(&dp->dp_config_rwlock);
+
+ VERIFY(nvlist_add_uint64(propval,
+ ZFS_PROP_SOURCE, src) == 0);
+ VERIFY(nvlist_add_string(propval,
+ ZFS_PROP_VALUE, strval) == 0);
+ } else {
+ VERIFY(nvlist_add_uint64(propval,
+ ZFS_PROP_SOURCE, src) == 0);
+ VERIFY(nvlist_add_uint64(propval,
+ ZFS_PROP_VALUE, value) == 0);
+ }
+ VERIFY(nvlist_add_nvlist(*nvp, za.za_name,
+ propval) == 0);
+ break;
+ }
+ nvlist_free(propval);
+ }
+ zap_cursor_fini(&zc);
+ mutex_exit(&spa->spa_props_lock);
+ if (err && err != ENOENT) {
+ nvlist_free(*nvp);
+ return (err);
+ }
+
+ return (0);
+}
+
+/*
+ * If the bootfs property value is dsobj, clear it.
+ */
+void
+spa_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
+{
+ if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
+ VERIFY(zap_remove(spa->spa_meta_objset,
+ spa->spa_pool_props_object,
+ zpool_prop_to_name(ZFS_PROP_BOOTFS), tx) == 0);
+ spa->spa_bootfs = 0;
+ }
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c
new file mode 100644
index 0000000..b5d8c38
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c
@@ -0,0 +1,361 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/nvpair.h>
+#include <sys/uio.h>
+#include <sys/fs/zfs.h>
+#include <sys/vdev_impl.h>
+#include <sys/zfs_ioctl.h>
+#ifdef _KERNEL
+#include <sys/kobj.h>
+#endif
+
+/*
+ * Pool configuration repository.
+ *
+ * The configuration for all pools, in addition to being stored on disk, is
+ * stored in /etc/zfs/zpool.cache as a packed nvlist. The kernel maintains
+ * this list as pools are created, destroyed, or modified.
+ *
+ * We have a single nvlist which holds all the configuration information. When
+ * the module loads, we read this information from the cache and populate the
+ * SPA namespace. This namespace is maintained independently in spa.c.
+ * Whenever the namespace is modified, or the configuration of a pool is
+ * changed, we call spa_config_sync(), which walks through all the active pools
+ * and writes the configuration to disk.
+ */
+
+static uint64_t spa_config_generation = 1;
+
+/*
+ * This can be overridden in userland to preserve an alternate namespace for
+ * userland pools when doing testing.
+ */
+const char *spa_config_dir = ZPOOL_CACHE_DIR;
+
+/*
+ * Called when the module is first loaded, this routine loads the configuration
+ * file into the SPA namespace. It does not actually open or load the pools; it
+ * only populates the namespace.
+ */
+void
+spa_config_load(void)
+{
+ void *buf = NULL;
+ nvlist_t *nvlist, *child;
+ nvpair_t *nvpair;
+ spa_t *spa;
+ char pathname[128];
+ struct _buf *file;
+ uint64_t fsize;
+
+ root_mount_wait();
+
+ /*
+ * Open the configuration file.
+ */
+ (void) snprintf(pathname, sizeof (pathname), "%s/%s",
+ spa_config_dir, ZPOOL_CACHE_FILE);
+
+ file = kobj_open_file(pathname);
+ if (file == (struct _buf *)-1)
+ return;
+
+ if (kobj_get_filesize(file, &fsize) != 0)
+ goto out;
+
+ buf = kmem_alloc(fsize, KM_SLEEP);
+
+ /*
+ * Read the nvlist from the file.
+ */
+ if (kobj_read_file(file, buf, fsize, 0) < 0)
+ goto out;
+
+ /*
+ * Unpack the nvlist.
+ */
+ if (nvlist_unpack(buf, fsize, &nvlist, KM_SLEEP) != 0)
+ goto out;
+
+ /*
+ * Iterate over all elements in the nvlist, creating a new spa_t for
+ * each one with the specified configuration.
+ */
+ mutex_enter(&spa_namespace_lock);
+ nvpair = NULL;
+ while ((nvpair = nvlist_next_nvpair(nvlist, nvpair)) != NULL) {
+
+ if (nvpair_type(nvpair) != DATA_TYPE_NVLIST)
+ continue;
+
+ VERIFY(nvpair_value_nvlist(nvpair, &child) == 0);
+
+ if (spa_lookup(nvpair_name(nvpair)) != NULL)
+ continue;
+ spa = spa_add(nvpair_name(nvpair), NULL);
+
+ /*
+ * We blindly duplicate the configuration here. If it's
+ * invalid, we will catch it when the pool is first opened.
+ */
+ VERIFY(nvlist_dup(child, &spa->spa_config, 0) == 0);
+ }
+ mutex_exit(&spa_namespace_lock);
+
+ nvlist_free(nvlist);
+
+out:
+ if (buf != NULL)
+ kmem_free(buf, fsize);
+
+ kobj_close_file(file);
+}
+
+/*
+ * Synchronize all pools to disk. This must be called with the namespace lock
+ * held.
+ */
+void
+spa_config_sync(void)
+{
+ spa_t *spa = NULL;
+ nvlist_t *config;
+ size_t buflen;
+ char *buf;
+ vnode_t *vp;
+ int oflags = FWRITE | FTRUNC | FCREAT | FOFFMAX;
+ char pathname[128];
+ char pathname2[128];
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+ /*
+ * Add all known pools to the configuration list, ignoring those with
+ * alternate root paths.
+ */
+ spa = NULL;
+ while ((spa = spa_next(spa)) != NULL) {
+ mutex_enter(&spa->spa_config_cache_lock);
+ if (spa->spa_config && spa->spa_name && spa->spa_root == NULL)
+ VERIFY(nvlist_add_nvlist(config, spa->spa_name,
+ spa->spa_config) == 0);
+ mutex_exit(&spa->spa_config_cache_lock);
+ }
+
+ /*
+ * Pack the configuration into a buffer.
+ */
+ VERIFY(nvlist_size(config, &buflen, NV_ENCODE_XDR) == 0);
+
+ buf = kmem_alloc(buflen, KM_SLEEP);
+
+ VERIFY(nvlist_pack(config, &buf, &buflen, NV_ENCODE_XDR,
+ KM_SLEEP) == 0);
+
+ /*
+ * Write the configuration to disk. We need to do the traditional
+ * 'write to temporary file, sync, move over original' to make sure we
+ * always have a consistent view of the data.
+ */
+ (void) snprintf(pathname, sizeof (pathname), "%s/%s", spa_config_dir,
+ ZPOOL_CACHE_TMP);
+
+ if (vn_open(pathname, UIO_SYSSPACE, oflags, 0644, &vp, CRCREAT, 0) != 0)
+ goto out;
+
+ if (vn_rdwr(UIO_WRITE, vp, buf, buflen, 0, UIO_SYSSPACE,
+ 0, RLIM64_INFINITY, kcred, NULL) == 0 &&
+ VOP_FSYNC(vp, FSYNC, kcred) == 0) {
+ (void) snprintf(pathname2, sizeof (pathname2), "%s/%s",
+ spa_config_dir, ZPOOL_CACHE_FILE);
+ (void) vn_rename(pathname, pathname2, UIO_SYSSPACE);
+ }
+
+ (void) VOP_CLOSE(vp, oflags, 1, 0, kcred);
+ VN_RELE(vp);
+
+out:
+ (void) vn_remove(pathname, UIO_SYSSPACE, RMFILE);
+ spa_config_generation++;
+
+ kmem_free(buf, buflen);
+ nvlist_free(config);
+}
+
+/*
+ * Sigh. Inside a local zone, we don't have access to /etc/zfs/zpool.cache,
+ * and we don't want to allow the local zone to see all the pools anyway.
+ * So we have to invent the ZFS_IOC_CONFIG ioctl to grab the configuration
+ * information for all pool visible within the zone.
+ */
+nvlist_t *
+spa_all_configs(uint64_t *generation)
+{
+ nvlist_t *pools;
+ spa_t *spa;
+
+ if (*generation == spa_config_generation)
+ return (NULL);
+
+ VERIFY(nvlist_alloc(&pools, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+ spa = NULL;
+ mutex_enter(&spa_namespace_lock);
+ while ((spa = spa_next(spa)) != NULL) {
+ if (INGLOBALZONE(curproc) ||
+ zone_dataset_visible(spa_name(spa), NULL)) {
+ mutex_enter(&spa->spa_config_cache_lock);
+ VERIFY(nvlist_add_nvlist(pools, spa_name(spa),
+ spa->spa_config) == 0);
+ mutex_exit(&spa->spa_config_cache_lock);
+ }
+ }
+ mutex_exit(&spa_namespace_lock);
+
+ *generation = spa_config_generation;
+
+ return (pools);
+}
+
+void
+spa_config_set(spa_t *spa, nvlist_t *config)
+{
+ mutex_enter(&spa->spa_config_cache_lock);
+ if (spa->spa_config != NULL)
+ nvlist_free(spa->spa_config);
+ spa->spa_config = config;
+ mutex_exit(&spa->spa_config_cache_lock);
+}
+
+/*
+ * Generate the pool's configuration based on the current in-core state.
+ * We infer whether to generate a complete config or just one top-level config
+ * based on whether vd is the root vdev.
+ */
+nvlist_t *
+spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats)
+{
+ nvlist_t *config, *nvroot;
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ ASSERT(spa_config_held(spa, RW_READER));
+
+ if (vd == NULL)
+ vd = rvd;
+
+ /*
+ * If txg is -1, report the current value of spa->spa_config_txg.
+ */
+ if (txg == -1ULL)
+ txg = spa->spa_config_txg;
+
+ VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
+ spa_version(spa)) == 0);
+ VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
+ spa_name(spa)) == 0);
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
+ spa_state(spa)) == 0);
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG,
+ txg) == 0);
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
+ spa_guid(spa)) == 0);
+
+ if (vd != rvd) {
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TOP_GUID,
+ vd->vdev_top->vdev_guid) == 0);
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_GUID,
+ vd->vdev_guid) == 0);
+ if (vd->vdev_isspare)
+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_IS_SPARE,
+ 1ULL) == 0);
+ vd = vd->vdev_top; /* label contains top config */
+ }
+
+ nvroot = vdev_config_generate(spa, vd, getstats, B_FALSE);
+ VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
+ nvlist_free(nvroot);
+
+ return (config);
+}
+
+/*
+ * Update all disk labels, generate a fresh config based on the current
+ * in-core state, and sync the global config cache.
+ */
+void
+spa_config_update(spa_t *spa, int what)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ uint64_t txg;
+ int c;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ spa_config_enter(spa, RW_WRITER, FTAG);
+ txg = spa_last_synced_txg(spa) + 1;
+ if (what == SPA_CONFIG_UPDATE_POOL) {
+ vdev_config_dirty(rvd);
+ } else {
+ /*
+ * If we have top-level vdevs that were added but have
+ * not yet been prepared for allocation, do that now.
+ * (It's safe now because the config cache is up to date,
+ * so it will be able to translate the new DVAs.)
+ * See comments in spa_vdev_add() for full details.
+ */
+ for (c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *tvd = rvd->vdev_child[c];
+ if (tvd->vdev_ms_array == 0) {
+ vdev_init(tvd, txg);
+ vdev_config_dirty(tvd);
+ }
+ }
+ }
+ spa_config_exit(spa, FTAG);
+
+ /*
+ * Wait for the mosconfig to be regenerated and synced.
+ */
+ txg_wait_synced(spa->spa_dsl_pool, txg);
+
+ /*
+ * Update the global config cache to reflect the new mosconfig.
+ */
+ spa_config_sync();
+
+ if (what == SPA_CONFIG_UPDATE_POOL)
+ spa_config_update(spa, SPA_CONFIG_UPDATE_VDEVS);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c
new file mode 100644
index 0000000..c52acaf
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c
@@ -0,0 +1,440 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * Routines to manage the on-disk persistent error log.
+ *
+ * Each pool stores a log of all logical data errors seen during normal
+ * operation. This is actually the union of two distinct logs: the last log,
+ * and the current log. All errors seen are logged to the current log. When a
+ * scrub completes, the current log becomes the last log, the last log is thrown
+ * out, and the current log is reinitialized. This way, if an error is somehow
+ * corrected, a new scrub will show that that it no longer exists, and will be
+ * deleted from the log when the scrub completes.
+ *
+ * The log is stored using a ZAP object whose key is a string form of the
+ * zbookmark tuple (objset, object, level, blkid), and whose contents is an
+ * optional 'objset:object' human-readable string describing the data. When an
+ * error is first logged, this string will be empty, indicating that no name is
+ * known. This prevents us from having to issue a potentially large amount of
+ * I/O to discover the object name during an error path. Instead, we do the
+ * calculation when the data is requested, storing the result so future queries
+ * will be faster.
+ *
+ * This log is then shipped into an nvlist where the key is the dataset name and
+ * the value is the object name. Userland is then responsible for uniquifying
+ * this list and displaying it to the user.
+ */
+
+#include <sys/dmu_tx.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/zap.h>
+#include <sys/zio.h>
+
+/*
+ * This is a stripped-down version of strtoull, suitable only for converting
+ * lowercase hexidecimal numbers that don't overflow.
+ */
+#ifdef _KERNEL
+static uint64_t
+_strtonum(char *str, char **nptr)
+{
+ uint64_t val = 0;
+ char c;
+ int digit;
+
+ while ((c = *str) != '\0') {
+ if (c >= '0' && c <= '9')
+ digit = c - '0';
+ else if (c >= 'a' && c <= 'f')
+ digit = 10 + c - 'a';
+ else
+ break;
+
+ val *= 16;
+ val += digit;
+
+ str++;
+ }
+
+ *nptr = str;
+
+ return (val);
+}
+#endif
+
+/*
+ * Convert a bookmark to a string.
+ */
+static void
+bookmark_to_name(zbookmark_t *zb, char *buf, size_t len)
+{
+ (void) snprintf(buf, len, "%llx:%llx:%llx:%llx",
+ (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object,
+ (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid);
+}
+
+/*
+ * Convert a string to a bookmark
+ */
+#ifdef _KERNEL
+static void
+name_to_bookmark(char *buf, zbookmark_t *zb)
+{
+ zb->zb_objset = _strtonum(buf, &buf);
+ ASSERT(*buf == ':');
+ zb->zb_object = _strtonum(buf + 1, &buf);
+ ASSERT(*buf == ':');
+ zb->zb_level = (int)_strtonum(buf + 1, &buf);
+ ASSERT(*buf == ':');
+ zb->zb_blkid = _strtonum(buf + 1, &buf);
+ ASSERT(*buf == '\0');
+}
+#endif
+
+/*
+ * Log an uncorrectable error to the persistent error log. We add it to the
+ * spa's list of pending errors. The changes are actually synced out to disk
+ * during spa_errlog_sync().
+ */
+void
+spa_log_error(spa_t *spa, zio_t *zio)
+{
+ zbookmark_t *zb = &zio->io_logical->io_bookmark;
+ spa_error_entry_t search;
+ spa_error_entry_t *new;
+ avl_tree_t *tree;
+ avl_index_t where;
+
+ /*
+ * If we are trying to import a pool, ignore any errors, as we won't be
+ * writing to the pool any time soon.
+ */
+ if (spa->spa_load_state == SPA_LOAD_TRYIMPORT)
+ return;
+
+ mutex_enter(&spa->spa_errlist_lock);
+
+ /*
+ * If we have had a request to rotate the log, log it to the next list
+ * instead of the current one.
+ */
+ if (spa->spa_scrub_active || spa->spa_scrub_finished)
+ tree = &spa->spa_errlist_scrub;
+ else
+ tree = &spa->spa_errlist_last;
+
+ search.se_bookmark = *zb;
+ if (avl_find(tree, &search, &where) != NULL) {
+ mutex_exit(&spa->spa_errlist_lock);
+ return;
+ }
+
+ new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP);
+ new->se_bookmark = *zb;
+ avl_insert(tree, new, where);
+
+ mutex_exit(&spa->spa_errlist_lock);
+}
+
+/*
+ * Return the number of errors currently in the error log. This is actually the
+ * sum of both the last log and the current log, since we don't know the union
+ * of these logs until we reach userland.
+ */
+uint64_t
+spa_get_errlog_size(spa_t *spa)
+{
+ uint64_t total = 0, count;
+
+ mutex_enter(&spa->spa_errlog_lock);
+ if (spa->spa_errlog_scrub != 0 &&
+ zap_count(spa->spa_meta_objset, spa->spa_errlog_scrub,
+ &count) == 0)
+ total += count;
+
+ if (spa->spa_errlog_last != 0 && !spa->spa_scrub_finished &&
+ zap_count(spa->spa_meta_objset, spa->spa_errlog_last,
+ &count) == 0)
+ total += count;
+ mutex_exit(&spa->spa_errlog_lock);
+
+ mutex_enter(&spa->spa_errlist_lock);
+ total += avl_numnodes(&spa->spa_errlist_last);
+ total += avl_numnodes(&spa->spa_errlist_scrub);
+ mutex_exit(&spa->spa_errlist_lock);
+
+ return (total);
+}
+
+#ifdef _KERNEL
+static int
+process_error_log(spa_t *spa, uint64_t obj, void *addr, size_t *count)
+{
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ zbookmark_t zb;
+
+ if (obj == 0)
+ return (0);
+
+ for (zap_cursor_init(&zc, spa->spa_meta_objset, obj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+
+ if (*count == 0) {
+ zap_cursor_fini(&zc);
+ return (ENOMEM);
+ }
+
+ name_to_bookmark(za.za_name, &zb);
+
+ if (copyout(&zb, (char *)addr +
+ (*count - 1) * sizeof (zbookmark_t),
+ sizeof (zbookmark_t)) != 0)
+ return (EFAULT);
+
+ *count -= 1;
+ }
+
+ zap_cursor_fini(&zc);
+
+ return (0);
+}
+
+static int
+process_error_list(avl_tree_t *list, void *addr, size_t *count)
+{
+ spa_error_entry_t *se;
+
+ for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) {
+
+ if (*count == 0)
+ return (ENOMEM);
+
+ if (copyout(&se->se_bookmark, (char *)addr +
+ (*count - 1) * sizeof (zbookmark_t),
+ sizeof (zbookmark_t)) != 0)
+ return (EFAULT);
+
+ *count -= 1;
+ }
+
+ return (0);
+}
+#endif
+
+/*
+ * Copy all known errors to userland as an array of bookmarks. This is
+ * actually a union of the on-disk last log and current log, as well as any
+ * pending error requests.
+ *
+ * Because the act of reading the on-disk log could cause errors to be
+ * generated, we have two separate locks: one for the error log and one for the
+ * in-core error lists. We only need the error list lock to log and error, so
+ * we grab the error log lock while we read the on-disk logs, and only pick up
+ * the error list lock when we are finished.
+ */
+int
+spa_get_errlog(spa_t *spa, void *uaddr, size_t *count)
+{
+ int ret = 0;
+
+#ifdef _KERNEL
+ mutex_enter(&spa->spa_errlog_lock);
+
+ ret = process_error_log(spa, spa->spa_errlog_scrub, uaddr, count);
+
+ if (!ret && !spa->spa_scrub_finished)
+ ret = process_error_log(spa, spa->spa_errlog_last, uaddr,
+ count);
+
+ mutex_enter(&spa->spa_errlist_lock);
+ if (!ret)
+ ret = process_error_list(&spa->spa_errlist_scrub, uaddr,
+ count);
+ if (!ret)
+ ret = process_error_list(&spa->spa_errlist_last, uaddr,
+ count);
+ mutex_exit(&spa->spa_errlist_lock);
+
+ mutex_exit(&spa->spa_errlog_lock);
+#endif
+
+ return (ret);
+}
+
+/*
+ * Called when a scrub completes. This simply set a bit which tells which AVL
+ * tree to add new errors. spa_errlog_sync() is responsible for actually
+ * syncing the changes to the underlying objects.
+ */
+void
+spa_errlog_rotate(spa_t *spa)
+{
+ mutex_enter(&spa->spa_errlist_lock);
+
+ ASSERT(!spa->spa_scrub_finished);
+ spa->spa_scrub_finished = B_TRUE;
+
+ mutex_exit(&spa->spa_errlist_lock);
+}
+
+/*
+ * Discard any pending errors from the spa_t. Called when unloading a faulted
+ * pool, as the errors encountered during the open cannot be synced to disk.
+ */
+void
+spa_errlog_drain(spa_t *spa)
+{
+ spa_error_entry_t *se;
+ void *cookie;
+
+ mutex_enter(&spa->spa_errlist_lock);
+
+ cookie = NULL;
+ while ((se = avl_destroy_nodes(&spa->spa_errlist_last,
+ &cookie)) != NULL)
+ kmem_free(se, sizeof (spa_error_entry_t));
+ cookie = NULL;
+ while ((se = avl_destroy_nodes(&spa->spa_errlist_scrub,
+ &cookie)) != NULL)
+ kmem_free(se, sizeof (spa_error_entry_t));
+
+ mutex_exit(&spa->spa_errlist_lock);
+}
+
+/*
+ * Process a list of errors into the current on-disk log.
+ */
+static void
+sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx)
+{
+ spa_error_entry_t *se;
+ char buf[64];
+ void *cookie;
+
+ if (avl_numnodes(t) != 0) {
+ /* create log if necessary */
+ if (*obj == 0)
+ *obj = zap_create(spa->spa_meta_objset,
+ DMU_OT_ERROR_LOG, DMU_OT_NONE,
+ 0, tx);
+
+ /* add errors to the current log */
+ for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) {
+ char *name = se->se_name ? se->se_name : "";
+
+ bookmark_to_name(&se->se_bookmark, buf, sizeof (buf));
+
+ (void) zap_update(spa->spa_meta_objset,
+ *obj, buf, 1, strlen(name) + 1, name, tx);
+ }
+
+ /* purge the error list */
+ cookie = NULL;
+ while ((se = avl_destroy_nodes(t, &cookie)) != NULL)
+ kmem_free(se, sizeof (spa_error_entry_t));
+ }
+}
+
+/*
+ * Sync the error log out to disk. This is a little tricky because the act of
+ * writing the error log requires the spa_errlist_lock. So, we need to lock the
+ * error lists, take a copy of the lists, and then reinitialize them. Then, we
+ * drop the error list lock and take the error log lock, at which point we
+ * do the errlog processing. Then, if we encounter an I/O error during this
+ * process, we can successfully add the error to the list. Note that this will
+ * result in the perpetual recycling of errors, but it is an unlikely situation
+ * and not a performance critical operation.
+ */
+void
+spa_errlog_sync(spa_t *spa, uint64_t txg)
+{
+ dmu_tx_t *tx;
+ avl_tree_t scrub, last;
+ int scrub_finished;
+
+ mutex_enter(&spa->spa_errlist_lock);
+
+ /*
+ * Bail out early under normal circumstances.
+ */
+ if (avl_numnodes(&spa->spa_errlist_scrub) == 0 &&
+ avl_numnodes(&spa->spa_errlist_last) == 0 &&
+ !spa->spa_scrub_finished) {
+ mutex_exit(&spa->spa_errlist_lock);
+ return;
+ }
+
+ spa_get_errlists(spa, &last, &scrub);
+ scrub_finished = spa->spa_scrub_finished;
+ spa->spa_scrub_finished = B_FALSE;
+
+ mutex_exit(&spa->spa_errlist_lock);
+ mutex_enter(&spa->spa_errlog_lock);
+
+ tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+
+ /*
+ * Sync out the current list of errors.
+ */
+ sync_error_list(spa, &last, &spa->spa_errlog_last, tx);
+
+ /*
+ * Rotate the log if necessary.
+ */
+ if (scrub_finished) {
+ if (spa->spa_errlog_last != 0)
+ VERIFY(dmu_object_free(spa->spa_meta_objset,
+ spa->spa_errlog_last, tx) == 0);
+ spa->spa_errlog_last = spa->spa_errlog_scrub;
+ spa->spa_errlog_scrub = 0;
+
+ sync_error_list(spa, &scrub, &spa->spa_errlog_last, tx);
+ }
+
+ /*
+ * Sync out any pending scrub errors.
+ */
+ sync_error_list(spa, &scrub, &spa->spa_errlog_scrub, tx);
+
+ /*
+ * Update the MOS to reflect the new values.
+ */
+ (void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_ERRLOG_LAST, sizeof (uint64_t), 1,
+ &spa->spa_errlog_last, tx);
+ (void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_ERRLOG_SCRUB, sizeof (uint64_t), 1,
+ &spa->spa_errlog_scrub, tx);
+
+ dmu_tx_commit(tx);
+
+ mutex_exit(&spa->spa_errlog_lock);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c
new file mode 100644
index 0000000..6642801
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c
@@ -0,0 +1,354 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/spa_impl.h>
+#include <sys/zap.h>
+#include <sys/dsl_synctask.h>
+
+/*
+ * Routines to manage the on-disk history log.
+ *
+ * The history log is stored as a dmu object containing
+ * <packed record length, record nvlist> tuples.
+ *
+ * Where "record nvlist" is a nvlist containing uint64_ts and strings, and
+ * "packed record length" is the packed length of the "record nvlist" stored
+ * as a little endian uint64_t.
+ *
+ * The log is implemented as a ring buffer, though the original creation
+ * of the pool ('zpool create') is never overwritten.
+ *
+ * The history log is tracked as object 'spa_t::spa_history'. The bonus buffer
+ * of 'spa_history' stores the offsets for logging/retrieving history as
+ * 'spa_history_phys_t'. 'sh_pool_create_len' is the ending offset in bytes of
+ * where the 'zpool create' record is stored. This allows us to never
+ * overwrite the original creation of the pool. 'sh_phys_max_off' is the
+ * physical ending offset in bytes of the log. This tells you the length of
+ * the buffer. 'sh_eof' is the logical EOF (in bytes). Whenever a record
+ * is added, 'sh_eof' is incremented by the the size of the record.
+ * 'sh_eof' is never decremented. 'sh_bof' is the logical BOF (in bytes).
+ * This is where the consumer should start reading from after reading in
+ * the 'zpool create' portion of the log.
+ *
+ * 'sh_records_lost' keeps track of how many records have been overwritten
+ * and permanently lost.
+ */
+
+typedef enum history_log_type {
+ LOG_CMD_CREATE,
+ LOG_CMD_NO_CREATE
+} history_log_type_t;
+
+typedef struct history_arg {
+ const char *ha_history_str;
+ history_log_type_t ha_log_type;
+} history_arg_t;
+
+/* convert a logical offset to physical */
+static uint64_t
+spa_history_log_to_phys(uint64_t log_off, spa_history_phys_t *shpp)
+{
+ uint64_t phys_len;
+
+ phys_len = shpp->sh_phys_max_off - shpp->sh_pool_create_len;
+ return ((log_off - shpp->sh_pool_create_len) % phys_len
+ + shpp->sh_pool_create_len);
+}
+
+void
+spa_history_create_obj(spa_t *spa, dmu_tx_t *tx)
+{
+ dmu_buf_t *dbp;
+ spa_history_phys_t *shpp;
+ objset_t *mos = spa->spa_meta_objset;
+
+ ASSERT(spa->spa_history == 0);
+ spa->spa_history = dmu_object_alloc(mos, DMU_OT_SPA_HISTORY,
+ SPA_MAXBLOCKSIZE, DMU_OT_SPA_HISTORY_OFFSETS,
+ sizeof (spa_history_phys_t), tx);
+
+ VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_HISTORY, sizeof (uint64_t), 1,
+ &spa->spa_history, tx) == 0);
+
+ VERIFY(0 == dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp));
+ ASSERT(dbp->db_size >= sizeof (spa_history_phys_t));
+
+ shpp = dbp->db_data;
+ dmu_buf_will_dirty(dbp, tx);
+
+ /*
+ * Figure out maximum size of history log. We set it at
+ * 1% of pool size, with a max of 32MB and min of 128KB.
+ */
+ shpp->sh_phys_max_off = spa_get_dspace(spa) / 100;
+ shpp->sh_phys_max_off = MIN(shpp->sh_phys_max_off, 32<<20);
+ shpp->sh_phys_max_off = MAX(shpp->sh_phys_max_off, 128<<10);
+
+ dmu_buf_rele(dbp, FTAG);
+}
+
+/*
+ * Change 'sh_bof' to the beginning of the next record.
+ */
+static int
+spa_history_advance_bof(spa_t *spa, spa_history_phys_t *shpp)
+{
+ objset_t *mos = spa->spa_meta_objset;
+ uint64_t firstread, reclen, phys_bof;
+ char buf[sizeof (reclen)];
+ int err;
+
+ phys_bof = spa_history_log_to_phys(shpp->sh_bof, shpp);
+ firstread = MIN(sizeof (reclen), shpp->sh_phys_max_off - phys_bof);
+
+ if ((err = dmu_read(mos, spa->spa_history, phys_bof, firstread,
+ buf)) != 0)
+ return (err);
+ if (firstread != sizeof (reclen)) {
+ if ((err = dmu_read(mos, spa->spa_history,
+ shpp->sh_pool_create_len, sizeof (reclen) - firstread,
+ buf + firstread)) != 0)
+ return (err);
+ }
+
+ reclen = LE_64(*((uint64_t *)buf));
+ shpp->sh_bof += reclen + sizeof (reclen);
+ shpp->sh_records_lost++;
+ return (0);
+}
+
+static int
+spa_history_write(spa_t *spa, void *buf, uint64_t len, spa_history_phys_t *shpp,
+ dmu_tx_t *tx)
+{
+ uint64_t firstwrite, phys_eof;
+ objset_t *mos = spa->spa_meta_objset;
+ int err;
+
+ ASSERT(MUTEX_HELD(&spa->spa_history_lock));
+
+ /* see if we need to reset logical BOF */
+ while (shpp->sh_phys_max_off - shpp->sh_pool_create_len -
+ (shpp->sh_eof - shpp->sh_bof) <= len) {
+ if ((err = spa_history_advance_bof(spa, shpp)) != 0)
+ return (err);
+ }
+
+ phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp);
+ firstwrite = MIN(len, shpp->sh_phys_max_off - phys_eof);
+ shpp->sh_eof += len;
+ dmu_write(mos, spa->spa_history, phys_eof, firstwrite, buf, tx);
+
+ len -= firstwrite;
+ if (len > 0) {
+ /* write out the rest at the beginning of physical file */
+ dmu_write(mos, spa->spa_history, shpp->sh_pool_create_len,
+ len, (char *)buf + firstwrite, tx);
+ }
+
+ return (0);
+}
+
+/*
+ * Write out a history event.
+ */
+void
+spa_history_log_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+ spa_t *spa = arg1;
+ history_arg_t *hap = arg2;
+ const char *history_str = hap->ha_history_str;
+ objset_t *mos = spa->spa_meta_objset;
+ dmu_buf_t *dbp;
+ spa_history_phys_t *shpp;
+ size_t reclen;
+ uint64_t le_len;
+ nvlist_t *nvrecord;
+ char *record_packed = NULL;
+ int ret;
+
+ if (history_str == NULL)
+ return;
+
+ /*
+ * If we have an older pool that doesn't have a command
+ * history object, create it now.
+ */
+ mutex_enter(&spa->spa_history_lock);
+ if (!spa->spa_history)
+ spa_history_create_obj(spa, tx);
+ mutex_exit(&spa->spa_history_lock);
+
+ /*
+ * Get the offset of where we need to write via the bonus buffer.
+ * Update the offset when the write completes.
+ */
+ VERIFY(0 == dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp));
+ shpp = dbp->db_data;
+
+ dmu_buf_will_dirty(dbp, tx);
+
+#ifdef ZFS_DEBUG
+ {
+ dmu_object_info_t doi;
+ dmu_object_info_from_db(dbp, &doi);
+ ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS);
+ }
+#endif
+
+ /* construct a nvlist of the current time and cmd string */
+ VERIFY(nvlist_alloc(&nvrecord, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_TIME,
+ gethrestime_sec()) == 0);
+ VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_CMD, history_str) == 0);
+ VERIFY(nvlist_pack(nvrecord, &record_packed, &reclen,
+ NV_ENCODE_XDR, KM_SLEEP) == 0);
+
+ mutex_enter(&spa->spa_history_lock);
+ if (hap->ha_log_type == LOG_CMD_CREATE)
+ VERIFY(shpp->sh_eof == shpp->sh_pool_create_len);
+
+ /* write out the packed length as little endian */
+ le_len = LE_64((uint64_t)reclen);
+ ret = spa_history_write(spa, &le_len, sizeof (le_len), shpp, tx);
+ if (!ret)
+ ret = spa_history_write(spa, record_packed, reclen, shpp, tx);
+
+ if (!ret && hap->ha_log_type == LOG_CMD_CREATE) {
+ shpp->sh_pool_create_len += sizeof (le_len) + reclen;
+ shpp->sh_bof = shpp->sh_pool_create_len;
+ }
+
+ mutex_exit(&spa->spa_history_lock);
+ nvlist_free(nvrecord);
+ kmem_free(record_packed, reclen);
+ dmu_buf_rele(dbp, FTAG);
+}
+
+/*
+ * Write out a history event.
+ */
+int
+spa_history_log(spa_t *spa, const char *history_str, uint64_t pool_create)
+{
+ history_arg_t ha;
+
+ ha.ha_history_str = history_str;
+ ha.ha_log_type = pool_create ? LOG_CMD_CREATE : LOG_CMD_NO_CREATE;
+ return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_history_log_sync,
+ spa, &ha, 0));
+}
+
+/*
+ * Read out the command history.
+ */
+int
+spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf)
+{
+ objset_t *mos = spa->spa_meta_objset;
+ dmu_buf_t *dbp;
+ uint64_t read_len, phys_read_off, phys_eof;
+ uint64_t leftover = 0;
+ spa_history_phys_t *shpp;
+ int err;
+
+ /*
+ * If the command history doesn't exist (older pool),
+ * that's ok, just return ENOENT.
+ */
+ if (!spa->spa_history)
+ return (ENOENT);
+
+ if ((err = dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)) != 0)
+ return (err);
+ shpp = dbp->db_data;
+
+#ifdef ZFS_DEBUG
+ {
+ dmu_object_info_t doi;
+ dmu_object_info_from_db(dbp, &doi);
+ ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS);
+ }
+#endif
+
+ mutex_enter(&spa->spa_history_lock);
+ phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp);
+
+ if (*offp < shpp->sh_pool_create_len) {
+ /* read in just the zpool create history */
+ phys_read_off = *offp;
+ read_len = MIN(*len, shpp->sh_pool_create_len -
+ phys_read_off);
+ } else {
+ /*
+ * Need to reset passed in offset to BOF if the passed in
+ * offset has since been overwritten.
+ */
+ *offp = MAX(*offp, shpp->sh_bof);
+ phys_read_off = spa_history_log_to_phys(*offp, shpp);
+
+ /*
+ * Read up to the minimum of what the user passed down or
+ * the EOF (physical or logical). If we hit physical EOF,
+ * use 'leftover' to read from the physical BOF.
+ */
+ if (phys_read_off <= phys_eof) {
+ read_len = MIN(*len, phys_eof - phys_read_off);
+ } else {
+ read_len = MIN(*len,
+ shpp->sh_phys_max_off - phys_read_off);
+ if (phys_read_off + *len > shpp->sh_phys_max_off) {
+ leftover = MIN(*len - read_len,
+ phys_eof - shpp->sh_pool_create_len);
+ }
+ }
+ }
+
+ /* offset for consumer to use next */
+ *offp += read_len + leftover;
+
+ /* tell the consumer how much you actually read */
+ *len = read_len + leftover;
+
+ if (read_len == 0) {
+ mutex_exit(&spa->spa_history_lock);
+ dmu_buf_rele(dbp, FTAG);
+ return (0);
+ }
+
+ err = dmu_read(mos, spa->spa_history, phys_read_off, read_len, buf);
+ if (leftover && err == 0) {
+ err = dmu_read(mos, spa->spa_history, shpp->sh_pool_create_len,
+ leftover, buf + read_len);
+ }
+ mutex_exit(&spa->spa_history_lock);
+
+ dmu_buf_rele(dbp, FTAG);
+ return (err);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
new file mode 100644
index 0000000..1de1e5a
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
@@ -0,0 +1,1126 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa_impl.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+#include <sys/zio_compress.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/zap.h>
+#include <sys/zil.h>
+#include <sys/vdev_impl.h>
+#include <sys/metaslab.h>
+#include <sys/uberblock_impl.h>
+#include <sys/txg.h>
+#include <sys/avl.h>
+#include <sys/unique.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/fs/zfs.h>
+
+/*
+ * SPA locking
+ *
+ * There are four basic locks for managing spa_t structures:
+ *
+ * spa_namespace_lock (global mutex)
+ *
+ * This lock must be acquired to do any of the following:
+ *
+ * - Lookup a spa_t by name
+ * - Add or remove a spa_t from the namespace
+ * - Increase spa_refcount from non-zero
+ * - Check if spa_refcount is zero
+ * - Rename a spa_t
+ * - add/remove/attach/detach devices
+ * - Held for the duration of create/destroy/import/export
+ *
+ * It does not need to handle recursion. A create or destroy may
+ * reference objects (files or zvols) in other pools, but by
+ * definition they must have an existing reference, and will never need
+ * to lookup a spa_t by name.
+ *
+ * spa_refcount (per-spa refcount_t protected by mutex)
+ *
+ * This reference count keep track of any active users of the spa_t. The
+ * spa_t cannot be destroyed or freed while this is non-zero. Internally,
+ * the refcount is never really 'zero' - opening a pool implicitly keeps
+ * some references in the DMU. Internally we check against SPA_MINREF, but
+ * present the image of a zero/non-zero value to consumers.
+ *
+ * spa_config_lock (per-spa crazy rwlock)
+ *
+ * This SPA special is a recursive rwlock, capable of being acquired from
+ * asynchronous threads. It has protects the spa_t from config changes,
+ * and must be held in the following circumstances:
+ *
+ * - RW_READER to perform I/O to the spa
+ * - RW_WRITER to change the vdev config
+ *
+ * spa_config_cache_lock (per-spa mutex)
+ *
+ * This mutex prevents the spa_config nvlist from being updated. No
+ * other locks are required to obtain this lock, although implicitly you
+ * must have the namespace lock or non-zero refcount to have any kind
+ * of spa_t pointer at all.
+ *
+ * The locking order is fairly straightforward:
+ *
+ * spa_namespace_lock -> spa_refcount
+ *
+ * The namespace lock must be acquired to increase the refcount from 0
+ * or to check if it is zero.
+ *
+ * spa_refcount -> spa_config_lock
+ *
+ * There must be at least one valid reference on the spa_t to acquire
+ * the config lock.
+ *
+ * spa_namespace_lock -> spa_config_lock
+ *
+ * The namespace lock must always be taken before the config lock.
+ *
+ *
+ * The spa_namespace_lock and spa_config_cache_lock can be acquired directly and
+ * are globally visible.
+ *
+ * The namespace is manipulated using the following functions, all which require
+ * the spa_namespace_lock to be held.
+ *
+ * spa_lookup() Lookup a spa_t by name.
+ *
+ * spa_add() Create a new spa_t in the namespace.
+ *
+ * spa_remove() Remove a spa_t from the namespace. This also
+ * frees up any memory associated with the spa_t.
+ *
+ * spa_next() Returns the next spa_t in the system, or the
+ * first if NULL is passed.
+ *
+ * spa_evict_all() Shutdown and remove all spa_t structures in
+ * the system.
+ *
+ * spa_guid_exists() Determine whether a pool/device guid exists.
+ *
+ * The spa_refcount is manipulated using the following functions:
+ *
+ * spa_open_ref() Adds a reference to the given spa_t. Must be
+ * called with spa_namespace_lock held if the
+ * refcount is currently zero.
+ *
+ * spa_close() Remove a reference from the spa_t. This will
+ * not free the spa_t or remove it from the
+ * namespace. No locking is required.
+ *
+ * spa_refcount_zero() Returns true if the refcount is currently
+ * zero. Must be called with spa_namespace_lock
+ * held.
+ *
+ * The spa_config_lock is manipulated using the following functions:
+ *
+ * spa_config_enter() Acquire the config lock as RW_READER or
+ * RW_WRITER. At least one reference on the spa_t
+ * must exist.
+ *
+ * spa_config_exit() Release the config lock.
+ *
+ * spa_config_held() Returns true if the config lock is currently
+ * held in the given state.
+ *
+ * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit().
+ *
+ * spa_vdev_enter() Acquire the namespace lock and the config lock
+ * for writing.
+ *
+ * spa_vdev_exit() Release the config lock, wait for all I/O
+ * to complete, sync the updated configs to the
+ * cache, and release the namespace lock.
+ *
+ * The spa_name() function also requires either the spa_namespace_lock
+ * or the spa_config_lock, as both are needed to do a rename. spa_rename() is
+ * also implemented within this file since is requires manipulation of the
+ * namespace.
+ */
+
+static avl_tree_t spa_namespace_avl;
+kmutex_t spa_namespace_lock;
+static kcondvar_t spa_namespace_cv;
+static int spa_active_count;
+int spa_max_replication_override = SPA_DVAS_PER_BP;
+
+static kmutex_t spa_spare_lock;
+static avl_tree_t spa_spare_avl;
+
+kmem_cache_t *spa_buffer_pool;
+int spa_mode;
+
+#ifdef ZFS_DEBUG
+int zfs_flags = ~0;
+#else
+int zfs_flags = 0;
+#endif
+
+/*
+ * zfs_recover can be set to nonzero to attempt to recover from
+ * otherwise-fatal errors, typically caused by on-disk corruption. When
+ * set, calls to zfs_panic_recover() will turn into warning messages.
+ */
+int zfs_recover = 0;
+
+#define SPA_MINREF 5 /* spa_refcnt for an open-but-idle pool */
+
+/*
+ * ==========================================================================
+ * SPA namespace functions
+ * ==========================================================================
+ */
+
+/*
+ * Lookup the named spa_t in the AVL tree. The spa_namespace_lock must be held.
+ * Returns NULL if no matching spa_t is found.
+ */
+spa_t *
+spa_lookup(const char *name)
+{
+ spa_t search, *spa;
+ avl_index_t where;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ search.spa_name = (char *)name;
+ spa = avl_find(&spa_namespace_avl, &search, &where);
+
+ return (spa);
+}
+
+/*
+ * Create an uninitialized spa_t with the given name. Requires
+ * spa_namespace_lock. The caller must ensure that the spa_t doesn't already
+ * exist by calling spa_lookup() first.
+ */
+spa_t *
+spa_add(const char *name, const char *altroot)
+{
+ spa_t *spa;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP);
+
+ spa->spa_name = spa_strdup(name);
+ spa->spa_state = POOL_STATE_UNINITIALIZED;
+ spa->spa_freeze_txg = UINT64_MAX;
+ spa->spa_final_txg = UINT64_MAX;
+
+ mutex_init(&spa->spa_config_cache_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ cv_init(&spa->spa_scrub_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
+
+ refcount_create(&spa->spa_refcount);
+ refcount_create(&spa->spa_config_lock.scl_count);
+
+ avl_add(&spa_namespace_avl, spa);
+
+ /*
+ * Set the alternate root, if there is one.
+ */
+ if (altroot) {
+ spa->spa_root = spa_strdup(altroot);
+ spa_active_count++;
+ }
+
+ return (spa);
+}
+
+/*
+ * Removes a spa_t from the namespace, freeing up any memory used. Requires
+ * spa_namespace_lock. This is called only after the spa_t has been closed and
+ * deactivated.
+ */
+void
+spa_remove(spa_t *spa)
+{
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
+ ASSERT(spa->spa_scrub_thread == NULL);
+
+ avl_remove(&spa_namespace_avl, spa);
+ cv_broadcast(&spa_namespace_cv);
+
+ if (spa->spa_root) {
+ spa_strfree(spa->spa_root);
+ spa_active_count--;
+ }
+
+ if (spa->spa_name)
+ spa_strfree(spa->spa_name);
+
+ spa_config_set(spa, NULL);
+
+ refcount_destroy(&spa->spa_refcount);
+ refcount_destroy(&spa->spa_config_lock.scl_count);
+
+ cv_destroy(&spa->spa_async_cv);
+ cv_destroy(&spa->spa_scrub_io_cv);
+ cv_destroy(&spa->spa_scrub_cv);
+
+ mutex_destroy(&spa->spa_scrub_lock);
+ mutex_destroy(&spa->spa_async_lock);
+ mutex_destroy(&spa->spa_config_cache_lock);
+
+ kmem_free(spa, sizeof (spa_t));
+}
+
+/*
+ * Given a pool, return the next pool in the namespace, or NULL if there is
+ * none. If 'prev' is NULL, return the first pool.
+ */
+spa_t *
+spa_next(spa_t *prev)
+{
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ if (prev)
+ return (AVL_NEXT(&spa_namespace_avl, prev));
+ else
+ return (avl_first(&spa_namespace_avl));
+}
+
+/*
+ * ==========================================================================
+ * SPA refcount functions
+ * ==========================================================================
+ */
+
+/*
+ * Add a reference to the given spa_t. Must have at least one reference, or
+ * have the namespace lock held.
+ */
+void
+spa_open_ref(spa_t *spa, void *tag)
+{
+ ASSERT(refcount_count(&spa->spa_refcount) > SPA_MINREF ||
+ MUTEX_HELD(&spa_namespace_lock));
+
+ (void) refcount_add(&spa->spa_refcount, tag);
+}
+
+/*
+ * Remove a reference to the given spa_t. Must have at least one reference, or
+ * have the namespace lock held.
+ */
+void
+spa_close(spa_t *spa, void *tag)
+{
+ ASSERT(refcount_count(&spa->spa_refcount) > SPA_MINREF ||
+ MUTEX_HELD(&spa_namespace_lock));
+
+ (void) refcount_remove(&spa->spa_refcount, tag);
+}
+
+/*
+ * Check to see if the spa refcount is zero. Must be called with
+ * spa_namespace_lock held. We really compare against SPA_MINREF, which is the
+ * number of references acquired when opening a pool
+ */
+boolean_t
+spa_refcount_zero(spa_t *spa)
+{
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ return (refcount_count(&spa->spa_refcount) == SPA_MINREF);
+}
+
+/*
+ * ==========================================================================
+ * SPA spare tracking
+ * ==========================================================================
+ */
+
+/*
+ * Spares are tracked globally due to the following constraints:
+ *
+ * - A spare may be part of multiple pools.
+ * - A spare may be added to a pool even if it's actively in use within
+ * another pool.
+ * - A spare in use in any pool can only be the source of a replacement if
+ * the target is a spare in the same pool.
+ *
+ * We keep track of all spares on the system through the use of a reference
+ * counted AVL tree. When a vdev is added as a spare, or used as a replacement
+ * spare, then we bump the reference count in the AVL tree. In addition, we set
+ * the 'vdev_isspare' member to indicate that the device is a spare (active or
+ * inactive). When a spare is made active (used to replace a device in the
+ * pool), we also keep track of which pool its been made a part of.
+ *
+ * The 'spa_spare_lock' protects the AVL tree. These functions are normally
+ * called under the spa_namespace lock as part of vdev reconfiguration. The
+ * separate spare lock exists for the status query path, which does not need to
+ * be completely consistent with respect to other vdev configuration changes.
+ */
+
+typedef struct spa_spare {
+ uint64_t spare_guid;
+ uint64_t spare_pool;
+ avl_node_t spare_avl;
+ int spare_count;
+} spa_spare_t;
+
+static int
+spa_spare_compare(const void *a, const void *b)
+{
+ const spa_spare_t *sa = a;
+ const spa_spare_t *sb = b;
+
+ if (sa->spare_guid < sb->spare_guid)
+ return (-1);
+ else if (sa->spare_guid > sb->spare_guid)
+ return (1);
+ else
+ return (0);
+}
+
+void
+spa_spare_add(vdev_t *vd)
+{
+ avl_index_t where;
+ spa_spare_t search;
+ spa_spare_t *spare;
+
+ mutex_enter(&spa_spare_lock);
+ ASSERT(!vd->vdev_isspare);
+
+ search.spare_guid = vd->vdev_guid;
+ if ((spare = avl_find(&spa_spare_avl, &search, &where)) != NULL) {
+ spare->spare_count++;
+ } else {
+ spare = kmem_zalloc(sizeof (spa_spare_t), KM_SLEEP);
+ spare->spare_guid = vd->vdev_guid;
+ spare->spare_count = 1;
+ avl_insert(&spa_spare_avl, spare, where);
+ }
+ vd->vdev_isspare = B_TRUE;
+
+ mutex_exit(&spa_spare_lock);
+}
+
+void
+spa_spare_remove(vdev_t *vd)
+{
+ spa_spare_t search;
+ spa_spare_t *spare;
+ avl_index_t where;
+
+ mutex_enter(&spa_spare_lock);
+
+ search.spare_guid = vd->vdev_guid;
+ spare = avl_find(&spa_spare_avl, &search, &where);
+
+ ASSERT(vd->vdev_isspare);
+ ASSERT(spare != NULL);
+
+ if (--spare->spare_count == 0) {
+ avl_remove(&spa_spare_avl, spare);
+ kmem_free(spare, sizeof (spa_spare_t));
+ } else if (spare->spare_pool == spa_guid(vd->vdev_spa)) {
+ spare->spare_pool = 0ULL;
+ }
+
+ vd->vdev_isspare = B_FALSE;
+ mutex_exit(&spa_spare_lock);
+}
+
+boolean_t
+spa_spare_exists(uint64_t guid, uint64_t *pool)
+{
+ spa_spare_t search, *found;
+ avl_index_t where;
+
+ mutex_enter(&spa_spare_lock);
+
+ search.spare_guid = guid;
+ found = avl_find(&spa_spare_avl, &search, &where);
+
+ if (pool) {
+ if (found)
+ *pool = found->spare_pool;
+ else
+ *pool = 0ULL;
+ }
+
+ mutex_exit(&spa_spare_lock);
+
+ return (found != NULL);
+}
+
+void
+spa_spare_activate(vdev_t *vd)
+{
+ spa_spare_t search, *found;
+ avl_index_t where;
+
+ mutex_enter(&spa_spare_lock);
+ ASSERT(vd->vdev_isspare);
+
+ search.spare_guid = vd->vdev_guid;
+ found = avl_find(&spa_spare_avl, &search, &where);
+ ASSERT(found != NULL);
+ ASSERT(found->spare_pool == 0ULL);
+
+ found->spare_pool = spa_guid(vd->vdev_spa);
+ mutex_exit(&spa_spare_lock);
+}
+
+/*
+ * ==========================================================================
+ * SPA config locking
+ * ==========================================================================
+ */
+
+/*
+ * Acquire the config lock. The config lock is a special rwlock that allows for
+ * recursive enters. Because these enters come from the same thread as well as
+ * asynchronous threads working on behalf of the owner, we must unilaterally
+ * allow all reads access as long at least one reader is held (even if a write
+ * is requested). This has the side effect of write starvation, but write locks
+ * are extremely rare, and a solution to this problem would be significantly
+ * more complex (if even possible).
+ *
+ * We would like to assert that the namespace lock isn't held, but this is a
+ * valid use during create.
+ */
+void
+spa_config_enter(spa_t *spa, krw_t rw, void *tag)
+{
+ spa_config_lock_t *scl = &spa->spa_config_lock;
+
+ mutex_enter(&scl->scl_lock);
+
+ if (scl->scl_writer != curthread) {
+ if (rw == RW_READER) {
+ while (scl->scl_writer != NULL)
+ cv_wait(&scl->scl_cv, &scl->scl_lock);
+ } else {
+ while (scl->scl_writer != NULL ||
+ !refcount_is_zero(&scl->scl_count))
+ cv_wait(&scl->scl_cv, &scl->scl_lock);
+ scl->scl_writer = curthread;
+ }
+ }
+
+ (void) refcount_add(&scl->scl_count, tag);
+
+ mutex_exit(&scl->scl_lock);
+}
+
+/*
+ * Release the spa config lock, notifying any waiters in the process.
+ */
+void
+spa_config_exit(spa_t *spa, void *tag)
+{
+ spa_config_lock_t *scl = &spa->spa_config_lock;
+
+ mutex_enter(&scl->scl_lock);
+
+ ASSERT(!refcount_is_zero(&scl->scl_count));
+ if (refcount_remove(&scl->scl_count, tag) == 0) {
+ cv_broadcast(&scl->scl_cv);
+ scl->scl_writer = NULL; /* OK in either case */
+ }
+
+ mutex_exit(&scl->scl_lock);
+}
+
+/*
+ * Returns true if the config lock is held in the given manner.
+ */
+boolean_t
+spa_config_held(spa_t *spa, krw_t rw)
+{
+ spa_config_lock_t *scl = &spa->spa_config_lock;
+ boolean_t held;
+
+ mutex_enter(&scl->scl_lock);
+ if (rw == RW_WRITER)
+ held = (scl->scl_writer == curthread);
+ else
+ held = !refcount_is_zero(&scl->scl_count);
+ mutex_exit(&scl->scl_lock);
+
+ return (held);
+}
+
+/*
+ * ==========================================================================
+ * SPA vdev locking
+ * ==========================================================================
+ */
+
+/*
+ * Lock the given spa_t for the purpose of adding or removing a vdev.
+ * Grabs the global spa_namespace_lock plus the spa config lock for writing.
+ * It returns the next transaction group for the spa_t.
+ */
+uint64_t
+spa_vdev_enter(spa_t *spa)
+{
+ /*
+ * Suspend scrub activity while we mess with the config.
+ */
+ spa_scrub_suspend(spa);
+
+ mutex_enter(&spa_namespace_lock);
+
+ spa_config_enter(spa, RW_WRITER, spa);
+
+ return (spa_last_synced_txg(spa) + 1);
+}
+
+/*
+ * Unlock the spa_t after adding or removing a vdev. Besides undoing the
+ * locking of spa_vdev_enter(), we also want make sure the transactions have
+ * synced to disk, and then update the global configuration cache with the new
+ * information.
+ */
+int
+spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
+{
+ int config_changed = B_FALSE;
+
+ ASSERT(txg > spa_last_synced_txg(spa));
+
+ /*
+ * Reassess the DTLs.
+ */
+ vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE);
+
+ /*
+ * If the config changed, notify the scrub thread that it must restart.
+ */
+ if (error == 0 && !list_is_empty(&spa->spa_dirty_list)) {
+ config_changed = B_TRUE;
+ spa_scrub_restart(spa, txg);
+ }
+
+ spa_config_exit(spa, spa);
+
+ /*
+ * Allow scrubbing to resume.
+ */
+ spa_scrub_resume(spa);
+
+ /*
+ * Note: this txg_wait_synced() is important because it ensures
+ * that there won't be more than one config change per txg.
+ * This allows us to use the txg as the generation number.
+ */
+ if (error == 0)
+ txg_wait_synced(spa->spa_dsl_pool, txg);
+
+ if (vd != NULL) {
+ ASSERT(!vd->vdev_detached || vd->vdev_dtl.smo_object == 0);
+ vdev_free(vd);
+ }
+
+ /*
+ * If the config changed, update the config cache.
+ */
+ if (config_changed)
+ spa_config_sync();
+
+ mutex_exit(&spa_namespace_lock);
+
+ return (error);
+}
+
+/*
+ * ==========================================================================
+ * Miscellaneous functions
+ * ==========================================================================
+ */
+
+/*
+ * Rename a spa_t.
+ */
+int
+spa_rename(const char *name, const char *newname)
+{
+ spa_t *spa;
+ int err;
+
+ /*
+ * Lookup the spa_t and grab the config lock for writing. We need to
+ * actually open the pool so that we can sync out the necessary labels.
+ * It's OK to call spa_open() with the namespace lock held because we
+ * allow recursive calls for other reasons.
+ */
+ mutex_enter(&spa_namespace_lock);
+ if ((err = spa_open(name, &spa, FTAG)) != 0) {
+ mutex_exit(&spa_namespace_lock);
+ return (err);
+ }
+
+ spa_config_enter(spa, RW_WRITER, FTAG);
+
+ avl_remove(&spa_namespace_avl, spa);
+ spa_strfree(spa->spa_name);
+ spa->spa_name = spa_strdup(newname);
+ avl_add(&spa_namespace_avl, spa);
+
+ /*
+ * Sync all labels to disk with the new names by marking the root vdev
+ * dirty and waiting for it to sync. It will pick up the new pool name
+ * during the sync.
+ */
+ vdev_config_dirty(spa->spa_root_vdev);
+
+ spa_config_exit(spa, FTAG);
+
+ txg_wait_synced(spa->spa_dsl_pool, 0);
+
+ /*
+ * Sync the updated config cache.
+ */
+ spa_config_sync();
+
+ spa_close(spa, FTAG);
+
+ mutex_exit(&spa_namespace_lock);
+
+ return (0);
+}
+
+
+/*
+ * Determine whether a pool with given pool_guid exists. If device_guid is
+ * non-zero, determine whether the pool exists *and* contains a device with the
+ * specified device_guid.
+ */
+boolean_t
+spa_guid_exists(uint64_t pool_guid, uint64_t device_guid)
+{
+ spa_t *spa;
+ avl_tree_t *t = &spa_namespace_avl;
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) {
+ if (spa->spa_state == POOL_STATE_UNINITIALIZED)
+ continue;
+ if (spa->spa_root_vdev == NULL)
+ continue;
+ if (spa_guid(spa) == pool_guid) {
+ if (device_guid == 0)
+ break;
+
+ if (vdev_lookup_by_guid(spa->spa_root_vdev,
+ device_guid) != NULL)
+ break;
+
+ /*
+ * Check any devices we may in the process of adding.
+ */
+ if (spa->spa_pending_vdev) {
+ if (vdev_lookup_by_guid(spa->spa_pending_vdev,
+ device_guid) != NULL)
+ break;
+ }
+ }
+ }
+
+ return (spa != NULL);
+}
+
+char *
+spa_strdup(const char *s)
+{
+ size_t len;
+ char *new;
+
+ len = strlen(s);
+ new = kmem_alloc(len + 1, KM_SLEEP);
+ bcopy(s, new, len);
+ new[len] = '\0';
+
+ return (new);
+}
+
+void
+spa_strfree(char *s)
+{
+ kmem_free(s, strlen(s) + 1);
+}
+
+uint64_t
+spa_get_random(uint64_t range)
+{
+ uint64_t r;
+
+ ASSERT(range != 0);
+
+ (void) random_get_pseudo_bytes((void *)&r, sizeof (uint64_t));
+
+ return (r % range);
+}
+
+void
+sprintf_blkptr(char *buf, int len, const blkptr_t *bp)
+{
+ int d;
+
+ if (bp == NULL) {
+ (void) snprintf(buf, len, "<NULL>");
+ return;
+ }
+
+ if (BP_IS_HOLE(bp)) {
+ (void) snprintf(buf, len, "<hole>");
+ return;
+ }
+
+ (void) snprintf(buf, len, "[L%llu %s] %llxL/%llxP ",
+ (u_longlong_t)BP_GET_LEVEL(bp),
+ dmu_ot[BP_GET_TYPE(bp)].ot_name,
+ (u_longlong_t)BP_GET_LSIZE(bp),
+ (u_longlong_t)BP_GET_PSIZE(bp));
+
+ for (d = 0; d < BP_GET_NDVAS(bp); d++) {
+ const dva_t *dva = &bp->blk_dva[d];
+ (void) snprintf(buf + strlen(buf), len - strlen(buf),
+ "DVA[%d]=<%llu:%llx:%llx> ", d,
+ (u_longlong_t)DVA_GET_VDEV(dva),
+ (u_longlong_t)DVA_GET_OFFSET(dva),
+ (u_longlong_t)DVA_GET_ASIZE(dva));
+ }
+
+ (void) snprintf(buf + strlen(buf), len - strlen(buf),
+ "%s %s %s %s birth=%llu fill=%llu cksum=%llx:%llx:%llx:%llx",
+ zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name,
+ zio_compress_table[BP_GET_COMPRESS(bp)].ci_name,
+ BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE",
+ BP_IS_GANG(bp) ? "gang" : "contiguous",
+ (u_longlong_t)bp->blk_birth,
+ (u_longlong_t)bp->blk_fill,
+ (u_longlong_t)bp->blk_cksum.zc_word[0],
+ (u_longlong_t)bp->blk_cksum.zc_word[1],
+ (u_longlong_t)bp->blk_cksum.zc_word[2],
+ (u_longlong_t)bp->blk_cksum.zc_word[3]);
+}
+
+void
+spa_freeze(spa_t *spa)
+{
+ uint64_t freeze_txg = 0;
+
+ spa_config_enter(spa, RW_WRITER, FTAG);
+ if (spa->spa_freeze_txg == UINT64_MAX) {
+ freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE;
+ spa->spa_freeze_txg = freeze_txg;
+ }
+ spa_config_exit(spa, FTAG);
+ if (freeze_txg != 0)
+ txg_wait_synced(spa_get_dsl(spa), freeze_txg);
+}
+
+void
+zfs_panic_recover(const char *fmt, ...)
+{
+ va_list adx;
+
+ va_start(adx, fmt);
+ vcmn_err(zfs_recover ? CE_WARN : CE_PANIC, fmt, adx);
+ va_end(adx);
+}
+
+/*
+ * ==========================================================================
+ * Accessor functions
+ * ==========================================================================
+ */
+
+krwlock_t *
+spa_traverse_rwlock(spa_t *spa)
+{
+ return (&spa->spa_traverse_lock);
+}
+
+int
+spa_traverse_wanted(spa_t *spa)
+{
+ return (spa->spa_traverse_wanted);
+}
+
+dsl_pool_t *
+spa_get_dsl(spa_t *spa)
+{
+ return (spa->spa_dsl_pool);
+}
+
+blkptr_t *
+spa_get_rootblkptr(spa_t *spa)
+{
+ return (&spa->spa_ubsync.ub_rootbp);
+}
+
+void
+spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp)
+{
+ spa->spa_uberblock.ub_rootbp = *bp;
+}
+
+void
+spa_altroot(spa_t *spa, char *buf, size_t buflen)
+{
+ if (spa->spa_root == NULL)
+ buf[0] = '\0';
+ else
+ (void) strncpy(buf, spa->spa_root, buflen);
+}
+
+int
+spa_sync_pass(spa_t *spa)
+{
+ return (spa->spa_sync_pass);
+}
+
+char *
+spa_name(spa_t *spa)
+{
+ /*
+ * Accessing the name requires holding either the namespace lock or the
+ * config lock, both of which are required to do a rename.
+ */
+ ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
+ spa_config_held(spa, RW_READER) || spa_config_held(spa, RW_WRITER));
+
+ return (spa->spa_name);
+}
+
+uint64_t
+spa_guid(spa_t *spa)
+{
+ /*
+ * If we fail to parse the config during spa_load(), we can go through
+ * the error path (which posts an ereport) and end up here with no root
+ * vdev. We stash the original pool guid in 'spa_load_guid' to handle
+ * this case.
+ */
+ if (spa->spa_root_vdev != NULL)
+ return (spa->spa_root_vdev->vdev_guid);
+ else
+ return (spa->spa_load_guid);
+}
+
+uint64_t
+spa_last_synced_txg(spa_t *spa)
+{
+ return (spa->spa_ubsync.ub_txg);
+}
+
+uint64_t
+spa_first_txg(spa_t *spa)
+{
+ return (spa->spa_first_txg);
+}
+
+int
+spa_state(spa_t *spa)
+{
+ return (spa->spa_state);
+}
+
+uint64_t
+spa_freeze_txg(spa_t *spa)
+{
+ return (spa->spa_freeze_txg);
+}
+
+/*
+ * In the future, this may select among different metaslab classes
+ * depending on the zdp. For now, there's no such distinction.
+ */
+metaslab_class_t *
+spa_metaslab_class_select(spa_t *spa)
+{
+ return (spa->spa_normal_class);
+}
+
+/*
+ * Return how much space is allocated in the pool (ie. sum of all asize)
+ */
+uint64_t
+spa_get_alloc(spa_t *spa)
+{
+ return (spa->spa_root_vdev->vdev_stat.vs_alloc);
+}
+
+/*
+ * Return how much (raid-z inflated) space there is in the pool.
+ */
+uint64_t
+spa_get_space(spa_t *spa)
+{
+ return (spa->spa_root_vdev->vdev_stat.vs_space);
+}
+
+/*
+ * Return the amount of raid-z-deflated space in the pool.
+ */
+uint64_t
+spa_get_dspace(spa_t *spa)
+{
+ if (spa->spa_deflate)
+ return (spa->spa_root_vdev->vdev_stat.vs_dspace);
+ else
+ return (spa->spa_root_vdev->vdev_stat.vs_space);
+}
+
+/* ARGSUSED */
+uint64_t
+spa_get_asize(spa_t *spa, uint64_t lsize)
+{
+ /*
+ * For now, the worst case is 512-byte RAID-Z blocks, in which
+ * case the space requirement is exactly 2x; so just assume that.
+ * Add to this the fact that we can have up to 3 DVAs per bp, and
+ * we have to multiply by a total of 6x.
+ */
+ return (lsize * 6);
+}
+
+uint64_t
+spa_version(spa_t *spa)
+{
+ return (spa->spa_ubsync.ub_version);
+}
+
+int
+spa_max_replication(spa_t *spa)
+{
+ /*
+ * As of ZFS_VERSION == ZFS_VERSION_DITTO_BLOCKS, we are able to
+ * handle BPs with more than one DVA allocated. Set our max
+ * replication level accordingly.
+ */
+ if (spa_version(spa) < ZFS_VERSION_DITTO_BLOCKS)
+ return (1);
+ return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override));
+}
+
+uint64_t
+bp_get_dasize(spa_t *spa, const blkptr_t *bp)
+{
+ int sz = 0, i;
+
+ if (!spa->spa_deflate)
+ return (BP_GET_ASIZE(bp));
+
+ for (i = 0; i < SPA_DVAS_PER_BP; i++) {
+ vdev_t *vd =
+ vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[i]));
+ sz += (DVA_GET_ASIZE(&bp->blk_dva[i]) >> SPA_MINBLOCKSHIFT) *
+ vd->vdev_deflate_ratio;
+ }
+ return (sz);
+}
+
+/*
+ * ==========================================================================
+ * Initialization and Termination
+ * ==========================================================================
+ */
+
+static int
+spa_name_compare(const void *a1, const void *a2)
+{
+ const spa_t *s1 = a1;
+ const spa_t *s2 = a2;
+ int s;
+
+ s = strcmp(s1->spa_name, s2->spa_name);
+ if (s > 0)
+ return (1);
+ if (s < 0)
+ return (-1);
+ return (0);
+}
+
+int
+spa_busy(void)
+{
+ return (spa_active_count);
+}
+
+void
+spa_init(int mode)
+{
+ mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL);
+
+ avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t),
+ offsetof(spa_t, spa_avl));
+
+ mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_spare_t),
+ offsetof(spa_spare_t, spare_avl));
+
+ spa_mode = mode;
+
+ refcount_init();
+ unique_init();
+ zio_init();
+ dmu_init();
+ zil_init();
+ spa_config_load();
+}
+
+void
+spa_fini(void)
+{
+ spa_evict_all();
+
+ zil_fini();
+ dmu_fini();
+ zio_fini();
+ refcount_fini();
+
+ avl_destroy(&spa_namespace_avl);
+ avl_destroy(&spa_spare_avl);
+
+ cv_destroy(&spa_namespace_cv);
+ mutex_destroy(&spa_namespace_lock);
+ mutex_destroy(&spa_spare_lock);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
new file mode 100644
index 0000000..23313a9
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
@@ -0,0 +1,501 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/dmu.h>
+#include <sys/zio.h>
+#include <sys/space_map.h>
+
+/*
+ * Space map routines.
+ * NOTE: caller is responsible for all locking.
+ */
+static int
+space_map_seg_compare(const void *x1, const void *x2)
+{
+ const space_seg_t *s1 = x1;
+ const space_seg_t *s2 = x2;
+
+ if (s1->ss_start < s2->ss_start) {
+ if (s1->ss_end > s2->ss_start)
+ return (0);
+ return (-1);
+ }
+ if (s1->ss_start > s2->ss_start) {
+ if (s1->ss_start < s2->ss_end)
+ return (0);
+ return (1);
+ }
+ return (0);
+}
+
+void
+space_map_create(space_map_t *sm, uint64_t start, uint64_t size, uint8_t shift,
+ kmutex_t *lp)
+{
+ bzero(sm, sizeof (*sm));
+
+ cv_init(&sm->sm_load_cv, NULL, CV_DEFAULT, NULL);
+ avl_create(&sm->sm_root, space_map_seg_compare,
+ sizeof (space_seg_t), offsetof(struct space_seg, ss_node));
+
+ sm->sm_start = start;
+ sm->sm_size = size;
+ sm->sm_shift = shift;
+ sm->sm_lock = lp;
+}
+
+void
+space_map_destroy(space_map_t *sm)
+{
+ ASSERT(!sm->sm_loaded && !sm->sm_loading);
+ VERIFY3U(sm->sm_space, ==, 0);
+ avl_destroy(&sm->sm_root);
+ cv_destroy(&sm->sm_load_cv);
+}
+
+void
+space_map_add(space_map_t *sm, uint64_t start, uint64_t size)
+{
+ avl_index_t where;
+ space_seg_t ssearch, *ss_before, *ss_after, *ss;
+ uint64_t end = start + size;
+ int merge_before, merge_after;
+
+ ASSERT(MUTEX_HELD(sm->sm_lock));
+ VERIFY(size != 0);
+ VERIFY3U(start, >=, sm->sm_start);
+ VERIFY3U(end, <=, sm->sm_start + sm->sm_size);
+ VERIFY(sm->sm_space + size <= sm->sm_size);
+ VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0);
+ VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0);
+
+ ssearch.ss_start = start;
+ ssearch.ss_end = end;
+ ss = avl_find(&sm->sm_root, &ssearch, &where);
+
+ if (ss != NULL && ss->ss_start <= start && ss->ss_end >= end) {
+ zfs_panic_recover("zfs: allocating allocated segment"
+ "(offset=%llu size=%llu)\n",
+ (longlong_t)start, (longlong_t)size);
+ return;
+ }
+
+ /* Make sure we don't overlap with either of our neighbors */
+ VERIFY(ss == NULL);
+
+ ss_before = avl_nearest(&sm->sm_root, where, AVL_BEFORE);
+ ss_after = avl_nearest(&sm->sm_root, where, AVL_AFTER);
+
+ merge_before = (ss_before != NULL && ss_before->ss_end == start);
+ merge_after = (ss_after != NULL && ss_after->ss_start == end);
+
+ if (merge_before && merge_after) {
+ avl_remove(&sm->sm_root, ss_before);
+ ss_after->ss_start = ss_before->ss_start;
+ kmem_free(ss_before, sizeof (*ss_before));
+ } else if (merge_before) {
+ ss_before->ss_end = end;
+ } else if (merge_after) {
+ ss_after->ss_start = start;
+ } else {
+ ss = kmem_alloc(sizeof (*ss), KM_SLEEP);
+ ss->ss_start = start;
+ ss->ss_end = end;
+ avl_insert(&sm->sm_root, ss, where);
+ }
+
+ sm->sm_space += size;
+}
+
+void
+space_map_remove(space_map_t *sm, uint64_t start, uint64_t size)
+{
+ avl_index_t where;
+ space_seg_t ssearch, *ss, *newseg;
+ uint64_t end = start + size;
+ int left_over, right_over;
+
+ ASSERT(MUTEX_HELD(sm->sm_lock));
+ VERIFY(size != 0);
+ VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0);
+ VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0);
+
+ ssearch.ss_start = start;
+ ssearch.ss_end = end;
+ ss = avl_find(&sm->sm_root, &ssearch, &where);
+
+ /* Make sure we completely overlap with someone */
+ if (ss == NULL) {
+ zfs_panic_recover("zfs: freeing free segment "
+ "(offset=%llu size=%llu)",
+ (longlong_t)start, (longlong_t)size);
+ return;
+ }
+ VERIFY3U(ss->ss_start, <=, start);
+ VERIFY3U(ss->ss_end, >=, end);
+ VERIFY(sm->sm_space - size <= sm->sm_size);
+
+ left_over = (ss->ss_start != start);
+ right_over = (ss->ss_end != end);
+
+ if (left_over && right_over) {
+ newseg = kmem_alloc(sizeof (*newseg), KM_SLEEP);
+ newseg->ss_start = end;
+ newseg->ss_end = ss->ss_end;
+ ss->ss_end = start;
+ avl_insert_here(&sm->sm_root, newseg, ss, AVL_AFTER);
+ } else if (left_over) {
+ ss->ss_end = start;
+ } else if (right_over) {
+ ss->ss_start = end;
+ } else {
+ avl_remove(&sm->sm_root, ss);
+ kmem_free(ss, sizeof (*ss));
+ }
+
+ sm->sm_space -= size;
+}
+
+int
+space_map_contains(space_map_t *sm, uint64_t start, uint64_t size)
+{
+ avl_index_t where;
+ space_seg_t ssearch, *ss;
+ uint64_t end = start + size;
+
+ ASSERT(MUTEX_HELD(sm->sm_lock));
+ VERIFY(size != 0);
+ VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0);
+ VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0);
+
+ ssearch.ss_start = start;
+ ssearch.ss_end = end;
+ ss = avl_find(&sm->sm_root, &ssearch, &where);
+
+ return (ss != NULL && ss->ss_start <= start && ss->ss_end >= end);
+}
+
+void
+space_map_vacate(space_map_t *sm, space_map_func_t *func, space_map_t *mdest)
+{
+ space_seg_t *ss;
+ void *cookie = NULL;
+
+ ASSERT(MUTEX_HELD(sm->sm_lock));
+
+ while ((ss = avl_destroy_nodes(&sm->sm_root, &cookie)) != NULL) {
+ if (func != NULL)
+ func(mdest, ss->ss_start, ss->ss_end - ss->ss_start);
+ kmem_free(ss, sizeof (*ss));
+ }
+ sm->sm_space = 0;
+}
+
+void
+space_map_walk(space_map_t *sm, space_map_func_t *func, space_map_t *mdest)
+{
+ space_seg_t *ss;
+
+ for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
+ func(mdest, ss->ss_start, ss->ss_end - ss->ss_start);
+}
+
+void
+space_map_excise(space_map_t *sm, uint64_t start, uint64_t size)
+{
+ avl_tree_t *t = &sm->sm_root;
+ avl_index_t where;
+ space_seg_t *ss, search;
+ uint64_t end = start + size;
+ uint64_t rm_start, rm_end;
+
+ ASSERT(MUTEX_HELD(sm->sm_lock));
+
+ search.ss_start = start;
+ search.ss_end = start;
+
+ for (;;) {
+ ss = avl_find(t, &search, &where);
+
+ if (ss == NULL)
+ ss = avl_nearest(t, where, AVL_AFTER);
+
+ if (ss == NULL || ss->ss_start >= end)
+ break;
+
+ rm_start = MAX(ss->ss_start, start);
+ rm_end = MIN(ss->ss_end, end);
+
+ space_map_remove(sm, rm_start, rm_end - rm_start);
+ }
+}
+
+/*
+ * Replace smd with the union of smd and sms.
+ */
+void
+space_map_union(space_map_t *smd, space_map_t *sms)
+{
+ avl_tree_t *t = &sms->sm_root;
+ space_seg_t *ss;
+
+ ASSERT(MUTEX_HELD(smd->sm_lock));
+
+ /*
+ * For each source segment, remove any intersections with the
+ * destination, then add the source segment to the destination.
+ */
+ for (ss = avl_first(t); ss != NULL; ss = AVL_NEXT(t, ss)) {
+ space_map_excise(smd, ss->ss_start, ss->ss_end - ss->ss_start);
+ space_map_add(smd, ss->ss_start, ss->ss_end - ss->ss_start);
+ }
+}
+
+/*
+ * Wait for any in-progress space_map_load() to complete.
+ */
+void
+space_map_load_wait(space_map_t *sm)
+{
+ ASSERT(MUTEX_HELD(sm->sm_lock));
+
+ while (sm->sm_loading)
+ cv_wait(&sm->sm_load_cv, sm->sm_lock);
+}
+
+/*
+ * Note: space_map_load() will drop sm_lock across dmu_read() calls.
+ * The caller must be OK with this.
+ */
+int
+space_map_load(space_map_t *sm, space_map_ops_t *ops, uint8_t maptype,
+ space_map_obj_t *smo, objset_t *os)
+{
+ uint64_t *entry, *entry_map, *entry_map_end;
+ uint64_t bufsize, size, offset, end, space;
+ uint64_t mapstart = sm->sm_start;
+
+ ASSERT(MUTEX_HELD(sm->sm_lock));
+
+ space_map_load_wait(sm);
+
+ if (sm->sm_loaded)
+ return (0);
+
+ sm->sm_loading = B_TRUE;
+ end = smo->smo_objsize;
+ space = smo->smo_alloc;
+
+ ASSERT(sm->sm_ops == NULL);
+ VERIFY3U(sm->sm_space, ==, 0);
+
+ if (maptype == SM_FREE) {
+ space_map_add(sm, sm->sm_start, sm->sm_size);
+ space = sm->sm_size - space;
+ }
+
+ bufsize = 1ULL << SPACE_MAP_BLOCKSHIFT;
+ entry_map = zio_buf_alloc(bufsize);
+
+ mutex_exit(sm->sm_lock);
+ if (end > bufsize)
+ dmu_prefetch(os, smo->smo_object, bufsize, end - bufsize);
+ mutex_enter(sm->sm_lock);
+
+ for (offset = 0; offset < end; offset += bufsize) {
+ size = MIN(end - offset, bufsize);
+ VERIFY(P2PHASE(size, sizeof (uint64_t)) == 0);
+ VERIFY(size != 0);
+
+ dprintf("object=%llu offset=%llx size=%llx\n",
+ smo->smo_object, offset, size);
+
+ mutex_exit(sm->sm_lock);
+ VERIFY3U(dmu_read(os, smo->smo_object, offset, size,
+ entry_map), ==, 0);
+ mutex_enter(sm->sm_lock);
+
+ entry_map_end = entry_map + (size / sizeof (uint64_t));
+ for (entry = entry_map; entry < entry_map_end; entry++) {
+ uint64_t e = *entry;
+
+ if (SM_DEBUG_DECODE(e)) /* Skip debug entries */
+ continue;
+
+ (SM_TYPE_DECODE(e) == maptype ?
+ space_map_add : space_map_remove)(sm,
+ (SM_OFFSET_DECODE(e) << sm->sm_shift) + mapstart,
+ SM_RUN_DECODE(e) << sm->sm_shift);
+ }
+ }
+ VERIFY3U(sm->sm_space, ==, space);
+
+ zio_buf_free(entry_map, bufsize);
+
+ sm->sm_loading = B_FALSE;
+ sm->sm_loaded = B_TRUE;
+ sm->sm_ops = ops;
+
+ cv_broadcast(&sm->sm_load_cv);
+
+ if (ops != NULL)
+ ops->smop_load(sm);
+
+ return (0);
+}
+
+void
+space_map_unload(space_map_t *sm)
+{
+ ASSERT(MUTEX_HELD(sm->sm_lock));
+
+ if (sm->sm_loaded && sm->sm_ops != NULL)
+ sm->sm_ops->smop_unload(sm);
+
+ sm->sm_loaded = B_FALSE;
+ sm->sm_ops = NULL;
+
+ space_map_vacate(sm, NULL, NULL);
+}
+
+uint64_t
+space_map_alloc(space_map_t *sm, uint64_t size)
+{
+ uint64_t start;
+
+ start = sm->sm_ops->smop_alloc(sm, size);
+ if (start != -1ULL)
+ space_map_remove(sm, start, size);
+ return (start);
+}
+
+void
+space_map_claim(space_map_t *sm, uint64_t start, uint64_t size)
+{
+ sm->sm_ops->smop_claim(sm, start, size);
+ space_map_remove(sm, start, size);
+}
+
+void
+space_map_free(space_map_t *sm, uint64_t start, uint64_t size)
+{
+ space_map_add(sm, start, size);
+ sm->sm_ops->smop_free(sm, start, size);
+}
+
+/*
+ * Note: space_map_sync() will drop sm_lock across dmu_write() calls.
+ */
+void
+space_map_sync(space_map_t *sm, uint8_t maptype,
+ space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_objset_spa(os);
+ void *cookie = NULL;
+ space_seg_t *ss;
+ uint64_t bufsize, start, size, run_len;
+ uint64_t *entry, *entry_map, *entry_map_end;
+
+ ASSERT(MUTEX_HELD(sm->sm_lock));
+
+ if (sm->sm_space == 0)
+ return;
+
+ dprintf("object %4llu, txg %llu, pass %d, %c, count %lu, space %llx\n",
+ smo->smo_object, dmu_tx_get_txg(tx), spa_sync_pass(spa),
+ maptype == SM_ALLOC ? 'A' : 'F', avl_numnodes(&sm->sm_root),
+ sm->sm_space);
+
+ if (maptype == SM_ALLOC)
+ smo->smo_alloc += sm->sm_space;
+ else
+ smo->smo_alloc -= sm->sm_space;
+
+ bufsize = (8 + avl_numnodes(&sm->sm_root)) * sizeof (uint64_t);
+ bufsize = MIN(bufsize, 1ULL << SPACE_MAP_BLOCKSHIFT);
+ entry_map = zio_buf_alloc(bufsize);
+ entry_map_end = entry_map + (bufsize / sizeof (uint64_t));
+ entry = entry_map;
+
+ *entry++ = SM_DEBUG_ENCODE(1) |
+ SM_DEBUG_ACTION_ENCODE(maptype) |
+ SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(spa)) |
+ SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx));
+
+ while ((ss = avl_destroy_nodes(&sm->sm_root, &cookie)) != NULL) {
+ size = ss->ss_end - ss->ss_start;
+ start = (ss->ss_start - sm->sm_start) >> sm->sm_shift;
+
+ sm->sm_space -= size;
+ size >>= sm->sm_shift;
+
+ while (size) {
+ run_len = MIN(size, SM_RUN_MAX);
+
+ if (entry == entry_map_end) {
+ mutex_exit(sm->sm_lock);
+ dmu_write(os, smo->smo_object, smo->smo_objsize,
+ bufsize, entry_map, tx);
+ mutex_enter(sm->sm_lock);
+ smo->smo_objsize += bufsize;
+ entry = entry_map;
+ }
+
+ *entry++ = SM_OFFSET_ENCODE(start) |
+ SM_TYPE_ENCODE(maptype) |
+ SM_RUN_ENCODE(run_len);
+
+ start += run_len;
+ size -= run_len;
+ }
+ kmem_free(ss, sizeof (*ss));
+ }
+
+ if (entry != entry_map) {
+ size = (entry - entry_map) * sizeof (uint64_t);
+ mutex_exit(sm->sm_lock);
+ dmu_write(os, smo->smo_object, smo->smo_objsize,
+ size, entry_map, tx);
+ mutex_enter(sm->sm_lock);
+ smo->smo_objsize += size;
+ }
+
+ zio_buf_free(entry_map, bufsize);
+
+ VERIFY3U(sm->sm_space, ==, 0);
+}
+
+void
+space_map_truncate(space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx)
+{
+ VERIFY(dmu_free_range(os, smo->smo_object, 0, -1ULL, tx) == 0);
+
+ smo->smo_objsize = 0;
+ smo->smo_alloc = 0;
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
new file mode 100644
index 0000000..f58ffc0
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
@@ -0,0 +1,109 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ARC_H
+#define _SYS_ARC_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/zio.h>
+
+typedef struct arc_buf_hdr arc_buf_hdr_t;
+typedef struct arc_buf arc_buf_t;
+typedef void arc_done_func_t(zio_t *zio, arc_buf_t *buf, void *private);
+typedef void arc_byteswap_func_t(void *buf, size_t size);
+typedef int arc_evict_func_t(void *private);
+
+/* generic arc_done_func_t's which you can use */
+arc_done_func_t arc_bcopy_func;
+arc_done_func_t arc_getbuf_func;
+
+struct arc_buf {
+ arc_buf_hdr_t *b_hdr;
+ arc_buf_t *b_next;
+ void *b_data;
+ arc_evict_func_t *b_efunc;
+ void *b_private;
+};
+
+typedef enum arc_buf_contents {
+ ARC_BUFC_UNDEF, /* buffer contents undefined */
+ ARC_BUFC_DATA, /* buffer contains data */
+ ARC_BUFC_METADATA /* buffer contains metadata */
+} arc_buf_contents_t;
+/*
+ * These are the flags we pass into calls to the arc
+ */
+#define ARC_WAIT (1 << 1) /* perform I/O synchronously */
+#define ARC_NOWAIT (1 << 2) /* perform I/O asynchronously */
+#define ARC_PREFETCH (1 << 3) /* I/O is a prefetch */
+#define ARC_CACHED (1 << 4) /* I/O was already in cache */
+
+arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag,
+ arc_buf_contents_t type);
+void arc_buf_add_ref(arc_buf_t *buf, void *tag);
+int arc_buf_remove_ref(arc_buf_t *buf, void *tag);
+int arc_buf_size(arc_buf_t *buf);
+void arc_release(arc_buf_t *buf, void *tag);
+int arc_released(arc_buf_t *buf);
+int arc_has_callback(arc_buf_t *buf);
+void arc_buf_freeze(arc_buf_t *buf);
+void arc_buf_thaw(arc_buf_t *buf);
+#ifdef ZFS_DEBUG
+int arc_referenced(arc_buf_t *buf);
+#endif
+
+int arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap,
+ arc_done_func_t *done, void *private, int priority, int flags,
+ uint32_t *arc_flags, zbookmark_t *zb);
+zio_t *arc_write(zio_t *pio, spa_t *spa, int checksum, int compress,
+ int ncopies, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
+ arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority,
+ int flags, zbookmark_t *zb);
+int arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+ zio_done_func_t *done, void *private, uint32_t arc_flags);
+int arc_tryread(spa_t *spa, blkptr_t *bp, void *data);
+
+void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private);
+int arc_buf_evict(arc_buf_t *buf);
+
+void arc_flush(void);
+void arc_tempreserve_clear(uint64_t tempreserve);
+int arc_tempreserve_space(uint64_t tempreserve);
+
+void arc_init(void);
+void arc_fini(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ARC_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h
new file mode 100644
index 0000000..b4c8376
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h
@@ -0,0 +1,89 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_BPLIST_H
+#define _SYS_BPLIST_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct bplist_phys {
+ /*
+ * This is the bonus buffer for the dead lists. The object's
+ * contents is an array of bpl_entries blkptr_t's, representing
+ * a total of bpl_bytes physical space.
+ */
+ uint64_t bpl_entries;
+ uint64_t bpl_bytes;
+ uint64_t bpl_comp;
+ uint64_t bpl_uncomp;
+} bplist_phys_t;
+
+#define BPLIST_SIZE_V0 (2 * sizeof (uint64_t))
+
+typedef struct bplist_q {
+ blkptr_t bpq_blk;
+ void *bpq_next;
+} bplist_q_t;
+
+typedef struct bplist {
+ kmutex_t bpl_lock;
+ objset_t *bpl_mos;
+ uint64_t bpl_object;
+ uint8_t bpl_blockshift;
+ uint8_t bpl_bpshift;
+ uint8_t bpl_havecomp;
+ bplist_q_t *bpl_queue;
+ bplist_phys_t *bpl_phys;
+ dmu_buf_t *bpl_dbuf;
+ dmu_buf_t *bpl_cached_dbuf;
+} bplist_t;
+
+extern uint64_t bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx);
+extern void bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx);
+extern int bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object);
+extern void bplist_close(bplist_t *bpl);
+extern boolean_t bplist_empty(bplist_t *bpl);
+extern int bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp);
+extern int bplist_enqueue(bplist_t *bpl, blkptr_t *bp, dmu_tx_t *tx);
+extern void bplist_enqueue_deferred(bplist_t *bpl, blkptr_t *bp);
+extern void bplist_sync(bplist_t *bpl, dmu_tx_t *tx);
+extern void bplist_vacate(bplist_t *bpl, dmu_tx_t *tx);
+extern int bplist_space(bplist_t *bpl,
+ uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_BPLIST_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
new file mode 100644
index 0000000..d33657b
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
@@ -0,0 +1,334 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_DBUF_H
+#define _SYS_DBUF_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/zio.h>
+#include <sys/arc.h>
+#include <sys/zfs_context.h>
+#include <sys/refcount.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define DB_BONUS_BLKID (-1ULL)
+#define IN_DMU_SYNC 2
+
+/*
+ * define flags for dbuf_read
+ */
+
+#define DB_RF_MUST_SUCCEED (1 << 0)
+#define DB_RF_CANFAIL (1 << 1)
+#define DB_RF_HAVESTRUCT (1 << 2)
+#define DB_RF_NOPREFETCH (1 << 3)
+#define DB_RF_NEVERWAIT (1 << 4)
+#define DB_RF_CACHED (1 << 5)
+
+/*
+ * The state transition diagram for dbufs looks like:
+ *
+ * +----> READ ----+
+ * | |
+ * | V
+ * (alloc)-->UNCACHED CACHED-->EVICTING-->(free)
+ * | ^
+ * | |
+ * +----> FILL ----+
+ */
+typedef enum dbuf_states {
+ DB_UNCACHED,
+ DB_FILL,
+ DB_READ,
+ DB_CACHED,
+ DB_EVICTING
+} dbuf_states_t;
+
+struct objset_impl;
+struct dnode;
+struct dmu_tx;
+
+/*
+ * level = 0 means the user data
+ * level = 1 means the single indirect block
+ * etc.
+ */
+
+#define LIST_LINK_INACTIVE(link) \
+ ((link)->list_next == NULL && (link)->list_prev == NULL)
+
+struct dmu_buf_impl;
+
+typedef enum override_states {
+ DR_NOT_OVERRIDDEN,
+ DR_IN_DMU_SYNC,
+ DR_OVERRIDDEN
+} override_states_t;
+
+typedef struct dbuf_dirty_record {
+ /* link on our parents dirty list */
+ list_node_t dr_dirty_node;
+
+ /* transaction group this data will sync in */
+ uint64_t dr_txg;
+
+ /* zio of outstanding write IO */
+ zio_t *dr_zio;
+
+ /* pointer back to our dbuf */
+ struct dmu_buf_impl *dr_dbuf;
+
+ /* pointer to next dirty record */
+ struct dbuf_dirty_record *dr_next;
+
+ /* pointer to parent dirty record */
+ struct dbuf_dirty_record *dr_parent;
+
+ union dirty_types {
+ struct dirty_indirect {
+
+ /* protect access to list */
+ kmutex_t dr_mtx;
+
+ /* Our list of dirty children */
+ list_t dr_children;
+ } di;
+ struct dirty_leaf {
+
+ /*
+ * dr_data is set when we dirty the buffer
+ * so that we can retain the pointer even if it
+ * gets COW'd in a subsequent transaction group.
+ */
+ arc_buf_t *dr_data;
+ blkptr_t dr_overridden_by;
+ override_states_t dr_override_state;
+ } dl;
+ } dt;
+} dbuf_dirty_record_t;
+
+typedef struct dmu_buf_impl {
+ /*
+ * The following members are immutable, with the exception of
+ * db.db_data, which is protected by db_mtx.
+ */
+
+ /* the publicly visible structure */
+ dmu_buf_t db;
+
+ /* the objset we belong to */
+ struct objset_impl *db_objset;
+
+ /*
+ * the dnode we belong to (NULL when evicted)
+ */
+ struct dnode *db_dnode;
+
+ /*
+ * our parent buffer; if the dnode points to us directly,
+ * db_parent == db_dnode->dn_dbuf
+ * only accessed by sync thread ???
+ * (NULL when evicted)
+ */
+ struct dmu_buf_impl *db_parent;
+
+ /*
+ * link for hash table of all dmu_buf_impl_t's
+ */
+ struct dmu_buf_impl *db_hash_next;
+
+ /* our block number */
+ uint64_t db_blkid;
+
+ /*
+ * Pointer to the blkptr_t which points to us. May be NULL if we
+ * don't have one yet. (NULL when evicted)
+ */
+ blkptr_t *db_blkptr;
+
+ /*
+ * Our indirection level. Data buffers have db_level==0.
+ * Indirect buffers which point to data buffers have
+ * db_level==1. etc. Buffers which contain dnodes have
+ * db_level==0, since the dnodes are stored in a file.
+ */
+ uint8_t db_level;
+
+ /* db_mtx protects the members below */
+ kmutex_t db_mtx;
+
+ /*
+ * Current state of the buffer
+ */
+ dbuf_states_t db_state;
+
+ /*
+ * Refcount accessed by dmu_buf_{hold,rele}.
+ * If nonzero, the buffer can't be destroyed.
+ * Protected by db_mtx.
+ */
+ refcount_t db_holds;
+
+ /* buffer holding our data */
+ arc_buf_t *db_buf;
+
+ kcondvar_t db_changed;
+ dbuf_dirty_record_t *db_data_pending;
+
+ /* pointer to most recent dirty record for this buffer */
+ dbuf_dirty_record_t *db_last_dirty;
+
+ /*
+ * Our link on the owner dnodes's dn_dbufs list.
+ * Protected by its dn_dbufs_mtx.
+ */
+ list_node_t db_link;
+
+ /* Data which is unique to data (leaf) blocks: */
+
+ /* stuff we store for the user (see dmu_buf_set_user) */
+ void *db_user_ptr;
+ void **db_user_data_ptr_ptr;
+ dmu_buf_evict_func_t *db_evict_func;
+
+ uint8_t db_immediate_evict;
+ uint8_t db_freed_in_flight;
+
+ uint8_t db_dirtycnt;
+} dmu_buf_impl_t;
+
+/* Note: the dbuf hash table is exposed only for the mdb module */
+#define DBUF_MUTEXES 256
+#define DBUF_HASH_MUTEX(h, idx) (&(h)->hash_mutexes[(idx) & (DBUF_MUTEXES-1)])
+typedef struct dbuf_hash_table {
+ uint64_t hash_table_mask;
+ dmu_buf_impl_t **hash_table;
+ kmutex_t hash_mutexes[DBUF_MUTEXES];
+} dbuf_hash_table_t;
+
+
+uint64_t dbuf_whichblock(struct dnode *di, uint64_t offset);
+
+dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data);
+dmu_buf_impl_t *dbuf_create_bonus(struct dnode *dn);
+
+dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag);
+dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid,
+ void *tag);
+int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid, int create,
+ void *tag, dmu_buf_impl_t **dbp);
+
+void dbuf_prefetch(struct dnode *dn, uint64_t blkid);
+
+void dbuf_add_ref(dmu_buf_impl_t *db, void *tag);
+uint64_t dbuf_refcount(dmu_buf_impl_t *db);
+
+void dbuf_rele(dmu_buf_impl_t *db, void *tag);
+
+dmu_buf_impl_t *dbuf_find(struct dnode *dn, uint8_t level, uint64_t blkid);
+
+int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags);
+void dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
+void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
+void dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx);
+void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
+void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx);
+dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
+
+void dbuf_clear(dmu_buf_impl_t *db);
+void dbuf_evict(dmu_buf_impl_t *db);
+
+void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
+void dbuf_unoverride(dbuf_dirty_record_t *dr);
+void dbuf_sync_list(list_t *list, dmu_tx_t *tx);
+
+void dbuf_free_range(struct dnode *dn, uint64_t blkid, uint64_t nblks,
+ struct dmu_tx *);
+
+void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx);
+
+void dbuf_init(void);
+void dbuf_fini(void);
+
+#define DBUF_GET_BUFC_TYPE(db) \
+ ((((db)->db_level > 0) || \
+ (dmu_ot[(db)->db_dnode->dn_type].ot_metadata)) ? \
+ ARC_BUFC_METADATA : ARC_BUFC_DATA);
+
+#ifdef ZFS_DEBUG
+
+/*
+ * There should be a ## between the string literal and fmt, to make it
+ * clear that we're joining two strings together, but gcc does not
+ * support that preprocessor token.
+ */
+#define dprintf_dbuf(dbuf, fmt, ...) do { \
+ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
+ char __db_buf[32]; \
+ uint64_t __db_obj = (dbuf)->db.db_object; \
+ if (__db_obj == DMU_META_DNODE_OBJECT) \
+ (void) strcpy(__db_buf, "mdn"); \
+ else \
+ (void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \
+ (u_longlong_t)__db_obj); \
+ dprintf_ds((dbuf)->db_objset->os_dsl_dataset, \
+ "obj=%s lvl=%u blkid=%lld " fmt, \
+ __db_buf, (dbuf)->db_level, \
+ (u_longlong_t)(dbuf)->db_blkid, __VA_ARGS__); \
+ } \
+_NOTE(CONSTCOND) } while (0)
+
+#define dprintf_dbuf_bp(db, bp, fmt, ...) do { \
+ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
+ char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \
+ sprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, bp); \
+ dprintf_dbuf(db, fmt " %s\n", __VA_ARGS__, __blkbuf); \
+ kmem_free(__blkbuf, BP_SPRINTF_LEN); \
+ } \
+_NOTE(CONSTCOND) } while (0)
+
+#define DBUF_VERIFY(db) dbuf_verify(db)
+
+#else
+
+#define dprintf_dbuf(db, fmt, ...)
+#define dprintf_dbuf_bp(db, bp, fmt, ...)
+#define DBUF_VERIFY(db)
+
+#endif
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DBUF_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
new file mode 100644
index 0000000..f534015
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
@@ -0,0 +1,586 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_DMU_H
+#define _SYS_DMU_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * This file describes the interface that the DMU provides for its
+ * consumers.
+ *
+ * The DMU also interacts with the SPA. That interface is described in
+ * dmu_spa.h.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct uio;
+struct page;
+struct vnode;
+struct spa;
+struct zilog;
+struct zio;
+struct blkptr;
+struct zap_cursor;
+struct dsl_dataset;
+struct dsl_pool;
+struct dnode;
+struct drr_begin;
+struct drr_end;
+struct zbookmark;
+struct spa;
+struct nvlist;
+struct objset_impl;
+struct file;
+
+typedef struct objset objset_t;
+typedef struct dmu_tx dmu_tx_t;
+typedef struct dsl_dir dsl_dir_t;
+
+typedef enum dmu_object_type {
+ DMU_OT_NONE,
+ /* general: */
+ DMU_OT_OBJECT_DIRECTORY, /* ZAP */
+ DMU_OT_OBJECT_ARRAY, /* UINT64 */
+ DMU_OT_PACKED_NVLIST, /* UINT8 (XDR by nvlist_pack/unpack) */
+ DMU_OT_PACKED_NVLIST_SIZE, /* UINT64 */
+ DMU_OT_BPLIST, /* UINT64 */
+ DMU_OT_BPLIST_HDR, /* UINT64 */
+ /* spa: */
+ DMU_OT_SPACE_MAP_HEADER, /* UINT64 */
+ DMU_OT_SPACE_MAP, /* UINT64 */
+ /* zil: */
+ DMU_OT_INTENT_LOG, /* UINT64 */
+ /* dmu: */
+ DMU_OT_DNODE, /* DNODE */
+ DMU_OT_OBJSET, /* OBJSET */
+ /* dsl: */
+ DMU_OT_DSL_DIR, /* UINT64 */
+ DMU_OT_DSL_DIR_CHILD_MAP, /* ZAP */
+ DMU_OT_DSL_DS_SNAP_MAP, /* ZAP */
+ DMU_OT_DSL_PROPS, /* ZAP */
+ DMU_OT_DSL_DATASET, /* UINT64 */
+ /* zpl: */
+ DMU_OT_ZNODE, /* ZNODE */
+ DMU_OT_ACL, /* ACL */
+ DMU_OT_PLAIN_FILE_CONTENTS, /* UINT8 */
+ DMU_OT_DIRECTORY_CONTENTS, /* ZAP */
+ DMU_OT_MASTER_NODE, /* ZAP */
+ DMU_OT_UNLINKED_SET, /* ZAP */
+ /* zvol: */
+ DMU_OT_ZVOL, /* UINT8 */
+ DMU_OT_ZVOL_PROP, /* ZAP */
+ /* other; for testing only! */
+ DMU_OT_PLAIN_OTHER, /* UINT8 */
+ DMU_OT_UINT64_OTHER, /* UINT64 */
+ DMU_OT_ZAP_OTHER, /* ZAP */
+ /* new object types: */
+ DMU_OT_ERROR_LOG, /* ZAP */
+ DMU_OT_SPA_HISTORY, /* UINT8 */
+ DMU_OT_SPA_HISTORY_OFFSETS, /* spa_his_phys_t */
+ DMU_OT_POOL_PROPS, /* ZAP */
+
+ DMU_OT_NUMTYPES
+} dmu_object_type_t;
+
+typedef enum dmu_objset_type {
+ DMU_OST_NONE,
+ DMU_OST_META,
+ DMU_OST_ZFS,
+ DMU_OST_ZVOL,
+ DMU_OST_OTHER, /* For testing only! */
+ DMU_OST_ANY, /* Be careful! */
+ DMU_OST_NUMTYPES
+} dmu_objset_type_t;
+
+void byteswap_uint64_array(void *buf, size_t size);
+void byteswap_uint32_array(void *buf, size_t size);
+void byteswap_uint16_array(void *buf, size_t size);
+void byteswap_uint8_array(void *buf, size_t size);
+void zap_byteswap(void *buf, size_t size);
+void zfs_acl_byteswap(void *buf, size_t size);
+void zfs_znode_byteswap(void *buf, size_t size);
+
+#define DS_MODE_NONE 0 /* invalid, to aid debugging */
+#define DS_MODE_STANDARD 1 /* normal access, no special needs */
+#define DS_MODE_PRIMARY 2 /* the "main" access, e.g. a mount */
+#define DS_MODE_EXCLUSIVE 3 /* exclusive access, e.g. to destroy */
+#define DS_MODE_LEVELS 4
+#define DS_MODE_LEVEL(x) ((x) & (DS_MODE_LEVELS - 1))
+#define DS_MODE_READONLY 0x8
+#define DS_MODE_IS_READONLY(x) ((x) & DS_MODE_READONLY)
+#define DS_MODE_INCONSISTENT 0x10
+#define DS_MODE_IS_INCONSISTENT(x) ((x) & DS_MODE_INCONSISTENT)
+
+#define DS_FIND_SNAPSHOTS (1<<0)
+#define DS_FIND_CHILDREN (1<<1)
+
+/*
+ * The maximum number of bytes that can be accessed as part of one
+ * operation, including metadata.
+ */
+#define DMU_MAX_ACCESS (10<<20) /* 10MB */
+
+/*
+ * Public routines to create, destroy, open, and close objsets.
+ */
+int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
+ objset_t **osp);
+void dmu_objset_close(objset_t *os);
+int dmu_objset_evict_dbufs(objset_t *os, int try);
+int dmu_objset_create(const char *name, dmu_objset_type_t type,
+ objset_t *clone_parent,
+ void (*func)(objset_t *os, void *arg, dmu_tx_t *tx), void *arg);
+int dmu_objset_destroy(const char *name);
+int dmu_snapshots_destroy(char *fsname, char *snapname);
+int dmu_objset_rollback(const char *name);
+int dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive);
+int dmu_objset_rename(const char *name, const char *newname);
+int dmu_objset_find(char *name, int func(char *, void *), void *arg,
+ int flags);
+void dmu_objset_byteswap(void *buf, size_t size);
+
+typedef struct dmu_buf {
+ uint64_t db_object; /* object that this buffer is part of */
+ uint64_t db_offset; /* byte offset in this object */
+ uint64_t db_size; /* size of buffer in bytes */
+ void *db_data; /* data in buffer */
+} dmu_buf_t;
+
+typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr);
+
+/*
+ * Callback function to perform byte swapping on a block.
+ */
+typedef void dmu_byteswap_func_t(void *buf, size_t size);
+
+/*
+ * The names of zap entries in the DIRECTORY_OBJECT of the MOS.
+ */
+#define DMU_POOL_DIRECTORY_OBJECT 1
+#define DMU_POOL_CONFIG "config"
+#define DMU_POOL_ROOT_DATASET "root_dataset"
+#define DMU_POOL_SYNC_BPLIST "sync_bplist"
+#define DMU_POOL_ERRLOG_SCRUB "errlog_scrub"
+#define DMU_POOL_ERRLOG_LAST "errlog_last"
+#define DMU_POOL_SPARES "spares"
+#define DMU_POOL_DEFLATE "deflate"
+#define DMU_POOL_HISTORY "history"
+#define DMU_POOL_PROPS "pool_props"
+
+/*
+ * Allocate an object from this objset. The range of object numbers
+ * available is (0, DN_MAX_OBJECT). Object 0 is the meta-dnode.
+ *
+ * The transaction must be assigned to a txg. The newly allocated
+ * object will be "held" in the transaction (ie. you can modify the
+ * newly allocated object in this transaction).
+ *
+ * dmu_object_alloc() chooses an object and returns it in *objectp.
+ *
+ * dmu_object_claim() allocates a specific object number. If that
+ * number is already allocated, it fails and returns EEXIST.
+ *
+ * Return 0 on success, or ENOSPC or EEXIST as specified above.
+ */
+uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot,
+ int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
+int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
+ int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
+int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
+ int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+
+/*
+ * Free an object from this objset.
+ *
+ * The object's data will be freed as well (ie. you don't need to call
+ * dmu_free(object, 0, -1, tx)).
+ *
+ * The object need not be held in the transaction.
+ *
+ * If there are any holds on this object's buffers (via dmu_buf_hold()),
+ * or tx holds on the object (via dmu_tx_hold_object()), you can not
+ * free it; it fails and returns EBUSY.
+ *
+ * If the object is not allocated, it fails and returns ENOENT.
+ *
+ * Return 0 on success, or EBUSY or ENOENT as specified above.
+ */
+int dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx);
+
+/*
+ * Find the next allocated or free object.
+ *
+ * The objectp parameter is in-out. It will be updated to be the next
+ * object which is allocated. Ignore objects which have not been
+ * modified since txg.
+ *
+ * XXX Can only be called on a objset with no dirty data.
+ *
+ * Returns 0 on success, or ENOENT if there are no more objects.
+ */
+int dmu_object_next(objset_t *os, uint64_t *objectp,
+ boolean_t hole, uint64_t txg);
+
+/*
+ * Set the data blocksize for an object.
+ *
+ * The object cannot have any blocks allcated beyond the first. If
+ * the first block is allocated already, the new size must be greater
+ * than the current block size. If these conditions are not met,
+ * ENOTSUP will be returned.
+ *
+ * Returns 0 on success, or EBUSY if there are any holds on the object
+ * contents, or ENOTSUP as described above.
+ */
+int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size,
+ int ibs, dmu_tx_t *tx);
+
+/*
+ * Set the checksum property on a dnode. The new checksum algorithm will
+ * apply to all newly written blocks; existing blocks will not be affected.
+ */
+void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
+ dmu_tx_t *tx);
+
+/*
+ * Set the compress property on a dnode. The new compression algorithm will
+ * apply to all newly written blocks; existing blocks will not be affected.
+ */
+void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
+ dmu_tx_t *tx);
+
+/*
+ * Decide how many copies of a given block we should make. Can be from
+ * 1 to SPA_DVAS_PER_BP.
+ */
+int dmu_get_replication_level(struct objset_impl *, struct zbookmark *zb,
+ dmu_object_type_t ot);
+/*
+ * The bonus data is accessed more or less like a regular buffer.
+ * You must dmu_bonus_hold() to get the buffer, which will give you a
+ * dmu_buf_t with db_offset==-1ULL, and db_size = the size of the bonus
+ * data. As with any normal buffer, you must call dmu_buf_read() to
+ * read db_data, dmu_buf_will_dirty() before modifying it, and the
+ * object must be held in an assigned transaction before calling
+ * dmu_buf_will_dirty. You may use dmu_buf_set_user() on the bonus
+ * buffer as well. You must release your hold with dmu_buf_rele().
+ */
+int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **);
+int dmu_bonus_max(void);
+
+/*
+ * Obtain the DMU buffer from the specified object which contains the
+ * specified offset. dmu_buf_hold() puts a "hold" on the buffer, so
+ * that it will remain in memory. You must release the hold with
+ * dmu_buf_rele(). You musn't access the dmu_buf_t after releasing your
+ * hold. You must have a hold on any dmu_buf_t* you pass to the DMU.
+ *
+ * You must call dmu_buf_read, dmu_buf_will_dirty, or dmu_buf_will_fill
+ * on the returned buffer before reading or writing the buffer's
+ * db_data. The comments for those routines describe what particular
+ * operations are valid after calling them.
+ *
+ * The object number must be a valid, allocated object number.
+ */
+int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
+ void *tag, dmu_buf_t **);
+void dmu_buf_add_ref(dmu_buf_t *db, void* tag);
+void dmu_buf_rele(dmu_buf_t *db, void *tag);
+uint64_t dmu_buf_refcount(dmu_buf_t *db);
+
+/*
+ * dmu_buf_hold_array holds the DMU buffers which contain all bytes in a
+ * range of an object. A pointer to an array of dmu_buf_t*'s is
+ * returned (in *dbpp).
+ *
+ * dmu_buf_rele_array releases the hold on an array of dmu_buf_t*'s, and
+ * frees the array. The hold on the array of buffers MUST be released
+ * with dmu_buf_rele_array. You can NOT release the hold on each buffer
+ * individually with dmu_buf_rele.
+ */
+int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset,
+ uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp);
+void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag);
+
+/*
+ * Returns NULL on success, or the existing user ptr if it's already
+ * been set.
+ *
+ * user_ptr is for use by the user and can be obtained via dmu_buf_get_user().
+ *
+ * user_data_ptr_ptr should be NULL, or a pointer to a pointer which
+ * will be set to db->db_data when you are allowed to access it. Note
+ * that db->db_data (the pointer) can change when you do dmu_buf_read(),
+ * dmu_buf_tryupgrade(), dmu_buf_will_dirty(), or dmu_buf_will_fill().
+ * *user_data_ptr_ptr will be set to the new value when it changes.
+ *
+ * If non-NULL, pageout func will be called when this buffer is being
+ * excised from the cache, so that you can clean up the data structure
+ * pointed to by user_ptr.
+ *
+ * dmu_evict_user() will call the pageout func for all buffers in a
+ * objset with a given pageout func.
+ */
+void *dmu_buf_set_user(dmu_buf_t *db, void *user_ptr, void *user_data_ptr_ptr,
+ dmu_buf_evict_func_t *pageout_func);
+/*
+ * set_user_ie is the same as set_user, but request immediate eviction
+ * when hold count goes to zero.
+ */
+void *dmu_buf_set_user_ie(dmu_buf_t *db, void *user_ptr,
+ void *user_data_ptr_ptr, dmu_buf_evict_func_t *pageout_func);
+void *dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr,
+ void *user_ptr, void *user_data_ptr_ptr,
+ dmu_buf_evict_func_t *pageout_func);
+void dmu_evict_user(objset_t *os, dmu_buf_evict_func_t *func);
+
+/*
+ * Returns the user_ptr set with dmu_buf_set_user(), or NULL if not set.
+ */
+void *dmu_buf_get_user(dmu_buf_t *db);
+
+/*
+ * Indicate that you are going to modify the buffer's data (db_data).
+ *
+ * The transaction (tx) must be assigned to a txg (ie. you've called
+ * dmu_tx_assign()). The buffer's object must be held in the tx
+ * (ie. you've called dmu_tx_hold_object(tx, db->db_object)).
+ */
+void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx);
+
+/*
+ * You must create a transaction, then hold the objects which you will
+ * (or might) modify as part of this transaction. Then you must assign
+ * the transaction to a transaction group. Once the transaction has
+ * been assigned, you can modify buffers which belong to held objects as
+ * part of this transaction. You can't modify buffers before the
+ * transaction has been assigned; you can't modify buffers which don't
+ * belong to objects which this transaction holds; you can't hold
+ * objects once the transaction has been assigned. You may hold an
+ * object which you are going to free (with dmu_object_free()), but you
+ * don't have to.
+ *
+ * You can abort the transaction before it has been assigned.
+ *
+ * Note that you may hold buffers (with dmu_buf_hold) at any time,
+ * regardless of transaction state.
+ */
+
+#define DMU_NEW_OBJECT (-1ULL)
+#define DMU_OBJECT_END (-1ULL)
+
+dmu_tx_t *dmu_tx_create(objset_t *os);
+void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len);
+void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off,
+ uint64_t len);
+void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name);
+void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object);
+void dmu_tx_abort(dmu_tx_t *tx);
+int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
+void dmu_tx_wait(dmu_tx_t *tx);
+void dmu_tx_commit(dmu_tx_t *tx);
+
+/*
+ * Free up the data blocks for a defined range of a file. If size is
+ * zero, the range from offset to end-of-file is freed.
+ */
+int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
+ uint64_t size, dmu_tx_t *tx);
+
+/*
+ * Convenience functions.
+ *
+ * Canfail routines will return 0 on success, or an errno if there is a
+ * nonrecoverable I/O error.
+ */
+int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+ void *buf);
+void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+ const void *buf, dmu_tx_t *tx);
+int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size);
+int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size,
+ dmu_tx_t *tx);
+int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset,
+ uint64_t size, struct page *pp, dmu_tx_t *tx);
+
+extern int zfs_prefetch_disable;
+
+/*
+ * Asynchronously try to read in the data.
+ */
+void dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset,
+ uint64_t len);
+
+typedef struct dmu_object_info {
+ /* All sizes are in bytes. */
+ uint32_t doi_data_block_size;
+ uint32_t doi_metadata_block_size;
+ uint64_t doi_bonus_size;
+ dmu_object_type_t doi_type;
+ dmu_object_type_t doi_bonus_type;
+ uint8_t doi_indirection; /* 2 = dnode->indirect->data */
+ uint8_t doi_checksum;
+ uint8_t doi_compress;
+ uint8_t doi_pad[5];
+ /* Values below are number of 512-byte blocks. */
+ uint64_t doi_physical_blks; /* data + metadata */
+ uint64_t doi_max_block_offset;
+} dmu_object_info_t;
+
+typedef struct dmu_object_type_info {
+ dmu_byteswap_func_t *ot_byteswap;
+ boolean_t ot_metadata;
+ char *ot_name;
+} dmu_object_type_info_t;
+
+extern const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES];
+
+/*
+ * Get information on a DMU object.
+ *
+ * Return 0 on success or ENOENT if object is not allocated.
+ *
+ * If doi is NULL, just indicates whether the object exists.
+ */
+int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi);
+void dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi);
+void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi);
+void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize,
+ u_longlong_t *nblk512);
+
+typedef struct dmu_objset_stats {
+ uint64_t dds_num_clones; /* number of clones of this */
+ uint64_t dds_creation_txg;
+ dmu_objset_type_t dds_type;
+ uint8_t dds_is_snapshot;
+ uint8_t dds_inconsistent;
+ char dds_clone_of[MAXNAMELEN];
+} dmu_objset_stats_t;
+
+/*
+ * Get stats on a dataset.
+ */
+void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat);
+
+/*
+ * Add entries to the nvlist for all the objset's properties. See
+ * zfs_prop_table[] and zfs(1m) for details on the properties.
+ */
+void dmu_objset_stats(objset_t *os, struct nvlist *nv);
+
+/*
+ * Get the space usage statistics for statvfs().
+ *
+ * refdbytes is the amount of space "referenced" by this objset.
+ * availbytes is the amount of space available to this objset, taking
+ * into account quotas & reservations, assuming that no other objsets
+ * use the space first. These values correspond to the 'referenced' and
+ * 'available' properties, described in the zfs(1m) manpage.
+ *
+ * usedobjs and availobjs are the number of objects currently allocated,
+ * and available.
+ */
+void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
+ uint64_t *usedobjsp, uint64_t *availobjsp);
+
+/*
+ * The fsid_guid is a 56-bit ID that can change to avoid collisions.
+ * (Contrast with the ds_guid which is a 64-bit ID that will never
+ * change, so there is a small probability that it will collide.)
+ */
+uint64_t dmu_objset_fsid_guid(objset_t *os);
+
+int dmu_objset_is_snapshot(objset_t *os);
+
+extern struct spa *dmu_objset_spa(objset_t *os);
+extern struct zilog *dmu_objset_zil(objset_t *os);
+extern struct dsl_pool *dmu_objset_pool(objset_t *os);
+extern struct dsl_dataset *dmu_objset_ds(objset_t *os);
+extern void dmu_objset_name(objset_t *os, char *buf);
+extern dmu_objset_type_t dmu_objset_type(objset_t *os);
+extern uint64_t dmu_objset_id(objset_t *os);
+extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
+ uint64_t *id, uint64_t *offp);
+extern int dmu_dir_list_next(objset_t *os, int namelen, char *name,
+ uint64_t *idp, uint64_t *offp);
+
+/*
+ * Return the txg number for the given assigned transaction.
+ */
+uint64_t dmu_tx_get_txg(dmu_tx_t *tx);
+
+/*
+ * Synchronous write.
+ * If a parent zio is provided this function initiates a write on the
+ * provided buffer as a child of the parent zio.
+ * In the absense of a parent zio, the write is completed synchronously.
+ * At write completion, blk is filled with the bp of the written block.
+ * Note that while the data covered by this function will be on stable
+ * storage when the write completes this new data does not become a
+ * permanent part of the file until the associated transaction commits.
+ */
+typedef void dmu_sync_cb_t(dmu_buf_t *db, void *arg);
+int dmu_sync(struct zio *zio, dmu_buf_t *db,
+ struct blkptr *bp, uint64_t txg, dmu_sync_cb_t *done, void *arg);
+
+/*
+ * Find the next hole or data block in file starting at *off
+ * Return found offset in *off. Return ESRCH for end of file.
+ */
+int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole,
+ uint64_t *off);
+
+/*
+ * Initial setup and final teardown.
+ */
+extern void dmu_init(void);
+extern void dmu_fini(void);
+
+typedef void (*dmu_traverse_cb_t)(objset_t *os, void *arg, struct blkptr *bp,
+ uint64_t object, uint64_t offset, int len);
+void dmu_traverse_objset(objset_t *os, uint64_t txg_start,
+ dmu_traverse_cb_t cb, void *arg);
+
+int dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, struct file *fp);
+int dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep,
+ boolean_t force, struct file *fp, uint64_t voffset);
+
+/* CRC64 table */
+#define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected form */
+extern uint64_t zfs_crc64_table[256];
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DMU_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h
new file mode 100644
index 0000000..807011e
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h
@@ -0,0 +1,237 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_DMU_IMPL_H
+#define _SYS_DMU_IMPL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/txg_impl.h>
+#include <sys/zio.h>
+#include <sys/dnode.h>
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * This is the locking strategy for the DMU. Numbers in parenthesis are
+ * cases that use that lock order, referenced below:
+ *
+ * ARC is self-contained
+ * bplist is self-contained
+ * refcount is self-contained
+ * txg is self-contained (hopefully!)
+ * zst_lock
+ * zf_rwlock
+ *
+ * XXX try to improve evicting path?
+ *
+ * dp_config_rwlock > os_obj_lock > dn_struct_rwlock >
+ * dn_dbufs_mtx > hash_mutexes > db_mtx > leafs
+ *
+ * dp_config_rwlock
+ * must be held before: everything
+ * protects dd namespace changes
+ * protects property changes globally
+ * held from:
+ * dsl_dir_open/r:
+ * dsl_dir_create_sync/w:
+ * dsl_dir_sync_destroy/w:
+ * dsl_dir_rename_sync/w:
+ * dsl_prop_changed_notify/r:
+ *
+ * os_obj_lock
+ * must be held before:
+ * everything except dp_config_rwlock
+ * protects os_obj_next
+ * held from:
+ * dmu_object_alloc: dn_dbufs_mtx, db_mtx, hash_mutexes, dn_struct_rwlock
+ *
+ * dn_struct_rwlock
+ * must be held before:
+ * everything except dp_config_rwlock and os_obj_lock
+ * protects structure of dnode (eg. nlevels)
+ * db_blkptr can change when syncing out change to nlevels
+ * dn_maxblkid
+ * dn_nlevels
+ * dn_*blksz*
+ * phys nlevels, maxblkid, physical blkptr_t's (?)
+ * held from:
+ * callers of dbuf_read_impl, dbuf_hold[_impl], dbuf_prefetch
+ * dmu_object_info_from_dnode: dn_dirty_mtx (dn_datablksz)
+ * dmu_tx_count_free:
+ * dbuf_read_impl: db_mtx, dmu_zfetch()
+ * dmu_zfetch: zf_rwlock/r, zst_lock, dbuf_prefetch()
+ * dbuf_new_size: db_mtx
+ * dbuf_dirty: db_mtx
+ * dbuf_findbp: (callers, phys? - the real need)
+ * dbuf_create: dn_dbufs_mtx, hash_mutexes, db_mtx (phys?)
+ * dbuf_prefetch: dn_dirty_mtx, hash_mutexes, db_mtx, dn_dbufs_mtx
+ * dbuf_hold_impl: hash_mutexes, db_mtx, dn_dbufs_mtx, dbuf_findbp()
+ * dnode_sync/w (increase_indirection): db_mtx (phys)
+ * dnode_set_blksz/w: dn_dbufs_mtx (dn_*blksz*)
+ * dnode_new_blkid/w: (dn_maxblkid)
+ * dnode_free_range/w: dn_dirty_mtx (dn_maxblkid)
+ * dnode_next_offset: (phys)
+ *
+ * dn_dbufs_mtx
+ * must be held before:
+ * db_mtx, hash_mutexes
+ * protects:
+ * dn_dbufs
+ * dn_evicted
+ * held from:
+ * dmu_evict_user: db_mtx (dn_dbufs)
+ * dbuf_free_range: db_mtx (dn_dbufs)
+ * dbuf_remove_ref: db_mtx, callees:
+ * dbuf_hash_remove: hash_mutexes, db_mtx
+ * dbuf_create: hash_mutexes, db_mtx (dn_dbufs)
+ * dnode_set_blksz: (dn_dbufs)
+ *
+ * hash_mutexes (global)
+ * must be held before:
+ * db_mtx
+ * protects dbuf_hash_table (global) and db_hash_next
+ * held from:
+ * dbuf_find: db_mtx
+ * dbuf_hash_insert: db_mtx
+ * dbuf_hash_remove: db_mtx
+ *
+ * db_mtx (meta-leaf)
+ * must be held before:
+ * dn_mtx, dn_dirty_mtx, dd_lock (leaf mutexes)
+ * protects:
+ * db_state
+ * db_holds
+ * db_buf
+ * db_changed
+ * db_data_pending
+ * db_dirtied
+ * db_link
+ * db_dirty_node (??)
+ * db_dirtycnt
+ * db_d.*
+ * db.*
+ * held from:
+ * dbuf_dirty: dn_mtx, dn_dirty_mtx
+ * dbuf_dirty->dsl_dir_willuse_space: dd_lock
+ * dbuf_dirty->dbuf_new_block->dsl_dataset_block_freeable: dd_lock
+ * dbuf_undirty: dn_dirty_mtx (db_d)
+ * dbuf_write_done: dn_dirty_mtx (db_state)
+ * dbuf_*
+ * dmu_buf_update_user: none (db_d)
+ * dmu_evict_user: none (db_d) (maybe can eliminate)
+ * dbuf_find: none (db_holds)
+ * dbuf_hash_insert: none (db_holds)
+ * dmu_buf_read_array_impl: none (db_state, db_changed)
+ * dmu_sync: none (db_dirty_node, db_d)
+ * dnode_reallocate: none (db)
+ *
+ * dn_mtx (leaf)
+ * protects:
+ * dn_dirty_dbufs
+ * dn_ranges
+ * phys accounting
+ * dn_allocated_txg
+ * dn_free_txg
+ * dn_assigned_txg
+ * dd_assigned_tx
+ * dn_notxholds
+ * dn_dirtyctx
+ * dn_dirtyctx_firstset
+ * (dn_phys copy fields?)
+ * (dn_phys contents?)
+ * held from:
+ * dnode_*
+ * dbuf_dirty: none
+ * dbuf_sync: none (phys accounting)
+ * dbuf_undirty: none (dn_ranges, dn_dirty_dbufs)
+ * dbuf_write_done: none (phys accounting)
+ * dmu_object_info_from_dnode: none (accounting)
+ * dmu_tx_commit: none
+ * dmu_tx_hold_object_impl: none
+ * dmu_tx_try_assign: dn_notxholds(cv)
+ * dmu_tx_unassign: none
+ *
+ * dd_lock (leaf)
+ * protects:
+ * dd_prop_cbs
+ * dd_sync_*
+ * dd_used_bytes
+ * dd_tempreserved
+ * dd_space_towrite
+ * dd_myname
+ * dd_phys accounting?
+ * held from:
+ * dsl_dir_*
+ * dsl_prop_changed_notify: none (dd_prop_cbs)
+ * dsl_prop_register: none (dd_prop_cbs)
+ * dsl_prop_unregister: none (dd_prop_cbs)
+ * dsl_dataset_block_freeable: none (dd_sync_*)
+ *
+ * os_lock (leaf)
+ * protects:
+ * os_dirty_dnodes
+ * os_free_dnodes
+ * os_dnodes
+ * os_downgraded_dbufs
+ * dn_dirtyblksz
+ * dn_dirty_link
+ * held from:
+ * dnode_create: none (os_dnodes)
+ * dnode_destroy: none (os_dnodes)
+ * dnode_setdirty: none (dn_dirtyblksz, os_*_dnodes)
+ * dnode_free: none (dn_dirtyblksz, os_*_dnodes)
+ *
+ * ds_lock (leaf)
+ * protects:
+ * ds_user_ptr
+ * ds_user_evice_func
+ * ds_open_refcount
+ * ds_snapname
+ * ds_phys accounting
+ * held from:
+ * dsl_dataset_*
+ *
+ * dr_mtx (leaf)
+ * protects:
+ * dr_children
+ * held from:
+ * dbuf_dirty
+ * dbuf_undirty
+ * dbuf_sync_indirect
+ * dnode_new_blkid
+ */
+
+struct objset;
+struct dmu_pool;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DMU_IMPL_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
new file mode 100644
index 0000000..8293a3b
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
@@ -0,0 +1,125 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_DMU_OBJSET_H
+#define _SYS_DMU_OBJSET_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/spa.h>
+#include <sys/arc.h>
+#include <sys/txg.h>
+#include <sys/zfs_context.h>
+#include <sys/dnode.h>
+#include <sys/zio.h>
+#include <sys/zil.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct dsl_dataset;
+struct dmu_tx;
+struct objset_impl;
+
+typedef struct objset_phys {
+ dnode_phys_t os_meta_dnode;
+ zil_header_t os_zil_header;
+ uint64_t os_type;
+ char os_pad[1024 - sizeof (dnode_phys_t) - sizeof (zil_header_t) -
+ sizeof (uint64_t)];
+} objset_phys_t;
+
+struct objset {
+ struct objset_impl *os;
+ int os_mode;
+};
+
+typedef struct objset_impl {
+ /* Immutable: */
+ struct dsl_dataset *os_dsl_dataset;
+ spa_t *os_spa;
+ arc_buf_t *os_phys_buf;
+ objset_phys_t *os_phys;
+ dnode_t *os_meta_dnode;
+ zilog_t *os_zil;
+ objset_t os;
+ uint8_t os_checksum; /* can change, under dsl_dir's locks */
+ uint8_t os_compress; /* can change, under dsl_dir's locks */
+ uint8_t os_copies; /* can change, under dsl_dir's locks */
+ uint8_t os_md_checksum;
+ uint8_t os_md_compress;
+
+ /* no lock needed: */
+ struct dmu_tx *os_synctx; /* XXX sketchy */
+ blkptr_t *os_rootbp;
+
+ /* Protected by os_obj_lock */
+ kmutex_t os_obj_lock;
+ uint64_t os_obj_next;
+
+ /* Protected by os_lock */
+ kmutex_t os_lock;
+ list_t os_dirty_dnodes[TXG_SIZE];
+ list_t os_free_dnodes[TXG_SIZE];
+ list_t os_dnodes;
+ list_t os_downgraded_dbufs;
+} objset_impl_t;
+
+#define DMU_META_DNODE_OBJECT 0
+
+/* called from zpl */
+int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
+ objset_t **osp);
+void dmu_objset_close(objset_t *os);
+int dmu_objset_create(const char *name, dmu_objset_type_t type,
+ objset_t *clone_parent,
+ void (*func)(objset_t *os, void *arg, dmu_tx_t *tx), void *arg);
+int dmu_objset_destroy(const char *name);
+int dmu_objset_rollback(const char *name);
+int dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive);
+void dmu_objset_stats(objset_t *os, nvlist_t *nv);
+void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat);
+void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
+ uint64_t *usedobjsp, uint64_t *availobjsp);
+uint64_t dmu_objset_fsid_guid(objset_t *os);
+int dmu_objset_find(char *name, int func(char *, void *), void *arg,
+ int flags);
+void dmu_objset_byteswap(void *buf, size_t size);
+int dmu_objset_evict_dbufs(objset_t *os, int try);
+
+/* called from dsl */
+void dmu_objset_sync(objset_impl_t *os, zio_t *zio, dmu_tx_t *tx);
+objset_impl_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds,
+ blkptr_t *bp, dmu_objset_type_t type, dmu_tx_t *tx);
+int dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp,
+ objset_impl_t **osip);
+void dmu_objset_evict(struct dsl_dataset *ds, void *arg);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DMU_OBJSET_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h
new file mode 100644
index 0000000..ea9fa6c
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h
@@ -0,0 +1,120 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_DMU_TRAVERSE_H
+#define _SYS_DMU_TRAVERSE_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/dmu.h>
+#include <sys/dnode.h>
+#include <sys/arc.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define ADVANCE_POST 0 /* post-order traversal */
+#define ADVANCE_PRE 0x01 /* pre-order traversal */
+#define ADVANCE_PRUNE 0x02 /* prune by prev snapshot birth time */
+#define ADVANCE_DATA 0x04 /* read user data blocks */
+#define ADVANCE_HOLES 0x08 /* visit holes */
+#define ADVANCE_ZIL 0x10 /* visit intent log blocks */
+#define ADVANCE_NOLOCK 0x20 /* Don't grab SPA sync lock */
+
+#define ZB_NO_LEVEL -2
+#define ZB_MAXLEVEL 32 /* Next power of 2 >= DN_MAX_LEVELS */
+#define ZB_MAXBLKID (1ULL << 62)
+#define ZB_MAXOBJSET (1ULL << 62)
+#define ZB_MAXOBJECT (1ULL << 62)
+
+#define ZB_MOS_CACHE 0
+#define ZB_MDN_CACHE 1
+#define ZB_DN_CACHE 2
+#define ZB_DEPTH 3
+
+typedef struct zseg {
+ uint64_t seg_mintxg;
+ uint64_t seg_maxtxg;
+ zbookmark_t seg_start;
+ zbookmark_t seg_end;
+ list_node_t seg_node;
+} zseg_t;
+
+typedef struct traverse_blk_cache {
+ zbookmark_t bc_bookmark;
+ blkptr_t bc_blkptr;
+ void *bc_data;
+ dnode_phys_t *bc_dnode;
+ int bc_errno;
+ int bc_pad1;
+ uint64_t bc_pad2;
+} traverse_blk_cache_t;
+
+typedef int (blkptr_cb_t)(traverse_blk_cache_t *bc, spa_t *spa, void *arg);
+
+struct traverse_handle {
+ spa_t *th_spa;
+ blkptr_cb_t *th_func;
+ void *th_arg;
+ uint16_t th_advance;
+ uint16_t th_locked;
+ int th_zio_flags;
+ list_t th_seglist;
+ traverse_blk_cache_t th_cache[ZB_DEPTH][ZB_MAXLEVEL];
+ traverse_blk_cache_t th_zil_cache;
+ uint64_t th_hits;
+ uint64_t th_arc_hits;
+ uint64_t th_reads;
+ uint64_t th_callbacks;
+ uint64_t th_syncs;
+ uint64_t th_restarts;
+ zbookmark_t th_noread;
+ zbookmark_t th_lastcb;
+};
+
+int traverse_dsl_dataset(struct dsl_dataset *ds, uint64_t txg_start,
+ int advance, blkptr_cb_t func, void *arg);
+
+traverse_handle_t *traverse_init(spa_t *spa, blkptr_cb_t *func, void *arg,
+ int advance, int zio_flags);
+void traverse_fini(traverse_handle_t *th);
+
+void traverse_add_dnode(traverse_handle_t *th,
+ uint64_t mintxg, uint64_t maxtxg, uint64_t objset, uint64_t object);
+void traverse_add_objset(traverse_handle_t *th,
+ uint64_t mintxg, uint64_t maxtxg, uint64_t objset);
+void traverse_add_pool(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg);
+
+int traverse_more(traverse_handle_t *th);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DMU_TRAVERSE_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h
new file mode 100644
index 0000000..89f4799
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h
@@ -0,0 +1,134 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_DMU_TX_H
+#define _SYS_DMU_TX_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/txg.h>
+#include <sys/refcount.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct dmu_buf_impl;
+struct dmu_tx_hold;
+struct dnode_link;
+struct dsl_pool;
+struct dnode;
+struct dsl_dir;
+
+struct dmu_tx {
+ /*
+ * No synchronization is needed because a tx can only be handled
+ * by one thread.
+ */
+ list_t tx_holds; /* list of dmu_tx_hold_t */
+ objset_t *tx_objset;
+ struct dsl_dir *tx_dir;
+ struct dsl_pool *tx_pool;
+ uint64_t tx_txg;
+ uint64_t tx_lastsnap_txg;
+ uint64_t tx_lasttried_txg;
+ txg_handle_t tx_txgh;
+ void *tx_tempreserve_cookie;
+ struct dmu_tx_hold *tx_needassign_txh;
+ uint8_t tx_anyobj;
+ int tx_err;
+#ifdef ZFS_DEBUG
+ uint64_t tx_space_towrite;
+ uint64_t tx_space_tofree;
+ uint64_t tx_space_tooverwrite;
+ refcount_t tx_space_written;
+ refcount_t tx_space_freed;
+#endif
+};
+
+enum dmu_tx_hold_type {
+ THT_NEWOBJECT,
+ THT_WRITE,
+ THT_BONUS,
+ THT_FREE,
+ THT_ZAP,
+ THT_SPACE,
+ THT_NUMTYPES
+};
+
+typedef struct dmu_tx_hold {
+ dmu_tx_t *txh_tx;
+ list_node_t txh_node;
+ struct dnode *txh_dnode;
+ uint64_t txh_space_towrite;
+ uint64_t txh_space_tofree;
+ uint64_t txh_space_tooverwrite;
+#ifdef ZFS_DEBUG
+ enum dmu_tx_hold_type txh_type;
+ uint64_t txh_arg1;
+ uint64_t txh_arg2;
+#endif
+} dmu_tx_hold_t;
+
+
+/*
+ * These routines are defined in dmu.h, and are called by the user.
+ */
+dmu_tx_t *dmu_tx_create(objset_t *dd);
+int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
+void dmu_tx_commit(dmu_tx_t *tx);
+void dmu_tx_abort(dmu_tx_t *tx);
+uint64_t dmu_tx_get_txg(dmu_tx_t *tx);
+void dmu_tx_wait(dmu_tx_t *tx);
+
+/*
+ * These routines are defined in dmu_spa.h, and are called by the SPA.
+ */
+extern dmu_tx_t *dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg);
+
+/*
+ * These routines are only called by the DMU.
+ */
+dmu_tx_t *dmu_tx_create_dd(dsl_dir_t *dd);
+int dmu_tx_is_syncing(dmu_tx_t *tx);
+int dmu_tx_private_ok(dmu_tx_t *tx);
+void dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object);
+void dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta);
+void dmu_tx_dirty_buf(dmu_tx_t *tx, struct dmu_buf_impl *db);
+int dmu_tx_holds(dmu_tx_t *tx, uint64_t object);
+void dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space);
+
+#ifdef ZFS_DEBUG
+#define DMU_TX_DIRTY_BUF(tx, db) dmu_tx_dirty_buf(tx, db)
+#else
+#define DMU_TX_DIRTY_BUF(tx, db)
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DMU_TX_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_zfetch.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_zfetch.h
new file mode 100644
index 0000000..c94bced
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_zfetch.h
@@ -0,0 +1,75 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _DFETCH_H
+#define _DFETCH_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern uint64_t zfetch_array_rd_sz;
+
+struct dnode; /* so we can reference dnode */
+
+typedef enum zfetch_dirn {
+ ZFETCH_FORWARD = 1, /* prefetch increasing block numbers */
+ ZFETCH_BACKWARD = -1 /* prefetch decreasing block numbers */
+} zfetch_dirn_t;
+
+typedef struct zstream {
+ uint64_t zst_offset; /* offset of starting block in range */
+ uint64_t zst_len; /* length of range, in blocks */
+ zfetch_dirn_t zst_direction; /* direction of prefetch */
+ uint64_t zst_stride; /* length of stride, in blocks */
+ uint64_t zst_ph_offset; /* prefetch offset, in blocks */
+ uint64_t zst_cap; /* prefetch limit (cap), in blocks */
+ kmutex_t zst_lock; /* protects stream */
+ clock_t zst_last; /* lbolt of last prefetch */
+ avl_node_t zst_node; /* embed avl node here */
+} zstream_t;
+
+typedef struct zfetch {
+ krwlock_t zf_rwlock; /* protects zfetch structure */
+ list_t zf_stream; /* AVL tree of zstream_t's */
+ struct dnode *zf_dnode; /* dnode that owns this zfetch */
+ uint32_t zf_stream_cnt; /* # of active streams */
+ uint64_t zf_alloc_fail; /* # of failed attempts to alloc strm */
+} zfetch_t;
+
+void dmu_zfetch_init(zfetch_t *, struct dnode *);
+void dmu_zfetch_rele(zfetch_t *);
+void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, int);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _DFETCH_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h
new file mode 100644
index 0000000..327e538
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h
@@ -0,0 +1,267 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_DNODE_H
+#define _SYS_DNODE_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/avl.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/zio.h>
+#include <sys/refcount.h>
+#include <sys/dmu_zfetch.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Flags.
+ */
+#define DNODE_MUST_BE_ALLOCATED 1
+#define DNODE_MUST_BE_FREE 2
+
+/*
+ * Fixed constants.
+ */
+#define DNODE_SHIFT 9 /* 512 bytes */
+#define DN_MIN_INDBLKSHIFT 10 /* 1k */
+#define DN_MAX_INDBLKSHIFT 14 /* 16k */
+#define DNODE_BLOCK_SHIFT 14 /* 16k */
+#define DNODE_CORE_SIZE 64 /* 64 bytes for dnode sans blkptrs */
+#define DN_MAX_OBJECT_SHIFT 48 /* 256 trillion (zfs_fid_t limit) */
+#define DN_MAX_OFFSET_SHIFT 64 /* 2^64 bytes in a dnode */
+
+/*
+ * Derived constants.
+ */
+#define DNODE_SIZE (1 << DNODE_SHIFT)
+#define DN_MAX_NBLKPTR ((DNODE_SIZE - DNODE_CORE_SIZE) >> SPA_BLKPTRSHIFT)
+#define DN_MAX_BONUSLEN (DNODE_SIZE - DNODE_CORE_SIZE - (1 << SPA_BLKPTRSHIFT))
+#define DN_MAX_OBJECT (1ULL << DN_MAX_OBJECT_SHIFT)
+
+#define DNODES_PER_BLOCK_SHIFT (DNODE_BLOCK_SHIFT - DNODE_SHIFT)
+#define DNODES_PER_BLOCK (1ULL << DNODES_PER_BLOCK_SHIFT)
+#define DNODES_PER_LEVEL_SHIFT (DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT)
+
+/* The +2 here is a cheesy way to round up */
+#define DN_MAX_LEVELS (2 + ((DN_MAX_OFFSET_SHIFT - SPA_MINBLOCKSHIFT) / \
+ (DN_MIN_INDBLKSHIFT - SPA_BLKPTRSHIFT)))
+
+#define DN_BONUS(dnp) ((void*)((dnp)->dn_bonus + \
+ (((dnp)->dn_nblkptr - 1) * sizeof (blkptr_t))))
+
+#define DN_USED_BYTES(dnp) (((dnp)->dn_flags & DNODE_FLAG_USED_BYTES) ? \
+ (dnp)->dn_used : (dnp)->dn_used << SPA_MINBLOCKSHIFT)
+
+#define EPB(blkshift, typeshift) (1 << (blkshift - typeshift))
+
+struct dmu_buf_impl;
+struct objset_impl;
+struct zio;
+
+enum dnode_dirtycontext {
+ DN_UNDIRTIED,
+ DN_DIRTY_OPEN,
+ DN_DIRTY_SYNC
+};
+
+/* Is dn_used in bytes? if not, it's in multiples of SPA_MINBLOCKSIZE */
+#define DNODE_FLAG_USED_BYTES (1<<0)
+
+typedef struct dnode_phys {
+ uint8_t dn_type; /* dmu_object_type_t */
+ uint8_t dn_indblkshift; /* ln2(indirect block size) */
+ uint8_t dn_nlevels; /* 1=dn_blkptr->data blocks */
+ uint8_t dn_nblkptr; /* length of dn_blkptr */
+ uint8_t dn_bonustype; /* type of data in bonus buffer */
+ uint8_t dn_checksum; /* ZIO_CHECKSUM type */
+ uint8_t dn_compress; /* ZIO_COMPRESS type */
+ uint8_t dn_flags; /* DNODE_FLAG_* */
+ uint16_t dn_datablkszsec; /* data block size in 512b sectors */
+ uint16_t dn_bonuslen; /* length of dn_bonus */
+ uint8_t dn_pad2[4];
+
+ /* accounting is protected by dn_dirty_mtx */
+ uint64_t dn_maxblkid; /* largest allocated block ID */
+ uint64_t dn_used; /* bytes (or sectors) of disk space */
+
+ uint64_t dn_pad3[4];
+
+ blkptr_t dn_blkptr[1];
+ uint8_t dn_bonus[DN_MAX_BONUSLEN];
+} dnode_phys_t;
+
+typedef struct dnode {
+ /*
+ * dn_struct_rwlock protects the structure of the dnode,
+ * including the number of levels of indirection (dn_nlevels),
+ * dn_maxblkid, and dn_next_*
+ */
+ krwlock_t dn_struct_rwlock;
+
+ /*
+ * Our link on dataset's dd_dnodes list.
+ * Protected by dd_accounting_mtx.
+ */
+ list_node_t dn_link;
+
+ /* immutable: */
+ struct objset_impl *dn_objset;
+ uint64_t dn_object;
+ struct dmu_buf_impl *dn_dbuf;
+ dnode_phys_t *dn_phys; /* pointer into dn->dn_dbuf->db.db_data */
+
+ /*
+ * Copies of stuff in dn_phys. They're valid in the open
+ * context (eg. even before the dnode is first synced).
+ * Where necessary, these are protected by dn_struct_rwlock.
+ */
+ dmu_object_type_t dn_type; /* object type */
+ uint16_t dn_bonuslen; /* bonus length */
+ uint8_t dn_bonustype; /* bonus type */
+ uint8_t dn_nblkptr; /* number of blkptrs (immutable) */
+ uint8_t dn_checksum; /* ZIO_CHECKSUM type */
+ uint8_t dn_compress; /* ZIO_COMPRESS type */
+ uint8_t dn_nlevels;
+ uint8_t dn_indblkshift;
+ uint8_t dn_datablkshift; /* zero if blksz not power of 2! */
+ uint16_t dn_datablkszsec; /* in 512b sectors */
+ uint32_t dn_datablksz; /* in bytes */
+ uint64_t dn_maxblkid;
+ uint8_t dn_next_nlevels[TXG_SIZE];
+ uint8_t dn_next_indblkshift[TXG_SIZE];
+ uint32_t dn_next_blksz[TXG_SIZE]; /* next block size in bytes */
+
+ /* protected by os_lock: */
+ list_node_t dn_dirty_link[TXG_SIZE]; /* next on dataset's dirty */
+
+ /* protected by dn_mtx: */
+ kmutex_t dn_mtx;
+ list_t dn_dirty_records[TXG_SIZE];
+ avl_tree_t dn_ranges[TXG_SIZE];
+ uint64_t dn_allocated_txg;
+ uint64_t dn_free_txg;
+ uint64_t dn_assigned_txg;
+ kcondvar_t dn_notxholds;
+ enum dnode_dirtycontext dn_dirtyctx;
+ uint8_t *dn_dirtyctx_firstset; /* dbg: contents meaningless */
+
+ /* protected by own devices */
+ refcount_t dn_tx_holds;
+ refcount_t dn_holds;
+
+ kmutex_t dn_dbufs_mtx;
+ list_t dn_dbufs; /* linked list of descendent dbuf_t's */
+ struct dmu_buf_impl *dn_bonus; /* bonus buffer dbuf */
+
+ /* parent IO for current sync write */
+ zio_t *dn_zio;
+
+ /* holds prefetch structure */
+ struct zfetch dn_zfetch;
+} dnode_t;
+
+typedef struct free_range {
+ avl_node_t fr_node;
+ uint64_t fr_blkid;
+ uint64_t fr_nblks;
+} free_range_t;
+
+dnode_t *dnode_special_open(struct objset_impl *dd, dnode_phys_t *dnp,
+ uint64_t object);
+void dnode_special_close(dnode_t *dn);
+
+int dnode_hold(struct objset_impl *dd, uint64_t object,
+ void *ref, dnode_t **dnp);
+int dnode_hold_impl(struct objset_impl *dd, uint64_t object, int flag,
+ void *ref, dnode_t **dnp);
+void dnode_add_ref(dnode_t *dn, void *ref);
+void dnode_rele(dnode_t *dn, void *ref);
+void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx);
+void dnode_sync(dnode_t *dn, dmu_tx_t *tx);
+void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+void dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+void dnode_free(dnode_t *dn, dmu_tx_t *tx);
+void dnode_byteswap(dnode_phys_t *dnp);
+void dnode_buf_byteswap(void *buf, size_t size);
+void dnode_verify(dnode_t *dn);
+int dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx);
+uint64_t dnode_current_max_length(dnode_t *dn);
+void dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx);
+void dnode_clear_range(dnode_t *dn, uint64_t blkid,
+ uint64_t nblks, dmu_tx_t *tx);
+void dnode_diduse_space(dnode_t *dn, int64_t space);
+void dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx);
+void dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx);
+uint64_t dnode_block_freed(dnode_t *dn, uint64_t blkid);
+void dnode_init(void);
+void dnode_fini(void);
+int dnode_next_offset(dnode_t *dn, boolean_t hole, uint64_t *off, int minlvl,
+ uint64_t blkfill, uint64_t txg);
+int dnode_evict_dbufs(dnode_t *dn, int try);
+
+#ifdef ZFS_DEBUG
+
+/*
+ * There should be a ## between the string literal and fmt, to make it
+ * clear that we're joining two strings together, but that piece of shit
+ * gcc doesn't support that preprocessor token.
+ */
+#define dprintf_dnode(dn, fmt, ...) do { \
+ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
+ char __db_buf[32]; \
+ uint64_t __db_obj = (dn)->dn_object; \
+ if (__db_obj == DMU_META_DNODE_OBJECT) \
+ (void) strcpy(__db_buf, "mdn"); \
+ else \
+ (void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \
+ (u_longlong_t)__db_obj);\
+ dprintf_ds((dn)->dn_objset->os_dsl_dataset, "obj=%s " fmt, \
+ __db_buf, __VA_ARGS__); \
+ } \
+_NOTE(CONSTCOND) } while (0)
+
+#define DNODE_VERIFY(dn) dnode_verify(dn)
+#define FREE_VERIFY(db, start, end, tx) free_verify(db, start, end, tx)
+
+#else
+
+#define dprintf_dnode(db, fmt, ...)
+#define DNODE_VERIFY(dn)
+#define FREE_VERIFY(db, start, end, tx)
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DNODE_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h
new file mode 100644
index 0000000..8929dbc
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h
@@ -0,0 +1,185 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_DSL_DATASET_H
+#define _SYS_DSL_DATASET_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/zio.h>
+#include <sys/bplist.h>
+#include <sys/dsl_synctask.h>
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct dsl_dataset;
+struct dsl_dir;
+struct dsl_pool;
+
+typedef void dsl_dataset_evict_func_t(struct dsl_dataset *, void *);
+
+#define DS_FLAG_INCONSISTENT (1ULL<<0)
+/*
+ * NB: nopromote can not yet be set, but we want support for it in this
+ * on-disk version, so that we don't need to upgrade for it later. It
+ * will be needed when we implement 'zfs split' (where the split off
+ * clone should not be promoted).
+ */
+#define DS_FLAG_NOPROMOTE (1ULL<<1)
+
+typedef struct dsl_dataset_phys {
+ uint64_t ds_dir_obj;
+ uint64_t ds_prev_snap_obj;
+ uint64_t ds_prev_snap_txg;
+ uint64_t ds_next_snap_obj;
+ uint64_t ds_snapnames_zapobj; /* zap obj of snaps; ==0 for snaps */
+ uint64_t ds_num_children; /* clone/snap children; ==0 for head */
+ uint64_t ds_creation_time; /* seconds since 1970 */
+ uint64_t ds_creation_txg;
+ uint64_t ds_deadlist_obj;
+ uint64_t ds_used_bytes;
+ uint64_t ds_compressed_bytes;
+ uint64_t ds_uncompressed_bytes;
+ uint64_t ds_unique_bytes; /* only relevant to snapshots */
+ /*
+ * The ds_fsid_guid is a 56-bit ID that can change to avoid
+ * collisions. The ds_guid is a 64-bit ID that will never
+ * change, so there is a small probability that it will collide.
+ */
+ uint64_t ds_fsid_guid;
+ uint64_t ds_guid;
+ uint64_t ds_flags;
+ blkptr_t ds_bp;
+ uint64_t ds_pad[8]; /* pad out to 320 bytes for good measure */
+} dsl_dataset_phys_t;
+
+typedef struct dsl_dataset {
+ /* Immutable: */
+ struct dsl_dir *ds_dir;
+ dsl_dataset_phys_t *ds_phys;
+ dmu_buf_t *ds_dbuf;
+ uint64_t ds_object;
+
+ /* only used in syncing context: */
+ struct dsl_dataset *ds_prev; /* only valid for non-snapshots */
+
+ /* has internal locking: */
+ bplist_t ds_deadlist;
+
+ /* protected by lock on pool's dp_dirty_datasets list */
+ txg_node_t ds_dirty_link;
+ list_node_t ds_synced_link;
+
+ /*
+ * ds_phys->ds_<accounting> is also protected by ds_lock.
+ * Protected by ds_lock:
+ */
+ kmutex_t ds_lock;
+ void *ds_user_ptr;
+ dsl_dataset_evict_func_t *ds_user_evict_func;
+ uint64_t ds_open_refcount;
+
+ /* no locking; only for making guesses */
+ uint64_t ds_trysnap_txg;
+
+ /* Protected by ds_lock; keep at end of struct for better locality */
+ char ds_snapname[MAXNAMELEN];
+} dsl_dataset_t;
+
+#define dsl_dataset_is_snapshot(ds) \
+ ((ds)->ds_phys->ds_num_children != 0)
+
+int dsl_dataset_open_spa(spa_t *spa, const char *name, int mode,
+ void *tag, dsl_dataset_t **dsp);
+int dsl_dataset_open(const char *name, int mode, void *tag,
+ dsl_dataset_t **dsp);
+int dsl_dataset_open_obj(struct dsl_pool *dp, uint64_t dsobj,
+ const char *tail, int mode, void *tag, dsl_dataset_t **);
+void dsl_dataset_name(dsl_dataset_t *ds, char *name);
+void dsl_dataset_close(dsl_dataset_t *ds, int mode, void *tag);
+uint64_t dsl_dataset_create_sync(dsl_dir_t *pds,
+ const char *lastname, dsl_dataset_t *clone_parent, dmu_tx_t *tx);
+int dsl_dataset_destroy(const char *name);
+int dsl_snapshots_destroy(char *fsname, char *snapname);
+dsl_checkfunc_t dsl_dataset_snapshot_check;
+dsl_syncfunc_t dsl_dataset_snapshot_sync;
+int dsl_dataset_rollback(dsl_dataset_t *ds);
+int dsl_dataset_rename(const char *name, const char *newname);
+int dsl_dataset_promote(const char *name);
+
+void *dsl_dataset_set_user_ptr(dsl_dataset_t *ds,
+ void *p, dsl_dataset_evict_func_t func);
+void *dsl_dataset_get_user_ptr(dsl_dataset_t *ds);
+
+blkptr_t *dsl_dataset_get_blkptr(dsl_dataset_t *ds);
+void dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
+
+spa_t *dsl_dataset_get_spa(dsl_dataset_t *ds);
+
+void dsl_dataset_sync(dsl_dataset_t *os, zio_t *zio, dmu_tx_t *tx);
+
+void dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
+void dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
+ dmu_tx_t *tx);
+int dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth);
+uint64_t dsl_dataset_prev_snap_txg(dsl_dataset_t *ds);
+
+void dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx);
+void dsl_dataset_stats(dsl_dataset_t *os, nvlist_t *nv);
+void dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat);
+void dsl_dataset_space(dsl_dataset_t *ds,
+ uint64_t *refdbytesp, uint64_t *availbytesp,
+ uint64_t *usedobjsp, uint64_t *availobjsp);
+uint64_t dsl_dataset_fsid_guid(dsl_dataset_t *ds);
+
+void dsl_dataset_create_root(struct dsl_pool *dp, uint64_t *ddobjp,
+ dmu_tx_t *tx);
+
+int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf);
+
+#ifdef ZFS_DEBUG
+#define dprintf_ds(ds, fmt, ...) do { \
+ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
+ char *__ds_name = kmem_alloc(MAXNAMELEN, KM_SLEEP); \
+ dsl_dataset_name(ds, __ds_name); \
+ dprintf("ds=%s " fmt, __ds_name, __VA_ARGS__); \
+ kmem_free(__ds_name, MAXNAMELEN); \
+ } \
+_NOTE(CONSTCOND) } while (0)
+#else
+#define dprintf_ds(dd, fmt, ...)
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_DATASET_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h
new file mode 100644
index 0000000..f33776a
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h
@@ -0,0 +1,142 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_DSL_DIR_H
+#define _SYS_DSL_DIR_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_synctask.h>
+#include <sys/refcount.h>
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct dsl_dataset;
+
+typedef struct dsl_dir_phys {
+ uint64_t dd_creation_time; /* not actually used */
+ uint64_t dd_head_dataset_obj;
+ uint64_t dd_parent_obj;
+ uint64_t dd_clone_parent_obj;
+ uint64_t dd_child_dir_zapobj;
+ /*
+ * how much space our children are accounting for; for leaf
+ * datasets, == physical space used by fs + snaps
+ */
+ uint64_t dd_used_bytes;
+ uint64_t dd_compressed_bytes;
+ uint64_t dd_uncompressed_bytes;
+ /* Administrative quota setting */
+ uint64_t dd_quota;
+ /* Administrative reservation setting */
+ uint64_t dd_reserved;
+ uint64_t dd_props_zapobj;
+ uint64_t dd_pad[21]; /* pad out to 256 bytes for good measure */
+} dsl_dir_phys_t;
+
+struct dsl_dir {
+ /* These are immutable; no lock needed: */
+ uint64_t dd_object;
+ dsl_dir_phys_t *dd_phys;
+ dmu_buf_t *dd_dbuf;
+ dsl_pool_t *dd_pool;
+
+ /* protected by lock on pool's dp_dirty_dirs list */
+ txg_node_t dd_dirty_link;
+
+ /* protected by dp_config_rwlock */
+ dsl_dir_t *dd_parent;
+
+ /* Protected by dd_lock */
+ kmutex_t dd_lock;
+ list_t dd_prop_cbs; /* list of dsl_prop_cb_record_t's */
+
+ /* Accounting */
+ /* reflects any changes to dd_phys->dd_used_bytes made this syncing */
+ int64_t dd_used_bytes;
+ /* gross estimate of space used by in-flight tx's */
+ uint64_t dd_tempreserved[TXG_SIZE];
+ /* amount of space we expect to write; == amount of dirty data */
+ int64_t dd_space_towrite[TXG_SIZE];
+
+ /* protected by dd_lock; keep at end of struct for better locality */
+ char dd_myname[MAXNAMELEN];
+};
+
+void dsl_dir_close(dsl_dir_t *dd, void *tag);
+int dsl_dir_open(const char *name, void *tag, dsl_dir_t **, const char **tail);
+int dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, dsl_dir_t **,
+ const char **tailp);
+int dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
+ const char *tail, void *tag, dsl_dir_t **);
+void dsl_dir_name(dsl_dir_t *dd, char *buf);
+int dsl_dir_is_private(dsl_dir_t *dd);
+uint64_t dsl_dir_create_sync(dsl_dir_t *pds, const char *name, dmu_tx_t *tx);
+void dsl_dir_create_root(objset_t *mos, uint64_t *ddobjp, dmu_tx_t *tx);
+dsl_checkfunc_t dsl_dir_destroy_check;
+dsl_syncfunc_t dsl_dir_destroy_sync;
+void dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv);
+uint64_t dsl_dir_space_available(dsl_dir_t *dd,
+ dsl_dir_t *ancestor, int64_t delta, int ondiskonly);
+void dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx);
+void dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx);
+int dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t mem,
+ uint64_t asize, uint64_t fsize, void **tr_cookiep, dmu_tx_t *tx);
+void dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx);
+void dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx);
+void dsl_dir_diduse_space(dsl_dir_t *dd,
+ int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx);
+int dsl_dir_set_quota(const char *ddname, uint64_t quota);
+int dsl_dir_set_reservation(const char *ddname, uint64_t reservation);
+int dsl_dir_rename(dsl_dir_t *dd, const char *newname);
+int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space);
+
+/* internal reserved dir name */
+#define MOS_DIR_NAME "$MOS"
+
+#ifdef ZFS_DEBUG
+#define dprintf_dd(dd, fmt, ...) do { \
+ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
+ char *__ds_name = kmem_alloc(MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, \
+ KM_SLEEP); \
+ dsl_dir_name(dd, __ds_name); \
+ dprintf("dd=%s " fmt, __ds_name, __VA_ARGS__); \
+ kmem_free(__ds_name, MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); \
+ } \
+_NOTE(CONSTCOND) } while (0)
+#else
+#define dprintf_dd(dd, fmt, ...)
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_DIR_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h
new file mode 100644
index 0000000..f7ec67a
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h
@@ -0,0 +1,82 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_DSL_POOL_H
+#define _SYS_DSL_POOL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/txg_impl.h>
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct objset;
+struct dsl_dir;
+
+typedef struct dsl_pool {
+ /* Immutable */
+ spa_t *dp_spa;
+ struct objset *dp_meta_objset;
+ struct dsl_dir *dp_root_dir;
+ struct dsl_dir *dp_mos_dir;
+ uint64_t dp_root_dir_obj;
+
+ /* No lock needed - sync context only */
+ blkptr_t dp_meta_rootbp;
+ list_t dp_synced_objsets;
+
+ /* Has its own locking */
+ tx_state_t dp_tx;
+ txg_list_t dp_dirty_datasets;
+ txg_list_t dp_dirty_dirs;
+ txg_list_t dp_sync_tasks;
+
+ /*
+ * Protects administrative changes (properties, namespace)
+ * It is only held for write in syncing context. Therefore
+ * syncing context does not need to ever have it for read, since
+ * nobody else could possibly have it for write.
+ */
+ krwlock_t dp_config_rwlock;
+} dsl_pool_t;
+
+int dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp);
+void dsl_pool_close(dsl_pool_t *dp);
+dsl_pool_t *dsl_pool_create(spa_t *spa, uint64_t txg);
+void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg);
+void dsl_pool_zil_clean(dsl_pool_t *dp);
+int dsl_pool_sync_context(dsl_pool_t *dp);
+uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_POOL_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h
new file mode 100644
index 0000000..d2debff
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h
@@ -0,0 +1,77 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_DSL_PROP_H
+#define _SYS_DSL_PROP_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/dsl_pool.h>
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct dsl_dataset;
+
+/* The callback func may not call into the DMU or DSL! */
+typedef void (dsl_prop_changed_cb_t)(void *arg, uint64_t newval);
+
+typedef struct dsl_prop_cb_record {
+ list_node_t cbr_node; /* link on dd_prop_cbs */
+ struct dsl_dataset *cbr_ds;
+ const char *cbr_propname;
+ dsl_prop_changed_cb_t *cbr_func;
+ void *cbr_arg;
+} dsl_prop_cb_record_t;
+
+int dsl_prop_register(struct dsl_dataset *ds, const char *propname,
+ dsl_prop_changed_cb_t *callback, void *cbarg);
+int dsl_prop_unregister(struct dsl_dataset *ds, const char *propname,
+ dsl_prop_changed_cb_t *callback, void *cbarg);
+int dsl_prop_numcb(struct dsl_dataset *ds);
+
+int dsl_prop_get(const char *ddname, const char *propname,
+ int intsz, int numints, void *buf, char *setpoint);
+int dsl_prop_get_integer(const char *ddname, const char *propname,
+ uint64_t *valuep, char *setpoint);
+int dsl_prop_get_all(objset_t *os, nvlist_t **nvp);
+
+int dsl_prop_set(const char *ddname, const char *propname,
+ int intsz, int numints, const void *buf);
+int dsl_prop_set_dd(dsl_dir_t *dd, const char *propname,
+ int intsz, int numints, const void *buf);
+
+void dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value);
+void dsl_prop_nvlist_add_string(nvlist_t *nv,
+ zfs_prop_t prop, const char *value);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_PROP_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h
new file mode 100644
index 0000000..e695b18
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h
@@ -0,0 +1,77 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_DSL_SYNCTASK_H
+#define _SYS_DSL_SYNCTASK_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/txg.h>
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct dsl_pool;
+
+typedef int (dsl_checkfunc_t)(void *, void *, dmu_tx_t *);
+typedef void (dsl_syncfunc_t)(void *, void *, dmu_tx_t *);
+
+typedef struct dsl_sync_task {
+ list_node_t dst_node;
+ dsl_checkfunc_t *dst_checkfunc;
+ dsl_syncfunc_t *dst_syncfunc;
+ void *dst_arg1;
+ void *dst_arg2;
+ int dst_err;
+} dsl_sync_task_t;
+
+typedef struct dsl_sync_task_group {
+ txg_node_t dstg_node;
+ list_t dstg_tasks;
+ struct dsl_pool *dstg_pool;
+ uint64_t dstg_txg;
+ int dstg_err;
+ int dstg_space;
+} dsl_sync_task_group_t;
+
+dsl_sync_task_group_t *dsl_sync_task_group_create(struct dsl_pool *dp);
+void dsl_sync_task_create(dsl_sync_task_group_t *dstg,
+ dsl_checkfunc_t *, dsl_syncfunc_t *,
+ void *arg1, void *arg2, int blocks_modified);
+int dsl_sync_task_group_wait(dsl_sync_task_group_t *dstg);
+void dsl_sync_task_group_destroy(dsl_sync_task_group_t *dstg);
+void dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx);
+
+int dsl_sync_task_do(struct dsl_pool *dp,
+ dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc,
+ void *arg1, void *arg2, int blocks_modified);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_SYNCTASK_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
new file mode 100644
index 0000000..095dd3c
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
@@ -0,0 +1,69 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_METASLAB_H
+#define _SYS_METASLAB_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/spa.h>
+#include <sys/space_map.h>
+#include <sys/txg.h>
+#include <sys/zio.h>
+#include <sys/avl.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct metaslab_class metaslab_class_t;
+typedef struct metaslab_group metaslab_group_t;
+
+extern metaslab_t *metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
+ uint64_t start, uint64_t size, uint64_t txg);
+extern void metaslab_fini(metaslab_t *msp);
+extern void metaslab_sync(metaslab_t *msp, uint64_t txg);
+extern void metaslab_sync_done(metaslab_t *msp, uint64_t txg);
+
+extern int metaslab_alloc(spa_t *spa, uint64_t psize, blkptr_t *bp,
+ int ncopies, uint64_t txg, blkptr_t *hintbp, boolean_t hintbp_avoid);
+extern void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg,
+ boolean_t now);
+extern int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg);
+
+extern metaslab_class_t *metaslab_class_create(void);
+extern void metaslab_class_destroy(metaslab_class_t *mc);
+extern void metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg);
+extern void metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg);
+
+extern metaslab_group_t *metaslab_group_create(metaslab_class_t *mc,
+ vdev_t *vd);
+extern void metaslab_group_destroy(metaslab_group_t *mg);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_METASLAB_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
new file mode 100644
index 0000000..5980cbc
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
@@ -0,0 +1,81 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_METASLAB_IMPL_H
+#define _SYS_METASLAB_IMPL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/metaslab.h>
+#include <sys/space_map.h>
+#include <sys/vdev.h>
+#include <sys/txg.h>
+#include <sys/avl.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct metaslab_class {
+ metaslab_group_t *mc_rotor;
+ uint64_t mc_allocated;
+};
+
+struct metaslab_group {
+ kmutex_t mg_lock;
+ avl_tree_t mg_metaslab_tree;
+ uint64_t mg_aliquot;
+ int64_t mg_bias;
+ metaslab_class_t *mg_class;
+ vdev_t *mg_vd;
+ metaslab_group_t *mg_prev;
+ metaslab_group_t *mg_next;
+};
+
+/*
+ * Each metaslab's free space is tracked in space map object in the MOS,
+ * which is only updated in syncing context. Each time we sync a txg,
+ * we append the allocs and frees from that txg to the space map object.
+ * When the txg is done syncing, metaslab_sync_done() updates ms_smo
+ * to ms_smo_syncing. Everything in ms_smo is always safe to allocate.
+ */
+struct metaslab {
+ kmutex_t ms_lock; /* metaslab lock */
+ space_map_obj_t ms_smo; /* synced space map object */
+ space_map_obj_t ms_smo_syncing; /* syncing space map object */
+ space_map_t ms_allocmap[TXG_SIZE]; /* allocated this txg */
+ space_map_t ms_freemap[TXG_SIZE]; /* freed this txg */
+ space_map_t ms_map; /* in-core free space map */
+ uint64_t ms_weight; /* weight vs. others in group */
+ metaslab_group_t *ms_group; /* metaslab group */
+ avl_node_t ms_group_node; /* node in metaslab group tree */
+ txg_node_t ms_txg_node; /* per-txg dirty metaslab links */
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_METASLAB_IMPL_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h
new file mode 100644
index 0000000..4de1cae
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h
@@ -0,0 +1,103 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_REFCOUNT_H
+#define _SYS_REFCOUNT_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/list.h>
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * If the reference is held only by the calling function and not any
+ * particular object, use FTAG (which is a string) for the holder_tag.
+ * Otherwise, use the object that holds the reference.
+ */
+#define FTAG ((char *)__func__)
+
+#if defined(DEBUG) || !defined(_KERNEL)
+typedef struct reference {
+ list_node_t ref_link;
+ void *ref_holder;
+ uint64_t ref_number;
+ uint8_t *ref_removed;
+} reference_t;
+
+typedef struct refcount {
+ kmutex_t rc_mtx;
+ list_t rc_list;
+ list_t rc_removed;
+ int64_t rc_count;
+ int64_t rc_removed_count;
+} refcount_t;
+
+/* Note: refcount_t should be initialized to zero before use. */
+
+void refcount_create(refcount_t *rc);
+void refcount_destroy(refcount_t *rc);
+void refcount_destroy_many(refcount_t *rc, uint64_t number);
+int refcount_is_zero(refcount_t *rc);
+int64_t refcount_count(refcount_t *rc);
+int64_t refcount_add(refcount_t *rc, void *holder_tag);
+int64_t refcount_remove(refcount_t *rc, void *holder_tag);
+int64_t refcount_add_many(refcount_t *rc, uint64_t number, void *holder_tag);
+int64_t refcount_remove_many(refcount_t *rc, uint64_t number, void *holder_tag);
+
+void refcount_init(void);
+void refcount_fini(void);
+
+#else /* DEBUG */
+
+typedef struct refcount {
+ uint64_t rc_count;
+} refcount_t;
+
+#define refcount_create(rc) ((rc)->rc_count = 0)
+#define refcount_destroy(rc) ((rc)->rc_count = 0)
+#define refcount_destroy_many(rc, number) ((rc)->rc_count = 0)
+#define refcount_is_zero(rc) ((rc)->rc_count == 0)
+#define refcount_count(rc) ((rc)->rc_count)
+#define refcount_add(rc, holder) atomic_add_64_nv(&(rc)->rc_count, 1)
+#define refcount_remove(rc, holder) atomic_add_64_nv(&(rc)->rc_count, -1)
+#define refcount_add_many(rc, number, holder) \
+ atomic_add_64_nv(&(rc)->rc_count, number)
+#define refcount_remove_many(rc, number, holder) \
+ atomic_add_64_nv(&(rc)->rc_count, -number)
+
+#define refcount_init()
+#define refcount_fini()
+
+#endif /* DEBUG */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_REFCOUNT_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
new file mode 100644
index 0000000..2bcf4c8
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
@@ -0,0 +1,491 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_SPA_H
+#define _SYS_SPA_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/avl.h>
+#include <sys/zfs_context.h>
+#include <sys/nvpair.h>
+#include <sys/sysmacros.h>
+#include <sys/types.h>
+#include <sys/fs/zfs.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Forward references that lots of things need.
+ */
+typedef struct spa spa_t;
+typedef struct vdev vdev_t;
+typedef struct metaslab metaslab_t;
+typedef struct zilog zilog_t;
+typedef struct traverse_handle traverse_handle_t;
+struct dsl_pool;
+
+/*
+ * General-purpose 32-bit and 64-bit bitfield encodings.
+ */
+#define BF32_DECODE(x, low, len) P2PHASE((x) >> (low), 1U << (len))
+#define BF64_DECODE(x, low, len) P2PHASE((x) >> (low), 1ULL << (len))
+#define BF32_ENCODE(x, low, len) (P2PHASE((x), 1U << (len)) << (low))
+#define BF64_ENCODE(x, low, len) (P2PHASE((x), 1ULL << (len)) << (low))
+
+#define BF32_GET(x, low, len) BF32_DECODE(x, low, len)
+#define BF64_GET(x, low, len) BF64_DECODE(x, low, len)
+
+#define BF32_SET(x, low, len, val) \
+ ((x) ^= BF32_ENCODE((x >> low) ^ (val), low, len))
+#define BF64_SET(x, low, len, val) \
+ ((x) ^= BF64_ENCODE((x >> low) ^ (val), low, len))
+
+#define BF32_GET_SB(x, low, len, shift, bias) \
+ ((BF32_GET(x, low, len) + (bias)) << (shift))
+#define BF64_GET_SB(x, low, len, shift, bias) \
+ ((BF64_GET(x, low, len) + (bias)) << (shift))
+
+#define BF32_SET_SB(x, low, len, shift, bias, val) \
+ BF32_SET(x, low, len, ((val) >> (shift)) - (bias))
+#define BF64_SET_SB(x, low, len, shift, bias, val) \
+ BF64_SET(x, low, len, ((val) >> (shift)) - (bias))
+
+/*
+ * We currently support nine block sizes, from 512 bytes to 128K.
+ * We could go higher, but the benefits are near-zero and the cost
+ * of COWing a giant block to modify one byte would become excessive.
+ */
+#define SPA_MINBLOCKSHIFT 9
+#define SPA_MAXBLOCKSHIFT 17
+#define SPA_MINBLOCKSIZE (1ULL << SPA_MINBLOCKSHIFT)
+#define SPA_MAXBLOCKSIZE (1ULL << SPA_MAXBLOCKSHIFT)
+
+#define SPA_BLOCKSIZES (SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1)
+
+/*
+ * The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB.
+ * The ASIZE encoding should be at least 64 times larger (6 more bits)
+ * to support up to 4-way RAID-Z mirror mode with worst-case gang block
+ * overhead, three DVAs per bp, plus one more bit in case we do anything
+ * else that expands the ASIZE.
+ */
+#define SPA_LSIZEBITS 16 /* LSIZE up to 32M (2^16 * 512) */
+#define SPA_PSIZEBITS 16 /* PSIZE up to 32M (2^16 * 512) */
+#define SPA_ASIZEBITS 24 /* ASIZE up to 64 times larger */
+
+/*
+ * All SPA data is represented by 128-bit data virtual addresses (DVAs).
+ * The members of the dva_t should be considered opaque outside the SPA.
+ */
+typedef struct dva {
+ uint64_t dva_word[2];
+} dva_t;
+
+/*
+ * Each block has a 256-bit checksum -- strong enough for cryptographic hashes.
+ */
+typedef struct zio_cksum {
+ uint64_t zc_word[4];
+} zio_cksum_t;
+
+/*
+ * Each block is described by its DVAs, time of birth, checksum, etc.
+ * The word-by-word, bit-by-bit layout of the blkptr is as follows:
+ *
+ * 64 56 48 40 32 24 16 8 0
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 0 | vdev1 | GRID | ASIZE |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 1 |G| offset1 |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 2 | vdev2 | GRID | ASIZE |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 3 |G| offset2 |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 4 | vdev3 | GRID | ASIZE |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 5 |G| offset3 |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 6 |E| lvl | type | cksum | comp | PSIZE | LSIZE |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 7 | padding |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 8 | padding |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 9 | padding |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * a | birth txg |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * b | fill count |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * c | checksum[0] |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * d | checksum[1] |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * e | checksum[2] |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * f | checksum[3] |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ *
+ * Legend:
+ *
+ * vdev virtual device ID
+ * offset offset into virtual device
+ * LSIZE logical size
+ * PSIZE physical size (after compression)
+ * ASIZE allocated size (including RAID-Z parity and gang block headers)
+ * GRID RAID-Z layout information (reserved for future use)
+ * cksum checksum function
+ * comp compression function
+ * G gang block indicator
+ * E endianness
+ * type DMU object type
+ * lvl level of indirection
+ * birth txg transaction group in which the block was born
+ * fill count number of non-zero blocks under this bp
+ * checksum[4] 256-bit checksum of the data this bp describes
+ */
+typedef struct blkptr {
+ dva_t blk_dva[3]; /* 128-bit Data Virtual Address */
+ uint64_t blk_prop; /* size, compression, type, etc */
+ uint64_t blk_pad[3]; /* Extra space for the future */
+ uint64_t blk_birth; /* transaction group at birth */
+ uint64_t blk_fill; /* fill count */
+ zio_cksum_t blk_cksum; /* 256-bit checksum */
+} blkptr_t;
+
+#define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */
+#define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */
+
+/*
+ * Macros to get and set fields in a bp or DVA.
+ */
+#define DVA_GET_ASIZE(dva) \
+ BF64_GET_SB((dva)->dva_word[0], 0, 24, SPA_MINBLOCKSHIFT, 0)
+#define DVA_SET_ASIZE(dva, x) \
+ BF64_SET_SB((dva)->dva_word[0], 0, 24, SPA_MINBLOCKSHIFT, 0, x)
+
+#define DVA_GET_GRID(dva) BF64_GET((dva)->dva_word[0], 24, 8)
+#define DVA_SET_GRID(dva, x) BF64_SET((dva)->dva_word[0], 24, 8, x)
+
+#define DVA_GET_VDEV(dva) BF64_GET((dva)->dva_word[0], 32, 32)
+#define DVA_SET_VDEV(dva, x) BF64_SET((dva)->dva_word[0], 32, 32, x)
+
+#define DVA_GET_OFFSET(dva) \
+ BF64_GET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0)
+#define DVA_SET_OFFSET(dva, x) \
+ BF64_SET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0, x)
+
+#define DVA_GET_GANG(dva) BF64_GET((dva)->dva_word[1], 63, 1)
+#define DVA_SET_GANG(dva, x) BF64_SET((dva)->dva_word[1], 63, 1, x)
+
+#define BP_GET_LSIZE(bp) \
+ (BP_IS_HOLE(bp) ? 0 : \
+ BF64_GET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1))
+#define BP_SET_LSIZE(bp, x) \
+ BF64_SET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x)
+
+#define BP_GET_PSIZE(bp) \
+ BF64_GET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1)
+#define BP_SET_PSIZE(bp, x) \
+ BF64_SET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x)
+
+#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 8)
+#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 8, x)
+
+#define BP_GET_CHECKSUM(bp) BF64_GET((bp)->blk_prop, 40, 8)
+#define BP_SET_CHECKSUM(bp, x) BF64_SET((bp)->blk_prop, 40, 8, x)
+
+#define BP_GET_TYPE(bp) BF64_GET((bp)->blk_prop, 48, 8)
+#define BP_SET_TYPE(bp, x) BF64_SET((bp)->blk_prop, 48, 8, x)
+
+#define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5)
+#define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x)
+
+#define BP_GET_BYTEORDER(bp) (0 - BF64_GET((bp)->blk_prop, 63, 1))
+#define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x)
+
+#define BP_GET_ASIZE(bp) \
+ (DVA_GET_ASIZE(&(bp)->blk_dva[0]) + DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
+ DVA_GET_ASIZE(&(bp)->blk_dva[2]))
+
+#define BP_GET_UCSIZE(bp) \
+ ((BP_GET_LEVEL(bp) > 0 || dmu_ot[BP_GET_TYPE(bp)].ot_metadata) ? \
+ BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp));
+
+#define BP_GET_NDVAS(bp) \
+ (!!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
+ !!DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
+ !!DVA_GET_ASIZE(&(bp)->blk_dva[2]))
+
+#define BP_COUNT_GANG(bp) \
+ (DVA_GET_GANG(&(bp)->blk_dva[0]) + \
+ DVA_GET_GANG(&(bp)->blk_dva[1]) + \
+ DVA_GET_GANG(&(bp)->blk_dva[2]))
+
+#define DVA_EQUAL(dva1, dva2) \
+ ((dva1)->dva_word[1] == (dva2)->dva_word[1] && \
+ (dva1)->dva_word[0] == (dva2)->dva_word[0])
+
+#define ZIO_CHECKSUM_EQUAL(zc1, zc2) \
+ (0 == (((zc1).zc_word[0] - (zc2).zc_word[0]) | \
+ ((zc1).zc_word[1] - (zc2).zc_word[1]) | \
+ ((zc1).zc_word[2] - (zc2).zc_word[2]) | \
+ ((zc1).zc_word[3] - (zc2).zc_word[3])))
+
+
+#define DVA_IS_VALID(dva) (DVA_GET_ASIZE(dva) != 0)
+
+#define ZIO_SET_CHECKSUM(zcp, w0, w1, w2, w3) \
+{ \
+ (zcp)->zc_word[0] = w0; \
+ (zcp)->zc_word[1] = w1; \
+ (zcp)->zc_word[2] = w2; \
+ (zcp)->zc_word[3] = w3; \
+}
+
+#define BP_IDENTITY(bp) (&(bp)->blk_dva[0])
+#define BP_IS_GANG(bp) DVA_GET_GANG(BP_IDENTITY(bp))
+#define BP_IS_HOLE(bp) ((bp)->blk_birth == 0)
+#define BP_IS_OLDER(bp, txg) (!BP_IS_HOLE(bp) && (bp)->blk_birth < (txg))
+
+#define BP_ZERO(bp) \
+{ \
+ (bp)->blk_dva[0].dva_word[0] = 0; \
+ (bp)->blk_dva[0].dva_word[1] = 0; \
+ (bp)->blk_dva[1].dva_word[0] = 0; \
+ (bp)->blk_dva[1].dva_word[1] = 0; \
+ (bp)->blk_dva[2].dva_word[0] = 0; \
+ (bp)->blk_dva[2].dva_word[1] = 0; \
+ (bp)->blk_prop = 0; \
+ (bp)->blk_pad[0] = 0; \
+ (bp)->blk_pad[1] = 0; \
+ (bp)->blk_pad[2] = 0; \
+ (bp)->blk_birth = 0; \
+ (bp)->blk_fill = 0; \
+ ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \
+}
+
+/*
+ * Note: the byteorder is either 0 or -1, both of which are palindromes.
+ * This simplifies the endianness handling a bit.
+ */
+#ifdef _BIG_ENDIAN
+#define ZFS_HOST_BYTEORDER (0ULL)
+#else
+#define ZFS_HOST_BYTEORDER (-1ULL)
+#endif
+
+#define BP_SHOULD_BYTESWAP(bp) (BP_GET_BYTEORDER(bp) != ZFS_HOST_BYTEORDER)
+
+#define BP_SPRINTF_LEN 320
+
+#include <sys/dmu.h>
+
+#define BP_GET_BUFC_TYPE(bp) \
+ (((BP_GET_LEVEL(bp) > 0) || (dmu_ot[BP_GET_TYPE(bp)].ot_metadata)) ? \
+ ARC_BUFC_METADATA : ARC_BUFC_DATA);
+/*
+ * Routines found in spa.c
+ */
+
+/* state manipulation functions */
+extern int spa_open(const char *pool, spa_t **, void *tag);
+extern int spa_get_stats(const char *pool, nvlist_t **config,
+ char *altroot, size_t buflen);
+extern int spa_create(const char *pool, nvlist_t *config, const char *altroot);
+extern int spa_import(const char *pool, nvlist_t *config, const char *altroot);
+extern nvlist_t *spa_tryimport(nvlist_t *tryconfig);
+extern int spa_destroy(char *pool);
+extern int spa_export(char *pool, nvlist_t **oldconfig);
+extern int spa_reset(char *pool);
+extern void spa_async_request(spa_t *spa, int flag);
+extern void spa_async_suspend(spa_t *spa);
+extern void spa_async_resume(spa_t *spa);
+extern spa_t *spa_inject_addref(char *pool);
+extern void spa_inject_delref(spa_t *spa);
+
+#define SPA_ASYNC_REOPEN 0x01
+#define SPA_ASYNC_REPLACE_DONE 0x02
+#define SPA_ASYNC_SCRUB 0x04
+#define SPA_ASYNC_RESILVER 0x08
+#define SPA_ASYNC_CONFIG_UPDATE 0x10
+
+/* device manipulation */
+extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot);
+extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot,
+ int replacing);
+extern int spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done);
+extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare);
+extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath);
+
+/* spare state (which is global across all pools) */
+extern void spa_spare_add(vdev_t *vd);
+extern void spa_spare_remove(vdev_t *vd);
+extern boolean_t spa_spare_exists(uint64_t guid, uint64_t *pool);
+extern void spa_spare_activate(vdev_t *vd);
+
+/* scrubbing */
+extern int spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force);
+extern void spa_scrub_suspend(spa_t *spa);
+extern void spa_scrub_resume(spa_t *spa);
+extern void spa_scrub_restart(spa_t *spa, uint64_t txg);
+
+/* spa syncing */
+extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */
+extern void spa_sync_allpools(void);
+
+/*
+ * SPA configuration functions in spa_config.c
+ */
+
+#define SPA_CONFIG_UPDATE_POOL 0
+#define SPA_CONFIG_UPDATE_VDEVS 1
+
+extern void spa_config_sync(void);
+extern void spa_config_load(void);
+extern nvlist_t *spa_all_configs(uint64_t *);
+extern void spa_config_set(spa_t *spa, nvlist_t *config);
+extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg,
+ int getstats);
+extern void spa_config_update(spa_t *spa, int what);
+
+/*
+ * Miscellaneous SPA routines in spa_misc.c
+ */
+
+/* Namespace manipulation */
+extern spa_t *spa_lookup(const char *name);
+extern spa_t *spa_add(const char *name, const char *altroot);
+extern void spa_remove(spa_t *spa);
+extern spa_t *spa_next(spa_t *prev);
+
+/* Refcount functions */
+extern void spa_open_ref(spa_t *spa, void *tag);
+extern void spa_close(spa_t *spa, void *tag);
+extern boolean_t spa_refcount_zero(spa_t *spa);
+
+/* Pool configuration lock */
+extern void spa_config_enter(spa_t *spa, krw_t rw, void *tag);
+extern void spa_config_exit(spa_t *spa, void *tag);
+extern boolean_t spa_config_held(spa_t *spa, krw_t rw);
+
+/* Pool vdev add/remove lock */
+extern uint64_t spa_vdev_enter(spa_t *spa);
+extern int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error);
+
+/* Accessor functions */
+extern krwlock_t *spa_traverse_rwlock(spa_t *spa);
+extern int spa_traverse_wanted(spa_t *spa);
+extern struct dsl_pool *spa_get_dsl(spa_t *spa);
+extern blkptr_t *spa_get_rootblkptr(spa_t *spa);
+extern void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp);
+extern void spa_altroot(spa_t *, char *, size_t);
+extern int spa_sync_pass(spa_t *spa);
+extern char *spa_name(spa_t *spa);
+extern uint64_t spa_guid(spa_t *spa);
+extern uint64_t spa_last_synced_txg(spa_t *spa);
+extern uint64_t spa_first_txg(spa_t *spa);
+extern uint64_t spa_version(spa_t *spa);
+extern int spa_state(spa_t *spa);
+extern uint64_t spa_freeze_txg(spa_t *spa);
+struct metaslab_class;
+extern struct metaslab_class *spa_metaslab_class_select(spa_t *spa);
+extern uint64_t spa_get_alloc(spa_t *spa);
+extern uint64_t spa_get_space(spa_t *spa);
+extern uint64_t spa_get_dspace(spa_t *spa);
+extern uint64_t spa_get_asize(spa_t *spa, uint64_t lsize);
+extern uint64_t spa_version(spa_t *spa);
+extern int spa_max_replication(spa_t *spa);
+extern int spa_busy(void);
+
+/* Miscellaneous support routines */
+extern int spa_rename(const char *oldname, const char *newname);
+extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid);
+extern char *spa_strdup(const char *);
+extern void spa_strfree(char *);
+extern uint64_t spa_get_random(uint64_t range);
+extern void sprintf_blkptr(char *buf, int len, const blkptr_t *bp);
+extern void spa_freeze(spa_t *spa);
+extern void spa_upgrade(spa_t *spa);
+extern void spa_evict_all(void);
+extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid);
+extern boolean_t spa_has_spare(spa_t *, uint64_t guid);
+extern uint64_t bp_get_dasize(spa_t *spa, const blkptr_t *bp);
+
+/* history logging */
+extern void spa_history_create_obj(spa_t *spa, dmu_tx_t *tx);
+extern int spa_history_get(spa_t *spa, uint64_t *offset, uint64_t *len_read,
+ char *his_buf);
+extern int spa_history_log(spa_t *spa, const char *his_buf,
+ uint64_t pool_create);
+
+/* error handling */
+struct zbookmark;
+struct zio;
+extern void spa_log_error(spa_t *spa, struct zio *zio);
+extern void zfs_ereport_post(const char *class, spa_t *spa, vdev_t *vd,
+ struct zio *zio, uint64_t stateoroffset, uint64_t length);
+extern void zfs_post_ok(spa_t *spa, vdev_t *vd);
+extern uint64_t spa_get_errlog_size(spa_t *spa);
+extern int spa_get_errlog(spa_t *spa, void *uaddr, size_t *count);
+extern void spa_errlog_rotate(spa_t *spa);
+extern void spa_errlog_drain(spa_t *spa);
+extern void spa_errlog_sync(spa_t *spa, uint64_t txg);
+extern void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub);
+
+/* Initialization and termination */
+extern void spa_init(int flags);
+extern void spa_fini(void);
+
+/* properties */
+extern int spa_set_props(spa_t *spa, nvlist_t *nvp);
+extern int spa_get_props(spa_t *spa, nvlist_t **nvp);
+extern void spa_clear_bootfs(spa_t *spa, uint64_t obj, dmu_tx_t *tx);
+extern boolean_t spa_has_bootfs(spa_t *spa);
+
+#ifdef ZFS_DEBUG
+#define dprintf_bp(bp, fmt, ...) do { \
+ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
+ char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \
+ sprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, (bp)); \
+ dprintf(fmt " %s\n", __VA_ARGS__, __blkbuf); \
+ kmem_free(__blkbuf, BP_SPRINTF_LEN); \
+ } \
+_NOTE(CONSTCOND) } while (0)
+#else
+#define dprintf_bp(bp, fmt, ...)
+#endif
+
+extern int spa_mode; /* mode, e.g. FREAD | FWRITE */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_SPA_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
new file mode 100644
index 0000000..8c57123
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
@@ -0,0 +1,168 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_SPA_IMPL_H
+#define _SYS_SPA_IMPL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/spa.h>
+#include <sys/vdev.h>
+#include <sys/metaslab.h>
+#include <sys/dmu.h>
+#include <sys/dsl_pool.h>
+#include <sys/uberblock_impl.h>
+#include <sys/zfs_context.h>
+#include <sys/avl.h>
+#include <sys/refcount.h>
+#include <sys/bplist.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct spa_config_lock {
+ kmutex_t scl_lock;
+ refcount_t scl_count;
+ kthread_t *scl_writer;
+ kcondvar_t scl_cv;
+} spa_config_lock_t;
+
+typedef struct spa_error_entry {
+ zbookmark_t se_bookmark;
+ char *se_name;
+ avl_node_t se_avl;
+} spa_error_entry_t;
+
+typedef struct spa_history_phys {
+ uint64_t sh_pool_create_len; /* ending offset of zpool create */
+ uint64_t sh_phys_max_off; /* physical EOF */
+ uint64_t sh_bof; /* logical BOF */
+ uint64_t sh_eof; /* logical EOF */
+ uint64_t sh_records_lost; /* num of records overwritten */
+} spa_history_phys_t;
+
+typedef struct spa_props {
+ nvlist_t *spa_props_nvp;
+ list_node_t spa_list_node;
+} spa_props_t;
+
+struct spa {
+ /*
+ * Fields protected by spa_namespace_lock.
+ */
+ char *spa_name; /* pool name */
+ avl_node_t spa_avl; /* node in spa_namespace_avl */
+ nvlist_t *spa_config; /* last synced config */
+ nvlist_t *spa_config_syncing; /* currently syncing config */
+ uint64_t spa_config_txg; /* txg of last config change */
+ kmutex_t spa_config_cache_lock; /* for spa_config RW_READER */
+ int spa_sync_pass; /* iterate-to-convergence */
+ int spa_state; /* pool state */
+ int spa_inject_ref; /* injection references */
+ uint8_t spa_traverse_wanted; /* traverse lock wanted */
+ uint8_t spa_sync_on; /* sync threads are running */
+ spa_load_state_t spa_load_state; /* current load operation */
+ taskq_t *spa_zio_issue_taskq[ZIO_TYPES];
+ taskq_t *spa_zio_intr_taskq[ZIO_TYPES];
+ dsl_pool_t *spa_dsl_pool;
+ metaslab_class_t *spa_normal_class; /* normal data class */
+ uint64_t spa_first_txg; /* first txg after spa_open() */
+ uint64_t spa_final_txg; /* txg of export/destroy */
+ uint64_t spa_freeze_txg; /* freeze pool at this txg */
+ objset_t *spa_meta_objset; /* copy of dp->dp_meta_objset */
+ txg_list_t spa_vdev_txg_list; /* per-txg dirty vdev list */
+ vdev_t *spa_root_vdev; /* top-level vdev container */
+ uint64_t spa_load_guid; /* initial guid for spa_load */
+ list_t spa_dirty_list; /* vdevs with dirty labels */
+ uint64_t spa_spares_object; /* MOS object for spare list */
+ nvlist_t *spa_sparelist; /* cached spare config */
+ vdev_t **spa_spares; /* available hot spares */
+ int spa_nspares; /* number of hot spares */
+ boolean_t spa_sync_spares; /* sync the spares list */
+ uint64_t spa_config_object; /* MOS object for pool config */
+ uint64_t spa_syncing_txg; /* txg currently syncing */
+ uint64_t spa_sync_bplist_obj; /* object for deferred frees */
+ bplist_t spa_sync_bplist; /* deferred-free bplist */
+ krwlock_t spa_traverse_lock; /* traverse vs. spa_sync() */
+ uberblock_t spa_ubsync; /* last synced uberblock */
+ uberblock_t spa_uberblock; /* current uberblock */
+ kmutex_t spa_scrub_lock; /* resilver/scrub lock */
+ kthread_t *spa_scrub_thread; /* scrub/resilver thread */
+ traverse_handle_t *spa_scrub_th; /* scrub traverse handle */
+ uint64_t spa_scrub_restart_txg; /* need to restart */
+ uint64_t spa_scrub_mintxg; /* min txg we'll scrub */
+ uint64_t spa_scrub_maxtxg; /* max txg we'll scrub */
+ uint64_t spa_scrub_inflight; /* in-flight scrub I/Os */
+ uint64_t spa_scrub_maxinflight; /* max in-flight scrub I/Os */
+ uint64_t spa_scrub_errors; /* scrub I/O error count */
+ int spa_scrub_suspended; /* tell scrubber to suspend */
+ kcondvar_t spa_scrub_cv; /* scrub thread state change */
+ kcondvar_t spa_scrub_io_cv; /* scrub I/O completion */
+ uint8_t spa_scrub_stop; /* tell scrubber to stop */
+ uint8_t spa_scrub_active; /* active or suspended? */
+ uint8_t spa_scrub_type; /* type of scrub we're doing */
+ uint8_t spa_scrub_finished; /* indicator to rotate logs */
+ kmutex_t spa_async_lock; /* protect async state */
+ kthread_t *spa_async_thread; /* thread doing async task */
+ int spa_async_suspended; /* async tasks suspended */
+ kcondvar_t spa_async_cv; /* wait for thread_exit() */
+ uint16_t spa_async_tasks; /* async task mask */
+ char *spa_root; /* alternate root directory */
+ kmutex_t spa_uberblock_lock; /* vdev_uberblock_load_done() */
+ uint64_t spa_ena; /* spa-wide ereport ENA */
+ boolean_t spa_last_open_failed; /* true if last open faled */
+ kmutex_t spa_errlog_lock; /* error log lock */
+ uint64_t spa_errlog_last; /* last error log object */
+ uint64_t spa_errlog_scrub; /* scrub error log object */
+ kmutex_t spa_errlist_lock; /* error list/ereport lock */
+ avl_tree_t spa_errlist_last; /* last error list */
+ avl_tree_t spa_errlist_scrub; /* scrub error list */
+ uint64_t spa_deflate; /* should we deflate? */
+ uint64_t spa_history; /* history object */
+ kmutex_t spa_history_lock; /* history lock */
+ vdev_t *spa_pending_vdev; /* pending vdev additions */
+ nvlist_t **spa_pending_spares; /* pending spare additions */
+ uint_t spa_pending_nspares; /* # pending spares */
+ kmutex_t spa_props_lock; /* property lock */
+ uint64_t spa_pool_props_object; /* object for properties */
+ uint64_t spa_bootfs; /* default boot filesystem */
+ /*
+ * spa_refcnt must be the last element because it changes size based on
+ * compilation options. In order for the MDB module to function
+ * correctly, the other fields must remain in the same location.
+ */
+ spa_config_lock_t spa_config_lock; /* configuration changes */
+ refcount_t spa_refcount; /* number of opens */
+};
+
+extern const char *spa_config_dir;
+extern kmutex_t spa_namespace_lock;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_SPA_IMPL_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h
new file mode 100644
index 0000000..db9daef
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h
@@ -0,0 +1,162 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_SPACE_MAP_H
+#define _SYS_SPACE_MAP_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/avl.h>
+#include <sys/dmu.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct space_map_ops space_map_ops_t;
+
+typedef struct space_map {
+ avl_tree_t sm_root; /* AVL tree of map segments */
+ uint64_t sm_space; /* sum of all segments in the map */
+ uint64_t sm_start; /* start of map */
+ uint64_t sm_size; /* size of map */
+ uint8_t sm_shift; /* unit shift */
+ uint8_t sm_pad[3]; /* unused */
+ uint8_t sm_loaded; /* map loaded? */
+ uint8_t sm_loading; /* map loading? */
+ kcondvar_t sm_load_cv; /* map load completion */
+ space_map_ops_t *sm_ops; /* space map block picker ops vector */
+ void *sm_ppd; /* picker-private data */
+ kmutex_t *sm_lock; /* pointer to lock that protects map */
+} space_map_t;
+
+typedef struct space_seg {
+ avl_node_t ss_node; /* AVL node */
+ uint64_t ss_start; /* starting offset of this segment */
+ uint64_t ss_end; /* ending offset (non-inclusive) */
+} space_seg_t;
+
+typedef struct space_map_obj {
+ uint64_t smo_object; /* on-disk space map object */
+ uint64_t smo_objsize; /* size of the object */
+ uint64_t smo_alloc; /* space allocated from the map */
+} space_map_obj_t;
+
+struct space_map_ops {
+ void (*smop_load)(space_map_t *sm);
+ void (*smop_unload)(space_map_t *sm);
+ uint64_t (*smop_alloc)(space_map_t *sm, uint64_t size);
+ void (*smop_claim)(space_map_t *sm, uint64_t start, uint64_t size);
+ void (*smop_free)(space_map_t *sm, uint64_t start, uint64_t size);
+};
+
+/*
+ * debug entry
+ *
+ * 1 3 10 50
+ * ,---+--------+------------+---------------------------------.
+ * | 1 | action | syncpass | txg (lower bits) |
+ * `---+--------+------------+---------------------------------'
+ * 63 62 60 59 50 49 0
+ *
+ *
+ *
+ * non-debug entry
+ *
+ * 1 47 1 15
+ * ,-----------------------------------------------------------.
+ * | 0 | offset (sm_shift units) | type | run |
+ * `-----------------------------------------------------------'
+ * 63 62 17 16 15 0
+ */
+
+/* All this stuff takes and returns bytes */
+#define SM_RUN_DECODE(x) (BF64_DECODE(x, 0, 15) + 1)
+#define SM_RUN_ENCODE(x) BF64_ENCODE((x) - 1, 0, 15)
+#define SM_TYPE_DECODE(x) BF64_DECODE(x, 15, 1)
+#define SM_TYPE_ENCODE(x) BF64_ENCODE(x, 15, 1)
+#define SM_OFFSET_DECODE(x) BF64_DECODE(x, 16, 47)
+#define SM_OFFSET_ENCODE(x) BF64_ENCODE(x, 16, 47)
+#define SM_DEBUG_DECODE(x) BF64_DECODE(x, 63, 1)
+#define SM_DEBUG_ENCODE(x) BF64_ENCODE(x, 63, 1)
+
+#define SM_DEBUG_ACTION_DECODE(x) BF64_DECODE(x, 60, 3)
+#define SM_DEBUG_ACTION_ENCODE(x) BF64_ENCODE(x, 60, 3)
+
+#define SM_DEBUG_SYNCPASS_DECODE(x) BF64_DECODE(x, 50, 10)
+#define SM_DEBUG_SYNCPASS_ENCODE(x) BF64_ENCODE(x, 50, 10)
+
+#define SM_DEBUG_TXG_DECODE(x) BF64_DECODE(x, 0, 50)
+#define SM_DEBUG_TXG_ENCODE(x) BF64_ENCODE(x, 0, 50)
+
+#define SM_RUN_MAX SM_RUN_DECODE(~0ULL)
+
+#define SM_ALLOC 0x0
+#define SM_FREE 0x1
+
+/*
+ * The data for a given space map can be kept on blocks of any size.
+ * Larger blocks entail fewer i/o operations, but they also cause the
+ * DMU to keep more data in-core, and also to waste more i/o bandwidth
+ * when only a few blocks have changed since the last transaction group.
+ * This could use a lot more research, but for now, set the freelist
+ * block size to 4k (2^12).
+ */
+#define SPACE_MAP_BLOCKSHIFT 12
+
+typedef void space_map_func_t(space_map_t *sm, uint64_t start, uint64_t size);
+
+extern void space_map_create(space_map_t *sm, uint64_t start, uint64_t size,
+ uint8_t shift, kmutex_t *lp);
+extern void space_map_destroy(space_map_t *sm);
+extern void space_map_add(space_map_t *sm, uint64_t start, uint64_t size);
+extern void space_map_remove(space_map_t *sm, uint64_t start, uint64_t size);
+extern int space_map_contains(space_map_t *sm, uint64_t start, uint64_t size);
+extern void space_map_vacate(space_map_t *sm,
+ space_map_func_t *func, space_map_t *mdest);
+extern void space_map_walk(space_map_t *sm,
+ space_map_func_t *func, space_map_t *mdest);
+extern void space_map_excise(space_map_t *sm, uint64_t start, uint64_t size);
+extern void space_map_union(space_map_t *smd, space_map_t *sms);
+
+extern void space_map_load_wait(space_map_t *sm);
+extern int space_map_load(space_map_t *sm, space_map_ops_t *ops,
+ uint8_t maptype, space_map_obj_t *smo, objset_t *os);
+extern void space_map_unload(space_map_t *sm);
+
+extern uint64_t space_map_alloc(space_map_t *sm, uint64_t size);
+extern void space_map_claim(space_map_t *sm, uint64_t start, uint64_t size);
+extern void space_map_free(space_map_t *sm, uint64_t start, uint64_t size);
+
+extern void space_map_sync(space_map_t *sm, uint8_t maptype,
+ space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx);
+extern void space_map_truncate(space_map_obj_t *smo,
+ objset_t *os, dmu_tx_t *tx);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_SPACE_MAP_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h
new file mode 100644
index 0000000..dae129c
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h
@@ -0,0 +1,120 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_TXG_H
+#define _SYS_TXG_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/spa.h>
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define TXG_CONCURRENT_STATES 3 /* open, quiescing, syncing */
+#define TXG_SIZE 4 /* next power of 2 */
+#define TXG_MASK (TXG_SIZE - 1) /* mask for size */
+#define TXG_INITIAL TXG_SIZE /* initial txg */
+#define TXG_IDX (txg & TXG_MASK)
+
+#define TXG_WAIT 1ULL
+#define TXG_NOWAIT 2ULL
+
+typedef struct tx_cpu tx_cpu_t;
+
+typedef struct txg_handle {
+ tx_cpu_t *th_cpu;
+ uint64_t th_txg;
+} txg_handle_t;
+
+typedef struct txg_node {
+ struct txg_node *tn_next[TXG_SIZE];
+ uint8_t tn_member[TXG_SIZE];
+} txg_node_t;
+
+typedef struct txg_list {
+ kmutex_t tl_lock;
+ size_t tl_offset;
+ txg_node_t *tl_head[TXG_SIZE];
+} txg_list_t;
+
+struct dsl_pool;
+
+extern void txg_init(struct dsl_pool *dp, uint64_t txg);
+extern void txg_fini(struct dsl_pool *dp);
+extern void txg_sync_start(struct dsl_pool *dp);
+extern void txg_sync_stop(struct dsl_pool *dp);
+extern uint64_t txg_hold_open(struct dsl_pool *dp, txg_handle_t *txghp);
+extern void txg_rele_to_quiesce(txg_handle_t *txghp);
+extern void txg_rele_to_sync(txg_handle_t *txghp);
+extern void txg_suspend(struct dsl_pool *dp);
+extern void txg_resume(struct dsl_pool *dp);
+
+/*
+ * Wait until the given transaction group has finished syncing.
+ * Try to make this happen as soon as possible (eg. kick off any
+ * necessary syncs immediately). If txg==0, wait for the currently open
+ * txg to finish syncing.
+ */
+extern void txg_wait_synced(struct dsl_pool *dp, uint64_t txg);
+
+/*
+ * Wait until the given transaction group, or one after it, is
+ * the open transaction group. Try to make this happen as soon
+ * as possible (eg. kick off any necessary syncs immediately).
+ * If txg == 0, wait for the next open txg.
+ */
+extern void txg_wait_open(struct dsl_pool *dp, uint64_t txg);
+
+/*
+ * Returns TRUE if we are "backed up" waiting for the syncing
+ * transaction to complete; otherwise returns FALSE.
+ */
+extern int txg_stalled(struct dsl_pool *dp);
+
+/*
+ * Per-txg object lists.
+ */
+
+#define TXG_CLEAN(txg) ((txg) - 1)
+
+extern void txg_list_create(txg_list_t *tl, size_t offset);
+extern void txg_list_destroy(txg_list_t *tl);
+extern int txg_list_empty(txg_list_t *tl, uint64_t txg);
+extern int txg_list_add(txg_list_t *tl, void *p, uint64_t txg);
+extern void *txg_list_remove(txg_list_t *tl, uint64_t txg);
+extern void *txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg);
+extern int txg_list_member(txg_list_t *tl, void *p, uint64_t txg);
+extern void *txg_list_head(txg_list_t *tl, uint64_t txg);
+extern void *txg_list_next(txg_list_t *tl, void *p, uint64_t txg);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_TXG_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h
new file mode 100644
index 0000000..45a138a
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h
@@ -0,0 +1,77 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_TXG_IMPL_H
+#define _SYS_TXG_IMPL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/spa.h>
+#include <sys/txg.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct tx_cpu {
+ kmutex_t tc_lock;
+ kcondvar_t tc_cv[TXG_SIZE];
+ uint64_t tc_count[TXG_SIZE];
+ char tc_pad[16];
+};
+
+typedef struct tx_state {
+ tx_cpu_t *tx_cpu; /* protects right to enter txg */
+ kmutex_t tx_sync_lock; /* protects tx_state_t */
+ krwlock_t tx_suspend;
+ uint64_t tx_open_txg; /* currently open txg id */
+ uint64_t tx_quiesced_txg; /* quiesced txg waiting for sync */
+ uint64_t tx_syncing_txg; /* currently syncing txg id */
+ uint64_t tx_synced_txg; /* last synced txg id */
+
+ uint64_t tx_sync_txg_waiting; /* txg we're waiting to sync */
+ uint64_t tx_quiesce_txg_waiting; /* txg we're waiting to open */
+
+ kcondvar_t tx_sync_more_cv;
+ kcondvar_t tx_sync_done_cv;
+ kcondvar_t tx_quiesce_more_cv;
+ kcondvar_t tx_quiesce_done_cv;
+ kcondvar_t tx_timeout_exit_cv;
+ kcondvar_t tx_exit_cv; /* wait for all threads to exit */
+
+ uint8_t tx_threads; /* number of threads */
+ uint8_t tx_exiting; /* set when we're exiting */
+
+ kthread_t *tx_sync_thread;
+ kthread_t *tx_quiesce_thread;
+ kthread_t *tx_timelimit_thread;
+} tx_state_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_TXG_IMPL_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h
new file mode 100644
index 0000000..93d936a
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h
@@ -0,0 +1,50 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_UBERBLOCK_H
+#define _SYS_UBERBLOCK_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/spa.h>
+#include <sys/vdev.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct uberblock uberblock_t;
+
+extern int uberblock_verify(uberblock_t *ub);
+extern int uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_UBERBLOCK_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h
new file mode 100644
index 0000000..ab0f2dc
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h
@@ -0,0 +1,63 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_UBERBLOCK_IMPL_H
+#define _SYS_UBERBLOCK_IMPL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/uberblock.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * The uberblock version is incremented whenever an incompatible on-disk
+ * format change is made to the SPA, DMU, or ZAP.
+ *
+ * Note: the first two fields should never be moved. When a storage pool
+ * is opened, the uberblock must be read off the disk before the version
+ * can be checked. If the ub_version field is moved, we may not detect
+ * version mismatch. If the ub_magic field is moved, applications that
+ * expect the magic number in the first word won't work.
+ */
+#define UBERBLOCK_MAGIC 0x00bab10c /* oo-ba-bloc! */
+#define UBERBLOCK_SHIFT 10 /* up to 1K */
+
+struct uberblock {
+ uint64_t ub_magic; /* UBERBLOCK_MAGIC */
+ uint64_t ub_version; /* ZFS_VERSION */
+ uint64_t ub_txg; /* txg of last sync */
+ uint64_t ub_guid_sum; /* sum of all vdev guids */
+ uint64_t ub_timestamp; /* UTC time of last sync */
+ blkptr_t ub_rootbp; /* MOS objset_phys_t */
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_UBERBLOCK_IMPL_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/unique.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/unique.h
new file mode 100644
index 0000000..c8c177e
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/unique.h
@@ -0,0 +1,56 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_UNIQUE_H
+#define _SYS_UNIQUE_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* The number of significant bits in each unique value. */
+#define UNIQUE_BITS 56
+
+void unique_init(void);
+
+/* Return a new unique value. */
+uint64_t unique_create(void);
+
+/* Return a unique value, which equals the one passed in if possible. */
+uint64_t unique_insert(uint64_t value);
+
+/* Indicate that this value no longer needs to be uniquified against. */
+void unique_remove(uint64_t value);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_UNIQUE_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h
new file mode 100644
index 0000000..3120811
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h
@@ -0,0 +1,132 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_VDEV_H
+#define _SYS_VDEV_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/dmu.h>
+#include <sys/space_map.h>
+#include <sys/fs/zfs.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern boolean_t zfs_nocacheflush;
+
+/*
+ * Fault injection modes.
+ */
+#define VDEV_FAULT_NONE 0
+#define VDEV_FAULT_RANDOM 1
+#define VDEV_FAULT_COUNT 2
+
+extern int vdev_open(vdev_t *);
+extern int vdev_validate(vdev_t *);
+extern void vdev_close(vdev_t *);
+extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace);
+extern void vdev_init(vdev_t *, uint64_t txg);
+extern void vdev_reopen(vdev_t *);
+extern int vdev_validate_spare(vdev_t *);
+
+extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev);
+extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid);
+extern void vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size);
+extern int vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size);
+extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
+ int scrub_done);
+
+extern const char *vdev_description(vdev_t *vd);
+
+extern int vdev_metaslab_init(vdev_t *vd, uint64_t txg);
+extern void vdev_metaslab_fini(vdev_t *vd);
+
+extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs);
+extern void vdev_stat_update(zio_t *zio);
+extern void vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type,
+ boolean_t complete);
+extern int vdev_getspec(spa_t *spa, uint64_t vdev, char **vdev_spec);
+extern void vdev_propagate_state(vdev_t *vd);
+extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state,
+ vdev_aux_t aux);
+
+extern void vdev_space_update(vdev_t *vd, int64_t space_delta,
+ int64_t alloc_delta);
+
+extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize);
+
+extern void vdev_io_start(zio_t *zio);
+extern void vdev_io_done(zio_t *zio);
+
+extern int vdev_online(spa_t *spa, uint64_t guid);
+extern int vdev_offline(spa_t *spa, uint64_t guid, int istmp);
+extern void vdev_clear(spa_t *spa, vdev_t *vd);
+
+extern int vdev_error_inject(vdev_t *vd, zio_t *zio);
+extern int vdev_is_dead(vdev_t *vd);
+
+extern void vdev_cache_init(vdev_t *vd);
+extern void vdev_cache_fini(vdev_t *vd);
+extern int vdev_cache_read(zio_t *zio);
+extern void vdev_cache_write(zio_t *zio);
+
+extern void vdev_queue_init(vdev_t *vd);
+extern void vdev_queue_fini(vdev_t *vd);
+extern zio_t *vdev_queue_io(zio_t *zio);
+extern void vdev_queue_io_done(zio_t *zio);
+
+extern void vdev_config_dirty(vdev_t *vd);
+extern void vdev_config_clean(vdev_t *vd);
+extern int vdev_config_sync(vdev_t *vd, uint64_t txg);
+
+extern nvlist_t *vdev_config_generate(spa_t *spa, vdev_t *vd,
+ boolean_t getstats, boolean_t isspare);
+
+/*
+ * Label routines
+ */
+struct uberblock;
+extern uint64_t vdev_label_offset(uint64_t psize, int l, uint64_t offset);
+extern nvlist_t *vdev_label_read_config(vdev_t *vd);
+extern void vdev_uberblock_load(zio_t *zio, vdev_t *vd, struct uberblock *ub);
+
+typedef enum {
+ VDEV_LABEL_CREATE, /* create/add a new device */
+ VDEV_LABEL_REPLACE, /* replace an existing device */
+ VDEV_LABEL_SPARE, /* add a new hot spare */
+ VDEV_LABEL_REMOVE /* remove an existing device */
+} vdev_labeltype_t;
+
+extern int vdev_label_init(vdev_t *vd, uint64_t txg, vdev_labeltype_t reason);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VDEV_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h
new file mode 100644
index 0000000..95536a7
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h
@@ -0,0 +1,52 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_VDEV_DISK_H
+#define _SYS_VDEV_DISK_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/vdev.h>
+#ifdef _KERNEL
+#include <sys/sunldi.h>
+#include <sys/sunddi.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct vdev_disk {
+ ddi_devid_t vd_devid;
+ char *vd_minor;
+ ldi_handle_t vd_lh;
+} vdev_disk_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VDEV_DISK_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_file.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_file.h
new file mode 100644
index 0000000..cd49673
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_file.h
@@ -0,0 +1,46 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_VDEV_FILE_H
+#define _SYS_VDEV_FILE_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/vdev.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct vdev_file {
+ vnode_t *vf_vnode;
+} vdev_file_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VDEV_FILE_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
new file mode 100644
index 0000000..aba7567
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
@@ -0,0 +1,298 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_VDEV_IMPL_H
+#define _SYS_VDEV_IMPL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/avl.h>
+#include <sys/dmu.h>
+#include <sys/metaslab.h>
+#include <sys/nvpair.h>
+#include <sys/space_map.h>
+#include <sys/vdev.h>
+#include <sys/dkio.h>
+#include <sys/uberblock_impl.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Virtual device descriptors.
+ *
+ * All storage pool operations go through the virtual device framework,
+ * which provides data replication and I/O scheduling.
+ */
+
+/*
+ * Forward declarations that lots of things need.
+ */
+typedef struct vdev_queue vdev_queue_t;
+typedef struct vdev_cache vdev_cache_t;
+typedef struct vdev_cache_entry vdev_cache_entry_t;
+
+/*
+ * Virtual device operations
+ */
+typedef int vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *ashift);
+typedef void vdev_close_func_t(vdev_t *vd);
+typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize);
+typedef void vdev_io_start_func_t(zio_t *zio);
+typedef void vdev_io_done_func_t(zio_t *zio);
+typedef void vdev_state_change_func_t(vdev_t *vd, int, int);
+
+typedef struct vdev_ops {
+ vdev_open_func_t *vdev_op_open;
+ vdev_close_func_t *vdev_op_close;
+ vdev_asize_func_t *vdev_op_asize;
+ vdev_io_start_func_t *vdev_op_io_start;
+ vdev_io_done_func_t *vdev_op_io_done;
+ vdev_state_change_func_t *vdev_op_state_change;
+ char vdev_op_type[16];
+ boolean_t vdev_op_leaf;
+} vdev_ops_t;
+
+/*
+ * Virtual device properties
+ */
+struct vdev_cache_entry {
+ char *ve_data;
+ uint64_t ve_offset;
+ uint64_t ve_lastused;
+ avl_node_t ve_offset_node;
+ avl_node_t ve_lastused_node;
+ uint32_t ve_hits;
+ uint16_t ve_missed_update;
+ zio_t *ve_fill_io;
+};
+
+struct vdev_cache {
+ avl_tree_t vc_offset_tree;
+ avl_tree_t vc_lastused_tree;
+ kmutex_t vc_lock;
+};
+
+struct vdev_queue {
+ avl_tree_t vq_deadline_tree;
+ avl_tree_t vq_read_tree;
+ avl_tree_t vq_write_tree;
+ avl_tree_t vq_pending_tree;
+ kmutex_t vq_lock;
+};
+
+/*
+ * Virtual device descriptor
+ */
+struct vdev {
+ /*
+ * Common to all vdev types.
+ */
+ uint64_t vdev_id; /* child number in vdev parent */
+ uint64_t vdev_guid; /* unique ID for this vdev */
+ uint64_t vdev_guid_sum; /* self guid + all child guids */
+ uint64_t vdev_asize; /* allocatable device capacity */
+ uint64_t vdev_ashift; /* block alignment shift */
+ uint64_t vdev_state; /* see VDEV_STATE_* #defines */
+ uint64_t vdev_prevstate; /* used when reopening a vdev */
+ vdev_ops_t *vdev_ops; /* vdev operations */
+ spa_t *vdev_spa; /* spa for this vdev */
+ void *vdev_tsd; /* type-specific data */
+ vdev_t *vdev_top; /* top-level vdev */
+ vdev_t *vdev_parent; /* parent vdev */
+ vdev_t **vdev_child; /* array of children */
+ uint64_t vdev_children; /* number of children */
+ space_map_t vdev_dtl_map; /* dirty time log in-core state */
+ space_map_t vdev_dtl_scrub; /* DTL for scrub repair writes */
+ vdev_stat_t vdev_stat; /* virtual device statistics */
+
+ /*
+ * Top-level vdev state.
+ */
+ uint64_t vdev_ms_array; /* metaslab array object */
+ uint64_t vdev_ms_shift; /* metaslab size shift */
+ uint64_t vdev_ms_count; /* number of metaslabs */
+ metaslab_group_t *vdev_mg; /* metaslab group */
+ metaslab_t **vdev_ms; /* metaslab array */
+ txg_list_t vdev_ms_list; /* per-txg dirty metaslab lists */
+ txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */
+ txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */
+ uint8_t vdev_reopen_wanted; /* async reopen wanted? */
+ list_node_t vdev_dirty_node; /* config dirty list */
+ uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */
+
+ /*
+ * Leaf vdev state.
+ */
+ uint64_t vdev_psize; /* physical device capacity */
+ space_map_obj_t vdev_dtl; /* dirty time log on-disk state */
+ txg_node_t vdev_dtl_node; /* per-txg dirty DTL linkage */
+ uint64_t vdev_wholedisk; /* true if this is a whole disk */
+ uint64_t vdev_offline; /* device taken offline? */
+ uint64_t vdev_nparity; /* number of parity devices for raidz */
+ char *vdev_path; /* vdev path (if any) */
+ char *vdev_devid; /* vdev devid (if any) */
+ uint64_t vdev_fault_arg; /* fault injection paramater */
+ int vdev_fault_mask; /* zio types to fault */
+ uint8_t vdev_fault_mode; /* fault injection mode */
+ uint8_t vdev_cache_active; /* vdev_cache and vdev_queue */
+ uint8_t vdev_tmpoffline; /* device taken offline temporarily? */
+ uint8_t vdev_detached; /* device detached? */
+ uint64_t vdev_isspare; /* was a hot spare */
+ vdev_queue_t vdev_queue; /* I/O deadline schedule queue */
+ vdev_cache_t vdev_cache; /* physical block cache */
+ uint64_t vdev_not_present; /* not present during import */
+ hrtime_t vdev_last_try; /* last reopen time */
+ boolean_t vdev_nowritecache; /* true if flushwritecache failed */
+
+ /*
+ * For DTrace to work in userland (libzpool) context, these fields must
+ * remain at the end of the structure. DTrace will use the kernel's
+ * CTF definition for 'struct vdev', and since the size of a kmutex_t is
+ * larger in userland, the offsets for the rest fields would be
+ * incorrect.
+ */
+ kmutex_t vdev_dtl_lock; /* vdev_dtl_{map,resilver} */
+ kmutex_t vdev_stat_lock; /* vdev_stat */
+};
+
+#define VDEV_SKIP_SIZE (8 << 10)
+#define VDEV_BOOT_HEADER_SIZE (8 << 10)
+#define VDEV_PHYS_SIZE (112 << 10)
+#define VDEV_UBERBLOCK_RING (128 << 10)
+
+#define VDEV_UBERBLOCK_SHIFT(vd) \
+ MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT)
+#define VDEV_UBERBLOCK_COUNT(vd) \
+ (VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd))
+#define VDEV_UBERBLOCK_OFFSET(vd, n) \
+ offsetof(vdev_label_t, vl_uberblock[(n) << VDEV_UBERBLOCK_SHIFT(vd)])
+#define VDEV_UBERBLOCK_SIZE(vd) (1ULL << VDEV_UBERBLOCK_SHIFT(vd))
+
+/* ZFS boot block */
+#define VDEV_BOOT_MAGIC 0x2f5b007b10cULL
+#define VDEV_BOOT_VERSION 1 /* version number */
+
+typedef struct vdev_boot_header {
+ uint64_t vb_magic; /* VDEV_BOOT_MAGIC */
+ uint64_t vb_version; /* VDEV_BOOT_VERSION */
+ uint64_t vb_offset; /* start offset (bytes) */
+ uint64_t vb_size; /* size (bytes) */
+ char vb_pad[VDEV_BOOT_HEADER_SIZE - 4 * sizeof (uint64_t)];
+} vdev_boot_header_t;
+
+typedef struct vdev_phys {
+ char vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_block_tail_t)];
+ zio_block_tail_t vp_zbt;
+} vdev_phys_t;
+
+typedef struct vdev_label {
+ char vl_pad[VDEV_SKIP_SIZE]; /* 8K */
+ vdev_boot_header_t vl_boot_header; /* 8K */
+ vdev_phys_t vl_vdev_phys; /* 112K */
+ char vl_uberblock[VDEV_UBERBLOCK_RING]; /* 128K */
+} vdev_label_t; /* 256K total */
+
+/*
+ * vdev_dirty() flags
+ */
+#define VDD_METASLAB 0x01
+#define VDD_DTL 0x02
+
+/*
+ * Size and offset of embedded boot loader region on each label.
+ * The total size of the first two labels plus the boot area is 4MB.
+ */
+#define VDEV_BOOT_OFFSET (2 * sizeof (vdev_label_t))
+#define VDEV_BOOT_SIZE (7ULL << 19) /* 3.5M */
+
+/*
+ * Size of label regions at the start and end of each leaf device.
+ */
+#define VDEV_LABEL_START_SIZE (2 * sizeof (vdev_label_t) + VDEV_BOOT_SIZE)
+#define VDEV_LABEL_END_SIZE (2 * sizeof (vdev_label_t))
+#define VDEV_LABELS 4
+
+#define VDEV_ALLOC_LOAD 0
+#define VDEV_ALLOC_ADD 1
+#define VDEV_ALLOC_SPARE 2
+
+/*
+ * Allocate or free a vdev
+ */
+extern int vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *config,
+ vdev_t *parent, uint_t id, int alloctype);
+extern void vdev_free(vdev_t *vd);
+
+/*
+ * Add or remove children and parents
+ */
+extern void vdev_add_child(vdev_t *pvd, vdev_t *cvd);
+extern void vdev_remove_child(vdev_t *pvd, vdev_t *cvd);
+extern void vdev_compact_children(vdev_t *pvd);
+extern vdev_t *vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops);
+extern void vdev_remove_parent(vdev_t *cvd);
+
+/*
+ * vdev sync load and sync
+ */
+extern void vdev_load(vdev_t *vd);
+extern void vdev_sync(vdev_t *vd, uint64_t txg);
+extern void vdev_sync_done(vdev_t *vd, uint64_t txg);
+extern void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg);
+
+/*
+ * Available vdev types.
+ */
+extern vdev_ops_t vdev_root_ops;
+extern vdev_ops_t vdev_mirror_ops;
+extern vdev_ops_t vdev_replacing_ops;
+extern vdev_ops_t vdev_raidz_ops;
+#ifdef _KERNEL
+extern vdev_ops_t vdev_geom_ops;
+#else
+extern vdev_ops_t vdev_disk_ops;
+extern vdev_ops_t vdev_file_ops;
+#endif
+extern vdev_ops_t vdev_missing_ops;
+extern vdev_ops_t vdev_spare_ops;
+
+/*
+ * Common size functions
+ */
+extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize);
+extern uint64_t vdev_get_rsize(vdev_t *vd);
+
+/*
+ * zdb uses this tunable, so it must be declared here to make lint happy.
+ */
+extern int zfs_vdev_cache_size;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VDEV_IMPL_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h
new file mode 100644
index 0000000..f89d938
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h
@@ -0,0 +1,359 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ZAP_H
+#define _SYS_ZAP_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * ZAP - ZFS Attribute Processor
+ *
+ * The ZAP is a module which sits on top of the DMU (Data Managemnt
+ * Unit) and implements a higher-level storage primitive using DMU
+ * objects. Its primary consumer is the ZPL (ZFS Posix Layer).
+ *
+ * A "zapobj" is a DMU object which the ZAP uses to stores attributes.
+ * Users should use only zap routines to access a zapobj - they should
+ * not access the DMU object directly using DMU routines.
+ *
+ * The attributes stored in a zapobj are name-value pairs. The name is
+ * a zero-terminated string of up to ZAP_MAXNAMELEN bytes (including
+ * terminating NULL). The value is an array of integers, which may be
+ * 1, 2, 4, or 8 bytes long. The total space used by the array (number
+ * of integers * integer length) can be up to ZAP_MAXVALUELEN bytes.
+ * Note that an 8-byte integer value can be used to store the location
+ * (object number) of another dmu object (which may be itself a zapobj).
+ * Note that you can use a zero-length attribute to store a single bit
+ * of information - the attribute is present or not.
+ *
+ * The ZAP routines are thread-safe. However, you must observe the
+ * DMU's restriction that a transaction may not be operated on
+ * concurrently.
+ *
+ * Any of the routines that return an int may return an I/O error (EIO
+ * or ECHECKSUM).
+ *
+ *
+ * Implementation / Performance Notes:
+ *
+ * The ZAP is intended to operate most efficiently on attributes with
+ * short (49 bytes or less) names and single 8-byte values, for which
+ * the microzap will be used. The ZAP should be efficient enough so
+ * that the user does not need to cache these attributes.
+ *
+ * The ZAP's locking scheme makes its routines thread-safe. Operations
+ * on different zapobjs will be processed concurrently. Operations on
+ * the same zapobj which only read data will be processed concurrently.
+ * Operations on the same zapobj which modify data will be processed
+ * concurrently when there are many attributes in the zapobj (because
+ * the ZAP uses per-block locking - more than 128 * (number of cpus)
+ * small attributes will suffice).
+ */
+
+/*
+ * We're using zero-terminated byte strings (ie. ASCII or UTF-8 C
+ * strings) for the names of attributes, rather than a byte string
+ * bounded by an explicit length. If some day we want to support names
+ * in character sets which have embedded zeros (eg. UTF-16, UTF-32),
+ * we'll have to add routines for using length-bounded strings.
+ */
+
+#include <sys/dmu.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define ZAP_MAXNAMELEN 256
+#define ZAP_MAXVALUELEN 1024
+
+/*
+ * Create a new zapobj with no attributes and return its object number.
+ */
+uint64_t zap_create(objset_t *ds, dmu_object_type_t ot,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+
+/*
+ * Create a new zapobj with no attributes from the given (unallocated)
+ * object number.
+ */
+int zap_create_claim(objset_t *ds, uint64_t obj, dmu_object_type_t ot,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+
+/*
+ * The zapobj passed in must be a valid ZAP object for all of the
+ * following routines.
+ */
+
+/*
+ * Destroy this zapobj and all its attributes.
+ *
+ * Frees the object number using dmu_object_free.
+ */
+int zap_destroy(objset_t *ds, uint64_t zapobj, dmu_tx_t *tx);
+
+/*
+ * Manipulate attributes.
+ *
+ * 'integer_size' is in bytes, and must be 1, 2, 4, or 8.
+ */
+
+/*
+ * Retrieve the contents of the attribute with the given name.
+ *
+ * If the requested attribute does not exist, the call will fail and
+ * return ENOENT.
+ *
+ * If 'integer_size' is smaller than the attribute's integer size, the
+ * call will fail and return EINVAL.
+ *
+ * If 'integer_size' is equal to or larger than the attribute's integer
+ * size, the call will succeed and return 0. * When converting to a
+ * larger integer size, the integers will be treated as unsigned (ie. no
+ * sign-extension will be performed).
+ *
+ * 'num_integers' is the length (in integers) of 'buf'.
+ *
+ * If the attribute is longer than the buffer, as many integers as will
+ * fit will be transferred to 'buf'. If the entire attribute was not
+ * transferred, the call will return EOVERFLOW.
+ */
+int zap_lookup(objset_t *ds, uint64_t zapobj, const char *name,
+ uint64_t integer_size, uint64_t num_integers, void *buf);
+
+/*
+ * Create an attribute with the given name and value.
+ *
+ * If an attribute with the given name already exists, the call will
+ * fail and return EEXIST.
+ */
+int zap_add(objset_t *ds, uint64_t zapobj, const char *name,
+ int integer_size, uint64_t num_integers,
+ const void *val, dmu_tx_t *tx);
+
+/*
+ * Set the attribute with the given name to the given value. If an
+ * attribute with the given name does not exist, it will be created. If
+ * an attribute with the given name already exists, the previous value
+ * will be overwritten. The integer_size may be different from the
+ * existing attribute's integer size, in which case the attribute's
+ * integer size will be updated to the new value.
+ */
+int zap_update(objset_t *ds, uint64_t zapobj, const char *name,
+ int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
+
+/*
+ * Get the length (in integers) and the integer size of the specified
+ * attribute.
+ *
+ * If the requested attribute does not exist, the call will fail and
+ * return ENOENT.
+ */
+int zap_length(objset_t *ds, uint64_t zapobj, const char *name,
+ uint64_t *integer_size, uint64_t *num_integers);
+
+/*
+ * Remove the specified attribute.
+ *
+ * If the specified attribute does not exist, the call will fail and
+ * return ENOENT.
+ */
+int zap_remove(objset_t *ds, uint64_t zapobj, const char *name, dmu_tx_t *tx);
+
+/*
+ * Returns (in *count) the number of attributes in the specified zap
+ * object.
+ */
+int zap_count(objset_t *ds, uint64_t zapobj, uint64_t *count);
+
+
+/*
+ * Returns (in name) the name of the entry whose value
+ * (za_first_integer) is value, or ENOENT if not found. The string
+ * pointed to by name must be at least 256 bytes long.
+ */
+int zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, char *name);
+
+struct zap;
+struct zap_leaf;
+typedef struct zap_cursor {
+ /* This structure is opaque! */
+ objset_t *zc_objset;
+ struct zap *zc_zap;
+ struct zap_leaf *zc_leaf;
+ uint64_t zc_zapobj;
+ uint64_t zc_hash;
+ uint32_t zc_cd;
+} zap_cursor_t;
+
+typedef struct {
+ int za_integer_length;
+ uint64_t za_num_integers;
+ uint64_t za_first_integer; /* no sign extension for <8byte ints */
+ char za_name[MAXNAMELEN];
+} zap_attribute_t;
+
+/*
+ * The interface for listing all the attributes of a zapobj can be
+ * thought of as cursor moving down a list of the attributes one by
+ * one. The cookie returned by the zap_cursor_serialize routine is
+ * persistent across system calls (and across reboot, even).
+ */
+
+/*
+ * Initialize a zap cursor, pointing to the "first" attribute of the
+ * zapobj. You must _fini the cursor when you are done with it.
+ */
+void zap_cursor_init(zap_cursor_t *zc, objset_t *ds, uint64_t zapobj);
+void zap_cursor_fini(zap_cursor_t *zc);
+
+/*
+ * Get the attribute currently pointed to by the cursor. Returns
+ * ENOENT if at the end of the attributes.
+ */
+int zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za);
+
+/*
+ * Advance the cursor to the next attribute.
+ */
+void zap_cursor_advance(zap_cursor_t *zc);
+
+/*
+ * Get a persistent cookie pointing to the current position of the zap
+ * cursor. The low 4 bits in the cookie are always zero, and thus can
+ * be used as to differentiate a serialized cookie from a different type
+ * of value. The cookie will be less than 2^32 as long as there are
+ * fewer than 2^22 (4.2 million) entries in the zap object.
+ */
+uint64_t zap_cursor_serialize(zap_cursor_t *zc);
+
+/*
+ * Initialize a zap cursor pointing to the position recorded by
+ * zap_cursor_serialize (in the "serialized" argument). You can also
+ * use a "serialized" argument of 0 to start at the beginning of the
+ * zapobj (ie. zap_cursor_init_serialized(..., 0) is equivalent to
+ * zap_cursor_init(...).)
+ */
+void zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *ds,
+ uint64_t zapobj, uint64_t serialized);
+
+
+#define ZAP_HISTOGRAM_SIZE 10
+
+typedef struct zap_stats {
+ /*
+ * Size of the pointer table (in number of entries).
+ * This is always a power of 2, or zero if it's a microzap.
+ * In general, it should be considerably greater than zs_num_leafs.
+ */
+ uint64_t zs_ptrtbl_len;
+
+ uint64_t zs_blocksize; /* size of zap blocks */
+
+ /*
+ * The number of blocks used. Note that some blocks may be
+ * wasted because old ptrtbl's and large name/value blocks are
+ * not reused. (Although their space is reclaimed, we don't
+ * reuse those offsets in the object.)
+ */
+ uint64_t zs_num_blocks;
+
+ /*
+ * Pointer table values from zap_ptrtbl in the zap_phys_t
+ */
+ uint64_t zs_ptrtbl_nextblk; /* next (larger) copy start block */
+ uint64_t zs_ptrtbl_blks_copied; /* number source blocks copied */
+ uint64_t zs_ptrtbl_zt_blk; /* starting block number */
+ uint64_t zs_ptrtbl_zt_numblks; /* number of blocks */
+ uint64_t zs_ptrtbl_zt_shift; /* bits to index it */
+
+ /*
+ * Values of the other members of the zap_phys_t
+ */
+ uint64_t zs_block_type; /* ZBT_HEADER */
+ uint64_t zs_magic; /* ZAP_MAGIC */
+ uint64_t zs_num_leafs; /* The number of leaf blocks */
+ uint64_t zs_num_entries; /* The number of zap entries */
+ uint64_t zs_salt; /* salt to stir into hash function */
+
+ /*
+ * Histograms. For all histograms, the last index
+ * (ZAP_HISTOGRAM_SIZE-1) includes any values which are greater
+ * than what can be represented. For example
+ * zs_leafs_with_n5_entries[ZAP_HISTOGRAM_SIZE-1] is the number
+ * of leafs with more than 45 entries.
+ */
+
+ /*
+ * zs_leafs_with_n_pointers[n] is the number of leafs with
+ * 2^n pointers to it.
+ */
+ uint64_t zs_leafs_with_2n_pointers[ZAP_HISTOGRAM_SIZE];
+
+ /*
+ * zs_leafs_with_n_entries[n] is the number of leafs with
+ * [n*5, (n+1)*5) entries. In the current implementation, there
+ * can be at most 55 entries in any block, but there may be
+ * fewer if the name or value is large, or the block is not
+ * completely full.
+ */
+ uint64_t zs_blocks_with_n5_entries[ZAP_HISTOGRAM_SIZE];
+
+ /*
+ * zs_leafs_n_tenths_full[n] is the number of leafs whose
+ * fullness is in the range [n/10, (n+1)/10).
+ */
+ uint64_t zs_blocks_n_tenths_full[ZAP_HISTOGRAM_SIZE];
+
+ /*
+ * zs_entries_using_n_chunks[n] is the number of entries which
+ * consume n 24-byte chunks. (Note, large names/values only use
+ * one chunk, but contribute to zs_num_blocks_large.)
+ */
+ uint64_t zs_entries_using_n_chunks[ZAP_HISTOGRAM_SIZE];
+
+ /*
+ * zs_buckets_with_n_entries[n] is the number of buckets (each
+ * leaf has 64 buckets) with n entries.
+ * zs_buckets_with_n_entries[1] should be very close to
+ * zs_num_entries.
+ */
+ uint64_t zs_buckets_with_n_entries[ZAP_HISTOGRAM_SIZE];
+} zap_stats_t;
+
+/*
+ * Get statistics about a ZAP object. Note: you need to be aware of the
+ * internal implementation of the ZAP to correctly interpret some of the
+ * statistics. This interface shouldn't be relied on unless you really
+ * know what you're doing.
+ */
+int zap_get_stats(objset_t *ds, uint64_t zapobj, zap_stats_t *zs);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZAP_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h
new file mode 100644
index 0000000..4e43f4a
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h
@@ -0,0 +1,204 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ZAP_IMPL_H
+#define _SYS_ZAP_IMPL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zap.h>
+#include <sys/zfs_context.h>
+#include <sys/avl.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern int fzap_default_block_shift;
+
+#define ZAP_MAGIC 0x2F52AB2ABULL
+
+#define FZAP_BLOCK_SHIFT(zap) ((zap)->zap_f.zap_block_shift)
+
+#define ZAP_MAXCD (uint32_t)(-1)
+#define ZAP_HASHBITS 28
+#define MZAP_ENT_LEN 64
+#define MZAP_NAME_LEN (MZAP_ENT_LEN - 8 - 4 - 2)
+#define MZAP_MAX_BLKSHIFT SPA_MAXBLOCKSHIFT
+#define MZAP_MAX_BLKSZ (1 << MZAP_MAX_BLKSHIFT)
+
+typedef struct mzap_ent_phys {
+ uint64_t mze_value;
+ uint32_t mze_cd;
+ uint16_t mze_pad; /* in case we want to chain them someday */
+ char mze_name[MZAP_NAME_LEN];
+} mzap_ent_phys_t;
+
+typedef struct mzap_phys {
+ uint64_t mz_block_type; /* ZBT_MICRO */
+ uint64_t mz_salt;
+ uint64_t mz_pad[6];
+ mzap_ent_phys_t mz_chunk[1];
+ /* actually variable size depending on block size */
+} mzap_phys_t;
+
+typedef struct mzap_ent {
+ avl_node_t mze_node;
+ int mze_chunkid;
+ uint64_t mze_hash;
+ mzap_ent_phys_t mze_phys;
+} mzap_ent_t;
+
+
+/*
+ * The (fat) zap is stored in one object. It is an array of
+ * 1<<FZAP_BLOCK_SHIFT byte blocks. The layout looks like one of:
+ *
+ * ptrtbl fits in first block:
+ * [zap_phys_t zap_ptrtbl_shift < 6] [zap_leaf_t] ...
+ *
+ * ptrtbl too big for first block:
+ * [zap_phys_t zap_ptrtbl_shift >= 6] [zap_leaf_t] [ptrtbl] ...
+ *
+ */
+
+struct dmu_buf;
+struct zap_leaf;
+
+#define ZBT_LEAF ((1ULL << 63) + 0)
+#define ZBT_HEADER ((1ULL << 63) + 1)
+#define ZBT_MICRO ((1ULL << 63) + 3)
+/* any other values are ptrtbl blocks */
+
+/*
+ * the embedded pointer table takes up half a block:
+ * block size / entry size (2^3) / 2
+ */
+#define ZAP_EMBEDDED_PTRTBL_SHIFT(zap) (FZAP_BLOCK_SHIFT(zap) - 3 - 1)
+
+/*
+ * The embedded pointer table starts half-way through the block. Since
+ * the pointer table itself is half the block, it starts at (64-bit)
+ * word number (1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)).
+ */
+#define ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) \
+ ((uint64_t *)(zap)->zap_f.zap_phys) \
+ [(idx) + (1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap))]
+
+/*
+ * TAKE NOTE:
+ * If zap_phys_t is modified, zap_byteswap() must be modified.
+ */
+typedef struct zap_phys {
+ uint64_t zap_block_type; /* ZBT_HEADER */
+ uint64_t zap_magic; /* ZAP_MAGIC */
+
+ struct zap_table_phys {
+ uint64_t zt_blk; /* starting block number */
+ uint64_t zt_numblks; /* number of blocks */
+ uint64_t zt_shift; /* bits to index it */
+ uint64_t zt_nextblk; /* next (larger) copy start block */
+ uint64_t zt_blks_copied; /* number source blocks copied */
+ } zap_ptrtbl;
+
+ uint64_t zap_freeblk; /* the next free block */
+ uint64_t zap_num_leafs; /* number of leafs */
+ uint64_t zap_num_entries; /* number of entries */
+ uint64_t zap_salt; /* salt to stir into hash function */
+ /*
+ * This structure is followed by padding, and then the embedded
+ * pointer table. The embedded pointer table takes up second
+ * half of the block. It is accessed using the
+ * ZAP_EMBEDDED_PTRTBL_ENT() macro.
+ */
+} zap_phys_t;
+
+typedef struct zap_table_phys zap_table_phys_t;
+
+typedef struct zap {
+ objset_t *zap_objset;
+ uint64_t zap_object;
+ struct dmu_buf *zap_dbuf;
+ krwlock_t zap_rwlock;
+ int zap_ismicro;
+ uint64_t zap_salt;
+ union {
+ struct {
+ zap_phys_t *zap_phys;
+
+ /*
+ * zap_num_entries_mtx protects
+ * zap_num_entries
+ */
+ kmutex_t zap_num_entries_mtx;
+ int zap_block_shift;
+ } zap_fat;
+ struct {
+ mzap_phys_t *zap_phys;
+ int16_t zap_num_entries;
+ int16_t zap_num_chunks;
+ int16_t zap_alloc_next;
+ avl_tree_t zap_avl;
+ } zap_micro;
+ } zap_u;
+} zap_t;
+
+#define zap_f zap_u.zap_fat
+#define zap_m zap_u.zap_micro
+
+uint64_t zap_hash(zap_t *zap, const char *name);
+int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
+ krw_t lti, int fatreader, zap_t **zapp);
+void zap_unlockdir(zap_t *zap);
+void zap_evict(dmu_buf_t *db, void *vmzap);
+
+#define ZAP_HASH_IDX(hash, n) (((n) == 0) ? 0 : ((hash) >> (64 - (n))))
+
+void fzap_byteswap(void *buf, size_t size);
+int fzap_count(zap_t *zap, uint64_t *count);
+int fzap_lookup(zap_t *zap, const char *name,
+ uint64_t integer_size, uint64_t num_integers, void *buf);
+int fzap_add(zap_t *zap, const char *name,
+ uint64_t integer_size, uint64_t num_integers,
+ const void *val, dmu_tx_t *tx);
+int fzap_update(zap_t *zap, const char *name,
+ int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
+int fzap_length(zap_t *zap, const char *name,
+ uint64_t *integer_size, uint64_t *num_integers);
+int fzap_remove(zap_t *zap, const char *name, dmu_tx_t *tx);
+int fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za);
+void fzap_get_stats(zap_t *zap, zap_stats_t *zs);
+void zap_put_leaf(struct zap_leaf *l);
+
+int fzap_add_cd(zap_t *zap, const char *name,
+ uint64_t integer_size, uint64_t num_integers,
+ const void *val, uint32_t cd, dmu_tx_t *tx);
+void fzap_upgrade(zap_t *zap, dmu_tx_t *tx);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZAP_IMPL_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h
new file mode 100644
index 0000000..147fb72
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h
@@ -0,0 +1,234 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ZAP_LEAF_H
+#define _SYS_ZAP_LEAF_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct zap;
+
+#define ZAP_LEAF_MAGIC 0x2AB1EAF
+
+/* chunk size = 24 bytes */
+#define ZAP_LEAF_CHUNKSIZE 24
+
+/*
+ * The amount of space available for chunks is:
+ * block size (1<<l->l_bs) - hash entry size (2) * number of hash
+ * entries - header space (2*chunksize)
+ */
+#define ZAP_LEAF_NUMCHUNKS(l) \
+ (((1<<(l)->l_bs) - 2*ZAP_LEAF_HASH_NUMENTRIES(l)) / \
+ ZAP_LEAF_CHUNKSIZE - 2)
+
+/*
+ * The amount of space within the chunk available for the array is:
+ * chunk size - space for type (1) - space for next pointer (2)
+ */
+#define ZAP_LEAF_ARRAY_BYTES (ZAP_LEAF_CHUNKSIZE - 3)
+
+#define ZAP_LEAF_ARRAY_NCHUNKS(bytes) \
+ (((bytes)+ZAP_LEAF_ARRAY_BYTES-1)/ZAP_LEAF_ARRAY_BYTES)
+
+/*
+ * Low water mark: when there are only this many chunks free, start
+ * growing the ptrtbl. Ideally, this should be larger than a
+ * "reasonably-sized" entry. 20 chunks is more than enough for the
+ * largest directory entry (MAXNAMELEN (256) byte name, 8-byte value),
+ * while still being only around 3% for 16k blocks.
+ */
+#define ZAP_LEAF_LOW_WATER (20)
+
+/*
+ * The leaf hash table has block size / 2^5 (32) number of entries,
+ * which should be more than enough for the maximum number of entries,
+ * which is less than block size / CHUNKSIZE (24) / minimum number of
+ * chunks per entry (3).
+ */
+#define ZAP_LEAF_HASH_SHIFT(l) ((l)->l_bs - 5)
+#define ZAP_LEAF_HASH_NUMENTRIES(l) (1 << ZAP_LEAF_HASH_SHIFT(l))
+
+/*
+ * The chunks start immediately after the hash table. The end of the
+ * hash table is at l_hash + HASH_NUMENTRIES, which we simply cast to a
+ * chunk_t.
+ */
+#define ZAP_LEAF_CHUNK(l, idx) \
+ ((zap_leaf_chunk_t *) \
+ ((l)->l_phys->l_hash + ZAP_LEAF_HASH_NUMENTRIES(l)))[idx]
+#define ZAP_LEAF_ENTRY(l, idx) (&ZAP_LEAF_CHUNK(l, idx).l_entry)
+
+typedef enum zap_chunk_type {
+ ZAP_CHUNK_FREE = 253,
+ ZAP_CHUNK_ENTRY = 252,
+ ZAP_CHUNK_ARRAY = 251,
+ ZAP_CHUNK_TYPE_MAX = 250
+} zap_chunk_type_t;
+
+/*
+ * TAKE NOTE:
+ * If zap_leaf_phys_t is modified, zap_leaf_byteswap() must be modified.
+ */
+typedef struct zap_leaf_phys {
+ struct zap_leaf_header {
+ uint64_t lh_block_type; /* ZBT_LEAF */
+ uint64_t lh_pad1;
+ uint64_t lh_prefix; /* hash prefix of this leaf */
+ uint32_t lh_magic; /* ZAP_LEAF_MAGIC */
+ uint16_t lh_nfree; /* number free chunks */
+ uint16_t lh_nentries; /* number of entries */
+ uint16_t lh_prefix_len; /* num bits used to id this */
+
+/* above is accessable to zap, below is zap_leaf private */
+
+ uint16_t lh_freelist; /* chunk head of free list */
+ uint8_t lh_pad2[12];
+ } l_hdr; /* 2 24-byte chunks */
+
+ /*
+ * The header is followed by a hash table with
+ * ZAP_LEAF_HASH_NUMENTRIES(zap) entries. The hash table is
+ * followed by an array of ZAP_LEAF_NUMCHUNKS(zap)
+ * zap_leaf_chunk structures. These structures are accessed
+ * with the ZAP_LEAF_CHUNK() macro.
+ */
+
+ uint16_t l_hash[1];
+} zap_leaf_phys_t;
+
+typedef union zap_leaf_chunk {
+ struct zap_leaf_entry {
+ uint8_t le_type; /* always ZAP_CHUNK_ENTRY */
+ uint8_t le_int_size; /* size of ints */
+ uint16_t le_next; /* next entry in hash chain */
+ uint16_t le_name_chunk; /* first chunk of the name */
+ uint16_t le_name_length; /* bytes in name, incl null */
+ uint16_t le_value_chunk; /* first chunk of the value */
+ uint16_t le_value_length; /* value length in ints */
+ uint32_t le_cd; /* collision differentiator */
+ uint64_t le_hash; /* hash value of the name */
+ } l_entry;
+ struct zap_leaf_array {
+ uint8_t la_type; /* always ZAP_CHUNK_ARRAY */
+ uint8_t la_array[ZAP_LEAF_ARRAY_BYTES];
+ uint16_t la_next; /* next blk or CHAIN_END */
+ } l_array;
+ struct zap_leaf_free {
+ uint8_t lf_type; /* always ZAP_CHUNK_FREE */
+ uint8_t lf_pad[ZAP_LEAF_ARRAY_BYTES];
+ uint16_t lf_next; /* next in free list, or CHAIN_END */
+ } l_free;
+} zap_leaf_chunk_t;
+
+typedef struct zap_leaf {
+ krwlock_t l_rwlock; /* only used on head of chain */
+ uint64_t l_blkid; /* 1<<ZAP_BLOCK_SHIFT byte block off */
+ int l_bs; /* block size shift */
+ dmu_buf_t *l_dbuf;
+ zap_leaf_phys_t *l_phys;
+} zap_leaf_t;
+
+
+typedef struct zap_entry_handle {
+ /* below is set by zap_leaf.c and is public to zap.c */
+ uint64_t zeh_num_integers;
+ uint64_t zeh_hash;
+ uint32_t zeh_cd;
+ uint8_t zeh_integer_size;
+
+ /* below is private to zap_leaf.c */
+ uint16_t zeh_fakechunk;
+ uint16_t *zeh_chunkp;
+ zap_leaf_t *zeh_leaf;
+} zap_entry_handle_t;
+
+/*
+ * Return a handle to the named entry, or ENOENT if not found. The hash
+ * value must equal zap_hash(name).
+ */
+extern int zap_leaf_lookup(zap_leaf_t *l,
+ const char *name, uint64_t h, zap_entry_handle_t *zeh);
+
+/*
+ * Return a handle to the entry with this hash+cd, or the entry with the
+ * next closest hash+cd.
+ */
+extern int zap_leaf_lookup_closest(zap_leaf_t *l,
+ uint64_t hash, uint32_t cd, zap_entry_handle_t *zeh);
+
+/*
+ * Read the first num_integers in the attribute. Integer size
+ * conversion will be done without sign extension. Return EINVAL if
+ * integer_size is too small. Return EOVERFLOW if there are more than
+ * num_integers in the attribute.
+ */
+extern int zap_entry_read(const zap_entry_handle_t *zeh,
+ uint8_t integer_size, uint64_t num_integers, void *buf);
+
+extern int zap_entry_read_name(const zap_entry_handle_t *zeh,
+ uint16_t buflen, char *buf);
+
+/*
+ * Replace the value of an existing entry.
+ *
+ * zap_entry_update may fail if it runs out of space (ENOSPC).
+ */
+extern int zap_entry_update(zap_entry_handle_t *zeh,
+ uint8_t integer_size, uint64_t num_integers, const void *buf);
+
+/*
+ * Remove an entry.
+ */
+extern void zap_entry_remove(zap_entry_handle_t *zeh);
+
+/*
+ * Create an entry. An equal entry must not exist, and this entry must
+ * belong in this leaf (according to its hash value). Fills in the
+ * entry handle on success. Returns 0 on success or ENOSPC on failure.
+ */
+extern int zap_entry_create(zap_leaf_t *l,
+ const char *name, uint64_t h, uint32_t cd,
+ uint8_t integer_size, uint64_t num_integers, const void *buf,
+ zap_entry_handle_t *zeh);
+
+/*
+ * Other stuff.
+ */
+
+extern void zap_leaf_init(zap_leaf_t *l);
+extern void zap_leaf_byteswap(zap_leaf_phys_t *buf, int len);
+extern void zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl);
+extern void zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZAP_LEAF_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h
new file mode 100644
index 0000000..3250b76
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h
@@ -0,0 +1,115 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_FS_ZFS_ACL_H
+#define _SYS_FS_ZFS_ACL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef _KERNEL
+#include <sys/cred.h>
+#endif
+#include <sys/acl.h>
+#include <sys/dmu.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct znode_phys;
+
+#define ACCESS_UNDETERMINED -1
+
+#define ACE_SLOT_CNT 6
+
+typedef struct zfs_znode_acl {
+ uint64_t z_acl_extern_obj; /* ext acl pieces */
+ uint32_t z_acl_count; /* Number of ACEs */
+ uint16_t z_acl_version; /* acl version */
+ uint16_t z_acl_pad; /* pad */
+ ace_t z_ace_data[ACE_SLOT_CNT]; /* 6 standard ACEs */
+} zfs_znode_acl_t;
+
+#define ACL_DATA_ALLOCED 0x1
+
+/*
+ * Max ACL size is prepended deny for all entries + the
+ * canonical six tacked on * the end.
+ */
+#define MAX_ACL_SIZE (MAX_ACL_ENTRIES * 2 + 6)
+
+typedef struct zfs_acl {
+ int z_slots; /* number of allocated slots for ACEs */
+ int z_acl_count;
+ uint_t z_state;
+ ace_t *z_acl;
+} zfs_acl_t;
+
+#define ZFS_ACL_SIZE(aclcnt) (sizeof (ace_t) * (aclcnt))
+
+/*
+ * Property values for acl_mode and acl_inherit.
+ *
+ * acl_mode can take discard, noallow, groupmask and passthrough.
+ * whereas acl_inherit has secure instead of groupmask.
+ */
+
+#define ZFS_ACL_DISCARD 0
+#define ZFS_ACL_NOALLOW 1
+#define ZFS_ACL_GROUPMASK 2
+#define ZFS_ACL_PASSTHROUGH 3
+#define ZFS_ACL_SECURE 4
+
+struct znode;
+
+#ifdef _KERNEL
+void zfs_perm_init(struct znode *, struct znode *, int, vattr_t *,
+ dmu_tx_t *, cred_t *);
+#ifdef TODO
+int zfs_getacl(struct znode *, vsecattr_t *, cred_t *);
+#endif
+int zfs_mode_update(struct znode *, uint64_t, dmu_tx_t *);
+#ifdef TODO
+int zfs_setacl(struct znode *, vsecattr_t *, cred_t *);
+#endif
+void zfs_acl_rele(void *);
+void zfs_ace_byteswap(ace_t *, int);
+extern int zfs_zaccess(struct znode *, int, cred_t *);
+extern int zfs_zaccess_rwx(struct znode *, mode_t, cred_t *);
+extern int zfs_acl_access(struct znode *, int, cred_t *);
+int zfs_acl_chmod_setattr(struct znode *, uint64_t, dmu_tx_t *);
+int zfs_zaccess_delete(struct znode *, struct znode *, cred_t *);
+int zfs_zaccess_rename(struct znode *, struct znode *,
+ struct znode *, struct znode *, cred_t *cr);
+int zfs_zaccess_v4_perm(struct znode *, int, cred_t *);
+void zfs_acl_free(zfs_acl_t *);
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* !ZFS_NO_ACL */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h
new file mode 100644
index 0000000..c91d807
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h
@@ -0,0 +1,122 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ZFS_CONTEXT_H
+#define _SYS_ZFS_CONTEXT_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/param.h>
+#include <sys/stdint.h>
+#include <sys/note.h>
+#include <sys/kernel.h>
+#include <sys/debug.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/sysmacros.h>
+#include <sys/bitmap.h>
+#include <sys/cmn_err.h>
+#include <sys/kmem.h>
+#include <sys/taskq.h>
+#include <sys/systm.h>
+#include <sys/kobj.h>
+#include <sys/conf.h>
+#include <sys/mutex.h>
+#include <sys/rwlock.h>
+#include <sys/random.h>
+#include <sys/byteorder.h>
+#include <sys/systm.h>
+#include <sys/list.h>
+#include <sys/uio.h>
+#include <sys/dirent.h>
+#include <sys/time.h>
+#include <sys/uio.h>
+#include <sys/fcntl.h>
+#include <sys/limits.h>
+#include <sys/string.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/cred.h>
+#include <sys/sdt.h>
+#include <sys/file.h>
+#include <sys/vfs.h>
+#include <sys/sysctl.h>
+#include <sys/sbuf.h>
+#include <sys/priv.h>
+#include <sys/kdb.h>
+#include <sys/ktr.h>
+#include <sys/stack.h>
+#include <sys/lockf.h>
+#include <sys/policy.h>
+#include <sys/zone.h>
+#include <sys/eventhandler.h>
+#include <sys/zfs_debug.h>
+
+#include <machine/stdarg.h>
+
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+#include <vm/vm_object.h>
+#include <vm/vm_pager.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_map.h>
+/* There is clash. vm_map.h defines the two below and vdev_cache.c use them. */
+#ifdef min_offset
+#undef min_offset
+#endif
+#ifdef max_offset
+#undef max_offset
+#endif
+#include <vm/vm_extern.h>
+#include <vm/vnode_pager.h>
+
+#define CPU_SEQID (curcpu)
+
+#ifdef __cplusplus
+}
+#endif
+
+#define physmem (vm_kmem_size / PAGE_SIZE)
+
+extern int zfs_debug_level;
+extern struct mtx zfs_debug_mtx;
+#define ZFS_LOG(lvl, ...) do { \
+ if (((lvl) & 0xff) <= zfs_debug_level) { \
+ mtx_lock(&zfs_debug_mtx); \
+ printf("%s:%u[%d]: ", __func__, __LINE__, (lvl)); \
+ printf(__VA_ARGS__); \
+ printf("\n"); \
+ if ((lvl) & 0x100) \
+ kdb_backtrace(); \
+ mtx_unlock(&zfs_debug_mtx); \
+ } \
+} while (0)
+
+#endif /* _SYS_ZFS_CONTEXT_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h
new file mode 100644
index 0000000..a676533
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h
@@ -0,0 +1,71 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _ZFS_CTLDIR_H
+#define _ZFS_CTLDIR_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/vnode.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_znode.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define ZFS_CTLDIR_NAME ".zfs"
+
+#define zfs_has_ctldir(zdp) \
+ ((zdp)->z_id == (zdp)->z_zfsvfs->z_root && \
+ ((zdp)->z_zfsvfs->z_ctldir != NULL))
+#define zfs_show_ctldir(zdp) \
+ (zfs_has_ctldir(zdp) && \
+ ((zdp)->z_zfsvfs->z_show_ctldir))
+
+void zfsctl_create(zfsvfs_t *);
+void zfsctl_destroy(zfsvfs_t *);
+vnode_t *zfsctl_root(znode_t *);
+void zfsctl_init(void);
+void zfsctl_fini(void);
+
+int zfsctl_rename_snapshot(const char *from, const char *to);
+int zfsctl_destroy_snapshot(const char *snapname, int force);
+int zfsctl_umount_snapshots(vfs_t *, int, cred_t *);
+
+int zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
+ int flags, vnode_t *rdir, cred_t *cr);
+
+int zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp);
+
+#define ZFSCTL_INO_ROOT 0x1
+#define ZFSCTL_INO_SNAPDIR 0x2
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZFS_CTLDIR_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h
new file mode 100644
index 0000000..450ac1c
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h
@@ -0,0 +1,75 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ZFS_DEBUG_H
+#define _SYS_ZFS_DEBUG_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef TRUE
+#define TRUE 1
+#endif
+
+#ifndef FALSE
+#define FALSE 0
+#endif
+
+/*
+ * ZFS debugging
+ */
+
+#if defined(DEBUG) || !defined(_KERNEL)
+#define ZFS_DEBUG
+#endif
+
+extern int zfs_flags;
+
+#define ZFS_DEBUG_DPRINTF 0x0001
+#define ZFS_DEBUG_DBUF_VERIFY 0x0002
+#define ZFS_DEBUG_DNODE_VERIFY 0x0004
+#define ZFS_DEBUG_SNAPNAMES 0x0008
+#define ZFS_DEBUG_MODIFY 0x0010
+
+#ifdef ZFS_DEBUG
+extern void __dprintf(const char *file, const char *func,
+ int line, const char *fmt, ...);
+#define dprintf(...) \
+ if (zfs_flags & ZFS_DEBUG_DPRINTF) \
+ __dprintf(__FILE__, __func__, __LINE__, __VA_ARGS__)
+#else
+#define dprintf(...) ((void)0)
+#endif /* ZFS_DEBUG */
+
+extern void zfs_panic_recover(const char *fmt, ...);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZFS_DEBUG_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h
new file mode 100644
index 0000000..f60d614
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h
@@ -0,0 +1,71 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_FS_ZFS_DIR_H
+#define _SYS_FS_ZFS_DIR_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/zfs_znode.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* zfs_dirent_lock() flags */
+#define ZNEW 0x0001 /* entry should not exist */
+#define ZEXISTS 0x0002 /* entry should exist */
+#define ZSHARED 0x0004 /* shared access (zfs_dirlook()) */
+#define ZXATTR 0x0008 /* we want the xattr dir */
+#define ZRENAMING 0x0010 /* znode is being renamed */
+
+/* mknode flags */
+#define IS_ROOT_NODE 0x01 /* create a root node */
+#define IS_XATTR 0x02 /* create an extended attribute node */
+#define IS_REPLAY 0x04 /* we are replaying intent log */
+
+extern int zfs_dirent_lock(zfs_dirlock_t **, znode_t *, char *, znode_t **,
+ int);
+extern void zfs_dirent_unlock(zfs_dirlock_t *);
+extern int zfs_link_create(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int);
+extern int zfs_link_destroy(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int,
+ boolean_t *);
+extern int zfs_dirlook(znode_t *, char *, vnode_t **);
+extern void zfs_mknode(znode_t *, vattr_t *, uint64_t *,
+ dmu_tx_t *, cred_t *, uint_t, znode_t **, int);
+extern void zfs_rmnode(znode_t *);
+extern boolean_t zfs_dirempty(znode_t *);
+extern void zfs_unlinked_add(znode_t *, dmu_tx_t *);
+extern void zfs_unlinked_drain(zfsvfs_t *zfsvfs);
+extern int zfs_sticky_remove_access(znode_t *, znode_t *, cred_t *cr);
+extern int zfs_get_xattrdir(znode_t *, vnode_t **, cred_t *, int);
+extern int zfs_make_xattrdir(znode_t *, vattr_t *, vnode_t **, cred_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_FS_ZFS_DIR_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
new file mode 100644
index 0000000..d28729b
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
@@ -0,0 +1,162 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ZFS_IOCTL_H
+#define _SYS_ZFS_IOCTL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/cred.h>
+#include <sys/dmu.h>
+#include <sys/zio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Property values for snapdir
+ */
+#define ZFS_SNAPDIR_HIDDEN 0
+#define ZFS_SNAPDIR_VISIBLE 1
+
+#define DMU_BACKUP_VERSION (1ULL)
+#define DMU_BACKUP_MAGIC 0x2F5bacbacULL
+
+/*
+ * zfs ioctl command structure
+ */
+typedef struct dmu_replay_record {
+ enum {
+ DRR_BEGIN, DRR_OBJECT, DRR_FREEOBJECTS,
+ DRR_WRITE, DRR_FREE, DRR_END,
+ } drr_type;
+ uint32_t drr_pad;
+ union {
+ struct drr_begin {
+ uint64_t drr_magic;
+ uint64_t drr_version;
+ uint64_t drr_creation_time;
+ dmu_objset_type_t drr_type;
+ uint32_t drr_pad;
+ uint64_t drr_toguid;
+ uint64_t drr_fromguid;
+ char drr_toname[MAXNAMELEN];
+ } drr_begin;
+ struct drr_end {
+ zio_cksum_t drr_checksum;
+ } drr_end;
+ struct drr_object {
+ uint64_t drr_object;
+ dmu_object_type_t drr_type;
+ dmu_object_type_t drr_bonustype;
+ uint32_t drr_blksz;
+ uint32_t drr_bonuslen;
+ uint8_t drr_checksum;
+ uint8_t drr_compress;
+ uint8_t drr_pad[6];
+ /* bonus content follows */
+ } drr_object;
+ struct drr_freeobjects {
+ uint64_t drr_firstobj;
+ uint64_t drr_numobjs;
+ } drr_freeobjects;
+ struct drr_write {
+ uint64_t drr_object;
+ dmu_object_type_t drr_type;
+ uint32_t drr_pad;
+ uint64_t drr_offset;
+ uint64_t drr_length;
+ /* content follows */
+ } drr_write;
+ struct drr_free {
+ uint64_t drr_object;
+ uint64_t drr_offset;
+ uint64_t drr_length;
+ } drr_free;
+ } drr_u;
+} dmu_replay_record_t;
+
+typedef struct zinject_record {
+ uint64_t zi_objset;
+ uint64_t zi_object;
+ uint64_t zi_start;
+ uint64_t zi_end;
+ uint64_t zi_guid;
+ uint32_t zi_level;
+ uint32_t zi_error;
+ uint64_t zi_type;
+ uint32_t zi_freq;
+} zinject_record_t;
+
+#define ZINJECT_NULL 0x1
+#define ZINJECT_FLUSH_ARC 0x2
+#define ZINJECT_UNLOAD_SPA 0x4
+
+typedef struct zfs_cmd {
+ char zc_name[MAXPATHLEN];
+ char zc_value[MAXPATHLEN * 2];
+ uint64_t zc_guid;
+ uint64_t zc_nvlist_src; /* really (char *) */
+ uint64_t zc_nvlist_src_size;
+ uint64_t zc_nvlist_dst; /* really (char *) */
+ uint64_t zc_nvlist_dst_size;
+ uint64_t zc_cookie;
+ uint64_t zc_cred;
+ uint64_t zc_dev;
+ uint64_t zc_objset_type;
+ uint64_t zc_history; /* really (char *) */
+ uint64_t zc_history_len;
+ uint64_t zc_history_offset;
+ uint64_t zc_obj;
+ uint64_t zc_jailid;
+ dmu_objset_stats_t zc_objset_stats;
+ struct drr_begin zc_begin_record;
+ zinject_record_t zc_inject_record;
+} zfs_cmd_t;
+
+#ifdef _KERNEL
+typedef struct zfs_create_data {
+ cred_t *zc_cred;
+ dev_t zc_dev;
+ nvlist_t *zc_props;
+} zfs_create_data_t;
+#endif
+
+#define ZVOL_MAX_MINOR (1 << 16)
+#define ZFS_MIN_MINOR (ZVOL_MAX_MINOR + 1)
+
+#ifdef _KERNEL
+
+extern int zfs_secpolicy_write(const char *dataset, cred_t *cr);
+extern int zfs_busy(void);
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZFS_IOCTL_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_rlock.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_rlock.h
new file mode 100644
index 0000000..f302b66
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_rlock.h
@@ -0,0 +1,89 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_FS_ZFS_RLOCK_H
+#define _SYS_FS_ZFS_RLOCK_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _KERNEL
+
+#include <sys/zfs_znode.h>
+
+typedef enum {
+ RL_READER,
+ RL_WRITER,
+ RL_APPEND
+} rl_type_t;
+
+typedef struct rl {
+ znode_t *r_zp; /* znode this lock applies to */
+ avl_node_t r_node; /* avl node link */
+ uint64_t r_off; /* file range offset */
+ uint64_t r_len; /* file range length */
+ uint_t r_cnt; /* range reference count in tree */
+ rl_type_t r_type; /* range type */
+ kcondvar_t r_wr_cv; /* cv for waiting writers */
+ kcondvar_t r_rd_cv; /* cv for waiting readers */
+ uint8_t r_proxy; /* acting for original range */
+ uint8_t r_write_wanted; /* writer wants to lock this range */
+ uint8_t r_read_wanted; /* reader wants to lock this range */
+} rl_t;
+
+/*
+ * Lock a range (offset, length) as either shared (READER)
+ * or exclusive (WRITER or APPEND). APPEND is a special type that
+ * is converted to WRITER that specified to lock from the start of the
+ * end of file. zfs_range_lock() returns the range lock structure.
+ */
+rl_t *zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type);
+
+/*
+ * Unlock range and destroy range lock structure.
+ */
+void zfs_range_unlock(rl_t *rl);
+
+/*
+ * Reduce range locked as RW_WRITER from whole file to specified range.
+ * Asserts the whole file was previously locked.
+ */
+void zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len);
+
+/*
+ * AVL comparison function used to compare range locks
+ */
+int zfs_range_compare(const void *arg1, const void *arg2);
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_FS_ZFS_RLOCK_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h
new file mode 100644
index 0000000..aa82cc1
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h
@@ -0,0 +1,100 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_FS_ZFS_VFSOPS_H
+#define _SYS_FS_ZFS_VFSOPS_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/list.h>
+#include <sys/vfs.h>
+#include <sys/zil.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct zfsvfs zfsvfs_t;
+
+struct zfsvfs {
+ vfs_t *z_vfs; /* generic fs struct */
+ zfsvfs_t *z_parent; /* parent fs */
+ objset_t *z_os; /* objset reference */
+ uint64_t z_root; /* id of root znode */
+ uint64_t z_unlinkedobj; /* id of unlinked zapobj */
+ uint64_t z_max_blksz; /* maximum block size for files */
+ uint64_t z_assign; /* TXG_NOWAIT or set by zil_replay() */
+ zilog_t *z_log; /* intent log pointer */
+ uint_t z_acl_mode; /* acl chmod/mode behavior */
+ uint_t z_acl_inherit; /* acl inheritance behavior */
+ boolean_t z_atime; /* enable atimes mount option */
+ boolean_t z_unmounted1; /* unmounted phase 1 */
+ boolean_t z_unmounted2; /* unmounted phase 2 */
+ uint32_t z_op_cnt; /* vnode/vfs operations ref count */
+ krwlock_t z_um_lock; /* rw lock for umount phase 2 */
+ list_t z_all_znodes; /* all vnodes in the fs */
+ kmutex_t z_znodes_lock; /* lock for z_all_znodes */
+ vnode_t *z_ctldir; /* .zfs directory pointer */
+ boolean_t z_show_ctldir; /* expose .zfs in the root dir */
+ boolean_t z_issnap; /* true if this is a snapshot */
+#define ZFS_OBJ_MTX_SZ 64
+ kmutex_t z_hold_mtx[ZFS_OBJ_MTX_SZ]; /* znode hold locks */
+};
+
+/*
+ * The total file ID size is limited to 12 bytes (including the length
+ * field) in the NFSv2 protocol. For historical reasons, this same limit
+ * is currently being imposed by the Solaris NFSv3 implementation...
+ * although the protocol actually permits a maximum of 64 bytes. It will
+ * not be possible to expand beyond 12 bytes without abandoning support
+ * of NFSv2 and making some changes to the Solaris NFSv3 implementation.
+ *
+ * For the time being, we will partition up the available space as follows:
+ * 2 bytes fid length (required)
+ * 6 bytes object number (48 bits)
+ * 4 bytes generation number (32 bits)
+ * We reserve only 48 bits for the object number, as this is the limit
+ * currently defined and imposed by the DMU.
+ */
+typedef struct zfid_short {
+ uint16_t zf_len;
+ uint8_t zf_object[6]; /* obj[i] = obj >> (8 * i) */
+ uint8_t zf_gen[4]; /* gen[i] = gen >> (8 * i) */
+} zfid_short_t;
+
+typedef struct zfid_long {
+ zfid_short_t z_fid;
+ uint8_t zf_setid[6]; /* obj[i] = obj >> (8 * i) */
+ uint8_t zf_setgen[2]; /* gen[i] = gen >> (8 * i) */
+} zfid_long_t;
+
+#define SHORT_FID_LEN (sizeof (zfid_short_t) - sizeof (uint16_t))
+#define LONG_FID_LEN (sizeof (zfid_long_t) - sizeof (uint16_t))
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_FS_ZFS_VFSOPS_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h
new file mode 100644
index 0000000..c9c317e
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h
@@ -0,0 +1,298 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_FS_ZFS_ZNODE_H
+#define _SYS_FS_ZFS_ZNODE_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef _KERNEL
+#include <sys/list.h>
+#include <sys/dmu.h>
+#include <sys/zfs_vfsops.h>
+#endif
+#include <sys/zfs_acl.h>
+#include <sys/zil.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Define special zfs pflags
+ */
+#define ZFS_XATTR 0x1 /* is an extended attribute */
+#define ZFS_INHERIT_ACE 0x2 /* ace has inheritable ACEs */
+#define ZFS_ACL_TRIVIAL 0x4 /* files ACL is trivial */
+
+#define MASTER_NODE_OBJ 1
+
+/*
+ * special attributes for master node.
+ */
+
+#define ZFS_FSID "FSID"
+#define ZFS_UNLINKED_SET "DELETE_QUEUE"
+#define ZFS_ROOT_OBJ "ROOT"
+#define ZPL_VERSION_OBJ "VERSION"
+#define ZFS_PROP_BLOCKPERPAGE "BLOCKPERPAGE"
+#define ZFS_PROP_NOGROWBLOCKS "NOGROWBLOCKS"
+
+#define ZFS_FLAG_BLOCKPERPAGE 0x1
+#define ZFS_FLAG_NOGROWBLOCKS 0x2
+
+/*
+ * ZPL version - rev'd whenever an incompatible on-disk format change
+ * occurs. Independent of SPA/DMU/ZAP versioning.
+ */
+
+#define ZPL_VERSION 1ULL
+
+#define ZFS_MAX_BLOCKSIZE (SPA_MAXBLOCKSIZE)
+
+/* Path component length */
+/*
+ * The generic fs code uses MAXNAMELEN to represent
+ * what the largest component length is. Unfortunately,
+ * this length includes the terminating NULL. ZFS needs
+ * to tell the users via pathconf() and statvfs() what the
+ * true maximum length of a component is, excluding the NULL.
+ */
+#define ZFS_MAXNAMELEN (MAXNAMELEN - 1)
+
+/*
+ * The directory entry has the type (currently unused on Solaris) in the
+ * top 4 bits, and the object number in the low 48 bits. The "middle"
+ * 12 bits are unused.
+ */
+#define ZFS_DIRENT_TYPE(de) BF64_GET(de, 60, 4)
+#define ZFS_DIRENT_OBJ(de) BF64_GET(de, 0, 48)
+#define ZFS_DIRENT_MAKE(type, obj) (((uint64_t)type << 60) | obj)
+
+
+/*
+ * This is the persistent portion of the znode. It is stored
+ * in the "bonus buffer" of the file. Short symbolic links
+ * are also stored in the bonus buffer.
+ */
+typedef struct znode_phys {
+ uint64_t zp_atime[2]; /* 0 - last file access time */
+ uint64_t zp_mtime[2]; /* 16 - last file modification time */
+ uint64_t zp_ctime[2]; /* 32 - last file change time */
+ uint64_t zp_crtime[2]; /* 48 - creation time */
+ uint64_t zp_gen; /* 64 - generation (txg of creation) */
+ uint64_t zp_mode; /* 72 - file mode bits */
+ uint64_t zp_size; /* 80 - size of file */
+ uint64_t zp_parent; /* 88 - directory parent (`..') */
+ uint64_t zp_links; /* 96 - number of links to file */
+ uint64_t zp_xattr; /* 104 - DMU object for xattrs */
+ uint64_t zp_rdev; /* 112 - dev_t for VBLK & VCHR files */
+ uint64_t zp_flags; /* 120 - persistent flags */
+ uint64_t zp_uid; /* 128 - file owner */
+ uint64_t zp_gid; /* 136 - owning group */
+ uint64_t zp_pad[4]; /* 144 - future */
+ zfs_znode_acl_t zp_acl; /* 176 - 263 ACL */
+ /*
+ * Data may pad out any remaining bytes in the znode buffer, eg:
+ *
+ * |<---------------------- dnode_phys (512) ------------------------>|
+ * |<-- dnode (192) --->|<----------- "bonus" buffer (320) ---------->|
+ * |<---- znode (264) ---->|<---- data (56) ---->|
+ *
+ * At present, we only use this space to store symbolic links.
+ */
+} znode_phys_t;
+
+/*
+ * Directory entry locks control access to directory entries.
+ * They are used to protect creates, deletes, and renames.
+ * Each directory znode has a mutex and a list of locked names.
+ */
+#ifdef _KERNEL
+typedef struct zfs_dirlock {
+ char *dl_name; /* directory entry being locked */
+ uint32_t dl_sharecnt; /* 0 if exclusive, > 0 if shared */
+ uint16_t dl_namesize; /* set if dl_name was allocated */
+ kcondvar_t dl_cv; /* wait for entry to be unlocked */
+ struct znode *dl_dzp; /* directory znode */
+ struct zfs_dirlock *dl_next; /* next in z_dirlocks list */
+} zfs_dirlock_t;
+
+typedef struct znode {
+ struct zfsvfs *z_zfsvfs;
+ vnode_t *z_vnode;
+ uint64_t z_id; /* object ID for this znode */
+ kmutex_t z_lock; /* znode modification lock */
+ krwlock_t z_map_lock; /* page map lock */
+ krwlock_t z_parent_lock; /* parent lock for directories */
+ krwlock_t z_name_lock; /* "master" lock for dirent locks */
+ zfs_dirlock_t *z_dirlocks; /* directory entry lock list */
+ kmutex_t z_range_lock; /* protects changes to z_range_avl */
+ avl_tree_t z_range_avl; /* avl tree of file range locks */
+ uint8_t z_unlinked; /* file has been unlinked */
+ uint8_t z_atime_dirty; /* atime needs to be synced */
+ uint8_t z_dbuf_held; /* Is z_dbuf already held? */
+ uint8_t z_zn_prefetch; /* Prefetch znodes? */
+ uint_t z_blksz; /* block size in bytes */
+ uint_t z_seq; /* modification sequence number */
+ uint64_t z_mapcnt; /* number of pages mapped to file */
+ uint64_t z_last_itx; /* last ZIL itx on this znode */
+ uint32_t z_sync_cnt; /* synchronous open count */
+ kmutex_t z_acl_lock; /* acl data lock */
+ list_node_t z_link_node; /* all znodes in fs link */
+ struct lockf *z_lockf; /* Head of byte-level lock list. */
+ /*
+ * These are dmu managed fields.
+ */
+ znode_phys_t *z_phys; /* pointer to persistent znode */
+ dmu_buf_t *z_dbuf; /* buffer containing the z_phys */
+} znode_t;
+
+
+/*
+ * Range locking rules
+ * --------------------
+ * 1. When truncating a file (zfs_create, zfs_setattr, zfs_space) the whole
+ * file range needs to be locked as RL_WRITER. Only then can the pages be
+ * freed etc and zp_size reset. zp_size must be set within range lock.
+ * 2. For writes and punching holes (zfs_write & zfs_space) just the range
+ * being written or freed needs to be locked as RL_WRITER.
+ * Multiple writes at the end of the file must coordinate zp_size updates
+ * to ensure data isn't lost. A compare and swap loop is currently used
+ * to ensure the file size is at least the offset last written.
+ * 3. For reads (zfs_read, zfs_get_data & zfs_putapage) just the range being
+ * read needs to be locked as RL_READER. A check against zp_size can then
+ * be made for reading beyond end of file.
+ */
+
+/*
+ * Convert between znode pointers and vnode pointers
+ */
+#define ZTOV(ZP) ((ZP)->z_vnode)
+#define VTOZ(VP) ((znode_t *)(VP)->v_data)
+
+/*
+ * ZFS_ENTER() is called on entry to each ZFS vnode and vfs operation.
+ * ZFS_EXIT() must be called before exitting the vop.
+ */
+#define ZFS_ENTER(zfsvfs) \
+ { \
+ atomic_add_32(&(zfsvfs)->z_op_cnt, 1); \
+ if ((zfsvfs)->z_unmounted1) { \
+ ZFS_EXIT(zfsvfs); \
+ return (EIO); \
+ } \
+ }
+#define ZFS_EXIT(zfsvfs) atomic_add_32(&(zfsvfs)->z_op_cnt, -1)
+
+/*
+ * Macros for dealing with dmu_buf_hold
+ */
+#define ZFS_OBJ_HASH(obj_num) (obj_num & (ZFS_OBJ_MTX_SZ - 1))
+#define ZFS_OBJ_MUTEX(zp) \
+ (&zp->z_zfsvfs->z_hold_mtx[ZFS_OBJ_HASH(zp->z_id)])
+#define ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num) \
+ mutex_enter(&zfsvfs->z_hold_mtx[ZFS_OBJ_HASH(obj_num)]);
+
+#define ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num) \
+ mutex_exit(&zfsvfs->z_hold_mtx[ZFS_OBJ_HASH(obj_num)])
+
+/*
+ * Macros to encode/decode ZFS stored time values from/to struct timespec
+ */
+#define ZFS_TIME_ENCODE(tp, stmp) \
+{ \
+ stmp[0] = (uint64_t)(tp)->tv_sec; \
+ stmp[1] = (uint64_t)(tp)->tv_nsec; \
+}
+
+#define ZFS_TIME_DECODE(tp, stmp) \
+{ \
+ (tp)->tv_sec = (time_t)stmp[0]; \
+ (tp)->tv_nsec = (long)stmp[1]; \
+}
+
+/*
+ * Timestamp defines
+ */
+#define ACCESSED (AT_ATIME)
+#define STATE_CHANGED (AT_CTIME)
+#define CONTENT_MODIFIED (AT_MTIME | AT_CTIME)
+
+#define ZFS_ACCESSTIME_STAMP(zfsvfs, zp) \
+ if ((zfsvfs)->z_atime && !((zfsvfs)->z_vfs->vfs_flag & VFS_RDONLY)) \
+ zfs_time_stamper(zp, ACCESSED, NULL)
+
+extern int zfs_init_fs(zfsvfs_t *, znode_t **, cred_t *);
+extern void zfs_set_dataprop(objset_t *);
+extern void zfs_create_fs(objset_t *os, cred_t *cr, dmu_tx_t *tx);
+extern void zfs_time_stamper(znode_t *, uint_t, dmu_tx_t *);
+extern void zfs_time_stamper_locked(znode_t *, uint_t, dmu_tx_t *);
+extern void zfs_grow_blocksize(znode_t *, uint64_t, dmu_tx_t *);
+extern int zfs_freesp(znode_t *, uint64_t, uint64_t, int, boolean_t);
+extern void zfs_znode_init(void);
+extern void zfs_znode_fini(void);
+extern int zfs_zget(zfsvfs_t *, uint64_t, znode_t **);
+extern void zfs_zinactive(znode_t *);
+extern void zfs_znode_delete(znode_t *, dmu_tx_t *);
+extern void zfs_znode_free(znode_t *);
+extern void zfs_remove_op_tables();
+extern int zfs_create_op_tables();
+extern dev_t zfs_cmpldev(uint64_t);
+
+extern void zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *dzp, znode_t *zp, char *name);
+extern void zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *dzp, char *name);
+extern void zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *dzp, znode_t *zp, char *name);
+extern void zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *dzp, znode_t *zp, char *name, char *link);
+extern void zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp);
+extern void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *zp, offset_t off, ssize_t len, int ioflag);
+extern void zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *zp, uint64_t off, uint64_t len);
+extern void zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *zp, vattr_t *vap, uint_t mask_applied);
+#ifndef ZFS_NO_ACL
+extern void zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *zp, int aclcnt, ace_t *z_ace);
+#endif
+
+extern zil_get_data_t zfs_get_data;
+extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE];
+extern int zfsfstype;
+
+#endif /* _KERNEL */
+
+extern int zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_FS_ZFS_ZNODE_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h
new file mode 100644
index 0000000..947ba9f
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h
@@ -0,0 +1,276 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ZIL_H
+#define _SYS_ZIL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/dmu.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Intent log format:
+ *
+ * Each objset has its own intent log. The log header (zil_header_t)
+ * for objset N's intent log is kept in the Nth object of the SPA's
+ * intent_log objset. The log header points to a chain of log blocks,
+ * each of which contains log records (i.e., transactions) followed by
+ * a log block trailer (zil_trailer_t). The format of a log record
+ * depends on the record (or transaction) type, but all records begin
+ * with a common structure that defines the type, length, and txg.
+ */
+
+/*
+ * Intent log header - this on disk structure holds fields to manage
+ * the log. All fields are 64 bit to easily handle cross architectures.
+ */
+typedef struct zil_header {
+ uint64_t zh_claim_txg; /* txg in which log blocks were claimed */
+ uint64_t zh_replay_seq; /* highest replayed sequence number */
+ blkptr_t zh_log; /* log chain */
+ uint64_t zh_claim_seq; /* highest claimed sequence number */
+ uint64_t zh_pad[5];
+} zil_header_t;
+
+/*
+ * Log block trailer - structure at the end of the header and each log block
+ *
+ * The zit_bt contains a zbt_cksum which for the intent log is
+ * the sequence number of this log block. A seq of 0 is invalid.
+ * The zbt_cksum is checked by the SPA against the sequence
+ * number passed in the blk_cksum field of the blkptr_t
+ */
+typedef struct zil_trailer {
+ uint64_t zit_pad;
+ blkptr_t zit_next_blk; /* next block in chain */
+ uint64_t zit_nused; /* bytes in log block used */
+ zio_block_tail_t zit_bt; /* block trailer */
+} zil_trailer_t;
+
+#define ZIL_MIN_BLKSZ 4096ULL
+#define ZIL_MAX_BLKSZ SPA_MAXBLOCKSIZE
+#define ZIL_BLK_DATA_SZ(lwb) ((lwb)->lwb_sz - sizeof (zil_trailer_t))
+
+/*
+ * The words of a log block checksum.
+ */
+#define ZIL_ZC_GUID_0 0
+#define ZIL_ZC_GUID_1 1
+#define ZIL_ZC_OBJSET 2
+#define ZIL_ZC_SEQ 3
+
+/*
+ * Intent log transaction types and record structures
+ */
+#define TX_CREATE 1 /* Create file */
+#define TX_MKDIR 2 /* Make directory */
+#define TX_MKXATTR 3 /* Make XATTR directory */
+#define TX_SYMLINK 4 /* Create symbolic link to a file */
+#define TX_REMOVE 5 /* Remove file */
+#define TX_RMDIR 6 /* Remove directory */
+#define TX_LINK 7 /* Create hard link to a file */
+#define TX_RENAME 8 /* Rename a file */
+#define TX_WRITE 9 /* File write */
+#define TX_TRUNCATE 10 /* Truncate a file */
+#define TX_SETATTR 11 /* Set file attributes */
+#define TX_ACL 12 /* Set acl */
+#define TX_MAX_TYPE 13 /* Max transaction type */
+
+/*
+ * Format of log records.
+ * The fields are carefully defined to allow them to be aligned
+ * and sized the same on sparc & intel architectures.
+ * Each log record has a common structure at the beginning.
+ *
+ * Note, lrc_seq holds two different sequence numbers. Whilst in memory
+ * it contains the transaction sequence number. The log record on
+ * disk holds the sequence number of all log records which is used to
+ * ensure we don't replay the same record. The two sequence numbers are
+ * different because the transactions can now be pushed out of order.
+ */
+typedef struct { /* common log record header */
+ uint64_t lrc_txtype; /* intent log transaction type */
+ uint64_t lrc_reclen; /* transaction record length */
+ uint64_t lrc_txg; /* dmu transaction group number */
+ uint64_t lrc_seq; /* see comment above */
+} lr_t;
+
+typedef struct {
+ lr_t lr_common; /* common portion of log record */
+ uint64_t lr_doid; /* object id of directory */
+ uint64_t lr_foid; /* object id of created file object */
+ uint64_t lr_mode; /* mode of object */
+ uint64_t lr_uid; /* uid of object */
+ uint64_t lr_gid; /* gid of object */
+ uint64_t lr_gen; /* generation (txg of creation) */
+ uint64_t lr_crtime[2]; /* creation time */
+ uint64_t lr_rdev; /* rdev of object to create */
+ /* name of object to create follows this */
+ /* for symlinks, link content follows name */
+} lr_create_t;
+
+typedef struct {
+ lr_t lr_common; /* common portion of log record */
+ uint64_t lr_doid; /* obj id of directory */
+ /* name of object to remove follows this */
+} lr_remove_t;
+
+typedef struct {
+ lr_t lr_common; /* common portion of log record */
+ uint64_t lr_doid; /* obj id of directory */
+ uint64_t lr_link_obj; /* obj id of link */
+ /* name of object to link follows this */
+} lr_link_t;
+
+typedef struct {
+ lr_t lr_common; /* common portion of log record */
+ uint64_t lr_sdoid; /* obj id of source directory */
+ uint64_t lr_tdoid; /* obj id of target directory */
+ /* 2 strings: names of source and destination follow this */
+} lr_rename_t;
+
+typedef struct {
+ lr_t lr_common; /* common portion of log record */
+ uint64_t lr_foid; /* file object to write */
+ uint64_t lr_offset; /* offset to write to */
+ uint64_t lr_length; /* user data length to write */
+ uint64_t lr_blkoff; /* offset represented by lr_blkptr */
+ blkptr_t lr_blkptr; /* spa block pointer for replay */
+ /* write data will follow for small writes */
+} lr_write_t;
+
+typedef struct {
+ lr_t lr_common; /* common portion of log record */
+ uint64_t lr_foid; /* object id of file to truncate */
+ uint64_t lr_offset; /* offset to truncate from */
+ uint64_t lr_length; /* length to truncate */
+} lr_truncate_t;
+
+typedef struct {
+ lr_t lr_common; /* common portion of log record */
+ uint64_t lr_foid; /* file object to change attributes */
+ uint64_t lr_mask; /* mask of attributes to set */
+ uint64_t lr_mode; /* mode to set */
+ uint64_t lr_uid; /* uid to set */
+ uint64_t lr_gid; /* gid to set */
+ uint64_t lr_size; /* size to set */
+ uint64_t lr_atime[2]; /* access time */
+ uint64_t lr_mtime[2]; /* modification time */
+} lr_setattr_t;
+
+typedef struct {
+ lr_t lr_common; /* common portion of log record */
+ uint64_t lr_foid; /* obj id of file */
+ uint64_t lr_aclcnt; /* number of acl entries */
+ /* lr_aclcnt number of ace_t entries follow this */
+} lr_acl_t;
+
+/*
+ * ZIL structure definitions, interface function prototype and globals.
+ */
+
+/*
+ * ZFS intent log transaction structure
+ */
+typedef enum {
+ WR_INDIRECT, /* indirect - a large write (dmu_sync() data */
+ /* and put blkptr in log, rather than actual data) */
+ WR_COPIED, /* immediate - data is copied into lr_write_t */
+ WR_NEED_COPY, /* immediate - data needs to be copied if pushed */
+} itx_wr_state_t;
+
+typedef struct itx {
+ list_node_t itx_node; /* linkage on zl_itx_list */
+ void *itx_private; /* type-specific opaque data */
+ itx_wr_state_t itx_wr_state; /* write state */
+ uint8_t itx_sync; /* synchronous transaction */
+ lr_t itx_lr; /* common part of log record */
+ /* followed by type-specific part of lr_xx_t and its immediate data */
+} itx_t;
+
+
+/*
+ * zgd_t is passed through dmu_sync() to the callback routine zfs_get_done()
+ * to handle the cleanup of the dmu_sync() buffer write
+ */
+typedef struct {
+ zilog_t *zgd_zilog; /* zilog */
+ blkptr_t *zgd_bp; /* block pointer */
+ struct rl *zgd_rl; /* range lock */
+} zgd_t;
+
+
+typedef void zil_parse_blk_func_t(zilog_t *zilog, blkptr_t *bp, void *arg,
+ uint64_t txg);
+typedef void zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg,
+ uint64_t txg);
+typedef int zil_replay_func_t();
+typedef int zil_get_data_t(void *arg, lr_write_t *lr, char *dbuf, zio_t *zio);
+
+extern uint64_t zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
+ zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg);
+
+extern void zil_init(void);
+extern void zil_fini(void);
+
+extern zilog_t *zil_alloc(objset_t *os, zil_header_t *zh_phys);
+extern void zil_free(zilog_t *zilog);
+
+extern zilog_t *zil_open(objset_t *os, zil_get_data_t *get_data);
+extern void zil_close(zilog_t *zilog);
+
+extern void zil_replay(objset_t *os, void *arg, uint64_t *txgp,
+ zil_replay_func_t *replay_func[TX_MAX_TYPE]);
+extern void zil_destroy(zilog_t *zilog, boolean_t keep_first);
+
+extern itx_t *zil_itx_create(int txtype, size_t lrsize);
+extern uint64_t zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx);
+
+extern void zil_commit(zilog_t *zilog, uint64_t seq, uint64_t oid);
+
+extern int zil_claim(char *osname, void *txarg);
+extern void zil_sync(zilog_t *zilog, dmu_tx_t *tx);
+extern void zil_clean(zilog_t *zilog);
+extern int zil_is_committed(zilog_t *zilog);
+
+extern int zil_suspend(zilog_t *zilog);
+extern void zil_resume(zilog_t *zilog);
+
+extern void zil_add_vdev(zilog_t *zilog, uint64_t vdev);
+
+extern int zil_disable;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZIL_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h
new file mode 100644
index 0000000..3ecf4e4
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h
@@ -0,0 +1,111 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ZIL_IMPL_H
+#define _SYS_ZIL_IMPL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zil.h>
+#include <sys/dmu_objset.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Log write buffer.
+ */
+typedef struct lwb {
+ zilog_t *lwb_zilog; /* back pointer to log struct */
+ blkptr_t lwb_blk; /* on disk address of this log blk */
+ int lwb_nused; /* # used bytes in buffer */
+ int lwb_sz; /* size of block and buffer */
+ char *lwb_buf; /* log write buffer */
+ zio_t *lwb_zio; /* zio for this buffer */
+ uint64_t lwb_max_txg; /* highest txg in this lwb */
+ txg_handle_t lwb_txgh; /* txg handle for txg_exit() */
+ list_node_t lwb_node; /* zilog->zl_lwb_list linkage */
+} lwb_t;
+
+/*
+ * Vdev flushing: We use a bit map of size ZIL_VDEV_BMAP bytes.
+ * Any vdev numbers beyond that use a linked list of zil_vdev_t structures.
+ */
+
+#define ZIL_VDEV_BMSZ 16 /* 16 * 8 = 128 vdevs */
+typedef struct zil_vdev {
+ uint64_t vdev; /* device written */
+ list_node_t vdev_seq_node; /* zilog->zl_vdev_list linkage */
+} zil_vdev_t;
+
+/*
+ * Stable storage intent log management structure. One per dataset.
+ */
+struct zilog {
+ kmutex_t zl_lock; /* protects most zilog_t fields */
+ struct dsl_pool *zl_dmu_pool; /* DSL pool */
+ spa_t *zl_spa; /* handle for read/write log */
+ const zil_header_t *zl_header; /* log header buffer */
+ objset_t *zl_os; /* object set we're logging */
+ zil_get_data_t *zl_get_data; /* callback to get object content */
+ zio_t *zl_root_zio; /* log writer root zio */
+ uint64_t zl_itx_seq; /* next itx sequence number */
+ uint64_t zl_commit_seq; /* committed upto this number */
+ uint64_t zl_lr_seq; /* log record sequence number */
+ uint64_t zl_destroy_txg; /* txg of last zil_destroy() */
+ uint64_t zl_replay_seq[TXG_SIZE]; /* seq of last replayed rec */
+ uint32_t zl_suspend; /* log suspend count */
+ kcondvar_t zl_cv_writer; /* log writer thread completion */
+ kcondvar_t zl_cv_suspend; /* log suspend completion */
+ uint8_t zl_suspending; /* log is currently suspending */
+ uint8_t zl_keep_first; /* keep first log block in destroy */
+ uint8_t zl_stop_replay; /* don't replay any further */
+ uint8_t zl_stop_sync; /* for debugging */
+ uint8_t zl_writer; /* boolean: write setup in progress */
+ uint8_t zl_log_error; /* boolean: log write error */
+ list_t zl_itx_list; /* in-memory itx list */
+ uint64_t zl_itx_list_sz; /* total size of records on list */
+ uint64_t zl_cur_used; /* current commit log size used */
+ uint64_t zl_prev_used; /* previous commit log size used */
+ list_t zl_lwb_list; /* in-flight log write list */
+ list_t zl_vdev_list; /* list of [vdev, seq] pairs */
+ uint8_t zl_vdev_bmap[ZIL_VDEV_BMSZ]; /* bitmap of vdevs */
+ taskq_t *zl_clean_taskq; /* runs lwb and itx clean tasks */
+ avl_tree_t zl_dva_tree; /* track DVAs during log parse */
+ clock_t zl_replay_time; /* lbolt of when replay started */
+ uint64_t zl_replay_blks; /* number of log blocks replayed */
+};
+
+typedef struct zil_dva_node {
+ dva_t zn_dva;
+ avl_node_t zn_node;
+} zil_dva_node_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZIL_IMPL_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
new file mode 100644
index 0000000..b026ae6
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
@@ -0,0 +1,366 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _ZIO_H
+#define _ZIO_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/avl.h>
+#include <sys/dkio.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio_impl.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define ZBT_MAGIC 0x210da7ab10c7a11ULL /* zio data bloc tail */
+
+typedef struct zio_block_tail {
+ uint64_t zbt_magic; /* for validation, endianness */
+ zio_cksum_t zbt_cksum; /* 256-bit checksum */
+} zio_block_tail_t;
+
+/*
+ * Gang block headers are self-checksumming and contain an array
+ * of block pointers.
+ */
+#define SPA_GANGBLOCKSIZE SPA_MINBLOCKSIZE
+#define SPA_GBH_NBLKPTRS ((SPA_GANGBLOCKSIZE - \
+ sizeof (zio_block_tail_t)) / sizeof (blkptr_t))
+#define SPA_GBH_FILLER ((SPA_GANGBLOCKSIZE - \
+ sizeof (zio_block_tail_t) - \
+ (SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\
+ sizeof (uint64_t))
+
+#define ZIO_GET_IOSIZE(zio) \
+ (BP_IS_GANG((zio)->io_bp) ? \
+ SPA_GANGBLOCKSIZE : BP_GET_PSIZE((zio)->io_bp))
+
+typedef struct zio_gbh {
+ blkptr_t zg_blkptr[SPA_GBH_NBLKPTRS];
+ uint64_t zg_filler[SPA_GBH_FILLER];
+ zio_block_tail_t zg_tail;
+} zio_gbh_phys_t;
+
+enum zio_checksum {
+ ZIO_CHECKSUM_INHERIT = 0,
+ ZIO_CHECKSUM_ON,
+ ZIO_CHECKSUM_OFF,
+ ZIO_CHECKSUM_LABEL,
+ ZIO_CHECKSUM_GANG_HEADER,
+ ZIO_CHECKSUM_ZILOG,
+ ZIO_CHECKSUM_FLETCHER_2,
+ ZIO_CHECKSUM_FLETCHER_4,
+ ZIO_CHECKSUM_SHA256,
+ ZIO_CHECKSUM_FUNCTIONS
+};
+
+#define ZIO_CHECKSUM_ON_VALUE ZIO_CHECKSUM_FLETCHER_2
+#define ZIO_CHECKSUM_DEFAULT ZIO_CHECKSUM_ON
+
+enum zio_compress {
+ ZIO_COMPRESS_INHERIT = 0,
+ ZIO_COMPRESS_ON,
+ ZIO_COMPRESS_OFF,
+ ZIO_COMPRESS_LZJB,
+ ZIO_COMPRESS_EMPTY,
+ ZIO_COMPRESS_GZIP_1,
+ ZIO_COMPRESS_GZIP_2,
+ ZIO_COMPRESS_GZIP_3,
+ ZIO_COMPRESS_GZIP_4,
+ ZIO_COMPRESS_GZIP_5,
+ ZIO_COMPRESS_GZIP_6,
+ ZIO_COMPRESS_GZIP_7,
+ ZIO_COMPRESS_GZIP_8,
+ ZIO_COMPRESS_GZIP_9,
+ ZIO_COMPRESS_FUNCTIONS
+};
+
+#define ZIO_COMPRESS_ON_VALUE ZIO_COMPRESS_LZJB
+#define ZIO_COMPRESS_DEFAULT ZIO_COMPRESS_OFF
+
+#define ZIO_PRIORITY_NOW (zio_priority_table[0])
+#define ZIO_PRIORITY_SYNC_READ (zio_priority_table[1])
+#define ZIO_PRIORITY_SYNC_WRITE (zio_priority_table[2])
+#define ZIO_PRIORITY_ASYNC_READ (zio_priority_table[3])
+#define ZIO_PRIORITY_ASYNC_WRITE (zio_priority_table[4])
+#define ZIO_PRIORITY_FREE (zio_priority_table[5])
+#define ZIO_PRIORITY_CACHE_FILL (zio_priority_table[6])
+#define ZIO_PRIORITY_LOG_WRITE (zio_priority_table[7])
+#define ZIO_PRIORITY_RESILVER (zio_priority_table[8])
+#define ZIO_PRIORITY_SCRUB (zio_priority_table[9])
+#define ZIO_PRIORITY_TABLE_SIZE 10
+
+#define ZIO_FLAG_MUSTSUCCEED 0x00000
+#define ZIO_FLAG_CANFAIL 0x00001
+#define ZIO_FLAG_FAILFAST 0x00002
+#define ZIO_FLAG_CONFIG_HELD 0x00004
+#define ZIO_FLAG_CONFIG_GRABBED 0x00008
+
+#define ZIO_FLAG_DONT_CACHE 0x00010
+#define ZIO_FLAG_DONT_QUEUE 0x00020
+#define ZIO_FLAG_DONT_PROPAGATE 0x00040
+#define ZIO_FLAG_DONT_RETRY 0x00080
+
+#define ZIO_FLAG_PHYSICAL 0x00100
+#define ZIO_FLAG_IO_BYPASS 0x00200
+#define ZIO_FLAG_IO_REPAIR 0x00400
+#define ZIO_FLAG_SPECULATIVE 0x00800
+
+#define ZIO_FLAG_RESILVER 0x01000
+#define ZIO_FLAG_SCRUB 0x02000
+#define ZIO_FLAG_SCRUB_THREAD 0x04000
+#define ZIO_FLAG_SUBBLOCK 0x08000
+
+#define ZIO_FLAG_NOBOOKMARK 0x10000
+#define ZIO_FLAG_USER 0x20000
+
+#define ZIO_FLAG_METADATA 0x40000
+
+#define ZIO_FLAG_GANG_INHERIT \
+ (ZIO_FLAG_CANFAIL | \
+ ZIO_FLAG_FAILFAST | \
+ ZIO_FLAG_CONFIG_HELD | \
+ ZIO_FLAG_DONT_RETRY | \
+ ZIO_FLAG_IO_REPAIR | \
+ ZIO_FLAG_SPECULATIVE | \
+ ZIO_FLAG_RESILVER | \
+ ZIO_FLAG_SCRUB | \
+ ZIO_FLAG_SCRUB_THREAD)
+
+#define ZIO_FLAG_VDEV_INHERIT \
+ (ZIO_FLAG_GANG_INHERIT | \
+ ZIO_FLAG_DONT_CACHE | \
+ ZIO_FLAG_PHYSICAL)
+
+/*
+ * We'll take the EILSEQ (Illegal byte sequence) errno
+ * to indicate checksum errors.
+ */
+#define ECKSUM EILSEQ
+
+typedef struct zio zio_t;
+typedef void zio_done_func_t(zio_t *zio);
+
+extern uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE];
+extern char *zio_type_name[ZIO_TYPES];
+
+/*
+ * A bookmark is a four-tuple <objset, object, level, blkid> that uniquely
+ * identifies any block in the pool. By convention, the meta-objset (MOS)
+ * is objset 0, the meta-dnode is object 0, the root block (osphys_t) is
+ * level -1 of the meta-dnode, and intent log blocks (which are chained
+ * off the root block) have blkid == sequence number. In summary:
+ *
+ * mos is objset 0
+ * meta-dnode is object 0
+ * root block is <objset, 0, -1, 0>
+ * intent log is <objset, 0, -1, ZIL sequence number>
+ *
+ * Note: this structure is called a bookmark because its first purpose was
+ * to remember where to resume a pool-wide traverse. The absolute ordering
+ * for block visitation during traversal is defined in compare_bookmark().
+ *
+ * Note: this structure is passed between userland and the kernel.
+ * Therefore it must not change size or alignment between 32/64 bit
+ * compilation options.
+ */
+typedef struct zbookmark {
+ uint64_t zb_objset;
+ uint64_t zb_object;
+ int64_t zb_level;
+ uint64_t zb_blkid;
+} zbookmark_t;
+
+struct zio {
+ /* Core information about this I/O */
+ zio_t *io_parent;
+ zio_t *io_root;
+ spa_t *io_spa;
+ zbookmark_t io_bookmark;
+ enum zio_checksum io_checksum;
+ enum zio_compress io_compress;
+ int io_ndvas;
+ uint64_t io_txg;
+ blkptr_t *io_bp;
+ blkptr_t io_bp_copy;
+ zio_t *io_child;
+ zio_t *io_sibling_prev;
+ zio_t *io_sibling_next;
+ zio_transform_t *io_transform_stack;
+ zio_t *io_logical;
+
+ /* Callback info */
+ zio_done_func_t *io_ready;
+ zio_done_func_t *io_done;
+ void *io_private;
+ blkptr_t io_bp_orig;
+
+ /* Data represented by this I/O */
+ void *io_data;
+ uint64_t io_size;
+
+ /* Stuff for the vdev stack */
+ vdev_t *io_vd;
+ void *io_vsd;
+ uint64_t io_offset;
+ uint64_t io_deadline;
+ uint64_t io_timestamp;
+ avl_node_t io_offset_node;
+ avl_node_t io_deadline_node;
+ avl_tree_t *io_vdev_tree;
+ zio_t *io_delegate_list;
+ zio_t *io_delegate_next;
+
+ /* Internal pipeline state */
+ int io_flags;
+ enum zio_type io_type;
+ enum zio_stage io_stage;
+ uint8_t io_stalled;
+ uint8_t io_priority;
+ struct dk_callback io_dk_callback;
+ int io_cmd;
+ int io_retries;
+ int io_error;
+ uint32_t io_numerrors;
+ uint32_t io_pipeline;
+ uint32_t io_async_stages;
+ uint64_t io_children_notready;
+ uint64_t io_children_notdone;
+ void *io_waiter;
+ kmutex_t io_lock;
+ kcondvar_t io_cv;
+
+ /* FMA state */
+ uint64_t io_ena;
+};
+
+extern zio_t *zio_null(zio_t *pio, spa_t *spa,
+ zio_done_func_t *done, void *private, int flags);
+
+extern zio_t *zio_root(spa_t *spa,
+ zio_done_func_t *done, void *private, int flags);
+
+extern zio_t *zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data,
+ uint64_t size, zio_done_func_t *done, void *private,
+ int priority, int flags, zbookmark_t *zb);
+
+extern zio_t *zio_write(zio_t *pio, spa_t *spa, int checksum, int compress,
+ int ncopies, uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
+ zio_done_func_t *ready, zio_done_func_t *done, void *private, int priority,
+ int flags, zbookmark_t *zb);
+
+extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, int checksum,
+ uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
+ zio_done_func_t *done, void *private, int priority, int flags,
+ zbookmark_t *zb);
+
+extern zio_t *zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+ zio_done_func_t *done, void *private);
+
+extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+ zio_done_func_t *done, void *private);
+
+extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
+ zio_done_func_t *done, void *private, int priority, int flags);
+
+extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
+ uint64_t size, void *data, int checksum,
+ zio_done_func_t *done, void *private, int priority, int flags);
+
+extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
+ uint64_t size, void *data, int checksum,
+ zio_done_func_t *done, void *private, int priority, int flags);
+
+extern int zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp,
+ blkptr_t *old_bp, uint64_t txg);
+extern void zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg);
+
+extern int zio_wait(zio_t *zio);
+extern void zio_nowait(zio_t *zio);
+
+extern void *zio_buf_alloc(size_t size);
+extern void zio_buf_free(void *buf, size_t size);
+extern void *zio_data_buf_alloc(size_t size);
+extern void zio_data_buf_free(void *buf, size_t size);
+
+/*
+ * Move an I/O to the next stage of the pipeline and execute that stage.
+ * There's no locking on io_stage because there's no legitimate way for
+ * multiple threads to be attempting to process the same I/O.
+ */
+extern void zio_next_stage(zio_t *zio);
+extern void zio_next_stage_async(zio_t *zio);
+extern void zio_wait_children_done(zio_t *zio);
+
+/*
+ * Delegate I/O to a child vdev.
+ */
+extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd,
+ uint64_t offset, void *data, uint64_t size, int type, int priority,
+ int flags, zio_done_func_t *done, void *private);
+
+extern void zio_vdev_io_bypass(zio_t *zio);
+extern void zio_vdev_io_reissue(zio_t *zio);
+extern void zio_vdev_io_redone(zio_t *zio);
+
+extern void zio_checksum_verified(zio_t *zio);
+extern void zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp);
+
+extern uint8_t zio_checksum_select(uint8_t child, uint8_t parent);
+extern uint8_t zio_compress_select(uint8_t child, uint8_t parent);
+
+boolean_t zio_should_retry(zio_t *zio);
+
+/*
+ * Initial setup and teardown.
+ */
+extern void zio_init(void);
+extern void zio_fini(void);
+
+/*
+ * Fault injection
+ */
+struct zinject_record;
+extern uint32_t zio_injection_enabled;
+extern int zio_inject_fault(char *name, int flags, int *id,
+ struct zinject_record *record);
+extern int zio_inject_list_next(int *id, char *name, size_t buflen,
+ struct zinject_record *record);
+extern int zio_clear_fault(int id);
+extern int zio_handle_fault_injection(zio_t *zio, int error);
+extern int zio_handle_device_injection(vdev_t *vd, int error);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZIO_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h
new file mode 100644
index 0000000..bb7bd41
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h
@@ -0,0 +1,75 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ZIO_CHECKSUM_H
+#define _SYS_ZIO_CHECKSUM_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Signature for checksum functions.
+ */
+typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp);
+
+/*
+ * Information about each checksum function.
+ */
+typedef struct zio_checksum_info {
+ zio_checksum_t *ci_func[2]; /* checksum function for each byteorder */
+ int ci_correctable; /* number of correctable bits */
+ int ci_zbt; /* uses zio block tail? */
+ char *ci_name; /* descriptive name */
+} zio_checksum_info_t;
+
+extern zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS];
+
+/*
+ * Checksum routines.
+ */
+extern zio_checksum_t fletcher_2_native;
+extern zio_checksum_t fletcher_4_native;
+extern zio_checksum_t fletcher_4_incremental_native;
+
+extern zio_checksum_t fletcher_2_byteswap;
+extern zio_checksum_t fletcher_4_byteswap;
+extern zio_checksum_t fletcher_4_incremental_byteswap;
+
+extern zio_checksum_t zio_checksum_SHA256;
+
+extern void zio_checksum(uint_t checksum, zio_cksum_t *zcp,
+ void *data, uint64_t size);
+extern int zio_checksum_error(zio_t *zio);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZIO_CHECKSUM_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h
new file mode 100644
index 0000000..66ee8d4
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h
@@ -0,0 +1,82 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ZIO_COMPRESS_H
+#define _SYS_ZIO_COMPRESS_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Common signature for all zio compress/decompress functions.
+ */
+typedef size_t zio_compress_func_t(void *src, void *dst,
+ size_t s_len, size_t d_len, int);
+typedef int zio_decompress_func_t(void *src, void *dst,
+ size_t s_len, size_t d_len, int);
+
+/*
+ * Information about each compression function.
+ */
+typedef struct zio_compress_info {
+ zio_compress_func_t *ci_compress; /* compression function */
+ zio_decompress_func_t *ci_decompress; /* decompression function */
+ int ci_level; /* level parameter */
+ char *ci_name; /* algorithm name */
+} zio_compress_info_t;
+
+extern zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS];
+
+/*
+ * Compression routines.
+ */
+extern size_t lzjb_compress(void *src, void *dst, size_t s_len, size_t d_len,
+ int level);
+extern int lzjb_decompress(void *src, void *dst, size_t s_len, size_t d_len,
+ int level);
+extern size_t gzip_compress(void *src, void *dst, size_t s_len, size_t d_len,
+ int level);
+extern int gzip_decompress(void *src, void *dst, size_t s_len, size_t d_len,
+ int level);
+
+/*
+ * Compress and decompress data if necessary.
+ */
+extern int zio_compress_data(int cpfunc, void *src, uint64_t srcsize,
+ void **destp, uint64_t *destsizep, uint64_t *destbufsizep);
+extern int zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
+ void *dest, uint64_t destsize);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZIO_COMPRESS_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h
new file mode 100644
index 0000000..d2ddbc3
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h
@@ -0,0 +1,205 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _ZIO_IMPL_H
+#define _ZIO_IMPL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/zio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * I/O Groups: pipeline stage definitions.
+ */
+
+typedef enum zio_stage {
+ ZIO_STAGE_OPEN = 0, /* RWFCI */
+ ZIO_STAGE_WAIT_CHILDREN_READY, /* RWFCI */
+
+ ZIO_STAGE_WRITE_COMPRESS, /* -W--- */
+ ZIO_STAGE_CHECKSUM_GENERATE, /* -W--- */
+
+ ZIO_STAGE_GANG_PIPELINE, /* -WFC- */
+
+ ZIO_STAGE_GET_GANG_HEADER, /* -WFC- */
+ ZIO_STAGE_REWRITE_GANG_MEMBERS, /* -W--- */
+ ZIO_STAGE_FREE_GANG_MEMBERS, /* --F-- */
+ ZIO_STAGE_CLAIM_GANG_MEMBERS, /* ---C- */
+
+ ZIO_STAGE_DVA_ALLOCATE, /* -W--- */
+ ZIO_STAGE_DVA_FREE, /* --F-- */
+ ZIO_STAGE_DVA_CLAIM, /* ---C- */
+
+ ZIO_STAGE_GANG_CHECKSUM_GENERATE, /* -W--- */
+
+ ZIO_STAGE_READY, /* RWFCI */
+
+ ZIO_STAGE_VDEV_IO_START, /* RW--I */
+ ZIO_STAGE_VDEV_IO_DONE, /* RW--I */
+ ZIO_STAGE_VDEV_IO_ASSESS, /* RW--I */
+
+ ZIO_STAGE_WAIT_CHILDREN_DONE, /* RWFCI */
+
+ ZIO_STAGE_CHECKSUM_VERIFY, /* R---- */
+ ZIO_STAGE_READ_GANG_MEMBERS, /* R---- */
+ ZIO_STAGE_READ_DECOMPRESS, /* R---- */
+
+ ZIO_STAGE_DONE /* RWFCI */
+} zio_stage_t;
+
+/*
+ * The stages for which there's some performance value in going async.
+ * When compression is enabled, ZIO_STAGE_WRITE_COMPRESS is ORed in as well.
+ */
+#define ZIO_ASYNC_PIPELINE_STAGES \
+ ((1U << ZIO_STAGE_CHECKSUM_GENERATE) | \
+ (1U << ZIO_STAGE_VDEV_IO_DONE) | \
+ (1U << ZIO_STAGE_CHECKSUM_VERIFY) | \
+ (1U << ZIO_STAGE_READ_DECOMPRESS))
+
+#define ZIO_VDEV_IO_PIPELINE \
+ ((1U << ZIO_STAGE_VDEV_IO_START) | \
+ (1U << ZIO_STAGE_VDEV_IO_DONE) | \
+ (1U << ZIO_STAGE_VDEV_IO_ASSESS))
+
+#define ZIO_READ_PHYS_PIPELINE \
+ ((1U << ZIO_STAGE_OPEN) | \
+ (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \
+ (1U << ZIO_STAGE_READY) | \
+ ZIO_VDEV_IO_PIPELINE | \
+ (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \
+ (1U << ZIO_STAGE_CHECKSUM_VERIFY) | \
+ (1U << ZIO_STAGE_DONE))
+
+#define ZIO_READ_PIPELINE \
+ ZIO_READ_PHYS_PIPELINE
+
+#define ZIO_WRITE_PHYS_PIPELINE \
+ ((1U << ZIO_STAGE_OPEN) | \
+ (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \
+ (1U << ZIO_STAGE_CHECKSUM_GENERATE) | \
+ (1U << ZIO_STAGE_READY) | \
+ ZIO_VDEV_IO_PIPELINE | \
+ (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \
+ (1U << ZIO_STAGE_DONE))
+
+#define ZIO_WRITE_COMMON_PIPELINE \
+ ZIO_WRITE_PHYS_PIPELINE
+
+#define ZIO_WRITE_PIPELINE \
+ ((1U << ZIO_STAGE_WRITE_COMPRESS) | \
+ ZIO_WRITE_COMMON_PIPELINE)
+
+#define ZIO_GANG_STAGES \
+ ((1U << ZIO_STAGE_GET_GANG_HEADER) | \
+ (1U << ZIO_STAGE_REWRITE_GANG_MEMBERS) | \
+ (1U << ZIO_STAGE_FREE_GANG_MEMBERS) | \
+ (1U << ZIO_STAGE_CLAIM_GANG_MEMBERS) | \
+ (1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE) | \
+ (1U << ZIO_STAGE_READ_GANG_MEMBERS))
+
+#define ZIO_REWRITE_PIPELINE \
+ ((1U << ZIO_STAGE_GANG_PIPELINE) | \
+ (1U << ZIO_STAGE_GET_GANG_HEADER) | \
+ (1U << ZIO_STAGE_REWRITE_GANG_MEMBERS) | \
+ (1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE) | \
+ ZIO_WRITE_COMMON_PIPELINE)
+
+#define ZIO_WRITE_ALLOCATE_PIPELINE \
+ ((1U << ZIO_STAGE_DVA_ALLOCATE) | \
+ ZIO_WRITE_COMMON_PIPELINE)
+
+#define ZIO_GANG_FREE_STAGES \
+ ((1U << ZIO_STAGE_GET_GANG_HEADER) | \
+ (1U << ZIO_STAGE_FREE_GANG_MEMBERS))
+
+#define ZIO_FREE_PIPELINE \
+ ((1U << ZIO_STAGE_OPEN) | \
+ (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \
+ (1U << ZIO_STAGE_GANG_PIPELINE) | \
+ (1U << ZIO_STAGE_GET_GANG_HEADER) | \
+ (1U << ZIO_STAGE_FREE_GANG_MEMBERS) | \
+ (1U << ZIO_STAGE_DVA_FREE) | \
+ (1U << ZIO_STAGE_READY) | \
+ (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \
+ (1U << ZIO_STAGE_DONE))
+
+#define ZIO_CLAIM_PIPELINE \
+ ((1U << ZIO_STAGE_OPEN) | \
+ (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \
+ (1U << ZIO_STAGE_GANG_PIPELINE) | \
+ (1U << ZIO_STAGE_GET_GANG_HEADER) | \
+ (1U << ZIO_STAGE_CLAIM_GANG_MEMBERS) | \
+ (1U << ZIO_STAGE_DVA_CLAIM) | \
+ (1U << ZIO_STAGE_READY) | \
+ (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \
+ (1U << ZIO_STAGE_DONE))
+
+#define ZIO_IOCTL_PIPELINE \
+ ((1U << ZIO_STAGE_OPEN) | \
+ (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \
+ (1U << ZIO_STAGE_READY) | \
+ ZIO_VDEV_IO_PIPELINE | \
+ (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \
+ (1U << ZIO_STAGE_DONE))
+
+#define ZIO_WAIT_FOR_CHILDREN_PIPELINE \
+ ((1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \
+ (1U << ZIO_STAGE_READY) | \
+ (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \
+ (1U << ZIO_STAGE_DONE))
+
+#define ZIO_WAIT_FOR_CHILDREN_DONE_PIPELINE \
+ ((1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \
+ (1U << ZIO_STAGE_DONE))
+
+#define ZIO_VDEV_CHILD_PIPELINE \
+ (ZIO_WAIT_FOR_CHILDREN_DONE_PIPELINE | \
+ ZIO_VDEV_IO_PIPELINE)
+
+#define ZIO_ERROR_PIPELINE_MASK \
+ ZIO_WAIT_FOR_CHILDREN_PIPELINE
+
+typedef struct zio_transform zio_transform_t;
+struct zio_transform {
+ void *zt_data;
+ uint64_t zt_size;
+ uint64_t zt_bufsize;
+ zio_transform_t *zt_next;
+};
+
+extern void zio_inject_init(void);
+extern void zio_inject_fini(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZIO_IMPL_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h
new file mode 100644
index 0000000..df85824
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h
@@ -0,0 +1,68 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ZVOL_H
+#define _SYS_ZVOL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _KERNEL
+extern int zvol_check_volsize(uint64_t volsize, uint64_t blocksize);
+extern int zvol_check_volblocksize(uint64_t volblocksize);
+extern int zvol_get_stats(objset_t *os, nvlist_t *nv);
+extern void zvol_create_cb(objset_t *os, void *arg, dmu_tx_t *tx);
+extern int zvol_create_minor(const char *, dev_t);
+extern int zvol_remove_minor(const char *);
+extern int zvol_set_volsize(const char *, dev_t, uint64_t);
+extern int zvol_set_volblocksize(const char *, uint64_t);
+
+extern int zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr);
+extern int zvol_close(dev_t dev, int flag, int otyp, cred_t *cr);
+#ifndef __FreeBSD__
+extern int zvol_strategy(buf_t *bp);
+extern int zvol_read(dev_t dev, uio_t *uiop, cred_t *cr);
+extern int zvol_write(dev_t dev, uio_t *uiop, cred_t *cr);
+extern int zvol_aread(dev_t dev, struct aio_req *aio, cred_t *cr);
+extern int zvol_awrite(dev_t dev, struct aio_req *aio, cred_t *cr);
+#endif
+extern int zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr,
+ int *rvalp);
+extern int zvol_busy(void);
+extern void zvol_init(void);
+extern void zvol_fini(void);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZVOL_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c
new file mode 100644
index 0000000..844beb6
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c
@@ -0,0 +1,611 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/txg_impl.h>
+#include <sys/dmu_impl.h>
+#include <sys/dsl_pool.h>
+#include <sys/callb.h>
+
+/*
+ * Pool-wide transaction groups.
+ */
+
+static void txg_sync_thread(void *arg);
+static void txg_quiesce_thread(void *arg);
+static void txg_timelimit_thread(void *arg);
+
+int txg_time = 5; /* max 5 seconds worth of delta per txg */
+
+/*
+ * Prepare the txg subsystem.
+ */
+void
+txg_init(dsl_pool_t *dp, uint64_t txg)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ int c, i;
+ bzero(tx, sizeof (tx_state_t));
+
+ tx->tx_cpu = kmem_zalloc(max_ncpus * sizeof (tx_cpu_t), KM_SLEEP);
+ for (c = 0; c < max_ncpus; c++) {
+ mutex_init(&tx->tx_cpu[c].tc_lock, NULL, MUTEX_DEFAULT, NULL);
+ for (i = 0; i < TXG_SIZE; i++)
+ cv_init(&tx->tx_cpu[c].tc_cv[i], NULL, CV_DEFAULT, NULL);
+ }
+
+ rw_init(&tx->tx_suspend, NULL, RW_DEFAULT, NULL);
+ mutex_init(&tx->tx_sync_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&tx->tx_sync_more_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&tx->tx_sync_done_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&tx->tx_quiesce_more_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&tx->tx_quiesce_done_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&tx->tx_timeout_exit_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&tx->tx_exit_cv, NULL, CV_DEFAULT, NULL);
+
+ tx->tx_open_txg = txg;
+}
+
+/*
+ * Close down the txg subsystem.
+ */
+void
+txg_fini(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ int c, i;
+
+ ASSERT(tx->tx_threads == 0);
+
+ cv_destroy(&tx->tx_exit_cv);
+ cv_destroy(&tx->tx_timeout_exit_cv);
+ cv_destroy(&tx->tx_quiesce_done_cv);
+ cv_destroy(&tx->tx_quiesce_more_cv);
+ cv_destroy(&tx->tx_sync_done_cv);
+ cv_destroy(&tx->tx_sync_more_cv);
+ rw_destroy(&tx->tx_suspend);
+ mutex_destroy(&tx->tx_sync_lock);
+
+ for (c = 0; c < max_ncpus; c++) {
+ for (i = 0; i < TXG_SIZE; i++)
+ cv_destroy(&tx->tx_cpu[c].tc_cv[i]);
+ mutex_destroy(&tx->tx_cpu[c].tc_lock);
+ }
+
+ kmem_free(tx->tx_cpu, max_ncpus * sizeof (tx_cpu_t));
+
+ bzero(tx, sizeof (tx_state_t));
+}
+
+/*
+ * Start syncing transaction groups.
+ */
+void
+txg_sync_start(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+
+ mutex_enter(&tx->tx_sync_lock);
+
+ dprintf("pool %p\n", dp);
+
+ ASSERT(tx->tx_threads == 0);
+
+ tx->tx_threads = 3;
+
+ tx->tx_quiesce_thread = thread_create(NULL, 0, txg_quiesce_thread,
+ dp, 0, &p0, TS_RUN, minclsyspri);
+
+ tx->tx_sync_thread = thread_create(NULL, 0, txg_sync_thread,
+ dp, 0, &p0, TS_RUN, minclsyspri);
+
+ tx->tx_timelimit_thread = thread_create(NULL, 0, txg_timelimit_thread,
+ dp, 0, &p0, TS_RUN, minclsyspri);
+
+ mutex_exit(&tx->tx_sync_lock);
+}
+
+static void
+txg_thread_enter(tx_state_t *tx, callb_cpr_t *cpr)
+{
+ CALLB_CPR_INIT(cpr, &tx->tx_sync_lock, callb_generic_cpr, FTAG);
+ mutex_enter(&tx->tx_sync_lock);
+}
+
+static void
+txg_thread_exit(tx_state_t *tx, callb_cpr_t *cpr, kthread_t **tpp)
+{
+ ASSERT(*tpp != NULL);
+ *tpp = NULL;
+ tx->tx_threads--;
+ cv_broadcast(&tx->tx_exit_cv);
+ CALLB_CPR_EXIT(cpr); /* drops &tx->tx_sync_lock */
+ thread_exit();
+}
+
+static void
+txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, int secmax)
+{
+ CALLB_CPR_SAFE_BEGIN(cpr);
+
+ if (secmax)
+ (void) cv_timedwait(cv, &tx->tx_sync_lock, secmax * hz);
+ else
+ cv_wait(cv, &tx->tx_sync_lock);
+
+ CALLB_CPR_SAFE_END(cpr, &tx->tx_sync_lock);
+}
+
+/*
+ * Stop syncing transaction groups.
+ */
+void
+txg_sync_stop(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+
+ dprintf("pool %p\n", dp);
+ /*
+ * Finish off any work in progress.
+ */
+ ASSERT(tx->tx_threads == 3);
+ txg_wait_synced(dp, 0);
+
+ /*
+ * Wake all 3 sync threads (one per state) and wait for them to die.
+ */
+ mutex_enter(&tx->tx_sync_lock);
+
+ ASSERT(tx->tx_threads == 3);
+
+ tx->tx_exiting = 1;
+
+ cv_broadcast(&tx->tx_quiesce_more_cv);
+ cv_broadcast(&tx->tx_quiesce_done_cv);
+ cv_broadcast(&tx->tx_sync_more_cv);
+ cv_broadcast(&tx->tx_timeout_exit_cv);
+
+ while (tx->tx_threads != 0)
+ cv_wait(&tx->tx_exit_cv, &tx->tx_sync_lock);
+
+ tx->tx_exiting = 0;
+
+ mutex_exit(&tx->tx_sync_lock);
+}
+
+uint64_t
+txg_hold_open(dsl_pool_t *dp, txg_handle_t *th)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ tx_cpu_t *tc = &tx->tx_cpu[CPU_SEQID];
+ uint64_t txg;
+
+ mutex_enter(&tc->tc_lock);
+
+ txg = tx->tx_open_txg;
+ tc->tc_count[txg & TXG_MASK]++;
+
+ th->th_cpu = tc;
+ th->th_txg = txg;
+
+ return (txg);
+}
+
+void
+txg_rele_to_quiesce(txg_handle_t *th)
+{
+ tx_cpu_t *tc = th->th_cpu;
+
+ mutex_exit(&tc->tc_lock);
+}
+
+void
+txg_rele_to_sync(txg_handle_t *th)
+{
+ tx_cpu_t *tc = th->th_cpu;
+ int g = th->th_txg & TXG_MASK;
+
+ mutex_enter(&tc->tc_lock);
+ ASSERT(tc->tc_count[g] != 0);
+ if (--tc->tc_count[g] == 0)
+ cv_broadcast(&tc->tc_cv[g]);
+ mutex_exit(&tc->tc_lock);
+
+ th->th_cpu = NULL; /* defensive */
+}
+
+static void
+txg_quiesce(dsl_pool_t *dp, uint64_t txg)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ int g = txg & TXG_MASK;
+ int c;
+
+ /*
+ * Grab all tx_cpu locks so nobody else can get into this txg.
+ */
+ for (c = 0; c < max_ncpus; c++)
+ mutex_enter(&tx->tx_cpu[c].tc_lock);
+
+ ASSERT(txg == tx->tx_open_txg);
+ tx->tx_open_txg++;
+
+ /*
+ * Now that we've incremented tx_open_txg, we can let threads
+ * enter the next transaction group.
+ */
+ for (c = 0; c < max_ncpus; c++)
+ mutex_exit(&tx->tx_cpu[c].tc_lock);
+
+ /*
+ * Quiesce the transaction group by waiting for everyone to txg_exit().
+ */
+ for (c = 0; c < max_ncpus; c++) {
+ tx_cpu_t *tc = &tx->tx_cpu[c];
+ mutex_enter(&tc->tc_lock);
+ while (tc->tc_count[g] != 0)
+ cv_wait(&tc->tc_cv[g], &tc->tc_lock);
+ mutex_exit(&tc->tc_lock);
+ }
+}
+
+static void
+txg_sync_thread(void *arg)
+{
+ dsl_pool_t *dp = arg;
+ tx_state_t *tx = &dp->dp_tx;
+ callb_cpr_t cpr;
+
+ txg_thread_enter(tx, &cpr);
+
+ for (;;) {
+ uint64_t txg;
+
+ /*
+ * We sync when there's someone waiting on us, or the
+ * quiesce thread has handed off a txg to us.
+ */
+ while (!tx->tx_exiting &&
+ tx->tx_synced_txg >= tx->tx_sync_txg_waiting &&
+ tx->tx_quiesced_txg == 0) {
+ dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n",
+ tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
+ txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, 0);
+ }
+
+ /*
+ * Wait until the quiesce thread hands off a txg to us,
+ * prompting it to do so if necessary.
+ */
+ while (!tx->tx_exiting && tx->tx_quiesced_txg == 0) {
+ if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1)
+ tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1;
+ cv_broadcast(&tx->tx_quiesce_more_cv);
+ txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0);
+ }
+
+ if (tx->tx_exiting)
+ txg_thread_exit(tx, &cpr, &tx->tx_sync_thread);
+
+ rw_enter(&tx->tx_suspend, RW_WRITER);
+
+ /*
+ * Consume the quiesced txg which has been handed off to
+ * us. This may cause the quiescing thread to now be
+ * able to quiesce another txg, so we must signal it.
+ */
+ txg = tx->tx_quiesced_txg;
+ tx->tx_quiesced_txg = 0;
+ tx->tx_syncing_txg = txg;
+ cv_broadcast(&tx->tx_quiesce_more_cv);
+ rw_exit(&tx->tx_suspend);
+
+ dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
+ txg, tx->tx_quiesce_txg_waiting,
+ tx->tx_sync_txg_waiting);
+ mutex_exit(&tx->tx_sync_lock);
+ spa_sync(dp->dp_spa, txg);
+ mutex_enter(&tx->tx_sync_lock);
+ rw_enter(&tx->tx_suspend, RW_WRITER);
+ tx->tx_synced_txg = txg;
+ tx->tx_syncing_txg = 0;
+ rw_exit(&tx->tx_suspend);
+ cv_broadcast(&tx->tx_sync_done_cv);
+ }
+}
+
+static void
+txg_quiesce_thread(void *arg)
+{
+ dsl_pool_t *dp = arg;
+ tx_state_t *tx = &dp->dp_tx;
+ callb_cpr_t cpr;
+
+ txg_thread_enter(tx, &cpr);
+
+ for (;;) {
+ uint64_t txg;
+
+ /*
+ * We quiesce when there's someone waiting on us.
+ * However, we can only have one txg in "quiescing" or
+ * "quiesced, waiting to sync" state. So we wait until
+ * the "quiesced, waiting to sync" txg has been consumed
+ * by the sync thread.
+ */
+ while (!tx->tx_exiting &&
+ (tx->tx_open_txg >= tx->tx_quiesce_txg_waiting ||
+ tx->tx_quiesced_txg != 0))
+ txg_thread_wait(tx, &cpr, &tx->tx_quiesce_more_cv, 0);
+
+ if (tx->tx_exiting)
+ txg_thread_exit(tx, &cpr, &tx->tx_quiesce_thread);
+
+ txg = tx->tx_open_txg;
+ dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
+ txg, tx->tx_quiesce_txg_waiting,
+ tx->tx_sync_txg_waiting);
+ mutex_exit(&tx->tx_sync_lock);
+ txg_quiesce(dp, txg);
+ mutex_enter(&tx->tx_sync_lock);
+
+ /*
+ * Hand this txg off to the sync thread.
+ */
+ dprintf("quiesce done, handing off txg %llu\n", txg);
+ tx->tx_quiesced_txg = txg;
+ cv_broadcast(&tx->tx_sync_more_cv);
+ cv_broadcast(&tx->tx_quiesce_done_cv);
+ }
+}
+
+void
+txg_wait_synced(dsl_pool_t *dp, uint64_t txg)
+{
+ tx_state_t *tx = &dp->dp_tx;
+
+ mutex_enter(&tx->tx_sync_lock);
+ ASSERT(tx->tx_threads == 3);
+ if (txg == 0)
+ txg = tx->tx_open_txg;
+ if (tx->tx_sync_txg_waiting < txg)
+ tx->tx_sync_txg_waiting = txg;
+ dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
+ txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
+ while (tx->tx_synced_txg < txg) {
+ dprintf("broadcasting sync more "
+ "tx_synced=%llu waiting=%llu dp=%p\n",
+ tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
+ cv_broadcast(&tx->tx_sync_more_cv);
+ cv_wait(&tx->tx_sync_done_cv, &tx->tx_sync_lock);
+ }
+ mutex_exit(&tx->tx_sync_lock);
+}
+
+void
+txg_wait_open(dsl_pool_t *dp, uint64_t txg)
+{
+ tx_state_t *tx = &dp->dp_tx;
+
+ mutex_enter(&tx->tx_sync_lock);
+ ASSERT(tx->tx_threads == 3);
+ if (txg == 0)
+ txg = tx->tx_open_txg + 1;
+ if (tx->tx_quiesce_txg_waiting < txg)
+ tx->tx_quiesce_txg_waiting = txg;
+ dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
+ txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
+ while (tx->tx_open_txg < txg) {
+ cv_broadcast(&tx->tx_quiesce_more_cv);
+ cv_wait(&tx->tx_quiesce_done_cv, &tx->tx_sync_lock);
+ }
+ mutex_exit(&tx->tx_sync_lock);
+}
+
+static void
+txg_timelimit_thread(void *arg)
+{
+ dsl_pool_t *dp = arg;
+ tx_state_t *tx = &dp->dp_tx;
+ callb_cpr_t cpr;
+
+ txg_thread_enter(tx, &cpr);
+
+ while (!tx->tx_exiting) {
+ uint64_t txg = tx->tx_open_txg + 1;
+
+ txg_thread_wait(tx, &cpr, &tx->tx_timeout_exit_cv, txg_time);
+
+ if (tx->tx_quiesce_txg_waiting < txg)
+ tx->tx_quiesce_txg_waiting = txg;
+
+ while (!tx->tx_exiting && tx->tx_open_txg < txg) {
+ dprintf("pushing out %llu\n", txg);
+ cv_broadcast(&tx->tx_quiesce_more_cv);
+ txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0);
+ }
+ }
+ txg_thread_exit(tx, &cpr, &tx->tx_timelimit_thread);
+}
+
+int
+txg_stalled(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ return (tx->tx_quiesce_txg_waiting > tx->tx_open_txg);
+}
+
+void
+txg_suspend(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ /* XXX some code paths suspend when they are already suspended! */
+ rw_enter(&tx->tx_suspend, RW_READER);
+}
+
+void
+txg_resume(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ rw_exit(&tx->tx_suspend);
+}
+
+/*
+ * Per-txg object lists.
+ */
+void
+txg_list_create(txg_list_t *tl, size_t offset)
+{
+ int t;
+
+ mutex_init(&tl->tl_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ tl->tl_offset = offset;
+
+ for (t = 0; t < TXG_SIZE; t++)
+ tl->tl_head[t] = NULL;
+}
+
+void
+txg_list_destroy(txg_list_t *tl)
+{
+ int t;
+
+ for (t = 0; t < TXG_SIZE; t++)
+ ASSERT(txg_list_empty(tl, t));
+
+ mutex_destroy(&tl->tl_lock);
+}
+
+int
+txg_list_empty(txg_list_t *tl, uint64_t txg)
+{
+ return (tl->tl_head[txg & TXG_MASK] == NULL);
+}
+
+/*
+ * Add an entry to the list.
+ * Returns 0 if it's a new entry, 1 if it's already there.
+ */
+int
+txg_list_add(txg_list_t *tl, void *p, uint64_t txg)
+{
+ int t = txg & TXG_MASK;
+ txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
+ int already_on_list;
+
+ mutex_enter(&tl->tl_lock);
+ already_on_list = tn->tn_member[t];
+ if (!already_on_list) {
+ tn->tn_member[t] = 1;
+ tn->tn_next[t] = tl->tl_head[t];
+ tl->tl_head[t] = tn;
+ }
+ mutex_exit(&tl->tl_lock);
+
+ return (already_on_list);
+}
+
+/*
+ * Remove the head of the list and return it.
+ */
+void *
+txg_list_remove(txg_list_t *tl, uint64_t txg)
+{
+ int t = txg & TXG_MASK;
+ txg_node_t *tn;
+ void *p = NULL;
+
+ mutex_enter(&tl->tl_lock);
+ if ((tn = tl->tl_head[t]) != NULL) {
+ p = (char *)tn - tl->tl_offset;
+ tl->tl_head[t] = tn->tn_next[t];
+ tn->tn_next[t] = NULL;
+ tn->tn_member[t] = 0;
+ }
+ mutex_exit(&tl->tl_lock);
+
+ return (p);
+}
+
+/*
+ * Remove a specific item from the list and return it.
+ */
+void *
+txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg)
+{
+ int t = txg & TXG_MASK;
+ txg_node_t *tn, **tp;
+
+ mutex_enter(&tl->tl_lock);
+
+ for (tp = &tl->tl_head[t]; (tn = *tp) != NULL; tp = &tn->tn_next[t]) {
+ if ((char *)tn - tl->tl_offset == p) {
+ *tp = tn->tn_next[t];
+ tn->tn_next[t] = NULL;
+ tn->tn_member[t] = 0;
+ mutex_exit(&tl->tl_lock);
+ return (p);
+ }
+ }
+
+ mutex_exit(&tl->tl_lock);
+
+ return (NULL);
+}
+
+int
+txg_list_member(txg_list_t *tl, void *p, uint64_t txg)
+{
+ int t = txg & TXG_MASK;
+ txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
+
+ return (tn->tn_member[t]);
+}
+
+/*
+ * Walk a txg list -- only safe if you know it's not changing.
+ */
+void *
+txg_list_head(txg_list_t *tl, uint64_t txg)
+{
+ int t = txg & TXG_MASK;
+ txg_node_t *tn = tl->tl_head[t];
+
+ return (tn == NULL ? NULL : (char *)tn - tl->tl_offset);
+}
+
+void *
+txg_list_next(txg_list_t *tl, void *p, uint64_t txg)
+{
+ int t = txg & TXG_MASK;
+ txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
+
+ tn = tn->tn_next[t];
+
+ return (tn == NULL ? NULL : (char *)tn - tl->tl_offset);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/uberblock.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/uberblock.c
new file mode 100644
index 0000000..34d7e0c
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/uberblock.c
@@ -0,0 +1,63 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/uberblock_impl.h>
+#include <sys/vdev_impl.h>
+
+int
+uberblock_verify(uberblock_t *ub)
+{
+ if (ub->ub_magic == BSWAP_64((uint64_t)UBERBLOCK_MAGIC))
+ byteswap_uint64_array(ub, sizeof (uberblock_t));
+
+ if (ub->ub_magic != UBERBLOCK_MAGIC)
+ return (EINVAL);
+
+ return (0);
+}
+
+/*
+ * Update the uberblock and return a boolean value indicating whether
+ * anything changed in this transaction group.
+ */
+int
+uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg)
+{
+ ASSERT(ub->ub_txg < txg);
+
+ /*
+ * We explicitly do not set ub_version here, so that older versions
+ * continue to be written with the previous uberblock version.
+ */
+ ub->ub_magic = UBERBLOCK_MAGIC;
+ ub->ub_txg = txg;
+ ub->ub_guid_sum = rvd->vdev_guid_sum;
+ ub->ub_timestamp = gethrestime_sec();
+
+ return (ub->ub_rootbp.blk_birth == txg);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/unique.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/unique.c
new file mode 100644
index 0000000..b52e729
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/unique.c
@@ -0,0 +1,107 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/avl.h>
+#include <sys/unique.h>
+
+static avl_tree_t unique_avl;
+static kmutex_t unique_mtx; /* Lock never initialized. */
+SX_SYSINIT(unique, &unique_mtx, "unique lock");
+
+typedef struct unique {
+ avl_node_t un_link;
+ uint64_t un_value;
+} unique_t;
+
+#define UNIQUE_MASK ((1ULL << UNIQUE_BITS) - 1)
+
+static int
+unique_compare(const void *a, const void *b)
+{
+ const unique_t *una = a;
+ const unique_t *unb = b;
+
+ if (una->un_value < unb->un_value)
+ return (-1);
+ if (una->un_value > unb->un_value)
+ return (+1);
+ return (0);
+}
+
+void
+unique_init(void)
+{
+ avl_create(&unique_avl, unique_compare,
+ sizeof (unique_t), offsetof(unique_t, un_link));
+}
+
+uint64_t
+unique_create(void)
+{
+ return (unique_insert(0));
+}
+
+uint64_t
+unique_insert(uint64_t value)
+{
+ avl_index_t idx;
+ unique_t *un = kmem_alloc(sizeof (unique_t), KM_SLEEP);
+
+ un->un_value = value;
+
+ mutex_enter(&unique_mtx);
+ while (un->un_value == 0 || un->un_value & ~UNIQUE_MASK ||
+ avl_find(&unique_avl, un, &idx)) {
+ mutex_exit(&unique_mtx);
+ (void) random_get_pseudo_bytes((void*)&un->un_value,
+ sizeof (un->un_value));
+ un->un_value &= UNIQUE_MASK;
+ mutex_enter(&unique_mtx);
+ }
+
+ avl_insert(&unique_avl, un, idx);
+ mutex_exit(&unique_mtx);
+
+ return (un->un_value);
+}
+
+void
+unique_remove(uint64_t value)
+{
+ unique_t un_tofind;
+ unique_t *un;
+
+ un_tofind.un_value = value;
+ mutex_enter(&unique_mtx);
+ un = avl_find(&unique_avl, &un_tofind, NULL);
+ if (un != NULL) {
+ avl_remove(&unique_avl, un);
+ kmem_free(un, sizeof (unique_t));
+ }
+ mutex_exit(&unique_mtx);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
new file mode 100644
index 0000000..0fceb8d
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
@@ -0,0 +1,1905 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/fm/fs/zfs.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/vdev_impl.h>
+#include <sys/uberblock_impl.h>
+#include <sys/metaslab.h>
+#include <sys/metaslab_impl.h>
+#include <sys/space_map.h>
+#include <sys/zio.h>
+#include <sys/zap.h>
+#include <sys/fs/zfs.h>
+
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV");
+
+/*
+ * Virtual device management.
+ */
+
+static vdev_ops_t *vdev_ops_table[] = {
+ &vdev_root_ops,
+ &vdev_raidz_ops,
+ &vdev_mirror_ops,
+ &vdev_replacing_ops,
+ &vdev_spare_ops,
+#ifdef _KERNEL
+ &vdev_geom_ops,
+#else
+ &vdev_disk_ops,
+ &vdev_file_ops,
+#endif
+ &vdev_missing_ops,
+ NULL
+};
+
+/* maximum scrub/resilver I/O queue */
+int zfs_scrub_limit = 70;
+
+/*
+ * Given a vdev type, return the appropriate ops vector.
+ */
+static vdev_ops_t *
+vdev_getops(const char *type)
+{
+ vdev_ops_t *ops, **opspp;
+
+ for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++)
+ if (strcmp(ops->vdev_op_type, type) == 0)
+ break;
+
+ return (ops);
+}
+
+/*
+ * Default asize function: return the MAX of psize with the asize of
+ * all children. This is what's used by anything other than RAID-Z.
+ */
+uint64_t
+vdev_default_asize(vdev_t *vd, uint64_t psize)
+{
+ uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);
+ uint64_t csize;
+ uint64_t c;
+
+ for (c = 0; c < vd->vdev_children; c++) {
+ csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
+ asize = MAX(asize, csize);
+ }
+
+ return (asize);
+}
+
+/*
+ * Get the replaceable or attachable device size.
+ * If the parent is a mirror or raidz, the replaceable size is the minimum
+ * psize of all its children. For the rest, just return our own psize.
+ *
+ * e.g.
+ * psize rsize
+ * root - -
+ * mirror/raidz - -
+ * disk1 20g 20g
+ * disk2 40g 20g
+ * disk3 80g 80g
+ */
+uint64_t
+vdev_get_rsize(vdev_t *vd)
+{
+ vdev_t *pvd, *cvd;
+ uint64_t c, rsize;
+
+ pvd = vd->vdev_parent;
+
+ /*
+ * If our parent is NULL or the root, just return our own psize.
+ */
+ if (pvd == NULL || pvd->vdev_parent == NULL)
+ return (vd->vdev_psize);
+
+ rsize = 0;
+
+ for (c = 0; c < pvd->vdev_children; c++) {
+ cvd = pvd->vdev_child[c];
+ rsize = MIN(rsize - 1, cvd->vdev_psize - 1) + 1;
+ }
+
+ return (rsize);
+}
+
+vdev_t *
+vdev_lookup_top(spa_t *spa, uint64_t vdev)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ if (vdev < rvd->vdev_children)
+ return (rvd->vdev_child[vdev]);
+
+ return (NULL);
+}
+
+vdev_t *
+vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
+{
+ int c;
+ vdev_t *mvd;
+
+ if (vd->vdev_guid == guid)
+ return (vd);
+
+ for (c = 0; c < vd->vdev_children; c++)
+ if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) !=
+ NULL)
+ return (mvd);
+
+ return (NULL);
+}
+
+void
+vdev_add_child(vdev_t *pvd, vdev_t *cvd)
+{
+ size_t oldsize, newsize;
+ uint64_t id = cvd->vdev_id;
+ vdev_t **newchild;
+
+ ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER));
+ ASSERT(cvd->vdev_parent == NULL);
+
+ cvd->vdev_parent = pvd;
+
+ if (pvd == NULL)
+ return;
+
+ ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL);
+
+ oldsize = pvd->vdev_children * sizeof (vdev_t *);
+ pvd->vdev_children = MAX(pvd->vdev_children, id + 1);
+ newsize = pvd->vdev_children * sizeof (vdev_t *);
+
+ newchild = kmem_zalloc(newsize, KM_SLEEP);
+ if (pvd->vdev_child != NULL) {
+ bcopy(pvd->vdev_child, newchild, oldsize);
+ kmem_free(pvd->vdev_child, oldsize);
+ }
+
+ pvd->vdev_child = newchild;
+ pvd->vdev_child[id] = cvd;
+
+ cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd);
+ ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL);
+
+ /*
+ * Walk up all ancestors to update guid sum.
+ */
+ for (; pvd != NULL; pvd = pvd->vdev_parent)
+ pvd->vdev_guid_sum += cvd->vdev_guid_sum;
+
+ if (cvd->vdev_ops->vdev_op_leaf)
+ cvd->vdev_spa->spa_scrub_maxinflight += zfs_scrub_limit;
+}
+
+void
+vdev_remove_child(vdev_t *pvd, vdev_t *cvd)
+{
+ int c;
+ uint_t id = cvd->vdev_id;
+
+ ASSERT(cvd->vdev_parent == pvd);
+
+ if (pvd == NULL)
+ return;
+
+ ASSERT(id < pvd->vdev_children);
+ ASSERT(pvd->vdev_child[id] == cvd);
+
+ pvd->vdev_child[id] = NULL;
+ cvd->vdev_parent = NULL;
+
+ for (c = 0; c < pvd->vdev_children; c++)
+ if (pvd->vdev_child[c])
+ break;
+
+ if (c == pvd->vdev_children) {
+ kmem_free(pvd->vdev_child, c * sizeof (vdev_t *));
+ pvd->vdev_child = NULL;
+ pvd->vdev_children = 0;
+ }
+
+ /*
+ * Walk up all ancestors to update guid sum.
+ */
+ for (; pvd != NULL; pvd = pvd->vdev_parent)
+ pvd->vdev_guid_sum -= cvd->vdev_guid_sum;
+
+ if (cvd->vdev_ops->vdev_op_leaf)
+ cvd->vdev_spa->spa_scrub_maxinflight -= zfs_scrub_limit;
+}
+
+/*
+ * Remove any holes in the child array.
+ */
+void
+vdev_compact_children(vdev_t *pvd)
+{
+ vdev_t **newchild, *cvd;
+ int oldc = pvd->vdev_children;
+ int newc, c;
+
+ ASSERT(spa_config_held(pvd->vdev_spa, RW_WRITER));
+
+ for (c = newc = 0; c < oldc; c++)
+ if (pvd->vdev_child[c])
+ newc++;
+
+ newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP);
+
+ for (c = newc = 0; c < oldc; c++) {
+ if ((cvd = pvd->vdev_child[c]) != NULL) {
+ newchild[newc] = cvd;
+ cvd->vdev_id = newc++;
+ }
+ }
+
+ kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *));
+ pvd->vdev_child = newchild;
+ pvd->vdev_children = newc;
+}
+
+/*
+ * Allocate and minimally initialize a vdev_t.
+ */
+static vdev_t *
+vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
+{
+ vdev_t *vd;
+
+ vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
+
+ if (spa->spa_root_vdev == NULL) {
+ ASSERT(ops == &vdev_root_ops);
+ spa->spa_root_vdev = vd;
+ }
+
+ if (guid == 0) {
+ if (spa->spa_root_vdev == vd) {
+ /*
+ * The root vdev's guid will also be the pool guid,
+ * which must be unique among all pools.
+ */
+ while (guid == 0 || spa_guid_exists(guid, 0))
+ guid = spa_get_random(-1ULL);
+ } else {
+ /*
+ * Any other vdev's guid must be unique within the pool.
+ */
+ while (guid == 0 ||
+ spa_guid_exists(spa_guid(spa), guid))
+ guid = spa_get_random(-1ULL);
+ }
+ ASSERT(!spa_guid_exists(spa_guid(spa), guid));
+ }
+
+ vd->vdev_spa = spa;
+ vd->vdev_id = id;
+ vd->vdev_guid = guid;
+ vd->vdev_guid_sum = guid;
+ vd->vdev_ops = ops;
+ vd->vdev_state = VDEV_STATE_CLOSED;
+
+ mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
+ space_map_create(&vd->vdev_dtl_map, 0, -1ULL, 0, &vd->vdev_dtl_lock);
+ space_map_create(&vd->vdev_dtl_scrub, 0, -1ULL, 0, &vd->vdev_dtl_lock);
+ txg_list_create(&vd->vdev_ms_list,
+ offsetof(struct metaslab, ms_txg_node));
+ txg_list_create(&vd->vdev_dtl_list,
+ offsetof(struct vdev, vdev_dtl_node));
+ vd->vdev_stat.vs_timestamp = gethrtime();
+
+ return (vd);
+}
+
+/*
+ * Free a vdev_t that has been removed from service.
+ */
+static void
+vdev_free_common(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ if (vd->vdev_path)
+ spa_strfree(vd->vdev_path);
+ if (vd->vdev_devid)
+ spa_strfree(vd->vdev_devid);
+
+ if (vd->vdev_isspare)
+ spa_spare_remove(vd);
+
+ txg_list_destroy(&vd->vdev_ms_list);
+ txg_list_destroy(&vd->vdev_dtl_list);
+ mutex_enter(&vd->vdev_dtl_lock);
+ space_map_unload(&vd->vdev_dtl_map);
+ space_map_destroy(&vd->vdev_dtl_map);
+ space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
+ space_map_destroy(&vd->vdev_dtl_scrub);
+ mutex_exit(&vd->vdev_dtl_lock);
+ mutex_destroy(&vd->vdev_dtl_lock);
+ mutex_destroy(&vd->vdev_stat_lock);
+
+ if (vd == spa->spa_root_vdev)
+ spa->spa_root_vdev = NULL;
+
+ kmem_free(vd, sizeof (vdev_t));
+}
+
+/*
+ * Allocate a new vdev. The 'alloctype' is used to control whether we are
+ * creating a new vdev or loading an existing one - the behavior is slightly
+ * different for each case.
+ */
+int
+vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
+ int alloctype)
+{
+ vdev_ops_t *ops;
+ char *type;
+ uint64_t guid = 0;
+ vdev_t *vd;
+
+ ASSERT(spa_config_held(spa, RW_WRITER));
+
+ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
+ return (EINVAL);
+
+ if ((ops = vdev_getops(type)) == NULL)
+ return (EINVAL);
+
+ /*
+ * If this is a load, get the vdev guid from the nvlist.
+ * Otherwise, vdev_alloc_common() will generate one for us.
+ */
+ if (alloctype == VDEV_ALLOC_LOAD) {
+ uint64_t label_id;
+
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) ||
+ label_id != id)
+ return (EINVAL);
+
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
+ return (EINVAL);
+ } else if (alloctype == VDEV_ALLOC_SPARE) {
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
+ return (EINVAL);
+ }
+
+ /*
+ * The first allocated vdev must be of type 'root'.
+ */
+ if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL)
+ return (EINVAL);
+
+ vd = vdev_alloc_common(spa, id, guid, ops);
+
+ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
+ vd->vdev_path = spa_strdup(vd->vdev_path);
+ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0)
+ vd->vdev_devid = spa_strdup(vd->vdev_devid);
+
+ /*
+ * Set the nparity propery for RAID-Z vdevs.
+ */
+ if (ops == &vdev_raidz_ops) {
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
+ &vd->vdev_nparity) == 0) {
+ /*
+ * Currently, we can only support 2 parity devices.
+ */
+ if (vd->vdev_nparity > 2)
+ return (EINVAL);
+ /*
+ * Older versions can only support 1 parity device.
+ */
+ if (vd->vdev_nparity == 2 &&
+ spa_version(spa) < ZFS_VERSION_RAID6)
+ return (ENOTSUP);
+
+ } else {
+ /*
+ * We require the parity to be specified for SPAs that
+ * support multiple parity levels.
+ */
+ if (spa_version(spa) >= ZFS_VERSION_RAID6)
+ return (EINVAL);
+
+ /*
+ * Otherwise, we default to 1 parity device for RAID-Z.
+ */
+ vd->vdev_nparity = 1;
+ }
+ } else {
+ vd->vdev_nparity = 0;
+ }
+
+ /*
+ * Set the whole_disk property. If it's not specified, leave the value
+ * as -1.
+ */
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
+ &vd->vdev_wholedisk) != 0)
+ vd->vdev_wholedisk = -1ULL;
+
+ /*
+ * Look for the 'not present' flag. This will only be set if the device
+ * was not present at the time of import.
+ */
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
+ &vd->vdev_not_present);
+
+ /*
+ * Get the alignment requirement.
+ */
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift);
+
+ /*
+ * If we're a top-level vdev, try to load the allocation parameters.
+ */
+ if (parent && !parent->vdev_parent && alloctype == VDEV_ALLOC_LOAD) {
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
+ &vd->vdev_ms_array);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
+ &vd->vdev_ms_shift);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
+ &vd->vdev_asize);
+ }
+
+ /*
+ * If we're a leaf vdev, try to load the DTL object and offline state.
+ */
+ if (vd->vdev_ops->vdev_op_leaf && alloctype == VDEV_ALLOC_LOAD) {
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
+ &vd->vdev_dtl.smo_object);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
+ &vd->vdev_offline);
+ }
+
+ /*
+ * Add ourselves to the parent's list of children.
+ */
+ vdev_add_child(parent, vd);
+
+ *vdp = vd;
+
+ return (0);
+}
+
+void
+vdev_free(vdev_t *vd)
+{
+ int c;
+
+ /*
+ * vdev_free() implies closing the vdev first. This is simpler than
+ * trying to ensure complicated semantics for all callers.
+ */
+ vdev_close(vd);
+
+ ASSERT(!list_link_active(&vd->vdev_dirty_node));
+
+ /*
+ * Free all children.
+ */
+ for (c = 0; c < vd->vdev_children; c++)
+ vdev_free(vd->vdev_child[c]);
+
+ ASSERT(vd->vdev_child == NULL);
+ ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
+
+ /*
+ * Discard allocation state.
+ */
+ if (vd == vd->vdev_top)
+ vdev_metaslab_fini(vd);
+
+ ASSERT3U(vd->vdev_stat.vs_space, ==, 0);
+ ASSERT3U(vd->vdev_stat.vs_dspace, ==, 0);
+ ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0);
+
+ /*
+ * Remove this vdev from its parent's child list.
+ */
+ vdev_remove_child(vd->vdev_parent, vd);
+
+ ASSERT(vd->vdev_parent == NULL);
+
+ vdev_free_common(vd);
+}
+
+/*
+ * Transfer top-level vdev state from svd to tvd.
+ */
+static void
+vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
+{
+ spa_t *spa = svd->vdev_spa;
+ metaslab_t *msp;
+ vdev_t *vd;
+ int t;
+
+ ASSERT(tvd == tvd->vdev_top);
+
+ tvd->vdev_ms_array = svd->vdev_ms_array;
+ tvd->vdev_ms_shift = svd->vdev_ms_shift;
+ tvd->vdev_ms_count = svd->vdev_ms_count;
+
+ svd->vdev_ms_array = 0;
+ svd->vdev_ms_shift = 0;
+ svd->vdev_ms_count = 0;
+
+ tvd->vdev_mg = svd->vdev_mg;
+ tvd->vdev_ms = svd->vdev_ms;
+
+ svd->vdev_mg = NULL;
+ svd->vdev_ms = NULL;
+
+ if (tvd->vdev_mg != NULL)
+ tvd->vdev_mg->mg_vd = tvd;
+
+ tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc;
+ tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space;
+ tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace;
+
+ svd->vdev_stat.vs_alloc = 0;
+ svd->vdev_stat.vs_space = 0;
+ svd->vdev_stat.vs_dspace = 0;
+
+ for (t = 0; t < TXG_SIZE; t++) {
+ while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
+ (void) txg_list_add(&tvd->vdev_ms_list, msp, t);
+ while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL)
+ (void) txg_list_add(&tvd->vdev_dtl_list, vd, t);
+ if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t))
+ (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t);
+ }
+
+ if (list_link_active(&svd->vdev_dirty_node)) {
+ vdev_config_clean(svd);
+ vdev_config_dirty(tvd);
+ }
+
+ tvd->vdev_reopen_wanted = svd->vdev_reopen_wanted;
+ svd->vdev_reopen_wanted = 0;
+
+ tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio;
+ svd->vdev_deflate_ratio = 0;
+}
+
+static void
+vdev_top_update(vdev_t *tvd, vdev_t *vd)
+{
+ int c;
+
+ if (vd == NULL)
+ return;
+
+ vd->vdev_top = tvd;
+
+ for (c = 0; c < vd->vdev_children; c++)
+ vdev_top_update(tvd, vd->vdev_child[c]);
+}
+
+/*
+ * Add a mirror/replacing vdev above an existing vdev.
+ */
+vdev_t *
+vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
+{
+ spa_t *spa = cvd->vdev_spa;
+ vdev_t *pvd = cvd->vdev_parent;
+ vdev_t *mvd;
+
+ ASSERT(spa_config_held(spa, RW_WRITER));
+
+ mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);
+
+ mvd->vdev_asize = cvd->vdev_asize;
+ mvd->vdev_ashift = cvd->vdev_ashift;
+ mvd->vdev_state = cvd->vdev_state;
+
+ vdev_remove_child(pvd, cvd);
+ vdev_add_child(pvd, mvd);
+ cvd->vdev_id = mvd->vdev_children;
+ vdev_add_child(mvd, cvd);
+ vdev_top_update(cvd->vdev_top, cvd->vdev_top);
+
+ if (mvd == mvd->vdev_top)
+ vdev_top_transfer(cvd, mvd);
+
+ return (mvd);
+}
+
+/*
+ * Remove a 1-way mirror/replacing vdev from the tree.
+ */
+void
+vdev_remove_parent(vdev_t *cvd)
+{
+ vdev_t *mvd = cvd->vdev_parent;
+ vdev_t *pvd = mvd->vdev_parent;
+
+ ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER));
+
+ ASSERT(mvd->vdev_children == 1);
+ ASSERT(mvd->vdev_ops == &vdev_mirror_ops ||
+ mvd->vdev_ops == &vdev_replacing_ops ||
+ mvd->vdev_ops == &vdev_spare_ops);
+ cvd->vdev_ashift = mvd->vdev_ashift;
+
+ vdev_remove_child(mvd, cvd);
+ vdev_remove_child(pvd, mvd);
+ cvd->vdev_id = mvd->vdev_id;
+ vdev_add_child(pvd, cvd);
+ /*
+ * If we created a new toplevel vdev, then we need to change the child's
+ * vdev GUID to match the old toplevel vdev. Otherwise, we could have
+ * detached an offline device, and when we go to import the pool we'll
+ * think we have two toplevel vdevs, instead of a different version of
+ * the same toplevel vdev.
+ */
+ if (cvd->vdev_top == cvd) {
+ pvd->vdev_guid_sum -= cvd->vdev_guid;
+ cvd->vdev_guid_sum -= cvd->vdev_guid;
+ cvd->vdev_guid = mvd->vdev_guid;
+ cvd->vdev_guid_sum += mvd->vdev_guid;
+ pvd->vdev_guid_sum += cvd->vdev_guid;
+ }
+ vdev_top_update(cvd->vdev_top, cvd->vdev_top);
+
+ if (cvd == cvd->vdev_top)
+ vdev_top_transfer(mvd, cvd);
+
+ ASSERT(mvd->vdev_children == 0);
+ vdev_free(mvd);
+}
+
+int
+vdev_metaslab_init(vdev_t *vd, uint64_t txg)
+{
+ spa_t *spa = vd->vdev_spa;
+ objset_t *mos = spa->spa_meta_objset;
+ metaslab_class_t *mc = spa_metaslab_class_select(spa);
+ uint64_t m;
+ uint64_t oldc = vd->vdev_ms_count;
+ uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
+ metaslab_t **mspp;
+ int error;
+
+ if (vd->vdev_ms_shift == 0) /* not being allocated from yet */
+ return (0);
+
+ dprintf("%s oldc %llu newc %llu\n", vdev_description(vd), oldc, newc);
+
+ ASSERT(oldc <= newc);
+
+ if (vd->vdev_mg == NULL)
+ vd->vdev_mg = metaslab_group_create(mc, vd);
+
+ mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
+
+ if (oldc != 0) {
+ bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp));
+ kmem_free(vd->vdev_ms, oldc * sizeof (*mspp));
+ }
+
+ vd->vdev_ms = mspp;
+ vd->vdev_ms_count = newc;
+
+ for (m = oldc; m < newc; m++) {
+ space_map_obj_t smo = { 0, 0, 0 };
+ if (txg == 0) {
+ uint64_t object = 0;
+ error = dmu_read(mos, vd->vdev_ms_array,
+ m * sizeof (uint64_t), sizeof (uint64_t), &object);
+ if (error)
+ return (error);
+ if (object != 0) {
+ dmu_buf_t *db;
+ error = dmu_bonus_hold(mos, object, FTAG, &db);
+ if (error)
+ return (error);
+ ASSERT3U(db->db_size, ==, sizeof (smo));
+ bcopy(db->db_data, &smo, db->db_size);
+ ASSERT3U(smo.smo_object, ==, object);
+ dmu_buf_rele(db, FTAG);
+ }
+ }
+ vd->vdev_ms[m] = metaslab_init(vd->vdev_mg, &smo,
+ m << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg);
+ }
+
+ return (0);
+}
+
+void
+vdev_metaslab_fini(vdev_t *vd)
+{
+ uint64_t m;
+ uint64_t count = vd->vdev_ms_count;
+
+ if (vd->vdev_ms != NULL) {
+ for (m = 0; m < count; m++)
+ if (vd->vdev_ms[m] != NULL)
+ metaslab_fini(vd->vdev_ms[m]);
+ kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
+ vd->vdev_ms = NULL;
+ }
+}
+
+/*
+ * Prepare a virtual device for access.
+ */
+int
+vdev_open(vdev_t *vd)
+{
+ int error;
+ int c;
+ uint64_t osize = 0;
+ uint64_t asize, psize;
+ uint64_t ashift = 0;
+
+ ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
+ vd->vdev_state == VDEV_STATE_CANT_OPEN ||
+ vd->vdev_state == VDEV_STATE_OFFLINE);
+
+ if (vd->vdev_fault_mode == VDEV_FAULT_COUNT)
+ vd->vdev_fault_arg >>= 1;
+ else
+ vd->vdev_fault_mode = VDEV_FAULT_NONE;
+
+ vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
+
+ if (vd->vdev_ops->vdev_op_leaf) {
+ vdev_cache_init(vd);
+ vdev_queue_init(vd);
+ vd->vdev_cache_active = B_TRUE;
+ }
+
+ if (vd->vdev_offline) {
+ ASSERT(vd->vdev_children == 0);
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE);
+ return (ENXIO);
+ }
+
+ error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift);
+
+ if (zio_injection_enabled && error == 0)
+ error = zio_handle_device_injection(vd, ENXIO);
+
+ dprintf("%s = %d, osize %llu, state = %d\n",
+ vdev_description(vd), error, osize, vd->vdev_state);
+
+ if (error) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ vd->vdev_stat.vs_aux);
+ return (error);
+ }
+
+ vd->vdev_state = VDEV_STATE_HEALTHY;
+
+ for (c = 0; c < vd->vdev_children; c++)
+ if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
+ VDEV_AUX_NONE);
+ break;
+ }
+
+ osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t));
+
+ if (vd->vdev_children == 0) {
+ if (osize < SPA_MINDEVSIZE) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_TOO_SMALL);
+ return (EOVERFLOW);
+ }
+ psize = osize;
+ asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE);
+ } else {
+ if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE -
+ (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_TOO_SMALL);
+ return (EOVERFLOW);
+ }
+ psize = 0;
+ asize = osize;
+ }
+
+ vd->vdev_psize = psize;
+
+ if (vd->vdev_asize == 0) {
+ /*
+ * This is the first-ever open, so use the computed values.
+ * For testing purposes, a higher ashift can be requested.
+ */
+ vd->vdev_asize = asize;
+ vd->vdev_ashift = MAX(ashift, vd->vdev_ashift);
+ } else {
+ /*
+ * Make sure the alignment requirement hasn't increased.
+ */
+ if (ashift > vd->vdev_top->vdev_ashift) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_BAD_LABEL);
+ return (EINVAL);
+ }
+
+ /*
+ * Make sure the device hasn't shrunk.
+ */
+ if (asize < vd->vdev_asize) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_BAD_LABEL);
+ return (EINVAL);
+ }
+
+ /*
+ * If all children are healthy and the asize has increased,
+ * then we've experienced dynamic LUN growth.
+ */
+ if (vd->vdev_state == VDEV_STATE_HEALTHY &&
+ asize > vd->vdev_asize) {
+ vd->vdev_asize = asize;
+ }
+ }
+
+ /*
+ * If this is a top-level vdev, compute the raidz-deflation
+ * ratio. Note, we hard-code in 128k (1<<17) because it is the
+ * current "typical" blocksize. Even if SPA_MAXBLOCKSIZE
+ * changes, this algorithm must never change, or we will
+ * inconsistently account for existing bp's.
+ */
+ if (vd->vdev_top == vd) {
+ vd->vdev_deflate_ratio = (1<<17) /
+ (vdev_psize_to_asize(vd, 1<<17) >> SPA_MINBLOCKSHIFT);
+ }
+
+ /*
+ * This allows the ZFS DE to close cases appropriately. If a device
+ * goes away and later returns, we want to close the associated case.
+ * But it's not enough to simply post this only when a device goes from
+ * CANT_OPEN -> HEALTHY. If we reboot the system and the device is
+ * back, we also need to close the case (otherwise we will try to replay
+ * it). So we have to post this notifier every time. Since this only
+ * occurs during pool open or error recovery, this should not be an
+ * issue.
+ */
+ zfs_post_ok(vd->vdev_spa, vd);
+
+ return (0);
+}
+
+/*
+ * Called once the vdevs are all opened, this routine validates the label
+ * contents. This needs to be done before vdev_load() so that we don't
+ * inadvertently do repair I/Os to the wrong device, and so that vdev_reopen()
+ * won't succeed if the device has been changed underneath.
+ *
+ * This function will only return failure if one of the vdevs indicates that it
+ * has since been destroyed or exported. This is only possible if
+ * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state
+ * will be updated but the function will return 0.
+ */
+int
+vdev_validate(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+ int c;
+ nvlist_t *label;
+ uint64_t guid;
+ uint64_t state;
+
+ for (c = 0; c < vd->vdev_children; c++)
+ if (vdev_validate(vd->vdev_child[c]) != 0)
+ return (-1);
+
+ /*
+ * If the device has already failed, or was marked offline, don't do
+ * any further validation. Otherwise, label I/O will fail and we will
+ * overwrite the previous state.
+ */
+ if (vd->vdev_ops->vdev_op_leaf && !vdev_is_dead(vd)) {
+
+ if ((label = vdev_label_read_config(vd)) == NULL) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_BAD_LABEL);
+ return (0);
+ }
+
+ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID,
+ &guid) != 0 || guid != spa_guid(spa)) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ nvlist_free(label);
+ return (0);
+ }
+
+ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID,
+ &guid) != 0 || guid != vd->vdev_guid) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ nvlist_free(label);
+ return (0);
+ }
+
+ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
+ &state) != 0) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ nvlist_free(label);
+ return (0);
+ }
+
+ nvlist_free(label);
+
+ if (spa->spa_load_state == SPA_LOAD_OPEN &&
+ state != POOL_STATE_ACTIVE)
+ return (-1);
+ }
+
+ /*
+ * If we were able to open and validate a vdev that was previously
+ * marked permanently unavailable, clear that state now.
+ */
+ if (vd->vdev_not_present)
+ vd->vdev_not_present = 0;
+
+ return (0);
+}
+
+/*
+ * Close a virtual device.
+ */
+void
+vdev_close(vdev_t *vd)
+{
+ vd->vdev_ops->vdev_op_close(vd);
+
+ if (vd->vdev_cache_active) {
+ vdev_cache_fini(vd);
+ vdev_queue_fini(vd);
+ vd->vdev_cache_active = B_FALSE;
+ }
+
+ /*
+ * We record the previous state before we close it, so that if we are
+ * doing a reopen(), we don't generate FMA ereports if we notice that
+ * it's still faulted.
+ */
+ vd->vdev_prevstate = vd->vdev_state;
+
+ if (vd->vdev_offline)
+ vd->vdev_state = VDEV_STATE_OFFLINE;
+ else
+ vd->vdev_state = VDEV_STATE_CLOSED;
+ vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
+}
+
+void
+vdev_reopen(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ ASSERT(spa_config_held(spa, RW_WRITER));
+
+ vdev_close(vd);
+ (void) vdev_open(vd);
+
+ /*
+ * Call vdev_validate() here to make sure we have the same device.
+ * Otherwise, a device with an invalid label could be successfully
+ * opened in response to vdev_reopen().
+ *
+ * The downside to this is that if the user is simply experimenting by
+ * overwriting an entire disk, we'll fault the device rather than
+ * demonstrate self-healing capabilities. On the other hand, with
+ * proper FMA integration, the series of errors we'd see from the device
+ * would result in a faulted device anyway. Given that this doesn't
+ * model any real-world corruption, it's better to catch this here and
+ * correctly identify that the device has either changed beneath us, or
+ * is corrupted beyond recognition.
+ */
+ (void) vdev_validate(vd);
+
+ /*
+ * Reassess root vdev's health.
+ */
+ vdev_propagate_state(spa->spa_root_vdev);
+}
+
+int
+vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing)
+{
+ int error;
+
+ /*
+ * Normally, partial opens (e.g. of a mirror) are allowed.
+ * For a create, however, we want to fail the request if
+ * there are any components we can't open.
+ */
+ error = vdev_open(vd);
+
+ if (error || vd->vdev_state != VDEV_STATE_HEALTHY) {
+ vdev_close(vd);
+ return (error ? error : ENXIO);
+ }
+
+ /*
+ * Recursively initialize all labels.
+ */
+ if ((error = vdev_label_init(vd, txg, isreplacing ?
+ VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) {
+ vdev_close(vd);
+ return (error);
+ }
+
+ return (0);
+}
+
+/*
+ * The is the latter half of vdev_create(). It is distinct because it
+ * involves initiating transactions in order to do metaslab creation.
+ * For creation, we want to try to create all vdevs at once and then undo it
+ * if anything fails; this is much harder if we have pending transactions.
+ */
+void
+vdev_init(vdev_t *vd, uint64_t txg)
+{
+ /*
+ * Aim for roughly 200 metaslabs per vdev.
+ */
+ vd->vdev_ms_shift = highbit(vd->vdev_asize / 200);
+ vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT);
+
+ /*
+ * Initialize the vdev's metaslabs. This can't fail because
+ * there's nothing to read when creating all new metaslabs.
+ */
+ VERIFY(vdev_metaslab_init(vd, txg) == 0);
+}
+
+void
+vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
+{
+ ASSERT(vd == vd->vdev_top);
+ ASSERT(ISP2(flags));
+
+ if (flags & VDD_METASLAB)
+ (void) txg_list_add(&vd->vdev_ms_list, arg, txg);
+
+ if (flags & VDD_DTL)
+ (void) txg_list_add(&vd->vdev_dtl_list, arg, txg);
+
+ (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg);
+}
+
+void
+vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size)
+{
+ mutex_enter(sm->sm_lock);
+ if (!space_map_contains(sm, txg, size))
+ space_map_add(sm, txg, size);
+ mutex_exit(sm->sm_lock);
+}
+
+int
+vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size)
+{
+ int dirty;
+
+ /*
+ * Quick test without the lock -- covers the common case that
+ * there are no dirty time segments.
+ */
+ if (sm->sm_space == 0)
+ return (0);
+
+ mutex_enter(sm->sm_lock);
+ dirty = space_map_contains(sm, txg, size);
+ mutex_exit(sm->sm_lock);
+
+ return (dirty);
+}
+
+/*
+ * Reassess DTLs after a config change or scrub completion.
+ */
+void
+vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
+{
+ spa_t *spa = vd->vdev_spa;
+ int c;
+
+ ASSERT(spa_config_held(spa, RW_WRITER));
+
+ if (vd->vdev_children == 0) {
+ mutex_enter(&vd->vdev_dtl_lock);
+ /*
+ * We're successfully scrubbed everything up to scrub_txg.
+ * Therefore, excise all old DTLs up to that point, then
+ * fold in the DTLs for everything we couldn't scrub.
+ */
+ if (scrub_txg != 0) {
+ space_map_excise(&vd->vdev_dtl_map, 0, scrub_txg);
+ space_map_union(&vd->vdev_dtl_map, &vd->vdev_dtl_scrub);
+ }
+ if (scrub_done)
+ space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
+ mutex_exit(&vd->vdev_dtl_lock);
+ if (txg != 0)
+ vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
+ return;
+ }
+
+ /*
+ * Make sure the DTLs are always correct under the scrub lock.
+ */
+ if (vd == spa->spa_root_vdev)
+ mutex_enter(&spa->spa_scrub_lock);
+
+ mutex_enter(&vd->vdev_dtl_lock);
+ space_map_vacate(&vd->vdev_dtl_map, NULL, NULL);
+ space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
+ mutex_exit(&vd->vdev_dtl_lock);
+
+ for (c = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+ vdev_dtl_reassess(cvd, txg, scrub_txg, scrub_done);
+ mutex_enter(&vd->vdev_dtl_lock);
+ space_map_union(&vd->vdev_dtl_map, &cvd->vdev_dtl_map);
+ space_map_union(&vd->vdev_dtl_scrub, &cvd->vdev_dtl_scrub);
+ mutex_exit(&vd->vdev_dtl_lock);
+ }
+
+ if (vd == spa->spa_root_vdev)
+ mutex_exit(&spa->spa_scrub_lock);
+}
+
+static int
+vdev_dtl_load(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+ space_map_obj_t *smo = &vd->vdev_dtl;
+ objset_t *mos = spa->spa_meta_objset;
+ dmu_buf_t *db;
+ int error;
+
+ ASSERT(vd->vdev_children == 0);
+
+ if (smo->smo_object == 0)
+ return (0);
+
+ if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0)
+ return (error);
+
+ ASSERT3U(db->db_size, ==, sizeof (*smo));
+ bcopy(db->db_data, smo, db->db_size);
+ dmu_buf_rele(db, FTAG);
+
+ mutex_enter(&vd->vdev_dtl_lock);
+ error = space_map_load(&vd->vdev_dtl_map, NULL, SM_ALLOC, smo, mos);
+ mutex_exit(&vd->vdev_dtl_lock);
+
+ return (error);
+}
+
+void
+vdev_dtl_sync(vdev_t *vd, uint64_t txg)
+{
+ spa_t *spa = vd->vdev_spa;
+ space_map_obj_t *smo = &vd->vdev_dtl;
+ space_map_t *sm = &vd->vdev_dtl_map;
+ objset_t *mos = spa->spa_meta_objset;
+ space_map_t smsync;
+ kmutex_t smlock;
+ dmu_buf_t *db;
+ dmu_tx_t *tx;
+
+ dprintf("%s in txg %llu pass %d\n",
+ vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa));
+
+ tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+
+ if (vd->vdev_detached) {
+ if (smo->smo_object != 0) {
+ int err = dmu_object_free(mos, smo->smo_object, tx);
+ ASSERT3U(err, ==, 0);
+ smo->smo_object = 0;
+ }
+ dmu_tx_commit(tx);
+ dprintf("detach %s committed in txg %llu\n",
+ vdev_description(vd), txg);
+ return;
+ }
+
+ if (smo->smo_object == 0) {
+ ASSERT(smo->smo_objsize == 0);
+ ASSERT(smo->smo_alloc == 0);
+ smo->smo_object = dmu_object_alloc(mos,
+ DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
+ DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
+ ASSERT(smo->smo_object != 0);
+ vdev_config_dirty(vd->vdev_top);
+ }
+
+ bzero(&smlock, sizeof(smlock));
+ mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL);
+
+ space_map_create(&smsync, sm->sm_start, sm->sm_size, sm->sm_shift,
+ &smlock);
+
+ mutex_enter(&smlock);
+
+ mutex_enter(&vd->vdev_dtl_lock);
+ space_map_walk(sm, space_map_add, &smsync);
+ mutex_exit(&vd->vdev_dtl_lock);
+
+ space_map_truncate(smo, mos, tx);
+ space_map_sync(&smsync, SM_ALLOC, smo, mos, tx);
+
+ space_map_destroy(&smsync);
+
+ mutex_exit(&smlock);
+ mutex_destroy(&smlock);
+
+ VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db));
+ dmu_buf_will_dirty(db, tx);
+ ASSERT3U(db->db_size, ==, sizeof (*smo));
+ bcopy(smo, db->db_data, db->db_size);
+ dmu_buf_rele(db, FTAG);
+
+ dmu_tx_commit(tx);
+}
+
+void
+vdev_load(vdev_t *vd)
+{
+ int c;
+
+ /*
+ * Recursively load all children.
+ */
+ for (c = 0; c < vd->vdev_children; c++)
+ vdev_load(vd->vdev_child[c]);
+
+ /*
+ * If this is a top-level vdev, initialize its metaslabs.
+ */
+ if (vd == vd->vdev_top &&
+ (vd->vdev_ashift == 0 || vd->vdev_asize == 0 ||
+ vdev_metaslab_init(vd, 0) != 0))
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+
+ /*
+ * If this is a leaf vdev, load its DTL.
+ */
+ if (vd->vdev_ops->vdev_op_leaf && vdev_dtl_load(vd) != 0)
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+}
+
+/*
+ * This special case of vdev_spare() is used for hot spares. It's sole purpose
+ * it to set the vdev state for the associated vdev. To do this, we make sure
+ * that we can open the underlying device, then try to read the label, and make
+ * sure that the label is sane and that it hasn't been repurposed to another
+ * pool.
+ */
+int
+vdev_validate_spare(vdev_t *vd)
+{
+ nvlist_t *label;
+ uint64_t guid, version;
+ uint64_t state;
+
+ if ((label = vdev_label_read_config(vd)) == NULL) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ return (-1);
+ }
+
+ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 ||
+ version > ZFS_VERSION ||
+ nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 ||
+ guid != vd->vdev_guid ||
+ nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ nvlist_free(label);
+ return (-1);
+ }
+
+ spa_spare_add(vd);
+
+ /*
+ * We don't actually check the pool state here. If it's in fact in
+ * use by another pool, we update this fact on the fly when requested.
+ */
+ nvlist_free(label);
+ return (0);
+}
+
+void
+vdev_sync_done(vdev_t *vd, uint64_t txg)
+{
+ metaslab_t *msp;
+
+ dprintf("%s txg %llu\n", vdev_description(vd), txg);
+
+ while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
+ metaslab_sync_done(msp, txg);
+}
+
+void
+vdev_sync(vdev_t *vd, uint64_t txg)
+{
+ spa_t *spa = vd->vdev_spa;
+ vdev_t *lvd;
+ metaslab_t *msp;
+ dmu_tx_t *tx;
+
+ dprintf("%s txg %llu pass %d\n",
+ vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa));
+
+ if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) {
+ ASSERT(vd == vd->vdev_top);
+ tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+ vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
+ DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
+ ASSERT(vd->vdev_ms_array != 0);
+ vdev_config_dirty(vd);
+ dmu_tx_commit(tx);
+ }
+
+ while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
+ metaslab_sync(msp, txg);
+ (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
+ }
+
+ while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL)
+ vdev_dtl_sync(lvd, txg);
+
+ (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
+}
+
+uint64_t
+vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
+{
+ return (vd->vdev_ops->vdev_op_asize(vd, psize));
+}
+
+void
+vdev_io_start(zio_t *zio)
+{
+ zio->io_vd->vdev_ops->vdev_op_io_start(zio);
+}
+
+void
+vdev_io_done(zio_t *zio)
+{
+ zio->io_vd->vdev_ops->vdev_op_io_done(zio);
+}
+
+const char *
+vdev_description(vdev_t *vd)
+{
+ if (vd == NULL || vd->vdev_ops == NULL)
+ return ("<unknown>");
+
+ if (vd->vdev_path != NULL)
+ return (vd->vdev_path);
+
+ if (vd->vdev_parent == NULL)
+ return (spa_name(vd->vdev_spa));
+
+ return (vd->vdev_ops->vdev_op_type);
+}
+
+int
+vdev_online(spa_t *spa, uint64_t guid)
+{
+ vdev_t *rvd, *vd;
+ uint64_t txg;
+
+ txg = spa_vdev_enter(spa);
+
+ rvd = spa->spa_root_vdev;
+
+ if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL)
+ return (spa_vdev_exit(spa, NULL, txg, ENODEV));
+
+ if (!vd->vdev_ops->vdev_op_leaf)
+ return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
+ dprintf("ONLINE: %s\n", vdev_description(vd));
+
+ vd->vdev_offline = B_FALSE;
+ vd->vdev_tmpoffline = B_FALSE;
+ vdev_reopen(vd->vdev_top);
+
+ vdev_config_dirty(vd->vdev_top);
+
+ (void) spa_vdev_exit(spa, NULL, txg, 0);
+
+ VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
+
+ return (0);
+}
+
+int
+vdev_offline(spa_t *spa, uint64_t guid, int istmp)
+{
+ vdev_t *rvd, *vd;
+ uint64_t txg;
+
+ txg = spa_vdev_enter(spa);
+
+ rvd = spa->spa_root_vdev;
+
+ if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL)
+ return (spa_vdev_exit(spa, NULL, txg, ENODEV));
+
+ if (!vd->vdev_ops->vdev_op_leaf)
+ return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
+ dprintf("OFFLINE: %s\n", vdev_description(vd));
+
+ /*
+ * If the device isn't already offline, try to offline it.
+ */
+ if (!vd->vdev_offline) {
+ /*
+ * If this device's top-level vdev has a non-empty DTL,
+ * don't allow the device to be offlined.
+ *
+ * XXX -- make this more precise by allowing the offline
+ * as long as the remaining devices don't have any DTL holes.
+ */
+ if (vd->vdev_top->vdev_dtl_map.sm_space != 0)
+ return (spa_vdev_exit(spa, NULL, txg, EBUSY));
+
+ /*
+ * Offline this device and reopen its top-level vdev.
+ * If this action results in the top-level vdev becoming
+ * unusable, undo it and fail the request.
+ */
+ vd->vdev_offline = B_TRUE;
+ vdev_reopen(vd->vdev_top);
+ if (vdev_is_dead(vd->vdev_top)) {
+ vd->vdev_offline = B_FALSE;
+ vdev_reopen(vd->vdev_top);
+ return (spa_vdev_exit(spa, NULL, txg, EBUSY));
+ }
+ }
+
+ vd->vdev_tmpoffline = istmp;
+
+ vdev_config_dirty(vd->vdev_top);
+
+ return (spa_vdev_exit(spa, NULL, txg, 0));
+}
+
+/*
+ * Clear the error counts associated with this vdev. Unlike vdev_online() and
+ * vdev_offline(), we assume the spa config is locked. We also clear all
+ * children. If 'vd' is NULL, then the user wants to clear all vdevs.
+ */
+void
+vdev_clear(spa_t *spa, vdev_t *vd)
+{
+ int c;
+
+ if (vd == NULL)
+ vd = spa->spa_root_vdev;
+
+ vd->vdev_stat.vs_read_errors = 0;
+ vd->vdev_stat.vs_write_errors = 0;
+ vd->vdev_stat.vs_checksum_errors = 0;
+
+ for (c = 0; c < vd->vdev_children; c++)
+ vdev_clear(spa, vd->vdev_child[c]);
+}
+
+int
+vdev_is_dead(vdev_t *vd)
+{
+ return (vd->vdev_state <= VDEV_STATE_CANT_OPEN);
+}
+
+int
+vdev_error_inject(vdev_t *vd, zio_t *zio)
+{
+ int error = 0;
+
+ if (vd->vdev_fault_mode == VDEV_FAULT_NONE)
+ return (0);
+
+ if (((1ULL << zio->io_type) & vd->vdev_fault_mask) == 0)
+ return (0);
+
+ switch (vd->vdev_fault_mode) {
+ case VDEV_FAULT_RANDOM:
+ if (spa_get_random(vd->vdev_fault_arg) == 0)
+ error = EIO;
+ break;
+
+ case VDEV_FAULT_COUNT:
+ if ((int64_t)--vd->vdev_fault_arg <= 0)
+ vd->vdev_fault_mode = VDEV_FAULT_NONE;
+ error = EIO;
+ break;
+ }
+
+ if (error != 0) {
+ dprintf("returning %d for type %d on %s state %d offset %llx\n",
+ error, zio->io_type, vdev_description(vd),
+ vd->vdev_state, zio->io_offset);
+ }
+
+ return (error);
+}
+
+/*
+ * Get statistics for the given vdev.
+ */
+void
+vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
+{
+ vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
+ int c, t;
+
+ mutex_enter(&vd->vdev_stat_lock);
+ bcopy(&vd->vdev_stat, vs, sizeof (*vs));
+ vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
+ vs->vs_state = vd->vdev_state;
+ vs->vs_rsize = vdev_get_rsize(vd);
+ mutex_exit(&vd->vdev_stat_lock);
+
+ /*
+ * If we're getting stats on the root vdev, aggregate the I/O counts
+ * over all top-level vdevs (i.e. the direct children of the root).
+ */
+ if (vd == rvd) {
+ for (c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *cvd = rvd->vdev_child[c];
+ vdev_stat_t *cvs = &cvd->vdev_stat;
+
+ mutex_enter(&vd->vdev_stat_lock);
+ for (t = 0; t < ZIO_TYPES; t++) {
+ vs->vs_ops[t] += cvs->vs_ops[t];
+ vs->vs_bytes[t] += cvs->vs_bytes[t];
+ }
+ vs->vs_read_errors += cvs->vs_read_errors;
+ vs->vs_write_errors += cvs->vs_write_errors;
+ vs->vs_checksum_errors += cvs->vs_checksum_errors;
+ vs->vs_scrub_examined += cvs->vs_scrub_examined;
+ vs->vs_scrub_errors += cvs->vs_scrub_errors;
+ mutex_exit(&vd->vdev_stat_lock);
+ }
+ }
+}
+
+void
+vdev_stat_update(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ vdev_t *pvd;
+ uint64_t txg = zio->io_txg;
+ vdev_stat_t *vs = &vd->vdev_stat;
+ zio_type_t type = zio->io_type;
+ int flags = zio->io_flags;
+
+ if (zio->io_error == 0) {
+ if (!(flags & ZIO_FLAG_IO_BYPASS)) {
+ mutex_enter(&vd->vdev_stat_lock);
+ vs->vs_ops[type]++;
+ vs->vs_bytes[type] += zio->io_size;
+ mutex_exit(&vd->vdev_stat_lock);
+ }
+ if ((flags & ZIO_FLAG_IO_REPAIR) &&
+ zio->io_delegate_list == NULL) {
+ mutex_enter(&vd->vdev_stat_lock);
+ if (flags & ZIO_FLAG_SCRUB_THREAD)
+ vs->vs_scrub_repaired += zio->io_size;
+ else
+ vs->vs_self_healed += zio->io_size;
+ mutex_exit(&vd->vdev_stat_lock);
+ }
+ return;
+ }
+
+ if (flags & ZIO_FLAG_SPECULATIVE)
+ return;
+
+ if (!vdev_is_dead(vd)) {
+ mutex_enter(&vd->vdev_stat_lock);
+ if (type == ZIO_TYPE_READ) {
+ if (zio->io_error == ECKSUM)
+ vs->vs_checksum_errors++;
+ else
+ vs->vs_read_errors++;
+ }
+ if (type == ZIO_TYPE_WRITE)
+ vs->vs_write_errors++;
+ mutex_exit(&vd->vdev_stat_lock);
+ }
+
+ if (type == ZIO_TYPE_WRITE) {
+ if (txg == 0 || vd->vdev_children != 0)
+ return;
+ if (flags & ZIO_FLAG_SCRUB_THREAD) {
+ ASSERT(flags & ZIO_FLAG_IO_REPAIR);
+ for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
+ vdev_dtl_dirty(&pvd->vdev_dtl_scrub, txg, 1);
+ }
+ if (!(flags & ZIO_FLAG_IO_REPAIR)) {
+ if (vdev_dtl_contains(&vd->vdev_dtl_map, txg, 1))
+ return;
+ vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
+ for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
+ vdev_dtl_dirty(&pvd->vdev_dtl_map, txg, 1);
+ }
+ }
+}
+
+void
+vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete)
+{
+ int c;
+ vdev_stat_t *vs = &vd->vdev_stat;
+
+ for (c = 0; c < vd->vdev_children; c++)
+ vdev_scrub_stat_update(vd->vdev_child[c], type, complete);
+
+ mutex_enter(&vd->vdev_stat_lock);
+
+ if (type == POOL_SCRUB_NONE) {
+ /*
+ * Update completion and end time. Leave everything else alone
+ * so we can report what happened during the previous scrub.
+ */
+ vs->vs_scrub_complete = complete;
+ vs->vs_scrub_end = gethrestime_sec();
+ } else {
+ vs->vs_scrub_type = type;
+ vs->vs_scrub_complete = 0;
+ vs->vs_scrub_examined = 0;
+ vs->vs_scrub_repaired = 0;
+ vs->vs_scrub_errors = 0;
+ vs->vs_scrub_start = gethrestime_sec();
+ vs->vs_scrub_end = 0;
+ }
+
+ mutex_exit(&vd->vdev_stat_lock);
+}
+
+/*
+ * Update the in-core space usage stats for this vdev and the root vdev.
+ */
+void
+vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta)
+{
+ ASSERT(vd == vd->vdev_top);
+ int64_t dspace_delta = space_delta;
+
+ do {
+ if (vd->vdev_ms_count) {
+ /*
+ * If this is a top-level vdev, apply the
+ * inverse of its psize-to-asize (ie. RAID-Z)
+ * space-expansion factor. We must calculate
+ * this here and not at the root vdev because
+ * the root vdev's psize-to-asize is simply the
+ * max of its childrens', thus not accurate
+ * enough for us.
+ */
+ ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0);
+ dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) *
+ vd->vdev_deflate_ratio;
+ }
+
+ mutex_enter(&vd->vdev_stat_lock);
+ vd->vdev_stat.vs_space += space_delta;
+ vd->vdev_stat.vs_alloc += alloc_delta;
+ vd->vdev_stat.vs_dspace += dspace_delta;
+ mutex_exit(&vd->vdev_stat_lock);
+ } while ((vd = vd->vdev_parent) != NULL);
+}
+
+/*
+ * Mark a top-level vdev's config as dirty, placing it on the dirty list
+ * so that it will be written out next time the vdev configuration is synced.
+ * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs.
+ */
+void
+vdev_config_dirty(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+ vdev_t *rvd = spa->spa_root_vdev;
+ int c;
+
+ /*
+ * The dirty list is protected by the config lock. The caller must
+ * either hold the config lock as writer, or must be the sync thread
+ * (which holds the lock as reader). There's only one sync thread,
+ * so this is sufficient to ensure mutual exclusion.
+ */
+ ASSERT(spa_config_held(spa, RW_WRITER) ||
+ dsl_pool_sync_context(spa_get_dsl(spa)));
+
+ if (vd == rvd) {
+ for (c = 0; c < rvd->vdev_children; c++)
+ vdev_config_dirty(rvd->vdev_child[c]);
+ } else {
+ ASSERT(vd == vd->vdev_top);
+
+ if (!list_link_active(&vd->vdev_dirty_node))
+ list_insert_head(&spa->spa_dirty_list, vd);
+ }
+}
+
+void
+vdev_config_clean(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ ASSERT(spa_config_held(spa, RW_WRITER) ||
+ dsl_pool_sync_context(spa_get_dsl(spa)));
+
+ ASSERT(list_link_active(&vd->vdev_dirty_node));
+ list_remove(&spa->spa_dirty_list, vd);
+}
+
+void
+vdev_propagate_state(vdev_t *vd)
+{
+ vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
+ int degraded = 0, faulted = 0;
+ int corrupted = 0;
+ int c;
+ vdev_t *child;
+
+ for (c = 0; c < vd->vdev_children; c++) {
+ child = vd->vdev_child[c];
+ if (child->vdev_state <= VDEV_STATE_CANT_OPEN)
+ faulted++;
+ else if (child->vdev_state == VDEV_STATE_DEGRADED)
+ degraded++;
+
+ if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA)
+ corrupted++;
+ }
+
+ vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded);
+
+ /*
+ * Root special: if there is a toplevel vdev that cannot be
+ * opened due to corrupted metadata, then propagate the root
+ * vdev's aux state as 'corrupt' rather than 'insufficient
+ * replicas'.
+ */
+ if (corrupted && vd == rvd && rvd->vdev_state == VDEV_STATE_CANT_OPEN)
+ vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+}
+
+/*
+ * Set a vdev's state. If this is during an open, we don't update the parent
+ * state, because we're in the process of opening children depth-first.
+ * Otherwise, we propagate the change to the parent.
+ *
+ * If this routine places a device in a faulted state, an appropriate ereport is
+ * generated.
+ */
+void
+vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
+{
+ uint64_t save_state;
+
+ if (state == vd->vdev_state) {
+ vd->vdev_stat.vs_aux = aux;
+ return;
+ }
+
+ save_state = vd->vdev_state;
+
+ vd->vdev_state = state;
+ vd->vdev_stat.vs_aux = aux;
+
+ if (state == VDEV_STATE_CANT_OPEN) {
+ /*
+ * If we fail to open a vdev during an import, we mark it as
+ * "not available", which signifies that it was never there to
+ * begin with. Failure to open such a device is not considered
+ * an error.
+ */
+ if (vd->vdev_spa->spa_load_state == SPA_LOAD_IMPORT &&
+ vd->vdev_ops->vdev_op_leaf)
+ vd->vdev_not_present = 1;
+
+ /*
+ * Post the appropriate ereport. If the 'prevstate' field is
+ * set to something other than VDEV_STATE_UNKNOWN, it indicates
+ * that this is part of a vdev_reopen(). In this case, we don't
+ * want to post the ereport if the device was already in the
+ * CANT_OPEN state beforehand.
+ */
+ if (vd->vdev_prevstate != state && !vd->vdev_not_present &&
+ vd != vd->vdev_spa->spa_root_vdev) {
+ const char *class;
+
+ switch (aux) {
+ case VDEV_AUX_OPEN_FAILED:
+ class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED;
+ break;
+ case VDEV_AUX_CORRUPT_DATA:
+ class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA;
+ break;
+ case VDEV_AUX_NO_REPLICAS:
+ class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS;
+ break;
+ case VDEV_AUX_BAD_GUID_SUM:
+ class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM;
+ break;
+ case VDEV_AUX_TOO_SMALL:
+ class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL;
+ break;
+ case VDEV_AUX_BAD_LABEL:
+ class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL;
+ break;
+ default:
+ class = FM_EREPORT_ZFS_DEVICE_UNKNOWN;
+ }
+
+ zfs_ereport_post(class, vd->vdev_spa,
+ vd, NULL, save_state, 0);
+ }
+ }
+
+ if (isopen)
+ return;
+
+ if (vd->vdev_parent != NULL)
+ vdev_propagate_state(vd->vdev_parent);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c
new file mode 100644
index 0000000..b4fb960
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c
@@ -0,0 +1,394 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+
+/*
+ * Virtual device read-ahead caching.
+ *
+ * This file implements a simple LRU read-ahead cache. When the DMU reads
+ * a given block, it will often want other, nearby blocks soon thereafter.
+ * We take advantage of this by reading a larger disk region and caching
+ * the result. In the best case, this can turn 256 back-to-back 512-byte
+ * reads into a single 128k read followed by 255 cache hits; this reduces
+ * latency dramatically. In the worst case, it can turn an isolated 512-byte
+ * read into a 128k read, which doesn't affect latency all that much but is
+ * terribly wasteful of bandwidth. A more intelligent version of the cache
+ * could keep track of access patterns and not do read-ahead unless it sees
+ * at least two temporally close I/Os to the same region. It could also
+ * take advantage of semantic information about the I/O. And it could use
+ * something faster than an AVL tree; that was chosen solely for convenience.
+ *
+ * There are five cache operations: allocate, fill, read, write, evict.
+ *
+ * (1) Allocate. This reserves a cache entry for the specified region.
+ * We separate the allocate and fill operations so that multiple threads
+ * don't generate I/O for the same cache miss.
+ *
+ * (2) Fill. When the I/O for a cache miss completes, the fill routine
+ * places the data in the previously allocated cache entry.
+ *
+ * (3) Read. Read data from the cache.
+ *
+ * (4) Write. Update cache contents after write completion.
+ *
+ * (5) Evict. When allocating a new entry, we evict the oldest (LRU) entry
+ * if the total cache size exceeds zfs_vdev_cache_size.
+ */
+
+/*
+ * These tunables are for performance analysis.
+ */
+/*
+ * All i/os smaller than zfs_vdev_cache_max will be turned into
+ * 1<<zfs_vdev_cache_bshift byte reads by the vdev_cache (aka software
+ * track buffer. At most zfs_vdev_cache_size bytes will be kept in each
+ * vdev's vdev_cache.
+ */
+int zfs_vdev_cache_max = 1<<14;
+int zfs_vdev_cache_size = 10ULL << 20;
+int zfs_vdev_cache_bshift = 16;
+
+SYSCTL_DECL(_vfs_zfs_vdev);
+SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, cache, CTLFLAG_RW, 0, "ZFS VDEV Cache");
+TUNABLE_INT("vfs.zfs.vdev.cache.max", &zfs_vdev_cache_max);
+SYSCTL_INT(_vfs_zfs_vdev_cache, OID_AUTO, max, CTLFLAG_RDTUN,
+ &zfs_vdev_cache_max, 0, "Maximum I/O request size that increase read size");
+TUNABLE_INT("vfs.zfs.vdev.cache.size", &zfs_vdev_cache_size);
+SYSCTL_INT(_vfs_zfs_vdev_cache, OID_AUTO, size, CTLFLAG_RDTUN,
+ &zfs_vdev_cache_size, 0, "Size of VDEV cache");
+
+#define VCBS (1 << zfs_vdev_cache_bshift)
+
+static int
+vdev_cache_offset_compare(const void *a1, const void *a2)
+{
+ const vdev_cache_entry_t *ve1 = a1;
+ const vdev_cache_entry_t *ve2 = a2;
+
+ if (ve1->ve_offset < ve2->ve_offset)
+ return (-1);
+ if (ve1->ve_offset > ve2->ve_offset)
+ return (1);
+ return (0);
+}
+
+static int
+vdev_cache_lastused_compare(const void *a1, const void *a2)
+{
+ const vdev_cache_entry_t *ve1 = a1;
+ const vdev_cache_entry_t *ve2 = a2;
+
+ if (ve1->ve_lastused < ve2->ve_lastused)
+ return (-1);
+ if (ve1->ve_lastused > ve2->ve_lastused)
+ return (1);
+
+ /*
+ * Among equally old entries, sort by offset to ensure uniqueness.
+ */
+ return (vdev_cache_offset_compare(a1, a2));
+}
+
+/*
+ * Evict the specified entry from the cache.
+ */
+static void
+vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve)
+{
+ ASSERT(MUTEX_HELD(&vc->vc_lock));
+ ASSERT(ve->ve_fill_io == NULL);
+ ASSERT(ve->ve_data != NULL);
+
+ dprintf("evicting %p, off %llx, LRU %llu, age %lu, hits %u, stale %u\n",
+ vc, ve->ve_offset, ve->ve_lastused, lbolt - ve->ve_lastused,
+ ve->ve_hits, ve->ve_missed_update);
+
+ avl_remove(&vc->vc_lastused_tree, ve);
+ avl_remove(&vc->vc_offset_tree, ve);
+ zio_buf_free(ve->ve_data, VCBS);
+ kmem_free(ve, sizeof (vdev_cache_entry_t));
+}
+
+/*
+ * Allocate an entry in the cache. At the point we don't have the data,
+ * we're just creating a placeholder so that multiple threads don't all
+ * go off and read the same blocks.
+ */
+static vdev_cache_entry_t *
+vdev_cache_allocate(zio_t *zio)
+{
+ vdev_cache_t *vc = &zio->io_vd->vdev_cache;
+ uint64_t offset = P2ALIGN(zio->io_offset, VCBS);
+ vdev_cache_entry_t *ve;
+
+ ASSERT(MUTEX_HELD(&vc->vc_lock));
+
+ if (zfs_vdev_cache_size == 0)
+ return (NULL);
+
+ /*
+ * If adding a new entry would exceed the cache size,
+ * evict the oldest entry (LRU).
+ */
+ if ((avl_numnodes(&vc->vc_lastused_tree) << zfs_vdev_cache_bshift) >
+ zfs_vdev_cache_size) {
+ ve = avl_first(&vc->vc_lastused_tree);
+ if (ve->ve_fill_io != NULL) {
+ dprintf("can't evict in %p, still filling\n", vc);
+ return (NULL);
+ }
+ ASSERT(ve->ve_hits != 0);
+ vdev_cache_evict(vc, ve);
+ }
+
+ ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP);
+ ve->ve_offset = offset;
+ ve->ve_lastused = lbolt;
+ ve->ve_data = zio_buf_alloc(VCBS);
+
+ avl_add(&vc->vc_offset_tree, ve);
+ avl_add(&vc->vc_lastused_tree, ve);
+
+ return (ve);
+}
+
+static void
+vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio)
+{
+ uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS);
+
+ ASSERT(MUTEX_HELD(&vc->vc_lock));
+ ASSERT(ve->ve_fill_io == NULL);
+
+ if (ve->ve_lastused != lbolt) {
+ avl_remove(&vc->vc_lastused_tree, ve);
+ ve->ve_lastused = lbolt;
+ avl_add(&vc->vc_lastused_tree, ve);
+ }
+
+ ve->ve_hits++;
+ bcopy(ve->ve_data + cache_phase, zio->io_data, zio->io_size);
+}
+
+/*
+ * Fill a previously allocated cache entry with data.
+ */
+static void
+vdev_cache_fill(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ vdev_cache_t *vc = &vd->vdev_cache;
+ vdev_cache_entry_t *ve = zio->io_private;
+ zio_t *dio;
+
+ ASSERT(zio->io_size == VCBS);
+
+ /*
+ * Add data to the cache.
+ */
+ mutex_enter(&vc->vc_lock);
+
+ ASSERT(ve->ve_fill_io == zio);
+ ASSERT(ve->ve_offset == zio->io_offset);
+ ASSERT(ve->ve_data == zio->io_data);
+
+ ve->ve_fill_io = NULL;
+
+ /*
+ * Even if this cache line was invalidated by a missed write update,
+ * any reads that were queued up before the missed update are still
+ * valid, so we can satisfy them from this line before we evict it.
+ */
+ for (dio = zio->io_delegate_list; dio; dio = dio->io_delegate_next)
+ vdev_cache_hit(vc, ve, dio);
+
+ if (zio->io_error || ve->ve_missed_update)
+ vdev_cache_evict(vc, ve);
+
+ mutex_exit(&vc->vc_lock);
+
+ while ((dio = zio->io_delegate_list) != NULL) {
+ zio->io_delegate_list = dio->io_delegate_next;
+ dio->io_delegate_next = NULL;
+ dio->io_error = zio->io_error;
+ zio_next_stage(dio);
+ }
+}
+
+/*
+ * Read data from the cache. Returns 0 on cache hit, errno on a miss.
+ */
+int
+vdev_cache_read(zio_t *zio)
+{
+ vdev_cache_t *vc = &zio->io_vd->vdev_cache;
+ vdev_cache_entry_t *ve, ve_search;
+ uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS);
+ uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS);
+ zio_t *fio;
+
+ ASSERT(zio->io_type == ZIO_TYPE_READ);
+
+ if (zio->io_flags & ZIO_FLAG_DONT_CACHE)
+ return (EINVAL);
+
+ if (zio->io_size > zfs_vdev_cache_max)
+ return (EOVERFLOW);
+
+ /*
+ * If the I/O straddles two or more cache blocks, don't cache it.
+ */
+ if (P2CROSS(zio->io_offset, zio->io_offset + zio->io_size - 1, VCBS))
+ return (EXDEV);
+
+ ASSERT(cache_phase + zio->io_size <= VCBS);
+
+ mutex_enter(&vc->vc_lock);
+
+ ve_search.ve_offset = cache_offset;
+ ve = avl_find(&vc->vc_offset_tree, &ve_search, NULL);
+
+ if (ve != NULL) {
+ if (ve->ve_missed_update) {
+ mutex_exit(&vc->vc_lock);
+ return (ESTALE);
+ }
+
+ if ((fio = ve->ve_fill_io) != NULL) {
+ zio->io_delegate_next = fio->io_delegate_list;
+ fio->io_delegate_list = zio;
+ zio_vdev_io_bypass(zio);
+ mutex_exit(&vc->vc_lock);
+ return (0);
+ }
+
+ vdev_cache_hit(vc, ve, zio);
+ zio_vdev_io_bypass(zio);
+
+ mutex_exit(&vc->vc_lock);
+ zio_next_stage(zio);
+ return (0);
+ }
+
+ ve = vdev_cache_allocate(zio);
+
+ if (ve == NULL) {
+ mutex_exit(&vc->vc_lock);
+ return (ENOMEM);
+ }
+
+ fio = zio_vdev_child_io(zio, NULL, zio->io_vd, cache_offset,
+ ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_CACHE_FILL,
+ ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE |
+ ZIO_FLAG_DONT_RETRY | ZIO_FLAG_NOBOOKMARK,
+ vdev_cache_fill, ve);
+
+ ve->ve_fill_io = fio;
+ fio->io_delegate_list = zio;
+ zio_vdev_io_bypass(zio);
+
+ mutex_exit(&vc->vc_lock);
+ zio_nowait(fio);
+
+ return (0);
+}
+
+/*
+ * Update cache contents upon write completion.
+ */
+void
+vdev_cache_write(zio_t *zio)
+{
+ vdev_cache_t *vc = &zio->io_vd->vdev_cache;
+ vdev_cache_entry_t *ve, ve_search;
+ uint64_t io_start = zio->io_offset;
+ uint64_t io_end = io_start + zio->io_size;
+ uint64_t min_offset = P2ALIGN(io_start, VCBS);
+ uint64_t max_offset = P2ROUNDUP(io_end, VCBS);
+ avl_index_t where;
+
+ ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+
+ mutex_enter(&vc->vc_lock);
+
+ ve_search.ve_offset = min_offset;
+ ve = avl_find(&vc->vc_offset_tree, &ve_search, &where);
+
+ if (ve == NULL)
+ ve = avl_nearest(&vc->vc_offset_tree, where, AVL_AFTER);
+
+ while (ve != NULL && ve->ve_offset < max_offset) {
+ uint64_t start = MAX(ve->ve_offset, io_start);
+ uint64_t end = MIN(ve->ve_offset + VCBS, io_end);
+
+ if (ve->ve_fill_io != NULL) {
+ ve->ve_missed_update = 1;
+ } else {
+ bcopy((char *)zio->io_data + start - io_start,
+ ve->ve_data + start - ve->ve_offset, end - start);
+ }
+ ve = AVL_NEXT(&vc->vc_offset_tree, ve);
+ }
+ mutex_exit(&vc->vc_lock);
+}
+
+void
+vdev_cache_init(vdev_t *vd)
+{
+ vdev_cache_t *vc = &vd->vdev_cache;
+
+ mutex_init(&vc->vc_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ avl_create(&vc->vc_offset_tree, vdev_cache_offset_compare,
+ sizeof (vdev_cache_entry_t),
+ offsetof(struct vdev_cache_entry, ve_offset_node));
+
+ avl_create(&vc->vc_lastused_tree, vdev_cache_lastused_compare,
+ sizeof (vdev_cache_entry_t),
+ offsetof(struct vdev_cache_entry, ve_lastused_node));
+}
+
+void
+vdev_cache_fini(vdev_t *vd)
+{
+ vdev_cache_t *vc = &vd->vdev_cache;
+ vdev_cache_entry_t *ve;
+
+ mutex_enter(&vc->vc_lock);
+ while ((ve = avl_first(&vc->vc_offset_tree)) != NULL)
+ vdev_cache_evict(vc, ve);
+ mutex_exit(&vc->vc_lock);
+
+ avl_destroy(&vc->vc_offset_tree);
+ avl_destroy(&vc->vc_lastused_tree);
+
+ mutex_destroy(&vc->vc_lock);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c
new file mode 100644
index 0000000..b965b1c
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c
@@ -0,0 +1,363 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_disk.h>
+#include <sys/vdev_impl.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio.h>
+#include <sys/sunldi.h>
+
+/*
+ * Virtual device vector for disks.
+ */
+
+extern ldi_ident_t zfs_li;
+
+typedef struct vdev_disk_buf {
+ buf_t vdb_buf;
+ zio_t *vdb_io;
+} vdev_disk_buf_t;
+
+static int
+vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
+{
+ vdev_disk_t *dvd;
+ struct dk_minfo dkm;
+ int error;
+
+ /*
+ * We must have a pathname, and it must be absolute.
+ */
+ if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
+ vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+ return (EINVAL);
+ }
+
+ dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
+
+ /*
+ * When opening a disk device, we want to preserve the user's original
+ * intent. We always want to open the device by the path the user gave
+ * us, even if it is one of multiple paths to the save device. But we
+ * also want to be able to survive disks being removed/recabled.
+ * Therefore the sequence of opening devices is:
+ *
+ * 1. Try opening the device by path. For legacy pools without the
+ * 'whole_disk' property, attempt to fix the path by appending 's0'.
+ *
+ * 2. If the devid of the device matches the stored value, return
+ * success.
+ *
+ * 3. Otherwise, the device may have moved. Try opening the device
+ * by the devid instead.
+ *
+ */
+ if (vd->vdev_devid != NULL) {
+ if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid,
+ &dvd->vd_minor) != 0) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+ return (EINVAL);
+ }
+ }
+
+ error = EINVAL; /* presume failure */
+
+ if (vd->vdev_path != NULL) {
+ ddi_devid_t devid;
+
+ if (vd->vdev_wholedisk == -1ULL) {
+ size_t len = strlen(vd->vdev_path) + 3;
+ char *buf = kmem_alloc(len, KM_SLEEP);
+ ldi_handle_t lh;
+
+ (void) snprintf(buf, len, "%ss0", vd->vdev_path);
+
+ if (ldi_open_by_name(buf, spa_mode, kcred,
+ &lh, zfs_li) == 0) {
+ spa_strfree(vd->vdev_path);
+ vd->vdev_path = buf;
+ vd->vdev_wholedisk = 1ULL;
+ (void) ldi_close(lh, spa_mode, kcred);
+ } else {
+ kmem_free(buf, len);
+ }
+ }
+
+ error = ldi_open_by_name(vd->vdev_path, spa_mode, kcred,
+ &dvd->vd_lh, zfs_li);
+
+ /*
+ * Compare the devid to the stored value.
+ */
+ if (error == 0 && vd->vdev_devid != NULL &&
+ ldi_get_devid(dvd->vd_lh, &devid) == 0) {
+ if (ddi_devid_compare(devid, dvd->vd_devid) != 0) {
+ error = EINVAL;
+ (void) ldi_close(dvd->vd_lh, spa_mode, kcred);
+ dvd->vd_lh = NULL;
+ }
+ ddi_devid_free(devid);
+ }
+
+ /*
+ * If we succeeded in opening the device, but 'vdev_wholedisk'
+ * is not yet set, then this must be a slice.
+ */
+ if (error == 0 && vd->vdev_wholedisk == -1ULL)
+ vd->vdev_wholedisk = 0;
+ }
+
+ /*
+ * If we were unable to open by path, or the devid check fails, open by
+ * devid instead.
+ */
+ if (error != 0 && vd->vdev_devid != NULL)
+ error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor,
+ spa_mode, kcred, &dvd->vd_lh, zfs_li);
+
+ if (error) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+ return (error);
+ }
+
+ /*
+ * Determine the actual size of the device.
+ */
+ if (ldi_get_size(dvd->vd_lh, psize) != 0) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+ return (EINVAL);
+ }
+
+ /*
+ * If we own the whole disk, try to enable disk write caching.
+ * We ignore errors because it's OK if we can't do it.
+ */
+ if (vd->vdev_wholedisk == 1) {
+ int wce = 1;
+ (void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce,
+ FKIOCTL, kcred, NULL);
+ }
+
+ /*
+ * Determine the device's minimum transfer size.
+ * If the ioctl isn't supported, assume DEV_BSIZE.
+ */
+ if (ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFO, (intptr_t)&dkm,
+ FKIOCTL, kcred, NULL) != 0)
+ dkm.dki_lbsize = DEV_BSIZE;
+
+ *ashift = highbit(MAX(dkm.dki_lbsize, SPA_MINBLOCKSIZE)) - 1;
+
+ /*
+ * Clear the nowritecache bit, so that on a vdev_reopen() we will
+ * try again.
+ */
+ vd->vdev_nowritecache = B_FALSE;
+
+ return (0);
+}
+
+static void
+vdev_disk_close(vdev_t *vd)
+{
+ vdev_disk_t *dvd = vd->vdev_tsd;
+
+ if (dvd == NULL)
+ return;
+
+ dprintf("removing disk %s, devid %s\n",
+ vd->vdev_path ? vd->vdev_path : "<none>",
+ vd->vdev_devid ? vd->vdev_devid : "<none>");
+
+ if (dvd->vd_minor != NULL)
+ ddi_devid_str_free(dvd->vd_minor);
+
+ if (dvd->vd_devid != NULL)
+ ddi_devid_free(dvd->vd_devid);
+
+ if (dvd->vd_lh != NULL)
+ (void) ldi_close(dvd->vd_lh, spa_mode, kcred);
+
+ kmem_free(dvd, sizeof (vdev_disk_t));
+ vd->vdev_tsd = NULL;
+}
+
+static void
+vdev_disk_io_intr(buf_t *bp)
+{
+ vdev_disk_buf_t *vdb = (vdev_disk_buf_t *)bp;
+ zio_t *zio = vdb->vdb_io;
+
+ if ((zio->io_error = geterror(bp)) == 0 && bp->b_resid != 0)
+ zio->io_error = EIO;
+
+ kmem_free(vdb, sizeof (vdev_disk_buf_t));
+
+ zio_next_stage_async(zio);
+}
+
+static void
+vdev_disk_ioctl_done(void *zio_arg, int error)
+{
+ zio_t *zio = zio_arg;
+
+ zio->io_error = error;
+
+ zio_next_stage_async(zio);
+}
+
+static void
+vdev_disk_io_start(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ vdev_disk_t *dvd = vd->vdev_tsd;
+ vdev_disk_buf_t *vdb;
+ buf_t *bp;
+ int flags, error;
+
+ if (zio->io_type == ZIO_TYPE_IOCTL) {
+ zio_vdev_io_bypass(zio);
+
+ /* XXPOLICY */
+ if (vdev_is_dead(vd)) {
+ zio->io_error = ENXIO;
+ zio_next_stage_async(zio);
+ return;
+ }
+
+ switch (zio->io_cmd) {
+
+ case DKIOCFLUSHWRITECACHE:
+
+ if (zfs_nocacheflush)
+ break;
+
+ if (vd->vdev_nowritecache) {
+ zio->io_error = ENOTSUP;
+ break;
+ }
+
+ zio->io_dk_callback.dkc_callback = vdev_disk_ioctl_done;
+ zio->io_dk_callback.dkc_cookie = zio;
+
+ error = ldi_ioctl(dvd->vd_lh, zio->io_cmd,
+ (uintptr_t)&zio->io_dk_callback,
+ FKIOCTL, kcred, NULL);
+
+ if (error == 0) {
+ /*
+ * The ioctl will be done asychronously,
+ * and will call vdev_disk_ioctl_done()
+ * upon completion.
+ */
+ return;
+ } else if (error == ENOTSUP) {
+ /*
+ * If we get ENOTSUP, we know that no future
+ * attempts will ever succeed. In this case we
+ * set a persistent bit so that we don't bother
+ * with the ioctl in the future.
+ */
+ vd->vdev_nowritecache = B_TRUE;
+ }
+ zio->io_error = error;
+
+ break;
+
+ default:
+ zio->io_error = ENOTSUP;
+ }
+
+ zio_next_stage_async(zio);
+ return;
+ }
+
+ if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
+ return;
+
+ if ((zio = vdev_queue_io(zio)) == NULL)
+ return;
+
+ flags = (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE);
+ flags |= B_BUSY | B_NOCACHE;
+ if (zio->io_flags & ZIO_FLAG_FAILFAST)
+ flags |= B_FAILFAST;
+
+ vdb = kmem_alloc(sizeof (vdev_disk_buf_t), KM_SLEEP);
+
+ vdb->vdb_io = zio;
+ bp = &vdb->vdb_buf;
+
+ bioinit(bp);
+ bp->b_flags = flags;
+ bp->b_bcount = zio->io_size;
+ bp->b_un.b_addr = zio->io_data;
+ bp->b_lblkno = lbtodb(zio->io_offset);
+ bp->b_bufsize = zio->io_size;
+ bp->b_iodone = (int (*)())vdev_disk_io_intr;
+
+ /* XXPOLICY */
+ error = vdev_is_dead(vd) ? ENXIO : vdev_error_inject(vd, zio);
+ if (error) {
+ zio->io_error = error;
+ bioerror(bp, error);
+ bp->b_resid = bp->b_bcount;
+ bp->b_iodone(bp);
+ return;
+ }
+
+ error = ldi_strategy(dvd->vd_lh, bp);
+ /* ldi_strategy() will return non-zero only on programming errors */
+ ASSERT(error == 0);
+}
+
+static void
+vdev_disk_io_done(zio_t *zio)
+{
+ vdev_queue_io_done(zio);
+
+ if (zio->io_type == ZIO_TYPE_WRITE)
+ vdev_cache_write(zio);
+
+ if (zio_injection_enabled && zio->io_error == 0)
+ zio->io_error = zio_handle_device_injection(zio->io_vd, EIO);
+
+ zio_next_stage(zio);
+}
+
+vdev_ops_t vdev_disk_ops = {
+ vdev_disk_open,
+ vdev_disk_close,
+ vdev_default_asize,
+ vdev_disk_io_start,
+ vdev_disk_io_done,
+ NULL,
+ VDEV_TYPE_DISK, /* name of this vdev type */
+ B_TRUE /* leaf vdev */
+};
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c
new file mode 100644
index 0000000..b8e79f8
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c
@@ -0,0 +1,225 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_file.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#include <sys/fs/zfs.h>
+
+/*
+ * Virtual device vector for files.
+ */
+
+static int
+vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
+{
+ vdev_file_t *vf;
+ vnode_t *vp;
+ vattr_t vattr;
+ int error;
+
+ /*
+ * We must have a pathname, and it must be absolute.
+ */
+ if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
+ vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+ return (EINVAL);
+ }
+
+ vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP);
+
+ /*
+ * We always open the files from the root of the global zone, even if
+ * we're in a local zone. If the user has gotten to this point, the
+ * administrator has already decided that the pool should be available
+ * to local zone users, so the underlying devices should be as well.
+ */
+ ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/');
+ error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE, spa_mode | FOFFMAX,
+ 0, &vp, 0, 0, rootdir);
+
+ if (error) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+ return (error);
+ }
+
+ vf->vf_vnode = vp;
+
+#ifdef _KERNEL
+ /*
+ * Make sure it's a regular file.
+ */
+ if (vp->v_type != VREG) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+ return (ENODEV);
+ }
+#endif
+
+ /*
+ * Determine the physical size of the file.
+ */
+ vattr.va_mask = AT_SIZE;
+ error = VOP_GETATTR(vp, &vattr, 0, kcred);
+ if (error) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+ return (error);
+ }
+
+ *psize = vattr.va_size;
+ *ashift = SPA_MINBLOCKSHIFT;
+
+ return (0);
+}
+
+static void
+vdev_file_close(vdev_t *vd)
+{
+ vdev_file_t *vf = vd->vdev_tsd;
+
+ if (vf == NULL)
+ return;
+
+ if (vf->vf_vnode != NULL) {
+ (void) VOP_PUTPAGE(vf->vf_vnode, 0, 0, B_INVAL, kcred);
+ (void) VOP_CLOSE(vf->vf_vnode, spa_mode, 1, 0, kcred);
+ VN_RELE(vf->vf_vnode);
+ }
+
+ kmem_free(vf, sizeof (vdev_file_t));
+ vd->vdev_tsd = NULL;
+}
+
+static void
+vdev_file_io_start(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ vdev_file_t *vf = vd->vdev_tsd;
+ ssize_t resid;
+ int error;
+
+ if (zio->io_type == ZIO_TYPE_IOCTL) {
+ zio_vdev_io_bypass(zio);
+
+ /* XXPOLICY */
+ if (vdev_is_dead(vd)) {
+ zio->io_error = ENXIO;
+ zio_next_stage_async(zio);
+ return;
+ }
+
+ switch (zio->io_cmd) {
+ case DKIOCFLUSHWRITECACHE:
+ zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC,
+ kcred);
+ dprintf("fsync(%s) = %d\n", vdev_description(vd),
+ zio->io_error);
+ break;
+ default:
+ zio->io_error = ENOTSUP;
+ }
+
+ zio_next_stage_async(zio);
+ return;
+ }
+
+ /*
+ * In the kernel, don't bother double-caching, but in userland,
+ * we want to test the vdev_cache code.
+ */
+#ifndef _KERNEL
+ if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
+ return;
+#endif
+
+ if ((zio = vdev_queue_io(zio)) == NULL)
+ return;
+
+ /* XXPOLICY */
+ error = vdev_is_dead(vd) ? ENXIO : vdev_error_inject(vd, zio);
+ if (error) {
+ zio->io_error = error;
+ zio_next_stage_async(zio);
+ return;
+ }
+
+ zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ?
+ UIO_READ : UIO_WRITE, vf->vf_vnode, zio->io_data,
+ zio->io_size, zio->io_offset, UIO_SYSSPACE,
+ 0, RLIM64_INFINITY, kcred, &resid);
+
+ if (resid != 0 && zio->io_error == 0)
+ zio->io_error = ENOSPC;
+
+ zio_next_stage_async(zio);
+}
+
+static void
+vdev_file_io_done(zio_t *zio)
+{
+ vdev_queue_io_done(zio);
+
+#ifndef _KERNEL
+ if (zio->io_type == ZIO_TYPE_WRITE)
+ vdev_cache_write(zio);
+#endif
+
+ if (zio_injection_enabled && zio->io_error == 0)
+ zio->io_error = zio_handle_device_injection(zio->io_vd, EIO);
+
+ zio_next_stage(zio);
+}
+
+vdev_ops_t vdev_file_ops = {
+ vdev_file_open,
+ vdev_file_close,
+ vdev_default_asize,
+ vdev_file_io_start,
+ vdev_file_io_done,
+ NULL,
+ VDEV_TYPE_FILE, /* name of this vdev type */
+ B_TRUE /* leaf vdev */
+};
+
+/*
+ * From userland we access disks just like files.
+ */
+#ifndef _KERNEL
+
+vdev_ops_t vdev_disk_ops = {
+ vdev_file_open,
+ vdev_file_close,
+ vdev_default_asize,
+ vdev_file_io_start,
+ vdev_file_io_done,
+ NULL,
+ VDEV_TYPE_DISK, /* name of this vdev type */
+ B_TRUE /* leaf vdev */
+};
+
+#endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
new file mode 100644
index 0000000..9699171
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
@@ -0,0 +1,432 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/bio.h>
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio.h>
+#include <geom/geom.h>
+
+/*
+ * Virtual device vector for GEOM.
+ */
+
+struct g_class zfs_vdev_class = {
+ .name = "ZFS::VDEV",
+ .version = G_VERSION,
+};
+
+DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev);
+
+typedef struct vdev_geom_ctx {
+ struct g_consumer *gc_consumer;
+ int gc_state;
+ struct bio_queue_head gc_queue;
+ struct mtx gc_queue_mtx;
+} vdev_geom_ctx_t;
+
+static void
+vdev_geom_release(vdev_t *vd)
+{
+ vdev_geom_ctx_t *ctx;
+
+ ctx = vd->vdev_tsd;
+ vd->vdev_tsd = NULL;
+
+ mtx_lock(&ctx->gc_queue_mtx);
+ ctx->gc_state = 1;
+ wakeup_one(&ctx->gc_queue);
+ while (ctx->gc_state != 2)
+ msleep(&ctx->gc_state, &ctx->gc_queue_mtx, 0, "vgeom:w", 0);
+ mtx_unlock(&ctx->gc_queue_mtx);
+ mtx_destroy(&ctx->gc_queue_mtx);
+ kmem_free(ctx, sizeof(*ctx));
+}
+
+static void
+vdev_geom_orphan(struct g_consumer *cp)
+{
+ struct g_geom *gp;
+ vdev_t *vd;
+ int error;
+
+ g_topology_assert();
+
+ vd = cp->private;
+ gp = cp->geom;
+ error = cp->provider->error;
+
+ ZFS_LOG(1, "Closing access to %s.", cp->provider->name);
+ g_access(cp, -cp->acr, -cp->acw, -cp->ace);
+ g_detach(cp);
+ ZFS_LOG(1, "Destroyed consumer to %s.", cp->provider->name);
+ g_destroy_consumer(cp);
+ /* Destroy geom if there are no consumers left. */
+ if (LIST_EMPTY(&gp->consumer)) {
+ ZFS_LOG(1, "Destroyed geom %s.", gp->name);
+ g_wither_geom(cp->geom, error);
+ }
+ vdev_geom_release(vd);
+ /* Both methods below work, but in a bit different way. */
+#if 0
+ vd->vdev_reopen_wanted = 1;
+#else
+ vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, vd->vdev_stat.vs_aux);
+#endif
+}
+
+static struct g_consumer *
+vdev_geom_attach(struct g_provider *pp, int write)
+{
+ struct g_geom *gp;
+ struct g_consumer *cp;
+
+ g_topology_assert();
+
+ ZFS_LOG(1, "Attaching to %s.", pp->name);
+ /* Do we have geom already? No? Create one. */
+ LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) {
+ if (!(gp->flags & G_GEOM_WITHER))
+ break;
+ }
+ if (gp == NULL) {
+ gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev");
+ gp->orphan = vdev_geom_orphan;
+ cp = g_new_consumer(gp);
+ if (g_attach(cp, pp) != 0) {
+ g_wither_geom(gp, ENXIO);
+ return (NULL);
+ }
+ if (g_access(cp, 1, write, 1) != 0) {
+ g_wither_geom(gp, ENXIO);
+ return (NULL);
+ }
+ ZFS_LOG(1, "Created geom and consumer for %s.", pp->name);
+ } else {
+ /* Check if we are already connected to this provider. */
+ LIST_FOREACH(cp, &gp->consumer, consumer) {
+ if (cp->provider == pp) {
+ ZFS_LOG(1, "Found consumer for %s.", pp->name);
+ break;
+ }
+ }
+ if (cp == NULL) {
+ cp = g_new_consumer(gp);
+ if (g_attach(cp, pp) != 0) {
+ g_destroy_consumer(cp);
+ return (NULL);
+ }
+ if (g_access(cp, 1, write, 1) != 0) {
+ g_detach(cp);
+ g_destroy_consumer(cp);
+ return (NULL);
+ }
+ ZFS_LOG(1, "Created consumer for %s.", pp->name);
+ } else {
+ if (g_access(cp, 1, cp->acw > 0 ? 0 : write, 1) != 0)
+ return (NULL);
+ ZFS_LOG(1, "Used existing consumer for %s.", pp->name);
+ }
+ }
+ return (cp);
+}
+
+static void
+vdev_geom_detach(void *arg, int flag __unused)
+{
+ struct g_geom *gp;
+ struct g_consumer *cp;
+
+ g_topology_assert();
+ cp = arg;
+ gp = cp->geom;
+
+ ZFS_LOG(1, "Closing access to %s.", cp->provider->name);
+ g_access(cp, -1, 0, -1);
+ /* Destroy consumer on last close. */
+ if (cp->acr == 0 && cp->ace == 0) {
+ ZFS_LOG(1, "Destroyed consumer to %s.", cp->provider->name);
+ if (cp->acw > 0)
+ g_access(cp, 0, -cp->acw, 0);
+ g_detach(cp);
+ g_destroy_consumer(cp);
+ }
+ /* Destroy geom if there are no consumers left. */
+ if (LIST_EMPTY(&gp->consumer)) {
+ ZFS_LOG(1, "Destroyed geom %s.", gp->name);
+ g_wither_geom(gp, ENXIO);
+ }
+}
+
+static void
+vdev_geom_worker(void *arg)
+{
+ vdev_geom_ctx_t *ctx;
+ zio_t *zio;
+ struct bio *bp;
+
+ ctx = arg;
+ for (;;) {
+ mtx_lock(&ctx->gc_queue_mtx);
+ bp = bioq_takefirst(&ctx->gc_queue);
+ if (bp == NULL) {
+ if (ctx->gc_state == 1) {
+ ctx->gc_state = 2;
+ wakeup_one(&ctx->gc_state);
+ mtx_unlock(&ctx->gc_queue_mtx);
+ kthread_exit(0);
+ }
+ msleep(&ctx->gc_queue, &ctx->gc_queue_mtx,
+ PRIBIO | PDROP, "vgeom:io", 0);
+ continue;
+ }
+ mtx_unlock(&ctx->gc_queue_mtx);
+ zio = bp->bio_caller1;
+ zio->io_error = bp->bio_error;
+ if (bp->bio_cmd == BIO_FLUSH && bp->bio_error == ENOTSUP) {
+ vdev_t *vd;
+
+ /*
+ * If we get ENOTSUP, we know that no future
+ * attempts will ever succeed. In this case we
+ * set a persistent bit so that we don't bother
+ * with the ioctl in the future.
+ */
+ vd = zio->io_vd;
+ vd->vdev_nowritecache = B_TRUE;
+ }
+ g_destroy_bio(bp);
+ zio_next_stage_async(zio);
+ }
+}
+
+static int
+vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
+{
+ vdev_geom_ctx_t *ctx;
+ struct g_provider *pp;
+ struct g_consumer *cp;
+ int owned;
+
+ /*
+ * We must have a pathname, and it must be absolute.
+ */
+ if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
+ vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+ return (EINVAL);
+ }
+
+ if ((owned = mtx_owned(&Giant)))
+ mtx_unlock(&Giant);
+ g_topology_lock();
+ pp = g_provider_by_name(vd->vdev_path + sizeof("/dev/") - 1);
+ if (pp == NULL) {
+ g_topology_unlock();
+ if (owned)
+ mtx_lock(&Giant);
+ vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+ return (EINVAL);
+ }
+ cp = vdev_geom_attach(pp, !!(spa_mode & FWRITE));
+ g_topology_unlock();
+ if (owned)
+ mtx_lock(&Giant);
+ if (cp == NULL) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+ return (EACCES);
+ }
+
+ /*
+ * Determine the actual size of the device.
+ */
+ *psize = pp->mediasize;
+
+ /*
+ * Determine the device's minimum transfer size.
+ */
+ *ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1;
+
+ /*
+ * Clear the nowritecache bit, so that on a vdev_reopen() we will
+ * try again.
+ */
+ vd->vdev_nowritecache = B_FALSE;
+
+ cp->private = vd;
+
+ ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP);
+ bioq_init(&ctx->gc_queue);
+ mtx_init(&ctx->gc_queue_mtx, "zfs:vdev:geom:queue", NULL, MTX_DEF);
+ ctx->gc_consumer = cp;
+ ctx->gc_state = 0;
+
+ vd->vdev_tsd = ctx;
+
+ kthread_create(vdev_geom_worker, ctx, NULL, 0, 0, "vdev:worker %s",
+ pp->name);
+
+ return (0);
+}
+
+static void
+vdev_geom_close(vdev_t *vd)
+{
+ vdev_geom_ctx_t *ctx;
+ struct g_consumer *cp;
+
+ if ((ctx = vd->vdev_tsd) == NULL)
+ return;
+ if ((cp = ctx->gc_consumer) == NULL)
+ return;
+ vdev_geom_release(vd);
+ g_post_event(vdev_geom_detach, cp, M_WAITOK, NULL);
+}
+
+static void
+vdev_geom_io_intr(struct bio *bp)
+{
+ vdev_geom_ctx_t *ctx;
+ zio_t *zio;
+
+ zio = bp->bio_caller1;
+ ctx = zio->io_vd->vdev_tsd;
+
+ mtx_lock(&ctx->gc_queue_mtx);
+ bioq_insert_tail(&ctx->gc_queue, bp);
+ wakeup_one(&ctx->gc_queue);
+ mtx_unlock(&ctx->gc_queue_mtx);
+}
+
+static void
+vdev_geom_io_start(zio_t *zio)
+{
+ vdev_t *vd;
+ vdev_geom_ctx_t *ctx;
+ struct g_consumer *cp;
+ struct bio *bp;
+ int error;
+
+ cp = NULL;
+
+ vd = zio->io_vd;
+ ctx = vd->vdev_tsd;
+ if (ctx != NULL)
+ cp = ctx->gc_consumer;
+
+ if (zio->io_type == ZIO_TYPE_IOCTL) {
+ zio_vdev_io_bypass(zio);
+
+ /* XXPOLICY */
+ if (vdev_is_dead(vd)) {
+ zio->io_error = ENXIO;
+ zio_next_stage_async(zio);
+ return;
+ }
+
+ switch (zio->io_cmd) {
+
+ case DKIOCFLUSHWRITECACHE:
+ if (vd->vdev_nowritecache) {
+ zio->io_error = ENOTSUP;
+ break;
+ }
+
+ goto sendreq;
+ default:
+ zio->io_error = ENOTSUP;
+ }
+
+ zio_next_stage_async(zio);
+ return;
+ }
+
+ if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
+ return;
+
+ if ((zio = vdev_queue_io(zio)) == NULL)
+ return;
+
+sendreq:
+
+ error = vdev_is_dead(vd) ? ENXIO : vdev_error_inject(vd, zio);
+ if (error == 0 && cp == NULL)
+ error = ENXIO;
+ if (error) {
+ zio->io_error = error;
+ zio_next_stage_async(zio);
+ return;
+ }
+
+ bp = g_alloc_bio();
+ bp->bio_caller1 = zio;
+ switch (zio->io_type) {
+ case ZIO_TYPE_READ:
+ case ZIO_TYPE_WRITE:
+ bp->bio_cmd = zio->io_type == ZIO_TYPE_READ ? BIO_READ : BIO_WRITE;
+ bp->bio_data = zio->io_data;
+ bp->bio_offset = zio->io_offset;
+ bp->bio_length = zio->io_size;
+ break;
+ case ZIO_TYPE_IOCTL:
+ bp->bio_cmd = BIO_FLUSH;
+ bp->bio_data = NULL;
+ bp->bio_offset = cp->provider->mediasize;
+ bp->bio_length = 0;
+ break;
+ }
+ bp->bio_done = vdev_geom_io_intr;
+
+ g_io_request(bp, cp);
+}
+
+static void
+vdev_geom_io_done(zio_t *zio)
+{
+ vdev_queue_io_done(zio);
+
+ if (zio->io_type == ZIO_TYPE_WRITE)
+ vdev_cache_write(zio);
+
+ if (zio_injection_enabled && zio->io_error == 0)
+ zio->io_error = zio_handle_device_injection(zio->io_vd, EIO);
+
+ zio_next_stage(zio);
+}
+
+vdev_ops_t vdev_geom_ops = {
+ vdev_geom_open,
+ vdev_geom_close,
+ vdev_default_asize,
+ vdev_geom_io_start,
+ vdev_geom_io_done,
+ NULL,
+ VDEV_TYPE_DISK, /* name of this vdev type */
+ B_TRUE /* leaf vdev */
+};
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
new file mode 100644
index 0000000..9d9f555
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
@@ -0,0 +1,1011 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * Virtual Device Labels
+ * ---------------------
+ *
+ * The vdev label serves several distinct purposes:
+ *
+ * 1. Uniquely identify this device as part of a ZFS pool and confirm its
+ * identity within the pool.
+ *
+ * 2. Verify that all the devices given in a configuration are present
+ * within the pool.
+ *
+ * 3. Determine the uberblock for the pool.
+ *
+ * 4. In case of an import operation, determine the configuration of the
+ * toplevel vdev of which it is a part.
+ *
+ * 5. If an import operation cannot find all the devices in the pool,
+ * provide enough information to the administrator to determine which
+ * devices are missing.
+ *
+ * It is important to note that while the kernel is responsible for writing the
+ * label, it only consumes the information in the first three cases. The
+ * latter information is only consumed in userland when determining the
+ * configuration to import a pool.
+ *
+ *
+ * Label Organization
+ * ------------------
+ *
+ * Before describing the contents of the label, it's important to understand how
+ * the labels are written and updated with respect to the uberblock.
+ *
+ * When the pool configuration is altered, either because it was newly created
+ * or a device was added, we want to update all the labels such that we can deal
+ * with fatal failure at any point. To this end, each disk has two labels which
+ * are updated before and after the uberblock is synced. Assuming we have
+ * labels and an uberblock with the following transacation groups:
+ *
+ * L1 UB L2
+ * +------+ +------+ +------+
+ * | | | | | |
+ * | t10 | | t10 | | t10 |
+ * | | | | | |
+ * +------+ +------+ +------+
+ *
+ * In this stable state, the labels and the uberblock were all updated within
+ * the same transaction group (10). Each label is mirrored and checksummed, so
+ * that we can detect when we fail partway through writing the label.
+ *
+ * In order to identify which labels are valid, the labels are written in the
+ * following manner:
+ *
+ * 1. For each vdev, update 'L1' to the new label
+ * 2. Update the uberblock
+ * 3. For each vdev, update 'L2' to the new label
+ *
+ * Given arbitrary failure, we can determine the correct label to use based on
+ * the transaction group. If we fail after updating L1 but before updating the
+ * UB, we will notice that L1's transaction group is greater than the uberblock,
+ * so L2 must be valid. If we fail after writing the uberblock but before
+ * writing L2, we will notice that L2's transaction group is less than L1, and
+ * therefore L1 is valid.
+ *
+ * Another added complexity is that not every label is updated when the config
+ * is synced. If we add a single device, we do not want to have to re-write
+ * every label for every device in the pool. This means that both L1 and L2 may
+ * be older than the pool uberblock, because the necessary information is stored
+ * on another vdev.
+ *
+ *
+ * On-disk Format
+ * --------------
+ *
+ * The vdev label consists of two distinct parts, and is wrapped within the
+ * vdev_label_t structure. The label includes 8k of padding to permit legacy
+ * VTOC disk labels, but is otherwise ignored.
+ *
+ * The first half of the label is a packed nvlist which contains pool wide
+ * properties, per-vdev properties, and configuration information. It is
+ * described in more detail below.
+ *
+ * The latter half of the label consists of a redundant array of uberblocks.
+ * These uberblocks are updated whenever a transaction group is committed,
+ * or when the configuration is updated. When a pool is loaded, we scan each
+ * vdev for the 'best' uberblock.
+ *
+ *
+ * Configuration Information
+ * -------------------------
+ *
+ * The nvlist describing the pool and vdev contains the following elements:
+ *
+ * version ZFS on-disk version
+ * name Pool name
+ * state Pool state
+ * txg Transaction group in which this label was written
+ * pool_guid Unique identifier for this pool
+ * vdev_tree An nvlist describing vdev tree.
+ *
+ * Each leaf device label also contains the following:
+ *
+ * top_guid Unique ID for top-level vdev in which this is contained
+ * guid Unique ID for the leaf vdev
+ *
+ * The 'vs' configuration follows the format described in 'spa_config.c'.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/dmu.h>
+#include <sys/zap.h>
+#include <sys/vdev.h>
+#include <sys/vdev_impl.h>
+#include <sys/uberblock_impl.h>
+#include <sys/metaslab.h>
+#include <sys/zio.h>
+#include <sys/fs/zfs.h>
+
+/*
+ * Basic routines to read and write from a vdev label.
+ * Used throughout the rest of this file.
+ */
+uint64_t
+vdev_label_offset(uint64_t psize, int l, uint64_t offset)
+{
+ ASSERT(offset < sizeof (vdev_label_t));
+
+ return (offset + l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ?
+ 0 : psize - VDEV_LABELS * sizeof (vdev_label_t)));
+}
+
+static void
+vdev_label_read(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
+ uint64_t size, zio_done_func_t *done, void *private)
+{
+ ASSERT(vd->vdev_children == 0);
+
+ zio_nowait(zio_read_phys(zio, vd,
+ vdev_label_offset(vd->vdev_psize, l, offset),
+ size, buf, ZIO_CHECKSUM_LABEL, done, private,
+ ZIO_PRIORITY_SYNC_READ,
+ ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE));
+}
+
+static void
+vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
+ uint64_t size, zio_done_func_t *done, void *private)
+{
+ ASSERT(vd->vdev_children == 0);
+
+ zio_nowait(zio_write_phys(zio, vd,
+ vdev_label_offset(vd->vdev_psize, l, offset),
+ size, buf, ZIO_CHECKSUM_LABEL, done, private,
+ ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL));
+}
+
+/*
+ * Generate the nvlist representing this vdev's config.
+ */
+nvlist_t *
+vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
+ boolean_t isspare)
+{
+ nvlist_t *nv = NULL;
+
+ VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+ VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE,
+ vd->vdev_ops->vdev_op_type) == 0);
+ if (!isspare)
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id)
+ == 0);
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0);
+
+ if (vd->vdev_path != NULL)
+ VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_PATH,
+ vd->vdev_path) == 0);
+
+ if (vd->vdev_devid != NULL)
+ VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_DEVID,
+ vd->vdev_devid) == 0);
+
+ if (vd->vdev_nparity != 0) {
+ ASSERT(strcmp(vd->vdev_ops->vdev_op_type,
+ VDEV_TYPE_RAIDZ) == 0);
+
+ /*
+ * Make sure someone hasn't managed to sneak a fancy new vdev
+ * into a crufty old storage pool.
+ */
+ ASSERT(vd->vdev_nparity == 1 ||
+ (vd->vdev_nparity == 2 &&
+ spa_version(spa) >= ZFS_VERSION_RAID6));
+
+ /*
+ * Note that we'll add the nparity tag even on storage pools
+ * that only support a single parity device -- older software
+ * will just ignore it.
+ */
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY,
+ vd->vdev_nparity) == 0);
+ }
+
+ if (vd->vdev_wholedisk != -1ULL)
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
+ vd->vdev_wholedisk) == 0);
+
+ if (vd->vdev_not_present)
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1) == 0);
+
+ if (vd->vdev_isspare)
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1) == 0);
+
+ if (!isspare && vd == vd->vdev_top) {
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
+ vd->vdev_ms_array) == 0);
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
+ vd->vdev_ms_shift) == 0);
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT,
+ vd->vdev_ashift) == 0);
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE,
+ vd->vdev_asize) == 0);
+ }
+
+ if (vd->vdev_dtl.smo_object != 0)
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DTL,
+ vd->vdev_dtl.smo_object) == 0);
+
+ if (getstats) {
+ vdev_stat_t vs;
+ vdev_get_stats(vd, &vs);
+ VERIFY(nvlist_add_uint64_array(nv, ZPOOL_CONFIG_STATS,
+ (uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t)) == 0);
+ }
+
+ if (!vd->vdev_ops->vdev_op_leaf) {
+ nvlist_t **child;
+ int c;
+
+ child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *),
+ KM_SLEEP);
+
+ for (c = 0; c < vd->vdev_children; c++)
+ child[c] = vdev_config_generate(spa, vd->vdev_child[c],
+ getstats, isspare);
+
+ VERIFY(nvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+ child, vd->vdev_children) == 0);
+
+ for (c = 0; c < vd->vdev_children; c++)
+ nvlist_free(child[c]);
+
+ kmem_free(child, vd->vdev_children * sizeof (nvlist_t *));
+
+ } else {
+ if (vd->vdev_offline && !vd->vdev_tmpoffline)
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE,
+ B_TRUE) == 0);
+ else
+ (void) nvlist_remove(nv, ZPOOL_CONFIG_OFFLINE,
+ DATA_TYPE_UINT64);
+ }
+
+ return (nv);
+}
+
+nvlist_t *
+vdev_label_read_config(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+ nvlist_t *config = NULL;
+ vdev_phys_t *vp;
+ zio_t *zio;
+ int l;
+
+ ASSERT(spa_config_held(spa, RW_READER));
+
+ if (vdev_is_dead(vd))
+ return (NULL);
+
+ vp = zio_buf_alloc(sizeof (vdev_phys_t));
+
+ for (l = 0; l < VDEV_LABELS; l++) {
+
+ zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CONFIG_HELD);
+
+ vdev_label_read(zio, vd, l, vp,
+ offsetof(vdev_label_t, vl_vdev_phys),
+ sizeof (vdev_phys_t), NULL, NULL);
+
+ if (zio_wait(zio) == 0 &&
+ nvlist_unpack(vp->vp_nvlist, sizeof (vp->vp_nvlist),
+ &config, 0) == 0)
+ break;
+
+ if (config != NULL) {
+ nvlist_free(config);
+ config = NULL;
+ }
+ }
+
+ zio_buf_free(vp, sizeof (vdev_phys_t));
+
+ return (config);
+}
+
+/*
+ * Determine if a device is in use. The 'spare_guid' parameter will be filled
+ * in with the device guid if this spare is active elsewhere on the system.
+ */
+static boolean_t
+vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason,
+ uint64_t *spare_guid)
+{
+ spa_t *spa = vd->vdev_spa;
+ uint64_t state, pool_guid, device_guid, txg, spare_pool;
+ uint64_t vdtxg = 0;
+ nvlist_t *label;
+
+ if (spare_guid)
+ *spare_guid = 0ULL;
+
+ /*
+ * Read the label, if any, and perform some basic sanity checks.
+ */
+ if ((label = vdev_label_read_config(vd)) == NULL)
+ return (B_FALSE);
+
+ (void) nvlist_lookup_uint64(label, ZPOOL_CONFIG_CREATE_TXG,
+ &vdtxg);
+
+ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
+ &state) != 0 ||
+ nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID,
+ &device_guid) != 0) {
+ nvlist_free(label);
+ return (B_FALSE);
+ }
+
+ if (state != POOL_STATE_SPARE &&
+ (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID,
+ &pool_guid) != 0 ||
+ nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
+ &txg) != 0)) {
+ nvlist_free(label);
+ return (B_FALSE);
+ }
+
+ nvlist_free(label);
+
+ /*
+ * Check to see if this device indeed belongs to the pool it claims to
+ * be a part of. The only way this is allowed is if the device is a hot
+ * spare (which we check for later on).
+ */
+ if (state != POOL_STATE_SPARE &&
+ !spa_guid_exists(pool_guid, device_guid) &&
+ !spa_spare_exists(device_guid, NULL))
+ return (B_FALSE);
+
+ /*
+ * If the transaction group is zero, then this an initialized (but
+ * unused) label. This is only an error if the create transaction
+ * on-disk is the same as the one we're using now, in which case the
+ * user has attempted to add the same vdev multiple times in the same
+ * transaction.
+ */
+ if (state != POOL_STATE_SPARE && txg == 0 && vdtxg == crtxg)
+ return (B_TRUE);
+
+ /*
+ * Check to see if this is a spare device. We do an explicit check for
+ * spa_has_spare() here because it may be on our pending list of spares
+ * to add.
+ */
+ if (spa_spare_exists(device_guid, &spare_pool) ||
+ spa_has_spare(spa, device_guid)) {
+ if (spare_guid)
+ *spare_guid = device_guid;
+
+ switch (reason) {
+ case VDEV_LABEL_CREATE:
+ return (B_TRUE);
+
+ case VDEV_LABEL_REPLACE:
+ return (!spa_has_spare(spa, device_guid) ||
+ spare_pool != 0ULL);
+
+ case VDEV_LABEL_SPARE:
+ return (spa_has_spare(spa, device_guid));
+ }
+ }
+
+ /*
+ * If the device is marked ACTIVE, then this device is in use by another
+ * pool on the system.
+ */
+ return (state == POOL_STATE_ACTIVE);
+}
+
+/*
+ * Initialize a vdev label. We check to make sure each leaf device is not in
+ * use, and writable. We put down an initial label which we will later
+ * overwrite with a complete label. Note that it's important to do this
+ * sequentially, not in parallel, so that we catch cases of multiple use of the
+ * same leaf vdev in the vdev we're creating -- e.g. mirroring a disk with
+ * itself.
+ */
+int
+vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
+{
+ spa_t *spa = vd->vdev_spa;
+ nvlist_t *label;
+ vdev_phys_t *vp;
+ vdev_boot_header_t *vb;
+ uberblock_t *ub;
+ zio_t *zio;
+ int l, c, n;
+ char *buf;
+ size_t buflen;
+ int error;
+ uint64_t spare_guid;
+
+ ASSERT(spa_config_held(spa, RW_WRITER));
+
+ for (c = 0; c < vd->vdev_children; c++)
+ if ((error = vdev_label_init(vd->vdev_child[c],
+ crtxg, reason)) != 0)
+ return (error);
+
+ if (!vd->vdev_ops->vdev_op_leaf)
+ return (0);
+
+ /*
+ * Dead vdevs cannot be initialized.
+ */
+ if (vdev_is_dead(vd))
+ return (EIO);
+
+ /*
+ * Determine if the vdev is in use.
+ */
+ if (reason != VDEV_LABEL_REMOVE &&
+ vdev_inuse(vd, crtxg, reason, &spare_guid))
+ return (EBUSY);
+
+ ASSERT(reason != VDEV_LABEL_REMOVE ||
+ vdev_inuse(vd, crtxg, reason, NULL));
+
+ /*
+ * If this is a request to add or replace a spare that is in use
+ * elsewhere on the system, then we must update the guid (which was
+ * initialized to a random value) to reflect the actual GUID (which is
+ * shared between multiple pools).
+ */
+ if (reason != VDEV_LABEL_REMOVE && spare_guid != 0ULL) {
+ vdev_t *pvd = vd->vdev_parent;
+
+ for (; pvd != NULL; pvd = pvd->vdev_parent) {
+ pvd->vdev_guid_sum -= vd->vdev_guid;
+ pvd->vdev_guid_sum += spare_guid;
+ }
+
+ vd->vdev_guid = vd->vdev_guid_sum = spare_guid;
+
+ /*
+ * If this is a replacement, then we want to fallthrough to the
+ * rest of the code. If we're adding a spare, then it's already
+ * labelled appropriately and we can just return.
+ */
+ if (reason == VDEV_LABEL_SPARE)
+ return (0);
+ ASSERT(reason == VDEV_LABEL_REPLACE);
+ }
+
+ /*
+ * Initialize its label.
+ */
+ vp = zio_buf_alloc(sizeof (vdev_phys_t));
+ bzero(vp, sizeof (vdev_phys_t));
+
+ /*
+ * Generate a label describing the pool and our top-level vdev.
+ * We mark it as being from txg 0 to indicate that it's not
+ * really part of an active pool just yet. The labels will
+ * be written again with a meaningful txg by spa_sync().
+ */
+ if (reason == VDEV_LABEL_SPARE ||
+ (reason == VDEV_LABEL_REMOVE && vd->vdev_isspare)) {
+ /*
+ * For inactive hot spares, we generate a special label that
+ * identifies as a mutually shared hot spare. We write the
+ * label if we are adding a hot spare, or if we are removing an
+ * active hot spare (in which case we want to revert the
+ * labels).
+ */
+ VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+ VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION,
+ spa_version(spa)) == 0);
+ VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE,
+ POOL_STATE_SPARE) == 0);
+ VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID,
+ vd->vdev_guid) == 0);
+ } else {
+ label = spa_config_generate(spa, vd, 0ULL, B_FALSE);
+
+ /*
+ * Add our creation time. This allows us to detect multiple
+ * vdev uses as described above, and automatically expires if we
+ * fail.
+ */
+ VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_CREATE_TXG,
+ crtxg) == 0);
+ }
+
+ buf = vp->vp_nvlist;
+ buflen = sizeof (vp->vp_nvlist);
+
+ error = nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP);
+ if (error != 0) {
+ nvlist_free(label);
+ zio_buf_free(vp, sizeof (vdev_phys_t));
+ /* EFAULT means nvlist_pack ran out of room */
+ return (error == EFAULT ? ENAMETOOLONG : EINVAL);
+ }
+
+ /*
+ * Initialize boot block header.
+ */
+ vb = zio_buf_alloc(sizeof (vdev_boot_header_t));
+ bzero(vb, sizeof (vdev_boot_header_t));
+ vb->vb_magic = VDEV_BOOT_MAGIC;
+ vb->vb_version = VDEV_BOOT_VERSION;
+ vb->vb_offset = VDEV_BOOT_OFFSET;
+ vb->vb_size = VDEV_BOOT_SIZE;
+
+ /*
+ * Initialize uberblock template.
+ */
+ ub = zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd));
+ bzero(ub, VDEV_UBERBLOCK_SIZE(vd));
+ *ub = spa->spa_uberblock;
+ ub->ub_txg = 0;
+
+ /*
+ * Write everything in parallel.
+ */
+ zio = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL);
+
+ for (l = 0; l < VDEV_LABELS; l++) {
+
+ vdev_label_write(zio, vd, l, vp,
+ offsetof(vdev_label_t, vl_vdev_phys),
+ sizeof (vdev_phys_t), NULL, NULL);
+
+ vdev_label_write(zio, vd, l, vb,
+ offsetof(vdev_label_t, vl_boot_header),
+ sizeof (vdev_boot_header_t), NULL, NULL);
+
+ for (n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
+ vdev_label_write(zio, vd, l, ub,
+ VDEV_UBERBLOCK_OFFSET(vd, n),
+ VDEV_UBERBLOCK_SIZE(vd), NULL, NULL);
+ }
+ }
+
+ error = zio_wait(zio);
+
+ nvlist_free(label);
+ zio_buf_free(ub, VDEV_UBERBLOCK_SIZE(vd));
+ zio_buf_free(vb, sizeof (vdev_boot_header_t));
+ zio_buf_free(vp, sizeof (vdev_phys_t));
+
+ /*
+ * If this vdev hasn't been previously identified as a spare, then we
+ * mark it as such only if a) we are labelling it as a spare, or b) it
+ * exists as a spare elsewhere in the system.
+ */
+ if (error == 0 && !vd->vdev_isspare &&
+ (reason == VDEV_LABEL_SPARE ||
+ spa_spare_exists(vd->vdev_guid, NULL)))
+ spa_spare_add(vd);
+
+ return (error);
+}
+
+/*
+ * ==========================================================================
+ * uberblock load/sync
+ * ==========================================================================
+ */
+
+/*
+ * Consider the following situation: txg is safely synced to disk. We've
+ * written the first uberblock for txg + 1, and then we lose power. When we
+ * come back up, we fail to see the uberblock for txg + 1 because, say,
+ * it was on a mirrored device and the replica to which we wrote txg + 1
+ * is now offline. If we then make some changes and sync txg + 1, and then
+ * the missing replica comes back, then for a new seconds we'll have two
+ * conflicting uberblocks on disk with the same txg. The solution is simple:
+ * among uberblocks with equal txg, choose the one with the latest timestamp.
+ */
+static int
+vdev_uberblock_compare(uberblock_t *ub1, uberblock_t *ub2)
+{
+ if (ub1->ub_txg < ub2->ub_txg)
+ return (-1);
+ if (ub1->ub_txg > ub2->ub_txg)
+ return (1);
+
+ if (ub1->ub_timestamp < ub2->ub_timestamp)
+ return (-1);
+ if (ub1->ub_timestamp > ub2->ub_timestamp)
+ return (1);
+
+ return (0);
+}
+
+static void
+vdev_uberblock_load_done(zio_t *zio)
+{
+ uberblock_t *ub = zio->io_data;
+ uberblock_t *ubbest = zio->io_private;
+ spa_t *spa = zio->io_spa;
+
+ ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(zio->io_vd));
+
+ if (zio->io_error == 0 && uberblock_verify(ub) == 0) {
+ mutex_enter(&spa->spa_uberblock_lock);
+ if (vdev_uberblock_compare(ub, ubbest) > 0)
+ *ubbest = *ub;
+ mutex_exit(&spa->spa_uberblock_lock);
+ }
+
+ zio_buf_free(zio->io_data, zio->io_size);
+}
+
+void
+vdev_uberblock_load(zio_t *zio, vdev_t *vd, uberblock_t *ubbest)
+{
+ int l, c, n;
+
+ for (c = 0; c < vd->vdev_children; c++)
+ vdev_uberblock_load(zio, vd->vdev_child[c], ubbest);
+
+ if (!vd->vdev_ops->vdev_op_leaf)
+ return;
+
+ if (vdev_is_dead(vd))
+ return;
+
+ for (l = 0; l < VDEV_LABELS; l++) {
+ for (n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
+ vdev_label_read(zio, vd, l,
+ zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)),
+ VDEV_UBERBLOCK_OFFSET(vd, n),
+ VDEV_UBERBLOCK_SIZE(vd),
+ vdev_uberblock_load_done, ubbest);
+ }
+ }
+}
+
+/*
+ * Write the uberblock to both labels of all leaves of the specified vdev.
+ * We only get credit for writes to known-visible vdevs; see spa_vdev_add().
+ */
+static void
+vdev_uberblock_sync_done(zio_t *zio)
+{
+ uint64_t *good_writes = zio->io_root->io_private;
+
+ if (zio->io_error == 0 && zio->io_vd->vdev_top->vdev_ms_array != 0)
+ atomic_add_64(good_writes, 1);
+}
+
+static void
+vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, uint64_t txg)
+{
+ int l, c, n;
+
+ for (c = 0; c < vd->vdev_children; c++)
+ vdev_uberblock_sync(zio, ub, vd->vdev_child[c], txg);
+
+ if (!vd->vdev_ops->vdev_op_leaf)
+ return;
+
+ if (vdev_is_dead(vd))
+ return;
+
+ n = txg & (VDEV_UBERBLOCK_COUNT(vd) - 1);
+
+ ASSERT(ub->ub_txg == txg);
+
+ for (l = 0; l < VDEV_LABELS; l++)
+ vdev_label_write(zio, vd, l, ub,
+ VDEV_UBERBLOCK_OFFSET(vd, n),
+ VDEV_UBERBLOCK_SIZE(vd),
+ vdev_uberblock_sync_done, NULL);
+
+ dprintf("vdev %s in txg %llu\n", vdev_description(vd), txg);
+}
+
+static int
+vdev_uberblock_sync_tree(spa_t *spa, uberblock_t *ub, vdev_t *vd, uint64_t txg)
+{
+ uberblock_t *ubbuf;
+ size_t size = vd->vdev_top ? VDEV_UBERBLOCK_SIZE(vd) : SPA_MAXBLOCKSIZE;
+ uint64_t *good_writes;
+ zio_t *zio;
+ int error;
+
+ ubbuf = zio_buf_alloc(size);
+ bzero(ubbuf, size);
+ *ubbuf = *ub;
+
+ good_writes = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
+
+ zio = zio_root(spa, NULL, good_writes,
+ ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL);
+
+ vdev_uberblock_sync(zio, ubbuf, vd, txg);
+
+ error = zio_wait(zio);
+
+ if (error && *good_writes != 0) {
+ dprintf("partial success: good_writes = %llu\n", *good_writes);
+ error = 0;
+ }
+
+ /*
+ * It's possible to have no good writes and no error if every vdev is in
+ * the CANT_OPEN state.
+ */
+ if (*good_writes == 0 && error == 0)
+ error = EIO;
+
+ kmem_free(good_writes, sizeof (uint64_t));
+ zio_buf_free(ubbuf, size);
+
+ return (error);
+}
+
+/*
+ * Sync out an individual vdev.
+ */
+static void
+vdev_sync_label_done(zio_t *zio)
+{
+ uint64_t *good_writes = zio->io_root->io_private;
+
+ if (zio->io_error == 0)
+ atomic_add_64(good_writes, 1);
+}
+
+static void
+vdev_sync_label(zio_t *zio, vdev_t *vd, int l, uint64_t txg)
+{
+ nvlist_t *label;
+ vdev_phys_t *vp;
+ char *buf;
+ size_t buflen;
+ int c;
+
+ for (c = 0; c < vd->vdev_children; c++)
+ vdev_sync_label(zio, vd->vdev_child[c], l, txg);
+
+ if (!vd->vdev_ops->vdev_op_leaf)
+ return;
+
+ if (vdev_is_dead(vd))
+ return;
+
+ /*
+ * Generate a label describing the top-level config to which we belong.
+ */
+ label = spa_config_generate(vd->vdev_spa, vd, txg, B_FALSE);
+
+ vp = zio_buf_alloc(sizeof (vdev_phys_t));
+ bzero(vp, sizeof (vdev_phys_t));
+
+ buf = vp->vp_nvlist;
+ buflen = sizeof (vp->vp_nvlist);
+
+ if (nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP) == 0)
+ vdev_label_write(zio, vd, l, vp,
+ offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t),
+ vdev_sync_label_done, NULL);
+
+ zio_buf_free(vp, sizeof (vdev_phys_t));
+ nvlist_free(label);
+
+ dprintf("%s label %d txg %llu\n", vdev_description(vd), l, txg);
+}
+
+static int
+vdev_sync_labels(vdev_t *vd, int l, uint64_t txg)
+{
+ uint64_t *good_writes;
+ zio_t *zio;
+ int error;
+
+ ASSERT(vd == vd->vdev_top);
+
+ good_writes = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
+
+ zio = zio_root(vd->vdev_spa, NULL, good_writes,
+ ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL);
+
+ /*
+ * Recursively kick off writes to all labels.
+ */
+ vdev_sync_label(zio, vd, l, txg);
+
+ error = zio_wait(zio);
+
+ if (error && *good_writes != 0) {
+ dprintf("partial success: good_writes = %llu\n", *good_writes);
+ error = 0;
+ }
+
+ if (*good_writes == 0 && error == 0)
+ error = ENODEV;
+
+ kmem_free(good_writes, sizeof (uint64_t));
+
+ return (error);
+}
+
+/*
+ * Sync the entire vdev configuration.
+ *
+ * The order of operations is carefully crafted to ensure that
+ * if the system panics or loses power at any time, the state on disk
+ * is still transactionally consistent. The in-line comments below
+ * describe the failure semantics at each stage.
+ *
+ * Moreover, it is designed to be idempotent: if spa_sync_labels() fails
+ * at any time, you can just call it again, and it will resume its work.
+ */
+int
+vdev_config_sync(vdev_t *uvd, uint64_t txg)
+{
+ spa_t *spa = uvd->vdev_spa;
+ uberblock_t *ub = &spa->spa_uberblock;
+ vdev_t *rvd = spa->spa_root_vdev;
+ vdev_t *vd;
+ zio_t *zio;
+ int l, error;
+
+ ASSERT(ub->ub_txg <= txg);
+
+ /*
+ * If this isn't a resync due to I/O errors, and nothing changed
+ * in this transaction group, and the vdev configuration hasn't changed,
+ * then there's nothing to do.
+ */
+ if (ub->ub_txg < txg && uberblock_update(ub, rvd, txg) == B_FALSE &&
+ list_is_empty(&spa->spa_dirty_list)) {
+ dprintf("nothing to sync in %s in txg %llu\n",
+ spa_name(spa), txg);
+ return (0);
+ }
+
+ if (txg > spa_freeze_txg(spa))
+ return (0);
+
+ ASSERT(txg <= spa->spa_final_txg);
+
+ dprintf("syncing %s txg %llu\n", spa_name(spa), txg);
+
+ /*
+ * Flush the write cache of every disk that's been written to
+ * in this transaction group. This ensures that all blocks
+ * written in this txg will be committed to stable storage
+ * before any uberblock that references them.
+ */
+ zio = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL);
+ for (vd = txg_list_head(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)); vd;
+ vd = txg_list_next(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg))) {
+ zio_nowait(zio_ioctl(zio, spa, vd, DKIOCFLUSHWRITECACHE,
+ NULL, NULL, ZIO_PRIORITY_NOW,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
+ }
+ (void) zio_wait(zio);
+
+ /*
+ * Sync out the even labels (L0, L2) for every dirty vdev. If the
+ * system dies in the middle of this process, that's OK: all of the
+ * even labels that made it to disk will be newer than any uberblock,
+ * and will therefore be considered invalid. The odd labels (L1, L3),
+ * which have not yet been touched, will still be valid.
+ */
+ for (vd = list_head(&spa->spa_dirty_list); vd != NULL;
+ vd = list_next(&spa->spa_dirty_list, vd)) {
+ for (l = 0; l < VDEV_LABELS; l++) {
+ if (l & 1)
+ continue;
+ if ((error = vdev_sync_labels(vd, l, txg)) != 0)
+ return (error);
+ }
+ }
+
+ /*
+ * Flush the new labels to disk. This ensures that all even-label
+ * updates are committed to stable storage before the uberblock update.
+ */
+ zio = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL);
+ for (vd = list_head(&spa->spa_dirty_list); vd != NULL;
+ vd = list_next(&spa->spa_dirty_list, vd)) {
+ zio_nowait(zio_ioctl(zio, spa, vd, DKIOCFLUSHWRITECACHE,
+ NULL, NULL, ZIO_PRIORITY_NOW,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
+ }
+ (void) zio_wait(zio);
+
+ /*
+ * Sync the uberblocks to all vdevs in the tree specified by uvd.
+ * If the system dies in the middle of this step, there are two cases
+ * to consider, and the on-disk state is consistent either way:
+ *
+ * (1) If none of the new uberblocks made it to disk, then the
+ * previous uberblock will be the newest, and the odd labels
+ * (which had not yet been touched) will be valid with respect
+ * to that uberblock.
+ *
+ * (2) If one or more new uberblocks made it to disk, then they
+ * will be the newest, and the even labels (which had all
+ * been successfully committed) will be valid with respect
+ * to the new uberblocks.
+ */
+ if ((error = vdev_uberblock_sync_tree(spa, ub, uvd, txg)) != 0)
+ return (error);
+
+ /*
+ * Flush the uberblocks to disk. This ensures that the odd labels
+ * are no longer needed (because the new uberblocks and the even
+ * labels are safely on disk), so it is safe to overwrite them.
+ */
+ (void) zio_wait(zio_ioctl(NULL, spa, uvd, DKIOCFLUSHWRITECACHE,
+ NULL, NULL, ZIO_PRIORITY_NOW,
+ ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
+
+ /*
+ * Sync out odd labels for every dirty vdev. If the system dies
+ * in the middle of this process, the even labels and the new
+ * uberblocks will suffice to open the pool. The next time
+ * the pool is opened, the first thing we'll do -- before any
+ * user data is modified -- is mark every vdev dirty so that
+ * all labels will be brought up to date.
+ */
+ for (vd = list_head(&spa->spa_dirty_list); vd != NULL;
+ vd = list_next(&spa->spa_dirty_list, vd)) {
+ for (l = 0; l < VDEV_LABELS; l++) {
+ if ((l & 1) == 0)
+ continue;
+ if ((error = vdev_sync_labels(vd, l, txg)) != 0)
+ return (error);
+ }
+ }
+
+ /*
+ * Flush the new labels to disk. This ensures that all odd-label
+ * updates are committed to stable storage before the next
+ * transaction group begins.
+ */
+ zio = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL);
+ for (vd = list_head(&spa->spa_dirty_list); vd != NULL;
+ vd = list_next(&spa->spa_dirty_list, vd)) {
+ zio_nowait(zio_ioctl(zio, spa, vd, DKIOCFLUSHWRITECACHE,
+ NULL, NULL, ZIO_PRIORITY_NOW,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
+ }
+ (void) zio_wait(zio);
+
+ return (0);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
new file mode 100644
index 0000000..73d1a83
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
@@ -0,0 +1,495 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#include <sys/fs/zfs.h>
+
+/*
+ * Virtual device vector for mirroring.
+ */
+
+typedef struct mirror_child {
+ vdev_t *mc_vd;
+ uint64_t mc_offset;
+ int mc_error;
+ short mc_tried;
+ short mc_skipped;
+} mirror_child_t;
+
+typedef struct mirror_map {
+ int mm_children;
+ int mm_replacing;
+ int mm_preferred;
+ int mm_root;
+ mirror_child_t mm_child[1];
+} mirror_map_t;
+
+int vdev_mirror_shift = 21;
+
+static mirror_map_t *
+vdev_mirror_map_alloc(zio_t *zio)
+{
+ mirror_map_t *mm = NULL;
+ mirror_child_t *mc;
+ vdev_t *vd = zio->io_vd;
+ int c, d;
+
+ if (vd == NULL) {
+ dva_t *dva = zio->io_bp->blk_dva;
+ spa_t *spa = zio->io_spa;
+
+ c = BP_GET_NDVAS(zio->io_bp);
+
+ mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
+ mm->mm_children = c;
+ mm->mm_replacing = B_FALSE;
+ mm->mm_preferred = spa_get_random(c);
+ mm->mm_root = B_TRUE;
+
+ /*
+ * Check the other, lower-index DVAs to see if they're on
+ * the same vdev as the child we picked. If they are, use
+ * them since they are likely to have been allocated from
+ * the primary metaslab in use at the time, and hence are
+ * more likely to have locality with single-copy data.
+ */
+ for (c = mm->mm_preferred, d = c - 1; d >= 0; d--) {
+ if (DVA_GET_VDEV(&dva[d]) == DVA_GET_VDEV(&dva[c]))
+ mm->mm_preferred = d;
+ }
+
+ for (c = 0; c < mm->mm_children; c++) {
+ mc = &mm->mm_child[c];
+
+ mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c]));
+ mc->mc_offset = DVA_GET_OFFSET(&dva[c]);
+ }
+ } else {
+ c = vd->vdev_children;
+
+ mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
+ mm->mm_children = c;
+ mm->mm_replacing = (vd->vdev_ops == &vdev_replacing_ops ||
+ vd->vdev_ops == &vdev_spare_ops);
+ mm->mm_preferred = mm->mm_replacing ? 0 :
+ (zio->io_offset >> vdev_mirror_shift) % c;
+ mm->mm_root = B_FALSE;
+
+ for (c = 0; c < mm->mm_children; c++) {
+ mc = &mm->mm_child[c];
+ mc->mc_vd = vd->vdev_child[c];
+ mc->mc_offset = zio->io_offset;
+ }
+ }
+
+ zio->io_vsd = mm;
+ return (mm);
+}
+
+static void
+vdev_mirror_map_free(zio_t *zio)
+{
+ mirror_map_t *mm = zio->io_vsd;
+
+ kmem_free(mm, offsetof(mirror_map_t, mm_child[mm->mm_children]));
+ zio->io_vsd = NULL;
+}
+
+static int
+vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
+{
+ vdev_t *cvd;
+ uint64_t c;
+ int numerrors = 0;
+ int ret, lasterror = 0;
+
+ if (vd->vdev_children == 0) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+ return (EINVAL);
+ }
+
+ for (c = 0; c < vd->vdev_children; c++) {
+ cvd = vd->vdev_child[c];
+
+ if ((ret = vdev_open(cvd)) != 0) {
+ lasterror = ret;
+ numerrors++;
+ continue;
+ }
+
+ *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
+ *ashift = MAX(*ashift, cvd->vdev_ashift);
+ }
+
+ if (numerrors == vd->vdev_children) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
+ return (lasterror);
+ }
+
+ return (0);
+}
+
+static void
+vdev_mirror_close(vdev_t *vd)
+{
+ uint64_t c;
+
+ for (c = 0; c < vd->vdev_children; c++)
+ vdev_close(vd->vdev_child[c]);
+}
+
+static void
+vdev_mirror_child_done(zio_t *zio)
+{
+ mirror_child_t *mc = zio->io_private;
+
+ mc->mc_error = zio->io_error;
+ mc->mc_tried = 1;
+ mc->mc_skipped = 0;
+}
+
+static void
+vdev_mirror_scrub_done(zio_t *zio)
+{
+ mirror_child_t *mc = zio->io_private;
+
+ if (zio->io_error == 0) {
+ zio_t *pio = zio->io_parent;
+ mutex_enter(&pio->io_lock);
+ ASSERT3U(zio->io_size, >=, pio->io_size);
+ bcopy(zio->io_data, pio->io_data, pio->io_size);
+ mutex_exit(&pio->io_lock);
+ }
+
+ zio_buf_free(zio->io_data, zio->io_size);
+
+ mc->mc_error = zio->io_error;
+ mc->mc_tried = 1;
+ mc->mc_skipped = 0;
+}
+
+static void
+vdev_mirror_repair_done(zio_t *zio)
+{
+ ASSERT(zio->io_private == zio->io_parent);
+ vdev_mirror_map_free(zio->io_private);
+}
+
+/*
+ * Try to find a child whose DTL doesn't contain the block we want to read.
+ * If we can't, try the read on any vdev we haven't already tried.
+ */
+static int
+vdev_mirror_child_select(zio_t *zio)
+{
+ mirror_map_t *mm = zio->io_vsd;
+ mirror_child_t *mc;
+ uint64_t txg = zio->io_txg;
+ int i, c;
+
+ ASSERT(zio->io_bp == NULL || zio->io_bp->blk_birth == txg);
+
+ /*
+ * Try to find a child whose DTL doesn't contain the block to read.
+ * If a child is known to be completely inaccessible (indicated by
+ * vdev_is_dead() returning B_TRUE), don't even try.
+ */
+ for (i = 0, c = mm->mm_preferred; i < mm->mm_children; i++, c++) {
+ if (c >= mm->mm_children)
+ c = 0;
+ mc = &mm->mm_child[c];
+ if (mc->mc_tried || mc->mc_skipped)
+ continue;
+ if (vdev_is_dead(mc->mc_vd)) {
+ mc->mc_error = ENXIO;
+ mc->mc_tried = 1; /* don't even try */
+ mc->mc_skipped = 1;
+ continue;
+ }
+ if (!vdev_dtl_contains(&mc->mc_vd->vdev_dtl_map, txg, 1))
+ return (c);
+ mc->mc_error = ESTALE;
+ mc->mc_skipped = 1;
+ }
+
+ /*
+ * Every device is either missing or has this txg in its DTL.
+ * Look for any child we haven't already tried before giving up.
+ */
+ for (c = 0; c < mm->mm_children; c++)
+ if (!mm->mm_child[c].mc_tried)
+ return (c);
+
+ /*
+ * Every child failed. There's no place left to look.
+ */
+ return (-1);
+}
+
+static void
+vdev_mirror_io_start(zio_t *zio)
+{
+ mirror_map_t *mm;
+ mirror_child_t *mc;
+ int c, children;
+
+ mm = vdev_mirror_map_alloc(zio);
+
+ if (zio->io_type == ZIO_TYPE_READ) {
+ if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_replacing) {
+ /*
+ * For scrubbing reads we need to allocate a read
+ * buffer for each child and issue reads to all
+ * children. If any child succeeds, it will copy its
+ * data into zio->io_data in vdev_mirror_scrub_done.
+ */
+ for (c = 0; c < mm->mm_children; c++) {
+ mc = &mm->mm_child[c];
+ zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
+ mc->mc_vd, mc->mc_offset,
+ zio_buf_alloc(zio->io_size), zio->io_size,
+ zio->io_type, zio->io_priority,
+ ZIO_FLAG_CANFAIL,
+ vdev_mirror_scrub_done, mc));
+ }
+ zio_wait_children_done(zio);
+ return;
+ }
+ /*
+ * For normal reads just pick one child.
+ */
+ c = vdev_mirror_child_select(zio);
+ children = (c >= 0);
+ } else {
+ ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+
+ /*
+ * If this is a resilvering I/O to a replacing vdev,
+ * only the last child should be written -- unless the
+ * first child happens to have a DTL entry here as well.
+ * All other writes go to all children.
+ */
+ if ((zio->io_flags & ZIO_FLAG_RESILVER) && mm->mm_replacing &&
+ !vdev_dtl_contains(&mm->mm_child[0].mc_vd->vdev_dtl_map,
+ zio->io_txg, 1)) {
+ c = mm->mm_children - 1;
+ children = 1;
+ } else {
+ c = 0;
+ children = mm->mm_children;
+ }
+ }
+
+ while (children--) {
+ mc = &mm->mm_child[c];
+ zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
+ mc->mc_vd, mc->mc_offset,
+ zio->io_data, zio->io_size, zio->io_type, zio->io_priority,
+ ZIO_FLAG_CANFAIL, vdev_mirror_child_done, mc));
+ c++;
+ }
+
+ zio_wait_children_done(zio);
+}
+
+static void
+vdev_mirror_io_done(zio_t *zio)
+{
+ mirror_map_t *mm = zio->io_vsd;
+ mirror_child_t *mc;
+ int c;
+ int good_copies = 0;
+ int unexpected_errors = 0;
+
+ zio->io_error = 0;
+ zio->io_numerrors = 0;
+
+ for (c = 0; c < mm->mm_children; c++) {
+ mc = &mm->mm_child[c];
+
+ if (mc->mc_tried && mc->mc_error == 0) {
+ good_copies++;
+ continue;
+ }
+
+ /*
+ * We preserve any EIOs because those may be worth retrying;
+ * whereas ECKSUM and ENXIO are more likely to be persistent.
+ */
+ if (mc->mc_error) {
+ if (zio->io_error != EIO)
+ zio->io_error = mc->mc_error;
+ if (!mc->mc_skipped)
+ unexpected_errors++;
+ zio->io_numerrors++;
+ }
+ }
+
+ if (zio->io_type == ZIO_TYPE_WRITE) {
+ /*
+ * XXX -- for now, treat partial writes as success.
+ * XXX -- For a replacing vdev, we need to make sure the
+ * new child succeeds.
+ */
+ /* XXPOLICY */
+ if (good_copies != 0)
+ zio->io_error = 0;
+ vdev_mirror_map_free(zio);
+ zio_next_stage(zio);
+ return;
+ }
+
+ ASSERT(zio->io_type == ZIO_TYPE_READ);
+
+ /*
+ * If we don't have a good copy yet, keep trying other children.
+ */
+ /* XXPOLICY */
+ if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) {
+ ASSERT(c >= 0 && c < mm->mm_children);
+ mc = &mm->mm_child[c];
+ dprintf("retrying i/o (err=%d) on child %s\n",
+ zio->io_error, vdev_description(mc->mc_vd));
+ zio->io_error = 0;
+ zio_vdev_io_redone(zio);
+ zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
+ mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size,
+ ZIO_TYPE_READ, zio->io_priority, ZIO_FLAG_CANFAIL,
+ vdev_mirror_child_done, mc));
+ zio_wait_children_done(zio);
+ return;
+ }
+
+ /* XXPOLICY */
+ if (good_copies)
+ zio->io_error = 0;
+ else
+ ASSERT(zio->io_error != 0);
+
+ if (good_copies && (spa_mode & FWRITE) &&
+ (unexpected_errors ||
+ (zio->io_flags & ZIO_FLAG_RESILVER) ||
+ ((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_replacing))) {
+ zio_t *rio;
+
+ /*
+ * Use the good data we have in hand to repair damaged children.
+ *
+ * We issue all repair I/Os as children of 'rio' to arrange
+ * that vdev_mirror_map_free(zio) will be invoked after all
+ * repairs complete, but before we advance to the next stage.
+ */
+ rio = zio_null(zio, zio->io_spa,
+ vdev_mirror_repair_done, zio, ZIO_FLAG_CANFAIL);
+
+ for (c = 0; c < mm->mm_children; c++) {
+ /*
+ * Don't rewrite known good children.
+ * Not only is it unnecessary, it could
+ * actually be harmful: if the system lost
+ * power while rewriting the only good copy,
+ * there would be no good copies left!
+ */
+ mc = &mm->mm_child[c];
+
+ if (mc->mc_error == 0) {
+ if (mc->mc_tried)
+ continue;
+ if (!(zio->io_flags & ZIO_FLAG_SCRUB) &&
+ !vdev_dtl_contains(&mc->mc_vd->vdev_dtl_map,
+ zio->io_txg, 1))
+ continue;
+ mc->mc_error = ESTALE;
+ }
+
+ dprintf("resilvered %s @ 0x%llx error %d\n",
+ vdev_description(mc->mc_vd), mc->mc_offset,
+ mc->mc_error);
+
+ zio_nowait(zio_vdev_child_io(rio, zio->io_bp, mc->mc_vd,
+ mc->mc_offset, zio->io_data, zio->io_size,
+ ZIO_TYPE_WRITE, zio->io_priority,
+ ZIO_FLAG_IO_REPAIR | ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_DONT_PROPAGATE, NULL, NULL));
+ }
+
+ zio_nowait(rio);
+ zio_wait_children_done(zio);
+ return;
+ }
+
+ vdev_mirror_map_free(zio);
+ zio_next_stage(zio);
+}
+
+static void
+vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded)
+{
+ if (faulted == vd->vdev_children)
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_NO_REPLICAS);
+ else if (degraded + faulted != 0)
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
+ else
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
+}
+
+vdev_ops_t vdev_mirror_ops = {
+ vdev_mirror_open,
+ vdev_mirror_close,
+ vdev_default_asize,
+ vdev_mirror_io_start,
+ vdev_mirror_io_done,
+ vdev_mirror_state_change,
+ VDEV_TYPE_MIRROR, /* name of this vdev type */
+ B_FALSE /* not a leaf vdev */
+};
+
+vdev_ops_t vdev_replacing_ops = {
+ vdev_mirror_open,
+ vdev_mirror_close,
+ vdev_default_asize,
+ vdev_mirror_io_start,
+ vdev_mirror_io_done,
+ vdev_mirror_state_change,
+ VDEV_TYPE_REPLACING, /* name of this vdev type */
+ B_FALSE /* not a leaf vdev */
+};
+
+vdev_ops_t vdev_spare_ops = {
+ vdev_mirror_open,
+ vdev_mirror_close,
+ vdev_default_asize,
+ vdev_mirror_io_start,
+ vdev_mirror_io_done,
+ vdev_mirror_state_change,
+ VDEV_TYPE_SPARE, /* name of this vdev type */
+ B_FALSE /* not a leaf vdev */
+};
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c
new file mode 100644
index 0000000..b35f4a5
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c
@@ -0,0 +1,89 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * The 'missing' vdev is a special vdev type used only during import. It
+ * signifies a placeholder in the root vdev for some vdev that we know is
+ * missing. We pass it down to the kernel to allow the rest of the
+ * configuration to parsed and an attempt made to open all available devices.
+ * Because its GUID is always 0, we know that the guid sum will mismatch and we
+ * won't be able to open the pool anyway.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio.h>
+
+/* ARGSUSED */
+static int
+vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
+{
+ /*
+ * Really this should just fail. But then the root vdev will be in the
+ * faulted state with VDEV_AUX_NO_REPLICAS, when what we really want is
+ * VDEV_AUX_BAD_GUID_SUM. So we pretend to succeed, knowing that we
+ * will fail the GUID sum check before ever trying to open the pool.
+ */
+ *psize = SPA_MINDEVSIZE;
+ *ashift = SPA_MINBLOCKSHIFT;
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+vdev_missing_close(vdev_t *vd)
+{
+}
+
+/* ARGSUSED */
+static void
+vdev_missing_io_start(zio_t *zio)
+{
+ zio->io_error = ENOTSUP;
+ zio_next_stage_async(zio);
+}
+
+/* ARGSUSED */
+static void
+vdev_missing_io_done(zio_t *zio)
+{
+ zio_next_stage(zio);
+}
+
+vdev_ops_t vdev_missing_ops = {
+ vdev_missing_open,
+ vdev_missing_close,
+ vdev_default_asize,
+ vdev_missing_io_start,
+ vdev_missing_io_done,
+ NULL,
+ VDEV_TYPE_MISSING, /* name of this vdev type */
+ B_TRUE /* leaf vdev */
+};
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
new file mode 100644
index 0000000..7e99c1f
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
@@ -0,0 +1,323 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#include <sys/avl.h>
+
+/*
+ * These tunables are for performance analysis.
+ */
+/*
+ * zfs_vdev_max_pending is the maximum number of i/os concurrently
+ * pending to each device. zfs_vdev_min_pending is the initial number
+ * of i/os pending to each device (before it starts ramping up to
+ * max_pending).
+ */
+int zfs_vdev_max_pending = 35;
+int zfs_vdev_min_pending = 4;
+
+/* deadline = pri + (lbolt >> time_shift) */
+int zfs_vdev_time_shift = 6;
+
+/* exponential I/O issue ramp-up rate */
+int zfs_vdev_ramp_rate = 2;
+
+/*
+ * i/os will be aggregated into a single large i/o up to
+ * zfs_vdev_aggregation_limit bytes long.
+ */
+int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE;
+
+/*
+ * Virtual device vector for disk I/O scheduling.
+ */
+int
+vdev_queue_deadline_compare(const void *x1, const void *x2)
+{
+ const zio_t *z1 = x1;
+ const zio_t *z2 = x2;
+
+ if (z1->io_deadline < z2->io_deadline)
+ return (-1);
+ if (z1->io_deadline > z2->io_deadline)
+ return (1);
+
+ if (z1->io_offset < z2->io_offset)
+ return (-1);
+ if (z1->io_offset > z2->io_offset)
+ return (1);
+
+ if (z1 < z2)
+ return (-1);
+ if (z1 > z2)
+ return (1);
+
+ return (0);
+}
+
+int
+vdev_queue_offset_compare(const void *x1, const void *x2)
+{
+ const zio_t *z1 = x1;
+ const zio_t *z2 = x2;
+
+ if (z1->io_offset < z2->io_offset)
+ return (-1);
+ if (z1->io_offset > z2->io_offset)
+ return (1);
+
+ if (z1 < z2)
+ return (-1);
+ if (z1 > z2)
+ return (1);
+
+ return (0);
+}
+
+void
+vdev_queue_init(vdev_t *vd)
+{
+ vdev_queue_t *vq = &vd->vdev_queue;
+
+ mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ avl_create(&vq->vq_deadline_tree, vdev_queue_deadline_compare,
+ sizeof (zio_t), offsetof(struct zio, io_deadline_node));
+
+ avl_create(&vq->vq_read_tree, vdev_queue_offset_compare,
+ sizeof (zio_t), offsetof(struct zio, io_offset_node));
+
+ avl_create(&vq->vq_write_tree, vdev_queue_offset_compare,
+ sizeof (zio_t), offsetof(struct zio, io_offset_node));
+
+ avl_create(&vq->vq_pending_tree, vdev_queue_offset_compare,
+ sizeof (zio_t), offsetof(struct zio, io_offset_node));
+}
+
+void
+vdev_queue_fini(vdev_t *vd)
+{
+ vdev_queue_t *vq = &vd->vdev_queue;
+
+ avl_destroy(&vq->vq_deadline_tree);
+ avl_destroy(&vq->vq_read_tree);
+ avl_destroy(&vq->vq_write_tree);
+ avl_destroy(&vq->vq_pending_tree);
+
+ mutex_destroy(&vq->vq_lock);
+}
+
+static void
+vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
+{
+ avl_add(&vq->vq_deadline_tree, zio);
+ avl_add(zio->io_vdev_tree, zio);
+}
+
+static void
+vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
+{
+ avl_remove(&vq->vq_deadline_tree, zio);
+ avl_remove(zio->io_vdev_tree, zio);
+}
+
+static void
+vdev_queue_agg_io_done(zio_t *aio)
+{
+ zio_t *dio;
+ uint64_t offset = 0;
+
+ while ((dio = aio->io_delegate_list) != NULL) {
+ if (aio->io_type == ZIO_TYPE_READ)
+ bcopy((char *)aio->io_data + offset, dio->io_data,
+ dio->io_size);
+ offset += dio->io_size;
+ aio->io_delegate_list = dio->io_delegate_next;
+ dio->io_delegate_next = NULL;
+ dio->io_error = aio->io_error;
+ zio_next_stage(dio);
+ }
+ ASSERT3U(offset, ==, aio->io_size);
+
+ zio_buf_free(aio->io_data, aio->io_size);
+}
+
+#define IS_ADJACENT(io, nio) \
+ ((io)->io_offset + (io)->io_size == (nio)->io_offset)
+
+typedef void zio_issue_func_t(zio_t *);
+
+static zio_t *
+vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit,
+ zio_issue_func_t **funcp)
+{
+ zio_t *fio, *lio, *aio, *dio;
+ avl_tree_t *tree;
+ uint64_t size;
+
+ ASSERT(MUTEX_HELD(&vq->vq_lock));
+
+ *funcp = NULL;
+
+ if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit ||
+ avl_numnodes(&vq->vq_deadline_tree) == 0)
+ return (NULL);
+
+ fio = lio = avl_first(&vq->vq_deadline_tree);
+
+ tree = fio->io_vdev_tree;
+ size = fio->io_size;
+
+ while ((dio = AVL_PREV(tree, fio)) != NULL && IS_ADJACENT(dio, fio) &&
+ size + dio->io_size <= zfs_vdev_aggregation_limit) {
+ dio->io_delegate_next = fio;
+ fio = dio;
+ size += dio->io_size;
+ }
+
+ while ((dio = AVL_NEXT(tree, lio)) != NULL && IS_ADJACENT(lio, dio) &&
+ size + dio->io_size <= zfs_vdev_aggregation_limit) {
+ lio->io_delegate_next = dio;
+ lio = dio;
+ size += dio->io_size;
+ }
+
+ if (fio != lio) {
+ char *buf = zio_buf_alloc(size);
+ uint64_t offset = 0;
+ int nagg = 0;
+
+ ASSERT(size <= zfs_vdev_aggregation_limit);
+
+ aio = zio_vdev_child_io(fio, NULL, fio->io_vd,
+ fio->io_offset, buf, size, fio->io_type,
+ ZIO_PRIORITY_NOW, ZIO_FLAG_DONT_QUEUE |
+ ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE |
+ ZIO_FLAG_NOBOOKMARK,
+ vdev_queue_agg_io_done, NULL);
+
+ aio->io_delegate_list = fio;
+
+ for (dio = fio; dio != NULL; dio = dio->io_delegate_next) {
+ ASSERT(dio->io_type == aio->io_type);
+ ASSERT(dio->io_vdev_tree == tree);
+ if (dio->io_type == ZIO_TYPE_WRITE)
+ bcopy(dio->io_data, buf + offset, dio->io_size);
+ offset += dio->io_size;
+ vdev_queue_io_remove(vq, dio);
+ zio_vdev_io_bypass(dio);
+ nagg++;
+ }
+
+ ASSERT(offset == size);
+
+ dprintf("%5s T=%llu off=%8llx agg=%3d "
+ "old=%5llx new=%5llx\n",
+ zio_type_name[fio->io_type],
+ fio->io_deadline, fio->io_offset, nagg, fio->io_size, size);
+
+ avl_add(&vq->vq_pending_tree, aio);
+
+ *funcp = zio_nowait;
+ return (aio);
+ }
+
+ ASSERT(fio->io_vdev_tree == tree);
+ vdev_queue_io_remove(vq, fio);
+
+ avl_add(&vq->vq_pending_tree, fio);
+
+ *funcp = zio_next_stage;
+
+ return (fio);
+}
+
+zio_t *
+vdev_queue_io(zio_t *zio)
+{
+ vdev_queue_t *vq = &zio->io_vd->vdev_queue;
+ zio_t *nio;
+ zio_issue_func_t *func;
+
+ ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
+
+ if (zio->io_flags & ZIO_FLAG_DONT_QUEUE)
+ return (zio);
+
+ zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE;
+
+ if (zio->io_type == ZIO_TYPE_READ)
+ zio->io_vdev_tree = &vq->vq_read_tree;
+ else
+ zio->io_vdev_tree = &vq->vq_write_tree;
+
+ mutex_enter(&vq->vq_lock);
+
+ zio->io_deadline = (zio->io_timestamp >> zfs_vdev_time_shift) +
+ zio->io_priority;
+
+ vdev_queue_io_add(vq, zio);
+
+ nio = vdev_queue_io_to_issue(vq, zfs_vdev_min_pending, &func);
+
+ mutex_exit(&vq->vq_lock);
+
+ if (nio == NULL || func != zio_nowait)
+ return (nio);
+
+ func(nio);
+ return (NULL);
+}
+
+void
+vdev_queue_io_done(zio_t *zio)
+{
+ vdev_queue_t *vq = &zio->io_vd->vdev_queue;
+ zio_t *nio;
+ zio_issue_func_t *func;
+ int i;
+
+ mutex_enter(&vq->vq_lock);
+
+ avl_remove(&vq->vq_pending_tree, zio);
+
+ for (i = 0; i < zfs_vdev_ramp_rate; i++) {
+ nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending, &func);
+ if (nio == NULL)
+ break;
+ mutex_exit(&vq->vq_lock);
+ if (func == zio_next_stage)
+ zio_vdev_io_reissue(nio);
+ func(nio);
+ mutex_enter(&vq->vq_lock);
+ }
+
+ mutex_exit(&vq->vq_lock);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
new file mode 100644
index 0000000..08df7e0
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
@@ -0,0 +1,1223 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+#include <sys/fs/zfs.h>
+#include <sys/fm/fs/zfs.h>
+
+/*
+ * Virtual device vector for RAID-Z.
+ *
+ * This vdev supports both single and double parity. For single parity, we
+ * use a simple XOR of all the data columns. For double parity, we use both
+ * the simple XOR as well as a technique described in "The mathematics of
+ * RAID-6" by H. Peter Anvin. This technique defines a Galois field, GF(2^8),
+ * over the integers expressable in a single byte. Briefly, the operations on
+ * the field are defined as follows:
+ *
+ * o addition (+) is represented by a bitwise XOR
+ * o subtraction (-) is therefore identical to addition: A + B = A - B
+ * o multiplication of A by 2 is defined by the following bitwise expression:
+ * (A * 2)_7 = A_6
+ * (A * 2)_6 = A_5
+ * (A * 2)_5 = A_4
+ * (A * 2)_4 = A_3 + A_7
+ * (A * 2)_3 = A_2 + A_7
+ * (A * 2)_2 = A_1 + A_7
+ * (A * 2)_1 = A_0
+ * (A * 2)_0 = A_7
+ *
+ * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
+ *
+ * Observe that any number in the field (except for 0) can be expressed as a
+ * power of 2 -- a generator for the field. We store a table of the powers of
+ * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
+ * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
+ * than field addition). The inverse of a field element A (A^-1) is A^254.
+ *
+ * The two parity columns, P and Q, over several data columns, D_0, ... D_n-1,
+ * can be expressed by field operations:
+ *
+ * P = D_0 + D_1 + ... + D_n-2 + D_n-1
+ * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
+ * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
+ *
+ * See the reconstruction code below for how P and Q can used individually or
+ * in concert to recover missing data columns.
+ */
+
+typedef struct raidz_col {
+ uint64_t rc_devidx; /* child device index for I/O */
+ uint64_t rc_offset; /* device offset */
+ uint64_t rc_size; /* I/O size */
+ void *rc_data; /* I/O data */
+ int rc_error; /* I/O error for this device */
+ uint8_t rc_tried; /* Did we attempt this I/O column? */
+ uint8_t rc_skipped; /* Did we skip this I/O column? */
+} raidz_col_t;
+
+typedef struct raidz_map {
+ uint64_t rm_cols; /* Column count */
+ uint64_t rm_bigcols; /* Number of oversized columns */
+ uint64_t rm_asize; /* Actual total I/O size */
+ uint64_t rm_missingdata; /* Count of missing data devices */
+ uint64_t rm_missingparity; /* Count of missing parity devices */
+ uint64_t rm_firstdatacol; /* First data column/parity count */
+ raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
+} raidz_map_t;
+
+#define VDEV_RAIDZ_P 0
+#define VDEV_RAIDZ_Q 1
+
+#define VDEV_RAIDZ_MAXPARITY 2
+
+#define VDEV_RAIDZ_MUL_2(a) (((a) << 1) ^ (((a) & 0x80) ? 0x1d : 0))
+
+/*
+ * These two tables represent powers and logs of 2 in the Galois field defined
+ * above. These values were computed by repeatedly multiplying by 2 as above.
+ */
+static const uint8_t vdev_raidz_pow2[256] = {
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+ 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
+ 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
+ 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
+ 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
+ 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
+ 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
+ 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
+ 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
+ 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
+ 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
+ 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
+ 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
+ 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
+ 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
+ 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
+ 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
+ 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
+ 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
+ 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
+ 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
+ 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
+ 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
+ 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
+ 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
+ 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
+ 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
+ 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
+ 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
+ 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
+ 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
+ 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
+};
+static const uint8_t vdev_raidz_log2[256] = {
+ 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
+ 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
+ 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
+ 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
+ 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
+ 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
+ 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
+ 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
+ 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
+ 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
+ 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
+ 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
+ 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
+ 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
+ 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
+ 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
+ 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
+ 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
+ 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
+ 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
+ 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
+ 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
+ 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
+ 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
+ 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
+ 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
+ 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
+ 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
+ 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
+ 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
+ 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
+ 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
+};
+
+/*
+ * Multiply a given number by 2 raised to the given power.
+ */
+static uint8_t
+vdev_raidz_exp2(uint_t a, int exp)
+{
+ if (a == 0)
+ return (0);
+
+ ASSERT(exp >= 0);
+ ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
+
+ exp += vdev_raidz_log2[a];
+ if (exp > 255)
+ exp -= 255;
+
+ return (vdev_raidz_pow2[exp]);
+}
+
+static raidz_map_t *
+vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
+ uint64_t nparity)
+{
+ raidz_map_t *rm;
+ uint64_t b = zio->io_offset >> unit_shift;
+ uint64_t s = zio->io_size >> unit_shift;
+ uint64_t f = b % dcols;
+ uint64_t o = (b / dcols) << unit_shift;
+ uint64_t q, r, c, bc, col, acols, coff, devidx;
+
+ q = s / (dcols - nparity);
+ r = s - q * (dcols - nparity);
+ bc = (r == 0 ? 0 : r + nparity);
+
+ acols = (q == 0 ? bc : dcols);
+
+ rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP);
+
+ rm->rm_cols = acols;
+ rm->rm_bigcols = bc;
+ rm->rm_asize = 0;
+ rm->rm_missingdata = 0;
+ rm->rm_missingparity = 0;
+ rm->rm_firstdatacol = nparity;
+
+ for (c = 0; c < acols; c++) {
+ col = f + c;
+ coff = o;
+ if (col >= dcols) {
+ col -= dcols;
+ coff += 1ULL << unit_shift;
+ }
+ rm->rm_col[c].rc_devidx = col;
+ rm->rm_col[c].rc_offset = coff;
+ rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift;
+ rm->rm_col[c].rc_data = NULL;
+ rm->rm_col[c].rc_error = 0;
+ rm->rm_col[c].rc_tried = 0;
+ rm->rm_col[c].rc_skipped = 0;
+ rm->rm_asize += rm->rm_col[c].rc_size;
+ }
+
+ rm->rm_asize = roundup(rm->rm_asize, (nparity + 1) << unit_shift);
+
+ for (c = 0; c < rm->rm_firstdatacol; c++)
+ rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
+
+ rm->rm_col[c].rc_data = zio->io_data;
+
+ for (c = c + 1; c < acols; c++)
+ rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
+ rm->rm_col[c - 1].rc_size;
+
+ /*
+ * If all data stored spans all columns, there's a danger that parity
+ * will always be on the same device and, since parity isn't read
+ * during normal operation, that that device's I/O bandwidth won't be
+ * used effectively. We therefore switch the parity every 1MB.
+ *
+ * ... at least that was, ostensibly, the theory. As a practical
+ * matter unless we juggle the parity between all devices evenly, we
+ * won't see any benefit. Further, occasional writes that aren't a
+ * multiple of the LCM of the number of children and the minimum
+ * stripe width are sufficient to avoid pessimal behavior.
+ * Unfortunately, this decision created an implicit on-disk format
+ * requirement that we need to support for all eternity, but only
+ * for single-parity RAID-Z.
+ */
+ ASSERT(rm->rm_cols >= 2);
+ ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
+
+ if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
+ devidx = rm->rm_col[0].rc_devidx;
+ o = rm->rm_col[0].rc_offset;
+ rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
+ rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
+ rm->rm_col[1].rc_devidx = devidx;
+ rm->rm_col[1].rc_offset = o;
+ }
+
+ zio->io_vsd = rm;
+ return (rm);
+}
+
+static void
+vdev_raidz_map_free(zio_t *zio)
+{
+ raidz_map_t *rm = zio->io_vsd;
+ int c;
+
+ for (c = 0; c < rm->rm_firstdatacol; c++)
+ zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
+
+ kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols]));
+ zio->io_vsd = NULL;
+}
+
+static void
+vdev_raidz_generate_parity_p(raidz_map_t *rm)
+{
+ uint64_t *p, *src, pcount, ccount, i;
+ int c;
+
+ pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
+
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ src = rm->rm_col[c].rc_data;
+ p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
+ ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
+
+ if (c == rm->rm_firstdatacol) {
+ ASSERT(ccount == pcount);
+ for (i = 0; i < ccount; i++, p++, src++) {
+ *p = *src;
+ }
+ } else {
+ ASSERT(ccount <= pcount);
+ for (i = 0; i < ccount; i++, p++, src++) {
+ *p ^= *src;
+ }
+ }
+ }
+}
+
+static void
+vdev_raidz_generate_parity_pq(raidz_map_t *rm)
+{
+ uint64_t *q, *p, *src, pcount, ccount, mask, i;
+ int c;
+
+ pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
+ ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
+ rm->rm_col[VDEV_RAIDZ_Q].rc_size);
+
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ src = rm->rm_col[c].rc_data;
+ p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
+ q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
+ ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
+
+ if (c == rm->rm_firstdatacol) {
+ ASSERT(ccount == pcount || ccount == 0);
+ for (i = 0; i < ccount; i++, p++, q++, src++) {
+ *q = *src;
+ *p = *src;
+ }
+ for (; i < pcount; i++, p++, q++, src++) {
+ *q = 0;
+ *p = 0;
+ }
+ } else {
+ ASSERT(ccount <= pcount);
+
+ /*
+ * Rather than multiplying each byte individually (as
+ * described above), we are able to handle 8 at once
+ * by generating a mask based on the high bit in each
+ * byte and using that to conditionally XOR in 0x1d.
+ */
+ for (i = 0; i < ccount; i++, p++, q++, src++) {
+ mask = *q & 0x8080808080808080ULL;
+ mask = (mask << 1) - (mask >> 7);
+ *q = ((*q << 1) & 0xfefefefefefefefeULL) ^
+ (mask & 0x1d1d1d1d1d1d1d1dULL);
+ *q ^= *src;
+ *p ^= *src;
+ }
+
+ /*
+ * Treat short columns as though they are full of 0s.
+ */
+ for (; i < pcount; i++, q++) {
+ mask = *q & 0x8080808080808080ULL;
+ mask = (mask << 1) - (mask >> 7);
+ *q = ((*q << 1) & 0xfefefefefefefefeULL) ^
+ (mask & 0x1d1d1d1d1d1d1d1dULL);
+ }
+ }
+ }
+}
+
+static void
+vdev_raidz_reconstruct_p(raidz_map_t *rm, int x)
+{
+ uint64_t *dst, *src, xcount, ccount, count, i;
+ int c;
+
+ xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
+ ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
+ ASSERT(xcount > 0);
+
+ src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
+ dst = rm->rm_col[x].rc_data;
+ for (i = 0; i < xcount; i++, dst++, src++) {
+ *dst = *src;
+ }
+
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ src = rm->rm_col[c].rc_data;
+ dst = rm->rm_col[x].rc_data;
+
+ if (c == x)
+ continue;
+
+ ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
+ count = MIN(ccount, xcount);
+
+ for (i = 0; i < count; i++, dst++, src++) {
+ *dst ^= *src;
+ }
+ }
+}
+
+static void
+vdev_raidz_reconstruct_q(raidz_map_t *rm, int x)
+{
+ uint64_t *dst, *src, xcount, ccount, count, mask, i;
+ uint8_t *b;
+ int c, j, exp;
+
+ xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
+ ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
+
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ src = rm->rm_col[c].rc_data;
+ dst = rm->rm_col[x].rc_data;
+
+ if (c == x)
+ ccount = 0;
+ else
+ ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
+
+ count = MIN(ccount, xcount);
+
+ if (c == rm->rm_firstdatacol) {
+ for (i = 0; i < count; i++, dst++, src++) {
+ *dst = *src;
+ }
+ for (; i < xcount; i++, dst++) {
+ *dst = 0;
+ }
+
+ } else {
+ /*
+ * For an explanation of this, see the comment in
+ * vdev_raidz_generate_parity_pq() above.
+ */
+ for (i = 0; i < count; i++, dst++, src++) {
+ mask = *dst & 0x8080808080808080ULL;
+ mask = (mask << 1) - (mask >> 7);
+ *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
+ (mask & 0x1d1d1d1d1d1d1d1dULL);
+ *dst ^= *src;
+ }
+
+ for (; i < xcount; i++, dst++) {
+ mask = *dst & 0x8080808080808080ULL;
+ mask = (mask << 1) - (mask >> 7);
+ *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
+ (mask & 0x1d1d1d1d1d1d1d1dULL);
+ }
+ }
+ }
+
+ src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
+ dst = rm->rm_col[x].rc_data;
+ exp = 255 - (rm->rm_cols - 1 - x);
+
+ for (i = 0; i < xcount; i++, dst++, src++) {
+ *dst ^= *src;
+ for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
+ *b = vdev_raidz_exp2(*b, exp);
+ }
+ }
+}
+
+static void
+vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y)
+{
+ uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
+ void *pdata, *qdata;
+ uint64_t xsize, ysize, i;
+
+ ASSERT(x < y);
+ ASSERT(x >= rm->rm_firstdatacol);
+ ASSERT(y < rm->rm_cols);
+
+ ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
+
+ /*
+ * Move the parity data aside -- we're going to compute parity as
+ * though columns x and y were full of zeros -- Pxy and Qxy. We want to
+ * reuse the parity generation mechanism without trashing the actual
+ * parity so we make those columns appear to be full of zeros by
+ * setting their lengths to zero.
+ */
+ pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
+ qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
+ xsize = rm->rm_col[x].rc_size;
+ ysize = rm->rm_col[y].rc_size;
+
+ rm->rm_col[VDEV_RAIDZ_P].rc_data =
+ zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
+ rm->rm_col[VDEV_RAIDZ_Q].rc_data =
+ zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
+ rm->rm_col[x].rc_size = 0;
+ rm->rm_col[y].rc_size = 0;
+
+ vdev_raidz_generate_parity_pq(rm);
+
+ rm->rm_col[x].rc_size = xsize;
+ rm->rm_col[y].rc_size = ysize;
+
+ p = pdata;
+ q = qdata;
+ pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
+ qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
+ xd = rm->rm_col[x].rc_data;
+ yd = rm->rm_col[y].rc_data;
+
+ /*
+ * We now have:
+ * Pxy = P + D_x + D_y
+ * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
+ *
+ * We can then solve for D_x:
+ * D_x = A * (P + Pxy) + B * (Q + Qxy)
+ * where
+ * A = 2^(x - y) * (2^(x - y) + 1)^-1
+ * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
+ *
+ * With D_x in hand, we can easily solve for D_y:
+ * D_y = P + Pxy + D_x
+ */
+
+ a = vdev_raidz_pow2[255 + x - y];
+ b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
+ tmp = 255 - vdev_raidz_log2[a ^ 1];
+
+ aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
+ bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
+
+ for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
+ *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
+ vdev_raidz_exp2(*q ^ *qxy, bexp);
+
+ if (i < ysize)
+ *yd = *p ^ *pxy ^ *xd;
+ }
+
+ zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
+ rm->rm_col[VDEV_RAIDZ_P].rc_size);
+ zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
+ rm->rm_col[VDEV_RAIDZ_Q].rc_size);
+
+ /*
+ * Restore the saved parity data.
+ */
+ rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
+ rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
+}
+
+
+static int
+vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
+{
+ vdev_t *cvd;
+ uint64_t nparity = vd->vdev_nparity;
+ int c, error;
+ int lasterror = 0;
+ int numerrors = 0;
+
+ ASSERT(nparity > 0);
+
+ if (nparity > VDEV_RAIDZ_MAXPARITY ||
+ vd->vdev_children < nparity + 1) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+ return (EINVAL);
+ }
+
+ for (c = 0; c < vd->vdev_children; c++) {
+ cvd = vd->vdev_child[c];
+
+ if ((error = vdev_open(cvd)) != 0) {
+ lasterror = error;
+ numerrors++;
+ continue;
+ }
+
+ *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
+ *ashift = MAX(*ashift, cvd->vdev_ashift);
+ }
+
+ *asize *= vd->vdev_children;
+
+ if (numerrors > nparity) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
+ return (lasterror);
+ }
+
+ return (0);
+}
+
+static void
+vdev_raidz_close(vdev_t *vd)
+{
+ int c;
+
+ for (c = 0; c < vd->vdev_children; c++)
+ vdev_close(vd->vdev_child[c]);
+}
+
+static uint64_t
+vdev_raidz_asize(vdev_t *vd, uint64_t psize)
+{
+ uint64_t asize;
+ uint64_t ashift = vd->vdev_top->vdev_ashift;
+ uint64_t cols = vd->vdev_children;
+ uint64_t nparity = vd->vdev_nparity;
+
+ asize = ((psize - 1) >> ashift) + 1;
+ asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
+ asize = roundup(asize, nparity + 1) << ashift;
+
+ return (asize);
+}
+
+static void
+vdev_raidz_child_done(zio_t *zio)
+{
+ raidz_col_t *rc = zio->io_private;
+
+ rc->rc_error = zio->io_error;
+ rc->rc_tried = 1;
+ rc->rc_skipped = 0;
+}
+
+static void
+vdev_raidz_repair_done(zio_t *zio)
+{
+ ASSERT(zio->io_private == zio->io_parent);
+ vdev_raidz_map_free(zio->io_private);
+}
+
+static void
+vdev_raidz_io_start(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ vdev_t *tvd = vd->vdev_top;
+ vdev_t *cvd;
+ blkptr_t *bp = zio->io_bp;
+ raidz_map_t *rm;
+ raidz_col_t *rc;
+ int c;
+
+ rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
+ vd->vdev_nparity);
+
+ ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
+
+ if (zio->io_type == ZIO_TYPE_WRITE) {
+ /*
+ * Generate RAID parity in the first virtual columns.
+ */
+ if (rm->rm_firstdatacol == 1)
+ vdev_raidz_generate_parity_p(rm);
+ else
+ vdev_raidz_generate_parity_pq(rm);
+
+ for (c = 0; c < rm->rm_cols; c++) {
+ rc = &rm->rm_col[c];
+ cvd = vd->vdev_child[rc->rc_devidx];
+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+ rc->rc_offset, rc->rc_data, rc->rc_size,
+ zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
+ vdev_raidz_child_done, rc));
+ }
+ zio_wait_children_done(zio);
+ return;
+ }
+
+ ASSERT(zio->io_type == ZIO_TYPE_READ);
+
+ /*
+ * Iterate over the columns in reverse order so that we hit the parity
+ * last -- any errors along the way will force us to read the parity
+ * data.
+ */
+ for (c = rm->rm_cols - 1; c >= 0; c--) {
+ rc = &rm->rm_col[c];
+ cvd = vd->vdev_child[rc->rc_devidx];
+ if (vdev_is_dead(cvd)) {
+ if (c >= rm->rm_firstdatacol)
+ rm->rm_missingdata++;
+ else
+ rm->rm_missingparity++;
+ rc->rc_error = ENXIO;
+ rc->rc_tried = 1; /* don't even try */
+ rc->rc_skipped = 1;
+ continue;
+ }
+ if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) {
+ if (c >= rm->rm_firstdatacol)
+ rm->rm_missingdata++;
+ else
+ rm->rm_missingparity++;
+ rc->rc_error = ESTALE;
+ rc->rc_skipped = 1;
+ continue;
+ }
+ if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
+ (zio->io_flags & ZIO_FLAG_SCRUB)) {
+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+ rc->rc_offset, rc->rc_data, rc->rc_size,
+ zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
+ vdev_raidz_child_done, rc));
+ }
+ }
+
+ zio_wait_children_done(zio);
+}
+
+/*
+ * Report a checksum error for a child of a RAID-Z device.
+ */
+static void
+raidz_checksum_error(zio_t *zio, raidz_col_t *rc)
+{
+ vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
+ dprintf_bp(zio->io_bp, "imputed checksum error on %s: ",
+ vdev_description(vd));
+
+ if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+ mutex_enter(&vd->vdev_stat_lock);
+ vd->vdev_stat.vs_checksum_errors++;
+ mutex_exit(&vd->vdev_stat_lock);
+ }
+
+ if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE))
+ zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
+ zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size);
+}
+
+/*
+ * Generate the parity from the data columns. If we tried and were able to
+ * read the parity without error, verify that the generated parity matches the
+ * data we read. If it doesn't, we fire off a checksum error. Return the
+ * number such failures.
+ */
+static int
+raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
+{
+ void *orig[VDEV_RAIDZ_MAXPARITY];
+ int c, ret = 0;
+ raidz_col_t *rc;
+
+ for (c = 0; c < rm->rm_firstdatacol; c++) {
+ rc = &rm->rm_col[c];
+ if (!rc->rc_tried || rc->rc_error != 0)
+ continue;
+ orig[c] = zio_buf_alloc(rc->rc_size);
+ bcopy(rc->rc_data, orig[c], rc->rc_size);
+ }
+
+ if (rm->rm_firstdatacol == 1)
+ vdev_raidz_generate_parity_p(rm);
+ else
+ vdev_raidz_generate_parity_pq(rm);
+
+ for (c = 0; c < rm->rm_firstdatacol; c++) {
+ rc = &rm->rm_col[c];
+ if (!rc->rc_tried || rc->rc_error != 0)
+ continue;
+ if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
+ raidz_checksum_error(zio, rc);
+ rc->rc_error = ECKSUM;
+ ret++;
+ }
+ zio_buf_free(orig[c], rc->rc_size);
+ }
+
+ return (ret);
+}
+
+static uint64_t raidz_corrected_p;
+static uint64_t raidz_corrected_q;
+static uint64_t raidz_corrected_pq;
+
+static void
+vdev_raidz_io_done(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ vdev_t *cvd;
+ raidz_map_t *rm = zio->io_vsd;
+ raidz_col_t *rc, *rc1;
+ int unexpected_errors = 0;
+ int parity_errors = 0;
+ int parity_untried = 0;
+ int data_errors = 0;
+ int n, c, c1;
+
+ ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */
+
+ zio->io_error = 0;
+ zio->io_numerrors = 0;
+
+ ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
+ ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
+
+ for (c = 0; c < rm->rm_cols; c++) {
+ rc = &rm->rm_col[c];
+
+ /*
+ * We preserve any EIOs because those may be worth retrying;
+ * whereas ECKSUM and ENXIO are more likely to be persistent.
+ */
+ if (rc->rc_error) {
+ if (zio->io_error != EIO)
+ zio->io_error = rc->rc_error;
+
+ if (c < rm->rm_firstdatacol)
+ parity_errors++;
+ else
+ data_errors++;
+
+ if (!rc->rc_skipped)
+ unexpected_errors++;
+
+ zio->io_numerrors++;
+ } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
+ parity_untried++;
+ }
+ }
+
+ if (zio->io_type == ZIO_TYPE_WRITE) {
+ /*
+ * If this is not a failfast write, and we were able to
+ * write enough columns to reconstruct the data, good enough.
+ */
+ /* XXPOLICY */
+ if (zio->io_numerrors <= rm->rm_firstdatacol &&
+ !(zio->io_flags & ZIO_FLAG_FAILFAST))
+ zio->io_error = 0;
+
+ vdev_raidz_map_free(zio);
+ zio_next_stage(zio);
+ return;
+ }
+
+ ASSERT(zio->io_type == ZIO_TYPE_READ);
+ /*
+ * There are three potential phases for a read:
+ * 1. produce valid data from the columns read
+ * 2. read all disks and try again
+ * 3. perform combinatorial reconstruction
+ *
+ * Each phase is progressively both more expensive and less likely to
+ * occur. If we encounter more errors than we can repair or all phases
+ * fail, we have no choice but to return an error.
+ */
+
+ /*
+ * If the number of errors we saw was correctable -- less than or equal
+ * to the number of parity disks read -- attempt to produce data that
+ * has a valid checksum. Naturally, this case applies in the absence of
+ * any errors.
+ */
+ if (zio->io_numerrors <= rm->rm_firstdatacol - parity_untried) {
+ switch (data_errors) {
+ case 0:
+ if (zio_checksum_error(zio) == 0) {
+ zio->io_error = 0;
+ if (parity_errors + parity_untried <
+ rm->rm_firstdatacol) {
+ n = raidz_parity_verify(zio, rm);
+ unexpected_errors += n;
+ ASSERT(parity_errors + n <=
+ rm->rm_firstdatacol);
+ }
+ goto done;
+ }
+ break;
+
+ case 1:
+ /*
+ * We either attempt to read all the parity columns or
+ * none of them. If we didn't try to read parity, we
+ * wouldn't be here in the correctable case. There must
+ * also have been fewer parity errors than parity
+ * columns or, again, we wouldn't be in this code path.
+ */
+ ASSERT(parity_untried == 0);
+ ASSERT(parity_errors < rm->rm_firstdatacol);
+
+ /*
+ * Find the column that reported the error.
+ */
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ rc = &rm->rm_col[c];
+ if (rc->rc_error != 0)
+ break;
+ }
+ ASSERT(c != rm->rm_cols);
+ ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
+ rc->rc_error == ESTALE);
+
+ if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
+ vdev_raidz_reconstruct_p(rm, c);
+ } else {
+ ASSERT(rm->rm_firstdatacol > 1);
+ vdev_raidz_reconstruct_q(rm, c);
+ }
+
+ if (zio_checksum_error(zio) == 0) {
+ zio->io_error = 0;
+ if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0)
+ atomic_inc_64(&raidz_corrected_p);
+ else
+ atomic_inc_64(&raidz_corrected_q);
+
+ /*
+ * If there's more than one parity disk that
+ * was successfully read, confirm that the
+ * other parity disk produced the correct data.
+ * This routine is suboptimal in that it
+ * regenerates both the parity we wish to test
+ * as well as the parity we just used to
+ * perform the reconstruction, but this should
+ * be a relatively uncommon case, and can be
+ * optimized if it becomes a problem.
+ */
+ if (parity_errors < rm->rm_firstdatacol - 1) {
+ n = raidz_parity_verify(zio, rm);
+ unexpected_errors += n;
+ ASSERT(parity_errors + n <=
+ rm->rm_firstdatacol);
+ }
+
+ goto done;
+ }
+ break;
+
+ case 2:
+ /*
+ * Two data column errors require double parity.
+ */
+ ASSERT(rm->rm_firstdatacol == 2);
+
+ /*
+ * Find the two columns that reported errors.
+ */
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ rc = &rm->rm_col[c];
+ if (rc->rc_error != 0)
+ break;
+ }
+ ASSERT(c != rm->rm_cols);
+ ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
+ rc->rc_error == ESTALE);
+
+ for (c1 = c++; c < rm->rm_cols; c++) {
+ rc = &rm->rm_col[c];
+ if (rc->rc_error != 0)
+ break;
+ }
+ ASSERT(c != rm->rm_cols);
+ ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
+ rc->rc_error == ESTALE);
+
+ vdev_raidz_reconstruct_pq(rm, c1, c);
+
+ if (zio_checksum_error(zio) == 0) {
+ zio->io_error = 0;
+ atomic_inc_64(&raidz_corrected_pq);
+
+ goto done;
+ }
+ break;
+
+ default:
+ ASSERT(rm->rm_firstdatacol <= 2);
+ ASSERT(0);
+ }
+ }
+
+ /*
+ * This isn't a typical situation -- either we got a read error or
+ * a child silently returned bad data. Read every block so we can
+ * try again with as much data and parity as we can track down. If
+ * we've already been through once before, all children will be marked
+ * as tried so we'll proceed to combinatorial reconstruction.
+ */
+ unexpected_errors = 1;
+ rm->rm_missingdata = 0;
+ rm->rm_missingparity = 0;
+
+ for (c = 0; c < rm->rm_cols; c++) {
+ if (rm->rm_col[c].rc_tried)
+ continue;
+
+ zio->io_error = 0;
+ zio_vdev_io_redone(zio);
+ do {
+ rc = &rm->rm_col[c];
+ if (rc->rc_tried)
+ continue;
+ zio_nowait(zio_vdev_child_io(zio, NULL,
+ vd->vdev_child[rc->rc_devidx],
+ rc->rc_offset, rc->rc_data, rc->rc_size,
+ zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
+ vdev_raidz_child_done, rc));
+ } while (++c < rm->rm_cols);
+ dprintf("rereading\n");
+ zio_wait_children_done(zio);
+ return;
+ }
+
+ /*
+ * At this point we've attempted to reconstruct the data given the
+ * errors we detected, and we've attempted to read all columns. There
+ * must, therefore, be one or more additional problems -- silent errors
+ * resulting in invalid data rather than explicit I/O errors resulting
+ * in absent data. Before we attempt combinatorial reconstruction make
+ * sure we have a chance of coming up with the right answer.
+ */
+ if (zio->io_numerrors >= rm->rm_firstdatacol) {
+ ASSERT(zio->io_error != 0);
+ goto done;
+ }
+
+ if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
+ /*
+ * Attempt to reconstruct the data from parity P.
+ */
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ void *orig;
+ rc = &rm->rm_col[c];
+
+ orig = zio_buf_alloc(rc->rc_size);
+ bcopy(rc->rc_data, orig, rc->rc_size);
+ vdev_raidz_reconstruct_p(rm, c);
+
+ if (zio_checksum_error(zio) == 0) {
+ zio_buf_free(orig, rc->rc_size);
+ zio->io_error = 0;
+ atomic_inc_64(&raidz_corrected_p);
+
+ /*
+ * If this child didn't know that it returned
+ * bad data, inform it.
+ */
+ if (rc->rc_tried && rc->rc_error == 0)
+ raidz_checksum_error(zio, rc);
+ rc->rc_error = ECKSUM;
+ goto done;
+ }
+
+ bcopy(orig, rc->rc_data, rc->rc_size);
+ zio_buf_free(orig, rc->rc_size);
+ }
+ }
+
+ if (rm->rm_firstdatacol > 1 && rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
+ /*
+ * Attempt to reconstruct the data from parity Q.
+ */
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ void *orig;
+ rc = &rm->rm_col[c];
+
+ orig = zio_buf_alloc(rc->rc_size);
+ bcopy(rc->rc_data, orig, rc->rc_size);
+ vdev_raidz_reconstruct_q(rm, c);
+
+ if (zio_checksum_error(zio) == 0) {
+ zio_buf_free(orig, rc->rc_size);
+ zio->io_error = 0;
+ atomic_inc_64(&raidz_corrected_q);
+
+ /*
+ * If this child didn't know that it returned
+ * bad data, inform it.
+ */
+ if (rc->rc_tried && rc->rc_error == 0)
+ raidz_checksum_error(zio, rc);
+ rc->rc_error = ECKSUM;
+ goto done;
+ }
+
+ bcopy(orig, rc->rc_data, rc->rc_size);
+ zio_buf_free(orig, rc->rc_size);
+ }
+ }
+
+ if (rm->rm_firstdatacol > 1 &&
+ rm->rm_col[VDEV_RAIDZ_P].rc_error == 0 &&
+ rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
+ /*
+ * Attempt to reconstruct the data from both P and Q.
+ */
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols - 1; c++) {
+ void *orig, *orig1;
+ rc = &rm->rm_col[c];
+
+ orig = zio_buf_alloc(rc->rc_size);
+ bcopy(rc->rc_data, orig, rc->rc_size);
+
+ for (c1 = c + 1; c1 < rm->rm_cols; c1++) {
+ rc1 = &rm->rm_col[c1];
+
+ orig1 = zio_buf_alloc(rc1->rc_size);
+ bcopy(rc1->rc_data, orig1, rc1->rc_size);
+
+ vdev_raidz_reconstruct_pq(rm, c, c1);
+
+ if (zio_checksum_error(zio) == 0) {
+ zio_buf_free(orig, rc->rc_size);
+ zio_buf_free(orig1, rc1->rc_size);
+ zio->io_error = 0;
+ atomic_inc_64(&raidz_corrected_pq);
+
+ /*
+ * If these children didn't know they
+ * returned bad data, inform them.
+ */
+ if (rc->rc_tried && rc->rc_error == 0)
+ raidz_checksum_error(zio, rc);
+ if (rc1->rc_tried && rc1->rc_error == 0)
+ raidz_checksum_error(zio, rc1);
+
+ rc->rc_error = ECKSUM;
+ rc1->rc_error = ECKSUM;
+
+ goto done;
+ }
+
+ bcopy(orig1, rc1->rc_data, rc1->rc_size);
+ zio_buf_free(orig1, rc1->rc_size);
+ }
+
+ bcopy(orig, rc->rc_data, rc->rc_size);
+ zio_buf_free(orig, rc->rc_size);
+ }
+ }
+
+ /*
+ * All combinations failed to checksum. Generate checksum ereports for
+ * all children.
+ */
+ zio->io_error = ECKSUM;
+ if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+ for (c = 0; c < rm->rm_cols; c++) {
+ rc = &rm->rm_col[c];
+ zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
+ zio->io_spa, vd->vdev_child[rc->rc_devidx], zio,
+ rc->rc_offset, rc->rc_size);
+ }
+ }
+
+done:
+ zio_checksum_verified(zio);
+
+ if (zio->io_error == 0 && (spa_mode & FWRITE) &&
+ (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
+ zio_t *rio;
+
+ /*
+ * Use the good data we have in hand to repair damaged children.
+ *
+ * We issue all repair I/Os as children of 'rio' to arrange
+ * that vdev_raidz_map_free(zio) will be invoked after all
+ * repairs complete, but before we advance to the next stage.
+ */
+ rio = zio_null(zio, zio->io_spa,
+ vdev_raidz_repair_done, zio, ZIO_FLAG_CANFAIL);
+
+ for (c = 0; c < rm->rm_cols; c++) {
+ rc = &rm->rm_col[c];
+ cvd = vd->vdev_child[rc->rc_devidx];
+
+ if (rc->rc_error == 0)
+ continue;
+
+ dprintf("%s resilvered %s @ 0x%llx error %d\n",
+ vdev_description(vd),
+ vdev_description(cvd),
+ zio->io_offset, rc->rc_error);
+
+ zio_nowait(zio_vdev_child_io(rio, NULL, cvd,
+ rc->rc_offset, rc->rc_data, rc->rc_size,
+ ZIO_TYPE_WRITE, zio->io_priority,
+ ZIO_FLAG_IO_REPAIR | ZIO_FLAG_DONT_PROPAGATE |
+ ZIO_FLAG_CANFAIL, NULL, NULL));
+ }
+
+ zio_nowait(rio);
+ zio_wait_children_done(zio);
+ return;
+ }
+
+ vdev_raidz_map_free(zio);
+ zio_next_stage(zio);
+}
+
+static void
+vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
+{
+ if (faulted > vd->vdev_nparity)
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_NO_REPLICAS);
+ else if (degraded + faulted != 0)
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
+ else
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
+}
+
+vdev_ops_t vdev_raidz_ops = {
+ vdev_raidz_open,
+ vdev_raidz_close,
+ vdev_raidz_asize,
+ vdev_raidz_io_start,
+ vdev_raidz_io_done,
+ vdev_raidz_state_change,
+ VDEV_TYPE_RAIDZ, /* name of this vdev type */
+ B_FALSE /* not a leaf vdev */
+};
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c
new file mode 100644
index 0000000..0e8752c
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c
@@ -0,0 +1,118 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#include <sys/fs/zfs.h>
+
+/*
+ * Virtual device vector for the pool's root vdev.
+ */
+
+/*
+ * We should be able to tolerate one failure with absolutely no damage
+ * to our metadata. Two failures will take out space maps, a bunch of
+ * indirect block trees, meta dnodes, dnodes, etc. Probably not a happy
+ * place to live. When we get smarter, we can liberalize this policy.
+ * e.g. If we haven't lost two consecutive top-level vdevs, then we are
+ * probably fine. Adding bean counters during alloc/free can make this
+ * future guesswork more accurate.
+ */
+/*ARGSUSED*/
+static int
+too_many_errors(vdev_t *vd, int numerrors)
+{
+ return (numerrors > 0);
+}
+
+static int
+vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
+{
+ vdev_t *cvd;
+ int c, error;
+ int lasterror = 0;
+ int numerrors = 0;
+
+ if (vd->vdev_children == 0) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+ return (EINVAL);
+ }
+
+ for (c = 0; c < vd->vdev_children; c++) {
+ cvd = vd->vdev_child[c];
+
+ if ((error = vdev_open(cvd)) != 0) {
+ lasterror = error;
+ numerrors++;
+ continue;
+ }
+ }
+
+ if (too_many_errors(vd, numerrors)) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
+ return (lasterror);
+ }
+
+ *asize = 0;
+ *ashift = 0;
+
+ return (0);
+}
+
+static void
+vdev_root_close(vdev_t *vd)
+{
+ int c;
+
+ for (c = 0; c < vd->vdev_children; c++)
+ vdev_close(vd->vdev_child[c]);
+}
+
+static void
+vdev_root_state_change(vdev_t *vd, int faulted, int degraded)
+{
+ if (too_many_errors(vd, faulted))
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_NO_REPLICAS);
+ else if (degraded != 0)
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
+ else
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
+}
+
+vdev_ops_t vdev_root_ops = {
+ vdev_root_open,
+ vdev_root_close,
+ vdev_default_asize,
+ NULL, /* io_start - not applicable to the root */
+ NULL, /* io_done - not applicable to the root */
+ vdev_root_state_change,
+ VDEV_TYPE_ROOT, /* name of this vdev type */
+ B_FALSE /* not a leaf vdev */
+};
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c
new file mode 100644
index 0000000..533431f
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c
@@ -0,0 +1,1070 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+
+/*
+ * This file contains the top half of the zfs directory structure
+ * implementation. The bottom half is in zap_leaf.c.
+ *
+ * The zdir is an extendable hash data structure. There is a table of
+ * pointers to buckets (zap_t->zd_data->zd_leafs). The buckets are
+ * each a constant size and hold a variable number of directory entries.
+ * The buckets (aka "leaf nodes") are implemented in zap_leaf.c.
+ *
+ * The pointer table holds a power of 2 number of pointers.
+ * (1<<zap_t->zd_data->zd_phys->zd_prefix_len). The bucket pointed to
+ * by the pointer at index i in the table holds entries whose hash value
+ * has a zd_prefix_len - bit prefix
+ */
+
+#include <sys/spa.h>
+#include <sys/dmu.h>
+#include <sys/zfs_context.h>
+#include <sys/zap.h>
+#include <sys/refcount.h>
+#include <sys/zap_impl.h>
+#include <sys/zap_leaf.h>
+
+int fzap_default_block_shift = 14; /* 16k blocksize */
+
+static void zap_leaf_pageout(dmu_buf_t *db, void *vl);
+static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks);
+
+
+void
+fzap_byteswap(void *vbuf, size_t size)
+{
+ uint64_t block_type;
+
+ block_type = *(uint64_t *)vbuf;
+
+ if (block_type == ZBT_LEAF || block_type == BSWAP_64(ZBT_LEAF))
+ zap_leaf_byteswap(vbuf, size);
+ else {
+ /* it's a ptrtbl block */
+ byteswap_uint64_array(vbuf, size);
+ }
+}
+
+void
+fzap_upgrade(zap_t *zap, dmu_tx_t *tx)
+{
+ dmu_buf_t *db;
+ zap_leaf_t *l;
+ int i;
+ zap_phys_t *zp;
+
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+ zap->zap_ismicro = FALSE;
+
+ (void) dmu_buf_update_user(zap->zap_dbuf, zap, zap,
+ &zap->zap_f.zap_phys, zap_evict);
+
+ mutex_init(&zap->zap_f.zap_num_entries_mtx, NULL, MUTEX_DEFAULT, 0);
+ zap->zap_f.zap_block_shift = highbit(zap->zap_dbuf->db_size) - 1;
+
+ zp = zap->zap_f.zap_phys;
+ /*
+ * explicitly zero it since it might be coming from an
+ * initialized microzap
+ */
+ bzero(zap->zap_dbuf->db_data, zap->zap_dbuf->db_size);
+ zp->zap_block_type = ZBT_HEADER;
+ zp->zap_magic = ZAP_MAGIC;
+
+ zp->zap_ptrtbl.zt_shift = ZAP_EMBEDDED_PTRTBL_SHIFT(zap);
+
+ zp->zap_freeblk = 2; /* block 1 will be the first leaf */
+ zp->zap_num_leafs = 1;
+ zp->zap_num_entries = 0;
+ zp->zap_salt = zap->zap_salt;
+
+ /* block 1 will be the first leaf */
+ for (i = 0; i < (1<<zp->zap_ptrtbl.zt_shift); i++)
+ ZAP_EMBEDDED_PTRTBL_ENT(zap, i) = 1;
+
+ /*
+ * set up block 1 - the first leaf
+ */
+ VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ 1<<FZAP_BLOCK_SHIFT(zap), FTAG, &db));
+ dmu_buf_will_dirty(db, tx);
+
+ l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
+ l->l_dbuf = db;
+ l->l_phys = db->db_data;
+
+ zap_leaf_init(l);
+
+ kmem_free(l, sizeof (zap_leaf_t));
+ dmu_buf_rele(db, FTAG);
+}
+
+static int
+zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx)
+{
+ if (RW_WRITE_HELD(&zap->zap_rwlock))
+ return (1);
+ if (rw_tryupgrade(&zap->zap_rwlock)) {
+ dmu_buf_will_dirty(zap->zap_dbuf, tx);
+ return (1);
+ }
+ return (0);
+}
+
+/*
+ * Generic routines for dealing with the pointer & cookie tables.
+ */
+
+static int
+zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
+ void (*transfer_func)(const uint64_t *src, uint64_t *dst, int n),
+ dmu_tx_t *tx)
+{
+ uint64_t b, newblk;
+ dmu_buf_t *db_old, *db_new;
+ int err;
+ int bs = FZAP_BLOCK_SHIFT(zap);
+ int hepb = 1<<(bs-4);
+ /* hepb = half the number of entries in a block */
+
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+ ASSERT(tbl->zt_blk != 0);
+ ASSERT(tbl->zt_numblks > 0);
+
+ if (tbl->zt_nextblk != 0) {
+ newblk = tbl->zt_nextblk;
+ } else {
+ newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2);
+ tbl->zt_nextblk = newblk;
+ ASSERT3U(tbl->zt_blks_copied, ==, 0);
+ dmu_prefetch(zap->zap_objset, zap->zap_object,
+ tbl->zt_blk << bs, tbl->zt_numblks << bs);
+ }
+
+ /*
+ * Copy the ptrtbl from the old to new location.
+ */
+
+ b = tbl->zt_blks_copied;
+ err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ (tbl->zt_blk + b) << bs, FTAG, &db_old);
+ if (err)
+ return (err);
+
+ /* first half of entries in old[b] go to new[2*b+0] */
+ VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ (newblk + 2*b+0) << bs, FTAG, &db_new));
+ dmu_buf_will_dirty(db_new, tx);
+ transfer_func(db_old->db_data, db_new->db_data, hepb);
+ dmu_buf_rele(db_new, FTAG);
+
+ /* second half of entries in old[b] go to new[2*b+1] */
+ VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ (newblk + 2*b+1) << bs, FTAG, &db_new));
+ dmu_buf_will_dirty(db_new, tx);
+ transfer_func((uint64_t *)db_old->db_data + hepb,
+ db_new->db_data, hepb);
+ dmu_buf_rele(db_new, FTAG);
+
+ dmu_buf_rele(db_old, FTAG);
+
+ tbl->zt_blks_copied++;
+
+ dprintf("copied block %llu of %llu\n",
+ tbl->zt_blks_copied, tbl->zt_numblks);
+
+ if (tbl->zt_blks_copied == tbl->zt_numblks) {
+ (void) dmu_free_range(zap->zap_objset, zap->zap_object,
+ tbl->zt_blk << bs, tbl->zt_numblks << bs, tx);
+
+ tbl->zt_blk = newblk;
+ tbl->zt_numblks *= 2;
+ tbl->zt_shift++;
+ tbl->zt_nextblk = 0;
+ tbl->zt_blks_copied = 0;
+
+ dprintf("finished; numblocks now %llu (%lluk entries)\n",
+ tbl->zt_numblks, 1<<(tbl->zt_shift-10));
+ }
+
+ return (0);
+}
+
+static int
+zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
+ dmu_tx_t *tx)
+{
+ int err;
+ uint64_t blk, off;
+ int bs = FZAP_BLOCK_SHIFT(zap);
+ dmu_buf_t *db;
+
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+ ASSERT(tbl->zt_blk != 0);
+
+ dprintf("storing %llx at index %llx\n", val, idx);
+
+ blk = idx >> (bs-3);
+ off = idx & ((1<<(bs-3))-1);
+
+ err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ (tbl->zt_blk + blk) << bs, FTAG, &db);
+ if (err)
+ return (err);
+ dmu_buf_will_dirty(db, tx);
+
+ if (tbl->zt_nextblk != 0) {
+ uint64_t idx2 = idx * 2;
+ uint64_t blk2 = idx2 >> (bs-3);
+ uint64_t off2 = idx2 & ((1<<(bs-3))-1);
+ dmu_buf_t *db2;
+
+ err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ (tbl->zt_nextblk + blk2) << bs, FTAG, &db2);
+ if (err) {
+ dmu_buf_rele(db, FTAG);
+ return (err);
+ }
+ dmu_buf_will_dirty(db2, tx);
+ ((uint64_t *)db2->db_data)[off2] = val;
+ ((uint64_t *)db2->db_data)[off2+1] = val;
+ dmu_buf_rele(db2, FTAG);
+ }
+
+ ((uint64_t *)db->db_data)[off] = val;
+ dmu_buf_rele(db, FTAG);
+
+ return (0);
+}
+
+static int
+zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
+{
+ uint64_t blk, off;
+ int err;
+ dmu_buf_t *db;
+ int bs = FZAP_BLOCK_SHIFT(zap);
+
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+
+ blk = idx >> (bs-3);
+ off = idx & ((1<<(bs-3))-1);
+
+ err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ (tbl->zt_blk + blk) << bs, FTAG, &db);
+ if (err)
+ return (err);
+ *valp = ((uint64_t *)db->db_data)[off];
+ dmu_buf_rele(db, FTAG);
+
+ if (tbl->zt_nextblk != 0) {
+ /*
+ * read the nextblk for the sake of i/o error checking,
+ * so that zap_table_load() will catch errors for
+ * zap_table_store.
+ */
+ blk = (idx*2) >> (bs-3);
+
+ err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ (tbl->zt_nextblk + blk) << bs, FTAG, &db);
+ dmu_buf_rele(db, FTAG);
+ }
+ return (err);
+}
+
+/*
+ * Routines for growing the ptrtbl.
+ */
+
+static void
+zap_ptrtbl_transfer(const uint64_t *src, uint64_t *dst, int n)
+{
+ int i;
+ for (i = 0; i < n; i++) {
+ uint64_t lb = src[i];
+ dst[2*i+0] = lb;
+ dst[2*i+1] = lb;
+ }
+}
+
+static int
+zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx)
+{
+ /* In case things go horribly wrong. */
+ if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift >= ZAP_HASHBITS-2)
+ return (ENOSPC);
+
+ if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
+ /*
+ * We are outgrowing the "embedded" ptrtbl (the one
+ * stored in the header block). Give it its own entire
+ * block, which will double the size of the ptrtbl.
+ */
+ uint64_t newblk;
+ dmu_buf_t *db_new;
+ int err;
+
+ ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==,
+ ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
+ ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_blk, ==, 0);
+
+ newblk = zap_allocate_blocks(zap, 1);
+ err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new);
+ if (err)
+ return (err);
+ dmu_buf_will_dirty(db_new, tx);
+ zap_ptrtbl_transfer(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
+ db_new->db_data, 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
+ dmu_buf_rele(db_new, FTAG);
+
+ zap->zap_f.zap_phys->zap_ptrtbl.zt_blk = newblk;
+ zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks = 1;
+ zap->zap_f.zap_phys->zap_ptrtbl.zt_shift++;
+
+ ASSERT3U(1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==,
+ zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks <<
+ (FZAP_BLOCK_SHIFT(zap)-3));
+
+ return (0);
+ } else {
+ return (zap_table_grow(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
+ zap_ptrtbl_transfer, tx));
+ }
+}
+
+static void
+zap_increment_num_entries(zap_t *zap, int delta, dmu_tx_t *tx)
+{
+ dmu_buf_will_dirty(zap->zap_dbuf, tx);
+ mutex_enter(&zap->zap_f.zap_num_entries_mtx);
+ ASSERT(delta > 0 || zap->zap_f.zap_phys->zap_num_entries >= -delta);
+ zap->zap_f.zap_phys->zap_num_entries += delta;
+ mutex_exit(&zap->zap_f.zap_num_entries_mtx);
+}
+
+static uint64_t
+zap_allocate_blocks(zap_t *zap, int nblocks)
+{
+ uint64_t newblk;
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+ newblk = zap->zap_f.zap_phys->zap_freeblk;
+ zap->zap_f.zap_phys->zap_freeblk += nblocks;
+ return (newblk);
+}
+
+static zap_leaf_t *
+zap_create_leaf(zap_t *zap, dmu_tx_t *tx)
+{
+ void *winner;
+ zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
+
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+ rw_init(&l->l_rwlock, NULL, RW_DEFAULT, 0);
+ rw_enter(&l->l_rwlock, RW_WRITER);
+ l->l_blkid = zap_allocate_blocks(zap, 1);
+ l->l_dbuf = NULL;
+ l->l_phys = NULL;
+
+ VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf));
+ winner = dmu_buf_set_user(l->l_dbuf, l, &l->l_phys, zap_leaf_pageout);
+ ASSERT(winner == NULL);
+ dmu_buf_will_dirty(l->l_dbuf, tx);
+
+ zap_leaf_init(l);
+
+ zap->zap_f.zap_phys->zap_num_leafs++;
+
+ return (l);
+}
+
+int
+fzap_count(zap_t *zap, uint64_t *count)
+{
+ ASSERT(!zap->zap_ismicro);
+ mutex_enter(&zap->zap_f.zap_num_entries_mtx); /* unnecessary */
+ *count = zap->zap_f.zap_phys->zap_num_entries;
+ mutex_exit(&zap->zap_f.zap_num_entries_mtx);
+ return (0);
+}
+
+/*
+ * Routines for obtaining zap_leaf_t's
+ */
+
+void
+zap_put_leaf(zap_leaf_t *l)
+{
+ rw_exit(&l->l_rwlock);
+ dmu_buf_rele(l->l_dbuf, NULL);
+}
+
+_NOTE(ARGSUSED(0))
+static void
+zap_leaf_pageout(dmu_buf_t *db, void *vl)
+{
+ zap_leaf_t *l = vl;
+
+ rw_destroy(&l->l_rwlock);
+ kmem_free(l, sizeof (zap_leaf_t));
+}
+
+static zap_leaf_t *
+zap_open_leaf(uint64_t blkid, dmu_buf_t *db)
+{
+ zap_leaf_t *l, *winner;
+
+ ASSERT(blkid != 0);
+
+ l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
+ rw_init(&l->l_rwlock, NULL, RW_DEFAULT, 0);
+ rw_enter(&l->l_rwlock, RW_WRITER);
+ l->l_blkid = blkid;
+ l->l_bs = highbit(db->db_size)-1;
+ l->l_dbuf = db;
+ l->l_phys = NULL;
+
+ winner = dmu_buf_set_user(db, l, &l->l_phys, zap_leaf_pageout);
+
+ rw_exit(&l->l_rwlock);
+ if (winner != NULL) {
+ /* someone else set it first */
+ zap_leaf_pageout(NULL, l);
+ l = winner;
+ }
+
+ /*
+ * lhr_pad was previously used for the next leaf in the leaf
+ * chain. There should be no chained leafs (as we have removed
+ * support for them).
+ */
+ ASSERT3U(l->l_phys->l_hdr.lh_pad1, ==, 0);
+
+ /*
+ * There should be more hash entries than there can be
+ * chunks to put in the hash table
+ */
+ ASSERT3U(ZAP_LEAF_HASH_NUMENTRIES(l), >, ZAP_LEAF_NUMCHUNKS(l) / 3);
+
+ /* The chunks should begin at the end of the hash table */
+ ASSERT3P(&ZAP_LEAF_CHUNK(l, 0), ==,
+ &l->l_phys->l_hash[ZAP_LEAF_HASH_NUMENTRIES(l)]);
+
+ /* The chunks should end at the end of the block */
+ ASSERT3U((uintptr_t)&ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)) -
+ (uintptr_t)l->l_phys, ==, l->l_dbuf->db_size);
+
+ return (l);
+}
+
+static int
+zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt,
+ zap_leaf_t **lp)
+{
+ dmu_buf_t *db;
+ zap_leaf_t *l;
+ int bs = FZAP_BLOCK_SHIFT(zap);
+ int err;
+
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+
+ err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ blkid << bs, NULL, &db);
+ if (err)
+ return (err);
+
+ ASSERT3U(db->db_object, ==, zap->zap_object);
+ ASSERT3U(db->db_offset, ==, blkid << bs);
+ ASSERT3U(db->db_size, ==, 1 << bs);
+ ASSERT(blkid != 0);
+
+ l = dmu_buf_get_user(db);
+
+ if (l == NULL)
+ l = zap_open_leaf(blkid, db);
+
+ rw_enter(&l->l_rwlock, lt);
+ /*
+ * Must lock before dirtying, otherwise l->l_phys could change,
+ * causing ASSERT below to fail.
+ */
+ if (lt == RW_WRITER)
+ dmu_buf_will_dirty(db, tx);
+ ASSERT3U(l->l_blkid, ==, blkid);
+ ASSERT3P(l->l_dbuf, ==, db);
+ ASSERT3P(l->l_phys, ==, l->l_dbuf->db_data);
+ ASSERT3U(l->l_phys->l_hdr.lh_block_type, ==, ZBT_LEAF);
+ ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
+
+ *lp = l;
+ return (0);
+}
+
+static int
+zap_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t *valp)
+{
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+
+ if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
+ ASSERT3U(idx, <,
+ (1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift));
+ *valp = ZAP_EMBEDDED_PTRTBL_ENT(zap, idx);
+ return (0);
+ } else {
+ return (zap_table_load(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
+ idx, valp));
+ }
+}
+
+static int
+zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx)
+{
+ ASSERT(tx != NULL);
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+ if (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk == 0) {
+ ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) = blk;
+ return (0);
+ } else {
+ return (zap_table_store(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
+ idx, blk, tx));
+ }
+}
+
+static int
+zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp)
+{
+ uint64_t idx, blk;
+ int err;
+
+ ASSERT(zap->zap_dbuf == NULL ||
+ zap->zap_f.zap_phys == zap->zap_dbuf->db_data);
+ ASSERT3U(zap->zap_f.zap_phys->zap_magic, ==, ZAP_MAGIC);
+ idx = ZAP_HASH_IDX(h, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
+ err = zap_idx_to_blk(zap, idx, &blk);
+ if (err != 0)
+ return (err);
+ err = zap_get_leaf_byblk(zap, blk, tx, lt, lp);
+
+ ASSERT(err || ZAP_HASH_IDX(h, (*lp)->l_phys->l_hdr.lh_prefix_len) ==
+ (*lp)->l_phys->l_hdr.lh_prefix);
+ return (err);
+}
+
+static int
+zap_expand_leaf(zap_t *zap, zap_leaf_t *l, uint64_t hash, dmu_tx_t *tx,
+ zap_leaf_t **lp)
+{
+ zap_leaf_t *nl;
+ int prefix_diff, i, err;
+ uint64_t sibling;
+ int old_prefix_len = l->l_phys->l_hdr.lh_prefix_len;
+
+ ASSERT3U(old_prefix_len, <=, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+
+ ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
+ l->l_phys->l_hdr.lh_prefix);
+
+ if (zap_tryupgradedir(zap, tx) == 0 ||
+ old_prefix_len == zap->zap_f.zap_phys->zap_ptrtbl.zt_shift) {
+ /* We failed to upgrade, or need to grow the pointer table */
+ objset_t *os = zap->zap_objset;
+ uint64_t object = zap->zap_object;
+
+ zap_put_leaf(l);
+ zap_unlockdir(zap);
+ err = zap_lockdir(os, object, tx, RW_WRITER, FALSE, &zap);
+ if (err)
+ return (err);
+ ASSERT(!zap->zap_ismicro);
+
+ while (old_prefix_len ==
+ zap->zap_f.zap_phys->zap_ptrtbl.zt_shift) {
+ err = zap_grow_ptrtbl(zap, tx);
+ if (err)
+ return (err);
+ }
+
+ err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l);
+ if (err)
+ return (err);
+
+ if (l->l_phys->l_hdr.lh_prefix_len != old_prefix_len) {
+ /* it split while our locks were down */
+ *lp = l;
+ return (0);
+ }
+ }
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+ ASSERT3U(old_prefix_len, <, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
+ ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
+ l->l_phys->l_hdr.lh_prefix);
+
+ prefix_diff = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift -
+ (old_prefix_len + 1);
+ sibling = (ZAP_HASH_IDX(hash, old_prefix_len + 1) | 1) << prefix_diff;
+
+ /* check for i/o errors before doing zap_leaf_split */
+ for (i = 0; i < (1ULL<<prefix_diff); i++) {
+ uint64_t blk;
+ err = zap_idx_to_blk(zap, sibling+i, &blk);
+ if (err)
+ return (err);
+ ASSERT3U(blk, ==, l->l_blkid);
+ }
+
+ nl = zap_create_leaf(zap, tx);
+ zap_leaf_split(l, nl);
+
+ /* set sibling pointers */
+ for (i = 0; i < (1ULL<<prefix_diff); i++) {
+ err = zap_set_idx_to_blk(zap, sibling+i, nl->l_blkid, tx);
+ ASSERT3U(err, ==, 0); /* we checked for i/o errors above */
+ }
+
+ if (hash & (1ULL << (64 - l->l_phys->l_hdr.lh_prefix_len))) {
+ /* we want the sibling */
+ zap_put_leaf(l);
+ *lp = nl;
+ } else {
+ zap_put_leaf(nl);
+ *lp = l;
+ }
+
+ return (0);
+}
+
+static void
+zap_put_leaf_maybe_grow_ptrtbl(zap_t *zap, zap_leaf_t *l, dmu_tx_t *tx)
+{
+ int shift = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift;
+ int leaffull = (l->l_phys->l_hdr.lh_prefix_len == shift &&
+ l->l_phys->l_hdr.lh_nfree < ZAP_LEAF_LOW_WATER);
+
+ zap_put_leaf(l);
+
+ if (leaffull || zap->zap_f.zap_phys->zap_ptrtbl.zt_nextblk) {
+ int err;
+
+ /*
+ * We are in the middle of growing the pointer table, or
+ * this leaf will soon make us grow it.
+ */
+ if (zap_tryupgradedir(zap, tx) == 0) {
+ objset_t *os = zap->zap_objset;
+ uint64_t zapobj = zap->zap_object;
+
+ zap_unlockdir(zap);
+ err = zap_lockdir(os, zapobj, tx,
+ RW_WRITER, FALSE, &zap);
+ if (err)
+ return;
+ }
+
+ /* could have finished growing while our locks were down */
+ if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift == shift)
+ (void) zap_grow_ptrtbl(zap, tx);
+ }
+}
+
+
+static int
+fzap_checksize(const char *name, uint64_t integer_size, uint64_t num_integers)
+{
+ if (name && strlen(name) > ZAP_MAXNAMELEN)
+ return (E2BIG);
+
+ /* Only integer sizes supported by C */
+ switch (integer_size) {
+ case 1:
+ case 2:
+ case 4:
+ case 8:
+ break;
+ default:
+ return (EINVAL);
+ }
+
+ if (integer_size * num_integers > ZAP_MAXVALUELEN)
+ return (E2BIG);
+
+ return (0);
+}
+
+/*
+ * Routines for maniplulating attributes.
+ */
+int
+fzap_lookup(zap_t *zap, const char *name,
+ uint64_t integer_size, uint64_t num_integers, void *buf)
+{
+ zap_leaf_t *l;
+ int err;
+ uint64_t hash;
+ zap_entry_handle_t zeh;
+
+ err = fzap_checksize(name, integer_size, num_integers);
+ if (err != 0)
+ return (err);
+
+ hash = zap_hash(zap, name);
+ err = zap_deref_leaf(zap, hash, NULL, RW_READER, &l);
+ if (err != 0)
+ return (err);
+ err = zap_leaf_lookup(l, name, hash, &zeh);
+ if (err == 0)
+ err = zap_entry_read(&zeh, integer_size, num_integers, buf);
+
+ zap_put_leaf(l);
+ return (err);
+}
+
+int
+fzap_add_cd(zap_t *zap, const char *name,
+ uint64_t integer_size, uint64_t num_integers,
+ const void *val, uint32_t cd, dmu_tx_t *tx)
+{
+ zap_leaf_t *l;
+ uint64_t hash;
+ int err;
+ zap_entry_handle_t zeh;
+
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+ ASSERT(!zap->zap_ismicro);
+ ASSERT(fzap_checksize(name, integer_size, num_integers) == 0);
+
+ hash = zap_hash(zap, name);
+ err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l);
+ if (err != 0)
+ return (err);
+retry:
+ err = zap_leaf_lookup(l, name, hash, &zeh);
+ if (err == 0) {
+ err = EEXIST;
+ goto out;
+ }
+ if (err != ENOENT)
+ goto out;
+
+ err = zap_entry_create(l, name, hash, cd,
+ integer_size, num_integers, val, &zeh);
+
+ if (err == 0) {
+ zap_increment_num_entries(zap, 1, tx);
+ } else if (err == EAGAIN) {
+ err = zap_expand_leaf(zap, l, hash, tx, &l);
+ if (err == 0)
+ goto retry;
+ }
+
+out:
+ zap_put_leaf_maybe_grow_ptrtbl(zap, l, tx);
+ return (err);
+}
+
+int
+fzap_add(zap_t *zap, const char *name,
+ uint64_t integer_size, uint64_t num_integers,
+ const void *val, dmu_tx_t *tx)
+{
+ int err = fzap_checksize(name, integer_size, num_integers);
+ if (err != 0)
+ return (err);
+
+ return (fzap_add_cd(zap, name, integer_size, num_integers,
+ val, ZAP_MAXCD, tx));
+}
+
+int
+fzap_update(zap_t *zap, const char *name,
+ int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
+{
+ zap_leaf_t *l;
+ uint64_t hash;
+ int err, create;
+ zap_entry_handle_t zeh;
+
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+ err = fzap_checksize(name, integer_size, num_integers);
+ if (err != 0)
+ return (err);
+
+ hash = zap_hash(zap, name);
+ err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l);
+ if (err != 0)
+ return (err);
+retry:
+ err = zap_leaf_lookup(l, name, hash, &zeh);
+ create = (err == ENOENT);
+ ASSERT(err == 0 || err == ENOENT);
+
+ /* XXX If this leaf is chained, split it if we can. */
+
+ if (create) {
+ err = zap_entry_create(l, name, hash, ZAP_MAXCD,
+ integer_size, num_integers, val, &zeh);
+ if (err == 0)
+ zap_increment_num_entries(zap, 1, tx);
+ } else {
+ err = zap_entry_update(&zeh, integer_size, num_integers, val);
+ }
+
+ if (err == EAGAIN) {
+ err = zap_expand_leaf(zap, l, hash, tx, &l);
+ if (err == 0)
+ goto retry;
+ }
+
+ zap_put_leaf_maybe_grow_ptrtbl(zap, l, tx);
+ return (err);
+}
+
+int
+fzap_length(zap_t *zap, const char *name,
+ uint64_t *integer_size, uint64_t *num_integers)
+{
+ zap_leaf_t *l;
+ int err;
+ uint64_t hash;
+ zap_entry_handle_t zeh;
+
+ hash = zap_hash(zap, name);
+ err = zap_deref_leaf(zap, hash, NULL, RW_READER, &l);
+ if (err != 0)
+ return (err);
+ err = zap_leaf_lookup(l, name, hash, &zeh);
+ if (err != 0)
+ goto out;
+
+ if (integer_size)
+ *integer_size = zeh.zeh_integer_size;
+ if (num_integers)
+ *num_integers = zeh.zeh_num_integers;
+out:
+ zap_put_leaf(l);
+ return (err);
+}
+
+int
+fzap_remove(zap_t *zap, const char *name, dmu_tx_t *tx)
+{
+ zap_leaf_t *l;
+ uint64_t hash;
+ int err;
+ zap_entry_handle_t zeh;
+
+ hash = zap_hash(zap, name);
+ err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l);
+ if (err != 0)
+ return (err);
+ err = zap_leaf_lookup(l, name, hash, &zeh);
+ if (err == 0) {
+ zap_entry_remove(&zeh);
+ zap_increment_num_entries(zap, -1, tx);
+ }
+ zap_put_leaf(l);
+ dprintf("fzap_remove: ds=%p obj=%llu name=%s err=%d\n",
+ zap->zap_objset, zap->zap_object, name, err);
+ return (err);
+}
+
+int
+zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, char *name)
+{
+ zap_cursor_t zc;
+ zap_attribute_t *za;
+ int err;
+
+ za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+ for (zap_cursor_init(&zc, os, zapobj);
+ (err = zap_cursor_retrieve(&zc, za)) == 0;
+ zap_cursor_advance(&zc)) {
+ if (za->za_first_integer == value) {
+ (void) strcpy(name, za->za_name);
+ break;
+ }
+ }
+ zap_cursor_fini(&zc);
+ kmem_free(za, sizeof (zap_attribute_t));
+ return (err);
+}
+
+
+/*
+ * Routines for iterating over the attributes.
+ */
+
+int
+fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za)
+{
+ int err = ENOENT;
+ zap_entry_handle_t zeh;
+ zap_leaf_t *l;
+
+ /* retrieve the next entry at or after zc_hash/zc_cd */
+ /* if no entry, return ENOENT */
+
+ if (zc->zc_leaf &&
+ (ZAP_HASH_IDX(zc->zc_hash,
+ zc->zc_leaf->l_phys->l_hdr.lh_prefix_len) !=
+ zc->zc_leaf->l_phys->l_hdr.lh_prefix)) {
+ rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
+ zap_put_leaf(zc->zc_leaf);
+ zc->zc_leaf = NULL;
+ }
+
+again:
+ if (zc->zc_leaf == NULL) {
+ err = zap_deref_leaf(zap, zc->zc_hash, NULL, RW_READER,
+ &zc->zc_leaf);
+ if (err != 0)
+ return (err);
+ } else {
+ rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
+ }
+ l = zc->zc_leaf;
+
+ err = zap_leaf_lookup_closest(l, zc->zc_hash, zc->zc_cd, &zeh);
+
+ if (err == ENOENT) {
+ uint64_t nocare =
+ (1ULL << (64 - l->l_phys->l_hdr.lh_prefix_len)) - 1;
+ zc->zc_hash = (zc->zc_hash & ~nocare) + nocare + 1;
+ zc->zc_cd = 0;
+ if (l->l_phys->l_hdr.lh_prefix_len == 0 || zc->zc_hash == 0) {
+ zc->zc_hash = -1ULL;
+ } else {
+ zap_put_leaf(zc->zc_leaf);
+ zc->zc_leaf = NULL;
+ goto again;
+ }
+ }
+
+ if (err == 0) {
+ zc->zc_hash = zeh.zeh_hash;
+ zc->zc_cd = zeh.zeh_cd;
+ za->za_integer_length = zeh.zeh_integer_size;
+ za->za_num_integers = zeh.zeh_num_integers;
+ if (zeh.zeh_num_integers == 0) {
+ za->za_first_integer = 0;
+ } else {
+ err = zap_entry_read(&zeh, 8, 1, &za->za_first_integer);
+ ASSERT(err == 0 || err == EOVERFLOW);
+ }
+ err = zap_entry_read_name(&zeh,
+ sizeof (za->za_name), za->za_name);
+ ASSERT(err == 0);
+ }
+ rw_exit(&zc->zc_leaf->l_rwlock);
+ return (err);
+}
+
+
+static void
+zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs)
+{
+ int i, err;
+ uint64_t lastblk = 0;
+
+ /*
+ * NB: if a leaf has more pointers than an entire ptrtbl block
+ * can hold, then it'll be accounted for more than once, since
+ * we won't have lastblk.
+ */
+ for (i = 0; i < len; i++) {
+ zap_leaf_t *l;
+
+ if (tbl[i] == lastblk)
+ continue;
+ lastblk = tbl[i];
+
+ err = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER, &l);
+ if (err == 0) {
+ zap_leaf_stats(zap, l, zs);
+ zap_put_leaf(l);
+ }
+ }
+}
+
+void
+fzap_get_stats(zap_t *zap, zap_stats_t *zs)
+{
+ int bs = FZAP_BLOCK_SHIFT(zap);
+ zs->zs_blocksize = 1ULL << bs;
+
+ /*
+ * Set zap_phys_t fields
+ */
+ zs->zs_num_leafs = zap->zap_f.zap_phys->zap_num_leafs;
+ zs->zs_num_entries = zap->zap_f.zap_phys->zap_num_entries;
+ zs->zs_num_blocks = zap->zap_f.zap_phys->zap_freeblk;
+ zs->zs_block_type = zap->zap_f.zap_phys->zap_block_type;
+ zs->zs_magic = zap->zap_f.zap_phys->zap_magic;
+ zs->zs_salt = zap->zap_f.zap_phys->zap_salt;
+
+ /*
+ * Set zap_ptrtbl fields
+ */
+ zs->zs_ptrtbl_len = 1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift;
+ zs->zs_ptrtbl_nextblk = zap->zap_f.zap_phys->zap_ptrtbl.zt_nextblk;
+ zs->zs_ptrtbl_blks_copied =
+ zap->zap_f.zap_phys->zap_ptrtbl.zt_blks_copied;
+ zs->zs_ptrtbl_zt_blk = zap->zap_f.zap_phys->zap_ptrtbl.zt_blk;
+ zs->zs_ptrtbl_zt_numblks = zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks;
+ zs->zs_ptrtbl_zt_shift = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift;
+
+ if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
+ /* the ptrtbl is entirely in the header block. */
+ zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
+ 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs);
+ } else {
+ int b;
+
+ dmu_prefetch(zap->zap_objset, zap->zap_object,
+ zap->zap_f.zap_phys->zap_ptrtbl.zt_blk << bs,
+ zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks << bs);
+
+ for (b = 0; b < zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks;
+ b++) {
+ dmu_buf_t *db;
+ int err;
+
+ err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk + b) << bs,
+ FTAG, &db);
+ if (err == 0) {
+ zap_stats_ptrtbl(zap, db->db_data,
+ 1<<(bs-3), zs);
+ dmu_buf_rele(db, FTAG);
+ }
+ }
+ }
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c
new file mode 100644
index 0000000..5dff514
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c
@@ -0,0 +1,741 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * The 512-byte leaf is broken into 32 16-byte chunks.
+ * chunk number n means l_chunk[n], even though the header precedes it.
+ * the names are stored null-terminated.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/zap.h>
+#include <sys/zap_impl.h>
+#include <sys/zap_leaf.h>
+#include <sys/spa.h>
+#include <sys/dmu.h>
+
+#define CHAIN_END 0xffff /* end of the chunk chain */
+
+/* half the (current) minimum block size */
+#define MAX_ARRAY_BYTES (8<<10)
+
+#define LEAF_HASH(l, h) \
+ ((ZAP_LEAF_HASH_NUMENTRIES(l)-1) & \
+ ((h) >> (64 - ZAP_LEAF_HASH_SHIFT(l)-(l)->l_phys->l_hdr.lh_prefix_len)))
+
+#define LEAF_HASH_ENTPTR(l, h) (&(l)->l_phys->l_hash[LEAF_HASH(l, h)])
+
+
+static void
+zap_memset(void *a, int c, size_t n)
+{
+ char *cp = a;
+ char *cpend = cp + n;
+
+ while (cp < cpend)
+ *cp++ = c;
+}
+
+static void
+stv(int len, void *addr, uint64_t value)
+{
+ switch (len) {
+ case 1:
+ *(uint8_t *)addr = value;
+ return;
+ case 2:
+ *(uint16_t *)addr = value;
+ return;
+ case 4:
+ *(uint32_t *)addr = value;
+ return;
+ case 8:
+ *(uint64_t *)addr = value;
+ return;
+ }
+ ASSERT(!"bad int len");
+}
+
+static uint64_t
+ldv(int len, const void *addr)
+{
+ switch (len) {
+ case 1:
+ return (*(uint8_t *)addr);
+ case 2:
+ return (*(uint16_t *)addr);
+ case 4:
+ return (*(uint32_t *)addr);
+ case 8:
+ return (*(uint64_t *)addr);
+ }
+ ASSERT(!"bad int len");
+ return (0xFEEDFACEDEADBEEFULL);
+}
+
+void
+zap_leaf_byteswap(zap_leaf_phys_t *buf, int size)
+{
+ int i;
+ zap_leaf_t l;
+ l.l_bs = highbit(size)-1;
+ l.l_phys = buf;
+
+ buf->l_hdr.lh_block_type = BSWAP_64(buf->l_hdr.lh_block_type);
+ buf->l_hdr.lh_prefix = BSWAP_64(buf->l_hdr.lh_prefix);
+ buf->l_hdr.lh_magic = BSWAP_32(buf->l_hdr.lh_magic);
+ buf->l_hdr.lh_nfree = BSWAP_16(buf->l_hdr.lh_nfree);
+ buf->l_hdr.lh_nentries = BSWAP_16(buf->l_hdr.lh_nentries);
+ buf->l_hdr.lh_prefix_len = BSWAP_16(buf->l_hdr.lh_prefix_len);
+ buf->l_hdr.lh_freelist = BSWAP_16(buf->l_hdr.lh_freelist);
+
+ for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++)
+ buf->l_hash[i] = BSWAP_16(buf->l_hash[i]);
+
+ for (i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) {
+ zap_leaf_chunk_t *lc = &ZAP_LEAF_CHUNK(&l, i);
+ struct zap_leaf_entry *le;
+
+ switch (lc->l_free.lf_type) {
+ case ZAP_CHUNK_ENTRY:
+ le = &lc->l_entry;
+
+ le->le_type = BSWAP_8(le->le_type);
+ le->le_int_size = BSWAP_8(le->le_int_size);
+ le->le_next = BSWAP_16(le->le_next);
+ le->le_name_chunk = BSWAP_16(le->le_name_chunk);
+ le->le_name_length = BSWAP_16(le->le_name_length);
+ le->le_value_chunk = BSWAP_16(le->le_value_chunk);
+ le->le_value_length = BSWAP_16(le->le_value_length);
+ le->le_cd = BSWAP_32(le->le_cd);
+ le->le_hash = BSWAP_64(le->le_hash);
+ break;
+ case ZAP_CHUNK_FREE:
+ lc->l_free.lf_type = BSWAP_8(lc->l_free.lf_type);
+ lc->l_free.lf_next = BSWAP_16(lc->l_free.lf_next);
+ break;
+ case ZAP_CHUNK_ARRAY:
+ lc->l_array.la_type = BSWAP_8(lc->l_array.la_type);
+ lc->l_array.la_next = BSWAP_16(lc->l_array.la_next);
+ /* la_array doesn't need swapping */
+ break;
+ default:
+ ASSERT(!"bad leaf type");
+ }
+ }
+}
+
+void
+zap_leaf_init(zap_leaf_t *l)
+{
+ int i;
+
+ l->l_bs = highbit(l->l_dbuf->db_size)-1;
+ zap_memset(&l->l_phys->l_hdr, 0, sizeof (struct zap_leaf_header));
+ zap_memset(l->l_phys->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l));
+ for (i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
+ ZAP_LEAF_CHUNK(l, i).l_free.lf_type = ZAP_CHUNK_FREE;
+ ZAP_LEAF_CHUNK(l, i).l_free.lf_next = i+1;
+ }
+ ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)-1).l_free.lf_next = CHAIN_END;
+ l->l_phys->l_hdr.lh_block_type = ZBT_LEAF;
+ l->l_phys->l_hdr.lh_magic = ZAP_LEAF_MAGIC;
+ l->l_phys->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l);
+}
+
+/*
+ * Routines which manipulate leaf chunks (l_chunk[]).
+ */
+
+static uint16_t
+zap_leaf_chunk_alloc(zap_leaf_t *l)
+{
+ int chunk;
+
+ ASSERT(l->l_phys->l_hdr.lh_nfree > 0);
+
+ chunk = l->l_phys->l_hdr.lh_freelist;
+ ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
+ ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_free.lf_type, ==, ZAP_CHUNK_FREE);
+
+ l->l_phys->l_hdr.lh_freelist = ZAP_LEAF_CHUNK(l, chunk).l_free.lf_next;
+
+ l->l_phys->l_hdr.lh_nfree--;
+
+ return (chunk);
+}
+
+static void
+zap_leaf_chunk_free(zap_leaf_t *l, uint16_t chunk)
+{
+ struct zap_leaf_free *zlf = &ZAP_LEAF_CHUNK(l, chunk).l_free;
+ ASSERT3U(l->l_phys->l_hdr.lh_nfree, <, ZAP_LEAF_NUMCHUNKS(l));
+ ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
+ ASSERT(zlf->lf_type != ZAP_CHUNK_FREE);
+
+ zlf->lf_type = ZAP_CHUNK_FREE;
+ zlf->lf_next = l->l_phys->l_hdr.lh_freelist;
+ bzero(zlf->lf_pad, sizeof (zlf->lf_pad)); /* help it to compress */
+ l->l_phys->l_hdr.lh_freelist = chunk;
+
+ l->l_phys->l_hdr.lh_nfree++;
+}
+
+/*
+ * Routines which manipulate leaf arrays (zap_leaf_array type chunks).
+ */
+
+static uint16_t
+zap_leaf_array_create(zap_leaf_t *l, const char *buf,
+ int integer_size, int num_integers)
+{
+ uint16_t chunk_head;
+ uint16_t *chunkp = &chunk_head;
+ int byten = 0;
+ uint64_t value;
+ int shift = (integer_size-1)*8;
+ int len = num_integers;
+
+ ASSERT3U(num_integers * integer_size, <, MAX_ARRAY_BYTES);
+
+ while (len > 0) {
+ uint16_t chunk = zap_leaf_chunk_alloc(l);
+ struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
+ int i;
+
+ la->la_type = ZAP_CHUNK_ARRAY;
+ for (i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) {
+ if (byten == 0)
+ value = ldv(integer_size, buf);
+ la->la_array[i] = value >> shift;
+ value <<= 8;
+ if (++byten == integer_size) {
+ byten = 0;
+ buf += integer_size;
+ if (--len == 0)
+ break;
+ }
+ }
+
+ *chunkp = chunk;
+ chunkp = &la->la_next;
+ }
+ *chunkp = CHAIN_END;
+
+ return (chunk_head);
+}
+
+static void
+zap_leaf_array_free(zap_leaf_t *l, uint16_t *chunkp)
+{
+ uint16_t chunk = *chunkp;
+
+ *chunkp = CHAIN_END;
+
+ while (chunk != CHAIN_END) {
+ int nextchunk = ZAP_LEAF_CHUNK(l, chunk).l_array.la_next;
+ ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_array.la_type, ==,
+ ZAP_CHUNK_ARRAY);
+ zap_leaf_chunk_free(l, chunk);
+ chunk = nextchunk;
+ }
+}
+
+/* array_len and buf_len are in integers, not bytes */
+static void
+zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk,
+ int array_int_len, int array_len, int buf_int_len, uint64_t buf_len,
+ char *buf)
+{
+ int len = MIN(array_len, buf_len);
+ int byten = 0;
+ uint64_t value = 0;
+
+ ASSERT3U(array_int_len, <=, buf_int_len);
+
+ /* Fast path for one 8-byte integer */
+ if (array_int_len == 8 && buf_int_len == 8 && len == 1) {
+ struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
+ uint8_t *ip = la->la_array;
+ uint64_t *buf64 = (uint64_t *)buf;
+
+ *buf64 = (uint64_t)ip[0] << 56 | (uint64_t)ip[1] << 48 |
+ (uint64_t)ip[2] << 40 | (uint64_t)ip[3] << 32 |
+ (uint64_t)ip[4] << 24 | (uint64_t)ip[5] << 16 |
+ (uint64_t)ip[6] << 8 | (uint64_t)ip[7];
+ return;
+ }
+
+ /* Fast path for an array of 1-byte integers (eg. the entry name) */
+ if (array_int_len == 1 && buf_int_len == 1 &&
+ buf_len > array_len + ZAP_LEAF_ARRAY_BYTES) {
+ while (chunk != CHAIN_END) {
+ struct zap_leaf_array *la =
+ &ZAP_LEAF_CHUNK(l, chunk).l_array;
+ bcopy(la->la_array, buf, ZAP_LEAF_ARRAY_BYTES);
+ buf += ZAP_LEAF_ARRAY_BYTES;
+ chunk = la->la_next;
+ }
+ return;
+ }
+
+ while (len > 0) {
+ struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
+ int i;
+
+ ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
+ for (i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) {
+ value = (value << 8) | la->la_array[i];
+ byten++;
+ if (byten == array_int_len) {
+ stv(buf_int_len, buf, value);
+ byten = 0;
+ len--;
+ if (len == 0)
+ return;
+ buf += buf_int_len;
+ }
+ }
+ chunk = la->la_next;
+ }
+}
+
+/*
+ * Only to be used on 8-bit arrays.
+ * array_len is actual len in bytes (not encoded le_value_length).
+ * buf is null-terminated.
+ */
+static int
+zap_leaf_array_equal(zap_leaf_t *l, int chunk,
+ int array_len, const char *buf)
+{
+ int bseen = 0;
+
+ while (bseen < array_len) {
+ struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
+ int toread = MIN(array_len - bseen, ZAP_LEAF_ARRAY_BYTES);
+ ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
+ if (bcmp(la->la_array, buf + bseen, toread))
+ break;
+ chunk = la->la_next;
+ bseen += toread;
+ }
+ return (bseen == array_len);
+}
+
+/*
+ * Routines which manipulate leaf entries.
+ */
+
+int
+zap_leaf_lookup(zap_leaf_t *l,
+ const char *name, uint64_t h, zap_entry_handle_t *zeh)
+{
+ uint16_t *chunkp;
+ struct zap_leaf_entry *le;
+
+ ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
+
+ for (chunkp = LEAF_HASH_ENTPTR(l, h);
+ *chunkp != CHAIN_END; chunkp = &le->le_next) {
+ uint16_t chunk = *chunkp;
+ le = ZAP_LEAF_ENTRY(l, chunk);
+
+ ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
+ ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
+
+ if (le->le_hash != h)
+ continue;
+
+ if (zap_leaf_array_equal(l, le->le_name_chunk,
+ le->le_name_length, name)) {
+ zeh->zeh_num_integers = le->le_value_length;
+ zeh->zeh_integer_size = le->le_int_size;
+ zeh->zeh_cd = le->le_cd;
+ zeh->zeh_hash = le->le_hash;
+ zeh->zeh_chunkp = chunkp;
+ zeh->zeh_leaf = l;
+ return (0);
+ }
+ }
+
+ return (ENOENT);
+}
+
+/* Return (h1,cd1 >= h2,cd2) */
+#define HCD_GTEQ(h1, cd1, h2, cd2) \
+ ((h1 > h2) ? TRUE : ((h1 == h2 && cd1 >= cd2) ? TRUE : FALSE))
+
+int
+zap_leaf_lookup_closest(zap_leaf_t *l,
+ uint64_t h, uint32_t cd, zap_entry_handle_t *zeh)
+{
+ uint16_t chunk;
+ uint64_t besth = -1ULL;
+ uint32_t bestcd = ZAP_MAXCD;
+ uint16_t bestlh = ZAP_LEAF_HASH_NUMENTRIES(l)-1;
+ uint16_t lh;
+ struct zap_leaf_entry *le;
+
+ ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
+
+ for (lh = LEAF_HASH(l, h); lh <= bestlh; lh++) {
+ for (chunk = l->l_phys->l_hash[lh];
+ chunk != CHAIN_END; chunk = le->le_next) {
+ le = ZAP_LEAF_ENTRY(l, chunk);
+
+ ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
+ ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
+
+ if (HCD_GTEQ(le->le_hash, le->le_cd, h, cd) &&
+ HCD_GTEQ(besth, bestcd, le->le_hash, le->le_cd)) {
+ ASSERT3U(bestlh, >=, lh);
+ bestlh = lh;
+ besth = le->le_hash;
+ bestcd = le->le_cd;
+
+ zeh->zeh_num_integers = le->le_value_length;
+ zeh->zeh_integer_size = le->le_int_size;
+ zeh->zeh_cd = le->le_cd;
+ zeh->zeh_hash = le->le_hash;
+ zeh->zeh_fakechunk = chunk;
+ zeh->zeh_chunkp = &zeh->zeh_fakechunk;
+ zeh->zeh_leaf = l;
+ }
+ }
+ }
+
+ return (bestcd == ZAP_MAXCD ? ENOENT : 0);
+}
+
+int
+zap_entry_read(const zap_entry_handle_t *zeh,
+ uint8_t integer_size, uint64_t num_integers, void *buf)
+{
+ struct zap_leaf_entry *le =
+ ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp);
+ ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
+
+ if (le->le_int_size > integer_size)
+ return (EINVAL);
+
+ zap_leaf_array_read(zeh->zeh_leaf, le->le_value_chunk, le->le_int_size,
+ le->le_value_length, integer_size, num_integers, buf);
+
+ if (zeh->zeh_num_integers > num_integers)
+ return (EOVERFLOW);
+ return (0);
+
+}
+
+int
+zap_entry_read_name(const zap_entry_handle_t *zeh, uint16_t buflen, char *buf)
+{
+ struct zap_leaf_entry *le =
+ ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp);
+ ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
+
+ zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 1,
+ le->le_name_length, 1, buflen, buf);
+ if (le->le_name_length > buflen)
+ return (EOVERFLOW);
+ return (0);
+}
+
+int
+zap_entry_update(zap_entry_handle_t *zeh,
+ uint8_t integer_size, uint64_t num_integers, const void *buf)
+{
+ int delta_chunks;
+ zap_leaf_t *l = zeh->zeh_leaf;
+ struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, *zeh->zeh_chunkp);
+
+ delta_chunks = ZAP_LEAF_ARRAY_NCHUNKS(num_integers * integer_size) -
+ ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_length * le->le_int_size);
+
+ if ((int)l->l_phys->l_hdr.lh_nfree < delta_chunks)
+ return (EAGAIN);
+
+ /*
+ * We should search other chained leaves (via
+ * zap_entry_remove,create?) otherwise returning EAGAIN will
+ * just send us into an infinite loop if we have to chain
+ * another leaf block, rather than being able to split this
+ * block.
+ */
+
+ zap_leaf_array_free(l, &le->le_value_chunk);
+ le->le_value_chunk =
+ zap_leaf_array_create(l, buf, integer_size, num_integers);
+ le->le_value_length = num_integers;
+ le->le_int_size = integer_size;
+ return (0);
+}
+
+void
+zap_entry_remove(zap_entry_handle_t *zeh)
+{
+ uint16_t entry_chunk;
+ struct zap_leaf_entry *le;
+ zap_leaf_t *l = zeh->zeh_leaf;
+
+ ASSERT3P(zeh->zeh_chunkp, !=, &zeh->zeh_fakechunk);
+
+ entry_chunk = *zeh->zeh_chunkp;
+ le = ZAP_LEAF_ENTRY(l, entry_chunk);
+ ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
+
+ zap_leaf_array_free(l, &le->le_name_chunk);
+ zap_leaf_array_free(l, &le->le_value_chunk);
+
+ *zeh->zeh_chunkp = le->le_next;
+ zap_leaf_chunk_free(l, entry_chunk);
+
+ l->l_phys->l_hdr.lh_nentries--;
+}
+
+int
+zap_entry_create(zap_leaf_t *l, const char *name, uint64_t h, uint32_t cd,
+ uint8_t integer_size, uint64_t num_integers, const void *buf,
+ zap_entry_handle_t *zeh)
+{
+ uint16_t chunk;
+ uint16_t *chunkp;
+ struct zap_leaf_entry *le;
+ uint64_t namelen, valuelen;
+ int numchunks;
+
+ valuelen = integer_size * num_integers;
+ namelen = strlen(name) + 1;
+ ASSERT(namelen >= 2);
+
+ numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(namelen) +
+ ZAP_LEAF_ARRAY_NCHUNKS(valuelen);
+ if (numchunks > ZAP_LEAF_NUMCHUNKS(l))
+ return (E2BIG);
+
+ if (cd == ZAP_MAXCD) {
+ for (cd = 0; cd < ZAP_MAXCD; cd++) {
+ for (chunk = *LEAF_HASH_ENTPTR(l, h);
+ chunk != CHAIN_END; chunk = le->le_next) {
+ le = ZAP_LEAF_ENTRY(l, chunk);
+ if (le->le_hash == h &&
+ le->le_cd == cd) {
+ break;
+ }
+ }
+ /* If this cd is not in use, we are good. */
+ if (chunk == CHAIN_END)
+ break;
+ }
+ /* If we tried all the cd's, we lose. */
+ if (cd == ZAP_MAXCD)
+ return (ENOSPC);
+ }
+
+ if (l->l_phys->l_hdr.lh_nfree < numchunks)
+ return (EAGAIN);
+
+ /* make the entry */
+ chunk = zap_leaf_chunk_alloc(l);
+ le = ZAP_LEAF_ENTRY(l, chunk);
+ le->le_type = ZAP_CHUNK_ENTRY;
+ le->le_name_chunk = zap_leaf_array_create(l, name, 1, namelen);
+ le->le_name_length = namelen;
+ le->le_value_chunk =
+ zap_leaf_array_create(l, buf, integer_size, num_integers);
+ le->le_value_length = num_integers;
+ le->le_int_size = integer_size;
+ le->le_hash = h;
+ le->le_cd = cd;
+
+ /* link it into the hash chain */
+ chunkp = LEAF_HASH_ENTPTR(l, h);
+ le->le_next = *chunkp;
+ *chunkp = chunk;
+
+ l->l_phys->l_hdr.lh_nentries++;
+
+ zeh->zeh_leaf = l;
+ zeh->zeh_num_integers = num_integers;
+ zeh->zeh_integer_size = le->le_int_size;
+ zeh->zeh_cd = le->le_cd;
+ zeh->zeh_hash = le->le_hash;
+ zeh->zeh_chunkp = chunkp;
+
+ return (0);
+}
+
+/*
+ * Routines for transferring entries between leafs.
+ */
+
+static void
+zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry)
+{
+ struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry);
+ uint16_t *ptr = LEAF_HASH_ENTPTR(l, le->le_hash);
+ le->le_next = *ptr;
+ *ptr = entry;
+}
+
+static uint16_t
+zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl)
+{
+ uint16_t new_chunk;
+ uint16_t *nchunkp = &new_chunk;
+
+ while (chunk != CHAIN_END) {
+ uint16_t nchunk = zap_leaf_chunk_alloc(nl);
+ struct zap_leaf_array *nla =
+ &ZAP_LEAF_CHUNK(nl, nchunk).l_array;
+ struct zap_leaf_array *la =
+ &ZAP_LEAF_CHUNK(l, chunk).l_array;
+ int nextchunk = la->la_next;
+
+ ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
+ ASSERT3U(nchunk, <, ZAP_LEAF_NUMCHUNKS(l));
+
+ *nla = *la; /* structure assignment */
+
+ zap_leaf_chunk_free(l, chunk);
+ chunk = nextchunk;
+ *nchunkp = nchunk;
+ nchunkp = &nla->la_next;
+ }
+ *nchunkp = CHAIN_END;
+ return (new_chunk);
+}
+
+static void
+zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl)
+{
+ struct zap_leaf_entry *le, *nle;
+ uint16_t chunk;
+
+ le = ZAP_LEAF_ENTRY(l, entry);
+ ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
+
+ chunk = zap_leaf_chunk_alloc(nl);
+ nle = ZAP_LEAF_ENTRY(nl, chunk);
+ *nle = *le; /* structure assignment */
+
+ zap_leaf_rehash_entry(nl, chunk);
+
+ nle->le_name_chunk = zap_leaf_transfer_array(l, le->le_name_chunk, nl);
+ nle->le_value_chunk =
+ zap_leaf_transfer_array(l, le->le_value_chunk, nl);
+
+ zap_leaf_chunk_free(l, entry);
+
+ l->l_phys->l_hdr.lh_nentries--;
+ nl->l_phys->l_hdr.lh_nentries++;
+}
+
+/*
+ * Transfer the entries whose hash prefix ends in 1 to the new leaf.
+ */
+void
+zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl)
+{
+ int i;
+ int bit = 64 - 1 - l->l_phys->l_hdr.lh_prefix_len;
+
+ /* set new prefix and prefix_len */
+ l->l_phys->l_hdr.lh_prefix <<= 1;
+ l->l_phys->l_hdr.lh_prefix_len++;
+ nl->l_phys->l_hdr.lh_prefix = l->l_phys->l_hdr.lh_prefix | 1;
+ nl->l_phys->l_hdr.lh_prefix_len = l->l_phys->l_hdr.lh_prefix_len;
+
+ /* break existing hash chains */
+ zap_memset(l->l_phys->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l));
+
+ /*
+ * Transfer entries whose hash bit 'bit' is set to nl; rehash
+ * the remaining entries
+ *
+ * NB: We could find entries via the hashtable instead. That
+ * would be O(hashents+numents) rather than O(numblks+numents),
+ * but this accesses memory more sequentially, and when we're
+ * called, the block is usually pretty full.
+ */
+ for (i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
+ struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, i);
+ if (le->le_type != ZAP_CHUNK_ENTRY)
+ continue;
+
+ if (le->le_hash & (1ULL << bit))
+ zap_leaf_transfer_entry(l, i, nl);
+ else
+ zap_leaf_rehash_entry(l, i);
+ }
+}
+
+void
+zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs)
+{
+ int i, n;
+
+ n = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift -
+ l->l_phys->l_hdr.lh_prefix_len;
+ n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
+ zs->zs_leafs_with_2n_pointers[n]++;
+
+
+ n = l->l_phys->l_hdr.lh_nentries/5;
+ n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
+ zs->zs_blocks_with_n5_entries[n]++;
+
+ n = ((1<<FZAP_BLOCK_SHIFT(zap)) -
+ l->l_phys->l_hdr.lh_nfree * (ZAP_LEAF_ARRAY_BYTES+1))*10 /
+ (1<<FZAP_BLOCK_SHIFT(zap));
+ n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
+ zs->zs_blocks_n_tenths_full[n]++;
+
+ for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) {
+ int nentries = 0;
+ int chunk = l->l_phys->l_hash[i];
+
+ while (chunk != CHAIN_END) {
+ struct zap_leaf_entry *le =
+ ZAP_LEAF_ENTRY(l, chunk);
+
+ n = 1 + ZAP_LEAF_ARRAY_NCHUNKS(le->le_name_length) +
+ ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_length *
+ le->le_int_size);
+ n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
+ zs->zs_entries_using_n_chunks[n]++;
+
+ chunk = le->le_next;
+ nentries++;
+ }
+
+ n = nentries;
+ n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
+ zs->zs_buckets_with_n_entries[n]++;
+ }
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c
new file mode 100644
index 0000000..9b7e23c
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c
@@ -0,0 +1,855 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/spa.h>
+#include <sys/dmu.h>
+#include <sys/zfs_context.h>
+#include <sys/zap.h>
+#include <sys/refcount.h>
+#include <sys/zap_impl.h>
+#include <sys/zap_leaf.h>
+#include <sys/avl.h>
+
+
+static void mzap_upgrade(zap_t *zap, dmu_tx_t *tx);
+
+
+static void
+mzap_byteswap(mzap_phys_t *buf, size_t size)
+{
+ int i, max;
+ buf->mz_block_type = BSWAP_64(buf->mz_block_type);
+ buf->mz_salt = BSWAP_64(buf->mz_salt);
+ max = (size / MZAP_ENT_LEN) - 1;
+ for (i = 0; i < max; i++) {
+ buf->mz_chunk[i].mze_value =
+ BSWAP_64(buf->mz_chunk[i].mze_value);
+ buf->mz_chunk[i].mze_cd =
+ BSWAP_32(buf->mz_chunk[i].mze_cd);
+ }
+}
+
+void
+zap_byteswap(void *buf, size_t size)
+{
+ uint64_t block_type;
+
+ block_type = *(uint64_t *)buf;
+
+ if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) {
+ /* ASSERT(magic == ZAP_LEAF_MAGIC); */
+ mzap_byteswap(buf, size);
+ } else {
+ fzap_byteswap(buf, size);
+ }
+}
+
+static int
+mze_compare(const void *arg1, const void *arg2)
+{
+ const mzap_ent_t *mze1 = arg1;
+ const mzap_ent_t *mze2 = arg2;
+
+ if (mze1->mze_hash > mze2->mze_hash)
+ return (+1);
+ if (mze1->mze_hash < mze2->mze_hash)
+ return (-1);
+ if (mze1->mze_phys.mze_cd > mze2->mze_phys.mze_cd)
+ return (+1);
+ if (mze1->mze_phys.mze_cd < mze2->mze_phys.mze_cd)
+ return (-1);
+ return (0);
+}
+
+static void
+mze_insert(zap_t *zap, int chunkid, uint64_t hash, mzap_ent_phys_t *mzep)
+{
+ mzap_ent_t *mze;
+
+ ASSERT(zap->zap_ismicro);
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+ ASSERT(mzep->mze_cd < ZAP_MAXCD);
+ ASSERT3U(zap_hash(zap, mzep->mze_name), ==, hash);
+
+ mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP);
+ mze->mze_chunkid = chunkid;
+ mze->mze_hash = hash;
+ mze->mze_phys = *mzep;
+ avl_add(&zap->zap_m.zap_avl, mze);
+}
+
+static mzap_ent_t *
+mze_find(zap_t *zap, const char *name, uint64_t hash)
+{
+ mzap_ent_t mze_tofind;
+ mzap_ent_t *mze;
+ avl_index_t idx;
+ avl_tree_t *avl = &zap->zap_m.zap_avl;
+
+ ASSERT(zap->zap_ismicro);
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+ ASSERT3U(zap_hash(zap, name), ==, hash);
+
+ if (strlen(name) >= sizeof (mze_tofind.mze_phys.mze_name))
+ return (NULL);
+
+ mze_tofind.mze_hash = hash;
+ mze_tofind.mze_phys.mze_cd = 0;
+
+ mze = avl_find(avl, &mze_tofind, &idx);
+ if (mze == NULL)
+ mze = avl_nearest(avl, idx, AVL_AFTER);
+ for (; mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) {
+ if (strcmp(name, mze->mze_phys.mze_name) == 0)
+ return (mze);
+ }
+ return (NULL);
+}
+
+static uint32_t
+mze_find_unused_cd(zap_t *zap, uint64_t hash)
+{
+ mzap_ent_t mze_tofind;
+ mzap_ent_t *mze;
+ avl_index_t idx;
+ avl_tree_t *avl = &zap->zap_m.zap_avl;
+ uint32_t cd;
+
+ ASSERT(zap->zap_ismicro);
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+
+ mze_tofind.mze_hash = hash;
+ mze_tofind.mze_phys.mze_cd = 0;
+
+ cd = 0;
+ for (mze = avl_find(avl, &mze_tofind, &idx);
+ mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) {
+ if (mze->mze_phys.mze_cd != cd)
+ break;
+ cd++;
+ }
+
+ return (cd);
+}
+
+static void
+mze_remove(zap_t *zap, mzap_ent_t *mze)
+{
+ ASSERT(zap->zap_ismicro);
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+ avl_remove(&zap->zap_m.zap_avl, mze);
+ kmem_free(mze, sizeof (mzap_ent_t));
+}
+
+static void
+mze_destroy(zap_t *zap)
+{
+ mzap_ent_t *mze;
+ void *avlcookie = NULL;
+
+ while (mze = avl_destroy_nodes(&zap->zap_m.zap_avl, &avlcookie))
+ kmem_free(mze, sizeof (mzap_ent_t));
+ avl_destroy(&zap->zap_m.zap_avl);
+}
+
+static zap_t *
+mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
+{
+ zap_t *winner;
+ zap_t *zap;
+ int i;
+
+ ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t));
+
+ zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP);
+ rw_init(&zap->zap_rwlock, NULL, RW_DEFAULT, 0);
+ rw_enter(&zap->zap_rwlock, RW_WRITER);
+ zap->zap_objset = os;
+ zap->zap_object = obj;
+ zap->zap_dbuf = db;
+
+ if (((uint64_t *)db->db_data)[0] != ZBT_MICRO) {
+ mutex_init(&zap->zap_f.zap_num_entries_mtx, NULL,
+ MUTEX_DEFAULT, 0);
+ zap->zap_f.zap_block_shift = highbit(db->db_size) - 1;
+ } else {
+ zap->zap_ismicro = TRUE;
+ }
+
+ /*
+ * Make sure that zap_ismicro is set before we let others see
+ * it, because zap_lockdir() checks zap_ismicro without the lock
+ * held.
+ */
+ winner = dmu_buf_set_user(db, zap, &zap->zap_m.zap_phys, zap_evict);
+
+ if (winner != NULL) {
+ if (!zap->zap_ismicro)
+ mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
+ kmem_free(zap, sizeof (zap_t));
+ return (winner);
+ }
+
+ if (zap->zap_ismicro) {
+ zap->zap_salt = zap->zap_m.zap_phys->mz_salt;
+ zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1;
+ avl_create(&zap->zap_m.zap_avl, mze_compare,
+ sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node));
+
+ for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
+ mzap_ent_phys_t *mze =
+ &zap->zap_m.zap_phys->mz_chunk[i];
+ if (mze->mze_name[0]) {
+ zap->zap_m.zap_num_entries++;
+ mze_insert(zap, i,
+ zap_hash(zap, mze->mze_name), mze);
+ }
+ }
+ } else {
+ zap->zap_salt = zap->zap_f.zap_phys->zap_salt;
+
+ ASSERT3U(sizeof (struct zap_leaf_header), ==,
+ 2*ZAP_LEAF_CHUNKSIZE);
+
+ /*
+ * The embedded pointer table should not overlap the
+ * other members.
+ */
+ ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >,
+ &zap->zap_f.zap_phys->zap_salt);
+
+ /*
+ * The embedded pointer table should end at the end of
+ * the block
+ */
+ ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap,
+ 1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)) -
+ (uintptr_t)zap->zap_f.zap_phys, ==,
+ zap->zap_dbuf->db_size);
+ }
+ rw_exit(&zap->zap_rwlock);
+ return (zap);
+}
+
+int
+zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
+ krw_t lti, int fatreader, zap_t **zapp)
+{
+ zap_t *zap;
+ dmu_buf_t *db;
+ krw_t lt;
+ int err;
+
+ *zapp = NULL;
+
+ err = dmu_buf_hold(os, obj, 0, NULL, &db);
+ if (err)
+ return (err);
+
+#ifdef ZFS_DEBUG
+ {
+ dmu_object_info_t doi;
+ dmu_object_info_from_db(db, &doi);
+ ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap);
+ }
+#endif
+
+ zap = dmu_buf_get_user(db);
+ if (zap == NULL)
+ zap = mzap_open(os, obj, db);
+
+ /*
+ * We're checking zap_ismicro without the lock held, in order to
+ * tell what type of lock we want. Once we have some sort of
+ * lock, see if it really is the right type. In practice this
+ * can only be different if it was upgraded from micro to fat,
+ * and micro wanted WRITER but fat only needs READER.
+ */
+ lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti;
+ rw_enter(&zap->zap_rwlock, lt);
+ if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) {
+ /* it was upgraded, now we only need reader */
+ ASSERT(lt == RW_WRITER);
+ ASSERT(RW_READER ==
+ (!zap->zap_ismicro && fatreader) ? RW_READER : lti);
+ rw_downgrade(&zap->zap_rwlock);
+ lt = RW_READER;
+ }
+
+ zap->zap_objset = os;
+
+ if (lt == RW_WRITER)
+ dmu_buf_will_dirty(db, tx);
+
+ ASSERT3P(zap->zap_dbuf, ==, db);
+
+ ASSERT(!zap->zap_ismicro ||
+ zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks);
+ if (zap->zap_ismicro && tx &&
+ zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) {
+ uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE;
+ if (newsz > MZAP_MAX_BLKSZ) {
+ dprintf("upgrading obj %llu: num_entries=%u\n",
+ obj, zap->zap_m.zap_num_entries);
+ mzap_upgrade(zap, tx);
+ *zapp = zap;
+ return (0);
+ }
+ err = dmu_object_set_blocksize(os, obj, newsz, 0, tx);
+ ASSERT3U(err, ==, 0);
+ zap->zap_m.zap_num_chunks =
+ db->db_size / MZAP_ENT_LEN - 1;
+ }
+
+ *zapp = zap;
+ return (0);
+}
+
+void
+zap_unlockdir(zap_t *zap)
+{
+ rw_exit(&zap->zap_rwlock);
+ dmu_buf_rele(zap->zap_dbuf, NULL);
+}
+
+static void
+mzap_upgrade(zap_t *zap, dmu_tx_t *tx)
+{
+ mzap_phys_t *mzp;
+ int i, sz, nchunks, err;
+
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+ sz = zap->zap_dbuf->db_size;
+ mzp = kmem_alloc(sz, KM_SLEEP);
+ bcopy(zap->zap_dbuf->db_data, mzp, sz);
+ nchunks = zap->zap_m.zap_num_chunks;
+
+ err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object,
+ 1ULL << fzap_default_block_shift, 0, tx);
+ ASSERT(err == 0);
+
+ dprintf("upgrading obj=%llu with %u chunks\n",
+ zap->zap_object, nchunks);
+ mze_destroy(zap);
+
+ fzap_upgrade(zap, tx);
+
+ for (i = 0; i < nchunks; i++) {
+ int err;
+ mzap_ent_phys_t *mze = &mzp->mz_chunk[i];
+ if (mze->mze_name[0] == 0)
+ continue;
+ dprintf("adding %s=%llu\n",
+ mze->mze_name, mze->mze_value);
+ err = fzap_add_cd(zap,
+ mze->mze_name, 8, 1, &mze->mze_value,
+ mze->mze_cd, tx);
+ ASSERT3U(err, ==, 0);
+ }
+ kmem_free(mzp, sz);
+}
+
+uint64_t
+zap_hash(zap_t *zap, const char *name)
+{
+ const uint8_t *cp;
+ uint8_t c;
+ uint64_t crc = zap->zap_salt;
+
+ ASSERT(crc != 0);
+ ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
+ for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
+ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
+
+ /*
+ * Only use 28 bits, since we need 4 bits in the cookie for the
+ * collision differentiator. We MUST use the high bits, since
+ * those are the onces that we first pay attention to when
+ * chosing the bucket.
+ */
+ crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
+
+ return (crc);
+}
+
+
+static void
+mzap_create_impl(objset_t *os, uint64_t obj, dmu_tx_t *tx)
+{
+ dmu_buf_t *db;
+ mzap_phys_t *zp;
+
+ VERIFY(0 == dmu_buf_hold(os, obj, 0, FTAG, &db));
+
+#ifdef ZFS_DEBUG
+ {
+ dmu_object_info_t doi;
+ dmu_object_info_from_db(db, &doi);
+ ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap);
+ }
+#endif
+
+ dmu_buf_will_dirty(db, tx);
+ zp = db->db_data;
+ zp->mz_block_type = ZBT_MICRO;
+ zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL;
+ ASSERT(zp->mz_salt != 0);
+ dmu_buf_rele(db, FTAG);
+}
+
+int
+zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+ int err;
+
+ err = dmu_object_claim(os, obj, ot, 0, bonustype, bonuslen, tx);
+ if (err != 0)
+ return (err);
+ mzap_create_impl(os, obj, tx);
+ return (0);
+}
+
+uint64_t
+zap_create(objset_t *os, dmu_object_type_t ot,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+ uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx);
+
+ mzap_create_impl(os, obj, tx);
+ return (obj);
+}
+
+int
+zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx)
+{
+ /*
+ * dmu_object_free will free the object number and free the
+ * data. Freeing the data will cause our pageout function to be
+ * called, which will destroy our data (zap_leaf_t's and zap_t).
+ */
+
+ return (dmu_object_free(os, zapobj, tx));
+}
+
+_NOTE(ARGSUSED(0))
+void
+zap_evict(dmu_buf_t *db, void *vzap)
+{
+ zap_t *zap = vzap;
+
+ rw_destroy(&zap->zap_rwlock);
+
+ if (zap->zap_ismicro)
+ mze_destroy(zap);
+ else
+ mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
+
+ kmem_free(zap, sizeof (zap_t));
+}
+
+int
+zap_count(objset_t *os, uint64_t zapobj, uint64_t *count)
+{
+ zap_t *zap;
+ int err;
+
+ err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, &zap);
+ if (err)
+ return (err);
+ if (!zap->zap_ismicro) {
+ err = fzap_count(zap, count);
+ } else {
+ *count = zap->zap_m.zap_num_entries;
+ }
+ zap_unlockdir(zap);
+ return (err);
+}
+
+/*
+ * Routines for maniplulating attributes.
+ */
+
+int
+zap_lookup(objset_t *os, uint64_t zapobj, const char *name,
+ uint64_t integer_size, uint64_t num_integers, void *buf)
+{
+ zap_t *zap;
+ int err;
+ mzap_ent_t *mze;
+
+ err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, &zap);
+ if (err)
+ return (err);
+ if (!zap->zap_ismicro) {
+ err = fzap_lookup(zap, name,
+ integer_size, num_integers, buf);
+ } else {
+ mze = mze_find(zap, name, zap_hash(zap, name));
+ if (mze == NULL) {
+ err = ENOENT;
+ } else {
+ if (num_integers < 1)
+ err = EOVERFLOW;
+ else if (integer_size != 8)
+ err = EINVAL;
+ else
+ *(uint64_t *)buf = mze->mze_phys.mze_value;
+ }
+ }
+ zap_unlockdir(zap);
+ return (err);
+}
+
+int
+zap_length(objset_t *os, uint64_t zapobj, const char *name,
+ uint64_t *integer_size, uint64_t *num_integers)
+{
+ zap_t *zap;
+ int err;
+ mzap_ent_t *mze;
+
+ err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, &zap);
+ if (err)
+ return (err);
+ if (!zap->zap_ismicro) {
+ err = fzap_length(zap, name, integer_size, num_integers);
+ } else {
+ mze = mze_find(zap, name, zap_hash(zap, name));
+ if (mze == NULL) {
+ err = ENOENT;
+ } else {
+ if (integer_size)
+ *integer_size = 8;
+ if (num_integers)
+ *num_integers = 1;
+ }
+ }
+ zap_unlockdir(zap);
+ return (err);
+}
+
+static void
+mzap_addent(zap_t *zap, const char *name, uint64_t hash, uint64_t value)
+{
+ int i;
+ int start = zap->zap_m.zap_alloc_next;
+ uint32_t cd;
+
+ dprintf("obj=%llu %s=%llu\n", zap->zap_object, name, value);
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+#ifdef ZFS_DEBUG
+ for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
+ mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i];
+ ASSERT(strcmp(name, mze->mze_name) != 0);
+ }
+#endif
+
+ cd = mze_find_unused_cd(zap, hash);
+ /* given the limited size of the microzap, this can't happen */
+ ASSERT(cd != ZAP_MAXCD);
+
+again:
+ for (i = start; i < zap->zap_m.zap_num_chunks; i++) {
+ mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i];
+ if (mze->mze_name[0] == 0) {
+ mze->mze_value = value;
+ mze->mze_cd = cd;
+ (void) strcpy(mze->mze_name, name);
+ zap->zap_m.zap_num_entries++;
+ zap->zap_m.zap_alloc_next = i+1;
+ if (zap->zap_m.zap_alloc_next ==
+ zap->zap_m.zap_num_chunks)
+ zap->zap_m.zap_alloc_next = 0;
+ mze_insert(zap, i, hash, mze);
+ return;
+ }
+ }
+ if (start != 0) {
+ start = 0;
+ goto again;
+ }
+ ASSERT(!"out of entries!");
+}
+
+int
+zap_add(objset_t *os, uint64_t zapobj, const char *name,
+ int integer_size, uint64_t num_integers,
+ const void *val, dmu_tx_t *tx)
+{
+ zap_t *zap;
+ int err;
+ mzap_ent_t *mze;
+ const uint64_t *intval = val;
+ uint64_t hash;
+
+ err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, &zap);
+ if (err)
+ return (err);
+ if (!zap->zap_ismicro) {
+ err = fzap_add(zap, name, integer_size, num_integers, val, tx);
+ } else if (integer_size != 8 || num_integers != 1 ||
+ strlen(name) >= MZAP_NAME_LEN) {
+ dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
+ zapobj, integer_size, num_integers, name);
+ mzap_upgrade(zap, tx);
+ err = fzap_add(zap, name, integer_size, num_integers, val, tx);
+ } else {
+ hash = zap_hash(zap, name);
+ mze = mze_find(zap, name, hash);
+ if (mze != NULL) {
+ err = EEXIST;
+ } else {
+ mzap_addent(zap, name, hash, *intval);
+ }
+ }
+ zap_unlockdir(zap);
+ return (err);
+}
+
+int
+zap_update(objset_t *os, uint64_t zapobj, const char *name,
+ int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
+{
+ zap_t *zap;
+ mzap_ent_t *mze;
+ const uint64_t *intval = val;
+ uint64_t hash;
+ int err;
+
+ err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, &zap);
+ if (err)
+ return (err);
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+ if (!zap->zap_ismicro) {
+ err = fzap_update(zap, name,
+ integer_size, num_integers, val, tx);
+ } else if (integer_size != 8 || num_integers != 1 ||
+ strlen(name) >= MZAP_NAME_LEN) {
+ dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
+ zapobj, integer_size, num_integers, name);
+ mzap_upgrade(zap, tx);
+ err = fzap_update(zap, name,
+ integer_size, num_integers, val, tx);
+ } else {
+ hash = zap_hash(zap, name);
+ mze = mze_find(zap, name, hash);
+ if (mze != NULL) {
+ mze->mze_phys.mze_value = *intval;
+ zap->zap_m.zap_phys->mz_chunk
+ [mze->mze_chunkid].mze_value = *intval;
+ } else {
+ mzap_addent(zap, name, hash, *intval);
+ }
+ }
+ zap_unlockdir(zap);
+ return (err);
+}
+
+int
+zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx)
+{
+ zap_t *zap;
+ int err;
+ mzap_ent_t *mze;
+
+ err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, &zap);
+ if (err)
+ return (err);
+ if (!zap->zap_ismicro) {
+ err = fzap_remove(zap, name, tx);
+ } else {
+ mze = mze_find(zap, name, zap_hash(zap, name));
+ if (mze == NULL) {
+ dprintf("fail: %s\n", name);
+ err = ENOENT;
+ } else {
+ dprintf("success: %s\n", name);
+ zap->zap_m.zap_num_entries--;
+ bzero(&zap->zap_m.zap_phys->mz_chunk[mze->mze_chunkid],
+ sizeof (mzap_ent_phys_t));
+ mze_remove(zap, mze);
+ }
+ }
+ zap_unlockdir(zap);
+ return (err);
+}
+
+
+/*
+ * Routines for iterating over the attributes.
+ */
+
+/*
+ * We want to keep the high 32 bits of the cursor zero if we can, so
+ * that 32-bit programs can access this. So use a small hash value so
+ * we can fit 4 bits of cd into the 32-bit cursor.
+ *
+ * [ 4 zero bits | 32-bit collision differentiator | 28-bit hash value ]
+ */
+void
+zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
+ uint64_t serialized)
+{
+ zc->zc_objset = os;
+ zc->zc_zap = NULL;
+ zc->zc_leaf = NULL;
+ zc->zc_zapobj = zapobj;
+ if (serialized == -1ULL) {
+ zc->zc_hash = -1ULL;
+ zc->zc_cd = 0;
+ } else {
+ zc->zc_hash = serialized << (64-ZAP_HASHBITS);
+ zc->zc_cd = serialized >> ZAP_HASHBITS;
+ if (zc->zc_cd >= ZAP_MAXCD) /* corrupt serialized */
+ zc->zc_cd = 0;
+ }
+}
+
+void
+zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
+{
+ zap_cursor_init_serialized(zc, os, zapobj, 0);
+}
+
+void
+zap_cursor_fini(zap_cursor_t *zc)
+{
+ if (zc->zc_zap) {
+ rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
+ zap_unlockdir(zc->zc_zap);
+ zc->zc_zap = NULL;
+ }
+ if (zc->zc_leaf) {
+ rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
+ zap_put_leaf(zc->zc_leaf);
+ zc->zc_leaf = NULL;
+ }
+ zc->zc_objset = NULL;
+}
+
+uint64_t
+zap_cursor_serialize(zap_cursor_t *zc)
+{
+ if (zc->zc_hash == -1ULL)
+ return (-1ULL);
+ ASSERT((zc->zc_hash & (ZAP_MAXCD-1)) == 0);
+ ASSERT(zc->zc_cd < ZAP_MAXCD);
+ return ((zc->zc_hash >> (64-ZAP_HASHBITS)) |
+ ((uint64_t)zc->zc_cd << ZAP_HASHBITS));
+}
+
+int
+zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
+{
+ int err;
+ avl_index_t idx;
+ mzap_ent_t mze_tofind;
+ mzap_ent_t *mze;
+
+ if (zc->zc_hash == -1ULL)
+ return (ENOENT);
+
+ if (zc->zc_zap == NULL) {
+ err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
+ RW_READER, TRUE, &zc->zc_zap);
+ if (err)
+ return (err);
+ } else {
+ rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
+ }
+ if (!zc->zc_zap->zap_ismicro) {
+ err = fzap_cursor_retrieve(zc->zc_zap, zc, za);
+ } else {
+ err = ENOENT;
+
+ mze_tofind.mze_hash = zc->zc_hash;
+ mze_tofind.mze_phys.mze_cd = zc->zc_cd;
+
+ mze = avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx);
+ ASSERT(mze == NULL || 0 == bcmp(&mze->mze_phys,
+ &zc->zc_zap->zap_m.zap_phys->mz_chunk[mze->mze_chunkid],
+ sizeof (mze->mze_phys)));
+ if (mze == NULL) {
+ mze = avl_nearest(&zc->zc_zap->zap_m.zap_avl,
+ idx, AVL_AFTER);
+ }
+ if (mze) {
+ za->za_integer_length = 8;
+ za->za_num_integers = 1;
+ za->za_first_integer = mze->mze_phys.mze_value;
+ (void) strcpy(za->za_name, mze->mze_phys.mze_name);
+ zc->zc_hash = mze->mze_hash;
+ zc->zc_cd = mze->mze_phys.mze_cd;
+ err = 0;
+ } else {
+ zc->zc_hash = -1ULL;
+ }
+ }
+ rw_exit(&zc->zc_zap->zap_rwlock);
+ return (err);
+}
+
+void
+zap_cursor_advance(zap_cursor_t *zc)
+{
+ if (zc->zc_hash == -1ULL)
+ return;
+ zc->zc_cd++;
+ if (zc->zc_cd >= ZAP_MAXCD) {
+ zc->zc_cd = 0;
+ zc->zc_hash += 1ULL<<(64-ZAP_HASHBITS);
+ if (zc->zc_hash == 0) /* EOF */
+ zc->zc_hash = -1ULL;
+ }
+}
+
+int
+zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
+{
+ int err;
+ zap_t *zap;
+
+ err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, &zap);
+ if (err)
+ return (err);
+
+ bzero(zs, sizeof (zap_stats_t));
+
+ if (zap->zap_ismicro) {
+ zs->zs_blocksize = zap->zap_dbuf->db_size;
+ zs->zs_num_entries = zap->zap_m.zap_num_entries;
+ zs->zs_num_blocks = 1;
+ } else {
+ fzap_get_stats(zap, zs);
+ }
+ zap_unlockdir(zap);
+ return (0);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs.conf b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs.conf
new file mode 100644
index 0000000..0988190
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs.conf
@@ -0,0 +1,28 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License"). You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+# ident "%Z%%M% %I% %E% SMI"
+#
+name="zfs" parent="pseudo";
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c
new file mode 100644
index 0000000..030424b
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c
@@ -0,0 +1,1607 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/resource.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/kmem.h>
+#include <sys/cmn_err.h>
+#include <sys/errno.h>
+#include <sys/unistd.h>
+#include <sys/sdt.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/dmu.h>
+#include <sys/zap.h>
+#include <acl/acl_common.h>
+
+#define ALLOW ACE_ACCESS_ALLOWED_ACE_TYPE
+#define DENY ACE_ACCESS_DENIED_ACE_TYPE
+
+#define OWNING_GROUP (ACE_GROUP|ACE_IDENTIFIER_GROUP)
+#define EVERYONE_ALLOW_MASK (ACE_READ_ACL|ACE_READ_ATTRIBUTES | \
+ ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE)
+#define EVERYONE_DENY_MASK (ACE_WRITE_ACL|ACE_WRITE_OWNER | \
+ ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS)
+#define OWNER_ALLOW_MASK (ACE_WRITE_ACL | ACE_WRITE_OWNER | \
+ ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS)
+#define WRITE_MASK (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS| \
+ ACE_WRITE_ATTRIBUTES|ACE_WRITE_ACL|ACE_WRITE_OWNER)
+
+#define OGE_CLEAR (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \
+ ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE)
+
+#define OKAY_MASK_BITS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \
+ ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE)
+
+#define ALL_INHERIT (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE | \
+ ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE)
+
+#define SECURE_CLEAR (ACE_WRITE_ACL|ACE_WRITE_OWNER)
+
+#define OGE_PAD 6 /* traditional owner/group/everyone ACES */
+
+static int zfs_ace_can_use(znode_t *zp, ace_t *);
+
+static zfs_acl_t *
+zfs_acl_alloc(int slots)
+{
+ zfs_acl_t *aclp;
+
+ aclp = kmem_zalloc(sizeof (zfs_acl_t), KM_SLEEP);
+ if (slots != 0) {
+ aclp->z_acl = kmem_alloc(ZFS_ACL_SIZE(slots), KM_SLEEP);
+ aclp->z_acl_count = 0;
+ aclp->z_state = ACL_DATA_ALLOCED;
+ } else {
+ aclp->z_state = 0;
+ }
+ aclp->z_slots = slots;
+ return (aclp);
+}
+
+void
+zfs_acl_free(zfs_acl_t *aclp)
+{
+ if (aclp->z_state == ACL_DATA_ALLOCED) {
+ kmem_free(aclp->z_acl, ZFS_ACL_SIZE(aclp->z_slots));
+ }
+ kmem_free(aclp, sizeof (zfs_acl_t));
+}
+
+static uint32_t
+zfs_v4_to_unix(uint32_t access_mask)
+{
+ uint32_t new_mask = 0;
+
+ /*
+ * This is used for mapping v4 permissions into permissions
+ * that can be passed to secpolicy_vnode_access()
+ */
+ if (access_mask & (ACE_READ_DATA | ACE_LIST_DIRECTORY |
+ ACE_READ_ATTRIBUTES | ACE_READ_ACL))
+ new_mask |= S_IROTH;
+ if (access_mask & (ACE_WRITE_DATA | ACE_APPEND_DATA |
+ ACE_WRITE_ATTRIBUTES | ACE_ADD_FILE | ACE_WRITE_NAMED_ATTRS))
+ new_mask |= S_IWOTH;
+ if (access_mask & (ACE_EXECUTE | ACE_READ_NAMED_ATTRS))
+ new_mask |= S_IXOTH;
+
+ return (new_mask);
+}
+
+/*
+ * Convert unix access mask to v4 access mask
+ */
+static uint32_t
+zfs_unix_to_v4(uint32_t access_mask)
+{
+ uint32_t new_mask = 0;
+
+ if (access_mask & 01)
+ new_mask |= (ACE_EXECUTE);
+ if (access_mask & 02) {
+ new_mask |= (ACE_WRITE_DATA);
+ } if (access_mask & 04) {
+ new_mask |= ACE_READ_DATA;
+ }
+ return (new_mask);
+}
+
+static void
+zfs_set_ace(ace_t *zacep, uint32_t access_mask, int access_type,
+ uid_t uid, int entry_type)
+{
+ zacep->a_access_mask = access_mask;
+ zacep->a_type = access_type;
+ zacep->a_who = uid;
+ zacep->a_flags = entry_type;
+}
+
+static uint64_t
+zfs_mode_compute(znode_t *zp, zfs_acl_t *aclp)
+{
+ int i;
+ int entry_type;
+ mode_t mode = (zp->z_phys->zp_mode &
+ (S_IFMT | S_ISUID | S_ISGID | S_ISVTX));
+ mode_t seen = 0;
+ ace_t *acep;
+
+ for (i = 0, acep = aclp->z_acl;
+ i != aclp->z_acl_count; i++, acep++) {
+ entry_type = (acep->a_flags & ACE_TYPE_FLAGS);
+ if (entry_type == ACE_OWNER) {
+ if ((acep->a_access_mask & ACE_READ_DATA) &&
+ (!(seen & S_IRUSR))) {
+ seen |= S_IRUSR;
+ if (acep->a_type == ALLOW) {
+ mode |= S_IRUSR;
+ }
+ }
+ if ((acep->a_access_mask & ACE_WRITE_DATA) &&
+ (!(seen & S_IWUSR))) {
+ seen |= S_IWUSR;
+ if (acep->a_type == ALLOW) {
+ mode |= S_IWUSR;
+ }
+ }
+ if ((acep->a_access_mask & ACE_EXECUTE) &&
+ (!(seen & S_IXUSR))) {
+ seen |= S_IXUSR;
+ if (acep->a_type == ALLOW) {
+ mode |= S_IXUSR;
+ }
+ }
+ } else if (entry_type == OWNING_GROUP) {
+ if ((acep->a_access_mask & ACE_READ_DATA) &&
+ (!(seen & S_IRGRP))) {
+ seen |= S_IRGRP;
+ if (acep->a_type == ALLOW) {
+ mode |= S_IRGRP;
+ }
+ }
+ if ((acep->a_access_mask & ACE_WRITE_DATA) &&
+ (!(seen & S_IWGRP))) {
+ seen |= S_IWGRP;
+ if (acep->a_type == ALLOW) {
+ mode |= S_IWGRP;
+ }
+ }
+ if ((acep->a_access_mask & ACE_EXECUTE) &&
+ (!(seen & S_IXGRP))) {
+ seen |= S_IXGRP;
+ if (acep->a_type == ALLOW) {
+ mode |= S_IXGRP;
+ }
+ }
+ } else if (entry_type == ACE_EVERYONE) {
+ if ((acep->a_access_mask & ACE_READ_DATA)) {
+ if (!(seen & S_IRUSR)) {
+ seen |= S_IRUSR;
+ if (acep->a_type == ALLOW) {
+ mode |= S_IRUSR;
+ }
+ }
+ if (!(seen & S_IRGRP)) {
+ seen |= S_IRGRP;
+ if (acep->a_type == ALLOW) {
+ mode |= S_IRGRP;
+ }
+ }
+ if (!(seen & S_IROTH)) {
+ seen |= S_IROTH;
+ if (acep->a_type == ALLOW) {
+ mode |= S_IROTH;
+ }
+ }
+ }
+ if ((acep->a_access_mask & ACE_WRITE_DATA)) {
+ if (!(seen & S_IWUSR)) {
+ seen |= S_IWUSR;
+ if (acep->a_type == ALLOW) {
+ mode |= S_IWUSR;
+ }
+ }
+ if (!(seen & S_IWGRP)) {
+ seen |= S_IWGRP;
+ if (acep->a_type == ALLOW) {
+ mode |= S_IWGRP;
+ }
+ }
+ if (!(seen & S_IWOTH)) {
+ seen |= S_IWOTH;
+ if (acep->a_type == ALLOW) {
+ mode |= S_IWOTH;
+ }
+ }
+ }
+ if ((acep->a_access_mask & ACE_EXECUTE)) {
+ if (!(seen & S_IXUSR)) {
+ seen |= S_IXUSR;
+ if (acep->a_type == ALLOW) {
+ mode |= S_IXUSR;
+ }
+ }
+ if (!(seen & S_IXGRP)) {
+ seen |= S_IXGRP;
+ if (acep->a_type == ALLOW) {
+ mode |= S_IXGRP;
+ }
+ }
+ if (!(seen & S_IXOTH)) {
+ seen |= S_IXOTH;
+ if (acep->a_type == ALLOW) {
+ mode |= S_IXOTH;
+ }
+ }
+ }
+ }
+ }
+ return (mode);
+}
+
+static zfs_acl_t *
+zfs_acl_node_read_internal(znode_t *zp)
+{
+ zfs_acl_t *aclp;
+
+ aclp = zfs_acl_alloc(0);
+ aclp->z_acl_count = zp->z_phys->zp_acl.z_acl_count;
+ aclp->z_acl = &zp->z_phys->zp_acl.z_ace_data[0];
+
+ return (aclp);
+}
+
+/*
+ * Read an external acl object.
+ */
+static int
+zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp)
+{
+ uint64_t extacl = zp->z_phys->zp_acl.z_acl_extern_obj;
+ zfs_acl_t *aclp;
+ int error;
+
+ ASSERT(MUTEX_HELD(&zp->z_acl_lock));
+
+ if (zp->z_phys->zp_acl.z_acl_extern_obj == 0) {
+ *aclpp = zfs_acl_node_read_internal(zp);
+ return (0);
+ }
+
+ aclp = zfs_acl_alloc(zp->z_phys->zp_acl.z_acl_count);
+
+ error = dmu_read(zp->z_zfsvfs->z_os, extacl, 0,
+ ZFS_ACL_SIZE(zp->z_phys->zp_acl.z_acl_count), aclp->z_acl);
+ if (error != 0) {
+ zfs_acl_free(aclp);
+ return (error);
+ }
+
+ aclp->z_acl_count = zp->z_phys->zp_acl.z_acl_count;
+
+ *aclpp = aclp;
+ return (0);
+}
+
+static boolean_t
+zfs_acl_valid(znode_t *zp, ace_t *uace, int aclcnt, int *inherit)
+{
+ ace_t *acep;
+ int i;
+
+ *inherit = 0;
+
+ if (aclcnt > MAX_ACL_ENTRIES || aclcnt <= 0) {
+ return (B_FALSE);
+ }
+
+ for (i = 0, acep = uace; i != aclcnt; i++, acep++) {
+
+ /*
+ * first check type of entry
+ */
+
+ switch (acep->a_flags & ACE_TYPE_FLAGS) {
+ case ACE_OWNER:
+ acep->a_who = -1;
+ break;
+ case (ACE_IDENTIFIER_GROUP | ACE_GROUP):
+ case ACE_IDENTIFIER_GROUP:
+ if (acep->a_flags & ACE_GROUP) {
+ acep->a_who = -1;
+ }
+ break;
+ case ACE_EVERYONE:
+ acep->a_who = -1;
+ break;
+ }
+
+ /*
+ * next check inheritance level flags
+ */
+
+ if (acep->a_type != ALLOW && acep->a_type != DENY)
+ return (B_FALSE);
+
+ /*
+ * Only directories should have inheritance flags.
+ */
+ if (ZTOV(zp)->v_type != VDIR && (acep->a_flags &
+ (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE|
+ ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE))) {
+ return (B_FALSE);
+ }
+
+ if (acep->a_flags &
+ (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))
+ *inherit = 1;
+
+ if (acep->a_flags &
+ (ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE)) {
+ if ((acep->a_flags & (ACE_FILE_INHERIT_ACE|
+ ACE_DIRECTORY_INHERIT_ACE)) == 0) {
+ return (B_FALSE);
+ }
+ }
+ }
+
+ return (B_TRUE);
+}
+/*
+ * common code for setting acl's.
+ *
+ * This function is called from zfs_mode_update, zfs_perm_init, and zfs_setacl.
+ * zfs_setacl passes a non-NULL inherit pointer (ihp) to indicate that it's
+ * already checked the acl and knows whether to inherit.
+ */
+int
+zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, dmu_tx_t *tx, int *ihp)
+{
+ int inherit = 0;
+ int error;
+ znode_phys_t *zphys = zp->z_phys;
+ zfs_znode_acl_t *zacl = &zphys->zp_acl;
+ uint32_t acl_phys_size = ZFS_ACL_SIZE(aclp->z_acl_count);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ uint64_t aoid = zphys->zp_acl.z_acl_extern_obj;
+
+ ASSERT(MUTEX_HELD(&zp->z_lock));
+ ASSERT(MUTEX_HELD(&zp->z_acl_lock));
+
+ if (ihp)
+ inherit = *ihp; /* already determined by caller */
+ else if (!zfs_acl_valid(zp, aclp->z_acl,
+ aclp->z_acl_count, &inherit)) {
+ return (EINVAL);
+ }
+
+ dmu_buf_will_dirty(zp->z_dbuf, tx);
+
+ /*
+ * Will ACL fit internally?
+ */
+ if (aclp->z_acl_count > ACE_SLOT_CNT) {
+ if (aoid == 0) {
+ aoid = dmu_object_alloc(zfsvfs->z_os,
+ DMU_OT_ACL, acl_phys_size, DMU_OT_NONE, 0, tx);
+ } else {
+ (void) dmu_object_set_blocksize(zfsvfs->z_os, aoid,
+ acl_phys_size, 0, tx);
+ }
+ zphys->zp_acl.z_acl_extern_obj = aoid;
+ zphys->zp_acl.z_acl_count = aclp->z_acl_count;
+ dmu_write(zfsvfs->z_os, aoid, 0,
+ acl_phys_size, aclp->z_acl, tx);
+ } else {
+ /*
+ * Migrating back embedded?
+ */
+ if (zphys->zp_acl.z_acl_extern_obj) {
+ error = dmu_object_free(zfsvfs->z_os,
+ zp->z_phys->zp_acl.z_acl_extern_obj, tx);
+ if (error)
+ return (error);
+ zphys->zp_acl.z_acl_extern_obj = 0;
+ }
+ bcopy(aclp->z_acl, zacl->z_ace_data,
+ aclp->z_acl_count * sizeof (ace_t));
+ zacl->z_acl_count = aclp->z_acl_count;
+ }
+
+ zp->z_phys->zp_flags &= ~(ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE);
+ if (inherit) {
+ zp->z_phys->zp_flags |= ZFS_INHERIT_ACE;
+ } else if (ace_trivial(zacl->z_ace_data, zacl->z_acl_count) == 0) {
+ zp->z_phys->zp_flags |= ZFS_ACL_TRIVIAL;
+ }
+
+ zphys->zp_mode = zfs_mode_compute(zp, aclp);
+ zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
+
+ return (0);
+}
+
+/*
+ * Create space for slots_needed ACEs to be append
+ * to aclp.
+ */
+static void
+zfs_acl_append(zfs_acl_t *aclp, int slots_needed)
+{
+ ace_t *newacep;
+ ace_t *oldaclp;
+ int slot_cnt;
+ int slots_left = aclp->z_slots - aclp->z_acl_count;
+
+ if (aclp->z_state == ACL_DATA_ALLOCED)
+ ASSERT(aclp->z_slots >= aclp->z_acl_count);
+ if (slots_left < slots_needed || aclp->z_state != ACL_DATA_ALLOCED) {
+ slot_cnt = aclp->z_slots + 1 + (slots_needed - slots_left);
+ newacep = kmem_alloc(ZFS_ACL_SIZE(slot_cnt), KM_SLEEP);
+ bcopy(aclp->z_acl, newacep,
+ ZFS_ACL_SIZE(aclp->z_acl_count));
+ oldaclp = aclp->z_acl;
+ if (aclp->z_state == ACL_DATA_ALLOCED)
+ kmem_free(oldaclp, ZFS_ACL_SIZE(aclp->z_slots));
+ aclp->z_acl = newacep;
+ aclp->z_slots = slot_cnt;
+ aclp->z_state = ACL_DATA_ALLOCED;
+ }
+}
+
+/*
+ * Remove "slot" ACE from aclp
+ */
+static void
+zfs_ace_remove(zfs_acl_t *aclp, int slot)
+{
+ if (aclp->z_acl_count > 1) {
+ (void) memmove(&aclp->z_acl[slot],
+ &aclp->z_acl[slot +1], sizeof (ace_t) *
+ (--aclp->z_acl_count - slot));
+ } else
+ aclp->z_acl_count--;
+}
+
+/*
+ * Update access mask for prepended ACE
+ *
+ * This applies the "groupmask" value for aclmode property.
+ */
+static void
+zfs_acl_prepend_fixup(ace_t *acep, ace_t *origacep, mode_t mode, uid_t owner)
+{
+
+ int rmask, wmask, xmask;
+ int user_ace;
+
+ user_ace = (!(acep->a_flags &
+ (ACE_OWNER|ACE_GROUP|ACE_IDENTIFIER_GROUP)));
+
+ if (user_ace && (acep->a_who == owner)) {
+ rmask = S_IRUSR;
+ wmask = S_IWUSR;
+ xmask = S_IXUSR;
+ } else {
+ rmask = S_IRGRP;
+ wmask = S_IWGRP;
+ xmask = S_IXGRP;
+ }
+
+ if (origacep->a_access_mask & ACE_READ_DATA) {
+ if (mode & rmask)
+ acep->a_access_mask &= ~ACE_READ_DATA;
+ else
+ acep->a_access_mask |= ACE_READ_DATA;
+ }
+
+ if (origacep->a_access_mask & ACE_WRITE_DATA) {
+ if (mode & wmask)
+ acep->a_access_mask &= ~ACE_WRITE_DATA;
+ else
+ acep->a_access_mask |= ACE_WRITE_DATA;
+ }
+
+ if (origacep->a_access_mask & ACE_APPEND_DATA) {
+ if (mode & wmask)
+ acep->a_access_mask &= ~ACE_APPEND_DATA;
+ else
+ acep->a_access_mask |= ACE_APPEND_DATA;
+ }
+
+ if (origacep->a_access_mask & ACE_EXECUTE) {
+ if (mode & xmask)
+ acep->a_access_mask &= ~ACE_EXECUTE;
+ else
+ acep->a_access_mask |= ACE_EXECUTE;
+ }
+}
+
+/*
+ * Apply mode to canonical six ACEs.
+ */
+static void
+zfs_acl_fixup_canonical_six(zfs_acl_t *aclp, mode_t mode)
+{
+ int cnt;
+ ace_t *acep;
+
+ cnt = aclp->z_acl_count -1;
+ acep = aclp->z_acl;
+
+ /*
+ * Fixup final ACEs to match the mode
+ */
+
+ ASSERT(cnt >= 5);
+ adjust_ace_pair(&acep[cnt - 1], mode); /* everyone@ */
+ adjust_ace_pair(&acep[cnt - 3], (mode & 0070) >> 3); /* group@ */
+ adjust_ace_pair(&acep[cnt - 5], (mode & 0700) >> 6); /* owner@ */
+}
+
+
+static int
+zfs_acl_ace_match(ace_t *acep, int allow_deny, int type, int mask)
+{
+ return (acep->a_access_mask == mask && acep->a_type == allow_deny &&
+ ((acep->a_flags & ACE_TYPE_FLAGS) == type));
+}
+
+/*
+ * Can prepended ACE be reused?
+ */
+static int
+zfs_reuse_deny(ace_t *acep, int i)
+{
+ int okay_masks;
+
+ if (i < 1)
+ return (B_FALSE);
+
+ if (acep[i-1].a_type != DENY)
+ return (B_FALSE);
+
+ if (acep[i-1].a_flags != (acep[i].a_flags & ACE_IDENTIFIER_GROUP))
+ return (B_FALSE);
+
+ okay_masks = (acep[i].a_access_mask & OKAY_MASK_BITS);
+
+ if (acep[i-1].a_access_mask & ~okay_masks)
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
+/*
+ * Create space to prepend an ACE
+ */
+static void
+zfs_acl_prepend(zfs_acl_t *aclp, int i)
+{
+ ace_t *oldaclp = NULL;
+ ace_t *to, *from;
+ int slots_left = aclp->z_slots - aclp->z_acl_count;
+ int oldslots;
+ int need_free = 0;
+
+ if (aclp->z_state == ACL_DATA_ALLOCED)
+ ASSERT(aclp->z_slots >= aclp->z_acl_count);
+
+ if (slots_left == 0 || aclp->z_state != ACL_DATA_ALLOCED) {
+
+ to = kmem_alloc(ZFS_ACL_SIZE(aclp->z_acl_count +
+ OGE_PAD), KM_SLEEP);
+ if (aclp->z_state == ACL_DATA_ALLOCED)
+ need_free++;
+ from = aclp->z_acl;
+ oldaclp = aclp->z_acl;
+ (void) memmove(to, from,
+ sizeof (ace_t) * aclp->z_acl_count);
+ aclp->z_state = ACL_DATA_ALLOCED;
+ } else {
+ from = aclp->z_acl;
+ to = aclp->z_acl;
+ }
+
+
+ (void) memmove(&to[i + 1], &from[i],
+ sizeof (ace_t) * (aclp->z_acl_count - i));
+
+ if (oldaclp) {
+ aclp->z_acl = to;
+ oldslots = aclp->z_slots;
+ aclp->z_slots = aclp->z_acl_count + OGE_PAD;
+ if (need_free)
+ kmem_free(oldaclp, ZFS_ACL_SIZE(oldslots));
+ }
+
+}
+
+/*
+ * Prepend deny ACE
+ */
+static void
+zfs_acl_prepend_deny(znode_t *zp, zfs_acl_t *aclp, int i,
+ mode_t mode)
+{
+ ace_t *acep;
+
+ zfs_acl_prepend(aclp, i);
+
+ acep = aclp->z_acl;
+ zfs_set_ace(&acep[i], 0, DENY, acep[i + 1].a_who,
+ (acep[i + 1].a_flags & ACE_TYPE_FLAGS));
+ zfs_acl_prepend_fixup(&acep[i], &acep[i+1], mode, zp->z_phys->zp_uid);
+ aclp->z_acl_count++;
+}
+
+/*
+ * Split an inherited ACE into inherit_only ACE
+ * and original ACE with inheritance flags stripped off.
+ */
+static void
+zfs_acl_split_ace(zfs_acl_t *aclp, int i)
+{
+ ace_t *acep = aclp->z_acl;
+
+ zfs_acl_prepend(aclp, i);
+ acep = aclp->z_acl;
+ acep[i] = acep[i + 1];
+ acep[i].a_flags |= ACE_INHERIT_ONLY_ACE;
+ acep[i + 1].a_flags &= ~ALL_INHERIT;
+ aclp->z_acl_count++;
+}
+
+/*
+ * Are ACES started at index i, the canonical six ACES?
+ */
+static int
+zfs_have_canonical_six(zfs_acl_t *aclp, int i)
+{
+ ace_t *acep = aclp->z_acl;
+
+ if ((zfs_acl_ace_match(&acep[i],
+ DENY, ACE_OWNER, 0) &&
+ zfs_acl_ace_match(&acep[i + 1], ALLOW, ACE_OWNER,
+ OWNER_ALLOW_MASK) && zfs_acl_ace_match(&acep[i + 2],
+ DENY, OWNING_GROUP, 0) && zfs_acl_ace_match(&acep[i + 3],
+ ALLOW, OWNING_GROUP, 0) && zfs_acl_ace_match(&acep[i + 4],
+ DENY, ACE_EVERYONE, EVERYONE_DENY_MASK) &&
+ zfs_acl_ace_match(&acep[i + 5], ALLOW, ACE_EVERYONE,
+ EVERYONE_ALLOW_MASK))) {
+ return (1);
+ } else {
+ return (0);
+ }
+}
+
+/*
+ * Apply step 1g, to group entries
+ *
+ * Need to deal with corner case where group may have
+ * greater permissions than owner. If so then limit
+ * group permissions, based on what extra permissions
+ * group has.
+ */
+static void
+zfs_fixup_group_entries(ace_t *acep, mode_t mode)
+{
+ mode_t extramode = (mode >> 3) & 07;
+ mode_t ownermode = (mode >> 6);
+
+ if (acep[0].a_flags & ACE_IDENTIFIER_GROUP) {
+
+ extramode &= ~ownermode;
+
+ if (extramode) {
+ if (extramode & 04) {
+ acep[0].a_access_mask &= ~ACE_READ_DATA;
+ acep[1].a_access_mask &= ~ACE_READ_DATA;
+ }
+ if (extramode & 02) {
+ acep[0].a_access_mask &=
+ ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
+ acep[1].a_access_mask &=
+ ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
+ }
+ if (extramode & 01) {
+ acep[0].a_access_mask &= ~ACE_EXECUTE;
+ acep[1].a_access_mask &= ~ACE_EXECUTE;
+ }
+ }
+ }
+}
+
+/*
+ * Apply the chmod algorithm as described
+ * in PSARC/2002/240
+ */
+static int
+zfs_acl_chmod(znode_t *zp, uint64_t mode, zfs_acl_t *aclp,
+ dmu_tx_t *tx)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ ace_t *acep;
+ int i;
+ int error;
+ int entry_type;
+ int reuse_deny;
+ int need_canonical_six = 1;
+ int inherit = 0;
+ int iflags;
+
+ ASSERT(MUTEX_HELD(&zp->z_acl_lock));
+ ASSERT(MUTEX_HELD(&zp->z_lock));
+
+ i = 0;
+ while (i < aclp->z_acl_count) {
+ acep = aclp->z_acl;
+ entry_type = (acep[i].a_flags & ACE_TYPE_FLAGS);
+ iflags = (acep[i].a_flags & ALL_INHERIT);
+
+ if ((acep[i].a_type != ALLOW && acep[i].a_type != DENY) ||
+ (iflags & ACE_INHERIT_ONLY_ACE)) {
+ i++;
+ if (iflags)
+ inherit = 1;
+ continue;
+ }
+
+
+ if (zfsvfs->z_acl_mode == ZFS_ACL_DISCARD) {
+ zfs_ace_remove(aclp, i);
+ continue;
+ }
+
+ /*
+ * Need to split ace into two?
+ */
+ if ((iflags & (ACE_FILE_INHERIT_ACE|
+ ACE_DIRECTORY_INHERIT_ACE)) &&
+ (!(iflags & ACE_INHERIT_ONLY_ACE))) {
+ zfs_acl_split_ace(aclp, i);
+ i++;
+ inherit = 1;
+ continue;
+ }
+
+ if (entry_type == ACE_OWNER || entry_type == ACE_EVERYONE ||
+ (entry_type == OWNING_GROUP)) {
+ acep[i].a_access_mask &= ~OGE_CLEAR;
+ i++;
+ continue;
+
+ } else {
+ if (acep[i].a_type == ALLOW) {
+
+ /*
+ * Check preceding ACE if any, to see
+ * if we need to prepend a DENY ACE.
+ * This is only applicable when the acl_mode
+ * property == groupmask.
+ */
+ if (zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK) {
+
+ reuse_deny = zfs_reuse_deny(acep, i);
+
+ if (reuse_deny == B_FALSE) {
+ zfs_acl_prepend_deny(zp, aclp,
+ i, mode);
+ i++;
+ acep = aclp->z_acl;
+ } else {
+ zfs_acl_prepend_fixup(
+ &acep[i - 1],
+ &acep[i], mode,
+ zp->z_phys->zp_uid);
+ }
+ zfs_fixup_group_entries(&acep[i - 1],
+ mode);
+ }
+ }
+ i++;
+ }
+ }
+
+ /*
+ * Check out last six aces, if we have six.
+ */
+
+ if (aclp->z_acl_count >= 6) {
+ i = aclp->z_acl_count - 6;
+
+ if (zfs_have_canonical_six(aclp, i)) {
+ need_canonical_six = 0;
+ }
+ }
+
+ if (need_canonical_six) {
+
+ zfs_acl_append(aclp, 6);
+ i = aclp->z_acl_count;
+ acep = aclp->z_acl;
+ zfs_set_ace(&acep[i++], 0, DENY, -1, ACE_OWNER);
+ zfs_set_ace(&acep[i++], OWNER_ALLOW_MASK, ALLOW, -1, ACE_OWNER);
+ zfs_set_ace(&acep[i++], 0, DENY, -1, OWNING_GROUP);
+ zfs_set_ace(&acep[i++], 0, ALLOW, -1, OWNING_GROUP);
+ zfs_set_ace(&acep[i++], EVERYONE_DENY_MASK,
+ DENY, -1, ACE_EVERYONE);
+ zfs_set_ace(&acep[i++], EVERYONE_ALLOW_MASK,
+ ALLOW, -1, ACE_EVERYONE);
+ aclp->z_acl_count += 6;
+ }
+
+ zfs_acl_fixup_canonical_six(aclp, mode);
+
+ zp->z_phys->zp_mode = mode;
+ error = zfs_aclset_common(zp, aclp, tx, &inherit);
+ return (error);
+}
+
+
+int
+zfs_acl_chmod_setattr(znode_t *zp, uint64_t mode, dmu_tx_t *tx)
+{
+ zfs_acl_t *aclp = NULL;
+ int error;
+
+ ASSERT(MUTEX_HELD(&zp->z_lock));
+ mutex_enter(&zp->z_acl_lock);
+ error = zfs_acl_node_read(zp, &aclp);
+ if (error == 0)
+ error = zfs_acl_chmod(zp, mode, aclp, tx);
+ mutex_exit(&zp->z_acl_lock);
+ if (aclp)
+ zfs_acl_free(aclp);
+ return (error);
+}
+
+/*
+ * strip off write_owner and write_acl
+ */
+static void
+zfs_securemode_update(zfsvfs_t *zfsvfs, ace_t *acep)
+{
+ if ((zfsvfs->z_acl_inherit == ZFS_ACL_SECURE) &&
+ (acep->a_type == ALLOW))
+ acep->a_access_mask &= ~SECURE_CLEAR;
+}
+
+/*
+ * inherit inheritable ACEs from parent
+ */
+static zfs_acl_t *
+zfs_acl_inherit(znode_t *zp, zfs_acl_t *paclp)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ ace_t *pacep;
+ ace_t *acep;
+ int ace_cnt = 0;
+ int pace_cnt;
+ int i, j;
+ zfs_acl_t *aclp = NULL;
+
+ i = j = 0;
+ pace_cnt = paclp->z_acl_count;
+ pacep = paclp->z_acl;
+ if (zfsvfs->z_acl_inherit != ZFS_ACL_DISCARD) {
+ for (i = 0; i != pace_cnt; i++) {
+
+ if (zfsvfs->z_acl_inherit == ZFS_ACL_NOALLOW &&
+ pacep[i].a_type == ALLOW)
+ continue;
+
+ if (zfs_ace_can_use(zp, &pacep[i])) {
+ ace_cnt++;
+ if (!(pacep[i].a_flags &
+ ACE_NO_PROPAGATE_INHERIT_ACE))
+ ace_cnt++;
+ }
+ }
+ }
+
+ aclp = zfs_acl_alloc(ace_cnt + OGE_PAD);
+ if (ace_cnt && zfsvfs->z_acl_inherit != ZFS_ACL_DISCARD) {
+ acep = aclp->z_acl;
+ pacep = paclp->z_acl;
+ for (i = 0; i != pace_cnt; i++) {
+
+ if (zfsvfs->z_acl_inherit == ZFS_ACL_NOALLOW &&
+ pacep[i].a_type == ALLOW)
+ continue;
+
+ if (zfs_ace_can_use(zp, &pacep[i])) {
+
+ /*
+ * Now create entry for inherited ace
+ */
+
+ acep[j] = pacep[i];
+
+ /*
+ * When AUDIT/ALARM a_types are supported
+ * they should be inherited here.
+ */
+
+ if ((pacep[i].a_flags &
+ ACE_NO_PROPAGATE_INHERIT_ACE) ||
+ (ZTOV(zp)->v_type != VDIR)) {
+ acep[j].a_flags &= ~ALL_INHERIT;
+ zfs_securemode_update(zfsvfs, &acep[j]);
+ j++;
+ continue;
+ }
+
+ ASSERT(ZTOV(zp)->v_type == VDIR);
+
+ /*
+ * If we are inheriting an ACE targeted for
+ * only files, then make sure inherit_only
+ * is on for future propagation.
+ */
+ if ((pacep[i].a_flags & (ACE_FILE_INHERIT_ACE |
+ ACE_DIRECTORY_INHERIT_ACE)) !=
+ ACE_FILE_INHERIT_ACE) {
+ j++;
+ acep[j] = acep[j-1];
+ acep[j-1].a_flags |=
+ ACE_INHERIT_ONLY_ACE;
+ acep[j].a_flags &= ~ALL_INHERIT;
+ } else {
+ acep[j].a_flags |= ACE_INHERIT_ONLY_ACE;
+ }
+ zfs_securemode_update(zfsvfs, &acep[j]);
+ j++;
+ }
+ }
+ }
+ aclp->z_acl_count = j;
+ ASSERT(aclp->z_slots >= aclp->z_acl_count);
+
+ return (aclp);
+}
+
+/*
+ * Create file system object initial permissions
+ * including inheritable ACEs.
+ */
+void
+zfs_perm_init(znode_t *zp, znode_t *parent, int flag,
+ vattr_t *vap, dmu_tx_t *tx, cred_t *cr)
+{
+ uint64_t mode;
+ uid_t uid;
+ gid_t gid;
+ int error;
+ int pull_down;
+ zfs_acl_t *aclp, *paclp;
+
+ mode = MAKEIMODE(vap->va_type, vap->va_mode);
+
+ /*
+ * Determine uid and gid.
+ */
+ if ((flag & (IS_ROOT_NODE | IS_REPLAY)) ||
+ ((flag & IS_XATTR) && (vap->va_type == VDIR))) {
+ uid = vap->va_uid;
+ gid = vap->va_gid;
+ } else {
+ uid = crgetuid(cr);
+ if ((vap->va_mask & AT_GID) &&
+ ((vap->va_gid == parent->z_phys->zp_gid) ||
+ groupmember(vap->va_gid, cr) ||
+ secpolicy_vnode_create_gid(cr) == 0))
+ gid = vap->va_gid;
+ else
+#ifdef __FreeBSD__
+ gid = parent->z_phys->zp_gid;
+#else
+ gid = (parent->z_phys->zp_mode & S_ISGID) ?
+ parent->z_phys->zp_gid : crgetgid(cr);
+#endif
+ }
+
+ /*
+ * If we're creating a directory, and the parent directory has the
+ * set-GID bit set, set in on the new directory.
+ * Otherwise, if the user is neither privileged nor a member of the
+ * file's new group, clear the file's set-GID bit.
+ */
+
+ if ((parent->z_phys->zp_mode & S_ISGID) && (vap->va_type == VDIR))
+ mode |= S_ISGID;
+ else {
+ if ((mode & S_ISGID) &&
+ secpolicy_vnode_setids_setgids(cr, gid) != 0)
+ mode &= ~S_ISGID;
+ }
+
+ zp->z_phys->zp_uid = uid;
+ zp->z_phys->zp_gid = gid;
+ zp->z_phys->zp_mode = mode;
+
+ mutex_enter(&parent->z_lock);
+ pull_down = (parent->z_phys->zp_flags & ZFS_INHERIT_ACE);
+ if (pull_down) {
+ mutex_enter(&parent->z_acl_lock);
+ VERIFY(0 == zfs_acl_node_read(parent, &paclp));
+ mutex_exit(&parent->z_acl_lock);
+ aclp = zfs_acl_inherit(zp, paclp);
+ zfs_acl_free(paclp);
+ } else {
+ aclp = zfs_acl_alloc(6);
+ }
+ mutex_exit(&parent->z_lock);
+ mutex_enter(&zp->z_lock);
+ mutex_enter(&zp->z_acl_lock);
+ error = zfs_acl_chmod(zp, mode, aclp, tx);
+ mutex_exit(&zp->z_lock);
+ mutex_exit(&zp->z_acl_lock);
+ ASSERT3U(error, ==, 0);
+ zfs_acl_free(aclp);
+}
+
+/*
+ * Should ACE be inherited?
+ */
+static int
+zfs_ace_can_use(znode_t *zp, ace_t *acep)
+{
+ int vtype = ZTOV(zp)->v_type;
+
+ int iflags = (acep->a_flags & 0xf);
+
+ if ((vtype == VDIR) && (iflags & ACE_DIRECTORY_INHERIT_ACE))
+ return (1);
+ else if (iflags & ACE_FILE_INHERIT_ACE)
+ return (!((vtype == VDIR) &&
+ (iflags & ACE_NO_PROPAGATE_INHERIT_ACE)));
+ return (0);
+}
+
+#ifdef TODO
+/*
+ * Retrieve a files ACL
+ */
+int
+zfs_getacl(znode_t *zp, vsecattr_t *vsecp, cred_t *cr)
+{
+ zfs_acl_t *aclp;
+ ulong_t mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT);
+ int error;
+
+ if (error = zfs_zaccess(zp, ACE_READ_ACL, cr)) {
+ /*
+ * If owner of file then allow reading of the
+ * ACL.
+ */
+ if (crgetuid(cr) != zp->z_phys->zp_uid)
+ return (error);
+ }
+
+ if (mask == 0)
+ return (ENOSYS);
+
+ mutex_enter(&zp->z_acl_lock);
+
+ error = zfs_acl_node_read(zp, &aclp);
+ if (error != 0) {
+ mutex_exit(&zp->z_acl_lock);
+ return (error);
+ }
+
+
+ if (mask & VSA_ACECNT) {
+ vsecp->vsa_aclcnt = aclp->z_acl_count;
+ }
+
+ if (mask & VSA_ACE) {
+ vsecp->vsa_aclentp = kmem_alloc(aclp->z_acl_count *
+ sizeof (ace_t), KM_SLEEP);
+ bcopy(aclp->z_acl, vsecp->vsa_aclentp,
+ aclp->z_acl_count * sizeof (ace_t));
+ }
+
+ mutex_exit(&zp->z_acl_lock);
+
+ zfs_acl_free(aclp);
+
+ return (0);
+}
+#endif /* TODO */
+
+#ifdef TODO
+/*
+ * Set a files ACL
+ */
+int
+zfs_setacl(znode_t *zp, vsecattr_t *vsecp, cred_t *cr)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ zilog_t *zilog = zfsvfs->z_log;
+ ace_t *acep = vsecp->vsa_aclentp;
+ int aclcnt = vsecp->vsa_aclcnt;
+ ulong_t mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT);
+ dmu_tx_t *tx;
+ int error;
+ int inherit;
+ zfs_acl_t *aclp;
+
+ if (mask == 0)
+ return (EINVAL);
+
+ if (!zfs_acl_valid(zp, acep, aclcnt, &inherit))
+ return (EINVAL);
+top:
+ error = zfs_zaccess_v4_perm(zp, ACE_WRITE_ACL, cr);
+ if (error == EACCES || error == ACCESS_UNDETERMINED) {
+ if ((error = secpolicy_vnode_setdac(cr,
+ zp->z_phys->zp_uid)) != 0) {
+ return (error);
+ }
+ } else if (error) {
+ return (error == EROFS ? error : EPERM);
+ }
+
+ mutex_enter(&zp->z_lock);
+ mutex_enter(&zp->z_acl_lock);
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_bonus(tx, zp->z_id);
+
+ if (zp->z_phys->zp_acl.z_acl_extern_obj) {
+ dmu_tx_hold_write(tx, zp->z_phys->zp_acl.z_acl_extern_obj,
+ 0, ZFS_ACL_SIZE(aclcnt));
+ } else if (aclcnt > ACE_SLOT_CNT) {
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, ZFS_ACL_SIZE(aclcnt));
+ }
+
+ error = dmu_tx_assign(tx, zfsvfs->z_assign);
+ if (error) {
+ mutex_exit(&zp->z_acl_lock);
+ mutex_exit(&zp->z_lock);
+
+ if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ goto top;
+ }
+ dmu_tx_abort(tx);
+ return (error);
+ }
+
+ aclp = zfs_acl_alloc(aclcnt);
+ bcopy(acep, aclp->z_acl, sizeof (ace_t) * aclcnt);
+ aclp->z_acl_count = aclcnt;
+ error = zfs_aclset_common(zp, aclp, tx, &inherit);
+ ASSERT(error == 0);
+
+ zfs_acl_free(aclp);
+ zfs_log_acl(zilog, tx, TX_ACL, zp, aclcnt, acep);
+ dmu_tx_commit(tx);
+done:
+ mutex_exit(&zp->z_acl_lock);
+ mutex_exit(&zp->z_lock);
+
+ return (error);
+}
+#endif /* TODO */
+
+static int
+zfs_ace_access(ace_t *zacep, int *working_mode)
+{
+ if (*working_mode == 0) {
+ return (0);
+ }
+
+ if (zacep->a_access_mask & *working_mode) {
+ if (zacep->a_type == ALLOW) {
+ *working_mode &=
+ ~(*working_mode & zacep->a_access_mask);
+ if (*working_mode == 0)
+ return (0);
+ } else if (zacep->a_type == DENY) {
+ return (EACCES);
+ }
+ }
+
+ /*
+ * haven't been specifcally denied at this point
+ * so return UNDETERMINED.
+ */
+
+ return (ACCESS_UNDETERMINED);
+}
+
+
+static int
+zfs_zaccess_common(znode_t *zp, int v4_mode, int *working_mode, cred_t *cr)
+{
+ zfs_acl_t *aclp;
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ ace_t *zacep;
+ gid_t gid;
+ int cnt;
+ int i;
+ int error;
+ int access_deny = ACCESS_UNDETERMINED;
+ uint_t entry_type;
+ uid_t uid = crgetuid(cr);
+
+ if (zfsvfs->z_assign >= TXG_INITIAL) { /* ZIL replay */
+ *working_mode = 0;
+ return (0);
+ }
+
+ *working_mode = v4_mode;
+
+ if ((v4_mode & WRITE_MASK) &&
+ (zp->z_zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) &&
+ (!IS_DEVVP(ZTOV(zp)))) {
+ return (EROFS);
+ }
+
+ mutex_enter(&zp->z_acl_lock);
+
+ error = zfs_acl_node_read(zp, &aclp);
+ if (error != 0) {
+ mutex_exit(&zp->z_acl_lock);
+ return (error);
+ }
+
+
+ zacep = aclp->z_acl;
+ cnt = aclp->z_acl_count;
+
+ for (i = 0; i != cnt; i++) {
+
+ DTRACE_PROBE2(zfs__access__common,
+ ace_t *, &zacep[i], int, *working_mode);
+
+ if (zacep[i].a_flags & ACE_INHERIT_ONLY_ACE)
+ continue;
+
+ entry_type = (zacep[i].a_flags & ACE_TYPE_FLAGS);
+ switch (entry_type) {
+ case ACE_OWNER:
+ if (uid == zp->z_phys->zp_uid) {
+ access_deny = zfs_ace_access(&zacep[i],
+ working_mode);
+ }
+ break;
+ case (ACE_IDENTIFIER_GROUP | ACE_GROUP):
+ case ACE_IDENTIFIER_GROUP:
+ /*
+ * Owning group gid is in znode not ACL
+ */
+ if (entry_type == (ACE_IDENTIFIER_GROUP | ACE_GROUP))
+ gid = zp->z_phys->zp_gid;
+ else
+ gid = zacep[i].a_who;
+
+ if (groupmember(gid, cr)) {
+ access_deny = zfs_ace_access(&zacep[i],
+ working_mode);
+ }
+ break;
+ case ACE_EVERYONE:
+ access_deny = zfs_ace_access(&zacep[i], working_mode);
+ break;
+
+ /* USER Entry */
+ default:
+ if (entry_type == 0) {
+ if (uid == zacep[i].a_who) {
+ access_deny = zfs_ace_access(&zacep[i],
+ working_mode);
+ }
+ break;
+ }
+ zfs_acl_free(aclp);
+ mutex_exit(&zp->z_acl_lock);
+ return (EIO);
+ }
+
+ if (access_deny != ACCESS_UNDETERMINED)
+ break;
+ }
+
+ mutex_exit(&zp->z_acl_lock);
+ zfs_acl_free(aclp);
+
+ return (access_deny);
+}
+
+
+/*
+ * Determine whether Access should be granted/denied, invoking least
+ * priv subsytem when a deny is determined.
+ */
+int
+zfs_zaccess(znode_t *zp, int mode, cred_t *cr)
+{
+ int working_mode;
+ int error;
+ int is_attr;
+ znode_t *xzp;
+ znode_t *check_zp = zp;
+
+ is_attr = ((zp->z_phys->zp_flags & ZFS_XATTR) &&
+ (ZTOV(zp)->v_type == VDIR));
+
+ /*
+ * If attribute then validate against base file
+ */
+ if (is_attr) {
+ if ((error = zfs_zget(zp->z_zfsvfs,
+ zp->z_phys->zp_parent, &xzp)) != 0) {
+ return (error);
+ }
+ check_zp = xzp;
+ /*
+ * fixup mode to map to xattr perms
+ */
+
+ if (mode & (ACE_WRITE_DATA|ACE_APPEND_DATA)) {
+ mode &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
+ mode |= ACE_WRITE_NAMED_ATTRS;
+ }
+
+ if (mode & (ACE_READ_DATA|ACE_EXECUTE)) {
+ mode &= ~(ACE_READ_DATA|ACE_EXECUTE);
+ mode |= ACE_READ_NAMED_ATTRS;
+ }
+ }
+
+ error = zfs_zaccess_common(check_zp, mode, &working_mode, cr);
+
+ if (error == EROFS) {
+ if (is_attr)
+ VN_RELE(ZTOV(xzp));
+ return (error);
+ }
+
+ if (error || working_mode) {
+ working_mode = (zfs_v4_to_unix(working_mode) << 6);
+ error = secpolicy_vnode_access(cr, ZTOV(check_zp),
+ check_zp->z_phys->zp_uid, working_mode);
+ }
+
+ if (is_attr)
+ VN_RELE(ZTOV(xzp));
+
+ return (error);
+}
+
+/*
+ * Special zaccess function to check for special nfsv4 perm.
+ * doesn't call secpolicy_vnode_access() for failure, since that
+ * would probably be the wrong policy function to call.
+ * instead its up to the caller to handle that situation.
+ */
+
+int
+zfs_zaccess_v4_perm(znode_t *zp, int mode, cred_t *cr)
+{
+ int working_mode = 0;
+ return (zfs_zaccess_common(zp, mode, &working_mode, cr));
+}
+
+/*
+ * Translate tradition unix VREAD/VWRITE/VEXEC mode into
+ * native ACL format and call zfs_zaccess()
+ */
+int
+zfs_zaccess_rwx(znode_t *zp, mode_t mode, cred_t *cr)
+{
+ int v4_mode = zfs_unix_to_v4(mode >> 6);
+
+ return (zfs_zaccess(zp, v4_mode, cr));
+}
+
+static int
+zfs_delete_final_check(znode_t *zp, znode_t *dzp, cred_t *cr)
+{
+ int error;
+
+ error = secpolicy_vnode_access(cr, ZTOV(zp),
+ dzp->z_phys->zp_uid, S_IWRITE|S_IEXEC);
+
+ if (error == 0)
+ error = zfs_sticky_remove_access(dzp, zp, cr);
+
+ return (error);
+}
+
+/*
+ * Determine whether Access should be granted/deny, without
+ * consulting least priv subsystem.
+ *
+ *
+ * The following chart is the recommended NFSv4 enforcement for
+ * ability to delete an object.
+ *
+ * -------------------------------------------------------
+ * | Parent Dir | Target Object Permissions |
+ * | permissions | |
+ * -------------------------------------------------------
+ * | | ACL Allows | ACL Denies| Delete |
+ * | | Delete | Delete | unspecified|
+ * -------------------------------------------------------
+ * | ACL Allows | Permit | Permit | Permit |
+ * | DELETE_CHILD | |
+ * -------------------------------------------------------
+ * | ACL Denies | Permit | Deny | Deny |
+ * | DELETE_CHILD | | | |
+ * -------------------------------------------------------
+ * | ACL specifies | | | |
+ * | only allow | Permit | Permit | Permit |
+ * | write and | | | |
+ * | execute | | | |
+ * -------------------------------------------------------
+ * | ACL denies | | | |
+ * | write and | Permit | Deny | Deny |
+ * | execute | | | |
+ * -------------------------------------------------------
+ * ^
+ * |
+ * No search privilege, can't even look up file?
+ *
+ */
+int
+zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr)
+{
+ int dzp_working_mode = 0;
+ int zp_working_mode = 0;
+ int dzp_error, zp_error;
+
+ /*
+ * Arghh, this check is going to require a couple of questions
+ * to be asked. We want specific DELETE permissions to
+ * take precedence over WRITE/EXECUTE. We don't
+ * want an ACL such as this to mess us up.
+ * user:joe:write_data:deny,user:joe:delete:allow
+ *
+ * However, deny permissions may ultimately be overridden
+ * by secpolicy_vnode_access().
+ */
+
+ dzp_error = zfs_zaccess_common(dzp, ACE_DELETE_CHILD,
+ &dzp_working_mode, cr);
+ zp_error = zfs_zaccess_common(zp, ACE_DELETE, &zp_working_mode, cr);
+
+ if (dzp_error == EROFS || zp_error == EROFS)
+ return (dzp_error);
+
+ /*
+ * First check the first row.
+ * We only need to see if parent Allows delete_child
+ */
+ if ((dzp_working_mode & ACE_DELETE_CHILD) == 0)
+ return (0);
+
+ /*
+ * Second row
+ * we already have the necessary information in
+ * zp_working_mode, zp_error and dzp_error.
+ */
+
+ if ((zp_working_mode & ACE_DELETE) == 0)
+ return (0);
+
+ /*
+ * Now zp_error should either be EACCES which indicates
+ * a "deny" delete entry or ACCESS_UNDETERMINED if the "delete"
+ * entry exists on the target.
+ *
+ * dzp_error should be either EACCES which indicates a "deny"
+ * entry for delete_child or ACCESS_UNDETERMINED if no delete_child
+ * entry exists. If value is EACCES then we are done
+ * and zfs_delete_final_check() will make the final decision
+ * regarding to allow the delete.
+ */
+
+ ASSERT(zp_error != 0 && dzp_error != 0);
+ if (dzp_error == EACCES)
+ return (zfs_delete_final_check(zp, dzp, cr));
+
+ /*
+ * Third Row
+ * Only need to check for write/execute on parent
+ */
+
+ dzp_error = zfs_zaccess_common(dzp, ACE_WRITE_DATA|ACE_EXECUTE,
+ &dzp_working_mode, cr);
+
+ if (dzp_error == EROFS)
+ return (dzp_error);
+
+ if ((dzp_working_mode & (ACE_WRITE_DATA|ACE_EXECUTE)) == 0)
+ return (zfs_sticky_remove_access(dzp, zp, cr));
+
+ /*
+ * Fourth Row
+ */
+
+ if (((dzp_working_mode & (ACE_WRITE_DATA|ACE_EXECUTE)) != 0) &&
+ ((zp_working_mode & ACE_DELETE) == 0))
+ return (zfs_sticky_remove_access(dzp, zp, cr));
+
+ return (zfs_delete_final_check(zp, dzp, cr));
+}
+
+int
+zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp,
+ znode_t *tzp, cred_t *cr)
+{
+ int add_perm;
+ int error;
+
+ add_perm = (ZTOV(szp)->v_type == VDIR) ?
+ ACE_ADD_SUBDIRECTORY : ACE_ADD_FILE;
+
+ /*
+ * Rename permissions are combination of delete permission +
+ * add file/subdir permission.
+ */
+
+ /*
+ * first make sure we do the delete portion.
+ *
+ * If that succeeds then check for add_file/add_subdir permissions
+ */
+
+ if (error = zfs_zaccess_delete(sdzp, szp, cr))
+ return (error);
+
+ /*
+ * If we have a tzp, see if we can delete it?
+ */
+ if (tzp) {
+ if (error = zfs_zaccess_delete(tdzp, tzp, cr))
+ return (error);
+ }
+
+ /*
+ * Now check for add permissions
+ */
+ error = zfs_zaccess(tdzp, add_perm, cr);
+
+ return (error);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c
new file mode 100644
index 0000000..c8450d4
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c
@@ -0,0 +1,99 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/vfs.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_acl.h>
+
+void
+zfs_ace_byteswap(ace_t *ace, int ace_cnt)
+{
+ int i;
+
+ for (i = 0; i != ace_cnt; i++, ace++) {
+ ace->a_who = BSWAP_32(ace->a_who);
+ ace->a_access_mask = BSWAP_32(ace->a_access_mask);
+ ace->a_flags = BSWAP_16(ace->a_flags);
+ ace->a_type = BSWAP_16(ace->a_type);
+ }
+}
+
+/* ARGSUSED */
+void
+zfs_acl_byteswap(void *buf, size_t size)
+{
+ int cnt;
+
+ /*
+ * Arggh, since we don't know how many ACEs are in
+ * the array, we have to swap the entire block
+ */
+
+ cnt = size / sizeof (ace_t);
+
+ zfs_ace_byteswap((ace_t *)buf, cnt);
+}
+
+void
+zfs_znode_byteswap(void *buf, size_t size)
+{
+ znode_phys_t *zp = buf;
+
+ ASSERT(size >= sizeof (znode_phys_t));
+
+ zp->zp_crtime[0] = BSWAP_64(zp->zp_crtime[0]);
+ zp->zp_crtime[1] = BSWAP_64(zp->zp_crtime[1]);
+ zp->zp_atime[0] = BSWAP_64(zp->zp_atime[0]);
+ zp->zp_atime[1] = BSWAP_64(zp->zp_atime[1]);
+ zp->zp_mtime[0] = BSWAP_64(zp->zp_mtime[0]);
+ zp->zp_mtime[1] = BSWAP_64(zp->zp_mtime[1]);
+ zp->zp_ctime[0] = BSWAP_64(zp->zp_ctime[0]);
+ zp->zp_ctime[1] = BSWAP_64(zp->zp_ctime[1]);
+ zp->zp_gen = BSWAP_64(zp->zp_gen);
+ zp->zp_mode = BSWAP_64(zp->zp_mode);
+ zp->zp_size = BSWAP_64(zp->zp_size);
+ zp->zp_parent = BSWAP_64(zp->zp_parent);
+ zp->zp_links = BSWAP_64(zp->zp_links);
+ zp->zp_xattr = BSWAP_64(zp->zp_xattr);
+ zp->zp_rdev = BSWAP_64(zp->zp_rdev);
+ zp->zp_flags = BSWAP_64(zp->zp_flags);
+ zp->zp_uid = BSWAP_64(zp->zp_uid);
+ zp->zp_gid = BSWAP_64(zp->zp_gid);
+ zp->zp_pad[0] = BSWAP_64(zp->zp_pad[0]);
+ zp->zp_pad[1] = BSWAP_64(zp->zp_pad[1]);
+ zp->zp_pad[2] = BSWAP_64(zp->zp_pad[2]);
+ zp->zp_pad[3] = BSWAP_64(zp->zp_pad[3]);
+
+ zp->zp_acl.z_acl_extern_obj = BSWAP_64(zp->zp_acl.z_acl_extern_obj);
+ zp->zp_acl.z_acl_count = BSWAP_32(zp->zp_acl.z_acl_count);
+ zp->zp_acl.z_acl_version = BSWAP_16(zp->zp_acl.z_acl_version);
+ zp->zp_acl.z_acl_pad = BSWAP_16(zp->zp_acl.z_acl_pad);
+ zfs_ace_byteswap(&zp->zp_acl.z_ace_data[0], ACE_SLOT_CNT);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c
new file mode 100644
index 0000000..c759962
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c
@@ -0,0 +1,1120 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * ZFS control directory (a.k.a. ".zfs")
+ *
+ * This directory provides a common location for all ZFS meta-objects.
+ * Currently, this is only the 'snapshot' directory, but this may expand in the
+ * future. The elements are built using the GFS primitives, as the hierarchy
+ * does not actually exist on disk.
+ *
+ * For 'snapshot', we don't want to have all snapshots always mounted, because
+ * this would take up a huge amount of space in /etc/mnttab. We have three
+ * types of objects:
+ *
+ * ctldir ------> snapshotdir -------> snapshot
+ * |
+ * |
+ * V
+ * mounted fs
+ *
+ * The 'snapshot' node contains just enough information to lookup '..' and act
+ * as a mountpoint for the snapshot. Whenever we lookup a specific snapshot, we
+ * perform an automount of the underlying filesystem and return the
+ * corresponding vnode.
+ *
+ * All mounts are handled automatically by the kernel, but unmounts are
+ * (currently) handled from user land. The main reason is that there is no
+ * reliable way to auto-unmount the filesystem when it's "no longer in use".
+ * When the user unmounts a filesystem, we call zfsctl_unmount(), which
+ * unmounts any snapshots within the snapshot directory.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/namei.h>
+#include <sys/gfs.h>
+#include <sys/stat.h>
+#include <sys/dmu.h>
+#include <sys/mount.h>
+
+typedef struct {
+ char *se_name;
+ vnode_t *se_root;
+ avl_node_t se_node;
+} zfs_snapentry_t;
+
+static int
+snapentry_compare(const void *a, const void *b)
+{
+ const zfs_snapentry_t *sa = a;
+ const zfs_snapentry_t *sb = b;
+ int ret = strcmp(sa->se_name, sb->se_name);
+
+ if (ret < 0)
+ return (-1);
+ else if (ret > 0)
+ return (1);
+ else
+ return (0);
+}
+
+static struct vop_vector zfsctl_ops_root;
+static struct vop_vector zfsctl_ops_snapdir;
+static struct vop_vector zfsctl_ops_snapshot;
+
+static vnode_t *zfsctl_mknode_snapdir(vnode_t *);
+static vnode_t *zfsctl_snapshot_mknode(vnode_t *, uint64_t objset);
+
+typedef struct zfsctl_node {
+ gfs_dir_t zc_gfs_private;
+ uint64_t zc_id;
+ timestruc_t zc_cmtime; /* ctime and mtime, always the same */
+} zfsctl_node_t;
+
+typedef struct zfsctl_snapdir {
+ zfsctl_node_t sd_node;
+ kmutex_t sd_lock;
+ avl_tree_t sd_snaps;
+} zfsctl_snapdir_t;
+
+/*
+ * Root directory elements. We have only a single static entry, 'snapshot'.
+ */
+static gfs_dirent_t zfsctl_root_entries[] = {
+ { "snapshot", zfsctl_mknode_snapdir, GFS_CACHE_VNODE },
+ { NULL }
+};
+
+/* include . and .. in the calculation */
+#define NROOT_ENTRIES ((sizeof (zfsctl_root_entries) / \
+ sizeof (gfs_dirent_t)) + 1)
+
+
+/*
+ * Initialize the various GFS pieces we'll need to create and manipulate .zfs
+ * directories. This is called from the ZFS init routine, and initializes the
+ * vnode ops vectors that we'll be using.
+ */
+void
+zfsctl_init(void)
+{
+}
+
+void
+zfsctl_fini(void)
+{
+}
+
+/*
+ * Return the inode number associated with the 'snapshot' directory.
+ */
+/* ARGSUSED */
+static ino64_t
+zfsctl_root_inode_cb(vnode_t *vp, int index)
+{
+ ASSERT(index == 0);
+ return (ZFSCTL_INO_SNAPDIR);
+}
+
+/*
+ * Create the '.zfs' directory. This directory is cached as part of the VFS
+ * structure. This results in a hold on the vfs_t. The code in zfs_umount()
+ * therefore checks against a vfs_count of 2 instead of 1. This reference
+ * is removed when the ctldir is destroyed in the unmount.
+ */
+void
+zfsctl_create(zfsvfs_t *zfsvfs)
+{
+ vnode_t *vp, *rvp;
+ zfsctl_node_t *zcp;
+
+ ASSERT(zfsvfs->z_ctldir == NULL);
+
+ vp = gfs_root_create(sizeof (zfsctl_node_t), zfsvfs->z_vfs,
+ &zfsctl_ops_root, ZFSCTL_INO_ROOT, zfsctl_root_entries,
+ zfsctl_root_inode_cb, MAXNAMELEN, NULL, NULL);
+ zcp = vp->v_data;
+ zcp->zc_id = ZFSCTL_INO_ROOT;
+
+ VERIFY(VFS_ROOT(zfsvfs->z_vfs, LK_EXCLUSIVE, &rvp, curthread) == 0);
+ ZFS_TIME_DECODE(&zcp->zc_cmtime, VTOZ(rvp)->z_phys->zp_crtime);
+ VN_URELE(rvp);
+
+ /*
+ * We're only faking the fact that we have a root of a filesystem for
+ * the sake of the GFS interfaces. Undo the flag manipulation it did
+ * for us.
+ */
+ vp->v_vflag &= ~VV_ROOT;
+
+ zfsvfs->z_ctldir = vp;
+}
+
+/*
+ * Destroy the '.zfs' directory. Only called when the filesystem is unmounted.
+ * There might still be more references if we were force unmounted, but only
+ * new zfs_inactive() calls can occur and they don't reference .zfs
+ */
+void
+zfsctl_destroy(zfsvfs_t *zfsvfs)
+{
+ VN_RELE(zfsvfs->z_ctldir);
+ zfsvfs->z_ctldir = NULL;
+}
+
+/*
+ * Given a root znode, retrieve the associated .zfs directory.
+ * Add a hold to the vnode and return it.
+ */
+vnode_t *
+zfsctl_root(znode_t *zp)
+{
+ ASSERT(zfs_has_ctldir(zp));
+ VN_HOLD(zp->z_zfsvfs->z_ctldir);
+ return (zp->z_zfsvfs->z_ctldir);
+}
+
+/*
+ * Common open routine. Disallow any write access.
+ */
+/* ARGSUSED */
+static int
+zfsctl_common_open(struct vop_open_args *ap)
+{
+ int flags = ap->a_mode;
+
+ if (flags & FWRITE)
+ return (EACCES);
+
+ return (0);
+}
+
+/*
+ * Common close routine. Nothing to do here.
+ */
+/* ARGSUSED */
+static int
+zfsctl_common_close(struct vop_close_args *ap)
+{
+ return (0);
+}
+
+/*
+ * Common access routine. Disallow writes.
+ */
+/* ARGSUSED */
+static int
+zfsctl_common_access(ap)
+ struct vop_access_args /* {
+ struct vnode *a_vp;
+ int a_mode;
+ struct ucred *a_cred;
+ struct thread *a_td;
+ } */ *ap;
+{
+ int mode = ap->a_mode;
+
+ if (mode & VWRITE)
+ return (EACCES);
+
+ return (0);
+}
+
+/*
+ * Common getattr function. Fill in basic information.
+ */
+static void
+zfsctl_common_getattr(vnode_t *vp, vattr_t *vap)
+{
+ zfsctl_node_t *zcp = vp->v_data;
+ timestruc_t now;
+
+ vap->va_uid = 0;
+ vap->va_gid = 0;
+ vap->va_rdev = 0;
+ /*
+ * We are a purly virtual object, so we have no
+ * blocksize or allocated blocks.
+ */
+ vap->va_blksize = 0;
+ vap->va_nblocks = 0;
+ vap->va_seq = 0;
+ vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
+ vap->va_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP |
+ S_IROTH | S_IXOTH;
+ vap->va_type = VDIR;
+ /*
+ * We live in the now (for atime).
+ */
+ gethrestime(&now);
+ vap->va_atime = now;
+ vap->va_mtime = vap->va_ctime = vap->va_birthtime = zcp->zc_cmtime;
+ /* FreeBSD: Reset chflags(2) flags. */
+ vap->va_flags = 0;
+}
+
+static int
+zfsctl_common_fid(ap)
+ struct vop_fid_args /* {
+ struct vnode *a_vp;
+ struct fid *a_fid;
+ } */ *ap;
+{
+ vnode_t *vp = ap->a_vp;
+ fid_t *fidp = (void *)ap->a_fid;
+ zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+ zfsctl_node_t *zcp = vp->v_data;
+ uint64_t object = zcp->zc_id;
+ zfid_short_t *zfid;
+ int i;
+
+ ZFS_ENTER(zfsvfs);
+
+ fidp->fid_len = SHORT_FID_LEN;
+
+ zfid = (zfid_short_t *)fidp;
+
+ zfid->zf_len = SHORT_FID_LEN;
+
+ for (i = 0; i < sizeof (zfid->zf_object); i++)
+ zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
+
+ /* .zfs znodes always have a generation number of 0 */
+ for (i = 0; i < sizeof (zfid->zf_gen); i++)
+ zfid->zf_gen[i] = 0;
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+static int
+zfsctl_common_reclaim(ap)
+ struct vop_reclaim_args /* {
+ struct vnode *a_vp;
+ struct thread *a_td;
+ } */ *ap;
+{
+ vnode_t *vp = ap->a_vp;
+
+ /*
+ * Destroy the vm object and flush associated pages.
+ */
+ vnode_destroy_vobject(vp);
+ VI_LOCK(vp);
+ vp->v_data = NULL;
+ VI_UNLOCK(vp);
+ return (0);
+}
+
+/*
+ * .zfs inode namespace
+ *
+ * We need to generate unique inode numbers for all files and directories
+ * within the .zfs pseudo-filesystem. We use the following scheme:
+ *
+ * ENTRY ZFSCTL_INODE
+ * .zfs 1
+ * .zfs/snapshot 2
+ * .zfs/snapshot/<snap> objectid(snap)
+ */
+
+#define ZFSCTL_INO_SNAP(id) (id)
+
+/*
+ * Get root directory attributes.
+ */
+/* ARGSUSED */
+static int
+zfsctl_root_getattr(ap)
+ struct vop_getattr_args /* {
+ struct vnode *a_vp;
+ struct vattr *a_vap;
+ struct ucred *a_cred;
+ struct thread *a_td;
+ } */ *ap;
+{
+ struct vnode *vp = ap->a_vp;
+ struct vattr *vap = ap->a_vap;
+ zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+
+ ZFS_ENTER(zfsvfs);
+ vap->va_nodeid = ZFSCTL_INO_ROOT;
+ vap->va_nlink = vap->va_size = NROOT_ENTRIES;
+
+ zfsctl_common_getattr(vp, vap);
+ ZFS_EXIT(zfsvfs);
+
+ return (0);
+}
+
+/*
+ * Special case the handling of "..".
+ */
+/* ARGSUSED */
+int
+zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
+ int flags, vnode_t *rdir, cred_t *cr)
+{
+ zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
+ int err;
+
+ ZFS_ENTER(zfsvfs);
+
+ if (strcmp(nm, "..") == 0) {
+ err = VFS_ROOT(dvp->v_vfsp, LK_EXCLUSIVE, vpp, curthread);
+ if (err == 0)
+ VOP_UNLOCK(*vpp, 0, curthread);
+ } else {
+ err = gfs_dir_lookup(dvp, nm, vpp);
+ }
+
+ ZFS_EXIT(zfsvfs);
+
+ return (err);
+}
+
+/*
+ * Special case the handling of "..".
+ */
+/* ARGSUSED */
+int
+zfsctl_root_lookup_vop(ap)
+ struct vop_lookup_args /* {
+ struct vnode *a_dvp;
+ struct vnode **a_vpp;
+ struct componentname *a_cnp;
+ } */ *ap;
+{
+ vnode_t *dvp = ap->a_dvp;
+ vnode_t **vpp = ap->a_vpp;
+ cred_t *cr = ap->a_cnp->cn_cred;
+ int flags = ap->a_cnp->cn_flags;
+ int nameiop = ap->a_cnp->cn_nameiop;
+ char nm[NAME_MAX + 1];
+ int err;
+
+ if ((flags & ISLASTCN) && (nameiop == RENAME || nameiop == CREATE))
+ return (EOPNOTSUPP);
+
+ ASSERT(ap->a_cnp->cn_namelen < sizeof(nm));
+ strlcpy(nm, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1);
+
+ err = zfsctl_root_lookup(dvp, nm, vpp, NULL, 0, NULL, cr);
+ if (err == 0 && (nm[0] != '.' || nm[1] != '\0'))
+ vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, ap->a_cnp->cn_thread);
+
+ return (err);
+}
+
+static struct vop_vector zfsctl_ops_root = {
+ .vop_default = &default_vnodeops,
+ .vop_open = zfsctl_common_open,
+ .vop_close = zfsctl_common_close,
+ .vop_ioctl = VOP_EINVAL,
+ .vop_getattr = zfsctl_root_getattr,
+ .vop_access = zfsctl_common_access,
+ .vop_readdir = gfs_vop_readdir,
+ .vop_lookup = zfsctl_root_lookup_vop,
+ .vop_inactive = gfs_vop_inactive,
+ .vop_reclaim = zfsctl_common_reclaim,
+ .vop_fid = zfsctl_common_fid,
+};
+
+static int
+zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname)
+{
+ objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os;
+
+ dmu_objset_name(os, zname);
+ if (strlen(zname) + 1 + strlen(name) >= len)
+ return (ENAMETOOLONG);
+ (void) strcat(zname, "@");
+ (void) strcat(zname, name);
+ return (0);
+}
+
+static int
+zfsctl_unmount_snap(vnode_t *dvp, const char *name, int force, cred_t *cr)
+{
+ zfsctl_snapdir_t *sdp = dvp->v_data;
+ zfs_snapentry_t search, *sep;
+ struct vop_inactive_args ap;
+ avl_index_t where;
+ int err;
+
+ ASSERT(MUTEX_HELD(&sdp->sd_lock));
+
+ search.se_name = (char *)name;
+ if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL)
+ return (ENOENT);
+
+ ASSERT(vn_ismntpt(sep->se_root));
+
+ /* this will be dropped by dounmount() */
+ if ((err = vn_vfswlock(sep->se_root)) != 0)
+ return (err);
+
+ err = dounmount(vn_mountedvfs(sep->se_root), force, curthread);
+ if (err)
+ return (err);
+ ASSERT(sep->se_root->v_count == 1);
+ ap.a_vp = sep->se_root;
+ gfs_vop_inactive(&ap);
+
+ avl_remove(&sdp->sd_snaps, sep);
+ kmem_free(sep->se_name, strlen(sep->se_name) + 1);
+ kmem_free(sep, sizeof (zfs_snapentry_t));
+
+ return (0);
+}
+
+#if 0
+static void
+zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm)
+{
+ avl_index_t where;
+ vfs_t *vfsp;
+ refstr_t *pathref;
+ char newpath[MAXNAMELEN];
+ char *tail;
+
+ ASSERT(MUTEX_HELD(&sdp->sd_lock));
+ ASSERT(sep != NULL);
+
+ vfsp = vn_mountedvfs(sep->se_root);
+ ASSERT(vfsp != NULL);
+
+ vfs_lock_wait(vfsp);
+
+ /*
+ * Change the name in the AVL tree.
+ */
+ avl_remove(&sdp->sd_snaps, sep);
+ kmem_free(sep->se_name, strlen(sep->se_name) + 1);
+ sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
+ (void) strcpy(sep->se_name, nm);
+ VERIFY(avl_find(&sdp->sd_snaps, sep, &where) == NULL);
+ avl_insert(&sdp->sd_snaps, sep, where);
+
+ /*
+ * Change the current mountpoint info:
+ * - update the tail of the mntpoint path
+ * - update the tail of the resource path
+ */
+ pathref = vfs_getmntpoint(vfsp);
+ (void) strncpy(newpath, refstr_value(pathref), sizeof (newpath));
+ VERIFY((tail = strrchr(newpath, '/')) != NULL);
+ *(tail+1) = '\0';
+ ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath));
+ (void) strcat(newpath, nm);
+ refstr_rele(pathref);
+ vfs_setmntpoint(vfsp, newpath);
+
+ pathref = vfs_getresource(vfsp);
+ (void) strncpy(newpath, refstr_value(pathref), sizeof (newpath));
+ VERIFY((tail = strrchr(newpath, '@')) != NULL);
+ *(tail+1) = '\0';
+ ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath));
+ (void) strcat(newpath, nm);
+ refstr_rele(pathref);
+ vfs_setresource(vfsp, newpath);
+
+ vfs_unlock(vfsp);
+}
+#endif
+
+#if 0
+static int
+zfsctl_snapdir_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm,
+ cred_t *cr)
+{
+ zfsctl_snapdir_t *sdp = sdvp->v_data;
+ zfs_snapentry_t search, *sep;
+ avl_index_t where;
+ char from[MAXNAMELEN], to[MAXNAMELEN];
+ int err;
+
+ err = zfsctl_snapshot_zname(sdvp, snm, MAXNAMELEN, from);
+ if (err)
+ return (err);
+ err = zfs_secpolicy_write(from, cr);
+ if (err)
+ return (err);
+
+ /*
+ * Cannot move snapshots out of the snapdir.
+ */
+ if (sdvp != tdvp)
+ return (EINVAL);
+
+ if (strcmp(snm, tnm) == 0)
+ return (0);
+
+ err = zfsctl_snapshot_zname(tdvp, tnm, MAXNAMELEN, to);
+ if (err)
+ return (err);
+
+ mutex_enter(&sdp->sd_lock);
+
+ search.se_name = (char *)snm;
+ if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL) {
+ mutex_exit(&sdp->sd_lock);
+ return (ENOENT);
+ }
+
+ err = dmu_objset_rename(from, to);
+ if (err == 0)
+ zfsctl_rename_snap(sdp, sep, tnm);
+
+ mutex_exit(&sdp->sd_lock);
+
+ return (err);
+}
+#endif
+
+#if 0
+/* ARGSUSED */
+static int
+zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr)
+{
+ zfsctl_snapdir_t *sdp = dvp->v_data;
+ char snapname[MAXNAMELEN];
+ int err;
+
+ err = zfsctl_snapshot_zname(dvp, name, MAXNAMELEN, snapname);
+ if (err)
+ return (err);
+ err = zfs_secpolicy_write(snapname, cr);
+ if (err)
+ return (err);
+
+ mutex_enter(&sdp->sd_lock);
+
+ err = zfsctl_unmount_snap(dvp, name, 0, cr);
+ if (err) {
+ mutex_exit(&sdp->sd_lock);
+ return (err);
+ }
+
+ err = dmu_objset_destroy(snapname);
+
+ mutex_exit(&sdp->sd_lock);
+
+ return (err);
+}
+#endif
+
+/*
+ * Lookup entry point for the 'snapshot' directory. Try to open the
+ * snapshot if it exist, creating the pseudo filesystem vnode as necessary.
+ * Perform a mount of the associated dataset on top of the vnode.
+ */
+/* ARGSUSED */
+int
+zfsctl_snapdir_lookup(ap)
+ struct vop_lookup_args /* {
+ struct vnode *a_dvp;
+ struct vnode **a_vpp;
+ struct componentname *a_cnp;
+ } */ *ap;
+{
+ vnode_t *dvp = ap->a_dvp;
+ vnode_t **vpp = ap->a_vpp;
+ char nm[NAME_MAX + 1];
+ zfsctl_snapdir_t *sdp = dvp->v_data;
+ objset_t *snap;
+ char snapname[MAXNAMELEN];
+ char *mountpoint;
+ zfs_snapentry_t *sep, search;
+ size_t mountpoint_len;
+ avl_index_t where;
+ zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
+ int err;
+
+ ASSERT(ap->a_cnp->cn_namelen < sizeof(nm));
+ strlcpy(nm, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1);
+
+ ASSERT(dvp->v_type == VDIR);
+
+ if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0)
+ return (0);
+
+ *vpp = NULL;
+
+ /*
+ * If we get a recursive call, that means we got called
+ * from the domount() code while it was trying to look up the
+ * spec (which looks like a local path for zfs). We need to
+ * add some flag to domount() to tell it not to do this lookup.
+ */
+ if (MUTEX_HELD(&sdp->sd_lock))
+ return (ENOENT);
+
+ ZFS_ENTER(zfsvfs);
+
+ mutex_enter(&sdp->sd_lock);
+ search.se_name = (char *)nm;
+ if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) != NULL) {
+ *vpp = sep->se_root;
+ VN_HOLD(*vpp);
+ if ((*vpp)->v_mountedhere == NULL) {
+ /*
+ * The snapshot was unmounted behind our backs,
+ * try to remount it.
+ */
+ goto domount;
+ }
+ vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, ap->a_cnp->cn_thread);
+ mutex_exit(&sdp->sd_lock);
+ ZFS_EXIT(zfsvfs);
+ return (0);
+ }
+
+ /*
+ * The requested snapshot is not currently mounted, look it up.
+ */
+ err = zfsctl_snapshot_zname(dvp, nm, MAXNAMELEN, snapname);
+ if (err) {
+ mutex_exit(&sdp->sd_lock);
+ ZFS_EXIT(zfsvfs);
+ return (err);
+ }
+ if (dmu_objset_open(snapname, DMU_OST_ZFS,
+ DS_MODE_STANDARD | DS_MODE_READONLY, &snap) != 0) {
+ mutex_exit(&sdp->sd_lock);
+ ZFS_EXIT(zfsvfs);
+ return (ENOENT);
+ }
+
+ sep = kmem_alloc(sizeof (zfs_snapentry_t), KM_SLEEP);
+ sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
+ (void) strcpy(sep->se_name, nm);
+ *vpp = sep->se_root = zfsctl_snapshot_mknode(dvp, dmu_objset_id(snap));
+ VN_HOLD(*vpp);
+ avl_insert(&sdp->sd_snaps, sep, where);
+
+ dmu_objset_close(snap);
+domount:
+ mountpoint_len = strlen(dvp->v_vfsp->mnt_stat.f_mntonname) +
+ strlen("/.zfs/snapshot/") + strlen(nm) + 1;
+ mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP);
+ (void) snprintf(mountpoint, mountpoint_len, "%s/.zfs/snapshot/%s",
+ dvp->v_vfsp->mnt_stat.f_mntonname, nm);
+ err = domount(curthread, *vpp, "zfs", mountpoint, snapname, 0);
+ kmem_free(mountpoint, mountpoint_len);
+ /* FreeBSD: This line was moved from below to avoid a lock recursion. */
+ if (err == 0)
+ vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, curthread);
+ mutex_exit(&sdp->sd_lock);
+
+ /*
+ * If we had an error, drop our hold on the vnode and
+ * zfsctl_snapshot_inactive() will clean up.
+ */
+ if (err) {
+ VN_RELE(*vpp);
+ *vpp = NULL;
+ }
+ return (err);
+}
+
+/* ARGSUSED */
+static int
+zfsctl_snapdir_readdir_cb(vnode_t *vp, struct dirent64 *dp, int *eofp,
+ offset_t *offp, offset_t *nextp, void *data)
+{
+ zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+ char snapname[MAXNAMELEN];
+ uint64_t id, cookie;
+
+ ZFS_ENTER(zfsvfs);
+
+ cookie = *offp;
+ if (dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN, snapname, &id,
+ &cookie) == ENOENT) {
+ *eofp = 1;
+ ZFS_EXIT(zfsvfs);
+ return (0);
+ }
+
+ (void) strcpy(dp->d_name, snapname);
+ dp->d_ino = ZFSCTL_INO_SNAP(id);
+ *nextp = cookie;
+
+ ZFS_EXIT(zfsvfs);
+
+ return (0);
+}
+
+vnode_t *
+zfsctl_mknode_snapdir(vnode_t *pvp)
+{
+ vnode_t *vp;
+ zfsctl_snapdir_t *sdp;
+
+ vp = gfs_dir_create(sizeof (zfsctl_snapdir_t), pvp, pvp->v_vfsp,
+ &zfsctl_ops_snapdir, NULL, NULL, MAXNAMELEN,
+ zfsctl_snapdir_readdir_cb, NULL);
+ sdp = vp->v_data;
+ sdp->sd_node.zc_id = ZFSCTL_INO_SNAPDIR;
+ sdp->sd_node.zc_cmtime = ((zfsctl_node_t *)pvp->v_data)->zc_cmtime;
+ mutex_init(&sdp->sd_lock, NULL, MUTEX_DEFAULT, NULL);
+ avl_create(&sdp->sd_snaps, snapentry_compare,
+ sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, se_node));
+ return (vp);
+}
+
+/* ARGSUSED */
+static int
+zfsctl_snapdir_getattr(ap)
+ struct vop_getattr_args /* {
+ struct vnode *a_vp;
+ struct vattr *a_vap;
+ struct ucred *a_cred;
+ struct thread *a_td;
+ } */ *ap;
+{
+ struct vnode *vp = ap->a_vp;
+ struct vattr *vap = ap->a_vap;
+ zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+ zfsctl_snapdir_t *sdp = vp->v_data;
+
+ ZFS_ENTER(zfsvfs);
+ zfsctl_common_getattr(vp, vap);
+ vap->va_nodeid = gfs_file_inode(vp);
+ vap->va_nlink = vap->va_size = avl_numnodes(&sdp->sd_snaps) + 2;
+ ZFS_EXIT(zfsvfs);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+zfsctl_snapdir_inactive(ap)
+ struct vop_inactive_args /* {
+ struct vnode *a_vp;
+ struct thread *a_td;
+ } */ *ap;
+{
+ vnode_t *vp = ap->a_vp;
+ zfsctl_snapdir_t *sdp = vp->v_data;
+ void *private;
+
+ private = gfs_dir_inactive(vp);
+ if (private != NULL) {
+ ASSERT(avl_numnodes(&sdp->sd_snaps) == 0);
+ mutex_destroy(&sdp->sd_lock);
+ avl_destroy(&sdp->sd_snaps);
+ kmem_free(private, sizeof (zfsctl_snapdir_t));
+ }
+ return (0);
+}
+
+static struct vop_vector zfsctl_ops_snapdir = {
+ .vop_default = &default_vnodeops,
+ .vop_open = zfsctl_common_open,
+ .vop_close = zfsctl_common_close,
+ .vop_ioctl = VOP_EINVAL,
+ .vop_getattr = zfsctl_snapdir_getattr,
+ .vop_access = zfsctl_common_access,
+ .vop_readdir = gfs_vop_readdir,
+ .vop_lookup = zfsctl_snapdir_lookup,
+ .vop_inactive = zfsctl_snapdir_inactive,
+ .vop_reclaim = zfsctl_common_reclaim,
+ .vop_fid = zfsctl_common_fid,
+};
+
+static vnode_t *
+zfsctl_snapshot_mknode(vnode_t *pvp, uint64_t objset)
+{
+ vnode_t *vp;
+ zfsctl_node_t *zcp;
+
+ vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp, pvp->v_vfsp,
+ &zfsctl_ops_snapshot, NULL, NULL, MAXNAMELEN, NULL, NULL);
+ zcp = vp->v_data;
+ zcp->zc_id = objset;
+
+ return (vp);
+}
+
+static int
+zfsctl_snapshot_inactive(ap)
+ struct vop_inactive_args /* {
+ struct vnode *a_vp;
+ struct thread *a_td;
+ } */ *ap;
+{
+ vnode_t *vp = ap->a_vp;
+ struct vop_inactive_args iap;
+ zfsctl_snapdir_t *sdp;
+ zfs_snapentry_t *sep, *next;
+ int locked;
+ vnode_t *dvp;
+
+ VERIFY(gfs_dir_lookup(vp, "..", &dvp) == 0);
+ sdp = dvp->v_data;
+ VOP_UNLOCK(dvp, 0, ap->a_td);
+
+ if (!(locked = MUTEX_HELD(&sdp->sd_lock)))
+ mutex_enter(&sdp->sd_lock);
+
+ if (vp->v_count > 1) {
+ if (!locked)
+ mutex_exit(&sdp->sd_lock);
+ return (0);
+ }
+ ASSERT(!vn_ismntpt(vp));
+
+ sep = avl_first(&sdp->sd_snaps);
+ while (sep != NULL) {
+ next = AVL_NEXT(&sdp->sd_snaps, sep);
+
+ if (sep->se_root == vp) {
+ avl_remove(&sdp->sd_snaps, sep);
+ kmem_free(sep->se_name, strlen(sep->se_name) + 1);
+ kmem_free(sep, sizeof (zfs_snapentry_t));
+ break;
+ }
+ sep = next;
+ }
+ ASSERT(sep != NULL);
+
+ if (!locked)
+ mutex_exit(&sdp->sd_lock);
+ VN_RELE(dvp);
+
+ /*
+ * Dispose of the vnode for the snapshot mount point.
+ * This is safe to do because once this entry has been removed
+ * from the AVL tree, it can't be found again, so cannot become
+ * "active". If we lookup the same name again we will end up
+ * creating a new vnode.
+ */
+ iap.a_vp = vp;
+ return (gfs_vop_inactive(&iap));
+}
+
+static int
+zfsctl_traverse_begin(vnode_t **vpp, kthread_t *td)
+{
+ int err;
+
+ VN_HOLD(*vpp);
+ /* Snapshot should be already mounted, but just in case. */
+ if (vn_mountedvfs(*vpp) == NULL)
+ return (ENOENT);
+ err = traverse(vpp);
+ if (err == 0)
+ vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td);
+ return (err);
+}
+
+static void
+zfsctl_traverse_end(vnode_t *vp, int err)
+{
+
+ if (err == 0)
+ vput(vp);
+ else
+ VN_RELE(vp);
+}
+
+static int
+zfsctl_snapshot_getattr(ap)
+ struct vop_getattr_args /* {
+ struct vnode *a_vp;
+ struct vattr *a_vap;
+ struct ucred *a_cred;
+ struct thread *a_td;
+ } */ *ap;
+{
+ vnode_t *vp = ap->a_vp;
+ int err;
+
+ err = zfsctl_traverse_begin(&vp, ap->a_td);
+ if (err == 0)
+ err = VOP_GETATTR(vp, ap->a_vap, ap->a_cred, ap->a_td);
+ zfsctl_traverse_end(vp, err);
+ return (err);
+}
+
+static int
+zfsctl_snapshot_fid(ap)
+ struct vop_fid_args /* {
+ struct vnode *a_vp;
+ struct fid *a_fid;
+ } */ *ap;
+{
+ vnode_t *vp = ap->a_vp;
+ int err;
+
+ err = zfsctl_traverse_begin(&vp, curthread);
+ if (err == 0)
+ err = VOP_VPTOFH(vp, (void *)ap->a_fid);
+ zfsctl_traverse_end(vp, err);
+ return (err);
+}
+
+/*
+ * These VP's should never see the light of day. They should always
+ * be covered.
+ */
+static struct vop_vector zfsctl_ops_snapshot = {
+ .vop_default = &default_vnodeops,
+ .vop_inactive = zfsctl_snapshot_inactive,
+ .vop_reclaim = zfsctl_common_reclaim,
+ .vop_getattr = zfsctl_snapshot_getattr,
+ .vop_fid = zfsctl_snapshot_fid,
+};
+
+int
+zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp)
+{
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+ vnode_t *dvp, *vp;
+ zfsctl_snapdir_t *sdp;
+ zfsctl_node_t *zcp;
+ zfs_snapentry_t *sep;
+ int error;
+
+ ASSERT(zfsvfs->z_ctldir != NULL);
+ error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
+ NULL, 0, NULL, kcred);
+ if (error != 0)
+ return (error);
+ sdp = dvp->v_data;
+
+ mutex_enter(&sdp->sd_lock);
+ sep = avl_first(&sdp->sd_snaps);
+ while (sep != NULL) {
+ vp = sep->se_root;
+ zcp = vp->v_data;
+ if (zcp->zc_id == objsetid)
+ break;
+
+ sep = AVL_NEXT(&sdp->sd_snaps, sep);
+ }
+
+ if (sep != NULL) {
+ VN_HOLD(vp);
+ error = traverse(&vp);
+ if (error == 0) {
+ if (vp == sep->se_root)
+ error = EINVAL;
+ else
+ *zfsvfsp = VTOZ(vp)->z_zfsvfs;
+ }
+ mutex_exit(&sdp->sd_lock);
+ VN_RELE(vp);
+ } else {
+ error = EINVAL;
+ mutex_exit(&sdp->sd_lock);
+ }
+
+ VN_RELE(dvp);
+
+ return (error);
+}
+
+/*
+ * Unmount any snapshots for the given filesystem. This is called from
+ * zfs_umount() - if we have a ctldir, then go through and unmount all the
+ * snapshots.
+ */
+int
+zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr)
+{
+ struct vop_inactive_args ap;
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+ vnode_t *dvp, *svp;
+ zfsctl_snapdir_t *sdp;
+ zfs_snapentry_t *sep, *next;
+ int error;
+
+ ASSERT(zfsvfs->z_ctldir != NULL);
+ error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
+ NULL, 0, NULL, cr);
+ if (error != 0)
+ return (error);
+ sdp = dvp->v_data;
+
+ mutex_enter(&sdp->sd_lock);
+
+ sep = avl_first(&sdp->sd_snaps);
+ while (sep != NULL) {
+ svp = sep->se_root;
+ next = AVL_NEXT(&sdp->sd_snaps, sep);
+
+ /*
+ * If this snapshot is not mounted, then it must
+ * have just been unmounted by somebody else, and
+ * will be cleaned up by zfsctl_snapdir_inactive().
+ */
+ if (vn_ismntpt(svp)) {
+ if ((error = vn_vfswlock(svp)) != 0)
+ goto out;
+
+ /*
+ * Increase usecount, so dounmount() won't vrele() it
+ * to 0 and call zfsctl_snapdir_inactive().
+ */
+ VN_HOLD(svp);
+ vfsp = vn_mountedvfs(svp);
+ mtx_lock(&Giant);
+ error = dounmount(vfsp, fflags, curthread);
+ mtx_unlock(&Giant);
+ if (error != 0) {
+ VN_RELE(svp);
+ goto out;
+ }
+
+ avl_remove(&sdp->sd_snaps, sep);
+ kmem_free(sep->se_name, strlen(sep->se_name) + 1);
+ kmem_free(sep, sizeof (zfs_snapentry_t));
+
+ /*
+ * We can't use VN_RELE(), as that will try to
+ * invoke zfsctl_snapdir_inactive(), and that
+ * would lead to an attempt to re-grab the sd_lock.
+ */
+ ASSERT3U(svp->v_count, ==, 1);
+ ap.a_vp = svp;
+ gfs_vop_inactive(&ap);
+ }
+ sep = next;
+ }
+out:
+ mutex_exit(&sdp->sd_lock);
+ VN_RELE(dvp);
+
+ return (error);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c
new file mode 100644
index 0000000..486aa74
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c
@@ -0,0 +1,796 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/resource.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/kmem.h>
+#include <sys/uio.h>
+#include <sys/cmn_err.h>
+#include <sys/errno.h>
+#include <sys/stat.h>
+#include <sys/unistd.h>
+#include <sys/random.h>
+#include <sys/kcondvar.h>
+#include <sys/callb.h>
+#include <sys/smp.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_acl.h>
+#include <sys/fs/zfs.h>
+#include <sys/zap.h>
+#include <sys/dmu.h>
+#include <sys/atomic.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/dnlc.h>
+
+/*
+ * Lock a directory entry. A dirlock on <dzp, name> protects that name
+ * in dzp's directory zap object. As long as you hold a dirlock, you can
+ * assume two things: (1) dzp cannot be reaped, and (2) no other thread
+ * can change the zap entry for (i.e. link or unlink) this name.
+ *
+ * Input arguments:
+ * dzp - znode for directory
+ * name - name of entry to lock
+ * flag - ZNEW: if the entry already exists, fail with EEXIST.
+ * ZEXISTS: if the entry does not exist, fail with ENOENT.
+ * ZSHARED: allow concurrent access with other ZSHARED callers.
+ * ZXATTR: we want dzp's xattr directory
+ *
+ * Output arguments:
+ * zpp - pointer to the znode for the entry (NULL if there isn't one)
+ * dlpp - pointer to the dirlock for this entry (NULL on error)
+ *
+ * Return value: 0 on success or errno on failure.
+ *
+ * NOTE: Always checks for, and rejects, '.' and '..'.
+ */
+int
+zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp,
+ int flag)
+{
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ zfs_dirlock_t *dl;
+ uint64_t zoid;
+ int error;
+ vnode_t *vp;
+
+ *zpp = NULL;
+ *dlpp = NULL;
+
+ /*
+ * Verify that we are not trying to lock '.', '..', or '.zfs'
+ */
+ if (name[0] == '.' &&
+ (name[1] == '\0' || (name[1] == '.' && name[2] == '\0')) ||
+ zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0)
+ return (EEXIST);
+
+ /*
+ * Wait until there are no locks on this name.
+ */
+ rw_enter(&dzp->z_name_lock, RW_READER);
+ mutex_enter(&dzp->z_lock);
+ for (;;) {
+ if (dzp->z_unlinked) {
+ mutex_exit(&dzp->z_lock);
+ rw_exit(&dzp->z_name_lock);
+ return (ENOENT);
+ }
+ for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next)
+ if (strcmp(name, dl->dl_name) == 0)
+ break;
+ if (dl == NULL) {
+ /*
+ * Allocate a new dirlock and add it to the list.
+ */
+ dl = kmem_alloc(sizeof (zfs_dirlock_t), KM_SLEEP);
+ cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL);
+ dl->dl_name = name;
+ dl->dl_sharecnt = 0;
+ dl->dl_namesize = 0;
+ dl->dl_dzp = dzp;
+ dl->dl_next = dzp->z_dirlocks;
+ dzp->z_dirlocks = dl;
+ break;
+ }
+ if ((flag & ZSHARED) && dl->dl_sharecnt != 0)
+ break;
+ cv_wait(&dl->dl_cv, &dzp->z_lock);
+ }
+
+ if ((flag & ZSHARED) && ++dl->dl_sharecnt > 1 && dl->dl_namesize == 0) {
+ /*
+ * We're the second shared reference to dl. Make a copy of
+ * dl_name in case the first thread goes away before we do.
+ * Note that we initialize the new name before storing its
+ * pointer into dl_name, because the first thread may load
+ * dl->dl_name at any time. He'll either see the old value,
+ * which is his, or the new shared copy; either is OK.
+ */
+ dl->dl_namesize = strlen(dl->dl_name) + 1;
+ name = kmem_alloc(dl->dl_namesize, KM_SLEEP);
+ bcopy(dl->dl_name, name, dl->dl_namesize);
+ dl->dl_name = name;
+ }
+
+ mutex_exit(&dzp->z_lock);
+
+ /*
+ * We have a dirlock on the name. (Note that it is the dirlock,
+ * not the dzp's z_lock, that protects the name in the zap object.)
+ * See if there's an object by this name; if so, put a hold on it.
+ */
+ if (flag & ZXATTR) {
+ zoid = dzp->z_phys->zp_xattr;
+ error = (zoid == 0 ? ENOENT : 0);
+ } else {
+ vp = dnlc_lookup(ZTOV(dzp), name);
+ if (vp == DNLC_NO_VNODE) {
+ VN_RELE(vp);
+ error = ENOENT;
+ } else if (vp) {
+ if (flag & ZNEW) {
+ zfs_dirent_unlock(dl);
+ VN_RELE(vp);
+ return (EEXIST);
+ }
+ *dlpp = dl;
+ *zpp = VTOZ(vp);
+ return (0);
+ } else {
+ error = zap_lookup(zfsvfs->z_os, dzp->z_id, name,
+ 8, 1, &zoid);
+ zoid = ZFS_DIRENT_OBJ(zoid);
+ if (error == ENOENT)
+ dnlc_update(ZTOV(dzp), name, DNLC_NO_VNODE);
+ }
+ }
+ if (error) {
+ if (error != ENOENT || (flag & ZEXISTS)) {
+ zfs_dirent_unlock(dl);
+ return (error);
+ }
+ } else {
+ if (flag & ZNEW) {
+ zfs_dirent_unlock(dl);
+ return (EEXIST);
+ }
+ error = zfs_zget(zfsvfs, zoid, zpp);
+ if (error) {
+ zfs_dirent_unlock(dl);
+ return (error);
+ }
+ if (!(flag & ZXATTR))
+ dnlc_update(ZTOV(dzp), name, ZTOV(*zpp));
+ }
+
+ *dlpp = dl;
+
+ return (0);
+}
+
+/*
+ * Unlock this directory entry and wake anyone who was waiting for it.
+ */
+void
+zfs_dirent_unlock(zfs_dirlock_t *dl)
+{
+ znode_t *dzp = dl->dl_dzp;
+ zfs_dirlock_t **prev_dl, *cur_dl;
+
+ mutex_enter(&dzp->z_lock);
+ rw_exit(&dzp->z_name_lock);
+ if (dl->dl_sharecnt > 1) {
+ dl->dl_sharecnt--;
+ mutex_exit(&dzp->z_lock);
+ return;
+ }
+ prev_dl = &dzp->z_dirlocks;
+ while ((cur_dl = *prev_dl) != dl)
+ prev_dl = &cur_dl->dl_next;
+ *prev_dl = dl->dl_next;
+ cv_broadcast(&dl->dl_cv);
+ mutex_exit(&dzp->z_lock);
+
+ if (dl->dl_namesize != 0)
+ kmem_free(dl->dl_name, dl->dl_namesize);
+ cv_destroy(&dl->dl_cv);
+ kmem_free(dl, sizeof (*dl));
+}
+
+/*
+ * Look up an entry in a directory.
+ *
+ * NOTE: '.' and '..' are handled as special cases because
+ * no directory entries are actually stored for them. If this is
+ * the root of a filesystem, then '.zfs' is also treated as a
+ * special pseudo-directory.
+ */
+int
+zfs_dirlook(znode_t *dzp, char *name, vnode_t **vpp)
+{
+ zfs_dirlock_t *dl;
+ znode_t *zp;
+ int error = 0;
+
+ if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
+ *vpp = ZTOV(dzp);
+ VN_HOLD(*vpp);
+ } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ /*
+ * If we are a snapshot mounted under .zfs, return
+ * the vp for the snapshot directory.
+ */
+ if (dzp->z_phys->zp_parent == dzp->z_id &&
+ zfsvfs->z_parent != zfsvfs) {
+ error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir,
+ "snapshot", vpp, NULL, 0, NULL, kcred);
+ return (error);
+ }
+ rw_enter(&dzp->z_parent_lock, RW_READER);
+ error = zfs_zget(zfsvfs, dzp->z_phys->zp_parent, &zp);
+ if (error == 0)
+ *vpp = ZTOV(zp);
+ rw_exit(&dzp->z_parent_lock);
+ } else if (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) {
+ *vpp = zfsctl_root(dzp);
+ } else {
+ error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS | ZSHARED);
+ if (error == 0) {
+ *vpp = ZTOV(zp);
+ zfs_dirent_unlock(dl);
+ dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */
+ }
+ }
+
+ return (error);
+}
+
+static char *
+zfs_unlinked_hexname(char namebuf[17], uint64_t x)
+{
+ char *name = &namebuf[16];
+ const char digits[16] = "0123456789abcdef";
+
+ *name = '\0';
+ do {
+ *--name = digits[x & 0xf];
+ x >>= 4;
+ } while (x != 0);
+
+ return (name);
+}
+
+/*
+ * unlinked Set (formerly known as the "delete queue") Error Handling
+ *
+ * When dealing with the unlinked set, we dmu_tx_hold_zap(), but we
+ * don't specify the name of the entry that we will be manipulating. We
+ * also fib and say that we won't be adding any new entries to the
+ * unlinked set, even though we might (this is to lower the minimum file
+ * size that can be deleted in a full filesystem). So on the small
+ * chance that the nlink list is using a fat zap (ie. has more than
+ * 2000 entries), we *may* not pre-read a block that's needed.
+ * Therefore it is remotely possible for some of the assertions
+ * regarding the unlinked set below to fail due to i/o error. On a
+ * nondebug system, this will result in the space being leaked.
+ */
+void
+zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ char obj_name[17];
+ int error;
+
+ ASSERT(zp->z_unlinked);
+ ASSERT3U(zp->z_phys->zp_links, ==, 0);
+
+ error = zap_add(zfsvfs->z_os, zfsvfs->z_unlinkedobj,
+ zfs_unlinked_hexname(obj_name, zp->z_id), 8, 1, &zp->z_id, tx);
+ ASSERT3U(error, ==, 0);
+}
+
+/*
+ * Clean up any znodes that had no links when we either crashed or
+ * (force) umounted the file system.
+ */
+void
+zfs_unlinked_drain(zfsvfs_t *zfsvfs)
+{
+ zap_cursor_t zc;
+ zap_attribute_t zap;
+ dmu_object_info_t doi;
+ znode_t *zp;
+ int error;
+
+ /*
+ * Interate over the contents of the unlinked set.
+ */
+ for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj);
+ zap_cursor_retrieve(&zc, &zap) == 0;
+ zap_cursor_advance(&zc)) {
+
+ /*
+ * See what kind of object we have in list
+ */
+
+ error = dmu_object_info(zfsvfs->z_os,
+ zap.za_first_integer, &doi);
+ if (error != 0)
+ continue;
+
+ ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) ||
+ (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS));
+ /*
+ * We need to re-mark these list entries for deletion,
+ * so we pull them back into core and set zp->z_unlinked.
+ */
+ error = zfs_zget(zfsvfs, zap.za_first_integer, &zp);
+
+ /*
+ * We may pick up znodes that are already marked for deletion.
+ * This could happen during the purge of an extended attribute
+ * directory. All we need to do is skip over them, since they
+ * are already in the system marked z_unlinked.
+ */
+ if (error != 0)
+ continue;
+
+ zp->z_unlinked = B_TRUE;
+ VN_RELE(ZTOV(zp));
+ }
+ zap_cursor_fini(&zc);
+}
+
+/*
+ * Delete the entire contents of a directory. Return a count
+ * of the number of entries that could not be deleted.
+ *
+ * NOTE: this function assumes that the directory is inactive,
+ * so there is no need to lock its entries before deletion.
+ * Also, it assumes the directory contents is *only* regular
+ * files.
+ */
+static int
+zfs_purgedir(znode_t *dzp)
+{
+ zap_cursor_t zc;
+ zap_attribute_t zap;
+ znode_t *xzp;
+ dmu_tx_t *tx;
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ zfs_dirlock_t dl;
+ int skipped = 0;
+ int error;
+
+ for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id);
+ (error = zap_cursor_retrieve(&zc, &zap)) == 0;
+ zap_cursor_advance(&zc)) {
+ error = zfs_zget(zfsvfs,
+ ZFS_DIRENT_OBJ(zap.za_first_integer), &xzp);
+ ASSERT3U(error, ==, 0);
+
+ ASSERT((ZTOV(xzp)->v_type == VREG) ||
+ (ZTOV(xzp)->v_type == VLNK));
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_bonus(tx, dzp->z_id);
+ dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name);
+ dmu_tx_hold_bonus(tx, xzp->z_id);
+ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ VN_RELE(ZTOV(xzp));
+ skipped += 1;
+ continue;
+ }
+ bzero(&dl, sizeof (dl));
+ dl.dl_dzp = dzp;
+ dl.dl_name = zap.za_name;
+
+ error = zfs_link_destroy(&dl, xzp, tx, 0, NULL);
+ ASSERT3U(error, ==, 0);
+ dmu_tx_commit(tx);
+
+ VN_RELE(ZTOV(xzp));
+ }
+ zap_cursor_fini(&zc);
+ ASSERT(error == ENOENT);
+ return (skipped);
+}
+
+void
+zfs_rmnode(znode_t *zp)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ objset_t *os = zfsvfs->z_os;
+ znode_t *xzp = NULL;
+ char obj_name[17];
+ dmu_tx_t *tx;
+ uint64_t acl_obj;
+ int error;
+ int vfslocked;
+
+ vfslocked = VFS_LOCK_GIANT(zfsvfs->z_vfs);
+
+ ASSERT(zp->z_phys->zp_links == 0);
+
+ /*
+ * If this is an attribute directory, purge its contents.
+ */
+ if (ZTOV(zp) != NULL && ZTOV(zp)->v_type == VDIR &&
+ (zp->z_phys->zp_flags & ZFS_XATTR)) {
+ if (zfs_purgedir(zp) != 0) {
+ /*
+ * Not enough space to delete some xattrs.
+ * Leave it on the unlinked set.
+ */
+ VFS_UNLOCK_GIANT(vfslocked);
+ return;
+ }
+ }
+
+ /*
+ * If the file has extended attributes, we're going to unlink
+ * the xattr dir.
+ */
+ if (zp->z_phys->zp_xattr) {
+ error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp);
+ ASSERT(error == 0);
+ }
+
+ acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj;
+
+ /*
+ * Set up the transaction.
+ */
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
+ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+ if (xzp) {
+ dmu_tx_hold_bonus(tx, xzp->z_id);
+ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL);
+ }
+ if (acl_obj)
+ dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ /*
+ * Not enough space to delete the file. Leave it in the
+ * unlinked set, leaking it until the fs is remounted (at
+ * which point we'll call zfs_unlinked_drain() to process it).
+ */
+ dmu_tx_abort(tx);
+ VFS_UNLOCK_GIANT(vfslocked);
+ return;
+ }
+
+ if (xzp) {
+ dmu_buf_will_dirty(xzp->z_dbuf, tx);
+ mutex_enter(&xzp->z_lock);
+ xzp->z_unlinked = B_TRUE; /* mark xzp for deletion */
+ xzp->z_phys->zp_links = 0; /* no more links to it */
+ mutex_exit(&xzp->z_lock);
+ zfs_unlinked_add(xzp, tx);
+ }
+
+ /* Remove this znode from the unlinked set */
+ error = zap_remove(os, zfsvfs->z_unlinkedobj,
+ zfs_unlinked_hexname(obj_name, zp->z_id), tx);
+ ASSERT3U(error, ==, 0);
+
+ zfs_znode_delete(zp, tx);
+
+ dmu_tx_commit(tx);
+
+ if (xzp)
+ VN_RELE(ZTOV(xzp));
+ VFS_UNLOCK_GIANT(vfslocked);
+}
+
+/*
+ * Link zp into dl. Can only fail if zp has been unlinked.
+ */
+int
+zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
+{
+ znode_t *dzp = dl->dl_dzp;
+ vnode_t *vp = ZTOV(zp);
+ uint64_t value;
+ int zp_is_dir = (vp->v_type == VDIR);
+ int error;
+
+ dmu_buf_will_dirty(zp->z_dbuf, tx);
+ mutex_enter(&zp->z_lock);
+
+ if (!(flag & ZRENAMING)) {
+ if (zp->z_unlinked) { /* no new links to unlinked zp */
+ ASSERT(!(flag & (ZNEW | ZEXISTS)));
+ mutex_exit(&zp->z_lock);
+ return (ENOENT);
+ }
+ zp->z_phys->zp_links++;
+ }
+ zp->z_phys->zp_parent = dzp->z_id; /* dzp is now zp's parent */
+
+ if (!(flag & ZNEW))
+ zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
+ mutex_exit(&zp->z_lock);
+
+ dmu_buf_will_dirty(dzp->z_dbuf, tx);
+ mutex_enter(&dzp->z_lock);
+ dzp->z_phys->zp_size++; /* one dirent added */
+ dzp->z_phys->zp_links += zp_is_dir; /* ".." link from zp */
+ zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx);
+ mutex_exit(&dzp->z_lock);
+
+ /*
+ * MacOS X will fill in the 4-bit object type here.
+ */
+ value = ZFS_DIRENT_MAKE(IFTODT(zp->z_phys->zp_mode), zp->z_id);
+ error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, dl->dl_name,
+ 8, 1, &value, tx);
+ ASSERT(error == 0);
+
+ dnlc_update(ZTOV(dzp), dl->dl_name, vp);
+
+ return (0);
+}
+
+/*
+ * Unlink zp from dl, and mark zp for deletion if this was the last link.
+ * Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST).
+ * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list.
+ * If it's non-NULL, we use it to indicate whether the znode needs deletion,
+ * and it's the caller's job to do it.
+ */
+int
+zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
+ boolean_t *unlinkedp)
+{
+ znode_t *dzp = dl->dl_dzp;
+ vnode_t *vp = ZTOV(zp);
+ int zp_is_dir = (vp->v_type == VDIR);
+ boolean_t unlinked = B_FALSE;
+ int error;
+
+ dnlc_remove(ZTOV(dzp), dl->dl_name);
+
+ if (!(flag & ZRENAMING)) {
+ dmu_buf_will_dirty(zp->z_dbuf, tx);
+
+ if (vn_vfswlock(vp)) /* prevent new mounts on zp */
+ return (EBUSY);
+
+ if (vn_ismntpt(vp)) { /* don't remove mount point */
+ vn_vfsunlock(vp);
+ return (EBUSY);
+ }
+
+ mutex_enter(&zp->z_lock);
+ if (zp_is_dir && !zfs_dirempty(zp)) { /* dir not empty */
+ mutex_exit(&zp->z_lock);
+ vn_vfsunlock(vp);
+ return (ENOTEMPTY);
+ }
+ if (zp->z_phys->zp_links <= zp_is_dir) {
+ zfs_panic_recover("zfs: link count on vnode %p is %u, "
+ "should be at least %u", zp->z_vnode,
+ (int)zp->z_phys->zp_links,
+ zp_is_dir + 1);
+ zp->z_phys->zp_links = zp_is_dir + 1;
+ }
+ if (--zp->z_phys->zp_links == zp_is_dir) {
+ zp->z_unlinked = B_TRUE;
+ zp->z_phys->zp_links = 0;
+ unlinked = B_TRUE;
+ } else {
+ zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
+ }
+ mutex_exit(&zp->z_lock);
+ vn_vfsunlock(vp);
+ }
+
+ dmu_buf_will_dirty(dzp->z_dbuf, tx);
+ mutex_enter(&dzp->z_lock);
+ dzp->z_phys->zp_size--; /* one dirent removed */
+ dzp->z_phys->zp_links -= zp_is_dir; /* ".." link from zp */
+ zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx);
+ mutex_exit(&dzp->z_lock);
+
+ error = zap_remove(zp->z_zfsvfs->z_os, dzp->z_id, dl->dl_name, tx);
+ ASSERT(error == 0);
+
+ if (unlinkedp != NULL)
+ *unlinkedp = unlinked;
+ else if (unlinked)
+ zfs_unlinked_add(zp, tx);
+
+ return (0);
+}
+
+/*
+ * Indicate whether the directory is empty. Works with or without z_lock
+ * held, but can only be consider a hint in the latter case. Returns true
+ * if only "." and ".." remain and there's no work in progress.
+ */
+boolean_t
+zfs_dirempty(znode_t *dzp)
+{
+ return (dzp->z_phys->zp_size == 2 && dzp->z_dirlocks == 0);
+}
+
+int
+zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ znode_t *xzp;
+ dmu_tx_t *tx;
+ uint64_t xoid;
+ int error;
+
+ *xvpp = NULL;
+
+ if (error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, cr))
+ return (error);
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_bonus(tx, zp->z_id);
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
+ error = dmu_tx_assign(tx, zfsvfs->z_assign);
+ if (error) {
+ if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT)
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ return (error);
+ }
+ zfs_mknode(zp, vap, &xoid, tx, cr, IS_XATTR, &xzp, 0);
+ ASSERT(xzp->z_id == xoid);
+ ASSERT(xzp->z_phys->zp_parent == zp->z_id);
+ dmu_buf_will_dirty(zp->z_dbuf, tx);
+ zp->z_phys->zp_xattr = xoid;
+
+ (void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp, xzp, "");
+ dmu_tx_commit(tx);
+
+ *xvpp = ZTOV(xzp);
+
+ return (0);
+}
+
+/*
+ * Return a znode for the extended attribute directory for zp.
+ * ** If the directory does not already exist, it is created **
+ *
+ * IN: zp - znode to obtain attribute directory from
+ * cr - credentials of caller
+ * flags - flags from the VOP_LOOKUP call
+ *
+ * OUT: xzpp - pointer to extended attribute znode
+ *
+ * RETURN: 0 on success
+ * error number on failure
+ */
+int
+zfs_get_xattrdir(znode_t *zp, vnode_t **xvpp, cred_t *cr, int flags)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ znode_t *xzp;
+ zfs_dirlock_t *dl;
+ vattr_t va;
+ int error;
+top:
+ error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR);
+ if (error)
+ return (error);
+
+ if (xzp != NULL) {
+ *xvpp = ZTOV(xzp);
+ zfs_dirent_unlock(dl);
+ return (0);
+ }
+
+ ASSERT(zp->z_phys->zp_xattr == 0);
+
+#ifdef TODO
+ if (!(flags & CREATE_XATTR_DIR)) {
+ zfs_dirent_unlock(dl);
+ return (ENOENT);
+ }
+#endif
+
+ if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
+ zfs_dirent_unlock(dl);
+ return (EROFS);
+ }
+
+ /*
+ * The ability to 'create' files in an attribute
+ * directory comes from the write_xattr permission on the base file.
+ *
+ * The ability to 'search' an attribute directory requires
+ * read_xattr permission on the base file.
+ *
+ * Once in a directory the ability to read/write attributes
+ * is controlled by the permissions on the attribute file.
+ */
+ va.va_mask = AT_TYPE | AT_MODE | AT_UID | AT_GID;
+ va.va_type = VDIR;
+ va.va_mode = S_IFDIR | S_ISVTX | 0777;
+ va.va_uid = (uid_t)zp->z_phys->zp_uid;
+ va.va_gid = (gid_t)zp->z_phys->zp_gid;
+
+ error = zfs_make_xattrdir(zp, &va, xvpp, cr);
+ zfs_dirent_unlock(dl);
+
+ if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+ /* NB: we already did dmu_tx_wait() if necessary */
+ goto top;
+ }
+
+ return (error);
+}
+
+/*
+ * Decide whether it is okay to remove within a sticky directory.
+ *
+ * In sticky directories, write access is not sufficient;
+ * you can remove entries from a directory only if:
+ *
+ * you own the directory,
+ * you own the entry,
+ * the entry is a plain file and you have write access,
+ * or you are privileged (checked in secpolicy...).
+ *
+ * The function returns 0 if remove access is granted.
+ */
+int
+zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr)
+{
+ uid_t uid;
+
+ if (zdp->z_zfsvfs->z_assign >= TXG_INITIAL) /* ZIL replay */
+ return (0);
+
+ if ((zdp->z_phys->zp_mode & S_ISVTX) == 0 ||
+ (uid = crgetuid(cr)) == zdp->z_phys->zp_uid ||
+ uid == zp->z_phys->zp_uid ||
+ (ZTOV(zp)->v_type == VREG &&
+ zfs_zaccess(zp, ACE_WRITE_DATA, cr) == 0))
+ return (0);
+ else
+ return (secpolicy_vnode_remove(cr));
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c
new file mode 100644
index 0000000..af765ba
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c
@@ -0,0 +1,336 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+
+#include <sys/fm/fs/zfs.h>
+#include <sys/fm/protocol.h>
+#include <sys/fm/util.h>
+
+#ifdef _KERNEL
+/* Including sys/bus.h is just too hard, so I declare what I need here. */
+extern void devctl_notify(const char *__system, const char *__subsystem,
+ const char *__type, const char *__data);
+#endif
+
+/*
+ * This general routine is responsible for generating all the different ZFS
+ * ereports. The payload is dependent on the class, and which arguments are
+ * supplied to the function:
+ *
+ * EREPORT POOL VDEV IO
+ * block X X X
+ * data X X
+ * device X X
+ * pool X
+ *
+ * If we are in a loading state, all errors are chained together by the same
+ * SPA-wide ENA.
+ *
+ * For isolated I/O requests, we get the ENA from the zio_t. The propagation
+ * gets very complicated due to RAID-Z, gang blocks, and vdev caching. We want
+ * to chain together all ereports associated with a logical piece of data. For
+ * read I/Os, there are basically three 'types' of I/O, which form a roughly
+ * layered diagram:
+ *
+ * +---------------+
+ * | Aggregate I/O | No associated logical data or device
+ * +---------------+
+ * |
+ * V
+ * +---------------+ Reads associated with a piece of logical data.
+ * | Read I/O | This includes reads on behalf of RAID-Z,
+ * +---------------+ mirrors, gang blocks, retries, etc.
+ * |
+ * V
+ * +---------------+ Reads associated with a particular device, but
+ * | Physical I/O | no logical data. Issued as part of vdev caching
+ * +---------------+ and I/O aggregation.
+ *
+ * Note that 'physical I/O' here is not the same terminology as used in the rest
+ * of ZIO. Typically, 'physical I/O' simply means that there is no attached
+ * blockpointer. But I/O with no associated block pointer can still be related
+ * to a logical piece of data (i.e. RAID-Z requests).
+ *
+ * Purely physical I/O always have unique ENAs. They are not related to a
+ * particular piece of logical data, and therefore cannot be chained together.
+ * We still generate an ereport, but the DE doesn't correlate it with any
+ * logical piece of data. When such an I/O fails, the delegated I/O requests
+ * will issue a retry, which will trigger the 'real' ereport with the correct
+ * ENA.
+ *
+ * We keep track of the ENA for a ZIO chain through the 'io_logical' member.
+ * When a new logical I/O is issued, we set this to point to itself. Child I/Os
+ * then inherit this pointer, so that when it is first set subsequent failures
+ * will use the same ENA. If a physical I/O is issued (by passing the
+ * ZIO_FLAG_NOBOOKMARK flag), then this pointer is reset, guaranteeing that a
+ * unique ENA will be generated. For an aggregate I/O, this pointer is set to
+ * NULL, and no ereport will be generated (since it doesn't actually correspond
+ * to any particular device or piece of data).
+ */
+void
+zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
+ uint64_t stateoroffset, uint64_t size)
+{
+#ifdef _KERNEL
+ char buf[1024];
+ char class[64];
+ struct sbuf sb;
+ struct timespec ts;
+
+ /*
+ * If we are doing a spa_tryimport(), ignore errors.
+ */
+ if (spa->spa_load_state == SPA_LOAD_TRYIMPORT)
+ return;
+
+ /*
+ * If we are in the middle of opening a pool, and the previous attempt
+ * failed, don't bother logging any new ereports - we're just going to
+ * get the same diagnosis anyway.
+ */
+ if (spa->spa_load_state != SPA_LOAD_NONE &&
+ spa->spa_last_open_failed)
+ return;
+
+ /*
+ * Ignore any errors from I/Os that we are going to retry anyway - we
+ * only generate errors from the final failure.
+ */
+ if (zio && zio_should_retry(zio))
+ return;
+
+ /*
+ * If this is not a read or write zio, ignore the error. This can occur
+ * if the DKIOCFLUSHWRITECACHE ioctl fails.
+ */
+ if (zio && zio->io_type != ZIO_TYPE_READ &&
+ zio->io_type != ZIO_TYPE_WRITE)
+ return;
+
+ nanotime(&ts);
+
+ sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
+ sbuf_printf(&sb, "time %ju.%ld", (uintmax_t)ts.tv_sec, ts.tv_nsec);
+
+ /*
+ * Serialize ereport generation
+ */
+ mutex_enter(&spa->spa_errlist_lock);
+
+#if 0
+ /*
+ * Determine the ENA to use for this event. If we are in a loading
+ * state, use a SPA-wide ENA. Otherwise, if we are in an I/O state, use
+ * a root zio-wide ENA. Otherwise, simply use a unique ENA.
+ */
+ if (spa->spa_load_state != SPA_LOAD_NONE) {
+#if 0
+ if (spa->spa_ena == 0)
+ spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1);
+#endif
+ ena = spa->spa_ena;
+ } else if (zio != NULL && zio->io_logical != NULL) {
+#if 0
+ if (zio->io_logical->io_ena == 0)
+ zio->io_logical->io_ena =
+ fm_ena_generate(0, FM_ENA_FMT1);
+#endif
+ ena = zio->io_logical->io_ena;
+ } else {
+#if 0
+ ena = fm_ena_generate(0, FM_ENA_FMT1);
+#else
+ ena = 0;
+#endif
+ }
+#endif
+
+ /*
+ * Construct the full class, detector, and other standard FMA fields.
+ */
+ sbuf_printf(&sb, " ereport_version %u", FM_EREPORT_VERSION);
+ snprintf(class, sizeof(class), "%s.%s", ZFS_ERROR_CLASS, subclass);
+ sbuf_printf(&sb, " class %s", class);
+
+ sbuf_printf(&sb, " zfs_scheme_version %u", FM_ZFS_SCHEME_VERSION);
+
+ /*
+ * Construct the per-ereport payload, depending on which parameters are
+ * passed in.
+ */
+
+ /*
+ * Generic payload members common to all ereports.
+ *
+ * The direct reference to spa_name is used rather than spa_name()
+ * because of the asynchronous nature of the zio pipeline. spa_name()
+ * asserts that the config lock is held in some form. This is always
+ * the case in I/O context, but because the check for RW_WRITER compares
+ * against 'curthread', we may be in an asynchronous context and blow
+ * this assert. Rather than loosen this assert, we acknowledge that all
+ * contexts in which this function is called (pool open, I/O) are safe,
+ * and dereference the name directly.
+ */
+ sbuf_printf(&sb, " %s %s", FM_EREPORT_PAYLOAD_ZFS_POOL, spa->spa_name);
+ sbuf_printf(&sb, " %s %ju", FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
+ spa_guid(spa));
+ sbuf_printf(&sb, " %s %u", FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT,
+ spa->spa_load_state);
+
+ if (vd != NULL) {
+ vdev_t *pvd = vd->vdev_parent;
+
+ sbuf_printf(&sb, " %s %ju", FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
+ vd->vdev_guid);
+ sbuf_printf(&sb, " %s %s", FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
+ vd->vdev_ops->vdev_op_type);
+ if (vd->vdev_path)
+ sbuf_printf(&sb, " %s %s",
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, vd->vdev_path);
+ if (vd->vdev_devid)
+ sbuf_printf(&sb, " %s %s",
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, vd->vdev_devid);
+
+ if (pvd != NULL) {
+ sbuf_printf(&sb, " %s %ju",
+ FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID, pvd->vdev_guid);
+ sbuf_printf(&sb, " %s %s",
+ FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE,
+ pvd->vdev_ops->vdev_op_type);
+ if (pvd->vdev_path)
+ sbuf_printf(&sb, " %s %s",
+ FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH,
+ pvd->vdev_path);
+ if (pvd->vdev_devid)
+ sbuf_printf(&sb, " %s %s",
+ FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID,
+ pvd->vdev_devid);
+ }
+ }
+
+ if (zio != NULL) {
+ /*
+ * Payload common to all I/Os.
+ */
+ sbuf_printf(&sb, " %s %u", FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR,
+ zio->io_error);
+
+ /*
+ * If the 'size' parameter is non-zero, it indicates this is a
+ * RAID-Z or other I/O where the physical offset and length are
+ * provided for us, instead of within the zio_t.
+ */
+ if (vd != NULL) {
+ if (size) {
+ sbuf_printf(&sb, " %s %ju",
+ FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
+ stateoroffset);
+ sbuf_printf(&sb, " %s %ju",
+ FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, size);
+ } else {
+ sbuf_printf(&sb, " %s %ju",
+ FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
+ zio->io_offset);
+ sbuf_printf(&sb, " %s %ju",
+ FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
+ zio->io_size);
+ }
+ }
+
+ /*
+ * Payload for I/Os with corresponding logical information.
+ */
+ if (zio->io_logical != NULL) {
+ sbuf_printf(&sb, " %s %ju",
+ FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT,
+ zio->io_logical->io_bookmark.zb_object);
+ sbuf_printf(&sb, " %s %ju",
+ FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL,
+ zio->io_logical->io_bookmark.zb_level);
+ sbuf_printf(&sb, " %s %ju",
+ FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID,
+ zio->io_logical->io_bookmark.zb_blkid);
+ }
+ } else if (vd != NULL) {
+ /*
+ * If we have a vdev but no zio, this is a device fault, and the
+ * 'stateoroffset' parameter indicates the previous state of the
+ * vdev.
+ */
+ sbuf_printf(&sb, " %s %ju", FM_EREPORT_PAYLOAD_ZFS_PREV_STATE,
+ stateoroffset);
+ }
+ mutex_exit(&spa->spa_errlist_lock);
+
+ sbuf_finish(&sb);
+ devctl_notify("ZFS", spa->spa_name, class, sbuf_data(&sb));
+ if (sbuf_overflowed(&sb))
+ printf("ZFS WARNING: sbuf overflowed\n");
+ sbuf_delete(&sb);
+#endif
+}
+
+/*
+ * The 'resource.fs.zfs.ok' event is an internal signal that the associated
+ * resource (pool or disk) has been identified by ZFS as healthy. This will
+ * then trigger the DE to close the associated case, if any.
+ */
+void
+zfs_post_ok(spa_t *spa, vdev_t *vd)
+{
+#ifdef _KERNEL
+ char buf[1024];
+ char class[64];
+ struct sbuf sb;
+ struct timespec ts;
+
+ nanotime(&ts);
+
+ sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
+ sbuf_printf(&sb, "time %ju.%ld", (uintmax_t)ts.tv_sec, ts.tv_nsec);
+
+ snprintf(class, sizeof(class), "%s.%s.%s", FM_RSRC_RESOURCE,
+ ZFS_ERROR_CLASS, FM_RESOURCE_OK);
+ sbuf_printf(&sb, " %s %hhu", FM_VERSION, FM_RSRC_VERSION);
+ sbuf_printf(&sb, " %s %s", FM_CLASS, class);
+ sbuf_printf(&sb, " %s %ju", FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
+ spa_guid(spa));
+ if (vd)
+ sbuf_printf(&sb, " %s %ju", FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
+ vd->vdev_guid);
+ sbuf_finish(&sb);
+ devctl_notify("ZFS", spa->spa_name, class, sbuf_data(&sb));
+ if (sbuf_overflowed(&sb))
+ printf("ZFS WARNING: sbuf overflowed\n");
+ sbuf_delete(&sb);
+#endif
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
new file mode 100644
index 0000000..aac1bb1
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
@@ -0,0 +1,1811 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/errno.h>
+#include <sys/uio.h>
+#include <sys/file.h>
+#include <sys/kmem.h>
+#include <sys/conf.h>
+#include <sys/cmn_err.h>
+#include <sys/stat.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zap.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev.h>
+#include <sys/vdev_impl.h>
+#include <sys/dmu.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
+#include <sys/nvpair.h>
+#include <sys/mount.h>
+#include <sys/taskqueue.h>
+#include <sys/sdt.h>
+#include <sys/varargs.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zvol.h>
+
+#include "zfs_namecheck.h"
+#include "zfs_prop.h"
+
+CTASSERT(sizeof(zfs_cmd_t) <= PAGE_SIZE);
+
+static struct cdev *zfsdev;
+
+extern void zfs_init(void);
+extern void zfs_fini(void);
+
+typedef int zfs_ioc_func_t(zfs_cmd_t *);
+typedef int zfs_secpolicy_func_t(const char *, cred_t *);
+
+typedef struct zfs_ioc_vec {
+ zfs_ioc_func_t *zvec_func;
+ zfs_secpolicy_func_t *zvec_secpolicy;
+ enum {
+ no_name,
+ pool_name,
+ dataset_name
+ } zvec_namecheck;
+} zfs_ioc_vec_t;
+
+/* _NOTE(PRINTFLIKE(4)) - this is printf-like, but lint is too whiney */
+void
+__dprintf(const char *file, const char *func, int line, const char *fmt, ...)
+{
+ const char *newfile;
+ char buf[256];
+ va_list adx;
+
+ /*
+ * Get rid of annoying "../common/" prefix to filename.
+ */
+ newfile = strrchr(file, '/');
+ if (newfile != NULL) {
+ newfile = newfile + 1; /* Get rid of leading / */
+ } else {
+ newfile = file;
+ }
+
+ va_start(adx, fmt);
+ (void) vsnprintf(buf, sizeof (buf), fmt, adx);
+ va_end(adx);
+
+ /*
+ * To get this data, use the zfs-dprintf probe as so:
+ * dtrace -q -n 'zfs-dprintf \
+ * /stringof(arg0) == "dbuf.c"/ \
+ * {printf("%s: %s", stringof(arg1), stringof(arg3))}'
+ * arg0 = file name
+ * arg1 = function name
+ * arg2 = line number
+ * arg3 = message
+ */
+ DTRACE_PROBE4(zfs__dprintf,
+ char *, newfile, char *, func, int, line, char *, buf);
+}
+
+/*
+ * Policy for top-level read operations (list pools). Requires no privileges,
+ * and can be used in the local zone, as there is no associated dataset.
+ */
+/* ARGSUSED */
+static int
+zfs_secpolicy_none(const char *unused1, cred_t *cr)
+{
+ return (0);
+}
+
+/*
+ * Policy for dataset read operations (list children, get statistics). Requires
+ * no privileges, but must be visible in the local zone.
+ */
+/* ARGSUSED */
+static int
+zfs_secpolicy_read(const char *dataset, cred_t *cr)
+{
+ if (INGLOBALZONE(curproc) ||
+ zone_dataset_visible(dataset, NULL))
+ return (0);
+
+ return (ENOENT);
+}
+
+static int
+zfs_dozonecheck(const char *dataset, cred_t *cr)
+{
+ uint64_t zoned;
+ int writable = 1;
+
+ /*
+ * The dataset must be visible by this zone -- check this first
+ * so they don't see EPERM on something they shouldn't know about.
+ */
+ if (!INGLOBALZONE(curproc) &&
+ !zone_dataset_visible(dataset, &writable))
+ return (ENOENT);
+
+ if (dsl_prop_get_integer(dataset, "jailed", &zoned, NULL))
+ return (ENOENT);
+
+ if (INGLOBALZONE(curproc)) {
+ /*
+ * If the fs is zoned, only root can access it from the
+ * global zone.
+ */
+ if (secpolicy_zfs(cr) && zoned)
+ return (EPERM);
+ } else {
+ /*
+ * If we are in a local zone, the 'zoned' property must be set.
+ */
+ if (!zoned)
+ return (EPERM);
+
+ /* must be writable by this zone */
+ if (!writable)
+ return (EPERM);
+ }
+ return (0);
+}
+
+/*
+ * Policy for dataset write operations (create children, set properties, etc).
+ * Requires SYS_MOUNT privilege, and must be writable in the local zone.
+ */
+int
+zfs_secpolicy_write(const char *dataset, cred_t *cr)
+{
+ int error;
+
+ if (error = zfs_dozonecheck(dataset, cr))
+ return (error);
+
+ return (secpolicy_zfs(cr));
+}
+
+/*
+ * Policy for operations that want to write a dataset's parent:
+ * create, destroy, snapshot, clone, restore.
+ */
+static int
+zfs_secpolicy_parent(const char *dataset, cred_t *cr)
+{
+ char parentname[MAXNAMELEN];
+ char *cp;
+
+ /*
+ * Remove the @bla or /bla from the end of the name to get the parent.
+ */
+ (void) strncpy(parentname, dataset, sizeof (parentname));
+ cp = strrchr(parentname, '@');
+ if (cp != NULL) {
+ cp[0] = '\0';
+ } else {
+ cp = strrchr(parentname, '/');
+ if (cp == NULL)
+ return (ENOENT);
+ cp[0] = '\0';
+
+ }
+
+ return (zfs_secpolicy_write(parentname, cr));
+}
+
+/*
+ * Policy for pool operations - create/destroy pools, add vdevs, etc. Requires
+ * SYS_CONFIG privilege, which is not available in a local zone.
+ */
+/* ARGSUSED */
+static int
+zfs_secpolicy_config(const char *unused, cred_t *cr)
+{
+ if (secpolicy_sys_config(cr, B_FALSE) != 0)
+ return (EPERM);
+
+ return (0);
+}
+
+/*
+ * Policy for fault injection. Requires all privileges.
+ */
+/* ARGSUSED */
+static int
+zfs_secpolicy_inject(const char *unused, cred_t *cr)
+{
+ return (secpolicy_zinject(cr));
+}
+
+/*
+ * Policy for dataset backup operations (sendbackup).
+ * Requires SYS_MOUNT privilege, and must be writable in the local zone.
+ */
+static int
+zfs_secpolicy_operator(const char *dataset, cred_t *cr)
+{
+ int writable = 1;
+
+ if (!INGLOBALZONE(curproc) && !zone_dataset_visible(dataset, &writable))
+ return (ENOENT);
+ if (secpolicy_zfs(cr) != 0 && !groupmember(GID_OPERATOR, cr))
+ return (EPERM);
+ return (0);
+}
+
+/*
+ * Returns the nvlist as specified by the user in the zfs_cmd_t.
+ */
+static int
+get_nvlist(zfs_cmd_t *zc, nvlist_t **nvp)
+{
+ char *packed;
+ size_t size;
+ int error;
+ nvlist_t *config = NULL;
+
+ /*
+ * Read in and unpack the user-supplied nvlist.
+ */
+ if ((size = zc->zc_nvlist_src_size) == 0)
+ return (EINVAL);
+
+ packed = kmem_alloc(size, KM_SLEEP);
+
+ if ((error = xcopyin((void *)(uintptr_t)zc->zc_nvlist_src, packed,
+ size)) != 0) {
+ kmem_free(packed, size);
+ return (error);
+ }
+
+ if ((error = nvlist_unpack(packed, size, &config, 0)) != 0) {
+ kmem_free(packed, size);
+ return (error);
+ }
+
+ kmem_free(packed, size);
+
+ *nvp = config;
+ return (0);
+}
+
+static int
+put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl)
+{
+ char *packed = NULL;
+ size_t size;
+ int error;
+
+ VERIFY(nvlist_size(nvl, &size, NV_ENCODE_NATIVE) == 0);
+
+ if (size > zc->zc_nvlist_dst_size) {
+ /*
+ * Solaris returns ENOMEM here, because even if an error is
+ * returned from an ioctl(2), new zc_nvlist_dst_size will be
+ * passed to the userland. This is not the case for FreeBSD.
+ * We need to return 0, so the kernel will copy the
+ * zc_nvlist_dst_size back and the userland can discover that a
+ * bigger buffer is needed.
+ */
+ error = 0;
+ } else {
+ VERIFY(nvlist_pack(nvl, &packed, &size, NV_ENCODE_NATIVE,
+ KM_SLEEP) == 0);
+ error = xcopyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst,
+ size);
+ kmem_free(packed, size);
+ }
+
+ zc->zc_nvlist_dst_size = size;
+ return (error);
+}
+
+static int
+zfs_ioc_pool_create(zfs_cmd_t *zc)
+{
+ int error;
+ nvlist_t *config;
+
+ if ((error = get_nvlist(zc, &config)) != 0)
+ return (error);
+
+ error = spa_create(zc->zc_name, config, zc->zc_value[0] == '\0' ?
+ NULL : zc->zc_value);
+
+ nvlist_free(config);
+
+ return (error);
+}
+
+static int
+zfs_ioc_pool_destroy(zfs_cmd_t *zc)
+{
+ return (spa_destroy(zc->zc_name));
+}
+
+static int
+zfs_ioc_pool_import(zfs_cmd_t *zc)
+{
+ int error;
+ nvlist_t *config;
+ uint64_t guid;
+
+ if ((error = get_nvlist(zc, &config)) != 0)
+ return (error);
+
+ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 ||
+ guid != zc->zc_guid)
+ error = EINVAL;
+ else
+ error = spa_import(zc->zc_name, config,
+ zc->zc_value[0] == '\0' ? NULL : zc->zc_value);
+
+ nvlist_free(config);
+
+ return (error);
+}
+
+static int
+zfs_ioc_pool_export(zfs_cmd_t *zc)
+{
+ return (spa_export(zc->zc_name, NULL));
+}
+
+static int
+zfs_ioc_pool_configs(zfs_cmd_t *zc)
+{
+ nvlist_t *configs;
+ int error;
+
+ if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL)
+ return (EEXIST);
+
+ error = put_nvlist(zc, configs);
+
+ nvlist_free(configs);
+
+ return (error);
+}
+
+static int
+zfs_ioc_pool_stats(zfs_cmd_t *zc)
+{
+ nvlist_t *config;
+ int error;
+ int ret = 0;
+
+ error = spa_get_stats(zc->zc_name, &config, zc->zc_value,
+ sizeof (zc->zc_value));
+
+ if (config != NULL) {
+ ret = put_nvlist(zc, config);
+ nvlist_free(config);
+
+ /*
+ * The config may be present even if 'error' is non-zero.
+ * In this case we return success, and preserve the real errno
+ * in 'zc_cookie'.
+ */
+ zc->zc_cookie = error;
+ } else {
+ ret = error;
+ }
+
+ return (ret);
+}
+
+/*
+ * Try to import the given pool, returning pool stats as appropriate so that
+ * user land knows which devices are available and overall pool health.
+ */
+static int
+zfs_ioc_pool_tryimport(zfs_cmd_t *zc)
+{
+ nvlist_t *tryconfig, *config;
+ int error;
+
+ if ((error = get_nvlist(zc, &tryconfig)) != 0)
+ return (error);
+
+ config = spa_tryimport(tryconfig);
+
+ nvlist_free(tryconfig);
+
+ if (config == NULL)
+ return (EINVAL);
+
+ error = put_nvlist(zc, config);
+ nvlist_free(config);
+
+ return (error);
+}
+
+static int
+zfs_ioc_pool_scrub(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ int error;
+
+ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+ return (error);
+
+ error = spa_scrub(spa, zc->zc_cookie, B_FALSE);
+
+ spa_close(spa, FTAG);
+
+ return (error);
+}
+
+static int
+zfs_ioc_pool_freeze(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ int error;
+
+ error = spa_open(zc->zc_name, &spa, FTAG);
+ if (error == 0) {
+ spa_freeze(spa);
+ spa_close(spa, FTAG);
+ }
+ return (error);
+}
+
+static int
+zfs_ioc_pool_upgrade(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ int error;
+
+ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+ return (error);
+
+ spa_upgrade(spa);
+
+ spa_close(spa, FTAG);
+
+ return (error);
+}
+
+static int
+zfs_ioc_pool_get_history(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ char *hist_buf;
+ uint64_t size;
+ int error;
+
+ if ((size = zc->zc_history_len) == 0)
+ return (EINVAL);
+
+ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+ return (error);
+
+ if (spa_version(spa) < ZFS_VERSION_ZPOOL_HISTORY) {
+ spa_close(spa, FTAG);
+ return (ENOTSUP);
+ }
+
+ hist_buf = kmem_alloc(size, KM_SLEEP);
+ if ((error = spa_history_get(spa, &zc->zc_history_offset,
+ &zc->zc_history_len, hist_buf)) == 0) {
+ error = xcopyout(hist_buf, (char *)(uintptr_t)zc->zc_history,
+ zc->zc_history_len);
+ }
+
+ spa_close(spa, FTAG);
+ kmem_free(hist_buf, size);
+ return (error);
+}
+
+static int
+zfs_ioc_pool_log_history(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ char *history_str = NULL;
+ size_t size;
+ int error;
+
+ size = zc->zc_history_len;
+ if (size == 0 || size > HIS_MAX_RECORD_LEN)
+ return (EINVAL);
+
+ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+ return (error);
+
+ if (spa_version(spa) < ZFS_VERSION_ZPOOL_HISTORY) {
+ spa_close(spa, FTAG);
+ return (ENOTSUP);
+ }
+
+ /* add one for the NULL delimiter */
+ size++;
+ history_str = kmem_alloc(size, KM_SLEEP);
+ if ((error = xcopyin((void *)(uintptr_t)zc->zc_history, history_str,
+ size)) != 0) {
+ spa_close(spa, FTAG);
+ kmem_free(history_str, size);
+ return (error);
+ }
+ history_str[size - 1] = '\0';
+
+ error = spa_history_log(spa, history_str, zc->zc_history_offset);
+
+ spa_close(spa, FTAG);
+ kmem_free(history_str, size);
+
+ return (error);
+}
+
+static int
+zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc)
+{
+ int error;
+
+ if (error = dsl_dsobj_to_dsname(zc->zc_name, zc->zc_obj, zc->zc_value))
+ return (error);
+
+ return (0);
+}
+
+static int
+zfs_ioc_obj_to_path(zfs_cmd_t *zc)
+{
+ objset_t *osp;
+ int error;
+
+ if ((error = dmu_objset_open(zc->zc_name, DMU_OST_ZFS,
+ DS_MODE_NONE | DS_MODE_READONLY, &osp)) != 0)
+ return (error);
+
+ error = zfs_obj_to_path(osp, zc->zc_obj, zc->zc_value,
+ sizeof (zc->zc_value));
+ dmu_objset_close(osp);
+
+ return (error);
+}
+
+static int
+zfs_ioc_vdev_add(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ int error;
+ nvlist_t *config;
+
+ error = spa_open(zc->zc_name, &spa, FTAG);
+ if (error != 0)
+ return (error);
+
+ /*
+ * A root pool with concatenated devices is not supported.
+ * Thus, can not add a device to a root pool with one device.
+ */
+ if (spa->spa_root_vdev->vdev_children == 1 && spa->spa_bootfs != 0) {
+ spa_close(spa, FTAG);
+ return (EDOM);
+ }
+
+ if ((error = get_nvlist(zc, &config)) == 0) {
+ error = spa_vdev_add(spa, config);
+ nvlist_free(config);
+ }
+
+ spa_close(spa, FTAG);
+ return (error);
+}
+
+static int
+zfs_ioc_vdev_remove(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ int error;
+
+ error = spa_open(zc->zc_name, &spa, FTAG);
+ if (error != 0)
+ return (error);
+ error = spa_vdev_remove(spa, zc->zc_guid, B_FALSE);
+ spa_close(spa, FTAG);
+ return (error);
+}
+
+static int
+zfs_ioc_vdev_online(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ int error;
+
+ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+ return (error);
+ error = vdev_online(spa, zc->zc_guid);
+ spa_close(spa, FTAG);
+ return (error);
+}
+
+static int
+zfs_ioc_vdev_offline(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ int istmp = zc->zc_cookie;
+ int error;
+
+ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+ return (error);
+ error = vdev_offline(spa, zc->zc_guid, istmp);
+ spa_close(spa, FTAG);
+ return (error);
+}
+
+static int
+zfs_ioc_vdev_attach(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ int replacing = zc->zc_cookie;
+ nvlist_t *config;
+ int error;
+
+ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+ return (error);
+
+ if ((error = get_nvlist(zc, &config)) == 0) {
+ error = spa_vdev_attach(spa, zc->zc_guid, config, replacing);
+ nvlist_free(config);
+ }
+
+ spa_close(spa, FTAG);
+ return (error);
+}
+
+static int
+zfs_ioc_vdev_detach(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ int error;
+
+ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+ return (error);
+
+ error = spa_vdev_detach(spa, zc->zc_guid, B_FALSE);
+
+ spa_close(spa, FTAG);
+ return (error);
+}
+
+static int
+zfs_ioc_vdev_setpath(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ char *path = zc->zc_value;
+ uint64_t guid = zc->zc_guid;
+ int error;
+
+ error = spa_open(zc->zc_name, &spa, FTAG);
+ if (error != 0)
+ return (error);
+
+ error = spa_vdev_setpath(spa, guid, path);
+ spa_close(spa, FTAG);
+ return (error);
+}
+
+static int
+zfs_ioc_objset_stats(zfs_cmd_t *zc)
+{
+ objset_t *os = NULL;
+ int error;
+ nvlist_t *nv;
+
+retry:
+ error = dmu_objset_open(zc->zc_name, DMU_OST_ANY,
+ DS_MODE_STANDARD | DS_MODE_READONLY, &os);
+ if (error != 0) {
+ /*
+ * This is ugly: dmu_objset_open() can return EBUSY if
+ * the objset is held exclusively. Fortunately this hold is
+ * only for a short while, so we retry here.
+ * This avoids user code having to handle EBUSY,
+ * for example for a "zfs list".
+ */
+ if (error == EBUSY) {
+ delay(1);
+ goto retry;
+ }
+ return (error);
+ }
+
+ dmu_objset_fast_stat(os, &zc->zc_objset_stats);
+
+ if (zc->zc_nvlist_dst != 0 &&
+ (error = dsl_prop_get_all(os, &nv)) == 0) {
+ dmu_objset_stats(os, nv);
+ /*
+ * NB: zvol_get_stats() will read the objset contents,
+ * which we aren't supposed to do with a
+ * DS_MODE_STANDARD open, because it could be
+ * inconsistent. So this is a bit of a workaround...
+ */
+ if (!zc->zc_objset_stats.dds_inconsistent &&
+ dmu_objset_type(os) == DMU_OST_ZVOL)
+ VERIFY(zvol_get_stats(os, nv) == 0);
+ error = put_nvlist(zc, nv);
+ nvlist_free(nv);
+ }
+
+ spa_altroot(dmu_objset_spa(os), zc->zc_value, sizeof (zc->zc_value));
+
+ dmu_objset_close(os);
+ if (error == ENOMEM)
+ error = 0;
+ return (error);
+}
+
+static int
+zfs_ioc_dataset_list_next(zfs_cmd_t *zc)
+{
+ objset_t *os;
+ int error;
+ char *p;
+
+retry:
+ error = dmu_objset_open(zc->zc_name, DMU_OST_ANY,
+ DS_MODE_STANDARD | DS_MODE_READONLY, &os);
+ if (error != 0) {
+ /*
+ * This is ugly: dmu_objset_open() can return EBUSY if
+ * the objset is held exclusively. Fortunately this hold is
+ * only for a short while, so we retry here.
+ * This avoids user code having to handle EBUSY,
+ * for example for a "zfs list".
+ */
+ if (error == EBUSY) {
+ delay(1);
+ goto retry;
+ }
+ if (error == ENOENT)
+ error = ESRCH;
+ return (error);
+ }
+
+ p = strrchr(zc->zc_name, '/');
+ if (p == NULL || p[1] != '\0')
+ (void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name));
+ p = zc->zc_name + strlen(zc->zc_name);
+
+ do {
+ error = dmu_dir_list_next(os,
+ sizeof (zc->zc_name) - (p - zc->zc_name), p,
+ NULL, &zc->zc_cookie);
+ if (error == ENOENT)
+ error = ESRCH;
+ } while (error == 0 && !INGLOBALZONE(curproc) &&
+ !zone_dataset_visible(zc->zc_name, NULL));
+
+ /*
+ * If it's a hidden dataset (ie. with a '$' in its name), don't
+ * try to get stats for it. Userland will skip over it.
+ */
+ if (error == 0 && strchr(zc->zc_name, '$') == NULL)
+ error = zfs_ioc_objset_stats(zc); /* fill in the stats */
+
+ dmu_objset_close(os);
+ return (error);
+}
+
+static int
+zfs_ioc_snapshot_list_next(zfs_cmd_t *zc)
+{
+ objset_t *os;
+ int error;
+
+retry:
+ error = dmu_objset_open(zc->zc_name, DMU_OST_ANY,
+ DS_MODE_STANDARD | DS_MODE_READONLY, &os);
+ if (error != 0) {
+ /*
+ * This is ugly: dmu_objset_open() can return EBUSY if
+ * the objset is held exclusively. Fortunately this hold is
+ * only for a short while, so we retry here.
+ * This avoids user code having to handle EBUSY,
+ * for example for a "zfs list".
+ */
+ if (error == EBUSY) {
+ delay(1);
+ goto retry;
+ }
+ if (error == ENOENT)
+ error = ESRCH;
+ return (error);
+ }
+
+ /*
+ * A dataset name of maximum length cannot have any snapshots,
+ * so exit immediately.
+ */
+ if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >= MAXNAMELEN) {
+ dmu_objset_close(os);
+ return (ESRCH);
+ }
+
+ error = dmu_snapshot_list_next(os,
+ sizeof (zc->zc_name) - strlen(zc->zc_name),
+ zc->zc_name + strlen(zc->zc_name), NULL, &zc->zc_cookie);
+ if (error == ENOENT)
+ error = ESRCH;
+
+ if (error == 0)
+ error = zfs_ioc_objset_stats(zc); /* fill in the stats */
+
+ dmu_objset_close(os);
+ return (error);
+}
+
+static int
+zfs_set_prop_nvlist(const char *name, dev_t dev, cred_t *cr, nvlist_t *nvl)
+{
+ nvpair_t *elem;
+ int error;
+ const char *propname;
+ zfs_prop_t prop;
+ uint64_t intval;
+ char *strval;
+ char buf[MAXNAMELEN];
+ const char *p;
+ spa_t *spa;
+
+ elem = NULL;
+ while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
+ propname = nvpair_name(elem);
+
+ if ((prop = zfs_name_to_prop(propname)) ==
+ ZFS_PROP_INVAL) {
+ /*
+ * If this is a user-defined property, it must be a
+ * string, and there is no further validation to do.
+ */
+ if (!zfs_prop_user(propname) ||
+ nvpair_type(elem) != DATA_TYPE_STRING)
+ return (EINVAL);
+
+ VERIFY(nvpair_value_string(elem, &strval) == 0);
+ error = dsl_prop_set(name, propname, 1,
+ strlen(strval) + 1, strval);
+ if (error == 0)
+ continue;
+ else
+ return (error);
+ }
+
+ /*
+ * Check permissions for special properties.
+ */
+ switch (prop) {
+ case ZFS_PROP_ZONED:
+ /*
+ * Disallow setting of 'zoned' from within a local zone.
+ */
+ if (!INGLOBALZONE(curproc))
+ return (EPERM);
+ break;
+
+ case ZFS_PROP_QUOTA:
+ if (error = zfs_dozonecheck(name, cr))
+ return (error);
+
+ if (!INGLOBALZONE(curproc)) {
+ uint64_t zoned;
+ char setpoint[MAXNAMELEN];
+ int dslen;
+ /*
+ * Unprivileged users are allowed to modify the
+ * quota on things *under* (ie. contained by)
+ * the thing they own.
+ */
+ if (dsl_prop_get_integer(name, "jailed", &zoned,
+ setpoint))
+ return (EPERM);
+ if (!zoned) /* this shouldn't happen */
+ return (EPERM);
+ dslen = strlen(name);
+ if (dslen <= strlen(setpoint))
+ return (EPERM);
+ }
+ break;
+
+ case ZFS_PROP_COMPRESSION:
+ /*
+ * If the user specified gzip compression, make sure
+ * the SPA supports it. We ignore any errors here since
+ * we'll catch them later.
+ */
+ if (nvpair_type(elem) == DATA_TYPE_UINT64 &&
+ nvpair_value_uint64(elem, &intval) == 0 &&
+ intval >= ZIO_COMPRESS_GZIP_1 &&
+ intval <= ZIO_COMPRESS_GZIP_9) {
+ if ((p = strchr(name, '/')) == NULL) {
+ p = name;
+ } else {
+ bcopy(name, buf, p - name);
+ buf[p - name] = '\0';
+ p = buf;
+ }
+
+ if (spa_open(p, &spa, FTAG) == 0) {
+ if (spa_version(spa) <
+ ZFS_VERSION_GZIP_COMPRESSION) {
+ spa_close(spa, FTAG);
+ return (ENOTSUP);
+ }
+
+ spa_close(spa, FTAG);
+ }
+ }
+ break;
+ }
+
+ switch (prop) {
+ case ZFS_PROP_QUOTA:
+ if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
+ (error = dsl_dir_set_quota(name,
+ intval)) != 0)
+ return (error);
+ break;
+
+ case ZFS_PROP_RESERVATION:
+ if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
+ (error = dsl_dir_set_reservation(name,
+ intval)) != 0)
+ return (error);
+ break;
+
+ case ZFS_PROP_VOLSIZE:
+ if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
+ (error = zvol_set_volsize(name, dev,
+ intval)) != 0)
+ return (error);
+ break;
+
+ case ZFS_PROP_VOLBLOCKSIZE:
+ if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
+ (error = zvol_set_volblocksize(name,
+ intval)) != 0)
+ return (error);
+ break;
+
+ default:
+ if (nvpair_type(elem) == DATA_TYPE_STRING) {
+ if (zfs_prop_get_type(prop) !=
+ prop_type_string)
+ return (EINVAL);
+ VERIFY(nvpair_value_string(elem, &strval) == 0);
+ if ((error = dsl_prop_set(name,
+ nvpair_name(elem), 1, strlen(strval) + 1,
+ strval)) != 0)
+ return (error);
+ } else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
+ const char *unused;
+
+ VERIFY(nvpair_value_uint64(elem, &intval) == 0);
+
+ switch (zfs_prop_get_type(prop)) {
+ case prop_type_number:
+ break;
+ case prop_type_boolean:
+ if (intval > 1)
+ return (EINVAL);
+ break;
+ case prop_type_string:
+ return (EINVAL);
+ case prop_type_index:
+ if (zfs_prop_index_to_string(prop,
+ intval, &unused) != 0)
+ return (EINVAL);
+ break;
+ default:
+ cmn_err(CE_PANIC, "unknown property "
+ "type");
+ break;
+ }
+
+ if ((error = dsl_prop_set(name, propname,
+ 8, 1, &intval)) != 0)
+ return (error);
+ } else {
+ return (EINVAL);
+ }
+ break;
+ }
+ }
+
+ return (0);
+}
+
+static int
+zfs_ioc_set_prop(zfs_cmd_t *zc)
+{
+ nvlist_t *nvl;
+ int error;
+ zfs_prop_t prop;
+
+ /*
+ * If zc_value is set, then this is an attempt to inherit a value.
+ * Otherwise, zc_nvlist refers to a list of properties to set.
+ */
+ if (zc->zc_value[0] != '\0') {
+ if (!zfs_prop_user(zc->zc_value) &&
+ ((prop = zfs_name_to_prop(zc->zc_value)) ==
+ ZFS_PROP_INVAL ||
+ !zfs_prop_inheritable(prop)))
+ return (EINVAL);
+
+ return (dsl_prop_set(zc->zc_name, zc->zc_value, 0, 0, NULL));
+ }
+
+ if ((error = get_nvlist(zc, &nvl)) != 0)
+ return (error);
+
+ error = zfs_set_prop_nvlist(zc->zc_name, zc->zc_dev,
+ (cred_t *)(uintptr_t)zc->zc_cred, nvl);
+ nvlist_free(nvl);
+ return (error);
+}
+
+static int
+zfs_ioc_pool_props_set(zfs_cmd_t *zc)
+{
+ nvlist_t *nvl;
+ int error, reset_bootfs = 0;
+ uint64_t objnum;
+ zpool_prop_t prop;
+ nvpair_t *elem;
+ char *propname, *strval;
+ spa_t *spa;
+ vdev_t *rvdev;
+ char *vdev_type;
+ objset_t *os;
+
+ if ((error = get_nvlist(zc, &nvl)) != 0)
+ return (error);
+
+ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) {
+ nvlist_free(nvl);
+ return (error);
+ }
+
+ if (spa_version(spa) < ZFS_VERSION_BOOTFS) {
+ nvlist_free(nvl);
+ spa_close(spa, FTAG);
+ return (ENOTSUP);
+ }
+
+ elem = NULL;
+ while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
+
+ propname = nvpair_name(elem);
+
+ if ((prop = zpool_name_to_prop(propname)) ==
+ ZFS_PROP_INVAL) {
+ nvlist_free(nvl);
+ spa_close(spa, FTAG);
+ return (EINVAL);
+ }
+
+ switch (prop) {
+ case ZFS_PROP_BOOTFS:
+ /*
+ * A bootable filesystem can not be on a RAIDZ pool
+ * nor a striped pool with more than 1 device.
+ */
+ rvdev = spa->spa_root_vdev;
+ vdev_type =
+ rvdev->vdev_child[0]->vdev_ops->vdev_op_type;
+ if (strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 ||
+ (strcmp(vdev_type, VDEV_TYPE_MIRROR) != 0 &&
+ rvdev->vdev_children > 1)) {
+ error = ENOTSUP;
+ break;
+ }
+
+ reset_bootfs = 1;
+
+ VERIFY(nvpair_value_string(elem, &strval) == 0);
+ if (strval == NULL || strval[0] == '\0') {
+ objnum =
+ zfs_prop_default_numeric(ZFS_PROP_BOOTFS);
+ break;
+ }
+
+ if (error = dmu_objset_open(strval, DMU_OST_ZFS,
+ DS_MODE_STANDARD | DS_MODE_READONLY, &os))
+ break;
+ objnum = dmu_objset_id(os);
+ dmu_objset_close(os);
+ break;
+
+ default:
+ error = EINVAL;
+ }
+
+ if (error)
+ break;
+ }
+ if (error == 0) {
+ if (reset_bootfs) {
+ VERIFY(nvlist_remove(nvl,
+ zpool_prop_to_name(ZFS_PROP_BOOTFS),
+ DATA_TYPE_STRING) == 0);
+ VERIFY(nvlist_add_uint64(nvl,
+ zpool_prop_to_name(ZFS_PROP_BOOTFS), objnum) == 0);
+ }
+ error = spa_set_props(spa, nvl);
+ }
+
+ nvlist_free(nvl);
+ spa_close(spa, FTAG);
+
+ return (error);
+}
+
+static int
+zfs_ioc_pool_props_get(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ int error;
+ nvlist_t *nvp = NULL;
+
+ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+ return (error);
+
+ error = spa_get_props(spa, &nvp);
+
+ if (error == 0 && zc->zc_nvlist_dst != 0)
+ error = put_nvlist(zc, nvp);
+ else
+ error = EFAULT;
+
+ spa_close(spa, FTAG);
+
+ if (nvp)
+ nvlist_free(nvp);
+ return (error);
+}
+
+static int
+zfs_ioc_create_minor(zfs_cmd_t *zc)
+{
+ return (zvol_create_minor(zc->zc_name, zc->zc_dev));
+}
+
+static int
+zfs_ioc_remove_minor(zfs_cmd_t *zc)
+{
+ return (zvol_remove_minor(zc->zc_name));
+}
+
+/*
+ * Search the vfs list for a specified resource. Returns a pointer to it
+ * or NULL if no suitable entry is found. The caller of this routine
+ * is responsible for releasing the returned vfs pointer.
+ */
+static vfs_t *
+zfs_get_vfs(const char *resource)
+{
+ vfs_t *vfsp;
+
+ mtx_lock(&mountlist_mtx);
+ TAILQ_FOREACH(vfsp, &mountlist, mnt_list) {
+ if (strcmp(vfsp->mnt_stat.f_mntfromname, resource) == 0) {
+ VFS_HOLD(vfsp);
+ break;
+ }
+ }
+ mtx_unlock(&mountlist_mtx);
+ return (vfsp);
+}
+
+static void
+zfs_create_cb(objset_t *os, void *arg, dmu_tx_t *tx)
+{
+ zfs_create_data_t *zc = arg;
+
+ zfs_create_fs(os, (cred_t *)(uintptr_t)zc->zc_cred, tx);
+}
+
+static int
+zfs_ioc_create(zfs_cmd_t *zc)
+{
+ objset_t *clone;
+ int error = 0;
+ zfs_create_data_t cbdata = { 0 };
+ void (*cbfunc)(objset_t *os, void *arg, dmu_tx_t *tx);
+ dmu_objset_type_t type = zc->zc_objset_type;
+
+ switch (type) {
+
+ case DMU_OST_ZFS:
+ cbfunc = zfs_create_cb;
+ break;
+
+ case DMU_OST_ZVOL:
+ cbfunc = zvol_create_cb;
+ break;
+
+ default:
+ cbfunc = NULL;
+ }
+ if (strchr(zc->zc_name, '@'))
+ return (EINVAL);
+
+ if (zc->zc_nvlist_src != 0 &&
+ (error = get_nvlist(zc, &cbdata.zc_props)) != 0)
+ return (error);
+
+ cbdata.zc_cred = (cred_t *)(uintptr_t)zc->zc_cred;
+ cbdata.zc_dev = (dev_t)zc->zc_dev;
+
+ if (zc->zc_value[0] != '\0') {
+ /*
+ * We're creating a clone of an existing snapshot.
+ */
+ zc->zc_value[sizeof (zc->zc_value) - 1] = '\0';
+ if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0) {
+ nvlist_free(cbdata.zc_props);
+ return (EINVAL);
+ }
+
+ error = dmu_objset_open(zc->zc_value, type,
+ DS_MODE_STANDARD | DS_MODE_READONLY, &clone);
+ if (error) {
+ nvlist_free(cbdata.zc_props);
+ return (error);
+ }
+ error = dmu_objset_create(zc->zc_name, type, clone, NULL, NULL);
+ dmu_objset_close(clone);
+ } else {
+ if (cbfunc == NULL) {
+ nvlist_free(cbdata.zc_props);
+ return (EINVAL);
+ }
+
+ if (type == DMU_OST_ZVOL) {
+ uint64_t volsize, volblocksize;
+
+ if (cbdata.zc_props == NULL ||
+ nvlist_lookup_uint64(cbdata.zc_props,
+ zfs_prop_to_name(ZFS_PROP_VOLSIZE),
+ &volsize) != 0) {
+ nvlist_free(cbdata.zc_props);
+ return (EINVAL);
+ }
+
+ if ((error = nvlist_lookup_uint64(cbdata.zc_props,
+ zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
+ &volblocksize)) != 0 && error != ENOENT) {
+ nvlist_free(cbdata.zc_props);
+ return (EINVAL);
+ }
+
+ if (error != 0)
+ volblocksize = zfs_prop_default_numeric(
+ ZFS_PROP_VOLBLOCKSIZE);
+
+ if ((error = zvol_check_volblocksize(
+ volblocksize)) != 0 ||
+ (error = zvol_check_volsize(volsize,
+ volblocksize)) != 0) {
+ nvlist_free(cbdata.zc_props);
+ return (error);
+ }
+ }
+
+ error = dmu_objset_create(zc->zc_name, type, NULL, cbfunc,
+ &cbdata);
+ }
+
+ /*
+ * It would be nice to do this atomically.
+ */
+ if (error == 0) {
+ if ((error = zfs_set_prop_nvlist(zc->zc_name,
+ zc->zc_dev, (cred_t *)(uintptr_t)zc->zc_cred,
+ cbdata.zc_props)) != 0)
+ (void) dmu_objset_destroy(zc->zc_name);
+ }
+
+ nvlist_free(cbdata.zc_props);
+ return (error);
+}
+
+static int
+zfs_ioc_snapshot(zfs_cmd_t *zc)
+{
+ if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0)
+ return (EINVAL);
+ return (dmu_objset_snapshot(zc->zc_name,
+ zc->zc_value, zc->zc_cookie));
+}
+
+static int
+zfs_unmount_snap(char *name, void *arg)
+{
+ char *snapname = arg;
+ char *cp;
+ vfs_t *vfsp = NULL;
+
+ /*
+ * Snapshots (which are under .zfs control) must be unmounted
+ * before they can be destroyed.
+ */
+
+ if (snapname) {
+ (void) strcat(name, "@");
+ (void) strcat(name, snapname);
+ vfsp = zfs_get_vfs(name);
+ cp = strchr(name, '@');
+ *cp = '\0';
+ } else if (strchr(name, '@')) {
+ vfsp = zfs_get_vfs(name);
+ }
+
+ if (vfsp) {
+ /*
+ * Always force the unmount for snapshots.
+ */
+ int flag = MS_FORCE;
+ int err;
+
+ if ((err = vn_vfswlock(vfsp->vfs_vnodecovered)) != 0) {
+ VFS_RELE(vfsp);
+ return (err);
+ }
+ VFS_RELE(vfsp);
+ mtx_lock(&Giant); /* dounmount() */
+ dounmount(vfsp, flag, curthread);
+ mtx_unlock(&Giant); /* dounmount() */
+ }
+ return (0);
+}
+
+static int
+zfs_ioc_destroy_snaps(zfs_cmd_t *zc)
+{
+ int err;
+
+ if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0)
+ return (EINVAL);
+ err = dmu_objset_find(zc->zc_name,
+ zfs_unmount_snap, zc->zc_value, DS_FIND_CHILDREN);
+ if (err)
+ return (err);
+ return (dmu_snapshots_destroy(zc->zc_name, zc->zc_value));
+}
+
+static int
+zfs_ioc_destroy(zfs_cmd_t *zc)
+{
+ if (strchr(zc->zc_name, '@') && zc->zc_objset_type == DMU_OST_ZFS) {
+ int err = zfs_unmount_snap(zc->zc_name, NULL);
+ if (err)
+ return (err);
+ }
+
+ return (dmu_objset_destroy(zc->zc_name));
+}
+
+static int
+zfs_ioc_rollback(zfs_cmd_t *zc)
+{
+ return (dmu_objset_rollback(zc->zc_name));
+}
+
+static int
+zfs_ioc_rename(zfs_cmd_t *zc)
+{
+ zc->zc_value[sizeof (zc->zc_value) - 1] = '\0';
+ if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0)
+ return (EINVAL);
+
+ if (strchr(zc->zc_name, '@') != NULL &&
+ zc->zc_objset_type == DMU_OST_ZFS) {
+ int err = zfs_unmount_snap(zc->zc_name, NULL);
+ if (err)
+ return (err);
+ }
+
+ return (dmu_objset_rename(zc->zc_name, zc->zc_value));
+}
+
+static int
+zfs_ioc_recvbackup(zfs_cmd_t *zc)
+{
+ kthread_t *td = curthread;
+ struct file *fp;
+ int error;
+ offset_t new_off;
+
+ if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 ||
+ strchr(zc->zc_value, '@') == NULL)
+ return (EINVAL);
+
+ error = fget_read(td, zc->zc_cookie, &fp);
+ if (error)
+ return (error);
+
+ error = dmu_recvbackup(zc->zc_value, &zc->zc_begin_record,
+ &zc->zc_cookie, (boolean_t)zc->zc_guid, fp,
+ fp->f_offset);
+
+ new_off = fp->f_offset + zc->zc_cookie;
+ fp->f_offset = new_off;
+
+ fdrop(fp, td);
+ return (error);
+}
+
+static int
+zfs_ioc_sendbackup(zfs_cmd_t *zc)
+{
+ kthread_t *td = curthread;
+ struct file *fp;
+ objset_t *fromsnap = NULL;
+ objset_t *tosnap;
+ int error, fd;
+
+ error = dmu_objset_open(zc->zc_name, DMU_OST_ANY,
+ DS_MODE_STANDARD | DS_MODE_READONLY, &tosnap);
+ if (error)
+ return (error);
+
+ if (zc->zc_value[0] != '\0') {
+ char buf[MAXPATHLEN];
+ char *cp;
+
+ (void) strncpy(buf, zc->zc_name, sizeof (buf));
+ cp = strchr(buf, '@');
+ if (cp)
+ *(cp+1) = 0;
+ (void) strlcat(buf, zc->zc_value, sizeof (buf));
+ error = dmu_objset_open(buf, DMU_OST_ANY,
+ DS_MODE_STANDARD | DS_MODE_READONLY, &fromsnap);
+ if (error) {
+ dmu_objset_close(tosnap);
+ return (error);
+ }
+ }
+
+ fd = zc->zc_cookie;
+ error = fget_write(td, fd, &fp);
+ if (error) {
+ dmu_objset_close(tosnap);
+ if (fromsnap)
+ dmu_objset_close(fromsnap);
+ return (error);
+ }
+
+ error = dmu_sendbackup(tosnap, fromsnap, fp);
+
+ fdrop(fp, td);
+ if (fromsnap)
+ dmu_objset_close(fromsnap);
+ dmu_objset_close(tosnap);
+ return (error);
+}
+
+static int
+zfs_ioc_inject_fault(zfs_cmd_t *zc)
+{
+ int id, error;
+
+ error = zio_inject_fault(zc->zc_name, (int)zc->zc_guid, &id,
+ &zc->zc_inject_record);
+
+ if (error == 0)
+ zc->zc_guid = (uint64_t)id;
+
+ return (error);
+}
+
+static int
+zfs_ioc_clear_fault(zfs_cmd_t *zc)
+{
+ return (zio_clear_fault((int)zc->zc_guid));
+}
+
+static int
+zfs_ioc_inject_list_next(zfs_cmd_t *zc)
+{
+ int id = (int)zc->zc_guid;
+ int error;
+
+ error = zio_inject_list_next(&id, zc->zc_name, sizeof (zc->zc_name),
+ &zc->zc_inject_record);
+
+ zc->zc_guid = id;
+
+ return (error);
+}
+
+static int
+zfs_ioc_error_log(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ int error;
+ size_t count = (size_t)zc->zc_nvlist_dst_size;
+
+ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+ return (error);
+
+ error = spa_get_errlog(spa, (void *)(uintptr_t)zc->zc_nvlist_dst,
+ &count);
+ if (error == 0)
+ zc->zc_nvlist_dst_size = count;
+ else
+ zc->zc_nvlist_dst_size = spa_get_errlog_size(spa);
+
+ spa_close(spa, FTAG);
+
+ return (error);
+}
+
+static int
+zfs_ioc_clear(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ vdev_t *vd;
+ int error;
+
+ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+ return (error);
+
+ spa_config_enter(spa, RW_WRITER, FTAG);
+
+ if (zc->zc_guid == 0) {
+ vd = NULL;
+ } else if ((vd = spa_lookup_by_guid(spa, zc->zc_guid)) == NULL) {
+ spa_config_exit(spa, FTAG);
+ spa_close(spa, FTAG);
+ return (ENODEV);
+ }
+
+ vdev_clear(spa, vd);
+
+ spa_config_exit(spa, FTAG);
+
+ spa_close(spa, FTAG);
+
+ return (0);
+}
+
+static int
+zfs_ioc_promote(zfs_cmd_t *zc)
+{
+ char *cp;
+
+ /*
+ * We don't need to unmount *all* the origin fs's snapshots, but
+ * it's easier.
+ */
+ cp = strchr(zc->zc_value, '@');
+ if (cp)
+ *cp = '\0';
+ (void) dmu_objset_find(zc->zc_value,
+ zfs_unmount_snap, NULL, DS_FIND_SNAPSHOTS);
+ return (dsl_dataset_promote(zc->zc_name));
+}
+
+static int
+zfs_ioc_jail(zfs_cmd_t *zc)
+{
+
+ return (zone_dataset_attach((cred_t *)(uintptr_t)zc->zc_cred,
+ zc->zc_name, (int)zc->zc_jailid));
+}
+
+static int
+zfs_ioc_unjail(zfs_cmd_t *zc)
+{
+
+ return (zone_dataset_detach((cred_t *)(uintptr_t)zc->zc_cred,
+ zc->zc_name, (int)zc->zc_jailid));
+}
+
+static zfs_ioc_vec_t zfs_ioc_vec[] = {
+ { zfs_ioc_pool_create, zfs_secpolicy_config, pool_name },
+ { zfs_ioc_pool_destroy, zfs_secpolicy_config, pool_name },
+ { zfs_ioc_pool_import, zfs_secpolicy_config, pool_name },
+ { zfs_ioc_pool_export, zfs_secpolicy_config, pool_name },
+ { zfs_ioc_pool_configs, zfs_secpolicy_none, no_name },
+ { zfs_ioc_pool_stats, zfs_secpolicy_read, pool_name },
+ { zfs_ioc_pool_tryimport, zfs_secpolicy_config, no_name },
+ { zfs_ioc_pool_scrub, zfs_secpolicy_config, pool_name },
+ { zfs_ioc_pool_freeze, zfs_secpolicy_config, no_name },
+ { zfs_ioc_pool_upgrade, zfs_secpolicy_config, pool_name },
+ { zfs_ioc_pool_get_history, zfs_secpolicy_config, pool_name },
+ { zfs_ioc_pool_log_history, zfs_secpolicy_config, pool_name },
+ { zfs_ioc_vdev_add, zfs_secpolicy_config, pool_name },
+ { zfs_ioc_vdev_remove, zfs_secpolicy_config, pool_name },
+ { zfs_ioc_vdev_online, zfs_secpolicy_config, pool_name },
+ { zfs_ioc_vdev_offline, zfs_secpolicy_config, pool_name },
+ { zfs_ioc_vdev_attach, zfs_secpolicy_config, pool_name },
+ { zfs_ioc_vdev_detach, zfs_secpolicy_config, pool_name },
+ { zfs_ioc_vdev_setpath, zfs_secpolicy_config, pool_name },
+ { zfs_ioc_objset_stats, zfs_secpolicy_read, dataset_name },
+ { zfs_ioc_dataset_list_next, zfs_secpolicy_read, dataset_name },
+ { zfs_ioc_snapshot_list_next, zfs_secpolicy_read, dataset_name },
+ { zfs_ioc_set_prop, zfs_secpolicy_write, dataset_name },
+ { zfs_ioc_create_minor, zfs_secpolicy_config, dataset_name },
+ { zfs_ioc_remove_minor, zfs_secpolicy_config, dataset_name },
+ { zfs_ioc_create, zfs_secpolicy_parent, dataset_name },
+ { zfs_ioc_destroy, zfs_secpolicy_parent, dataset_name },
+ { zfs_ioc_rollback, zfs_secpolicy_write, dataset_name },
+ { zfs_ioc_rename, zfs_secpolicy_write, dataset_name },
+ { zfs_ioc_recvbackup, zfs_secpolicy_write, dataset_name },
+ { zfs_ioc_sendbackup, zfs_secpolicy_operator, dataset_name },
+ { zfs_ioc_inject_fault, zfs_secpolicy_inject, no_name },
+ { zfs_ioc_clear_fault, zfs_secpolicy_inject, no_name },
+ { zfs_ioc_inject_list_next, zfs_secpolicy_inject, no_name },
+ { zfs_ioc_error_log, zfs_secpolicy_inject, pool_name },
+ { zfs_ioc_clear, zfs_secpolicy_config, pool_name },
+ { zfs_ioc_promote, zfs_secpolicy_write, dataset_name },
+ { zfs_ioc_destroy_snaps, zfs_secpolicy_write, dataset_name },
+ { zfs_ioc_snapshot, zfs_secpolicy_operator, dataset_name },
+ { zfs_ioc_dsobj_to_dsname, zfs_secpolicy_config, pool_name },
+ { zfs_ioc_obj_to_path, zfs_secpolicy_config, no_name },
+ { zfs_ioc_pool_props_set, zfs_secpolicy_config, pool_name },
+ { zfs_ioc_pool_props_get, zfs_secpolicy_read, pool_name },
+ { zfs_ioc_jail, zfs_secpolicy_config, dataset_name },
+ { zfs_ioc_unjail, zfs_secpolicy_config, dataset_name }
+};
+
+static int
+zfsdev_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag,
+ struct thread *td)
+{
+ zfs_cmd_t *zc = (void *)addr;
+ uint_t vec;
+ int error;
+
+ vec = ZFS_IOC(cmd);
+
+ if (vec >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0]))
+ return (EINVAL);
+
+ zc->zc_cred = (uintptr_t)td->td_ucred;
+ zc->zc_dev = (uintptr_t)dev;
+ error = zfs_ioc_vec[vec].zvec_secpolicy(zc->zc_name, td->td_ucred);
+
+ /*
+ * Ensure that all pool/dataset names are valid before we pass down to
+ * the lower layers.
+ */
+ if (error == 0) {
+ zc->zc_name[sizeof (zc->zc_name) - 1] = '\0';
+ switch (zfs_ioc_vec[vec].zvec_namecheck) {
+ case pool_name:
+ if (pool_namecheck(zc->zc_name, NULL, NULL) != 0)
+ error = EINVAL;
+ break;
+
+ case dataset_name:
+ if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0)
+ error = EINVAL;
+ break;
+
+ case no_name:
+ break;
+ }
+ }
+
+ if (error == 0)
+ error = zfs_ioc_vec[vec].zvec_func(zc);
+
+ return (error);
+}
+
+/*
+ * OK, so this is a little weird.
+ *
+ * /dev/zfs is the control node, i.e. minor 0.
+ * /dev/zvol/[r]dsk/pool/dataset are the zvols, minor > 0.
+ *
+ * /dev/zfs has basically nothing to do except serve up ioctls,
+ * so most of the standard driver entry points are in zvol.c.
+ */
+static struct cdevsw zfs_cdevsw = {
+ .d_version = D_VERSION,
+ .d_ioctl = zfsdev_ioctl,
+ .d_name = ZFS_DEV_NAME
+};
+
+static void
+zfsdev_init(void)
+{
+ zfsdev = make_dev(&zfs_cdevsw, 0x0, UID_ROOT, GID_OPERATOR, 0660,
+ ZFS_DEV_NAME);
+}
+
+static void
+zfsdev_fini(void)
+{
+ if (zfsdev != NULL)
+ destroy_dev(zfsdev);
+}
+
+static struct task zfs_start_task;
+
+static void
+zfs_start(void *context __unused, int pending __unused)
+{
+
+ zfsdev_init();
+ spa_init(FREAD | FWRITE);
+ zfs_init();
+ zvol_init();
+ printf("ZFS storage pool version " ZFS_VERSION_STRING "\n");
+}
+
+static int
+zfs_modevent(module_t mod, int type, void *unused __unused)
+{
+ int error;
+
+ error = EOPNOTSUPP;
+ switch (type) {
+ case MOD_LOAD:
+ printf("WARNING: ZFS is considered to be an experimental "
+ "feature in FreeBSD.\n");
+ TASK_INIT(&zfs_start_task, 0, zfs_start, NULL);
+ taskqueue_enqueue(taskqueue_thread, &zfs_start_task);
+ error = 0;
+ break;
+ case MOD_UNLOAD:
+ if (spa_busy() || /* zfs_busy() || */ zvol_busy() ||
+ zio_injection_enabled) {
+ error = EBUSY;
+ break;
+ }
+ zvol_fini();
+ zfs_fini();
+ spa_fini();
+ zfsdev_fini();
+ error = 0;
+ break;
+ }
+ return (error);
+}
+
+static moduledata_t zfs_mod = {
+ "zfsctrl",
+ zfs_modevent,
+ 0
+};
+DECLARE_MODULE(zfsctrl, zfs_mod, SI_SUB_MOUNT_ROOT, SI_ORDER_ANY);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c
new file mode 100644
index 0000000..06cb95a
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c
@@ -0,0 +1,348 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/cmn_err.h>
+#include <sys/kmem.h>
+#include <sys/file.h>
+#include <sys/vfs.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_dir.h>
+#include <sys/zil.h>
+#include <sys/byteorder.h>
+#include <sys/stat.h>
+#include <sys/acl.h>
+#include <sys/dmu.h>
+#include <sys/spa.h>
+
+/*
+ * All the functions in this file are used to construct the log entries
+ * to record transactions. They allocate * a intent log transaction
+ * structure (itx_t) and save within it all the information necessary to
+ * possibly replay the transaction. The itx is then assigned a sequence
+ * number and inserted in the in-memory list anchored in the zilog.
+ */
+
+/*
+ * zfs_log_create() is used to handle TX_CREATE, TX_MKDIR and TX_MKXATTR
+ * transactions.
+ */
+void
+zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *dzp, znode_t *zp, char *name)
+{
+ itx_t *itx;
+ uint64_t seq;
+ lr_create_t *lr;
+ size_t namesize = strlen(name) + 1;
+
+ if (zilog == NULL)
+ return;
+
+ itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
+ lr = (lr_create_t *)&itx->itx_lr;
+ lr->lr_doid = dzp->z_id;
+ lr->lr_foid = zp->z_id;
+ lr->lr_mode = zp->z_phys->zp_mode;
+ lr->lr_uid = zp->z_phys->zp_uid;
+ lr->lr_gid = zp->z_phys->zp_gid;
+ lr->lr_gen = zp->z_phys->zp_gen;
+ lr->lr_crtime[0] = zp->z_phys->zp_crtime[0];
+ lr->lr_crtime[1] = zp->z_phys->zp_crtime[1];
+ lr->lr_rdev = zp->z_phys->zp_rdev;
+ bcopy(name, (char *)(lr + 1), namesize);
+
+ seq = zil_itx_assign(zilog, itx, tx);
+ dzp->z_last_itx = seq;
+ zp->z_last_itx = seq;
+}
+
+/*
+ * zfs_log_remove() handles both TX_REMOVE and TX_RMDIR transactions.
+ */
+void
+zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *dzp, char *name)
+{
+ itx_t *itx;
+ uint64_t seq;
+ lr_remove_t *lr;
+ size_t namesize = strlen(name) + 1;
+
+ if (zilog == NULL)
+ return;
+
+ itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
+ lr = (lr_remove_t *)&itx->itx_lr;
+ lr->lr_doid = dzp->z_id;
+ bcopy(name, (char *)(lr + 1), namesize);
+
+ seq = zil_itx_assign(zilog, itx, tx);
+ dzp->z_last_itx = seq;
+}
+
+/*
+ * zfs_log_link() handles TX_LINK transactions.
+ */
+void
+zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *dzp, znode_t *zp, char *name)
+{
+ itx_t *itx;
+ uint64_t seq;
+ lr_link_t *lr;
+ size_t namesize = strlen(name) + 1;
+
+ if (zilog == NULL)
+ return;
+
+ itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
+ lr = (lr_link_t *)&itx->itx_lr;
+ lr->lr_doid = dzp->z_id;
+ lr->lr_link_obj = zp->z_id;
+ bcopy(name, (char *)(lr + 1), namesize);
+
+ seq = zil_itx_assign(zilog, itx, tx);
+ dzp->z_last_itx = seq;
+ zp->z_last_itx = seq;
+}
+
+/*
+ * zfs_log_symlink() handles TX_SYMLINK transactions.
+ */
+void
+zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *dzp, znode_t *zp, char *name, char *link)
+{
+ itx_t *itx;
+ uint64_t seq;
+ lr_create_t *lr;
+ size_t namesize = strlen(name) + 1;
+ size_t linksize = strlen(link) + 1;
+
+ if (zilog == NULL)
+ return;
+
+ itx = zil_itx_create(txtype, sizeof (*lr) + namesize + linksize);
+ lr = (lr_create_t *)&itx->itx_lr;
+ lr->lr_doid = dzp->z_id;
+ lr->lr_foid = zp->z_id;
+ lr->lr_mode = zp->z_phys->zp_mode;
+ lr->lr_uid = zp->z_phys->zp_uid;
+ lr->lr_gid = zp->z_phys->zp_gid;
+ lr->lr_gen = zp->z_phys->zp_gen;
+ lr->lr_crtime[0] = zp->z_phys->zp_crtime[0];
+ lr->lr_crtime[1] = zp->z_phys->zp_crtime[1];
+ bcopy(name, (char *)(lr + 1), namesize);
+ bcopy(link, (char *)(lr + 1) + namesize, linksize);
+
+ seq = zil_itx_assign(zilog, itx, tx);
+ dzp->z_last_itx = seq;
+ zp->z_last_itx = seq;
+}
+
+/*
+ * zfs_log_rename() handles TX_RENAME transactions.
+ */
+void
+zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp)
+{
+ itx_t *itx;
+ uint64_t seq;
+ lr_rename_t *lr;
+ size_t snamesize = strlen(sname) + 1;
+ size_t dnamesize = strlen(dname) + 1;
+
+ if (zilog == NULL)
+ return;
+
+ itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize);
+ lr = (lr_rename_t *)&itx->itx_lr;
+ lr->lr_sdoid = sdzp->z_id;
+ lr->lr_tdoid = tdzp->z_id;
+ bcopy(sname, (char *)(lr + 1), snamesize);
+ bcopy(dname, (char *)(lr + 1) + snamesize, dnamesize);
+
+ seq = zil_itx_assign(zilog, itx, tx);
+ sdzp->z_last_itx = seq;
+ tdzp->z_last_itx = seq;
+ szp->z_last_itx = seq;
+}
+
+/*
+ * zfs_log_write() handles TX_WRITE transactions.
+ */
+ssize_t zfs_immediate_write_sz = 32768;
+
+void
+zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *zp, offset_t off, ssize_t len, int ioflag)
+{
+ itx_t *itx;
+ uint64_t seq;
+ lr_write_t *lr;
+ itx_wr_state_t write_state;
+ int err;
+
+ if (zilog == NULL || zp->z_unlinked)
+ return;
+
+ /*
+ * Writes are handled in three different ways:
+ *
+ * WR_INDIRECT:
+ * If the write is greater than zfs_immediate_write_sz then
+ * later *if* we need to log the write then dmu_sync() is used
+ * to immediately write the block and it's block pointer is put
+ * in the log record.
+ * WR_COPIED:
+ * If we know we'll immediately be committing the
+ * transaction (FDSYNC (O_DSYNC)), the we allocate a larger
+ * log record here for the data and copy the data in.
+ * WR_NEED_COPY:
+ * Otherwise we don't allocate a buffer, and *if* we need to
+ * flush the write later then a buffer is allocated and
+ * we retrieve the data using the dmu.
+ */
+ if (len > zfs_immediate_write_sz)
+ write_state = WR_INDIRECT;
+ else if (ioflag & IO_SYNC)
+ write_state = WR_COPIED;
+ else
+ write_state = WR_NEED_COPY;
+
+ itx = zil_itx_create(txtype, sizeof (*lr) +
+ (write_state == WR_COPIED ? len : 0));
+ lr = (lr_write_t *)&itx->itx_lr;
+ if (write_state == WR_COPIED) {
+ err = dmu_read(zp->z_zfsvfs->z_os, zp->z_id, off, len, lr + 1);
+ if (err) {
+ kmem_free(itx, offsetof(itx_t, itx_lr) +
+ itx->itx_lr.lrc_reclen);
+ itx = zil_itx_create(txtype, sizeof (*lr));
+ lr = (lr_write_t *)&itx->itx_lr;
+ write_state = WR_NEED_COPY;
+ }
+ }
+
+ itx->itx_wr_state = write_state;
+ lr->lr_foid = zp->z_id;
+ lr->lr_offset = off;
+ lr->lr_length = len;
+ lr->lr_blkoff = 0;
+ BP_ZERO(&lr->lr_blkptr);
+
+ itx->itx_private = zp->z_zfsvfs;
+
+ itx->itx_sync = (zp->z_sync_cnt != 0);
+ seq = zil_itx_assign(zilog, itx, tx);
+ zp->z_last_itx = seq;
+}
+
+/*
+ * zfs_log_truncate() handles TX_TRUNCATE transactions.
+ */
+void
+zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *zp, uint64_t off, uint64_t len)
+{
+ itx_t *itx;
+ uint64_t seq;
+ lr_truncate_t *lr;
+
+ if (zilog == NULL || zp->z_unlinked)
+ return;
+
+ itx = zil_itx_create(txtype, sizeof (*lr));
+ lr = (lr_truncate_t *)&itx->itx_lr;
+ lr->lr_foid = zp->z_id;
+ lr->lr_offset = off;
+ lr->lr_length = len;
+
+ itx->itx_sync = (zp->z_sync_cnt != 0);
+ seq = zil_itx_assign(zilog, itx, tx);
+ zp->z_last_itx = seq;
+}
+
+/*
+ * zfs_log_setattr() handles TX_SETATTR transactions.
+ */
+void
+zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *zp, vattr_t *vap, uint_t mask_applied)
+{
+ itx_t *itx;
+ uint64_t seq;
+ lr_setattr_t *lr;
+
+ if (zilog == NULL || zp->z_unlinked)
+ return;
+
+ itx = zil_itx_create(txtype, sizeof (*lr));
+ lr = (lr_setattr_t *)&itx->itx_lr;
+ lr->lr_foid = zp->z_id;
+ lr->lr_mask = (uint64_t)mask_applied;
+ lr->lr_mode = (uint64_t)vap->va_mode;
+ lr->lr_uid = (uint64_t)vap->va_uid;
+ lr->lr_gid = (uint64_t)vap->va_gid;
+ lr->lr_size = (uint64_t)vap->va_size;
+ ZFS_TIME_ENCODE(&vap->va_atime, lr->lr_atime);
+ ZFS_TIME_ENCODE(&vap->va_mtime, lr->lr_mtime);
+
+ itx->itx_sync = (zp->z_sync_cnt != 0);
+ seq = zil_itx_assign(zilog, itx, tx);
+ zp->z_last_itx = seq;
+}
+
+/*
+ * zfs_log_acl() handles TX_ACL transactions.
+ */
+void
+zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *zp, int aclcnt, ace_t *z_ace)
+{
+ itx_t *itx;
+ uint64_t seq;
+ lr_acl_t *lr;
+
+ if (zilog == NULL || zp->z_unlinked)
+ return;
+
+ itx = zil_itx_create(txtype, sizeof (*lr) + aclcnt * sizeof (ace_t));
+ lr = (lr_acl_t *)&itx->itx_lr;
+ lr->lr_foid = zp->z_id;
+ lr->lr_aclcnt = (uint64_t)aclcnt;
+ bcopy(z_ace, (ace_t *)(lr + 1), aclcnt * sizeof (ace_t));
+
+ itx->itx_sync = (zp->z_sync_cnt != 0);
+ seq = zil_itx_assign(zilog, itx, tx);
+ zp->z_last_itx = seq;
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c
new file mode 100644
index 0000000..ad3ad91
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c
@@ -0,0 +1,424 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/cmn_err.h>
+#include <sys/kmem.h>
+#include <sys/file.h>
+#include <sys/fcntl.h>
+#include <sys/vfs.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_acl.h>
+#include <sys/spa.h>
+#include <sys/zil.h>
+#include <sys/byteorder.h>
+#include <sys/stat.h>
+#include <sys/acl.h>
+#include <sys/atomic.h>
+#include <sys/cred.h>
+#include <sys/namei.h>
+
+/*
+ * Functions to replay ZFS intent log (ZIL) records
+ * The functions are called through a function vector (zfs_replay_vector)
+ * which is indexed by the transaction type.
+ */
+
+static void
+zfs_init_vattr(vattr_t *vap, uint64_t mask, uint64_t mode,
+ uint64_t uid, uint64_t gid, uint64_t rdev, uint64_t nodeid)
+{
+ VATTR_NULL(vap);
+ vap->va_mask = (uint_t)mask;
+ vap->va_type = IFTOVT(mode);
+ vap->va_mode = mode & MODEMASK;
+ vap->va_uid = (uid_t)uid;
+ vap->va_gid = (gid_t)gid;
+ vap->va_rdev = zfs_cmpldev(rdev);
+ vap->va_nodeid = nodeid;
+}
+
+/* ARGSUSED */
+static int
+zfs_replay_error(zfsvfs_t *zfsvfs, lr_t *lr, boolean_t byteswap)
+{
+ return (ENOTSUP);
+}
+
+static int
+zfs_replay_create(zfsvfs_t *zfsvfs, lr_create_t *lr, boolean_t byteswap)
+{
+ char *name = (char *)(lr + 1); /* name follows lr_create_t */
+ char *link; /* symlink content follows name */
+ znode_t *dzp;
+ vnode_t *vp = NULL;
+ vattr_t va;
+ struct componentname cn;
+ int error;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
+ return (error);
+
+ zfs_init_vattr(&va, AT_TYPE | AT_MODE | AT_UID | AT_GID,
+ lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, lr->lr_foid);
+
+ /*
+ * All forms of zfs create (create, mkdir, mkxattrdir, symlink)
+ * eventually end up in zfs_mknode(), which assigns the object's
+ * creation time and generation number. The generic VOP_CREATE()
+ * doesn't have either concept, so we smuggle the values inside
+ * the vattr's otherwise unused va_ctime and va_nblocks fields.
+ */
+ ZFS_TIME_DECODE(&va.va_ctime, lr->lr_crtime);
+ va.va_nblocks = lr->lr_gen;
+
+ cn.cn_nameptr = name;
+ cn.cn_cred = kcred;
+ cn.cn_thread = curthread;
+ cn.cn_flags = SAVENAME;
+
+ vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY, curthread);
+ switch ((int)lr->lr_common.lrc_txtype) {
+ case TX_CREATE:
+ error = VOP_CREATE(ZTOV(dzp), &vp, &cn, &va);
+ break;
+ case TX_MKDIR:
+ error = VOP_MKDIR(ZTOV(dzp), &vp, &cn, &va);
+ break;
+ case TX_MKXATTR:
+ error = zfs_make_xattrdir(dzp, &va, &vp, kcred);
+ break;
+ case TX_SYMLINK:
+ link = name + strlen(name) + 1;
+ error = VOP_SYMLINK(ZTOV(dzp), &vp, &cn, &va, link);
+ break;
+ default:
+ error = ENOTSUP;
+ }
+ VOP_UNLOCK(ZTOV(dzp), 0, curthread);
+
+ if (error == 0 && vp != NULL) {
+ VOP_UNLOCK(vp, 0, curthread);
+ VN_RELE(vp);
+ }
+
+ VN_RELE(ZTOV(dzp));
+
+ return (error);
+}
+
+static int
+zfs_replay_remove(zfsvfs_t *zfsvfs, lr_remove_t *lr, boolean_t byteswap)
+{
+ char *name = (char *)(lr + 1); /* name follows lr_remove_t */
+ znode_t *dzp;
+ struct componentname cn;
+ vnode_t *vp;
+ int error;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
+ return (error);
+
+ cn.cn_nameptr = name;
+ cn.cn_namelen = strlen(name);
+ cn.cn_nameiop = DELETE;
+ cn.cn_flags = ISLASTCN | SAVENAME;
+ cn.cn_cred = kcred;
+ cn.cn_thread = curthread;
+ vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY, curthread);
+ error = VOP_LOOKUP(ZTOV(dzp), &vp, &cn);
+ if (error != 0) {
+ VOP_UNLOCK(ZTOV(dzp), 0, curthread);
+ goto fail;
+ }
+
+ switch ((int)lr->lr_common.lrc_txtype) {
+ case TX_REMOVE:
+ error = VOP_REMOVE(ZTOV(dzp), vp, &cn);
+ break;
+ case TX_RMDIR:
+ error = VOP_RMDIR(ZTOV(dzp), vp, &cn);
+ break;
+ default:
+ error = ENOTSUP;
+ }
+ vput(vp);
+ VOP_UNLOCK(ZTOV(dzp), 0, curthread);
+fail:
+ VN_RELE(ZTOV(dzp));
+
+ return (error);
+}
+
+static int
+zfs_replay_link(zfsvfs_t *zfsvfs, lr_link_t *lr, boolean_t byteswap)
+{
+ char *name = (char *)(lr + 1); /* name follows lr_link_t */
+ znode_t *dzp, *zp;
+ struct componentname cn;
+ int error;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
+ return (error);
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_link_obj, &zp)) != 0) {
+ VN_RELE(ZTOV(dzp));
+ return (error);
+ }
+
+ cn.cn_nameptr = name;
+ cn.cn_cred = kcred;
+ cn.cn_thread = curthread;
+ cn.cn_flags = SAVENAME;
+
+ vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY, curthread);
+ vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY, curthread);
+ error = VOP_LINK(ZTOV(dzp), ZTOV(zp), &cn);
+ VOP_UNLOCK(ZTOV(zp), 0, curthread);
+ VOP_UNLOCK(ZTOV(dzp), 0, curthread);
+
+ VN_RELE(ZTOV(zp));
+ VN_RELE(ZTOV(dzp));
+
+ return (error);
+}
+
+static int
+zfs_replay_rename(zfsvfs_t *zfsvfs, lr_rename_t *lr, boolean_t byteswap)
+{
+ char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */
+ char *tname = sname + strlen(sname) + 1;
+ znode_t *sdzp, *tdzp;
+ struct componentname scn, tcn;
+ vnode_t *svp, *tvp;
+ kthread_t *td = curthread;
+ int error;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_sdoid, &sdzp)) != 0)
+ return (error);
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_tdoid, &tdzp)) != 0) {
+ VN_RELE(ZTOV(sdzp));
+ return (error);
+ }
+
+ svp = tvp = NULL;
+
+ scn.cn_nameptr = sname;
+ scn.cn_namelen = strlen(sname);
+ scn.cn_nameiop = DELETE;
+ scn.cn_flags = ISLASTCN | SAVENAME;
+ scn.cn_cred = kcred;
+ scn.cn_thread = td;
+ vn_lock(ZTOV(sdzp), LK_EXCLUSIVE | LK_RETRY, td);
+ error = VOP_LOOKUP(ZTOV(sdzp), &svp, &scn);
+ VOP_UNLOCK(ZTOV(sdzp), 0, td);
+ if (error != 0)
+ goto fail;
+ VOP_UNLOCK(svp, 0, td);
+
+ tcn.cn_nameptr = tname;
+ tcn.cn_namelen = strlen(tname);
+ tcn.cn_nameiop = RENAME;
+ tcn.cn_flags = ISLASTCN | SAVENAME;
+ tcn.cn_cred = kcred;
+ tcn.cn_thread = td;
+ vn_lock(ZTOV(tdzp), LK_EXCLUSIVE | LK_RETRY, td);
+ error = VOP_LOOKUP(ZTOV(tdzp), &tvp, &tcn);
+ if (error == EJUSTRETURN)
+ tvp = NULL;
+ else if (error != 0) {
+ VOP_UNLOCK(ZTOV(tdzp), 0, td);
+ goto fail;
+ }
+
+ error = VOP_RENAME(ZTOV(sdzp), svp, &scn, ZTOV(tdzp), tvp, &tcn);
+ return (error);
+fail:
+ if (svp != NULL)
+ vrele(svp);
+ if (tvp != NULL)
+ vrele(tvp);
+ VN_RELE(ZTOV(tdzp));
+ VN_RELE(ZTOV(sdzp));
+
+ return (error);
+}
+
+static int
+zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap)
+{
+ char *data = (char *)(lr + 1); /* data follows lr_write_t */
+ znode_t *zp;
+ int error;
+ ssize_t resid;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
+ /*
+ * As we can log writes out of order, it's possible the
+ * file has been removed. In this case just drop the write
+ * and return success.
+ */
+ if (error == ENOENT)
+ error = 0;
+ return (error);
+ }
+
+ error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, lr->lr_length,
+ lr->lr_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
+
+ VN_RELE(ZTOV(zp));
+
+ return (error);
+}
+
+static int
+zfs_replay_truncate(zfsvfs_t *zfsvfs, lr_truncate_t *lr, boolean_t byteswap)
+{
+
+ ZFS_LOG(0, "Unexpected code path, report to pjd@FreeBSD.org");
+ return (EOPNOTSUPP);
+}
+
+static int
+zfs_replay_setattr(zfsvfs_t *zfsvfs, lr_setattr_t *lr, boolean_t byteswap)
+{
+ znode_t *zp;
+ vattr_t va;
+ vnode_t *vp;
+ int error;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
+ /*
+ * As we can log setattrs out of order, it's possible the
+ * file has been removed. In this case just drop the setattr
+ * and return success.
+ */
+ if (error == ENOENT)
+ error = 0;
+ return (error);
+ }
+
+ zfs_init_vattr(&va, lr->lr_mask, lr->lr_mode,
+ lr->lr_uid, lr->lr_gid, 0, lr->lr_foid);
+
+ va.va_size = lr->lr_size;
+ ZFS_TIME_DECODE(&va.va_atime, lr->lr_atime);
+ ZFS_TIME_DECODE(&va.va_mtime, lr->lr_mtime);
+
+ vp = ZTOV(zp);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread);
+ error = VOP_SETATTR(vp, &va, kcred, curthread);
+ VOP_UNLOCK(vp, 0, curthread);
+ VN_RELE(vp);
+
+ return (error);
+}
+
+static int
+zfs_replay_acl(zfsvfs_t *zfsvfs, lr_acl_t *lr, boolean_t byteswap)
+{
+ ace_t *ace = (ace_t *)(lr + 1); /* ace array follows lr_acl_t */
+#ifdef TODO
+ vsecattr_t vsa;
+#endif
+ znode_t *zp;
+ int error;
+
+ if (byteswap) {
+ byteswap_uint64_array(lr, sizeof (*lr));
+ zfs_ace_byteswap(ace, lr->lr_aclcnt);
+ }
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
+ /*
+ * As we can log acls out of order, it's possible the
+ * file has been removed. In this case just drop the acl
+ * and return success.
+ */
+ if (error == ENOENT)
+ error = 0;
+ return (error);
+ }
+
+#ifdef TODO
+ bzero(&vsa, sizeof (vsa));
+ vsa.vsa_mask = VSA_ACE | VSA_ACECNT;
+ vsa.vsa_aclcnt = lr->lr_aclcnt;
+ vsa.vsa_aclentp = ace;
+
+ error = VOP_SETSECATTR(ZTOV(zp), &vsa, 0, kcred);
+#else
+ error = EOPNOTSUPP;
+#endif
+
+ VN_RELE(ZTOV(zp));
+
+ return (error);
+}
+
+/*
+ * Callback vectors for replaying records
+ */
+zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = {
+ zfs_replay_error, /* 0 no such transaction type */
+ zfs_replay_create, /* TX_CREATE */
+ zfs_replay_create, /* TX_MKDIR */
+ zfs_replay_create, /* TX_MKXATTR */
+ zfs_replay_create, /* TX_SYMLINK */
+ zfs_replay_remove, /* TX_REMOVE */
+ zfs_replay_remove, /* TX_RMDIR */
+ zfs_replay_link, /* TX_LINK */
+ zfs_replay_rename, /* TX_RENAME */
+ zfs_replay_write, /* TX_WRITE */
+ zfs_replay_truncate, /* TX_TRUNCATE */
+ zfs_replay_setattr, /* TX_SETATTR */
+ zfs_replay_acl, /* TX_ACL */
+};
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c
new file mode 100644
index 0000000..07ec0f6
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c
@@ -0,0 +1,594 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * This file contains the code to implement file range locking in
+ * ZFS, although there isn't much specific to ZFS (all that comes to mind
+ * support for growing the blocksize).
+ *
+ * Interface
+ * ---------
+ * Defined in zfs_rlock.h but essentially:
+ * rl = zfs_range_lock(zp, off, len, lock_type);
+ * zfs_range_unlock(rl);
+ * zfs_range_reduce(rl, off, len);
+ *
+ * AVL tree
+ * --------
+ * An AVL tree is used to maintain the state of the existing ranges
+ * that are locked for exclusive (writer) or shared (reader) use.
+ * The starting range offset is used for searching and sorting the tree.
+ *
+ * Common case
+ * -----------
+ * The (hopefully) usual case is of no overlaps or contention for
+ * locks. On entry to zfs_lock_range() a rl_t is allocated; the tree
+ * searched that finds no overlap, and *this* rl_t is placed in the tree.
+ *
+ * Overlaps/Reference counting/Proxy locks
+ * ---------------------------------------
+ * The avl code only allows one node at a particular offset. Also it's very
+ * inefficient to search through all previous entries looking for overlaps
+ * (because the very 1st in the ordered list might be at offset 0 but
+ * cover the whole file).
+ * So this implementation uses reference counts and proxy range locks.
+ * Firstly, only reader locks use reference counts and proxy locks,
+ * because writer locks are exclusive.
+ * When a reader lock overlaps with another then a proxy lock is created
+ * for that range and replaces the original lock. If the overlap
+ * is exact then the reference count of the proxy is simply incremented.
+ * Otherwise, the proxy lock is split into smaller lock ranges and
+ * new proxy locks created for non overlapping ranges.
+ * The reference counts are adjusted accordingly.
+ * Meanwhile, the orginal lock is kept around (this is the callers handle)
+ * and its offset and length are used when releasing the lock.
+ *
+ * Thread coordination
+ * -------------------
+ * In order to make wakeups efficient and to ensure multiple continuous
+ * readers on a range don't starve a writer for the same range lock,
+ * two condition variables are allocated in each rl_t.
+ * If a writer (or reader) can't get a range it initialises the writer
+ * (or reader) cv; sets a flag saying there's a writer (or reader) waiting;
+ * and waits on that cv. When a thread unlocks that range it wakes up all
+ * writers then all readers before destroying the lock.
+ *
+ * Append mode writes
+ * ------------------
+ * Append mode writes need to lock a range at the end of a file.
+ * The offset of the end of the file is determined under the
+ * range locking mutex, and the lock type converted from RL_APPEND to
+ * RL_WRITER and the range locked.
+ *
+ * Grow block handling
+ * -------------------
+ * ZFS supports multiple block sizes currently upto 128K. The smallest
+ * block size is used for the file which is grown as needed. During this
+ * growth all other writers and readers must be excluded.
+ * So if the block size needs to be grown then the whole file is
+ * exclusively locked, then later the caller will reduce the lock
+ * range to just the range to be written using zfs_reduce_range.
+ */
+
+#include <sys/zfs_rlock.h>
+
+/*
+ * Check if a write lock can be grabbed, or wait and recheck until available.
+ */
+static void
+zfs_range_lock_writer(znode_t *zp, rl_t *new)
+{
+ avl_tree_t *tree = &zp->z_range_avl;
+ rl_t *rl;
+ avl_index_t where;
+ uint64_t end_size;
+ uint64_t off = new->r_off;
+ uint64_t len = new->r_len;
+
+ for (;;) {
+ /*
+ * Range locking is also used by zvol and uses a
+ * dummied up znode. However, for zvol, we don't need to
+ * append or grow blocksize, and besides we don't have
+ * a z_phys or z_zfsvfs - so skip that processing.
+ *
+ * Yes, this is ugly, and would be solved by not handling
+ * grow or append in range lock code. If that was done then
+ * we could make the range locking code generically available
+ * to other non-zfs consumers.
+ */
+ if (zp->z_vnode) { /* caller is ZPL */
+ /*
+ * If in append mode pick up the current end of file.
+ * This is done under z_range_lock to avoid races.
+ */
+ if (new->r_type == RL_APPEND)
+ new->r_off = zp->z_phys->zp_size;
+
+ /*
+ * If we need to grow the block size then grab the whole
+ * file range. This is also done under z_range_lock to
+ * avoid races.
+ */
+ end_size = MAX(zp->z_phys->zp_size, new->r_off + len);
+ if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) ||
+ zp->z_blksz < zp->z_zfsvfs->z_max_blksz)) {
+ new->r_off = 0;
+ new->r_len = UINT64_MAX;
+ }
+ }
+
+ /*
+ * First check for the usual case of no locks
+ */
+ if (avl_numnodes(tree) == 0) {
+ new->r_type = RL_WRITER; /* convert to writer */
+ avl_add(tree, new);
+ return;
+ }
+
+ /*
+ * Look for any locks in the range.
+ */
+ rl = avl_find(tree, new, &where);
+ if (rl)
+ goto wait; /* already locked at same offset */
+
+ rl = (rl_t *)avl_nearest(tree, where, AVL_AFTER);
+ if (rl && (rl->r_off < new->r_off + new->r_len))
+ goto wait;
+
+ rl = (rl_t *)avl_nearest(tree, where, AVL_BEFORE);
+ if (rl && rl->r_off + rl->r_len > new->r_off)
+ goto wait;
+
+ new->r_type = RL_WRITER; /* convert possible RL_APPEND */
+ avl_insert(tree, new, where);
+ return;
+wait:
+ if (!rl->r_write_wanted) {
+ cv_init(&rl->r_wr_cv, NULL, CV_DEFAULT, NULL);
+ rl->r_write_wanted = B_TRUE;
+ }
+ cv_wait(&rl->r_wr_cv, &zp->z_range_lock);
+
+ /* reset to original */
+ new->r_off = off;
+ new->r_len = len;
+ }
+}
+
+/*
+ * If this is an original (non-proxy) lock then replace it by
+ * a proxy and return the proxy.
+ */
+static rl_t *
+zfs_range_proxify(avl_tree_t *tree, rl_t *rl)
+{
+ rl_t *proxy;
+
+ if (rl->r_proxy)
+ return (rl); /* already a proxy */
+
+ ASSERT3U(rl->r_cnt, ==, 1);
+ ASSERT(rl->r_write_wanted == B_FALSE);
+ ASSERT(rl->r_read_wanted == B_FALSE);
+ avl_remove(tree, rl);
+ rl->r_cnt = 0;
+
+ /* create a proxy range lock */
+ proxy = kmem_alloc(sizeof (rl_t), KM_SLEEP);
+ proxy->r_off = rl->r_off;
+ proxy->r_len = rl->r_len;
+ proxy->r_cnt = 1;
+ proxy->r_type = RL_READER;
+ proxy->r_proxy = B_TRUE;
+ proxy->r_write_wanted = B_FALSE;
+ proxy->r_read_wanted = B_FALSE;
+ avl_add(tree, proxy);
+
+ return (proxy);
+}
+
+/*
+ * Split the range lock at the supplied offset
+ * returning the *front* proxy.
+ */
+static rl_t *
+zfs_range_split(avl_tree_t *tree, rl_t *rl, uint64_t off)
+{
+ rl_t *front, *rear;
+
+ ASSERT3U(rl->r_len, >, 1);
+ ASSERT3U(off, >, rl->r_off);
+ ASSERT3U(off, <, rl->r_off + rl->r_len);
+ ASSERT(rl->r_write_wanted == B_FALSE);
+ ASSERT(rl->r_read_wanted == B_FALSE);
+
+ /* create the rear proxy range lock */
+ rear = kmem_alloc(sizeof (rl_t), KM_SLEEP);
+ rear->r_off = off;
+ rear->r_len = rl->r_off + rl->r_len - off;
+ rear->r_cnt = rl->r_cnt;
+ rear->r_type = RL_READER;
+ rear->r_proxy = B_TRUE;
+ rear->r_write_wanted = B_FALSE;
+ rear->r_read_wanted = B_FALSE;
+
+ front = zfs_range_proxify(tree, rl);
+ front->r_len = off - rl->r_off;
+
+ avl_insert_here(tree, rear, front, AVL_AFTER);
+ return (front);
+}
+
+/*
+ * Create and add a new proxy range lock for the supplied range.
+ */
+static void
+zfs_range_new_proxy(avl_tree_t *tree, uint64_t off, uint64_t len)
+{
+ rl_t *rl;
+
+ ASSERT(len);
+ rl = kmem_alloc(sizeof (rl_t), KM_SLEEP);
+ rl->r_off = off;
+ rl->r_len = len;
+ rl->r_cnt = 1;
+ rl->r_type = RL_READER;
+ rl->r_proxy = B_TRUE;
+ rl->r_write_wanted = B_FALSE;
+ rl->r_read_wanted = B_FALSE;
+ avl_add(tree, rl);
+}
+
+static void
+zfs_range_add_reader(avl_tree_t *tree, rl_t *new, rl_t *prev, avl_index_t where)
+{
+ rl_t *next;
+ uint64_t off = new->r_off;
+ uint64_t len = new->r_len;
+
+ /*
+ * prev arrives either:
+ * - pointing to an entry at the same offset
+ * - pointing to the entry with the closest previous offset whose
+ * range may overlap with the new range
+ * - null, if there were no ranges starting before the new one
+ */
+ if (prev) {
+ if (prev->r_off + prev->r_len <= off) {
+ prev = NULL;
+ } else if (prev->r_off != off) {
+ /*
+ * convert to proxy if needed then
+ * split this entry and bump ref count
+ */
+ prev = zfs_range_split(tree, prev, off);
+ prev = AVL_NEXT(tree, prev); /* move to rear range */
+ }
+ }
+ ASSERT((prev == NULL) || (prev->r_off == off));
+
+ if (prev)
+ next = prev;
+ else
+ next = (rl_t *)avl_nearest(tree, where, AVL_AFTER);
+
+ if (next == NULL || off + len <= next->r_off) {
+ /* no overlaps, use the original new rl_t in the tree */
+ avl_insert(tree, new, where);
+ return;
+ }
+
+ if (off < next->r_off) {
+ /* Add a proxy for initial range before the overlap */
+ zfs_range_new_proxy(tree, off, next->r_off - off);
+ }
+
+ new->r_cnt = 0; /* will use proxies in tree */
+ /*
+ * We now search forward through the ranges, until we go past the end
+ * of the new range. For each entry we make it a proxy if it
+ * isn't already, then bump its reference count. If there's any
+ * gaps between the ranges then we create a new proxy range.
+ */
+ for (prev = NULL; next; prev = next, next = AVL_NEXT(tree, next)) {
+ if (off + len <= next->r_off)
+ break;
+ if (prev && prev->r_off + prev->r_len < next->r_off) {
+ /* there's a gap */
+ ASSERT3U(next->r_off, >, prev->r_off + prev->r_len);
+ zfs_range_new_proxy(tree, prev->r_off + prev->r_len,
+ next->r_off - (prev->r_off + prev->r_len));
+ }
+ if (off + len == next->r_off + next->r_len) {
+ /* exact overlap with end */
+ next = zfs_range_proxify(tree, next);
+ next->r_cnt++;
+ return;
+ }
+ if (off + len < next->r_off + next->r_len) {
+ /* new range ends in the middle of this block */
+ next = zfs_range_split(tree, next, off + len);
+ next->r_cnt++;
+ return;
+ }
+ ASSERT3U(off + len, >, next->r_off + next->r_len);
+ next = zfs_range_proxify(tree, next);
+ next->r_cnt++;
+ }
+
+ /* Add the remaining end range. */
+ zfs_range_new_proxy(tree, prev->r_off + prev->r_len,
+ (off + len) - (prev->r_off + prev->r_len));
+}
+
+/*
+ * Check if a reader lock can be grabbed, or wait and recheck until available.
+ */
+static void
+zfs_range_lock_reader(znode_t *zp, rl_t *new)
+{
+ avl_tree_t *tree = &zp->z_range_avl;
+ rl_t *prev, *next;
+ avl_index_t where;
+ uint64_t off = new->r_off;
+ uint64_t len = new->r_len;
+
+ /*
+ * Look for any writer locks in the range.
+ */
+retry:
+ prev = avl_find(tree, new, &where);
+ if (prev == NULL)
+ prev = (rl_t *)avl_nearest(tree, where, AVL_BEFORE);
+
+ /*
+ * Check the previous range for a writer lock overlap.
+ */
+ if (prev && (off < prev->r_off + prev->r_len)) {
+ if ((prev->r_type == RL_WRITER) || (prev->r_write_wanted)) {
+ if (!prev->r_read_wanted) {
+ cv_init(&prev->r_rd_cv, NULL, CV_DEFAULT, NULL);
+ prev->r_read_wanted = B_TRUE;
+ }
+ cv_wait(&prev->r_rd_cv, &zp->z_range_lock);
+ goto retry;
+ }
+ if (off + len < prev->r_off + prev->r_len)
+ goto got_lock;
+ }
+
+ /*
+ * Search through the following ranges to see if there's
+ * write lock any overlap.
+ */
+ if (prev)
+ next = AVL_NEXT(tree, prev);
+ else
+ next = (rl_t *)avl_nearest(tree, where, AVL_AFTER);
+ for (; next; next = AVL_NEXT(tree, next)) {
+ if (off + len <= next->r_off)
+ goto got_lock;
+ if ((next->r_type == RL_WRITER) || (next->r_write_wanted)) {
+ if (!next->r_read_wanted) {
+ cv_init(&next->r_rd_cv, NULL, CV_DEFAULT, NULL);
+ next->r_read_wanted = B_TRUE;
+ }
+ cv_wait(&next->r_rd_cv, &zp->z_range_lock);
+ goto retry;
+ }
+ if (off + len <= next->r_off + next->r_len)
+ goto got_lock;
+ }
+
+got_lock:
+ /*
+ * Add the read lock, which may involve splitting existing
+ * locks and bumping ref counts (r_cnt).
+ */
+ zfs_range_add_reader(tree, new, prev, where);
+}
+
+/*
+ * Lock a range (offset, length) as either shared (RL_READER)
+ * or exclusive (RL_WRITER). Returns the range lock structure
+ * for later unlocking or reduce range (if entire file
+ * previously locked as RL_WRITER).
+ */
+rl_t *
+zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type)
+{
+ rl_t *new;
+
+ ASSERT(type == RL_READER || type == RL_WRITER || type == RL_APPEND);
+
+ new = kmem_alloc(sizeof (rl_t), KM_SLEEP);
+ new->r_zp = zp;
+ new->r_off = off;
+ new->r_len = len;
+ new->r_cnt = 1; /* assume it's going to be in the tree */
+ new->r_type = type;
+ new->r_proxy = B_FALSE;
+ new->r_write_wanted = B_FALSE;
+ new->r_read_wanted = B_FALSE;
+
+ mutex_enter(&zp->z_range_lock);
+ if (type == RL_READER) {
+ /*
+ * First check for the usual case of no locks
+ */
+ if (avl_numnodes(&zp->z_range_avl) == 0)
+ avl_add(&zp->z_range_avl, new);
+ else
+ zfs_range_lock_reader(zp, new);
+ } else
+ zfs_range_lock_writer(zp, new); /* RL_WRITER or RL_APPEND */
+ mutex_exit(&zp->z_range_lock);
+ return (new);
+}
+
+/*
+ * Unlock a reader lock
+ */
+static void
+zfs_range_unlock_reader(znode_t *zp, rl_t *remove)
+{
+ avl_tree_t *tree = &zp->z_range_avl;
+ rl_t *rl, *next;
+ uint64_t len;
+
+ /*
+ * The common case is when the remove entry is in the tree
+ * (cnt == 1) meaning there's been no other reader locks overlapping
+ * with this one. Otherwise the remove entry will have been
+ * removed from the tree and replaced by proxies (one or
+ * more ranges mapping to the entire range).
+ */
+ if (remove->r_cnt == 1) {
+ avl_remove(tree, remove);
+ if (remove->r_write_wanted)
+ cv_broadcast(&remove->r_wr_cv);
+ if (remove->r_read_wanted)
+ cv_broadcast(&remove->r_rd_cv);
+ } else {
+ ASSERT3U(remove->r_cnt, ==, 0);
+ ASSERT3U(remove->r_write_wanted, ==, 0);
+ ASSERT3U(remove->r_read_wanted, ==, 0);
+ /*
+ * Find start proxy representing this reader lock,
+ * then decrement ref count on all proxies
+ * that make up this range, freeing them as needed.
+ */
+ rl = avl_find(tree, remove, NULL);
+ ASSERT(rl);
+ ASSERT(rl->r_cnt);
+ ASSERT(rl->r_type == RL_READER);
+ for (len = remove->r_len; len != 0; rl = next) {
+ len -= rl->r_len;
+ if (len) {
+ next = AVL_NEXT(tree, rl);
+ ASSERT(next);
+ ASSERT(rl->r_off + rl->r_len == next->r_off);
+ ASSERT(next->r_cnt);
+ ASSERT(next->r_type == RL_READER);
+ }
+ rl->r_cnt--;
+ if (rl->r_cnt == 0) {
+ avl_remove(tree, rl);
+ if (rl->r_write_wanted)
+ cv_broadcast(&rl->r_wr_cv);
+ if (rl->r_read_wanted)
+ cv_broadcast(&rl->r_rd_cv);
+ kmem_free(rl, sizeof (rl_t));
+ }
+ }
+ }
+ kmem_free(remove, sizeof (rl_t));
+}
+
+/*
+ * Unlock range and destroy range lock structure.
+ */
+void
+zfs_range_unlock(rl_t *rl)
+{
+ znode_t *zp = rl->r_zp;
+
+ ASSERT(rl->r_type == RL_WRITER || rl->r_type == RL_READER);
+ ASSERT(rl->r_cnt == 1 || rl->r_cnt == 0);
+ ASSERT(!rl->r_proxy);
+
+ mutex_enter(&zp->z_range_lock);
+ if (rl->r_type == RL_WRITER) {
+ /* writer locks can't be shared or split */
+ avl_remove(&zp->z_range_avl, rl);
+ mutex_exit(&zp->z_range_lock);
+ if (rl->r_write_wanted) {
+ cv_broadcast(&rl->r_wr_cv);
+ cv_destroy(&rl->r_wr_cv);
+ }
+ if (rl->r_read_wanted) {
+ cv_broadcast(&rl->r_rd_cv);
+ cv_destroy(&rl->r_rd_cv);
+ }
+ kmem_free(rl, sizeof (rl_t));
+ } else {
+ /*
+ * lock may be shared, let zfs_range_unlock_reader()
+ * release the lock and free the rl_t
+ */
+ zfs_range_unlock_reader(zp, rl);
+ mutex_exit(&zp->z_range_lock);
+ }
+}
+
+/*
+ * Reduce range locked as RL_WRITER from whole file to specified range.
+ * Asserts the whole file is exclusivly locked and so there's only one
+ * entry in the tree.
+ */
+void
+zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len)
+{
+ znode_t *zp = rl->r_zp;
+
+ /* Ensure there are no other locks */
+ ASSERT(avl_numnodes(&zp->z_range_avl) == 1);
+ ASSERT(rl->r_off == 0);
+ ASSERT(rl->r_type == RL_WRITER);
+ ASSERT(!rl->r_proxy);
+ ASSERT3U(rl->r_len, ==, UINT64_MAX);
+ ASSERT3U(rl->r_cnt, ==, 1);
+
+ mutex_enter(&zp->z_range_lock);
+ rl->r_off = off;
+ rl->r_len = len;
+ mutex_exit(&zp->z_range_lock);
+ if (rl->r_write_wanted)
+ cv_broadcast(&rl->r_wr_cv);
+ if (rl->r_read_wanted)
+ cv_broadcast(&rl->r_rd_cv);
+}
+
+/*
+ * AVL comparison function used to order range locks
+ * Locks are ordered on the start offset of the range.
+ */
+int
+zfs_range_compare(const void *arg1, const void *arg2)
+{
+ const rl_t *rl1 = arg1;
+ const rl_t *rl2 = arg2;
+
+ if (rl1->r_off > rl2->r_off)
+ return (1);
+ if (rl1->r_off < rl2->r_off)
+ return (-1);
+ return (0);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
new file mode 100644
index 0000000..27e00c3
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
@@ -0,0 +1,986 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/sysmacros.h>
+#include <sys/kmem.h>
+#include <sys/acl.h>
+#include <sys/vnode.h>
+#include <sys/vfs.h>
+#include <sys/mntent.h>
+#include <sys/mount.h>
+#include <sys/cmn_err.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_dir.h>
+#include <sys/zil.h>
+#include <sys/fs/zfs.h>
+#include <sys/dmu.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_dataset.h>
+#include <sys/spa.h>
+#include <sys/zap.h>
+#include <sys/varargs.h>
+#include <sys/atomic.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/dnlc.h>
+
+struct mtx atomic_mtx;
+MTX_SYSINIT(atomic, &atomic_mtx, "atomic", MTX_DEF);
+
+struct mtx zfs_debug_mtx;
+MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF);
+SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system");
+int zfs_debug_level = 0;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RW, &zfs_debug_level, 0,
+ "Debug level");
+
+static int zfs_mount(vfs_t *vfsp, kthread_t *td);
+static int zfs_umount(vfs_t *vfsp, int fflag, kthread_t *td);
+static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp, kthread_t *td);
+static int zfs_statfs(vfs_t *vfsp, struct statfs *statp, kthread_t *td);
+static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp);
+static int zfs_sync(vfs_t *vfsp, int waitfor, kthread_t *td);
+static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp);
+static void zfs_objset_close(zfsvfs_t *zfsvfs);
+static void zfs_freevfs(vfs_t *vfsp);
+
+static struct vfsops zfs_vfsops = {
+ .vfs_mount = zfs_mount,
+ .vfs_unmount = zfs_umount,
+ .vfs_root = zfs_root,
+ .vfs_statfs = zfs_statfs,
+ .vfs_vget = zfs_vget,
+ .vfs_sync = zfs_sync,
+ .vfs_fhtovp = zfs_fhtovp,
+};
+
+VFS_SET(zfs_vfsops, zfs, VFCF_JAIL);
+
+/*
+ * We need to keep a count of active fs's.
+ * This is necessary to prevent our module
+ * from being unloaded after a umount -f
+ */
+static uint32_t zfs_active_fs_count = 0;
+
+/*ARGSUSED*/
+static int
+zfs_sync(vfs_t *vfsp, int waitfor, kthread_t *td)
+{
+
+ /*
+ * Data integrity is job one. We don't want a compromised kernel
+ * writing to the storage pool, so we never sync during panic.
+ */
+ if (panicstr)
+ return (0);
+
+ if (vfsp != NULL) {
+ /*
+ * Sync a specific filesystem.
+ */
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+ int error;
+
+ error = vfs_stdsync(vfsp, waitfor, td);
+ if (error != 0)
+ return (error);
+
+ ZFS_ENTER(zfsvfs);
+ if (zfsvfs->z_log != NULL)
+ zil_commit(zfsvfs->z_log, UINT64_MAX, 0);
+ else
+ txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
+ ZFS_EXIT(zfsvfs);
+ } else {
+ /*
+ * Sync all ZFS filesystems. This is what happens when you
+ * run sync(1M). Unlike other filesystems, ZFS honors the
+ * request by waiting for all pools to commit all dirty data.
+ */
+ spa_sync_allpools();
+ }
+
+ return (0);
+}
+
+static void
+atime_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ if (newval == TRUE) {
+ zfsvfs->z_atime = TRUE;
+ zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME;
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
+ } else {
+ zfsvfs->z_atime = FALSE;
+ zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME;
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
+ }
+}
+
+static void
+xattr_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ if (newval == TRUE) {
+ /* XXX locking on vfs_flag? */
+#ifdef TODO
+ zfsvfs->z_vfs->vfs_flag |= VFS_XATTR;
+#endif
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0);
+ } else {
+ /* XXX locking on vfs_flag? */
+#ifdef TODO
+ zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR;
+#endif
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0);
+ }
+}
+
+static void
+blksz_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ if (newval < SPA_MINBLOCKSIZE ||
+ newval > SPA_MAXBLOCKSIZE || !ISP2(newval))
+ newval = SPA_MAXBLOCKSIZE;
+
+ zfsvfs->z_max_blksz = newval;
+ zfsvfs->z_vfs->vfs_bsize = newval;
+}
+
+static void
+readonly_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ if (newval) {
+ /* XXX locking on vfs_flag? */
+ zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
+ } else {
+ /* XXX locking on vfs_flag? */
+ zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
+ }
+}
+
+static void
+setuid_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ if (newval == FALSE) {
+ zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
+ } else {
+ zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
+ }
+}
+
+static void
+exec_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ if (newval == FALSE) {
+ zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
+ } else {
+ zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
+ vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
+ vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
+ }
+}
+
+static void
+snapdir_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ zfsvfs->z_show_ctldir = newval;
+}
+
+static void
+acl_mode_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ zfsvfs->z_acl_mode = newval;
+}
+
+static void
+acl_inherit_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ zfsvfs->z_acl_inherit = newval;
+}
+
+static int
+zfs_refresh_properties(vfs_t *vfsp)
+{
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+
+ /*
+ * Remount operations default to "rw" unless "ro" is explicitly
+ * specified.
+ */
+ if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
+ readonly_changed_cb(zfsvfs, B_TRUE);
+ } else {
+ if (!dmu_objset_is_snapshot(zfsvfs->z_os))
+ readonly_changed_cb(zfsvfs, B_FALSE);
+ else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL))
+ return (EROFS);
+ }
+
+ if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
+ setuid_changed_cb(zfsvfs, B_FALSE);
+ } else {
+ if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
+ setuid_changed_cb(zfsvfs, B_FALSE);
+ else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL))
+ setuid_changed_cb(zfsvfs, B_TRUE);
+ }
+
+ if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
+ exec_changed_cb(zfsvfs, B_FALSE);
+ else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL))
+ exec_changed_cb(zfsvfs, B_TRUE);
+
+ if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL))
+ atime_changed_cb(zfsvfs, B_TRUE);
+ else if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL))
+ atime_changed_cb(zfsvfs, B_FALSE);
+
+ if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL))
+ xattr_changed_cb(zfsvfs, B_TRUE);
+ else if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL))
+ xattr_changed_cb(zfsvfs, B_FALSE);
+
+ return (0);
+}
+
+static int
+zfs_register_callbacks(vfs_t *vfsp)
+{
+ struct dsl_dataset *ds = NULL;
+ objset_t *os = NULL;
+ zfsvfs_t *zfsvfs = NULL;
+ int readonly, do_readonly = FALSE;
+ int setuid, do_setuid = FALSE;
+ int exec, do_exec = FALSE;
+ int xattr, do_xattr = FALSE;
+ int error = 0;
+
+ ASSERT(vfsp);
+ zfsvfs = vfsp->vfs_data;
+ ASSERT(zfsvfs);
+ os = zfsvfs->z_os;
+
+ /*
+ * The act of registering our callbacks will destroy any mount
+ * options we may have. In order to enable temporary overrides
+ * of mount options, we stash away the current values and
+ * restore them after we register the callbacks.
+ */
+ if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
+ readonly = B_TRUE;
+ do_readonly = B_TRUE;
+ } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
+ readonly = B_FALSE;
+ do_readonly = B_TRUE;
+ }
+ if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
+ setuid = B_FALSE;
+ do_setuid = B_TRUE;
+ } else {
+ if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
+ setuid = B_FALSE;
+ do_setuid = B_TRUE;
+ } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
+ setuid = B_TRUE;
+ do_setuid = B_TRUE;
+ }
+ }
+ if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
+ exec = B_FALSE;
+ do_exec = B_TRUE;
+ } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
+ exec = B_TRUE;
+ do_exec = B_TRUE;
+ }
+ if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
+ xattr = B_FALSE;
+ do_xattr = B_TRUE;
+ } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
+ xattr = B_TRUE;
+ do_xattr = B_TRUE;
+ }
+
+ /*
+ * Register property callbacks.
+ *
+ * It would probably be fine to just check for i/o error from
+ * the first prop_register(), but I guess I like to go
+ * overboard...
+ */
+ ds = dmu_objset_ds(os);
+ error = dsl_prop_register(ds, "atime", atime_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ "xattr", xattr_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ "recordsize", blksz_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ "readonly", readonly_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ "setuid", setuid_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ "exec", exec_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ "snapdir", snapdir_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ "aclmode", acl_mode_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ "aclinherit", acl_inherit_changed_cb, zfsvfs);
+ if (error)
+ goto unregister;
+
+ /*
+ * Invoke our callbacks to restore temporary mount options.
+ */
+ if (do_readonly)
+ readonly_changed_cb(zfsvfs, readonly);
+ if (do_setuid)
+ setuid_changed_cb(zfsvfs, setuid);
+ if (do_exec)
+ exec_changed_cb(zfsvfs, exec);
+ if (do_xattr)
+ xattr_changed_cb(zfsvfs, xattr);
+
+ return (0);
+
+unregister:
+ /*
+ * We may attempt to unregister some callbacks that are not
+ * registered, but this is OK; it will simply return ENOMSG,
+ * which we will ignore.
+ */
+ (void) dsl_prop_unregister(ds, "atime", atime_changed_cb, zfsvfs);
+ (void) dsl_prop_unregister(ds, "xattr", xattr_changed_cb, zfsvfs);
+ (void) dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zfsvfs);
+ (void) dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zfsvfs);
+ (void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs);
+ (void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs);
+ (void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs);
+ (void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs);
+ (void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb,
+ zfsvfs);
+ return (error);
+
+}
+
+static int
+zfs_domount(vfs_t *vfsp, char *osname, kthread_t *td)
+{
+ cred_t *cr = td->td_ucred;
+ uint64_t recordsize, readonly;
+ int error = 0;
+ int mode;
+ zfsvfs_t *zfsvfs;
+ znode_t *zp = NULL;
+
+ ASSERT(vfsp);
+ ASSERT(osname);
+
+ /*
+ * Initialize the zfs-specific filesystem structure.
+ * Should probably make this a kmem cache, shuffle fields,
+ * and just bzero up to z_hold_mtx[].
+ */
+ zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
+ zfsvfs->z_vfs = vfsp;
+ zfsvfs->z_parent = zfsvfs;
+ zfsvfs->z_assign = TXG_NOWAIT;
+ zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
+ zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
+
+ mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
+ offsetof(znode_t, z_link_node));
+ rw_init(&zfsvfs->z_um_lock, NULL, RW_DEFAULT, NULL);
+
+ if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
+ NULL))
+ goto out;
+ zfsvfs->z_vfs->vfs_bsize = recordsize;
+
+ vfsp->vfs_data = zfsvfs;
+ vfsp->mnt_flag |= MNT_LOCAL;
+ vfsp->mnt_kern_flag |= MNTK_MPSAFE;
+ vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED;
+
+ if (error = dsl_prop_get_integer(osname, "readonly", &readonly, NULL))
+ goto out;
+
+ if (readonly)
+ mode = DS_MODE_PRIMARY | DS_MODE_READONLY;
+ else
+ mode = DS_MODE_PRIMARY;
+
+ error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os);
+ if (error == EROFS) {
+ /*
+ * FreeBSD: In Solaris there is DS_MODE_PRIMARY instead of
+ * DS_MODE_STANDARD, but it doesn't work on FreeBSD and
+ * I don't know why. It looks like the dataset is opened
+ * on mount DS_MODE_PRIMARY mode and snapshot cannot open
+ * the same dataset in DS_MODE_PRIMARY mode again.
+ */
+ mode = DS_MODE_STANDARD | DS_MODE_READONLY;
+ error = dmu_objset_open(osname, DMU_OST_ZFS, mode,
+ &zfsvfs->z_os);
+ }
+
+ if (error)
+ goto out;
+
+ if (error = zfs_init_fs(zfsvfs, &zp, cr))
+ goto out;
+
+ if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
+ uint64_t xattr;
+
+ ASSERT(mode & DS_MODE_READONLY);
+ atime_changed_cb(zfsvfs, B_FALSE);
+ readonly_changed_cb(zfsvfs, B_TRUE);
+ if (error = dsl_prop_get_integer(osname, "xattr", &xattr, NULL))
+ goto out;
+ xattr_changed_cb(zfsvfs, xattr);
+ zfsvfs->z_issnap = B_TRUE;
+ } else {
+ error = zfs_register_callbacks(vfsp);
+ if (error)
+ goto out;
+
+ zfs_unlinked_drain(zfsvfs);
+
+ /*
+ * Parse and replay the intent log.
+ */
+ zil_replay(zfsvfs->z_os, zfsvfs, &zfsvfs->z_assign,
+ zfs_replay_vector);
+
+ if (!zil_disable)
+ zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
+ }
+
+ vfs_mountedfrom(vfsp, osname);
+
+ if (!zfsvfs->z_issnap)
+ zfsctl_create(zfsvfs);
+out:
+ if (error) {
+ if (zfsvfs->z_os)
+ dmu_objset_close(zfsvfs->z_os);
+ rw_destroy(&zfsvfs->z_um_lock);
+ mutex_destroy(&zfsvfs->z_znodes_lock);
+ kmem_free(zfsvfs, sizeof (zfsvfs_t));
+ } else {
+ atomic_add_32(&zfs_active_fs_count, 1);
+ }
+
+ return (error);
+
+}
+
+void
+zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
+{
+ objset_t *os = zfsvfs->z_os;
+ struct dsl_dataset *ds;
+
+ /*
+ * Unregister properties.
+ */
+ if (!dmu_objset_is_snapshot(os)) {
+ ds = dmu_objset_ds(os);
+ VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb,
+ zfsvfs) == 0);
+
+ VERIFY(dsl_prop_unregister(ds, "xattr", xattr_changed_cb,
+ zfsvfs) == 0);
+
+ VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb,
+ zfsvfs) == 0);
+
+ VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb,
+ zfsvfs) == 0);
+
+ VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb,
+ zfsvfs) == 0);
+
+ VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb,
+ zfsvfs) == 0);
+
+ VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb,
+ zfsvfs) == 0);
+
+ VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb,
+ zfsvfs) == 0);
+
+ VERIFY(dsl_prop_unregister(ds, "aclinherit",
+ acl_inherit_changed_cb, zfsvfs) == 0);
+ }
+}
+
+/*ARGSUSED*/
+static int
+zfs_mount(vfs_t *vfsp, kthread_t *td)
+{
+ char *from;
+ int error;
+
+ /* TODO: For now deny user mounts. */
+ if ((error = priv_check(td, PRIV_VFS_MOUNT)) != 0)
+ return (error);
+
+ /*
+ * When doing a remount, we simply refresh our temporary properties
+ * according to those options set in the current VFS options.
+ */
+ if (vfsp->vfs_flag & MS_REMOUNT)
+ return (zfs_refresh_properties(vfsp));
+
+ if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&from, NULL))
+ return (EINVAL);
+
+ return (zfs_domount(vfsp, from, td));
+}
+
+static int
+zfs_statfs(vfs_t *vfsp, struct statfs *statp, kthread_t *td)
+{
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+ uint64_t refdbytes, availbytes, usedobjs, availobjs;
+
+ statp->f_version = STATFS_VERSION;
+
+ ZFS_ENTER(zfsvfs);
+
+ dmu_objset_space(zfsvfs->z_os,
+ &refdbytes, &availbytes, &usedobjs, &availobjs);
+
+ /*
+ * The underlying storage pool actually uses multiple block sizes.
+ * We report the fragsize as the smallest block size we support,
+ * and we report our blocksize as the filesystem's maximum blocksize.
+ */
+ statp->f_bsize = zfsvfs->z_vfs->vfs_bsize;
+ statp->f_iosize = zfsvfs->z_vfs->vfs_bsize;
+
+ /*
+ * The following report "total" blocks of various kinds in the
+ * file system, but reported in terms of f_frsize - the
+ * "fragment" size.
+ */
+
+ statp->f_blocks = (refdbytes + availbytes) / statp->f_bsize;
+ statp->f_bfree = availbytes / statp->f_bsize;
+ statp->f_bavail = statp->f_bfree; /* no root reservation */
+
+ /*
+ * statvfs() should really be called statufs(), because it assumes
+ * static metadata. ZFS doesn't preallocate files, so the best
+ * we can do is report the max that could possibly fit in f_files,
+ * and that minus the number actually used in f_ffree.
+ * For f_ffree, report the smaller of the number of object available
+ * and the number of blocks (each object will take at least a block).
+ */
+ statp->f_ffree = MIN(availobjs, statp->f_bfree);
+ statp->f_files = statp->f_ffree + usedobjs;
+
+ /*
+ * We're a zfs filesystem.
+ */
+ (void) strlcpy(statp->f_fstypename, "zfs", sizeof(statp->f_fstypename));
+
+ strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname,
+ sizeof(statp->f_mntfromname));
+ strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname,
+ sizeof(statp->f_mntonname));
+
+ statp->f_namemax = ZFS_MAXNAMELEN;
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+static int
+zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp, kthread_t *td)
+{
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+ znode_t *rootzp;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+
+ error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
+ if (error == 0) {
+ *vpp = ZTOV(rootzp);
+ error = vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td);
+ (*vpp)->v_vflag |= VV_ROOT;
+ }
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*ARGSUSED*/
+static int
+zfs_umount(vfs_t *vfsp, int fflag, kthread_t *td)
+{
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+ cred_t *cr = td->td_ucred;
+ int ret;
+
+ if ((ret = secpolicy_fs_unmount(cr, vfsp)) != 0)
+ return (ret);
+
+ (void) dnlc_purge_vfsp(vfsp, 0);
+
+ /*
+ * Unmount any snapshots mounted under .zfs before unmounting the
+ * dataset itself.
+ */
+ if (zfsvfs->z_ctldir != NULL) {
+ if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0)
+ return (ret);
+ ret = vflush(vfsp, 0, 0, td);
+ ASSERT(ret == EBUSY);
+ if (!(fflag & MS_FORCE)) {
+ if (zfsvfs->z_ctldir->v_count > 1)
+ return (EBUSY);
+ ASSERT(zfsvfs->z_ctldir->v_count == 1);
+ }
+ zfsctl_destroy(zfsvfs);
+ ASSERT(zfsvfs->z_ctldir == NULL);
+ }
+
+ /*
+ * Flush all the files.
+ */
+ ret = vflush(vfsp, 1, (fflag & MS_FORCE) ? FORCECLOSE : 0, td);
+ if (ret != 0) {
+ if (!zfsvfs->z_issnap) {
+ zfsctl_create(zfsvfs);
+ ASSERT(zfsvfs->z_ctldir != NULL);
+ }
+ return (ret);
+ }
+
+ if (fflag & MS_FORCE) {
+ MNT_ILOCK(vfsp);
+ vfsp->mnt_kern_flag |= MNTK_UNMOUNTF;
+ MNT_IUNLOCK(vfsp);
+ zfsvfs->z_unmounted1 = B_TRUE;
+
+ /*
+ * Wait for all zfs threads to leave zfs.
+ * Grabbing a rwlock as reader in all vops and
+ * as writer here doesn't work because it too easy to get
+ * multiple reader enters as zfs can re-enter itself.
+ * This can lead to deadlock if there is an intervening
+ * rw_enter as writer.
+ * So a file system threads ref count (z_op_cnt) is used.
+ * A polling loop on z_op_cnt may seem inefficient, but
+ * - this saves all threads on exit from having to grab a
+ * mutex in order to cv_signal
+ * - only occurs on forced unmount in the rare case when
+ * there are outstanding threads within the file system.
+ */
+ while (zfsvfs->z_op_cnt) {
+ delay(1);
+ }
+ }
+
+ zfs_objset_close(zfsvfs);
+ VFS_RELE(vfsp);
+ zfs_freevfs(vfsp);
+
+ return (0);
+}
+
+static int
+zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
+{
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+ znode_t *zp;
+ int err;
+
+ ZFS_ENTER(zfsvfs);
+ err = zfs_zget(zfsvfs, ino, &zp);
+ if (err == 0 && zp->z_unlinked) {
+ VN_RELE(ZTOV(zp));
+ err = EINVAL;
+ }
+ if (err != 0)
+ *vpp = NULL;
+ else {
+ *vpp = ZTOV(zp);
+ vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, curthread);
+ }
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+static int
+zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp)
+{
+ kthread_t *td = curthread;
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+ znode_t *zp;
+ uint64_t object = 0;
+ uint64_t fid_gen = 0;
+ uint64_t gen_mask;
+ uint64_t zp_gen;
+ int i, err;
+
+ *vpp = NULL;
+
+ ZFS_ENTER(zfsvfs);
+
+ if (fidp->fid_len == LONG_FID_LEN) {
+ zfid_long_t *zlfid = (zfid_long_t *)fidp;
+ uint64_t objsetid = 0;
+ uint64_t setgen = 0;
+
+ for (i = 0; i < sizeof (zlfid->zf_setid); i++)
+ objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
+
+ for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
+ setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
+
+ ZFS_EXIT(zfsvfs);
+
+ err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
+ if (err)
+ return (EINVAL);
+ ZFS_ENTER(zfsvfs);
+ }
+
+ if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
+ zfid_short_t *zfid = (zfid_short_t *)fidp;
+
+ for (i = 0; i < sizeof (zfid->zf_object); i++)
+ object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
+
+ for (i = 0; i < sizeof (zfid->zf_gen); i++)
+ fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
+ } else {
+ ZFS_EXIT(zfsvfs);
+ return (EINVAL);
+ }
+
+ /* A zero fid_gen means we are in the .zfs control directories */
+ if (fid_gen == 0 &&
+ (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) {
+ *vpp = zfsvfs->z_ctldir;
+ ASSERT(*vpp != NULL);
+ if (object == ZFSCTL_INO_SNAPDIR) {
+ VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL,
+ 0, NULL, NULL) == 0);
+ } else {
+ VN_HOLD(*vpp);
+ }
+ ZFS_EXIT(zfsvfs);
+ vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td);
+ return (0);
+ }
+
+ gen_mask = -1ULL >> (64 - 8 * i);
+
+ dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask);
+ if (err = zfs_zget(zfsvfs, object, &zp)) {
+ ZFS_EXIT(zfsvfs);
+ return (err);
+ }
+ zp_gen = zp->z_phys->zp_gen & gen_mask;
+ if (zp_gen == 0)
+ zp_gen = 1;
+ if (zp->z_unlinked || zp_gen != fid_gen) {
+ dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
+ VN_RELE(ZTOV(zp));
+ ZFS_EXIT(zfsvfs);
+ return (EINVAL);
+ }
+
+ *vpp = ZTOV(zp);
+ vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td);
+ vnode_create_vobject(*vpp, zp->z_phys->zp_size, td);
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+static void
+zfs_objset_close(zfsvfs_t *zfsvfs)
+{
+ znode_t *zp, *nextzp;
+ objset_t *os = zfsvfs->z_os;
+
+ /*
+ * For forced unmount, at this point all vops except zfs_inactive
+ * are erroring EIO. We need to now suspend zfs_inactive threads
+ * while we are freeing dbufs before switching zfs_inactive
+ * to use behaviour without a objset.
+ */
+ rw_enter(&zfsvfs->z_um_lock, RW_WRITER);
+
+ /*
+ * Release all holds on dbufs
+ * Note, although we have stopped all other vop threads and
+ * zfs_inactive(), the dmu can callback via znode_pageout_func()
+ * which can zfs_znode_free() the znode.
+ * So we lock z_all_znodes; search the list for a held
+ * dbuf; drop the lock (we know zp can't disappear if we hold
+ * a dbuf lock; then regrab the lock and restart.
+ */
+ mutex_enter(&zfsvfs->z_znodes_lock);
+ for (zp = list_head(&zfsvfs->z_all_znodes); zp; zp = nextzp) {
+ nextzp = list_next(&zfsvfs->z_all_znodes, zp);
+ if (zp->z_dbuf_held) {
+ /* dbufs should only be held when force unmounting */
+ zp->z_dbuf_held = 0;
+ mutex_exit(&zfsvfs->z_znodes_lock);
+ dmu_buf_rele(zp->z_dbuf, NULL);
+ /* Start again */
+ mutex_enter(&zfsvfs->z_znodes_lock);
+ nextzp = list_head(&zfsvfs->z_all_znodes);
+ }
+ }
+ mutex_exit(&zfsvfs->z_znodes_lock);
+
+ /*
+ * Unregister properties.
+ */
+ if (!dmu_objset_is_snapshot(os))
+ zfs_unregister_callbacks(zfsvfs);
+
+ /*
+ * Switch zfs_inactive to behaviour without an objset.
+ * It just tosses cached pages and frees the znode & vnode.
+ * Then re-enable zfs_inactive threads in that new behaviour.
+ */
+ zfsvfs->z_unmounted2 = B_TRUE;
+ rw_exit(&zfsvfs->z_um_lock); /* re-enable any zfs_inactive threads */
+
+ /*
+ * Close the zil. Can't close the zil while zfs_inactive
+ * threads are blocked as zil_close can call zfs_inactive.
+ */
+ if (zfsvfs->z_log) {
+ zil_close(zfsvfs->z_log);
+ zfsvfs->z_log = NULL;
+ }
+
+ /*
+ * Evict all dbufs so that cached znodes will be freed
+ */
+ if (dmu_objset_evict_dbufs(os, 1)) {
+ txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
+ (void) dmu_objset_evict_dbufs(os, 0);
+ }
+
+ /*
+ * Finally close the objset
+ */
+ dmu_objset_close(os);
+}
+
+static void
+zfs_freevfs(vfs_t *vfsp)
+{
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+ int i;
+
+ for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
+ mutex_destroy(&zfsvfs->z_hold_mtx[i]);
+ rw_destroy(&zfsvfs->z_um_lock);
+ mutex_destroy(&zfsvfs->z_znodes_lock);
+ kmem_free(zfsvfs, sizeof (zfsvfs_t));
+
+ atomic_add_32(&zfs_active_fs_count, -1);
+}
+
+void
+zfs_init(void)
+{
+
+ printf("ZFS filesystem version " ZFS_VERSION_STRING "\n");
+
+ /*
+ * Initialize .zfs directory structures
+ */
+ zfsctl_init();
+
+ /*
+ * Initialize znode cache, vnode ops, etc...
+ */
+ zfs_znode_init();
+}
+
+void
+zfs_fini(void)
+{
+ zfsctl_fini();
+ zfs_znode_fini();
+}
+
+int
+zfs_busy(void)
+{
+ return (zfs_active_fs_count != 0);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
new file mode 100644
index 0000000..6769177
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
@@ -0,0 +1,3227 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/resource.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/kmem.h>
+#include <sys/taskq.h>
+#include <sys/uio.h>
+#include <sys/atomic.h>
+#include <sys/namei.h>
+#include <sys/mman.h>
+#include <sys/cmn_err.h>
+#include <sys/errno.h>
+#include <sys/unistd.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/fs/zfs.h>
+#include <sys/dmu.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/dbuf.h>
+#include <sys/zap.h>
+#include <sys/dirent.h>
+#include <sys/filio.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/dnlc.h>
+#include <sys/zfs_rlock.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/sf_buf.h>
+#include <sys/sched.h>
+
+/*
+ * Programming rules.
+ *
+ * Each vnode op performs some logical unit of work. To do this, the ZPL must
+ * properly lock its in-core state, create a DMU transaction, do the work,
+ * record this work in the intent log (ZIL), commit the DMU transaction,
+ * and wait the the intent log to commit if it's is a synchronous operation.
+ * Morover, the vnode ops must work in both normal and log replay context.
+ * The ordering of events is important to avoid deadlocks and references
+ * to freed memory. The example below illustrates the following Big Rules:
+ *
+ * (1) A check must be made in each zfs thread for a mounted file system.
+ * This is done avoiding races using ZFS_ENTER(zfsvfs).
+ * A ZFS_EXIT(zfsvfs) is needed before all returns.
+ *
+ * (2) VN_RELE() should always be the last thing except for zil_commit()
+ * (if necessary) and ZFS_EXIT(). This is for 3 reasons:
+ * First, if it's the last reference, the vnode/znode
+ * can be freed, so the zp may point to freed memory. Second, the last
+ * reference will call zfs_zinactive(), which may induce a lot of work --
+ * pushing cached pages (which acquires range locks) and syncing out
+ * cached atime changes. Third, zfs_zinactive() may require a new tx,
+ * which could deadlock the system if you were already holding one.
+ *
+ * (3) All range locks must be grabbed before calling dmu_tx_assign(),
+ * as they can span dmu_tx_assign() calls.
+ *
+ * (4) Always pass zfsvfs->z_assign as the second argument to dmu_tx_assign().
+ * In normal operation, this will be TXG_NOWAIT. During ZIL replay,
+ * it will be a specific txg. Either way, dmu_tx_assign() never blocks.
+ * This is critical because we don't want to block while holding locks.
+ * Note, in particular, that if a lock is sometimes acquired before
+ * the tx assigns, and sometimes after (e.g. z_lock), then failing to
+ * use a non-blocking assign can deadlock the system. The scenario:
+ *
+ * Thread A has grabbed a lock before calling dmu_tx_assign().
+ * Thread B is in an already-assigned tx, and blocks for this lock.
+ * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
+ * forever, because the previous txg can't quiesce until B's tx commits.
+ *
+ * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
+ * then drop all locks, call dmu_tx_wait(), and try again.
+ *
+ * (5) If the operation succeeded, generate the intent log entry for it
+ * before dropping locks. This ensures that the ordering of events
+ * in the intent log matches the order in which they actually occurred.
+ *
+ * (6) At the end of each vnode op, the DMU tx must always commit,
+ * regardless of whether there were any errors.
+ *
+ * (7) After dropping all locks, invoke zil_commit(zilog, seq, foid)
+ * to ensure that synchronous semantics are provided when necessary.
+ *
+ * In general, this is how things should be ordered in each vnode op:
+ *
+ * ZFS_ENTER(zfsvfs); // exit if unmounted
+ * top:
+ * zfs_dirent_lock(&dl, ...) // lock directory entry (may VN_HOLD())
+ * rw_enter(...); // grab any other locks you need
+ * tx = dmu_tx_create(...); // get DMU tx
+ * dmu_tx_hold_*(); // hold each object you might modify
+ * error = dmu_tx_assign(tx, zfsvfs->z_assign); // try to assign
+ * if (error) {
+ * rw_exit(...); // drop locks
+ * zfs_dirent_unlock(dl); // unlock directory entry
+ * VN_RELE(...); // release held vnodes
+ * if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+ * dmu_tx_wait(tx);
+ * dmu_tx_abort(tx);
+ * goto top;
+ * }
+ * dmu_tx_abort(tx); // abort DMU tx
+ * ZFS_EXIT(zfsvfs); // finished in zfs
+ * return (error); // really out of space
+ * }
+ * error = do_real_work(); // do whatever this VOP does
+ * if (error == 0)
+ * zfs_log_*(...); // on success, make ZIL entry
+ * dmu_tx_commit(tx); // commit DMU tx -- error or not
+ * rw_exit(...); // drop locks
+ * zfs_dirent_unlock(dl); // unlock directory entry
+ * VN_RELE(...); // release held vnodes
+ * zil_commit(zilog, seq, foid); // synchronous when necessary
+ * ZFS_EXIT(zfsvfs); // finished in zfs
+ * return (error); // done, report error
+ */
+/* ARGSUSED */
+static int
+zfs_open(ap)
+ struct vop_open_args /* {
+ struct vnode *a_vp;
+ int a_mode;
+ struct ucred *a_cred;
+ struct thread *a_td;
+ } */ *ap;
+{
+ vnode_t *vp = ap->a_vp;
+ znode_t *zp = VTOZ(vp);
+ int flag = ap->a_mode;
+
+ /* Keep a count of the synchronous opens in the znode */
+ if (flag & FFSYNC) {
+ atomic_inc_32(&zp->z_sync_cnt);
+ ZFS_LOG(0, "Unexpected code path, report to pjd@FreeBSD.org");
+ }
+
+ vnode_create_vobject(vp, zp->z_phys->zp_size, ap->a_td);
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+zfs_close(ap)
+ struct vop_close_args /* {
+ struct vnode *a_vp;
+ int a_fflag;
+ struct ucred *a_cred;
+ struct thread *a_td;
+ } */ *ap;
+{
+ znode_t *zp = VTOZ(ap->a_vp);
+ int flag = ap->a_fflag;
+
+ /* Decrement the synchronous opens in the znode */
+ if (flag & FFSYNC) {
+ atomic_dec_32(&zp->z_sync_cnt);
+ ZFS_LOG(0, "Unexpected code path, report to pjd@FreeBSD.org");
+ }
+
+ return (0);
+}
+
+/*
+ * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
+ * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
+ */
+static int
+zfs_holey(vnode_t *vp, int cmd, offset_t *off)
+{
+ znode_t *zp = VTOZ(vp);
+ uint64_t noff = (uint64_t)*off; /* new offset */
+ uint64_t file_sz;
+ int error;
+ boolean_t hole;
+
+ file_sz = zp->z_phys->zp_size;
+ if (noff >= file_sz) {
+ return (ENXIO);
+ }
+
+ if (cmd == FIOSEEKHOLE)
+ hole = B_TRUE;
+ else
+ hole = B_FALSE;
+
+ error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
+
+ /* end of file? */
+ if ((error == ESRCH) || (noff > file_sz)) {
+ /*
+ * Handle the virtual hole at the end of file.
+ */
+ if (hole) {
+ *off = file_sz;
+ return (0);
+ }
+ return (ENXIO);
+ }
+
+ if (noff < *off)
+ return (error);
+ *off = noff;
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+zfs_ioctl(ap)
+ struct vop_ioctl_args /* {
+ struct vnode *a_vp;
+ u_long a_command;
+ caddr_t a_data;
+ int fflag;
+ struct ucred *cred;
+ struct thread *td;
+ } */ *ap;
+{
+ vnode_t *vp = ap->a_vp;
+ int com = ap->a_command;
+ caddr_t data = ap->a_data;
+ offset_t off;
+ zfsvfs_t *zfsvfs;
+ int error;
+
+ switch (com) {
+ case FIOSEEKDATA:
+ case FIOSEEKHOLE:
+ off = *(offset_t *)data;
+
+ zfsvfs = VTOZ(vp)->z_zfsvfs;
+ ZFS_ENTER(zfsvfs);
+
+ /* offset parameter is in/out */
+ error = zfs_holey(vp, com, &off);
+ ZFS_EXIT(zfsvfs);
+ if (error)
+ return (error);
+ *(offset_t *)data = off;
+ return (0);
+ }
+ return (ENOTTY);
+}
+
+/*
+ * When a file is memory mapped, we must keep the IO data synchronized
+ * between the DMU cache and the memory mapped pages. What this means:
+ *
+ * On Write: If we find a memory mapped page, we write to *both*
+ * the page and the dmu buffer.
+ *
+ * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
+ * the file is memory mapped.
+ */
+static int
+mappedwrite(vnode_t *vp, int nbytes, uio_t *uio, dmu_tx_t *tx)
+{
+ znode_t *zp = VTOZ(vp);
+ objset_t *os = zp->z_zfsvfs->z_os;
+ vm_object_t obj;
+ vm_page_t m;
+ struct sf_buf *sf;
+ int64_t start, off;
+ int len = nbytes;
+ int error = 0;
+
+ ASSERT(vp->v_mount != NULL);
+ obj = vp->v_object;
+ ASSERT(obj != NULL);
+
+ start = uio->uio_loffset;
+ off = start & PAGEOFFSET;
+ VM_OBJECT_LOCK(obj);
+ for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
+ uint64_t bytes = MIN(PAGESIZE - off, len);
+ uint64_t woff = uio->uio_loffset;
+
+again:
+ if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
+ vm_page_is_valid(m, (vm_offset_t)off, bytes)) {
+ caddr_t va;
+
+ if (vm_page_sleep_if_busy(m, FALSE, "zfsmwb"))
+ goto again;
+ vm_page_busy(m);
+ VM_OBJECT_UNLOCK(obj);
+ sched_pin();
+ sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
+ va = (caddr_t)sf_buf_kva(sf);
+ error = uiomove(va+off, bytes, UIO_WRITE, uio);
+ if (error == 0)
+ dmu_write(os, zp->z_id, woff, bytes, va+off, tx);
+ sf_buf_free(sf);
+ sched_unpin();
+ VM_OBJECT_LOCK(obj);
+ vm_page_wakeup(m);
+ } else {
+ VM_OBJECT_UNLOCK(obj);
+ error = dmu_write_uio(os, zp->z_id, uio, bytes, tx);
+ VM_OBJECT_LOCK(obj);
+ }
+ len -= bytes;
+ off = 0;
+ if (error)
+ break;
+ }
+ VM_OBJECT_UNLOCK(obj);
+ return (error);
+}
+
+/*
+ * When a file is memory mapped, we must keep the IO data synchronized
+ * between the DMU cache and the memory mapped pages. What this means:
+ *
+ * On Read: We "read" preferentially from memory mapped pages,
+ * else we default from the dmu buffer.
+ *
+ * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
+ * the file is memory mapped.
+ */
+static int
+mappedread(vnode_t *vp, int nbytes, uio_t *uio)
+{
+ znode_t *zp = VTOZ(vp);
+ objset_t *os = zp->z_zfsvfs->z_os;
+ vm_object_t obj;
+ vm_page_t m;
+ struct sf_buf *sf;
+ int64_t start, off;
+ int len = nbytes;
+ int error = 0;
+
+ ASSERT(vp->v_mount != NULL);
+ obj = vp->v_object;
+ ASSERT(obj != NULL);
+
+ start = uio->uio_loffset;
+ off = start & PAGEOFFSET;
+ VM_OBJECT_LOCK(obj);
+ for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
+ uint64_t bytes = MIN(PAGESIZE - off, len);
+
+again:
+ if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
+ vm_page_is_valid(m, (vm_offset_t)off, bytes)) {
+ caddr_t va;
+
+ if (vm_page_sleep_if_busy(m, FALSE, "zfsmrb"))
+ goto again;
+ vm_page_busy(m);
+ VM_OBJECT_UNLOCK(obj);
+ sched_pin();
+ sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
+ va = (caddr_t)sf_buf_kva(sf);
+ error = uiomove(va + off, bytes, UIO_READ, uio);
+ sf_buf_free(sf);
+ sched_unpin();
+ VM_OBJECT_LOCK(obj);
+ vm_page_wakeup(m);
+ } else {
+ VM_OBJECT_UNLOCK(obj);
+ error = dmu_read_uio(os, zp->z_id, uio, bytes);
+ VM_OBJECT_LOCK(obj);
+ }
+ len -= bytes;
+ off = 0;
+ if (error)
+ break;
+ }
+ VM_OBJECT_UNLOCK(obj);
+ return (error);
+}
+
+offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
+
+/*
+ * Read bytes from specified file into supplied buffer.
+ *
+ * IN: vp - vnode of file to be read from.
+ * uio - structure supplying read location, range info,
+ * and return buffer.
+ * ioflag - SYNC flags; used to provide FRSYNC semantics.
+ * cr - credentials of caller.
+ *
+ * OUT: uio - updated offset and range, buffer filled.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Side Effects:
+ * vp - atime updated if byte count > 0
+ */
+/* ARGSUSED */
+static int
+zfs_read(ap)
+ struct vop_read_args /* {
+ struct vnode *a_vp;
+ struct uio *a_uio;
+ int a_ioflag;
+ struct ucred *a_cred;
+ } */ *ap;
+{
+ vnode_t *vp = ap->a_vp;
+ uio_t *uio = ap->a_uio;
+ int ioflag = ap->a_ioflag;
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ objset_t *os = zfsvfs->z_os;
+ ssize_t n, nbytes;
+ int error;
+ rl_t *rl;
+
+ ZFS_ENTER(zfsvfs);
+
+ /*
+ * Validate file offset
+ */
+ if (uio->uio_loffset < (offset_t)0) {
+ ZFS_EXIT(zfsvfs);
+ return (EINVAL);
+ }
+
+ /*
+ * Fasttrack empty reads
+ */
+ if (uio->uio_resid == 0) {
+ ZFS_EXIT(zfsvfs);
+ return (0);
+ }
+
+ /*
+ * If we're in FRSYNC mode, sync out this znode before reading it.
+ */
+ if (ioflag & IO_SYNC)
+ zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
+
+ /*
+ * Lock the range against changes.
+ */
+ rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
+
+ /*
+ * If we are reading past end-of-file we can skip
+ * to the end; but we might still need to set atime.
+ */
+ if (uio->uio_loffset >= zp->z_phys->zp_size) {
+ error = 0;
+ goto out;
+ }
+
+ ASSERT(uio->uio_loffset < zp->z_phys->zp_size);
+ n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset);
+
+ while (n > 0) {
+ nbytes = MIN(n, zfs_read_chunk_size -
+ P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
+
+ if (vn_has_cached_data(vp))
+ error = mappedread(vp, nbytes, uio);
+ else
+ error = dmu_read_uio(os, zp->z_id, uio, nbytes);
+ if (error)
+ break;
+ n -= nbytes;
+ }
+
+out:
+ zfs_range_unlock(rl);
+
+ ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
+ ZFS_EXIT(zfsvfs);
+
+ return (error);
+}
+
+/*
+ * Fault in the pages of the first n bytes specified by the uio structure.
+ * 1 byte in each page is touched and the uio struct is unmodified.
+ * Any error will exit this routine as this is only a best
+ * attempt to get the pages resident. This is a copy of ufs_trans_touch().
+ */
+static void
+zfs_prefault_write(ssize_t n, struct uio *uio)
+{
+ struct iovec *iov;
+ ulong_t cnt, incr;
+ caddr_t p;
+
+ if (uio->uio_segflg != UIO_USERSPACE)
+ return;
+
+ iov = uio->uio_iov;
+
+ while (n) {
+ cnt = MIN(iov->iov_len, n);
+ if (cnt == 0) {
+ /* empty iov entry */
+ iov++;
+ continue;
+ }
+ n -= cnt;
+ /*
+ * touch each page in this segment.
+ */
+ p = iov->iov_base;
+ while (cnt) {
+ if (fubyte(p) == -1)
+ return;
+ incr = MIN(cnt, PAGESIZE);
+ p += incr;
+ cnt -= incr;
+ }
+ /*
+ * touch the last byte in case it straddles a page.
+ */
+ p--;
+ if (fubyte(p) == -1)
+ return;
+ iov++;
+ }
+}
+
+/*
+ * Write the bytes to a file.
+ *
+ * IN: vp - vnode of file to be written to.
+ * uio - structure supplying write location, range info,
+ * and data buffer.
+ * ioflag - IO_APPEND flag set if in append mode.
+ * cr - credentials of caller.
+ *
+ * OUT: uio - updated offset and range.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * vp - ctime|mtime updated if byte count > 0
+ */
+/* ARGSUSED */
+static int
+zfs_write(ap)
+ struct vop_write_args /* {
+ struct vnode *a_vp;
+ struct uio *a_uio;
+ int a_ioflag;
+ struct ucred *a_cred;
+ } */ *ap;
+{
+ vnode_t *vp = ap->a_vp;
+ uio_t *uio = ap->a_uio;
+ int ioflag = ap->a_ioflag;
+ cred_t *cr = ap->a_cred;
+ znode_t *zp = VTOZ(vp);
+ ssize_t start_resid = uio->uio_resid;
+ ssize_t tx_bytes;
+ uint64_t end_size;
+ dmu_tx_t *tx;
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ zilog_t *zilog = zfsvfs->z_log;
+ offset_t woff;
+ ssize_t n, nbytes;
+ rl_t *rl;
+ int max_blksz = zfsvfs->z_max_blksz;
+ int error;
+
+ /*
+ * Fasttrack empty write
+ */
+ n = start_resid;
+ if (n == 0)
+ return (0);
+
+ ZFS_ENTER(zfsvfs);
+
+ /*
+ * Pre-fault the pages to ensure slow (eg NFS) pages
+ * don't hold up txg.
+ */
+ zfs_prefault_write(n, uio);
+
+ /*
+ * If in append mode, set the io offset pointer to eof.
+ */
+ if (ioflag & IO_APPEND) {
+ /*
+ * Range lock for a file append:
+ * The value for the start of range will be determined by
+ * zfs_range_lock() (to guarantee append semantics).
+ * If this write will cause the block size to increase,
+ * zfs_range_lock() will lock the entire file, so we must
+ * later reduce the range after we grow the block size.
+ */
+ rl = zfs_range_lock(zp, 0, n, RL_APPEND);
+ if (rl->r_len == UINT64_MAX) {
+ /* overlocked, zp_size can't change */
+ woff = uio->uio_loffset = zp->z_phys->zp_size;
+ } else {
+ woff = uio->uio_loffset = rl->r_off;
+ }
+ } else {
+ woff = uio->uio_loffset;
+ /*
+ * Validate file offset
+ */
+ if (woff < 0) {
+ ZFS_EXIT(zfsvfs);
+ return (EINVAL);
+ }
+
+ /*
+ * If we need to grow the block size then zfs_range_lock()
+ * will lock a wider range than we request here.
+ * Later after growing the block size we reduce the range.
+ */
+ rl = zfs_range_lock(zp, woff, n, RL_WRITER);
+ }
+
+ end_size = MAX(zp->z_phys->zp_size, woff + n);
+
+ /*
+ * Write the file in reasonable size chunks. Each chunk is written
+ * in a separate transaction; this keeps the intent log records small
+ * and allows us to do more fine-grained space accounting.
+ */
+ while (n > 0) {
+ /*
+ * Start a transaction.
+ */
+ woff = uio->uio_loffset;
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_bonus(tx, zp->z_id);
+ dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
+ error = dmu_tx_assign(tx, zfsvfs->z_assign);
+ if (error) {
+ if (error == ERESTART &&
+ zfsvfs->z_assign == TXG_NOWAIT) {
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ continue;
+ }
+ dmu_tx_abort(tx);
+ break;
+ }
+
+ /*
+ * If zfs_range_lock() over-locked we grow the blocksize
+ * and then reduce the lock range. This will only happen
+ * on the first iteration since zfs_range_reduce() will
+ * shrink down r_len to the appropriate size.
+ */
+ if (rl->r_len == UINT64_MAX) {
+ uint64_t new_blksz;
+
+ if (zp->z_blksz > max_blksz) {
+ ASSERT(!ISP2(zp->z_blksz));
+ new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE);
+ } else {
+ new_blksz = MIN(end_size, max_blksz);
+ }
+ zfs_grow_blocksize(zp, new_blksz, tx);
+ zfs_range_reduce(rl, woff, n);
+ }
+
+ /*
+ * XXX - should we really limit each write to z_max_blksz?
+ * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
+ */
+ nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
+ rw_enter(&zp->z_map_lock, RW_READER);
+
+ tx_bytes = uio->uio_resid;
+ if (woff + nbytes > zp->z_phys->zp_size)
+ vnode_pager_setsize(vp, woff + nbytes);
+
+ if (vn_has_cached_data(vp)) {
+ rw_exit(&zp->z_map_lock);
+ error = mappedwrite(vp, nbytes, uio, tx);
+ } else {
+ error = dmu_write_uio(zfsvfs->z_os, zp->z_id,
+ uio, nbytes, tx);
+ rw_exit(&zp->z_map_lock);
+ }
+ tx_bytes -= uio->uio_resid;
+
+ /*
+ * If we made no progress, we're done. If we made even
+ * partial progress, update the znode and ZIL accordingly.
+ */
+ if (tx_bytes == 0) {
+ dmu_tx_commit(tx);
+ ASSERT(error != 0);
+ break;
+ }
+
+ /*
+ * Clear Set-UID/Set-GID bits on successful write if not
+ * privileged and at least one of the excute bits is set.
+ *
+ * It would be nice to to this after all writes have
+ * been done, but that would still expose the ISUID/ISGID
+ * to another app after the partial write is committed.
+ */
+ mutex_enter(&zp->z_acl_lock);
+ if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) |
+ (S_IXUSR >> 6))) != 0 &&
+ (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 &&
+ secpolicy_vnode_setid_retain(cr,
+ (zp->z_phys->zp_mode & S_ISUID) != 0 &&
+ zp->z_phys->zp_uid == 0) != 0) {
+ zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID);
+ }
+ mutex_exit(&zp->z_acl_lock);
+
+ /*
+ * Update time stamp. NOTE: This marks the bonus buffer as
+ * dirty, so we don't have to do it again for zp_size.
+ */
+ zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
+
+ /*
+ * Update the file size (zp_size) if it has changed;
+ * account for possible concurrent updates.
+ */
+ while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset)
+ (void) atomic_cas_64(&zp->z_phys->zp_size, end_size,
+ uio->uio_loffset);
+ zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
+ dmu_tx_commit(tx);
+
+ if (error != 0)
+ break;
+ ASSERT(tx_bytes == nbytes);
+ n -= nbytes;
+ }
+
+ zfs_range_unlock(rl);
+
+ /*
+ * If we're in replay mode, or we made no progress, return error.
+ * Otherwise, it's at least a partial write, so it's successful.
+ */
+ if (zfsvfs->z_assign >= TXG_INITIAL || uio->uio_resid == start_resid) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ if (ioflag & IO_SYNC)
+ zil_commit(zilog, zp->z_last_itx, zp->z_id);
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+void
+zfs_get_done(dmu_buf_t *db, void *vzgd)
+{
+ zgd_t *zgd = (zgd_t *)vzgd;
+ rl_t *rl = zgd->zgd_rl;
+ vnode_t *vp = ZTOV(rl->r_zp);
+ int vfslocked;
+
+ vfslocked = VFS_LOCK_GIANT(vp->v_vfsp);
+ dmu_buf_rele(db, vzgd);
+ zfs_range_unlock(rl);
+ VN_RELE(vp);
+ zil_add_vdev(zgd->zgd_zilog, DVA_GET_VDEV(BP_IDENTITY(zgd->zgd_bp)));
+ kmem_free(zgd, sizeof (zgd_t));
+ VFS_UNLOCK_GIANT(vfslocked);
+}
+
+/*
+ * Get data to generate a TX_WRITE intent log record.
+ */
+int
+zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
+{
+ zfsvfs_t *zfsvfs = arg;
+ objset_t *os = zfsvfs->z_os;
+ znode_t *zp;
+ uint64_t off = lr->lr_offset;
+ dmu_buf_t *db;
+ rl_t *rl;
+ zgd_t *zgd;
+ int dlen = lr->lr_length; /* length of user data */
+ int error = 0;
+
+ ASSERT(zio);
+ ASSERT(dlen != 0);
+
+ /*
+ * Nothing to do if the file has been removed
+ */
+ if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0)
+ return (ENOENT);
+ if (zp->z_unlinked) {
+ VN_RELE(ZTOV(zp));
+ return (ENOENT);
+ }
+
+ /*
+ * Write records come in two flavors: immediate and indirect.
+ * For small writes it's cheaper to store the data with the
+ * log record (immediate); for large writes it's cheaper to
+ * sync the data and get a pointer to it (indirect) so that
+ * we don't have to write the data twice.
+ */
+ if (buf != NULL) { /* immediate write */
+ rl = zfs_range_lock(zp, off, dlen, RL_READER);
+ /* test for truncation needs to be done while range locked */
+ if (off >= zp->z_phys->zp_size) {
+ error = ENOENT;
+ goto out;
+ }
+ VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf));
+ } else { /* indirect write */
+ uint64_t boff; /* block starting offset */
+
+ /*
+ * Have to lock the whole block to ensure when it's
+ * written out and it's checksum is being calculated
+ * that no one can change the data. We need to re-check
+ * blocksize after we get the lock in case it's changed!
+ */
+ for (;;) {
+ if (ISP2(zp->z_blksz)) {
+ boff = P2ALIGN_TYPED(off, zp->z_blksz,
+ uint64_t);
+ } else {
+ boff = 0;
+ }
+ dlen = zp->z_blksz;
+ rl = zfs_range_lock(zp, boff, dlen, RL_READER);
+ if (zp->z_blksz == dlen)
+ break;
+ zfs_range_unlock(rl);
+ }
+ /* test for truncation needs to be done while range locked */
+ if (off >= zp->z_phys->zp_size) {
+ error = ENOENT;
+ goto out;
+ }
+ zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP);
+ zgd->zgd_rl = rl;
+ zgd->zgd_zilog = zfsvfs->z_log;
+ zgd->zgd_bp = &lr->lr_blkptr;
+ VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db));
+ ASSERT(boff == db->db_offset);
+ lr->lr_blkoff = off - boff;
+ error = dmu_sync(zio, db, &lr->lr_blkptr,
+ lr->lr_common.lrc_txg, zfs_get_done, zgd);
+ ASSERT(error == EEXIST || lr->lr_length <= zp->z_blksz);
+ if (error == 0) {
+ zil_add_vdev(zfsvfs->z_log,
+ DVA_GET_VDEV(BP_IDENTITY(&lr->lr_blkptr)));
+ }
+ /*
+ * If we get EINPROGRESS, then we need to wait for a
+ * write IO initiated by dmu_sync() to complete before
+ * we can release this dbuf. We will finish everything
+ * up in the zfs_get_done() callback.
+ */
+ if (error == EINPROGRESS)
+ return (0);
+ dmu_buf_rele(db, zgd);
+ kmem_free(zgd, sizeof (zgd_t));
+ }
+out:
+ zfs_range_unlock(rl);
+ VN_RELE(ZTOV(zp));
+ return (error);
+}
+
+/*ARGSUSED*/
+static int
+zfs_access(ap)
+ struct vop_access_args /* {
+ struct vnode *a_vp;
+ int a_mode;
+ struct ucred *a_cred;
+ struct thread *a_td;
+ } */ *ap;
+{
+ vnode_t *vp = ap->a_vp;
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+ error = zfs_zaccess_rwx(zp, ap->a_mode, ap->a_cred);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Lookup an entry in a directory, or an extended attribute directory.
+ * If it exists, return a held vnode reference for it.
+ *
+ * IN: dvp - vnode of directory to search.
+ * nm - name of entry to lookup.
+ * pnp - full pathname to lookup [UNUSED].
+ * flags - LOOKUP_XATTR set if looking for an attribute.
+ * rdir - root directory vnode [UNUSED].
+ * cr - credentials of caller.
+ *
+ * OUT: vpp - vnode of located entry, NULL if not found.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * NA
+ */
+/* ARGSUSED */
+int
+zfs_lookup(ap)
+ struct vop_lookup_args /* {
+ struct vnode *a_dvp;
+ struct vnode **a_vpp;
+ struct componentname *a_cnp;
+ } */ *ap;
+{
+ kthread_t *td = ap->a_cnp->cn_thread;
+ struct componentname *cnp = ap->a_cnp;
+ vnode_t *dvp = ap->a_dvp;
+ vnode_t **vpp = ap->a_vpp;
+ u_long flags = cnp->cn_flags;
+ cred_t *cr = cnp->cn_cred;
+ int nameiop = cnp->cn_nameiop;
+ char nm[NAME_MAX + 1];
+ znode_t *zdp = VTOZ(dvp);
+ zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
+ int error;
+
+ ASSERT(cnp->cn_namelen < sizeof(nm));
+ strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm)));
+
+ ZFS_ENTER(zfsvfs);
+
+ *vpp = NULL;
+
+#ifdef TODO
+ if (flags & LOOKUP_XATTR) {
+ /*
+ * If the xattr property is off, refuse the lookup request.
+ */
+ if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
+ ZFS_EXIT(zfsvfs);
+ return (EINVAL);
+ }
+
+ /*
+ * We don't allow recursive attributes..
+ * Maybe someday we will.
+ */
+ if (zdp->z_phys->zp_flags & ZFS_XATTR) {
+ ZFS_EXIT(zfsvfs);
+ return (EINVAL);
+ }
+
+ if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * Do we have permission to get into attribute directory?
+ */
+
+ if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, cr)) {
+ VN_RELE(*vpp);
+ }
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+#endif /* TODO */
+
+ if (dvp->v_type != VDIR) {
+ ZFS_EXIT(zfsvfs);
+ return (ENOTDIR);
+ }
+
+ /*
+ * Check accessibility of directory.
+ */
+
+ if (error = zfs_zaccess(zdp, ACE_EXECUTE, cr)) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ error = zfs_dirlook(zdp, nm, vpp);
+
+ ZFS_EXIT(zfsvfs);
+
+ /* Translate errors and add SAVENAME when needed. */
+ if (cnp->cn_flags & ISLASTCN) {
+ switch (nameiop) {
+ case CREATE:
+ case RENAME:
+ if (error == ENOENT) {
+ error = EJUSTRETURN;
+ cnp->cn_flags |= SAVENAME;
+ break;
+ }
+ /* FALLTHROUGH */
+ case DELETE:
+ if (error == 0)
+ cnp->cn_flags |= SAVENAME;
+ break;
+ }
+ }
+ if (error == 0 && (nm[0] != '.' || nm[1] != '\0')) {
+ if (flags & ISDOTDOT)
+ VOP_UNLOCK(dvp, 0, td);
+ vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td);
+ if (flags & ISDOTDOT)
+ vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, td);
+ }
+
+#ifdef FREEBSD_NAMECACHE
+ /*
+ * Insert name into cache (as non-existent) if appropriate.
+ */
+ if (error == ENOENT && (cnp->cn_flags & MAKEENTRY) && nameiop != CREATE)
+ cache_enter(dvp, *vpp, cnp);
+ /*
+ * Insert name into cache if appropriate.
+ */
+ if (error == 0 && (cnp->cn_flags & MAKEENTRY)) {
+ if (!(cnp->cn_flags & ISLASTCN) ||
+ (nameiop != DELETE && nameiop != RENAME)) {
+ cache_enter(dvp, *vpp, cnp);
+ }
+ }
+#endif
+
+ return (error);
+}
+
+/*
+ * Attempt to create a new entry in a directory. If the entry
+ * already exists, truncate the file if permissible, else return
+ * an error. Return the vp of the created or trunc'd file.
+ *
+ * IN: dvp - vnode of directory to put new file entry in.
+ * name - name of new file entry.
+ * vap - attributes of new file.
+ * excl - flag indicating exclusive or non-exclusive mode.
+ * mode - mode to open file with.
+ * cr - credentials of caller.
+ * flag - large file flag [UNUSED].
+ *
+ * OUT: vpp - vnode of created or trunc'd entry.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * dvp - ctime|mtime updated if new entry created
+ * vp - ctime|mtime always, atime if new
+ */
+/* ARGSUSED */
+static int
+zfs_create(ap)
+ struct vop_create_args /* {
+ struct vnode *a_dvp;
+ struct vnode **a_vpp;
+ struct componentname *a_cnp;
+ struct vattr *a_vap;
+ } */ *ap;
+{
+ vnode_t *dvp = ap->a_dvp;
+ vnode_t **vpp = ap->a_vpp;
+ vattr_t *vap = ap->a_vap;
+ cred_t *cr = ap->a_cnp->cn_cred;
+ char *name = ap->a_cnp->cn_nameptr;
+ znode_t *zp, *dzp = VTOZ(dvp);
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ zilog_t *zilog = zfsvfs->z_log;
+ objset_t *os = zfsvfs->z_os;
+ zfs_dirlock_t *dl;
+ dmu_tx_t *tx;
+ int error;
+ uint64_t zoid;
+ int mode;
+
+ ASSERT(ap->a_cnp->cn_flags & SAVENAME);
+
+ vattr_init_mask(vap);
+ mode = vap->va_mode & ALLPERMS;
+
+ ZFS_ENTER(zfsvfs);
+
+top:
+ *vpp = NULL;
+
+ if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr))
+ vap->va_mode &= ~VSVTX;
+
+ if (*name == '\0') {
+ /*
+ * Null component name refers to the directory itself.
+ */
+ VN_HOLD(dvp);
+ zp = dzp;
+ dl = NULL;
+ error = 0;
+ } else {
+ /* possible VN_HOLD(zp) */
+ if (error = zfs_dirent_lock(&dl, dzp, name, &zp, 0)) {
+ if (strcmp(name, "..") == 0)
+ error = EISDIR;
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ }
+
+ zoid = zp ? zp->z_id : -1ULL;
+
+ if (zp == NULL) {
+ /*
+ * Create a new file object and update the directory
+ * to reference it.
+ */
+ if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) {
+ goto out;
+ }
+
+ /*
+ * We only support the creation of regular files in
+ * extended attribute directories.
+ */
+ if ((dzp->z_phys->zp_flags & ZFS_XATTR) &&
+ (vap->va_type != VREG)) {
+ error = EINVAL;
+ goto out;
+ }
+
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+ dmu_tx_hold_bonus(tx, dzp->z_id);
+ dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
+ if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+ 0, SPA_MAXBLOCKSIZE);
+ error = dmu_tx_assign(tx, zfsvfs->z_assign);
+ if (error) {
+ zfs_dirent_unlock(dl);
+ if (error == ERESTART &&
+ zfsvfs->z_assign == TXG_NOWAIT) {
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ goto top;
+ }
+ dmu_tx_abort(tx);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0);
+ ASSERT(zp->z_id == zoid);
+ (void) zfs_link_create(dl, zp, tx, ZNEW);
+ zfs_log_create(zilog, tx, TX_CREATE, dzp, zp, name);
+ dmu_tx_commit(tx);
+ } else {
+ /*
+ * A directory entry already exists for this name.
+ */
+
+ /*
+ * Can't open a directory for writing.
+ */
+ if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) {
+ error = EISDIR;
+ goto out;
+ }
+ /*
+ * Verify requested access to file.
+ */
+ if (mode && (error = zfs_zaccess_rwx(zp, mode, cr))) {
+ goto out;
+ }
+
+ mutex_enter(&dzp->z_lock);
+ dzp->z_seq++;
+ mutex_exit(&dzp->z_lock);
+
+ /*
+ * Truncate regular files if requested.
+ */
+ if ((ZTOV(zp)->v_type == VREG) &&
+ (zp->z_phys->zp_size != 0) &&
+ (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) {
+ error = zfs_freesp(zp, 0, 0, mode, TRUE);
+ if (error == ERESTART &&
+ zfsvfs->z_assign == TXG_NOWAIT) {
+ /* NB: we already did dmu_tx_wait() */
+ zfs_dirent_unlock(dl);
+ VN_RELE(ZTOV(zp));
+ goto top;
+ }
+ }
+ }
+out:
+
+ if (error == 0) {
+ *vpp = ZTOV(zp);
+ vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, ap->a_cnp->cn_thread);
+ }
+
+ if (dl)
+ zfs_dirent_unlock(dl);
+
+ if (error) {
+ if (zp)
+ VN_RELE(ZTOV(zp));
+ }
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Remove an entry from a directory.
+ *
+ * IN: dvp - vnode of directory to remove entry from.
+ * name - name of entry to remove.
+ * cr - credentials of caller.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * dvp - ctime|mtime
+ * vp - ctime (if nlink > 0)
+ */
+static int
+zfs_remove(ap)
+ struct vop_remove_args /* {
+ struct vnode *a_dvp;
+ struct vnode *a_vp;
+ struct componentname *a_cnp;
+ } */ *ap;
+{
+ vnode_t *dvp = ap->a_dvp;
+ cred_t *cr = ap->a_cnp->cn_cred;
+ char *name = ap->a_cnp->cn_nameptr;
+ znode_t *zp, *dzp = VTOZ(dvp);
+#if 0
+ znode_t *xzp = NULL;
+#endif
+ vnode_t *vp;
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ zilog_t *zilog = zfsvfs->z_log;
+ uint64_t xattr_obj;
+ zfs_dirlock_t *dl;
+ dmu_tx_t *tx;
+ boolean_t unlinked;
+ int error;
+
+ ASSERT(ap->a_cnp->cn_flags & SAVENAME);
+
+ ZFS_ENTER(zfsvfs);
+
+top:
+ /*
+ * Attempt to lock directory; fail if entry doesn't exist.
+ */
+ if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS)) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ vp = ZTOV(zp);
+ ASSERT(vp == ap->a_vp);
+ ASSERT(vp->v_type != VDIR);
+ /* Drop an extra reference from zfs_dirent_lock(). */
+ VN_RELE(vp);
+ ASSERT(vp->v_count > 0);
+
+ if (error = zfs_zaccess_delete(dzp, zp, cr))
+ goto out;
+
+ dnlc_remove(dvp, name);
+
+ /*
+ * We may delete the znode now, or we may put it in the unlinked set;
+ * it depends on whether we're the last link, and on whether there are
+ * other holds on the vnode. So we dmu_tx_hold() the right things to
+ * allow for either case.
+ */
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
+ dmu_tx_hold_bonus(tx, zp->z_id);
+
+ /* are there any extended attributes? */
+ if ((xattr_obj = zp->z_phys->zp_xattr) != 0) {
+ /* XXX - do we need this if we are deleting? */
+ dmu_tx_hold_bonus(tx, xattr_obj);
+ }
+
+ /* charge as an update -- would be nice not to charge at all */
+ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+
+ error = dmu_tx_assign(tx, zfsvfs->z_assign);
+ if (error) {
+ zfs_dirent_unlock(dl);
+ if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ goto top;
+ }
+ dmu_tx_abort(tx);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * Remove the directory entry.
+ */
+ error = zfs_link_destroy(dl, zp, tx, 0, &unlinked);
+
+ if (error) {
+ dmu_tx_commit(tx);
+ goto out;
+ }
+
+ if (unlinked)
+ zfs_unlinked_add(zp, tx);
+
+ zfs_log_remove(zilog, tx, TX_REMOVE, dzp, name);
+
+ dmu_tx_commit(tx);
+out:
+ zfs_dirent_unlock(dl);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Create a new directory and insert it into dvp using the name
+ * provided. Return a pointer to the inserted directory.
+ *
+ * IN: dvp - vnode of directory to add subdir to.
+ * dirname - name of new directory.
+ * vap - attributes of new directory.
+ * cr - credentials of caller.
+ *
+ * OUT: vpp - vnode of created directory.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * dvp - ctime|mtime updated
+ * vp - ctime|mtime|atime updated
+ */
+static int
+zfs_mkdir(ap)
+ struct vop_mkdir_args /* {
+ struct vnode *a_dvp;
+ struct vnode **a_vpp;
+ struct componentname *a_cnp;
+ struct vattr *a_vap;
+ } */ *ap;
+{
+ vnode_t *dvp = ap->a_dvp;
+ vnode_t **vpp = ap->a_vpp;
+ vattr_t *vap = ap->a_vap;
+ cred_t *cr = ap->a_cnp->cn_cred;
+ char *dirname = ap->a_cnp->cn_nameptr;
+ znode_t *zp, *dzp = VTOZ(dvp);
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ zilog_t *zilog = zfsvfs->z_log;
+ zfs_dirlock_t *dl;
+ uint64_t zoid = 0;
+ dmu_tx_t *tx;
+ int error;
+
+ ASSERT(ap->a_cnp->cn_flags & SAVENAME);
+ ASSERT(vap->va_type == VDIR);
+ vattr_init_mask(vap);
+
+ ZFS_ENTER(zfsvfs);
+
+ if (dzp->z_phys->zp_flags & ZFS_XATTR) {
+ ZFS_EXIT(zfsvfs);
+ return (EINVAL);
+ }
+top:
+ *vpp = NULL;
+
+ /*
+ * First make sure the new directory doesn't exist.
+ */
+ if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, ZNEW)) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, cr)) {
+ zfs_dirent_unlock(dl);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * Add a new entry to the directory.
+ */
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
+ if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+ 0, SPA_MAXBLOCKSIZE);
+ error = dmu_tx_assign(tx, zfsvfs->z_assign);
+ if (error) {
+ zfs_dirent_unlock(dl);
+ if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ goto top;
+ }
+ dmu_tx_abort(tx);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * Create new node.
+ */
+ zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0);
+
+ /*
+ * Now put new name in parent dir.
+ */
+ (void) zfs_link_create(dl, zp, tx, ZNEW);
+
+ *vpp = ZTOV(zp);
+
+ zfs_log_create(zilog, tx, TX_MKDIR, dzp, zp, dirname);
+ dmu_tx_commit(tx);
+
+ vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, ap->a_cnp->cn_thread);
+
+ zfs_dirent_unlock(dl);
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+/*
+ * Remove a directory subdir entry. If the current working
+ * directory is the same as the subdir to be removed, the
+ * remove will fail.
+ *
+ * IN: dvp - vnode of directory to remove from.
+ * name - name of directory to be removed.
+ * cwd - vnode of current working directory.
+ * cr - credentials of caller.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * dvp - ctime|mtime updated
+ */
+static int
+zfs_rmdir(ap)
+ struct vop_rmdir_args /* {
+ struct vnode *a_dvp;
+ struct vnode *a_vp;
+ struct componentname *a_cnp;
+ } */ *ap;
+{
+ vnode_t *dvp = ap->a_dvp;
+ cred_t *cr = ap->a_cnp->cn_cred;
+ char *name = ap->a_cnp->cn_nameptr;
+ znode_t *dzp = VTOZ(dvp);
+ znode_t *zp;
+ vnode_t *vp;
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ zilog_t *zilog = zfsvfs->z_log;
+ zfs_dirlock_t *dl;
+ dmu_tx_t *tx;
+ int error;
+
+ ASSERT(ap->a_cnp->cn_flags & SAVENAME);
+
+ ZFS_ENTER(zfsvfs);
+top:
+ zp = NULL;
+
+ /*
+ * Attempt to lock directory; fail if entry doesn't exist.
+ */
+ if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS)) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ vp = ZTOV(zp);
+ ASSERT(vp == ap->a_vp);
+ ASSERT(vp->v_type == VDIR);
+ /* Drop an extra reference from zfs_dirent_lock(). */
+ VN_RELE(vp);
+ ASSERT(vp->v_count > 0);
+
+ if (error = zfs_zaccess_delete(dzp, zp, cr)) {
+ goto out;
+ }
+
+ /*
+ * Grab a lock on the directory to make sure that noone is
+ * trying to add (or lookup) entries while we are removing it.
+ */
+ rw_enter(&zp->z_name_lock, RW_WRITER);
+
+ /*
+ * Grab a lock on the parent pointer to make sure we play well
+ * with the treewalk and directory rename code.
+ */
+ rw_enter(&zp->z_parent_lock, RW_WRITER);
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
+ dmu_tx_hold_bonus(tx, zp->z_id);
+ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+ error = dmu_tx_assign(tx, zfsvfs->z_assign);
+ if (error) {
+ rw_exit(&zp->z_parent_lock);
+ rw_exit(&zp->z_name_lock);
+ zfs_dirent_unlock(dl);
+ if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ goto top;
+ }
+ dmu_tx_abort(tx);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+#ifdef FREEBSD_NAMECACHE
+ cache_purge(dvp);
+#endif
+
+ error = zfs_link_destroy(dl, zp, tx, 0, NULL);
+
+ if (error == 0)
+ zfs_log_remove(zilog, tx, TX_RMDIR, dzp, name);
+
+ dmu_tx_commit(tx);
+
+ rw_exit(&zp->z_parent_lock);
+ rw_exit(&zp->z_name_lock);
+#ifdef FREEBSD_NAMECACHE
+ cache_purge(vp);
+#endif
+out:
+ zfs_dirent_unlock(dl);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Read as many directory entries as will fit into the provided
+ * buffer from the given directory cursor position (specified in
+ * the uio structure.
+ *
+ * IN: vp - vnode of directory to read.
+ * uio - structure supplying read location, range info,
+ * and return buffer.
+ * cr - credentials of caller.
+ *
+ * OUT: uio - updated offset and range, buffer filled.
+ * eofp - set to true if end-of-file detected.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * vp - atime updated
+ *
+ * Note that the low 4 bits of the cookie returned by zap is always zero.
+ * This allows us to use the low range for "special" directory entries:
+ * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem,
+ * we use the offset 2 for the '.zfs' directory.
+ */
+/* ARGSUSED */
+static int
+zfs_readdir(ap)
+ struct vop_readdir_args /* {
+ struct vnode *a_vp;
+ struct uio *a_uio;
+ struct ucred *a_cred;
+ int *a_eofflag;
+ int *ncookies;
+ u_long **a_cookies;
+ } */ *ap;
+{
+ vnode_t *vp = ap->a_vp;
+ uio_t *uio = ap->a_uio;
+ int *eofp = ap->a_eofflag;
+ znode_t *zp = VTOZ(vp);
+ iovec_t *iovp;
+ dirent64_t *odp;
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ objset_t *os;
+ caddr_t outbuf;
+ size_t bufsize;
+ zap_cursor_t zc;
+ zap_attribute_t zap;
+ uint_t bytes_wanted;
+ uint64_t offset; /* must be unsigned; checks for < 1 */
+ int local_eof;
+ int outcount;
+ int error;
+ uint8_t prefetch;
+ uint8_t type;
+ int ncookies;
+ u_long *cookies = NULL;
+
+ ZFS_ENTER(zfsvfs);
+
+ /*
+ * If we are not given an eof variable,
+ * use a local one.
+ */
+ if (eofp == NULL)
+ eofp = &local_eof;
+
+ /*
+ * Check for valid iov_len.
+ */
+ if (uio->uio_iov->iov_len <= 0) {
+ ZFS_EXIT(zfsvfs);
+ return (EINVAL);
+ }
+
+ /*
+ * Quit if directory has been removed (posix)
+ */
+ if ((*eofp = zp->z_unlinked) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (0);
+ }
+
+ error = 0;
+ os = zfsvfs->z_os;
+ offset = uio->uio_loffset;
+ prefetch = zp->z_zn_prefetch;
+
+ /*
+ * Initialize the iterator cursor.
+ */
+ if (offset <= 3) {
+ /*
+ * Start iteration from the beginning of the directory.
+ */
+ zap_cursor_init(&zc, os, zp->z_id);
+ } else {
+ /*
+ * The offset is a serialized cursor.
+ */
+ zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
+ }
+
+ /*
+ * Get space to change directory entries into fs independent format.
+ */
+ iovp = uio->uio_iov;
+ bytes_wanted = iovp->iov_len;
+ if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
+ bufsize = bytes_wanted;
+ outbuf = kmem_alloc(bufsize, KM_SLEEP);
+ odp = (struct dirent64 *)outbuf;
+ } else {
+ bufsize = bytes_wanted;
+ odp = (struct dirent64 *)iovp->iov_base;
+ }
+
+ if (ap->a_ncookies) {
+ /*
+ * Minimum entry size is dirent size and 1 byte for a file name.
+ */
+ ncookies = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
+ cookies = malloc(ncookies * sizeof(u_long), M_TEMP, M_WAITOK);
+ *ap->a_cookies = cookies;
+ *ap->a_ncookies = ncookies;
+ }
+
+ /*
+ * Transform to file-system independent format
+ */
+ outcount = 0;
+ while (outcount < bytes_wanted) {
+ ino64_t objnum;
+ ushort_t reclen;
+
+ /*
+ * Special case `.', `..', and `.zfs'.
+ */
+ if (offset == 0) {
+ (void) strcpy(zap.za_name, ".");
+ objnum = zp->z_id;
+ } else if (offset == 1) {
+ (void) strcpy(zap.za_name, "..");
+ objnum = zp->z_phys->zp_parent;
+ } else if (offset == 2 && zfs_show_ctldir(zp)) {
+ (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
+ objnum = ZFSCTL_INO_ROOT;
+ } else {
+ /*
+ * Grab next entry.
+ */
+ if (error = zap_cursor_retrieve(&zc, &zap)) {
+ if ((*eofp = (error == ENOENT)) != 0)
+ break;
+ else
+ goto update;
+ }
+
+ if (zap.za_integer_length != 8 ||
+ zap.za_num_integers != 1) {
+ cmn_err(CE_WARN, "zap_readdir: bad directory "
+ "entry, obj = %lld, offset = %lld\n",
+ (u_longlong_t)zp->z_id,
+ (u_longlong_t)offset);
+ error = ENXIO;
+ goto update;
+ }
+
+ objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
+ /*
+ * MacOS X can extract the object type here such as:
+ * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
+ */
+ type = ZFS_DIRENT_TYPE(zap.za_first_integer);
+ }
+ reclen = DIRENT64_RECLEN(strlen(zap.za_name));
+
+ /*
+ * Will this entry fit in the buffer?
+ */
+ if (outcount + reclen > bufsize) {
+ /*
+ * Did we manage to fit anything in the buffer?
+ */
+ if (!outcount) {
+ error = EINVAL;
+ goto update;
+ }
+ break;
+ }
+ /*
+ * Add this entry:
+ */
+ odp->d_ino = objnum;
+ odp->d_reclen = reclen;
+ odp->d_namlen = strlen(zap.za_name);
+ (void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
+ odp->d_type = type;
+ outcount += reclen;
+ odp = (dirent64_t *)((intptr_t)odp + reclen);
+
+ ASSERT(outcount <= bufsize);
+
+ /* Prefetch znode */
+ if (prefetch)
+ dmu_prefetch(os, objnum, 0, 0);
+
+ /*
+ * Move to the next entry, fill in the previous offset.
+ */
+ if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
+ zap_cursor_advance(&zc);
+ offset = zap_cursor_serialize(&zc);
+ } else {
+ offset += 1;
+ }
+
+ if (cookies != NULL) {
+ *cookies++ = offset;
+ ncookies--;
+ KASSERT(ncookies >= 0, ("ncookies=%d", ncookies));
+ }
+ }
+ zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
+
+ /* Subtract unused cookies */
+ if (ap->a_ncookies)
+ *ap->a_ncookies -= ncookies;
+
+ if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
+ iovp->iov_base += outcount;
+ iovp->iov_len -= outcount;
+ uio->uio_resid -= outcount;
+ } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
+ /*
+ * Reset the pointer.
+ */
+ offset = uio->uio_loffset;
+ }
+
+update:
+ zap_cursor_fini(&zc);
+ if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
+ kmem_free(outbuf, bufsize);
+
+ if (error == ENOENT)
+ error = 0;
+
+ ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
+
+ uio->uio_loffset = offset;
+ ZFS_EXIT(zfsvfs);
+ if (error != 0) {
+ free(*ap->a_cookies, M_TEMP);
+ *ap->a_cookies = NULL;
+ *ap->a_ncookies = 0;
+ }
+ return (error);
+}
+
+static int
+zfs_fsync(struct vop_fsync_args *ap)
+{
+ znode_t *zp = VTOZ(ap->a_vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+
+ vop_stdfsync(ap);
+
+ ZFS_ENTER(zfsvfs);
+ zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+/*
+ * Get the requested file attributes and place them in the provided
+ * vattr structure.
+ *
+ * IN: vp - vnode of file.
+ * vap - va_mask identifies requested attributes.
+ * flags - [UNUSED]
+ * cr - credentials of caller.
+ *
+ * OUT: vap - attribute values.
+ *
+ * RETURN: 0 (always succeeds)
+ */
+/* ARGSUSED */
+static int
+zfs_getattr(ap)
+ struct vop_getattr_args /* {
+ struct vnode *a_vp;
+ struct vattr *a_vap;
+ struct ucred *a_cred;
+ struct thread *a_td;
+ } */ *ap;
+{
+ vnode_t *vp = ap->a_vp;
+ vattr_t *vap = ap->a_vap;
+ cred_t *cr = ap->a_cred;
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ znode_phys_t *pzp = zp->z_phys;
+ uint32_t blksize;
+ u_longlong_t nblocks;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+
+ /*
+ * Return all attributes. It's cheaper to provide the answer
+ * than to determine whether we were asked the question.
+ */
+ mutex_enter(&zp->z_lock);
+
+ vap->va_type = IFTOVT(pzp->zp_mode);
+ vap->va_mode = pzp->zp_mode & ~S_IFMT;
+ vap->va_uid = zp->z_phys->zp_uid;
+ vap->va_gid = zp->z_phys->zp_gid;
+ vap->va_nodeid = zp->z_id;
+ vap->va_nlink = MIN(pzp->zp_links, UINT32_MAX); /* nlink_t limit! */
+ vap->va_size = pzp->zp_size;
+ vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
+ vap->va_seq = zp->z_seq;
+ vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */
+
+ ZFS_TIME_DECODE(&vap->va_atime, pzp->zp_atime);
+ ZFS_TIME_DECODE(&vap->va_mtime, pzp->zp_mtime);
+ ZFS_TIME_DECODE(&vap->va_ctime, pzp->zp_ctime);
+ ZFS_TIME_DECODE(&vap->va_birthtime, pzp->zp_crtime);
+
+ /*
+ * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
+ * Also, if we are the owner don't bother, since owner should
+ * always be allowed to read basic attributes of file.
+ */
+ if (!(zp->z_phys->zp_flags & ZFS_ACL_TRIVIAL) &&
+ (zp->z_phys->zp_uid != crgetuid(cr))) {
+ if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, cr)) {
+ mutex_exit(&zp->z_lock);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ }
+
+ mutex_exit(&zp->z_lock);
+
+ dmu_object_size_from_db(zp->z_dbuf, &blksize, &nblocks);
+ vap->va_blksize = blksize;
+ vap->va_bytes = nblocks << 9; /* nblocks * 512 */
+
+ if (zp->z_blksz == 0) {
+ /*
+ * Block size hasn't been set; suggest maximal I/O transfers.
+ */
+ vap->va_blksize = zfsvfs->z_max_blksz;
+ }
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+/*
+ * Set the file attributes to the values contained in the
+ * vattr structure.
+ *
+ * IN: vp - vnode of file to be modified.
+ * vap - new attribute values.
+ * flags - ATTR_UTIME set if non-default time values provided.
+ * cr - credentials of caller.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * vp - ctime updated, mtime updated if size changed.
+ */
+/* ARGSUSED */
+static int
+zfs_setattr(ap)
+ struct vop_setattr_args /* {
+ struct vnode *a_vp;
+ struct vattr *a_vap;
+ struct ucred *a_cred;
+ struct thread *a_td;
+ } */ *ap;
+{
+ vnode_t *vp = ap->a_vp;
+ vattr_t *vap = ap->a_vap;
+ cred_t *cr = ap->a_cred;
+ znode_t *zp = VTOZ(vp);
+ znode_phys_t *pzp = zp->z_phys;
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ zilog_t *zilog = zfsvfs->z_log;
+ dmu_tx_t *tx;
+ vattr_t oldva;
+ uint_t mask;
+ uint_t saved_mask;
+ int trim_mask = 0;
+ uint64_t new_mode;
+ znode_t *attrzp;
+ int need_policy = FALSE;
+ int flags = 0;
+ int err;
+
+ /* No support for FreeBSD's chflags(2). */
+ if (vap->va_flags != VNOVAL)
+ return (EOPNOTSUPP);
+
+ vattr_init_mask(vap);
+ mask = vap->va_mask;
+
+ if (mask == 0)
+ return (0);
+
+ if (mask & AT_SIZE && vp->v_type == VDIR)
+ return (EISDIR);
+
+ if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO)
+ return (EINVAL);
+
+ ZFS_ENTER(zfsvfs);
+
+top:
+ attrzp = NULL;
+
+ if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
+ ZFS_EXIT(zfsvfs);
+ return (EROFS);
+ }
+
+ /*
+ * First validate permissions
+ */
+
+ if (mask & AT_SIZE) {
+ err = zfs_zaccess(zp, ACE_WRITE_DATA, cr);
+ if (err) {
+ ZFS_EXIT(zfsvfs);
+ return (err);
+ }
+ /*
+ * XXX - Note, we are not providing any open
+ * mode flags here (like FNDELAY), so we may
+ * block if there are locks present... this
+ * should be addressed in openat().
+ */
+ do {
+ err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
+ /* NB: we already did dmu_tx_wait() if necessary */
+ } while (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT);
+ if (err) {
+ ZFS_EXIT(zfsvfs);
+ return (err);
+ }
+ }
+
+ if (mask & (AT_ATIME|AT_MTIME))
+ need_policy = zfs_zaccess_v4_perm(zp, ACE_WRITE_ATTRIBUTES, cr);
+
+ if (mask & (AT_UID|AT_GID)) {
+ int idmask = (mask & (AT_UID|AT_GID));
+ int take_owner;
+ int take_group;
+
+ /*
+ * NOTE: even if a new mode is being set,
+ * we may clear S_ISUID/S_ISGID bits.
+ */
+
+ if (!(mask & AT_MODE))
+ vap->va_mode = pzp->zp_mode;
+
+ /*
+ * Take ownership or chgrp to group we are a member of
+ */
+
+ take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
+ take_group = (mask & AT_GID) && groupmember(vap->va_gid, cr);
+
+ /*
+ * If both AT_UID and AT_GID are set then take_owner and
+ * take_group must both be set in order to allow taking
+ * ownership.
+ *
+ * Otherwise, send the check through secpolicy_vnode_setattr()
+ *
+ */
+
+ if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
+ ((idmask == AT_UID) && take_owner) ||
+ ((idmask == AT_GID) && take_group)) {
+ if (zfs_zaccess_v4_perm(zp, ACE_WRITE_OWNER, cr) == 0) {
+ /*
+ * Remove setuid/setgid for non-privileged users
+ */
+ secpolicy_setid_clear(vap, cr);
+ trim_mask = (mask & (AT_UID|AT_GID));
+ } else {
+ need_policy = TRUE;
+ }
+ } else {
+ need_policy = TRUE;
+ }
+ }
+
+ mutex_enter(&zp->z_lock);
+ oldva.va_mode = pzp->zp_mode;
+ oldva.va_uid = zp->z_phys->zp_uid;
+ oldva.va_gid = zp->z_phys->zp_gid;
+ mutex_exit(&zp->z_lock);
+
+ if (mask & AT_MODE) {
+ if (zfs_zaccess_v4_perm(zp, ACE_WRITE_ACL, cr) == 0) {
+ err = secpolicy_setid_setsticky_clear(vp, vap,
+ &oldva, cr);
+ if (err) {
+ ZFS_EXIT(zfsvfs);
+ return (err);
+ }
+ trim_mask |= AT_MODE;
+ } else {
+ need_policy = TRUE;
+ }
+ }
+
+ if (need_policy) {
+ /*
+ * If trim_mask is set then take ownership
+ * has been granted or write_acl is present and user
+ * has the ability to modify mode. In that case remove
+ * UID|GID and or MODE from mask so that
+ * secpolicy_vnode_setattr() doesn't revoke it.
+ */
+
+ if (trim_mask) {
+ saved_mask = vap->va_mask;
+ vap->va_mask &= ~trim_mask;
+
+ }
+ err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
+ (int (*)(void *, int, cred_t *))zfs_zaccess_rwx, zp);
+ if (err) {
+ ZFS_EXIT(zfsvfs);
+ return (err);
+ }
+
+ if (trim_mask)
+ vap->va_mask |= saved_mask;
+ }
+
+ /*
+ * secpolicy_vnode_setattr, or take ownership may have
+ * changed va_mask
+ */
+ mask = vap->va_mask;
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_bonus(tx, zp->z_id);
+
+ if (mask & AT_MODE) {
+ uint64_t pmode = pzp->zp_mode;
+
+ new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
+
+ if (zp->z_phys->zp_acl.z_acl_extern_obj)
+ dmu_tx_hold_write(tx,
+ pzp->zp_acl.z_acl_extern_obj, 0, SPA_MAXBLOCKSIZE);
+ else
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+ 0, ZFS_ACL_SIZE(MAX_ACL_SIZE));
+ }
+
+ if ((mask & (AT_UID | AT_GID)) && zp->z_phys->zp_xattr != 0) {
+ err = zfs_zget(zp->z_zfsvfs, zp->z_phys->zp_xattr, &attrzp);
+ if (err) {
+ dmu_tx_abort(tx);
+ ZFS_EXIT(zfsvfs);
+ return (err);
+ }
+ dmu_tx_hold_bonus(tx, attrzp->z_id);
+ }
+
+ err = dmu_tx_assign(tx, zfsvfs->z_assign);
+ if (err) {
+ if (attrzp)
+ VN_RELE(ZTOV(attrzp));
+ if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ goto top;
+ }
+ dmu_tx_abort(tx);
+ ZFS_EXIT(zfsvfs);
+ return (err);
+ }
+
+ dmu_buf_will_dirty(zp->z_dbuf, tx);
+
+ /*
+ * Set each attribute requested.
+ * We group settings according to the locks they need to acquire.
+ *
+ * Note: you cannot set ctime directly, although it will be
+ * updated as a side-effect of calling this function.
+ */
+
+ mutex_enter(&zp->z_lock);
+
+ if (mask & AT_MODE) {
+ err = zfs_acl_chmod_setattr(zp, new_mode, tx);
+ ASSERT3U(err, ==, 0);
+ }
+
+ if (attrzp)
+ mutex_enter(&attrzp->z_lock);
+
+ if (mask & AT_UID) {
+ zp->z_phys->zp_uid = (uint64_t)vap->va_uid;
+ if (attrzp) {
+ attrzp->z_phys->zp_uid = (uint64_t)vap->va_uid;
+ }
+ }
+
+ if (mask & AT_GID) {
+ zp->z_phys->zp_gid = (uint64_t)vap->va_gid;
+ if (attrzp)
+ attrzp->z_phys->zp_gid = (uint64_t)vap->va_gid;
+ }
+
+ if (attrzp)
+ mutex_exit(&attrzp->z_lock);
+
+ if (mask & AT_ATIME)
+ ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime);
+
+ if (mask & AT_MTIME)
+ ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
+
+ if (mask & AT_SIZE)
+ zfs_time_stamper_locked(zp, CONTENT_MODIFIED, tx);
+ else if (mask != 0)
+ zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
+
+ if (mask != 0)
+ zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask);
+
+ mutex_exit(&zp->z_lock);
+
+ if (attrzp)
+ VN_RELE(ZTOV(attrzp));
+
+ dmu_tx_commit(tx);
+
+ ZFS_EXIT(zfsvfs);
+ return (err);
+}
+
+typedef struct zfs_zlock {
+ krwlock_t *zl_rwlock; /* lock we acquired */
+ znode_t *zl_znode; /* znode we held */
+ struct zfs_zlock *zl_next; /* next in list */
+} zfs_zlock_t;
+
+/*
+ * Drop locks and release vnodes that were held by zfs_rename_lock().
+ */
+static void
+zfs_rename_unlock(zfs_zlock_t **zlpp)
+{
+ zfs_zlock_t *zl;
+
+ while ((zl = *zlpp) != NULL) {
+ if (zl->zl_znode != NULL)
+ VN_RELE(ZTOV(zl->zl_znode));
+ rw_exit(zl->zl_rwlock);
+ *zlpp = zl->zl_next;
+ kmem_free(zl, sizeof (*zl));
+ }
+}
+
+/*
+ * Search back through the directory tree, using the ".." entries.
+ * Lock each directory in the chain to prevent concurrent renames.
+ * Fail any attempt to move a directory into one of its own descendants.
+ * XXX - z_parent_lock can overlap with map or grow locks
+ */
+static int
+zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
+{
+ zfs_zlock_t *zl;
+ znode_t *zp = tdzp;
+ uint64_t rootid = zp->z_zfsvfs->z_root;
+ uint64_t *oidp = &zp->z_id;
+ krwlock_t *rwlp = &szp->z_parent_lock;
+ krw_t rw = RW_WRITER;
+
+ /*
+ * First pass write-locks szp and compares to zp->z_id.
+ * Later passes read-lock zp and compare to zp->z_parent.
+ */
+ do {
+ if (!rw_tryenter(rwlp, rw)) {
+ /*
+ * Another thread is renaming in this path.
+ * Note that if we are a WRITER, we don't have any
+ * parent_locks held yet.
+ */
+ if (rw == RW_READER && zp->z_id > szp->z_id) {
+ /*
+ * Drop our locks and restart
+ */
+ zfs_rename_unlock(&zl);
+ *zlpp = NULL;
+ zp = tdzp;
+ oidp = &zp->z_id;
+ rwlp = &szp->z_parent_lock;
+ rw = RW_WRITER;
+ continue;
+ } else {
+ /*
+ * Wait for other thread to drop its locks
+ */
+ rw_enter(rwlp, rw);
+ }
+ }
+
+ zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
+ zl->zl_rwlock = rwlp;
+ zl->zl_znode = NULL;
+ zl->zl_next = *zlpp;
+ *zlpp = zl;
+
+ if (*oidp == szp->z_id) /* We're a descendant of szp */
+ return (EINVAL);
+
+ if (*oidp == rootid) /* We've hit the top */
+ return (0);
+
+ if (rw == RW_READER) { /* i.e. not the first pass */
+ int error = zfs_zget(zp->z_zfsvfs, *oidp, &zp);
+ if (error)
+ return (error);
+ zl->zl_znode = zp;
+ }
+ oidp = &zp->z_phys->zp_parent;
+ rwlp = &zp->z_parent_lock;
+ rw = RW_READER;
+
+ } while (zp->z_id != sdzp->z_id);
+
+ return (0);
+}
+
+/*
+ * Move an entry from the provided source directory to the target
+ * directory. Change the entry name as indicated.
+ *
+ * IN: sdvp - Source directory containing the "old entry".
+ * snm - Old entry name.
+ * tdvp - Target directory to contain the "new entry".
+ * tnm - New entry name.
+ * cr - credentials of caller.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * sdvp,tdvp - ctime|mtime updated
+ */
+static int
+zfs_rename(ap)
+ struct vop_rename_args /* {
+ struct vnode *a_fdvp;
+ struct vnode *a_fvp;
+ struct componentname *a_fcnp;
+ struct vnode *a_tdvp;
+ struct vnode *a_tvp;
+ struct componentname *a_tcnp;
+ } */ *ap;
+{
+ vnode_t *svp = ap->a_fvp;
+ vnode_t *tvp = ap->a_tvp;
+ vnode_t *sdvp = ap->a_fdvp;
+ vnode_t *tdvp = ap->a_tdvp;
+ char *snm = ap->a_fcnp->cn_nameptr;
+ char *tnm = ap->a_tcnp->cn_nameptr;
+ cred_t *cr = ap->a_fcnp->cn_cred;
+ znode_t *tdzp, *szp, *tzp;
+ znode_t *sdzp = VTOZ(sdvp);
+ zfsvfs_t *zfsvfs = sdzp->z_zfsvfs;
+ zilog_t *zilog = zfsvfs->z_log;
+ zfs_dirlock_t *sdl, *tdl;
+ dmu_tx_t *tx;
+ zfs_zlock_t *zl;
+ int serr, terr, error;
+
+ ASSERT(ap->a_fcnp->cn_flags & SAVENAME);
+ ASSERT(ap->a_tcnp->cn_flags & SAVENAME);
+
+ ZFS_ENTER(zfsvfs);
+
+ if (tdvp->v_vfsp != sdvp->v_vfsp) {
+ error = EXDEV;
+abort:
+ ZFS_EXIT(zfsvfs);
+ if (tdvp == tvp)
+ vrele(tdvp);
+ else
+ vput(tdvp);
+ if (tvp)
+ vput(tvp);
+ vrele(sdvp);
+ vrele(svp);
+ return (error);
+ }
+
+ tdzp = VTOZ(tdvp);
+top:
+ szp = NULL;
+ tzp = NULL;
+ zl = NULL;
+
+ /*
+ * This is to prevent the creation of links into attribute space
+ * by renaming a linked file into/outof an attribute directory.
+ * See the comment in zfs_link() for why this is considered bad.
+ */
+ if ((tdzp->z_phys->zp_flags & ZFS_XATTR) !=
+ (sdzp->z_phys->zp_flags & ZFS_XATTR)) {
+ error = EINVAL;
+ goto abort;
+ }
+
+ /*
+ * POSIX: "If the old argument and the new argument
+ * both refer to links to the same existing file,
+ * the rename() function shall return successfully
+ * and perform no other action."
+ * This case is already handled in kern_rename().
+ */
+ ASSERT(svp != tvp);
+
+ /*
+ * Lock source and target directory entries. To prevent deadlock,
+ * a lock ordering must be defined. We lock the directory with
+ * the smallest object id first, or if it's a tie, the one with
+ * the lexically first name.
+ */
+ if (sdzp->z_id < tdzp->z_id) {
+ serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS);
+ terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, 0);
+ } else /* if (sdzp->z_id > tdzp->z_id) */ {
+ terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, 0);
+ serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS);
+ }
+ if (!terr && tzp != NULL) {
+ if (tvp)
+ vrele(tvp);
+ else {
+ ZFS_LOG(0, "Unexpected code path, report to pjd@FreeBSD.org");
+ tvp = ZTOV(tzp);
+ vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY, curthread);
+ }
+ ASSERT(tvp->v_count > 0);
+ }
+
+ if (serr || svp != ZTOV(szp)) {
+ /*
+ * Source entry invalid or not there.
+ */
+ if (!serr)
+ zfs_dirent_unlock(sdl);
+ if (!terr)
+ zfs_dirent_unlock(tdl);
+ if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0)
+ serr = EINVAL;
+ error = serr;
+ goto abort;
+ }
+ if (terr) {
+ zfs_dirent_unlock(sdl);
+ if (strcmp(tnm, "..") == 0)
+ terr = EINVAL;
+ error = terr;
+ goto abort;
+ }
+ /* Remove an extra reference from zfs_dirent_lock(). */
+ vrele(svp);
+ ASSERT(svp->v_count > 0);
+
+ /*
+ * Must have write access at the source to remove the old entry
+ * and write access at the target to create the new entry.
+ * Note that if target and source are the same, this can be
+ * done in a single check.
+ */
+
+ if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
+ goto out;
+
+ if (svp->v_type == VDIR) {
+ /*
+ * Check to make sure rename is valid.
+ * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
+ */
+ if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl))
+ goto out;
+ }
+
+ /*
+ * Does target exist?
+ */
+ if (tzp) {
+ /*
+ * Source and target must be the same type.
+ */
+ if (svp->v_type == VDIR) {
+ if (tvp->v_type != VDIR) {
+ error = ENOTDIR;
+ goto out;
+ }
+ } else {
+ if (tvp->v_type == VDIR) {
+ error = EISDIR;
+ goto out;
+ }
+ }
+ /*
+ * POSIX dictates that when the source and target
+ * entries refer to the same file object, rename
+ * must do nothing and exit without error.
+ */
+ if (szp->z_id == tzp->z_id) {
+ error = 0;
+ goto out;
+ }
+ }
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_bonus(tx, szp->z_id); /* nlink changes */
+ dmu_tx_hold_bonus(tx, sdzp->z_id); /* nlink changes */
+ dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
+ dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
+ if (sdzp != tdzp)
+ dmu_tx_hold_bonus(tx, tdzp->z_id); /* nlink changes */
+ if (tzp)
+ dmu_tx_hold_bonus(tx, tzp->z_id); /* parent changes */
+ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+ error = dmu_tx_assign(tx, zfsvfs->z_assign);
+ if (error) {
+ if (zl != NULL)
+ zfs_rename_unlock(&zl);
+ zfs_dirent_unlock(sdl);
+ zfs_dirent_unlock(tdl);
+ if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ if (ap->a_tvp == NULL && tvp != NULL) {
+ vput(tvp);
+ tvp = NULL;
+ }
+ goto top;
+ }
+ dmu_tx_abort(tx);
+ goto abort;
+ }
+
+ if (tzp) /* Attempt to remove the existing target */
+ error = zfs_link_destroy(tdl, tzp, tx, 0, NULL);
+
+ if (error == 0) {
+ error = zfs_link_create(tdl, szp, tx, ZRENAMING);
+ if (error == 0) {
+ error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
+ ASSERT(error == 0);
+ zfs_log_rename(zilog, tx, TX_RENAME, sdzp,
+ sdl->dl_name, tdzp, tdl->dl_name, szp);
+ }
+#ifdef FREEBSD_NAMECACHE
+ if (error == 0) {
+ cache_purge(sdvp);
+ cache_purge(tdvp);
+ }
+#endif
+ }
+
+ dmu_tx_commit(tx);
+out:
+ if (zl != NULL)
+ zfs_rename_unlock(&zl);
+
+ zfs_dirent_unlock(sdl);
+ zfs_dirent_unlock(tdl);
+
+ if (tzp)
+ vput(tvp);
+ vput(tdvp);
+ vrele(sdvp);
+ vrele(svp);
+
+ ZFS_EXIT(zfsvfs);
+
+ return (error);
+}
+
+/*
+ * Insert the indicated symbolic reference entry into the directory.
+ *
+ * IN: dvp - Directory to contain new symbolic link.
+ * link - Name for new symlink entry.
+ * vap - Attributes of new entry.
+ * target - Target path of new symlink.
+ * cr - credentials of caller.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * dvp - ctime|mtime updated
+ */
+static int
+zfs_symlink(ap)
+ struct vop_symlink_args /* {
+ struct vnode *a_dvp;
+ struct vnode **a_vpp;
+ struct componentname *a_cnp;
+ struct vattr *a_vap;
+ char *a_target;
+ } */ *ap;
+{
+ vnode_t *dvp = ap->a_dvp;
+ vattr_t *vap = ap->a_vap;
+ cred_t *cr = ap->a_cnp->cn_cred;
+ char *name = ap->a_cnp->cn_nameptr;
+ char *link = ap->a_target;
+ znode_t *zp, *dzp = VTOZ(dvp);
+ zfs_dirlock_t *dl;
+ dmu_tx_t *tx;
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ zilog_t *zilog = zfsvfs->z_log;
+ uint64_t zoid;
+ int len = strlen(link);
+ int error;
+
+ ASSERT(ap->a_cnp->cn_flags & SAVENAME);
+
+ vap->va_type = VLNK; /* FreeBSD: Syscall only sets va_mode. */
+ vattr_init_mask(vap);
+
+ ZFS_ENTER(zfsvfs);
+top:
+ if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ if (len > MAXPATHLEN) {
+ ZFS_EXIT(zfsvfs);
+ return (ENAMETOOLONG);
+ }
+
+ /*
+ * Attempt to lock directory; fail if entry already exists.
+ */
+ if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZNEW)) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
+ dmu_tx_hold_bonus(tx, dzp->z_id);
+ dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
+ if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE);
+ error = dmu_tx_assign(tx, zfsvfs->z_assign);
+ if (error) {
+ zfs_dirent_unlock(dl);
+ if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ goto top;
+ }
+ dmu_tx_abort(tx);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ dmu_buf_will_dirty(dzp->z_dbuf, tx);
+
+ /*
+ * Create a new object for the symlink.
+ * Put the link content into bonus buffer if it will fit;
+ * otherwise, store it just like any other file data.
+ */
+ zoid = 0;
+ if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) {
+ zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, len);
+ if (len != 0)
+ bcopy(link, zp->z_phys + 1, len);
+ } else {
+ dmu_buf_t *dbp;
+
+ zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0);
+
+ /*
+ * Nothing can access the znode yet so no locking needed
+ * for growing the znode's blocksize.
+ */
+ zfs_grow_blocksize(zp, len, tx);
+
+ VERIFY(0 == dmu_buf_hold(zfsvfs->z_os, zoid, 0, FTAG, &dbp));
+ dmu_buf_will_dirty(dbp, tx);
+
+ ASSERT3U(len, <=, dbp->db_size);
+ bcopy(link, dbp->db_data, len);
+ dmu_buf_rele(dbp, FTAG);
+ }
+ zp->z_phys->zp_size = len;
+
+ /*
+ * Insert the new object into the directory.
+ */
+ (void) zfs_link_create(dl, zp, tx, ZNEW);
+
+ if (error == 0) {
+ zfs_log_symlink(zilog, tx, TX_SYMLINK, dzp, zp, name, link);
+
+ *ap->a_vpp = ZTOV(zp);
+
+ vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY,
+ ap->a_cnp->cn_thread);
+ }
+
+ dmu_tx_commit(tx);
+
+ zfs_dirent_unlock(dl);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Return, in the buffer contained in the provided uio structure,
+ * the symbolic path referred to by vp.
+ *
+ * IN: vp - vnode of symbolic link.
+ * uoip - structure to contain the link path.
+ * cr - credentials of caller.
+ *
+ * OUT: uio - structure to contain the link path.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * vp - atime updated
+ */
+/* ARGSUSED */
+static int
+zfs_readlink(ap)
+ struct vop_readlink_args /* {
+ struct vnode *a_vp;
+ struct uio *a_uio;
+ struct ucred *a_cred;
+ } */ *ap;
+{
+ vnode_t *vp = ap->a_vp;
+ uio_t *uio = ap->a_uio;
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ size_t bufsz;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+
+ bufsz = (size_t)zp->z_phys->zp_size;
+ if (bufsz + sizeof (znode_phys_t) <= zp->z_dbuf->db_size) {
+ error = uiomove(zp->z_phys + 1,
+ MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
+ } else {
+ dmu_buf_t *dbp;
+ error = dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0, FTAG, &dbp);
+ if (error) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ error = uiomove(dbp->db_data,
+ MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
+ dmu_buf_rele(dbp, FTAG);
+ }
+
+ ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Insert a new entry into directory tdvp referencing svp.
+ *
+ * IN: tdvp - Directory to contain new entry.
+ * svp - vnode of new entry.
+ * name - name of new entry.
+ * cr - credentials of caller.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * tdvp - ctime|mtime updated
+ * svp - ctime updated
+ */
+/* ARGSUSED */
+static int
+zfs_link(ap)
+ struct vop_link_args /* {
+ struct vnode *a_tdvp;
+ struct vnode *a_vp;
+ struct componentname *a_cnp;
+ } */ *ap;
+{
+ vnode_t *svp = ap->a_vp;
+ vnode_t *tdvp = ap->a_tdvp;
+ cred_t *cr = ap->a_cnp->cn_cred;
+ char *name = ap->a_cnp->cn_nameptr;
+ znode_t *dzp = VTOZ(tdvp);
+ znode_t *tzp, *szp;
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ zilog_t *zilog = zfsvfs->z_log;
+ zfs_dirlock_t *dl;
+ dmu_tx_t *tx;
+ int error;
+
+ ASSERT(ap->a_cnp->cn_flags & SAVENAME);
+ ASSERT(tdvp->v_type == VDIR);
+
+ ZFS_ENTER(zfsvfs);
+
+ if (svp->v_vfsp != tdvp->v_vfsp) {
+ ZFS_EXIT(zfsvfs);
+ return (EXDEV);
+ }
+
+ szp = VTOZ(svp);
+top:
+ /*
+ * We do not support links between attributes and non-attributes
+ * because of the potential security risk of creating links
+ * into "normal" file space in order to circumvent restrictions
+ * imposed in attribute space.
+ */
+ if ((szp->z_phys->zp_flags & ZFS_XATTR) !=
+ (dzp->z_phys->zp_flags & ZFS_XATTR)) {
+ ZFS_EXIT(zfsvfs);
+ return (EINVAL);
+ }
+
+ /*
+ * POSIX dictates that we return EPERM here.
+ * Better choices include ENOTSUP or EISDIR.
+ */
+ if (svp->v_type == VDIR) {
+ ZFS_EXIT(zfsvfs);
+ return (EPERM);
+ }
+
+ if ((uid_t)szp->z_phys->zp_uid != crgetuid(cr) &&
+ secpolicy_basic_link(cr) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (EPERM);
+ }
+
+ if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * Attempt to lock directory; fail if entry already exists.
+ */
+ if (error = zfs_dirent_lock(&dl, dzp, name, &tzp, ZNEW)) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_bonus(tx, szp->z_id);
+ dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
+ error = dmu_tx_assign(tx, zfsvfs->z_assign);
+ if (error) {
+ zfs_dirent_unlock(dl);
+ if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ goto top;
+ }
+ dmu_tx_abort(tx);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ error = zfs_link_create(dl, szp, tx, 0);
+
+ if (error == 0)
+ zfs_log_link(zilog, tx, TX_LINK, dzp, szp, name);
+
+ dmu_tx_commit(tx);
+
+ zfs_dirent_unlock(dl);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+static int
+zfs_inactive(ap)
+ struct vop_inactive_args /* {
+ struct vnode *a_vp;
+ struct thread *a_td;
+ } */ *ap;
+{
+ vnode_t *vp = ap->a_vp;
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ int error;
+
+ /*
+ * Destroy the vm object and flush associated pages.
+ */
+ vnode_destroy_vobject(vp);
+
+ rw_enter(&zfsvfs->z_um_lock, RW_READER);
+ if (zfsvfs->z_unmounted2) {
+ ASSERT(zp->z_dbuf_held == 0);
+
+ mutex_enter(&zp->z_lock);
+ VI_LOCK(vp);
+ vp->v_count = 0; /* count arrives as 1 */
+ VI_UNLOCK(vp);
+ if (zp->z_dbuf == NULL) {
+ mutex_exit(&zp->z_lock);
+ zfs_znode_free(zp);
+ } else {
+ mutex_exit(&zp->z_lock);
+ }
+ rw_exit(&zfsvfs->z_um_lock);
+ VFS_RELE(zfsvfs->z_vfs);
+ return (0);
+ }
+
+ if (zp->z_atime_dirty && zp->z_unlinked == 0) {
+ dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
+
+ dmu_tx_hold_bonus(tx, zp->z_id);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ } else {
+ dmu_buf_will_dirty(zp->z_dbuf, tx);
+ mutex_enter(&zp->z_lock);
+ zp->z_atime_dirty = 0;
+ mutex_exit(&zp->z_lock);
+ dmu_tx_commit(tx);
+ }
+ }
+
+ zfs_zinactive(zp);
+ rw_exit(&zfsvfs->z_um_lock);
+ return (0);
+}
+
+static int
+zfs_reclaim(ap)
+ struct vop_reclaim_args /* {
+ struct vnode *a_vp;
+ struct thread *a_td;
+ } */ *ap;
+{
+ vnode_t *vp = ap->a_vp;
+ znode_t *zp = VTOZ(vp);
+
+ if (zp != NULL)
+ mutex_enter(&zp->z_lock);
+#if 0 /*
+ * We do it from zfs_inactive(), because after zfs_inactive() we can't
+ * VOP_WRITE() to the vnode.
+ */
+ /*
+ * Destroy the vm object and flush associated pages.
+ */
+ vnode_destroy_vobject(vp);
+#endif
+ VI_LOCK(vp);
+ vp->v_data = NULL;
+ if (zp != NULL)
+ zp->z_vnode = NULL;
+ ASSERT(vp->v_holdcnt > 1);
+ vdropl(vp);
+ if (zp != NULL)
+ mutex_exit(&zp->z_lock);
+ return (0);
+}
+
+CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
+CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
+
+static int
+zfs_fid(ap)
+ struct vop_fid_args /* {
+ struct vnode *a_vp;
+ struct fid *a_fid;
+ } */ *ap;
+{
+ vnode_t *vp = ap->a_vp;
+ fid_t *fidp = (void *)ap->a_fid;
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ uint32_t gen = (uint32_t)zp->z_phys->zp_gen;
+ uint64_t object = zp->z_id;
+ zfid_short_t *zfid;
+ int size, i;
+
+ ZFS_ENTER(zfsvfs);
+
+ size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
+ fidp->fid_len = size;
+
+ zfid = (zfid_short_t *)fidp;
+
+ zfid->zf_len = size;
+
+ for (i = 0; i < sizeof (zfid->zf_object); i++)
+ zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
+
+ /* Must have a non-zero generation number to distinguish from .zfs */
+ if (gen == 0)
+ gen = 1;
+ for (i = 0; i < sizeof (zfid->zf_gen); i++)
+ zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
+
+ if (size == LONG_FID_LEN) {
+ uint64_t objsetid = dmu_objset_id(zfsvfs->z_os);
+ zfid_long_t *zlfid;
+
+ zlfid = (zfid_long_t *)fidp;
+
+ for (i = 0; i < sizeof (zlfid->zf_setid); i++)
+ zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
+
+ /* XXX - this should be the generation number for the objset */
+ for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
+ zlfid->zf_setgen[i] = 0;
+ }
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+static int
+zfs_pathconf(ap)
+ struct vop_pathconf_args /* {
+ struct vnode *a_vp;
+ int a_name;
+ int *a_retval;
+ } */ *ap;
+{
+ int cmd = ap->a_name;
+ int *valp = ap->a_retval;
+#if 0
+ vnode_t *vp = ap->a_vp;
+ znode_t *zp, *xzp;
+ zfsvfs_t *zfsvfs;
+ zfs_dirlock_t *dl;
+ int error;
+#endif
+
+ switch (cmd) {
+ case _PC_LINK_MAX:
+ *valp = INT_MAX;
+ return (0);
+
+ case _PC_FILESIZEBITS:
+ *valp = 64;
+ return (0);
+
+#if 0
+ case _PC_XATTR_EXISTS:
+ zp = VTOZ(vp);
+ zfsvfs = zp->z_zfsvfs;
+ ZFS_ENTER(zfsvfs);
+ *valp = 0;
+ error = zfs_dirent_lock(&dl, zp, "", &xzp,
+ ZXATTR | ZEXISTS | ZSHARED);
+ if (error == 0) {
+ zfs_dirent_unlock(dl);
+ if (!zfs_dirempty(xzp))
+ *valp = 1;
+ VN_RELE(ZTOV(xzp));
+ } else if (error == ENOENT) {
+ /*
+ * If there aren't extended attributes, it's the
+ * same as having zero of them.
+ */
+ error = 0;
+ }
+ ZFS_EXIT(zfsvfs);
+ return (error);
+#endif
+
+ case _PC_ACL_EXTENDED:
+ *valp = 0; /* TODO */
+ return (0);
+
+ case _PC_MIN_HOLE_SIZE:
+ *valp = (int)SPA_MINBLOCKSIZE;
+ return (0);
+
+ default:
+ return (vop_stdpathconf(ap));
+ }
+}
+
+#ifdef TODO
+/*ARGSUSED*/
+static int
+zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+ error = zfs_getacl(zp, vsecp, cr);
+ ZFS_EXIT(zfsvfs);
+
+ return (error);
+}
+#endif /* TODO */
+
+#ifdef TODO
+/*ARGSUSED*/
+static int
+zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+ error = zfs_setacl(zp, vsecp, cr);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+#endif /* TODO */
+
+/*
+ * Advisory record locking support
+ */
+static int
+zfs_advlock(ap)
+ struct vop_advlock_args /* {
+ struct vnode *a_vp;
+ caddr_t a_id;
+ int a_op;
+ struct flock *a_fl;
+ int a_flags;
+ } */ *ap;
+{
+ znode_t *zp = VTOZ(ap->a_vp);
+
+ return (lf_advlock(ap, &(zp->z_lockf), zp->z_phys->zp_size));
+}
+
+struct vop_vector zfs_vnodeops;
+struct vop_vector zfs_fifoops;
+
+struct vop_vector zfs_vnodeops = {
+ .vop_default = &default_vnodeops,
+ .vop_inactive = zfs_inactive,
+ .vop_reclaim = zfs_reclaim,
+ .vop_access = zfs_access,
+#ifdef FREEBSD_NAMECACHE
+ .vop_lookup = vfs_cache_lookup,
+ .vop_cachedlookup = zfs_lookup,
+#else
+ .vop_lookup = zfs_lookup,
+#endif
+ .vop_getattr = zfs_getattr,
+ .vop_setattr = zfs_setattr,
+ .vop_create = zfs_create,
+ .vop_mknod = zfs_create,
+ .vop_mkdir = zfs_mkdir,
+ .vop_readdir = zfs_readdir,
+ .vop_fsync = zfs_fsync,
+ .vop_open = zfs_open,
+ .vop_close = zfs_close,
+ .vop_rmdir = zfs_rmdir,
+ .vop_ioctl = zfs_ioctl,
+ .vop_link = zfs_link,
+ .vop_symlink = zfs_symlink,
+ .vop_readlink = zfs_readlink,
+ .vop_read = zfs_read,
+ .vop_write = zfs_write,
+ .vop_remove = zfs_remove,
+ .vop_rename = zfs_rename,
+ .vop_advlock = zfs_advlock,
+ .vop_pathconf = zfs_pathconf,
+ .vop_bmap = VOP_EOPNOTSUPP,
+ .vop_fid = zfs_fid,
+};
+
+struct vop_vector zfs_fifoops = {
+ .vop_default = &fifo_specops,
+ .vop_fsync = VOP_PANIC,
+ .vop_access = zfs_access,
+ .vop_getattr = zfs_getattr,
+ .vop_inactive = zfs_inactive,
+ .vop_read = VOP_PANIC,
+ .vop_reclaim = zfs_reclaim,
+ .vop_setattr = zfs_setattr,
+ .vop_write = VOP_PANIC,
+ .vop_fid = zfs_fid,
+};
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
new file mode 100644
index 0000000..d2806b9
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
@@ -0,0 +1,1061 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef _KERNEL
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/resource.h>
+#include <sys/mntent.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/kmem.h>
+#include <sys/cmn_err.h>
+#include <sys/errno.h>
+#include <sys/unistd.h>
+#include <sys/atomic.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_rlock.h>
+#include <sys/fs/zfs.h>
+#endif /* _KERNEL */
+
+#include <sys/dmu.h>
+#include <sys/refcount.h>
+#include <sys/stat.h>
+#include <sys/zap.h>
+#include <sys/zfs_znode.h>
+#include <sys/refcount.h>
+
+/*
+ * Functions needed for userland (ie: libzpool) are not put under
+ * #ifdef_KERNEL; the rest of the functions have dependencies
+ * (such as VFS logic) that will not compile easily in userland.
+ */
+#ifdef _KERNEL
+struct kmem_cache *znode_cache = NULL;
+
+/*ARGSUSED*/
+static void
+znode_pageout_func(dmu_buf_t *dbuf, void *user_ptr)
+{
+ znode_t *zp = user_ptr;
+ vnode_t *vp = ZTOV(zp);
+
+ mutex_enter(&zp->z_lock);
+ if (vp == NULL) {
+ mutex_exit(&zp->z_lock);
+ zfs_znode_free(zp);
+ } else if (vp->v_count == 0) {
+ ZTOV(zp) = NULL;
+ mutex_exit(&zp->z_lock);
+ vhold(vp);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread);
+ vrecycle(vp, curthread);
+ VOP_UNLOCK(vp, 0, curthread);
+ vdrop(vp);
+ zfs_znode_free(zp);
+ } else {
+ /* signal force unmount that this znode can be freed */
+ zp->z_dbuf = NULL;
+ mutex_exit(&zp->z_lock);
+ }
+}
+
+extern struct vop_vector zfs_vnodeops;
+extern struct vop_vector zfs_fifoops;
+
+/*
+ * XXX: We cannot use this function as a cache constructor, because
+ * there is one global cache for all file systems and we need
+ * to pass vfsp here, which is not possible, because argument
+ * 'cdrarg' is defined at kmem_cache_create() time.
+ */
+static int
+zfs_znode_cache_constructor(void *buf, void *cdrarg, int kmflags)
+{
+ znode_t *zp = buf;
+ vfs_t *vfsp = cdrarg;
+ int error;
+
+ if (cdrarg != NULL) {
+ error = getnewvnode("zfs", vfsp, &zfs_vnodeops, &zp->z_vnode);
+ ASSERT(error == 0);
+ zp->z_vnode->v_data = (caddr_t)zp;
+ vhold(zp->z_vnode);
+ } else {
+ zp->z_vnode = NULL;
+ }
+ mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
+ rw_init(&zp->z_map_lock, NULL, RW_DEFAULT, NULL);
+ rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
+ rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL);
+ mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL);
+ avl_create(&zp->z_range_avl, zfs_range_compare,
+ sizeof (rl_t), offsetof(rl_t, r_node));
+
+ zp->z_dbuf_held = 0;
+ zp->z_dirlocks = 0;
+ zp->z_lockf = NULL;
+ return (0);
+}
+
+/*ARGSUSED*/
+static void
+zfs_znode_cache_destructor(void *buf, void *cdarg)
+{
+ znode_t *zp = buf;
+
+ ASSERT(zp->z_dirlocks == 0);
+ mutex_destroy(&zp->z_lock);
+ rw_destroy(&zp->z_map_lock);
+ rw_destroy(&zp->z_parent_lock);
+ rw_destroy(&zp->z_name_lock);
+ mutex_destroy(&zp->z_acl_lock);
+ mutex_destroy(&zp->z_range_lock);
+ avl_destroy(&zp->z_range_avl);
+
+ ASSERT(zp->z_dbuf_held == 0);
+}
+
+void
+zfs_znode_init(void)
+{
+ /*
+ * Initialize zcache
+ */
+ ASSERT(znode_cache == NULL);
+ znode_cache = kmem_cache_create("zfs_znode_cache",
+ sizeof (znode_t), 0, /* zfs_znode_cache_constructor */ NULL,
+ zfs_znode_cache_destructor, NULL, NULL, NULL, 0);
+}
+
+void
+zfs_znode_fini(void)
+{
+ /*
+ * Cleanup zcache
+ */
+ if (znode_cache)
+ kmem_cache_destroy(znode_cache);
+ znode_cache = NULL;
+}
+
+/*
+ * zfs_init_fs - Initialize the zfsvfs struct and the file system
+ * incore "master" object. Verify version compatibility.
+ */
+int
+zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp, cred_t *cr)
+{
+ objset_t *os = zfsvfs->z_os;
+ uint64_t version = ZPL_VERSION;
+ int i, error;
+ dmu_object_info_t doi;
+ uint64_t fsid_guid;
+
+ *zpp = NULL;
+
+ /*
+ * XXX - hack to auto-create the pool root filesystem at
+ * the first attempted mount.
+ */
+ if (dmu_object_info(os, MASTER_NODE_OBJ, &doi) == ENOENT) {
+ dmu_tx_t *tx = dmu_tx_create(os);
+
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL); /* master */
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL); /* del queue */
+ dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); /* root node */
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ ASSERT3U(error, ==, 0);
+ zfs_create_fs(os, cr, tx);
+ dmu_tx_commit(tx);
+ }
+
+ error = zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_OBJ, 8, 1,
+ &version);
+ if (error) {
+ return (error);
+ } else if (version != ZPL_VERSION) {
+ (void) printf("Mismatched versions: File system "
+ "is version %lld on-disk format, which is "
+ "incompatible with this software version %lld!",
+ (u_longlong_t)version, ZPL_VERSION);
+ return (ENOTSUP);
+ }
+
+ /*
+ * The fsid is 64 bits, composed of an 8-bit fs type, which
+ * separates our fsid from any other filesystem types, and a
+ * 56-bit objset unique ID. The objset unique ID is unique to
+ * all objsets open on this system, provided by unique_create().
+ * The 8-bit fs type must be put in the low bits of fsid[1]
+ * because that's where other Solaris filesystems put it.
+ */
+ fsid_guid = dmu_objset_fsid_guid(os);
+ ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
+ zfsvfs->z_vfs->vfs_fsid.val[0] = fsid_guid;
+ zfsvfs->z_vfs->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) |
+ zfsvfs->z_vfs->mnt_vfc->vfc_typenum & 0xFF;
+
+ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
+ &zfsvfs->z_root);
+ if (error)
+ return (error);
+ ASSERT(zfsvfs->z_root != 0);
+
+ /*
+ * Create the per mount vop tables.
+ */
+
+ /*
+ * Initialize zget mutex's
+ */
+ for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
+ mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
+
+ error = zfs_zget(zfsvfs, zfsvfs->z_root, zpp);
+ if (error)
+ return (error);
+ ASSERT3U((*zpp)->z_id, ==, zfsvfs->z_root);
+
+ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
+ &zfsvfs->z_unlinkedobj);
+ if (error)
+ return (error);
+
+ return (0);
+}
+
+/*
+ * define a couple of values we need available
+ * for both 64 and 32 bit environments.
+ */
+#ifndef NBITSMINOR64
+#define NBITSMINOR64 32
+#endif
+#ifndef MAXMAJ64
+#define MAXMAJ64 0xffffffffUL
+#endif
+#ifndef MAXMIN64
+#define MAXMIN64 0xffffffffUL
+#endif
+
+/*
+ * Create special expldev for ZFS private use.
+ * Can't use standard expldev since it doesn't do
+ * what we want. The standard expldev() takes a
+ * dev32_t in LP64 and expands it to a long dev_t.
+ * We need an interface that takes a dev32_t in ILP32
+ * and expands it to a long dev_t.
+ */
+static uint64_t
+zfs_expldev(dev_t dev)
+{
+ return ((uint64_t)0);
+}
+/*
+ * Special cmpldev for ZFS private use.
+ * Can't use standard cmpldev since it takes
+ * a long dev_t and compresses it to dev32_t in
+ * LP64. We need to do a compaction of a long dev_t
+ * to a dev32_t in ILP32.
+ */
+dev_t
+zfs_cmpldev(uint64_t dev)
+{
+ return ((dev_t)0);
+}
+
+/*
+ * Construct a new znode/vnode and intialize.
+ *
+ * This does not do a call to dmu_set_user() that is
+ * up to the caller to do, in case you don't want to
+ * return the znode
+ */
+static znode_t *
+zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, uint64_t obj_num, int blksz)
+{
+ znode_t *zp;
+ vnode_t *vp;
+ int error;
+
+ zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
+ zfs_znode_cache_constructor(zp, zfsvfs->z_vfs, 0);
+
+ ASSERT(zp->z_dirlocks == NULL);
+
+ zp->z_phys = db->db_data;
+ zp->z_zfsvfs = zfsvfs;
+ zp->z_unlinked = 0;
+ zp->z_atime_dirty = 0;
+ zp->z_dbuf_held = 0;
+ zp->z_mapcnt = 0;
+ zp->z_last_itx = 0;
+ zp->z_dbuf = db;
+ zp->z_id = obj_num;
+ zp->z_blksz = blksz;
+ zp->z_seq = 0x7A4653;
+ zp->z_sync_cnt = 0;
+
+ mutex_enter(&zfsvfs->z_znodes_lock);
+ list_insert_tail(&zfsvfs->z_all_znodes, zp);
+ mutex_exit(&zfsvfs->z_znodes_lock);
+
+ vp = ZTOV(zp);
+ if (vp == NULL)
+ return (zp);
+
+ error = insmntque(vp, zfsvfs->z_vfs);
+ KASSERT(error == 0, ("insmntque() failed: error %d", error));
+
+ vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode);
+ switch (vp->v_type) {
+ case VDIR:
+ zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */
+ break;
+ case VFIFO:
+ vp->v_op = &zfs_fifoops;
+ break;
+ }
+
+ return (zp);
+}
+
+static void
+zfs_znode_dmu_init(znode_t *zp)
+{
+ znode_t *nzp;
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ dmu_buf_t *db = zp->z_dbuf;
+
+ mutex_enter(&zp->z_lock);
+
+ nzp = dmu_buf_set_user(db, zp, &zp->z_phys, znode_pageout_func);
+
+ /*
+ * there should be no
+ * concurrent zgets on this object.
+ */
+ ASSERT3P(nzp, ==, NULL);
+
+ /*
+ * Slap on VROOT if we are the root znode
+ */
+ if (zp->z_id == zfsvfs->z_root) {
+ ZTOV(zp)->v_flag |= VROOT;
+ }
+
+ ASSERT(zp->z_dbuf_held == 0);
+ zp->z_dbuf_held = 1;
+ VFS_HOLD(zfsvfs->z_vfs);
+ mutex_exit(&zp->z_lock);
+}
+
+/*
+ * Create a new DMU object to hold a zfs znode.
+ *
+ * IN: dzp - parent directory for new znode
+ * vap - file attributes for new znode
+ * tx - dmu transaction id for zap operations
+ * cr - credentials of caller
+ * flag - flags:
+ * IS_ROOT_NODE - new object will be root
+ * IS_XATTR - new object is an attribute
+ * IS_REPLAY - intent log replay
+ *
+ * OUT: oid - ID of created object
+ *
+ */
+void
+zfs_mknode(znode_t *dzp, vattr_t *vap, uint64_t *oid, dmu_tx_t *tx, cred_t *cr,
+ uint_t flag, znode_t **zpp, int bonuslen)
+{
+ dmu_buf_t *dbp;
+ znode_phys_t *pzp;
+ znode_t *zp;
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ timestruc_t now;
+ uint64_t gen;
+ int err;
+
+ ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
+
+ if (zfsvfs->z_assign >= TXG_INITIAL) { /* ZIL replay */
+ *oid = vap->va_nodeid;
+ flag |= IS_REPLAY;
+ now = vap->va_ctime; /* see zfs_replay_create() */
+ gen = vap->va_nblocks; /* ditto */
+ } else {
+ *oid = 0;
+ gethrestime(&now);
+ gen = dmu_tx_get_txg(tx);
+ }
+
+ /*
+ * Create a new DMU object.
+ */
+ /*
+ * There's currently no mechanism for pre-reading the blocks that will
+ * be to needed allocate a new object, so we accept the small chance
+ * that there will be an i/o error and we will fail one of the
+ * assertions below.
+ */
+ if (vap->va_type == VDIR) {
+ if (flag & IS_REPLAY) {
+ err = zap_create_claim(zfsvfs->z_os, *oid,
+ DMU_OT_DIRECTORY_CONTENTS,
+ DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
+ ASSERT3U(err, ==, 0);
+ } else {
+ *oid = zap_create(zfsvfs->z_os,
+ DMU_OT_DIRECTORY_CONTENTS,
+ DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
+ }
+ } else {
+ if (flag & IS_REPLAY) {
+ err = dmu_object_claim(zfsvfs->z_os, *oid,
+ DMU_OT_PLAIN_FILE_CONTENTS, 0,
+ DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
+ ASSERT3U(err, ==, 0);
+ } else {
+ *oid = dmu_object_alloc(zfsvfs->z_os,
+ DMU_OT_PLAIN_FILE_CONTENTS, 0,
+ DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
+ }
+ }
+ VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, *oid, NULL, &dbp));
+ dmu_buf_will_dirty(dbp, tx);
+
+ /*
+ * Initialize the znode physical data to zero.
+ */
+ ASSERT(dbp->db_size >= sizeof (znode_phys_t));
+ bzero(dbp->db_data, dbp->db_size);
+ pzp = dbp->db_data;
+
+ /*
+ * If this is the root, fix up the half-initialized parent pointer
+ * to reference the just-allocated physical data area.
+ */
+ if (flag & IS_ROOT_NODE) {
+ dzp->z_phys = pzp;
+ dzp->z_id = *oid;
+ }
+
+ /*
+ * If parent is an xattr, so am I.
+ */
+ if (dzp->z_phys->zp_flags & ZFS_XATTR)
+ flag |= IS_XATTR;
+
+ if (vap->va_type == VBLK || vap->va_type == VCHR) {
+ pzp->zp_rdev = zfs_expldev(vap->va_rdev);
+ }
+
+ if (vap->va_type == VDIR) {
+ pzp->zp_size = 2; /* contents ("." and "..") */
+ pzp->zp_links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1;
+ }
+
+ pzp->zp_parent = dzp->z_id;
+ if (flag & IS_XATTR)
+ pzp->zp_flags |= ZFS_XATTR;
+
+ pzp->zp_gen = gen;
+
+ ZFS_TIME_ENCODE(&now, pzp->zp_crtime);
+ ZFS_TIME_ENCODE(&now, pzp->zp_ctime);
+
+ if (vap->va_mask & AT_ATIME) {
+ ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime);
+ } else {
+ ZFS_TIME_ENCODE(&now, pzp->zp_atime);
+ }
+
+ if (vap->va_mask & AT_MTIME) {
+ ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
+ } else {
+ ZFS_TIME_ENCODE(&now, pzp->zp_mtime);
+ }
+
+ pzp->zp_mode = MAKEIMODE(vap->va_type, vap->va_mode);
+ zp = zfs_znode_alloc(zfsvfs, dbp, *oid, 0);
+
+ zfs_perm_init(zp, dzp, flag, vap, tx, cr);
+
+ if (zpp) {
+ kmutex_t *hash_mtx = ZFS_OBJ_MUTEX(zp);
+
+ mutex_enter(hash_mtx);
+ zfs_znode_dmu_init(zp);
+ mutex_exit(hash_mtx);
+
+ *zpp = zp;
+ } else {
+ if (ZTOV(zp) != NULL)
+ ZTOV(zp)->v_count = 0;
+ dmu_buf_rele(dbp, NULL);
+ zfs_znode_free(zp);
+ }
+}
+
+int
+zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
+{
+ dmu_object_info_t doi;
+ dmu_buf_t *db;
+ znode_t *zp;
+ vnode_t *vp;
+ int err;
+
+ *zpp = NULL;
+
+ ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
+
+ err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db);
+ if (err) {
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ return (err);
+ }
+
+ dmu_object_info_from_db(db, &doi);
+ if (doi.doi_bonus_type != DMU_OT_ZNODE ||
+ doi.doi_bonus_size < sizeof (znode_phys_t)) {
+ dmu_buf_rele(db, NULL);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ return (EINVAL);
+ }
+
+ ASSERT(db->db_object == obj_num);
+ ASSERT(db->db_offset == -1);
+ ASSERT(db->db_data != NULL);
+
+ zp = dmu_buf_get_user(db);
+
+ if (zp != NULL) {
+ mutex_enter(&zp->z_lock);
+
+ ASSERT3U(zp->z_id, ==, obj_num);
+ if (zp->z_unlinked) {
+ dmu_buf_rele(db, NULL);
+ mutex_exit(&zp->z_lock);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ return (ENOENT);
+ } else if (zp->z_dbuf_held) {
+ dmu_buf_rele(db, NULL);
+ } else {
+ zp->z_dbuf_held = 1;
+ VFS_HOLD(zfsvfs->z_vfs);
+ }
+
+ if (ZTOV(zp) != NULL)
+ VN_HOLD(ZTOV(zp));
+ else {
+ err = getnewvnode("zfs", zfsvfs->z_vfs, &zfs_vnodeops,
+ &zp->z_vnode);
+ ASSERT(err == 0);
+ vp = ZTOV(zp);
+ vp->v_data = (caddr_t)zp;
+ vhold(vp);
+ vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode);
+ if (vp->v_type == VDIR)
+ zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */
+ err = insmntque(vp, zfsvfs->z_vfs);
+ KASSERT(err == 0, ("insmntque() failed: error %d", err));
+ }
+ mutex_exit(&zp->z_lock);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ *zpp = zp;
+ return (0);
+ }
+
+ /*
+ * Not found create new znode/vnode
+ */
+ zp = zfs_znode_alloc(zfsvfs, db, obj_num, doi.doi_data_block_size);
+ ASSERT3U(zp->z_id, ==, obj_num);
+ zfs_znode_dmu_init(zp);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ *zpp = zp;
+ return (0);
+}
+
+void
+zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ int error;
+
+ ZFS_OBJ_HOLD_ENTER(zfsvfs, zp->z_id);
+ if (zp->z_phys->zp_acl.z_acl_extern_obj) {
+ error = dmu_object_free(zfsvfs->z_os,
+ zp->z_phys->zp_acl.z_acl_extern_obj, tx);
+ ASSERT3U(error, ==, 0);
+ }
+ error = dmu_object_free(zfsvfs->z_os, zp->z_id, tx);
+ ASSERT3U(error, ==, 0);
+ zp->z_dbuf_held = 0;
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id);
+ dmu_buf_rele(zp->z_dbuf, NULL);
+}
+
+void
+zfs_zinactive(znode_t *zp)
+{
+ vnode_t *vp = ZTOV(zp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ uint64_t z_id = zp->z_id;
+
+ ASSERT(zp->z_dbuf_held && zp->z_phys);
+
+ /*
+ * Don't allow a zfs_zget() while were trying to release this znode
+ */
+ ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id);
+
+ mutex_enter(&zp->z_lock);
+ VI_LOCK(vp);
+ if (vp->v_count > 0) {
+ /*
+ * If the hold count is greater than zero, somebody has
+ * obtained a new reference on this znode while we were
+ * processing it here, so we are done.
+ */
+ VI_UNLOCK(vp);
+ mutex_exit(&zp->z_lock);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
+ return;
+ }
+ VI_UNLOCK(vp);
+
+ /*
+ * If this was the last reference to a file with no links,
+ * remove the file from the file system.
+ */
+ if (zp->z_unlinked) {
+ ZTOV(zp) = NULL;
+ mutex_exit(&zp->z_lock);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
+ ASSERT(vp->v_count == 0);
+ vrecycle(vp, curthread);
+ zfs_rmnode(zp);
+ VFS_RELE(zfsvfs->z_vfs);
+ return;
+ }
+ ASSERT(zp->z_phys);
+ ASSERT(zp->z_dbuf_held);
+
+ zp->z_dbuf_held = 0;
+ mutex_exit(&zp->z_lock);
+ dmu_buf_rele(zp->z_dbuf, NULL);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
+ VFS_RELE(zfsvfs->z_vfs);
+}
+
+/*
+ * FreeBSD: Should be called from ->vop_reclaim().
+ */
+void
+zfs_znode_free(znode_t *zp)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+
+ mutex_enter(&zfsvfs->z_znodes_lock);
+ list_remove(&zfsvfs->z_all_znodes, zp);
+ mutex_exit(&zfsvfs->z_znodes_lock);
+
+ kmem_cache_free(znode_cache, zp);
+}
+
+void
+zfs_time_stamper_locked(znode_t *zp, uint_t flag, dmu_tx_t *tx)
+{
+ timestruc_t now;
+
+ ASSERT(MUTEX_HELD(&zp->z_lock));
+
+ gethrestime(&now);
+
+ if (tx) {
+ dmu_buf_will_dirty(zp->z_dbuf, tx);
+ zp->z_atime_dirty = 0;
+ zp->z_seq++;
+ } else {
+ zp->z_atime_dirty = 1;
+ }
+
+ if (flag & AT_ATIME)
+ ZFS_TIME_ENCODE(&now, zp->z_phys->zp_atime);
+
+ if (flag & AT_MTIME)
+ ZFS_TIME_ENCODE(&now, zp->z_phys->zp_mtime);
+
+ if (flag & AT_CTIME)
+ ZFS_TIME_ENCODE(&now, zp->z_phys->zp_ctime);
+}
+
+/*
+ * Update the requested znode timestamps with the current time.
+ * If we are in a transaction, then go ahead and mark the znode
+ * dirty in the transaction so the timestamps will go to disk.
+ * Otherwise, we will get pushed next time the znode is updated
+ * in a transaction, or when this znode eventually goes inactive.
+ *
+ * Why is this OK?
+ * 1 - Only the ACCESS time is ever updated outside of a transaction.
+ * 2 - Multiple consecutive updates will be collapsed into a single
+ * znode update by the transaction grouping semantics of the DMU.
+ */
+void
+zfs_time_stamper(znode_t *zp, uint_t flag, dmu_tx_t *tx)
+{
+ mutex_enter(&zp->z_lock);
+ zfs_time_stamper_locked(zp, flag, tx);
+ mutex_exit(&zp->z_lock);
+}
+
+/*
+ * Grow the block size for a file.
+ *
+ * IN: zp - znode of file to free data in.
+ * size - requested block size
+ * tx - open transaction.
+ *
+ * NOTE: this function assumes that the znode is write locked.
+ */
+void
+zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
+{
+ int error;
+ u_longlong_t dummy;
+
+ if (size <= zp->z_blksz)
+ return;
+ /*
+ * If the file size is already greater than the current blocksize,
+ * we will not grow. If there is more than one block in a file,
+ * the blocksize cannot change.
+ */
+ if (zp->z_blksz && zp->z_phys->zp_size > zp->z_blksz)
+ return;
+
+ error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id,
+ size, 0, tx);
+ if (error == ENOTSUP)
+ return;
+ ASSERT3U(error, ==, 0);
+
+ /* What blocksize did we actually get? */
+ dmu_object_size_from_db(zp->z_dbuf, &zp->z_blksz, &dummy);
+}
+
+/*
+ * Free space in a file.
+ *
+ * IN: zp - znode of file to free data in.
+ * off - start of section to free.
+ * len - length of section to free (0 => to EOF).
+ * flag - current file open mode flags.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ */
+int
+zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
+{
+ vnode_t *vp = ZTOV(zp);
+ dmu_tx_t *tx;
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ zilog_t *zilog = zfsvfs->z_log;
+ rl_t *rl;
+ uint64_t end = off + len;
+ uint64_t size, new_blksz;
+ int error;
+
+ if (ZTOV(zp)->v_type == VFIFO)
+ return (0);
+
+ /*
+ * If we will change zp_size then lock the whole file,
+ * otherwise just lock the range being freed.
+ */
+ if (len == 0 || off + len > zp->z_phys->zp_size) {
+ rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
+ } else {
+ rl = zfs_range_lock(zp, off, len, RL_WRITER);
+ /* recheck, in case zp_size changed */
+ if (off + len > zp->z_phys->zp_size) {
+ /* lost race: file size changed, lock whole file */
+ zfs_range_unlock(rl);
+ rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
+ }
+ }
+
+ /*
+ * Nothing to do if file already at desired length.
+ */
+ size = zp->z_phys->zp_size;
+ if (len == 0 && size == off) {
+ zfs_range_unlock(rl);
+ return (0);
+ }
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_bonus(tx, zp->z_id);
+ new_blksz = 0;
+ if (end > size &&
+ (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
+ /*
+ * We are growing the file past the current block size.
+ */
+ if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) {
+ ASSERT(!ISP2(zp->z_blksz));
+ new_blksz = MIN(end, SPA_MAXBLOCKSIZE);
+ } else {
+ new_blksz = MIN(end, zp->z_zfsvfs->z_max_blksz);
+ }
+ dmu_tx_hold_write(tx, zp->z_id, 0, MIN(end, new_blksz));
+ } else if (off < size) {
+ /*
+ * If len == 0, we are truncating the file.
+ */
+ dmu_tx_hold_free(tx, zp->z_id, off, len ? len : DMU_OBJECT_END);
+ }
+
+ error = dmu_tx_assign(tx, zfsvfs->z_assign);
+ if (error) {
+ if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT)
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ zfs_range_unlock(rl);
+ return (error);
+ }
+
+ if (new_blksz)
+ zfs_grow_blocksize(zp, new_blksz, tx);
+
+ if (end > size || len == 0)
+ zp->z_phys->zp_size = end;
+
+ if (off < size) {
+ objset_t *os = zfsvfs->z_os;
+ uint64_t rlen = len;
+
+ if (len == 0)
+ rlen = -1;
+ else if (end > size)
+ rlen = size - off;
+ VERIFY(0 == dmu_free_range(os, zp->z_id, off, rlen, tx));
+ }
+
+ if (log) {
+ zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
+ zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
+ }
+
+ zfs_range_unlock(rl);
+
+ dmu_tx_commit(tx);
+
+ /*
+ * Clear any mapped pages in the truncated region. This has to
+ * happen outside of the transaction to avoid the possibility of
+ * a deadlock with someone trying to push a page that we are
+ * about to invalidate.
+ */
+ rw_enter(&zp->z_map_lock, RW_WRITER);
+ if (end > size)
+ vnode_pager_setsize(vp, end);
+ else if (len == 0) {
+#if 0
+ error = vtruncbuf(vp, curthread->td_ucred, curthread, end, PAGE_SIZE);
+#else
+ error = vinvalbuf(vp, V_SAVE, curthread, 0, 0);
+ vnode_pager_setsize(vp, end);
+#endif
+ }
+ rw_exit(&zp->z_map_lock);
+
+ return (0);
+}
+
+void
+zfs_create_fs(objset_t *os, cred_t *cr, dmu_tx_t *tx)
+{
+ zfsvfs_t zfsvfs;
+ uint64_t moid, doid, roid = 0;
+ uint64_t version = ZPL_VERSION;
+ int error;
+ znode_t *rootzp = NULL;
+ vattr_t vattr;
+
+ /*
+ * First attempt to create master node.
+ */
+ /*
+ * In an empty objset, there are no blocks to read and thus
+ * there can be no i/o errors (which we assert below).
+ */
+ moid = MASTER_NODE_OBJ;
+ error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
+ DMU_OT_NONE, 0, tx);
+ ASSERT(error == 0);
+
+ /*
+ * Set starting attributes.
+ */
+
+ error = zap_update(os, moid, ZPL_VERSION_OBJ, 8, 1, &version, tx);
+ ASSERT(error == 0);
+
+ /*
+ * Create a delete queue.
+ */
+ doid = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
+
+ error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &doid, tx);
+ ASSERT(error == 0);
+
+ /*
+ * Create root znode. Create minimal znode/vnode/zfsvfs
+ * to allow zfs_mknode to work.
+ */
+ vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
+ vattr.va_type = VDIR;
+ vattr.va_mode = S_IFDIR|0755;
+ vattr.va_uid = UID_ROOT;
+ vattr.va_gid = GID_WHEEL;
+
+ rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
+ zfs_znode_cache_constructor(rootzp, NULL, 0);
+ rootzp->z_zfsvfs = &zfsvfs;
+ rootzp->z_unlinked = 0;
+ rootzp->z_atime_dirty = 0;
+ rootzp->z_dbuf_held = 0;
+
+ bzero(&zfsvfs, sizeof (zfsvfs_t));
+
+ zfsvfs.z_os = os;
+ zfsvfs.z_assign = TXG_NOWAIT;
+ zfsvfs.z_parent = &zfsvfs;
+
+ mutex_init(&zfsvfs.z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&zfsvfs.z_all_znodes, sizeof (znode_t),
+ offsetof(znode_t, z_link_node));
+
+ zfs_mknode(rootzp, &vattr, &roid, tx, cr, IS_ROOT_NODE, NULL, 0);
+ ASSERT3U(rootzp->z_id, ==, roid);
+ error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &roid, tx);
+ ASSERT(error == 0);
+
+ kmem_cache_free(znode_cache, rootzp);
+}
+#endif /* _KERNEL */
+
+/*
+ * Given an object number, return its parent object number and whether
+ * or not the object is an extended attribute directory.
+ */
+static int
+zfs_obj_to_pobj(objset_t *osp, uint64_t obj, uint64_t *pobjp, int *is_xattrdir)
+{
+ dmu_buf_t *db;
+ dmu_object_info_t doi;
+ znode_phys_t *zp;
+ int error;
+
+ if ((error = dmu_bonus_hold(osp, obj, FTAG, &db)) != 0)
+ return (error);
+
+ dmu_object_info_from_db(db, &doi);
+ if (doi.doi_bonus_type != DMU_OT_ZNODE ||
+ doi.doi_bonus_size < sizeof (znode_phys_t)) {
+ dmu_buf_rele(db, FTAG);
+ return (EINVAL);
+ }
+
+ zp = db->db_data;
+ *pobjp = zp->zp_parent;
+ *is_xattrdir = ((zp->zp_flags & ZFS_XATTR) != 0) &&
+ S_ISDIR(zp->zp_mode);
+ dmu_buf_rele(db, FTAG);
+
+ return (0);
+}
+
+int
+zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
+{
+ char *path = buf + len - 1;
+ int error;
+
+ *path = '\0';
+
+ for (;;) {
+ uint64_t pobj;
+ char component[MAXNAMELEN + 2];
+ size_t complen;
+ int is_xattrdir;
+
+ if ((error = zfs_obj_to_pobj(osp, obj, &pobj,
+ &is_xattrdir)) != 0)
+ break;
+
+ if (pobj == obj) {
+ if (path[0] != '/')
+ *--path = '/';
+ break;
+ }
+
+ component[0] = '/';
+ if (is_xattrdir) {
+ (void) sprintf(component + 1, "<xattrdir>");
+ } else {
+ error = zap_value_search(osp, pobj, obj, component + 1);
+ if (error != 0)
+ break;
+ }
+
+ complen = strlen(component);
+ path -= complen;
+ ASSERT(path >= buf);
+ bcopy(component, path, complen);
+ obj = pobj;
+ }
+
+ if (error == 0)
+ (void) memmove(buf, path, buf + len - path);
+ return (error);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
new file mode 100644
index 0000000..6b52b55
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
@@ -0,0 +1,1607 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/dmu.h>
+#include <sys/zap.h>
+#include <sys/arc.h>
+#include <sys/stat.h>
+#include <sys/resource.h>
+#include <sys/zil.h>
+#include <sys/zil_impl.h>
+#include <sys/dsl_dataset.h>
+#include <sys/vdev.h>
+#include <sys/dmu_tx.h>
+
+/*
+ * The zfs intent log (ZIL) saves transaction records of system calls
+ * that change the file system in memory with enough information
+ * to be able to replay them. These are stored in memory until
+ * either the DMU transaction group (txg) commits them to the stable pool
+ * and they can be discarded, or they are flushed to the stable log
+ * (also in the pool) due to a fsync, O_DSYNC or other synchronous
+ * requirement. In the event of a panic or power fail then those log
+ * records (transactions) are replayed.
+ *
+ * There is one ZIL per file system. Its on-disk (pool) format consists
+ * of 3 parts:
+ *
+ * - ZIL header
+ * - ZIL blocks
+ * - ZIL records
+ *
+ * A log record holds a system call transaction. Log blocks can
+ * hold many log records and the blocks are chained together.
+ * Each ZIL block contains a block pointer (blkptr_t) to the next
+ * ZIL block in the chain. The ZIL header points to the first
+ * block in the chain. Note there is not a fixed place in the pool
+ * to hold blocks. They are dynamically allocated and freed as
+ * needed from the blocks available. Figure X shows the ZIL structure:
+ */
+
+/*
+ * This global ZIL switch affects all pools
+ */
+int zil_disable = 0; /* disable intent logging */
+SYSCTL_DECL(_vfs_zfs);
+TUNABLE_INT("vfs.zfs.zil_disable", &zil_disable);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, zil_disable, CTLFLAG_RDTUN, &zil_disable, 0,
+ "Disable ZFS Intent Log (ZIL)");
+
+/*
+ * Tunable parameter for debugging or performance analysis. Setting
+ * zfs_nocacheflush will cause corruption on power loss if a volatile
+ * out-of-order write cache is enabled.
+ */
+boolean_t zfs_nocacheflush = B_FALSE;
+TUNABLE_INT("vfs.zfs.cache_flush_disable", &zfs_nocacheflush);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, cache_flush_disable, CTLFLAG_RDTUN,
+ &zfs_nocacheflush, 0, "Disable cache flush");
+
+static kmem_cache_t *zil_lwb_cache;
+
+static int
+zil_dva_compare(const void *x1, const void *x2)
+{
+ const dva_t *dva1 = x1;
+ const dva_t *dva2 = x2;
+
+ if (DVA_GET_VDEV(dva1) < DVA_GET_VDEV(dva2))
+ return (-1);
+ if (DVA_GET_VDEV(dva1) > DVA_GET_VDEV(dva2))
+ return (1);
+
+ if (DVA_GET_OFFSET(dva1) < DVA_GET_OFFSET(dva2))
+ return (-1);
+ if (DVA_GET_OFFSET(dva1) > DVA_GET_OFFSET(dva2))
+ return (1);
+
+ return (0);
+}
+
+static void
+zil_dva_tree_init(avl_tree_t *t)
+{
+ avl_create(t, zil_dva_compare, sizeof (zil_dva_node_t),
+ offsetof(zil_dva_node_t, zn_node));
+}
+
+static void
+zil_dva_tree_fini(avl_tree_t *t)
+{
+ zil_dva_node_t *zn;
+ void *cookie = NULL;
+
+ while ((zn = avl_destroy_nodes(t, &cookie)) != NULL)
+ kmem_free(zn, sizeof (zil_dva_node_t));
+
+ avl_destroy(t);
+}
+
+static int
+zil_dva_tree_add(avl_tree_t *t, dva_t *dva)
+{
+ zil_dva_node_t *zn;
+ avl_index_t where;
+
+ if (avl_find(t, dva, &where) != NULL)
+ return (EEXIST);
+
+ zn = kmem_alloc(sizeof (zil_dva_node_t), KM_SLEEP);
+ zn->zn_dva = *dva;
+ avl_insert(t, zn, where);
+
+ return (0);
+}
+
+static zil_header_t *
+zil_header_in_syncing_context(zilog_t *zilog)
+{
+ return ((zil_header_t *)zilog->zl_header);
+}
+
+static void
+zil_init_log_chain(zilog_t *zilog, blkptr_t *bp)
+{
+ zio_cksum_t *zc = &bp->blk_cksum;
+
+ zc->zc_word[ZIL_ZC_GUID_0] = spa_get_random(-1ULL);
+ zc->zc_word[ZIL_ZC_GUID_1] = spa_get_random(-1ULL);
+ zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os);
+ zc->zc_word[ZIL_ZC_SEQ] = 1ULL;
+}
+
+/*
+ * Read a log block, make sure it's valid, and byteswap it if necessary.
+ */
+static int
+zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, arc_buf_t **abufpp)
+{
+ blkptr_t blk = *bp;
+ zbookmark_t zb;
+ uint32_t aflags = ARC_WAIT;
+ int error;
+
+ zb.zb_objset = bp->blk_cksum.zc_word[ZIL_ZC_OBJSET];
+ zb.zb_object = 0;
+ zb.zb_level = -1;
+ zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
+
+ *abufpp = NULL;
+
+ error = arc_read(NULL, zilog->zl_spa, &blk, byteswap_uint64_array,
+ arc_getbuf_func, abufpp, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB, &aflags, &zb);
+
+ if (error == 0) {
+ char *data = (*abufpp)->b_data;
+ uint64_t blksz = BP_GET_LSIZE(bp);
+ zil_trailer_t *ztp = (zil_trailer_t *)(data + blksz) - 1;
+ zio_cksum_t cksum = bp->blk_cksum;
+
+ /*
+ * Sequence numbers should be... sequential. The checksum
+ * verifier for the next block should be bp's checksum plus 1.
+ */
+ cksum.zc_word[ZIL_ZC_SEQ]++;
+
+ if (bcmp(&cksum, &ztp->zit_next_blk.blk_cksum, sizeof (cksum)))
+ error = ESTALE;
+ else if (BP_IS_HOLE(&ztp->zit_next_blk))
+ error = ENOENT;
+ else if (ztp->zit_nused > (blksz - sizeof (zil_trailer_t)))
+ error = EOVERFLOW;
+
+ if (error) {
+ VERIFY(arc_buf_remove_ref(*abufpp, abufpp) == 1);
+ *abufpp = NULL;
+ }
+ }
+
+ dprintf("error %d on %llu:%llu\n", error, zb.zb_objset, zb.zb_blkid);
+
+ return (error);
+}
+
+/*
+ * Parse the intent log, and call parse_func for each valid record within.
+ * Return the highest sequence number.
+ */
+uint64_t
+zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
+ zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg)
+{
+ const zil_header_t *zh = zilog->zl_header;
+ uint64_t claim_seq = zh->zh_claim_seq;
+ uint64_t seq = 0;
+ uint64_t max_seq = 0;
+ blkptr_t blk = zh->zh_log;
+ arc_buf_t *abuf;
+ char *lrbuf, *lrp;
+ zil_trailer_t *ztp;
+ int reclen, error;
+
+ if (BP_IS_HOLE(&blk))
+ return (max_seq);
+
+ /*
+ * Starting at the block pointed to by zh_log we read the log chain.
+ * For each block in the chain we strongly check that block to
+ * ensure its validity. We stop when an invalid block is found.
+ * For each block pointer in the chain we call parse_blk_func().
+ * For each record in each valid block we call parse_lr_func().
+ * If the log has been claimed, stop if we encounter a sequence
+ * number greater than the highest claimed sequence number.
+ */
+ zil_dva_tree_init(&zilog->zl_dva_tree);
+ for (;;) {
+ seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
+
+ if (claim_seq != 0 && seq > claim_seq)
+ break;
+
+ ASSERT(max_seq < seq);
+ max_seq = seq;
+
+ error = zil_read_log_block(zilog, &blk, &abuf);
+
+ if (parse_blk_func != NULL)
+ parse_blk_func(zilog, &blk, arg, txg);
+
+ if (error)
+ break;
+
+ lrbuf = abuf->b_data;
+ ztp = (zil_trailer_t *)(lrbuf + BP_GET_LSIZE(&blk)) - 1;
+ blk = ztp->zit_next_blk;
+
+ if (parse_lr_func == NULL) {
+ VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
+ continue;
+ }
+
+ for (lrp = lrbuf; lrp < lrbuf + ztp->zit_nused; lrp += reclen) {
+ lr_t *lr = (lr_t *)lrp;
+ reclen = lr->lrc_reclen;
+ ASSERT3U(reclen, >=, sizeof (lr_t));
+ parse_lr_func(zilog, lr, arg, txg);
+ }
+ VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
+ }
+ zil_dva_tree_fini(&zilog->zl_dva_tree);
+
+ return (max_seq);
+}
+
+/* ARGSUSED */
+static void
+zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg)
+{
+ spa_t *spa = zilog->zl_spa;
+ int err;
+
+ /*
+ * Claim log block if not already committed and not already claimed.
+ */
+ if (bp->blk_birth >= first_txg &&
+ zil_dva_tree_add(&zilog->zl_dva_tree, BP_IDENTITY(bp)) == 0) {
+ err = zio_wait(zio_claim(NULL, spa, first_txg, bp, NULL, NULL));
+ ASSERT(err == 0);
+ }
+}
+
+static void
+zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg)
+{
+ if (lrc->lrc_txtype == TX_WRITE) {
+ lr_write_t *lr = (lr_write_t *)lrc;
+ zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg);
+ }
+}
+
+/* ARGSUSED */
+static void
+zil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg)
+{
+ zio_free_blk(zilog->zl_spa, bp, dmu_tx_get_txg(tx));
+}
+
+static void
+zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg)
+{
+ /*
+ * If we previously claimed it, we need to free it.
+ */
+ if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE) {
+ lr_write_t *lr = (lr_write_t *)lrc;
+ blkptr_t *bp = &lr->lr_blkptr;
+ if (bp->blk_birth >= claim_txg &&
+ !zil_dva_tree_add(&zilog->zl_dva_tree, BP_IDENTITY(bp))) {
+ (void) arc_free(NULL, zilog->zl_spa,
+ dmu_tx_get_txg(tx), bp, NULL, NULL, ARC_WAIT);
+ }
+ }
+}
+
+/*
+ * Create an on-disk intent log.
+ */
+static void
+zil_create(zilog_t *zilog)
+{
+ const zil_header_t *zh = zilog->zl_header;
+ lwb_t *lwb;
+ uint64_t txg = 0;
+ dmu_tx_t *tx = NULL;
+ blkptr_t blk;
+ int error = 0;
+
+ /*
+ * Wait for any previous destroy to complete.
+ */
+ txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
+
+ ASSERT(zh->zh_claim_txg == 0);
+ ASSERT(zh->zh_replay_seq == 0);
+
+ blk = zh->zh_log;
+
+ /*
+ * If we don't already have an initial log block, allocate one now.
+ */
+ if (BP_IS_HOLE(&blk)) {
+ tx = dmu_tx_create(zilog->zl_os);
+ (void) dmu_tx_assign(tx, TXG_WAIT);
+ dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
+ txg = dmu_tx_get_txg(tx);
+
+ error = zio_alloc_blk(zilog->zl_spa, ZIL_MIN_BLKSZ, &blk,
+ NULL, txg);
+
+ if (error == 0)
+ zil_init_log_chain(zilog, &blk);
+ }
+
+ /*
+ * Allocate a log write buffer (lwb) for the first log block.
+ */
+ if (error == 0) {
+ lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
+ lwb->lwb_zilog = zilog;
+ lwb->lwb_blk = blk;
+ lwb->lwb_nused = 0;
+ lwb->lwb_sz = BP_GET_LSIZE(&lwb->lwb_blk);
+ lwb->lwb_buf = zio_buf_alloc(lwb->lwb_sz);
+ lwb->lwb_max_txg = txg;
+ lwb->lwb_zio = NULL;
+
+ mutex_enter(&zilog->zl_lock);
+ list_insert_tail(&zilog->zl_lwb_list, lwb);
+ mutex_exit(&zilog->zl_lock);
+ }
+
+ /*
+ * If we just allocated the first log block, commit our transaction
+ * and wait for zil_sync() to stuff the block poiner into zh_log.
+ * (zh is part of the MOS, so we cannot modify it in open context.)
+ */
+ if (tx != NULL) {
+ dmu_tx_commit(tx);
+ txg_wait_synced(zilog->zl_dmu_pool, txg);
+ }
+
+ ASSERT(bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0);
+}
+
+/*
+ * In one tx, free all log blocks and clear the log header.
+ * If keep_first is set, then we're replaying a log with no content.
+ * We want to keep the first block, however, so that the first
+ * synchronous transaction doesn't require a txg_wait_synced()
+ * in zil_create(). We don't need to txg_wait_synced() here either
+ * when keep_first is set, because both zil_create() and zil_destroy()
+ * will wait for any in-progress destroys to complete.
+ */
+void
+zil_destroy(zilog_t *zilog, boolean_t keep_first)
+{
+ const zil_header_t *zh = zilog->zl_header;
+ lwb_t *lwb;
+ dmu_tx_t *tx;
+ uint64_t txg;
+
+ /*
+ * Wait for any previous destroy to complete.
+ */
+ txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
+
+ if (BP_IS_HOLE(&zh->zh_log))
+ return;
+
+ tx = dmu_tx_create(zilog->zl_os);
+ (void) dmu_tx_assign(tx, TXG_WAIT);
+ dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
+ txg = dmu_tx_get_txg(tx);
+
+ mutex_enter(&zilog->zl_lock);
+
+ ASSERT3U(zilog->zl_destroy_txg, <, txg);
+ zilog->zl_destroy_txg = txg;
+ zilog->zl_keep_first = keep_first;
+
+ if (!list_is_empty(&zilog->zl_lwb_list)) {
+ ASSERT(zh->zh_claim_txg == 0);
+ ASSERT(!keep_first);
+ while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
+ list_remove(&zilog->zl_lwb_list, lwb);
+ if (lwb->lwb_buf != NULL)
+ zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
+ zio_free_blk(zilog->zl_spa, &lwb->lwb_blk, txg);
+ kmem_cache_free(zil_lwb_cache, lwb);
+ }
+ } else {
+ if (!keep_first) {
+ (void) zil_parse(zilog, zil_free_log_block,
+ zil_free_log_record, tx, zh->zh_claim_txg);
+ }
+ }
+ mutex_exit(&zilog->zl_lock);
+
+ dmu_tx_commit(tx);
+
+ if (keep_first) /* no need to wait in this case */
+ return;
+
+ txg_wait_synced(zilog->zl_dmu_pool, txg);
+ ASSERT(BP_IS_HOLE(&zh->zh_log));
+}
+
+int
+zil_claim(char *osname, void *txarg)
+{
+ dmu_tx_t *tx = txarg;
+ uint64_t first_txg = dmu_tx_get_txg(tx);
+ zilog_t *zilog;
+ zil_header_t *zh;
+ objset_t *os;
+ int error;
+
+ error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_STANDARD, &os);
+ if (error) {
+ cmn_err(CE_WARN, "can't process intent log for %s", osname);
+ return (0);
+ }
+
+ zilog = dmu_objset_zil(os);
+ zh = zil_header_in_syncing_context(zilog);
+
+ /*
+ * Claim all log blocks if we haven't already done so, and remember
+ * the highest claimed sequence number. This ensures that if we can
+ * read only part of the log now (e.g. due to a missing device),
+ * but we can read the entire log later, we will not try to replay
+ * or destroy beyond the last block we successfully claimed.
+ */
+ ASSERT3U(zh->zh_claim_txg, <=, first_txg);
+ if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) {
+ zh->zh_claim_txg = first_txg;
+ zh->zh_claim_seq = zil_parse(zilog, zil_claim_log_block,
+ zil_claim_log_record, tx, first_txg);
+ dsl_dataset_dirty(dmu_objset_ds(os), tx);
+ }
+
+ ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1));
+ dmu_objset_close(os);
+ return (0);
+}
+
+void
+zil_add_vdev(zilog_t *zilog, uint64_t vdev)
+{
+ zil_vdev_t *zv, *new;
+ uint64_t bmap_sz = sizeof (zilog->zl_vdev_bmap) << 3;
+ uchar_t *cp;
+
+ if (zfs_nocacheflush)
+ return;
+
+ if (vdev < bmap_sz) {
+ cp = zilog->zl_vdev_bmap + (vdev / 8);
+ atomic_or_8(cp, 1 << (vdev % 8));
+ } else {
+ /*
+ * insert into ordered list
+ */
+ mutex_enter(&zilog->zl_lock);
+ for (zv = list_head(&zilog->zl_vdev_list); zv != NULL;
+ zv = list_next(&zilog->zl_vdev_list, zv)) {
+ if (zv->vdev == vdev) {
+ /* duplicate found - just return */
+ mutex_exit(&zilog->zl_lock);
+ return;
+ }
+ if (zv->vdev > vdev) {
+ /* insert before this entry */
+ new = kmem_alloc(sizeof (zil_vdev_t),
+ KM_SLEEP);
+ new->vdev = vdev;
+ list_insert_before(&zilog->zl_vdev_list,
+ zv, new);
+ mutex_exit(&zilog->zl_lock);
+ return;
+ }
+ }
+ /* ran off end of list, insert at the end */
+ ASSERT(zv == NULL);
+ new = kmem_alloc(sizeof (zil_vdev_t), KM_SLEEP);
+ new->vdev = vdev;
+ list_insert_tail(&zilog->zl_vdev_list, new);
+ mutex_exit(&zilog->zl_lock);
+ }
+}
+
+/* start an async flush of the write cache for this vdev */
+void
+zil_flush_vdev(spa_t *spa, uint64_t vdev, zio_t **zio)
+{
+ vdev_t *vd;
+
+ if (*zio == NULL)
+ *zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+
+ vd = vdev_lookup_top(spa, vdev);
+ ASSERT(vd);
+
+ (void) zio_nowait(zio_ioctl(*zio, spa, vd, DKIOCFLUSHWRITECACHE,
+ NULL, NULL, ZIO_PRIORITY_NOW,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
+}
+
+void
+zil_flush_vdevs(zilog_t *zilog)
+{
+ zil_vdev_t *zv;
+ zio_t *zio = NULL;
+ spa_t *spa = zilog->zl_spa;
+ uint64_t vdev;
+ uint8_t b;
+ int i, j;
+
+ ASSERT(zilog->zl_writer);
+
+ for (i = 0; i < sizeof (zilog->zl_vdev_bmap); i++) {
+ b = zilog->zl_vdev_bmap[i];
+ if (b == 0)
+ continue;
+ for (j = 0; j < 8; j++) {
+ if (b & (1 << j)) {
+ vdev = (i << 3) + j;
+ zil_flush_vdev(spa, vdev, &zio);
+ }
+ }
+ zilog->zl_vdev_bmap[i] = 0;
+ }
+
+ while ((zv = list_head(&zilog->zl_vdev_list)) != NULL) {
+ zil_flush_vdev(spa, zv->vdev, &zio);
+ list_remove(&zilog->zl_vdev_list, zv);
+ kmem_free(zv, sizeof (zil_vdev_t));
+ }
+ /*
+ * Wait for all the flushes to complete. Not all devices actually
+ * support the DKIOCFLUSHWRITECACHE ioctl, so it's OK if it fails.
+ */
+ if (zio)
+ (void) zio_wait(zio);
+}
+
+/*
+ * Function called when a log block write completes
+ */
+static void
+zil_lwb_write_done(zio_t *zio)
+{
+ lwb_t *lwb = zio->io_private;
+ zilog_t *zilog = lwb->lwb_zilog;
+
+ /*
+ * Now that we've written this log block, we have a stable pointer
+ * to the next block in the chain, so it's OK to let the txg in
+ * which we allocated the next block sync.
+ */
+ txg_rele_to_sync(&lwb->lwb_txgh);
+
+ zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
+ mutex_enter(&zilog->zl_lock);
+ lwb->lwb_buf = NULL;
+ if (zio->io_error) {
+ zilog->zl_log_error = B_TRUE;
+ mutex_exit(&zilog->zl_lock);
+ return;
+ }
+ mutex_exit(&zilog->zl_lock);
+}
+
+/*
+ * Initialize the io for a log block.
+ *
+ * Note, we should not initialize the IO until we are about
+ * to use it, since zio_rewrite() does a spa_config_enter().
+ */
+static void
+zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
+{
+ zbookmark_t zb;
+
+ zb.zb_objset = lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET];
+ zb.zb_object = 0;
+ zb.zb_level = -1;
+ zb.zb_blkid = lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
+
+ if (zilog->zl_root_zio == NULL) {
+ zilog->zl_root_zio = zio_root(zilog->zl_spa, NULL, NULL,
+ ZIO_FLAG_CANFAIL);
+ }
+ if (lwb->lwb_zio == NULL) {
+ lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa,
+ ZIO_CHECKSUM_ZILOG, 0, &lwb->lwb_blk, lwb->lwb_buf,
+ lwb->lwb_sz, zil_lwb_write_done, lwb,
+ ZIO_PRIORITY_LOG_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
+ }
+}
+
+/*
+ * Start a log block write and advance to the next log block.
+ * Calls are serialized.
+ */
+static lwb_t *
+zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
+{
+ lwb_t *nlwb;
+ zil_trailer_t *ztp = (zil_trailer_t *)(lwb->lwb_buf + lwb->lwb_sz) - 1;
+ spa_t *spa = zilog->zl_spa;
+ blkptr_t *bp = &ztp->zit_next_blk;
+ uint64_t txg;
+ uint64_t zil_blksz;
+ int error;
+
+ ASSERT(lwb->lwb_nused <= ZIL_BLK_DATA_SZ(lwb));
+
+ /*
+ * Allocate the next block and save its address in this block
+ * before writing it in order to establish the log chain.
+ * Note that if the allocation of nlwb synced before we wrote
+ * the block that points at it (lwb), we'd leak it if we crashed.
+ * Therefore, we don't do txg_rele_to_sync() until zil_lwb_write_done().
+ */
+ txg = txg_hold_open(zilog->zl_dmu_pool, &lwb->lwb_txgh);
+ txg_rele_to_quiesce(&lwb->lwb_txgh);
+
+ /*
+ * Pick a ZIL blocksize. We request a size that is the
+ * maximum of the previous used size, the current used size and
+ * the amount waiting in the queue.
+ */
+ zil_blksz = MAX(zilog->zl_prev_used,
+ zilog->zl_cur_used + sizeof (*ztp));
+ zil_blksz = MAX(zil_blksz, zilog->zl_itx_list_sz + sizeof (*ztp));
+ zil_blksz = P2ROUNDUP_TYPED(zil_blksz, ZIL_MIN_BLKSZ, uint64_t);
+ if (zil_blksz > ZIL_MAX_BLKSZ)
+ zil_blksz = ZIL_MAX_BLKSZ;
+
+ BP_ZERO(bp);
+ /* pass the old blkptr in order to spread log blocks across devs */
+ error = zio_alloc_blk(spa, zil_blksz, bp, &lwb->lwb_blk, txg);
+ if (error) {
+ dmu_tx_t *tx = dmu_tx_create_assigned(zilog->zl_dmu_pool, txg);
+
+ /*
+ * We dirty the dataset to ensure that zil_sync() will
+ * be called to remove this lwb from our zl_lwb_list.
+ * Failing to do so, may leave an lwb with a NULL lwb_buf
+ * hanging around on the zl_lwb_list.
+ */
+ dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
+ dmu_tx_commit(tx);
+
+ /*
+ * Since we've just experienced an allocation failure so we
+ * terminate the current lwb and send it on its way.
+ */
+ ztp->zit_pad = 0;
+ ztp->zit_nused = lwb->lwb_nused;
+ ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum;
+ zio_nowait(lwb->lwb_zio);
+
+ /*
+ * By returning NULL the caller will call tx_wait_synced()
+ */
+ return (NULL);
+ }
+
+ ASSERT3U(bp->blk_birth, ==, txg);
+ ztp->zit_pad = 0;
+ ztp->zit_nused = lwb->lwb_nused;
+ ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum;
+ bp->blk_cksum = lwb->lwb_blk.blk_cksum;
+ bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;
+
+ /*
+ * Allocate a new log write buffer (lwb).
+ */
+ nlwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
+
+ nlwb->lwb_zilog = zilog;
+ nlwb->lwb_blk = *bp;
+ nlwb->lwb_nused = 0;
+ nlwb->lwb_sz = BP_GET_LSIZE(&nlwb->lwb_blk);
+ nlwb->lwb_buf = zio_buf_alloc(nlwb->lwb_sz);
+ nlwb->lwb_max_txg = txg;
+ nlwb->lwb_zio = NULL;
+
+ /*
+ * Put new lwb at the end of the log chain
+ */
+ mutex_enter(&zilog->zl_lock);
+ list_insert_tail(&zilog->zl_lwb_list, nlwb);
+ mutex_exit(&zilog->zl_lock);
+
+ /* Record the vdev for later flushing */
+ zil_add_vdev(zilog, DVA_GET_VDEV(BP_IDENTITY(&(lwb->lwb_blk))));
+
+ /*
+ * kick off the write for the old log block
+ */
+ dprintf_bp(&lwb->lwb_blk, "lwb %p txg %llu: ", lwb, txg);
+ ASSERT(lwb->lwb_zio);
+ zio_nowait(lwb->lwb_zio);
+
+ return (nlwb);
+}
+
+static lwb_t *
+zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
+{
+ lr_t *lrc = &itx->itx_lr; /* common log record */
+ lr_write_t *lr = (lr_write_t *)lrc;
+ uint64_t txg = lrc->lrc_txg;
+ uint64_t reclen = lrc->lrc_reclen;
+ uint64_t dlen;
+
+ if (lwb == NULL)
+ return (NULL);
+ ASSERT(lwb->lwb_buf != NULL);
+
+ if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY)
+ dlen = P2ROUNDUP_TYPED(
+ lr->lr_length, sizeof (uint64_t), uint64_t);
+ else
+ dlen = 0;
+
+ zilog->zl_cur_used += (reclen + dlen);
+
+ zil_lwb_write_init(zilog, lwb);
+
+ /*
+ * If this record won't fit in the current log block, start a new one.
+ */
+ if (lwb->lwb_nused + reclen + dlen > ZIL_BLK_DATA_SZ(lwb)) {
+ lwb = zil_lwb_write_start(zilog, lwb);
+ if (lwb == NULL)
+ return (NULL);
+ zil_lwb_write_init(zilog, lwb);
+ ASSERT(lwb->lwb_nused == 0);
+ if (reclen + dlen > ZIL_BLK_DATA_SZ(lwb)) {
+ txg_wait_synced(zilog->zl_dmu_pool, txg);
+ return (lwb);
+ }
+ }
+
+ /*
+ * Update the lrc_seq, to be log record sequence number. See zil.h
+ * Then copy the record to the log buffer.
+ */
+ lrc->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */
+ bcopy(lrc, lwb->lwb_buf + lwb->lwb_nused, reclen);
+
+ /*
+ * If it's a write, fetch the data or get its blkptr as appropriate.
+ */
+ if (lrc->lrc_txtype == TX_WRITE) {
+ if (txg > spa_freeze_txg(zilog->zl_spa))
+ txg_wait_synced(zilog->zl_dmu_pool, txg);
+ if (itx->itx_wr_state != WR_COPIED) {
+ char *dbuf;
+ int error;
+
+ /* alignment is guaranteed */
+ lr = (lr_write_t *)(lwb->lwb_buf + lwb->lwb_nused);
+ if (dlen) {
+ ASSERT(itx->itx_wr_state == WR_NEED_COPY);
+ dbuf = lwb->lwb_buf + lwb->lwb_nused + reclen;
+ lr->lr_common.lrc_reclen += dlen;
+ } else {
+ ASSERT(itx->itx_wr_state == WR_INDIRECT);
+ dbuf = NULL;
+ }
+ error = zilog->zl_get_data(
+ itx->itx_private, lr, dbuf, lwb->lwb_zio);
+ if (error) {
+ ASSERT(error == ENOENT || error == EEXIST ||
+ error == EALREADY);
+ return (lwb);
+ }
+ }
+ }
+
+ lwb->lwb_nused += reclen + dlen;
+ lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg);
+ ASSERT3U(lwb->lwb_nused, <=, ZIL_BLK_DATA_SZ(lwb));
+ ASSERT3U(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)), ==, 0);
+
+ return (lwb);
+}
+
+itx_t *
+zil_itx_create(int txtype, size_t lrsize)
+{
+ itx_t *itx;
+
+ lrsize = P2ROUNDUP_TYPED(lrsize, sizeof (uint64_t), size_t);
+
+ itx = kmem_alloc(offsetof(itx_t, itx_lr) + lrsize, KM_SLEEP);
+ itx->itx_lr.lrc_txtype = txtype;
+ itx->itx_lr.lrc_reclen = lrsize;
+ itx->itx_lr.lrc_seq = 0; /* defensive */
+
+ return (itx);
+}
+
+uint64_t
+zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
+{
+ uint64_t seq;
+
+ ASSERT(itx->itx_lr.lrc_seq == 0);
+
+ mutex_enter(&zilog->zl_lock);
+ list_insert_tail(&zilog->zl_itx_list, itx);
+ zilog->zl_itx_list_sz += itx->itx_lr.lrc_reclen;
+ itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx);
+ itx->itx_lr.lrc_seq = seq = ++zilog->zl_itx_seq;
+ mutex_exit(&zilog->zl_lock);
+
+ return (seq);
+}
+
+/*
+ * Free up all in-memory intent log transactions that have now been synced.
+ */
+static void
+zil_itx_clean(zilog_t *zilog)
+{
+ uint64_t synced_txg = spa_last_synced_txg(zilog->zl_spa);
+ uint64_t freeze_txg = spa_freeze_txg(zilog->zl_spa);
+ list_t clean_list;
+ itx_t *itx;
+
+ list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node));
+
+ mutex_enter(&zilog->zl_lock);
+ /* wait for a log writer to finish walking list */
+ while (zilog->zl_writer) {
+ cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
+ }
+
+ /*
+ * Move the sync'd log transactions to a separate list so we can call
+ * kmem_free without holding the zl_lock.
+ *
+ * There is no need to set zl_writer as we don't drop zl_lock here
+ */
+ while ((itx = list_head(&zilog->zl_itx_list)) != NULL &&
+ itx->itx_lr.lrc_txg <= MIN(synced_txg, freeze_txg)) {
+ list_remove(&zilog->zl_itx_list, itx);
+ zilog->zl_itx_list_sz -= itx->itx_lr.lrc_reclen;
+ list_insert_tail(&clean_list, itx);
+ }
+ cv_broadcast(&zilog->zl_cv_writer);
+ mutex_exit(&zilog->zl_lock);
+
+ /* destroy sync'd log transactions */
+ while ((itx = list_head(&clean_list)) != NULL) {
+ list_remove(&clean_list, itx);
+ kmem_free(itx, offsetof(itx_t, itx_lr)
+ + itx->itx_lr.lrc_reclen);
+ }
+ list_destroy(&clean_list);
+}
+
+/*
+ * If there are any in-memory intent log transactions which have now been
+ * synced then start up a taskq to free them.
+ */
+void
+zil_clean(zilog_t *zilog)
+{
+ itx_t *itx;
+
+ mutex_enter(&zilog->zl_lock);
+ itx = list_head(&zilog->zl_itx_list);
+ if ((itx != NULL) &&
+ (itx->itx_lr.lrc_txg <= spa_last_synced_txg(zilog->zl_spa))) {
+ (void) taskq_dispatch(zilog->zl_clean_taskq,
+ (void (*)(void *))zil_itx_clean, zilog, TQ_NOSLEEP);
+ }
+ mutex_exit(&zilog->zl_lock);
+}
+
+void
+zil_commit_writer(zilog_t *zilog, uint64_t seq, uint64_t foid)
+{
+ uint64_t txg;
+ uint64_t reclen;
+ uint64_t commit_seq = 0;
+ itx_t *itx, *itx_next = (itx_t *)-1;
+ lwb_t *lwb;
+ spa_t *spa;
+
+ zilog->zl_writer = B_TRUE;
+ zilog->zl_root_zio = NULL;
+ spa = zilog->zl_spa;
+
+ if (zilog->zl_suspend) {
+ lwb = NULL;
+ } else {
+ lwb = list_tail(&zilog->zl_lwb_list);
+ if (lwb == NULL) {
+ /*
+ * Return if there's nothing to flush before we
+ * dirty the fs by calling zil_create()
+ */
+ if (list_is_empty(&zilog->zl_itx_list)) {
+ zilog->zl_writer = B_FALSE;
+ return;
+ }
+ mutex_exit(&zilog->zl_lock);
+ zil_create(zilog);
+ mutex_enter(&zilog->zl_lock);
+ lwb = list_tail(&zilog->zl_lwb_list);
+ }
+ }
+
+ /* Loop through in-memory log transactions filling log blocks. */
+ DTRACE_PROBE1(zil__cw1, zilog_t *, zilog);
+ for (;;) {
+ /*
+ * Find the next itx to push:
+ * Push all transactions related to specified foid and all
+ * other transactions except TX_WRITE, TX_TRUNCATE,
+ * TX_SETATTR and TX_ACL for all other files.
+ */
+ if (itx_next != (itx_t *)-1)
+ itx = itx_next;
+ else
+ itx = list_head(&zilog->zl_itx_list);
+ for (; itx != NULL; itx = list_next(&zilog->zl_itx_list, itx)) {
+ if (foid == 0) /* push all foids? */
+ break;
+ if (itx->itx_sync) /* push all O_[D]SYNC */
+ break;
+ switch (itx->itx_lr.lrc_txtype) {
+ case TX_SETATTR:
+ case TX_WRITE:
+ case TX_TRUNCATE:
+ case TX_ACL:
+ /* lr_foid is same offset for these records */
+ if (((lr_write_t *)&itx->itx_lr)->lr_foid
+ != foid) {
+ continue; /* skip this record */
+ }
+ }
+ break;
+ }
+ if (itx == NULL)
+ break;
+
+ reclen = itx->itx_lr.lrc_reclen;
+ if ((itx->itx_lr.lrc_seq > seq) &&
+ ((lwb == NULL) || (lwb->lwb_nused == 0) ||
+ (lwb->lwb_nused + reclen > ZIL_BLK_DATA_SZ(lwb)))) {
+ break;
+ }
+
+ /*
+ * Save the next pointer. Even though we soon drop
+ * zl_lock all threads that may change the list
+ * (another writer or zil_itx_clean) can't do so until
+ * they have zl_writer.
+ */
+ itx_next = list_next(&zilog->zl_itx_list, itx);
+ list_remove(&zilog->zl_itx_list, itx);
+ mutex_exit(&zilog->zl_lock);
+ txg = itx->itx_lr.lrc_txg;
+ ASSERT(txg);
+
+ if (txg > spa_last_synced_txg(spa) ||
+ txg > spa_freeze_txg(spa))
+ lwb = zil_lwb_commit(zilog, itx, lwb);
+ kmem_free(itx, offsetof(itx_t, itx_lr)
+ + itx->itx_lr.lrc_reclen);
+ mutex_enter(&zilog->zl_lock);
+ zilog->zl_itx_list_sz -= reclen;
+ }
+ DTRACE_PROBE1(zil__cw2, zilog_t *, zilog);
+ /* determine commit sequence number */
+ itx = list_head(&zilog->zl_itx_list);
+ if (itx)
+ commit_seq = itx->itx_lr.lrc_seq;
+ else
+ commit_seq = zilog->zl_itx_seq;
+ mutex_exit(&zilog->zl_lock);
+
+ /* write the last block out */
+ if (lwb != NULL && lwb->lwb_zio != NULL)
+ lwb = zil_lwb_write_start(zilog, lwb);
+
+ zilog->zl_prev_used = zilog->zl_cur_used;
+ zilog->zl_cur_used = 0;
+
+ /*
+ * Wait if necessary for the log blocks to be on stable storage.
+ */
+ if (zilog->zl_root_zio) {
+ DTRACE_PROBE1(zil__cw3, zilog_t *, zilog);
+ (void) zio_wait(zilog->zl_root_zio);
+ DTRACE_PROBE1(zil__cw4, zilog_t *, zilog);
+ if (!zfs_nocacheflush)
+ zil_flush_vdevs(zilog);
+ }
+
+ if (zilog->zl_log_error || lwb == NULL) {
+ zilog->zl_log_error = 0;
+ txg_wait_synced(zilog->zl_dmu_pool, 0);
+ }
+
+ mutex_enter(&zilog->zl_lock);
+ zilog->zl_writer = B_FALSE;
+
+ ASSERT3U(commit_seq, >=, zilog->zl_commit_seq);
+ zilog->zl_commit_seq = commit_seq;
+}
+
+/*
+ * Push zfs transactions to stable storage up to the supplied sequence number.
+ * If foid is 0 push out all transactions, otherwise push only those
+ * for that file or might have been used to create that file.
+ */
+void
+zil_commit(zilog_t *zilog, uint64_t seq, uint64_t foid)
+{
+ if (zilog == NULL || seq == 0)
+ return;
+
+ mutex_enter(&zilog->zl_lock);
+
+ seq = MIN(seq, zilog->zl_itx_seq); /* cap seq at largest itx seq */
+
+ while (zilog->zl_writer) {
+ cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
+ if (seq < zilog->zl_commit_seq) {
+ mutex_exit(&zilog->zl_lock);
+ return;
+ }
+ }
+ zil_commit_writer(zilog, seq, foid); /* drops zl_lock */
+ /* wake up others waiting on the commit */
+ cv_broadcast(&zilog->zl_cv_writer);
+ mutex_exit(&zilog->zl_lock);
+}
+
+/*
+ * Called in syncing context to free committed log blocks and update log header.
+ */
+void
+zil_sync(zilog_t *zilog, dmu_tx_t *tx)
+{
+ zil_header_t *zh = zil_header_in_syncing_context(zilog);
+ uint64_t txg = dmu_tx_get_txg(tx);
+ spa_t *spa = zilog->zl_spa;
+ lwb_t *lwb;
+
+ mutex_enter(&zilog->zl_lock);
+
+ ASSERT(zilog->zl_stop_sync == 0);
+
+ zh->zh_replay_seq = zilog->zl_replay_seq[txg & TXG_MASK];
+
+ if (zilog->zl_destroy_txg == txg) {
+ blkptr_t blk = zh->zh_log;
+
+ ASSERT(list_head(&zilog->zl_lwb_list) == NULL);
+ ASSERT(spa_sync_pass(spa) == 1);
+
+ bzero(zh, sizeof (zil_header_t));
+ bzero(zilog->zl_replay_seq, sizeof (zilog->zl_replay_seq));
+
+ if (zilog->zl_keep_first) {
+ /*
+ * If this block was part of log chain that couldn't
+ * be claimed because a device was missing during
+ * zil_claim(), but that device later returns,
+ * then this block could erroneously appear valid.
+ * To guard against this, assign a new GUID to the new
+ * log chain so it doesn't matter what blk points to.
+ */
+ zil_init_log_chain(zilog, &blk);
+ zh->zh_log = blk;
+ }
+ }
+
+ for (;;) {
+ lwb = list_head(&zilog->zl_lwb_list);
+ if (lwb == NULL) {
+ mutex_exit(&zilog->zl_lock);
+ return;
+ }
+ zh->zh_log = lwb->lwb_blk;
+ if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg)
+ break;
+ list_remove(&zilog->zl_lwb_list, lwb);
+ zio_free_blk(spa, &lwb->lwb_blk, txg);
+ kmem_cache_free(zil_lwb_cache, lwb);
+
+ /*
+ * If we don't have anything left in the lwb list then
+ * we've had an allocation failure and we need to zero
+ * out the zil_header blkptr so that we don't end
+ * up freeing the same block twice.
+ */
+ if (list_head(&zilog->zl_lwb_list) == NULL)
+ BP_ZERO(&zh->zh_log);
+ }
+ mutex_exit(&zilog->zl_lock);
+}
+
+void
+zil_init(void)
+{
+ zil_lwb_cache = kmem_cache_create("zil_lwb_cache",
+ sizeof (struct lwb), 0, NULL, NULL, NULL, NULL, NULL, 0);
+}
+
+void
+zil_fini(void)
+{
+ kmem_cache_destroy(zil_lwb_cache);
+}
+
+zilog_t *
+zil_alloc(objset_t *os, zil_header_t *zh_phys)
+{
+ zilog_t *zilog;
+
+ zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP);
+
+ zilog->zl_header = zh_phys;
+ zilog->zl_os = os;
+ zilog->zl_spa = dmu_objset_spa(os);
+ zilog->zl_dmu_pool = dmu_objset_pool(os);
+ zilog->zl_destroy_txg = TXG_INITIAL - 1;
+
+ mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&zilog->zl_cv_writer, NULL, CV_DEFAULT, NULL);
+ cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL);
+
+ list_create(&zilog->zl_itx_list, sizeof (itx_t),
+ offsetof(itx_t, itx_node));
+
+ list_create(&zilog->zl_lwb_list, sizeof (lwb_t),
+ offsetof(lwb_t, lwb_node));
+
+ list_create(&zilog->zl_vdev_list, sizeof (zil_vdev_t),
+ offsetof(zil_vdev_t, vdev_seq_node));
+
+ return (zilog);
+}
+
+void
+zil_free(zilog_t *zilog)
+{
+ lwb_t *lwb;
+ zil_vdev_t *zv;
+
+ zilog->zl_stop_sync = 1;
+
+ while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
+ list_remove(&zilog->zl_lwb_list, lwb);
+ if (lwb->lwb_buf != NULL)
+ zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
+ kmem_cache_free(zil_lwb_cache, lwb);
+ }
+ list_destroy(&zilog->zl_lwb_list);
+
+ while ((zv = list_head(&zilog->zl_vdev_list)) != NULL) {
+ list_remove(&zilog->zl_vdev_list, zv);
+ kmem_free(zv, sizeof (zil_vdev_t));
+ }
+ list_destroy(&zilog->zl_vdev_list);
+
+ ASSERT(list_head(&zilog->zl_itx_list) == NULL);
+ list_destroy(&zilog->zl_itx_list);
+ cv_destroy(&zilog->zl_cv_suspend);
+ cv_destroy(&zilog->zl_cv_writer);
+ mutex_destroy(&zilog->zl_lock);
+
+ kmem_free(zilog, sizeof (zilog_t));
+}
+
+/*
+ * return true if the initial log block is not valid
+ */
+static int
+zil_empty(zilog_t *zilog)
+{
+ const zil_header_t *zh = zilog->zl_header;
+ arc_buf_t *abuf = NULL;
+
+ if (BP_IS_HOLE(&zh->zh_log))
+ return (1);
+
+ if (zil_read_log_block(zilog, &zh->zh_log, &abuf) != 0)
+ return (1);
+
+ VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
+ return (0);
+}
+
+/*
+ * Open an intent log.
+ */
+zilog_t *
+zil_open(objset_t *os, zil_get_data_t *get_data)
+{
+ zilog_t *zilog = dmu_objset_zil(os);
+
+ zilog->zl_get_data = get_data;
+ zilog->zl_clean_taskq = taskq_create("zil_clean", 1, minclsyspri,
+ 2, 2, TASKQ_PREPOPULATE);
+
+ return (zilog);
+}
+
+/*
+ * Close an intent log.
+ */
+void
+zil_close(zilog_t *zilog)
+{
+ /*
+ * If the log isn't already committed, mark the objset dirty
+ * (so zil_sync() will be called) and wait for that txg to sync.
+ */
+ if (!zil_is_committed(zilog)) {
+ uint64_t txg;
+ dmu_tx_t *tx = dmu_tx_create(zilog->zl_os);
+ (void) dmu_tx_assign(tx, TXG_WAIT);
+ dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
+ txg = dmu_tx_get_txg(tx);
+ dmu_tx_commit(tx);
+ txg_wait_synced(zilog->zl_dmu_pool, txg);
+ }
+
+ taskq_destroy(zilog->zl_clean_taskq);
+ zilog->zl_clean_taskq = NULL;
+ zilog->zl_get_data = NULL;
+
+ zil_itx_clean(zilog);
+ ASSERT(list_head(&zilog->zl_itx_list) == NULL);
+}
+
+/*
+ * Suspend an intent log. While in suspended mode, we still honor
+ * synchronous semantics, but we rely on txg_wait_synced() to do it.
+ * We suspend the log briefly when taking a snapshot so that the snapshot
+ * contains all the data it's supposed to, and has an empty intent log.
+ */
+int
+zil_suspend(zilog_t *zilog)
+{
+ const zil_header_t *zh = zilog->zl_header;
+
+ mutex_enter(&zilog->zl_lock);
+ if (zh->zh_claim_txg != 0) { /* unplayed log */
+ mutex_exit(&zilog->zl_lock);
+ return (EBUSY);
+ }
+ if (zilog->zl_suspend++ != 0) {
+ /*
+ * Someone else already began a suspend.
+ * Just wait for them to finish.
+ */
+ while (zilog->zl_suspending)
+ cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock);
+ ASSERT(BP_IS_HOLE(&zh->zh_log));
+ mutex_exit(&zilog->zl_lock);
+ return (0);
+ }
+ zilog->zl_suspending = B_TRUE;
+ mutex_exit(&zilog->zl_lock);
+
+ zil_commit(zilog, UINT64_MAX, 0);
+
+ /*
+ * Wait for any in-flight log writes to complete.
+ */
+ mutex_enter(&zilog->zl_lock);
+ while (zilog->zl_writer)
+ cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
+ mutex_exit(&zilog->zl_lock);
+
+ zil_destroy(zilog, B_FALSE);
+
+ mutex_enter(&zilog->zl_lock);
+ ASSERT(BP_IS_HOLE(&zh->zh_log));
+ zilog->zl_suspending = B_FALSE;
+ cv_broadcast(&zilog->zl_cv_suspend);
+ mutex_exit(&zilog->zl_lock);
+
+ return (0);
+}
+
+void
+zil_resume(zilog_t *zilog)
+{
+ mutex_enter(&zilog->zl_lock);
+ ASSERT(zilog->zl_suspend != 0);
+ zilog->zl_suspend--;
+ mutex_exit(&zilog->zl_lock);
+}
+
+typedef struct zil_replay_arg {
+ objset_t *zr_os;
+ zil_replay_func_t **zr_replay;
+ void *zr_arg;
+ uint64_t *zr_txgp;
+ boolean_t zr_byteswap;
+ char *zr_lrbuf;
+} zil_replay_arg_t;
+
+static void
+zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
+{
+ zil_replay_arg_t *zr = zra;
+ const zil_header_t *zh = zilog->zl_header;
+ uint64_t reclen = lr->lrc_reclen;
+ uint64_t txtype = lr->lrc_txtype;
+ char *name;
+ int pass, error, sunk;
+
+ if (zilog->zl_stop_replay)
+ return;
+
+ if (lr->lrc_txg < claim_txg) /* already committed */
+ return;
+
+ if (lr->lrc_seq <= zh->zh_replay_seq) /* already replayed */
+ return;
+
+ /*
+ * Make a copy of the data so we can revise and extend it.
+ */
+ bcopy(lr, zr->zr_lrbuf, reclen);
+
+ /*
+ * The log block containing this lr may have been byteswapped
+ * so that we can easily examine common fields like lrc_txtype.
+ * However, the log is a mix of different data types, and only the
+ * replay vectors know how to byteswap their records. Therefore, if
+ * the lr was byteswapped, undo it before invoking the replay vector.
+ */
+ if (zr->zr_byteswap)
+ byteswap_uint64_array(zr->zr_lrbuf, reclen);
+
+ /*
+ * If this is a TX_WRITE with a blkptr, suck in the data.
+ */
+ if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) {
+ lr_write_t *lrw = (lr_write_t *)lr;
+ blkptr_t *wbp = &lrw->lr_blkptr;
+ uint64_t wlen = lrw->lr_length;
+ char *wbuf = zr->zr_lrbuf + reclen;
+
+ if (BP_IS_HOLE(wbp)) { /* compressed to a hole */
+ bzero(wbuf, wlen);
+ } else {
+ /*
+ * A subsequent write may have overwritten this block,
+ * in which case wbp may have been been freed and
+ * reallocated, and our read of wbp may fail with a
+ * checksum error. We can safely ignore this because
+ * the later write will provide the correct data.
+ */
+ zbookmark_t zb;
+
+ zb.zb_objset = dmu_objset_id(zilog->zl_os);
+ zb.zb_object = lrw->lr_foid;
+ zb.zb_level = -1;
+ zb.zb_blkid = lrw->lr_offset / BP_GET_LSIZE(wbp);
+
+ (void) zio_wait(zio_read(NULL, zilog->zl_spa,
+ wbp, wbuf, BP_GET_LSIZE(wbp), NULL, NULL,
+ ZIO_PRIORITY_SYNC_READ,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &zb));
+ (void) memmove(wbuf, wbuf + lrw->lr_blkoff, wlen);
+ }
+ }
+
+ /*
+ * We must now do two things atomically: replay this log record,
+ * and update the log header to reflect the fact that we did so.
+ * We use the DMU's ability to assign into a specific txg to do this.
+ */
+ for (pass = 1, sunk = B_FALSE; /* CONSTANTCONDITION */; pass++) {
+ uint64_t replay_txg;
+ dmu_tx_t *replay_tx;
+
+ replay_tx = dmu_tx_create(zr->zr_os);
+ error = dmu_tx_assign(replay_tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(replay_tx);
+ break;
+ }
+
+ replay_txg = dmu_tx_get_txg(replay_tx);
+
+ if (txtype == 0 || txtype >= TX_MAX_TYPE) {
+ error = EINVAL;
+ } else {
+ /*
+ * On the first pass, arrange for the replay vector
+ * to fail its dmu_tx_assign(). That's the only way
+ * to ensure that those code paths remain well tested.
+ */
+ *zr->zr_txgp = replay_txg - (pass == 1);
+ error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lrbuf,
+ zr->zr_byteswap);
+ *zr->zr_txgp = TXG_NOWAIT;
+ }
+
+ if (error == 0) {
+ dsl_dataset_dirty(dmu_objset_ds(zr->zr_os), replay_tx);
+ zilog->zl_replay_seq[replay_txg & TXG_MASK] =
+ lr->lrc_seq;
+ }
+
+ dmu_tx_commit(replay_tx);
+
+ if (!error)
+ return;
+
+ /*
+ * The DMU's dnode layer doesn't see removes until the txg
+ * commits, so a subsequent claim can spuriously fail with
+ * EEXIST. So if we receive any error other than ERESTART
+ * we try syncing out any removes then retrying the
+ * transaction.
+ */
+ if (error != ERESTART && !sunk) {
+ txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
+ sunk = B_TRUE;
+ continue; /* retry */
+ }
+
+ if (error != ERESTART)
+ break;
+
+ if (pass != 1)
+ txg_wait_open(spa_get_dsl(zilog->zl_spa),
+ replay_txg + 1);
+
+ dprintf("pass %d, retrying\n", pass);
+ }
+
+ ASSERT(error && error != ERESTART);
+ name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
+ dmu_objset_name(zr->zr_os, name);
+ cmn_err(CE_WARN, "ZFS replay transaction error %d, "
+ "dataset %s, seq 0x%llx, txtype %llu\n",
+ error, name, (u_longlong_t)lr->lrc_seq, (u_longlong_t)txtype);
+ zilog->zl_stop_replay = 1;
+ kmem_free(name, MAXNAMELEN);
+}
+
+/* ARGSUSED */
+static void
+zil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
+{
+ zilog->zl_replay_blks++;
+}
+
+/*
+ * If this dataset has a non-empty intent log, replay it and destroy it.
+ */
+void
+zil_replay(objset_t *os, void *arg, uint64_t *txgp,
+ zil_replay_func_t *replay_func[TX_MAX_TYPE])
+{
+ zilog_t *zilog = dmu_objset_zil(os);
+ const zil_header_t *zh = zilog->zl_header;
+ zil_replay_arg_t zr;
+
+ if (zil_empty(zilog)) {
+ zil_destroy(zilog, B_TRUE);
+ return;
+ }
+ //printf("ZFS: Replaying ZIL on %s...\n", os->os->os_spa->spa_name);
+
+ zr.zr_os = os;
+ zr.zr_replay = replay_func;
+ zr.zr_arg = arg;
+ zr.zr_txgp = txgp;
+ zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log);
+ zr.zr_lrbuf = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP);
+
+ /*
+ * Wait for in-progress removes to sync before starting replay.
+ */
+ txg_wait_synced(zilog->zl_dmu_pool, 0);
+
+ zilog->zl_stop_replay = 0;
+ zilog->zl_replay_time = lbolt;
+ ASSERT(zilog->zl_replay_blks == 0);
+ (void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr,
+ zh->zh_claim_txg);
+ kmem_free(zr.zr_lrbuf, 2 * SPA_MAXBLOCKSIZE);
+
+ zil_destroy(zilog, B_FALSE);
+ //printf("ZFS: Replay of ZIL on %s finished.\n", os->os->os_spa->spa_name);
+}
+
+/*
+ * Report whether all transactions are committed
+ */
+int
+zil_is_committed(zilog_t *zilog)
+{
+ lwb_t *lwb;
+ int ret;
+
+ mutex_enter(&zilog->zl_lock);
+ while (zilog->zl_writer)
+ cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
+
+ /* recent unpushed intent log transactions? */
+ if (!list_is_empty(&zilog->zl_itx_list)) {
+ ret = B_FALSE;
+ goto out;
+ }
+
+ /* intent log never used? */
+ lwb = list_head(&zilog->zl_lwb_list);
+ if (lwb == NULL) {
+ ret = B_TRUE;
+ goto out;
+ }
+
+ /*
+ * more than 1 log buffer means zil_sync() hasn't yet freed
+ * entries after a txg has committed
+ */
+ if (list_next(&zilog->zl_lwb_list, lwb)) {
+ ret = B_FALSE;
+ goto out;
+ }
+
+ ASSERT(zil_empty(zilog));
+ ret = B_TRUE;
+out:
+ cv_broadcast(&zilog->zl_cv_writer);
+ mutex_exit(&zilog->zl_lock);
+ return (ret);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
new file mode 100644
index 0000000..6bc4a36
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
@@ -0,0 +1,1853 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/fm/fs/zfs.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio_impl.h>
+#include <sys/zio_compress.h>
+#include <sys/zio_checksum.h>
+
+/*
+ * ==========================================================================
+ * I/O priority table
+ * ==========================================================================
+ */
+uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = {
+ 0, /* ZIO_PRIORITY_NOW */
+ 0, /* ZIO_PRIORITY_SYNC_READ */
+ 0, /* ZIO_PRIORITY_SYNC_WRITE */
+ 6, /* ZIO_PRIORITY_ASYNC_READ */
+ 4, /* ZIO_PRIORITY_ASYNC_WRITE */
+ 4, /* ZIO_PRIORITY_FREE */
+ 0, /* ZIO_PRIORITY_CACHE_FILL */
+ 0, /* ZIO_PRIORITY_LOG_WRITE */
+ 10, /* ZIO_PRIORITY_RESILVER */
+ 20, /* ZIO_PRIORITY_SCRUB */
+};
+
+/*
+ * ==========================================================================
+ * I/O type descriptions
+ * ==========================================================================
+ */
+char *zio_type_name[ZIO_TYPES] = {
+ "null", "read", "write", "free", "claim", "ioctl" };
+
+/* At or above this size, force gang blocking - for testing */
+uint64_t zio_gang_bang = SPA_MAXBLOCKSIZE + 1;
+
+/* Force an allocation failure when non-zero */
+uint16_t zio_zil_fail_shift = 0;
+
+typedef struct zio_sync_pass {
+ int zp_defer_free; /* defer frees after this pass */
+ int zp_dontcompress; /* don't compress after this pass */
+ int zp_rewrite; /* rewrite new bps after this pass */
+} zio_sync_pass_t;
+
+zio_sync_pass_t zio_sync_pass = {
+ 1, /* zp_defer_free */
+ 4, /* zp_dontcompress */
+ 1, /* zp_rewrite */
+};
+
+#ifdef ZIO_USE_UMA
+/*
+ * ==========================================================================
+ * I/O kmem caches
+ * ==========================================================================
+ */
+kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
+kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
+#endif
+
+#ifdef _KERNEL
+extern vmem_t *zio_alloc_arena;
+#endif
+
+void
+zio_init(void)
+{
+#ifdef ZIO_USE_UMA
+ size_t c;
+#endif
+#if 0
+ vmem_t *data_alloc_arena = NULL;
+
+#ifdef _KERNEL
+ data_alloc_arena = zio_alloc_arena;
+#endif
+#endif
+
+#ifdef ZIO_USE_UMA
+ /*
+ * For small buffers, we want a cache for each multiple of
+ * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache
+ * for each quarter-power of 2. For large buffers, we want
+ * a cache for each multiple of PAGESIZE.
+ */
+ for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
+ size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
+ size_t p2 = size;
+ size_t align = 0;
+
+ while (p2 & (p2 - 1))
+ p2 &= p2 - 1;
+
+ if (size <= 4 * SPA_MINBLOCKSIZE) {
+ align = SPA_MINBLOCKSIZE;
+ } else if (P2PHASE(size, PAGESIZE) == 0) {
+ align = PAGESIZE;
+ } else if (P2PHASE(size, p2 >> 2) == 0) {
+ align = p2 >> 2;
+ }
+
+ if (align != 0) {
+ char name[36];
+ (void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
+ zio_buf_cache[c] = kmem_cache_create(name, size,
+ align, NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG);
+
+ (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
+ zio_data_buf_cache[c] = kmem_cache_create(name, size,
+ align, NULL, NULL, NULL, NULL, data_alloc_arena,
+ KMC_NODEBUG);
+
+ dprintf("creating cache for size %5lx align %5lx\n",
+ size, align);
+ }
+ }
+
+ while (--c != 0) {
+ ASSERT(zio_buf_cache[c] != NULL);
+ if (zio_buf_cache[c - 1] == NULL)
+ zio_buf_cache[c - 1] = zio_buf_cache[c];
+
+ ASSERT(zio_data_buf_cache[c] != NULL);
+ if (zio_data_buf_cache[c - 1] == NULL)
+ zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
+ }
+#endif
+
+ zio_inject_init();
+}
+
+void
+zio_fini(void)
+{
+#ifdef ZIO_USE_UMA
+ size_t c;
+ kmem_cache_t *last_cache = NULL;
+ kmem_cache_t *last_data_cache = NULL;
+
+ for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
+ if (zio_buf_cache[c] != last_cache) {
+ last_cache = zio_buf_cache[c];
+ kmem_cache_destroy(zio_buf_cache[c]);
+ }
+ zio_buf_cache[c] = NULL;
+
+ if (zio_data_buf_cache[c] != last_data_cache) {
+ last_data_cache = zio_data_buf_cache[c];
+ kmem_cache_destroy(zio_data_buf_cache[c]);
+ }
+ zio_data_buf_cache[c] = NULL;
+ }
+#endif
+
+ zio_inject_fini();
+}
+
+/*
+ * ==========================================================================
+ * Allocate and free I/O buffers
+ * ==========================================================================
+ */
+
+/*
+ * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a
+ * crashdump if the kernel panics, so use it judiciously. Obviously, it's
+ * useful to inspect ZFS metadata, but if possible, we should avoid keeping
+ * excess / transient data in-core during a crashdump.
+ */
+void *
+zio_buf_alloc(size_t size)
+{
+#ifdef ZIO_USE_UMA
+ size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
+
+ ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+
+ return (kmem_cache_alloc(zio_buf_cache[c], KM_SLEEP));
+#else
+ return (kmem_alloc(size, KM_SLEEP));
+#endif
+}
+
+/*
+ * Use zio_data_buf_alloc to allocate data. The data will not appear in a
+ * crashdump if the kernel panics. This exists so that we will limit the amount
+ * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount
+ * of kernel heap dumped to disk when the kernel panics)
+ */
+void *
+zio_data_buf_alloc(size_t size)
+{
+#ifdef ZIO_USE_UMA
+ size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
+
+ ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+
+ return (kmem_cache_alloc(zio_data_buf_cache[c], KM_SLEEP));
+#else
+ return (kmem_alloc(size, KM_SLEEP));
+#endif
+}
+
+void
+zio_buf_free(void *buf, size_t size)
+{
+#ifdef ZIO_USE_UMA
+ size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
+
+ ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+
+ kmem_cache_free(zio_buf_cache[c], buf);
+#else
+ kmem_free(buf, size);
+#endif
+}
+
+void
+zio_data_buf_free(void *buf, size_t size)
+{
+#ifdef ZIO_USE_UMA
+ size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
+
+ ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+
+ kmem_cache_free(zio_data_buf_cache[c], buf);
+#else
+ kmem_free(buf, size);
+#endif
+}
+
+/*
+ * ==========================================================================
+ * Push and pop I/O transform buffers
+ * ==========================================================================
+ */
+static void
+zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize)
+{
+ zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
+
+ zt->zt_data = data;
+ zt->zt_size = size;
+ zt->zt_bufsize = bufsize;
+
+ zt->zt_next = zio->io_transform_stack;
+ zio->io_transform_stack = zt;
+
+ zio->io_data = data;
+ zio->io_size = size;
+}
+
+static void
+zio_pop_transform(zio_t *zio, void **data, uint64_t *size, uint64_t *bufsize)
+{
+ zio_transform_t *zt = zio->io_transform_stack;
+
+ *data = zt->zt_data;
+ *size = zt->zt_size;
+ *bufsize = zt->zt_bufsize;
+
+ zio->io_transform_stack = zt->zt_next;
+ kmem_free(zt, sizeof (zio_transform_t));
+
+ if ((zt = zio->io_transform_stack) != NULL) {
+ zio->io_data = zt->zt_data;
+ zio->io_size = zt->zt_size;
+ }
+}
+
+static void
+zio_clear_transform_stack(zio_t *zio)
+{
+ void *data;
+ uint64_t size, bufsize;
+
+ ASSERT(zio->io_transform_stack != NULL);
+
+ zio_pop_transform(zio, &data, &size, &bufsize);
+ while (zio->io_transform_stack != NULL) {
+ zio_buf_free(data, bufsize);
+ zio_pop_transform(zio, &data, &size, &bufsize);
+ }
+}
+
+/*
+ * ==========================================================================
+ * Create the various types of I/O (read, write, free)
+ * ==========================================================================
+ */
+static zio_t *
+zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+ void *data, uint64_t size, zio_done_func_t *done, void *private,
+ zio_type_t type, int priority, int flags, uint8_t stage, uint32_t pipeline)
+{
+ zio_t *zio;
+
+ ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
+ ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
+
+ zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP);
+ zio->io_parent = pio;
+ zio->io_spa = spa;
+ zio->io_txg = txg;
+ if (bp != NULL) {
+ zio->io_bp = bp;
+ zio->io_bp_copy = *bp;
+ zio->io_bp_orig = *bp;
+ }
+ zio->io_done = done;
+ zio->io_private = private;
+ zio->io_type = type;
+ zio->io_priority = priority;
+ zio->io_stage = stage;
+ zio->io_pipeline = pipeline;
+ zio->io_async_stages = ZIO_ASYNC_PIPELINE_STAGES;
+ zio->io_timestamp = lbolt64;
+ zio->io_flags = flags;
+ mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
+ zio_push_transform(zio, data, size, size);
+
+ /*
+ * Note on config lock:
+ *
+ * If CONFIG_HELD is set, then the caller already has the config
+ * lock, so we don't need it for this io.
+ *
+ * We set CONFIG_GRABBED to indicate that we have grabbed the
+ * config lock on behalf of this io, so it should be released
+ * in zio_done.
+ *
+ * Unless CONFIG_HELD is set, we will grab the config lock for
+ * any top-level (parent-less) io, *except* NULL top-level ios.
+ * The NULL top-level ios rarely have any children, so we delay
+ * grabbing the lock until the first child is added (but it is
+ * still grabbed on behalf of the top-level i/o, so additional
+ * children don't need to also grab it). This greatly reduces
+ * contention on the config lock.
+ */
+ if (pio == NULL) {
+ if (type != ZIO_TYPE_NULL &&
+ !(flags & ZIO_FLAG_CONFIG_HELD)) {
+ spa_config_enter(zio->io_spa, RW_READER, zio);
+ zio->io_flags |= ZIO_FLAG_CONFIG_GRABBED;
+ }
+ zio->io_root = zio;
+ } else {
+ zio->io_root = pio->io_root;
+ if (!(flags & ZIO_FLAG_NOBOOKMARK))
+ zio->io_logical = pio->io_logical;
+ mutex_enter(&pio->io_lock);
+ if (pio->io_parent == NULL &&
+ pio->io_type == ZIO_TYPE_NULL &&
+ !(pio->io_flags & ZIO_FLAG_CONFIG_GRABBED) &&
+ !(pio->io_flags & ZIO_FLAG_CONFIG_HELD)) {
+ pio->io_flags |= ZIO_FLAG_CONFIG_GRABBED;
+ spa_config_enter(zio->io_spa, RW_READER, pio);
+ }
+ if (stage < ZIO_STAGE_READY)
+ pio->io_children_notready++;
+ pio->io_children_notdone++;
+ zio->io_sibling_next = pio->io_child;
+ zio->io_sibling_prev = NULL;
+ if (pio->io_child != NULL)
+ pio->io_child->io_sibling_prev = zio;
+ pio->io_child = zio;
+ zio->io_ndvas = pio->io_ndvas;
+ mutex_exit(&pio->io_lock);
+ }
+
+ return (zio);
+}
+
+zio_t *
+zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private,
+ int flags)
+{
+ zio_t *zio;
+
+ zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
+ ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, ZIO_STAGE_OPEN,
+ ZIO_WAIT_FOR_CHILDREN_PIPELINE);
+
+ return (zio);
+}
+
+zio_t *
+zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags)
+{
+ return (zio_null(NULL, spa, done, private, flags));
+}
+
+zio_t *
+zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data,
+ uint64_t size, zio_done_func_t *done, void *private,
+ int priority, int flags, zbookmark_t *zb)
+{
+ zio_t *zio;
+
+ ASSERT3U(size, ==, BP_GET_LSIZE(bp));
+
+ zio = zio_create(pio, spa, bp->blk_birth, bp, data, size, done, private,
+ ZIO_TYPE_READ, priority, flags | ZIO_FLAG_USER,
+ ZIO_STAGE_OPEN, ZIO_READ_PIPELINE);
+ zio->io_bookmark = *zb;
+
+ zio->io_logical = zio;
+
+ /*
+ * Work off our copy of the bp so the caller can free it.
+ */
+ zio->io_bp = &zio->io_bp_copy;
+
+ if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
+ uint64_t csize = BP_GET_PSIZE(bp);
+ void *cbuf = zio_buf_alloc(csize);
+
+ zio_push_transform(zio, cbuf, csize, csize);
+ zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS;
+ }
+
+ if (BP_IS_GANG(bp)) {
+ uint64_t gsize = SPA_GANGBLOCKSIZE;
+ void *gbuf = zio_buf_alloc(gsize);
+
+ zio_push_transform(zio, gbuf, gsize, gsize);
+ zio->io_pipeline |= 1U << ZIO_STAGE_READ_GANG_MEMBERS;
+ }
+
+ return (zio);
+}
+
+zio_t *
+zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies,
+ uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
+ zio_done_func_t *ready, zio_done_func_t *done, void *private, int priority,
+ int flags, zbookmark_t *zb)
+{
+ zio_t *zio;
+
+ ASSERT(checksum >= ZIO_CHECKSUM_OFF &&
+ checksum < ZIO_CHECKSUM_FUNCTIONS);
+
+ ASSERT(compress >= ZIO_COMPRESS_OFF &&
+ compress < ZIO_COMPRESS_FUNCTIONS);
+
+ zio = zio_create(pio, spa, txg, bp, data, size, done, private,
+ ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER,
+ ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE);
+
+ zio->io_ready = ready;
+
+ zio->io_bookmark = *zb;
+
+ zio->io_logical = zio;
+
+ zio->io_checksum = checksum;
+ zio->io_compress = compress;
+ zio->io_ndvas = ncopies;
+
+ if (compress != ZIO_COMPRESS_OFF)
+ zio->io_async_stages |= 1U << ZIO_STAGE_WRITE_COMPRESS;
+
+ if (bp->blk_birth != txg) {
+ /* XXX the bp usually (always?) gets re-zeroed later */
+ BP_ZERO(bp);
+ BP_SET_LSIZE(bp, size);
+ BP_SET_PSIZE(bp, size);
+ } else {
+ /* Make sure someone doesn't change their mind on overwrites */
+ ASSERT(MIN(zio->io_ndvas + BP_IS_GANG(bp),
+ spa_max_replication(spa)) == BP_GET_NDVAS(bp));
+ }
+
+ return (zio);
+}
+
+zio_t *
+zio_rewrite(zio_t *pio, spa_t *spa, int checksum,
+ uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
+ zio_done_func_t *done, void *private, int priority, int flags,
+ zbookmark_t *zb)
+{
+ zio_t *zio;
+
+ zio = zio_create(pio, spa, txg, bp, data, size, done, private,
+ ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER,
+ ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
+
+ zio->io_bookmark = *zb;
+ zio->io_checksum = checksum;
+ zio->io_compress = ZIO_COMPRESS_OFF;
+
+ if (pio != NULL)
+ ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp));
+
+ return (zio);
+}
+
+static zio_t *
+zio_write_allocate(zio_t *pio, spa_t *spa, int checksum,
+ uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
+ zio_done_func_t *done, void *private, int priority, int flags)
+{
+ zio_t *zio;
+
+ BP_ZERO(bp);
+ BP_SET_LSIZE(bp, size);
+ BP_SET_PSIZE(bp, size);
+ BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
+
+ zio = zio_create(pio, spa, txg, bp, data, size, done, private,
+ ZIO_TYPE_WRITE, priority, flags,
+ ZIO_STAGE_OPEN, ZIO_WRITE_ALLOCATE_PIPELINE);
+
+ zio->io_checksum = checksum;
+ zio->io_compress = ZIO_COMPRESS_OFF;
+
+ return (zio);
+}
+
+zio_t *
+zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+ zio_done_func_t *done, void *private)
+{
+ zio_t *zio;
+
+ ASSERT(!BP_IS_HOLE(bp));
+
+ if (txg == spa->spa_syncing_txg &&
+ spa->spa_sync_pass > zio_sync_pass.zp_defer_free) {
+ bplist_enqueue_deferred(&spa->spa_sync_bplist, bp);
+ return (zio_null(pio, spa, NULL, NULL, 0));
+ }
+
+ zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private,
+ ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, ZIO_FLAG_USER,
+ ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE);
+
+ zio->io_bp = &zio->io_bp_copy;
+
+ return (zio);
+}
+
+zio_t *
+zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+ zio_done_func_t *done, void *private)
+{
+ zio_t *zio;
+
+ /*
+ * A claim is an allocation of a specific block. Claims are needed
+ * to support immediate writes in the intent log. The issue is that
+ * immediate writes contain committed data, but in a txg that was
+ * *not* committed. Upon opening the pool after an unclean shutdown,
+ * the intent log claims all blocks that contain immediate write data
+ * so that the SPA knows they're in use.
+ *
+ * All claims *must* be resolved in the first txg -- before the SPA
+ * starts allocating blocks -- so that nothing is allocated twice.
+ */
+ ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
+ ASSERT3U(spa_first_txg(spa), <=, txg);
+
+ zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private,
+ ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, 0,
+ ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
+
+ zio->io_bp = &zio->io_bp_copy;
+
+ return (zio);
+}
+
+zio_t *
+zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
+ zio_done_func_t *done, void *private, int priority, int flags)
+{
+ zio_t *zio;
+ int c;
+
+ if (vd->vdev_children == 0) {
+ zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
+ ZIO_TYPE_IOCTL, priority, flags,
+ ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
+
+ zio->io_vd = vd;
+ zio->io_cmd = cmd;
+ } else {
+ zio = zio_null(pio, spa, NULL, NULL, flags);
+
+ for (c = 0; c < vd->vdev_children; c++)
+ zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
+ done, private, priority, flags));
+ }
+
+ return (zio);
+}
+
+static void
+zio_phys_bp_init(vdev_t *vd, blkptr_t *bp, uint64_t offset, uint64_t size,
+ int checksum)
+{
+ ASSERT(vd->vdev_children == 0);
+
+ ASSERT(size <= SPA_MAXBLOCKSIZE);
+ ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
+ ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
+
+ ASSERT(offset + size <= VDEV_LABEL_START_SIZE ||
+ offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
+ ASSERT3U(offset + size, <=, vd->vdev_psize);
+
+ BP_ZERO(bp);
+
+ BP_SET_LSIZE(bp, size);
+ BP_SET_PSIZE(bp, size);
+
+ BP_SET_CHECKSUM(bp, checksum);
+ BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
+ BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+
+ if (checksum != ZIO_CHECKSUM_OFF)
+ ZIO_SET_CHECKSUM(&bp->blk_cksum, offset, 0, 0, 0);
+}
+
+zio_t *
+zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
+ void *data, int checksum, zio_done_func_t *done, void *private,
+ int priority, int flags)
+{
+ zio_t *zio;
+ blkptr_t blk;
+
+ zio_phys_bp_init(vd, &blk, offset, size, checksum);
+
+ zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private,
+ ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL,
+ ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
+
+ zio->io_vd = vd;
+ zio->io_offset = offset;
+
+ /*
+ * Work off our copy of the bp so the caller can free it.
+ */
+ zio->io_bp = &zio->io_bp_copy;
+
+ return (zio);
+}
+
+zio_t *
+zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
+ void *data, int checksum, zio_done_func_t *done, void *private,
+ int priority, int flags)
+{
+ zio_block_tail_t *zbt;
+ void *wbuf;
+ zio_t *zio;
+ blkptr_t blk;
+
+ zio_phys_bp_init(vd, &blk, offset, size, checksum);
+
+ zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private,
+ ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL,
+ ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
+
+ zio->io_vd = vd;
+ zio->io_offset = offset;
+
+ zio->io_bp = &zio->io_bp_copy;
+ zio->io_checksum = checksum;
+
+ if (zio_checksum_table[checksum].ci_zbt) {
+ /*
+ * zbt checksums are necessarily destructive -- they modify
+ * one word of the write buffer to hold the verifier/checksum.
+ * Therefore, we must make a local copy in case the data is
+ * being written to multiple places.
+ */
+ wbuf = zio_buf_alloc(size);
+ bcopy(data, wbuf, size);
+ zio_push_transform(zio, wbuf, size, size);
+
+ zbt = (zio_block_tail_t *)((char *)wbuf + size) - 1;
+ zbt->zbt_cksum = blk.blk_cksum;
+ }
+
+ return (zio);
+}
+
+/*
+ * Create a child I/O to do some work for us. It has no associated bp.
+ */
+zio_t *
+zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
+ void *data, uint64_t size, int type, int priority, int flags,
+ zio_done_func_t *done, void *private)
+{
+ uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE;
+ zio_t *cio;
+
+ if (type == ZIO_TYPE_READ && bp != NULL) {
+ /*
+ * If we have the bp, then the child should perform the
+ * checksum and the parent need not. This pushes error
+ * detection as close to the leaves as possible and
+ * eliminates redundant checksums in the interior nodes.
+ */
+ pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY;
+ zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY);
+ }
+
+ cio = zio_create(zio, zio->io_spa, zio->io_txg, bp, data, size,
+ done, private, type, priority,
+ (zio->io_flags & ZIO_FLAG_VDEV_INHERIT) | ZIO_FLAG_CANFAIL | flags,
+ ZIO_STAGE_VDEV_IO_START - 1, pipeline);
+
+ cio->io_vd = vd;
+ cio->io_offset = offset;
+
+ return (cio);
+}
+
+/*
+ * ==========================================================================
+ * Initiate I/O, either sync or async
+ * ==========================================================================
+ */
+int
+zio_wait(zio_t *zio)
+{
+ int error;
+
+ ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
+
+ zio->io_waiter = curthread;
+
+ zio_next_stage_async(zio);
+
+ mutex_enter(&zio->io_lock);
+ while (zio->io_stalled != ZIO_STAGE_DONE)
+ cv_wait(&zio->io_cv, &zio->io_lock);
+ mutex_exit(&zio->io_lock);
+
+ error = zio->io_error;
+ cv_destroy(&zio->io_cv);
+ mutex_destroy(&zio->io_lock);
+ kmem_free(zio, sizeof (zio_t));
+
+ return (error);
+}
+
+void
+zio_nowait(zio_t *zio)
+{
+ zio_next_stage_async(zio);
+}
+
+/*
+ * ==========================================================================
+ * I/O pipeline interlocks: parent/child dependency scoreboarding
+ * ==========================================================================
+ */
+static void
+zio_wait_for_children(zio_t *zio, uint32_t stage, uint64_t *countp)
+{
+ mutex_enter(&zio->io_lock);
+ if (*countp == 0) {
+ ASSERT(zio->io_stalled == 0);
+ mutex_exit(&zio->io_lock);
+ zio_next_stage(zio);
+ } else {
+ zio->io_stalled = stage;
+ mutex_exit(&zio->io_lock);
+ }
+}
+
+static void
+zio_notify_parent(zio_t *zio, uint32_t stage, uint64_t *countp)
+{
+ zio_t *pio = zio->io_parent;
+
+ mutex_enter(&pio->io_lock);
+ if (pio->io_error == 0 && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
+ pio->io_error = zio->io_error;
+ if (--*countp == 0 && pio->io_stalled == stage) {
+ pio->io_stalled = 0;
+ mutex_exit(&pio->io_lock);
+ zio_next_stage_async(pio);
+ } else {
+ mutex_exit(&pio->io_lock);
+ }
+}
+
+static void
+zio_wait_children_ready(zio_t *zio)
+{
+ zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_READY,
+ &zio->io_children_notready);
+}
+
+void
+zio_wait_children_done(zio_t *zio)
+{
+ zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_DONE,
+ &zio->io_children_notdone);
+}
+
+static void
+zio_ready(zio_t *zio)
+{
+ zio_t *pio = zio->io_parent;
+
+ if (zio->io_ready)
+ zio->io_ready(zio);
+
+ if (pio != NULL)
+ zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_READY,
+ &pio->io_children_notready);
+
+ if (zio->io_bp)
+ zio->io_bp_copy = *zio->io_bp;
+
+ zio_next_stage(zio);
+}
+
+static void
+zio_done(zio_t *zio)
+{
+ zio_t *pio = zio->io_parent;
+ spa_t *spa = zio->io_spa;
+ blkptr_t *bp = zio->io_bp;
+ vdev_t *vd = zio->io_vd;
+
+ ASSERT(zio->io_children_notready == 0);
+ ASSERT(zio->io_children_notdone == 0);
+
+ if (bp != NULL) {
+ ASSERT(bp->blk_pad[0] == 0);
+ ASSERT(bp->blk_pad[1] == 0);
+ ASSERT(bp->blk_pad[2] == 0);
+ ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0);
+ if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) &&
+ !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
+ ASSERT(!BP_SHOULD_BYTESWAP(bp));
+ if (zio->io_ndvas != 0)
+ ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp));
+ ASSERT(BP_COUNT_GANG(bp) == 0 ||
+ (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp)));
+ }
+ }
+
+ if (vd != NULL)
+ vdev_stat_update(zio);
+
+ if (zio->io_error) {
+ /*
+ * If this I/O is attached to a particular vdev,
+ * generate an error message describing the I/O failure
+ * at the block level. We ignore these errors if the
+ * device is currently unavailable.
+ */
+ if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd))
+ zfs_ereport_post(FM_EREPORT_ZFS_IO,
+ zio->io_spa, vd, zio, 0, 0);
+
+ if ((zio->io_error == EIO ||
+ !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) &&
+ zio->io_logical == zio) {
+ /*
+ * For root I/O requests, tell the SPA to log the error
+ * appropriately. Also, generate a logical data
+ * ereport.
+ */
+ spa_log_error(zio->io_spa, zio);
+
+ zfs_ereport_post(FM_EREPORT_ZFS_DATA,
+ zio->io_spa, NULL, zio, 0, 0);
+ }
+
+ /*
+ * For I/O requests that cannot fail, panic appropriately.
+ */
+ if (!(zio->io_flags & ZIO_FLAG_CANFAIL)) {
+ char *blkbuf;
+
+ blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_NOSLEEP);
+ if (blkbuf) {
+ sprintf_blkptr(blkbuf, BP_SPRINTF_LEN,
+ bp ? bp : &zio->io_bp_copy);
+ }
+ panic("ZFS: %s (%s on %s off %llx: zio %p %s): error "
+ "%d", zio->io_error == ECKSUM ?
+ "bad checksum" : "I/O failure",
+ zio_type_name[zio->io_type],
+ vdev_description(vd),
+ (u_longlong_t)zio->io_offset,
+ zio, blkbuf ? blkbuf : "", zio->io_error);
+ }
+ }
+ zio_clear_transform_stack(zio);
+
+ if (zio->io_done)
+ zio->io_done(zio);
+
+ ASSERT(zio->io_delegate_list == NULL);
+ ASSERT(zio->io_delegate_next == NULL);
+
+ if (pio != NULL) {
+ zio_t *next, *prev;
+
+ mutex_enter(&pio->io_lock);
+ next = zio->io_sibling_next;
+ prev = zio->io_sibling_prev;
+ if (next != NULL)
+ next->io_sibling_prev = prev;
+ if (prev != NULL)
+ prev->io_sibling_next = next;
+ if (pio->io_child == zio)
+ pio->io_child = next;
+ mutex_exit(&pio->io_lock);
+
+ zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_DONE,
+ &pio->io_children_notdone);
+ }
+
+ /*
+ * Note: this I/O is now done, and will shortly be
+ * kmem_free()'d, so there is no need to clear this (or any
+ * other) flag.
+ */
+ if (zio->io_flags & ZIO_FLAG_CONFIG_GRABBED)
+ spa_config_exit(spa, zio);
+
+ if (zio->io_waiter != NULL) {
+ mutex_enter(&zio->io_lock);
+ ASSERT(zio->io_stage == ZIO_STAGE_DONE);
+ zio->io_stalled = zio->io_stage;
+ cv_broadcast(&zio->io_cv);
+ mutex_exit(&zio->io_lock);
+ } else {
+ kmem_free(zio, sizeof (zio_t));
+ }
+}
+
+/*
+ * ==========================================================================
+ * Compression support
+ * ==========================================================================
+ */
+static void
+zio_write_compress(zio_t *zio)
+{
+ int compress = zio->io_compress;
+ blkptr_t *bp = zio->io_bp;
+ void *cbuf;
+ uint64_t lsize = zio->io_size;
+ uint64_t csize = lsize;
+ uint64_t cbufsize = 0;
+ int pass;
+
+ if (bp->blk_birth == zio->io_txg) {
+ /*
+ * We're rewriting an existing block, which means we're
+ * working on behalf of spa_sync(). For spa_sync() to
+ * converge, it must eventually be the case that we don't
+ * have to allocate new blocks. But compression changes
+ * the blocksize, which forces a reallocate, and makes
+ * convergence take longer. Therefore, after the first
+ * few passes, stop compressing to ensure convergence.
+ */
+ pass = spa_sync_pass(zio->io_spa);
+ if (pass > zio_sync_pass.zp_dontcompress)
+ compress = ZIO_COMPRESS_OFF;
+ } else {
+ ASSERT(BP_IS_HOLE(bp));
+ pass = 1;
+ }
+
+ if (compress != ZIO_COMPRESS_OFF)
+ if (!zio_compress_data(compress, zio->io_data, zio->io_size,
+ &cbuf, &csize, &cbufsize))
+ compress = ZIO_COMPRESS_OFF;
+
+ if (compress != ZIO_COMPRESS_OFF && csize != 0)
+ zio_push_transform(zio, cbuf, csize, cbufsize);
+
+ /*
+ * The final pass of spa_sync() must be all rewrites, but the first
+ * few passes offer a trade-off: allocating blocks defers convergence,
+ * but newly allocated blocks are sequential, so they can be written
+ * to disk faster. Therefore, we allow the first few passes of
+ * spa_sync() to reallocate new blocks, but force rewrites after that.
+ * There should only be a handful of blocks after pass 1 in any case.
+ */
+ if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize &&
+ pass > zio_sync_pass.zp_rewrite) {
+ ASSERT(csize != 0);
+ BP_SET_LSIZE(bp, lsize);
+ BP_SET_COMPRESS(bp, compress);
+ zio->io_pipeline = ZIO_REWRITE_PIPELINE;
+ } else {
+ if (bp->blk_birth == zio->io_txg)
+ BP_ZERO(bp);
+ if (csize == 0) {
+ BP_ZERO(bp);
+ zio->io_pipeline = ZIO_WAIT_FOR_CHILDREN_PIPELINE;
+ } else {
+ ASSERT3U(BP_GET_NDVAS(bp), ==, 0);
+ BP_SET_LSIZE(bp, lsize);
+ BP_SET_PSIZE(bp, csize);
+ BP_SET_COMPRESS(bp, compress);
+ zio->io_pipeline = ZIO_WRITE_ALLOCATE_PIPELINE;
+ }
+ }
+
+ zio_next_stage(zio);
+}
+
+static void
+zio_read_decompress(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+ void *data;
+ uint64_t size;
+ uint64_t bufsize;
+ int compress = BP_GET_COMPRESS(bp);
+
+ ASSERT(compress != ZIO_COMPRESS_OFF);
+
+ zio_pop_transform(zio, &data, &size, &bufsize);
+
+ if (zio_decompress_data(compress, data, size,
+ zio->io_data, zio->io_size))
+ zio->io_error = EIO;
+
+ zio_buf_free(data, bufsize);
+
+ zio_next_stage(zio);
+}
+
+/*
+ * ==========================================================================
+ * Gang block support
+ * ==========================================================================
+ */
+static void
+zio_gang_pipeline(zio_t *zio)
+{
+ /*
+ * By default, the pipeline assumes that we're dealing with a gang
+ * block. If we're not, strip out any gang-specific stages.
+ */
+ if (!BP_IS_GANG(zio->io_bp))
+ zio->io_pipeline &= ~ZIO_GANG_STAGES;
+
+ zio_next_stage(zio);
+}
+
+static void
+zio_gang_byteswap(zio_t *zio)
+{
+ ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
+
+ if (BP_SHOULD_BYTESWAP(zio->io_bp))
+ byteswap_uint64_array(zio->io_data, zio->io_size);
+}
+
+static void
+zio_get_gang_header(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+ uint64_t gsize = SPA_GANGBLOCKSIZE;
+ void *gbuf = zio_buf_alloc(gsize);
+
+ ASSERT(BP_IS_GANG(bp));
+
+ zio_push_transform(zio, gbuf, gsize, gsize);
+
+ zio_nowait(zio_create(zio, zio->io_spa, bp->blk_birth, bp, gbuf, gsize,
+ NULL, NULL, ZIO_TYPE_READ, zio->io_priority,
+ zio->io_flags & ZIO_FLAG_GANG_INHERIT,
+ ZIO_STAGE_OPEN, ZIO_READ_PIPELINE));
+
+ zio_wait_children_done(zio);
+}
+
+static void
+zio_read_gang_members(zio_t *zio)
+{
+ zio_gbh_phys_t *gbh;
+ uint64_t gsize, gbufsize, loff, lsize;
+ int i;
+
+ ASSERT(BP_IS_GANG(zio->io_bp));
+
+ zio_gang_byteswap(zio);
+ zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
+
+ for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) {
+ blkptr_t *gbp = &gbh->zg_blkptr[i];
+ lsize = BP_GET_PSIZE(gbp);
+
+ ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF);
+ ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp));
+ ASSERT3U(loff + lsize, <=, zio->io_size);
+ ASSERT(i < SPA_GBH_NBLKPTRS);
+ ASSERT(!BP_IS_HOLE(gbp));
+
+ zio_nowait(zio_read(zio, zio->io_spa, gbp,
+ (char *)zio->io_data + loff, lsize, NULL, NULL,
+ zio->io_priority, zio->io_flags & ZIO_FLAG_GANG_INHERIT,
+ &zio->io_bookmark));
+ }
+
+ zio_buf_free(gbh, gbufsize);
+ zio_wait_children_done(zio);
+}
+
+static void
+zio_rewrite_gang_members(zio_t *zio)
+{
+ zio_gbh_phys_t *gbh;
+ uint64_t gsize, gbufsize, loff, lsize;
+ int i;
+
+ ASSERT(BP_IS_GANG(zio->io_bp));
+ ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE);
+
+ zio_gang_byteswap(zio);
+ zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
+
+ ASSERT(gsize == gbufsize);
+
+ for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) {
+ blkptr_t *gbp = &gbh->zg_blkptr[i];
+ lsize = BP_GET_PSIZE(gbp);
+
+ ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF);
+ ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp));
+ ASSERT3U(loff + lsize, <=, zio->io_size);
+ ASSERT(i < SPA_GBH_NBLKPTRS);
+ ASSERT(!BP_IS_HOLE(gbp));
+
+ zio_nowait(zio_rewrite(zio, zio->io_spa, zio->io_checksum,
+ zio->io_txg, gbp, (char *)zio->io_data + loff, lsize,
+ NULL, NULL, zio->io_priority, zio->io_flags,
+ &zio->io_bookmark));
+ }
+
+ zio_push_transform(zio, gbh, gsize, gbufsize);
+ zio_wait_children_ready(zio);
+}
+
+static void
+zio_free_gang_members(zio_t *zio)
+{
+ zio_gbh_phys_t *gbh;
+ uint64_t gsize, gbufsize;
+ int i;
+
+ ASSERT(BP_IS_GANG(zio->io_bp));
+
+ zio_gang_byteswap(zio);
+ zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
+
+ for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
+ blkptr_t *gbp = &gbh->zg_blkptr[i];
+
+ if (BP_IS_HOLE(gbp))
+ continue;
+ zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg,
+ gbp, NULL, NULL));
+ }
+
+ zio_buf_free(gbh, gbufsize);
+ zio_next_stage(zio);
+}
+
+static void
+zio_claim_gang_members(zio_t *zio)
+{
+ zio_gbh_phys_t *gbh;
+ uint64_t gsize, gbufsize;
+ int i;
+
+ ASSERT(BP_IS_GANG(zio->io_bp));
+
+ zio_gang_byteswap(zio);
+ zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
+
+ for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
+ blkptr_t *gbp = &gbh->zg_blkptr[i];
+ if (BP_IS_HOLE(gbp))
+ continue;
+ zio_nowait(zio_claim(zio, zio->io_spa, zio->io_txg,
+ gbp, NULL, NULL));
+ }
+
+ zio_buf_free(gbh, gbufsize);
+ zio_next_stage(zio);
+}
+
+static void
+zio_write_allocate_gang_member_done(zio_t *zio)
+{
+ zio_t *pio = zio->io_parent;
+ dva_t *cdva = zio->io_bp->blk_dva;
+ dva_t *pdva = pio->io_bp->blk_dva;
+ uint64_t asize;
+ int d;
+
+ ASSERT3U(pio->io_ndvas, ==, zio->io_ndvas);
+ ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
+ ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(zio->io_bp));
+ ASSERT3U(pio->io_ndvas, <=, BP_GET_NDVAS(pio->io_bp));
+
+ mutex_enter(&pio->io_lock);
+ for (d = 0; d < BP_GET_NDVAS(pio->io_bp); d++) {
+ ASSERT(DVA_GET_GANG(&pdva[d]));
+ asize = DVA_GET_ASIZE(&pdva[d]);
+ asize += DVA_GET_ASIZE(&cdva[d]);
+ DVA_SET_ASIZE(&pdva[d], asize);
+ }
+ mutex_exit(&pio->io_lock);
+}
+
+static void
+zio_write_allocate_gang_members(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+ dva_t *dva = bp->blk_dva;
+ spa_t *spa = zio->io_spa;
+ zio_gbh_phys_t *gbh;
+ uint64_t txg = zio->io_txg;
+ uint64_t resid = zio->io_size;
+ uint64_t maxalloc = P2ROUNDUP(zio->io_size >> 1, SPA_MINBLOCKSIZE);
+ uint64_t gsize, loff, lsize;
+ uint32_t gbps_left;
+ int ndvas = zio->io_ndvas;
+ int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa));
+ int error;
+ int i, d;
+
+ gsize = SPA_GANGBLOCKSIZE;
+ gbps_left = SPA_GBH_NBLKPTRS;
+
+ error = metaslab_alloc(spa, gsize, bp, gbh_ndvas, txg, NULL, B_FALSE);
+ if (error == ENOSPC)
+ panic("can't allocate gang block header");
+ ASSERT(error == 0);
+
+ for (d = 0; d < gbh_ndvas; d++)
+ DVA_SET_GANG(&dva[d], 1);
+
+ bp->blk_birth = txg;
+
+ gbh = zio_buf_alloc(gsize);
+ bzero(gbh, gsize);
+
+ /* We need to test multi-level gang blocks */
+ if (maxalloc >= zio_gang_bang && (lbolt & 0x1) == 0)
+ maxalloc = MAX(maxalloc >> 2, SPA_MINBLOCKSIZE);
+
+ for (loff = 0, i = 0; loff != zio->io_size;
+ loff += lsize, resid -= lsize, gbps_left--, i++) {
+ blkptr_t *gbp = &gbh->zg_blkptr[i];
+ dva = gbp->blk_dva;
+
+ ASSERT(gbps_left != 0);
+ maxalloc = MIN(maxalloc, resid);
+
+ while (resid <= maxalloc * gbps_left) {
+ error = metaslab_alloc(spa, maxalloc, gbp, ndvas,
+ txg, bp, B_FALSE);
+ if (error == 0)
+ break;
+ ASSERT3U(error, ==, ENOSPC);
+ if (maxalloc == SPA_MINBLOCKSIZE)
+ panic("really out of space");
+ maxalloc = P2ROUNDUP(maxalloc >> 1, SPA_MINBLOCKSIZE);
+ }
+
+ if (resid <= maxalloc * gbps_left) {
+ lsize = maxalloc;
+ BP_SET_LSIZE(gbp, lsize);
+ BP_SET_PSIZE(gbp, lsize);
+ BP_SET_COMPRESS(gbp, ZIO_COMPRESS_OFF);
+ gbp->blk_birth = txg;
+ zio_nowait(zio_rewrite(zio, spa,
+ zio->io_checksum, txg, gbp,
+ (char *)zio->io_data + loff, lsize,
+ zio_write_allocate_gang_member_done, NULL,
+ zio->io_priority, zio->io_flags,
+ &zio->io_bookmark));
+ } else {
+ lsize = P2ROUNDUP(resid / gbps_left, SPA_MINBLOCKSIZE);
+ ASSERT(lsize != SPA_MINBLOCKSIZE);
+ zio_nowait(zio_write_allocate(zio, spa,
+ zio->io_checksum, txg, gbp,
+ (char *)zio->io_data + loff, lsize,
+ zio_write_allocate_gang_member_done, NULL,
+ zio->io_priority, zio->io_flags));
+ }
+ }
+
+ ASSERT(resid == 0 && loff == zio->io_size);
+
+ zio->io_pipeline |= 1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE;
+
+ zio_push_transform(zio, gbh, gsize, gsize);
+ /*
+ * As much as we'd like this to be zio_wait_children_ready(),
+ * updating our ASIZE doesn't happen until the io_done callback,
+ * so we have to wait for that to finish in order for our BP
+ * to be stable.
+ */
+ zio_wait_children_done(zio);
+}
+
+/*
+ * ==========================================================================
+ * Allocate and free blocks
+ * ==========================================================================
+ */
+static void
+zio_dva_allocate(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+ int error;
+
+ ASSERT(BP_IS_HOLE(bp));
+ ASSERT3U(BP_GET_NDVAS(bp), ==, 0);
+ ASSERT3U(zio->io_ndvas, >, 0);
+ ASSERT3U(zio->io_ndvas, <=, spa_max_replication(zio->io_spa));
+
+ /* For testing, make some blocks above a certain size be gang blocks */
+ if (zio->io_size >= zio_gang_bang && (lbolt & 0x3) == 0) {
+ zio_write_allocate_gang_members(zio);
+ return;
+ }
+
+ ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
+
+ error = metaslab_alloc(zio->io_spa, zio->io_size, bp, zio->io_ndvas,
+ zio->io_txg, NULL, B_FALSE);
+
+ if (error == 0) {
+ bp->blk_birth = zio->io_txg;
+ } else if (error == ENOSPC) {
+ if (zio->io_size == SPA_MINBLOCKSIZE)
+ panic("really, truly out of space");
+ zio_write_allocate_gang_members(zio);
+ return;
+ } else {
+ zio->io_error = error;
+ }
+ zio_next_stage(zio);
+}
+
+static void
+zio_dva_free(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+
+ metaslab_free(zio->io_spa, bp, zio->io_txg, B_FALSE);
+
+ BP_ZERO(bp);
+
+ zio_next_stage(zio);
+}
+
+static void
+zio_dva_claim(zio_t *zio)
+{
+ zio->io_error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
+
+ zio_next_stage(zio);
+}
+
+/*
+ * ==========================================================================
+ * Read and write to physical devices
+ * ==========================================================================
+ */
+
+static void
+zio_vdev_io_start(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ vdev_t *tvd = vd ? vd->vdev_top : NULL;
+ blkptr_t *bp = zio->io_bp;
+ uint64_t align;
+
+ if (vd == NULL) {
+ /* The mirror_ops handle multiple DVAs in a single BP */
+ vdev_mirror_ops.vdev_op_io_start(zio);
+ return;
+ }
+
+ align = 1ULL << tvd->vdev_ashift;
+
+ if (zio->io_retries == 0 && vd == tvd)
+ zio->io_flags |= ZIO_FLAG_FAILFAST;
+
+ if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
+ vd->vdev_children == 0) {
+ zio->io_flags |= ZIO_FLAG_PHYSICAL;
+ zio->io_offset += VDEV_LABEL_START_SIZE;
+ }
+
+ if (P2PHASE(zio->io_size, align) != 0) {
+ uint64_t asize = P2ROUNDUP(zio->io_size, align);
+ char *abuf = zio_buf_alloc(asize);
+ ASSERT(vd == tvd);
+ if (zio->io_type == ZIO_TYPE_WRITE) {
+ bcopy(zio->io_data, abuf, zio->io_size);
+ bzero(abuf + zio->io_size, asize - zio->io_size);
+ }
+ zio_push_transform(zio, abuf, asize, asize);
+ ASSERT(!(zio->io_flags & ZIO_FLAG_SUBBLOCK));
+ zio->io_flags |= ZIO_FLAG_SUBBLOCK;
+ }
+
+ ASSERT(P2PHASE(zio->io_offset, align) == 0);
+ ASSERT(P2PHASE(zio->io_size, align) == 0);
+ ASSERT(bp == NULL ||
+ P2ROUNDUP(ZIO_GET_IOSIZE(zio), align) == zio->io_size);
+ ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE));
+
+ vdev_io_start(zio);
+
+ /* zio_next_stage_async() gets called from io completion interrupt */
+}
+
+static void
+zio_vdev_io_done(zio_t *zio)
+{
+ if (zio->io_vd == NULL)
+ /* The mirror_ops handle multiple DVAs in a single BP */
+ vdev_mirror_ops.vdev_op_io_done(zio);
+ else
+ vdev_io_done(zio);
+}
+
+/* XXPOLICY */
+boolean_t
+zio_should_retry(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+
+ if (zio->io_error == 0)
+ return (B_FALSE);
+ if (zio->io_delegate_list != NULL)
+ return (B_FALSE);
+ if (vd && vd != vd->vdev_top)
+ return (B_FALSE);
+ if (zio->io_flags & ZIO_FLAG_DONT_RETRY)
+ return (B_FALSE);
+ if (zio->io_retries > 0)
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
+static void
+zio_vdev_io_assess(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ vdev_t *tvd = vd ? vd->vdev_top : NULL;
+
+ ASSERT(zio->io_vsd == NULL);
+
+ if (zio->io_flags & ZIO_FLAG_SUBBLOCK) {
+ void *abuf;
+ uint64_t asize;
+ ASSERT(vd == tvd);
+ zio_pop_transform(zio, &abuf, &asize, &asize);
+ if (zio->io_type == ZIO_TYPE_READ)
+ bcopy(abuf, zio->io_data, zio->io_size);
+ zio_buf_free(abuf, asize);
+ zio->io_flags &= ~ZIO_FLAG_SUBBLOCK;
+ }
+
+ if (zio_injection_enabled && !zio->io_error)
+ zio->io_error = zio_handle_fault_injection(zio, EIO);
+
+ /*
+ * If the I/O failed, determine whether we should attempt to retry it.
+ */
+ /* XXPOLICY */
+ if (zio_should_retry(zio)) {
+ ASSERT(tvd == vd);
+
+ zio->io_retries++;
+ zio->io_error = 0;
+ zio->io_flags &= ZIO_FLAG_VDEV_INHERIT |
+ ZIO_FLAG_CONFIG_GRABBED;
+ /* XXPOLICY */
+ zio->io_flags &= ~ZIO_FLAG_FAILFAST;
+ zio->io_flags |= ZIO_FLAG_DONT_CACHE;
+ zio->io_stage = ZIO_STAGE_VDEV_IO_START - 1;
+
+ dprintf("retry #%d for %s to %s offset %llx\n",
+ zio->io_retries, zio_type_name[zio->io_type],
+ vdev_description(vd), zio->io_offset);
+
+ zio_next_stage_async(zio);
+ return;
+ }
+
+ if (zio->io_error != 0 && zio->io_error != ECKSUM &&
+ !(zio->io_flags & ZIO_FLAG_SPECULATIVE) && vd) {
+ /*
+ * Poor man's hotplug support. Even if we're done retrying this
+ * I/O, try to reopen the vdev to see if it's still attached.
+ * To avoid excessive thrashing, we only try it once a minute.
+ * This also has the effect of detecting when missing devices
+ * have come back, by polling the device once a minute.
+ *
+ * We need to do this asynchronously because we can't grab
+ * all the necessary locks way down here.
+ */
+ if (gethrtime() - vd->vdev_last_try > 60ULL * NANOSEC) {
+ vd->vdev_last_try = gethrtime();
+ tvd->vdev_reopen_wanted = 1;
+ spa_async_request(vd->vdev_spa, SPA_ASYNC_REOPEN);
+ }
+ }
+
+ zio_next_stage(zio);
+}
+
+void
+zio_vdev_io_reissue(zio_t *zio)
+{
+ ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
+ ASSERT(zio->io_error == 0);
+
+ zio->io_stage--;
+}
+
+void
+zio_vdev_io_redone(zio_t *zio)
+{
+ ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
+
+ zio->io_stage--;
+}
+
+void
+zio_vdev_io_bypass(zio_t *zio)
+{
+ ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
+ ASSERT(zio->io_error == 0);
+
+ zio->io_flags |= ZIO_FLAG_IO_BYPASS;
+ zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS - 1;
+}
+
+/*
+ * ==========================================================================
+ * Generate and verify checksums
+ * ==========================================================================
+ */
+static void
+zio_checksum_generate(zio_t *zio)
+{
+ int checksum = zio->io_checksum;
+ blkptr_t *bp = zio->io_bp;
+
+ ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
+
+ BP_SET_CHECKSUM(bp, checksum);
+ BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+
+ zio_checksum(checksum, &bp->blk_cksum, zio->io_data, zio->io_size);
+
+ zio_next_stage(zio);
+}
+
+static void
+zio_gang_checksum_generate(zio_t *zio)
+{
+ zio_cksum_t zc;
+ zio_gbh_phys_t *gbh = zio->io_data;
+
+ ASSERT(BP_IS_GANG(zio->io_bp));
+ ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE);
+
+ zio_set_gang_verifier(zio, &gbh->zg_tail.zbt_cksum);
+
+ zio_checksum(ZIO_CHECKSUM_GANG_HEADER, &zc, zio->io_data, zio->io_size);
+
+ zio_next_stage(zio);
+}
+
+static void
+zio_checksum_verify(zio_t *zio)
+{
+ if (zio->io_bp != NULL) {
+ zio->io_error = zio_checksum_error(zio);
+ if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE))
+ zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
+ zio->io_spa, zio->io_vd, zio, 0, 0);
+ }
+
+ zio_next_stage(zio);
+}
+
+/*
+ * Called by RAID-Z to ensure we don't compute the checksum twice.
+ */
+void
+zio_checksum_verified(zio_t *zio)
+{
+ zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY);
+}
+
+/*
+ * Set the external verifier for a gang block based on stuff in the bp
+ */
+void
+zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp)
+{
+ blkptr_t *bp = zio->io_bp;
+
+ zcp->zc_word[0] = DVA_GET_VDEV(BP_IDENTITY(bp));
+ zcp->zc_word[1] = DVA_GET_OFFSET(BP_IDENTITY(bp));
+ zcp->zc_word[2] = bp->blk_birth;
+ zcp->zc_word[3] = 0;
+}
+
+/*
+ * ==========================================================================
+ * Define the pipeline
+ * ==========================================================================
+ */
+typedef void zio_pipe_stage_t(zio_t *zio);
+
+static void
+zio_badop(zio_t *zio)
+{
+ panic("Invalid I/O pipeline stage %u for zio %p", zio->io_stage, zio);
+}
+
+zio_pipe_stage_t *zio_pipeline[ZIO_STAGE_DONE + 2] = {
+ zio_badop,
+ zio_wait_children_ready,
+ zio_write_compress,
+ zio_checksum_generate,
+ zio_gang_pipeline,
+ zio_get_gang_header,
+ zio_rewrite_gang_members,
+ zio_free_gang_members,
+ zio_claim_gang_members,
+ zio_dva_allocate,
+ zio_dva_free,
+ zio_dva_claim,
+ zio_gang_checksum_generate,
+ zio_ready,
+ zio_vdev_io_start,
+ zio_vdev_io_done,
+ zio_vdev_io_assess,
+ zio_wait_children_done,
+ zio_checksum_verify,
+ zio_read_gang_members,
+ zio_read_decompress,
+ zio_done,
+ zio_badop
+};
+
+/*
+ * Move an I/O to the next stage of the pipeline and execute that stage.
+ * There's no locking on io_stage because there's no legitimate way for
+ * multiple threads to be attempting to process the same I/O.
+ */
+void
+zio_next_stage(zio_t *zio)
+{
+ uint32_t pipeline = zio->io_pipeline;
+
+ ASSERT(!MUTEX_HELD(&zio->io_lock));
+
+ if (zio->io_error) {
+ dprintf("zio %p vdev %s offset %llx stage %d error %d\n",
+ zio, vdev_description(zio->io_vd),
+ zio->io_offset, zio->io_stage, zio->io_error);
+ if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0)
+ pipeline &= ZIO_ERROR_PIPELINE_MASK;
+ }
+
+ while (((1U << ++zio->io_stage) & pipeline) == 0)
+ continue;
+
+ ASSERT(zio->io_stage <= ZIO_STAGE_DONE);
+ ASSERT(zio->io_stalled == 0);
+
+ /*
+ * See the comment in zio_next_stage_async() about per-CPU taskqs.
+ */
+ if (((1U << zio->io_stage) & zio->io_async_stages) &&
+ (zio->io_stage == ZIO_STAGE_WRITE_COMPRESS) &&
+ !(zio->io_flags & ZIO_FLAG_METADATA)) {
+ taskq_t *tq = zio->io_spa->spa_zio_issue_taskq[zio->io_type];
+ (void) taskq_dispatch(tq,
+ (task_func_t *)zio_pipeline[zio->io_stage], zio, TQ_SLEEP);
+ } else {
+ zio_pipeline[zio->io_stage](zio);
+ }
+}
+
+void
+zio_next_stage_async(zio_t *zio)
+{
+ taskq_t *tq;
+ uint32_t pipeline = zio->io_pipeline;
+
+ ASSERT(!MUTEX_HELD(&zio->io_lock));
+
+ if (zio->io_error) {
+ dprintf("zio %p vdev %s offset %llx stage %d error %d\n",
+ zio, vdev_description(zio->io_vd),
+ zio->io_offset, zio->io_stage, zio->io_error);
+ if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0)
+ pipeline &= ZIO_ERROR_PIPELINE_MASK;
+ }
+
+ while (((1U << ++zio->io_stage) & pipeline) == 0)
+ continue;
+
+ ASSERT(zio->io_stage <= ZIO_STAGE_DONE);
+ ASSERT(zio->io_stalled == 0);
+
+ /*
+ * For performance, we'll probably want two sets of task queues:
+ * per-CPU issue taskqs and per-CPU completion taskqs. The per-CPU
+ * part is for read performance: since we have to make a pass over
+ * the data to checksum it anyway, we want to do this on the same CPU
+ * that issued the read, because (assuming CPU scheduling affinity)
+ * that thread is probably still there. Getting this optimization
+ * right avoids performance-hostile cache-to-cache transfers.
+ *
+ * Note that having two sets of task queues is also necessary for
+ * correctness: if all of the issue threads get bogged down waiting
+ * for dependent reads (e.g. metaslab freelist) to complete, then
+ * there won't be any threads available to service I/O completion
+ * interrupts.
+ */
+ if ((1U << zio->io_stage) & zio->io_async_stages) {
+ if (zio->io_stage < ZIO_STAGE_VDEV_IO_DONE)
+ tq = zio->io_spa->spa_zio_issue_taskq[zio->io_type];
+ else
+ tq = zio->io_spa->spa_zio_intr_taskq[zio->io_type];
+ (void) taskq_dispatch(tq,
+ (task_func_t *)zio_pipeline[zio->io_stage], zio, TQ_SLEEP);
+ } else {
+ zio_pipeline[zio->io_stage](zio);
+ }
+}
+
+static boolean_t
+zio_alloc_should_fail(void)
+{
+ static uint16_t allocs = 0;
+
+ return (P2PHASE(allocs++, 1U<<zio_zil_fail_shift) == 0);
+}
+
+/*
+ * Try to allocate an intent log block. Return 0 on success, errno on failure.
+ */
+int
+zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, blkptr_t *old_bp,
+ uint64_t txg)
+{
+ int error;
+
+ spa_config_enter(spa, RW_READER, FTAG);
+
+ if (zio_zil_fail_shift && zio_alloc_should_fail()) {
+ spa_config_exit(spa, FTAG);
+ return (ENOSPC);
+ }
+
+ /*
+ * We were passed the previous log blocks dva_t in bp->blk_dva[0].
+ */
+ error = metaslab_alloc(spa, size, new_bp, 1, txg, old_bp, B_TRUE);
+
+ if (error == 0) {
+ BP_SET_LSIZE(new_bp, size);
+ BP_SET_PSIZE(new_bp, size);
+ BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
+ BP_SET_CHECKSUM(new_bp, ZIO_CHECKSUM_ZILOG);
+ BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
+ BP_SET_LEVEL(new_bp, 0);
+ BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
+ new_bp->blk_birth = txg;
+ }
+
+ spa_config_exit(spa, FTAG);
+
+ return (error);
+}
+
+/*
+ * Free an intent log block. We know it can't be a gang block, so there's
+ * nothing to do except metaslab_free() it.
+ */
+void
+zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg)
+{
+ ASSERT(!BP_IS_GANG(bp));
+
+ spa_config_enter(spa, RW_READER, FTAG);
+
+ metaslab_free(spa, bp, txg, B_FALSE);
+
+ spa_config_exit(spa, FTAG);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c
new file mode 100644
index 0000000..f0d9a14
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c
@@ -0,0 +1,172 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+
+/*
+ * Checksum vectors.
+ *
+ * In the SPA, everything is checksummed. We support checksum vectors
+ * for three distinct reasons:
+ *
+ * 1. Different kinds of data need different levels of protection.
+ * For SPA metadata, we always want a very strong checksum.
+ * For user data, we let users make the trade-off between speed
+ * and checksum strength.
+ *
+ * 2. Cryptographic hash and MAC algorithms are an area of active research.
+ * It is likely that in future hash functions will be at least as strong
+ * as current best-of-breed, and may be substantially faster as well.
+ * We want the ability to take advantage of these new hashes as soon as
+ * they become available.
+ *
+ * 3. If someone develops hardware that can compute a strong hash quickly,
+ * we want the ability to take advantage of that hardware.
+ *
+ * Of course, we don't want a checksum upgrade to invalidate existing
+ * data, so we store the checksum *function* in five bits of the DVA.
+ * This gives us room for up to 32 different checksum functions.
+ *
+ * When writing a block, we always checksum it with the latest-and-greatest
+ * checksum function of the appropriate strength. When reading a block,
+ * we compare the expected checksum against the actual checksum, which we
+ * compute via the checksum function specified in the DVA encoding.
+ */
+
+/*ARGSUSED*/
+static void
+zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+ ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
+}
+
+zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
+ {{NULL, NULL}, 0, 0, "inherit"},
+ {{NULL, NULL}, 0, 0, "on"},
+ {{zio_checksum_off, zio_checksum_off}, 0, 0, "off"},
+ {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, "label"},
+ {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, "gang_header"},
+ {{fletcher_2_native, fletcher_2_byteswap}, 0, 1, "zilog"},
+ {{fletcher_2_native, fletcher_2_byteswap}, 0, 0, "fletcher2"},
+ {{fletcher_4_native, fletcher_4_byteswap}, 1, 0, "fletcher4"},
+ {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 0, "SHA256"},
+};
+
+uint8_t
+zio_checksum_select(uint8_t child, uint8_t parent)
+{
+ ASSERT(child < ZIO_CHECKSUM_FUNCTIONS);
+ ASSERT(parent < ZIO_CHECKSUM_FUNCTIONS);
+ ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON);
+
+ if (child == ZIO_CHECKSUM_INHERIT)
+ return (parent);
+
+ if (child == ZIO_CHECKSUM_ON)
+ return (ZIO_CHECKSUM_ON_VALUE);
+
+ return (child);
+}
+
+/*
+ * Generate the checksum.
+ */
+void
+zio_checksum(uint_t checksum, zio_cksum_t *zcp, void *data, uint64_t size)
+{
+ zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1;
+ zio_checksum_info_t *ci = &zio_checksum_table[checksum];
+ zio_cksum_t zbt_cksum;
+
+ ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS);
+ ASSERT(ci->ci_func[0] != NULL);
+
+ if (ci->ci_zbt) {
+ *zcp = zbt->zbt_cksum;
+ zbt->zbt_magic = ZBT_MAGIC;
+ ci->ci_func[0](data, size, &zbt_cksum);
+ zbt->zbt_cksum = zbt_cksum;
+ } else {
+ ci->ci_func[0](data, size, zcp);
+ }
+}
+
+int
+zio_checksum_error(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+ zio_cksum_t zc = bp->blk_cksum;
+ uint_t checksum = BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER :
+ BP_GET_CHECKSUM(bp);
+ int byteswap = BP_SHOULD_BYTESWAP(bp);
+ void *data = zio->io_data;
+ uint64_t size = ZIO_GET_IOSIZE(zio);
+ zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1;
+ zio_checksum_info_t *ci = &zio_checksum_table[checksum];
+ zio_cksum_t actual_cksum, expected_cksum;
+
+ if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL)
+ return (EINVAL);
+
+ if (ci->ci_zbt) {
+ if (checksum == ZIO_CHECKSUM_GANG_HEADER)
+ zio_set_gang_verifier(zio, &zc);
+
+ if (zbt->zbt_magic == BSWAP_64(ZBT_MAGIC)) {
+ expected_cksum = zbt->zbt_cksum;
+ byteswap_uint64_array(&expected_cksum,
+ sizeof (zio_cksum_t));
+ zbt->zbt_cksum = zc;
+ byteswap_uint64_array(&zbt->zbt_cksum,
+ sizeof (zio_cksum_t));
+ ci->ci_func[1](data, size, &actual_cksum);
+ zbt->zbt_cksum = expected_cksum;
+ byteswap_uint64_array(&zbt->zbt_cksum,
+ sizeof (zio_cksum_t));
+ } else {
+ expected_cksum = zbt->zbt_cksum;
+ zbt->zbt_cksum = zc;
+ ci->ci_func[0](data, size, &actual_cksum);
+ zbt->zbt_cksum = expected_cksum;
+ }
+ zc = expected_cksum;
+ } else {
+ ASSERT(!BP_IS_GANG(bp));
+ ci->ci_func[byteswap](data, size, &actual_cksum);
+ }
+
+ if (!ZIO_CHECKSUM_EQUAL(actual_cksum, zc))
+ return (ECKSUM);
+
+ if (zio_injection_enabled && !zio->io_error)
+ return (zio_handle_fault_injection(zio, ECKSUM));
+
+ return (0);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c
new file mode 100644
index 0000000..c563be4
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c
@@ -0,0 +1,148 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/compress.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/zio_compress.h>
+
+/*
+ * Compression vectors.
+ */
+
+zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
+ {NULL, NULL, 0, "inherit"},
+ {NULL, NULL, 0, "on"},
+ {NULL, NULL, 0, "uncompressed"},
+ {lzjb_compress, lzjb_decompress, 0, "lzjb"},
+ {NULL, NULL, 0, "empty"},
+ {gzip_compress, gzip_decompress, 1, "gzip-1"},
+ {gzip_compress, gzip_decompress, 2, "gzip-2"},
+ {gzip_compress, gzip_decompress, 3, "gzip-3"},
+ {gzip_compress, gzip_decompress, 4, "gzip-4"},
+ {gzip_compress, gzip_decompress, 5, "gzip-5"},
+ {gzip_compress, gzip_decompress, 6, "gzip-6"},
+ {gzip_compress, gzip_decompress, 7, "gzip-7"},
+ {gzip_compress, gzip_decompress, 8, "gzip-8"},
+ {gzip_compress, gzip_decompress, 9, "gzip-9"},
+};
+
+uint8_t
+zio_compress_select(uint8_t child, uint8_t parent)
+{
+ ASSERT(child < ZIO_COMPRESS_FUNCTIONS);
+ ASSERT(parent < ZIO_COMPRESS_FUNCTIONS);
+ ASSERT(parent != ZIO_COMPRESS_INHERIT && parent != ZIO_COMPRESS_ON);
+
+ if (child == ZIO_COMPRESS_INHERIT)
+ return (parent);
+
+ if (child == ZIO_COMPRESS_ON)
+ return (ZIO_COMPRESS_ON_VALUE);
+
+ return (child);
+}
+
+int
+zio_compress_data(int cpfunc, void *src, uint64_t srcsize, void **destp,
+ uint64_t *destsizep, uint64_t *destbufsizep)
+{
+ uint64_t *word, *word_end;
+ uint64_t ciosize, gapsize, destbufsize;
+ zio_compress_info_t *ci = &zio_compress_table[cpfunc];
+ char *dest;
+ uint_t allzero;
+
+ ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS);
+ ASSERT((uint_t)cpfunc == ZIO_COMPRESS_EMPTY || ci->ci_compress != NULL);
+
+ /*
+ * If the data is all zeroes, we don't even need to allocate
+ * a block for it. We indicate this by setting *destsizep = 0.
+ */
+ allzero = 1;
+ word = src;
+ word_end = (uint64_t *)(uintptr_t)((uintptr_t)word + srcsize);
+ while (word < word_end) {
+ if (*word++ != 0) {
+ allzero = 0;
+ break;
+ }
+ }
+ if (allzero) {
+ *destp = NULL;
+ *destsizep = 0;
+ *destbufsizep = 0;
+ return (1);
+ }
+
+ if (cpfunc == ZIO_COMPRESS_EMPTY)
+ return (0);
+
+ /* Compress at least 12.5% */
+ destbufsize = P2ALIGN(srcsize - (srcsize >> 3), SPA_MINBLOCKSIZE);
+ if (destbufsize == 0)
+ return (0);
+ dest = zio_buf_alloc(destbufsize);
+
+ ciosize = ci->ci_compress(src, dest, (size_t)srcsize,
+ (size_t)destbufsize, ci->ci_level);
+ if (ciosize > destbufsize) {
+ zio_buf_free(dest, destbufsize);
+ return (0);
+ }
+
+ /* Cool. We compressed at least as much as we were hoping to. */
+
+ /* For security, make sure we don't write random heap crap to disk */
+ gapsize = P2ROUNDUP(ciosize, SPA_MINBLOCKSIZE) - ciosize;
+ if (gapsize != 0) {
+ bzero(dest + ciosize, gapsize);
+ ciosize += gapsize;
+ }
+
+ ASSERT3U(ciosize, <=, destbufsize);
+ ASSERT(P2PHASE(ciosize, SPA_MINBLOCKSIZE) == 0);
+ *destp = dest;
+ *destsizep = ciosize;
+ *destbufsizep = destbufsize;
+
+ return (1);
+}
+
+int
+zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
+ void *dest, uint64_t destsize)
+{
+ zio_compress_info_t *ci = &zio_compress_table[cpfunc];
+
+ ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS);
+
+ return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c
new file mode 100644
index 0000000..4cada09
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c
@@ -0,0 +1,315 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * ZFS fault injection
+ *
+ * To handle fault injection, we keep track of a series of zinject_record_t
+ * structures which describe which logical block(s) should be injected with a
+ * fault. These are kept in a global list. Each record corresponds to a given
+ * spa_t and maintains a special hold on the spa_t so that it cannot be deleted
+ * or exported while the injection record exists.
+ *
+ * Device level injection is done using the 'zi_guid' field. If this is set, it
+ * means that the error is destined for a particular device, not a piece of
+ * data.
+ *
+ * This is a rather poor data structure and algorithm, but we don't expect more
+ * than a few faults at any one time, so it should be sufficient for our needs.
+ */
+
+#include <sys/arc.h>
+#include <sys/zio_impl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+
+uint32_t zio_injection_enabled;
+
+typedef struct inject_handler {
+ int zi_id;
+ spa_t *zi_spa;
+ zinject_record_t zi_record;
+ list_node_t zi_link;
+} inject_handler_t;
+
+static list_t inject_handlers;
+static krwlock_t inject_lock;
+static int inject_next_id = 1;
+
+/*
+ * Returns true if the given record matches the I/O in progress.
+ */
+static boolean_t
+zio_match_handler(zbookmark_t *zb, uint64_t type,
+ zinject_record_t *record, int error)
+{
+ /*
+ * Check for a match against the MOS, which is based on type
+ */
+ if (zb->zb_objset == 0 && record->zi_objset == 0 &&
+ record->zi_object == 0) {
+ if (record->zi_type == DMU_OT_NONE ||
+ type == record->zi_type)
+ return (record->zi_freq == 0 ||
+ spa_get_random(100) < record->zi_freq);
+ else
+ return (B_FALSE);
+ }
+
+ /*
+ * Check for an exact match.
+ */
+ if (zb->zb_objset == record->zi_objset &&
+ zb->zb_object == record->zi_object &&
+ zb->zb_level == record->zi_level &&
+ zb->zb_blkid >= record->zi_start &&
+ zb->zb_blkid <= record->zi_end &&
+ error == record->zi_error)
+ return (record->zi_freq == 0 ||
+ spa_get_random(100) < record->zi_freq);
+
+ return (B_FALSE);
+}
+
+/*
+ * Determine if the I/O in question should return failure. Returns the errno
+ * to be returned to the caller.
+ */
+int
+zio_handle_fault_injection(zio_t *zio, int error)
+{
+ int ret = 0;
+ inject_handler_t *handler;
+
+ /*
+ * Ignore I/O not associated with any logical data.
+ */
+ if (zio->io_logical == NULL)
+ return (0);
+
+ /*
+ * Currently, we only support fault injection on reads.
+ */
+ if (zio->io_type != ZIO_TYPE_READ)
+ return (0);
+
+ rw_enter(&inject_lock, RW_READER);
+
+ for (handler = list_head(&inject_handlers); handler != NULL;
+ handler = list_next(&inject_handlers, handler)) {
+
+ /* Ignore errors not destined for this pool */
+ if (zio->io_spa != handler->zi_spa)
+ continue;
+
+ /* Ignore device errors */
+ if (handler->zi_record.zi_guid != 0)
+ continue;
+
+ /* If this handler matches, return EIO */
+ if (zio_match_handler(&zio->io_logical->io_bookmark,
+ zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE,
+ &handler->zi_record, error)) {
+ ret = error;
+ break;
+ }
+ }
+
+ rw_exit(&inject_lock);
+
+ return (ret);
+}
+
+int
+zio_handle_device_injection(vdev_t *vd, int error)
+{
+ inject_handler_t *handler;
+ int ret = 0;
+
+ rw_enter(&inject_lock, RW_READER);
+
+ for (handler = list_head(&inject_handlers); handler != NULL;
+ handler = list_next(&inject_handlers, handler)) {
+
+ if (vd->vdev_guid == handler->zi_record.zi_guid) {
+ if (handler->zi_record.zi_error == error) {
+ /*
+ * For a failed open, pretend like the device
+ * has gone away.
+ */
+ if (error == ENXIO)
+ vd->vdev_stat.vs_aux =
+ VDEV_AUX_OPEN_FAILED;
+ ret = error;
+ break;
+ }
+ if (handler->zi_record.zi_error == ENXIO) {
+ ret = EIO;
+ break;
+ }
+ }
+ }
+
+ rw_exit(&inject_lock);
+
+ return (ret);
+}
+
+/*
+ * Create a new handler for the given record. We add it to the list, adding
+ * a reference to the spa_t in the process. We increment zio_injection_enabled,
+ * which is the switch to trigger all fault injection.
+ */
+int
+zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record)
+{
+ inject_handler_t *handler;
+ int error;
+ spa_t *spa;
+
+ /*
+ * If this is pool-wide metadata, make sure we unload the corresponding
+ * spa_t, so that the next attempt to load it will trigger the fault.
+ * We call spa_reset() to unload the pool appropriately.
+ */
+ if (flags & ZINJECT_UNLOAD_SPA)
+ if ((error = spa_reset(name)) != 0)
+ return (error);
+
+ if (!(flags & ZINJECT_NULL)) {
+ /*
+ * spa_inject_ref() will add an injection reference, which will
+ * prevent the pool from being removed from the namespace while
+ * still allowing it to be unloaded.
+ */
+ if ((spa = spa_inject_addref(name)) == NULL)
+ return (ENOENT);
+
+ handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP);
+
+ rw_enter(&inject_lock, RW_WRITER);
+
+ *id = handler->zi_id = inject_next_id++;
+ handler->zi_spa = spa;
+ handler->zi_record = *record;
+ list_insert_tail(&inject_handlers, handler);
+ atomic_add_32(&zio_injection_enabled, 1);
+
+ rw_exit(&inject_lock);
+ }
+
+ /*
+ * Flush the ARC, so that any attempts to read this data will end up
+ * going to the ZIO layer. Note that this is a little overkill, but
+ * we don't have the necessary ARC interfaces to do anything else, and
+ * fault injection isn't a performance critical path.
+ */
+ if (flags & ZINJECT_FLUSH_ARC)
+ arc_flush();
+
+ return (0);
+}
+
+/*
+ * Returns the next record with an ID greater than that supplied to the
+ * function. Used to iterate over all handlers in the system.
+ */
+int
+zio_inject_list_next(int *id, char *name, size_t buflen,
+ zinject_record_t *record)
+{
+ inject_handler_t *handler;
+ int ret;
+
+ mutex_enter(&spa_namespace_lock);
+ rw_enter(&inject_lock, RW_READER);
+
+ for (handler = list_head(&inject_handlers); handler != NULL;
+ handler = list_next(&inject_handlers, handler))
+ if (handler->zi_id > *id)
+ break;
+
+ if (handler) {
+ *record = handler->zi_record;
+ *id = handler->zi_id;
+ (void) strncpy(name, spa_name(handler->zi_spa), buflen);
+ ret = 0;
+ } else {
+ ret = ENOENT;
+ }
+
+ rw_exit(&inject_lock);
+ mutex_exit(&spa_namespace_lock);
+
+ return (ret);
+}
+
+/*
+ * Clear the fault handler with the given identifier, or return ENOENT if none
+ * exists.
+ */
+int
+zio_clear_fault(int id)
+{
+ inject_handler_t *handler;
+ int ret;
+
+ rw_enter(&inject_lock, RW_WRITER);
+
+ for (handler = list_head(&inject_handlers); handler != NULL;
+ handler = list_next(&inject_handlers, handler))
+ if (handler->zi_id == id)
+ break;
+
+ if (handler == NULL) {
+ ret = ENOENT;
+ } else {
+ list_remove(&inject_handlers, handler);
+ spa_inject_delref(handler->zi_spa);
+ kmem_free(handler, sizeof (inject_handler_t));
+ atomic_add_32(&zio_injection_enabled, -1);
+ ret = 0;
+ }
+
+ rw_exit(&inject_lock);
+
+ return (ret);
+}
+
+void
+zio_inject_init(void)
+{
+ list_create(&inject_handlers, sizeof (inject_handler_t),
+ offsetof(inject_handler_t, zi_link));
+}
+
+void
+zio_inject_fini(void)
+{
+ list_destroy(&inject_handlers);
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
new file mode 100644
index 0000000..9aa8b7f
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
@@ -0,0 +1,796 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * ZFS volume emulation driver.
+ *
+ * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
+ * Volumes are accessed through the symbolic links named:
+ *
+ * /dev/zvol/dsk/<pool_name>/<dataset_name>
+ * /dev/zvol/rdsk/<pool_name>/<dataset_name>
+ *
+ * These links are created by the ZFS-specific devfsadm link generator.
+ * Volumes are persistent through reboot. No user command needs to be
+ * run before opening and using a device.
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/errno.h>
+#include <sys/uio.h>
+#include <sys/bio.h>
+#include <sys/kmem.h>
+#include <sys/conf.h>
+#include <sys/cmn_err.h>
+#include <sys/stat.h>
+#include <sys/zap.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/dsl_prop.h>
+#include <sys/byteorder.h>
+#include <sys/dirent.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zil.h>
+#include <sys/refcount.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_rlock.h>
+#include <geom/geom.h>
+
+#include "zfs_namecheck.h"
+
+struct g_class zfs_zvol_class = {
+ .name = "ZFS::ZVOL",
+ .version = G_VERSION,
+};
+
+DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
+
+#define ZVOL_OBJ 1ULL
+#define ZVOL_ZAP_OBJ 2ULL
+
+static uint32_t zvol_minors;
+
+/*
+ * The in-core state of each volume.
+ */
+typedef struct zvol_state {
+ char zv_name[MAXPATHLEN]; /* pool/dd name */
+ uint64_t zv_volsize; /* amount of space we advertise */
+ uint64_t zv_volblocksize; /* volume block size */
+ struct g_provider *zv_provider; /* GEOM provider */
+ uint8_t zv_min_bs; /* minimum addressable block shift */
+ uint8_t zv_readonly; /* hard readonly; like write-protect */
+ objset_t *zv_objset; /* objset handle */
+ uint32_t zv_mode; /* DS_MODE_* flags at open time */
+ uint32_t zv_total_opens; /* total open count */
+ zilog_t *zv_zilog; /* ZIL handle */
+ uint64_t zv_txg_assign; /* txg to assign during ZIL replay */
+ znode_t zv_znode; /* for range locking */
+ int zv_state;
+ struct bio_queue_head zv_queue;
+ struct mtx zv_queue_mtx; /* zv_queue mutex */
+} zvol_state_t;
+
+/*
+ * zvol maximum transfer in one DMU tx.
+ */
+int zvol_maxphys = DMU_MAX_ACCESS/2;
+
+static int zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio);
+
+int
+zvol_check_volsize(uint64_t volsize, uint64_t blocksize)
+{
+ if (volsize == 0)
+ return (EINVAL);
+
+ if (volsize % blocksize != 0)
+ return (EINVAL);
+
+#ifdef _ILP32
+ if (volsize - 1 > SPEC_MAXOFFSET_T)
+ return (EOVERFLOW);
+#endif
+ return (0);
+}
+
+int
+zvol_check_volblocksize(uint64_t volblocksize)
+{
+ if (volblocksize < SPA_MINBLOCKSIZE ||
+ volblocksize > SPA_MAXBLOCKSIZE ||
+ !ISP2(volblocksize))
+ return (EDOM);
+
+ return (0);
+}
+
+static void
+zvol_readonly_changed_cb(void *arg, uint64_t newval)
+{
+ zvol_state_t *zv = arg;
+
+ zv->zv_readonly = (uint8_t)newval;
+}
+
+int
+zvol_get_stats(objset_t *os, nvlist_t *nv)
+{
+ int error;
+ dmu_object_info_t doi;
+ uint64_t val;
+
+
+ error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val);
+ if (error)
+ return (error);
+
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val);
+
+ error = dmu_object_info(os, ZVOL_OBJ, &doi);
+
+ if (error == 0) {
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE,
+ doi.doi_data_block_size);
+ }
+
+ return (error);
+}
+
+static zvol_state_t *
+zvol_minor_lookup(const char *name)
+{
+ struct g_provider *pp;
+ struct g_geom *gp;
+
+ g_topology_assert();
+
+ LIST_FOREACH(gp, &zfs_zvol_class.geom, geom) {
+ LIST_FOREACH(pp, &gp->provider, provider) {
+ if (strcmp(pp->name + sizeof(ZVOL_DEV_DIR), name) == 0)
+ return (pp->private);
+ }
+ }
+
+ return (NULL);
+}
+
+static int
+zvol_access(struct g_provider *pp, int acr, int acw, int ace)
+{
+ zvol_state_t *zv;
+
+ g_topology_assert();
+
+ zv = pp->private;
+ if (zv == NULL) {
+ if (acr <= 0 && acw <= 0 && ace <= 0)
+ return (0);
+ return (pp->error);
+ }
+
+ ASSERT(zv->zv_objset != NULL);
+
+ if (acw > 0 && (zv->zv_readonly || (zv->zv_mode & DS_MODE_READONLY)))
+ return (EROFS);
+
+ zv->zv_total_opens += acr + acw + ace;
+
+ return (0);
+}
+
+/*
+ * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions.
+ *
+ * We store data in the log buffers if it's small enough.
+ * Otherwise we will later flush the data out via dmu_sync().
+ */
+ssize_t zvol_immediate_write_sz = 32768;
+
+static void
+zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t len)
+{
+ uint32_t blocksize = zv->zv_volblocksize;
+ lr_write_t *lr;
+
+ while (len) {
+ ssize_t nbytes = MIN(len, blocksize - P2PHASE(off, blocksize));
+ itx_t *itx = zil_itx_create(TX_WRITE, sizeof (*lr));
+
+ itx->itx_wr_state =
+ len > zvol_immediate_write_sz ? WR_INDIRECT : WR_NEED_COPY;
+ itx->itx_private = zv;
+ lr = (lr_write_t *)&itx->itx_lr;
+ lr->lr_foid = ZVOL_OBJ;
+ lr->lr_offset = off;
+ lr->lr_length = nbytes;
+ lr->lr_blkoff = off - P2ALIGN_TYPED(off, blocksize, uint64_t);
+ BP_ZERO(&lr->lr_blkptr);
+
+ (void) zil_itx_assign(zv->zv_zilog, itx, tx);
+ len -= nbytes;
+ off += nbytes;
+ }
+}
+
+static void
+zvol_start(struct bio *bp)
+{
+ zvol_state_t *zv;
+
+ switch (bp->bio_cmd) {
+ case BIO_READ:
+ case BIO_WRITE:
+ case BIO_FLUSH:
+ zv = bp->bio_to->private;
+ ASSERT(zv != NULL);
+ mtx_lock(&zv->zv_queue_mtx);
+ bioq_insert_tail(&zv->zv_queue, bp);
+ wakeup_one(&zv->zv_queue);
+ mtx_unlock(&zv->zv_queue_mtx);
+ break;
+ case BIO_DELETE:
+ case BIO_GETATTR:
+ default:
+ g_io_deliver(bp, EOPNOTSUPP);
+ break;
+ }
+}
+
+static void
+zvol_serve_one(zvol_state_t *zv, struct bio *bp)
+{
+ uint64_t off, volsize;
+ size_t size, resid;
+ char *addr;
+ objset_t *os;
+ rl_t *rl;
+ int error = 0;
+ boolean_t reading;
+
+ off = bp->bio_offset;
+ volsize = zv->zv_volsize;
+
+ os = zv->zv_objset;
+ ASSERT(os != NULL);
+
+ addr = bp->bio_data;
+ resid = bp->bio_length;
+
+ error = 0;
+
+ /*
+ * There must be no buffer changes when doing a dmu_sync() because
+ * we can't change the data whilst calculating the checksum.
+ * A better approach than a per zvol rwlock would be to lock ranges.
+ */
+ reading = (bp->bio_cmd == BIO_READ);
+ rl = zfs_range_lock(&zv->zv_znode, off, resid,
+ reading ? RL_READER : RL_WRITER);
+
+ while (resid != 0 && off < volsize) {
+
+ size = MIN(resid, zvol_maxphys); /* zvol_maxphys per tx */
+
+ if (size > volsize - off) /* don't write past the end */
+ size = volsize - off;
+
+ if (reading) {
+ error = dmu_read(os, ZVOL_OBJ, off, size, addr);
+ } else {
+ dmu_tx_t *tx = dmu_tx_create(os);
+ dmu_tx_hold_write(tx, ZVOL_OBJ, off, size);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ } else {
+ dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
+ zvol_log_write(zv, tx, off, size);
+ dmu_tx_commit(tx);
+ }
+ }
+ if (error)
+ break;
+ off += size;
+ addr += size;
+ resid -= size;
+ }
+ zfs_range_unlock(rl);
+
+ bp->bio_completed = bp->bio_length - resid;
+ if (bp->bio_completed < bp->bio_length)
+ bp->bio_error = (off > volsize ? EINVAL : error);
+
+ /*
+ * XXX: We are devilering here?
+ * Looks like I don't understand something here, but I was sure it was
+ * an async request.
+ */
+ g_io_deliver(bp, bp->bio_error);
+}
+
+static void
+zvol_worker(void *arg)
+{
+ zvol_state_t *zv;
+ struct bio *bp;
+
+ zv = arg;
+ for (;;) {
+ mtx_lock(&zv->zv_queue_mtx);
+ bp = bioq_takefirst(&zv->zv_queue);
+ if (bp == NULL) {
+ if (zv->zv_state == 1) {
+ zv->zv_state = 2;
+ wakeup(&zv->zv_state);
+ mtx_unlock(&zv->zv_queue_mtx);
+ kthread_exit(0);
+ }
+ msleep(&zv->zv_queue, &zv->zv_queue_mtx, PRIBIO | PDROP,
+ "zvol:io", 0);
+ continue;
+ }
+ mtx_unlock(&zv->zv_queue_mtx);
+ if (bp->bio_cmd == BIO_FLUSH) {
+ zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ);
+ g_io_deliver(bp, 0);
+ } else {
+ zvol_serve_one(zv, bp);
+ }
+ }
+}
+
+void
+zvol_create_cb(objset_t *os, void *arg, dmu_tx_t *tx)
+{
+ zfs_create_data_t *zc = arg;
+ int error;
+ uint64_t volblocksize, volsize;
+
+ VERIFY(nvlist_lookup_uint64(zc->zc_props,
+ zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0);
+ if (nvlist_lookup_uint64(zc->zc_props,
+ zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0)
+ volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
+
+ /*
+ * These properites must be removed from the list so the generic
+ * property setting step won't apply to them.
+ */
+ VERIFY(nvlist_remove_all(zc->zc_props,
+ zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0);
+ (void) nvlist_remove_all(zc->zc_props,
+ zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE));
+
+ error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize,
+ DMU_OT_NONE, 0, tx);
+ ASSERT(error == 0);
+
+ error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP,
+ DMU_OT_NONE, 0, tx);
+ ASSERT(error == 0);
+
+ error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx);
+ ASSERT(error == 0);
+}
+
+/*
+ * Replay a TX_WRITE ZIL transaction that didn't get committed
+ * after a system failure
+ */
+static int
+zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap)
+{
+ objset_t *os = zv->zv_objset;
+ char *data = (char *)(lr + 1); /* data follows lr_write_t */
+ uint64_t off = lr->lr_offset;
+ uint64_t len = lr->lr_length;
+ dmu_tx_t *tx;
+ int error;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_write(tx, ZVOL_OBJ, off, len);
+ error = dmu_tx_assign(tx, zv->zv_txg_assign);
+ if (error) {
+ dmu_tx_abort(tx);
+ } else {
+ dmu_write(os, ZVOL_OBJ, off, len, data, tx);
+ dmu_tx_commit(tx);
+ }
+
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+zvol_replay_err(zvol_state_t *zv, lr_t *lr, boolean_t byteswap)
+{
+ return (ENOTSUP);
+}
+
+/*
+ * Callback vectors for replaying records.
+ * Only TX_WRITE is needed for zvol.
+ */
+zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
+ zvol_replay_err, /* 0 no such transaction type */
+ zvol_replay_err, /* TX_CREATE */
+ zvol_replay_err, /* TX_MKDIR */
+ zvol_replay_err, /* TX_MKXATTR */
+ zvol_replay_err, /* TX_SYMLINK */
+ zvol_replay_err, /* TX_REMOVE */
+ zvol_replay_err, /* TX_RMDIR */
+ zvol_replay_err, /* TX_LINK */
+ zvol_replay_err, /* TX_RENAME */
+ zvol_replay_write, /* TX_WRITE */
+ zvol_replay_err, /* TX_TRUNCATE */
+ zvol_replay_err, /* TX_SETATTR */
+ zvol_replay_err, /* TX_ACL */
+};
+
+/*
+ * Create a minor node for the specified volume.
+ */
+int
+zvol_create_minor(const char *name, dev_t dev)
+{
+ struct g_provider *pp;
+ struct g_geom *gp;
+ zvol_state_t *zv;
+ objset_t *os;
+ dmu_object_info_t doi;
+ uint64_t volsize;
+ int ds_mode = DS_MODE_PRIMARY;
+ int error;
+
+ DROP_GIANT();
+ g_topology_lock();
+
+ if ((zv = zvol_minor_lookup(name)) != NULL) {
+ error = EEXIST;
+ goto end;
+ }
+
+ if (strchr(name, '@') != 0)
+ ds_mode |= DS_MODE_READONLY;
+
+ error = dmu_objset_open(name, DMU_OST_ZVOL, ds_mode, &os);
+ if (error)
+ goto end;
+
+ g_topology_unlock();
+ PICKUP_GIANT();
+ error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
+ DROP_GIANT();
+ g_topology_lock();
+ if (error) {
+ dmu_objset_close(os);
+ goto end;
+ }
+
+ gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
+ gp->start = zvol_start;
+ gp->access = zvol_access;
+ pp = g_new_providerf(gp, "%s/%s", ZVOL_DEV_DIR, name);
+ pp->mediasize = volsize;
+ pp->sectorsize = DEV_BSIZE;
+
+ zv = kmem_zalloc(sizeof(*zv), KM_SLEEP);
+ (void) strcpy(zv->zv_name, name);
+ zv->zv_min_bs = DEV_BSHIFT;
+ zv->zv_provider = pp;
+ zv->zv_volsize = pp->mediasize;
+ zv->zv_objset = os;
+ zv->zv_mode = ds_mode;
+ zv->zv_zilog = zil_open(os, zvol_get_data);
+ mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL);
+ avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare,
+ sizeof (rl_t), offsetof(rl_t, r_node));
+
+
+ /* get and cache the blocksize */
+ error = dmu_object_info(os, ZVOL_OBJ, &doi);
+ ASSERT(error == 0);
+ zv->zv_volblocksize = doi.doi_data_block_size;
+
+ zil_replay(os, zv, &zv->zv_txg_assign, zvol_replay_vector);
+
+ /* XXX this should handle the possible i/o error */
+ VERIFY(dsl_prop_register(dmu_objset_ds(zv->zv_objset),
+ "readonly", zvol_readonly_changed_cb, zv) == 0);
+
+ pp->private = zv;
+ g_error_provider(pp, 0);
+
+ bioq_init(&zv->zv_queue);
+ mtx_init(&zv->zv_queue_mtx, "zvol", NULL, MTX_DEF);
+ zv->zv_state = 0;
+ kthread_create(zvol_worker, zv, NULL, 0, 0, "zvol:worker %s", pp->name);
+
+ zvol_minors++;
+end:
+ g_topology_unlock();
+ PICKUP_GIANT();
+
+ return (error);
+}
+
+/*
+ * Remove minor node for the specified volume.
+ */
+int
+zvol_remove_minor(const char *name)
+{
+ struct g_provider *pp;
+ zvol_state_t *zv;
+ int error = 0;
+
+ DROP_GIANT();
+ g_topology_lock();
+
+ if ((zv = zvol_minor_lookup(name)) == NULL) {
+ error = ENXIO;
+ goto end;
+ }
+
+ if (zv->zv_total_opens != 0) {
+ error = EBUSY;
+ goto end;
+ }
+
+ VERIFY(dsl_prop_unregister(dmu_objset_ds(zv->zv_objset),
+ "readonly", zvol_readonly_changed_cb, zv) == 0);
+
+ mtx_lock(&zv->zv_queue_mtx);
+ zv->zv_state = 1;
+ wakeup_one(&zv->zv_queue);
+ while (zv->zv_state != 2)
+ msleep(&zv->zv_state, &zv->zv_queue_mtx, 0, "zvol:w", 0);
+ mtx_unlock(&zv->zv_queue_mtx);
+ mtx_destroy(&zv->zv_queue_mtx);
+
+ pp = zv->zv_provider;
+ pp->private = NULL;
+ g_wither_geom(pp->geom, ENXIO);
+
+ zil_close(zv->zv_zilog);
+ zv->zv_zilog = NULL;
+ dmu_objset_close(zv->zv_objset);
+ zv->zv_objset = NULL;
+ avl_destroy(&zv->zv_znode.z_range_avl);
+ mutex_destroy(&zv->zv_znode.z_range_lock);
+
+ kmem_free(zv, sizeof(*zv));
+
+ zvol_minors--;
+end:
+ g_topology_unlock();
+ PICKUP_GIANT();
+
+ return (error);
+}
+
+int
+zvol_set_volsize(const char *name, dev_t dev, uint64_t volsize)
+{
+ zvol_state_t *zv;
+ dmu_tx_t *tx;
+ int error;
+ dmu_object_info_t doi;
+
+ DROP_GIANT();
+ g_topology_lock();
+
+ if ((zv = zvol_minor_lookup(name)) == NULL) {
+ error = ENXIO;
+ goto end;
+ }
+
+ if ((error = dmu_object_info(zv->zv_objset, ZVOL_OBJ, &doi)) != 0 ||
+ (error = zvol_check_volsize(volsize,
+ doi.doi_data_block_size)) != 0) {
+ goto end;
+ }
+
+ if (zv->zv_readonly || (zv->zv_mode & DS_MODE_READONLY)) {
+ error = EROFS;
+ goto end;
+ }
+
+ tx = dmu_tx_create(zv->zv_objset);
+ dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
+ dmu_tx_hold_free(tx, ZVOL_OBJ, volsize, DMU_OBJECT_END);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ goto end;
+ }
+
+ error = zap_update(zv->zv_objset, ZVOL_ZAP_OBJ, "size", 8, 1,
+ &volsize, tx);
+ if (error == 0) {
+ error = dmu_free_range(zv->zv_objset, ZVOL_OBJ, volsize,
+ DMU_OBJECT_END, tx);
+ }
+
+ dmu_tx_commit(tx);
+
+ if (error == 0) {
+ zv->zv_volsize = volsize;
+ zv->zv_provider->mediasize = volsize; /* XXX: Not supported. */
+ }
+end:
+ g_topology_unlock();
+ PICKUP_GIANT();
+
+ return (error);
+}
+
+int
+zvol_set_volblocksize(const char *name, uint64_t volblocksize)
+{
+ zvol_state_t *zv;
+ dmu_tx_t *tx;
+ int error;
+
+ DROP_GIANT();
+ g_topology_lock();
+
+ if ((zv = zvol_minor_lookup(name)) == NULL) {
+ error = ENXIO;
+ goto end;
+ }
+
+ if (zv->zv_readonly || (zv->zv_mode & DS_MODE_READONLY)) {
+ error = EROFS;
+ goto end;
+ }
+
+ tx = dmu_tx_create(zv->zv_objset);
+ dmu_tx_hold_bonus(tx, ZVOL_OBJ);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ } else {
+ error = dmu_object_set_blocksize(zv->zv_objset, ZVOL_OBJ,
+ volblocksize, 0, tx);
+ if (error == ENOTSUP)
+ error = EBUSY;
+ dmu_tx_commit(tx);
+ /* XXX: Not supported. */
+#if 0
+ if (error == 0)
+ zv->zv_provider->sectorsize = zc->zc_volblocksize;
+#endif
+ }
+end:
+ g_topology_unlock();
+ PICKUP_GIANT();
+
+ return (error);
+}
+
+void
+zvol_get_done(dmu_buf_t *db, void *vzgd)
+{
+ zgd_t *zgd = (zgd_t *)vzgd;
+ rl_t *rl = zgd->zgd_rl;
+
+ dmu_buf_rele(db, vzgd);
+ zfs_range_unlock(rl);
+ zil_add_vdev(zgd->zgd_zilog, DVA_GET_VDEV(BP_IDENTITY(zgd->zgd_bp)));
+ kmem_free(zgd, sizeof (zgd_t));
+}
+
+/*
+ * Get data to generate a TX_WRITE intent log record.
+ */
+static int
+zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
+{
+ zvol_state_t *zv = arg;
+ objset_t *os = zv->zv_objset;
+ dmu_buf_t *db;
+ rl_t *rl;
+ zgd_t *zgd;
+ uint64_t boff; /* block starting offset */
+ int dlen = lr->lr_length; /* length of user data */
+ int error;
+
+ ASSERT(zio);
+ ASSERT(dlen != 0);
+
+ /*
+ * Write records come in two flavors: immediate and indirect.
+ * For small writes it's cheaper to store the data with the
+ * log record (immediate); for large writes it's cheaper to
+ * sync the data and get a pointer to it (indirect) so that
+ * we don't have to write the data twice.
+ */
+ if (buf != NULL) /* immediate write */
+ return (dmu_read(os, ZVOL_OBJ, lr->lr_offset, dlen, buf));
+
+ zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP);
+ zgd->zgd_zilog = zv->zv_zilog;
+ zgd->zgd_bp = &lr->lr_blkptr;
+
+ /*
+ * Lock the range of the block to ensure that when the data is
+ * written out and it's checksum is being calculated that no other
+ * thread can change the block.
+ */
+ boff = P2ALIGN_TYPED(lr->lr_offset, zv->zv_volblocksize, uint64_t);
+ rl = zfs_range_lock(&zv->zv_znode, boff, zv->zv_volblocksize,
+ RL_READER);
+ zgd->zgd_rl = rl;
+
+ VERIFY(0 == dmu_buf_hold(os, ZVOL_OBJ, lr->lr_offset, zgd, &db));
+ error = dmu_sync(zio, db, &lr->lr_blkptr,
+ lr->lr_common.lrc_txg, zvol_get_done, zgd);
+ if (error == 0)
+ zil_add_vdev(zv->zv_zilog,
+ DVA_GET_VDEV(BP_IDENTITY(&lr->lr_blkptr)));
+ /*
+ * If we get EINPROGRESS, then we need to wait for a
+ * write IO initiated by dmu_sync() to complete before
+ * we can release this dbuf. We will finish everything
+ * up in the zvol_get_done() callback.
+ */
+ if (error == EINPROGRESS)
+ return (0);
+ dmu_buf_rele(db, zgd);
+ zfs_range_unlock(rl);
+ kmem_free(zgd, sizeof (zgd_t));
+ return (error);
+}
+
+int
+zvol_busy(void)
+{
+ return (zvol_minors != 0);
+}
+
+void
+zvol_init(void)
+{
+ ZFS_LOG(1, "ZVOL Initialized.");
+}
+
+void
+zvol_fini(void)
+{
+ ZFS_LOG(1, "ZVOL Deinitialized.");
+}
OpenPOWER on IntegriCloud