summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--sys/boot/zfs/zfs.c2
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c507
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c3
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c28
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c10
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c74
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c22
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c6
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c10
-rw-r--r--sys/modules/zfs/Makefile4
-rw-r--r--sys/sys/ioccom.h5
11 files changed, 470 insertions, 201 deletions
diff --git a/sys/boot/zfs/zfs.c b/sys/boot/zfs/zfs.c
index 52df773..99bb60a 100644
--- a/sys/boot/zfs/zfs.c
+++ b/sys/boot/zfs/zfs.c
@@ -397,7 +397,7 @@ zfs_dev_init(void)
/*
* Open all the disks we can find and see if we can reconstruct
* ZFS pools from them. Bogusly assumes that the disks are named
- * diskN or diskNsM.
+ * diskN, diskNpM or diskNsM.
*/
zfs_init();
for (unit = 0; unit < 32 /* XXX */; unit++) {
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
index 83a4c97..ceb4c87 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
@@ -186,6 +186,11 @@ SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0,
SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RDTUN,
&zfs_mdcomp_disable, 0, "Disable metadata compression");
+#ifdef ZIO_USE_UMA
+extern kmem_cache_t *zio_buf_cache[];
+extern kmem_cache_t *zio_data_buf_cache[];
+#endif
+
/*
* Note that buffers can be in one of 6 states:
* ARC_anon - anonymous (discussed below)
@@ -218,13 +223,31 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RDTUN,
* second level ARC benefit from these fast lookups.
*/
+#define ARCS_LOCK_PAD CACHE_LINE_SIZE
+struct arcs_lock {
+ kmutex_t arcs_lock;
+#ifdef _KERNEL
+ unsigned char pad[(ARCS_LOCK_PAD - sizeof (kmutex_t))];
+#endif
+};
+
+/*
+ * must be power of two for mask use to work
+ *
+ */
+#define ARC_BUFC_NUMDATALISTS 16
+#define ARC_BUFC_NUMMETADATALISTS 16
+#define ARC_BUFC_NUMLISTS (ARC_BUFC_NUMMETADATALISTS + ARC_BUFC_NUMDATALISTS)
+
typedef struct arc_state {
- list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */
uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */
uint64_t arcs_size; /* total amount of data in this state */
- kmutex_t arcs_mtx;
+ list_t arcs_lists[ARC_BUFC_NUMLISTS]; /* list of evictable buffers */
+ struct arcs_lock arcs_locks[ARC_BUFC_NUMLISTS] __aligned(CACHE_LINE_SIZE);
} arc_state_t;
+#define ARCS_LOCK(s, i) (&((s)->arcs_locks[(i)].arcs_lock))
+
/* The 6 states: */
static arc_state_t ARC_anon;
static arc_state_t ARC_mru;
@@ -248,7 +271,9 @@ typedef struct arc_stats {
kstat_named_t arcstat_mru_ghost_hits;
kstat_named_t arcstat_mfu_hits;
kstat_named_t arcstat_mfu_ghost_hits;
+ kstat_named_t arcstat_allocated;
kstat_named_t arcstat_deleted;
+ kstat_named_t arcstat_stolen;
kstat_named_t arcstat_recycle_miss;
kstat_named_t arcstat_mutex_miss;
kstat_named_t arcstat_evict_skip;
@@ -280,6 +305,19 @@ typedef struct arc_stats {
kstat_named_t arcstat_l2_size;
kstat_named_t arcstat_l2_hdr_size;
kstat_named_t arcstat_memory_throttle_count;
+ kstat_named_t arcstat_l2_write_trylock_fail;
+ kstat_named_t arcstat_l2_write_passed_headroom;
+ kstat_named_t arcstat_l2_write_spa_mismatch;
+ kstat_named_t arcstat_l2_write_in_l2;
+ kstat_named_t arcstat_l2_write_hdr_io_in_progress;
+ kstat_named_t arcstat_l2_write_not_cacheable;
+ kstat_named_t arcstat_l2_write_full;
+ kstat_named_t arcstat_l2_write_buffer_iter;
+ kstat_named_t arcstat_l2_write_pios;
+ kstat_named_t arcstat_l2_write_bytes_written;
+ kstat_named_t arcstat_l2_write_buffer_bytes_scanned;
+ kstat_named_t arcstat_l2_write_buffer_list_iter;
+ kstat_named_t arcstat_l2_write_buffer_list_null_iter;
} arc_stats_t;
static arc_stats_t arc_stats = {
@@ -297,7 +335,9 @@ static arc_stats_t arc_stats = {
{ "mru_ghost_hits", KSTAT_DATA_UINT64 },
{ "mfu_hits", KSTAT_DATA_UINT64 },
{ "mfu_ghost_hits", KSTAT_DATA_UINT64 },
+ { "allocated", KSTAT_DATA_UINT64 },
{ "deleted", KSTAT_DATA_UINT64 },
+ { "stolen", KSTAT_DATA_UINT64 },
{ "recycle_miss", KSTAT_DATA_UINT64 },
{ "mutex_miss", KSTAT_DATA_UINT64 },
{ "evict_skip", KSTAT_DATA_UINT64 },
@@ -328,7 +368,20 @@ static arc_stats_t arc_stats = {
{ "l2_io_error", KSTAT_DATA_UINT64 },
{ "l2_size", KSTAT_DATA_UINT64 },
{ "l2_hdr_size", KSTAT_DATA_UINT64 },
- { "memory_throttle_count", KSTAT_DATA_UINT64 }
+ { "memory_throttle_count", KSTAT_DATA_UINT64 },
+ { "l2_write_trylock_fail", KSTAT_DATA_UINT64 },
+ { "l2_write_passed_headroom", KSTAT_DATA_UINT64 },
+ { "l2_write_spa_mismatch", KSTAT_DATA_UINT64 },
+ { "l2_write_in_l2", KSTAT_DATA_UINT64 },
+ { "l2_write_io_in_progress", KSTAT_DATA_UINT64 },
+ { "l2_write_not_cacheable", KSTAT_DATA_UINT64 },
+ { "l2_write_full", KSTAT_DATA_UINT64 },
+ { "l2_write_buffer_iter", KSTAT_DATA_UINT64 },
+ { "l2_write_pios", KSTAT_DATA_UINT64 },
+ { "l2_write_bytes_written", KSTAT_DATA_UINT64 },
+ { "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 },
+ { "l2_write_buffer_list_iter", KSTAT_DATA_UINT64 },
+ { "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 }
};
#define ARCSTAT(stat) (arc_stats.stat.value.ui64)
@@ -336,7 +389,7 @@ static arc_stats_t arc_stats = {
#define ARCSTAT_INCR(stat, val) \
atomic_add_64(&arc_stats.stat.value.ui64, (val));
-#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1)
+#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1)
#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1)
#define ARCSTAT_MAX(stat, val) { \
@@ -370,7 +423,7 @@ static arc_stats_t arc_stats = {
}
kstat_t *arc_ksp;
-static arc_state_t *arc_anon;
+static arc_state_t *arc_anon;
static arc_state_t *arc_mru;
static arc_state_t *arc_mru_ghost;
static arc_state_t *arc_mfu;
@@ -514,7 +567,7 @@ static void arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes);
* Hash table routines
*/
-#define HT_LOCK_PAD 128
+#define HT_LOCK_PAD CACHE_LINE_SIZE
struct ht_lock {
kmutex_t ht_lock;
@@ -527,7 +580,7 @@ struct ht_lock {
typedef struct buf_hash_table {
uint64_t ht_mask;
arc_buf_hdr_t **ht_table;
- struct ht_lock ht_locks[BUF_LOCKS];
+ struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE);
} buf_hash_table_t;
static buf_hash_table_t buf_hash_table;
@@ -541,13 +594,19 @@ static buf_hash_table_t buf_hash_table;
uint64_t zfs_crc64_table[256];
+#ifdef ZIO_USE_UMA
+extern kmem_cache_t *zio_buf_cache[];
+extern kmem_cache_t *zio_data_buf_cache[];
+#endif
+
/*
* Level 2 ARC
*/
-#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */
-#define L2ARC_HEADROOM 4 /* num of writes */
+#define L2ARC_WRITE_SIZE (64 * 1024 * 1024) /* initial write max */
+#define L2ARC_HEADROOM 128 /* num of writes */
#define L2ARC_FEED_SECS 1 /* caching interval */
+#define L2ARC_FEED_SECS_SHIFT 1 /* caching interval shift */
#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent)
#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done)
@@ -559,7 +618,66 @@ uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */
uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */
uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */
uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */
-boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */
+uint64_t l2arc_feed_secs_shift = L2ARC_FEED_SECS_SHIFT; /* interval seconds shift */
+boolean_t l2arc_noprefetch = B_FALSE; /* don't cache prefetch bufs */
+
+
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW,
+ &l2arc_write_max, 0, "max write size");
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW,
+ &l2arc_write_boost, 0, "extra write during warmup");
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW,
+ &l2arc_headroom, 0, "number of dev writes");
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW,
+ &l2arc_feed_secs, 0, "interval seconds");
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs_shift, CTLFLAG_RW,
+ &l2arc_feed_secs_shift, 0, "power of 2 division of feed seconds");
+
+SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW,
+ &l2arc_noprefetch, 0, "don't cache prefetch bufs");
+
+
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD,
+ &ARC_anon.arcs_size, 0, "size of anonymous state");
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD,
+ &ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0, "size of anonymous state");
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD,
+ &ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0, "size of anonymous state");
+
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD,
+ &ARC_mru.arcs_size, 0, "size of mru state");
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD,
+ &ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru state");
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD,
+ &ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru state");
+
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD,
+ &ARC_mru_ghost.arcs_size, 0, "size of mru ghost state");
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD,
+ &ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
+ "size of metadata in mru ghost state");
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD,
+ &ARC_mru_ghost.arcs_lsize[ARC_BUFC_DATA], 0,
+ "size of data in mru ghost state");
+
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD,
+ &ARC_mfu.arcs_size, 0, "size of mfu state");
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD,
+ &ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu state");
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD,
+ &ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu state");
+
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD,
+ &ARC_mfu_ghost.arcs_size, 0, "size of mfu ghost state");
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD,
+ &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
+ "size of metadata in mfu ghost state");
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD,
+ &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_DATA], 0,
+ "size of data in mfu ghost state");
+
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD,
+ &ARC_l2c_only.arcs_size, 0, "size of mru state");
/*
* L2ARC Internals
@@ -953,18 +1071,38 @@ arc_buf_freeze(arc_buf_t *buf)
}
static void
+get_buf_info(arc_buf_hdr_t *ab, arc_state_t *state, list_t **list, kmutex_t **lock)
+{
+ uint64_t buf_hashid = buf_hash(ab->b_spa, &ab->b_dva, ab->b_birth);
+
+ if (ab->b_type == ARC_BUFC_METADATA)
+ buf_hashid &= (ARC_BUFC_NUMMETADATALISTS - 1);
+ else {
+ buf_hashid &= (ARC_BUFC_NUMDATALISTS - 1);
+ buf_hashid += ARC_BUFC_NUMMETADATALISTS;
+ }
+
+ *list = &state->arcs_lists[buf_hashid];
+ *lock = ARCS_LOCK(state, buf_hashid);
+}
+
+
+static void
add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
{
+
ASSERT(MUTEX_HELD(hash_lock));
if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
(ab->b_state != arc_anon)) {
uint64_t delta = ab->b_size * ab->b_datacnt;
- list_t *list = &ab->b_state->arcs_list[ab->b_type];
uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
+ list_t *list;
+ kmutex_t *lock;
- ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx));
- mutex_enter(&ab->b_state->arcs_mtx);
+ get_buf_info(ab, ab->b_state, &list, &lock);
+ ASSERT(!MUTEX_HELD(lock));
+ mutex_enter(lock);
ASSERT(list_link_active(&ab->b_arc_node));
list_remove(list, ab);
if (GHOST_STATE(ab->b_state)) {
@@ -975,7 +1113,7 @@ add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
ASSERT(delta > 0);
ASSERT3U(*size, >=, delta);
atomic_add_64(size, -delta);
- mutex_exit(&ab->b_state->arcs_mtx);
+ mutex_exit(lock);
/* remove the prefetch flag if we get a reference */
if (ab->b_flags & ARC_PREFETCH)
ab->b_flags &= ~ARC_PREFETCH;
@@ -994,14 +1132,17 @@ remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
(state != arc_anon)) {
uint64_t *size = &state->arcs_lsize[ab->b_type];
+ list_t *list;
+ kmutex_t *lock;
- ASSERT(!MUTEX_HELD(&state->arcs_mtx));
- mutex_enter(&state->arcs_mtx);
+ get_buf_info(ab, state, &list, &lock);
+ ASSERT(!MUTEX_HELD(lock));
+ mutex_enter(lock);
ASSERT(!list_link_active(&ab->b_arc_node));
- list_insert_head(&state->arcs_list[ab->b_type], ab);
+ list_insert_head(list, ab);
ASSERT(ab->b_datacnt > 0);
atomic_add_64(size, ab->b_size * ab->b_datacnt);
- mutex_exit(&state->arcs_mtx);
+ mutex_exit(lock);
}
return (cnt);
}
@@ -1016,6 +1157,8 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
arc_state_t *old_state = ab->b_state;
int64_t refcnt = refcount_count(&ab->b_refcnt);
uint64_t from_delta, to_delta;
+ list_t *list;
+ kmutex_t *lock;
ASSERT(MUTEX_HELD(hash_lock));
ASSERT(new_state != old_state);
@@ -1030,14 +1173,16 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
*/
if (refcnt == 0) {
if (old_state != arc_anon) {
- int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
+ int use_mutex;
uint64_t *size = &old_state->arcs_lsize[ab->b_type];
+ get_buf_info(ab, old_state, &list, &lock);
+ use_mutex = !MUTEX_HELD(lock);
if (use_mutex)
- mutex_enter(&old_state->arcs_mtx);
+ mutex_enter(lock);
ASSERT(list_link_active(&ab->b_arc_node));
- list_remove(&old_state->arcs_list[ab->b_type], ab);
+ list_remove(list, ab);
/*
* If prefetching out of the ghost cache,
@@ -1052,16 +1197,18 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
atomic_add_64(size, -from_delta);
if (use_mutex)
- mutex_exit(&old_state->arcs_mtx);
+ mutex_exit(lock);
}
if (new_state != arc_anon) {
- int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
+ int use_mutex;
uint64_t *size = &new_state->arcs_lsize[ab->b_type];
+ get_buf_info(ab, new_state, &list, &lock);
+ use_mutex = !MUTEX_HELD(lock);
if (use_mutex)
- mutex_enter(&new_state->arcs_mtx);
+ mutex_enter(lock);
- list_insert_head(&new_state->arcs_list[ab->b_type], ab);
+ list_insert_head(list, ab);
/* ghost elements have a ghost size */
if (GHOST_STATE(new_state)) {
@@ -1072,7 +1219,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
atomic_add_64(size, to_delta);
if (use_mutex)
- mutex_exit(&new_state->arcs_mtx);
+ mutex_exit(lock);
}
}
@@ -1462,21 +1609,48 @@ arc_evict(arc_state_t *state, spa_t *spa, int64_t bytes, boolean_t recycle,
{
arc_state_t *evicted_state;
uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
+ int64_t bytes_remaining;
arc_buf_hdr_t *ab, *ab_prev = NULL;
- list_t *list = &state->arcs_list[type];
+ list_t *evicted_list, *list, *evicted_list_start, *list_start;
+ kmutex_t *lock, *evicted_lock;
kmutex_t *hash_lock;
boolean_t have_lock;
void *stolen = NULL;
+ static int evict_metadata_offset, evict_data_offset;
+ int i, idx, offset, list_count, count;
ASSERT(state == arc_mru || state == arc_mfu);
evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
- mutex_enter(&state->arcs_mtx);
- mutex_enter(&evicted_state->arcs_mtx);
+ if (type == ARC_BUFC_METADATA) {
+ offset = 0;
+ list_count = ARC_BUFC_NUMMETADATALISTS;
+ list_start = &state->arcs_lists[0];
+ evicted_list_start = &evicted_state->arcs_lists[0];
+ idx = evict_metadata_offset;
+ } else {
+ offset = ARC_BUFC_NUMMETADATALISTS;
+ list_start = &state->arcs_lists[offset];
+ evicted_list_start = &evicted_state->arcs_lists[offset];
+ list_count = ARC_BUFC_NUMDATALISTS;
+ idx = evict_data_offset;
+ }
+ bytes_remaining = evicted_state->arcs_lsize[type];
+ count = 0;
+
+evict_start:
+ list = &list_start[idx];
+ evicted_list = &evicted_list_start[idx];
+ lock = ARCS_LOCK(state, (offset + idx));
+ evicted_lock = ARCS_LOCK(evicted_state, (offset + idx));
+
+ mutex_enter(lock);
+ mutex_enter(evicted_lock);
for (ab = list_tail(list); ab; ab = ab_prev) {
ab_prev = list_prev(list, ab);
+ bytes_remaining -= (ab->b_size * ab->b_datacnt);
/* prefetch buffers have a minimum lifespan */
if (HDR_IO_IN_PROGRESS(ab) ||
(spa && ab->b_spa != spa) ||
@@ -1536,17 +1710,35 @@ arc_evict(arc_state_t *state, spa_t *spa, int64_t bytes, boolean_t recycle,
mutex_exit(hash_lock);
if (bytes >= 0 && bytes_evicted >= bytes)
break;
+ if (bytes_remaining > 0) {
+ mutex_exit(evicted_lock);
+ mutex_exit(lock);
+ idx = ((idx + 1) & (list_count - 1));
+ count++;
+ goto evict_start;
+ }
} else {
missed += 1;
}
}
- mutex_exit(&evicted_state->arcs_mtx);
- mutex_exit(&state->arcs_mtx);
+ mutex_exit(evicted_lock);
+ mutex_exit(lock);
- if (bytes_evicted < bytes)
- dprintf("only evicted %lld bytes from %x",
- (longlong_t)bytes_evicted, state);
+ idx = ((idx + 1) & (list_count - 1));
+ count++;
+
+ if (bytes_evicted < bytes) {
+ if (count < list_count)
+ goto evict_start;
+ else
+ dprintf("only evicted %lld bytes from %x",
+ (longlong_t)bytes_evicted, state);
+ }
+ if (type == ARC_BUFC_METADATA)
+ evict_metadata_offset = idx;
+ else
+ evict_data_offset = idx;
if (skipped)
ARCSTAT_INCR(arcstat_evict_skip, skipped);
@@ -1574,6 +1766,8 @@ arc_evict(arc_state_t *state, spa_t *spa, int64_t bytes, boolean_t recycle,
arc_evict_ghost(arc_mfu_ghost, NULL, todelete);
}
}
+ if (stolen)
+ ARCSTAT_BUMP(arcstat_stolen);
return (stolen);
}
@@ -1586,14 +1780,28 @@ static void
arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes)
{
arc_buf_hdr_t *ab, *ab_prev;
- list_t *list = &state->arcs_list[ARC_BUFC_DATA];
- kmutex_t *hash_lock;
+ list_t *list, *list_start;
+ kmutex_t *hash_lock, *lock;
uint64_t bytes_deleted = 0;
uint64_t bufs_skipped = 0;
+ static int evict_offset;
+ int list_count, idx = evict_offset;
+ int offset, count = 0;
ASSERT(GHOST_STATE(state));
-top:
- mutex_enter(&state->arcs_mtx);
+
+ /*
+ * data lists come after metadata lists
+ */
+ list_start = &state->arcs_lists[ARC_BUFC_NUMMETADATALISTS];
+ list_count = ARC_BUFC_NUMDATALISTS;
+ offset = ARC_BUFC_NUMMETADATALISTS;
+
+evict_start:
+ list = &list_start[idx];
+ lock = ARCS_LOCK(state, idx + offset);
+
+ mutex_enter(lock);
for (ab = list_tail(list); ab; ab = ab_prev) {
ab_prev = list_prev(list, ab);
if (spa && ab->b_spa != spa)
@@ -1623,20 +1831,31 @@ top:
break;
} else {
if (bytes < 0) {
- mutex_exit(&state->arcs_mtx);
+ /*
+ * we're draining the ARC, retry
+ */
+ mutex_exit(lock);
mutex_enter(hash_lock);
mutex_exit(hash_lock);
- goto top;
+ goto evict_start;
}
bufs_skipped += 1;
}
}
- mutex_exit(&state->arcs_mtx);
+ mutex_exit(lock);
+ idx = ((idx + 1) & (ARC_BUFC_NUMDATALISTS - 1));
+ count++;
- if (list == &state->arcs_list[ARC_BUFC_DATA] &&
+ if (count < list_count)
+ goto evict_start;
+
+ evict_offset = idx;
+ if ((uintptr_t)list > (uintptr_t)&state->arcs_lists[ARC_BUFC_NUMMETADATALISTS] &&
(bytes < 0 || bytes_deleted < bytes)) {
- list = &state->arcs_list[ARC_BUFC_METADATA];
- goto top;
+ list_start = &state->arcs_lists[0];
+ list_count = ARC_BUFC_NUMMETADATALISTS;
+ offset = count = 0;
+ goto evict_start;
}
if (bufs_skipped) {
@@ -1718,7 +1937,7 @@ arc_do_user_evicts(void)
/*
* Move list over to avoid LOR
*/
-restart:
+restart:
mutex_enter(&arc_eviction_mtx);
tmp_arc_eviction_list = arc_eviction_list;
arc_eviction_list = NULL;
@@ -1750,22 +1969,22 @@ restart:
void
arc_flush(spa_t *spa)
{
- while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) {
+ while (arc_mru->arcs_lsize[ARC_BUFC_DATA]) {
(void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_DATA);
if (spa)
break;
}
- while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) {
+ while (arc_mru->arcs_lsize[ARC_BUFC_METADATA]) {
(void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_METADATA);
if (spa)
break;
}
- while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) {
+ while (arc_mfu->arcs_lsize[ARC_BUFC_DATA]) {
(void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_DATA);
if (spa)
break;
}
- while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) {
+ while (arc_mfu->arcs_lsize[ARC_BUFC_METADATA]) {
(void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_METADATA);
if (spa)
break;
@@ -1829,7 +2048,7 @@ arc_reclaim_needed(void)
return (0);
/*
- * If pages are needed or we're within 2048 pages
+ * If pages are needed or we're within 2048 pages
* of needing to page need to reclaim
*/
if (vm_pages_needed || (vm_paging_target() > -2048))
@@ -1896,8 +2115,6 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat)
size_t i;
kmem_cache_t *prev_cache = NULL;
kmem_cache_t *prev_data_cache = NULL;
- extern kmem_cache_t *zio_buf_cache[];
- extern kmem_cache_t *zio_data_buf_cache[];
#endif
#ifdef _KERNEL
@@ -2203,6 +2420,7 @@ out:
arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
arc_p = MIN(arc_c, arc_p + size);
}
+ ARCSTAT_BUMP(arcstat_allocated);
}
/*
@@ -2502,7 +2720,6 @@ arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf,
uint32_t *arc_flags, const zbookmark_t *zb)
{
int err;
- arc_buf_hdr_t *hdr = pbuf->b_hdr;
ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt));
ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size);
@@ -2510,8 +2727,6 @@ arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf,
err = arc_read_nolock(pio, spa, bp, done, private, priority,
zio_flags, arc_flags, zb);
-
- ASSERT3P(hdr, ==, pbuf->b_hdr);
rw_exit(&pbuf->b_lock);
return (err);
}
@@ -2728,7 +2943,7 @@ top:
* released by l2arc_read_done().
*/
rzio = zio_read_phys(pio, vd, addr, size,
- buf->b_data, ZIO_CHECKSUM_OFF,
+ buf->b_data, ZIO_CHECKSUM_OFF,
l2arc_read_done, cb, priority, zio_flags |
ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
ZIO_FLAG_DONT_PROPAGATE |
@@ -2823,6 +3038,8 @@ arc_buf_evict(arc_buf_t *buf)
arc_buf_hdr_t *hdr;
kmutex_t *hash_lock;
arc_buf_t **bufp;
+ list_t *list, *evicted_list;
+ kmutex_t *lock, *evicted_lock;
rw_enter(&buf->b_lock, RW_WRITER);
hdr = buf->b_hdr;
@@ -2871,16 +3088,18 @@ arc_buf_evict(arc_buf_t *buf)
evicted_state =
(old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
- mutex_enter(&old_state->arcs_mtx);
- mutex_enter(&evicted_state->arcs_mtx);
+ get_buf_info(hdr, old_state, &list, &lock);
+ get_buf_info(hdr, evicted_state, &evicted_list, &evicted_lock);
+ mutex_enter(lock);
+ mutex_enter(evicted_lock);
arc_change_state(evicted_state, hdr, hash_lock);
ASSERT(HDR_IN_HASH_TABLE(hdr));
hdr->b_flags |= ARC_IN_HASH_TABLE;
hdr->b_flags &= ~ARC_BUF_AVAILABLE;
- mutex_exit(&evicted_state->arcs_mtx);
- mutex_exit(&old_state->arcs_mtx);
+ mutex_exit(evicted_lock);
+ mutex_exit(lock);
}
mutex_exit(hash_lock);
rw_exit(&buf->b_lock);
@@ -3426,7 +3645,8 @@ void
arc_init(void)
{
int prefetch_tunable_set = 0;
-
+ int i;
+
mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
mutex_init(&arc_lowmem_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -3494,33 +3714,33 @@ arc_init(void)
arc_l2c_only = &ARC_l2c_only;
arc_size = 0;
- mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
-
- list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
- sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
- list_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
- sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
- list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
- sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
- list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
- sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
- list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
- sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
- list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
- sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
- list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
- sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
- list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
- sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
- list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
- sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
- list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
- sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ for (i = 0; i < ARC_BUFC_NUMLISTS; i++) {
+ mutex_init(&arc_anon->arcs_locks[i].arcs_lock,
+ NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&arc_mru->arcs_locks[i].arcs_lock,
+ NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&arc_mru_ghost->arcs_locks[i].arcs_lock,
+ NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&arc_mfu->arcs_locks[i].arcs_lock,
+ NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&arc_mfu_ghost->arcs_locks[i].arcs_lock,
+ NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&arc_l2c_only->arcs_locks[i].arcs_lock,
+ NULL, MUTEX_DEFAULT, NULL);
+
+ list_create(&arc_mru->arcs_lists[i],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_mru_ghost->arcs_lists[i],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_mfu->arcs_lists[i],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_mfu_ghost->arcs_lists[i],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_mfu_ghost->arcs_lists[i],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_l2c_only->arcs_lists[i],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ }
buf_init();
@@ -3557,7 +3777,7 @@ arc_init(void)
#ifdef _KERNEL
if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable))
prefetch_tunable_set = 1;
-
+
#ifdef __i386__
if (prefetch_tunable_set == 0) {
printf("ZFS NOTICE: Prefetch is disabled by default on i386 "
@@ -3566,7 +3786,7 @@ arc_init(void)
"to /boot/loader.conf.\n");
zfs_prefetch_disable=1;
}
-#else
+#else
if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) &&
prefetch_tunable_set == 0) {
printf("ZFS NOTICE: Prefetch is disabled by default if less "
@@ -3575,7 +3795,7 @@ arc_init(void)
"to /boot/loader.conf.\n");
zfs_prefetch_disable=1;
}
-#endif
+#endif
/* Warn about ZFS memory and address space requirements. */
if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) {
printf("ZFS WARNING: Recommended minimum RAM size is 512MB; "
@@ -3594,6 +3814,7 @@ arc_init(void)
void
arc_fini(void)
{
+ int i;
mutex_enter(&arc_reclaim_thr_lock);
arc_thread_exit = 1;
@@ -3615,20 +3836,20 @@ arc_fini(void)
mutex_destroy(&arc_reclaim_thr_lock);
cv_destroy(&arc_reclaim_thr_cv);
- list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
- list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
- list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
- list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
- list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
- list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
- list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
- list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
-
- mutex_destroy(&arc_anon->arcs_mtx);
- mutex_destroy(&arc_mru->arcs_mtx);
- mutex_destroy(&arc_mru_ghost->arcs_mtx);
- mutex_destroy(&arc_mfu->arcs_mtx);
- mutex_destroy(&arc_mfu_ghost->arcs_mtx);
+ for (i = 0; i < ARC_BUFC_NUMLISTS; i++) {
+ list_destroy(&arc_mru->arcs_lists[i]);
+ list_destroy(&arc_mru_ghost->arcs_lists[i]);
+ list_destroy(&arc_mfu->arcs_lists[i]);
+ list_destroy(&arc_mfu_ghost->arcs_lists[i]);
+ list_destroy(&arc_l2c_only->arcs_lists[i]);
+
+ mutex_destroy(&arc_anon->arcs_locks[i].arcs_lock);
+ mutex_destroy(&arc_mru->arcs_locks[i].arcs_lock);
+ mutex_destroy(&arc_mru_ghost->arcs_locks[i].arcs_lock);
+ mutex_destroy(&arc_mfu->arcs_locks[i].arcs_lock);
+ mutex_destroy(&arc_mfu_ghost->arcs_locks[i].arcs_lock);
+ mutex_destroy(&arc_l2c_only->arcs_locks[i].arcs_lock);
+ }
mutex_destroy(&zfs_write_limit_lock);
@@ -4024,26 +4245,27 @@ static list_t *
l2arc_list_locked(int list_num, kmutex_t **lock)
{
list_t *list;
-
- ASSERT(list_num >= 0 && list_num <= 3);
-
- switch (list_num) {
- case 0:
- list = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
- *lock = &arc_mfu->arcs_mtx;
- break;
- case 1:
- list = &arc_mru->arcs_list[ARC_BUFC_METADATA];
- *lock = &arc_mru->arcs_mtx;
- break;
- case 2:
- list = &arc_mfu->arcs_list[ARC_BUFC_DATA];
- *lock = &arc_mfu->arcs_mtx;
- break;
- case 3:
- list = &arc_mru->arcs_list[ARC_BUFC_DATA];
- *lock = &arc_mru->arcs_mtx;
- break;
+ int idx;
+
+ ASSERT(list_num >= 0 && list_num < 2 * ARC_BUFC_NUMLISTS);
+
+ if (list_num < ARC_BUFC_NUMMETADATALISTS) {
+ idx = list_num;
+ list = &arc_mfu->arcs_lists[idx];
+ *lock = ARCS_LOCK(arc_mfu, idx);
+ } else if (list_num < ARC_BUFC_NUMMETADATALISTS * 2) {
+ idx = list_num - ARC_BUFC_NUMMETADATALISTS;
+ list = &arc_mru->arcs_lists[idx];
+ *lock = ARCS_LOCK(arc_mru, idx);
+ } else if (list_num < (ARC_BUFC_NUMMETADATALISTS * 2 +
+ ARC_BUFC_NUMDATALISTS)) {
+ idx = list_num - ARC_BUFC_NUMMETADATALISTS;
+ list = &arc_mfu->arcs_lists[idx];
+ *lock = ARCS_LOCK(arc_mfu, idx);
+ } else {
+ idx = list_num - ARC_BUFC_NUMLISTS;
+ list = &arc_mru->arcs_lists[idx];
+ *lock = ARCS_LOCK(arc_mru, idx);
}
ASSERT(!(MUTEX_HELD(*lock)));
@@ -4210,13 +4432,15 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
head->b_flags |= ARC_L2_WRITE_HEAD;
+ ARCSTAT_BUMP(arcstat_l2_write_buffer_iter);
/*
* Copy buffers for L2ARC writing.
*/
mutex_enter(&l2arc_buflist_mtx);
- for (try = 0; try <= 3; try++) {
+ for (try = 0; try < 2 * ARC_BUFC_NUMLISTS; try++) {
list = l2arc_list_locked(try, &list_lock);
passed_sz = 0;
+ ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter);
/*
* L2ARC fast warmup.
@@ -4229,52 +4453,65 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
ab = list_head(list);
else
ab = list_tail(list);
+ if (ab == NULL)
+ ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter);
for (; ab; ab = ab_prev) {
if (arc_warm == B_FALSE)
ab_prev = list_next(list, ab);
else
ab_prev = list_prev(list, ab);
+ ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, ab->b_size);
hash_lock = HDR_LOCK(ab);
have_lock = MUTEX_HELD(hash_lock);
if (!have_lock && !mutex_tryenter(hash_lock)) {
+ ARCSTAT_BUMP(arcstat_l2_write_trylock_fail);
/*
* Skip this buffer rather than waiting.
*/
continue;
}
+ if (ab->b_l2hdr != NULL) {
+ /*
+ * Already in L2ARC.
+ */
+ mutex_exit(hash_lock);
+ ARCSTAT_BUMP(arcstat_l2_write_in_l2);
+ continue;
+ }
+
passed_sz += ab->b_size;
if (passed_sz > headroom) {
/*
* Searched too far.
*/
mutex_exit(hash_lock);
+ ARCSTAT_BUMP(arcstat_l2_write_passed_headroom);
break;
}
if (ab->b_spa != spa) {
mutex_exit(hash_lock);
+ ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch);
continue;
}
- if (ab->b_l2hdr != NULL) {
- /*
- * Already in L2ARC.
- */
+ if (HDR_IO_IN_PROGRESS(ab)) {
mutex_exit(hash_lock);
+ ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress);
continue;
}
-
- if (HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab)) {
+ if (!HDR_L2CACHE(ab)) {
mutex_exit(hash_lock);
+ ARCSTAT_BUMP(arcstat_l2_write_not_cacheable);
continue;
}
-
if ((write_sz + ab->b_size) > target_sz) {
full = B_TRUE;
mutex_exit(hash_lock);
+ ARCSTAT_BUMP(arcstat_l2_write_full);
break;
}
@@ -4298,8 +4535,10 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
cb->l2wcb_head = head;
pio = zio_root(spa, l2arc_write_done, cb,
ZIO_FLAG_CANFAIL);
+ ARCSTAT_BUMP(arcstat_l2_write_pios);
}
+ ARCSTAT_INCR(arcstat_l2_write_bytes_written, ab->b_size);
/*
* Create and add a new L2ARC header.
*/
@@ -4395,7 +4634,7 @@ l2arc_feed_thread(void *dummy __unused)
*/
CALLB_CPR_SAFE_BEGIN(&cpr);
(void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
- hz * l2arc_feed_secs);
+ hz * l2arc_feed_secs >> l2arc_feed_secs_shift);
CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
/*
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
index 2494c1e..3bf0939 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
@@ -2210,9 +2210,6 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
if (BP_IS_HOLE(ibp))
continue;
- ASSERT3U(BP_GET_LSIZE(ibp), ==,
- db->db_level == 1 ? dn->dn_datablksz :
- (1<<dn->dn_phys->dn_indblkshift));
fill += ibp->blk_fill;
}
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c
index cddc64d..5cc56a9 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c
@@ -49,11 +49,11 @@ uint32_t zfetch_block_cap = 256;
uint64_t zfetch_array_rd_sz = 1024 * 1024;
SYSCTL_DECL(_vfs_zfs);
-SYSCTL_INT(_vfs_zfs, OID_AUTO, prefetch_disable, CTLFLAG_RDTUN,
+SYSCTL_INT(_vfs_zfs, OID_AUTO, prefetch_disable, CTLFLAG_RW,
&zfs_prefetch_disable, 0, "Disable prefetch");
SYSCTL_NODE(_vfs_zfs, OID_AUTO, zfetch, CTLFLAG_RW, 0, "ZFS ZFETCH");
TUNABLE_INT("vfs.zfs.zfetch.max_streams", &zfetch_max_streams);
-SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_streams, CTLFLAG_RDTUN,
+SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_streams, CTLFLAG_RW,
&zfetch_max_streams, 0, "Max # of streams per zfetch");
TUNABLE_INT("vfs.zfs.zfetch.min_sec_reap", &zfetch_min_sec_reap);
SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, min_sec_reap, CTLFLAG_RDTUN,
@@ -338,8 +338,10 @@ top:
reset = !prefetched && zs->zst_len > 1;
- mutex_enter(&zs->zst_lock);
-
+ if (mutex_tryenter(&zs->zst_lock) == 0) {
+ rc = 1;
+ goto out;
+ }
if (zh->zst_offset != zs->zst_offset + zs->zst_len) {
mutex_exit(&zs->zst_lock);
goto top;
@@ -363,8 +365,10 @@ top:
reset = !prefetched && zs->zst_len > 1;
- mutex_enter(&zs->zst_lock);
-
+ if (mutex_tryenter(&zs->zst_lock) == 0) {
+ rc = 1;
+ goto out;
+ }
if (zh->zst_offset != zs->zst_offset - zh->zst_len) {
mutex_exit(&zs->zst_lock);
goto top;
@@ -391,8 +395,10 @@ top:
zs->zst_len) && (zs->zst_len != zs->zst_stride)) {
/* strided forward access */
- mutex_enter(&zs->zst_lock);
-
+ if (mutex_tryenter(&zs->zst_lock) == 0) {
+ rc = 1;
+ goto out;
+ }
if ((zh->zst_offset - zs->zst_offset - zs->zst_stride >=
zs->zst_len) || (zs->zst_len == zs->zst_stride)) {
mutex_exit(&zs->zst_lock);
@@ -408,8 +414,10 @@ top:
zs->zst_len) && (zs->zst_len != zs->zst_stride)) {
/* strided reverse access */
- mutex_enter(&zs->zst_lock);
-
+ if (mutex_tryenter(&zs->zst_lock) == 0) {
+ rc = 1;
+ goto out;
+ }
if ((zh->zst_offset - zs->zst_offset + zs->zst_stride >=
zs->zst_len) || (zs->zst_len == zs->zst_stride)) {
mutex_exit(&zs->zst_lock);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
index 163b215..90861ba 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
@@ -62,6 +62,14 @@
#include "zfs_prop.h"
#include "zfs_comutil.h"
+/* Check hostid on import? */
+static int check_hostid = 1;
+
+SYSCTL_DECL(_vfs_zfs);
+TUNABLE_INT("vfs.zfs.check_hostid", &check_hostid);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RW, &check_hostid, 0,
+ "Check hostid on import?");
+
int zio_taskq_threads[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
/* ISSUE INTR */
{ 1, 1 }, /* ZIO_TYPE_NULL */
@@ -1168,7 +1176,7 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
ZPOOL_CONFIG_HOSTNAME, &hostname) == 0);
(void) ddi_strtoul(hw_serial, NULL, 10, &myhostid);
- if (hostid != 0 && myhostid != 0 &&
+ if (check_hostid && hostid != 0 && myhostid != 0 &&
(unsigned long)hostid != myhostid) {
cmn_err(CE_WARN, "pool '%s' could not be "
"loaded as it was last accessed by "
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
index 4c41f90..819473e 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
@@ -29,6 +29,7 @@
#include <sys/bio.h>
#include <sys/disk.h>
#include <sys/spa.h>
+#include <sys/spa_impl.h>
#include <sys/vdev_impl.h>
#include <sys/fs/zfs.h>
#include <sys/zio.h>
@@ -102,7 +103,7 @@ vdev_geom_orphan(struct g_consumer *cp)
}
static struct g_consumer *
-vdev_geom_attach(struct g_provider *pp, int write)
+vdev_geom_attach(struct g_provider *pp)
{
struct g_geom *gp;
struct g_consumer *cp;
@@ -126,7 +127,7 @@ vdev_geom_attach(struct g_provider *pp, int write)
g_wither_geom(gp, ENXIO);
return (NULL);
}
- if (g_access(cp, 1, write, 1) != 0) {
+ if (g_access(cp, 1, 0, 1) != 0) {
g_wither_geom(gp, ENXIO);
return (NULL);
}
@@ -145,14 +146,14 @@ vdev_geom_attach(struct g_provider *pp, int write)
g_destroy_consumer(cp);
return (NULL);
}
- if (g_access(cp, 1, write, 1) != 0) {
+ if (g_access(cp, 1, 0, 1) != 0) {
g_detach(cp);
g_destroy_consumer(cp);
return (NULL);
}
ZFS_LOG(1, "Created consumer for %s.", pp->name);
} else {
- if (g_access(cp, 1, cp->acw > 0 ? 0 : write, 1) != 0)
+ if (g_access(cp, 1, 0, 1) != 0)
return (NULL);
ZFS_LOG(1, "Used existing consumer for %s.", pp->name);
}
@@ -342,7 +343,6 @@ vdev_geom_read_guid(struct g_consumer *cp)
struct vdev_geom_find {
uint64_t guid;
- int write;
struct g_consumer *cp;
};
@@ -394,10 +394,10 @@ vdev_geom_attach_by_guid_event(void *arg, int flags __unused)
g_detach(zcp);
if (guid != ap->guid)
continue;
- ap->cp = vdev_geom_attach(pp, ap->write);
+ ap->cp = vdev_geom_attach(pp);
if (ap->cp == NULL) {
- printf("ZFS WARNING: Cannot open %s "
- "for writting.\n", pp->name);
+ printf("ZFS WARNING: Unable to attach to %s.",
+ pp->name);
continue;
}
goto end;
@@ -411,14 +411,13 @@ end:
}
static struct g_consumer *
-vdev_geom_attach_by_guid(uint64_t guid, int write)
+vdev_geom_attach_by_guid(uint64_t guid)
{
struct vdev_geom_find *ap;
struct g_consumer *cp;
ap = kmem_zalloc(sizeof(*ap), KM_SLEEP);
ap->guid = guid;
- ap->write = write;
g_waitfor_event(vdev_geom_attach_by_guid_event, ap, M_WAITOK, NULL);
cp = ap->cp;
kmem_free(ap, sizeof(*ap));
@@ -433,7 +432,7 @@ vdev_geom_open_by_guid(vdev_t *vd)
size_t len;
ZFS_LOG(1, "Searching by guid [%ju].", (uintmax_t)vd->vdev_guid);
- cp = vdev_geom_attach_by_guid(vd->vdev_guid, !!(spa_mode & FWRITE));
+ cp = vdev_geom_attach_by_guid(vd->vdev_guid);
if (cp != NULL) {
len = strlen(cp->provider->name) + strlen("/dev/") + 1;
buf = kmem_alloc(len, KM_SLEEP);
@@ -464,7 +463,7 @@ vdev_geom_open_by_path(vdev_t *vd, int check_guid)
pp = g_provider_by_name(vd->vdev_path + sizeof("/dev/") - 1);
if (pp != NULL) {
ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path);
- cp = vdev_geom_attach(pp, !!(spa_mode & FWRITE));
+ cp = vdev_geom_attach(pp);
if (cp != NULL && check_guid) {
g_topology_unlock();
guid = vdev_geom_read_guid(cp);
@@ -492,7 +491,7 @@ vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
vdev_geom_ctx_t *ctx;
struct g_provider *pp;
struct g_consumer *cp;
- int owned;
+ int error, owned;
/*
* We must have a pathname, and it must be absolute.
@@ -506,26 +505,47 @@ vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
if ((owned = mtx_owned(&Giant)))
mtx_unlock(&Giant);
- cp = vdev_geom_open_by_path(vd, 1);
- if (cp == NULL) {
- /*
- * The device at vd->vdev_path doesn't have the expected guid.
- * The disks might have merely moved around so try all other
- * geom providers to find one with the right guid.
- */
- cp = vdev_geom_open_by_guid(vd);
- }
- if (cp == NULL)
+ error = 0;
+
+ /*
+ * If we're creating pool, just find GEOM provider by its name
+ * and ignore GUID mismatches.
+ */
+ if (vd->vdev_spa->spa_load_state == SPA_LOAD_NONE)
cp = vdev_geom_open_by_path(vd, 0);
+ else {
+ cp = vdev_geom_open_by_path(vd, 1);
+ if (cp == NULL) {
+ /*
+ * The device at vd->vdev_path doesn't have the
+ * expected guid. The disks might have merely
+ * moved around so try all other GEOM providers
+ * to find one with the right guid.
+ */
+ cp = vdev_geom_open_by_guid(vd);
+ }
+ }
+
if (cp == NULL) {
ZFS_LOG(1, "Provider %s not found.", vd->vdev_path);
- vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
- if (owned)
- mtx_lock(&Giant);
- return (EACCES);
+ error = ENOENT;
+ } else if (cp->acw == 0 && (spa_mode & FWRITE) != 0) {
+ g_topology_lock();
+ error = g_access(cp, 0, 1, 0);
+ if (error != 0) {
+ printf("ZFS WARNING: Unable to open %s for writing (error=%d).",
+ vd->vdev_path, error);
+ vdev_geom_detach(cp, 0);
+ cp = NULL;
+ }
+ g_topology_unlock();
}
if (owned)
mtx_lock(&Giant);
+ if (cp == NULL) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+ return (error);
+ }
cp->private = vd;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
index 1132483..07c9b61 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
@@ -214,7 +214,7 @@ blksz_changed_cb(void *arg, uint64_t newval)
newval = SPA_MAXBLOCKSIZE;
zfsvfs->z_max_blksz = newval;
- zfsvfs->z_vfs->vfs_bsize = newval;
+ zfsvfs->z_vfs->mnt_stat.f_iosize = newval;
}
static void
@@ -577,7 +577,8 @@ zfs_domount(vfs_t *vfsp, char *osname)
if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
NULL))
goto out;
- zfsvfs->z_vfs->vfs_bsize = recordsize;
+ zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE;
+ zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize;
vfsp->vfs_data = zfsvfs;
vfsp->mnt_flag |= MNT_LOCAL;
@@ -817,8 +818,8 @@ zfs_statfs(vfs_t *vfsp, struct statfs *statp)
* We report the fragsize as the smallest block size we support,
* and we report our blocksize as the filesystem's maximum blocksize.
*/
- statp->f_bsize = zfsvfs->z_vfs->vfs_bsize;
- statp->f_iosize = zfsvfs->z_vfs->vfs_bsize;
+ statp->f_bsize = SPA_MINBLOCKSIZE;
+ statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize;
/*
* The following report "total" blocks of various kinds in the
@@ -826,7 +827,7 @@ zfs_statfs(vfs_t *vfsp, struct statfs *statp)
* "fragment" size.
*/
- statp->f_blocks = (refdbytes + availbytes) / statp->f_bsize;
+ statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT;
statp->f_bfree = availbytes / statp->f_bsize;
statp->f_bavail = statp->f_bfree; /* no root reservation */
@@ -867,13 +868,15 @@ zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp)
ZFS_ENTER_NOERROR(zfsvfs);
error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
+
+ ZFS_EXIT(zfsvfs);
+
if (error == 0) {
*vpp = ZTOV(rootzp);
error = vn_lock(*vpp, flags);
(*vpp)->v_vflag |= VV_ROOT;
}
- ZFS_EXIT(zfsvfs);
return (error);
}
@@ -1142,13 +1145,13 @@ zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
VN_RELE(ZTOV(zp));
err = EINVAL;
}
+ ZFS_EXIT(zfsvfs);
if (err != 0)
*vpp = NULL;
else {
*vpp = ZTOV(zp);
vn_lock(*vpp, flags);
}
- ZFS_EXIT(zfsvfs);
return (err);
}
@@ -1236,8 +1239,8 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp)
} else {
VN_HOLD(*vpp);
}
- vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
ZFS_EXIT(zfsvfs);
+ vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
return (0);
}
@@ -1258,10 +1261,11 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp)
return (EINVAL);
}
+ ZFS_EXIT(zfsvfs);
+
*vpp = ZTOV(zp);
vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
vnode_create_vobject(*vpp, zp->z_phys->zp_size, curthread);
- ZFS_EXIT(zfsvfs);
return (0);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
index 4f61f5f..59a58dd 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
@@ -1209,15 +1209,17 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
ltype = VOP_ISLOCKED(dvp);
VOP_UNLOCK(dvp, 0);
}
+ ZFS_EXIT(zfsvfs);
error = vn_lock(*vpp, cnp->cn_lkflags);
if (cnp->cn_flags & ISDOTDOT)
vn_lock(dvp, ltype | LK_RETRY);
if (error != 0) {
VN_RELE(*vpp);
*vpp = NULL;
- ZFS_EXIT(zfsvfs);
return (error);
}
+ } else {
+ ZFS_EXIT(zfsvfs);
}
#ifdef FREEBSD_NAMECACHE
@@ -1237,8 +1239,6 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
}
#endif
- ZFS_EXIT(zfsvfs);
-
return (error);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
index 4650d42..e7227f2 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
@@ -91,13 +91,6 @@ zio_init(void)
#ifdef ZIO_USE_UMA
size_t c;
#endif
-#if 0
- vmem_t *data_alloc_arena = NULL;
-
-#ifdef _KERNEL
- data_alloc_arena = zio_alloc_arena;
-#endif
-#endif
zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0,
NULL, NULL, NULL, NULL, NULL, 0);
@@ -132,8 +125,7 @@ zio_init(void)
(void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
zio_data_buf_cache[c] = kmem_cache_create(name, size,
- align, NULL, NULL, NULL, NULL, data_alloc_arena,
- KMC_NODEBUG);
+ align, NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG);
}
}
diff --git a/sys/modules/zfs/Makefile b/sys/modules/zfs/Makefile
index c95a840..4554088 100644
--- a/sys/modules/zfs/Makefile
+++ b/sys/modules/zfs/Makefile
@@ -63,8 +63,8 @@ ZFS_SRCS= ${ZFS_OBJS:C/.o$/.c/}
SRCS+= ${ZFS_SRCS}
SRCS+= vdev_geom.c
-# Use UMA for ZIO allocation. This is not stable.
-#CFLAGS+=-DZIO_USE_UMA
+# Use UMA for ZIO allocation.
+CFLAGS+=-DZIO_USE_UMA
# Use FreeBSD's namecache.
CFLAGS+=-DFREEBSD_NAMECACHE
diff --git a/sys/sys/ioccom.h b/sys/sys/ioccom.h
index be2ce66..5669088 100644
--- a/sys/sys/ioccom.h
+++ b/sys/sys/ioccom.h
@@ -38,12 +38,13 @@
* any in or out parameters in the upper word. The high 3 bits of the
* upper word are used to encode the in/out status of the parameter.
*/
-#define IOCPARM_MASK 0x1fff /* parameter length, at most 13 bits */
+#define IOCPARM_SHIFT 13 /* number of bits for ioctl size */
+#define IOCPARM_MASK ((1 << IOCPARM_SHIFT) - 1) /* parameter length mask */
#define IOCPARM_LEN(x) (((x) >> 16) & IOCPARM_MASK)
#define IOCBASECMD(x) ((x) & ~(IOCPARM_MASK << 16))
#define IOCGROUP(x) (((x) >> 8) & 0xff)
-#define IOCPARM_MAX PAGE_SIZE /* max size of ioctl, mult. of PAGE_SIZE */
+#define IOCPARM_MAX (1 << IOCPARM_SHIFT) /* max size of ioctl */
#define IOC_VOID 0x20000000 /* no parameters */
#define IOC_OUT 0x40000000 /* copy out parameters */
#define IOC_IN 0x80000000 /* copy in parameters */
OpenPOWER on IntegriCloud