diff options
-rw-r--r-- | sys/boot/zfs/zfs.c | 2 | ||||
-rw-r--r-- | sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c | 507 | ||||
-rw-r--r-- | sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c | 3 | ||||
-rw-r--r-- | sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c | 28 | ||||
-rw-r--r-- | sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c | 10 | ||||
-rw-r--r-- | sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c | 74 | ||||
-rw-r--r-- | sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c | 22 | ||||
-rw-r--r-- | sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c | 6 | ||||
-rw-r--r-- | sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c | 10 | ||||
-rw-r--r-- | sys/modules/zfs/Makefile | 4 | ||||
-rw-r--r-- | sys/sys/ioccom.h | 5 |
11 files changed, 470 insertions, 201 deletions
diff --git a/sys/boot/zfs/zfs.c b/sys/boot/zfs/zfs.c index 52df773..99bb60a 100644 --- a/sys/boot/zfs/zfs.c +++ b/sys/boot/zfs/zfs.c @@ -397,7 +397,7 @@ zfs_dev_init(void) /* * Open all the disks we can find and see if we can reconstruct * ZFS pools from them. Bogusly assumes that the disks are named - * diskN or diskNsM. + * diskN, diskNpM or diskNsM. */ zfs_init(); for (unit = 0; unit < 32 /* XXX */; unit++) { diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c index 83a4c97..ceb4c87 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c @@ -186,6 +186,11 @@ SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0, SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RDTUN, &zfs_mdcomp_disable, 0, "Disable metadata compression"); +#ifdef ZIO_USE_UMA +extern kmem_cache_t *zio_buf_cache[]; +extern kmem_cache_t *zio_data_buf_cache[]; +#endif + /* * Note that buffers can be in one of 6 states: * ARC_anon - anonymous (discussed below) @@ -218,13 +223,31 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RDTUN, * second level ARC benefit from these fast lookups. */ +#define ARCS_LOCK_PAD CACHE_LINE_SIZE +struct arcs_lock { + kmutex_t arcs_lock; +#ifdef _KERNEL + unsigned char pad[(ARCS_LOCK_PAD - sizeof (kmutex_t))]; +#endif +}; + +/* + * must be power of two for mask use to work + * + */ +#define ARC_BUFC_NUMDATALISTS 16 +#define ARC_BUFC_NUMMETADATALISTS 16 +#define ARC_BUFC_NUMLISTS (ARC_BUFC_NUMMETADATALISTS + ARC_BUFC_NUMDATALISTS) + typedef struct arc_state { - list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */ uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */ uint64_t arcs_size; /* total amount of data in this state */ - kmutex_t arcs_mtx; + list_t arcs_lists[ARC_BUFC_NUMLISTS]; /* list of evictable buffers */ + struct arcs_lock arcs_locks[ARC_BUFC_NUMLISTS] __aligned(CACHE_LINE_SIZE); } arc_state_t; +#define ARCS_LOCK(s, i) (&((s)->arcs_locks[(i)].arcs_lock)) + /* The 6 states: */ static arc_state_t ARC_anon; static arc_state_t ARC_mru; @@ -248,7 +271,9 @@ typedef struct arc_stats { kstat_named_t arcstat_mru_ghost_hits; kstat_named_t arcstat_mfu_hits; kstat_named_t arcstat_mfu_ghost_hits; + kstat_named_t arcstat_allocated; kstat_named_t arcstat_deleted; + kstat_named_t arcstat_stolen; kstat_named_t arcstat_recycle_miss; kstat_named_t arcstat_mutex_miss; kstat_named_t arcstat_evict_skip; @@ -280,6 +305,19 @@ typedef struct arc_stats { kstat_named_t arcstat_l2_size; kstat_named_t arcstat_l2_hdr_size; kstat_named_t arcstat_memory_throttle_count; + kstat_named_t arcstat_l2_write_trylock_fail; + kstat_named_t arcstat_l2_write_passed_headroom; + kstat_named_t arcstat_l2_write_spa_mismatch; + kstat_named_t arcstat_l2_write_in_l2; + kstat_named_t arcstat_l2_write_hdr_io_in_progress; + kstat_named_t arcstat_l2_write_not_cacheable; + kstat_named_t arcstat_l2_write_full; + kstat_named_t arcstat_l2_write_buffer_iter; + kstat_named_t arcstat_l2_write_pios; + kstat_named_t arcstat_l2_write_bytes_written; + kstat_named_t arcstat_l2_write_buffer_bytes_scanned; + kstat_named_t arcstat_l2_write_buffer_list_iter; + kstat_named_t arcstat_l2_write_buffer_list_null_iter; } arc_stats_t; static arc_stats_t arc_stats = { @@ -297,7 +335,9 @@ static arc_stats_t arc_stats = { { "mru_ghost_hits", KSTAT_DATA_UINT64 }, { "mfu_hits", KSTAT_DATA_UINT64 }, { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, + { "allocated", KSTAT_DATA_UINT64 }, { "deleted", KSTAT_DATA_UINT64 }, + { "stolen", KSTAT_DATA_UINT64 }, { "recycle_miss", KSTAT_DATA_UINT64 }, { "mutex_miss", KSTAT_DATA_UINT64 }, { "evict_skip", KSTAT_DATA_UINT64 }, @@ -328,7 +368,20 @@ static arc_stats_t arc_stats = { { "l2_io_error", KSTAT_DATA_UINT64 }, { "l2_size", KSTAT_DATA_UINT64 }, { "l2_hdr_size", KSTAT_DATA_UINT64 }, - { "memory_throttle_count", KSTAT_DATA_UINT64 } + { "memory_throttle_count", KSTAT_DATA_UINT64 }, + { "l2_write_trylock_fail", KSTAT_DATA_UINT64 }, + { "l2_write_passed_headroom", KSTAT_DATA_UINT64 }, + { "l2_write_spa_mismatch", KSTAT_DATA_UINT64 }, + { "l2_write_in_l2", KSTAT_DATA_UINT64 }, + { "l2_write_io_in_progress", KSTAT_DATA_UINT64 }, + { "l2_write_not_cacheable", KSTAT_DATA_UINT64 }, + { "l2_write_full", KSTAT_DATA_UINT64 }, + { "l2_write_buffer_iter", KSTAT_DATA_UINT64 }, + { "l2_write_pios", KSTAT_DATA_UINT64 }, + { "l2_write_bytes_written", KSTAT_DATA_UINT64 }, + { "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 }, + { "l2_write_buffer_list_iter", KSTAT_DATA_UINT64 }, + { "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 } }; #define ARCSTAT(stat) (arc_stats.stat.value.ui64) @@ -336,7 +389,7 @@ static arc_stats_t arc_stats = { #define ARCSTAT_INCR(stat, val) \ atomic_add_64(&arc_stats.stat.value.ui64, (val)); -#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) +#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) #define ARCSTAT_MAX(stat, val) { \ @@ -370,7 +423,7 @@ static arc_stats_t arc_stats = { } kstat_t *arc_ksp; -static arc_state_t *arc_anon; +static arc_state_t *arc_anon; static arc_state_t *arc_mru; static arc_state_t *arc_mru_ghost; static arc_state_t *arc_mfu; @@ -514,7 +567,7 @@ static void arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes); * Hash table routines */ -#define HT_LOCK_PAD 128 +#define HT_LOCK_PAD CACHE_LINE_SIZE struct ht_lock { kmutex_t ht_lock; @@ -527,7 +580,7 @@ struct ht_lock { typedef struct buf_hash_table { uint64_t ht_mask; arc_buf_hdr_t **ht_table; - struct ht_lock ht_locks[BUF_LOCKS]; + struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE); } buf_hash_table_t; static buf_hash_table_t buf_hash_table; @@ -541,13 +594,19 @@ static buf_hash_table_t buf_hash_table; uint64_t zfs_crc64_table[256]; +#ifdef ZIO_USE_UMA +extern kmem_cache_t *zio_buf_cache[]; +extern kmem_cache_t *zio_data_buf_cache[]; +#endif + /* * Level 2 ARC */ -#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ -#define L2ARC_HEADROOM 4 /* num of writes */ +#define L2ARC_WRITE_SIZE (64 * 1024 * 1024) /* initial write max */ +#define L2ARC_HEADROOM 128 /* num of writes */ #define L2ARC_FEED_SECS 1 /* caching interval */ +#define L2ARC_FEED_SECS_SHIFT 1 /* caching interval shift */ #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) @@ -559,7 +618,66 @@ uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ -boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ +uint64_t l2arc_feed_secs_shift = L2ARC_FEED_SECS_SHIFT; /* interval seconds shift */ +boolean_t l2arc_noprefetch = B_FALSE; /* don't cache prefetch bufs */ + + +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW, + &l2arc_write_max, 0, "max write size"); +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW, + &l2arc_write_boost, 0, "extra write during warmup"); +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW, + &l2arc_headroom, 0, "number of dev writes"); +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW, + &l2arc_feed_secs, 0, "interval seconds"); +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs_shift, CTLFLAG_RW, + &l2arc_feed_secs_shift, 0, "power of 2 division of feed seconds"); + +SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW, + &l2arc_noprefetch, 0, "don't cache prefetch bufs"); + + +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD, + &ARC_anon.arcs_size, 0, "size of anonymous state"); +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD, + &ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0, "size of anonymous state"); +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD, + &ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0, "size of anonymous state"); + +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD, + &ARC_mru.arcs_size, 0, "size of mru state"); +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD, + &ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru state"); +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD, + &ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru state"); + +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD, + &ARC_mru_ghost.arcs_size, 0, "size of mru ghost state"); +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD, + &ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0, + "size of metadata in mru ghost state"); +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD, + &ARC_mru_ghost.arcs_lsize[ARC_BUFC_DATA], 0, + "size of data in mru ghost state"); + +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD, + &ARC_mfu.arcs_size, 0, "size of mfu state"); +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD, + &ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu state"); +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD, + &ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu state"); + +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD, + &ARC_mfu_ghost.arcs_size, 0, "size of mfu ghost state"); +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD, + &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0, + "size of metadata in mfu ghost state"); +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD, + &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_DATA], 0, + "size of data in mfu ghost state"); + +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD, + &ARC_l2c_only.arcs_size, 0, "size of mru state"); /* * L2ARC Internals @@ -953,18 +1071,38 @@ arc_buf_freeze(arc_buf_t *buf) } static void +get_buf_info(arc_buf_hdr_t *ab, arc_state_t *state, list_t **list, kmutex_t **lock) +{ + uint64_t buf_hashid = buf_hash(ab->b_spa, &ab->b_dva, ab->b_birth); + + if (ab->b_type == ARC_BUFC_METADATA) + buf_hashid &= (ARC_BUFC_NUMMETADATALISTS - 1); + else { + buf_hashid &= (ARC_BUFC_NUMDATALISTS - 1); + buf_hashid += ARC_BUFC_NUMMETADATALISTS; + } + + *list = &state->arcs_lists[buf_hashid]; + *lock = ARCS_LOCK(state, buf_hashid); +} + + +static void add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) { + ASSERT(MUTEX_HELD(hash_lock)); if ((refcount_add(&ab->b_refcnt, tag) == 1) && (ab->b_state != arc_anon)) { uint64_t delta = ab->b_size * ab->b_datacnt; - list_t *list = &ab->b_state->arcs_list[ab->b_type]; uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type]; + list_t *list; + kmutex_t *lock; - ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx)); - mutex_enter(&ab->b_state->arcs_mtx); + get_buf_info(ab, ab->b_state, &list, &lock); + ASSERT(!MUTEX_HELD(lock)); + mutex_enter(lock); ASSERT(list_link_active(&ab->b_arc_node)); list_remove(list, ab); if (GHOST_STATE(ab->b_state)) { @@ -975,7 +1113,7 @@ add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) ASSERT(delta > 0); ASSERT3U(*size, >=, delta); atomic_add_64(size, -delta); - mutex_exit(&ab->b_state->arcs_mtx); + mutex_exit(lock); /* remove the prefetch flag if we get a reference */ if (ab->b_flags & ARC_PREFETCH) ab->b_flags &= ~ARC_PREFETCH; @@ -994,14 +1132,17 @@ remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) && (state != arc_anon)) { uint64_t *size = &state->arcs_lsize[ab->b_type]; + list_t *list; + kmutex_t *lock; - ASSERT(!MUTEX_HELD(&state->arcs_mtx)); - mutex_enter(&state->arcs_mtx); + get_buf_info(ab, state, &list, &lock); + ASSERT(!MUTEX_HELD(lock)); + mutex_enter(lock); ASSERT(!list_link_active(&ab->b_arc_node)); - list_insert_head(&state->arcs_list[ab->b_type], ab); + list_insert_head(list, ab); ASSERT(ab->b_datacnt > 0); atomic_add_64(size, ab->b_size * ab->b_datacnt); - mutex_exit(&state->arcs_mtx); + mutex_exit(lock); } return (cnt); } @@ -1016,6 +1157,8 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) arc_state_t *old_state = ab->b_state; int64_t refcnt = refcount_count(&ab->b_refcnt); uint64_t from_delta, to_delta; + list_t *list; + kmutex_t *lock; ASSERT(MUTEX_HELD(hash_lock)); ASSERT(new_state != old_state); @@ -1030,14 +1173,16 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) */ if (refcnt == 0) { if (old_state != arc_anon) { - int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx); + int use_mutex; uint64_t *size = &old_state->arcs_lsize[ab->b_type]; + get_buf_info(ab, old_state, &list, &lock); + use_mutex = !MUTEX_HELD(lock); if (use_mutex) - mutex_enter(&old_state->arcs_mtx); + mutex_enter(lock); ASSERT(list_link_active(&ab->b_arc_node)); - list_remove(&old_state->arcs_list[ab->b_type], ab); + list_remove(list, ab); /* * If prefetching out of the ghost cache, @@ -1052,16 +1197,18 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) atomic_add_64(size, -from_delta); if (use_mutex) - mutex_exit(&old_state->arcs_mtx); + mutex_exit(lock); } if (new_state != arc_anon) { - int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx); + int use_mutex; uint64_t *size = &new_state->arcs_lsize[ab->b_type]; + get_buf_info(ab, new_state, &list, &lock); + use_mutex = !MUTEX_HELD(lock); if (use_mutex) - mutex_enter(&new_state->arcs_mtx); + mutex_enter(lock); - list_insert_head(&new_state->arcs_list[ab->b_type], ab); + list_insert_head(list, ab); /* ghost elements have a ghost size */ if (GHOST_STATE(new_state)) { @@ -1072,7 +1219,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) atomic_add_64(size, to_delta); if (use_mutex) - mutex_exit(&new_state->arcs_mtx); + mutex_exit(lock); } } @@ -1462,21 +1609,48 @@ arc_evict(arc_state_t *state, spa_t *spa, int64_t bytes, boolean_t recycle, { arc_state_t *evicted_state; uint64_t bytes_evicted = 0, skipped = 0, missed = 0; + int64_t bytes_remaining; arc_buf_hdr_t *ab, *ab_prev = NULL; - list_t *list = &state->arcs_list[type]; + list_t *evicted_list, *list, *evicted_list_start, *list_start; + kmutex_t *lock, *evicted_lock; kmutex_t *hash_lock; boolean_t have_lock; void *stolen = NULL; + static int evict_metadata_offset, evict_data_offset; + int i, idx, offset, list_count, count; ASSERT(state == arc_mru || state == arc_mfu); evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; - mutex_enter(&state->arcs_mtx); - mutex_enter(&evicted_state->arcs_mtx); + if (type == ARC_BUFC_METADATA) { + offset = 0; + list_count = ARC_BUFC_NUMMETADATALISTS; + list_start = &state->arcs_lists[0]; + evicted_list_start = &evicted_state->arcs_lists[0]; + idx = evict_metadata_offset; + } else { + offset = ARC_BUFC_NUMMETADATALISTS; + list_start = &state->arcs_lists[offset]; + evicted_list_start = &evicted_state->arcs_lists[offset]; + list_count = ARC_BUFC_NUMDATALISTS; + idx = evict_data_offset; + } + bytes_remaining = evicted_state->arcs_lsize[type]; + count = 0; + +evict_start: + list = &list_start[idx]; + evicted_list = &evicted_list_start[idx]; + lock = ARCS_LOCK(state, (offset + idx)); + evicted_lock = ARCS_LOCK(evicted_state, (offset + idx)); + + mutex_enter(lock); + mutex_enter(evicted_lock); for (ab = list_tail(list); ab; ab = ab_prev) { ab_prev = list_prev(list, ab); + bytes_remaining -= (ab->b_size * ab->b_datacnt); /* prefetch buffers have a minimum lifespan */ if (HDR_IO_IN_PROGRESS(ab) || (spa && ab->b_spa != spa) || @@ -1536,17 +1710,35 @@ arc_evict(arc_state_t *state, spa_t *spa, int64_t bytes, boolean_t recycle, mutex_exit(hash_lock); if (bytes >= 0 && bytes_evicted >= bytes) break; + if (bytes_remaining > 0) { + mutex_exit(evicted_lock); + mutex_exit(lock); + idx = ((idx + 1) & (list_count - 1)); + count++; + goto evict_start; + } } else { missed += 1; } } - mutex_exit(&evicted_state->arcs_mtx); - mutex_exit(&state->arcs_mtx); + mutex_exit(evicted_lock); + mutex_exit(lock); - if (bytes_evicted < bytes) - dprintf("only evicted %lld bytes from %x", - (longlong_t)bytes_evicted, state); + idx = ((idx + 1) & (list_count - 1)); + count++; + + if (bytes_evicted < bytes) { + if (count < list_count) + goto evict_start; + else + dprintf("only evicted %lld bytes from %x", + (longlong_t)bytes_evicted, state); + } + if (type == ARC_BUFC_METADATA) + evict_metadata_offset = idx; + else + evict_data_offset = idx; if (skipped) ARCSTAT_INCR(arcstat_evict_skip, skipped); @@ -1574,6 +1766,8 @@ arc_evict(arc_state_t *state, spa_t *spa, int64_t bytes, boolean_t recycle, arc_evict_ghost(arc_mfu_ghost, NULL, todelete); } } + if (stolen) + ARCSTAT_BUMP(arcstat_stolen); return (stolen); } @@ -1586,14 +1780,28 @@ static void arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes) { arc_buf_hdr_t *ab, *ab_prev; - list_t *list = &state->arcs_list[ARC_BUFC_DATA]; - kmutex_t *hash_lock; + list_t *list, *list_start; + kmutex_t *hash_lock, *lock; uint64_t bytes_deleted = 0; uint64_t bufs_skipped = 0; + static int evict_offset; + int list_count, idx = evict_offset; + int offset, count = 0; ASSERT(GHOST_STATE(state)); -top: - mutex_enter(&state->arcs_mtx); + + /* + * data lists come after metadata lists + */ + list_start = &state->arcs_lists[ARC_BUFC_NUMMETADATALISTS]; + list_count = ARC_BUFC_NUMDATALISTS; + offset = ARC_BUFC_NUMMETADATALISTS; + +evict_start: + list = &list_start[idx]; + lock = ARCS_LOCK(state, idx + offset); + + mutex_enter(lock); for (ab = list_tail(list); ab; ab = ab_prev) { ab_prev = list_prev(list, ab); if (spa && ab->b_spa != spa) @@ -1623,20 +1831,31 @@ top: break; } else { if (bytes < 0) { - mutex_exit(&state->arcs_mtx); + /* + * we're draining the ARC, retry + */ + mutex_exit(lock); mutex_enter(hash_lock); mutex_exit(hash_lock); - goto top; + goto evict_start; } bufs_skipped += 1; } } - mutex_exit(&state->arcs_mtx); + mutex_exit(lock); + idx = ((idx + 1) & (ARC_BUFC_NUMDATALISTS - 1)); + count++; - if (list == &state->arcs_list[ARC_BUFC_DATA] && + if (count < list_count) + goto evict_start; + + evict_offset = idx; + if ((uintptr_t)list > (uintptr_t)&state->arcs_lists[ARC_BUFC_NUMMETADATALISTS] && (bytes < 0 || bytes_deleted < bytes)) { - list = &state->arcs_list[ARC_BUFC_METADATA]; - goto top; + list_start = &state->arcs_lists[0]; + list_count = ARC_BUFC_NUMMETADATALISTS; + offset = count = 0; + goto evict_start; } if (bufs_skipped) { @@ -1718,7 +1937,7 @@ arc_do_user_evicts(void) /* * Move list over to avoid LOR */ -restart: +restart: mutex_enter(&arc_eviction_mtx); tmp_arc_eviction_list = arc_eviction_list; arc_eviction_list = NULL; @@ -1750,22 +1969,22 @@ restart: void arc_flush(spa_t *spa) { - while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) { + while (arc_mru->arcs_lsize[ARC_BUFC_DATA]) { (void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_DATA); if (spa) break; } - while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) { + while (arc_mru->arcs_lsize[ARC_BUFC_METADATA]) { (void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_METADATA); if (spa) break; } - while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) { + while (arc_mfu->arcs_lsize[ARC_BUFC_DATA]) { (void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_DATA); if (spa) break; } - while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) { + while (arc_mfu->arcs_lsize[ARC_BUFC_METADATA]) { (void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_METADATA); if (spa) break; @@ -1829,7 +2048,7 @@ arc_reclaim_needed(void) return (0); /* - * If pages are needed or we're within 2048 pages + * If pages are needed or we're within 2048 pages * of needing to page need to reclaim */ if (vm_pages_needed || (vm_paging_target() > -2048)) @@ -1896,8 +2115,6 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat) size_t i; kmem_cache_t *prev_cache = NULL; kmem_cache_t *prev_data_cache = NULL; - extern kmem_cache_t *zio_buf_cache[]; - extern kmem_cache_t *zio_data_buf_cache[]; #endif #ifdef _KERNEL @@ -2203,6 +2420,7 @@ out: arc_anon->arcs_size + arc_mru->arcs_size > arc_p) arc_p = MIN(arc_c, arc_p + size); } + ARCSTAT_BUMP(arcstat_allocated); } /* @@ -2502,7 +2720,6 @@ arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf, uint32_t *arc_flags, const zbookmark_t *zb) { int err; - arc_buf_hdr_t *hdr = pbuf->b_hdr; ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt)); ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size); @@ -2510,8 +2727,6 @@ arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf, err = arc_read_nolock(pio, spa, bp, done, private, priority, zio_flags, arc_flags, zb); - - ASSERT3P(hdr, ==, pbuf->b_hdr); rw_exit(&pbuf->b_lock); return (err); } @@ -2728,7 +2943,7 @@ top: * released by l2arc_read_done(). */ rzio = zio_read_phys(pio, vd, addr, size, - buf->b_data, ZIO_CHECKSUM_OFF, + buf->b_data, ZIO_CHECKSUM_OFF, l2arc_read_done, cb, priority, zio_flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | @@ -2823,6 +3038,8 @@ arc_buf_evict(arc_buf_t *buf) arc_buf_hdr_t *hdr; kmutex_t *hash_lock; arc_buf_t **bufp; + list_t *list, *evicted_list; + kmutex_t *lock, *evicted_lock; rw_enter(&buf->b_lock, RW_WRITER); hdr = buf->b_hdr; @@ -2871,16 +3088,18 @@ arc_buf_evict(arc_buf_t *buf) evicted_state = (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; - mutex_enter(&old_state->arcs_mtx); - mutex_enter(&evicted_state->arcs_mtx); + get_buf_info(hdr, old_state, &list, &lock); + get_buf_info(hdr, evicted_state, &evicted_list, &evicted_lock); + mutex_enter(lock); + mutex_enter(evicted_lock); arc_change_state(evicted_state, hdr, hash_lock); ASSERT(HDR_IN_HASH_TABLE(hdr)); hdr->b_flags |= ARC_IN_HASH_TABLE; hdr->b_flags &= ~ARC_BUF_AVAILABLE; - mutex_exit(&evicted_state->arcs_mtx); - mutex_exit(&old_state->arcs_mtx); + mutex_exit(evicted_lock); + mutex_exit(lock); } mutex_exit(hash_lock); rw_exit(&buf->b_lock); @@ -3426,7 +3645,8 @@ void arc_init(void) { int prefetch_tunable_set = 0; - + int i; + mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); mutex_init(&arc_lowmem_lock, NULL, MUTEX_DEFAULT, NULL); @@ -3494,33 +3714,33 @@ arc_init(void) arc_l2c_only = &ARC_l2c_only; arc_size = 0; - mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); - - list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); - list_create(&arc_mru->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); - list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); - list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); - list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); - list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); - list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); - list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); - list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); - list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + for (i = 0; i < ARC_BUFC_NUMLISTS; i++) { + mutex_init(&arc_anon->arcs_locks[i].arcs_lock, + NULL, MUTEX_DEFAULT, NULL); + mutex_init(&arc_mru->arcs_locks[i].arcs_lock, + NULL, MUTEX_DEFAULT, NULL); + mutex_init(&arc_mru_ghost->arcs_locks[i].arcs_lock, + NULL, MUTEX_DEFAULT, NULL); + mutex_init(&arc_mfu->arcs_locks[i].arcs_lock, + NULL, MUTEX_DEFAULT, NULL); + mutex_init(&arc_mfu_ghost->arcs_locks[i].arcs_lock, + NULL, MUTEX_DEFAULT, NULL); + mutex_init(&arc_l2c_only->arcs_locks[i].arcs_lock, + NULL, MUTEX_DEFAULT, NULL); + + list_create(&arc_mru->arcs_lists[i], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mru_ghost->arcs_lists[i], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mfu->arcs_lists[i], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mfu_ghost->arcs_lists[i], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mfu_ghost->arcs_lists[i], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_l2c_only->arcs_lists[i], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + } buf_init(); @@ -3557,7 +3777,7 @@ arc_init(void) #ifdef _KERNEL if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable)) prefetch_tunable_set = 1; - + #ifdef __i386__ if (prefetch_tunable_set == 0) { printf("ZFS NOTICE: Prefetch is disabled by default on i386 " @@ -3566,7 +3786,7 @@ arc_init(void) "to /boot/loader.conf.\n"); zfs_prefetch_disable=1; } -#else +#else if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) && prefetch_tunable_set == 0) { printf("ZFS NOTICE: Prefetch is disabled by default if less " @@ -3575,7 +3795,7 @@ arc_init(void) "to /boot/loader.conf.\n"); zfs_prefetch_disable=1; } -#endif +#endif /* Warn about ZFS memory and address space requirements. */ if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) { printf("ZFS WARNING: Recommended minimum RAM size is 512MB; " @@ -3594,6 +3814,7 @@ arc_init(void) void arc_fini(void) { + int i; mutex_enter(&arc_reclaim_thr_lock); arc_thread_exit = 1; @@ -3615,20 +3836,20 @@ arc_fini(void) mutex_destroy(&arc_reclaim_thr_lock); cv_destroy(&arc_reclaim_thr_cv); - list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); - list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); - list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); - list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); - list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); - list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); - list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); - list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); - - mutex_destroy(&arc_anon->arcs_mtx); - mutex_destroy(&arc_mru->arcs_mtx); - mutex_destroy(&arc_mru_ghost->arcs_mtx); - mutex_destroy(&arc_mfu->arcs_mtx); - mutex_destroy(&arc_mfu_ghost->arcs_mtx); + for (i = 0; i < ARC_BUFC_NUMLISTS; i++) { + list_destroy(&arc_mru->arcs_lists[i]); + list_destroy(&arc_mru_ghost->arcs_lists[i]); + list_destroy(&arc_mfu->arcs_lists[i]); + list_destroy(&arc_mfu_ghost->arcs_lists[i]); + list_destroy(&arc_l2c_only->arcs_lists[i]); + + mutex_destroy(&arc_anon->arcs_locks[i].arcs_lock); + mutex_destroy(&arc_mru->arcs_locks[i].arcs_lock); + mutex_destroy(&arc_mru_ghost->arcs_locks[i].arcs_lock); + mutex_destroy(&arc_mfu->arcs_locks[i].arcs_lock); + mutex_destroy(&arc_mfu_ghost->arcs_locks[i].arcs_lock); + mutex_destroy(&arc_l2c_only->arcs_locks[i].arcs_lock); + } mutex_destroy(&zfs_write_limit_lock); @@ -4024,26 +4245,27 @@ static list_t * l2arc_list_locked(int list_num, kmutex_t **lock) { list_t *list; - - ASSERT(list_num >= 0 && list_num <= 3); - - switch (list_num) { - case 0: - list = &arc_mfu->arcs_list[ARC_BUFC_METADATA]; - *lock = &arc_mfu->arcs_mtx; - break; - case 1: - list = &arc_mru->arcs_list[ARC_BUFC_METADATA]; - *lock = &arc_mru->arcs_mtx; - break; - case 2: - list = &arc_mfu->arcs_list[ARC_BUFC_DATA]; - *lock = &arc_mfu->arcs_mtx; - break; - case 3: - list = &arc_mru->arcs_list[ARC_BUFC_DATA]; - *lock = &arc_mru->arcs_mtx; - break; + int idx; + + ASSERT(list_num >= 0 && list_num < 2 * ARC_BUFC_NUMLISTS); + + if (list_num < ARC_BUFC_NUMMETADATALISTS) { + idx = list_num; + list = &arc_mfu->arcs_lists[idx]; + *lock = ARCS_LOCK(arc_mfu, idx); + } else if (list_num < ARC_BUFC_NUMMETADATALISTS * 2) { + idx = list_num - ARC_BUFC_NUMMETADATALISTS; + list = &arc_mru->arcs_lists[idx]; + *lock = ARCS_LOCK(arc_mru, idx); + } else if (list_num < (ARC_BUFC_NUMMETADATALISTS * 2 + + ARC_BUFC_NUMDATALISTS)) { + idx = list_num - ARC_BUFC_NUMMETADATALISTS; + list = &arc_mfu->arcs_lists[idx]; + *lock = ARCS_LOCK(arc_mfu, idx); + } else { + idx = list_num - ARC_BUFC_NUMLISTS; + list = &arc_mru->arcs_lists[idx]; + *lock = ARCS_LOCK(arc_mru, idx); } ASSERT(!(MUTEX_HELD(*lock))); @@ -4210,13 +4432,15 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); head->b_flags |= ARC_L2_WRITE_HEAD; + ARCSTAT_BUMP(arcstat_l2_write_buffer_iter); /* * Copy buffers for L2ARC writing. */ mutex_enter(&l2arc_buflist_mtx); - for (try = 0; try <= 3; try++) { + for (try = 0; try < 2 * ARC_BUFC_NUMLISTS; try++) { list = l2arc_list_locked(try, &list_lock); passed_sz = 0; + ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter); /* * L2ARC fast warmup. @@ -4229,52 +4453,65 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) ab = list_head(list); else ab = list_tail(list); + if (ab == NULL) + ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter); for (; ab; ab = ab_prev) { if (arc_warm == B_FALSE) ab_prev = list_next(list, ab); else ab_prev = list_prev(list, ab); + ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, ab->b_size); hash_lock = HDR_LOCK(ab); have_lock = MUTEX_HELD(hash_lock); if (!have_lock && !mutex_tryenter(hash_lock)) { + ARCSTAT_BUMP(arcstat_l2_write_trylock_fail); /* * Skip this buffer rather than waiting. */ continue; } + if (ab->b_l2hdr != NULL) { + /* + * Already in L2ARC. + */ + mutex_exit(hash_lock); + ARCSTAT_BUMP(arcstat_l2_write_in_l2); + continue; + } + passed_sz += ab->b_size; if (passed_sz > headroom) { /* * Searched too far. */ mutex_exit(hash_lock); + ARCSTAT_BUMP(arcstat_l2_write_passed_headroom); break; } if (ab->b_spa != spa) { mutex_exit(hash_lock); + ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch); continue; } - if (ab->b_l2hdr != NULL) { - /* - * Already in L2ARC. - */ + if (HDR_IO_IN_PROGRESS(ab)) { mutex_exit(hash_lock); + ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress); continue; } - - if (HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab)) { + if (!HDR_L2CACHE(ab)) { mutex_exit(hash_lock); + ARCSTAT_BUMP(arcstat_l2_write_not_cacheable); continue; } - if ((write_sz + ab->b_size) > target_sz) { full = B_TRUE; mutex_exit(hash_lock); + ARCSTAT_BUMP(arcstat_l2_write_full); break; } @@ -4298,8 +4535,10 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) cb->l2wcb_head = head; pio = zio_root(spa, l2arc_write_done, cb, ZIO_FLAG_CANFAIL); + ARCSTAT_BUMP(arcstat_l2_write_pios); } + ARCSTAT_INCR(arcstat_l2_write_bytes_written, ab->b_size); /* * Create and add a new L2ARC header. */ @@ -4395,7 +4634,7 @@ l2arc_feed_thread(void *dummy __unused) */ CALLB_CPR_SAFE_BEGIN(&cpr); (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, - hz * l2arc_feed_secs); + hz * l2arc_feed_secs >> l2arc_feed_secs_shift); CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); /* diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c index 2494c1e..3bf0939 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c @@ -2210,9 +2210,6 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { if (BP_IS_HOLE(ibp)) continue; - ASSERT3U(BP_GET_LSIZE(ibp), ==, - db->db_level == 1 ? dn->dn_datablksz : - (1<<dn->dn_phys->dn_indblkshift)); fill += ibp->blk_fill; } } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c index cddc64d..5cc56a9 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c @@ -49,11 +49,11 @@ uint32_t zfetch_block_cap = 256; uint64_t zfetch_array_rd_sz = 1024 * 1024; SYSCTL_DECL(_vfs_zfs); -SYSCTL_INT(_vfs_zfs, OID_AUTO, prefetch_disable, CTLFLAG_RDTUN, +SYSCTL_INT(_vfs_zfs, OID_AUTO, prefetch_disable, CTLFLAG_RW, &zfs_prefetch_disable, 0, "Disable prefetch"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, zfetch, CTLFLAG_RW, 0, "ZFS ZFETCH"); TUNABLE_INT("vfs.zfs.zfetch.max_streams", &zfetch_max_streams); -SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_streams, CTLFLAG_RDTUN, +SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_streams, CTLFLAG_RW, &zfetch_max_streams, 0, "Max # of streams per zfetch"); TUNABLE_INT("vfs.zfs.zfetch.min_sec_reap", &zfetch_min_sec_reap); SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, min_sec_reap, CTLFLAG_RDTUN, @@ -338,8 +338,10 @@ top: reset = !prefetched && zs->zst_len > 1; - mutex_enter(&zs->zst_lock); - + if (mutex_tryenter(&zs->zst_lock) == 0) { + rc = 1; + goto out; + } if (zh->zst_offset != zs->zst_offset + zs->zst_len) { mutex_exit(&zs->zst_lock); goto top; @@ -363,8 +365,10 @@ top: reset = !prefetched && zs->zst_len > 1; - mutex_enter(&zs->zst_lock); - + if (mutex_tryenter(&zs->zst_lock) == 0) { + rc = 1; + goto out; + } if (zh->zst_offset != zs->zst_offset - zh->zst_len) { mutex_exit(&zs->zst_lock); goto top; @@ -391,8 +395,10 @@ top: zs->zst_len) && (zs->zst_len != zs->zst_stride)) { /* strided forward access */ - mutex_enter(&zs->zst_lock); - + if (mutex_tryenter(&zs->zst_lock) == 0) { + rc = 1; + goto out; + } if ((zh->zst_offset - zs->zst_offset - zs->zst_stride >= zs->zst_len) || (zs->zst_len == zs->zst_stride)) { mutex_exit(&zs->zst_lock); @@ -408,8 +414,10 @@ top: zs->zst_len) && (zs->zst_len != zs->zst_stride)) { /* strided reverse access */ - mutex_enter(&zs->zst_lock); - + if (mutex_tryenter(&zs->zst_lock) == 0) { + rc = 1; + goto out; + } if ((zh->zst_offset - zs->zst_offset + zs->zst_stride >= zs->zst_len) || (zs->zst_len == zs->zst_stride)) { mutex_exit(&zs->zst_lock); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c index 163b215..90861ba 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c @@ -62,6 +62,14 @@ #include "zfs_prop.h" #include "zfs_comutil.h" +/* Check hostid on import? */ +static int check_hostid = 1; + +SYSCTL_DECL(_vfs_zfs); +TUNABLE_INT("vfs.zfs.check_hostid", &check_hostid); +SYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RW, &check_hostid, 0, + "Check hostid on import?"); + int zio_taskq_threads[ZIO_TYPES][ZIO_TASKQ_TYPES] = { /* ISSUE INTR */ { 1, 1 }, /* ZIO_TYPE_NULL */ @@ -1168,7 +1176,7 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); - if (hostid != 0 && myhostid != 0 && + if (check_hostid && hostid != 0 && myhostid != 0 && (unsigned long)hostid != myhostid) { cmn_err(CE_WARN, "pool '%s' could not be " "loaded as it was last accessed by " diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c index 4c41f90..819473e 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c @@ -29,6 +29,7 @@ #include <sys/bio.h> #include <sys/disk.h> #include <sys/spa.h> +#include <sys/spa_impl.h> #include <sys/vdev_impl.h> #include <sys/fs/zfs.h> #include <sys/zio.h> @@ -102,7 +103,7 @@ vdev_geom_orphan(struct g_consumer *cp) } static struct g_consumer * -vdev_geom_attach(struct g_provider *pp, int write) +vdev_geom_attach(struct g_provider *pp) { struct g_geom *gp; struct g_consumer *cp; @@ -126,7 +127,7 @@ vdev_geom_attach(struct g_provider *pp, int write) g_wither_geom(gp, ENXIO); return (NULL); } - if (g_access(cp, 1, write, 1) != 0) { + if (g_access(cp, 1, 0, 1) != 0) { g_wither_geom(gp, ENXIO); return (NULL); } @@ -145,14 +146,14 @@ vdev_geom_attach(struct g_provider *pp, int write) g_destroy_consumer(cp); return (NULL); } - if (g_access(cp, 1, write, 1) != 0) { + if (g_access(cp, 1, 0, 1) != 0) { g_detach(cp); g_destroy_consumer(cp); return (NULL); } ZFS_LOG(1, "Created consumer for %s.", pp->name); } else { - if (g_access(cp, 1, cp->acw > 0 ? 0 : write, 1) != 0) + if (g_access(cp, 1, 0, 1) != 0) return (NULL); ZFS_LOG(1, "Used existing consumer for %s.", pp->name); } @@ -342,7 +343,6 @@ vdev_geom_read_guid(struct g_consumer *cp) struct vdev_geom_find { uint64_t guid; - int write; struct g_consumer *cp; }; @@ -394,10 +394,10 @@ vdev_geom_attach_by_guid_event(void *arg, int flags __unused) g_detach(zcp); if (guid != ap->guid) continue; - ap->cp = vdev_geom_attach(pp, ap->write); + ap->cp = vdev_geom_attach(pp); if (ap->cp == NULL) { - printf("ZFS WARNING: Cannot open %s " - "for writting.\n", pp->name); + printf("ZFS WARNING: Unable to attach to %s.", + pp->name); continue; } goto end; @@ -411,14 +411,13 @@ end: } static struct g_consumer * -vdev_geom_attach_by_guid(uint64_t guid, int write) +vdev_geom_attach_by_guid(uint64_t guid) { struct vdev_geom_find *ap; struct g_consumer *cp; ap = kmem_zalloc(sizeof(*ap), KM_SLEEP); ap->guid = guid; - ap->write = write; g_waitfor_event(vdev_geom_attach_by_guid_event, ap, M_WAITOK, NULL); cp = ap->cp; kmem_free(ap, sizeof(*ap)); @@ -433,7 +432,7 @@ vdev_geom_open_by_guid(vdev_t *vd) size_t len; ZFS_LOG(1, "Searching by guid [%ju].", (uintmax_t)vd->vdev_guid); - cp = vdev_geom_attach_by_guid(vd->vdev_guid, !!(spa_mode & FWRITE)); + cp = vdev_geom_attach_by_guid(vd->vdev_guid); if (cp != NULL) { len = strlen(cp->provider->name) + strlen("/dev/") + 1; buf = kmem_alloc(len, KM_SLEEP); @@ -464,7 +463,7 @@ vdev_geom_open_by_path(vdev_t *vd, int check_guid) pp = g_provider_by_name(vd->vdev_path + sizeof("/dev/") - 1); if (pp != NULL) { ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path); - cp = vdev_geom_attach(pp, !!(spa_mode & FWRITE)); + cp = vdev_geom_attach(pp); if (cp != NULL && check_guid) { g_topology_unlock(); guid = vdev_geom_read_guid(cp); @@ -492,7 +491,7 @@ vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) vdev_geom_ctx_t *ctx; struct g_provider *pp; struct g_consumer *cp; - int owned; + int error, owned; /* * We must have a pathname, and it must be absolute. @@ -506,26 +505,47 @@ vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) if ((owned = mtx_owned(&Giant))) mtx_unlock(&Giant); - cp = vdev_geom_open_by_path(vd, 1); - if (cp == NULL) { - /* - * The device at vd->vdev_path doesn't have the expected guid. - * The disks might have merely moved around so try all other - * geom providers to find one with the right guid. - */ - cp = vdev_geom_open_by_guid(vd); - } - if (cp == NULL) + error = 0; + + /* + * If we're creating pool, just find GEOM provider by its name + * and ignore GUID mismatches. + */ + if (vd->vdev_spa->spa_load_state == SPA_LOAD_NONE) cp = vdev_geom_open_by_path(vd, 0); + else { + cp = vdev_geom_open_by_path(vd, 1); + if (cp == NULL) { + /* + * The device at vd->vdev_path doesn't have the + * expected guid. The disks might have merely + * moved around so try all other GEOM providers + * to find one with the right guid. + */ + cp = vdev_geom_open_by_guid(vd); + } + } + if (cp == NULL) { ZFS_LOG(1, "Provider %s not found.", vd->vdev_path); - vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; - if (owned) - mtx_lock(&Giant); - return (EACCES); + error = ENOENT; + } else if (cp->acw == 0 && (spa_mode & FWRITE) != 0) { + g_topology_lock(); + error = g_access(cp, 0, 1, 0); + if (error != 0) { + printf("ZFS WARNING: Unable to open %s for writing (error=%d).", + vd->vdev_path, error); + vdev_geom_detach(cp, 0); + cp = NULL; + } + g_topology_unlock(); } if (owned) mtx_lock(&Giant); + if (cp == NULL) { + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + return (error); + } cp->private = vd; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c index 1132483..07c9b61 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c @@ -214,7 +214,7 @@ blksz_changed_cb(void *arg, uint64_t newval) newval = SPA_MAXBLOCKSIZE; zfsvfs->z_max_blksz = newval; - zfsvfs->z_vfs->vfs_bsize = newval; + zfsvfs->z_vfs->mnt_stat.f_iosize = newval; } static void @@ -577,7 +577,8 @@ zfs_domount(vfs_t *vfsp, char *osname) if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize, NULL)) goto out; - zfsvfs->z_vfs->vfs_bsize = recordsize; + zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE; + zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize; vfsp->vfs_data = zfsvfs; vfsp->mnt_flag |= MNT_LOCAL; @@ -817,8 +818,8 @@ zfs_statfs(vfs_t *vfsp, struct statfs *statp) * We report the fragsize as the smallest block size we support, * and we report our blocksize as the filesystem's maximum blocksize. */ - statp->f_bsize = zfsvfs->z_vfs->vfs_bsize; - statp->f_iosize = zfsvfs->z_vfs->vfs_bsize; + statp->f_bsize = SPA_MINBLOCKSIZE; + statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize; /* * The following report "total" blocks of various kinds in the @@ -826,7 +827,7 @@ zfs_statfs(vfs_t *vfsp, struct statfs *statp) * "fragment" size. */ - statp->f_blocks = (refdbytes + availbytes) / statp->f_bsize; + statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT; statp->f_bfree = availbytes / statp->f_bsize; statp->f_bavail = statp->f_bfree; /* no root reservation */ @@ -867,13 +868,15 @@ zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp) ZFS_ENTER_NOERROR(zfsvfs); error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); + + ZFS_EXIT(zfsvfs); + if (error == 0) { *vpp = ZTOV(rootzp); error = vn_lock(*vpp, flags); (*vpp)->v_vflag |= VV_ROOT; } - ZFS_EXIT(zfsvfs); return (error); } @@ -1142,13 +1145,13 @@ zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp) VN_RELE(ZTOV(zp)); err = EINVAL; } + ZFS_EXIT(zfsvfs); if (err != 0) *vpp = NULL; else { *vpp = ZTOV(zp); vn_lock(*vpp, flags); } - ZFS_EXIT(zfsvfs); return (err); } @@ -1236,8 +1239,8 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp) } else { VN_HOLD(*vpp); } - vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); ZFS_EXIT(zfsvfs); + vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); return (0); } @@ -1258,10 +1261,11 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp) return (EINVAL); } + ZFS_EXIT(zfsvfs); + *vpp = ZTOV(zp); vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); vnode_create_vobject(*vpp, zp->z_phys->zp_size, curthread); - ZFS_EXIT(zfsvfs); return (0); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c index 4f61f5f..59a58dd 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c @@ -1209,15 +1209,17 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp, ltype = VOP_ISLOCKED(dvp); VOP_UNLOCK(dvp, 0); } + ZFS_EXIT(zfsvfs); error = vn_lock(*vpp, cnp->cn_lkflags); if (cnp->cn_flags & ISDOTDOT) vn_lock(dvp, ltype | LK_RETRY); if (error != 0) { VN_RELE(*vpp); *vpp = NULL; - ZFS_EXIT(zfsvfs); return (error); } + } else { + ZFS_EXIT(zfsvfs); } #ifdef FREEBSD_NAMECACHE @@ -1237,8 +1239,6 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp, } #endif - ZFS_EXIT(zfsvfs); - return (error); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c index 4650d42..e7227f2 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c @@ -91,13 +91,6 @@ zio_init(void) #ifdef ZIO_USE_UMA size_t c; #endif -#if 0 - vmem_t *data_alloc_arena = NULL; - -#ifdef _KERNEL - data_alloc_arena = zio_alloc_arena; -#endif -#endif zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); @@ -132,8 +125,7 @@ zio_init(void) (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); zio_data_buf_cache[c] = kmem_cache_create(name, size, - align, NULL, NULL, NULL, NULL, data_alloc_arena, - KMC_NODEBUG); + align, NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG); } } diff --git a/sys/modules/zfs/Makefile b/sys/modules/zfs/Makefile index c95a840..4554088 100644 --- a/sys/modules/zfs/Makefile +++ b/sys/modules/zfs/Makefile @@ -63,8 +63,8 @@ ZFS_SRCS= ${ZFS_OBJS:C/.o$/.c/} SRCS+= ${ZFS_SRCS} SRCS+= vdev_geom.c -# Use UMA for ZIO allocation. This is not stable. -#CFLAGS+=-DZIO_USE_UMA +# Use UMA for ZIO allocation. +CFLAGS+=-DZIO_USE_UMA # Use FreeBSD's namecache. CFLAGS+=-DFREEBSD_NAMECACHE diff --git a/sys/sys/ioccom.h b/sys/sys/ioccom.h index be2ce66..5669088 100644 --- a/sys/sys/ioccom.h +++ b/sys/sys/ioccom.h @@ -38,12 +38,13 @@ * any in or out parameters in the upper word. The high 3 bits of the * upper word are used to encode the in/out status of the parameter. */ -#define IOCPARM_MASK 0x1fff /* parameter length, at most 13 bits */ +#define IOCPARM_SHIFT 13 /* number of bits for ioctl size */ +#define IOCPARM_MASK ((1 << IOCPARM_SHIFT) - 1) /* parameter length mask */ #define IOCPARM_LEN(x) (((x) >> 16) & IOCPARM_MASK) #define IOCBASECMD(x) ((x) & ~(IOCPARM_MASK << 16)) #define IOCGROUP(x) (((x) >> 8) & 0xff) -#define IOCPARM_MAX PAGE_SIZE /* max size of ioctl, mult. of PAGE_SIZE */ +#define IOCPARM_MAX (1 << IOCPARM_SHIFT) /* max size of ioctl */ #define IOC_VOID 0x20000000 /* no parameters */ #define IOC_OUT 0x40000000 /* copy out parameters */ #define IOC_IN 0x80000000 /* copy in parameters */ |