diff options
Diffstat (limited to 'sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c')
-rw-r--r-- | sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c | 190 |
1 files changed, 119 insertions, 71 deletions
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c index 3d20e1d..a4fbf04 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c @@ -127,6 +127,7 @@ #include <sys/refcount.h> #include <sys/vdev.h> #include <sys/vdev_impl.h> +#include <sys/dsl_pool.h> #ifdef _KERNEL #include <sys/dnlc.h> #endif @@ -150,10 +151,6 @@ static kmutex_t arc_reclaim_thr_lock; static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ static uint8_t arc_thread_exit; -extern int zfs_write_limit_shift; -extern uint64_t zfs_write_limit_max; -extern kmutex_t zfs_write_limit_lock; - #define ARC_REDUCE_DNLC_PERCENT 3 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; @@ -162,6 +159,12 @@ typedef enum arc_reclaim_strategy { ARC_RECLAIM_CONS /* Conservative reclaim strategy */ } arc_reclaim_strategy_t; +/* + * The number of iterations through arc_evict_*() before we + * drop & reacquire the lock. + */ +int arc_evict_iterations = 100; + /* number of seconds before growing cache again */ static int arc_grow_retry = 60; @@ -177,6 +180,11 @@ static int arc_shrink_shift = 5; */ static int arc_min_prefetch_lifespan; +/* + * If this percent of memory is free, don't throttle. + */ +int arc_lotsfree_percent = 10; + static int arc_dead; extern int zfs_prefetch_disable; @@ -526,6 +534,7 @@ typedef struct arc_write_callback arc_write_callback_t; struct arc_write_callback { void *awcb_private; arc_done_func_t *awcb_ready; + arc_done_func_t *awcb_physdone; arc_done_func_t *awcb_done; arc_buf_t *awcb_buf; }; @@ -1312,7 +1321,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) kmutex_t *lock; ASSERT(MUTEX_HELD(hash_lock)); - ASSERT(new_state != old_state); + ASSERT3P(new_state, !=, old_state); ASSERT(refcnt == 0 || ab->b_datacnt > 0); ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state)); ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon); @@ -1937,8 +1946,10 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, kmutex_t *hash_lock; boolean_t have_lock; void *stolen = NULL; + arc_buf_hdr_t marker = { 0 }; + int count = 0; static int evict_metadata_offset, evict_data_offset; - int i, idx, offset, list_count, count; + int i, idx, offset, list_count, lists; ASSERT(state == arc_mru || state == arc_mfu); @@ -1958,7 +1969,7 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, idx = evict_data_offset; } bytes_remaining = evicted_state->arcs_lsize[type]; - count = 0; + lists = 0; evict_start: list = &list_start[idx]; @@ -1985,6 +1996,33 @@ evict_start: if (recycle && ab->b_size != bytes && ab_prev && ab_prev->b_size == bytes) continue; + + /* ignore markers */ + if (ab->b_spa == 0) + continue; + + /* + * It may take a long time to evict all the bufs requested. + * To avoid blocking all arc activity, periodically drop + * the arcs_mtx and give other threads a chance to run + * before reacquiring the lock. + * + * If we are looking for a buffer to recycle, we are in + * the hot code path, so don't sleep. + */ + if (!recycle && count++ > arc_evict_iterations) { + list_insert_after(list, ab, &marker); + mutex_exit(evicted_lock); + mutex_exit(lock); + kpreempt(KPREEMPT_SYNC); + mutex_enter(lock); + mutex_enter(evicted_lock); + ab_prev = list_prev(list, &marker); + list_remove(list, &marker); + count = 0; + continue; + } + hash_lock = HDR_LOCK(ab); have_lock = MUTEX_HELD(hash_lock); if (have_lock || mutex_tryenter(hash_lock)) { @@ -2051,7 +2089,7 @@ evict_start: mutex_exit(evicted_lock); mutex_exit(lock); idx = ((idx + 1) & (list_count - 1)); - count++; + lists++; goto evict_start; } } else { @@ -2063,10 +2101,10 @@ evict_start: mutex_exit(lock); idx = ((idx + 1) & (list_count - 1)); - count++; + lists++; if (bytes_evicted < bytes) { - if (count < list_count) + if (lists < list_count) goto evict_start; else dprintf("only evicted %lld bytes from %x", @@ -2084,28 +2122,14 @@ evict_start: ARCSTAT_INCR(arcstat_mutex_miss, missed); /* - * We have just evicted some data into the ghost state, make - * sure we also adjust the ghost state size if necessary. + * Note: we have just evicted some data into the ghost state, + * potentially putting the ghost size over the desired size. Rather + * that evicting from the ghost list in this hot code path, leave + * this chore to the arc_reclaim_thread(). */ - if (arc_no_grow && - arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) { - int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size + - arc_mru_ghost->arcs_size - arc_c; - - if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) { - int64_t todelete = - MIN(arc_mru_ghost->arcs_lsize[type], mru_over); - arc_evict_ghost(arc_mru_ghost, 0, todelete); - } else if (arc_mfu_ghost->arcs_lsize[type] > 0) { - int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type], - arc_mru_ghost->arcs_size + - arc_mfu_ghost->arcs_size - arc_c); - arc_evict_ghost(arc_mfu_ghost, 0, todelete); - } - } + if (stolen) ARCSTAT_BUMP(arcstat_stolen); - return (stolen); } @@ -2122,9 +2146,10 @@ arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes) kmutex_t *hash_lock, *lock; uint64_t bytes_deleted = 0; uint64_t bufs_skipped = 0; + int count = 0; static int evict_offset; int list_count, idx = evict_offset; - int offset, count = 0; + int offset, lists = 0; ASSERT(GHOST_STATE(state)); @@ -2142,6 +2167,8 @@ evict_start: mutex_enter(lock); for (ab = list_tail(list); ab; ab = ab_prev) { ab_prev = list_prev(list, ab); + if (ab->b_type > ARC_BUFC_NUMTYPES) + panic("invalid ab=%p", (void *)ab); if (spa && ab->b_spa != spa) continue; @@ -2153,6 +2180,23 @@ evict_start: /* caller may be trying to modify this buffer, skip it */ if (MUTEX_HELD(hash_lock)) continue; + + /* + * It may take a long time to evict all the bufs requested. + * To avoid blocking all arc activity, periodically drop + * the arcs_mtx and give other threads a chance to run + * before reacquiring the lock. + */ + if (count++ > arc_evict_iterations) { + list_insert_after(list, ab, &marker); + mutex_exit(lock); + kpreempt(KPREEMPT_SYNC); + mutex_enter(lock); + ab_prev = list_prev(list, &marker); + list_remove(list, &marker); + count = 0; + continue; + } if (mutex_tryenter(hash_lock)) { ASSERT(!HDR_IO_IN_PROGRESS(ab)); ASSERT(ab->b_buf == NULL); @@ -2188,14 +2232,16 @@ evict_start: mutex_enter(lock); ab_prev = list_prev(list, &marker); list_remove(list, &marker); - } else + } else { bufs_skipped += 1; + } + } mutex_exit(lock); idx = ((idx + 1) & (ARC_BUFC_NUMDATALISTS - 1)); - count++; + lists++; - if (count < list_count) + if (lists < list_count) goto evict_start; evict_offset = idx; @@ -2203,7 +2249,7 @@ evict_start: (bytes < 0 || bytes_deleted < bytes)) { list_start = &state->arcs_lists[0]; list_count = ARC_BUFC_NUMMETADATALISTS; - offset = count = 0; + offset = lists = 0; goto evict_start; } @@ -3083,7 +3129,7 @@ arc_read_done(zio_t *zio) */ int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, - void *private, int priority, int zio_flags, uint32_t *arc_flags, + void *private, zio_priority_t priority, int zio_flags, uint32_t *arc_flags, const zbookmark_t *zb) { arc_buf_hdr_t *hdr; @@ -3699,6 +3745,18 @@ arc_write_ready(zio_t *zio) hdr->b_flags |= ARC_IO_IN_PROGRESS; } +/* + * The SPA calls this callback for each physical write that happens on behalf + * of a logical write. See the comment in dbuf_write_physdone() for details. + */ +static void +arc_write_physdone(zio_t *zio) +{ + arc_write_callback_t *cb = zio->io_private; + if (cb->awcb_physdone != NULL) + cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private); +} + static void arc_write_done(zio_t *zio) { @@ -3779,8 +3837,9 @@ arc_write_done(zio_t *zio) zio_t * arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress, - const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *done, - void *private, int priority, int zio_flags, const zbookmark_t *zb) + const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone, + arc_done_func_t *done, void *private, zio_priority_t priority, + int zio_flags, const zbookmark_t *zb) { arc_buf_hdr_t *hdr = buf->b_hdr; arc_write_callback_t *callback; @@ -3797,18 +3856,20 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, hdr->b_flags |= ARC_L2COMPRESS; callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); callback->awcb_ready = ready; + callback->awcb_physdone = physdone; callback->awcb_done = done; callback->awcb_private = private; callback->awcb_buf = buf; zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp, - arc_write_ready, arc_write_done, callback, priority, zio_flags, zb); + arc_write_ready, arc_write_physdone, arc_write_done, callback, + priority, zio_flags, zb); return (zio); } static int -arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg) +arc_memory_throttle(uint64_t reserve, uint64_t txg) { #ifdef _KERNEL uint64_t available_memory = @@ -3822,7 +3883,9 @@ arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg) MIN(available_memory, vmem_size(heap_arena, VMEM_FREE)); #endif #endif /* sun */ - if (available_memory >= zfs_write_limit_max) + + if (cnt.v_free_count + cnt.v_cache_count > + (uint64_t)physmem * arc_lotsfree_percent / 100) return (0); if (txg > last_txg) { @@ -3846,20 +3909,6 @@ arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg) return (SET_ERROR(EAGAIN)); } page_load = 0; - - if (arc_size > arc_c_min) { - uint64_t evictable_memory = - arc_mru->arcs_lsize[ARC_BUFC_DATA] + - arc_mru->arcs_lsize[ARC_BUFC_METADATA] + - arc_mfu->arcs_lsize[ARC_BUFC_DATA] + - arc_mfu->arcs_lsize[ARC_BUFC_METADATA]; - available_memory += MIN(evictable_memory, arc_size - arc_c_min); - } - - if (inflight_data > available_memory / 4) { - ARCSTAT_INCR(arcstat_memory_throttle_count, 1); - return (SET_ERROR(ERESTART)); - } #endif return (0); } @@ -3877,15 +3926,6 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg) int error; uint64_t anon_size; -#ifdef ZFS_DEBUG - /* - * Once in a while, fail for no reason. Everything should cope. - */ - if (spa_get_random(10000) == 0) { - dprintf("forcing random failure\n"); - return (SET_ERROR(ERESTART)); - } -#endif if (reserve > arc_c/4 && !arc_no_grow) arc_c = MIN(arc_c_max, reserve * 4); if (reserve > arc_c) @@ -3903,7 +3943,8 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg) * in order to compress/encrypt/etc the data. We therefore need to * make sure that there is sufficient available memory for this. */ - if (error = arc_memory_throttle(reserve, anon_size, txg)) + error = arc_memory_throttle(reserve, txg); + if (error != 0) return (error); /* @@ -4094,11 +4135,20 @@ arc_init(void) arc_dead = FALSE; arc_warm = B_FALSE; - if (zfs_write_limit_max == 0) - zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift; - else - zfs_write_limit_shift = 0; - mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL); + /* + * Calculate maximum amount of dirty data per pool. + * + * If it has been set by /etc/system, take that. + * Otherwise, use a percentage of physical memory defined by + * zfs_dirty_data_max_percent (default 10%) with a cap at + * zfs_dirty_data_max_max (default 4GB). + */ + if (zfs_dirty_data_max == 0) { + zfs_dirty_data_max = ptob(physmem) * + zfs_dirty_data_max_percent / 100; + zfs_dirty_data_max = MIN(zfs_dirty_data_max, + zfs_dirty_data_max_max); + } #ifdef _KERNEL if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable)) @@ -4177,8 +4227,6 @@ arc_fini(void) mutex_destroy(&arc_l2c_only->arcs_locks[i].arcs_lock); } - mutex_destroy(&zfs_write_limit_lock); - buf_fini(); ASSERT(arc_loaned_bytes == 0); |