summaryrefslogtreecommitdiffstats
path: root/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
diff options
context:
space:
mode:
Diffstat (limited to 'sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c')
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c190
1 files changed, 119 insertions, 71 deletions
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
index 3d20e1d..a4fbf04 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
@@ -127,6 +127,7 @@
#include <sys/refcount.h>
#include <sys/vdev.h>
#include <sys/vdev_impl.h>
+#include <sys/dsl_pool.h>
#ifdef _KERNEL
#include <sys/dnlc.h>
#endif
@@ -150,10 +151,6 @@ static kmutex_t arc_reclaim_thr_lock;
static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */
static uint8_t arc_thread_exit;
-extern int zfs_write_limit_shift;
-extern uint64_t zfs_write_limit_max;
-extern kmutex_t zfs_write_limit_lock;
-
#define ARC_REDUCE_DNLC_PERCENT 3
uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
@@ -162,6 +159,12 @@ typedef enum arc_reclaim_strategy {
ARC_RECLAIM_CONS /* Conservative reclaim strategy */
} arc_reclaim_strategy_t;
+/*
+ * The number of iterations through arc_evict_*() before we
+ * drop & reacquire the lock.
+ */
+int arc_evict_iterations = 100;
+
/* number of seconds before growing cache again */
static int arc_grow_retry = 60;
@@ -177,6 +180,11 @@ static int arc_shrink_shift = 5;
*/
static int arc_min_prefetch_lifespan;
+/*
+ * If this percent of memory is free, don't throttle.
+ */
+int arc_lotsfree_percent = 10;
+
static int arc_dead;
extern int zfs_prefetch_disable;
@@ -526,6 +534,7 @@ typedef struct arc_write_callback arc_write_callback_t;
struct arc_write_callback {
void *awcb_private;
arc_done_func_t *awcb_ready;
+ arc_done_func_t *awcb_physdone;
arc_done_func_t *awcb_done;
arc_buf_t *awcb_buf;
};
@@ -1312,7 +1321,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
kmutex_t *lock;
ASSERT(MUTEX_HELD(hash_lock));
- ASSERT(new_state != old_state);
+ ASSERT3P(new_state, !=, old_state);
ASSERT(refcnt == 0 || ab->b_datacnt > 0);
ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
@@ -1937,8 +1946,10 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
kmutex_t *hash_lock;
boolean_t have_lock;
void *stolen = NULL;
+ arc_buf_hdr_t marker = { 0 };
+ int count = 0;
static int evict_metadata_offset, evict_data_offset;
- int i, idx, offset, list_count, count;
+ int i, idx, offset, list_count, lists;
ASSERT(state == arc_mru || state == arc_mfu);
@@ -1958,7 +1969,7 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
idx = evict_data_offset;
}
bytes_remaining = evicted_state->arcs_lsize[type];
- count = 0;
+ lists = 0;
evict_start:
list = &list_start[idx];
@@ -1985,6 +1996,33 @@ evict_start:
if (recycle && ab->b_size != bytes &&
ab_prev && ab_prev->b_size == bytes)
continue;
+
+ /* ignore markers */
+ if (ab->b_spa == 0)
+ continue;
+
+ /*
+ * It may take a long time to evict all the bufs requested.
+ * To avoid blocking all arc activity, periodically drop
+ * the arcs_mtx and give other threads a chance to run
+ * before reacquiring the lock.
+ *
+ * If we are looking for a buffer to recycle, we are in
+ * the hot code path, so don't sleep.
+ */
+ if (!recycle && count++ > arc_evict_iterations) {
+ list_insert_after(list, ab, &marker);
+ mutex_exit(evicted_lock);
+ mutex_exit(lock);
+ kpreempt(KPREEMPT_SYNC);
+ mutex_enter(lock);
+ mutex_enter(evicted_lock);
+ ab_prev = list_prev(list, &marker);
+ list_remove(list, &marker);
+ count = 0;
+ continue;
+ }
+
hash_lock = HDR_LOCK(ab);
have_lock = MUTEX_HELD(hash_lock);
if (have_lock || mutex_tryenter(hash_lock)) {
@@ -2051,7 +2089,7 @@ evict_start:
mutex_exit(evicted_lock);
mutex_exit(lock);
idx = ((idx + 1) & (list_count - 1));
- count++;
+ lists++;
goto evict_start;
}
} else {
@@ -2063,10 +2101,10 @@ evict_start:
mutex_exit(lock);
idx = ((idx + 1) & (list_count - 1));
- count++;
+ lists++;
if (bytes_evicted < bytes) {
- if (count < list_count)
+ if (lists < list_count)
goto evict_start;
else
dprintf("only evicted %lld bytes from %x",
@@ -2084,28 +2122,14 @@ evict_start:
ARCSTAT_INCR(arcstat_mutex_miss, missed);
/*
- * We have just evicted some data into the ghost state, make
- * sure we also adjust the ghost state size if necessary.
+ * Note: we have just evicted some data into the ghost state,
+ * potentially putting the ghost size over the desired size. Rather
+ * that evicting from the ghost list in this hot code path, leave
+ * this chore to the arc_reclaim_thread().
*/
- if (arc_no_grow &&
- arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) {
- int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size +
- arc_mru_ghost->arcs_size - arc_c;
-
- if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
- int64_t todelete =
- MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
- arc_evict_ghost(arc_mru_ghost, 0, todelete);
- } else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
- int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
- arc_mru_ghost->arcs_size +
- arc_mfu_ghost->arcs_size - arc_c);
- arc_evict_ghost(arc_mfu_ghost, 0, todelete);
- }
- }
+
if (stolen)
ARCSTAT_BUMP(arcstat_stolen);
-
return (stolen);
}
@@ -2122,9 +2146,10 @@ arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
kmutex_t *hash_lock, *lock;
uint64_t bytes_deleted = 0;
uint64_t bufs_skipped = 0;
+ int count = 0;
static int evict_offset;
int list_count, idx = evict_offset;
- int offset, count = 0;
+ int offset, lists = 0;
ASSERT(GHOST_STATE(state));
@@ -2142,6 +2167,8 @@ evict_start:
mutex_enter(lock);
for (ab = list_tail(list); ab; ab = ab_prev) {
ab_prev = list_prev(list, ab);
+ if (ab->b_type > ARC_BUFC_NUMTYPES)
+ panic("invalid ab=%p", (void *)ab);
if (spa && ab->b_spa != spa)
continue;
@@ -2153,6 +2180,23 @@ evict_start:
/* caller may be trying to modify this buffer, skip it */
if (MUTEX_HELD(hash_lock))
continue;
+
+ /*
+ * It may take a long time to evict all the bufs requested.
+ * To avoid blocking all arc activity, periodically drop
+ * the arcs_mtx and give other threads a chance to run
+ * before reacquiring the lock.
+ */
+ if (count++ > arc_evict_iterations) {
+ list_insert_after(list, ab, &marker);
+ mutex_exit(lock);
+ kpreempt(KPREEMPT_SYNC);
+ mutex_enter(lock);
+ ab_prev = list_prev(list, &marker);
+ list_remove(list, &marker);
+ count = 0;
+ continue;
+ }
if (mutex_tryenter(hash_lock)) {
ASSERT(!HDR_IO_IN_PROGRESS(ab));
ASSERT(ab->b_buf == NULL);
@@ -2188,14 +2232,16 @@ evict_start:
mutex_enter(lock);
ab_prev = list_prev(list, &marker);
list_remove(list, &marker);
- } else
+ } else {
bufs_skipped += 1;
+ }
+
}
mutex_exit(lock);
idx = ((idx + 1) & (ARC_BUFC_NUMDATALISTS - 1));
- count++;
+ lists++;
- if (count < list_count)
+ if (lists < list_count)
goto evict_start;
evict_offset = idx;
@@ -2203,7 +2249,7 @@ evict_start:
(bytes < 0 || bytes_deleted < bytes)) {
list_start = &state->arcs_lists[0];
list_count = ARC_BUFC_NUMMETADATALISTS;
- offset = count = 0;
+ offset = lists = 0;
goto evict_start;
}
@@ -3083,7 +3129,7 @@ arc_read_done(zio_t *zio)
*/
int
arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
- void *private, int priority, int zio_flags, uint32_t *arc_flags,
+ void *private, zio_priority_t priority, int zio_flags, uint32_t *arc_flags,
const zbookmark_t *zb)
{
arc_buf_hdr_t *hdr;
@@ -3699,6 +3745,18 @@ arc_write_ready(zio_t *zio)
hdr->b_flags |= ARC_IO_IN_PROGRESS;
}
+/*
+ * The SPA calls this callback for each physical write that happens on behalf
+ * of a logical write. See the comment in dbuf_write_physdone() for details.
+ */
+static void
+arc_write_physdone(zio_t *zio)
+{
+ arc_write_callback_t *cb = zio->io_private;
+ if (cb->awcb_physdone != NULL)
+ cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
+}
+
static void
arc_write_done(zio_t *zio)
{
@@ -3779,8 +3837,9 @@ arc_write_done(zio_t *zio)
zio_t *
arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
- const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *done,
- void *private, int priority, int zio_flags, const zbookmark_t *zb)
+ const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone,
+ arc_done_func_t *done, void *private, zio_priority_t priority,
+ int zio_flags, const zbookmark_t *zb)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
arc_write_callback_t *callback;
@@ -3797,18 +3856,20 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
hdr->b_flags |= ARC_L2COMPRESS;
callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
callback->awcb_ready = ready;
+ callback->awcb_physdone = physdone;
callback->awcb_done = done;
callback->awcb_private = private;
callback->awcb_buf = buf;
zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
- arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
+ arc_write_ready, arc_write_physdone, arc_write_done, callback,
+ priority, zio_flags, zb);
return (zio);
}
static int
-arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
+arc_memory_throttle(uint64_t reserve, uint64_t txg)
{
#ifdef _KERNEL
uint64_t available_memory =
@@ -3822,7 +3883,9 @@ arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
#endif
#endif /* sun */
- if (available_memory >= zfs_write_limit_max)
+
+ if (cnt.v_free_count + cnt.v_cache_count >
+ (uint64_t)physmem * arc_lotsfree_percent / 100)
return (0);
if (txg > last_txg) {
@@ -3846,20 +3909,6 @@ arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
return (SET_ERROR(EAGAIN));
}
page_load = 0;
-
- if (arc_size > arc_c_min) {
- uint64_t evictable_memory =
- arc_mru->arcs_lsize[ARC_BUFC_DATA] +
- arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
- arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
- arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
- available_memory += MIN(evictable_memory, arc_size - arc_c_min);
- }
-
- if (inflight_data > available_memory / 4) {
- ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
- return (SET_ERROR(ERESTART));
- }
#endif
return (0);
}
@@ -3877,15 +3926,6 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg)
int error;
uint64_t anon_size;
-#ifdef ZFS_DEBUG
- /*
- * Once in a while, fail for no reason. Everything should cope.
- */
- if (spa_get_random(10000) == 0) {
- dprintf("forcing random failure\n");
- return (SET_ERROR(ERESTART));
- }
-#endif
if (reserve > arc_c/4 && !arc_no_grow)
arc_c = MIN(arc_c_max, reserve * 4);
if (reserve > arc_c)
@@ -3903,7 +3943,8 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg)
* in order to compress/encrypt/etc the data. We therefore need to
* make sure that there is sufficient available memory for this.
*/
- if (error = arc_memory_throttle(reserve, anon_size, txg))
+ error = arc_memory_throttle(reserve, txg);
+ if (error != 0)
return (error);
/*
@@ -4094,11 +4135,20 @@ arc_init(void)
arc_dead = FALSE;
arc_warm = B_FALSE;
- if (zfs_write_limit_max == 0)
- zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
- else
- zfs_write_limit_shift = 0;
- mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL);
+ /*
+ * Calculate maximum amount of dirty data per pool.
+ *
+ * If it has been set by /etc/system, take that.
+ * Otherwise, use a percentage of physical memory defined by
+ * zfs_dirty_data_max_percent (default 10%) with a cap at
+ * zfs_dirty_data_max_max (default 4GB).
+ */
+ if (zfs_dirty_data_max == 0) {
+ zfs_dirty_data_max = ptob(physmem) *
+ zfs_dirty_data_max_percent / 100;
+ zfs_dirty_data_max = MIN(zfs_dirty_data_max,
+ zfs_dirty_data_max_max);
+ }
#ifdef _KERNEL
if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable))
@@ -4177,8 +4227,6 @@ arc_fini(void)
mutex_destroy(&arc_l2c_only->arcs_locks[i].arcs_lock);
}
- mutex_destroy(&zfs_write_limit_lock);
-
buf_fini();
ASSERT(arc_loaned_bytes == 0);
OpenPOWER on IntegriCloud