diff options
Diffstat (limited to 'sys/contrib/opensolaris/uts/common/fs/zfs/arc.c')
-rw-r--r-- | sys/contrib/opensolaris/uts/common/fs/zfs/arc.c | 2858 |
1 files changed, 2858 insertions, 0 deletions
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/contrib/opensolaris/uts/common/fs/zfs/arc.c new file mode 100644 index 0000000..89cbc1f --- /dev/null +++ b/sys/contrib/opensolaris/uts/common/fs/zfs/arc.c @@ -0,0 +1,2858 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * DVA-based Adjustable Replacement Cache + * + * While much of the theory of operation used here is + * based on the self-tuning, low overhead replacement cache + * presented by Megiddo and Modha at FAST 2003, there are some + * significant differences: + * + * 1. The Megiddo and Modha model assumes any page is evictable. + * Pages in its cache cannot be "locked" into memory. This makes + * the eviction algorithm simple: evict the last page in the list. + * This also make the performance characteristics easy to reason + * about. Our cache is not so simple. At any given moment, some + * subset of the blocks in the cache are un-evictable because we + * have handed out a reference to them. Blocks are only evictable + * when there are no external references active. This makes + * eviction far more problematic: we choose to evict the evictable + * blocks that are the "lowest" in the list. + * + * There are times when it is not possible to evict the requested + * space. In these circumstances we are unable to adjust the cache + * size. To prevent the cache growing unbounded at these times we + * implement a "cache throttle" that slowes the flow of new data + * into the cache until we can make space avaiable. + * + * 2. The Megiddo and Modha model assumes a fixed cache size. + * Pages are evicted when the cache is full and there is a cache + * miss. Our model has a variable sized cache. It grows with + * high use, but also tries to react to memory preasure from the + * operating system: decreasing its size when system memory is + * tight. + * + * 3. The Megiddo and Modha model assumes a fixed page size. All + * elements of the cache are therefor exactly the same size. So + * when adjusting the cache size following a cache miss, its simply + * a matter of choosing a single page to evict. In our model, we + * have variable sized cache blocks (rangeing from 512 bytes to + * 128K bytes). We therefor choose a set of blocks to evict to make + * space for a cache miss that approximates as closely as possible + * the space used by the new block. + * + * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" + * by N. Megiddo & D. Modha, FAST 2003 + */ + +/* + * The locking model: + * + * A new reference to a cache buffer can be obtained in two + * ways: 1) via a hash table lookup using the DVA as a key, + * or 2) via one of the ARC lists. The arc_read() inerface + * uses method 1, while the internal arc algorithms for + * adjusting the cache use method 2. We therefor provide two + * types of locks: 1) the hash table lock array, and 2) the + * arc list locks. + * + * Buffers do not have their own mutexs, rather they rely on the + * hash table mutexs for the bulk of their protection (i.e. most + * fields in the arc_buf_hdr_t are protected by these mutexs). + * + * buf_hash_find() returns the appropriate mutex (held) when it + * locates the requested buffer in the hash table. It returns + * NULL for the mutex if the buffer was not in the table. + * + * buf_hash_remove() expects the appropriate hash mutex to be + * already held before it is invoked. + * + * Each arc state also has a mutex which is used to protect the + * buffer list associated with the state. When attempting to + * obtain a hash table lock while holding an arc list lock you + * must use: mutex_tryenter() to avoid deadlock. Also note that + * the active state mutex must be held before the ghost state mutex. + * + * Arc buffers may have an associated eviction callback function. + * This function will be invoked prior to removing the buffer (e.g. + * in arc_do_user_evicts()). Note however that the data associated + * with the buffer may be evicted prior to the callback. The callback + * must be made with *no locks held* (to prevent deadlock). Additionally, + * the users of callbacks must ensure that their private data is + * protected from simultaneous callbacks from arc_buf_evict() + * and arc_do_user_evicts(). + * + * Note that the majority of the performance stats are manipulated + * with atomic operations. + */ + +#include <sys/spa.h> +#include <sys/zio.h> +#include <sys/zio_checksum.h> +#include <sys/zfs_context.h> +#include <sys/arc.h> +#include <sys/refcount.h> +#ifdef _KERNEL +#include <sys/dnlc.h> +#endif +#include <sys/callb.h> +#include <sys/kstat.h> +#include <sys/sdt.h> + +static kmutex_t arc_reclaim_thr_lock; +static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ +static uint8_t arc_thread_exit; + +#define ARC_REDUCE_DNLC_PERCENT 3 +uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; + +typedef enum arc_reclaim_strategy { + ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ + ARC_RECLAIM_CONS /* Conservative reclaim strategy */ +} arc_reclaim_strategy_t; + +/* number of seconds before growing cache again */ +static int arc_grow_retry = 60; + +/* + * minimum lifespan of a prefetch block in clock ticks + * (initialized in arc_init()) + */ +static int arc_min_prefetch_lifespan; + +static int arc_dead; + +/* + * These tunables are for performance analysis. + */ +u_long zfs_arc_max; +u_long zfs_arc_min; +TUNABLE_ULONG("vfs.zfs.arc_max", &zfs_arc_max); +TUNABLE_ULONG("vfs.zfs.arc_min", &zfs_arc_min); +SYSCTL_DECL(_vfs_zfs); +SYSCTL_ULONG(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RD, &zfs_arc_max, 0, + "Maximum ARC size"); +SYSCTL_ULONG(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RD, &zfs_arc_min, 0, + "Minimum ARC size"); + +/* + * Note that buffers can be on one of 5 states: + * ARC_anon - anonymous (discussed below) + * ARC_mru - recently used, currently cached + * ARC_mru_ghost - recentely used, no longer in cache + * ARC_mfu - frequently used, currently cached + * ARC_mfu_ghost - frequently used, no longer in cache + * When there are no active references to the buffer, they + * are linked onto one of the lists in arc. These are the + * only buffers that can be evicted or deleted. + * + * Anonymous buffers are buffers that are not associated with + * a DVA. These are buffers that hold dirty block copies + * before they are written to stable storage. By definition, + * they are "ref'd" and are considered part of arc_mru + * that cannot be freed. Generally, they will aquire a DVA + * as they are written and migrate onto the arc_mru list. + */ + +typedef struct arc_state { + list_t arcs_list; /* linked list of evictable buffer in state */ + uint64_t arcs_lsize; /* total size of buffers in the linked list */ + uint64_t arcs_size; /* total size of all buffers in this state */ + kmutex_t arcs_mtx; +} arc_state_t; + +/* The 5 states: */ +static arc_state_t ARC_anon; +static arc_state_t ARC_mru; +static arc_state_t ARC_mru_ghost; +static arc_state_t ARC_mfu; +static arc_state_t ARC_mfu_ghost; + +typedef struct arc_stats { + kstat_named_t arcstat_hits; + kstat_named_t arcstat_misses; + kstat_named_t arcstat_demand_data_hits; + kstat_named_t arcstat_demand_data_misses; + kstat_named_t arcstat_demand_metadata_hits; + kstat_named_t arcstat_demand_metadata_misses; + kstat_named_t arcstat_prefetch_data_hits; + kstat_named_t arcstat_prefetch_data_misses; + kstat_named_t arcstat_prefetch_metadata_hits; + kstat_named_t arcstat_prefetch_metadata_misses; + kstat_named_t arcstat_mru_hits; + kstat_named_t arcstat_mru_ghost_hits; + kstat_named_t arcstat_mfu_hits; + kstat_named_t arcstat_mfu_ghost_hits; + kstat_named_t arcstat_deleted; + kstat_named_t arcstat_recycle_miss; + kstat_named_t arcstat_mutex_miss; + kstat_named_t arcstat_evict_skip; + kstat_named_t arcstat_hash_elements; + kstat_named_t arcstat_hash_elements_max; + kstat_named_t arcstat_hash_collisions; + kstat_named_t arcstat_hash_chains; + kstat_named_t arcstat_hash_chain_max; + kstat_named_t arcstat_p; + kstat_named_t arcstat_c; + kstat_named_t arcstat_c_min; + kstat_named_t arcstat_c_max; + kstat_named_t arcstat_size; +} arc_stats_t; + +static arc_stats_t arc_stats = { + { "hits", KSTAT_DATA_UINT64 }, + { "misses", KSTAT_DATA_UINT64 }, + { "demand_data_hits", KSTAT_DATA_UINT64 }, + { "demand_data_misses", KSTAT_DATA_UINT64 }, + { "demand_metadata_hits", KSTAT_DATA_UINT64 }, + { "demand_metadata_misses", KSTAT_DATA_UINT64 }, + { "prefetch_data_hits", KSTAT_DATA_UINT64 }, + { "prefetch_data_misses", KSTAT_DATA_UINT64 }, + { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, + { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, + { "mru_hits", KSTAT_DATA_UINT64 }, + { "mru_ghost_hits", KSTAT_DATA_UINT64 }, + { "mfu_hits", KSTAT_DATA_UINT64 }, + { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, + { "deleted", KSTAT_DATA_UINT64 }, + { "recycle_miss", KSTAT_DATA_UINT64 }, + { "mutex_miss", KSTAT_DATA_UINT64 }, + { "evict_skip", KSTAT_DATA_UINT64 }, + { "hash_elements", KSTAT_DATA_UINT64 }, + { "hash_elements_max", KSTAT_DATA_UINT64 }, + { "hash_collisions", KSTAT_DATA_UINT64 }, + { "hash_chains", KSTAT_DATA_UINT64 }, + { "hash_chain_max", KSTAT_DATA_UINT64 }, + { "p", KSTAT_DATA_UINT64 }, + { "c", KSTAT_DATA_UINT64 }, + { "c_min", KSTAT_DATA_UINT64 }, + { "c_max", KSTAT_DATA_UINT64 }, + { "size", KSTAT_DATA_UINT64 } +}; + +#define ARCSTAT(stat) (arc_stats.stat.value.ui64) + +#define ARCSTAT_INCR(stat, val) \ + atomic_add_64(&arc_stats.stat.value.ui64, (val)); + +#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) +#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) + +#define ARCSTAT_MAX(stat, val) { \ + uint64_t m; \ + while ((val) > (m = arc_stats.stat.value.ui64) && \ + (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ + continue; \ +} + +#define ARCSTAT_MAXSTAT(stat) \ + ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) + +/* + * We define a macro to allow ARC hits/misses to be easily broken down by + * two separate conditions, giving a total of four different subtypes for + * each of hits and misses (so eight statistics total). + */ +#define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ + if (cond1) { \ + if (cond2) { \ + ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ + } else { \ + ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ + } \ + } else { \ + if (cond2) { \ + ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ + } else { \ + ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ + } \ + } + +kstat_t *arc_ksp; +static arc_state_t *arc_anon; +static arc_state_t *arc_mru; +static arc_state_t *arc_mru_ghost; +static arc_state_t *arc_mfu; +static arc_state_t *arc_mfu_ghost; + +/* + * There are several ARC variables that are critical to export as kstats -- + * but we don't want to have to grovel around in the kstat whenever we wish to + * manipulate them. For these variables, we therefore define them to be in + * terms of the statistic variable. This assures that we are not introducing + * the possibility of inconsistency by having shadow copies of the variables, + * while still allowing the code to be readable. + */ +#define arc_size ARCSTAT(arcstat_size) /* actual total arc size */ +#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ +#define arc_c ARCSTAT(arcstat_c) /* target size of cache */ +#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ +#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ + +static int arc_no_grow; /* Don't try to grow cache size */ +static uint64_t arc_tempreserve; + +typedef struct arc_callback arc_callback_t; + +struct arc_callback { + void *acb_private; + arc_done_func_t *acb_done; + arc_byteswap_func_t *acb_byteswap; + arc_buf_t *acb_buf; + zio_t *acb_zio_dummy; + arc_callback_t *acb_next; +}; + +typedef struct arc_write_callback arc_write_callback_t; + +struct arc_write_callback { + void *awcb_private; + arc_done_func_t *awcb_ready; + arc_done_func_t *awcb_done; + arc_buf_t *awcb_buf; +}; + +struct arc_buf_hdr { + /* protected by hash lock */ + dva_t b_dva; + uint64_t b_birth; + uint64_t b_cksum0; + + kmutex_t b_freeze_lock; + zio_cksum_t *b_freeze_cksum; + + arc_buf_hdr_t *b_hash_next; + arc_buf_t *b_buf; + uint32_t b_flags; + uint32_t b_datacnt; + + arc_callback_t *b_acb; + kcondvar_t b_cv; + + /* immutable */ + arc_buf_contents_t b_type; + uint64_t b_size; + spa_t *b_spa; + + /* protected by arc state mutex */ + arc_state_t *b_state; + list_node_t b_arc_node; + + /* updated atomically */ + clock_t b_arc_access; + + /* self protecting */ + refcount_t b_refcnt; +}; + +static arc_buf_t *arc_eviction_list; +static kmutex_t arc_eviction_mtx; +static arc_buf_hdr_t arc_eviction_hdr; +static void arc_get_data_buf(arc_buf_t *buf); +static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); + +#define GHOST_STATE(state) \ + ((state) == arc_mru_ghost || (state) == arc_mfu_ghost) + +/* + * Private ARC flags. These flags are private ARC only flags that will show up + * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can + * be passed in as arc_flags in things like arc_read. However, these flags + * should never be passed and should only be set by ARC code. When adding new + * public flags, make sure not to smash the private ones. + */ + +#define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */ +#define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */ +#define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */ +#define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */ +#define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */ +#define ARC_INDIRECT (1 << 14) /* this is an indirect block */ + +#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) +#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) +#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) +#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) +#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE) + +/* + * Hash table routines + */ + +#define HT_LOCK_PAD 128 + +struct ht_lock { + kmutex_t ht_lock; +#ifdef _KERNEL + unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; +#endif +}; + +#define BUF_LOCKS 256 +typedef struct buf_hash_table { + uint64_t ht_mask; + arc_buf_hdr_t **ht_table; + struct ht_lock ht_locks[BUF_LOCKS]; +} buf_hash_table_t; + +static buf_hash_table_t buf_hash_table; + +#define BUF_HASH_INDEX(spa, dva, birth) \ + (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) +#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) +#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) +#define HDR_LOCK(buf) \ + (BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth))) + +uint64_t zfs_crc64_table[256]; + +static uint64_t +buf_hash(spa_t *spa, dva_t *dva, uint64_t birth) +{ + uintptr_t spav = (uintptr_t)spa; + uint8_t *vdva = (uint8_t *)dva; + uint64_t crc = -1ULL; + int i; + + ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); + + for (i = 0; i < sizeof (dva_t); i++) + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; + + crc ^= (spav>>8) ^ birth; + + return (crc); +} + +#define BUF_EMPTY(buf) \ + ((buf)->b_dva.dva_word[0] == 0 && \ + (buf)->b_dva.dva_word[1] == 0 && \ + (buf)->b_birth == 0) + +#define BUF_EQUAL(spa, dva, birth, buf) \ + ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ + ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ + ((buf)->b_birth == birth) && ((buf)->b_spa == spa) + +static arc_buf_hdr_t * +buf_hash_find(spa_t *spa, dva_t *dva, uint64_t birth, kmutex_t **lockp) +{ + uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); + kmutex_t *hash_lock = BUF_HASH_LOCK(idx); + arc_buf_hdr_t *buf; + + mutex_enter(hash_lock); + for (buf = buf_hash_table.ht_table[idx]; buf != NULL; + buf = buf->b_hash_next) { + if (BUF_EQUAL(spa, dva, birth, buf)) { + *lockp = hash_lock; + return (buf); + } + } + mutex_exit(hash_lock); + *lockp = NULL; + return (NULL); +} + +/* + * Insert an entry into the hash table. If there is already an element + * equal to elem in the hash table, then the already existing element + * will be returned and the new element will not be inserted. + * Otherwise returns NULL. + */ +static arc_buf_hdr_t * +buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp) +{ + uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); + kmutex_t *hash_lock = BUF_HASH_LOCK(idx); + arc_buf_hdr_t *fbuf; + uint32_t i; + + ASSERT(!HDR_IN_HASH_TABLE(buf)); + *lockp = hash_lock; + mutex_enter(hash_lock); + for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL; + fbuf = fbuf->b_hash_next, i++) { + if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf)) + return (fbuf); + } + + buf->b_hash_next = buf_hash_table.ht_table[idx]; + buf_hash_table.ht_table[idx] = buf; + buf->b_flags |= ARC_IN_HASH_TABLE; + + /* collect some hash table performance data */ + if (i > 0) { + ARCSTAT_BUMP(arcstat_hash_collisions); + if (i == 1) + ARCSTAT_BUMP(arcstat_hash_chains); + + ARCSTAT_MAX(arcstat_hash_chain_max, i); + } + + ARCSTAT_BUMP(arcstat_hash_elements); + ARCSTAT_MAXSTAT(arcstat_hash_elements); + + return (NULL); +} + +static void +buf_hash_remove(arc_buf_hdr_t *buf) +{ + arc_buf_hdr_t *fbuf, **bufp; + uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); + + ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); + ASSERT(HDR_IN_HASH_TABLE(buf)); + + bufp = &buf_hash_table.ht_table[idx]; + while ((fbuf = *bufp) != buf) { + ASSERT(fbuf != NULL); + bufp = &fbuf->b_hash_next; + } + *bufp = buf->b_hash_next; + buf->b_hash_next = NULL; + buf->b_flags &= ~ARC_IN_HASH_TABLE; + + /* collect some hash table performance data */ + ARCSTAT_BUMPDOWN(arcstat_hash_elements); + + if (buf_hash_table.ht_table[idx] && + buf_hash_table.ht_table[idx]->b_hash_next == NULL) + ARCSTAT_BUMPDOWN(arcstat_hash_chains); +} + +/* + * Global data structures and functions for the buf kmem cache. + */ +static kmem_cache_t *hdr_cache; +static kmem_cache_t *buf_cache; + +static void +buf_fini(void) +{ + int i; + + kmem_free(buf_hash_table.ht_table, + (buf_hash_table.ht_mask + 1) * sizeof (void *)); + for (i = 0; i < BUF_LOCKS; i++) + mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); + kmem_cache_destroy(hdr_cache); + kmem_cache_destroy(buf_cache); +} + +/* + * Constructor callback - called when the cache is empty + * and a new buf is requested. + */ +/* ARGSUSED */ +static int +hdr_cons(void *vbuf, void *unused, int kmflag) +{ + arc_buf_hdr_t *buf = vbuf; + + bzero(buf, sizeof (arc_buf_hdr_t)); + refcount_create(&buf->b_refcnt); + cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); + return (0); +} + +/* + * Destructor callback - called when a cached buf is + * no longer required. + */ +/* ARGSUSED */ +static void +hdr_dest(void *vbuf, void *unused) +{ + arc_buf_hdr_t *buf = vbuf; + + refcount_destroy(&buf->b_refcnt); + cv_destroy(&buf->b_cv); +} + +/* + * Reclaim callback -- invoked when memory is low. + */ +/* ARGSUSED */ +static void +hdr_recl(void *unused) +{ + dprintf("hdr_recl called\n"); + /* + * umem calls the reclaim func when we destroy the buf cache, + * which is after we do arc_fini(). + */ + if (!arc_dead) + cv_signal(&arc_reclaim_thr_cv); +} + +static void +buf_init(void) +{ + uint64_t *ct; + uint64_t hsize = 1ULL << 12; + int i, j; + + /* + * The hash table is big enough to fill all of physical memory + * with an average 64K block size. The table will take up + * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers). + */ + while (hsize * 65536 < (uint64_t)physmem * PAGESIZE) + hsize <<= 1; +retry: + buf_hash_table.ht_mask = hsize - 1; + buf_hash_table.ht_table = + kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); + if (buf_hash_table.ht_table == NULL) { + ASSERT(hsize > (1ULL << 8)); + hsize >>= 1; + goto retry; + } + + hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), + 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0); + buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), + 0, NULL, NULL, NULL, NULL, NULL, 0); + + for (i = 0; i < 256; i++) + for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) + *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); + + for (i = 0; i < BUF_LOCKS; i++) { + mutex_init(&buf_hash_table.ht_locks[i].ht_lock, + NULL, MUTEX_DEFAULT, NULL); + } +} + +#define ARC_MINTIME (hz>>4) /* 62 ms */ + +static void +arc_cksum_verify(arc_buf_t *buf) +{ + zio_cksum_t zc; + + if (!(zfs_flags & ZFS_DEBUG_MODIFY)) + return; + + mutex_enter(&buf->b_hdr->b_freeze_lock); + if (buf->b_hdr->b_freeze_cksum == NULL || + (buf->b_hdr->b_flags & ARC_IO_ERROR)) { + mutex_exit(&buf->b_hdr->b_freeze_lock); + return; + } + fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); + if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) + panic("buffer modified while frozen!"); + mutex_exit(&buf->b_hdr->b_freeze_lock); +} + +static void +arc_cksum_compute(arc_buf_t *buf) +{ + if (!(zfs_flags & ZFS_DEBUG_MODIFY)) + return; + + mutex_enter(&buf->b_hdr->b_freeze_lock); + if (buf->b_hdr->b_freeze_cksum != NULL) { + mutex_exit(&buf->b_hdr->b_freeze_lock); + return; + } + buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); + fletcher_2_native(buf->b_data, buf->b_hdr->b_size, + buf->b_hdr->b_freeze_cksum); + mutex_exit(&buf->b_hdr->b_freeze_lock); +} + +void +arc_buf_thaw(arc_buf_t *buf) +{ + if (!(zfs_flags & ZFS_DEBUG_MODIFY)) + return; + + if (buf->b_hdr->b_state != arc_anon) + panic("modifying non-anon buffer!"); + if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS) + panic("modifying buffer while i/o in progress!"); + arc_cksum_verify(buf); + mutex_enter(&buf->b_hdr->b_freeze_lock); + if (buf->b_hdr->b_freeze_cksum != NULL) { + kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); + buf->b_hdr->b_freeze_cksum = NULL; + } + mutex_exit(&buf->b_hdr->b_freeze_lock); +} + +void +arc_buf_freeze(arc_buf_t *buf) +{ + if (!(zfs_flags & ZFS_DEBUG_MODIFY)) + return; + + ASSERT(buf->b_hdr->b_freeze_cksum != NULL || + buf->b_hdr->b_state == arc_anon); + arc_cksum_compute(buf); +} + +static void +add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) +{ + ASSERT(MUTEX_HELD(hash_lock)); + + if ((refcount_add(&ab->b_refcnt, tag) == 1) && + (ab->b_state != arc_anon)) { + uint64_t delta = ab->b_size * ab->b_datacnt; + + ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx)); + mutex_enter(&ab->b_state->arcs_mtx); + ASSERT(list_link_active(&ab->b_arc_node)); + list_remove(&ab->b_state->arcs_list, ab); + if (GHOST_STATE(ab->b_state)) { + ASSERT3U(ab->b_datacnt, ==, 0); + ASSERT3P(ab->b_buf, ==, NULL); + delta = ab->b_size; + } + ASSERT(delta > 0); + ASSERT3U(ab->b_state->arcs_lsize, >=, delta); + atomic_add_64(&ab->b_state->arcs_lsize, -delta); + mutex_exit(&ab->b_state->arcs_mtx); + /* remove the prefetch flag is we get a reference */ + if (ab->b_flags & ARC_PREFETCH) + ab->b_flags &= ~ARC_PREFETCH; + } +} + +static int +remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) +{ + int cnt; + arc_state_t *state = ab->b_state; + + ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); + ASSERT(!GHOST_STATE(state)); + + if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) && + (state != arc_anon)) { + ASSERT(!MUTEX_HELD(&state->arcs_mtx)); + mutex_enter(&state->arcs_mtx); + ASSERT(!list_link_active(&ab->b_arc_node)); + list_insert_head(&state->arcs_list, ab); + ASSERT(ab->b_datacnt > 0); + atomic_add_64(&state->arcs_lsize, ab->b_size * ab->b_datacnt); + ASSERT3U(state->arcs_size, >=, state->arcs_lsize); + mutex_exit(&state->arcs_mtx); + } + return (cnt); +} + +/* + * Move the supplied buffer to the indicated state. The mutex + * for the buffer must be held by the caller. + */ +static void +arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) +{ + arc_state_t *old_state = ab->b_state; + int64_t refcnt = refcount_count(&ab->b_refcnt); + uint64_t from_delta, to_delta; + + ASSERT(MUTEX_HELD(hash_lock)); + ASSERT(new_state != old_state); + ASSERT(refcnt == 0 || ab->b_datacnt > 0); + ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state)); + + from_delta = to_delta = ab->b_datacnt * ab->b_size; + + /* + * If this buffer is evictable, transfer it from the + * old state list to the new state list. + */ + if (refcnt == 0) { + if (old_state != arc_anon) { + int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx); + + if (use_mutex) + mutex_enter(&old_state->arcs_mtx); + + ASSERT(list_link_active(&ab->b_arc_node)); + list_remove(&old_state->arcs_list, ab); + + /* + * If prefetching out of the ghost cache, + * we will have a non-null datacnt. + */ + if (GHOST_STATE(old_state) && ab->b_datacnt == 0) { + /* ghost elements have a ghost size */ + ASSERT(ab->b_buf == NULL); + from_delta = ab->b_size; + } + ASSERT3U(old_state->arcs_lsize, >=, from_delta); + atomic_add_64(&old_state->arcs_lsize, -from_delta); + + if (use_mutex) + mutex_exit(&old_state->arcs_mtx); + } + if (new_state != arc_anon) { + int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx); + + if (use_mutex) + mutex_enter(&new_state->arcs_mtx); + + list_insert_head(&new_state->arcs_list, ab); + + /* ghost elements have a ghost size */ + if (GHOST_STATE(new_state)) { + ASSERT(ab->b_datacnt == 0); + ASSERT(ab->b_buf == NULL); + to_delta = ab->b_size; + } + atomic_add_64(&new_state->arcs_lsize, to_delta); + ASSERT3U(new_state->arcs_size + to_delta, >=, + new_state->arcs_lsize); + + if (use_mutex) + mutex_exit(&new_state->arcs_mtx); + } + } + + ASSERT(!BUF_EMPTY(ab)); + if (new_state == arc_anon && old_state != arc_anon) { + buf_hash_remove(ab); + } + + /* adjust state sizes */ + if (to_delta) + atomic_add_64(&new_state->arcs_size, to_delta); + if (from_delta) { + ASSERT3U(old_state->arcs_size, >=, from_delta); + atomic_add_64(&old_state->arcs_size, -from_delta); + } + ab->b_state = new_state; +} + +arc_buf_t * +arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) +{ + arc_buf_hdr_t *hdr; + arc_buf_t *buf; + + ASSERT3U(size, >, 0); + hdr = kmem_cache_alloc(hdr_cache, KM_SLEEP); + ASSERT(BUF_EMPTY(hdr)); + hdr->b_size = size; + hdr->b_type = type; + hdr->b_spa = spa; + hdr->b_state = arc_anon; + hdr->b_arc_access = 0; + mutex_init(&hdr->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); + buf = kmem_cache_alloc(buf_cache, KM_SLEEP); + buf->b_hdr = hdr; + buf->b_data = NULL; + buf->b_efunc = NULL; + buf->b_private = NULL; + buf->b_next = NULL; + hdr->b_buf = buf; + arc_get_data_buf(buf); + hdr->b_datacnt = 1; + hdr->b_flags = 0; + ASSERT(refcount_is_zero(&hdr->b_refcnt)); + (void) refcount_add(&hdr->b_refcnt, tag); + + return (buf); +} + +static arc_buf_t * +arc_buf_clone(arc_buf_t *from) +{ + arc_buf_t *buf; + arc_buf_hdr_t *hdr = from->b_hdr; + uint64_t size = hdr->b_size; + + buf = kmem_cache_alloc(buf_cache, KM_SLEEP); + buf->b_hdr = hdr; + buf->b_data = NULL; + buf->b_efunc = NULL; + buf->b_private = NULL; + buf->b_next = hdr->b_buf; + hdr->b_buf = buf; + arc_get_data_buf(buf); + bcopy(from->b_data, buf->b_data, size); + hdr->b_datacnt += 1; + return (buf); +} + +void +arc_buf_add_ref(arc_buf_t *buf, void* tag) +{ + arc_buf_hdr_t *hdr; + kmutex_t *hash_lock; + + /* + * Check to see if this buffer is currently being evicted via + * arc_do_user_evicts(). + */ + mutex_enter(&arc_eviction_mtx); + hdr = buf->b_hdr; + if (hdr == NULL) { + mutex_exit(&arc_eviction_mtx); + return; + } + hash_lock = HDR_LOCK(hdr); + mutex_exit(&arc_eviction_mtx); + + mutex_enter(hash_lock); + if (buf->b_data == NULL) { + /* + * This buffer is evicted. + */ + mutex_exit(hash_lock); + return; + } + + ASSERT(buf->b_hdr == hdr); + ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); + add_reference(hdr, hash_lock, tag); + arc_access(hdr, hash_lock); + mutex_exit(hash_lock); + ARCSTAT_BUMP(arcstat_hits); + ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), + demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, + data, metadata, hits); +} + +static void +arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) +{ + arc_buf_t **bufp; + + /* free up data associated with the buf */ + if (buf->b_data) { + arc_state_t *state = buf->b_hdr->b_state; + uint64_t size = buf->b_hdr->b_size; + arc_buf_contents_t type = buf->b_hdr->b_type; + + arc_cksum_verify(buf); + if (!recycle) { + if (type == ARC_BUFC_METADATA) { + zio_buf_free(buf->b_data, size); + } else { + ASSERT(type == ARC_BUFC_DATA); + zio_data_buf_free(buf->b_data, size); + } + atomic_add_64(&arc_size, -size); + } + if (list_link_active(&buf->b_hdr->b_arc_node)) { + ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt)); + ASSERT(state != arc_anon); + ASSERT3U(state->arcs_lsize, >=, size); + atomic_add_64(&state->arcs_lsize, -size); + } + ASSERT3U(state->arcs_size, >=, size); + atomic_add_64(&state->arcs_size, -size); + buf->b_data = NULL; + ASSERT(buf->b_hdr->b_datacnt > 0); + buf->b_hdr->b_datacnt -= 1; + } + + /* only remove the buf if requested */ + if (!all) + return; + + /* remove the buf from the hdr list */ + for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next) + continue; + *bufp = buf->b_next; + + ASSERT(buf->b_efunc == NULL); + + /* clean up the buf */ + buf->b_hdr = NULL; + kmem_cache_free(buf_cache, buf); +} + +static void +arc_hdr_destroy(arc_buf_hdr_t *hdr) +{ + ASSERT(refcount_is_zero(&hdr->b_refcnt)); + ASSERT3P(hdr->b_state, ==, arc_anon); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + + if (!BUF_EMPTY(hdr)) { + ASSERT(!HDR_IN_HASH_TABLE(hdr)); + bzero(&hdr->b_dva, sizeof (dva_t)); + hdr->b_birth = 0; + hdr->b_cksum0 = 0; + } + while (hdr->b_buf) { + arc_buf_t *buf = hdr->b_buf; + + if (buf->b_efunc) { + mutex_enter(&arc_eviction_mtx); + ASSERT(buf->b_hdr != NULL); + arc_buf_destroy(hdr->b_buf, FALSE, FALSE); + hdr->b_buf = buf->b_next; + buf->b_hdr = &arc_eviction_hdr; + buf->b_next = arc_eviction_list; + arc_eviction_list = buf; + mutex_exit(&arc_eviction_mtx); + } else { + arc_buf_destroy(hdr->b_buf, FALSE, TRUE); + } + } + if (hdr->b_freeze_cksum != NULL) { + kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); + hdr->b_freeze_cksum = NULL; + } + mutex_destroy(&hdr->b_freeze_lock); + + ASSERT(!list_link_active(&hdr->b_arc_node)); + ASSERT3P(hdr->b_hash_next, ==, NULL); + ASSERT3P(hdr->b_acb, ==, NULL); + kmem_cache_free(hdr_cache, hdr); +} + +void +arc_buf_free(arc_buf_t *buf, void *tag) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + int hashed = hdr->b_state != arc_anon; + + ASSERT(buf->b_efunc == NULL); + ASSERT(buf->b_data != NULL); + + if (hashed) { + kmutex_t *hash_lock = HDR_LOCK(hdr); + + mutex_enter(hash_lock); + (void) remove_reference(hdr, hash_lock, tag); + if (hdr->b_datacnt > 1) + arc_buf_destroy(buf, FALSE, TRUE); + else + hdr->b_flags |= ARC_BUF_AVAILABLE; + mutex_exit(hash_lock); + } else if (HDR_IO_IN_PROGRESS(hdr)) { + int destroy_hdr; + /* + * We are in the middle of an async write. Don't destroy + * this buffer unless the write completes before we finish + * decrementing the reference count. + */ + mutex_enter(&arc_eviction_mtx); + (void) remove_reference(hdr, NULL, tag); + ASSERT(refcount_is_zero(&hdr->b_refcnt)); + destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); + mutex_exit(&arc_eviction_mtx); + if (destroy_hdr) + arc_hdr_destroy(hdr); + } else { + if (remove_reference(hdr, NULL, tag) > 0) { + ASSERT(HDR_IO_ERROR(hdr)); + arc_buf_destroy(buf, FALSE, TRUE); + } else { + arc_hdr_destroy(hdr); + } + } +} + +int +arc_buf_remove_ref(arc_buf_t *buf, void* tag) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + kmutex_t *hash_lock = HDR_LOCK(hdr); + int no_callback = (buf->b_efunc == NULL); + + if (hdr->b_state == arc_anon) { + arc_buf_free(buf, tag); + return (no_callback); + } + + mutex_enter(hash_lock); + ASSERT(hdr->b_state != arc_anon); + ASSERT(buf->b_data != NULL); + + (void) remove_reference(hdr, hash_lock, tag); + if (hdr->b_datacnt > 1) { + if (no_callback) + arc_buf_destroy(buf, FALSE, TRUE); + } else if (no_callback) { + ASSERT(hdr->b_buf == buf && buf->b_next == NULL); + hdr->b_flags |= ARC_BUF_AVAILABLE; + } + ASSERT(no_callback || hdr->b_datacnt > 1 || + refcount_is_zero(&hdr->b_refcnt)); + mutex_exit(hash_lock); + return (no_callback); +} + +int +arc_buf_size(arc_buf_t *buf) +{ + return (buf->b_hdr->b_size); +} + +/* + * Evict buffers from list until we've removed the specified number of + * bytes. Move the removed buffers to the appropriate evict state. + * If the recycle flag is set, then attempt to "recycle" a buffer: + * - look for a buffer to evict that is `bytes' long. + * - return the data block from this buffer rather than freeing it. + * This flag is used by callers that are trying to make space for a + * new buffer in a full arc cache. + */ +static void * +arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle, + arc_buf_contents_t type) +{ + arc_state_t *evicted_state; + uint64_t bytes_evicted = 0, skipped = 0, missed = 0; + arc_buf_hdr_t *ab, *ab_prev = NULL; + kmutex_t *hash_lock; + boolean_t have_lock; + void *stolen = NULL; + + ASSERT(state == arc_mru || state == arc_mfu); + + evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; + + mutex_enter(&state->arcs_mtx); + mutex_enter(&evicted_state->arcs_mtx); + + for (ab = list_tail(&state->arcs_list); ab; ab = ab_prev) { + ab_prev = list_prev(&state->arcs_list, ab); + /* prefetch buffers have a minimum lifespan */ + if (HDR_IO_IN_PROGRESS(ab) || + (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) && + lbolt - ab->b_arc_access < arc_min_prefetch_lifespan)) { + skipped++; + continue; + } + /* "lookahead" for better eviction candidate */ + if (recycle && ab->b_size != bytes && + ab_prev && ab_prev->b_size == bytes) + continue; + hash_lock = HDR_LOCK(ab); + have_lock = MUTEX_HELD(hash_lock); + if (have_lock || mutex_tryenter(hash_lock)) { + ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0); + ASSERT(ab->b_datacnt > 0); + while (ab->b_buf) { + arc_buf_t *buf = ab->b_buf; + if (buf->b_data) { + bytes_evicted += ab->b_size; + if (recycle && ab->b_type == type && + ab->b_size == bytes) { + stolen = buf->b_data; + recycle = FALSE; + } + } + if (buf->b_efunc) { + mutex_enter(&arc_eviction_mtx); + arc_buf_destroy(buf, + buf->b_data == stolen, FALSE); + ab->b_buf = buf->b_next; + buf->b_hdr = &arc_eviction_hdr; + buf->b_next = arc_eviction_list; + arc_eviction_list = buf; + mutex_exit(&arc_eviction_mtx); + } else { + arc_buf_destroy(buf, + buf->b_data == stolen, TRUE); + } + } + ASSERT(ab->b_datacnt == 0); + arc_change_state(evicted_state, ab, hash_lock); + ASSERT(HDR_IN_HASH_TABLE(ab)); + ab->b_flags = ARC_IN_HASH_TABLE; + DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab); + if (!have_lock) + mutex_exit(hash_lock); + if (bytes >= 0 && bytes_evicted >= bytes) + break; + } else { + missed += 1; + } + } + + mutex_exit(&evicted_state->arcs_mtx); + mutex_exit(&state->arcs_mtx); + + if (bytes_evicted < bytes) + dprintf("only evicted %lld bytes from %x", + (longlong_t)bytes_evicted, state); + + if (skipped) + ARCSTAT_INCR(arcstat_evict_skip, skipped); + + if (missed) + ARCSTAT_INCR(arcstat_mutex_miss, missed); + + return (stolen); +} + +/* + * Remove buffers from list until we've removed the specified number of + * bytes. Destroy the buffers that are removed. + */ +static void +arc_evict_ghost(arc_state_t *state, int64_t bytes) +{ + arc_buf_hdr_t *ab, *ab_prev; + kmutex_t *hash_lock; + uint64_t bytes_deleted = 0; + uint64_t bufs_skipped = 0; + + ASSERT(GHOST_STATE(state)); +top: + mutex_enter(&state->arcs_mtx); + for (ab = list_tail(&state->arcs_list); ab; ab = ab_prev) { + ab_prev = list_prev(&state->arcs_list, ab); + hash_lock = HDR_LOCK(ab); + if (mutex_tryenter(hash_lock)) { + ASSERT(!HDR_IO_IN_PROGRESS(ab)); + ASSERT(ab->b_buf == NULL); + arc_change_state(arc_anon, ab, hash_lock); + mutex_exit(hash_lock); + ARCSTAT_BUMP(arcstat_deleted); + bytes_deleted += ab->b_size; + arc_hdr_destroy(ab); + DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab); + if (bytes >= 0 && bytes_deleted >= bytes) + break; + } else { + if (bytes < 0) { + mutex_exit(&state->arcs_mtx); + mutex_enter(hash_lock); + mutex_exit(hash_lock); + goto top; + } + bufs_skipped += 1; + } + } + mutex_exit(&state->arcs_mtx); + + if (bufs_skipped) { + ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped); + ASSERT(bytes >= 0); + } + + if (bytes_deleted < bytes) + dprintf("only deleted %lld bytes from %p", + (longlong_t)bytes_deleted, state); +} + +static void +arc_adjust(void) +{ + int64_t top_sz, mru_over, arc_over, todelete; + + top_sz = arc_anon->arcs_size + arc_mru->arcs_size; + + if (top_sz > arc_p && arc_mru->arcs_lsize > 0) { + int64_t toevict = MIN(arc_mru->arcs_lsize, top_sz - arc_p); + (void) arc_evict(arc_mru, toevict, FALSE, ARC_BUFC_UNDEF); + top_sz = arc_anon->arcs_size + arc_mru->arcs_size; + } + + mru_over = top_sz + arc_mru_ghost->arcs_size - arc_c; + + if (mru_over > 0) { + if (arc_mru_ghost->arcs_lsize > 0) { + todelete = MIN(arc_mru_ghost->arcs_lsize, mru_over); + arc_evict_ghost(arc_mru_ghost, todelete); + } + } + + if ((arc_over = arc_size - arc_c) > 0) { + int64_t tbl_over; + + if (arc_mfu->arcs_lsize > 0) { + int64_t toevict = MIN(arc_mfu->arcs_lsize, arc_over); + (void) arc_evict(arc_mfu, toevict, FALSE, + ARC_BUFC_UNDEF); + } + + tbl_over = arc_size + arc_mru_ghost->arcs_lsize + + arc_mfu_ghost->arcs_lsize - arc_c*2; + + if (tbl_over > 0 && arc_mfu_ghost->arcs_lsize > 0) { + todelete = MIN(arc_mfu_ghost->arcs_lsize, tbl_over); + arc_evict_ghost(arc_mfu_ghost, todelete); + } + } +} + +static void +arc_do_user_evicts(void) +{ + mutex_enter(&arc_eviction_mtx); + while (arc_eviction_list != NULL) { + arc_buf_t *buf = arc_eviction_list; + arc_eviction_list = buf->b_next; + buf->b_hdr = NULL; + mutex_exit(&arc_eviction_mtx); + + if (buf->b_efunc != NULL) + VERIFY(buf->b_efunc(buf) == 0); + + buf->b_efunc = NULL; + buf->b_private = NULL; + kmem_cache_free(buf_cache, buf); + mutex_enter(&arc_eviction_mtx); + } + mutex_exit(&arc_eviction_mtx); +} + +/* + * Flush all *evictable* data from the cache. + * NOTE: this will not touch "active" (i.e. referenced) data. + */ +void +arc_flush(void) +{ + while (list_head(&arc_mru->arcs_list)) + (void) arc_evict(arc_mru, -1, FALSE, ARC_BUFC_UNDEF); + while (list_head(&arc_mfu->arcs_list)) + (void) arc_evict(arc_mfu, -1, FALSE, ARC_BUFC_UNDEF); + + arc_evict_ghost(arc_mru_ghost, -1); + arc_evict_ghost(arc_mfu_ghost, -1); + + mutex_enter(&arc_reclaim_thr_lock); + arc_do_user_evicts(); + mutex_exit(&arc_reclaim_thr_lock); + ASSERT(arc_eviction_list == NULL); +} + +int arc_shrink_shift = 5; /* log2(fraction of arc to reclaim) */ + +void +arc_shrink(void) +{ + if (arc_c > arc_c_min) { + uint64_t to_free; + +#ifdef _KERNEL + to_free = arc_c >> arc_shrink_shift; +#else + to_free = arc_c >> arc_shrink_shift; +#endif + if (arc_c > arc_c_min + to_free) + atomic_add_64(&arc_c, -to_free); + else + arc_c = arc_c_min; + + atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); + if (arc_c > arc_size) + arc_c = MAX(arc_size, arc_c_min); + if (arc_p > arc_c) + arc_p = (arc_c >> 1); + ASSERT(arc_c >= arc_c_min); + ASSERT((int64_t)arc_p >= 0); + } + + if (arc_size > arc_c) + arc_adjust(); +} + +static int zfs_needfree = 0; + +static int +arc_reclaim_needed(void) +{ +#if 0 + uint64_t extra; +#endif + +#ifdef _KERNEL + + if (zfs_needfree) + return (1); + +#if 0 + /* + * check to make sure that swapfs has enough space so that anon + * reservations can still succeeed. anon_resvmem() checks that the + * availrmem is greater than swapfs_minfree, and the number of reserved + * swap pages. We also add a bit of extra here just to prevent + * circumstances from getting really dire. + */ + if (availrmem < swapfs_minfree + swapfs_reserve + extra) + return (1); + + /* + * If zio data pages are being allocated out of a separate heap segment, + * then check that the size of available vmem for this area remains + * above 1/4th free. This needs to be done when the size of the + * non-default segment is smaller than physical memory, so we could + * conceivably run out of VA in that segment before running out of + * physical memory. + */ + if (zio_arena != NULL) { + size_t arc_ziosize = + btop(vmem_size(zio_arena, VMEM_FREE | VMEM_ALLOC)); + + if ((physmem > arc_ziosize) && + (btop(vmem_size(zio_arena, VMEM_FREE)) < arc_ziosize >> 2)) + return (1); + } + +#if defined(__i386) + /* + * If we're on an i386 platform, it's possible that we'll exhaust the + * kernel heap space before we ever run out of available physical + * memory. Most checks of the size of the heap_area compare against + * tune.t_minarmem, which is the minimum available real memory that we + * can have in the system. However, this is generally fixed at 25 pages + * which is so low that it's useless. In this comparison, we seek to + * calculate the total heap-size, and reclaim if more than 3/4ths of the + * heap is allocated. (Or, in the caclulation, if less than 1/4th is + * free) + */ + if (btop(vmem_size(heap_arena, VMEM_FREE)) < + (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2)) + return (1); +#endif +#else + if (kmem_used() > kmem_size() / 2) + return (1); +#endif + +#else + if (spa_get_random(100) == 0) + return (1); +#endif + return (0); +} + +static void +arc_kmem_reap_now(arc_reclaim_strategy_t strat) +{ +#ifdef ZIO_USE_UMA + size_t i; + kmem_cache_t *prev_cache = NULL; + kmem_cache_t *prev_data_cache = NULL; + extern kmem_cache_t *zio_buf_cache[]; + extern kmem_cache_t *zio_data_buf_cache[]; +#endif + +#ifdef _KERNEL + /* + * First purge some DNLC entries, in case the DNLC is using + * up too much memory. + */ + dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); + +#if defined(__i386) + /* + * Reclaim unused memory from all kmem caches. + */ + kmem_reap(); +#endif +#endif + + /* + * An agressive reclamation will shrink the cache size as well as + * reap free buffers from the arc kmem caches. + */ + if (strat == ARC_RECLAIM_AGGR) + arc_shrink(); + +#ifdef ZIO_USE_UMA + for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { + if (zio_buf_cache[i] != prev_cache) { + prev_cache = zio_buf_cache[i]; + kmem_cache_reap_now(zio_buf_cache[i]); + } + if (zio_data_buf_cache[i] != prev_data_cache) { + prev_data_cache = zio_data_buf_cache[i]; + kmem_cache_reap_now(zio_data_buf_cache[i]); + } + } +#endif + kmem_cache_reap_now(buf_cache); + kmem_cache_reap_now(hdr_cache); +} + +static void +arc_reclaim_thread(void *dummy __unused) +{ + clock_t growtime = 0; + arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; + callb_cpr_t cpr; + + CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); + + mutex_enter(&arc_reclaim_thr_lock); + while (arc_thread_exit == 0) { + if (arc_reclaim_needed()) { + + if (arc_no_grow) { + if (last_reclaim == ARC_RECLAIM_CONS) { + last_reclaim = ARC_RECLAIM_AGGR; + } else { + last_reclaim = ARC_RECLAIM_CONS; + } + } else { + arc_no_grow = TRUE; + last_reclaim = ARC_RECLAIM_AGGR; + membar_producer(); + } + + /* reset the growth delay for every reclaim */ + growtime = lbolt + (arc_grow_retry * hz); + ASSERT(growtime > 0); + + if (zfs_needfree && last_reclaim == ARC_RECLAIM_CONS) { + /* + * If zfs_needfree is TRUE our vm_lowmem hook + * was called and in that case we must free some + * memory, so switch to aggressive mode. + */ + arc_no_grow = TRUE; + last_reclaim = ARC_RECLAIM_AGGR; + } + arc_kmem_reap_now(last_reclaim); + } else if ((growtime > 0) && ((growtime - lbolt) <= 0)) { + arc_no_grow = FALSE; + } + + if (zfs_needfree || + (2 * arc_c < arc_size + + arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size)) + arc_adjust(); + + if (arc_eviction_list != NULL) + arc_do_user_evicts(); + + if (arc_reclaim_needed()) { + zfs_needfree = 0; +#ifdef _KERNEL + wakeup(&zfs_needfree); +#endif + } + + /* block until needed, or one second, whichever is shorter */ + CALLB_CPR_SAFE_BEGIN(&cpr); + (void) cv_timedwait(&arc_reclaim_thr_cv, + &arc_reclaim_thr_lock, hz); + CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); + } + + arc_thread_exit = 0; + cv_broadcast(&arc_reclaim_thr_cv); + CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */ + thread_exit(); +} + +/* + * Adapt arc info given the number of bytes we are trying to add and + * the state that we are comming from. This function is only called + * when we are adding new content to the cache. + */ +static void +arc_adapt(int bytes, arc_state_t *state) +{ + int mult; + + ASSERT(bytes > 0); + /* + * Adapt the target size of the MRU list: + * - if we just hit in the MRU ghost list, then increase + * the target size of the MRU list. + * - if we just hit in the MFU ghost list, then increase + * the target size of the MFU list by decreasing the + * target size of the MRU list. + */ + if (state == arc_mru_ghost) { + mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ? + 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size)); + + arc_p = MIN(arc_c, arc_p + bytes * mult); + } else if (state == arc_mfu_ghost) { + mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ? + 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size)); + + arc_p = MAX(0, (int64_t)arc_p - bytes * mult); + } + ASSERT((int64_t)arc_p >= 0); + + if (arc_reclaim_needed()) { + cv_signal(&arc_reclaim_thr_cv); + return; + } + + if (arc_no_grow) + return; + + if (arc_c >= arc_c_max) + return; + + /* + * If we're within (2 * maxblocksize) bytes of the target + * cache size, increment the target cache size + */ + if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { + atomic_add_64(&arc_c, (int64_t)bytes); + if (arc_c > arc_c_max) + arc_c = arc_c_max; + else if (state == arc_anon) + atomic_add_64(&arc_p, (int64_t)bytes); + if (arc_p > arc_c) + arc_p = arc_c; + } + ASSERT((int64_t)arc_p >= 0); +} + +/* + * Check if the cache has reached its limits and eviction is required + * prior to insert. + */ +static int +arc_evict_needed() +{ + if (arc_reclaim_needed()) + return (1); + + return (arc_size > arc_c); +} + +/* + * The buffer, supplied as the first argument, needs a data block. + * So, if we are at cache max, determine which cache should be victimized. + * We have the following cases: + * + * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) -> + * In this situation if we're out of space, but the resident size of the MFU is + * under the limit, victimize the MFU cache to satisfy this insertion request. + * + * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) -> + * Here, we've used up all of the available space for the MRU, so we need to + * evict from our own cache instead. Evict from the set of resident MRU + * entries. + * + * 3. Insert for MFU (c - p) > sizeof(arc_mfu) -> + * c minus p represents the MFU space in the cache, since p is the size of the + * cache that is dedicated to the MRU. In this situation there's still space on + * the MFU side, so the MRU side needs to be victimized. + * + * 4. Insert for MFU (c - p) < sizeof(arc_mfu) -> + * MFU's resident set is consuming more space than it has been allotted. In + * this situation, we must victimize our own cache, the MFU, for this insertion. + */ +static void +arc_get_data_buf(arc_buf_t *buf) +{ + arc_state_t *state = buf->b_hdr->b_state; + uint64_t size = buf->b_hdr->b_size; + arc_buf_contents_t type = buf->b_hdr->b_type; + + arc_adapt(size, state); + + /* + * We have not yet reached cache maximum size, + * just allocate a new buffer. + */ + if (!arc_evict_needed()) { + if (type == ARC_BUFC_METADATA) { + buf->b_data = zio_buf_alloc(size); + } else { + ASSERT(type == ARC_BUFC_DATA); + buf->b_data = zio_data_buf_alloc(size); + } + atomic_add_64(&arc_size, size); + goto out; + } + + /* + * If we are prefetching from the mfu ghost list, this buffer + * will end up on the mru list; so steal space from there. + */ + if (state == arc_mfu_ghost) + state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu; + else if (state == arc_mru_ghost) + state = arc_mru; + + if (state == arc_mru || state == arc_anon) { + uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; + state = (arc_p > mru_used) ? arc_mfu : arc_mru; + } else { + /* MFU cases */ + uint64_t mfu_space = arc_c - arc_p; + state = (mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; + } + if ((buf->b_data = arc_evict(state, size, TRUE, type)) == NULL) { + if (type == ARC_BUFC_METADATA) { + buf->b_data = zio_buf_alloc(size); + } else { + ASSERT(type == ARC_BUFC_DATA); + buf->b_data = zio_data_buf_alloc(size); + } + atomic_add_64(&arc_size, size); + ARCSTAT_BUMP(arcstat_recycle_miss); + } + ASSERT(buf->b_data != NULL); +out: + /* + * Update the state size. Note that ghost states have a + * "ghost size" and so don't need to be updated. + */ + if (!GHOST_STATE(buf->b_hdr->b_state)) { + arc_buf_hdr_t *hdr = buf->b_hdr; + + atomic_add_64(&hdr->b_state->arcs_size, size); + if (list_link_active(&hdr->b_arc_node)) { + ASSERT(refcount_is_zero(&hdr->b_refcnt)); + atomic_add_64(&hdr->b_state->arcs_lsize, size); + } + /* + * If we are growing the cache, and we are adding anonymous + * data, and we have outgrown arc_p, update arc_p + */ + if (arc_size < arc_c && hdr->b_state == arc_anon && + arc_anon->arcs_size + arc_mru->arcs_size > arc_p) + arc_p = MIN(arc_c, arc_p + size); + } +} + +/* + * This routine is called whenever a buffer is accessed. + * NOTE: the hash lock is dropped in this function. + */ +static void +arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) +{ + ASSERT(MUTEX_HELD(hash_lock)); + + if (buf->b_state == arc_anon) { + /* + * This buffer is not in the cache, and does not + * appear in our "ghost" list. Add the new buffer + * to the MRU state. + */ + + ASSERT(buf->b_arc_access == 0); + buf->b_arc_access = lbolt; + DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); + arc_change_state(arc_mru, buf, hash_lock); + + } else if (buf->b_state == arc_mru) { + /* + * If this buffer is here because of a prefetch, then either: + * - clear the flag if this is a "referencing" read + * (any subsequent access will bump this into the MFU state). + * or + * - move the buffer to the head of the list if this is + * another prefetch (to make it less likely to be evicted). + */ + if ((buf->b_flags & ARC_PREFETCH) != 0) { + if (refcount_count(&buf->b_refcnt) == 0) { + ASSERT(list_link_active(&buf->b_arc_node)); + mutex_enter(&arc_mru->arcs_mtx); + list_remove(&arc_mru->arcs_list, buf); + list_insert_head(&arc_mru->arcs_list, buf); + mutex_exit(&arc_mru->arcs_mtx); + } else { + buf->b_flags &= ~ARC_PREFETCH; + ARCSTAT_BUMP(arcstat_mru_hits); + } + buf->b_arc_access = lbolt; + return; + } + + /* + * This buffer has been "accessed" only once so far, + * but it is still in the cache. Move it to the MFU + * state. + */ + if (lbolt > buf->b_arc_access + ARC_MINTIME) { + /* + * More than 125ms have passed since we + * instantiated this buffer. Move it to the + * most frequently used state. + */ + buf->b_arc_access = lbolt; + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); + arc_change_state(arc_mfu, buf, hash_lock); + } + ARCSTAT_BUMP(arcstat_mru_hits); + } else if (buf->b_state == arc_mru_ghost) { + arc_state_t *new_state; + /* + * This buffer has been "accessed" recently, but + * was evicted from the cache. Move it to the + * MFU state. + */ + + if (buf->b_flags & ARC_PREFETCH) { + new_state = arc_mru; + if (refcount_count(&buf->b_refcnt) > 0) + buf->b_flags &= ~ARC_PREFETCH; + DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); + } else { + new_state = arc_mfu; + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); + } + + buf->b_arc_access = lbolt; + arc_change_state(new_state, buf, hash_lock); + + ARCSTAT_BUMP(arcstat_mru_ghost_hits); + } else if (buf->b_state == arc_mfu) { + /* + * This buffer has been accessed more than once and is + * still in the cache. Keep it in the MFU state. + * + * NOTE: an add_reference() that occurred when we did + * the arc_read() will have kicked this off the list. + * If it was a prefetch, we will explicitly move it to + * the head of the list now. + */ + if ((buf->b_flags & ARC_PREFETCH) != 0) { + ASSERT(refcount_count(&buf->b_refcnt) == 0); + ASSERT(list_link_active(&buf->b_arc_node)); + mutex_enter(&arc_mfu->arcs_mtx); + list_remove(&arc_mfu->arcs_list, buf); + list_insert_head(&arc_mfu->arcs_list, buf); + mutex_exit(&arc_mfu->arcs_mtx); + } + ARCSTAT_BUMP(arcstat_mfu_hits); + buf->b_arc_access = lbolt; + } else if (buf->b_state == arc_mfu_ghost) { + arc_state_t *new_state = arc_mfu; + /* + * This buffer has been accessed more than once but has + * been evicted from the cache. Move it back to the + * MFU state. + */ + + if (buf->b_flags & ARC_PREFETCH) { + /* + * This is a prefetch access... + * move this block back to the MRU state. + */ + ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0); + new_state = arc_mru; + } + + buf->b_arc_access = lbolt; + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); + arc_change_state(new_state, buf, hash_lock); + + ARCSTAT_BUMP(arcstat_mfu_ghost_hits); + } else { + ASSERT(!"invalid arc state"); + } +} + +/* a generic arc_done_func_t which you can use */ +/* ARGSUSED */ +void +arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) +{ + bcopy(buf->b_data, arg, buf->b_hdr->b_size); + VERIFY(arc_buf_remove_ref(buf, arg) == 1); +} + +/* a generic arc_done_func_t which you can use */ +void +arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) +{ + arc_buf_t **bufp = arg; + if (zio && zio->io_error) { + VERIFY(arc_buf_remove_ref(buf, arg) == 1); + *bufp = NULL; + } else { + *bufp = buf; + } +} + +static void +arc_read_done(zio_t *zio) +{ + arc_buf_hdr_t *hdr, *found; + arc_buf_t *buf; + arc_buf_t *abuf; /* buffer we're assigning to callback */ + kmutex_t *hash_lock; + arc_callback_t *callback_list, *acb; + int freeable = FALSE; + + buf = zio->io_private; + hdr = buf->b_hdr; + + /* + * The hdr was inserted into hash-table and removed from lists + * prior to starting I/O. We should find this header, since + * it's in the hash table, and it should be legit since it's + * not possible to evict it during the I/O. The only possible + * reason for it not to be found is if we were freed during the + * read. + */ + found = buf_hash_find(zio->io_spa, &hdr->b_dva, hdr->b_birth, + &hash_lock); + + ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) || + (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp)))); + + /* byteswap if necessary */ + callback_list = hdr->b_acb; + ASSERT(callback_list != NULL); + if (BP_SHOULD_BYTESWAP(zio->io_bp) && callback_list->acb_byteswap) + callback_list->acb_byteswap(buf->b_data, hdr->b_size); + + arc_cksum_compute(buf); + + /* create copies of the data buffer for the callers */ + abuf = buf; + for (acb = callback_list; acb; acb = acb->acb_next) { + if (acb->acb_done) { + if (abuf == NULL) + abuf = arc_buf_clone(buf); + acb->acb_buf = abuf; + abuf = NULL; + } + } + hdr->b_acb = NULL; + hdr->b_flags &= ~ARC_IO_IN_PROGRESS; + ASSERT(!HDR_BUF_AVAILABLE(hdr)); + if (abuf == buf) + hdr->b_flags |= ARC_BUF_AVAILABLE; + + ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL); + + if (zio->io_error != 0) { + hdr->b_flags |= ARC_IO_ERROR; + if (hdr->b_state != arc_anon) + arc_change_state(arc_anon, hdr, hash_lock); + if (HDR_IN_HASH_TABLE(hdr)) + buf_hash_remove(hdr); + freeable = refcount_is_zero(&hdr->b_refcnt); + /* convert checksum errors into IO errors */ + if (zio->io_error == ECKSUM) + zio->io_error = EIO; + } + + /* + * Broadcast before we drop the hash_lock to avoid the possibility + * that the hdr (and hence the cv) might be freed before we get to + * the cv_broadcast(). + */ + cv_broadcast(&hdr->b_cv); + + if (hash_lock) { + /* + * Only call arc_access on anonymous buffers. This is because + * if we've issued an I/O for an evicted buffer, we've already + * called arc_access (to prevent any simultaneous readers from + * getting confused). + */ + if (zio->io_error == 0 && hdr->b_state == arc_anon) + arc_access(hdr, hash_lock); + mutex_exit(hash_lock); + } else { + /* + * This block was freed while we waited for the read to + * complete. It has been removed from the hash table and + * moved to the anonymous state (so that it won't show up + * in the cache). + */ + ASSERT3P(hdr->b_state, ==, arc_anon); + freeable = refcount_is_zero(&hdr->b_refcnt); + } + + /* execute each callback and free its structure */ + while ((acb = callback_list) != NULL) { + if (acb->acb_done) + acb->acb_done(zio, acb->acb_buf, acb->acb_private); + + if (acb->acb_zio_dummy != NULL) { + acb->acb_zio_dummy->io_error = zio->io_error; + zio_nowait(acb->acb_zio_dummy); + } + + callback_list = acb->acb_next; + kmem_free(acb, sizeof (arc_callback_t)); + } + + if (freeable) + arc_hdr_destroy(hdr); +} + +/* + * "Read" the block block at the specified DVA (in bp) via the + * cache. If the block is found in the cache, invoke the provided + * callback immediately and return. Note that the `zio' parameter + * in the callback will be NULL in this case, since no IO was + * required. If the block is not in the cache pass the read request + * on to the spa with a substitute callback function, so that the + * requested block will be added to the cache. + * + * If a read request arrives for a block that has a read in-progress, + * either wait for the in-progress read to complete (and return the + * results); or, if this is a read with a "done" func, add a record + * to the read to invoke the "done" func when the read completes, + * and return; or just return. + * + * arc_read_done() will invoke all the requested "done" functions + * for readers of this block. + */ +int +arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap, + arc_done_func_t *done, void *private, int priority, int flags, + uint32_t *arc_flags, zbookmark_t *zb) +{ + arc_buf_hdr_t *hdr; + arc_buf_t *buf; + kmutex_t *hash_lock; + zio_t *rzio; + +top: + hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); + if (hdr && hdr->b_datacnt > 0) { + + *arc_flags |= ARC_CACHED; + + if (HDR_IO_IN_PROGRESS(hdr)) { + + if (*arc_flags & ARC_WAIT) { + cv_wait(&hdr->b_cv, hash_lock); + mutex_exit(hash_lock); + goto top; + } + ASSERT(*arc_flags & ARC_NOWAIT); + + if (done) { + arc_callback_t *acb = NULL; + + acb = kmem_zalloc(sizeof (arc_callback_t), + KM_SLEEP); + acb->acb_done = done; + acb->acb_private = private; + acb->acb_byteswap = swap; + if (pio != NULL) + acb->acb_zio_dummy = zio_null(pio, + spa, NULL, NULL, flags); + + ASSERT(acb->acb_done != NULL); + acb->acb_next = hdr->b_acb; + hdr->b_acb = acb; + add_reference(hdr, hash_lock, private); + mutex_exit(hash_lock); + return (0); + } + mutex_exit(hash_lock); + return (0); + } + + ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); + + if (done) { + add_reference(hdr, hash_lock, private); + /* + * If this block is already in use, create a new + * copy of the data so that we will be guaranteed + * that arc_release() will always succeed. + */ + buf = hdr->b_buf; + ASSERT(buf); + ASSERT(buf->b_data); + if (HDR_BUF_AVAILABLE(hdr)) { + ASSERT(buf->b_efunc == NULL); + hdr->b_flags &= ~ARC_BUF_AVAILABLE; + } else { + buf = arc_buf_clone(buf); + } + } else if (*arc_flags & ARC_PREFETCH && + refcount_count(&hdr->b_refcnt) == 0) { + hdr->b_flags |= ARC_PREFETCH; + } + DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); + arc_access(hdr, hash_lock); + mutex_exit(hash_lock); + ARCSTAT_BUMP(arcstat_hits); + ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), + demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, + data, metadata, hits); + + if (done) + done(NULL, buf, private); + } else { + uint64_t size = BP_GET_LSIZE(bp); + arc_callback_t *acb; + + if (hdr == NULL) { + /* this block is not in the cache */ + arc_buf_hdr_t *exists; + arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); + buf = arc_buf_alloc(spa, size, private, type); + hdr = buf->b_hdr; + hdr->b_dva = *BP_IDENTITY(bp); + hdr->b_birth = bp->blk_birth; + hdr->b_cksum0 = bp->blk_cksum.zc_word[0]; + exists = buf_hash_insert(hdr, &hash_lock); + if (exists) { + /* somebody beat us to the hash insert */ + mutex_exit(hash_lock); + bzero(&hdr->b_dva, sizeof (dva_t)); + hdr->b_birth = 0; + hdr->b_cksum0 = 0; + (void) arc_buf_remove_ref(buf, private); + goto top; /* restart the IO request */ + } + /* if this is a prefetch, we don't have a reference */ + if (*arc_flags & ARC_PREFETCH) { + (void) remove_reference(hdr, hash_lock, + private); + hdr->b_flags |= ARC_PREFETCH; + } + if (BP_GET_LEVEL(bp) > 0) + hdr->b_flags |= ARC_INDIRECT; + } else { + /* this block is in the ghost cache */ + ASSERT(GHOST_STATE(hdr->b_state)); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0); + ASSERT(hdr->b_buf == NULL); + + /* if this is a prefetch, we don't have a reference */ + if (*arc_flags & ARC_PREFETCH) + hdr->b_flags |= ARC_PREFETCH; + else + add_reference(hdr, hash_lock, private); + buf = kmem_cache_alloc(buf_cache, KM_SLEEP); + buf->b_hdr = hdr; + buf->b_data = NULL; + buf->b_efunc = NULL; + buf->b_private = NULL; + buf->b_next = NULL; + hdr->b_buf = buf; + arc_get_data_buf(buf); + ASSERT(hdr->b_datacnt == 0); + hdr->b_datacnt = 1; + + } + + acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); + acb->acb_done = done; + acb->acb_private = private; + acb->acb_byteswap = swap; + + ASSERT(hdr->b_acb == NULL); + hdr->b_acb = acb; + hdr->b_flags |= ARC_IO_IN_PROGRESS; + + /* + * If the buffer has been evicted, migrate it to a present state + * before issuing the I/O. Once we drop the hash-table lock, + * the header will be marked as I/O in progress and have an + * attached buffer. At this point, anybody who finds this + * buffer ought to notice that it's legit but has a pending I/O. + */ + + if (GHOST_STATE(hdr->b_state)) + arc_access(hdr, hash_lock); + mutex_exit(hash_lock); + + ASSERT3U(hdr->b_size, ==, size); + DTRACE_PROBE3(arc__miss, blkptr_t *, bp, uint64_t, size, + zbookmark_t *, zb); + ARCSTAT_BUMP(arcstat_misses); + ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), + demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, + data, metadata, misses); + + rzio = zio_read(pio, spa, bp, buf->b_data, size, + arc_read_done, buf, priority, flags, zb); + + if (*arc_flags & ARC_WAIT) + return (zio_wait(rzio)); + + ASSERT(*arc_flags & ARC_NOWAIT); + zio_nowait(rzio); + } + return (0); +} + +/* + * arc_read() variant to support pool traversal. If the block is already + * in the ARC, make a copy of it; otherwise, the caller will do the I/O. + * The idea is that we don't want pool traversal filling up memory, but + * if the ARC already has the data anyway, we shouldn't pay for the I/O. + */ +int +arc_tryread(spa_t *spa, blkptr_t *bp, void *data) +{ + arc_buf_hdr_t *hdr; + kmutex_t *hash_mtx; + int rc = 0; + + hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx); + + if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) { + arc_buf_t *buf = hdr->b_buf; + + ASSERT(buf); + while (buf->b_data == NULL) { + buf = buf->b_next; + ASSERT(buf); + } + bcopy(buf->b_data, data, hdr->b_size); + } else { + rc = ENOENT; + } + + if (hash_mtx) + mutex_exit(hash_mtx); + + return (rc); +} + +void +arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) +{ + ASSERT(buf->b_hdr != NULL); + ASSERT(buf->b_hdr->b_state != arc_anon); + ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL); + buf->b_efunc = func; + buf->b_private = private; +} + +/* + * This is used by the DMU to let the ARC know that a buffer is + * being evicted, so the ARC should clean up. If this arc buf + * is not yet in the evicted state, it will be put there. + */ +int +arc_buf_evict(arc_buf_t *buf) +{ + arc_buf_hdr_t *hdr; + kmutex_t *hash_lock; + arc_buf_t **bufp; + + mutex_enter(&arc_eviction_mtx); + hdr = buf->b_hdr; + if (hdr == NULL) { + /* + * We are in arc_do_user_evicts(). + */ + ASSERT(buf->b_data == NULL); + mutex_exit(&arc_eviction_mtx); + return (0); + } + hash_lock = HDR_LOCK(hdr); + mutex_exit(&arc_eviction_mtx); + + mutex_enter(hash_lock); + + if (buf->b_data == NULL) { + /* + * We are on the eviction list. + */ + mutex_exit(hash_lock); + mutex_enter(&arc_eviction_mtx); + if (buf->b_hdr == NULL) { + /* + * We are already in arc_do_user_evicts(). + */ + mutex_exit(&arc_eviction_mtx); + return (0); + } else { + arc_buf_t copy = *buf; /* structure assignment */ + /* + * Process this buffer now + * but let arc_do_user_evicts() do the reaping. + */ + buf->b_efunc = NULL; + mutex_exit(&arc_eviction_mtx); + VERIFY(copy.b_efunc(©) == 0); + return (1); + } + } + + ASSERT(buf->b_hdr == hdr); + ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt); + ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); + + /* + * Pull this buffer off of the hdr + */ + bufp = &hdr->b_buf; + while (*bufp != buf) + bufp = &(*bufp)->b_next; + *bufp = buf->b_next; + + ASSERT(buf->b_data != NULL); + arc_buf_destroy(buf, FALSE, FALSE); + + if (hdr->b_datacnt == 0) { + arc_state_t *old_state = hdr->b_state; + arc_state_t *evicted_state; + + ASSERT(refcount_is_zero(&hdr->b_refcnt)); + + evicted_state = + (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; + + mutex_enter(&old_state->arcs_mtx); + mutex_enter(&evicted_state->arcs_mtx); + + arc_change_state(evicted_state, hdr, hash_lock); + ASSERT(HDR_IN_HASH_TABLE(hdr)); + hdr->b_flags = ARC_IN_HASH_TABLE; + + mutex_exit(&evicted_state->arcs_mtx); + mutex_exit(&old_state->arcs_mtx); + } + mutex_exit(hash_lock); + + VERIFY(buf->b_efunc(buf) == 0); + buf->b_efunc = NULL; + buf->b_private = NULL; + buf->b_hdr = NULL; + kmem_cache_free(buf_cache, buf); + return (1); +} + +/* + * Release this buffer from the cache. This must be done + * after a read and prior to modifying the buffer contents. + * If the buffer has more than one reference, we must make + * make a new hdr for the buffer. + */ +void +arc_release(arc_buf_t *buf, void *tag) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + kmutex_t *hash_lock = HDR_LOCK(hdr); + + /* this buffer is not on any list */ + ASSERT(refcount_count(&hdr->b_refcnt) > 0); + + if (hdr->b_state == arc_anon) { + /* this buffer is already released */ + ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1); + ASSERT(BUF_EMPTY(hdr)); + ASSERT(buf->b_efunc == NULL); + arc_buf_thaw(buf); + return; + } + + mutex_enter(hash_lock); + + /* + * Do we have more than one buf? + */ + if (hdr->b_buf != buf || buf->b_next != NULL) { + arc_buf_hdr_t *nhdr; + arc_buf_t **bufp; + uint64_t blksz = hdr->b_size; + spa_t *spa = hdr->b_spa; + arc_buf_contents_t type = hdr->b_type; + + ASSERT(hdr->b_datacnt > 1); + /* + * Pull the data off of this buf and attach it to + * a new anonymous buf. + */ + (void) remove_reference(hdr, hash_lock, tag); + bufp = &hdr->b_buf; + while (*bufp != buf) + bufp = &(*bufp)->b_next; + *bufp = (*bufp)->b_next; + buf->b_next = NULL; + + ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size); + atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size); + if (refcount_is_zero(&hdr->b_refcnt)) { + ASSERT3U(hdr->b_state->arcs_lsize, >=, hdr->b_size); + atomic_add_64(&hdr->b_state->arcs_lsize, -hdr->b_size); + } + hdr->b_datacnt -= 1; + arc_cksum_verify(buf); + + mutex_exit(hash_lock); + + nhdr = kmem_cache_alloc(hdr_cache, KM_SLEEP); + nhdr->b_size = blksz; + nhdr->b_spa = spa; + nhdr->b_type = type; + nhdr->b_buf = buf; + nhdr->b_state = arc_anon; + nhdr->b_arc_access = 0; + nhdr->b_flags = 0; + nhdr->b_datacnt = 1; + nhdr->b_freeze_cksum = NULL; + mutex_init(&nhdr->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); + (void) refcount_add(&nhdr->b_refcnt, tag); + buf->b_hdr = nhdr; + atomic_add_64(&arc_anon->arcs_size, blksz); + + hdr = nhdr; + } else { + ASSERT(refcount_count(&hdr->b_refcnt) == 1); + ASSERT(!list_link_active(&hdr->b_arc_node)); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + arc_change_state(arc_anon, hdr, hash_lock); + hdr->b_arc_access = 0; + mutex_exit(hash_lock); + bzero(&hdr->b_dva, sizeof (dva_t)); + hdr->b_birth = 0; + hdr->b_cksum0 = 0; + arc_buf_thaw(buf); + } + buf->b_efunc = NULL; + buf->b_private = NULL; +} + +int +arc_released(arc_buf_t *buf) +{ + return (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon); +} + +int +arc_has_callback(arc_buf_t *buf) +{ + return (buf->b_efunc != NULL); +} + +#ifdef ZFS_DEBUG +int +arc_referenced(arc_buf_t *buf) +{ + return (refcount_count(&buf->b_hdr->b_refcnt)); +} +#endif + +static void +arc_write_ready(zio_t *zio) +{ + arc_write_callback_t *callback = zio->io_private; + arc_buf_t *buf = callback->awcb_buf; + + if (callback->awcb_ready) { + ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt)); + callback->awcb_ready(zio, buf, callback->awcb_private); + } + arc_cksum_compute(buf); +} + +static void +arc_write_done(zio_t *zio) +{ + arc_write_callback_t *callback = zio->io_private; + arc_buf_t *buf = callback->awcb_buf; + arc_buf_hdr_t *hdr = buf->b_hdr; + + hdr->b_acb = NULL; + + /* this buffer is on no lists and is not in the hash table */ + ASSERT3P(hdr->b_state, ==, arc_anon); + + hdr->b_dva = *BP_IDENTITY(zio->io_bp); + hdr->b_birth = zio->io_bp->blk_birth; + hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; + /* + * If the block to be written was all-zero, we may have + * compressed it away. In this case no write was performed + * so there will be no dva/birth-date/checksum. The buffer + * must therefor remain anonymous (and uncached). + */ + if (!BUF_EMPTY(hdr)) { + arc_buf_hdr_t *exists; + kmutex_t *hash_lock; + + arc_cksum_verify(buf); + + exists = buf_hash_insert(hdr, &hash_lock); + if (exists) { + /* + * This can only happen if we overwrite for + * sync-to-convergence, because we remove + * buffers from the hash table when we arc_free(). + */ + ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig), + BP_IDENTITY(zio->io_bp))); + ASSERT3U(zio->io_bp_orig.blk_birth, ==, + zio->io_bp->blk_birth); + + ASSERT(refcount_is_zero(&exists->b_refcnt)); + arc_change_state(arc_anon, exists, hash_lock); + mutex_exit(hash_lock); + arc_hdr_destroy(exists); + exists = buf_hash_insert(hdr, &hash_lock); + ASSERT3P(exists, ==, NULL); + } + hdr->b_flags &= ~ARC_IO_IN_PROGRESS; + arc_access(hdr, hash_lock); + mutex_exit(hash_lock); + } else if (callback->awcb_done == NULL) { + int destroy_hdr; + /* + * This is an anonymous buffer with no user callback, + * destroy it if there are no active references. + */ + mutex_enter(&arc_eviction_mtx); + destroy_hdr = refcount_is_zero(&hdr->b_refcnt); + hdr->b_flags &= ~ARC_IO_IN_PROGRESS; + mutex_exit(&arc_eviction_mtx); + if (destroy_hdr) + arc_hdr_destroy(hdr); + } else { + hdr->b_flags &= ~ARC_IO_IN_PROGRESS; + } + + if (callback->awcb_done) { + ASSERT(!refcount_is_zero(&hdr->b_refcnt)); + callback->awcb_done(zio, buf, callback->awcb_private); + } + + kmem_free(callback, sizeof (arc_write_callback_t)); +} + +zio_t * +arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies, + uint64_t txg, blkptr_t *bp, arc_buf_t *buf, + arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority, + int flags, zbookmark_t *zb) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + arc_write_callback_t *callback; + zio_t *zio; + + /* this is a private buffer - no locking required */ + ASSERT3P(hdr->b_state, ==, arc_anon); + ASSERT(BUF_EMPTY(hdr)); + ASSERT(!HDR_IO_ERROR(hdr)); + ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0); + ASSERT(hdr->b_acb == 0); + callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); + callback->awcb_ready = ready; + callback->awcb_done = done; + callback->awcb_private = private; + callback->awcb_buf = buf; + hdr->b_flags |= ARC_IO_IN_PROGRESS; + zio = zio_write(pio, spa, checksum, compress, ncopies, txg, bp, + buf->b_data, hdr->b_size, arc_write_ready, arc_write_done, callback, + priority, flags, zb); + + return (zio); +} + +int +arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, + zio_done_func_t *done, void *private, uint32_t arc_flags) +{ + arc_buf_hdr_t *ab; + kmutex_t *hash_lock; + zio_t *zio; + + /* + * If this buffer is in the cache, release it, so it + * can be re-used. + */ + ab = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); + if (ab != NULL) { + /* + * The checksum of blocks to free is not always + * preserved (eg. on the deadlist). However, if it is + * nonzero, it should match what we have in the cache. + */ + ASSERT(bp->blk_cksum.zc_word[0] == 0 || + ab->b_cksum0 == bp->blk_cksum.zc_word[0]); + if (ab->b_state != arc_anon) + arc_change_state(arc_anon, ab, hash_lock); + if (HDR_IO_IN_PROGRESS(ab)) { + /* + * This should only happen when we prefetch. + */ + ASSERT(ab->b_flags & ARC_PREFETCH); + ASSERT3U(ab->b_datacnt, ==, 1); + ab->b_flags |= ARC_FREED_IN_READ; + if (HDR_IN_HASH_TABLE(ab)) + buf_hash_remove(ab); + ab->b_arc_access = 0; + bzero(&ab->b_dva, sizeof (dva_t)); + ab->b_birth = 0; + ab->b_cksum0 = 0; + ab->b_buf->b_efunc = NULL; + ab->b_buf->b_private = NULL; + mutex_exit(hash_lock); + } else if (refcount_is_zero(&ab->b_refcnt)) { + mutex_exit(hash_lock); + arc_hdr_destroy(ab); + ARCSTAT_BUMP(arcstat_deleted); + } else { + /* + * We still have an active reference on this + * buffer. This can happen, e.g., from + * dbuf_unoverride(). + */ + ASSERT(!HDR_IN_HASH_TABLE(ab)); + ab->b_arc_access = 0; + bzero(&ab->b_dva, sizeof (dva_t)); + ab->b_birth = 0; + ab->b_cksum0 = 0; + ab->b_buf->b_efunc = NULL; + ab->b_buf->b_private = NULL; + mutex_exit(hash_lock); + } + } + + zio = zio_free(pio, spa, txg, bp, done, private); + + if (arc_flags & ARC_WAIT) + return (zio_wait(zio)); + + ASSERT(arc_flags & ARC_NOWAIT); + zio_nowait(zio); + + return (0); +} + +void +arc_tempreserve_clear(uint64_t tempreserve) +{ + atomic_add_64(&arc_tempreserve, -tempreserve); + ASSERT((int64_t)arc_tempreserve >= 0); +} + +int +arc_tempreserve_space(uint64_t tempreserve) +{ +#ifdef ZFS_DEBUG + /* + * Once in a while, fail for no reason. Everything should cope. + */ + if (spa_get_random(10000) == 0) { + dprintf("forcing random failure\n"); + return (ERESTART); + } +#endif + if (tempreserve > arc_c/4 && !arc_no_grow) + arc_c = MIN(arc_c_max, tempreserve * 4); + if (tempreserve > arc_c) + return (ENOMEM); + + /* + * Throttle writes when the amount of dirty data in the cache + * gets too large. We try to keep the cache less than half full + * of dirty blocks so that our sync times don't grow too large. + * Note: if two requests come in concurrently, we might let them + * both succeed, when one of them should fail. Not a huge deal. + * + * XXX The limit should be adjusted dynamically to keep the time + * to sync a dataset fixed (around 1-5 seconds?). + */ + + if (tempreserve + arc_tempreserve + arc_anon->arcs_size > arc_c / 2 && + arc_tempreserve + arc_anon->arcs_size > arc_c / 4) { + dprintf("failing, arc_tempreserve=%lluK anon=%lluK " + "tempreserve=%lluK arc_c=%lluK\n", + arc_tempreserve>>10, arc_anon->arcs_lsize>>10, + tempreserve>>10, arc_c>>10); + return (ERESTART); + } + atomic_add_64(&arc_tempreserve, tempreserve); + return (0); +} + +static kmutex_t arc_lowmem_lock; +#ifdef _KERNEL +static eventhandler_tag arc_event_lowmem = NULL; + +static void +arc_lowmem(void *arg __unused, int howto __unused) +{ + + /* Serialize access via arc_lowmem_lock. */ + mutex_enter(&arc_lowmem_lock); + zfs_needfree = 1; + cv_signal(&arc_reclaim_thr_cv); + while (zfs_needfree) + tsleep(&zfs_needfree, 0, "zfs:lowmem", hz / 5); + mutex_exit(&arc_lowmem_lock); +} +#endif + +void +arc_init(void) +{ + mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&arc_lowmem_lock, NULL, MUTEX_DEFAULT, NULL); + + /* Convert seconds to clock ticks */ + arc_min_prefetch_lifespan = 1 * hz; + + /* Start out with 1/8 of all memory */ + arc_c = kmem_size() / 8; +#if 0 +#ifdef _KERNEL + /* + * On architectures where the physical memory can be larger + * than the addressable space (intel in 32-bit mode), we may + * need to limit the cache to 1/8 of VM size. + */ + arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); +#endif +#endif + /* set min cache to 1/32 of all memory, or 16MB, whichever is more */ + arc_c_min = MAX(arc_c / 4, 64<<18); + /* set max to 1/2 of all memory, or all but 1GB, whichever is more */ + if (arc_c * 8 >= 1<<30) + arc_c_max = (arc_c * 8) - (1<<30); + else + arc_c_max = arc_c_min; + arc_c_max = MAX(arc_c * 4, arc_c_max); +#ifdef _KERNEL + /* + * Allow the tunables to override our calculations if they are + * reasonable (ie. over 16MB) + */ + if (zfs_arc_max >= 64<<18 && zfs_arc_max < kmem_size()) + arc_c_max = zfs_arc_max; + if (zfs_arc_min >= 64<<18 && zfs_arc_min <= arc_c_max) + arc_c_min = zfs_arc_min; +#endif + arc_c = arc_c_max; + arc_p = (arc_c >> 1); + + /* if kmem_flags are set, lets try to use less memory */ + if (kmem_debugging()) + arc_c = arc_c / 2; + if (arc_c < arc_c_min) + arc_c = arc_c_min; + + zfs_arc_min = arc_c_min; + zfs_arc_max = arc_c_max; + + arc_anon = &ARC_anon; + arc_mru = &ARC_mru; + arc_mru_ghost = &ARC_mru_ghost; + arc_mfu = &ARC_mfu; + arc_mfu_ghost = &ARC_mfu_ghost; + arc_size = 0; + + mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); + + list_create(&arc_mru->arcs_list, sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mru_ghost->arcs_list, sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mfu->arcs_list, sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mfu_ghost->arcs_list, sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_arc_node)); + + buf_init(); + + arc_thread_exit = 0; + arc_eviction_list = NULL; + mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL); + bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); + + arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, + sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); + + if (arc_ksp != NULL) { + arc_ksp->ks_data = &arc_stats; + kstat_install(arc_ksp); + } + + (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, + TS_RUN, minclsyspri); + +#ifdef _KERNEL + arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL, + EVENTHANDLER_PRI_FIRST); +#endif + + arc_dead = FALSE; + +#ifdef _KERNEL + /* Warn about ZFS memory requirements. */ + if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) { + printf("ZFS WARNING: Recommended minimum RAM size is 512MB; " + "expect unstable behavior.\n"); + } else if (kmem_size() < 256 * (1 << 20)) { + printf("ZFS WARNING: Recommended minimum kmem_size is 256MB; " + "expect unstable behavior.\n"); + printf(" Consider tuning vm.kmem_size or " + "vm.kmem_size_min\n"); + printf(" in /boot/loader.conf.\n"); + } +#endif +} + +void +arc_fini(void) +{ + mutex_enter(&arc_reclaim_thr_lock); + arc_thread_exit = 1; + cv_signal(&arc_reclaim_thr_cv); + while (arc_thread_exit != 0) + cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); + mutex_exit(&arc_reclaim_thr_lock); + + arc_flush(); + + arc_dead = TRUE; + + if (arc_ksp != NULL) { + kstat_delete(arc_ksp); + arc_ksp = NULL; + } + + mutex_destroy(&arc_eviction_mtx); + mutex_destroy(&arc_reclaim_thr_lock); + cv_destroy(&arc_reclaim_thr_cv); + + list_destroy(&arc_mru->arcs_list); + list_destroy(&arc_mru_ghost->arcs_list); + list_destroy(&arc_mfu->arcs_list); + list_destroy(&arc_mfu_ghost->arcs_list); + + mutex_destroy(&arc_anon->arcs_mtx); + mutex_destroy(&arc_mru->arcs_mtx); + mutex_destroy(&arc_mru_ghost->arcs_mtx); + mutex_destroy(&arc_mfu->arcs_mtx); + mutex_destroy(&arc_mfu_ghost->arcs_mtx); + + buf_fini(); + + mutex_destroy(&arc_lowmem_lock); +#ifdef _KERNEL + if (arc_event_lowmem != NULL) + EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem); +#endif +} |