summaryrefslogtreecommitdiffstats
path: root/sys/cddl
diff options
context:
space:
mode:
authormav <mav@FreeBSD.org>2015-10-03 07:27:58 +0000
committermav <mav@FreeBSD.org>2015-10-03 07:27:58 +0000
commitbcd238ca436a2c4776cf0bdb90fa7b0986f00483 (patch)
treebea05b621e09e97a546ddb15fa3d01e6fff88a84 /sys/cddl
parent21d16f9dcd16d9ed664deee897d47b49f9091be0 (diff)
downloadFreeBSD-src-bcd238ca436a2c4776cf0bdb90fa7b0986f00483.zip
FreeBSD-src-bcd238ca436a2c4776cf0bdb90fa7b0986f00483.tar.gz
MFC r286570: 5408 managing ZFS cache devices requires lots of RAM
Reviewed by: Christopher Siden <christopher.siden@delphix.com> Reviewed by: George Wilson <george.wilson@delphix.com> Reviewed by: Matthew Ahrens <mahrens@delphix.com> Reviewed by: Don Brady <dev.fs.zfs@gmail.com> Reviewed by: Josef 'Jeff' Sipek <josef.sipek@nexenta.com> Approved by: Garrett D'Amore <garrett@damore.org> Author: Chris Williamson <Chris.Williamson@delphix.com> illumos/illumos-gate@89c86e32293a30cdd7af530c38b2073fee01411c Currently, every buffer cached in the L2ARC is accompanied by a 240-byte header in memory, leading to very high memory consumption when using very large cache devices. These changes significantly reduce this overhead. Currently: L1-only header = 176 bytes L1 + L2 or L2-only header = 176 bytes + 32 byte checksum + 32 byte l2hdr = 240 bytes Memory-optimized: L1-only header = 176 bytes L1 + L2 header = 176 bytes + 32 byte checksum = 208 bytes L2-only header = 96 bytes + 32 byte checksum = 128 bytes So overall: Trunk Optimized +-----------------+ L1-only | 176 B | 176 B | (same) +-----------------+ L1 & L2 | 240 B | 208 B | (saved 32 bytes) +-----------------+ L2-only | 240 B | 128 B | (saved 116 bytes) +-----------------+ For an average blocksize of 8KB, this means that for the L2ARC, the ratio of metadata to data has gone down from about 2.92% to 1.56%. For a 'storage optimized' EC2 instance with 1600GB of SSD and 60GB of RAM, this means that we expect a completely full L2ARC to use (1600 GB * 0.0156) / 60GB = 41% of the available memory, down from 78%. Relnotes: yes
Diffstat (limited to 'sys/cddl')
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c1457
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h27
2 files changed, 903 insertions, 581 deletions
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
index e581485..6d5d5f2 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
@@ -111,7 +111,7 @@
* Note that the majority of the performance stats are manipulated
* with atomic operations.
*
- * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
+ * The L2ARC uses the l2ad_mtx on each vdev for the following:
*
* - L2ARC buflist creation
* - L2ARC buflist eviction
@@ -402,6 +402,7 @@ typedef struct arc_stats {
kstat_named_t arcstat_l2_writes_hdr_miss;
kstat_named_t arcstat_l2_evict_lock_retry;
kstat_named_t arcstat_l2_evict_reading;
+ kstat_named_t arcstat_l2_evict_l1cached;
kstat_named_t arcstat_l2_free_on_write;
kstat_named_t arcstat_l2_cdata_free_on_write;
kstat_named_t arcstat_l2_abort_lowmem;
@@ -484,6 +485,7 @@ static arc_stats_t arc_stats = {
{ "l2_writes_hdr_miss", KSTAT_DATA_UINT64 },
{ "l2_evict_lock_retry", KSTAT_DATA_UINT64 },
{ "l2_evict_reading", KSTAT_DATA_UINT64 },
+ { "l2_evict_l1cached", KSTAT_DATA_UINT64 },
{ "l2_free_on_write", KSTAT_DATA_UINT64 },
{ "l2_cdata_free_on_write", KSTAT_DATA_UINT64 },
{ "l2_abort_lowmem", KSTAT_DATA_UINT64 },
@@ -588,8 +590,6 @@ static int arc_no_grow; /* Don't try to grow cache size */
static uint64_t arc_tempreserve;
static uint64_t arc_loaned_bytes;
-typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
-
typedef struct arc_callback arc_callback_t;
struct arc_callback {
@@ -610,29 +610,53 @@ struct arc_write_callback {
arc_buf_t *awcb_buf;
};
-struct arc_buf_hdr {
- /* protected by hash lock */
- dva_t b_dva;
- uint64_t b_birth;
- uint64_t b_cksum0;
-
+/*
+ * ARC buffers are separated into multiple structs as a memory saving measure:
+ * - Common fields struct, always defined, and embedded within it:
+ * - L2-only fields, always allocated but undefined when not in L2ARC
+ * - L1-only fields, only allocated when in L1ARC
+ *
+ * Buffer in L1 Buffer only in L2
+ * +------------------------+ +------------------------+
+ * | arc_buf_hdr_t | | arc_buf_hdr_t |
+ * | | | |
+ * | | | |
+ * | | | |
+ * +------------------------+ +------------------------+
+ * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t |
+ * | (undefined if L1-only) | | |
+ * +------------------------+ +------------------------+
+ * | l1arc_buf_hdr_t |
+ * | |
+ * | |
+ * | |
+ * | |
+ * +------------------------+
+ *
+ * Because it's possible for the L2ARC to become extremely large, we can wind
+ * up eating a lot of memory in L2ARC buffer headers, so the size of a header
+ * is minimized by only allocating the fields necessary for an L1-cached buffer
+ * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and
+ * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple
+ * words in pointers. arc_hdr_realloc() is used to switch a header between
+ * these two allocation states.
+ */
+typedef struct l1arc_buf_hdr {
kmutex_t b_freeze_lock;
- zio_cksum_t *b_freeze_cksum;
+#ifdef ZFS_DEBUG
+ /*
+ * used for debugging wtih kmem_flags - by allocating and freeing
+ * b_thawed when the buffer is thawed, we get a record of the stack
+ * trace that thawed it.
+ */
void *b_thawed;
+#endif
- arc_buf_hdr_t *b_hash_next;
arc_buf_t *b_buf;
- arc_flags_t b_flags;
uint32_t b_datacnt;
-
- arc_callback_t *b_acb;
+ /* for waiting on writes to complete */
kcondvar_t b_cv;
- /* immutable */
- arc_buf_contents_t b_type;
- uint64_t b_size;
- uint64_t b_spa;
-
/* protected by arc state mutex */
arc_state_t *b_state;
list_node_t b_arc_node;
@@ -643,8 +667,46 @@ struct arc_buf_hdr {
/* self protecting */
refcount_t b_refcnt;
- l2arc_buf_hdr_t *b_l2hdr;
+ arc_callback_t *b_acb;
+ /* temporary buffer holder for in-flight compressed data */
+ void *b_tmp_cdata;
+} l1arc_buf_hdr_t;
+
+typedef struct l2arc_dev l2arc_dev_t;
+
+typedef struct l2arc_buf_hdr {
+ /* protected by arc_buf_hdr mutex */
+ l2arc_dev_t *b_dev; /* L2ARC device */
+ uint64_t b_daddr; /* disk address, offset byte */
+ /* real alloc'd buffer size depending on b_compress applied */
+ int32_t b_asize;
+
list_node_t b_l2node;
+} l2arc_buf_hdr_t;
+
+struct arc_buf_hdr {
+ /* protected by hash lock */
+ dva_t b_dva;
+ uint64_t b_birth;
+ /*
+ * Even though this checksum is only set/verified when a buffer is in
+ * the L1 cache, it needs to be in the set of common fields because it
+ * must be preserved from the time before a buffer is written out to
+ * L2ARC until after it is read back in.
+ */
+ zio_cksum_t *b_freeze_cksum;
+
+ arc_buf_hdr_t *b_hash_next;
+ arc_flags_t b_flags;
+
+ /* immutable */
+ int32_t b_size;
+ uint64_t b_spa;
+
+ /* L2ARC fields. Undefined when not in L2ARC. */
+ l2arc_buf_hdr_t b_l2hdr;
+ /* L1ARC fields. Undefined when in l2arc_only state */
+ l1arc_buf_hdr_t b_l1hdr;
};
#ifdef _KERNEL
@@ -681,22 +743,38 @@ static arc_buf_hdr_t arc_eviction_hdr;
#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH)
#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FLAG_FREED_IN_READ)
#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_FLAG_BUF_AVAILABLE)
-#define HDR_FREE_IN_PROGRESS(hdr) \
- ((hdr)->b_flags & ARC_FLAG_FREE_IN_PROGRESS)
+
#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE)
+#define HDR_L2COMPRESS(hdr) ((hdr)->b_flags & ARC_FLAG_L2COMPRESS)
#define HDR_L2_READING(hdr) \
- ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS && \
- (hdr)->b_l2hdr != NULL)
+ (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \
+ ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR))
#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING)
#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED)
#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD)
+#define HDR_ISTYPE_METADATA(hdr) \
+ ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA)
+#define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr))
+
+#define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR)
+#define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)
+
+/* For storing compression mode in b_flags */
+#define HDR_COMPRESS_OFFSET 24
+#define HDR_COMPRESS_NBITS 7
+
+#define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET(hdr->b_flags, \
+ HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS))
+#define HDR_SET_COMPRESS(hdr, cmp) BF32_SET(hdr->b_flags, \
+ HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS, (cmp))
+
/*
* Other sizes
*/
-#define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
-#define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
+#define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
+#define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr))
/*
* Hash table routines
@@ -820,7 +898,7 @@ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD,
/*
* L2ARC Internals
*/
-typedef struct l2arc_dev {
+struct l2arc_dev {
vdev_t *l2ad_vdev; /* vdev */
spa_t *l2ad_spa; /* spa */
uint64_t l2ad_hand; /* next write location */
@@ -829,15 +907,15 @@ typedef struct l2arc_dev {
uint64_t l2ad_evict; /* last addr eviction reached */
boolean_t l2ad_first; /* first sweep through */
boolean_t l2ad_writing; /* currently writing */
- list_t *l2ad_buflist; /* buffer list */
+ kmutex_t l2ad_mtx; /* lock for buffer list */
+ list_t l2ad_buflist; /* buffer list */
list_node_t l2ad_node; /* device list node */
-} l2arc_dev_t;
+};
static list_t L2ARC_dev_list; /* device list */
static list_t *l2arc_dev_list; /* device list pointer */
static kmutex_t l2arc_dev_mtx; /* device list mutex */
static l2arc_dev_t *l2arc_dev_last; /* last device used */
-static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */
static list_t L2ARC_free_on_write; /* free after write buf list */
static list_t *l2arc_free_on_write; /* free after write list ptr */
static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */
@@ -857,18 +935,6 @@ typedef struct l2arc_write_callback {
arc_buf_hdr_t *l2wcb_head; /* head of write buflist */
} l2arc_write_callback_t;
-struct l2arc_buf_hdr {
- /* protected by arc_buf_hdr mutex */
- l2arc_dev_t *b_dev; /* L2ARC device */
- uint64_t b_daddr; /* disk address, offset byte */
- /* compression applied to buffer data */
- enum zio_compress b_compress;
- /* real alloc'd buffer size depending on b_compress applied */
- int b_asize;
- /* temporary buffer holder for in-flight compressed data */
- void *b_tmp_cdata;
-};
-
typedef struct l2arc_data_free {
/* protected by l2arc_free_on_write_mtx */
void *l2df_data;
@@ -887,12 +953,13 @@ static int arc_evict_needed(arc_buf_contents_t);
static void arc_evict_ghost(arc_state_t *, uint64_t, int64_t);
static void arc_buf_watch(arc_buf_t *);
+static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
+static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
+
static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
static void l2arc_read_done(zio_t *);
-static void l2arc_hdr_stat_add(void);
-static void l2arc_hdr_stat_remove(void);
-static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *);
+static boolean_t l2arc_compress_buf(arc_buf_hdr_t *);
static void l2arc_decompress_zio(zio_t *, arc_buf_hdr_t *, enum zio_compress);
static void l2arc_release_cdata_buf(arc_buf_hdr_t *);
@@ -915,8 +982,7 @@ buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
#define BUF_EMPTY(buf) \
((buf)->b_dva.dva_word[0] == 0 && \
- (buf)->b_dva.dva_word[1] == 0 && \
- (buf)->b_cksum0 == 0)
+ (buf)->b_dva.dva_word[1] == 0)
#define BUF_EQUAL(spa, dva, birth, buf) \
((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \
@@ -929,7 +995,6 @@ buf_discard_identity(arc_buf_hdr_t *hdr)
hdr->b_dva.dva_word[0] = 0;
hdr->b_dva.dva_word[1] = 0;
hdr->b_birth = 0;
- hdr->b_cksum0 = 0;
}
static arc_buf_hdr_t *
@@ -959,6 +1024,7 @@ buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
* equal to elem in the hash table, then the already existing element
* will be returned and the new element will not be inserted.
* Otherwise returns NULL.
+ * If lockp == NULL, the caller is assumed to already hold the hash lock.
*/
static arc_buf_hdr_t *
buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp)
@@ -971,8 +1037,14 @@ buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp)
ASSERT(!DVA_IS_EMPTY(&hdr->b_dva));
ASSERT(hdr->b_birth != 0);
ASSERT(!HDR_IN_HASH_TABLE(hdr));
- *lockp = hash_lock;
- mutex_enter(hash_lock);
+
+ if (lockp != NULL) {
+ *lockp = hash_lock;
+ mutex_enter(hash_lock);
+ } else {
+ ASSERT(MUTEX_HELD(hash_lock));
+ }
+
for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL;
fhdr = fhdr->b_hash_next, i++) {
if (BUF_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr))
@@ -1027,7 +1099,8 @@ buf_hash_remove(arc_buf_hdr_t *hdr)
/*
* Global data structures and functions for the buf kmem cache.
*/
-static kmem_cache_t *hdr_cache;
+static kmem_cache_t *hdr_full_cache;
+static kmem_cache_t *hdr_l2only_cache;
static kmem_cache_t *buf_cache;
static void
@@ -1039,7 +1112,8 @@ buf_fini(void)
(buf_hash_table.ht_mask + 1) * sizeof (void *));
for (i = 0; i < BUF_LOCKS; i++)
mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
- kmem_cache_destroy(hdr_cache);
+ kmem_cache_destroy(hdr_full_cache);
+ kmem_cache_destroy(hdr_l2only_cache);
kmem_cache_destroy(buf_cache);
}
@@ -1049,15 +1123,27 @@ buf_fini(void)
*/
/* ARGSUSED */
static int
-hdr_cons(void *vbuf, void *unused, int kmflag)
+hdr_full_cons(void *vbuf, void *unused, int kmflag)
{
arc_buf_hdr_t *hdr = vbuf;
- bzero(hdr, sizeof (arc_buf_hdr_t));
- refcount_create(&hdr->b_refcnt);
- cv_init(&hdr->b_cv, NULL, CV_DEFAULT, NULL);
- mutex_init(&hdr->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
- arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
+ bzero(hdr, HDR_FULL_SIZE);
+ cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL);
+ refcount_create(&hdr->b_l1hdr.b_refcnt);
+ mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
+ arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+hdr_l2only_cons(void *vbuf, void *unused, int kmflag)
+{
+ arc_buf_hdr_t *hdr = vbuf;
+
+ bzero(hdr, HDR_L2ONLY_SIZE);
+ arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
return (0);
}
@@ -1081,15 +1167,25 @@ buf_cons(void *vbuf, void *unused, int kmflag)
*/
/* ARGSUSED */
static void
-hdr_dest(void *vbuf, void *unused)
+hdr_full_dest(void *vbuf, void *unused)
{
arc_buf_hdr_t *hdr = vbuf;
ASSERT(BUF_EMPTY(hdr));
- refcount_destroy(&hdr->b_refcnt);
- cv_destroy(&hdr->b_cv);
- mutex_destroy(&hdr->b_freeze_lock);
- arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
+ cv_destroy(&hdr->b_l1hdr.b_cv);
+ refcount_destroy(&hdr->b_l1hdr.b_refcnt);
+ mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
+ arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS);
+}
+
+/* ARGSUSED */
+static void
+hdr_l2only_dest(void *vbuf, void *unused)
+{
+ arc_buf_hdr_t *hdr = vbuf;
+
+ ASSERT(BUF_EMPTY(hdr));
+ arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
}
/* ARGSUSED */
@@ -1143,8 +1239,11 @@ retry:
goto retry;
}
- hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
- 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
+ hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE,
+ 0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0);
+ hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only",
+ HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl,
+ NULL, NULL, 0);
buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
@@ -1158,6 +1257,81 @@ retry:
}
}
+/*
+ * Transition between the two allocation states for the arc_buf_hdr struct.
+ * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without
+ * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller
+ * version is used when a cache buffer is only in the L2ARC in order to reduce
+ * memory usage.
+ */
+static arc_buf_hdr_t *
+arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
+{
+ ASSERT(HDR_HAS_L2HDR(hdr));
+
+ arc_buf_hdr_t *nhdr;
+ l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
+
+ ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) ||
+ (old == hdr_l2only_cache && new == hdr_full_cache));
+
+ nhdr = kmem_cache_alloc(new, KM_PUSHPAGE);
+
+ ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
+ buf_hash_remove(hdr);
+
+ bcopy(hdr, nhdr, HDR_L2ONLY_SIZE);
+ if (new == hdr_full_cache) {
+ nhdr->b_flags |= ARC_FLAG_HAS_L1HDR;
+ /*
+ * arc_access and arc_change_state need to be aware that a
+ * header has just come out of L2ARC, so we set its state to
+ * l2c_only even though it's about to change.
+ */
+ nhdr->b_l1hdr.b_state = arc_l2c_only;
+ } else {
+ ASSERT(hdr->b_l1hdr.b_buf == NULL);
+ ASSERT0(hdr->b_l1hdr.b_datacnt);
+ ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
+ /*
+ * We might be removing the L1hdr of a buffer which was just
+ * written out to L2ARC. If such a buffer is compressed then we
+ * need to free its b_tmp_cdata before destroying the header.
+ */
+ if (hdr->b_l1hdr.b_tmp_cdata != NULL &&
+ HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF)
+ l2arc_release_cdata_buf(hdr);
+ nhdr->b_flags &= ~ARC_FLAG_HAS_L1HDR;
+ }
+ /*
+ * The header has been reallocated so we need to re-insert it into any
+ * lists it was on.
+ */
+ (void) buf_hash_insert(nhdr, NULL);
+
+ ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node));
+
+ mutex_enter(&dev->l2ad_mtx);
+
+ /*
+ * We must place the realloc'ed header back into the list at
+ * the same spot. Otherwise, if it's placed earlier in the list,
+ * l2arc_write_buffers() could find it during the function's
+ * write phase, and try to write it out to the l2arc.
+ */
+ list_insert_after(&dev->l2ad_buflist, hdr, nhdr);
+ list_remove(&dev->l2ad_buflist, hdr);
+
+ mutex_exit(&dev->l2ad_mtx);
+
+ buf_discard_identity(hdr);
+ hdr->b_freeze_cksum = NULL;
+ kmem_cache_free(old, hdr);
+
+ return (nhdr);
+}
+
+
#define ARC_MINTIME (hz>>4) /* 62 ms */
static void
@@ -1168,16 +1342,15 @@ arc_cksum_verify(arc_buf_t *buf)
if (!(zfs_flags & ZFS_DEBUG_MODIFY))
return;
- mutex_enter(&buf->b_hdr->b_freeze_lock);
- if (buf->b_hdr->b_freeze_cksum == NULL ||
- (buf->b_hdr->b_flags & ARC_FLAG_IO_ERROR)) {
- mutex_exit(&buf->b_hdr->b_freeze_lock);
+ mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
+ if (buf->b_hdr->b_freeze_cksum == NULL || HDR_IO_ERROR(buf->b_hdr)) {
+ mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
return;
}
fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
panic("buffer modified while frozen!");
- mutex_exit(&buf->b_hdr->b_freeze_lock);
+ mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
}
static int
@@ -1186,10 +1359,10 @@ arc_cksum_equal(arc_buf_t *buf)
zio_cksum_t zc;
int equal;
- mutex_enter(&buf->b_hdr->b_freeze_lock);
+ mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
- mutex_exit(&buf->b_hdr->b_freeze_lock);
+ mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
return (equal);
}
@@ -1200,15 +1373,15 @@ arc_cksum_compute(arc_buf_t *buf, boolean_t force)
if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
return;
- mutex_enter(&buf->b_hdr->b_freeze_lock);
+ mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
if (buf->b_hdr->b_freeze_cksum != NULL) {
- mutex_exit(&buf->b_hdr->b_freeze_lock);
+ mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
return;
}
buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
buf->b_hdr->b_freeze_cksum);
- mutex_exit(&buf->b_hdr->b_freeze_lock);
+ mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
#ifdef illumos
arc_buf_watch(buf);
#endif /* illumos */
@@ -1259,30 +1432,58 @@ arc_buf_watch(arc_buf_t *buf)
}
#endif /* illumos */
+static arc_buf_contents_t
+arc_buf_type(arc_buf_hdr_t *hdr)
+{
+ if (HDR_ISTYPE_METADATA(hdr)) {
+ return (ARC_BUFC_METADATA);
+ } else {
+ return (ARC_BUFC_DATA);
+ }
+}
+
+static uint32_t
+arc_bufc_to_flags(arc_buf_contents_t type)
+{
+ switch (type) {
+ case ARC_BUFC_DATA:
+ /* metadata field is 0 if buffer contains normal data */
+ return (0);
+ case ARC_BUFC_METADATA:
+ return (ARC_FLAG_BUFC_METADATA);
+ default:
+ break;
+ }
+ panic("undefined ARC buffer type!");
+ return ((uint32_t)-1);
+}
+
void
arc_buf_thaw(arc_buf_t *buf)
{
if (zfs_flags & ZFS_DEBUG_MODIFY) {
- if (buf->b_hdr->b_state != arc_anon)
+ if (buf->b_hdr->b_l1hdr.b_state != arc_anon)
panic("modifying non-anon buffer!");
- if (buf->b_hdr->b_flags & ARC_FLAG_IO_IN_PROGRESS)
+ if (HDR_IO_IN_PROGRESS(buf->b_hdr))
panic("modifying buffer while i/o in progress!");
arc_cksum_verify(buf);
}
- mutex_enter(&buf->b_hdr->b_freeze_lock);
+ mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
if (buf->b_hdr->b_freeze_cksum != NULL) {
kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
buf->b_hdr->b_freeze_cksum = NULL;
}
+#ifdef ZFS_DEBUG
if (zfs_flags & ZFS_DEBUG_MODIFY) {
- if (buf->b_hdr->b_thawed)
- kmem_free(buf->b_hdr->b_thawed, 1);
- buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP);
+ if (buf->b_hdr->b_l1hdr.b_thawed != NULL)
+ kmem_free(buf->b_hdr->b_l1hdr.b_thawed, 1);
+ buf->b_hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP);
}
+#endif
- mutex_exit(&buf->b_hdr->b_freeze_lock);
+ mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
#ifdef illumos
arc_buf_unwatch(buf);
@@ -1301,7 +1502,7 @@ arc_buf_freeze(arc_buf_t *buf)
mutex_enter(hash_lock);
ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
- buf->b_hdr->b_state == arc_anon);
+ buf->b_hdr->b_l1hdr.b_state == arc_anon);
arc_cksum_compute(buf, B_FALSE);
mutex_exit(hash_lock);
@@ -1312,7 +1513,7 @@ get_buf_info(arc_buf_hdr_t *hdr, arc_state_t *state, list_t **list, kmutex_t **l
{
uint64_t buf_hashid = buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
- if (hdr->b_type == ARC_BUFC_METADATA)
+ if (arc_buf_type(hdr) == ARC_BUFC_METADATA)
buf_hashid &= (ARC_BUFC_NUMMETADATALISTS - 1);
else {
buf_hashid &= (ARC_BUFC_NUMDATALISTS - 1);
@@ -1327,32 +1528,36 @@ get_buf_info(arc_buf_hdr_t *hdr, arc_state_t *state, list_t **list, kmutex_t **l
static void
add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
{
+ ASSERT(HDR_HAS_L1HDR(hdr));
ASSERT(MUTEX_HELD(hash_lock));
+ arc_state_t *state = hdr->b_l1hdr.b_state;
- if ((refcount_add(&hdr->b_refcnt, tag) == 1) &&
- (hdr->b_state != arc_anon)) {
- uint64_t delta = hdr->b_size * hdr->b_datacnt;
- uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
- list_t *list;
- kmutex_t *lock;
-
- get_buf_info(hdr, hdr->b_state, &list, &lock);
- ASSERT(!MUTEX_HELD(lock));
- mutex_enter(lock);
- ASSERT(list_link_active(&hdr->b_arc_node));
- list_remove(list, hdr);
- if (GHOST_STATE(hdr->b_state)) {
- ASSERT0(hdr->b_datacnt);
- ASSERT3P(hdr->b_buf, ==, NULL);
- delta = hdr->b_size;
+ if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) &&
+ (state != arc_anon)) {
+ /* We don't use the L2-only state list. */
+ if (state != arc_l2c_only) {
+ uint64_t delta = hdr->b_size * hdr->b_l1hdr.b_datacnt;
+ uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)];
+ list_t *list;
+ kmutex_t *lock;
+
+ get_buf_info(hdr, state, &list, &lock);
+ ASSERT(!MUTEX_HELD(lock));
+ mutex_enter(lock);
+ ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node));
+ list_remove(list, hdr);
+ if (GHOST_STATE(state)) {
+ ASSERT0(hdr->b_l1hdr.b_datacnt);
+ ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+ delta = hdr->b_size;
+ }
+ ASSERT(delta > 0);
+ ASSERT3U(*size, >=, delta);
+ atomic_add_64(size, -delta);
+ mutex_exit(lock);
}
- ASSERT(delta > 0);
- ASSERT3U(*size, >=, delta);
- atomic_add_64(size, -delta);
- mutex_exit(lock);
/* remove the prefetch flag if we get a reference */
- if (hdr->b_flags & ARC_FLAG_PREFETCH)
- hdr->b_flags &= ~ARC_FLAG_PREFETCH;
+ hdr->b_flags &= ~ARC_FLAG_PREFETCH;
}
}
@@ -1360,24 +1565,30 @@ static int
remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
{
int cnt;
- arc_state_t *state = hdr->b_state;
+ arc_state_t *state = hdr->b_l1hdr.b_state;
+ ASSERT(HDR_HAS_L1HDR(hdr));
ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
ASSERT(!GHOST_STATE(state));
- if (((cnt = refcount_remove(&hdr->b_refcnt, tag)) == 0) &&
+ /*
+ * arc_l2c_only counts as a ghost state so we don't need to explicitly
+ * check to prevent usage of the arc_l2c_only list.
+ */
+ if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) &&
(state != arc_anon)) {
- uint64_t *size = &state->arcs_lsize[hdr->b_type];
+ uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)];
list_t *list;
kmutex_t *lock;
get_buf_info(hdr, state, &list, &lock);
ASSERT(!MUTEX_HELD(lock));
mutex_enter(lock);
- ASSERT(!list_link_active(&hdr->b_arc_node));
+ ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
list_insert_head(list, hdr);
- ASSERT(hdr->b_datacnt > 0);
- atomic_add_64(size, hdr->b_size * hdr->b_datacnt);
+ ASSERT(hdr->b_l1hdr.b_datacnt > 0);
+ atomic_add_64(size, hdr->b_size *
+ hdr->b_l1hdr.b_datacnt);
mutex_exit(lock);
}
return (cnt);
@@ -1391,44 +1602,64 @@ static void
arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
kmutex_t *hash_lock)
{
- arc_state_t *old_state = hdr->b_state;
- int64_t refcnt = refcount_count(&hdr->b_refcnt);
+ arc_state_t *old_state;
+ int64_t refcnt;
+ uint32_t datacnt;
uint64_t from_delta, to_delta;
+ arc_buf_contents_t buftype = arc_buf_type(hdr);
list_t *list;
kmutex_t *lock;
+ /*
+ * We almost always have an L1 hdr here, since we call arc_hdr_realloc()
+ * in arc_read() when bringing a buffer out of the L2ARC. However, the
+ * L1 hdr doesn't always exist when we change state to arc_anon before
+ * destroying a header, in which case reallocating to add the L1 hdr is
+ * pointless.
+ */
+ if (HDR_HAS_L1HDR(hdr)) {
+ old_state = hdr->b_l1hdr.b_state;
+ refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt);
+ datacnt = hdr->b_l1hdr.b_datacnt;
+ } else {
+ old_state = arc_l2c_only;
+ refcnt = 0;
+ datacnt = 0;
+ }
+
ASSERT(MUTEX_HELD(hash_lock));
ASSERT3P(new_state, !=, old_state);
- ASSERT(refcnt == 0 || hdr->b_datacnt > 0);
- ASSERT(hdr->b_datacnt == 0 || !GHOST_STATE(new_state));
- ASSERT(hdr->b_datacnt <= 1 || old_state != arc_anon);
+ ASSERT(refcnt == 0 || datacnt > 0);
+ ASSERT(!GHOST_STATE(new_state) || datacnt == 0);
+ ASSERT(old_state != arc_anon || datacnt <= 1);
- from_delta = to_delta = hdr->b_datacnt * hdr->b_size;
+ from_delta = to_delta = datacnt * hdr->b_size;
/*
* If this buffer is evictable, transfer it from the
* old state list to the new state list.
*/
if (refcnt == 0) {
- if (old_state != arc_anon) {
+ if (old_state != arc_anon && old_state != arc_l2c_only) {
int use_mutex;
- uint64_t *size = &old_state->arcs_lsize[hdr->b_type];
+ uint64_t *size = &old_state->arcs_lsize[buftype];
get_buf_info(hdr, old_state, &list, &lock);
use_mutex = !MUTEX_HELD(lock);
if (use_mutex)
mutex_enter(lock);
- ASSERT(list_link_active(&hdr->b_arc_node));
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node));
list_remove(list, hdr);
/*
* If prefetching out of the ghost cache,
* we will have a non-zero datacnt.
*/
- if (GHOST_STATE(old_state) && hdr->b_datacnt == 0) {
+ if (GHOST_STATE(old_state) && datacnt == 0) {
/* ghost elements have a ghost size */
- ASSERT(hdr->b_buf == NULL);
+ ASSERT(hdr->b_l1hdr.b_buf == NULL);
from_delta = hdr->b_size;
}
ASSERT3U(*size, >=, from_delta);
@@ -1437,10 +1668,17 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
if (use_mutex)
mutex_exit(lock);
}
- if (new_state != arc_anon) {
+ if (new_state != arc_anon && new_state != arc_l2c_only) {
int use_mutex;
- uint64_t *size = &new_state->arcs_lsize[hdr->b_type];
+ uint64_t *size = &new_state->arcs_lsize[buftype];
+ /*
+ * An L1 header always exists here, since if we're
+ * moving to some L1-cached state (i.e. not l2c_only or
+ * anonymous), we realloc the header to add an L1hdr
+ * beforehand.
+ */
+ ASSERT(HDR_HAS_L1HDR(hdr));
get_buf_info(hdr, new_state, &list, &lock);
use_mutex = !MUTEX_HELD(lock);
if (use_mutex)
@@ -1450,8 +1688,8 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
/* ghost elements have a ghost size */
if (GHOST_STATE(new_state)) {
- ASSERT(hdr->b_datacnt == 0);
- ASSERT(hdr->b_buf == NULL);
+ ASSERT(datacnt == 0);
+ ASSERT(hdr->b_l1hdr.b_buf == NULL);
to_delta = hdr->b_size;
}
atomic_add_64(size, to_delta);
@@ -1465,20 +1703,22 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr))
buf_hash_remove(hdr);
- /* adjust state sizes */
- if (to_delta)
+ /* adjust state sizes (ignore arc_l2c_only) */
+ if (to_delta && new_state != arc_l2c_only)
atomic_add_64(&new_state->arcs_size, to_delta);
- if (from_delta) {
+ if (from_delta && old_state != arc_l2c_only) {
ASSERT3U(old_state->arcs_size, >=, from_delta);
atomic_add_64(&old_state->arcs_size, -from_delta);
}
- hdr->b_state = new_state;
+ if (HDR_HAS_L1HDR(hdr))
+ hdr->b_l1hdr.b_state = new_state;
- /* adjust l2arc hdr stats */
- if (new_state == arc_l2c_only)
- l2arc_hdr_stat_add();
- else if (old_state == arc_l2c_only)
- l2arc_hdr_stat_remove();
+ /*
+ * L2 headers should never be on the L2 state list since they don't
+ * have L1 headers allocated.
+ */
+ ASSERT(list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) &&
+ list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]));
}
void
@@ -1534,31 +1774,36 @@ arc_space_return(uint64_t space, arc_space_type_t type)
}
arc_buf_t *
-arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
+arc_buf_alloc(spa_t *spa, int32_t size, void *tag, arc_buf_contents_t type)
{
arc_buf_hdr_t *hdr;
arc_buf_t *buf;
ASSERT3U(size, >, 0);
- hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
+ hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
ASSERT(BUF_EMPTY(hdr));
+ ASSERT3P(hdr->b_freeze_cksum, ==, NULL);
hdr->b_size = size;
- hdr->b_type = type;
hdr->b_spa = spa_load_guid(spa);
- hdr->b_state = arc_anon;
- hdr->b_arc_access = 0;
+
buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
buf->b_hdr = hdr;
buf->b_data = NULL;
buf->b_efunc = NULL;
buf->b_private = NULL;
buf->b_next = NULL;
- hdr->b_buf = buf;
+
+ hdr->b_flags = arc_bufc_to_flags(type);
+ hdr->b_flags |= ARC_FLAG_HAS_L1HDR;
+
+ hdr->b_l1hdr.b_buf = buf;
+ hdr->b_l1hdr.b_state = arc_anon;
+ hdr->b_l1hdr.b_arc_access = 0;
+ hdr->b_l1hdr.b_datacnt = 1;
+
arc_get_data_buf(buf);
- hdr->b_datacnt = 1;
- hdr->b_flags = 0;
- ASSERT(refcount_is_zero(&hdr->b_refcnt));
- (void) refcount_add(&hdr->b_refcnt, tag);
+ ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+ (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
return (buf);
}
@@ -1591,8 +1836,9 @@ arc_return_buf(arc_buf_t *buf, void *tag)
arc_buf_hdr_t *hdr = buf->b_hdr;
ASSERT(buf->b_data != NULL);
- (void) refcount_add(&hdr->b_refcnt, tag);
- (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
+ (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
}
@@ -1601,12 +1847,12 @@ arc_return_buf(arc_buf_t *buf, void *tag)
void
arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
{
- arc_buf_hdr_t *hdr;
+ arc_buf_hdr_t *hdr = buf->b_hdr;
ASSERT(buf->b_data != NULL);
- hdr = buf->b_hdr;
- (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
- (void) refcount_remove(&hdr->b_refcnt, tag);
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ (void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
+ (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag);
buf->b_efunc = NULL;
buf->b_private = NULL;
@@ -1620,15 +1866,16 @@ arc_buf_clone(arc_buf_t *from)
arc_buf_hdr_t *hdr = from->b_hdr;
uint64_t size = hdr->b_size;
- ASSERT(hdr->b_state != arc_anon);
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ ASSERT(hdr->b_l1hdr.b_state != arc_anon);
buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
buf->b_hdr = hdr;
buf->b_data = NULL;
buf->b_efunc = NULL;
buf->b_private = NULL;
- buf->b_next = hdr->b_buf;
- hdr->b_buf = buf;
+ buf->b_next = hdr->b_l1hdr.b_buf;
+ hdr->b_l1hdr.b_buf = buf;
arc_get_data_buf(buf);
bcopy(from->b_data, buf->b_data, size);
@@ -1638,11 +1885,11 @@ arc_buf_clone(arc_buf_t *from)
* then track the size and number of duplicates. These stats will be
* updated as duplicate buffers are created and destroyed.
*/
- if (hdr->b_type == ARC_BUFC_DATA) {
+ if (HDR_ISTYPE_DATA(hdr)) {
ARCSTAT_BUMP(arcstat_duplicate_buffers);
ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
}
- hdr->b_datacnt += 1;
+ hdr->b_l1hdr.b_datacnt += 1;
return (buf);
}
@@ -1665,17 +1912,20 @@ arc_buf_add_ref(arc_buf_t *buf, void* tag)
hash_lock = HDR_LOCK(buf->b_hdr);
mutex_enter(hash_lock);
hdr = buf->b_hdr;
+ ASSERT(HDR_HAS_L1HDR(hdr));
ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
mutex_exit(&buf->b_evict_lock);
- ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
+ ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
+ hdr->b_l1hdr.b_state == arc_mfu);
+
add_reference(hdr, hash_lock, tag);
DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
arc_access(hdr, hash_lock);
mutex_exit(hash_lock);
ARCSTAT_BUMP(arcstat_hits);
- ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_FLAG_PREFETCH),
- demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
+ ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
+ demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
data, metadata, hits);
}
@@ -1718,18 +1968,26 @@ arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
static void
arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr)
{
- l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
+ ASSERT(HDR_HAS_L2HDR(hdr));
+ ASSERT(MUTEX_HELD(&hdr->b_l2hdr.b_dev->l2ad_mtx));
- ASSERT(MUTEX_HELD(&l2arc_buflist_mtx));
+ /*
+ * The b_tmp_cdata field is linked off of the b_l1hdr, so if
+ * that doesn't exist, the header is in the arc_l2c_only state,
+ * and there isn't anything to free (it's already been freed).
+ */
+ if (!HDR_HAS_L1HDR(hdr))
+ return;
- if (l2hdr->b_tmp_cdata == NULL)
+ if (hdr->b_l1hdr.b_tmp_cdata == NULL)
return;
ASSERT(HDR_L2_WRITING(hdr));
- arc_buf_free_on_write(l2hdr->b_tmp_cdata, hdr->b_size,
+ arc_buf_free_on_write(hdr->b_l1hdr.b_tmp_cdata, hdr->b_size,
zio_data_buf_free);
+
ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write);
- l2hdr->b_tmp_cdata = NULL;
+ hdr->b_l1hdr.b_tmp_cdata = NULL;
}
static void
@@ -1738,10 +1996,10 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove)
arc_buf_t **bufp;
/* free up data associated with the buf */
- if (buf->b_data) {
- arc_state_t *state = buf->b_hdr->b_state;
+ if (buf->b_data != NULL) {
+ arc_state_t *state = buf->b_hdr->b_l1hdr.b_state;
uint64_t size = buf->b_hdr->b_size;
- arc_buf_contents_t type = buf->b_hdr->b_type;
+ arc_buf_contents_t type = arc_buf_type(buf->b_hdr);
arc_cksum_verify(buf);
#ifdef illumos
@@ -1759,11 +2017,12 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove)
atomic_add_64(&arc_size, -size);
}
}
- if (list_link_active(&buf->b_hdr->b_arc_node)) {
+ if (list_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) {
uint64_t *cnt = &state->arcs_lsize[type];
- ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
- ASSERT(state != arc_anon);
+ ASSERT(refcount_is_zero(
+ &buf->b_hdr->b_l1hdr.b_refcnt));
+ ASSERT(state != arc_anon && state != arc_l2c_only);
ASSERT3U(*cnt, >=, size);
atomic_add_64(cnt, -size);
@@ -1776,13 +2035,13 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove)
* If we're destroying a duplicate buffer make sure
* that the appropriate statistics are updated.
*/
- if (buf->b_hdr->b_datacnt > 1 &&
- buf->b_hdr->b_type == ARC_BUFC_DATA) {
+ if (buf->b_hdr->b_l1hdr.b_datacnt > 1 &&
+ HDR_ISTYPE_DATA(buf->b_hdr)) {
ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
}
- ASSERT(buf->b_hdr->b_datacnt > 0);
- buf->b_hdr->b_datacnt -= 1;
+ ASSERT(buf->b_hdr->b_l1hdr.b_datacnt > 0);
+ buf->b_hdr->b_l1hdr.b_datacnt -= 1;
}
/* only remove the buf if requested */
@@ -1790,7 +2049,8 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove)
return;
/* remove the buf from the hdr list */
- for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
+ for (bufp = &buf->b_hdr->b_l1hdr.b_buf; *bufp != buf;
+ bufp = &(*bufp)->b_next)
continue;
*bufp = buf->b_next;
buf->b_next = NULL;
@@ -1805,89 +2065,94 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove)
static void
arc_hdr_destroy(arc_buf_hdr_t *hdr)
{
- ASSERT(refcount_is_zero(&hdr->b_refcnt));
- ASSERT3P(hdr->b_state, ==, arc_anon);
+ if (HDR_HAS_L1HDR(hdr)) {
+ ASSERT(hdr->b_l1hdr.b_buf == NULL ||
+ hdr->b_l1hdr.b_datacnt > 0);
+ ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+ ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
+ }
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
- l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
+ ASSERT(!HDR_IN_HASH_TABLE(hdr));
+
+ if (HDR_HAS_L2HDR(hdr)) {
+ l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
+ boolean_t buflist_held = MUTEX_HELD(&l2hdr->b_dev->l2ad_mtx);
- if (l2hdr != NULL) {
- boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
- /*
- * To prevent arc_free() and l2arc_evict() from
- * attempting to free the same buffer at the same time,
- * a FREE_IN_PROGRESS flag is given to arc_free() to
- * give it priority. l2arc_evict() can't destroy this
- * header while we are waiting on l2arc_buflist_mtx.
- *
- * The hdr may be removed from l2ad_buflist before we
- * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
- */
if (!buflist_held) {
- mutex_enter(&l2arc_buflist_mtx);
- l2hdr = hdr->b_l2hdr;
+ mutex_enter(&l2hdr->b_dev->l2ad_mtx);
+ l2hdr = &hdr->b_l2hdr;
}
- if (l2hdr != NULL) {
- trim_map_free(l2hdr->b_dev->l2ad_vdev, l2hdr->b_daddr,
- l2hdr->b_asize, 0);
- list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
- arc_buf_l2_cdata_free(hdr);
- ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
- ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
- vdev_space_update(l2hdr->b_dev->l2ad_vdev,
- -l2hdr->b_asize, 0, 0);
- kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
- if (hdr->b_state == arc_l2c_only)
- l2arc_hdr_stat_remove();
- hdr->b_l2hdr = NULL;
- }
+ trim_map_free(l2hdr->b_dev->l2ad_vdev, l2hdr->b_daddr,
+ l2hdr->b_asize, 0);
+ list_remove(&l2hdr->b_dev->l2ad_buflist, hdr);
+
+ /*
+ * We don't want to leak the b_tmp_cdata buffer that was
+ * allocated in l2arc_write_buffers()
+ */
+ arc_buf_l2_cdata_free(hdr);
+
+ ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
+ ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
if (!buflist_held)
- mutex_exit(&l2arc_buflist_mtx);
+ mutex_exit(&l2hdr->b_dev->l2ad_mtx);
+
+ hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR;
}
- if (!BUF_EMPTY(hdr)) {
- ASSERT(!HDR_IN_HASH_TABLE(hdr));
+ if (!BUF_EMPTY(hdr))
buf_discard_identity(hdr);
- }
- while (hdr->b_buf) {
- arc_buf_t *buf = hdr->b_buf;
-
- if (buf->b_efunc) {
- mutex_enter(&arc_eviction_mtx);
- mutex_enter(&buf->b_evict_lock);
- ASSERT(buf->b_hdr != NULL);
- arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
- hdr->b_buf = buf->b_next;
- buf->b_hdr = &arc_eviction_hdr;
- buf->b_next = arc_eviction_list;
- arc_eviction_list = buf;
- mutex_exit(&buf->b_evict_lock);
- mutex_exit(&arc_eviction_mtx);
- } else {
- arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
- }
- }
if (hdr->b_freeze_cksum != NULL) {
kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
hdr->b_freeze_cksum = NULL;
}
- if (hdr->b_thawed) {
- kmem_free(hdr->b_thawed, 1);
- hdr->b_thawed = NULL;
+
+ if (HDR_HAS_L1HDR(hdr)) {
+ while (hdr->b_l1hdr.b_buf) {
+ arc_buf_t *buf = hdr->b_l1hdr.b_buf;
+
+ if (buf->b_efunc != NULL) {
+ mutex_enter(&arc_eviction_mtx);
+ mutex_enter(&buf->b_evict_lock);
+ ASSERT(buf->b_hdr != NULL);
+ arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE,
+ FALSE);
+ hdr->b_l1hdr.b_buf = buf->b_next;
+ buf->b_hdr = &arc_eviction_hdr;
+ buf->b_next = arc_eviction_list;
+ arc_eviction_list = buf;
+ mutex_exit(&buf->b_evict_lock);
+ mutex_exit(&arc_eviction_mtx);
+ } else {
+ arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE,
+ TRUE);
+ }
+ }
+#ifdef ZFS_DEBUG
+ if (hdr->b_l1hdr.b_thawed != NULL) {
+ kmem_free(hdr->b_l1hdr.b_thawed, 1);
+ hdr->b_l1hdr.b_thawed = NULL;
+ }
+#endif
}
- ASSERT(!list_link_active(&hdr->b_arc_node));
ASSERT3P(hdr->b_hash_next, ==, NULL);
- ASSERT3P(hdr->b_acb, ==, NULL);
- kmem_cache_free(hdr_cache, hdr);
+ if (HDR_HAS_L1HDR(hdr)) {
+ ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
+ ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
+ kmem_cache_free(hdr_full_cache, hdr);
+ } else {
+ kmem_cache_free(hdr_l2only_cache, hdr);
+ }
}
void
arc_buf_free(arc_buf_t *buf, void *tag)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
- int hashed = hdr->b_state != arc_anon;
+ int hashed = hdr->b_l1hdr.b_state != arc_anon;
ASSERT(buf->b_efunc == NULL);
ASSERT(buf->b_data != NULL);
@@ -1900,10 +2165,10 @@ arc_buf_free(arc_buf_t *buf, void *tag)
ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
(void) remove_reference(hdr, hash_lock, tag);
- if (hdr->b_datacnt > 1) {
+ if (hdr->b_l1hdr.b_datacnt > 1) {
arc_buf_destroy(buf, FALSE, TRUE);
} else {
- ASSERT(buf == hdr->b_buf);
+ ASSERT(buf == hdr->b_l1hdr.b_buf);
ASSERT(buf->b_efunc == NULL);
hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
}
@@ -1917,7 +2182,7 @@ arc_buf_free(arc_buf_t *buf, void *tag)
*/
mutex_enter(&arc_eviction_mtx);
(void) remove_reference(hdr, NULL, tag);
- ASSERT(refcount_is_zero(&hdr->b_refcnt));
+ ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
mutex_exit(&arc_eviction_mtx);
if (destroy_hdr)
@@ -1937,34 +2202,35 @@ arc_buf_remove_ref(arc_buf_t *buf, void* tag)
kmutex_t *hash_lock = HDR_LOCK(hdr);
boolean_t no_callback = (buf->b_efunc == NULL);
- if (hdr->b_state == arc_anon) {
- ASSERT(hdr->b_datacnt == 1);
+ if (hdr->b_l1hdr.b_state == arc_anon) {
+ ASSERT(hdr->b_l1hdr.b_datacnt == 1);
arc_buf_free(buf, tag);
return (no_callback);
}
mutex_enter(hash_lock);
hdr = buf->b_hdr;
+ ASSERT(hdr->b_l1hdr.b_datacnt > 0);
ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
- ASSERT(hdr->b_state != arc_anon);
+ ASSERT(hdr->b_l1hdr.b_state != arc_anon);
ASSERT(buf->b_data != NULL);
(void) remove_reference(hdr, hash_lock, tag);
- if (hdr->b_datacnt > 1) {
+ if (hdr->b_l1hdr.b_datacnt > 1) {
if (no_callback)
arc_buf_destroy(buf, FALSE, TRUE);
} else if (no_callback) {
- ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
+ ASSERT(hdr->b_l1hdr.b_buf == buf && buf->b_next == NULL);
ASSERT(buf->b_efunc == NULL);
hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
}
- ASSERT(no_callback || hdr->b_datacnt > 1 ||
- refcount_is_zero(&hdr->b_refcnt));
+ ASSERT(no_callback || hdr->b_l1hdr.b_datacnt > 1 ||
+ refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
mutex_exit(hash_lock);
return (no_callback);
}
-int
+int32_t
arc_buf_size(arc_buf_t *buf)
{
return (buf->b_hdr->b_size);
@@ -2005,7 +2271,7 @@ arc_buf_eviction_needed(arc_buf_t *buf)
return (B_TRUE);
}
- if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA)
+ if (hdr->b_l1hdr.b_datacnt > 1 && HDR_ISTYPE_DATA(hdr))
evict_needed = B_TRUE;
mutex_exit(&buf->b_evict_lock);
@@ -2065,15 +2331,16 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
realtype = ARC_BUFC_METADATA;
} else if (arc_meta_used <= arc_meta_min) {
realtype = ARC_BUFC_DATA;
- } else {
#ifdef illumos
- if (data_hdr->b_arc_access <
- metadata_hdr->b_arc_access) {
- realtype = ARC_BUFC_DATA;
- } else {
- realtype = ARC_BUFC_METADATA;
- }
+ } else if (HDR_HAS_L1HDR(data_hdr) &&
+ HDR_HAS_L1HDR(metadata_hdr) &&
+ data_hdr->b_l1hdr.b_arc_access <
+ metadata_hdr->b_l1hdr.b_arc_access) {
+ realtype = ARC_BUFC_DATA;
+ } else {
+ realtype = ARC_BUFC_METADATA;
#else
+ } else {
/* TODO */
realtype = type;
#endif
@@ -2112,17 +2379,32 @@ evict_start:
lock = ARCS_LOCK(state, (offset + idx));
evicted_lock = ARCS_LOCK(evicted_state, (offset + idx));
- mutex_enter(lock);
+ /*
+ * The ghost list lock must be acquired first in order to prevent
+ * a 3 party deadlock:
+ *
+ * - arc_evict_ghost acquires arc_*_ghost->arcs_mtx, followed by
+ * l2ad_mtx in arc_hdr_realloc
+ * - l2arc_write_buffers acquires l2ad_mtx, followed by arc_*->arcs_mtx
+ * - arc_evict acquires arc_*_ghost->arcs_mtx, followed by
+ * arc_*_ghost->arcs_mtx and forms a deadlock cycle.
+ *
+ * This situation is avoided by acquiring the ghost list lock first.
+ */
mutex_enter(evicted_lock);
+ mutex_enter(lock);
for (hdr = list_tail(list); hdr; hdr = hdr_prev) {
hdr_prev = list_prev(list, hdr);
- bytes_remaining -= (hdr->b_size * hdr->b_datacnt);
+ if (HDR_HAS_L1HDR(hdr)) {
+ bytes_remaining -=
+ (hdr->b_size * hdr->b_l1hdr.b_datacnt);
+ }
/* prefetch buffers have a minimum lifespan */
if (HDR_IO_IN_PROGRESS(hdr) ||
(spa && hdr->b_spa != spa) ||
- (hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT) &&
- ddi_get_lbolt() - hdr->b_arc_access <
+ ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
+ ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
arc_min_prefetch_lifespan)) {
skipped++;
continue;
@@ -2147,11 +2429,11 @@ evict_start:
*/
if (!recycle && count++ > arc_evict_iterations) {
list_insert_after(list, hdr, &marker);
- mutex_exit(evicted_lock);
mutex_exit(lock);
+ mutex_exit(evicted_lock);
kpreempt(KPREEMPT_SYNC);
- mutex_enter(lock);
mutex_enter(evicted_lock);
+ mutex_enter(lock);
hdr_prev = list_prev(list, &marker);
list_remove(list, &marker);
count = 0;
@@ -2161,28 +2443,29 @@ evict_start:
hash_lock = HDR_LOCK(hdr);
have_lock = MUTEX_HELD(hash_lock);
if (have_lock || mutex_tryenter(hash_lock)) {
- ASSERT0(refcount_count(&hdr->b_refcnt));
- ASSERT(hdr->b_datacnt > 0);
- while (hdr->b_buf) {
- arc_buf_t *buf = hdr->b_buf;
+ ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
+ ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0);
+ while (hdr->b_l1hdr.b_buf) {
+ arc_buf_t *buf = hdr->b_l1hdr.b_buf;
if (!mutex_tryenter(&buf->b_evict_lock)) {
missed += 1;
break;
}
- if (buf->b_data) {
+ if (buf->b_data != NULL) {
bytes_evicted += hdr->b_size;
- if (recycle && hdr->b_type == type &&
+ if (recycle &&
+ arc_buf_type(hdr) == type &&
hdr->b_size == bytes &&
!HDR_L2_WRITING(hdr)) {
stolen = buf->b_data;
recycle = FALSE;
}
}
- if (buf->b_efunc) {
+ if (buf->b_efunc != NULL) {
mutex_enter(&arc_eviction_mtx);
arc_buf_destroy(buf,
buf->b_data == stolen, FALSE);
- hdr->b_buf = buf->b_next;
+ hdr->b_l1hdr.b_buf = buf->b_next;
buf->b_hdr = &arc_eviction_hdr;
buf->b_next = arc_eviction_list;
arc_eviction_list = buf;
@@ -2195,7 +2478,7 @@ evict_start:
}
}
- if (hdr->b_l2hdr) {
+ if (HDR_HAS_L2HDR(hdr)) {
ARCSTAT_INCR(arcstat_evict_l2_cached,
hdr->b_size);
} else {
@@ -2209,7 +2492,7 @@ evict_start:
}
}
- if (hdr->b_datacnt == 0) {
+ if (hdr->b_l1hdr.b_datacnt == 0) {
arc_change_state(evicted_state, hdr, hash_lock);
ASSERT(HDR_IN_HASH_TABLE(hdr));
hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE;
@@ -2232,8 +2515,8 @@ evict_start:
}
}
- mutex_exit(evicted_lock);
mutex_exit(lock);
+ mutex_exit(evicted_lock);
idx = ((idx + 1) & (list_count - 1));
lists++;
@@ -2302,7 +2585,7 @@ evict_start:
mutex_enter(lock);
for (hdr = list_tail(list); hdr; hdr = hdr_prev) {
hdr_prev = list_prev(list, hdr);
- if (hdr->b_type > ARC_BUFC_NUMTYPES)
+ if (arc_buf_type(hdr) >= ARC_BUFC_NUMTYPES)
panic("invalid hdr=%p", (void *)hdr);
if (spa && hdr->b_spa != spa)
continue;
@@ -2334,16 +2617,23 @@ evict_start:
}
if (mutex_tryenter(hash_lock)) {
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
- ASSERT(hdr->b_buf == NULL);
+ ASSERT(!HDR_HAS_L1HDR(hdr) ||
+ hdr->b_l1hdr.b_buf == NULL);
ARCSTAT_BUMP(arcstat_deleted);
bytes_deleted += hdr->b_size;
- if (hdr->b_l2hdr != NULL) {
+ if (HDR_HAS_L2HDR(hdr)) {
/*
* This buffer is cached on the 2nd Level ARC;
* don't destroy the header.
*/
arc_change_state(arc_l2c_only, hdr, hash_lock);
+ /*
+ * dropping from L1+L2 cached to L2-only,
+ * realloc to remove the L1 header.
+ */
+ hdr = arc_hdr_realloc(hdr, hdr_full_cache,
+ hdr_l2only_cache);
mutex_exit(hash_lock);
} else {
arc_change_state(arc_anon, hdr, hash_lock);
@@ -2504,27 +2794,27 @@ arc_flush(spa_t *spa)
{
uint64_t guid = 0;
- if (spa)
+ if (spa != NULL)
guid = spa_load_guid(spa);
while (arc_mru->arcs_lsize[ARC_BUFC_DATA]) {
(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
- if (spa)
+ if (spa != NULL)
break;
}
while (arc_mru->arcs_lsize[ARC_BUFC_METADATA]) {
(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
- if (spa)
+ if (spa != NULL)
break;
}
while (arc_mfu->arcs_lsize[ARC_BUFC_DATA]) {
(void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
- if (spa)
+ if (spa != NULL)
break;
}
while (arc_mfu->arcs_lsize[ARC_BUFC_METADATA]) {
(void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
- if (spa)
+ if (spa != NULL)
break;
}
@@ -2736,7 +3026,8 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat)
}
}
kmem_cache_reap_now(buf_cache);
- kmem_cache_reap_now(hdr_cache);
+ kmem_cache_reap_now(hdr_full_cache);
+ kmem_cache_reap_now(hdr_l2only_cache);
kmem_cache_reap_now(range_seg_cache);
#ifdef sun
@@ -2932,9 +3223,9 @@ arc_evict_needed(arc_buf_contents_t type)
static void
arc_get_data_buf(arc_buf_t *buf)
{
- arc_state_t *state = buf->b_hdr->b_state;
+ arc_state_t *state = buf->b_hdr->b_l1hdr.b_state;
uint64_t size = buf->b_hdr->b_size;
- arc_buf_contents_t type = buf->b_hdr->b_type;
+ arc_buf_contents_t type = arc_buf_type(buf->b_hdr);
arc_adapt(size, state);
@@ -2960,8 +3251,7 @@ arc_get_data_buf(arc_buf_t *buf)
* will end up on the mru list; so steal space from there.
*/
if (state == arc_mfu_ghost)
- state = buf->b_hdr->b_flags & ARC_FLAG_PREFETCH ?
- arc_mru : arc_mfu;
+ state = HDR_PREFETCH(buf->b_hdr) ? arc_mru : arc_mfu;
else if (state == arc_mru_ghost)
state = arc_mru;
@@ -2993,19 +3283,20 @@ out:
* Update the state size. Note that ghost states have a
* "ghost size" and so don't need to be updated.
*/
- if (!GHOST_STATE(buf->b_hdr->b_state)) {
+ if (!GHOST_STATE(buf->b_hdr->b_l1hdr.b_state)) {
arc_buf_hdr_t *hdr = buf->b_hdr;
- atomic_add_64(&hdr->b_state->arcs_size, size);
- if (list_link_active(&hdr->b_arc_node)) {
- ASSERT(refcount_is_zero(&hdr->b_refcnt));
- atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
+ atomic_add_64(&hdr->b_l1hdr.b_state->arcs_size, size);
+ if (list_link_active(&hdr->b_l1hdr.b_arc_node)) {
+ ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+ atomic_add_64(&hdr->b_l1hdr.b_state->arcs_lsize[type],
+ size);
}
/*
* If we are growing the cache, and we are adding anonymous
* data, and we have outgrown arc_p, update arc_p
*/
- if (arc_size < arc_c && hdr->b_state == arc_anon &&
+ if (arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon &&
arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
arc_p = MIN(arc_c, arc_p + size);
}
@@ -3022,20 +3313,21 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
clock_t now;
ASSERT(MUTEX_HELD(hash_lock));
+ ASSERT(HDR_HAS_L1HDR(hdr));
- if (hdr->b_state == arc_anon) {
+ if (hdr->b_l1hdr.b_state == arc_anon) {
/*
* This buffer is not in the cache, and does not
* appear in our "ghost" list. Add the new buffer
* to the MRU state.
*/
- ASSERT(hdr->b_arc_access == 0);
- hdr->b_arc_access = ddi_get_lbolt();
+ ASSERT0(hdr->b_l1hdr.b_arc_access);
+ hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
arc_change_state(arc_mru, hdr, hash_lock);
- } else if (hdr->b_state == arc_mru) {
+ } else if (hdr->b_l1hdr.b_state == arc_mru) {
now = ddi_get_lbolt();
/*
@@ -3046,14 +3338,15 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
* - move the buffer to the head of the list if this is
* another prefetch (to make it less likely to be evicted).
*/
- if ((hdr->b_flags & ARC_FLAG_PREFETCH) != 0) {
- if (refcount_count(&hdr->b_refcnt) == 0) {
- ASSERT(list_link_active(&hdr->b_arc_node));
+ if (HDR_PREFETCH(hdr)) {
+ if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
+ ASSERT(list_link_active(
+ &hdr->b_l1hdr.b_arc_node));
} else {
hdr->b_flags &= ~ARC_FLAG_PREFETCH;
ARCSTAT_BUMP(arcstat_mru_hits);
}
- hdr->b_arc_access = now;
+ hdr->b_l1hdr.b_arc_access = now;
return;
}
@@ -3062,18 +3355,18 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
* but it is still in the cache. Move it to the MFU
* state.
*/
- if (now > hdr->b_arc_access + ARC_MINTIME) {
+ if (now > hdr->b_l1hdr.b_arc_access + ARC_MINTIME) {
/*
* More than 125ms have passed since we
* instantiated this buffer. Move it to the
* most frequently used state.
*/
- hdr->b_arc_access = now;
+ hdr->b_l1hdr.b_arc_access = now;
DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
arc_change_state(arc_mfu, hdr, hash_lock);
}
ARCSTAT_BUMP(arcstat_mru_hits);
- } else if (hdr->b_state == arc_mru_ghost) {
+ } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) {
arc_state_t *new_state;
/*
* This buffer has been "accessed" recently, but
@@ -3081,9 +3374,9 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
* MFU state.
*/
- if (hdr->b_flags & ARC_FLAG_PREFETCH) {
+ if (HDR_PREFETCH(hdr)) {
new_state = arc_mru;
- if (refcount_count(&hdr->b_refcnt) > 0)
+ if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0)
hdr->b_flags &= ~ARC_FLAG_PREFETCH;
DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
} else {
@@ -3091,11 +3384,11 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
}
- hdr->b_arc_access = ddi_get_lbolt();
+ hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
arc_change_state(new_state, hdr, hash_lock);
ARCSTAT_BUMP(arcstat_mru_ghost_hits);
- } else if (hdr->b_state == arc_mfu) {
+ } else if (hdr->b_l1hdr.b_state == arc_mfu) {
/*
* This buffer has been accessed more than once and is
* still in the cache. Keep it in the MFU state.
@@ -3105,13 +3398,13 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
* If it was a prefetch, we will explicitly move it to
* the head of the list now.
*/
- if ((hdr->b_flags & ARC_FLAG_PREFETCH) != 0) {
- ASSERT(refcount_count(&hdr->b_refcnt) == 0);
- ASSERT(list_link_active(&hdr->b_arc_node));
+ if ((HDR_PREFETCH(hdr)) != 0) {
+ ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+ ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node));
}
ARCSTAT_BUMP(arcstat_mfu_hits);
- hdr->b_arc_access = ddi_get_lbolt();
- } else if (hdr->b_state == arc_mfu_ghost) {
+ hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
+ } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) {
arc_state_t *new_state = arc_mfu;
/*
* This buffer has been accessed more than once but has
@@ -3119,26 +3412,26 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
* MFU state.
*/
- if (hdr->b_flags & ARC_FLAG_PREFETCH) {
+ if (HDR_PREFETCH(hdr)) {
/*
* This is a prefetch access...
* move this block back to the MRU state.
*/
- ASSERT0(refcount_count(&hdr->b_refcnt));
+ ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
new_state = arc_mru;
}
- hdr->b_arc_access = ddi_get_lbolt();
+ hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
arc_change_state(new_state, hdr, hash_lock);
ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
- } else if (hdr->b_state == arc_l2c_only) {
+ } else if (hdr->b_l1hdr.b_state == arc_l2c_only) {
/*
* This buffer is on the 2nd Level ARC.
*/
- hdr->b_arc_access = ddi_get_lbolt();
+ hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
arc_change_state(arc_mfu, hdr, hash_lock);
} else {
@@ -3209,11 +3502,11 @@ arc_read_done(zio_t *zio)
}
hdr->b_flags &= ~ARC_FLAG_L2_EVICTED;
- if (l2arc_noprefetch && (hdr->b_flags & ARC_FLAG_PREFETCH))
+ if (l2arc_noprefetch && HDR_PREFETCH(hdr))
hdr->b_flags &= ~ARC_FLAG_L2CACHE;
/* byteswap if necessary */
- callback_list = hdr->b_acb;
+ callback_list = hdr->b_l1hdr.b_acb;
ASSERT(callback_list != NULL);
if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
dmu_object_byteswap_t bswap =
@@ -3229,7 +3522,8 @@ arc_read_done(zio_t *zio)
arc_buf_watch(buf);
#endif /* illumos */
- if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
+ if (hash_lock && zio->io_error == 0 &&
+ hdr->b_l1hdr.b_state == arc_anon) {
/*
* Only call arc_access on anonymous buffers. This is because
* if we've issued an I/O for an evicted buffer, we've already
@@ -3251,24 +3545,25 @@ arc_read_done(zio_t *zio)
abuf = NULL;
}
}
- hdr->b_acb = NULL;
+ hdr->b_l1hdr.b_acb = NULL;
hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
ASSERT(!HDR_BUF_AVAILABLE(hdr));
if (abuf == buf) {
ASSERT(buf->b_efunc == NULL);
- ASSERT(hdr->b_datacnt == 1);
+ ASSERT(hdr->b_l1hdr.b_datacnt == 1);
hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
}
- ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
+ ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
+ callback_list != NULL);
if (zio->io_error != 0) {
hdr->b_flags |= ARC_FLAG_IO_ERROR;
- if (hdr->b_state != arc_anon)
+ if (hdr->b_l1hdr.b_state != arc_anon)
arc_change_state(arc_anon, hdr, hash_lock);
if (HDR_IN_HASH_TABLE(hdr))
buf_hash_remove(hdr);
- freeable = refcount_is_zero(&hdr->b_refcnt);
+ freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
}
/*
@@ -3276,9 +3571,9 @@ arc_read_done(zio_t *zio)
* that the hdr (and hence the cv) might be freed before we get to
* the cv_broadcast().
*/
- cv_broadcast(&hdr->b_cv);
+ cv_broadcast(&hdr->b_l1hdr.b_cv);
- if (hash_lock) {
+ if (hash_lock != NULL) {
mutex_exit(hash_lock);
} else {
/*
@@ -3287,8 +3582,8 @@ arc_read_done(zio_t *zio)
* moved to the anonymous state (so that it won't show up
* in the cache).
*/
- ASSERT3P(hdr->b_state, ==, arc_anon);
- freeable = refcount_is_zero(&hdr->b_refcnt);
+ ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
+ freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
}
/* execute each callback and free its structure */
@@ -3350,14 +3645,14 @@ top:
hdr = buf_hash_find(guid, bp, &hash_lock);
}
- if (hdr != NULL && hdr->b_datacnt > 0) {
+ if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_datacnt > 0) {
*arc_flags |= ARC_FLAG_CACHED;
if (HDR_IO_IN_PROGRESS(hdr)) {
if (*arc_flags & ARC_FLAG_WAIT) {
- cv_wait(&hdr->b_cv, hash_lock);
+ cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
mutex_exit(hash_lock);
goto top;
}
@@ -3375,8 +3670,8 @@ top:
spa, NULL, NULL, NULL, zio_flags);
ASSERT(acb->acb_done != NULL);
- acb->acb_next = hdr->b_acb;
- hdr->b_acb = acb;
+ acb->acb_next = hdr->b_l1hdr.b_acb;
+ hdr->b_l1hdr.b_acb = acb;
add_reference(hdr, hash_lock, private);
mutex_exit(hash_lock);
return (0);
@@ -3385,7 +3680,8 @@ top:
return (0);
}
- ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
+ ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
+ hdr->b_l1hdr.b_state == arc_mfu);
if (done) {
add_reference(hdr, hash_lock, private);
@@ -3394,7 +3690,7 @@ top:
* copy of the data so that we will be guaranteed
* that arc_release() will always succeed.
*/
- buf = hdr->b_buf;
+ buf = hdr->b_l1hdr.b_buf;
ASSERT(buf);
ASSERT(buf->b_data);
if (HDR_BUF_AVAILABLE(hdr)) {
@@ -3405,7 +3701,7 @@ top:
}
} else if (*arc_flags & ARC_FLAG_PREFETCH &&
- refcount_count(&hdr->b_refcnt) == 0) {
+ refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
hdr->b_flags |= ARC_FLAG_PREFETCH;
}
DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
@@ -3416,8 +3712,8 @@ top:
hdr->b_flags |= ARC_FLAG_L2COMPRESS;
mutex_exit(hash_lock);
ARCSTAT_BUMP(arcstat_hits);
- ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_FLAG_PREFETCH),
- demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
+ ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
+ demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
data, metadata, hits);
if (done)
@@ -3429,7 +3725,7 @@ top:
uint64_t addr = 0;
boolean_t devw = B_FALSE;
enum zio_compress b_compress = ZIO_COMPRESS_OFF;
- uint64_t b_asize = 0;
+ int32_t b_asize = 0;
if (hdr == NULL) {
/* this block is not in the cache */
@@ -3440,7 +3736,6 @@ top:
if (!BP_IS_EMBEDDED(bp)) {
hdr->b_dva = *BP_IDENTITY(bp);
hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
- hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
exists = buf_hash_insert(hdr, &hash_lock);
}
if (exists != NULL) {
@@ -3464,11 +3759,20 @@ top:
if (BP_GET_LEVEL(bp) > 0)
hdr->b_flags |= ARC_FLAG_INDIRECT;
} else {
- /* this block is in the ghost cache */
- ASSERT(GHOST_STATE(hdr->b_state));
+ /*
+ * This block is in the ghost cache. If it was L2-only
+ * (and thus didn't have an L1 hdr), we realloc the
+ * header to add an L1 hdr.
+ */
+ if (!HDR_HAS_L1HDR(hdr)) {
+ hdr = arc_hdr_realloc(hdr, hdr_l2only_cache,
+ hdr_full_cache);
+ }
+
+ ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state));
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
- ASSERT0(refcount_count(&hdr->b_refcnt));
- ASSERT(hdr->b_buf == NULL);
+ ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+ ASSERT(hdr->b_l1hdr.b_buf == NULL);
/* if this is a prefetch, we don't have a reference */
if (*arc_flags & ARC_FLAG_PREFETCH)
@@ -3485,29 +3789,29 @@ top:
buf->b_efunc = NULL;
buf->b_private = NULL;
buf->b_next = NULL;
- hdr->b_buf = buf;
- ASSERT(hdr->b_datacnt == 0);
- hdr->b_datacnt = 1;
+ hdr->b_l1hdr.b_buf = buf;
+ ASSERT0(hdr->b_l1hdr.b_datacnt);
+ hdr->b_l1hdr.b_datacnt = 1;
arc_get_data_buf(buf);
arc_access(hdr, hash_lock);
}
- ASSERT(!GHOST_STATE(hdr->b_state));
+ ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state));
acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
acb->acb_done = done;
acb->acb_private = private;
- ASSERT(hdr->b_acb == NULL);
- hdr->b_acb = acb;
+ ASSERT(hdr->b_l1hdr.b_acb == NULL);
+ hdr->b_l1hdr.b_acb = acb;
hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS;
- if (hdr->b_l2hdr != NULL &&
- (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
- devw = hdr->b_l2hdr->b_dev->l2ad_writing;
- addr = hdr->b_l2hdr->b_daddr;
- b_compress = hdr->b_l2hdr->b_compress;
- b_asize = hdr->b_l2hdr->b_asize;
+ if (HDR_HAS_L2HDR(hdr) &&
+ (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) {
+ devw = hdr->b_l2hdr.b_dev->l2ad_writing;
+ addr = hdr->b_l2hdr.b_daddr;
+ b_compress = HDR_GET_COMPRESS(hdr);
+ b_asize = hdr->b_l2hdr.b_asize;
/*
* Lock out device removal.
*/
@@ -3527,8 +3831,8 @@ top:
DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
uint64_t, size, zbookmark_phys_t *, zb);
ARCSTAT_BUMP(arcstat_misses);
- ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_FLAG_PREFETCH),
- demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
+ ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
+ demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
data, metadata, misses);
#ifdef _KERNEL
curthread->td_ru.ru_inblock++;
@@ -3544,7 +3848,7 @@ top:
* also have invalidated the vdev.
* 5. This isn't prefetch and l2arc_noprefetch is set.
*/
- if (hdr->b_l2hdr != NULL &&
+ if (HDR_HAS_L2HDR(hdr) &&
!HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
!(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
l2arc_read_callback_t *cb;
@@ -3636,8 +3940,9 @@ void
arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
{
ASSERT(buf->b_hdr != NULL);
- ASSERT(buf->b_hdr->b_state != arc_anon);
- ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
+ ASSERT(buf->b_hdr->b_l1hdr.b_state != arc_anon);
+ ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt) ||
+ func == NULL);
ASSERT(buf->b_efunc == NULL);
ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
@@ -3661,7 +3966,7 @@ arc_freed(spa_t *spa, const blkptr_t *bp)
if (hdr == NULL)
return;
if (HDR_BUF_AVAILABLE(hdr)) {
- arc_buf_t *buf = hdr->b_buf;
+ arc_buf_t *buf = hdr->b_l1hdr.b_buf;
add_reference(hdr, hash_lock, FTAG);
hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
mutex_exit(hash_lock);
@@ -3721,17 +4026,19 @@ arc_clear_callback(arc_buf_t *buf)
hdr = buf->b_hdr;
ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
- ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
- ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
+ ASSERT3U(refcount_count(&hdr->b_l1hdr.b_refcnt), <,
+ hdr->b_l1hdr.b_datacnt);
+ ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
+ hdr->b_l1hdr.b_state == arc_mfu);
buf->b_efunc = NULL;
buf->b_private = NULL;
- if (hdr->b_datacnt > 1) {
+ if (hdr->b_l1hdr.b_datacnt > 1) {
mutex_exit(&buf->b_evict_lock);
arc_buf_destroy(buf, FALSE, TRUE);
} else {
- ASSERT(buf == hdr->b_buf);
+ ASSERT(buf == hdr->b_l1hdr.b_buf);
hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
mutex_exit(&buf->b_evict_lock);
}
@@ -3750,10 +4057,7 @@ arc_clear_callback(arc_buf_t *buf)
void
arc_release(arc_buf_t *buf, void *tag)
{
- arc_buf_hdr_t *hdr;
- kmutex_t *hash_lock = NULL;
- l2arc_buf_hdr_t *l2hdr;
- uint64_t buf_size;
+ arc_buf_hdr_t *hdr = buf->b_hdr;
/*
* It would be nice to assert that if it's DMU metadata (level >
@@ -3762,57 +4066,94 @@ arc_release(arc_buf_t *buf, void *tag)
*/
mutex_enter(&buf->b_evict_lock);
- hdr = buf->b_hdr;
+ /*
+ * We don't grab the hash lock prior to this check, because if
+ * the buffer's header is in the arc_anon state, it won't be
+ * linked into the hash table.
+ */
+ if (hdr->b_l1hdr.b_state == arc_anon) {
+ mutex_exit(&buf->b_evict_lock);
+ ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+ ASSERT(!HDR_IN_HASH_TABLE(hdr));
+ ASSERT(!HDR_HAS_L2HDR(hdr));
+ ASSERT(BUF_EMPTY(hdr));
+ ASSERT3U(hdr->b_l1hdr.b_datacnt, ==, 1);
+ ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1);
+ ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
- /* this buffer is not on any list */
- ASSERT(refcount_count(&hdr->b_refcnt) > 0);
+ ASSERT3P(buf->b_efunc, ==, NULL);
+ ASSERT3P(buf->b_private, ==, NULL);
- if (hdr->b_state == arc_anon) {
- /* this buffer is already released */
- ASSERT(buf->b_efunc == NULL);
- } else {
- hash_lock = HDR_LOCK(hdr);
- mutex_enter(hash_lock);
- hdr = buf->b_hdr;
- ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
+ hdr->b_l1hdr.b_arc_access = 0;
+ arc_buf_thaw(buf);
+
+ return;
}
- l2hdr = hdr->b_l2hdr;
- if (l2hdr) {
- mutex_enter(&l2arc_buflist_mtx);
+ kmutex_t *hash_lock = HDR_LOCK(hdr);
+ mutex_enter(hash_lock);
+
+ /*
+ * This assignment is only valid as long as the hash_lock is
+ * held, we must be careful not to reference state or the
+ * b_state field after dropping the lock.
+ */
+ arc_state_t *state = hdr->b_l1hdr.b_state;
+ ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
+ ASSERT3P(state, !=, arc_anon);
+
+ /* this buffer is not on any list */
+ ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) > 0);
+
+ if (HDR_HAS_L2HDR(hdr)) {
+ ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize);
+ ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
+
+ mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx);
+ trim_map_free(hdr->b_l2hdr.b_dev->l2ad_vdev,
+ hdr->b_l2hdr.b_daddr, hdr->b_l2hdr.b_asize, 0);
+ list_remove(&hdr->b_l2hdr.b_dev->l2ad_buflist, hdr);
+
+ /*
+ * We don't want to leak the b_tmp_cdata buffer that was
+ * allocated in l2arc_write_buffers()
+ */
arc_buf_l2_cdata_free(hdr);
- hdr->b_l2hdr = NULL;
- list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
+
+ mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx);
+
+ hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR;
}
- buf_size = hdr->b_size;
/*
* Do we have more than one buf?
*/
- if (hdr->b_datacnt > 1) {
+ if (hdr->b_l1hdr.b_datacnt > 1) {
arc_buf_hdr_t *nhdr;
arc_buf_t **bufp;
uint64_t blksz = hdr->b_size;
uint64_t spa = hdr->b_spa;
- arc_buf_contents_t type = hdr->b_type;
+ arc_buf_contents_t type = arc_buf_type(hdr);
uint32_t flags = hdr->b_flags;
- ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
+ ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL);
/*
* Pull the data off of this hdr and attach it to
* a new anonymous hdr.
*/
(void) remove_reference(hdr, hash_lock, tag);
- bufp = &hdr->b_buf;
+ bufp = &hdr->b_l1hdr.b_buf;
while (*bufp != buf)
bufp = &(*bufp)->b_next;
*bufp = buf->b_next;
buf->b_next = NULL;
- ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
- atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
- if (refcount_is_zero(&hdr->b_refcnt)) {
- uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
+ ASSERT3P(state, !=, arc_l2c_only);
+ ASSERT3U(state->arcs_size, >=, hdr->b_size);
+ atomic_add_64(&state->arcs_size, -hdr->b_size);
+ if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
+ ASSERT3P(state, !=, arc_l2c_only);
+ uint64_t *size = &state->arcs_lsize[type];
ASSERT3U(*size, >=, hdr->b_size);
atomic_add_64(size, -hdr->b_size);
}
@@ -3821,12 +4162,12 @@ arc_release(arc_buf_t *buf, void *tag)
* We're releasing a duplicate user data buffer, update
* our statistics accordingly.
*/
- if (hdr->b_type == ARC_BUFC_DATA) {
+ if (HDR_ISTYPE_DATA(hdr)) {
ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
ARCSTAT_INCR(arcstat_duplicate_buffers_size,
-hdr->b_size);
}
- hdr->b_datacnt -= 1;
+ hdr->b_l1hdr.b_datacnt -= 1;
arc_cksum_verify(buf);
#ifdef illumos
arc_buf_unwatch(buf);
@@ -3834,48 +4175,39 @@ arc_release(arc_buf_t *buf, void *tag)
mutex_exit(hash_lock);
- nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
+ nhdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
nhdr->b_size = blksz;
nhdr->b_spa = spa;
- nhdr->b_type = type;
- nhdr->b_buf = buf;
- nhdr->b_state = arc_anon;
- nhdr->b_arc_access = 0;
+
nhdr->b_flags = flags & ARC_FLAG_L2_WRITING;
- nhdr->b_l2hdr = NULL;
- nhdr->b_datacnt = 1;
+ nhdr->b_flags |= arc_bufc_to_flags(type);
+ nhdr->b_flags |= ARC_FLAG_HAS_L1HDR;
+
+ nhdr->b_l1hdr.b_buf = buf;
+ nhdr->b_l1hdr.b_datacnt = 1;
+ nhdr->b_l1hdr.b_state = arc_anon;
+ nhdr->b_l1hdr.b_arc_access = 0;
nhdr->b_freeze_cksum = NULL;
- (void) refcount_add(&nhdr->b_refcnt, tag);
+
+ (void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag);
buf->b_hdr = nhdr;
mutex_exit(&buf->b_evict_lock);
atomic_add_64(&arc_anon->arcs_size, blksz);
} else {
mutex_exit(&buf->b_evict_lock);
- ASSERT(refcount_count(&hdr->b_refcnt) == 1);
- ASSERT(!list_link_active(&hdr->b_arc_node));
+ ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1);
+ /* protected by hash lock */
+ ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
- if (hdr->b_state != arc_anon)
- arc_change_state(arc_anon, hdr, hash_lock);
- hdr->b_arc_access = 0;
- if (hash_lock)
- mutex_exit(hash_lock);
+ arc_change_state(arc_anon, hdr, hash_lock);
+ hdr->b_l1hdr.b_arc_access = 0;
+ mutex_exit(hash_lock);
buf_discard_identity(hdr);
arc_buf_thaw(buf);
}
buf->b_efunc = NULL;
buf->b_private = NULL;
-
- if (l2hdr) {
- ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
- vdev_space_update(l2hdr->b_dev->l2ad_vdev,
- -l2hdr->b_asize, 0, 0);
- trim_map_free(l2hdr->b_dev->l2ad_vdev, l2hdr->b_daddr,
- l2hdr->b_asize, 0);
- kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
- ARCSTAT_INCR(arcstat_l2_size, -buf_size);
- mutex_exit(&l2arc_buflist_mtx);
- }
}
int
@@ -3884,7 +4216,8 @@ arc_released(arc_buf_t *buf)
int released;
mutex_enter(&buf->b_evict_lock);
- released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
+ released = (buf->b_data != NULL &&
+ buf->b_hdr->b_l1hdr.b_state == arc_anon);
mutex_exit(&buf->b_evict_lock);
return (released);
}
@@ -3896,7 +4229,7 @@ arc_referenced(arc_buf_t *buf)
int referenced;
mutex_enter(&buf->b_evict_lock);
- referenced = (refcount_count(&buf->b_hdr->b_refcnt));
+ referenced = (refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt));
mutex_exit(&buf->b_evict_lock);
return (referenced);
}
@@ -3909,7 +4242,9 @@ arc_write_ready(zio_t *zio)
arc_buf_t *buf = callback->awcb_buf;
arc_buf_hdr_t *hdr = buf->b_hdr;
- ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt));
+ ASSERT(hdr->b_l1hdr.b_datacnt > 0);
callback->awcb_ready(zio, buf, callback->awcb_private);
/*
@@ -3919,12 +4254,12 @@ arc_write_ready(zio_t *zio)
* accounting for any re-write attempt.
*/
if (HDR_IO_IN_PROGRESS(hdr)) {
- mutex_enter(&hdr->b_freeze_lock);
+ mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
if (hdr->b_freeze_cksum != NULL) {
kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
hdr->b_freeze_cksum = NULL;
}
- mutex_exit(&hdr->b_freeze_lock);
+ mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
}
arc_cksum_compute(buf, B_FALSE);
hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS;
@@ -3949,7 +4284,7 @@ arc_write_done(zio_t *zio)
arc_buf_t *buf = callback->awcb_buf;
arc_buf_hdr_t *hdr = buf->b_hdr;
- ASSERT(hdr->b_acb == NULL);
+ ASSERT(hdr->b_l1hdr.b_acb == NULL);
if (zio->io_error == 0) {
if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
@@ -3957,7 +4292,6 @@ arc_write_done(zio_t *zio)
} else {
hdr->b_dva = *BP_IDENTITY(zio->io_bp);
hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
- hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
}
} else {
ASSERT(BUF_EMPTY(hdr));
@@ -3978,7 +4312,7 @@ arc_write_done(zio_t *zio)
arc_cksum_verify(buf);
exists = buf_hash_insert(hdr, &hash_lock);
- if (exists) {
+ if (exists != NULL) {
/*
* This can only happen if we overwrite for
* sync-to-convergence, because we remove
@@ -3988,7 +4322,8 @@ arc_write_done(zio_t *zio)
if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
panic("bad overwrite, hdr=%p exists=%p",
(void *)hdr, (void *)exists);
- ASSERT(refcount_is_zero(&exists->b_refcnt));
+ ASSERT(refcount_is_zero(
+ &exists->b_l1hdr.b_refcnt));
arc_change_state(arc_anon, exists, hash_lock);
mutex_exit(hash_lock);
arc_hdr_destroy(exists);
@@ -4002,22 +4337,22 @@ arc_write_done(zio_t *zio)
(void *)hdr, (void *)exists);
} else {
/* Dedup */
- ASSERT(hdr->b_datacnt == 1);
- ASSERT(hdr->b_state == arc_anon);
+ ASSERT(hdr->b_l1hdr.b_datacnt == 1);
+ ASSERT(hdr->b_l1hdr.b_state == arc_anon);
ASSERT(BP_GET_DEDUP(zio->io_bp));
ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
}
}
hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
/* if it's not anon, we are doing a scrub */
- if (!exists && hdr->b_state == arc_anon)
+ if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon)
arc_access(hdr, hash_lock);
mutex_exit(hash_lock);
} else {
hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
}
- ASSERT(!refcount_is_zero(&hdr->b_refcnt));
+ ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
callback->awcb_done(zio, buf, callback->awcb_private);
kmem_free(callback, sizeof (arc_write_callback_t));
@@ -4037,8 +4372,9 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
ASSERT(ready != NULL);
ASSERT(done != NULL);
ASSERT(!HDR_IO_ERROR(hdr));
- ASSERT((hdr->b_flags & ARC_FLAG_IO_IN_PROGRESS) == 0);
- ASSERT(hdr->b_acb == NULL);
+ ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+ ASSERT(hdr->b_l1hdr.b_acb == NULL);
+ ASSERT(hdr->b_l1hdr.b_datacnt > 0);
if (l2arc)
hdr->b_flags |= ARC_FLAG_L2CACHE;
if (l2arc_compress)
@@ -4290,17 +4626,23 @@ arc_init(void)
NULL, MUTEX_DEFAULT, NULL);
list_create(&arc_mru->arcs_lists[i],
- sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
list_create(&arc_mru_ghost->arcs_lists[i],
- sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
list_create(&arc_mfu->arcs_lists[i],
- sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
list_create(&arc_mfu_ghost->arcs_lists[i],
- sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
list_create(&arc_mfu_ghost->arcs_lists[i],
- sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
list_create(&arc_l2c_only->arcs_lists[i],
- sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
}
buf_init();
@@ -4423,7 +4765,7 @@ arc_fini(void)
buf_fini();
- ASSERT(arc_loaned_bytes == 0);
+ ASSERT0(arc_loaned_bytes);
mutex_destroy(&arc_lowmem_lock);
#ifdef _KERNEL
@@ -4591,7 +4933,7 @@ l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr)
ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch);
return (B_FALSE);
}
- if (hdr->b_l2hdr != NULL) {
+ if (HDR_HAS_L2HDR(hdr)) {
ARCSTAT_BUMP(arcstat_l2_write_in_l2);
return (B_FALSE);
}
@@ -4653,20 +4995,6 @@ l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
return (next);
}
-static void
-l2arc_hdr_stat_add(void)
-{
- ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
- ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
-}
-
-static void
-l2arc_hdr_stat_remove(void)
-{
- ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
- ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
-}
-
/*
* Cycle through L2ARC devices. This is how L2ARC load balances.
* If a device is returned, this also returns holding the spa config lock.
@@ -4763,7 +5091,6 @@ l2arc_write_done(zio_t *zio)
l2arc_dev_t *dev;
list_t *buflist;
arc_buf_hdr_t *head, *hdr, *hdr_prev;
- l2arc_buf_hdr_t *abl2;
kmutex_t *hash_lock;
int64_t bytes_dropped = 0;
@@ -4773,7 +5100,7 @@ l2arc_write_done(zio_t *zio)
ASSERT(dev != NULL);
head = cb->l2wcb_head;
ASSERT(head != NULL);
- buflist = dev->l2ad_buflist;
+ buflist = &dev->l2ad_buflist;
ASSERT(buflist != NULL);
DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
l2arc_write_callback_t *, cb);
@@ -4781,43 +5108,44 @@ l2arc_write_done(zio_t *zio)
if (zio->io_error != 0)
ARCSTAT_BUMP(arcstat_l2_writes_error);
- mutex_enter(&l2arc_buflist_mtx);
+ mutex_enter(&dev->l2ad_mtx);
/*
* All writes completed, or an error was hit.
*/
for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) {
hdr_prev = list_prev(buflist, hdr);
- abl2 = hdr->b_l2hdr;
-
- /*
- * Release the temporary compressed buffer as soon as possible.
- */
- if (abl2->b_compress != ZIO_COMPRESS_OFF)
- l2arc_release_cdata_buf(hdr);
hash_lock = HDR_LOCK(hdr);
if (!mutex_tryenter(hash_lock)) {
/*
* This buffer misses out. It may be in a stage
- * of eviction. Its ARC_L2_WRITING flag will be
+ * of eviction. Its ARC_FLAG_L2_WRITING flag will be
* left set, denying reads to this buffer.
*/
ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
continue;
}
+ /*
+ * It's possible that this buffer got evicted from the L1 cache
+ * before we grabbed the vdev + hash locks, in which case
+ * arc_hdr_realloc freed b_tmp_cdata for us if it was allocated.
+ * Only free the buffer if we still have an L1 hdr.
+ */
+ if (HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_tmp_cdata != NULL &&
+ HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF)
+ l2arc_release_cdata_buf(hdr);
+
if (zio->io_error != 0) {
/*
* Error - drop L2ARC entry.
*/
- list_remove(buflist, hdr);
- ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
- bytes_dropped += abl2->b_asize;
- hdr->b_l2hdr = NULL;
- trim_map_free(abl2->b_dev->l2ad_vdev, abl2->b_daddr,
- abl2->b_asize, 0);
- kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
+ trim_map_free(hdr->b_l2hdr.b_dev->l2ad_vdev,
+ hdr->b_l2hdr.b_daddr, hdr->b_l2hdr.b_asize, 0);
+ hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR;
+
+ ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize);
ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
}
@@ -4831,8 +5159,9 @@ l2arc_write_done(zio_t *zio)
atomic_inc_64(&l2arc_writes_done);
list_remove(buflist, head);
- kmem_cache_free(hdr_cache, head);
- mutex_exit(&l2arc_buflist_mtx);
+ ASSERT(!HDR_HAS_L1HDR(head));
+ kmem_cache_free(hdr_l2only_cache, head);
+ mutex_exit(&dev->l2ad_mtx);
vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);
@@ -4971,16 +5300,12 @@ static void
l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
{
list_t *buflist;
- l2arc_buf_hdr_t *abl2;
arc_buf_hdr_t *hdr, *hdr_prev;
kmutex_t *hash_lock;
uint64_t taddr;
int64_t bytes_evicted = 0;
- buflist = dev->l2ad_buflist;
-
- if (buflist == NULL)
- return;
+ buflist = &dev->l2ad_buflist;
if (!all && dev->l2ad_first) {
/*
@@ -5003,7 +5328,7 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
uint64_t, taddr, boolean_t, all);
top:
- mutex_enter(&l2arc_buflist_mtx);
+ mutex_enter(&dev->l2ad_mtx);
for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) {
hdr_prev = list_prev(buflist, hdr);
@@ -5013,7 +5338,7 @@ top:
* Missed the hash lock. Retry.
*/
ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
- mutex_exit(&l2arc_buflist_mtx);
+ mutex_exit(&dev->l2ad_mtx);
mutex_enter(hash_lock);
mutex_exit(hash_lock);
goto top;
@@ -5029,9 +5354,9 @@ top:
continue;
}
- if (!all && hdr->b_l2hdr != NULL &&
- (hdr->b_l2hdr->b_daddr > taddr ||
- hdr->b_l2hdr->b_daddr < dev->l2ad_hand)) {
+ if (!all && HDR_HAS_L2HDR(hdr) &&
+ (hdr->b_l2hdr.b_daddr > taddr ||
+ hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) {
/*
* We've evicted to the target address,
* or the end of the device.
@@ -5040,15 +5365,8 @@ top:
break;
}
- if (HDR_FREE_IN_PROGRESS(hdr)) {
- /*
- * Already on the path to destruction.
- */
- mutex_exit(hash_lock);
- continue;
- }
-
- if (hdr->b_state == arc_l2c_only) {
+ ASSERT(HDR_HAS_L2HDR(hdr));
+ if (!HDR_HAS_L1HDR(hdr)) {
ASSERT(!HDR_L2_READING(hdr));
/*
* This doesn't exist in the ARC. Destroy.
@@ -5058,6 +5376,8 @@ top:
arc_change_state(arc_anon, hdr, hash_lock);
arc_hdr_destroy(hdr);
} else {
+ ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only);
+ ARCSTAT_BUMP(arcstat_l2_evict_l1cached);
/*
* Invalidate issued or about to be issued
* reads, since we may be about to write
@@ -5068,33 +5388,18 @@ top:
hdr->b_flags |= ARC_FLAG_L2_EVICTED;
}
- /*
- * Tell ARC this no longer exists in L2ARC.
- */
- if (hdr->b_l2hdr != NULL) {
- abl2 = hdr->b_l2hdr;
- ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
- bytes_evicted += abl2->b_asize;
- hdr->b_l2hdr = NULL;
- /*
- * We are destroying l2hdr, so ensure that
- * its compressed buffer, if any, is not leaked.
- */
- ASSERT(abl2->b_tmp_cdata == NULL);
- kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
- ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
- }
+ /* Tell ARC this no longer exists in L2ARC. */
+ ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize);
+ ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
+ hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR;
list_remove(buflist, hdr);
- /*
- * This may have been leftover after a
- * failed write.
- */
+ /* This may have been leftover after a failed write. */
hdr->b_flags &= ~ARC_FLAG_L2_WRITING;
}
mutex_exit(hash_lock);
}
- mutex_exit(&l2arc_buflist_mtx);
+ mutex_exit(&dev->l2ad_mtx);
vdev_space_update(dev->l2ad_vdev, -bytes_evicted, 0, 0);
dev->l2ad_evict = taddr;
@@ -5135,8 +5440,9 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
pio = NULL;
write_sz = write_asize = 0;
full = B_FALSE;
- head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
+ head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
head->b_flags |= ARC_FLAG_L2_WRITE_HEAD;
+ head->b_flags |= ARC_FLAG_HAS_L2HDR;
ARCSTAT_BUMP(arcstat_l2_write_buffer_iter);
/*
@@ -5148,7 +5454,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
/*
* Copy buffers for L2ARC writing.
*/
- mutex_enter(&l2arc_buflist_mtx);
+ mutex_enter(&dev->l2ad_mtx);
for (try = 0; try < 2 * ARC_BUFC_NUMLISTS; try++) {
uint64_t passed_sz = 0;
@@ -5173,7 +5479,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
headroom = (headroom * l2arc_headroom_boost) / 100;
for (; hdr; hdr = hdr_prev) {
- l2arc_buf_hdr_t *l2hdr;
kmutex_t *hash_lock;
uint64_t buf_sz;
uint64_t buf_a_sz;
@@ -5229,7 +5534,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
* l2arc_write_done() can find where the
* write buffers begin without searching.
*/
- list_insert_head(dev->l2ad_buflist, head);
+ list_insert_head(&dev->l2ad_buflist, head);
cb = kmem_alloc(
sizeof (l2arc_write_callback_t), KM_SLEEP);
@@ -5243,32 +5548,30 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
/*
* Create and add a new L2ARC header.
*/
- l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
- l2hdr->b_dev = dev;
+ hdr->b_l2hdr.b_dev = dev;
hdr->b_flags |= ARC_FLAG_L2_WRITING;
-
/*
* Temporarily stash the data buffer in b_tmp_cdata.
* The subsequent write step will pick it up from
- * there. This is because can't access hdr->b_buf
+ * there. This is because can't access b_l1hdr.b_buf
* without holding the hash_lock, which we in turn
* can't access without holding the ARC list locks
* (which we want to avoid during compression/writing).
*/
- l2hdr->b_compress = ZIO_COMPRESS_OFF;
- l2hdr->b_asize = hdr->b_size;
- l2hdr->b_tmp_cdata = hdr->b_buf->b_data;
+ HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF);
+ hdr->b_l2hdr.b_asize = hdr->b_size;
+ hdr->b_l1hdr.b_tmp_cdata = hdr->b_l1hdr.b_buf->b_data;
- hdr->b_l2hdr = l2hdr;
+ hdr->b_flags |= ARC_FLAG_HAS_L2HDR;
- list_insert_head(dev->l2ad_buflist, hdr);
+ list_insert_head(&dev->l2ad_buflist, hdr);
/*
* Compute and store the buffer cksum before
* writing. On debug the cksum is verified first.
*/
- arc_cksum_verify(hdr->b_buf);
- arc_cksum_compute(hdr->b_buf, B_TRUE);
+ arc_cksum_verify(hdr->b_l1hdr.b_buf);
+ arc_cksum_compute(hdr->b_l1hdr.b_buf, B_TRUE);
mutex_exit(hash_lock);
@@ -5285,8 +5588,9 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
/* No buffers selected for writing? */
if (pio == NULL) {
ASSERT0(write_sz);
- mutex_exit(&l2arc_buflist_mtx);
- kmem_cache_free(hdr_cache, head);
+ mutex_exit(&dev->l2ad_mtx);
+ ASSERT(!HDR_HAS_L1HDR(head));
+ kmem_cache_free(hdr_l2only_cache, head);
return (0);
}
@@ -5308,24 +5612,22 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
* and work backwards, retracing the course of the buffer selector
* loop above.
*/
- for (hdr = list_prev(dev->l2ad_buflist, head); hdr;
- hdr = list_prev(dev->l2ad_buflist, hdr)) {
- l2arc_buf_hdr_t *l2hdr;
+ for (hdr = list_prev(&dev->l2ad_buflist, head); hdr;
+ hdr = list_prev(&dev->l2ad_buflist, hdr)) {
uint64_t buf_sz;
/*
* We shouldn't need to lock the buffer here, since we flagged
* it as ARC_FLAG_L2_WRITING in the previous step, but we must
* take care to only access its L2 cache parameters. In
- * particular, hdr->b_buf may be invalid by now due to
+ * particular, hdr->l1hdr.b_buf may be invalid by now due to
* ARC eviction.
*/
- l2hdr = hdr->b_l2hdr;
- l2hdr->b_daddr = dev->l2ad_hand;
+ hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
- if ((hdr->b_flags & ARC_FLAG_L2COMPRESS) &&
- l2hdr->b_asize >= buf_compress_minsz) {
- if (l2arc_compress_buf(l2hdr)) {
+ if ((HDR_L2COMPRESS(hdr)) &&
+ hdr->b_l2hdr.b_asize >= buf_compress_minsz) {
+ if (l2arc_compress_buf(hdr)) {
/*
* If compression succeeded, enable headroom
* boost on the next scan cycle.
@@ -5338,16 +5640,16 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
* Pick up the buffer data we had previously stashed away
* (and now potentially also compressed).
*/
- buf_data = l2hdr->b_tmp_cdata;
- buf_sz = l2hdr->b_asize;
+ buf_data = hdr->b_l1hdr.b_tmp_cdata;
+ buf_sz = hdr->b_l2hdr.b_asize;
/*
* If the data has not been compressed, then clear b_tmp_cdata
* to make sure that it points only to a temporary compression
* buffer.
*/
- if (!L2ARC_IS_VALID_COMPRESS(l2hdr->b_compress))
- l2hdr->b_tmp_cdata = NULL;
+ if (!L2ARC_IS_VALID_COMPRESS(HDR_GET_COMPRESS(hdr)))
+ hdr->b_l1hdr.b_tmp_cdata = NULL;
/* Compression may have squashed the buffer to zero length. */
if (buf_sz != 0) {
@@ -5372,7 +5674,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
}
}
- mutex_exit(&l2arc_buflist_mtx);
+ mutex_exit(&dev->l2ad_mtx);
ASSERT3U(write_asize, <=, target_sz);
ARCSTAT_BUMP(arcstat_l2_writes_sent);
@@ -5400,7 +5702,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
/*
* Compresses an L2ARC buffer.
- * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its
+ * The data to be compressed must be prefilled in l1hdr.b_tmp_cdata and its
* size in l2hdr->b_asize. This routine tries to compress the data and
* depending on the compression result there are three possible outcomes:
* *) The buffer was incompressible. The original l2hdr contents were left
@@ -5418,25 +5720,29 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
* buffer was incompressible).
*/
static boolean_t
-l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)
+l2arc_compress_buf(arc_buf_hdr_t *hdr)
{
void *cdata;
size_t csize, len, rounded;
+ ASSERT(HDR_HAS_L2HDR(hdr));
+ l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
- ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF);
- ASSERT(l2hdr->b_tmp_cdata != NULL);
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ ASSERT(HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF);
+ ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL);
len = l2hdr->b_asize;
cdata = zio_data_buf_alloc(len);
- csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata,
+ ASSERT3P(cdata, !=, NULL);
+ csize = zio_compress_data(ZIO_COMPRESS_LZ4, hdr->b_l1hdr.b_tmp_cdata,
cdata, l2hdr->b_asize);
if (csize == 0) {
/* zero block, indicate that there's nothing to write */
zio_data_buf_free(cdata, len);
- l2hdr->b_compress = ZIO_COMPRESS_EMPTY;
+ HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_EMPTY);
l2hdr->b_asize = 0;
- l2hdr->b_tmp_cdata = NULL;
+ hdr->b_l1hdr.b_tmp_cdata = NULL;
ARCSTAT_BUMP(arcstat_l2_compress_zeros);
return (B_TRUE);
}
@@ -5452,9 +5758,9 @@ l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)
bzero((char *)cdata + csize, rounded - csize);
csize = rounded;
}
- l2hdr->b_compress = ZIO_COMPRESS_LZ4;
+ HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_LZ4);
l2hdr->b_asize = csize;
- l2hdr->b_tmp_cdata = cdata;
+ hdr->b_l1hdr.b_tmp_cdata = cdata;
ARCSTAT_BUMP(arcstat_l2_compress_successes);
return (B_TRUE);
} else {
@@ -5499,9 +5805,9 @@ l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
* need to fill its io_data after we're done restoring the
* buffer's contents.
*/
- ASSERT(hdr->b_buf != NULL);
- bzero(hdr->b_buf->b_data, hdr->b_size);
- zio->io_data = zio->io_orig_data = hdr->b_buf->b_data;
+ ASSERT(hdr->b_l1hdr.b_buf != NULL);
+ bzero(hdr->b_l1hdr.b_buf->b_data, hdr->b_size);
+ zio->io_data = zio->io_orig_data = hdr->b_l1hdr.b_buf->b_data;
} else {
ASSERT(zio->io_data != NULL);
/*
@@ -5539,19 +5845,18 @@ l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
static void
l2arc_release_cdata_buf(arc_buf_hdr_t *hdr)
{
- l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
-
- ASSERT(L2ARC_IS_VALID_COMPRESS(l2hdr->b_compress));
- if (l2hdr->b_compress != ZIO_COMPRESS_EMPTY) {
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_EMPTY) {
/*
* If the data was compressed, then we've allocated a
* temporary buffer for it, so now we need to release it.
*/
- ASSERT(l2hdr->b_tmp_cdata != NULL);
- zio_data_buf_free(l2hdr->b_tmp_cdata, hdr->b_size);
- l2hdr->b_tmp_cdata = NULL;
+ ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL);
+ zio_data_buf_free(hdr->b_l1hdr.b_tmp_cdata,
+ hdr->b_size);
+ hdr->b_l1hdr.b_tmp_cdata = NULL;
} else {
- ASSERT(l2hdr->b_tmp_cdata == NULL);
+ ASSERT(hdr->b_l1hdr.b_tmp_cdata == NULL);
}
}
@@ -5695,13 +6000,13 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd)
adddev->l2ad_first = B_TRUE;
adddev->l2ad_writing = B_FALSE;
+ mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL);
/*
* This is a list of all ARC buffers that are still valid on the
* device.
*/
- adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
- list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
- offsetof(arc_buf_hdr_t, b_l2node));
+ list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node));
vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
@@ -5747,8 +6052,8 @@ l2arc_remove_vdev(vdev_t *vd)
* Clear all buflists and ARC references. L2ARC device flush.
*/
l2arc_evict(remdev, 0, B_TRUE);
- list_destroy(remdev->l2ad_buflist);
- kmem_free(remdev->l2ad_buflist, sizeof (list_t));
+ list_destroy(&remdev->l2ad_buflist);
+ mutex_destroy(&remdev->l2ad_mtx);
kmem_free(remdev, sizeof (l2arc_dev_t));
}
@@ -5763,7 +6068,6 @@ l2arc_init(void)
mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
l2arc_dev_list = &L2ARC_dev_list;
@@ -5788,7 +6092,6 @@ l2arc_fini(void)
mutex_destroy(&l2arc_feed_thr_lock);
cv_destroy(&l2arc_feed_thr_cv);
mutex_destroy(&l2arc_dev_mtx);
- mutex_destroy(&l2arc_buflist_mtx);
mutex_destroy(&l2arc_free_on_write_mtx);
list_destroy(l2arc_dev_list);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
index eb21867..8205d5c 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
@@ -70,10 +70,29 @@ typedef enum arc_flags
ARC_FLAG_FREED_IN_READ = 1 << 10, /* freed during read */
ARC_FLAG_BUF_AVAILABLE = 1 << 11, /* block not in use */
ARC_FLAG_INDIRECT = 1 << 12, /* indirect block */
- ARC_FLAG_FREE_IN_PROGRESS = 1 << 13, /* about to be freed */
- ARC_FLAG_L2_WRITING = 1 << 14, /* write in progress */
- ARC_FLAG_L2_EVICTED = 1 << 15, /* evicted during I/O */
- ARC_FLAG_L2_WRITE_HEAD = 1 << 16, /* head of write list */
+ ARC_FLAG_L2_WRITING = 1 << 13, /* write in progress */
+ ARC_FLAG_L2_EVICTED = 1 << 14, /* evicted during I/O */
+ ARC_FLAG_L2_WRITE_HEAD = 1 << 15, /* head of write list */
+ /* indicates that the buffer contains metadata (otherwise, data) */
+ ARC_FLAG_BUFC_METADATA = 1 << 16,
+
+ /* Flags specifying whether optional hdr struct fields are defined */
+ ARC_FLAG_HAS_L1HDR = 1 << 17,
+ ARC_FLAG_HAS_L2HDR = 1 << 18,
+
+ /*
+ * The arc buffer's compression mode is stored in the top 7 bits of the
+ * flags field, so these dummy flags are included so that MDB can
+ * interpret the enum properly.
+ */
+ ARC_FLAG_COMPRESS_0 = 1 << 24,
+ ARC_FLAG_COMPRESS_1 = 1 << 25,
+ ARC_FLAG_COMPRESS_2 = 1 << 26,
+ ARC_FLAG_COMPRESS_3 = 1 << 27,
+ ARC_FLAG_COMPRESS_4 = 1 << 28,
+ ARC_FLAG_COMPRESS_5 = 1 << 29,
+ ARC_FLAG_COMPRESS_6 = 1 << 30
+
} arc_flags_t;
struct arc_buf {
OpenPOWER on IntegriCloud