summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authormav <mav@FreeBSD.org>2015-08-14 09:39:23 +0000
committermav <mav@FreeBSD.org>2015-08-14 09:39:23 +0000
commit3cc7e59c98c7cc13260aa495a033595c744cbafb (patch)
tree85fd3d65ca4da1c77cd3809beaf0d06be507062c
parent14f3f345285922d562a078398cfc4381097986bd (diff)
downloadFreeBSD-src-3cc7e59c98c7cc13260aa495a033595c744cbafb.zip
FreeBSD-src-3cc7e59c98c7cc13260aa495a033595c744cbafb.tar.gz
MFV r286765: 5817 change type of arcs_size from uint64_t to refcount_t
Reviewed by: Matthew Ahrens <mahrens@delphix.com> Reviewed by: Paul Dagnelie <paul.dagnelie@delphix.com> Reviewed by: Adam Leventhal <ahl@delphix.com> Reviewed by: Alex Reece <alex@delphix.com> Reviewed by: Richard Elling <richard.elling@richardelling.com> Approved by: Garrett D'Amore <garrett@damore.org> Author: Prakash Surya <prakash.surya@delphix.com> illumos/illumos-gate@2fd872a734cf486007a8dba532cec52bfb4d40e5 As a way to make it more difficult to introduce bugs into the ARC, and to make it easier to diagnose issues when bugs do creep in, it would be beneficial to change the type of the arc_state_t's arcs_size field to be a refcount_t instead of a uint64_t. This would allow us to make stricter checks when incrementing and decrementing the value with debugging enabled, but still fallback to simple, fast atomic operations when debugging is disabled.
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c133
1 files changed, 108 insertions, 25 deletions
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
index 0f2817e..e3f28b4 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
- * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
* Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
*/
@@ -347,7 +347,7 @@ typedef struct arc_state {
* total amount of data in this state; this includes: evictable,
* non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA.
*/
- uint64_t arcs_size;
+ refcount_t arcs_size;
} arc_state_t;
/* The 6 states: */
@@ -1860,12 +1860,73 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
buf_hash_remove(hdr);
/* adjust state sizes (ignore arc_l2c_only) */
- if (to_delta && new_state != arc_l2c_only)
- atomic_add_64(&new_state->arcs_size, to_delta);
+
+ if (to_delta && new_state != arc_l2c_only) {
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ if (GHOST_STATE(new_state)) {
+ ASSERT0(datacnt);
+
+ /*
+ * We moving a header to a ghost state, we first
+ * remove all arc buffers. Thus, we'll have a
+ * datacnt of zero, and no arc buffer to use for
+ * the reference. As a result, we use the arc
+ * header pointer for the reference.
+ */
+ (void) refcount_add_many(&new_state->arcs_size,
+ hdr->b_size, hdr);
+ } else {
+ ASSERT3U(datacnt, !=, 0);
+
+ /*
+ * Each individual buffer holds a unique reference,
+ * thus we must remove each of these references one
+ * at a time.
+ */
+ for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
+ buf = buf->b_next) {
+ (void) refcount_add_many(&new_state->arcs_size,
+ hdr->b_size, buf);
+ }
+ }
+ }
+
if (from_delta && old_state != arc_l2c_only) {
- ASSERT3U(old_state->arcs_size, >=, from_delta);
- atomic_add_64(&old_state->arcs_size, -from_delta);
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ if (GHOST_STATE(old_state)) {
+ /*
+ * When moving a header off of a ghost state,
+ * there's the possibility for datacnt to be
+ * non-zero. This is because we first add the
+ * arc buffer to the header prior to changing
+ * the header's state. Since we used the header
+ * for the reference when putting the header on
+ * the ghost state, we must balance that and use
+ * the header when removing off the ghost state
+ * (even though datacnt is non zero).
+ */
+
+ IMPLY(datacnt == 0, new_state == arc_anon ||
+ new_state == arc_l2c_only);
+
+ (void) refcount_remove_many(&old_state->arcs_size,
+ hdr->b_size, hdr);
+ } else {
+ ASSERT3P(datacnt, !=, 0);
+
+ /*
+ * Each individual buffer holds a unique reference,
+ * thus we must remove each of these references one
+ * at a time.
+ */
+ for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
+ buf = buf->b_next) {
+ (void) refcount_remove_many(
+ &old_state->arcs_size, hdr->b_size, buf);
+ }
+ }
}
+
if (HDR_HAS_L1HDR(hdr))
hdr->b_l1hdr.b_state = new_state;
@@ -2224,8 +2285,8 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t remove)
ASSERT3U(*cnt, >=, size);
atomic_add_64(cnt, -size);
}
- ASSERT3U(state->arcs_size, >=, size);
- atomic_add_64(&state->arcs_size, -size);
+
+ (void) refcount_remove_many(&state->arcs_size, size, buf);
buf->b_data = NULL;
/*
@@ -2949,7 +3010,8 @@ arc_adjust_meta(void)
* evict some from the MRU here, and some from the MFU below.
*/
target = MIN((int64_t)(arc_meta_used - arc_meta_limit),
- (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size - arc_p));
+ (int64_t)(refcount_count(&arc_anon->arcs_size) +
+ refcount_count(&arc_mru->arcs_size) - arc_p));
total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
@@ -2959,7 +3021,7 @@ arc_adjust_meta(void)
* space alloted to the MFU (which is defined as arc_c - arc_p).
*/
target = MIN((int64_t)(arc_meta_used - arc_meta_limit),
- (int64_t)(arc_mfu->arcs_size - (arc_c - arc_p)));
+ (int64_t)(refcount_count(&arc_mfu->arcs_size) - (arc_c - arc_p)));
total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
@@ -3068,8 +3130,8 @@ arc_adjust(void)
* arc_p here, and then evict more from the MFU below.
*/
target = MIN((int64_t)(arc_size - arc_c),
- (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
- arc_p));
+ (int64_t)(refcount_count(&arc_anon->arcs_size) +
+ refcount_count(&arc_mru->arcs_size) + arc_meta_used - arc_p));
/*
* If we're below arc_meta_min, always prefer to evict data.
@@ -3153,7 +3215,8 @@ arc_adjust(void)
* cache. The following logic enforces these limits on the ghost
* caches, and evicts from them as needed.
*/
- target = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
+ target = refcount_count(&arc_mru->arcs_size) +
+ refcount_count(&arc_mru_ghost->arcs_size) - arc_c;
bytes = arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA);
total_evicted += bytes;
@@ -3171,7 +3234,8 @@ arc_adjust(void)
* mru + mfu + mru ghost + mfu ghost <= 2 * arc_c
* mru ghost + mfu ghost <= arc_c
*/
- target = arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
+ target = refcount_count(&arc_mru_ghost->arcs_size) +
+ refcount_count(&arc_mfu_ghost->arcs_size) - arc_c;
bytes = arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA);
total_evicted += bytes;
@@ -3665,6 +3729,8 @@ arc_adapt(int bytes, arc_state_t *state)
{
int mult;
uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
+ int64_t mrug_size = refcount_count(&arc_mru_ghost->arcs_size);
+ int64_t mfug_size = refcount_count(&arc_mfu_ghost->arcs_size);
if (state == arc_l2c_only)
return;
@@ -3679,16 +3745,14 @@ arc_adapt(int bytes, arc_state_t *state)
* target size of the MRU list.
*/
if (state == arc_mru_ghost) {
- mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
- 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
+ mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size);
mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
} else if (state == arc_mfu_ghost) {
uint64_t delta;
- mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
- 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
+ mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size);
mult = MIN(mult, 10);
delta = MIN(bytes * mult, arc_p);
@@ -3805,8 +3869,9 @@ arc_get_data_buf(arc_buf_t *buf)
*/
if (!GHOST_STATE(buf->b_hdr->b_l1hdr.b_state)) {
arc_buf_hdr_t *hdr = buf->b_hdr;
+ arc_state_t *state = hdr->b_l1hdr.b_state;
- atomic_add_64(&hdr->b_l1hdr.b_state->arcs_size, size);
+ (void) refcount_add_many(&state->arcs_size, size, buf);
/*
* If this is reached via arc_read, the link is
@@ -3827,7 +3892,8 @@ arc_get_data_buf(arc_buf_t *buf)
* data, and we have outgrown arc_p, update arc_p
*/
if (arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon &&
- arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
+ (refcount_count(&arc_anon->arcs_size) +
+ refcount_count(&arc_mru->arcs_size) > arc_p))
arc_p = MIN(arc_c, arc_p + size);
}
ARCSTAT_BUMP(arcstat_allocated);
@@ -4683,8 +4749,10 @@ arc_release(arc_buf_t *buf, void *tag)
buf->b_next = NULL;
ASSERT3P(state, !=, arc_l2c_only);
- ASSERT3U(state->arcs_size, >=, hdr->b_size);
- atomic_add_64(&state->arcs_size, -hdr->b_size);
+
+ (void) refcount_remove_many(
+ &state->arcs_size, hdr->b_size, buf);
+
if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
ASSERT3P(state, !=, arc_l2c_only);
uint64_t *size = &state->arcs_lsize[type];
@@ -4727,7 +4795,7 @@ arc_release(arc_buf_t *buf, void *tag)
(void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag);
buf->b_hdr = nhdr;
mutex_exit(&buf->b_evict_lock);
- atomic_add_64(&arc_anon->arcs_size, blksz);
+ (void) refcount_add_many(&arc_anon->arcs_size, blksz, buf);
} else {
mutex_exit(&buf->b_evict_lock);
ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1);
@@ -4994,7 +5062,8 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg)
* network delays from blocking transactions that are ready to be
* assigned to a txg.
*/
- anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
+ anon_size = MAX((int64_t)(refcount_count(&arc_anon->arcs_size) -
+ arc_loaned_bytes), 0);
/*
* Writes will, almost always, require additional memory allocations
@@ -5031,7 +5100,7 @@ static void
arc_kstat_update_state(arc_state_t *state, kstat_named_t *size,
kstat_named_t *evict_data, kstat_named_t *evict_metadata)
{
- size->value.ui64 = state->arcs_size;
+ size->value.ui64 = refcount_count(&state->arcs_size);
evict_data->value.ui64 = state->arcs_lsize[ARC_BUFC_DATA];
evict_metadata->value.ui64 = state->arcs_lsize[ARC_BUFC_METADATA];
}
@@ -5271,6 +5340,13 @@ arc_init(void)
offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+ refcount_create(&arc_anon->arcs_size);
+ refcount_create(&arc_mru->arcs_size);
+ refcount_create(&arc_mru_ghost->arcs_size);
+ refcount_create(&arc_mfu->arcs_size);
+ refcount_create(&arc_mfu_ghost->arcs_size);
+ refcount_create(&arc_l2c_only->arcs_size);
+
buf_init();
arc_reclaim_thread_exit = FALSE;
@@ -5397,6 +5473,13 @@ arc_fini(void)
mutex_destroy(&arc_user_evicts_lock);
cv_destroy(&arc_user_evicts_cv);
+ refcount_destroy(&arc_anon->arcs_size);
+ refcount_destroy(&arc_mru->arcs_size);
+ refcount_destroy(&arc_mru_ghost->arcs_size);
+ refcount_destroy(&arc_mfu->arcs_size);
+ refcount_destroy(&arc_mfu_ghost->arcs_size);
+ refcount_destroy(&arc_l2c_only->arcs_size);
+
multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
OpenPOWER on IntegriCloud