summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--lib/libc/stdlib/malloc.c355
1 files changed, 297 insertions, 58 deletions
diff --git a/lib/libc/stdlib/malloc.c b/lib/libc/stdlib/malloc.c
index 3ad27f5..4d37e12 100644
--- a/lib/libc/stdlib/malloc.c
+++ b/lib/libc/stdlib/malloc.c
@@ -171,6 +171,7 @@ __FBSDID("$FreeBSD$");
# define QUANTUM_2POW_MIN 4
# define SIZEOF_PTR_2POW 2
# define USE_BRK
+# define CPU_SPINWAIT __asm__ volatile("pause")
#endif
#ifdef __ia64__
# define QUANTUM_2POW_MIN 4
@@ -189,6 +190,7 @@ __FBSDID("$FreeBSD$");
#ifdef __amd64__
# define QUANTUM_2POW_MIN 4
# define SIZEOF_PTR_2POW 3
+# define CPU_SPINWAIT __asm__ volatile("pause")
#endif
#ifdef __arm__
# define QUANTUM_2POW_MIN 3
@@ -270,11 +272,51 @@ __FBSDID("$FreeBSD$");
*/
#define LAZY_FREE_NPROBES 5
+/*
+ * Hyper-threaded CPUs may need a special instruction inside spin loops in
+ * order to yield to another virtual CPU. If no such instruction is defined
+ * above, make CPU_SPINWAIT a no-op.
+ */
+#ifndef CPU_SPINWAIT
+# define CPU_SPINWAIT
+#endif
+
+/*
+ * Adaptive spinning must eventually switch to blocking, in order to avoid the
+ * potential for priority inversion deadlock. Backing off past a certain point
+ * can actually waste time.
+ */
+#define SPIN_LIMIT_2POW 11
+
+/*
+ * Conversion from spinning to blocking is expensive; we use (1U <<
+ * BLOCK_COST_2POW) to estimate how many more times costly blocking is than
+ * worst-case spinning.
+ */
+#define BLOCK_COST_2POW 4
+
+/*
+ * We use an exponential moving average to track recent lock contention, where
+ * the size of the history window is N, and alpha=2/(N+1).
+ *
+ * Due to integer math rounding, very small values here can cause substantial
+ * degradation in accuracy, thus making the moving average decay faster than it
+ * would with precise calculation.
+ */
+#define BALANCE_ALPHA_INV_2POW 9
+
+/*
+ * Threshold value for the exponential moving contention average at which to
+ * re-assign a thread.
+ */
+#define BALANCE_THRESHOLD_DEFAULT (1U << (SPIN_LIMIT_2POW-4))
+
/******************************************************************************/
/*
- * Mutexes based on spinlocks. We can't use normal pthread mutexes, because
- * they require malloc()ed memory.
+ * Mutexes based on spinlocks. We can't use normal pthread spinlocks in all
+ * places, because they require malloc()ed memory, which causes bootstrapping
+ * issues in some cases.
*/
typedef struct {
spinlock_t lock;
@@ -330,6 +372,11 @@ struct arena_stats_s {
size_t allocated_large;
uint64_t nmalloc_large;
uint64_t ndalloc_large;
+
+#ifndef NO_TLS
+ /* Number of times this arena reassigned a thread due to contention. */
+ uint64_t nbalance;
+#endif
};
typedef struct chunk_stats_s chunk_stats_t;
@@ -517,8 +564,8 @@ struct arena_s {
# define ARENA_MAGIC 0x947d3d24
#endif
- /* All operations on this arena require that mtx be locked. */
- malloc_mutex_t mtx;
+ /* All operations on this arena require that lock be locked. */
+ pthread_mutex_t lock;
#ifdef MALLOC_STATS
arena_stats_t stats;
@@ -542,6 +589,12 @@ struct arena_s {
#ifndef NO_TLS
/*
+ * The arena load balancing machinery needs to keep track of how much
+ * lock contention there is. This value is exponentially averaged.
+ */
+ uint32_t contention;
+
+ /*
* Deallocation of small objects can be lazy, in which case free_cache
* stores pointers to those objects that have not yet been deallocated.
* In order to avoid lock contention, slots are chosen randomly. Empty
@@ -681,9 +734,9 @@ static size_t base_mapped;
static arena_t **arenas;
static unsigned narenas;
#ifndef NO_TLS
-static unsigned next_arena;
+static unsigned narenas_2pow;
#endif
-static malloc_mutex_t arenas_mtx; /* Protects arenas initialization. */
+static pthread_mutex_t arenas_lock; /* Protects arenas initialization. */
#ifndef NO_TLS
/*
@@ -714,6 +767,7 @@ static bool opt_junk = false;
static bool opt_hint = false;
#ifndef NO_TLS
static int opt_lazy_free_2pow = LAZY_FREE_2POW_DEFAULT;
+static uint64_t opt_balance_threshold = BALANCE_THRESHOLD_DEFAULT;
#endif
static bool opt_print_stats = false;
static size_t opt_quantum_2pow = QUANTUM_2POW_MIN;
@@ -743,6 +797,7 @@ typedef struct {
*/
static void malloc_mutex_init(malloc_mutex_t *a_mutex);
+static bool malloc_spin_init(pthread_mutex_t *lock);
static void wrtmessage(const char *p1, const char *p2, const char *p3,
const char *p4);
#ifdef MALLOC_STATS
@@ -751,6 +806,7 @@ static void malloc_printf(const char *format, ...);
static char *umax2s(uintmax_t x, char *s);
static bool base_pages_alloc(size_t minsize);
static void *base_alloc(size_t size);
+static void *base_calloc(size_t number, size_t size);
static chunk_node_t *base_chunk_node_alloc(void);
static void base_chunk_node_dealloc(chunk_node_t *node);
#ifdef MALLOC_STATS
@@ -798,7 +854,9 @@ static bool malloc_init_hard(void);
*/
/******************************************************************************/
/*
- * Begin mutex.
+ * Begin mutex. We can't use normal pthread mutexes in all places, because
+ * they require malloc()ed memory, which causes bootstrapping issues in some
+ * cases.
*/
static void
@@ -830,6 +888,86 @@ malloc_mutex_unlock(malloc_mutex_t *a_mutex)
*/
/******************************************************************************/
/*
+ * Begin spin lock. Spin locks here are actually adaptive mutexes that block
+ * after a period of spinning, because unbounded spinning would allow for
+ * priority inversion.
+ */
+
+/*
+ * We use an unpublished interface to initialize pthread mutexes with an
+ * allocation callback, in order to avoid infinite recursion.
+ */
+int _pthread_mutex_init_calloc_cb(pthread_mutex_t *mutex,
+ void *(calloc_cb)(size_t, size_t));
+
+__weak_reference(_pthread_mutex_init_calloc_cb_stub,
+ _pthread_mutex_init_calloc_cb);
+
+int
+_pthread_mutex_init_calloc_cb_stub(pthread_mutex_t *mutex,
+ void *(calloc_cb)(size_t, size_t))
+{
+
+ return (0);
+}
+
+static bool
+malloc_spin_init(pthread_mutex_t *lock)
+{
+
+ if (_pthread_mutex_init_calloc_cb(lock, base_calloc) != 0)
+ return (true);
+
+ return (false);
+}
+
+static inline unsigned
+malloc_spin_lock(pthread_mutex_t *lock)
+{
+ unsigned ret = 0;
+
+ if (__isthreaded) {
+ if (_pthread_mutex_trylock(lock) != 0) {
+ unsigned i;
+ volatile unsigned j;
+
+ /* Exponentially back off. */
+ for (i = 1; i <= SPIN_LIMIT_2POW; i++) {
+ for (j = 0; j < (1U << i); j++)
+ ret++;
+
+ CPU_SPINWAIT;
+ if (_pthread_mutex_trylock(lock) == 0)
+ return (ret);
+ }
+
+ /*
+ * Spinning failed. Block until the lock becomes
+ * available, in order to avoid indefinite priority
+ * inversion.
+ */
+ _pthread_mutex_lock(lock);
+ assert((ret << BLOCK_COST_2POW) != 0);
+ return (ret << BLOCK_COST_2POW);
+ }
+ }
+
+ return (ret);
+}
+
+static inline void
+malloc_spin_unlock(pthread_mutex_t *lock)
+{
+
+ if (__isthreaded)
+ _pthread_mutex_unlock(lock);
+}
+
+/*
+ * End spin lock.
+ */
+/******************************************************************************/
+/*
* Begin Utility functions/macros.
*/
@@ -926,6 +1064,10 @@ prn_##suffix(uint32_t lg_range) \
/* Define the per-thread PRNG used for lazy deallocation. */
static __thread uint32_t lazy_free_x;
PRN_DEFINE(lazy_free, lazy_free_x, 12345, 12347)
+
+/* Define the PRNG used for arena assignment. */
+static __thread uint32_t balance_x;
+PRN_DEFINE(balance, balance_x, 1297, 1301);
#endif
static void
@@ -1083,6 +1225,17 @@ RETURN:
return (ret);
}
+static void *
+base_calloc(size_t number, size_t size)
+{
+ void *ret;
+
+ ret = base_alloc(number * size);
+ memset(ret, 0, number * size);
+
+ return (ret);
+}
+
static chunk_node_t *
base_chunk_node_alloc(void)
{
@@ -1574,12 +1727,12 @@ choose_arena(void)
* Avoid races with another thread that may have already
* initialized arenas[ind].
*/
- malloc_mutex_lock(&arenas_mtx);
+ malloc_spin_lock(&arenas_lock);
if (arenas[ind] == NULL)
ret = arenas_extend((unsigned)ind);
else
ret = arenas[ind];
- malloc_mutex_unlock(&arenas_mtx);
+ malloc_spin_unlock(&arenas_lock);
}
} else
ret = arenas[0];
@@ -1613,21 +1766,27 @@ choose_arena_hard(void)
*/
SPRN(lazy_free, (uint32_t)(uintptr_t)(_pthread_self()));
- /* Assign one of the arenas to this thread, in a round-robin fashion. */
- malloc_mutex_lock(&arenas_mtx);
- ret = arenas[next_arena];
- if (ret == NULL)
- ret = arenas_extend(next_arena);
- if (ret == NULL) {
- /*
- * Make sure that this function never returns NULL, so that
- * choose_arena() doesn't have to check for a NULL return
- * value.
- */
+ /*
+ * Seed the PRNG used for arena load balancing. We can get away with
+ * using the same seed here as for the lazy_free PRNG without
+ * introducing autocorrelation because the PRNG parameters are
+ * distinct.
+ */
+ SPRN(balance, (uint32_t)(uintptr_t)(_pthread_self()));
+
+ if (narenas > 1) {
+ unsigned ind;
+
+ ind = PRN(balance, narenas_2pow);
+ if ((ret = arenas[ind]) == NULL) {
+ malloc_spin_lock(&arenas_lock);
+ if ((ret = arenas[ind]) == NULL)
+ ret = arenas_extend(ind);
+ malloc_spin_unlock(&arenas_lock);
+ }
+ } else
ret = arenas[0];
- }
- next_arena = (next_arena + 1) % narenas;
- malloc_mutex_unlock(&arenas_mtx);
+
arenas_map = ret;
return (ret);
@@ -2325,6 +2484,48 @@ arena_bin_run_size_calc(arena_bin_t *bin, size_t min_run_size)
return (good_run_size);
}
+static inline void
+arena_lock_balance(arena_t *arena)
+{
+#ifndef NO_TLS
+ unsigned contention;
+
+ contention = malloc_spin_lock(&arena->lock);
+ if (narenas > 1) {
+ /*
+ * Calculate the exponentially averaged contention for this
+ * arena. Due to integer math always rounding down, this value
+ * decays somewhat faster then normal.
+ */
+ arena->contention = (((uint64_t)arena->contention
+ * (uint64_t)((1U << BALANCE_ALPHA_INV_2POW)-1))
+ + (uint64_t)contention) >> BALANCE_ALPHA_INV_2POW;
+ if (arena->contention >= opt_balance_threshold) {
+ uint32_t ind;
+
+ arena->contention = 0;
+#ifdef MALLOC_STATS
+ arena->stats.nbalance++;
+#endif
+ ind = PRN(balance, narenas_2pow);
+ if (arenas[ind] != NULL)
+ arenas_map = arenas[ind];
+ else {
+ malloc_spin_lock(&arenas_lock);
+ if (arenas[ind] != NULL)
+ arenas_map = arenas[ind];
+ else
+ arenas_map = arenas_extend(ind);
+ malloc_spin_unlock(&arenas_lock);
+ }
+ }
+ }
+#else
+ malloc_spin_lock(&arena->lock);
+#endif
+
+}
+
static void *
arena_malloc(arena_t *arena, size_t size, bool zero)
{
@@ -2368,14 +2569,14 @@ arena_malloc(arena_t *arena, size_t size, bool zero)
}
assert(size == bin->reg_size);
- malloc_mutex_lock(&arena->mtx);
+ arena_lock_balance(arena);
if ((run = bin->runcur) != NULL && run->nfree > 0)
ret = arena_bin_malloc_easy(arena, bin, run);
else
ret = arena_bin_malloc_hard(arena, bin);
if (ret == NULL) {
- malloc_mutex_unlock(&arena->mtx);
+ malloc_spin_unlock(&arena->lock);
return (NULL);
}
@@ -2384,7 +2585,7 @@ arena_malloc(arena_t *arena, size_t size, bool zero)
arena->stats.nmalloc_small++;
arena->stats.allocated_small += size;
#endif
- malloc_mutex_unlock(&arena->mtx);
+ malloc_spin_unlock(&arena->lock);
if (zero == false) {
if (opt_junk)
@@ -2396,17 +2597,17 @@ arena_malloc(arena_t *arena, size_t size, bool zero)
} else {
/* Large allocation. */
size = PAGE_CEILING(size);
- malloc_mutex_lock(&arena->mtx);
+ arena_lock_balance(arena);
ret = (void *)arena_run_alloc(arena, size, true); // XXX zero?
if (ret == NULL) {
- malloc_mutex_unlock(&arena->mtx);
+ malloc_spin_unlock(&arena->lock);
return (NULL);
}
#ifdef MALLOC_STATS
arena->stats.nmalloc_large++;
arena->stats.allocated_large += size;
#endif
- malloc_mutex_unlock(&arena->mtx);
+ malloc_spin_unlock(&arena->lock);
if (zero == false) {
if (opt_junk)
@@ -2453,10 +2654,10 @@ arena_palloc(arena_t *arena, size_t alignment, size_t size, size_t alloc_size)
npages = size >> pagesize_2pow;
- malloc_mutex_lock(&arena->mtx);
+ arena_lock_balance(arena);
ret = (void *)arena_run_alloc(arena, alloc_size, false);
if (ret == NULL) {
- malloc_mutex_unlock(&arena->mtx);
+ malloc_spin_unlock(&arena->lock);
return (NULL);
}
@@ -2509,7 +2710,7 @@ arena_palloc(arena_t *arena, size_t alignment, size_t size, size_t alloc_size)
arena->stats.nmalloc_large++;
arena->stats.allocated_large += size;
#endif
- malloc_mutex_unlock(&arena->mtx);
+ malloc_spin_unlock(&arena->lock);
if (opt_junk)
memset(ret, 0xa5, size);
@@ -2675,9 +2876,9 @@ arena_dalloc_lazy(arena_t *arena, arena_chunk_t *chunk, void *ptr,
unsigned i, slot;
if (!__isthreaded || opt_lazy_free_2pow < 0) {
- malloc_mutex_lock(&arena->mtx);
+ malloc_spin_lock(&arena->lock);
arena_dalloc_small(arena, chunk, ptr, pageind, mapelm);
- malloc_mutex_unlock(&arena->mtx);
+ malloc_spin_unlock(&arena->lock);
return;
}
@@ -2689,7 +2890,7 @@ arena_dalloc_lazy(arena_t *arena, arena_chunk_t *chunk, void *ptr,
}
}
- malloc_mutex_lock(&arena->mtx);
+ malloc_spin_lock(&arena->lock);
arena_dalloc_small(arena, chunk, ptr, pageind, mapelm);
/*
@@ -2736,7 +2937,7 @@ arena_dalloc_lazy(arena_t *arena, arena_chunk_t *chunk, void *ptr,
}
}
- malloc_mutex_unlock(&arena->mtx);
+ malloc_spin_unlock(&arena->lock);
}
#endif
@@ -2760,9 +2961,9 @@ arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr)
#ifndef NO_TLS
arena_dalloc_lazy(arena, chunk, ptr, pageind, mapelm);
#else
- malloc_mutex_lock(&arena->mtx);
+ malloc_spin_lock(&arena->lock);
arena_dalloc_small(arena, chunk, ptr, pageind, mapelm);
- malloc_mutex_unlock(&arena->mtx);
+ malloc_spin_unlock(&arena->lock);
#endif
} else {
size_t size;
@@ -2775,13 +2976,13 @@ arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr)
if (opt_junk)
memset(ptr, 0x5a, size);
- malloc_mutex_lock(&arena->mtx);
+ malloc_spin_lock(&arena->lock);
arena_run_dalloc(arena, (arena_run_t *)ptr, size);
#ifdef MALLOC_STATS
arena->stats.allocated_large -= size;
arena->stats.ndalloc_large++;
#endif
- malloc_mutex_unlock(&arena->mtx);
+ malloc_spin_unlock(&arena->lock);
}
}
@@ -2792,7 +2993,8 @@ arena_new(arena_t *arena)
arena_bin_t *bin;
size_t pow2_size, prev_run_size;
- malloc_mutex_init(&arena->mtx);
+ if (malloc_spin_init(&arena->lock))
+ return (true);
#ifdef MALLOC_STATS
memset(&arena->stats, 0, sizeof(arena_stats_t));
@@ -2803,6 +3005,7 @@ arena_new(arena_t *arena)
arena->spare = NULL;
#ifndef NO_TLS
+ arena->contention = 0;
if (opt_lazy_free_2pow >= 0) {
arena->free_cache = (void **) base_alloc(sizeof(void *)
* (1U << opt_lazy_free_2pow));
@@ -3332,6 +3535,8 @@ malloc_print_stats(void)
umax2s(1U << opt_lazy_free_2pow, s), "\n", "");
} else
_malloc_message("Lazy free slots: 0\n", "", "", "");
+ _malloc_message("Arena balance threshold: ",
+ umax2s(opt_balance_threshold, s), "\n", "");
#endif
_malloc_message("Pointer size: ", umax2s(sizeof(void *), s),
"\n", "");
@@ -3345,6 +3550,9 @@ malloc_print_stats(void)
#ifdef MALLOC_STATS
{
size_t allocated, mapped;
+#ifndef NO_TLS
+ uint64_t nbalance = 0;
+#endif
unsigned i;
arena_t *arena;
@@ -3353,12 +3561,15 @@ malloc_print_stats(void)
/* arenas. */
for (i = 0, allocated = 0; i < narenas; i++) {
if (arenas[i] != NULL) {
- malloc_mutex_lock(&arenas[i]->mtx);
+ malloc_spin_lock(&arenas[i]->lock);
allocated +=
arenas[i]->stats.allocated_small;
allocated +=
arenas[i]->stats.allocated_large;
- malloc_mutex_unlock(&arenas[i]->mtx);
+#ifndef NO_TLS
+ nbalance += arenas[i]->stats.nbalance;
+#endif
+ malloc_spin_unlock(&arenas[i]->lock);
}
}
@@ -3375,6 +3586,11 @@ malloc_print_stats(void)
malloc_printf("Allocated: %zu, mapped: %zu\n",
allocated, mapped);
+#ifndef NO_TLS
+ malloc_printf("Arena balance reassignments: %llu\n",
+ nbalance);
+#endif
+
/* Print chunk stats. */
{
chunk_stats_t chunks_stats;
@@ -3403,9 +3619,9 @@ malloc_print_stats(void)
if (arena != NULL) {
malloc_printf(
"\narenas[%u]:\n", i);
- malloc_mutex_lock(&arena->mtx);
+ malloc_spin_lock(&arena->lock);
stats_print(arena);
- malloc_mutex_unlock(&arena->mtx);
+ malloc_spin_unlock(&arena->lock);
}
}
}
@@ -3540,6 +3756,20 @@ malloc_init_hard(void)
case 'A':
opt_abort = true;
break;
+ case 'b':
+#ifndef NO_TLS
+ opt_balance_threshold >>= 1;
+#endif
+ break;
+ case 'B':
+#ifndef NO_TLS
+ if (opt_balance_threshold == 0)
+ opt_balance_threshold = 1;
+ else if ((opt_balance_threshold << 1)
+ > opt_balance_threshold)
+ opt_balance_threshold <<= 1;
+#endif
+ break;
case 'h':
opt_hint = false;
break;
@@ -3565,8 +3795,8 @@ malloc_init_hard(void)
/*
* There must be fewer pages in a chunk than
* can be recorded by the pos field of
- * arena_chunk_map_t, in order to make POS_FREE
- * special.
+ * arena_chunk_map_t, in order to make
+ * POS_EMPTY/POS_FREE special.
*/
if (opt_chunk_2pow - pagesize_2pow
< (sizeof(uint32_t) << 3) - 1)
@@ -3770,6 +4000,12 @@ malloc_init_hard(void)
if (narenas == 0)
narenas = 1;
}
+#ifndef NO_TLS
+ assert(narenas != 0);
+ for (narenas_2pow = 0;
+ (narenas >> (narenas_2pow + 1)) != 0;
+ narenas_2pow++);
+#endif
#ifdef NO_TLS
if (narenas > 1) {
@@ -3798,10 +4034,6 @@ malloc_init_hard(void)
}
#endif
-#ifndef NO_TLS
- next_arena = 0;
-#endif
-
/* Allocate and initialize arenas. */
arenas = (arena_t **)base_alloc(sizeof(arena_t *) * narenas);
if (arenas == NULL) {
@@ -3825,13 +4057,20 @@ malloc_init_hard(void)
}
#ifndef NO_TLS
/*
+ * Assign the initial arena to the initial thread, in order to avoid
+ * spurious creation of an extra arena if the application switches to
+ * threaded mode.
+ */
+ arenas_map = arenas[0];
+ /*
* Seed here for the initial thread, since choose_arena_hard() is only
* called for other threads. The seed values don't really matter.
*/
SPRN(lazy_free, 42);
+ SPRN(balance, 42);
#endif
- malloc_mutex_init(&arenas_mtx);
+ malloc_spin_init(&arenas_lock);
malloc_initialized = true;
malloc_mutex_unlock(&init_lock);
@@ -4075,12 +4314,12 @@ _malloc_prefork(void)
/* Acquire all mutexes in a safe order. */
- malloc_mutex_lock(&arenas_mtx);
+ malloc_spin_lock(&arenas_lock);
for (i = 0; i < narenas; i++) {
if (arenas[i] != NULL)
- malloc_mutex_lock(&arenas[i]->mtx);
+ malloc_spin_lock(&arenas[i]->lock);
}
- malloc_mutex_unlock(&arenas_mtx);
+ malloc_spin_unlock(&arenas_lock);
malloc_mutex_lock(&base_mtx);
@@ -4098,12 +4337,12 @@ _malloc_postfork(void)
malloc_mutex_unlock(&base_mtx);
- malloc_mutex_lock(&arenas_mtx);
+ malloc_spin_lock(&arenas_lock);
for (i = 0; i < narenas; i++) {
if (arenas[i] != NULL)
- malloc_mutex_unlock(&arenas[i]->mtx);
+ malloc_spin_unlock(&arenas[i]->lock);
}
- malloc_mutex_unlock(&arenas_mtx);
+ malloc_spin_unlock(&arenas_lock);
}
/*
OpenPOWER on IntegriCloud