diff options
-rw-r--r-- | lib/libc/stdlib/malloc.c | 355 |
1 files changed, 297 insertions, 58 deletions
diff --git a/lib/libc/stdlib/malloc.c b/lib/libc/stdlib/malloc.c index 3ad27f5..4d37e12 100644 --- a/lib/libc/stdlib/malloc.c +++ b/lib/libc/stdlib/malloc.c @@ -171,6 +171,7 @@ __FBSDID("$FreeBSD$"); # define QUANTUM_2POW_MIN 4 # define SIZEOF_PTR_2POW 2 # define USE_BRK +# define CPU_SPINWAIT __asm__ volatile("pause") #endif #ifdef __ia64__ # define QUANTUM_2POW_MIN 4 @@ -189,6 +190,7 @@ __FBSDID("$FreeBSD$"); #ifdef __amd64__ # define QUANTUM_2POW_MIN 4 # define SIZEOF_PTR_2POW 3 +# define CPU_SPINWAIT __asm__ volatile("pause") #endif #ifdef __arm__ # define QUANTUM_2POW_MIN 3 @@ -270,11 +272,51 @@ __FBSDID("$FreeBSD$"); */ #define LAZY_FREE_NPROBES 5 +/* + * Hyper-threaded CPUs may need a special instruction inside spin loops in + * order to yield to another virtual CPU. If no such instruction is defined + * above, make CPU_SPINWAIT a no-op. + */ +#ifndef CPU_SPINWAIT +# define CPU_SPINWAIT +#endif + +/* + * Adaptive spinning must eventually switch to blocking, in order to avoid the + * potential for priority inversion deadlock. Backing off past a certain point + * can actually waste time. + */ +#define SPIN_LIMIT_2POW 11 + +/* + * Conversion from spinning to blocking is expensive; we use (1U << + * BLOCK_COST_2POW) to estimate how many more times costly blocking is than + * worst-case spinning. + */ +#define BLOCK_COST_2POW 4 + +/* + * We use an exponential moving average to track recent lock contention, where + * the size of the history window is N, and alpha=2/(N+1). + * + * Due to integer math rounding, very small values here can cause substantial + * degradation in accuracy, thus making the moving average decay faster than it + * would with precise calculation. + */ +#define BALANCE_ALPHA_INV_2POW 9 + +/* + * Threshold value for the exponential moving contention average at which to + * re-assign a thread. + */ +#define BALANCE_THRESHOLD_DEFAULT (1U << (SPIN_LIMIT_2POW-4)) + /******************************************************************************/ /* - * Mutexes based on spinlocks. We can't use normal pthread mutexes, because - * they require malloc()ed memory. + * Mutexes based on spinlocks. We can't use normal pthread spinlocks in all + * places, because they require malloc()ed memory, which causes bootstrapping + * issues in some cases. */ typedef struct { spinlock_t lock; @@ -330,6 +372,11 @@ struct arena_stats_s { size_t allocated_large; uint64_t nmalloc_large; uint64_t ndalloc_large; + +#ifndef NO_TLS + /* Number of times this arena reassigned a thread due to contention. */ + uint64_t nbalance; +#endif }; typedef struct chunk_stats_s chunk_stats_t; @@ -517,8 +564,8 @@ struct arena_s { # define ARENA_MAGIC 0x947d3d24 #endif - /* All operations on this arena require that mtx be locked. */ - malloc_mutex_t mtx; + /* All operations on this arena require that lock be locked. */ + pthread_mutex_t lock; #ifdef MALLOC_STATS arena_stats_t stats; @@ -542,6 +589,12 @@ struct arena_s { #ifndef NO_TLS /* + * The arena load balancing machinery needs to keep track of how much + * lock contention there is. This value is exponentially averaged. + */ + uint32_t contention; + + /* * Deallocation of small objects can be lazy, in which case free_cache * stores pointers to those objects that have not yet been deallocated. * In order to avoid lock contention, slots are chosen randomly. Empty @@ -681,9 +734,9 @@ static size_t base_mapped; static arena_t **arenas; static unsigned narenas; #ifndef NO_TLS -static unsigned next_arena; +static unsigned narenas_2pow; #endif -static malloc_mutex_t arenas_mtx; /* Protects arenas initialization. */ +static pthread_mutex_t arenas_lock; /* Protects arenas initialization. */ #ifndef NO_TLS /* @@ -714,6 +767,7 @@ static bool opt_junk = false; static bool opt_hint = false; #ifndef NO_TLS static int opt_lazy_free_2pow = LAZY_FREE_2POW_DEFAULT; +static uint64_t opt_balance_threshold = BALANCE_THRESHOLD_DEFAULT; #endif static bool opt_print_stats = false; static size_t opt_quantum_2pow = QUANTUM_2POW_MIN; @@ -743,6 +797,7 @@ typedef struct { */ static void malloc_mutex_init(malloc_mutex_t *a_mutex); +static bool malloc_spin_init(pthread_mutex_t *lock); static void wrtmessage(const char *p1, const char *p2, const char *p3, const char *p4); #ifdef MALLOC_STATS @@ -751,6 +806,7 @@ static void malloc_printf(const char *format, ...); static char *umax2s(uintmax_t x, char *s); static bool base_pages_alloc(size_t minsize); static void *base_alloc(size_t size); +static void *base_calloc(size_t number, size_t size); static chunk_node_t *base_chunk_node_alloc(void); static void base_chunk_node_dealloc(chunk_node_t *node); #ifdef MALLOC_STATS @@ -798,7 +854,9 @@ static bool malloc_init_hard(void); */ /******************************************************************************/ /* - * Begin mutex. + * Begin mutex. We can't use normal pthread mutexes in all places, because + * they require malloc()ed memory, which causes bootstrapping issues in some + * cases. */ static void @@ -830,6 +888,86 @@ malloc_mutex_unlock(malloc_mutex_t *a_mutex) */ /******************************************************************************/ /* + * Begin spin lock. Spin locks here are actually adaptive mutexes that block + * after a period of spinning, because unbounded spinning would allow for + * priority inversion. + */ + +/* + * We use an unpublished interface to initialize pthread mutexes with an + * allocation callback, in order to avoid infinite recursion. + */ +int _pthread_mutex_init_calloc_cb(pthread_mutex_t *mutex, + void *(calloc_cb)(size_t, size_t)); + +__weak_reference(_pthread_mutex_init_calloc_cb_stub, + _pthread_mutex_init_calloc_cb); + +int +_pthread_mutex_init_calloc_cb_stub(pthread_mutex_t *mutex, + void *(calloc_cb)(size_t, size_t)) +{ + + return (0); +} + +static bool +malloc_spin_init(pthread_mutex_t *lock) +{ + + if (_pthread_mutex_init_calloc_cb(lock, base_calloc) != 0) + return (true); + + return (false); +} + +static inline unsigned +malloc_spin_lock(pthread_mutex_t *lock) +{ + unsigned ret = 0; + + if (__isthreaded) { + if (_pthread_mutex_trylock(lock) != 0) { + unsigned i; + volatile unsigned j; + + /* Exponentially back off. */ + for (i = 1; i <= SPIN_LIMIT_2POW; i++) { + for (j = 0; j < (1U << i); j++) + ret++; + + CPU_SPINWAIT; + if (_pthread_mutex_trylock(lock) == 0) + return (ret); + } + + /* + * Spinning failed. Block until the lock becomes + * available, in order to avoid indefinite priority + * inversion. + */ + _pthread_mutex_lock(lock); + assert((ret << BLOCK_COST_2POW) != 0); + return (ret << BLOCK_COST_2POW); + } + } + + return (ret); +} + +static inline void +malloc_spin_unlock(pthread_mutex_t *lock) +{ + + if (__isthreaded) + _pthread_mutex_unlock(lock); +} + +/* + * End spin lock. + */ +/******************************************************************************/ +/* * Begin Utility functions/macros. */ @@ -926,6 +1064,10 @@ prn_##suffix(uint32_t lg_range) \ /* Define the per-thread PRNG used for lazy deallocation. */ static __thread uint32_t lazy_free_x; PRN_DEFINE(lazy_free, lazy_free_x, 12345, 12347) + +/* Define the PRNG used for arena assignment. */ +static __thread uint32_t balance_x; +PRN_DEFINE(balance, balance_x, 1297, 1301); #endif static void @@ -1083,6 +1225,17 @@ RETURN: return (ret); } +static void * +base_calloc(size_t number, size_t size) +{ + void *ret; + + ret = base_alloc(number * size); + memset(ret, 0, number * size); + + return (ret); +} + static chunk_node_t * base_chunk_node_alloc(void) { @@ -1574,12 +1727,12 @@ choose_arena(void) * Avoid races with another thread that may have already * initialized arenas[ind]. */ - malloc_mutex_lock(&arenas_mtx); + malloc_spin_lock(&arenas_lock); if (arenas[ind] == NULL) ret = arenas_extend((unsigned)ind); else ret = arenas[ind]; - malloc_mutex_unlock(&arenas_mtx); + malloc_spin_unlock(&arenas_lock); } } else ret = arenas[0]; @@ -1613,21 +1766,27 @@ choose_arena_hard(void) */ SPRN(lazy_free, (uint32_t)(uintptr_t)(_pthread_self())); - /* Assign one of the arenas to this thread, in a round-robin fashion. */ - malloc_mutex_lock(&arenas_mtx); - ret = arenas[next_arena]; - if (ret == NULL) - ret = arenas_extend(next_arena); - if (ret == NULL) { - /* - * Make sure that this function never returns NULL, so that - * choose_arena() doesn't have to check for a NULL return - * value. - */ + /* + * Seed the PRNG used for arena load balancing. We can get away with + * using the same seed here as for the lazy_free PRNG without + * introducing autocorrelation because the PRNG parameters are + * distinct. + */ + SPRN(balance, (uint32_t)(uintptr_t)(_pthread_self())); + + if (narenas > 1) { + unsigned ind; + + ind = PRN(balance, narenas_2pow); + if ((ret = arenas[ind]) == NULL) { + malloc_spin_lock(&arenas_lock); + if ((ret = arenas[ind]) == NULL) + ret = arenas_extend(ind); + malloc_spin_unlock(&arenas_lock); + } + } else ret = arenas[0]; - } - next_arena = (next_arena + 1) % narenas; - malloc_mutex_unlock(&arenas_mtx); + arenas_map = ret; return (ret); @@ -2325,6 +2484,48 @@ arena_bin_run_size_calc(arena_bin_t *bin, size_t min_run_size) return (good_run_size); } +static inline void +arena_lock_balance(arena_t *arena) +{ +#ifndef NO_TLS + unsigned contention; + + contention = malloc_spin_lock(&arena->lock); + if (narenas > 1) { + /* + * Calculate the exponentially averaged contention for this + * arena. Due to integer math always rounding down, this value + * decays somewhat faster then normal. + */ + arena->contention = (((uint64_t)arena->contention + * (uint64_t)((1U << BALANCE_ALPHA_INV_2POW)-1)) + + (uint64_t)contention) >> BALANCE_ALPHA_INV_2POW; + if (arena->contention >= opt_balance_threshold) { + uint32_t ind; + + arena->contention = 0; +#ifdef MALLOC_STATS + arena->stats.nbalance++; +#endif + ind = PRN(balance, narenas_2pow); + if (arenas[ind] != NULL) + arenas_map = arenas[ind]; + else { + malloc_spin_lock(&arenas_lock); + if (arenas[ind] != NULL) + arenas_map = arenas[ind]; + else + arenas_map = arenas_extend(ind); + malloc_spin_unlock(&arenas_lock); + } + } + } +#else + malloc_spin_lock(&arena->lock); +#endif + +} + static void * arena_malloc(arena_t *arena, size_t size, bool zero) { @@ -2368,14 +2569,14 @@ arena_malloc(arena_t *arena, size_t size, bool zero) } assert(size == bin->reg_size); - malloc_mutex_lock(&arena->mtx); + arena_lock_balance(arena); if ((run = bin->runcur) != NULL && run->nfree > 0) ret = arena_bin_malloc_easy(arena, bin, run); else ret = arena_bin_malloc_hard(arena, bin); if (ret == NULL) { - malloc_mutex_unlock(&arena->mtx); + malloc_spin_unlock(&arena->lock); return (NULL); } @@ -2384,7 +2585,7 @@ arena_malloc(arena_t *arena, size_t size, bool zero) arena->stats.nmalloc_small++; arena->stats.allocated_small += size; #endif - malloc_mutex_unlock(&arena->mtx); + malloc_spin_unlock(&arena->lock); if (zero == false) { if (opt_junk) @@ -2396,17 +2597,17 @@ arena_malloc(arena_t *arena, size_t size, bool zero) } else { /* Large allocation. */ size = PAGE_CEILING(size); - malloc_mutex_lock(&arena->mtx); + arena_lock_balance(arena); ret = (void *)arena_run_alloc(arena, size, true); // XXX zero? if (ret == NULL) { - malloc_mutex_unlock(&arena->mtx); + malloc_spin_unlock(&arena->lock); return (NULL); } #ifdef MALLOC_STATS arena->stats.nmalloc_large++; arena->stats.allocated_large += size; #endif - malloc_mutex_unlock(&arena->mtx); + malloc_spin_unlock(&arena->lock); if (zero == false) { if (opt_junk) @@ -2453,10 +2654,10 @@ arena_palloc(arena_t *arena, size_t alignment, size_t size, size_t alloc_size) npages = size >> pagesize_2pow; - malloc_mutex_lock(&arena->mtx); + arena_lock_balance(arena); ret = (void *)arena_run_alloc(arena, alloc_size, false); if (ret == NULL) { - malloc_mutex_unlock(&arena->mtx); + malloc_spin_unlock(&arena->lock); return (NULL); } @@ -2509,7 +2710,7 @@ arena_palloc(arena_t *arena, size_t alignment, size_t size, size_t alloc_size) arena->stats.nmalloc_large++; arena->stats.allocated_large += size; #endif - malloc_mutex_unlock(&arena->mtx); + malloc_spin_unlock(&arena->lock); if (opt_junk) memset(ret, 0xa5, size); @@ -2675,9 +2876,9 @@ arena_dalloc_lazy(arena_t *arena, arena_chunk_t *chunk, void *ptr, unsigned i, slot; if (!__isthreaded || opt_lazy_free_2pow < 0) { - malloc_mutex_lock(&arena->mtx); + malloc_spin_lock(&arena->lock); arena_dalloc_small(arena, chunk, ptr, pageind, mapelm); - malloc_mutex_unlock(&arena->mtx); + malloc_spin_unlock(&arena->lock); return; } @@ -2689,7 +2890,7 @@ arena_dalloc_lazy(arena_t *arena, arena_chunk_t *chunk, void *ptr, } } - malloc_mutex_lock(&arena->mtx); + malloc_spin_lock(&arena->lock); arena_dalloc_small(arena, chunk, ptr, pageind, mapelm); /* @@ -2736,7 +2937,7 @@ arena_dalloc_lazy(arena_t *arena, arena_chunk_t *chunk, void *ptr, } } - malloc_mutex_unlock(&arena->mtx); + malloc_spin_unlock(&arena->lock); } #endif @@ -2760,9 +2961,9 @@ arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr) #ifndef NO_TLS arena_dalloc_lazy(arena, chunk, ptr, pageind, mapelm); #else - malloc_mutex_lock(&arena->mtx); + malloc_spin_lock(&arena->lock); arena_dalloc_small(arena, chunk, ptr, pageind, mapelm); - malloc_mutex_unlock(&arena->mtx); + malloc_spin_unlock(&arena->lock); #endif } else { size_t size; @@ -2775,13 +2976,13 @@ arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr) if (opt_junk) memset(ptr, 0x5a, size); - malloc_mutex_lock(&arena->mtx); + malloc_spin_lock(&arena->lock); arena_run_dalloc(arena, (arena_run_t *)ptr, size); #ifdef MALLOC_STATS arena->stats.allocated_large -= size; arena->stats.ndalloc_large++; #endif - malloc_mutex_unlock(&arena->mtx); + malloc_spin_unlock(&arena->lock); } } @@ -2792,7 +2993,8 @@ arena_new(arena_t *arena) arena_bin_t *bin; size_t pow2_size, prev_run_size; - malloc_mutex_init(&arena->mtx); + if (malloc_spin_init(&arena->lock)) + return (true); #ifdef MALLOC_STATS memset(&arena->stats, 0, sizeof(arena_stats_t)); @@ -2803,6 +3005,7 @@ arena_new(arena_t *arena) arena->spare = NULL; #ifndef NO_TLS + arena->contention = 0; if (opt_lazy_free_2pow >= 0) { arena->free_cache = (void **) base_alloc(sizeof(void *) * (1U << opt_lazy_free_2pow)); @@ -3332,6 +3535,8 @@ malloc_print_stats(void) umax2s(1U << opt_lazy_free_2pow, s), "\n", ""); } else _malloc_message("Lazy free slots: 0\n", "", "", ""); + _malloc_message("Arena balance threshold: ", + umax2s(opt_balance_threshold, s), "\n", ""); #endif _malloc_message("Pointer size: ", umax2s(sizeof(void *), s), "\n", ""); @@ -3345,6 +3550,9 @@ malloc_print_stats(void) #ifdef MALLOC_STATS { size_t allocated, mapped; +#ifndef NO_TLS + uint64_t nbalance = 0; +#endif unsigned i; arena_t *arena; @@ -3353,12 +3561,15 @@ malloc_print_stats(void) /* arenas. */ for (i = 0, allocated = 0; i < narenas; i++) { if (arenas[i] != NULL) { - malloc_mutex_lock(&arenas[i]->mtx); + malloc_spin_lock(&arenas[i]->lock); allocated += arenas[i]->stats.allocated_small; allocated += arenas[i]->stats.allocated_large; - malloc_mutex_unlock(&arenas[i]->mtx); +#ifndef NO_TLS + nbalance += arenas[i]->stats.nbalance; +#endif + malloc_spin_unlock(&arenas[i]->lock); } } @@ -3375,6 +3586,11 @@ malloc_print_stats(void) malloc_printf("Allocated: %zu, mapped: %zu\n", allocated, mapped); +#ifndef NO_TLS + malloc_printf("Arena balance reassignments: %llu\n", + nbalance); +#endif + /* Print chunk stats. */ { chunk_stats_t chunks_stats; @@ -3403,9 +3619,9 @@ malloc_print_stats(void) if (arena != NULL) { malloc_printf( "\narenas[%u]:\n", i); - malloc_mutex_lock(&arena->mtx); + malloc_spin_lock(&arena->lock); stats_print(arena); - malloc_mutex_unlock(&arena->mtx); + malloc_spin_unlock(&arena->lock); } } } @@ -3540,6 +3756,20 @@ malloc_init_hard(void) case 'A': opt_abort = true; break; + case 'b': +#ifndef NO_TLS + opt_balance_threshold >>= 1; +#endif + break; + case 'B': +#ifndef NO_TLS + if (opt_balance_threshold == 0) + opt_balance_threshold = 1; + else if ((opt_balance_threshold << 1) + > opt_balance_threshold) + opt_balance_threshold <<= 1; +#endif + break; case 'h': opt_hint = false; break; @@ -3565,8 +3795,8 @@ malloc_init_hard(void) /* * There must be fewer pages in a chunk than * can be recorded by the pos field of - * arena_chunk_map_t, in order to make POS_FREE - * special. + * arena_chunk_map_t, in order to make + * POS_EMPTY/POS_FREE special. */ if (opt_chunk_2pow - pagesize_2pow < (sizeof(uint32_t) << 3) - 1) @@ -3770,6 +4000,12 @@ malloc_init_hard(void) if (narenas == 0) narenas = 1; } +#ifndef NO_TLS + assert(narenas != 0); + for (narenas_2pow = 0; + (narenas >> (narenas_2pow + 1)) != 0; + narenas_2pow++); +#endif #ifdef NO_TLS if (narenas > 1) { @@ -3798,10 +4034,6 @@ malloc_init_hard(void) } #endif -#ifndef NO_TLS - next_arena = 0; -#endif - /* Allocate and initialize arenas. */ arenas = (arena_t **)base_alloc(sizeof(arena_t *) * narenas); if (arenas == NULL) { @@ -3825,13 +4057,20 @@ malloc_init_hard(void) } #ifndef NO_TLS /* + * Assign the initial arena to the initial thread, in order to avoid + * spurious creation of an extra arena if the application switches to + * threaded mode. + */ + arenas_map = arenas[0]; + /* * Seed here for the initial thread, since choose_arena_hard() is only * called for other threads. The seed values don't really matter. */ SPRN(lazy_free, 42); + SPRN(balance, 42); #endif - malloc_mutex_init(&arenas_mtx); + malloc_spin_init(&arenas_lock); malloc_initialized = true; malloc_mutex_unlock(&init_lock); @@ -4075,12 +4314,12 @@ _malloc_prefork(void) /* Acquire all mutexes in a safe order. */ - malloc_mutex_lock(&arenas_mtx); + malloc_spin_lock(&arenas_lock); for (i = 0; i < narenas; i++) { if (arenas[i] != NULL) - malloc_mutex_lock(&arenas[i]->mtx); + malloc_spin_lock(&arenas[i]->lock); } - malloc_mutex_unlock(&arenas_mtx); + malloc_spin_unlock(&arenas_lock); malloc_mutex_lock(&base_mtx); @@ -4098,12 +4337,12 @@ _malloc_postfork(void) malloc_mutex_unlock(&base_mtx); - malloc_mutex_lock(&arenas_mtx); + malloc_spin_lock(&arenas_lock); for (i = 0; i < narenas; i++) { if (arenas[i] != NULL) - malloc_mutex_unlock(&arenas[i]->mtx); + malloc_spin_unlock(&arenas[i]->lock); } - malloc_mutex_unlock(&arenas_mtx); + malloc_spin_unlock(&arenas_lock); } /* |