From ecc57500104c134a8f529cf4a64988e635ba5dc9 Mon Sep 17 00:00:00 2001 From: jasone Date: Tue, 4 Apr 2006 03:51:47 +0000 Subject: Refactor per-run bitmap manipulation functions so that bitmap offsets only have to be calculated once per allocator operation. Make nil const. Update various comments. Remove/avoid division where possible. For the one division operation that remains in the critical path, add a switch statement that has a case for each small size class, and do division with a constant divisor in each case. This allows the compiler to generate optimized code that does not use hardware division [1]. Obtained from: peter [1] --- lib/libc/stdlib/malloc.c | 200 +++++++++++++++++++++++++++++++---------------- 1 file changed, 131 insertions(+), 69 deletions(-) (limited to 'lib/libc/stdlib') diff --git a/lib/libc/stdlib/malloc.c b/lib/libc/stdlib/malloc.c index a2a83ce..86c22fe 100644 --- a/lib/libc/stdlib/malloc.c +++ b/lib/libc/stdlib/malloc.c @@ -66,7 +66,7 @@ * | | Sub-page | 1 kB | * | | | 2 kB | * |====================================| - * | Medium | 4 kB | + * | Large | 4 kB | * | | 8 kB | * | | 16 kB | * | | ... | @@ -74,7 +74,7 @@ * | | 512 kB | * | | 1 MB | * |====================================| - * | Large | 2 MB | + * | Huge | 2 MB | * | | 4 MB | * | | 6 MB | * | | ... | @@ -85,11 +85,11 @@ * Small : Each size class is segregated into its own set of runs. Each run * maintains a bitmap of which regions are free/allocated. * - * Medium : Each allocation is backed by a dedicated run. Metadata are stored - * in the associated arena chunk header maps. + * Large : Each allocation is backed by a dedicated run. Metadata are stored + * in the associated arena chunk header maps. * - * Large : Each allocation is backed by a dedicated contiguous set of chunks. - * Metadata are stored in a separate red-black tree. + * Huge : Each allocation is backed by a dedicated contiguous set of chunks. + * Metadata are stored in a separate red-black tree. * ******************************************************************************* */ @@ -143,8 +143,10 @@ (a_qr_b)->a_field.qre_prev = t; \ } while (0) -/* qr_meld() and qr_split() are functionally equivalent, so there's no need to - * have two copies of the code. */ +/* + * qr_meld() and qr_split() are functionally equivalent, so there's no need to + * have two copies of the code. + */ #define qr_split(a_qr_a, a_qr_b, a_type, a_field) \ qr_meld((a_qr_a), (a_qr_b), a_type, a_field) @@ -173,7 +175,7 @@ /* * In order to disable various extra features that may have negative - * performance impacts, (assertions, expanded statistics, redzones), define + * performance impacts, (assertions, expanded statistics), define * NO_MALLOC_EXTRAS. */ /* #define NO_MALLOC_EXTRAS */ @@ -654,7 +656,7 @@ struct arena_s { */ /* Used as a special "nil" return value for malloc(0). */ -static int nil; +static const int nil; /* Number of CPUs. */ static unsigned ncpus; @@ -1498,52 +1500,29 @@ arena_chunk_comp(arena_chunk_t *a, arena_chunk_t *b) /* Generate red-black tree code for arena chunks. */ RB_GENERATE_STATIC(arena_chunk_tree_s, arena_chunk_s, link, arena_chunk_comp); -static inline void -arena_run_mask_free_set(arena_run_t *run, unsigned reg) -{ - unsigned elm, bit; - - assert(run->magic == ARENA_RUN_MAGIC); - assert(reg < run->bin->nregs); - - elm = reg >> (SIZEOF_INT_2POW + 3); - if (elm < run->regs_minelm) - run->regs_minelm = elm; - bit = reg - (elm << (SIZEOF_INT_2POW + 3)); - assert((run->regs_mask[elm] & (1 << bit)) == 0); - run->regs_mask[elm] |= (1 << bit); -} - -static inline void -arena_run_mask_free_unset(arena_run_t *run, unsigned reg) -{ - unsigned elm, bit; - - assert(run->magic == ARENA_RUN_MAGIC); - assert(reg < run->bin->nregs); - - elm = reg >> (SIZEOF_INT_2POW + 3); - bit = reg - (elm << (SIZEOF_INT_2POW + 3)); - assert((run->regs_mask[elm] & (1 << bit)) != 0); - run->regs_mask[elm] ^= (1 << bit); -} - -static inline unsigned -arena_run_search(arena_run_t *run) +static inline void * +arena_run_reg_alloc(arena_run_t *run, arena_bin_t *bin) { - unsigned i, mask, bit; + void *ret; + unsigned i, mask, bit, regind; assert(run->magic == ARENA_RUN_MAGIC); for (i = run->regs_minelm; i < REGS_MASK_NELMS; i++) { mask = run->regs_mask[i]; if (mask != 0) { - bit = ffs(mask); - if (bit != 0) { - /* Usable allocation found. */ - return ((i << (SIZEOF_INT_2POW + 3)) - + bit - 1); - } + /* Usable allocation found. */ + bit = ffs(mask) - 1; + + regind = ((i << (SIZEOF_INT_2POW + 3)) + bit); + ret = (void *)&((char *)run)[bin->reg0_offset + + (bin->reg_size * regind)]; + + /* Clear bit. */ + mask ^= (1 << bit); + run->regs_mask[i] = mask; + + return (ret); } else { /* * Make a note that nothing before this element @@ -1552,8 +1531,94 @@ arena_run_search(arena_run_t *run) run->regs_minelm = i + 1; } } + /* Not reached. */ + assert(0); +} + +static inline void +arena_run_reg_dalloc(arena_run_t *run, arena_bin_t *bin, void *ptr, size_t size) +{ + unsigned diff, regind, elm, bit; + + assert(run->magic == ARENA_RUN_MAGIC); + + /* + * Avoid doing division with a variable divisor if possible. This + * single operation can reduce allocator throughput by around 20%! + */ +#define POW2_CASE(p) \ + case (1 << (p)): \ + regind = diff >> (p); \ + break; +#define QUANTUM_CASE(n) \ + case ((n) << QUANTUM_2POW_MIN): \ + regind = diff / ((n) << QUANTUM_2POW_MIN); \ + break; + + /* + * These assertions make sure that the switch statement matches + * compile-time configuration. + */ + assert(tiny_min_2pow >= 1); + assert(QUANTUM_2POW_MIN >= 3 && QUANTUM_2POW_MIN <= 4); + assert(SMALL_MAX_2POW_DEFAULT == 9); + + diff = (unsigned)((uintptr_t)ptr - (uintptr_t)run - bin->reg0_offset); + switch (size) { + POW2_CASE(1) + POW2_CASE(2) +#if (QUANTUM_2POW_MIN > 3) + POW2_CASE(3) +#endif + QUANTUM_CASE(1) + QUANTUM_CASE(2) + QUANTUM_CASE(3) + QUANTUM_CASE(4) + QUANTUM_CASE(5) + QUANTUM_CASE(6) + QUANTUM_CASE(7) + QUANTUM_CASE(8) + QUANTUM_CASE(9) + QUANTUM_CASE(10) + QUANTUM_CASE(11) + QUANTUM_CASE(12) + QUANTUM_CASE(13) + QUANTUM_CASE(14) + QUANTUM_CASE(15) + QUANTUM_CASE(16) + QUANTUM_CASE(17) + QUANTUM_CASE(18) + QUANTUM_CASE(19) + QUANTUM_CASE(20) + QUANTUM_CASE(21) + QUANTUM_CASE(22) + QUANTUM_CASE(23) + QUANTUM_CASE(24) + QUANTUM_CASE(25) + QUANTUM_CASE(26) + QUANTUM_CASE(27) + QUANTUM_CASE(28) + QUANTUM_CASE(29) + QUANTUM_CASE(30) + QUANTUM_CASE(31) + QUANTUM_CASE(32) + + POW2_CASE(10) + POW2_CASE(11) + POW2_CASE(12) /* Handle up to 8 kB pages. */ + default: + regind = diff / size; + } +#undef POW2_CASE +#undef QUANTUM_CASE + assert(regind < bin->nregs); - return (UINT_MAX); + elm = regind >> (SIZEOF_INT_2POW + 3); + if (elm < run->regs_minelm) + run->regs_minelm = elm; + bit = regind - (elm << (SIZEOF_INT_2POW + 3)); + assert((run->regs_mask[elm] & (1 << bit)) == 0); + run->regs_mask[elm] |= (1 << bit); } static void @@ -1638,7 +1703,7 @@ arena_chunk_alloc(arena_t *arena) header_size += pagesize - (header_size % pagesize); } - header_npages = header_size / pagesize; + header_npages = header_size >> pagesize_2pow; pow2_header_npages = pow2_ceil(header_npages); /* @@ -1665,7 +1730,7 @@ arena_chunk_alloc(arena_t *arena) * the beginning of the chunk, which we just took care of by * "allocating" the leading pages. */ - while (map_offset < (chunk_size / pagesize)) { + while (map_offset < (chunk_size >> pagesize_2pow)) { log2_run_pages = ffs(map_offset) - 1; run_pages = (1 << log2_run_pages); for (i = 0; i < run_pages; i++) { @@ -1858,7 +1923,7 @@ AGAIN: * large enough. Look for a precise fit, but do not pass up a chunk * that has a run which is large enough to split. */ - min_ind = ffs(size / pagesize) - 1; + min_ind = ffs(size >> pagesize_2pow) - 1; RB_FOREACH(chunk, arena_chunk_tree_s, &arena->chunks) { for (i = min_ind; i < (opt_chunk_2pow - pagesize_2pow); @@ -2044,18 +2109,12 @@ arena_bin_malloc_easy(arena_t *arena, arena_bin_t *bin, arena_run_t *run, size_t size) { void *ret; - unsigned regind; assert(run->magic == ARENA_RUN_MAGIC); assert(run->nfree > 0); - regind = arena_run_search(run); - assert(regind != UINT_MAX); - assert(regind < bin->nregs); - - ret = (void *)&((char *)run)[bin->reg0_offset + (bin->reg_size - * regind)]; - arena_run_mask_free_unset(run, regind); + ret = arena_run_reg_alloc(run, bin); + assert(ret != NULL); run->nfree--; if (run->nfree < run->free_min) { /* Promote run to higher fullness quartile. */ @@ -2256,7 +2315,6 @@ arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr) if (mapelm.large == false) { arena_run_t *run; arena_bin_t *bin; - unsigned regind; /* Small allocation. */ @@ -2270,10 +2328,8 @@ arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr) if (opt_junk) memset(ptr, 0x5a, size); - regind = (unsigned)(((uintptr_t)ptr - (uintptr_t)run - - bin->reg0_offset) / size); malloc_mutex_lock(&arena->mtx); - arena_run_mask_free_set(run, regind); + arena_run_reg_dalloc(run, bin, ptr, size); run->nfree++; if (run->nfree > run->free_max) { /* Demote run to lower fullness quartile. */ @@ -3325,7 +3381,7 @@ malloc(size_t size) if (size == 0) { if (opt_sysv == false) - ret = &nil; + ret = (void *)&nil; else ret = NULL; goto RETURN; @@ -3407,11 +3463,17 @@ calloc(size_t num, size_t size) num_size = num * size; if (num_size == 0) { if (opt_sysv == false) - ret = &nil; + ret = (void *)&nil; else ret = NULL; goto RETURN; - } else if (num_size / size != num) { + /* + * Try to avoid division here. We know that it isn't possible to + * overflow during multiplication if neither operand uses any of the + * most significant half of the bits in a size_t. + */ + } else if (((num | size) & (SIZE_T_MAX << (sizeof(size_t) << 2))) + && (num_size / size != num)) { /* size_t overflow. */ ret = NULL; goto RETURN; @@ -3474,7 +3536,7 @@ realloc(void *ptr, size_t size) if (ptr != &nil && ptr != NULL) idalloc(ptr); - ret = &nil; + ret = (void *)&nil; } UTRACE(ptr, size, ret); -- cgit v1.1