summaryrefslogtreecommitdiffstats
path: root/lib
diff options
context:
space:
mode:
authorjasone <jasone@FreeBSD.org>2006-07-01 16:51:10 +0000
committerjasone <jasone@FreeBSD.org>2006-07-01 16:51:10 +0000
commit64896f5bfe82d13c4ef89639922bcbe4bf176ae2 (patch)
tree7ab664d386fc26dc09adecc8061149dea6000e7d /lib
parentcfa2c683aec5a35f84bceee2684770146d0eb553 (diff)
downloadFreeBSD-src-64896f5bfe82d13c4ef89639922bcbe4bf176ae2.zip
FreeBSD-src-64896f5bfe82d13c4ef89639922bcbe4bf176ae2.tar.gz
Use some math tricks in arena_run_reg_dalloc() to avoid actual division, as
well as avoiding a switch statement. This change has no significant impact to performance when branch prediction is successful at predicting the sizes of objects passed to free(), but in the case that the object sizes are semi-random, this change has the potential to prevent many branch prediction misses, thus improving performance substantially. Take advantage of alignment guarantees in ipalloc(), and pad object sizes to something less than a power of two when possible. This has the potential to substantially reduce internal fragmentation for objects allocated via posix_memalign(). Avoid an unnecessary pow2_ceil() call in arena_ralloc(). Submitted by: djam8193ah@hotmail.com
Diffstat (limited to 'lib')
-rw-r--r--lib/libc/stdlib/malloc.c173
1 files changed, 90 insertions, 83 deletions
diff --git a/lib/libc/stdlib/malloc.c b/lib/libc/stdlib/malloc.c
index 313fad5..7154ee4 100644
--- a/lib/libc/stdlib/malloc.c
+++ b/lib/libc/stdlib/malloc.c
@@ -1562,84 +1562,83 @@ arena_run_reg_alloc(arena_run_t *run, arena_bin_t *bin)
static inline void
arena_run_reg_dalloc(arena_run_t *run, arena_bin_t *bin, void *ptr, size_t size)
{
+ /*
+ * To divide by a number D that is not a power of two we multiply
+ * by (2^21 / D) and then right shift by 21 positions.
+ *
+ * X / D
+ *
+ * becomes
+ *
+ * (size_invs[(D >> QUANTUM_2POW_MIN) - 3] * D) >> SIZE_INV_SHIFT
+ */
+#define SIZE_INV_SHIFT 21
+#define SIZE_INV(s) (((1 << SIZE_INV_SHIFT) / (s << QUANTUM_2POW_MIN)) + 1)
+ static const unsigned size_invs[] = {
+ SIZE_INV(3),
+ SIZE_INV(4), SIZE_INV(5), SIZE_INV(6), SIZE_INV(7),
+ SIZE_INV(8), SIZE_INV(9), SIZE_INV(10), SIZE_INV(11),
+ SIZE_INV(12),SIZE_INV(13), SIZE_INV(14), SIZE_INV(15),
+ SIZE_INV(16),SIZE_INV(17), SIZE_INV(18), SIZE_INV(19),
+ SIZE_INV(20),SIZE_INV(21), SIZE_INV(22), SIZE_INV(23),
+ SIZE_INV(24),SIZE_INV(25), SIZE_INV(26), SIZE_INV(27),
+ SIZE_INV(28),SIZE_INV(29), SIZE_INV(30), SIZE_INV(31)
+ };
unsigned diff, regind, elm, bit;
assert(run->magic == ARENA_RUN_MAGIC);
+ assert(((sizeof(size_invs)) / sizeof(unsigned)) + 3
+ >= (SMALL_MAX_DEFAULT >> QUANTUM_2POW_MIN));
/*
- * Avoid doing division with a variable divisor if possible. This
- * single operation can reduce allocator throughput by around 20%!
+ * Avoid doing division with a variable divisor if possible. Using
+ * actual division here can reduce allocator throughput by over 20%!
*/
-#define POW2_CASE(p) \
- case (1 << (p)): \
- regind = diff >> (p); \
- break;
-#define QUANTUM_CASE(n) \
- case ((n) << QUANTUM_2POW_MIN): \
- regind = diff / ((n) << QUANTUM_2POW_MIN); \
- break;
-
- /*
- * These assertions make sure that the switch statement matches
- * compile-time configuration.
- */
- assert(tiny_min_2pow >= 1);
- assert(QUANTUM_2POW_MIN >= 3 && QUANTUM_2POW_MIN <= 4);
- assert(SMALL_MAX_2POW_DEFAULT == 9);
-
diff = (unsigned)((uintptr_t)ptr - (uintptr_t)run - bin->reg0_offset);
- switch (size) {
- POW2_CASE(1)
- POW2_CASE(2)
-#if (QUANTUM_2POW_MIN > 3)
- POW2_CASE(3)
-#endif
- QUANTUM_CASE(1)
- QUANTUM_CASE(2)
- QUANTUM_CASE(3)
- QUANTUM_CASE(4)
- QUANTUM_CASE(5)
- QUANTUM_CASE(6)
- QUANTUM_CASE(7)
- QUANTUM_CASE(8)
- QUANTUM_CASE(9)
- QUANTUM_CASE(10)
- QUANTUM_CASE(11)
- QUANTUM_CASE(12)
- QUANTUM_CASE(13)
- QUANTUM_CASE(14)
- QUANTUM_CASE(15)
- QUANTUM_CASE(16)
- QUANTUM_CASE(17)
- QUANTUM_CASE(18)
- QUANTUM_CASE(19)
- QUANTUM_CASE(20)
- QUANTUM_CASE(21)
- QUANTUM_CASE(22)
- QUANTUM_CASE(23)
- QUANTUM_CASE(24)
- QUANTUM_CASE(25)
- QUANTUM_CASE(26)
- QUANTUM_CASE(27)
- QUANTUM_CASE(28)
- QUANTUM_CASE(29)
- QUANTUM_CASE(30)
- QUANTUM_CASE(31)
- QUANTUM_CASE(32)
-
-#if (QUANTUM_2POW_MIN <= 3)
- POW2_CASE(9)
-#endif
- POW2_CASE(10)
- POW2_CASE(11)
- POW2_CASE(12) /* Handle up to 8 kB pages. */
- default:
+ if ((size & (size - 1)) == 0) {
+ /*
+ * log2_table allows fast division of a power of two in the
+ * [1..128] range.
+ *
+ * (x / divisor) becomes (x >> log2_table[divisor - 1]).
+ */
+ static const unsigned char log2_table[] = {
+ 0, 1, 0, 2, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 4,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7
+ };
+
+ if (size <= 128)
+ regind = (diff >> log2_table[size - 1]);
+ else if (size <= 32768)
+ regind = diff >> (8 + log2_table[(size >> 8) - 1]);
+ else {
+ /*
+ * The page size is too large for us to use the lookup
+ * table. Use real division.
+ */
regind = diff / size;
- }
-#undef POW2_CASE
-#undef QUANTUM_CASE
+ }
+ } else if (size <= ((sizeof(size_invs) / sizeof(unsigned))
+ << QUANTUM_2POW_MIN) + 2) {
+ regind = size_invs[(size >> QUANTUM_2POW_MIN) - 3] * diff;
+ regind >>= SIZE_INV_SHIFT;
+ } else {
+ /*
+ * size_invs isn't large enough to handle this size class, so
+ * calculate regind using actual division. This only happens
+ * if the user increases small_max via the 'S' runtime
+ * configuration option.
+ */
+ regind = diff / size;
+ };
+ assert(regind == diff / size);
assert(regind < bin->nregs);
- assert(regind * size == diff);
elm = regind >> (SIZEOF_INT_2POW + 3);
if (elm < run->regs_minelm)
@@ -1647,6 +1646,8 @@ arena_run_reg_dalloc(arena_run_t *run, arena_bin_t *bin, void *ptr, size_t size)
bit = regind - (elm << (SIZEOF_INT_2POW + 3));
assert((run->regs_mask[elm] & (1 << bit)) == 0);
run->regs_mask[elm] |= (1 << bit);
+#undef SIZE_INV
+#undef SIZE_INV_SHIFT
}
static void
@@ -2278,8 +2279,7 @@ arena_ralloc(void *ptr, size_t size, size_t oldsize)
== (QUANTUM_CEILING(oldsize) >> opt_quantum_2pow))
goto IN_PLACE;
} else {
- if (oldsize > small_max &&
- pow2_ceil(size) == pow2_ceil(oldsize))
+ if (oldsize > small_max && pow2_ceil(size) == oldsize)
goto IN_PLACE;
}
@@ -2663,29 +2663,36 @@ static void *
ipalloc(size_t alignment, size_t size)
{
void *ret;
- size_t pow2_size;
+ size_t alloc_size;
/*
- * Round up to the nearest power of two that is >= alignment and
- * >= size.
+ * Take advantage of the fact that for each size class, every object is
+ * aligned at the smallest power of two that is non-zero in the base
+ * two representation of the size. For example:
+ *
+ * Size | Base 2 | Minimum alignment
+ * -----+----------+------------------
+ * 96 | 1100000 | 32
+ * 144 | 10100000 | 32
+ * 192 | 11000000 | 64
+ *
+ * Depending on runtime settings, it is possible that arena_malloc()
+ * will further round up to a power of two, but that never causes
+ * correctness issues.
*/
- if (size > alignment)
- pow2_size = pow2_ceil(size);
- else
- pow2_size = alignment;
- pow2_size = QUANTUM_CEILING(pow2_size);
- if (pow2_size < size) {
+ alloc_size = (size + (alignment - 1)) & (-alignment);
+ if (alloc_size < size) {
/* size_t overflow. */
return (NULL);
}
- if (pow2_size <= arena_maxclass)
- ret = arena_malloc(choose_arena(), pow2_size);
+ if (alloc_size <= arena_maxclass)
+ ret = arena_malloc(choose_arena(), alloc_size);
else {
if (alignment <= chunk_size)
ret = huge_malloc(size);
else {
- size_t chunksize, alloc_size, offset;
+ size_t chunksize, offset;
chunk_node_t *node;
/*
OpenPOWER on IntegriCloud