summaryrefslogtreecommitdiffstats
path: root/lib
diff options
context:
space:
mode:
authorjasone <jasone@FreeBSD.org>2008-08-27 02:00:53 +0000
committerjasone <jasone@FreeBSD.org>2008-08-27 02:00:53 +0000
commita734052e9cc1d5fcb1f68103dcaf5e4c9324994d (patch)
tree0e5399c773ca24c377f8a89207f8b7bf90716b12 /lib
parentcad1568d07cb3426f863ebdd3ef86f0befecc497 (diff)
downloadFreeBSD-src-a734052e9cc1d5fcb1f68103dcaf5e4c9324994d.zip
FreeBSD-src-a734052e9cc1d5fcb1f68103dcaf5e4c9324994d.tar.gz
Add thread-specific caching for small size classes, based on magazines.
This caching allows for completely lock-free allocation/deallocation in the steady state, at the expense of likely increased memory use and fragmentation. Reduce the default number of arenas to 2*ncpus, since thread-specific caching typically reduces arena contention. Modify size class spacing to include ranges of 2^n-spaced, quantum-spaced, cacheline-spaced, and subpage-spaced size classes. The advantages are: fewer size classes, reduced false cacheline sharing, and reduced internal fragmentation for allocations that are slightly over 512, 1024, etc. Increase RUN_MAX_SMALL, in order to limit fragmentation for the subpage-spaced size classes. Add a size-->bin lookup table for small sizes to simplify translating sizes to size classes. Include a hard-coded constant table that is used unless custom size class spacing is specified at run time. Add the ability to disable tiny size classes at compile time via MALLOC_TINY.
Diffstat (limited to 'lib')
-rw-r--r--lib/libc/include/libc_private.h6
-rw-r--r--lib/libc/stdlib/Symbol.map1
-rw-r--r--lib/libc/stdlib/malloc.377
-rw-r--r--lib/libc/stdlib/malloc.c1311
-rw-r--r--lib/libthr/thread/thr_exit.c4
5 files changed, 1143 insertions, 256 deletions
diff --git a/lib/libc/include/libc_private.h b/lib/libc/include/libc_private.h
index 70e1ef8..50903f3 100644
--- a/lib/libc/include/libc_private.h
+++ b/lib/libc/include/libc_private.h
@@ -158,6 +158,12 @@ void _set_tp(void *tp);
extern const char *__progname;
/*
+ * This function is used by the threading libraries to notify malloc that a
+ * thread is exiting.
+ */
+void _malloc_thread_cleanup(void);
+
+/*
* These functions are used by the threading libraries in order to protect
* malloc across fork().
*/
diff --git a/lib/libc/stdlib/Symbol.map b/lib/libc/stdlib/Symbol.map
index 10dff7e..d159bed 100644
--- a/lib/libc/stdlib/Symbol.map
+++ b/lib/libc/stdlib/Symbol.map
@@ -93,6 +93,7 @@ FBSD_1.0 {
};
FBSDprivate_1.0 {
+ _malloc_thread_cleanup;
_malloc_prefork;
_malloc_postfork;
__system;
diff --git a/lib/libc/stdlib/malloc.3 b/lib/libc/stdlib/malloc.3
index 7cd04de..994038d 100644
--- a/lib/libc/stdlib/malloc.3
+++ b/lib/libc/stdlib/malloc.3
@@ -32,7 +32,7 @@
.\" @(#)malloc.3 8.1 (Berkeley) 6/4/93
.\" $FreeBSD$
.\"
-.Dd February 17, 2008
+.Dd August 26, 2008
.Dt MALLOC 3
.Os
.Sh NAME
@@ -154,7 +154,7 @@ should not be depended on, since such behavior is entirely
implementation-dependent.
.Sh TUNING
Once, when the first call is made to one of these memory allocation
-routines, various flags will be set or reset, which affect the
+routines, various flags will be set or reset, which affects the
workings of this allocator implementation.
.Pp
The
@@ -196,6 +196,11 @@ it should be to contention over arenas.
Therefore, some applications may benefit from increasing or decreasing this
threshold parameter.
This option is not available for some configurations (non-PIC).
+.It C
+Double/halve the size of the maximum size class that is a multiple of the
+cacheline size (64).
+Above this size, subpage spacing (256 bytes) is used for size classes.
+The default value is 512 bytes.
.It D
Use
.Xr sbrk 2
@@ -214,6 +219,16 @@ physical memory becomes scarce and the pages remain unused.
The default is 512 pages per arena;
.Ev MALLOC_OPTIONS=10f
will prevent any dirty unused pages from accumulating.
+.It G
+When there are multiple threads, use thread-specific caching for objects that
+are smaller than one page.
+This option is enabled by default.
+Thread-specific caching allows many allocations to be satisfied without
+performing any thread synchronization, at the cost of increased memory use.
+See the
+.Dq R
+option for related tuning information.
+This option is not available for some configurations (non-PIC).
.It J
Each byte of new memory allocated by
.Fn malloc ,
@@ -248,7 +263,7 @@ option is implicitly enabled in order to assure that there is a method for
acquiring memory.
.It N
Double/halve the number of arenas.
-The default number of arenas is four times the number of CPUs, or one if there
+The default number of arenas is two times the number of CPUs, or one if there
is a single CPU.
.It P
Various statistics are printed at program exit via an
@@ -259,14 +274,18 @@ while one or more threads are executing in the memory allocation functions.
Therefore, this option should only be used with care; it is primarily intended
as a performance tuning aid during application development.
.It Q
-Double/halve the size of the allocation quantum.
-The default quantum is the minimum allowed by the architecture (typically 8 or
-16 bytes).
-.It S
Double/halve the size of the maximum size class that is a multiple of the
-quantum.
-Above this size, power-of-two spacing is used for size classes.
-The default value is 512 bytes.
+quantum (8 or 16 bytes, depending on architecture).
+Above this size, cacheline spacing is used for size classes.
+The default value is 128 bytes.
+.It R
+Double/halve magazine size, which approximately doubles/halves the number of
+rounds in each magazine.
+Magazines are used by the thread-specific caching machinery to acquire and
+release objects in bulk.
+Increasing the magazine size decreases locking overhead, at the expense of
+increased memory usage.
+This option is not available for some configurations (non-PIC).
.It U
Generate
.Dq utrace
@@ -358,6 +377,13 @@ improve performance, mainly due to reduced cache performance.
However, it may make sense to reduce the number of arenas if an application
does not make much use of the allocation functions.
.Pp
+In addition to multiple arenas, this allocator supports thread-specific
+caching for small objects (smaller than one page), in order to make it
+possible to completely avoid synchronization for most small allocation requests.
+Such caching allows very fast allocation in the common case, but it increases
+memory usage and fragmentation, since a bounded number of objects can remain
+allocated in each thread cache.
+.Pp
Memory is conceptually broken into equal-sized chunks, where the chunk size is
a power of two that is greater than the page size.
Chunks are always aligned to multiples of the chunk size.
@@ -366,7 +392,7 @@ quickly.
.Pp
User objects are broken into three categories according to size: small, large,
and huge.
-Small objects are no larger than one half of a page.
+Small objects are smaller than one page.
Large objects are smaller than the chunk size.
Huge objects are a multiple of the chunk size.
Small and large objects are managed by arenas; huge objects are managed
@@ -378,23 +404,24 @@ Each chunk that is managed by an arena tracks its contents as runs of
contiguous pages (unused, backing a set of small objects, or backing one large
object).
The combination of chunk alignment and chunk page maps makes it possible to
-determine all metadata regarding small and large allocations in
-constant and logarithmic time, respectively.
+determine all metadata regarding small and large allocations in constant time.
.Pp
Small objects are managed in groups by page runs.
Each run maintains a bitmap that tracks which regions are in use.
-Allocation requests that are no more than half the quantum (see the
-.Dq Q
-option) are rounded up to the nearest power of two (typically 2, 4, or 8).
+Allocation requests that are no more than half the quantum (8 or 16, depending
+on architecture) are rounded up to the nearest power of two.
Allocation requests that are more than half the quantum, but no more than the
-maximum quantum-multiple size class (see the
-.Dq S
+minimum cacheline-multiple size class (see the
+.Dq Q
option) are rounded up to the nearest multiple of the quantum.
-Allocation requests that are larger than the maximum quantum-multiple size
-class, but no larger than one half of a page, are rounded up to the nearest
-power of two.
-Allocation requests that are larger than half of a page, but small enough to
-fit in an arena-managed chunk (see the
+Allocation requests that are more than the minumum cacheline-multiple size
+class, but no more than the minimum subpage-multiple size class (see the
+.Dq C
+option) are rounded up to the nearest multiple of the cacheline size (64).
+Allocation requests that are more than the minimum subpage-multiple size class
+are rounded up to the nearest multiple of the subpage size (256).
+Allocation requests that are more than one page, but small enough to fit in
+an arena-managed chunk (see the
.Dq K
option), are rounded up to the nearest run size.
Allocation requests that are too large to fit in an arena-managed chunk are
@@ -402,8 +429,8 @@ rounded up to the nearest multiple of the chunk size.
.Pp
Allocations are packed tightly together, which can be an issue for
multi-threaded applications.
-If you need to assure that allocations do not suffer from cache line sharing,
-round your allocation requests up to the nearest multiple of the cache line
+If you need to assure that allocations do not suffer from cacheline sharing,
+round your allocation requests up to the nearest multiple of the cacheline
size.
.Sh DEBUGGING MALLOC PROBLEMS
The first thing to do is to set the
diff --git a/lib/libc/stdlib/malloc.c b/lib/libc/stdlib/malloc.c
index b36dbc7..a32ffa7 100644
--- a/lib/libc/stdlib/malloc.c
+++ b/lib/libc/stdlib/malloc.c
@@ -35,6 +35,9 @@
* + Multiple arenas are used if there are multiple CPUs, which reduces lock
* contention and cache sloshing.
*
+ * + Thread-specific caching is used if there are multiple threads, which
+ * reduces the amount of locking.
+ *
* + Cache line sharing between arenas is avoided for internal data
* structures.
*
@@ -48,37 +51,49 @@
* and a 16 byte quantum on a 32-bit system, the size classes in each category
* are as follows:
*
- * |=====================================|
- * | Category | Subcategory | Size |
- * |=====================================|
- * | Small | Tiny | 2 |
- * | | | 4 |
- * | | | 8 |
- * | |----------------+---------|
- * | | Quantum-spaced | 16 |
- * | | | 32 |
- * | | | 48 |
- * | | | ... |
- * | | | 480 |
- * | | | 496 |
- * | | | 512 |
- * | |----------------+---------|
- * | | Sub-page | 1 kB |
- * | | | 2 kB |
- * |=====================================|
- * | Large | 4 kB |
- * | | 8 kB |
- * | | 12 kB |
- * | | ... |
- * | | 1012 kB |
- * | | 1016 kB |
- * | | 1020 kB |
- * |=====================================|
- * | Huge | 1 MB |
- * | | 2 MB |
- * | | 3 MB |
- * | | ... |
- * |=====================================|
+ * |=======================================|
+ * | Category | Subcategory | Size |
+ * |=======================================|
+ * | Small | Tiny | 2 |
+ * | | | 4 |
+ * | | | 8 |
+ * | |------------------+---------|
+ * | | Quantum-spaced | 16 |
+ * | | | 32 |
+ * | | | 48 |
+ * | | | ... |
+ * | | | 96 |
+ * | | | 112 |
+ * | | | 128 |
+ * | |------------------+---------|
+ * | | Cacheline-spaced | 192 |
+ * | | | 256 |
+ * | | | 320 |
+ * | | | 384 |
+ * | | | 448 |
+ * | | | 512 |
+ * | |------------------+---------|
+ * | | Sub-page | 760 |
+ * | | | 1024 |
+ * | | | 1280 |
+ * | | | ... |
+ * | | | 3328 |
+ * | | | 3584 |
+ * | | | 3840 |
+ * |=======================================|
+ * | Large | 4 kB |
+ * | | 8 kB |
+ * | | 12 kB |
+ * | | ... |
+ * | | 1012 kB |
+ * | | 1016 kB |
+ * | | 1020 kB |
+ * |=======================================|
+ * | Huge | 1 MB |
+ * | | 2 MB |
+ * | | 3 MB |
+ * | | ... |
+ * |=======================================|
*
* A different mechanism is used for each category:
*
@@ -113,6 +128,19 @@
#endif
/*
+ * MALLOC_TINY enables support for tiny objects, which are smaller than one
+ * quantum.
+ */
+#define MALLOC_TINY
+
+/*
+ * MALLOC_MAG enables a magazine-based thread-specific caching layer for small
+ * objects. This makes it possible to allocate/deallocate objects without any
+ * locking when the cache is in the steady state.
+ */
+#define MALLOC_MAG
+
+/*
* MALLOC_BALANCE enables monitoring of arena lock contention and dynamically
* re-balances arena load if exponentially averaged contention exceeds a
* certain threshold.
@@ -184,46 +212,63 @@ __FBSDID("$FreeBSD$");
/* Size of stack-allocated buffer passed to strerror_r(). */
#define STRERROR_BUF 64
-/* Minimum alignment of allocations is 2^QUANTUM_2POW_MIN bytes. */
+/*
+ * The const_size2bin table is sized according to PAGESIZE_2POW, but for
+ * correctness reasons, we never assume that
+ * (pagesize == (1U << * PAGESIZE_2POW)).
+ *
+ * Minimum alignment of allocations is 2^QUANTUM_2POW bytes.
+ */
#ifdef __i386__
-# define QUANTUM_2POW_MIN 4
+# define PAGESIZE_2POW 12
+# define QUANTUM_2POW 4
# define SIZEOF_PTR_2POW 2
# define CPU_SPINWAIT __asm__ volatile("pause")
#endif
#ifdef __ia64__
-# define QUANTUM_2POW_MIN 4
+# define PAGESIZE_2POW 12
+# define QUANTUM_2POW 4
# define SIZEOF_PTR_2POW 3
#endif
#ifdef __alpha__
-# define QUANTUM_2POW_MIN 4
+# define PAGESIZE_2POW 13
+# define QUANTUM_2POW 4
# define SIZEOF_PTR_2POW 3
# define NO_TLS
#endif
#ifdef __sparc64__
-# define QUANTUM_2POW_MIN 4
+# define PAGESIZE_2POW 13
+# define QUANTUM_2POW 4
# define SIZEOF_PTR_2POW 3
# define NO_TLS
#endif
#ifdef __amd64__
-# define QUANTUM_2POW_MIN 4
+# define PAGESIZE_2POW 12
+# define QUANTUM_2POW 4
# define SIZEOF_PTR_2POW 3
# define CPU_SPINWAIT __asm__ volatile("pause")
#endif
#ifdef __arm__
-# define QUANTUM_2POW_MIN 3
+# define PAGESIZE_2POW 12
+# define QUANTUM_2POW 3
# define SIZEOF_PTR_2POW 2
# define NO_TLS
#endif
#ifdef __mips__
-# define QUANTUM_2POW_MIN 3
+# define PAGESIZE_2POW 12
+# define QUANTUM_2POW 3
# define SIZEOF_PTR_2POW 2
# define NO_TLS
#endif
#ifdef __powerpc__
-# define QUANTUM_2POW_MIN 4
+# define PAGESIZE_2POW 12
+# define QUANTUM_2POW 4
# define SIZEOF_PTR_2POW 2
#endif
+#define QUANTUM ((size_t)(1U << QUANTUM_2POW))
+#define QUANTUM_MASK (QUANTUM - 1)
+
#define SIZEOF_PTR (1U << SIZEOF_PTR_2POW)
/* sizeof(int) == (1U << SIZEOF_INT_2POW). */
@@ -237,6 +282,10 @@ __FBSDID("$FreeBSD$");
#endif
#ifdef NO_TLS
+ /* MALLOC_MAG requires TLS. */
+# ifdef MALLOC_MAG
+# undef MALLOC_MAG
+# endif
/* MALLOC_BALANCE requires TLS. */
# ifdef MALLOC_BALANCE
# undef MALLOC_BALANCE
@@ -253,23 +302,42 @@ __FBSDID("$FreeBSD$");
#define DIRTY_MAX_DEFAULT (1U << 9)
/*
- * Maximum size of L1 cache line. This is used to avoid cache line aliasing,
- * so over-estimates are okay (up to a point), but under-estimates will
- * negatively affect performance.
+ * Maximum size of L1 cache line. This is used to avoid cache line aliasing.
+ * In addition, this controls the spacing of cacheline-spaced size classes.
*/
#define CACHELINE_2POW 6
#define CACHELINE ((size_t)(1U << CACHELINE_2POW))
+#define CACHELINE_MASK (CACHELINE - 1)
-/* Smallest size class to support. */
-#define TINY_MIN_2POW 1
+/*
+ * Subpages are an artificially designated partitioning of pages. Their only
+ * purpose is to support subpage-spaced size classes.
+ *
+ * There must be at least 4 subpages per page, due to the way size classes are
+ * handled.
+ */
+#define SUBPAGE_2POW 8
+#define SUBPAGE ((size_t)(1U << SUBPAGE_2POW))
+#define SUBPAGE_MASK (SUBPAGE - 1)
+
+#ifdef MALLOC_TINY
+ /* Smallest size class to support. */
+# define TINY_MIN_2POW 1
+#endif
/*
* Maximum size class that is a multiple of the quantum, but not (necessarily)
* a power of 2. Above this size, allocations are rounded up to the nearest
* power of 2.
*/
-#define SMALL_MAX_2POW_DEFAULT 9
-#define SMALL_MAX_DEFAULT (1U << SMALL_MAX_2POW_DEFAULT)
+#define QSPACE_MAX_2POW_DEFAULT 7
+
+/*
+ * Maximum size class that is a multiple of the cacheline, but not (necessarily)
+ * a power of 2. Above this size, allocations are rounded up to the nearest
+ * power of 2.
+ */
+#define CSPACE_MAX_2POW_DEFAULT 9
/*
* RUN_MAX_OVRHD indicates maximum desired run header overhead. Runs are sized
@@ -293,8 +361,7 @@ __FBSDID("$FreeBSD$");
#define RUN_MAX_OVRHD_RELAX 0x00001800U
/* Put a cap on small object run size. This overrides RUN_MAX_OVRHD. */
-#define RUN_MAX_SMALL_2POW 15
-#define RUN_MAX_SMALL (1U << RUN_MAX_SMALL_2POW)
+#define RUN_MAX_SMALL (12 * pagesize)
/*
* Hyper-threaded CPUs may need a special instruction inside spin loops in
@@ -319,6 +386,15 @@ __FBSDID("$FreeBSD$");
*/
#define BLOCK_COST_2POW 4
+#ifdef MALLOC_MAG
+ /*
+ * Default magazine size, in bytes. max_rounds is calculated to make
+ * optimal use of the space, leaving just enough room for the magazine
+ * header.
+ */
+# define MAG_SIZE_2POW_DEFAULT 9
+#endif
+
#ifdef MALLOC_BALANCE
/*
* We use an exponential moving average to track recent lock contention,
@@ -369,6 +445,11 @@ struct malloc_bin_stats_s {
*/
uint64_t nrequests;
+#ifdef MALLOC_MAG
+ /* Number of magazine reloads from this bin. */
+ uint64_t nmags;
+#endif
+
/* Total number of runs created for this bin's size class. */
uint64_t nruns;
@@ -678,6 +759,35 @@ struct arena_s {
/******************************************************************************/
/*
+ * Magazine data structures.
+ */
+
+#ifdef MALLOC_MAG
+typedef struct mag_s mag_t;
+struct mag_s {
+ size_t binind; /* Index of associated bin. */
+ size_t nrounds;
+ void *rounds[1]; /* Dynamically sized. */
+};
+
+/*
+ * Magazines are lazily allocated, but once created, they remain until the
+ * associated mag_rack is destroyed.
+ */
+typedef struct bin_mags_s bin_mags_t;
+struct bin_mags_s {
+ mag_t *curmag;
+ mag_t *sparemag;
+};
+
+typedef struct mag_rack_s mag_rack_t;
+struct mag_rack_s {
+ bin_mags_t bin_mags[1]; /* Dynamically sized. */
+};
+#endif
+
+/******************************************************************************/
+/*
* Data.
*/
@@ -690,16 +800,147 @@ static size_t pagesize_mask;
static size_t pagesize_2pow;
/* Various bin-related settings. */
-static size_t bin_maxclass; /* Max size class for bins. */
-static unsigned ntbins; /* Number of (2^n)-spaced tiny bins. */
+#ifdef MALLOC_TINY /* Number of (2^n)-spaced tiny bins. */
+# define ntbins ((unsigned)(QUANTUM_2POW - TINY_MIN_2POW))
+#else
+# define ntbins 0
+#endif
static unsigned nqbins; /* Number of quantum-spaced bins. */
-static unsigned nsbins; /* Number of (2^n)-spaced sub-page bins. */
-static size_t small_min;
-static size_t small_max;
-
-/* Various quantum-related settings. */
-static size_t quantum;
-static size_t quantum_mask; /* (quantum - 1). */
+static unsigned ncbins; /* Number of cacheline-spaced bins. */
+static unsigned nsbins; /* Number of subpage-spaced bins. */
+static unsigned nbins;
+#ifdef MALLOC_TINY
+# define tspace_max ((size_t)(QUANTUM >> 1))
+#endif
+#define qspace_min QUANTUM
+static size_t qspace_max;
+static size_t cspace_min;
+static size_t cspace_max;
+static size_t sspace_min;
+static size_t sspace_max;
+#define bin_maxclass sspace_max
+
+static uint8_t const *size2bin;
+/*
+ * const_size2bin is a static constant lookup table that in the common case can
+ * be used as-is for size2bin. For dynamically linked programs, this avoids
+ * a page of memory overhead per process.
+ */
+#define S2B_1(i) i,
+#define S2B_2(i) S2B_1(i) S2B_1(i)
+#define S2B_4(i) S2B_2(i) S2B_2(i)
+#define S2B_8(i) S2B_4(i) S2B_4(i)
+#define S2B_16(i) S2B_8(i) S2B_8(i)
+#define S2B_32(i) S2B_16(i) S2B_16(i)
+#define S2B_64(i) S2B_32(i) S2B_32(i)
+#define S2B_128(i) S2B_64(i) S2B_64(i)
+#define S2B_256(i) S2B_128(i) S2B_128(i)
+static const uint8_t const_size2bin[(1U << PAGESIZE_2POW) - 255] = {
+ S2B_1(0xffU) /* 0 */
+#if (QUANTUM_2POW == 4)
+/* 64-bit system ************************/
+# ifdef MALLOC_TINY
+ S2B_2(0) /* 2 */
+ S2B_2(1) /* 4 */
+ S2B_4(2) /* 8 */
+ S2B_8(3) /* 16 */
+# define S2B_QMIN 3
+# else
+ S2B_16(0) /* 16 */
+# define S2B_QMIN 0
+# endif
+ S2B_16(S2B_QMIN + 1) /* 32 */
+ S2B_16(S2B_QMIN + 2) /* 48 */
+ S2B_16(S2B_QMIN + 3) /* 64 */
+ S2B_16(S2B_QMIN + 4) /* 80 */
+ S2B_16(S2B_QMIN + 5) /* 96 */
+ S2B_16(S2B_QMIN + 6) /* 112 */
+ S2B_16(S2B_QMIN + 7) /* 128 */
+# define S2B_CMIN (S2B_QMIN + 8)
+#else
+/* 32-bit system ************************/
+# ifdef MALLOC_TINY
+ S2B_2(0) /* 2 */
+ S2B_2(1) /* 4 */
+ S2B_4(2) /* 8 */
+# define S2B_QMIN 2
+# else
+ S2B_8(0) /* 8 */
+# define S2B_QMIN 0
+# endif
+ S2B_8(S2B_QMIN + 1) /* 16 */
+ S2B_8(S2B_QMIN + 2) /* 24 */
+ S2B_8(S2B_QMIN + 3) /* 32 */
+ S2B_8(S2B_QMIN + 4) /* 40 */
+ S2B_8(S2B_QMIN + 5) /* 48 */
+ S2B_8(S2B_QMIN + 6) /* 56 */
+ S2B_8(S2B_QMIN + 7) /* 64 */
+ S2B_8(S2B_QMIN + 8) /* 72 */
+ S2B_8(S2B_QMIN + 9) /* 80 */
+ S2B_8(S2B_QMIN + 10) /* 88 */
+ S2B_8(S2B_QMIN + 11) /* 96 */
+ S2B_8(S2B_QMIN + 12) /* 104 */
+ S2B_8(S2B_QMIN + 13) /* 112 */
+ S2B_8(S2B_QMIN + 14) /* 120 */
+ S2B_8(S2B_QMIN + 15) /* 128 */
+# define S2B_CMIN (S2B_QMIN + 16)
+#endif
+/****************************************/
+ S2B_64(S2B_CMIN + 0) /* 192 */
+ S2B_64(S2B_CMIN + 1) /* 256 */
+ S2B_64(S2B_CMIN + 2) /* 320 */
+ S2B_64(S2B_CMIN + 3) /* 384 */
+ S2B_64(S2B_CMIN + 4) /* 448 */
+ S2B_64(S2B_CMIN + 5) /* 512 */
+# define S2B_SMIN (S2B_CMIN + 6)
+ S2B_256(S2B_SMIN + 0) /* 768 */
+ S2B_256(S2B_SMIN + 1) /* 1024 */
+ S2B_256(S2B_SMIN + 2) /* 1280 */
+ S2B_256(S2B_SMIN + 3) /* 1536 */
+ S2B_256(S2B_SMIN + 4) /* 1792 */
+ S2B_256(S2B_SMIN + 5) /* 2048 */
+ S2B_256(S2B_SMIN + 6) /* 2304 */
+ S2B_256(S2B_SMIN + 7) /* 2560 */
+ S2B_256(S2B_SMIN + 8) /* 2816 */
+ S2B_256(S2B_SMIN + 9) /* 3072 */
+ S2B_256(S2B_SMIN + 10) /* 3328 */
+ S2B_256(S2B_SMIN + 11) /* 3584 */
+ S2B_256(S2B_SMIN + 12) /* 3840 */
+#if (PAGESIZE_2POW == 13)
+ S2B_256(S2B_SMIN + 13) /* 4096 */
+ S2B_256(S2B_SMIN + 14) /* 4352 */
+ S2B_256(S2B_SMIN + 15) /* 4608 */
+ S2B_256(S2B_SMIN + 16) /* 4864 */
+ S2B_256(S2B_SMIN + 17) /* 5120 */
+ S2B_256(S2B_SMIN + 18) /* 5376 */
+ S2B_256(S2B_SMIN + 19) /* 5632 */
+ S2B_256(S2B_SMIN + 20) /* 5888 */
+ S2B_256(S2B_SMIN + 21) /* 6144 */
+ S2B_256(S2B_SMIN + 22) /* 6400 */
+ S2B_256(S2B_SMIN + 23) /* 6656 */
+ S2B_256(S2B_SMIN + 24) /* 6912 */
+ S2B_256(S2B_SMIN + 25) /* 7168 */
+ S2B_256(S2B_SMIN + 26) /* 7424 */
+ S2B_256(S2B_SMIN + 27) /* 7680 */
+ S2B_256(S2B_SMIN + 28) /* 7936 */
+#endif
+};
+#undef S2B_1
+#undef S2B_2
+#undef S2B_4
+#undef S2B_8
+#undef S2B_16
+#undef S2B_32
+#undef S2B_64
+#undef S2B_128
+#undef S2B_256
+#undef S2B_QMIN
+#undef S2B_CMIN
+#undef S2B_SMIN
+
+#ifdef MALLOC_MAG
+static size_t max_rounds;
+#endif
/* Various chunk-related settings. */
static size_t chunksize;
@@ -796,6 +1037,14 @@ static pthread_mutex_t arenas_lock; /* Protects arenas initialization. */
static __thread arena_t *arenas_map;
#endif
+#ifdef MALLOC_MAG
+/*
+ * Map of thread-specific magazine racks, used for thread-specific object
+ * caching.
+ */
+static __thread mag_rack_t *mag_rack;
+#endif
+
#ifdef MALLOC_STATS
/* Chunk statistics. */
static chunk_stats_t stats_chunks;
@@ -818,13 +1067,17 @@ static bool opt_junk = false;
static bool opt_dss = true;
static bool opt_mmap = true;
#endif
+#ifdef MALLOC_MAG
+static bool opt_mag = true;
+static size_t opt_mag_size_2pow = MAG_SIZE_2POW_DEFAULT;
+#endif
static size_t opt_dirty_max = DIRTY_MAX_DEFAULT;
#ifdef MALLOC_BALANCE
static uint64_t opt_balance_threshold = BALANCE_THRESHOLD_DEFAULT;
#endif
static bool opt_print_stats = false;
-static size_t opt_quantum_2pow = QUANTUM_2POW_MIN;
-static size_t opt_small_max_2pow = SMALL_MAX_2POW_DEFAULT;
+static size_t opt_qspace_max_2pow = QSPACE_MAX_2POW_DEFAULT;
+static size_t opt_cspace_max_2pow = CSPACE_MAX_2POW_DEFAULT;
static size_t opt_chunk_2pow = CHUNK_2POW_DEFAULT;
static bool opt_utrace = false;
static bool opt_sysv = false;
@@ -902,15 +1155,21 @@ static void arena_run_trim_head(arena_t *arena, arena_chunk_t *chunk,
static void arena_run_trim_tail(arena_t *arena, arena_chunk_t *chunk,
arena_run_t *run, size_t oldsize, size_t newsize, bool dirty);
static arena_run_t *arena_bin_nonfull_run_get(arena_t *arena, arena_bin_t *bin);
-static void *arena_bin_malloc_hard(arena_t *arena, arena_bin_t *bin);
-static size_t arena_bin_run_size_calc(arena_bin_t *bin, size_t min_run_size);
+static void *arena_bin_malloc_hard(arena_t *arena, arena_bin_t *bin);
+static size_t arena_bin_run_size_calc(arena_bin_t *bin, size_t min_run_size);
#ifdef MALLOC_BALANCE
static void arena_lock_balance_hard(arena_t *arena);
#endif
+#ifdef MALLOC_MAG
+static void mag_load(mag_t *mag);
+#endif
static void *arena_malloc_large(arena_t *arena, size_t size, bool zero);
static void *arena_palloc(arena_t *arena, size_t alignment, size_t size,
size_t alloc_size);
static size_t arena_salloc(const void *ptr);
+#ifdef MALLOC_MAG
+static void mag_unload(mag_t *mag);
+#endif
static void arena_dalloc_large(arena_t *arena, arena_chunk_t *chunk,
void *ptr);
static void arena_ralloc_large_shrink(arena_t *arena, arena_chunk_t *chunk,
@@ -921,11 +1180,22 @@ static bool arena_ralloc_large(void *ptr, size_t size, size_t oldsize);
static void *arena_ralloc(void *ptr, size_t size, size_t oldsize);
static bool arena_new(arena_t *arena);
static arena_t *arenas_extend(unsigned ind);
+#ifdef MALLOC_MAG
+static mag_t *mag_create(arena_t *arena, size_t binind);
+static void mag_destroy(mag_t *mag);
+static mag_rack_t *mag_rack_create(arena_t *arena);
+static void mag_rack_destroy(mag_rack_t *rack);
+#endif
static void *huge_malloc(size_t size, bool zero);
static void *huge_palloc(size_t alignment, size_t size);
static void *huge_ralloc(void *ptr, size_t size, size_t oldsize);
static void huge_dalloc(void *ptr);
static void malloc_print_stats(void);
+#ifdef MALLOC_DEBUG
+static void size2bin_validate(void);
+#endif
+static bool size2bin_init(void);
+static bool size2bin_init_hard(void);
static bool malloc_init_hard(void);
/*
@@ -1063,18 +1333,23 @@ malloc_spin_unlock(pthread_mutex_t *lock)
#define CHUNK_CEILING(s) \
(((s) + chunksize_mask) & ~chunksize_mask)
+/* Return the smallest quantum multiple that is >= a. */
+#define QUANTUM_CEILING(a) \
+ (((a) + QUANTUM_MASK) & ~QUANTUM_MASK)
+
/* Return the smallest cacheline multiple that is >= s. */
#define CACHELINE_CEILING(s) \
- (((s) + (CACHELINE - 1)) & ~(CACHELINE - 1))
+ (((s) + CACHELINE_MASK) & ~CACHELINE_MASK)
-/* Return the smallest quantum multiple that is >= a. */
-#define QUANTUM_CEILING(a) \
- (((a) + quantum_mask) & ~quantum_mask)
+/* Return the smallest subpage multiple that is >= s. */
+#define SUBPAGE_CEILING(s) \
+ (((s) + SUBPAGE_MASK) & ~SUBPAGE_MASK)
/* Return the smallest pagesize multiple that is >= s. */
#define PAGE_CEILING(s) \
(((s) + pagesize_mask) & ~pagesize_mask)
+#ifdef MALLOC_TINY
/* Compute the smallest power of 2 that is >= x. */
static inline size_t
pow2_ceil(size_t x)
@@ -1092,6 +1367,7 @@ pow2_ceil(size_t x)
x++;
return (x);
}
+#endif
#ifdef MALLOC_BALANCE
/*
@@ -1382,10 +1658,19 @@ stats_print(arena_t *arena)
arena->stats.ndalloc_small + arena->stats.ndalloc_large);
malloc_printf("mapped: %12zu\n", arena->stats.mapped);
- malloc_printf("bins: bin size regs pgs requests newruns"
- " reruns maxruns curruns\n");
- for (i = 0, gap_start = UINT_MAX; i < ntbins + nqbins + nsbins; i++) {
- if (arena->bins[i].stats.nrequests == 0) {
+#ifdef MALLOC_MAG
+ if (__isthreaded && opt_mag) {
+ malloc_printf("bins: bin size regs pgs mags "
+ "newruns reruns maxruns curruns\n");
+ } else {
+#endif
+ malloc_printf("bins: bin size regs pgs requests "
+ "newruns reruns maxruns curruns\n");
+#ifdef MALLOC_MAG
+ }
+#endif
+ for (i = 0, gap_start = UINT_MAX; i < nbins; i++) {
+ if (arena->bins[i].stats.nruns == 0) {
if (gap_start == UINT_MAX)
gap_start = i;
} else {
@@ -1404,10 +1689,15 @@ stats_print(arena_t *arena)
"%13u %1s %4u %4u %3u %9llu %9llu"
" %9llu %7lu %7lu\n",
i,
- i < ntbins ? "T" : i < ntbins + nqbins ? "Q" : "S",
+ i < ntbins ? "T" : i < ntbins + nqbins ? "Q" :
+ i < ntbins + nqbins + ncbins ? "C" : "S",
arena->bins[i].reg_size,
arena->bins[i].nregs,
arena->bins[i].run_size >> pagesize_2pow,
+#ifdef MALLOC_MAG
+ (__isthreaded && opt_mag) ?
+ arena->bins[i].stats.nmags :
+#endif
arena->bins[i].stats.nrequests,
arena->bins[i].stats.nruns,
arena->bins[i].stats.reruns,
@@ -2137,44 +2427,9 @@ arena_run_reg_alloc(arena_run_t *run, arena_bin_t *bin)
static inline void
arena_run_reg_dalloc(arena_run_t *run, arena_bin_t *bin, void *ptr, size_t size)
{
- /*
- * To divide by a number D that is not a power of two we multiply
- * by (2^21 / D) and then right shift by 21 positions.
- *
- * X / D
- *
- * becomes
- *
- * (X * size_invs[(D >> QUANTUM_2POW_MIN) - 3]) >> SIZE_INV_SHIFT
- */
-#define SIZE_INV_SHIFT 21
-#define SIZE_INV(s) (((1U << SIZE_INV_SHIFT) / (s << QUANTUM_2POW_MIN)) + 1)
- static const unsigned size_invs[] = {
- SIZE_INV(3),
- SIZE_INV(4), SIZE_INV(5), SIZE_INV(6), SIZE_INV(7),
- SIZE_INV(8), SIZE_INV(9), SIZE_INV(10), SIZE_INV(11),
- SIZE_INV(12),SIZE_INV(13), SIZE_INV(14), SIZE_INV(15),
- SIZE_INV(16),SIZE_INV(17), SIZE_INV(18), SIZE_INV(19),
- SIZE_INV(20),SIZE_INV(21), SIZE_INV(22), SIZE_INV(23),
- SIZE_INV(24),SIZE_INV(25), SIZE_INV(26), SIZE_INV(27),
- SIZE_INV(28),SIZE_INV(29), SIZE_INV(30), SIZE_INV(31)
-#if (QUANTUM_2POW_MIN < 4)
- ,
- SIZE_INV(32), SIZE_INV(33), SIZE_INV(34), SIZE_INV(35),
- SIZE_INV(36), SIZE_INV(37), SIZE_INV(38), SIZE_INV(39),
- SIZE_INV(40), SIZE_INV(41), SIZE_INV(42), SIZE_INV(43),
- SIZE_INV(44), SIZE_INV(45), SIZE_INV(46), SIZE_INV(47),
- SIZE_INV(48), SIZE_INV(49), SIZE_INV(50), SIZE_INV(51),
- SIZE_INV(52), SIZE_INV(53), SIZE_INV(54), SIZE_INV(55),
- SIZE_INV(56), SIZE_INV(57), SIZE_INV(58), SIZE_INV(59),
- SIZE_INV(60), SIZE_INV(61), SIZE_INV(62), SIZE_INV(63)
-#endif
- };
unsigned diff, regind, elm, bit;
assert(run->magic == ARENA_RUN_MAGIC);
- assert(((sizeof(size_invs)) / sizeof(unsigned)) + 3
- >= (SMALL_MAX_DEFAULT >> QUANTUM_2POW_MIN));
/*
* Avoid doing division with a variable divisor if possible. Using
@@ -2203,26 +2458,89 @@ arena_run_reg_dalloc(arena_run_t *run, arena_bin_t *bin, void *ptr, size_t size)
regind = (diff >> log2_table[size - 1]);
else if (size <= 32768)
regind = diff >> (8 + log2_table[(size >> 8) - 1]);
- else {
- /*
- * The run size is too large for us to use the lookup
- * table. Use real division.
- */
+ else
regind = diff / size;
- }
- } else if (size <= (((sizeof(size_invs) / sizeof(unsigned)) + 2)
- << QUANTUM_2POW_MIN)) {
- regind = size_invs[(size >> QUANTUM_2POW_MIN) - 3] * diff;
- regind >>= SIZE_INV_SHIFT;
- } else {
+ } else if (size < qspace_max) {
/*
- * size_invs isn't large enough to handle this size class, so
- * calculate regind using actual division. This only happens
- * if the user increases small_max via the 'S' runtime
- * configuration option.
+ * To divide by a number D that is not a power of two we
+ * multiply by (2^21 / D) and then right shift by 21 positions.
+ *
+ * X / D
+ *
+ * becomes
+ *
+ * (X * qsize_invs[(D >> QUANTUM_2POW) - 3])
+ * >> SIZE_INV_SHIFT
+ *
+ * We can omit the first three elements, because we never
+ * divide by 0, and QUANTUM and 2*QUANTUM are both powers of
+ * two, which are handled above.
*/
- regind = diff / size;
- };
+#define SIZE_INV_SHIFT 21
+#define QSIZE_INV(s) (((1U << SIZE_INV_SHIFT) / (s << QUANTUM_2POW)) + 1)
+ static const unsigned qsize_invs[] = {
+ QSIZE_INV(3),
+ QSIZE_INV(4), QSIZE_INV(5), QSIZE_INV(6), QSIZE_INV(7)
+#if (QUANTUM_2POW < 4)
+ ,
+ QSIZE_INV(8), QSIZE_INV(9), QSIZE_INV(10), QSIZE_INV(11),
+ QSIZE_INV(12),QSIZE_INV(13), QSIZE_INV(14), QSIZE_INV(15)
+#endif
+ };
+ assert(QUANTUM * (((sizeof(qsize_invs)) / sizeof(unsigned)) + 3)
+ >= (1U << QSPACE_MAX_2POW_DEFAULT));
+
+ if (size <= (((sizeof(qsize_invs) / sizeof(unsigned)) + 2) <<
+ QUANTUM_2POW)) {
+ regind = qsize_invs[(size >> QUANTUM_2POW) - 3] * diff;
+ regind >>= SIZE_INV_SHIFT;
+ } else
+ regind = diff / size;
+#undef QSIZE_INV
+ } else if (size < cspace_max) {
+#define CSIZE_INV(s) (((1U << SIZE_INV_SHIFT) / (s << CACHELINE_2POW)) + 1)
+ static const unsigned csize_invs[] = {
+ CSIZE_INV(3),
+ CSIZE_INV(4), CSIZE_INV(5), CSIZE_INV(6), CSIZE_INV(7)
+ };
+ assert(CACHELINE * (((sizeof(csize_invs)) / sizeof(unsigned)) +
+ 3) >= (1U << CSPACE_MAX_2POW_DEFAULT));
+
+ if (size <= (((sizeof(csize_invs) / sizeof(unsigned)) + 2) <<
+ CACHELINE_2POW)) {
+ regind = csize_invs[(size >> CACHELINE_2POW) - 3] *
+ diff;
+ regind >>= SIZE_INV_SHIFT;
+ } else
+ regind = diff / size;
+#undef CSIZE_INV
+ } else {
+#define SSIZE_INV(s) (((1U << SIZE_INV_SHIFT) / (s << SUBPAGE_2POW)) + 1)
+ static const unsigned ssize_invs[] = {
+ SSIZE_INV(3),
+ SSIZE_INV(4), SSIZE_INV(5), SSIZE_INV(6), SSIZE_INV(7),
+ SSIZE_INV(8), SSIZE_INV(9), SSIZE_INV(10), SSIZE_INV(11),
+ SSIZE_INV(12), SSIZE_INV(13), SSIZE_INV(14), SSIZE_INV(15)
+#if (PAGESIZE_2POW == 13)
+ ,
+ SSIZE_INV(16), SSIZE_INV(17), SSIZE_INV(18), SSIZE_INV(19),
+ SSIZE_INV(20), SSIZE_INV(21), SSIZE_INV(22), SSIZE_INV(23),
+ SSIZE_INV(24), SSIZE_INV(25), SSIZE_INV(26), SSIZE_INV(27),
+ SSIZE_INV(28), SSIZE_INV(29), SSIZE_INV(29), SSIZE_INV(30)
+#endif
+ };
+ assert(SUBPAGE * (((sizeof(ssize_invs)) / sizeof(unsigned)) + 3)
+ >= (1U << PAGESIZE_2POW));
+
+ if (size < (((sizeof(ssize_invs) / sizeof(unsigned)) + 2) <<
+ SUBPAGE_2POW)) {
+ regind = ssize_invs[(size >> SUBPAGE_2POW) - 3] * diff;
+ regind >>= SIZE_INV_SHIFT;
+ } else
+ regind = diff / size;
+#undef SSIZE_INV
+ }
+#undef SIZE_INV_SHIFT
assert(diff == regind * size);
assert(regind < bin->nregs);
@@ -2232,8 +2550,6 @@ arena_run_reg_dalloc(arena_run_t *run, arena_bin_t *bin, void *ptr, size_t size)
bit = regind - (elm << (SIZEOF_INT_2POW + 3));
assert((run->regs_mask[elm] & (1U << bit)) == 0);
run->regs_mask[elm] |= (1U << bit);
-#undef SIZE_INV
-#undef SIZE_INV_SHIFT
}
static void
@@ -2813,7 +3129,7 @@ arena_lock_balance(arena_t *arena)
/*
* Calculate the exponentially averaged contention for this
* arena. Due to integer math always rounding down, this value
- * decays somewhat faster then normal.
+ * decays somewhat faster than normal.
*/
arena->contention = (((uint64_t)arena->contention
* (uint64_t)((1U << BALANCE_ALPHA_INV_2POW)-1))
@@ -2846,39 +3162,125 @@ arena_lock_balance_hard(arena_t *arena)
}
#endif
+#ifdef MALLOC_MAG
+static inline void *
+mag_alloc(mag_t *mag)
+{
+
+ if (mag->nrounds == 0)
+ return (NULL);
+ mag->nrounds--;
+
+ return (mag->rounds[mag->nrounds]);
+}
+
+static void
+mag_load(mag_t *mag)
+{
+ arena_t *arena;
+ arena_bin_t *bin;
+ arena_run_t *run;
+ void *round;
+ size_t i;
+
+ arena = choose_arena();
+ bin = &arena->bins[mag->binind];
+#ifdef MALLOC_BALANCE
+ arena_lock_balance(arena);
+#else
+ malloc_spin_lock(&arena->lock);
+#endif
+ for (i = mag->nrounds; i < max_rounds; i++) {
+ if ((run = bin->runcur) != NULL && run->nfree > 0)
+ round = arena_bin_malloc_easy(arena, bin, run);
+ else
+ round = arena_bin_malloc_hard(arena, bin);
+ if (round == NULL)
+ break;
+ mag->rounds[i] = round;
+ }
+#ifdef MALLOC_STATS
+ bin->stats.nmags++;
+ arena->stats.nmalloc_small += (i - mag->nrounds);
+ arena->stats.allocated_small += (i - mag->nrounds) * bin->reg_size;
+#endif
+ malloc_spin_unlock(&arena->lock);
+ mag->nrounds = i;
+}
+
+static inline void *
+mag_rack_alloc(mag_rack_t *rack, size_t size, bool zero)
+{
+ void *ret;
+ bin_mags_t *bin_mags;
+ mag_t *mag;
+ size_t binind;
+
+ binind = size2bin[size];
+ assert(binind < nbins);
+ bin_mags = &rack->bin_mags[binind];
+
+ mag = bin_mags->curmag;
+ if (mag == NULL) {
+ /* Create an initial magazine for this size class. */
+ assert(bin_mags->sparemag == NULL);
+ mag = mag_create(choose_arena(), binind);
+ if (mag == NULL)
+ return (NULL);
+ bin_mags->curmag = mag;
+ mag_load(mag);
+ }
+
+ ret = mag_alloc(mag);
+ if (ret == NULL) {
+ if (bin_mags->sparemag != NULL) {
+ if (bin_mags->sparemag->nrounds > 0) {
+ /* Swap magazines. */
+ bin_mags->curmag = bin_mags->sparemag;
+ bin_mags->sparemag = mag;
+ mag = bin_mags->curmag;
+ } else {
+ /* Reload the current magazine. */
+ mag_load(mag);
+ }
+ } else {
+ /* Create a second magazine. */
+ mag = mag_create(choose_arena(), binind);
+ if (mag == NULL)
+ return (NULL);
+ mag_load(mag);
+ bin_mags->sparemag = bin_mags->curmag;
+ bin_mags->curmag = mag;
+ }
+ ret = mag_alloc(mag);
+ if (ret == NULL)
+ return (NULL);
+ }
+
+ if (zero == false) {
+ if (opt_junk)
+ memset(ret, 0xa5, size);
+ else if (opt_zero)
+ memset(ret, 0, size);
+ } else
+ memset(ret, 0, size);
+
+ return (ret);
+}
+#endif
+
static inline void *
arena_malloc_small(arena_t *arena, size_t size, bool zero)
{
void *ret;
arena_bin_t *bin;
arena_run_t *run;
+ size_t binind;
- if (size < small_min) {
- /* Tiny. */
- size = pow2_ceil(size);
- bin = &arena->bins[ffs((int)(size >> (TINY_MIN_2POW +
- 1)))];
-#if (!defined(NDEBUG) || defined(MALLOC_STATS))
- /*
- * Bin calculation is always correct, but we may need
- * to fix size for the purposes of assertions and/or
- * stats accuracy.
- */
- if (size < (1U << TINY_MIN_2POW))
- size = (1U << TINY_MIN_2POW);
-#endif
- } else if (size <= small_max) {
- /* Quantum-spaced. */
- size = QUANTUM_CEILING(size);
- bin = &arena->bins[ntbins + (size >> opt_quantum_2pow)
- - 1];
- } else {
- /* Sub-page. */
- size = pow2_ceil(size);
- bin = &arena->bins[ntbins + nqbins
- + (ffs((int)(size >> opt_small_max_2pow)) - 2)];
- }
- assert(size == bin->reg_size);
+ binind = size2bin[size];
+ assert(binind < nbins);
+ bin = &arena->bins[binind];
+ size = bin->reg_size;
#ifdef MALLOC_BALANCE
arena_lock_balance(arena);
@@ -2956,7 +3358,19 @@ arena_malloc(arena_t *arena, size_t size, bool zero)
assert(QUANTUM_CEILING(size) <= arena_maxclass);
if (size <= bin_maxclass) {
- return (arena_malloc_small(arena, size, zero));
+#ifdef MALLOC_MAG
+ if (__isthreaded && opt_mag) {
+ mag_rack_t *rack = mag_rack;
+ if (rack == NULL) {
+ rack = mag_rack_create(arena);
+ if (rack == NULL)
+ return (NULL);
+ mag_rack = rack;
+ }
+ return (mag_rack_alloc(rack, size, zero));
+ } else
+#endif
+ return (arena_malloc_small(arena, size, zero));
} else
return (arena_malloc_large(arena, size, zero));
}
@@ -3085,7 +3499,7 @@ ipalloc(size_t alignment, size_t size)
size_t run_size;
/*
- * We can't achieve sub-page alignment, so round up alignment
+ * We can't achieve subpage alignment, so round up alignment
* permanently; it makes later calculations simpler.
*/
alignment = PAGE_CEILING(alignment);
@@ -3282,6 +3696,122 @@ arena_dalloc_small(arena_t *arena, arena_chunk_t *chunk, void *ptr,
#endif
}
+#ifdef MALLOC_MAG
+static void
+mag_unload(mag_t *mag)
+{
+ arena_chunk_t *chunk;
+ arena_t *arena;
+ void *round;
+ size_t i, ndeferred, nrounds;
+
+ for (ndeferred = mag->nrounds; ndeferred > 0;) {
+ nrounds = ndeferred;
+ /* Lock the arena associated with the first round. */
+ chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(mag->rounds[0]);
+ arena = chunk->arena;
+#ifdef MALLOC_BALANCE
+ arena_lock_balance(arena);
+#else
+ malloc_spin_lock(&arena->lock);
+#endif
+ /* Deallocate every round that belongs to the locked arena. */
+ for (i = ndeferred = 0; i < nrounds; i++) {
+ round = mag->rounds[i];
+ chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(round);
+ if (chunk->arena == arena) {
+ size_t pageind = (((uintptr_t)round -
+ (uintptr_t)chunk) >> pagesize_2pow);
+ arena_chunk_map_t *mapelm =
+ &chunk->map[pageind];
+ arena_dalloc_small(arena, chunk, round, mapelm);
+ } else {
+ /*
+ * This round was allocated via a different
+ * arena than the one that is currently locked.
+ * Stash the round, so that it can be handled
+ * in a future pass.
+ */
+ mag->rounds[ndeferred] = round;
+ ndeferred++;
+ }
+ }
+ malloc_spin_unlock(&arena->lock);
+ }
+
+ mag->nrounds = 0;
+}
+
+static inline void
+mag_rack_dalloc(mag_rack_t *rack, void *ptr)
+{
+ arena_t *arena;
+ arena_chunk_t *chunk;
+ arena_run_t *run;
+ arena_bin_t *bin;
+ bin_mags_t *bin_mags;
+ mag_t *mag;
+ size_t pageind, binind;
+ arena_chunk_map_t *mapelm;
+
+ chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
+ arena = chunk->arena;
+ pageind = (((uintptr_t)ptr - (uintptr_t)chunk) >> pagesize_2pow);
+ mapelm = &chunk->map[pageind];
+ run = (arena_run_t *)(mapelm->bits & ~pagesize_mask);
+ assert(run->magic == ARENA_RUN_MAGIC);
+ bin = run->bin;
+ binind = ((uintptr_t)bin - (uintptr_t)&arena->bins) /
+ sizeof(arena_bin_t);
+ assert(binind < nbins);
+
+ if (opt_junk)
+ memset(ptr, 0x5a, arena->bins[binind].reg_size);
+
+ bin_mags = &rack->bin_mags[binind];
+ mag = bin_mags->curmag;
+ if (mag == NULL) {
+ /* Create an initial magazine for this size class. */
+ assert(bin_mags->sparemag == NULL);
+ mag = mag_create(choose_arena(), binind);
+ if (mag == NULL) {
+ malloc_spin_lock(&arena->lock);
+ arena_dalloc_small(arena, chunk, ptr, mapelm);
+ malloc_spin_unlock(&arena->lock);
+ return;
+ }
+ bin_mags->curmag = mag;
+ }
+
+ if (mag->nrounds == max_rounds) {
+ if (bin_mags->sparemag != NULL) {
+ if (bin_mags->sparemag->nrounds < max_rounds) {
+ /* Swap magazines. */
+ bin_mags->curmag = bin_mags->sparemag;
+ bin_mags->sparemag = mag;
+ mag = bin_mags->curmag;
+ } else {
+ /* Unload the current magazine. */
+ mag_unload(mag);
+ }
+ } else {
+ /* Create a second magazine. */
+ mag = mag_create(choose_arena(), binind);
+ if (mag == NULL) {
+ mag = rack->bin_mags[binind].curmag;
+ mag_unload(mag);
+ } else {
+ bin_mags->sparemag = bin_mags->curmag;
+ bin_mags->curmag = mag;
+ }
+ }
+ assert(mag->nrounds < max_rounds);
+ }
+ mag->rounds[mag->nrounds] = ptr;
+ mag->nrounds++;
+}
+#endif
+
static void
arena_dalloc_large(arena_t *arena, arena_chunk_t *chunk, void *ptr)
{
@@ -3329,9 +3859,28 @@ arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr)
assert((mapelm->bits & CHUNK_MAP_ALLOCATED) != 0);
if ((mapelm->bits & CHUNK_MAP_LARGE) == 0) {
/* Small allocation. */
- malloc_spin_lock(&arena->lock);
- arena_dalloc_small(arena, chunk, ptr, mapelm);
- malloc_spin_unlock(&arena->lock);
+#ifdef MALLOC_MAG
+ if (__isthreaded && opt_mag) {
+ mag_rack_t *rack = mag_rack;
+ if (rack == NULL) {
+ rack = mag_rack_create(arena);
+ if (rack == NULL) {
+ malloc_spin_lock(&arena->lock);
+ arena_dalloc_small(arena, chunk, ptr,
+ mapelm);
+ malloc_spin_unlock(&arena->lock);
+ }
+ mag_rack = rack;
+ }
+ mag_rack_dalloc(rack, ptr);
+ } else {
+#endif
+ malloc_spin_lock(&arena->lock);
+ arena_dalloc_small(arena, chunk, ptr, mapelm);
+ malloc_spin_unlock(&arena->lock);
+#ifdef MALLOC_MAG
+ }
+#endif
} else
arena_dalloc_large(arena, chunk, ptr);
}
@@ -3471,24 +4020,16 @@ arena_ralloc(void *ptr, size_t size, size_t oldsize)
size_t copysize;
/* Try to avoid moving the allocation. */
- if (size < small_min) {
- if (oldsize < small_min &&
- ffs((int)(pow2_ceil(size) >> (TINY_MIN_2POW + 1)))
- == ffs((int)(pow2_ceil(oldsize) >> (TINY_MIN_2POW + 1))))
- goto IN_PLACE; /* Same size class. */
- } else if (size <= small_max) {
- if (oldsize >= small_min && oldsize <= small_max &&
- (QUANTUM_CEILING(size) >> opt_quantum_2pow)
- == (QUANTUM_CEILING(oldsize) >> opt_quantum_2pow))
- goto IN_PLACE; /* Same size class. */
- } else if (size <= bin_maxclass) {
- if (oldsize > small_max && oldsize <= bin_maxclass &&
- pow2_ceil(size) == pow2_ceil(oldsize))
- goto IN_PLACE; /* Same size class. */
- } else if (oldsize > bin_maxclass && oldsize <= arena_maxclass) {
- assert(size > bin_maxclass);
- if (arena_ralloc_large(ptr, size, oldsize) == false)
- return (ptr);
+ if (size <= bin_maxclass) {
+ if (oldsize <= bin_maxclass && size2bin[size] ==
+ size2bin[oldsize])
+ goto IN_PLACE;
+ } else {
+ if (oldsize > bin_maxclass && oldsize <= arena_maxclass) {
+ assert(size > bin_maxclass);
+ if (arena_ralloc_large(ptr, size, oldsize) == false)
+ return (ptr);
+ }
}
/*
@@ -3558,8 +4099,10 @@ arena_new(arena_t *arena)
/* Initialize bins. */
prev_run_size = pagesize;
+ i = 0;
+#ifdef MALLOC_TINY
/* (2^n)-spaced tiny bins. */
- for (i = 0; i < ntbins; i++) {
+ for (; i < ntbins; i++) {
bin = &arena->bins[i];
bin->runcur = NULL;
arena_run_tree_new(&bin->runs);
@@ -3572,6 +4115,7 @@ arena_new(arena_t *arena)
memset(&bin->stats, 0, sizeof(malloc_bin_stats_t));
#endif
}
+#endif
/* Quantum-spaced bins. */
for (; i < ntbins + nqbins; i++) {
@@ -3579,7 +4123,7 @@ arena_new(arena_t *arena)
bin->runcur = NULL;
arena_run_tree_new(&bin->runs);
- bin->reg_size = quantum * (i - ntbins + 1);
+ bin->reg_size = (i - ntbins + 1) << QUANTUM_2POW;
prev_run_size = arena_bin_run_size_calc(bin, prev_run_size);
@@ -3588,13 +4132,30 @@ arena_new(arena_t *arena)
#endif
}
- /* (2^n)-spaced sub-page bins. */
- for (; i < ntbins + nqbins + nsbins; i++) {
+ /* Cacheline-spaced bins. */
+ for (; i < ntbins + nqbins + ncbins; i++) {
bin = &arena->bins[i];
bin->runcur = NULL;
arena_run_tree_new(&bin->runs);
- bin->reg_size = (small_max << (i - (ntbins + nqbins) + 1));
+ bin->reg_size = cspace_min + ((i - (ntbins + nqbins)) <<
+ CACHELINE_2POW);
+
+ prev_run_size = arena_bin_run_size_calc(bin, prev_run_size);
+
+#ifdef MALLOC_STATS
+ memset(&bin->stats, 0, sizeof(malloc_bin_stats_t));
+#endif
+ }
+
+ /* Subpage-spaced bins. */
+ for (; i < nbins; i++) {
+ bin = &arena->bins[i];
+ bin->runcur = NULL;
+ arena_run_tree_new(&bin->runs);
+
+ bin->reg_size = sspace_min + ((i - (ntbins + nqbins + ncbins))
+ << SUBPAGE_2POW);
prev_run_size = arena_bin_run_size_calc(bin, prev_run_size);
@@ -3618,7 +4179,7 @@ arenas_extend(unsigned ind)
/* Allocate enough space for trailing bins. */
ret = (arena_t *)base_alloc(sizeof(arena_t)
- + (sizeof(arena_bin_t) * (ntbins + nqbins + nsbins - 1)));
+ + (sizeof(arena_bin_t) * (nbins - 1)));
if (ret != NULL && arena_new(ret) == false) {
arenas[ind] = ret;
return (ret);
@@ -3639,6 +4200,95 @@ arenas_extend(unsigned ind)
return (arenas[0]);
}
+#ifdef MALLOC_MAG
+static mag_t *
+mag_create(arena_t *arena, size_t binind)
+{
+ mag_t *ret;
+
+ if (sizeof(mag_t) + (sizeof(void *) * (max_rounds - 1)) <=
+ bin_maxclass) {
+ ret = arena_malloc_small(arena, sizeof(mag_t) + (sizeof(void *)
+ * (max_rounds - 1)), false);
+ } else {
+ ret = imalloc(sizeof(mag_t) + (sizeof(void *) * (max_rounds -
+ 1)));
+ }
+ if (ret == NULL)
+ return (NULL);
+ ret->binind = binind;
+ ret->nrounds = 0;
+
+ return (ret);
+}
+
+static void
+mag_destroy(mag_t *mag)
+{
+ arena_t *arena;
+ arena_chunk_t *chunk;
+ size_t pageind;
+ arena_chunk_map_t *mapelm;
+
+ chunk = CHUNK_ADDR2BASE(mag);
+ arena = chunk->arena;
+ pageind = (((uintptr_t)mag - (uintptr_t)chunk) >> pagesize_2pow);
+ mapelm = &chunk->map[pageind];
+
+ assert(mag->nrounds == 0);
+ if (sizeof(mag_t) + (sizeof(void *) * (max_rounds - 1)) <=
+ bin_maxclass) {
+ malloc_spin_lock(&arena->lock);
+ arena_dalloc_small(arena, chunk, mag, mapelm);
+ malloc_spin_unlock(&arena->lock);
+ } else
+ idalloc(mag);
+}
+
+static mag_rack_t *
+mag_rack_create(arena_t *arena)
+{
+
+ assert(sizeof(mag_rack_t) + (sizeof(bin_mags_t *) * (nbins - 1)) <=
+ bin_maxclass);
+ return (arena_malloc_small(arena, sizeof(mag_rack_t) +
+ (sizeof(bin_mags_t) * (nbins - 1)), true));
+}
+
+static void
+mag_rack_destroy(mag_rack_t *rack)
+{
+ arena_t *arena;
+ arena_chunk_t *chunk;
+ bin_mags_t *bin_mags;
+ size_t i, pageind;
+ arena_chunk_map_t *mapelm;
+
+ for (i = 0; i < nbins; i++) {
+ bin_mags = &rack->bin_mags[i];
+ if (bin_mags->curmag != NULL) {
+ assert(bin_mags->curmag->binind == i);
+ mag_unload(bin_mags->curmag);
+ mag_destroy(bin_mags->curmag);
+ }
+ if (bin_mags->sparemag != NULL) {
+ assert(bin_mags->sparemag->binind == i);
+ mag_unload(bin_mags->sparemag);
+ mag_destroy(bin_mags->sparemag);
+ }
+ }
+
+ chunk = CHUNK_ADDR2BASE(rack);
+ arena = chunk->arena;
+ pageind = (((uintptr_t)rack - (uintptr_t)chunk) >> pagesize_2pow);
+ mapelm = &chunk->map[pageind];
+
+ malloc_spin_lock(&arena->lock);
+ arena_dalloc_small(arena, chunk, rack, mapelm);
+ malloc_spin_unlock(&arena->lock);
+}
+#endif
+
/*
* End arena.
*/
@@ -3860,6 +4510,9 @@ malloc_print_stats(void)
#ifdef MALLOC_DSS
_malloc_message(opt_dss ? "D" : "d", "", "", "");
#endif
+#ifdef MALLOC_MAG
+ _malloc_message(opt_mag ? "G" : "g", "", "", "");
+#endif
_malloc_message(opt_junk ? "J" : "j", "", "", "");
#ifdef MALLOC_DSS
_malloc_message(opt_mmap ? "M" : "m", "", "", "");
@@ -3877,9 +4530,27 @@ malloc_print_stats(void)
#endif
_malloc_message("Pointer size: ", umax2s(sizeof(void *), s),
"\n", "");
- _malloc_message("Quantum size: ", umax2s(quantum, s), "\n", "");
- _malloc_message("Max small size: ", umax2s(small_max, s), "\n",
- "");
+ _malloc_message("Quantum size: ", umax2s(QUANTUM, s), "\n", "");
+ _malloc_message("Cacheline size (assumed): ", umax2s(CACHELINE,
+ s), "\n", "");
+#ifdef MALLOC_TINY
+ _malloc_message("Tiny 2^n-spaced sizes: [", umax2s((1U <<
+ TINY_MIN_2POW), s), "..", "");
+ _malloc_message(umax2s((qspace_min >> 1), s), "]\n", "", "");
+#endif
+ _malloc_message("Quantum-spaced sizes: [", umax2s(qspace_min,
+ s), "..", "");
+ _malloc_message(umax2s(qspace_max, s), "]\n", "", "");
+ _malloc_message("Cacheline-spaced sizes: [", umax2s(cspace_min,
+ s), "..", "");
+ _malloc_message(umax2s(cspace_max, s), "]\n", "", "");
+ _malloc_message("Subpage-spaced sizes: [", umax2s(sspace_min,
+ s), "..", "");
+ _malloc_message(umax2s(sspace_max, s), "]\n", "", "");
+#ifdef MALLOC_MAG
+ _malloc_message("Rounds per magazine: ", umax2s(max_rounds, s),
+ "\n", "");
+#endif
_malloc_message("Max dirty pages per arena: ",
umax2s(opt_dirty_max, s), "\n", "");
@@ -3969,6 +4640,122 @@ malloc_print_stats(void)
}
}
+#ifdef MALLOC_DEBUG
+static void
+size2bin_validate(void)
+{
+ size_t i, size, binind;
+
+ assert(size2bin[0] == 0xffU);
+ i = 1;
+# ifdef MALLOC_TINY
+ /* Tiny. */
+ for (; i < (1U << TINY_MIN_2POW); i++) {
+ size = pow2_ceil(1U << TINY_MIN_2POW);
+ binind = ffs((int)(size >> (TINY_MIN_2POW + 1)));
+ assert(size2bin[i] == binind);
+ }
+ for (; i < qspace_min; i++) {
+ size = pow2_ceil(i);
+ binind = ffs((int)(size >> (TINY_MIN_2POW + 1)));
+ assert(size2bin[i] == binind);
+ }
+# endif
+ /* Quantum-spaced. */
+ for (; i <= qspace_max; i++) {
+ size = QUANTUM_CEILING(i);
+ binind = ntbins + (size >> QUANTUM_2POW) - 1;
+ assert(size2bin[i] == binind);
+ }
+ /* Cacheline-spaced. */
+ for (; i <= cspace_max; i++) {
+ size = CACHELINE_CEILING(i);
+ binind = ntbins + nqbins + ((size - cspace_min) >>
+ CACHELINE_2POW);
+ assert(size2bin[i] == binind);
+ }
+ /* Sub-page. */
+ for (; i <= sspace_max; i++) {
+ size = SUBPAGE_CEILING(i);
+ binind = ntbins + nqbins + ncbins + ((size - sspace_min)
+ >> SUBPAGE_2POW);
+ assert(size2bin[i] == binind);
+ }
+}
+#endif
+
+static bool
+size2bin_init(void)
+{
+
+ if (opt_qspace_max_2pow != QSPACE_MAX_2POW_DEFAULT
+ || opt_cspace_max_2pow != CSPACE_MAX_2POW_DEFAULT)
+ return (size2bin_init_hard());
+
+ size2bin = const_size2bin;
+#ifdef MALLOC_DEBUG
+ assert(sizeof(const_size2bin) == bin_maxclass + 1);
+ size2bin_validate();
+#endif
+ return (false);
+}
+
+static bool
+size2bin_init_hard(void)
+{
+ size_t i, size, binind;
+ uint8_t *custom_size2bin;
+
+ assert(opt_qspace_max_2pow != QSPACE_MAX_2POW_DEFAULT
+ || opt_cspace_max_2pow != CSPACE_MAX_2POW_DEFAULT);
+
+ custom_size2bin = (uint8_t *)base_alloc(bin_maxclass + 1);
+ if (custom_size2bin == NULL)
+ return (true);
+
+ custom_size2bin[0] = 0xffU;
+ i = 1;
+#ifdef MALLOC_TINY
+ /* Tiny. */
+ for (; i < (1U << TINY_MIN_2POW); i++) {
+ size = pow2_ceil(1U << TINY_MIN_2POW);
+ binind = ffs((int)(size >> (TINY_MIN_2POW + 1)));
+ custom_size2bin[i] = binind;
+ }
+ for (; i < qspace_min; i++) {
+ size = pow2_ceil(i);
+ binind = ffs((int)(size >> (TINY_MIN_2POW + 1)));
+ custom_size2bin[i] = binind;
+ }
+#endif
+ /* Quantum-spaced. */
+ for (; i <= qspace_max; i++) {
+ size = QUANTUM_CEILING(i);
+ binind = ntbins + (size >> QUANTUM_2POW) - 1;
+ custom_size2bin[i] = binind;
+ }
+ /* Cacheline-spaced. */
+ for (; i <= cspace_max; i++) {
+ size = CACHELINE_CEILING(i);
+ binind = ntbins + nqbins + ((size - cspace_min) >>
+ CACHELINE_2POW);
+ custom_size2bin[i] = binind;
+ }
+ /* Sub-page. */
+ for (; i <= sspace_max; i++) {
+ size = SUBPAGE_CEILING(i);
+ binind = ntbins + nqbins + ncbins + ((size - sspace_min) >>
+ SUBPAGE_2POW);
+ custom_size2bin[i] = binind;
+ }
+
+ size2bin = custom_size2bin;
+#ifdef MALLOC_DEBUG
+ size2bin_validate();
+#endif
+ return (false);
+}
+
/*
* FreeBSD's pthreads implementation calls malloc(3), so the malloc
* implementation has to take pains to avoid infinite recursion during
@@ -4128,6 +4915,18 @@ MALLOC_OUT:
opt_balance_threshold <<= 1;
#endif
break;
+ case 'c':
+ if (opt_cspace_max_2pow - 1 >
+ opt_qspace_max_2pow &&
+ opt_cspace_max_2pow >
+ CACHELINE_2POW)
+ opt_cspace_max_2pow--;
+ break;
+ case 'C':
+ if (opt_cspace_max_2pow < pagesize_2pow
+ - 1)
+ opt_cspace_max_2pow++;
+ break;
case 'd':
#ifdef MALLOC_DSS
opt_dss = false;
@@ -4147,6 +4946,14 @@ MALLOC_OUT:
else if ((opt_dirty_max << 1) != 0)
opt_dirty_max <<= 1;
break;
+#ifdef MALLOC_MAG
+ case 'g':
+ opt_mag = false;
+ break;
+ case 'G':
+ opt_mag = true;
+ break;
+#endif
case 'j':
opt_junk = false;
break;
@@ -4190,24 +4997,30 @@ MALLOC_OUT:
opt_print_stats = true;
break;
case 'q':
- if (opt_quantum_2pow > QUANTUM_2POW_MIN)
- opt_quantum_2pow--;
+ if (opt_qspace_max_2pow > QUANTUM_2POW)
+ opt_qspace_max_2pow--;
break;
case 'Q':
- if (opt_quantum_2pow < pagesize_2pow -
- 1)
- opt_quantum_2pow++;
+ if (opt_qspace_max_2pow + 1 <
+ opt_cspace_max_2pow)
+ opt_qspace_max_2pow++;
break;
- case 's':
- if (opt_small_max_2pow >
- QUANTUM_2POW_MIN)
- opt_small_max_2pow--;
+#ifdef MALLOC_MAG
+ case 'R':
+ if (opt_mag_size_2pow + 1 < (8U <<
+ SIZEOF_PTR_2POW))
+ opt_mag_size_2pow++;
break;
- case 'S':
- if (opt_small_max_2pow < pagesize_2pow
- - 1)
- opt_small_max_2pow++;
+ case 'r':
+ /*
+ * Make sure there's always at least
+ * one round per magazine.
+ */
+ if ((1U << (opt_mag_size_2pow-1)) >=
+ sizeof(mag_t))
+ opt_mag_size_2pow--;
break;
+#endif
case 'u':
opt_utrace = false;
break;
@@ -4259,27 +5072,40 @@ MALLOC_OUT:
atexit(malloc_print_stats);
}
- /* Set variables according to the value of opt_small_max_2pow. */
- if (opt_small_max_2pow < opt_quantum_2pow)
- opt_small_max_2pow = opt_quantum_2pow;
- small_max = (1U << opt_small_max_2pow);
-
- /* Set bin-related variables. */
- bin_maxclass = (pagesize >> 1);
- assert(opt_quantum_2pow >= TINY_MIN_2POW);
- ntbins = opt_quantum_2pow - TINY_MIN_2POW;
- assert(ntbins <= opt_quantum_2pow);
- nqbins = (small_max >> opt_quantum_2pow);
- nsbins = pagesize_2pow - opt_small_max_2pow - 1;
-
- /* Set variables according to the value of opt_quantum_2pow. */
- quantum = (1U << opt_quantum_2pow);
- quantum_mask = quantum - 1;
- if (ntbins > 0)
- small_min = (quantum >> 1) + 1;
- else
- small_min = 1;
- assert(small_min <= quantum);
+#ifdef MALLOC_MAG
+ /*
+ * Calculate the actual number of rounds per magazine, taking into
+ * account header overhead.
+ */
+ max_rounds = (1LLU << (opt_mag_size_2pow - SIZEOF_PTR_2POW)) -
+ (sizeof(mag_t) >> SIZEOF_PTR_2POW) + 1;
+#endif
+
+ /* Set variables according to the value of opt_[qc]space_max_2pow. */
+ qspace_max = (1U << opt_qspace_max_2pow);
+ cspace_min = CACHELINE_CEILING(qspace_max);
+ if (cspace_min == qspace_max)
+ cspace_min += CACHELINE;
+ cspace_max = (1U << opt_cspace_max_2pow);
+ sspace_min = SUBPAGE_CEILING(cspace_max);
+ if (sspace_min == cspace_max)
+ sspace_min += SUBPAGE;
+ assert(sspace_min < pagesize);
+ sspace_max = pagesize - SUBPAGE;
+
+#ifdef MALLOC_TINY
+ assert(QUANTUM_2POW >= TINY_MIN_2POW);
+#endif
+ assert(ntbins <= QUANTUM_2POW);
+ nqbins = qspace_max >> QUANTUM_2POW;
+ ncbins = ((cspace_max - cspace_min) >> CACHELINE_2POW) + 1;
+ nsbins = ((sspace_max - sspace_min) >> SUBPAGE_2POW) + 1;
+ nbins = ntbins + nqbins + ncbins + nsbins;
+
+ if (size2bin_init()) {
+ malloc_mutex_unlock(&init_lock);
+ return (true);
+ }
/* Set variables according to the value of opt_chunk_2pow. */
chunksize = (1LU << opt_chunk_2pow);
@@ -4289,11 +5115,8 @@ MALLOC_OUT:
size_t header_size;
/*
- * Compute the header size such that it is large
- * enough to contain the page map and enough nodes for the
- * worst case: one node per non-header page plus one extra for
- * situations where we briefly have one more node allocated
- * than we will need.
+ * Compute the header size such that it is large enough to
+ * contain the page map.
*/
header_size = sizeof(arena_chunk_t) +
(sizeof(arena_chunk_map_t) * (chunk_npages - 1));
@@ -4310,10 +5133,7 @@ MALLOC_OUT:
#endif
/* Various sanity checks that regard configuration. */
- assert(quantum >= sizeof(void *));
- assert(quantum <= pagesize);
assert(chunksize >= pagesize);
- assert(quantum * 4 <= chunksize);
/* Initialize chunks data. */
malloc_mutex_init(&huge_mtx);
@@ -4350,10 +5170,10 @@ MALLOC_OUT:
if (ncpus > 1) {
/*
- * For SMP systems, create four times as many arenas as there
- * are CPUs by default.
+ * For SMP systems, create twice as many arenas as there are
+ * CPUs by default.
*/
- opt_narenas_lshift += 2;
+ opt_narenas_lshift++;
}
/* Determine how many arenas to use. */
@@ -4682,8 +5502,37 @@ malloc_usable_size(const void *ptr)
*/
/******************************************************************************/
/*
- * Begin library-private functions, used by threading libraries for protection
- * of malloc during fork(). These functions are only called if the program is
+ * Begin library-private functions.
+ */
+
+/******************************************************************************/
+/*
+ * Begin thread cache.
+ */
+
+/*
+ * We provide an unpublished interface in order to receive notifications from
+ * the pthreads library whenever a thread exits. This allows us to clean up
+ * thread caches.
+ */
+void
+_malloc_thread_cleanup(void)
+{
+
+#ifdef MALLOC_MAG
+ if (mag_rack != NULL) {
+ assert(mag_rack != (void *)-1);
+ mag_rack_destroy(mag_rack);
+#ifdef MALLOC_DEBUG
+ mag_rack = (void *)-1;
+#endif
+ }
+#endif
+}
+
+/*
+ * The following functions are used by threading libraries for protection of
+ * malloc during fork(). These functions are only called if the program is
* running in threaded mode, so there is no need to check whether the program
* is threaded here.
*/
diff --git a/lib/libthr/thread/thr_exit.c b/lib/libthr/thread/thr_exit.c
index 4b31a60..4e14311 100644
--- a/lib/libthr/thread/thr_exit.c
+++ b/lib/libthr/thread/thr_exit.c
@@ -36,6 +36,7 @@
#include <pthread.h>
#include "un-namespace.h"
+#include "libc_private.h"
#include "thr_private.h"
void _pthread_exit(void *status);
@@ -95,6 +96,9 @@ _pthread_exit(void *status)
_thread_cleanupspecific();
}
+ /* Tell malloc that the thread is exiting. */
+ _malloc_thread_cleanup();
+
if (!_thr_isthreaded())
exit(0);
OpenPOWER on IntegriCloud