summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2008-07-28 11:32:33 -0700
committerIngo Molnar <mingo@elte.hu>2008-07-28 22:20:41 +0200
commite56b3bc7942982ac2589c942fb345e38bc7a341a (patch)
tree8130492904f5bb9cff061f62ebb1c5d6eed3308b /kernel
parent414f746d232d41ed6ae8632c4495ae795373c44b (diff)
downloadop-kernel-dev-e56b3bc7942982ac2589c942fb345e38bc7a341a.zip
op-kernel-dev-e56b3bc7942982ac2589c942fb345e38bc7a341a.tar.gz
cpu masks: optimize and clean up cpumask_of_cpu()
Clean up and optimize cpumask_of_cpu(), by sharing all the zero words. Instead of stupidly generating all possible i=0...NR_CPUS 2^i patterns creating a huge array of constant bitmasks, realize that the zero words can be shared. In other words, on a 64-bit architecture, we only ever need 64 of these arrays - with a different bit set in one single world (with enough zero words around it so that we can create any bitmask by just offsetting in that big array). And then we just put enough zeroes around it that we can point every single cpumask to be one of those things. So when we have 4k CPU's, instead of having 4k arrays (of 4k bits each, with one bit set in each array - 2MB memory total), we have exactly 64 arrays instead, each 8k bits in size (64kB total). And then we just point cpumask(n) to the right position (which we can calculate dynamically). Once we have the right arrays, getting "cpumask(n)" ends up being: static inline const cpumask_t *get_cpu_mask(unsigned int cpu) { const unsigned long *p = cpu_bit_bitmap[1 + cpu % BITS_PER_LONG]; p -= cpu / BITS_PER_LONG; return (const cpumask_t *)p; } This brings other advantages and simplifications as well: - we are not wasting memory that is just filled with a single bit in various different places - we don't need all those games to re-create the arrays in some dense format, because they're already going to be dense enough. if we compile a kernel for up to 4k CPU's, "wasting" that 64kB of memory is a non-issue (especially since by doing this "overlapping" trick we probably get better cache behaviour anyway). [ mingo@elte.hu: Converted Linus's mails into a commit. See: http://lkml.org/lkml/2008/7/27/156 http://lkml.org/lkml/2008/7/28/320 Also applied a family filter - which also has the side-effect of leaving out the bits where Linus calls me an idio... Oh, never mind ;-) ] Signed-off-by: Ingo Molnar <mingo@elte.hu> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Al Viro <viro@ZenIV.linux.org.uk> Cc: Mike Travis <travis@sgi.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cpu.c128
1 files changed, 20 insertions, 108 deletions
diff --git a/kernel/cpu.c b/kernel/cpu.c
index a35d899..06a8358 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -462,115 +462,27 @@ out:
#endif /* CONFIG_SMP */
-/* 64 bits of zeros, for initializers. */
-#if BITS_PER_LONG == 32
-#define Z64 0, 0
-#else
-#define Z64 0
-#endif
+/*
+ * cpu_bit_bitmap[] is a special, "compressed" data structure that
+ * represents all NR_CPUS bits binary values of 1<<nr.
+ *
+ * It is used by cpumask_of_cpu() to get a constant address to a CPU
+ * mask value that has a single bit set only.
+ */
-/* Initializer macros. */
-#define CMI0(n) { .bits = { 1UL << (n) } }
-#define CMI(n, ...) { .bits = { __VA_ARGS__, 1UL << ((n) % BITS_PER_LONG) } }
-
-#define CMI8(n, ...) \
- CMI((n), __VA_ARGS__), CMI((n)+1, __VA_ARGS__), \
- CMI((n)+2, __VA_ARGS__), CMI((n)+3, __VA_ARGS__), \
- CMI((n)+4, __VA_ARGS__), CMI((n)+5, __VA_ARGS__), \
- CMI((n)+6, __VA_ARGS__), CMI((n)+7, __VA_ARGS__)
-
-#if BITS_PER_LONG == 32
-#define CMI64(n, ...) \
- CMI8((n), __VA_ARGS__), CMI8((n)+8, __VA_ARGS__), \
- CMI8((n)+16, __VA_ARGS__), CMI8((n)+24, __VA_ARGS__), \
- CMI8((n)+32, 0, __VA_ARGS__), CMI8((n)+40, 0, __VA_ARGS__), \
- CMI8((n)+48, 0, __VA_ARGS__), CMI8((n)+56, 0, __VA_ARGS__)
-#else
-#define CMI64(n, ...) \
- CMI8((n), __VA_ARGS__), CMI8((n)+8, __VA_ARGS__), \
- CMI8((n)+16, __VA_ARGS__), CMI8((n)+24, __VA_ARGS__), \
- CMI8((n)+32, __VA_ARGS__), CMI8((n)+40, __VA_ARGS__), \
- CMI8((n)+48, __VA_ARGS__), CMI8((n)+56, __VA_ARGS__)
-#endif
+/* cpu_bit_bitmap[0] is empty - so we can back into it */
+#define MASK_DECLARE_1(x) [x+1][0] = 1UL << (x)
+#define MASK_DECLARE_2(x) MASK_DECLARE_1(x), MASK_DECLARE_1(x+1)
+#define MASK_DECLARE_4(x) MASK_DECLARE_2(x), MASK_DECLARE_2(x+2)
+#define MASK_DECLARE_8(x) MASK_DECLARE_4(x), MASK_DECLARE_4(x+4)
-#define CMI256(n, ...) \
- CMI64((n), __VA_ARGS__), CMI64((n)+64, Z64, __VA_ARGS__), \
- CMI64((n)+128, Z64, Z64, __VA_ARGS__), \
- CMI64((n)+192, Z64, Z64, Z64, __VA_ARGS__)
-#define Z256 Z64, Z64, Z64, Z64
-
-#define CMI1024(n, ...) \
- CMI256((n), __VA_ARGS__), \
- CMI256((n)+256, Z256, __VA_ARGS__), \
- CMI256((n)+512, Z256, Z256, __VA_ARGS__), \
- CMI256((n)+768, Z256, Z256, Z256, __VA_ARGS__)
-#define Z1024 Z256, Z256, Z256, Z256
-
-/* We want this statically initialized, just to be safe. We try not
- * to waste too much space, either. */
-static const cpumask_t cpumask_map[]
-#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP
-__initdata
-#endif
-= {
- CMI0(0), CMI0(1), CMI0(2), CMI0(3),
-#if NR_CPUS > 4
- CMI0(4), CMI0(5), CMI0(6), CMI0(7),
-#endif
-#if NR_CPUS > 8
- CMI0(8), CMI0(9), CMI0(10), CMI0(11),
- CMI0(12), CMI0(13), CMI0(14), CMI0(15),
-#endif
-#if NR_CPUS > 16
- CMI0(16), CMI0(17), CMI0(18), CMI0(19),
- CMI0(20), CMI0(21), CMI0(22), CMI0(23),
- CMI0(24), CMI0(25), CMI0(26), CMI0(27),
- CMI0(28), CMI0(29), CMI0(30), CMI0(31),
-#endif
-#if NR_CPUS > 32
-#if BITS_PER_LONG == 32
- CMI(32, 0), CMI(33, 0), CMI(34, 0), CMI(35, 0),
- CMI(36, 0), CMI(37, 0), CMI(38, 0), CMI(39, 0),
- CMI(40, 0), CMI(41, 0), CMI(42, 0), CMI(43, 0),
- CMI(44, 0), CMI(45, 0), CMI(46, 0), CMI(47, 0),
- CMI(48, 0), CMI(49, 0), CMI(50, 0), CMI(51, 0),
- CMI(52, 0), CMI(53, 0), CMI(54, 0), CMI(55, 0),
- CMI(56, 0), CMI(57, 0), CMI(58, 0), CMI(59, 0),
- CMI(60, 0), CMI(61, 0), CMI(62, 0), CMI(63, 0),
-#else
- CMI0(32), CMI0(33), CMI0(34), CMI0(35),
- CMI0(36), CMI0(37), CMI0(38), CMI0(39),
- CMI0(40), CMI0(41), CMI0(42), CMI0(43),
- CMI0(44), CMI0(45), CMI0(46), CMI0(47),
- CMI0(48), CMI0(49), CMI0(50), CMI0(51),
- CMI0(52), CMI0(53), CMI0(54), CMI0(55),
- CMI0(56), CMI0(57), CMI0(58), CMI0(59),
- CMI0(60), CMI0(61), CMI0(62), CMI0(63),
-#endif /* BITS_PER_LONG == 64 */
-#endif
-#if NR_CPUS > 64
- CMI64(64, Z64),
-#endif
-#if NR_CPUS > 128
- CMI64(128, Z64, Z64), CMI64(192, Z64, Z64, Z64),
-#endif
-#if NR_CPUS > 256
- CMI256(256, Z256),
-#endif
-#if NR_CPUS > 512
- CMI256(512, Z256, Z256), CMI256(768, Z256, Z256, Z256),
-#endif
-#if NR_CPUS > 1024
- CMI1024(1024, Z1024),
-#endif
-#if NR_CPUS > 2048
- CMI1024(2048, Z1024, Z1024), CMI1024(3072, Z1024, Z1024, Z1024),
-#endif
-#if NR_CPUS > 4096
-#error NR_CPUS too big. Fix initializers or set CONFIG_HAVE_CPUMASK_OF_CPU_MAP
+const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = {
+
+ MASK_DECLARE_8(0), MASK_DECLARE_8(8),
+ MASK_DECLARE_8(16), MASK_DECLARE_8(24),
+#if BITS_PER_LONG > 32
+ MASK_DECLARE_8(32), MASK_DECLARE_8(40),
+ MASK_DECLARE_8(48), MASK_DECLARE_8(56),
#endif
};
-
-const cpumask_t *cpumask_of_cpu_map = cpumask_map;
-
-EXPORT_SYMBOL_GPL(cpumask_of_cpu_map);
+EXPORT_SYMBOL_GPL(cpu_bit_bitmap);
OpenPOWER on IntegriCloud