From e7c8d5c9955a4d2e88e36b640563f5d6d5aba48a Mon Sep 17 00:00:00 2001
From: Christoph Lameter <christoph@lameter.com>
Date: Tue, 21 Jun 2005 17:14:47 -0700
Subject: [PATCH] node local per-cpu-pages

This patch modifies the way pagesets in struct zone are managed.

Each zone has a per-cpu array of pagesets.  So any particular CPU has some
memory in each zone structure which belongs to itself.  Even if that CPU is
not local to that zone.

So the patch relocates the pagesets for each cpu to the node that is nearest
to the cpu instead of allocating the pagesets in the (possibly remote) target
zone.  This means that the operations to manage pages on remote zone can be
done with information available locally.

We play a macro trick so that non-NUMA pmachines avoid the additional
pointer chase on the page allocator fastpath.

AIM7 benchmark on a 32 CPU SGI Altix

w/o patches:
Tasks    jobs/min  jti  jobs/min/task      real       cpu
    1      484.68  100       484.6769     12.01      1.97   Fri Mar 25 11:01:42 2005
  100    27140.46   89       271.4046     21.44    148.71   Fri Mar 25 11:02:04 2005
  200    30792.02   82       153.9601     37.80    296.72   Fri Mar 25 11:02:42 2005
  300    32209.27   81       107.3642     54.21    451.34   Fri Mar 25 11:03:37 2005
  400    34962.83   78        87.4071     66.59    588.97   Fri Mar 25 11:04:44 2005
  500    31676.92   75        63.3538     91.87    742.71   Fri Mar 25 11:06:16 2005
  600    36032.69   73        60.0545     96.91    885.44   Fri Mar 25 11:07:54 2005
  700    35540.43   77        50.7720    114.63   1024.28   Fri Mar 25 11:09:49 2005
  800    33906.70   74        42.3834    137.32   1181.65   Fri Mar 25 11:12:06 2005
  900    34120.67   73        37.9119    153.51   1325.26   Fri Mar 25 11:14:41 2005
 1000    34802.37   74        34.8024    167.23   1465.26   Fri Mar 25 11:17:28 2005

with slab API changes and pageset patch:

Tasks    jobs/min  jti  jobs/min/task      real       cpu
    1      485.00  100       485.0000     12.00      1.96   Fri Mar 25 11:46:18 2005
  100    28000.96   89       280.0096     20.79    150.45   Fri Mar 25 11:46:39 2005
  200    32285.80   79       161.4290     36.05    293.37   Fri Mar 25 11:47:16 2005
  300    40424.15   84       134.7472     43.19    438.42   Fri Mar 25 11:47:59 2005
  400    39155.01   79        97.8875     59.46    590.05   Fri Mar 25 11:48:59 2005
  500    37881.25   82        75.7625     76.82    730.19   Fri Mar 25 11:50:16 2005
  600    39083.14   78        65.1386     89.35    872.79   Fri Mar 25 11:51:46 2005
  700    38627.83   77        55.1826    105.47   1022.46   Fri Mar 25 11:53:32 2005
  800    39631.94   78        49.5399    117.48   1169.94   Fri Mar 25 11:55:30 2005
  900    36903.70   79        41.0041    141.94   1310.78   Fri Mar 25 11:57:53 2005
 1000    36201.23   77        36.2012    160.77   1458.31   Fri Mar 25 12:00:34 2005

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Shobhit Dayal <shobhit@calsoftinc.com>
Signed-off-by: Shai Fultheim <Shai@Scalex86.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/base/node.c    |   2 +-
 include/linux/mm.h     |   6 ++
 include/linux/mmzone.h |  11 ++-
 init/main.c            |   1 +
 mm/mempolicy.c         |   2 +-
 mm/page_alloc.c        | 211 +++++++++++++++++++++++++++++++++++++++++--------
 6 files changed, 195 insertions(+), 38 deletions(-)

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 5d4517c..904b27c 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -87,7 +87,7 @@ static ssize_t node_read_numastat(struct sys_device * dev, char * buf)
 	for (i = 0; i < MAX_NR_ZONES; i++) {
 		struct zone *z = &pg->node_zones[i];
 		for (cpu = 0; cpu < NR_CPUS; cpu++) {
-			struct per_cpu_pageset *ps = &z->pageset[cpu];
+			struct per_cpu_pageset *ps = zone_pcp(z,cpu);
 			numa_hit += ps->numa_hit;
 			numa_miss += ps->numa_miss;
 			numa_foreign += ps->numa_foreign;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 17518fe..1813b16 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -691,6 +691,12 @@ extern void show_mem(void);
 extern void si_meminfo(struct sysinfo * val);
 extern void si_meminfo_node(struct sysinfo *val, int nid);
 
+#ifdef CONFIG_NUMA
+extern void setup_per_cpu_pageset(void);
+#else
+static inline void setup_per_cpu_pageset(void) {}
+#endif
+
 /* prio_tree.c */
 void vma_prio_tree_add(struct vm_area_struct *, struct vm_area_struct *old);
 void vma_prio_tree_insert(struct vm_area_struct *, struct prio_tree_root *);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 18fed8b..4733d35 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -63,6 +63,12 @@ struct per_cpu_pageset {
 #endif
 } ____cacheline_aligned_in_smp;
 
+#ifdef CONFIG_NUMA
+#define zone_pcp(__z, __cpu) ((__z)->pageset[(__cpu)])
+#else
+#define zone_pcp(__z, __cpu) (&(__z)->pageset[(__cpu)])
+#endif
+
 #define ZONE_DMA		0
 #define ZONE_NORMAL		1
 #define ZONE_HIGHMEM		2
@@ -122,8 +128,11 @@ struct zone {
 	 */
 	unsigned long		lowmem_reserve[MAX_NR_ZONES];
 
+#ifdef CONFIG_NUMA
+	struct per_cpu_pageset	*pageset[NR_CPUS];
+#else
 	struct per_cpu_pageset	pageset[NR_CPUS];
-
+#endif
 	/*
 	 * free areas of different sizes
 	 */
diff --git a/init/main.c b/init/main.c
index 40bf367..d324801 100644
--- a/init/main.c
+++ b/init/main.c
@@ -490,6 +490,7 @@ asmlinkage void __init start_kernel(void)
 	vfs_caches_init_early();
 	mem_init();
 	kmem_cache_init();
+	setup_per_cpu_pageset();
 	numa_policy_init();
 	if (late_time_init)
 		late_time_init();
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 08c41da..39252c7 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -721,7 +721,7 @@ static struct page *alloc_page_interleave(unsigned int __nocast gfp, unsigned or
 	zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK);
 	page = __alloc_pages(gfp, order, zl);
 	if (page && page_zone(page) == zl->zones[0]) {
-		zl->zones[0]->pageset[get_cpu()].interleave_hit++;
+		zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
 		put_cpu();
 	}
 	return page;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2019c1b..95cbd30 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -71,6 +71,11 @@ EXPORT_SYMBOL(nr_swap_pages);
 struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)];
 EXPORT_SYMBOL(zone_table);
 
+#ifdef CONFIG_NUMA
+static struct per_cpu_pageset
+	pageset_table[MAX_NR_ZONES*MAX_NUMNODES*NR_CPUS] __initdata;
+#endif
+
 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
 int min_free_kbytes = 1024;
 
@@ -520,7 +525,7 @@ static void __drain_pages(unsigned int cpu)
 	for_each_zone(zone) {
 		struct per_cpu_pageset *pset;
 
-		pset = &zone->pageset[cpu];
+		pset = zone_pcp(zone, cpu);
 		for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
 			struct per_cpu_pages *pcp;
 
@@ -583,12 +588,12 @@ static void zone_statistics(struct zonelist *zonelist, struct zone *z)
 
 	local_irq_save(flags);
 	cpu = smp_processor_id();
-	p = &z->pageset[cpu];
+	p = zone_pcp(z,cpu);
 	if (pg == orig) {
-		z->pageset[cpu].numa_hit++;
+		p->numa_hit++;
 	} else {
 		p->numa_miss++;
-		zonelist->zones[0]->pageset[cpu].numa_foreign++;
+		zone_pcp(zonelist->zones[0], cpu)->numa_foreign++;
 	}
 	if (pg == NODE_DATA(numa_node_id()))
 		p->local_node++;
@@ -615,7 +620,7 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
 	if (PageAnon(page))
 		page->mapping = NULL;
 	free_pages_check(__FUNCTION__, page);
-	pcp = &zone->pageset[get_cpu()].pcp[cold];
+	pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
 	local_irq_save(flags);
 	if (pcp->count >= pcp->high)
 		pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
@@ -659,7 +664,7 @@ buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags)
 	if (order == 0) {
 		struct per_cpu_pages *pcp;
 
-		pcp = &zone->pageset[get_cpu()].pcp[cold];
+		pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
 		local_irq_save(flags);
 		if (pcp->count <= pcp->low)
 			pcp->count += rmqueue_bulk(zone, 0,
@@ -1262,7 +1267,7 @@ void show_free_areas(void)
 			if (!cpu_possible(cpu))
 				continue;
 
-			pageset = zone->pageset + cpu;
+			pageset = zone_pcp(zone, cpu);
 
 			for (temperature = 0; temperature < 2; temperature++)
 				printk("cpu %d %s: low %d, high %d, batch %d\n",
@@ -1645,6 +1650,157 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
 	memmap_init_zone((size), (nid), (zone), (start_pfn))
 #endif
 
+static int __devinit zone_batchsize(struct zone *zone)
+{
+	int batch;
+
+	/*
+	 * The per-cpu-pages pools are set to around 1000th of the
+	 * size of the zone.  But no more than 1/4 of a meg - there's
+	 * no point in going beyond the size of L2 cache.
+	 *
+	 * OK, so we don't know how big the cache is.  So guess.
+	 */
+	batch = zone->present_pages / 1024;
+	if (batch * PAGE_SIZE > 256 * 1024)
+		batch = (256 * 1024) / PAGE_SIZE;
+	batch /= 4;		/* We effectively *= 4 below */
+	if (batch < 1)
+		batch = 1;
+
+	/*
+	 * Clamp the batch to a 2^n - 1 value. Having a power
+	 * of 2 value was found to be more likely to have
+	 * suboptimal cache aliasing properties in some cases.
+	 *
+	 * For example if 2 tasks are alternately allocating
+	 * batches of pages, one task can end up with a lot
+	 * of pages of one half of the possible page colors
+	 * and the other with pages of the other colors.
+	 */
+	batch = (1 << fls(batch + batch/2)) - 1;
+	return batch;
+}
+
+#ifdef CONFIG_NUMA
+/*
+ * Dynamicaly allocate memory for the
+ * per cpu pageset array in struct zone.
+ */
+static int __devinit process_zones(int cpu)
+{
+	struct zone *zone, *dzone;
+	int i;
+
+	for_each_zone(zone) {
+		struct per_cpu_pageset *npageset = NULL;
+
+		npageset = kmalloc_node(sizeof(struct per_cpu_pageset),
+					 GFP_KERNEL, cpu_to_node(cpu));
+		if (!npageset) {
+			zone->pageset[cpu] = NULL;
+			goto bad;
+		}
+
+		if (zone->pageset[cpu]) {
+			memcpy(npageset, zone->pageset[cpu],
+					sizeof(struct per_cpu_pageset));
+
+			/* Relocate lists */
+			for (i = 0; i < 2; i++) {
+				INIT_LIST_HEAD(&npageset->pcp[i].list);
+				list_splice(&zone->pageset[cpu]->pcp[i].list,
+					&npageset->pcp[i].list);
+			}
+ 		} else {
+			struct per_cpu_pages *pcp;
+			unsigned long batch;
+
+			batch = zone_batchsize(zone);
+
+			pcp = &npageset->pcp[0];		/* hot */
+			pcp->count = 0;
+			pcp->low = 2 * batch;
+			pcp->high = 6 * batch;
+			pcp->batch = 1 * batch;
+			INIT_LIST_HEAD(&pcp->list);
+
+			pcp = &npageset->pcp[1];		/* cold*/
+			pcp->count = 0;
+			pcp->low = 0;
+			pcp->high = 2 * batch;
+			pcp->batch = 1 * batch;
+			INIT_LIST_HEAD(&pcp->list);
+		}
+		zone->pageset[cpu] = npageset;
+	}
+
+	return 0;
+bad:
+	for_each_zone(dzone) {
+		if (dzone == zone)
+			break;
+		kfree(dzone->pageset[cpu]);
+		dzone->pageset[cpu] = NULL;
+	}
+	return -ENOMEM;
+}
+
+static inline void free_zone_pagesets(int cpu)
+{
+#ifdef CONFIG_NUMA
+	struct zone *zone;
+
+	for_each_zone(zone) {
+		struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
+
+		zone_pcp(zone, cpu) = NULL;
+		kfree(pset);
+	}
+#endif
+}
+
+static int __devinit pageset_cpuup_callback(struct notifier_block *nfb,
+		unsigned long action,
+		void *hcpu)
+{
+	int cpu = (long)hcpu;
+	int ret = NOTIFY_OK;
+
+	switch (action) {
+		case CPU_UP_PREPARE:
+			if (process_zones(cpu))
+				ret = NOTIFY_BAD;
+			break;
+#ifdef CONFIG_HOTPLUG_CPU
+		case CPU_DEAD:
+			free_zone_pagesets(cpu);
+			break;
+#endif
+		default:
+			break;
+	}
+	return ret;
+}
+
+static struct notifier_block pageset_notifier =
+	{ &pageset_cpuup_callback, NULL, 0 };
+
+void __init setup_per_cpu_pageset()
+{
+	int err;
+
+	/* Initialize per_cpu_pageset for cpu 0.
+	 * A cpuup callback will do this for every cpu
+	 * as it comes online
+	 */
+	err = process_zones(smp_processor_id());
+	BUG_ON(err);
+	register_cpu_notifier(&pageset_notifier);
+}
+
+#endif
+
 /*
  * Set up the zone data structures:
  *   - mark all pages reserved
@@ -1687,43 +1843,28 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
 
 		zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
 
-		/*
-		 * The per-cpu-pages pools are set to around 1000th of the
-		 * size of the zone.  But no more than 1/4 of a meg - there's
-		 * no point in going beyond the size of L2 cache.
-		 *
-		 * OK, so we don't know how big the cache is.  So guess.
-		 */
-		batch = zone->present_pages / 1024;
-		if (batch * PAGE_SIZE > 256 * 1024)
-			batch = (256 * 1024) / PAGE_SIZE;
-		batch /= 4;		/* We effectively *= 4 below */
-		if (batch < 1)
-			batch = 1;
-
-		/*
-		 * Clamp the batch to a 2^n - 1 value. Having a power
-		 * of 2 value was found to be more likely to have
-		 * suboptimal cache aliasing properties in some cases.
-		 *
-		 * For example if 2 tasks are alternately allocating
-		 * batches of pages, one task can end up with a lot
-		 * of pages of one half of the possible page colors
-		 * and the other with pages of the other colors.
-		 */
-		batch = (1 << fls(batch + batch/2)) - 1;
+		batch = zone_batchsize(zone);
 
 		for (cpu = 0; cpu < NR_CPUS; cpu++) {
 			struct per_cpu_pages *pcp;
+#ifdef CONFIG_NUMA
+			struct per_cpu_pageset *pgset;
+			pgset = &pageset_table[nid*MAX_NR_ZONES*NR_CPUS +
+					(j * NR_CPUS) + cpu];
+
+			zone->pageset[cpu] = pgset;
+#else
+			struct per_cpu_pageset *pgset = zone_pcp(zone, cpu);
+#endif
 
-			pcp = &zone->pageset[cpu].pcp[0];	/* hot */
+			pcp = &pgset->pcp[0];			/* hot */
 			pcp->count = 0;
 			pcp->low = 2 * batch;
 			pcp->high = 6 * batch;
 			pcp->batch = 1 * batch;
 			INIT_LIST_HEAD(&pcp->list);
 
-			pcp = &zone->pageset[cpu].pcp[1];	/* cold */
+			pcp = &pgset->pcp[1];			/* cold */
 			pcp->count = 0;
 			pcp->low = 0;
 			pcp->high = 2 * batch;
@@ -1929,7 +2070,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
 			struct per_cpu_pageset *pageset;
 			int j;
 
-			pageset = &zone->pageset[i];
+			pageset = zone_pcp(zone, i);
 			for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
 				if (pageset->pcp[j].count)
 					break;
-- 
cgit v1.1