diff options
-rw-r--r-- | include/linux/mmzone.h | 1 | ||||
-rw-r--r-- | mm/vmstat.c | 139 |
2 files changed, 120 insertions, 20 deletions
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 656b588..f45163c 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -77,6 +77,7 @@ struct per_cpu_pages { struct per_cpu_pageset { struct per_cpu_pages pcp[2]; /* 0: hot. 1: cold */ #ifdef CONFIG_SMP + s8 stat_threshold; s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS]; #endif } ____cacheline_aligned_in_smp; diff --git a/mm/vmstat.c b/mm/vmstat.c index 3799a0f..c1b5f41 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -12,6 +12,7 @@ #include <linux/config.h> #include <linux/mm.h> #include <linux/module.h> +#include <linux/cpu.h> void __get_zone_counts(unsigned long *active, unsigned long *inactive, unsigned long *free, struct pglist_data *pgdat) @@ -114,17 +115,72 @@ EXPORT_SYMBOL(vm_stat); #ifdef CONFIG_SMP -#define STAT_THRESHOLD 32 +static int calculate_threshold(struct zone *zone) +{ + int threshold; + int mem; /* memory in 128 MB units */ + + /* + * The threshold scales with the number of processors and the amount + * of memory per zone. More memory means that we can defer updates for + * longer, more processors could lead to more contention. + * fls() is used to have a cheap way of logarithmic scaling. + * + * Some sample thresholds: + * + * Threshold Processors (fls) Zonesize fls(mem+1) + * ------------------------------------------------------------------ + * 8 1 1 0.9-1 GB 4 + * 16 2 2 0.9-1 GB 4 + * 20 2 2 1-2 GB 5 + * 24 2 2 2-4 GB 6 + * 28 2 2 4-8 GB 7 + * 32 2 2 8-16 GB 8 + * 4 2 2 <128M 1 + * 30 4 3 2-4 GB 5 + * 48 4 3 8-16 GB 8 + * 32 8 4 1-2 GB 4 + * 32 8 4 0.9-1GB 4 + * 10 16 5 <128M 1 + * 40 16 5 900M 4 + * 70 64 7 2-4 GB 5 + * 84 64 7 4-8 GB 6 + * 108 512 9 4-8 GB 6 + * 125 1024 10 8-16 GB 8 + * 125 1024 10 16-32 GB 9 + */ + + mem = zone->present_pages >> (27 - PAGE_SHIFT); + + threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem)); + + /* + * Maximum threshold is 125 + */ + threshold = min(125, threshold); + + return threshold; +} /* - * Determine pointer to currently valid differential byte given a zone and - * the item number. - * - * Preemption must be off + * Refresh the thresholds for each zone. */ -static inline s8 *diff_pointer(struct zone *zone, enum zone_stat_item item) +static void refresh_zone_stat_thresholds(void) { - return &zone_pcp(zone, smp_processor_id())->vm_stat_diff[item]; + struct zone *zone; + int cpu; + int threshold; + + for_each_zone(zone) { + + if (!zone->present_pages) + continue; + + threshold = calculate_threshold(zone); + + for_each_online_cpu(cpu) + zone_pcp(zone, cpu)->stat_threshold = threshold; + } } /* @@ -133,17 +189,16 @@ static inline s8 *diff_pointer(struct zone *zone, enum zone_stat_item item) void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, int delta) { - s8 *p; + struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); + s8 *p = pcp->vm_stat_diff + item; long x; - p = diff_pointer(zone, item); x = delta + *p; - if (unlikely(x > STAT_THRESHOLD || x < -STAT_THRESHOLD)) { + if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) { zone_page_state_add(x, zone, item); x = 0; } - *p = x; } EXPORT_SYMBOL(__mod_zone_page_state); @@ -172,10 +227,12 @@ EXPORT_SYMBOL(mod_zone_page_state); * No overflow check is necessary and therefore the differential can be * incremented or decremented in place which may allow the compilers to * generate better code. - * * The increment or decrement is known and therefore one boundary check can * be omitted. * + * NOTE: These functions are very performance sensitive. Change only + * with care. + * * Some processors have inc/dec instructions that are atomic vs an interrupt. * However, the code must first determine the differential location in a zone * based on the processor number and then inc/dec the counter. There is no @@ -185,13 +242,16 @@ EXPORT_SYMBOL(mod_zone_page_state); */ static void __inc_zone_state(struct zone *zone, enum zone_stat_item item) { - s8 *p = diff_pointer(zone, item); + struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); + s8 *p = pcp->vm_stat_diff + item; (*p)++; - if (unlikely(*p > STAT_THRESHOLD)) { - zone_page_state_add(*p + STAT_THRESHOLD / 2, zone, item); - *p = -STAT_THRESHOLD / 2; + if (unlikely(*p > pcp->stat_threshold)) { + int overstep = pcp->stat_threshold / 2; + + zone_page_state_add(*p + overstep, zone, item); + *p = -overstep; } } @@ -204,13 +264,16 @@ EXPORT_SYMBOL(__inc_zone_page_state); void __dec_zone_page_state(struct page *page, enum zone_stat_item item) { struct zone *zone = page_zone(page); - s8 *p = diff_pointer(zone, item); + struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); + s8 *p = pcp->vm_stat_diff + item; (*p)--; - if (unlikely(*p < -STAT_THRESHOLD)) { - zone_page_state_add(*p - STAT_THRESHOLD / 2, zone, item); - *p = STAT_THRESHOLD /2; + if (unlikely(*p < - pcp->stat_threshold)) { + int overstep = pcp->stat_threshold / 2; + + zone_page_state_add(*p - overstep, zone, item); + *p = overstep; } } EXPORT_SYMBOL(__dec_zone_page_state); @@ -515,6 +578,10 @@ static int zoneinfo_show(struct seq_file *m, void *arg) pageset->pcp[j].high, pageset->pcp[j].batch); } +#ifdef CONFIG_SMP + seq_printf(m, "\n vm stats threshold: %d", + pageset->stat_threshold); +#endif } seq_printf(m, "\n all_unreclaimable: %u" @@ -603,3 +670,35 @@ struct seq_operations vmstat_op = { #endif /* CONFIG_PROC_FS */ +#ifdef CONFIG_SMP +/* + * Use the cpu notifier to insure that the thresholds are recalculated + * when necessary. + */ +static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_CANCELED: + case CPU_DEAD: + refresh_zone_stat_thresholds(); + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata vmstat_notifier = + { &vmstat_cpuup_callback, NULL, 0 }; + +int __init setup_vmstat(void) +{ + refresh_zone_stat_thresholds(); + register_cpu_notifier(&vmstat_notifier); + return 0; +} +module_init(setup_vmstat) +#endif |