diff options
author | Christoph Lameter <clameter@sgi.com> | 2006-06-30 01:55:45 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@g5.osdl.org> | 2006-06-30 11:25:36 -0700 |
commit | f8891e5e1f93a128c3900f82035e8541357896a7 (patch) | |
tree | 97b078ac97970962b17c85d39fd64cb48dc01168 /mm | |
parent | ca889e6c45e0b112cb2ca9d35afc66297519b5d5 (diff) | |
download | op-kernel-dev-f8891e5e1f93a128c3900f82035e8541357896a7.zip op-kernel-dev-f8891e5e1f93a128c3900f82035e8541357896a7.tar.gz |
[PATCH] Light weight event counters
The remaining counters in page_state after the zoned VM counter patches
have been applied are all just for show in /proc/vmstat. They have no
essential function for the VM.
We use a simple increment of per cpu variables. In order to avoid the most
severe races we disable preempt. Preempt does not prevent the race between
an increment and an interrupt handler incrementing the same statistics
counter. However, that race is exceedingly rare, we may only loose one
increment or so and there is no requirement (at least not in kernel) that
the vm event counters have to be accurate.
In the non preempt case this results in a simple increment for each
counter. For many architectures this will be reduced by the compiler to a
single instruction. This single instruction is atomic for i386 and x86_64.
And therefore even the rare race condition in an interrupt is avoided for
both architectures in most cases.
The patchset also adds an off switch for embedded systems that allows a
building of linux kernels without these counters.
The implementation of these counters is through inline code that hopefully
results in only a single instruction increment instruction being emitted
(i386, x86_64) or in the increment being hidden though instruction
concurrency (EPIC architectures such as ia64 can get that done).
Benefits:
- VM event counter operations usually reduce to a single inline instruction
on i386 and x86_64.
- No interrupt disable, only preempt disable for the preempt case.
Preempt disable can also be avoided by moving the counter into a spinlock.
- Handling is similar to zoned VM counters.
- Simple and easily extendable.
- Can be omitted to reduce memory use for embedded use.
References:
RFC http://marc.theaimsgroup.com/?l=linux-kernel&m=113512330605497&w=2
RFC http://marc.theaimsgroup.com/?l=linux-kernel&m=114988082814934&w=2
local_t http://marc.theaimsgroup.com/?l=linux-kernel&m=114991748606690&w=2
V2 http://marc.theaimsgroup.com/?t=115014808400007&r=1&w=2
V3 http://marc.theaimsgroup.com/?l=linux-kernel&m=115024767022346&w=2
V4 http://marc.theaimsgroup.com/?l=linux-kernel&m=115047968808926&w=2
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/filemap.c | 4 | ||||
-rw-r--r-- | mm/memory.c | 4 | ||||
-rw-r--r-- | mm/page_alloc.c | 21 | ||||
-rw-r--r-- | mm/page_io.c | 4 | ||||
-rw-r--r-- | mm/shmem.c | 4 | ||||
-rw-r--r-- | mm/swap.c | 4 | ||||
-rw-r--r-- | mm/vmscan.c | 23 | ||||
-rw-r--r-- | mm/vmstat.c | 171 |
8 files changed, 114 insertions, 121 deletions
diff --git a/mm/filemap.c b/mm/filemap.c index 87d62c4..796a547 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1416,7 +1416,7 @@ retry_find: */ if (!did_readaround) { majmin = VM_FAULT_MAJOR; - inc_page_state(pgmajfault); + count_vm_event(PGMAJFAULT); } did_readaround = 1; ra_pages = max_sane_readahead(file->f_ra.ra_pages); @@ -1487,7 +1487,7 @@ no_cached_page: page_not_uptodate: if (!did_readaround) { majmin = VM_FAULT_MAJOR; - inc_page_state(pgmajfault); + count_vm_event(PGMAJFAULT); } lock_page(page); diff --git a/mm/memory.c b/mm/memory.c index 1a78791..7e2a4b1 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1951,7 +1951,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, /* Had to read the page from swap area: Major fault */ ret = VM_FAULT_MAJOR; - inc_page_state(pgmajfault); + count_vm_event(PGMAJFAULT); grab_swap_token(); } @@ -2324,7 +2324,7 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, __set_current_state(TASK_RUNNING); - inc_page_state(pgfault); + count_vm_event(PGFAULT); if (unlikely(is_vm_hugetlb_page(vma))) return hugetlb_fault(mm, vma, address, write_access); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d616712..30b0b97 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -456,7 +456,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) kernel_map_pages(page, 1 << order, 0); local_irq_save(flags); - __mod_page_state(pgfree, 1 << order); + __count_vm_events(PGFREE, 1 << order); free_one_page(page_zone(page), page, order); local_irq_restore(flags); } @@ -729,7 +729,7 @@ static void fastcall free_hot_cold_page(struct page *page, int cold) pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; local_irq_save(flags); - __inc_page_state(pgfree); + __count_vm_event(PGFREE); list_add(&page->lru, &pcp->list); pcp->count++; if (pcp->count >= pcp->high) { @@ -805,7 +805,7 @@ again: goto failed; } - __mod_page_state_zone(zone, pgalloc, 1 << order); + __count_zone_vm_events(PGALLOC, zone, 1 << order); zone_statistics(zonelist, zone); local_irq_restore(flags); put_cpu(); @@ -2101,24 +2101,11 @@ static int page_alloc_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { int cpu = (unsigned long)hcpu; - unsigned long *src, *dest; if (action == CPU_DEAD) { - int i; - local_irq_disable(); __drain_pages(cpu); - - /* Add dead cpu's page_states to our own. */ - dest = (unsigned long *)&__get_cpu_var(page_states); - src = (unsigned long *)&per_cpu(page_states, cpu); - - for (i = 0; i < sizeof(struct page_state)/sizeof(unsigned long); - i++) { - dest[i] += src[i]; - src[i] = 0; - } - + vm_events_fold_cpu(cpu); local_irq_enable(); refresh_cpu_vm_stats(cpu); } diff --git a/mm/page_io.c b/mm/page_io.c index bb2b0d5..8802994 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -101,7 +101,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) } if (wbc->sync_mode == WB_SYNC_ALL) rw |= (1 << BIO_RW_SYNC); - inc_page_state(pswpout); + count_vm_event(PSWPOUT); set_page_writeback(page); unlock_page(page); submit_bio(rw, bio); @@ -123,7 +123,7 @@ int swap_readpage(struct file *file, struct page *page) ret = -ENOMEM; goto out; } - inc_page_state(pswpin); + count_vm_event(PSWPIN); submit_bio(READ, bio); out: return ret; @@ -1045,12 +1045,12 @@ repeat: swappage = lookup_swap_cache(swap); if (!swappage) { shmem_swp_unmap(entry); - spin_unlock(&info->lock); /* here we actually do the io */ if (type && *type == VM_FAULT_MINOR) { - inc_page_state(pgmajfault); + __count_vm_event(PGMAJFAULT); *type = VM_FAULT_MAJOR; } + spin_unlock(&info->lock); swappage = shmem_swapin(info, swap, idx); if (!swappage) { spin_lock(&info->lock); @@ -87,7 +87,7 @@ int rotate_reclaimable_page(struct page *page) spin_lock_irqsave(&zone->lru_lock, flags); if (PageLRU(page) && !PageActive(page)) { list_move_tail(&page->lru, &zone->inactive_list); - inc_page_state(pgrotated); + __count_vm_event(PGROTATED); } if (!test_clear_page_writeback(page)) BUG(); @@ -107,7 +107,7 @@ void fastcall activate_page(struct page *page) del_page_from_inactive_list(zone, page); SetPageActive(page); add_page_to_active_list(zone, page); - inc_page_state(pgactivate); + __count_vm_event(PGACTIVATE); } spin_unlock_irq(&zone->lru_lock); } diff --git a/mm/vmscan.c b/mm/vmscan.c index d694243..ff2ebe9 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -215,7 +215,7 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, break; if (shrink_ret < nr_before) ret += nr_before - shrink_ret; - mod_page_state(slabs_scanned, this_scan); + count_vm_events(SLABS_SCANNED, this_scan); total_scan -= this_scan; cond_resched(); @@ -569,7 +569,7 @@ keep: list_splice(&ret_pages, page_list); if (pagevec_count(&freed_pvec)) __pagevec_release_nonlru(&freed_pvec); - mod_page_state(pgactivate, pgactivate); + count_vm_events(PGACTIVATE, pgactivate); return nr_reclaimed; } @@ -659,11 +659,11 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, nr_reclaimed += nr_freed; local_irq_disable(); if (current_is_kswapd()) { - __mod_page_state_zone(zone, pgscan_kswapd, nr_scan); - __mod_page_state(kswapd_steal, nr_freed); + __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan); + __count_vm_events(KSWAPD_STEAL, nr_freed); } else - __mod_page_state_zone(zone, pgscan_direct, nr_scan); - __mod_page_state_zone(zone, pgsteal, nr_freed); + __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan); + __count_vm_events(PGACTIVATE, nr_freed); if (nr_taken == 0) goto done; @@ -841,11 +841,10 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, } } zone->nr_active += pgmoved; - spin_unlock(&zone->lru_lock); - __mod_page_state_zone(zone, pgrefill, pgscanned); - __mod_page_state(pgdeactivate, pgdeactivate); - local_irq_enable(); + __count_zone_vm_events(PGREFILL, zone, pgscanned); + __count_vm_events(PGDEACTIVATE, pgdeactivate); + spin_unlock_irq(&zone->lru_lock); pagevec_release(&pvec); } @@ -977,7 +976,7 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) .swappiness = vm_swappiness, }; - inc_page_state(allocstall); + count_vm_event(ALLOCSTALL); for (i = 0; zones[i] != NULL; i++) { struct zone *zone = zones[i]; @@ -1074,7 +1073,7 @@ loop_again: total_scanned = 0; nr_reclaimed = 0; sc.may_writepage = !laptop_mode; - inc_page_state(pageoutrun); + count_vm_event(PAGEOUTRUN); for (i = 0; i < pgdat->nr_zones; i++) { struct zone *zone = pgdat->node_zones + i; diff --git a/mm/vmstat.c b/mm/vmstat.c index ee7f896..73b83d6 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -13,66 +13,6 @@ #include <linux/mm.h> #include <linux/module.h> -/* - * Accumulate the page_state information across all CPUs. - * The result is unavoidably approximate - it can change - * during and after execution of this function. - */ -DEFINE_PER_CPU(struct page_state, page_states) = {0}; - -static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) -{ - unsigned cpu; - - memset(ret, 0, nr * sizeof(unsigned long)); - cpus_and(*cpumask, *cpumask, cpu_online_map); - - for_each_cpu_mask(cpu, *cpumask) { - unsigned long *in; - unsigned long *out; - unsigned off; - unsigned next_cpu; - - in = (unsigned long *)&per_cpu(page_states, cpu); - - next_cpu = next_cpu(cpu, *cpumask); - if (likely(next_cpu < NR_CPUS)) - prefetch(&per_cpu(page_states, next_cpu)); - - out = (unsigned long *)ret; - for (off = 0; off < nr; off++) - *out++ += *in++; - } -} - -void get_full_page_state(struct page_state *ret) -{ - cpumask_t mask = CPU_MASK_ALL; - - __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask); -} - -void __mod_page_state_offset(unsigned long offset, unsigned long delta) -{ - void *ptr; - - ptr = &__get_cpu_var(page_states); - *(unsigned long *)(ptr + offset) += delta; -} -EXPORT_SYMBOL(__mod_page_state_offset); - -void mod_page_state_offset(unsigned long offset, unsigned long delta) -{ - unsigned long flags; - void *ptr; - - local_irq_save(flags); - ptr = &__get_cpu_var(page_states); - *(unsigned long *)(ptr + offset) += delta; - local_irq_restore(flags); -} -EXPORT_SYMBOL(mod_page_state_offset); - void __get_zone_counts(unsigned long *active, unsigned long *inactive, unsigned long *free, struct pglist_data *pgdat) { @@ -106,6 +46,63 @@ void get_zone_counts(unsigned long *active, } } +#ifdef CONFIG_VM_EVENT_COUNTERS +DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; +EXPORT_PER_CPU_SYMBOL(vm_event_states); + +static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask) +{ + int cpu = 0; + int i; + + memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long)); + + cpu = first_cpu(*cpumask); + while (cpu < NR_CPUS) { + struct vm_event_state *this = &per_cpu(vm_event_states, cpu); + + cpu = next_cpu(cpu, *cpumask); + + if (cpu < NR_CPUS) + prefetch(&per_cpu(vm_event_states, cpu)); + + + for (i = 0; i < NR_VM_EVENT_ITEMS; i++) + ret[i] += this->event[i]; + } +} + +/* + * Accumulate the vm event counters across all CPUs. + * The result is unavoidably approximate - it can change + * during and after execution of this function. +*/ +void all_vm_events(unsigned long *ret) +{ + sum_vm_events(ret, &cpu_online_map); +} + +#ifdef CONFIG_HOTPLUG +/* + * Fold the foreign cpu events into our own. + * + * This is adding to the events on one processor + * but keeps the global counts constant. + */ +void vm_events_fold_cpu(int cpu) +{ + struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu); + int i; + + for (i = 0; i < NR_VM_EVENT_ITEMS; i++) { + count_vm_events(i, fold_state->event[i]); + fold_state->event[i] = 0; + } +} +#endif /* CONFIG_HOTPLUG */ + +#endif /* CONFIG_VM_EVENT_COUNTERS */ + /* * Manage combined zone based / global counters * @@ -405,16 +402,16 @@ static char *vmstat_text[] = { "numa_other", #endif - /* Event counters */ +#ifdef CONFIG_VM_EVENT_COUNTERS "pgpgin", "pgpgout", "pswpin", "pswpout", - "pgalloc_high", - "pgalloc_normal", - "pgalloc_dma32", "pgalloc_dma", + "pgalloc_dma32", + "pgalloc_normal", + "pgalloc_high", "pgfree", "pgactivate", @@ -423,25 +420,25 @@ static char *vmstat_text[] = { "pgfault", "pgmajfault", - "pgrefill_high", - "pgrefill_normal", - "pgrefill_dma32", "pgrefill_dma", + "pgrefill_dma32", + "pgrefill_normal", + "pgrefill_high", - "pgsteal_high", - "pgsteal_normal", - "pgsteal_dma32", "pgsteal_dma", + "pgsteal_dma32", + "pgsteal_normal", + "pgsteal_high", - "pgscan_kswapd_high", - "pgscan_kswapd_normal", - "pgscan_kswapd_dma32", "pgscan_kswapd_dma", + "pgscan_kswapd_dma32", + "pgscan_kswapd_normal", + "pgscan_kswapd_high", - "pgscan_direct_high", - "pgscan_direct_normal", - "pgscan_direct_dma32", "pgscan_direct_dma", + "pgscan_direct_dma32", + "pgscan_direct_normal", + "pgscan_direct_high", "pginodesteal", "slabs_scanned", @@ -451,6 +448,7 @@ static char *vmstat_text[] = { "allocstall", "pgrotated", +#endif }; /* @@ -553,23 +551,32 @@ struct seq_operations zoneinfo_op = { static void *vmstat_start(struct seq_file *m, loff_t *pos) { unsigned long *v; - struct page_state *ps; +#ifdef CONFIG_VM_EVENT_COUNTERS + unsigned long *e; +#endif int i; if (*pos >= ARRAY_SIZE(vmstat_text)) return NULL; +#ifdef CONFIG_VM_EVENT_COUNTERS v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) - + sizeof(*ps), GFP_KERNEL); + + sizeof(struct vm_event_state), GFP_KERNEL); +#else + v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long), + GFP_KERNEL); +#endif m->private = v; if (!v) return ERR_PTR(-ENOMEM); for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) v[i] = global_page_state(i); - ps = (struct page_state *)(v + NR_VM_ZONE_STAT_ITEMS); - get_full_page_state(ps); - ps->pgpgin /= 2; /* sectors -> kbytes */ - ps->pgpgout /= 2; +#ifdef CONFIG_VM_EVENT_COUNTERS + e = v + NR_VM_ZONE_STAT_ITEMS; + all_vm_events(e); + e[PGPGIN] /= 2; /* sectors -> kbytes */ + e[PGPGOUT] /= 2; +#endif return v + *pos; } |