diff options
author | Hugh Dickins <hugh@veritas.com> | 2005-10-29 18:16:18 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@g5.osdl.org> | 2005-10-29 21:40:39 -0700 |
commit | 365e9c87a982c03d0af3886e29d877f581b59611 (patch) | |
tree | d06c1918ca9fe6677d7e4e869555e095004274f7 /mm | |
parent | 861f2fb8e796022b4928cab9c74fca6681a1c557 (diff) | |
download | op-kernel-dev-365e9c87a982c03d0af3886e29d877f581b59611.zip op-kernel-dev-365e9c87a982c03d0af3886e29d877f581b59611.tar.gz |
[PATCH] mm: update_hiwaters just in time
update_mem_hiwater has attracted various criticisms, in particular from those
concerned with mm scalability. Originally it was called whenever rss or
total_vm got raised. Then many of those callsites were replaced by a timer
tick call from account_system_time. Now Frank van Maarseveen reports that to
be found inadequate. How about this? Works for Frank.
Replace update_mem_hiwater, a poor combination of two unrelated ops, by macros
update_hiwater_rss and update_hiwater_vm. Don't attempt to keep
mm->hiwater_rss up to date at timer tick, nor every time we raise rss (usually
by 1): those are hot paths. Do the opposite, update only when about to lower
rss (usually by many), or just before final accounting in do_exit. Handle
mm->hiwater_vm in the same way, though it's much less of an issue. Demand
that whoever collects these hiwater statistics do the work of taking the
maximum with rss or total_vm.
And there has been no collector of these hiwater statistics in the tree. The
new convention needs an example, so match Frank's usage by adding a VmPeak
line above VmSize to /proc/<pid>/status, and also a VmHWM line above VmRSS
(High-Water-Mark or High-Water-Memory).
There was a particular anomaly during mremap move, that hiwater_vm might be
captured too high. A fleeting such anomaly remains, but it's quickly
corrected now, whereas before it would stick.
What locking? None: if the app is racy then these statistics will be racy,
it's not worth any overhead to make them exact. But whenever it suits,
hiwater_vm is updated under exclusive mmap_sem, and hiwater_rss under
page_table_lock (for now) or with preemption disabled (later on): without
going to any trouble, minimize the time between reading current values and
updating, to minimize those occasions when a racing thread bumps a count up
and back down in between.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/fremap.c | 4 | ||||
-rw-r--r-- | mm/hugetlb.c | 3 | ||||
-rw-r--r-- | mm/memory.c | 17 | ||||
-rw-r--r-- | mm/mmap.c | 4 | ||||
-rw-r--r-- | mm/mremap.c | 12 | ||||
-rw-r--r-- | mm/nommu.c | 15 | ||||
-rw-r--r-- | mm/rmap.c | 6 |
7 files changed, 29 insertions, 32 deletions
diff --git a/mm/fremap.c b/mm/fremap.c index 7f08d10..49719a3 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -143,8 +143,10 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, if (!pte) goto err_unlock; - if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte)) + if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte)) { + update_hiwater_rss(mm); dec_mm_counter(mm, file_rss); + } set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); pte_val = *pte; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 094455b..ac5f044 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -310,6 +310,9 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, BUG_ON(start & ~HPAGE_MASK); BUG_ON(end & ~HPAGE_MASK); + /* Update high watermark before we lower rss */ + update_hiwater_rss(mm); + for (address = start; address < end; address += HPAGE_SIZE) { ptep = huge_pte_offset(mm, address); if (! ptep) diff --git a/mm/memory.c b/mm/memory.c index a25ee1d..692ad81 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -820,6 +820,7 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, lru_add_drain(); spin_lock(&mm->page_table_lock); tlb = tlb_gather_mmu(mm, 0); + update_hiwater_rss(mm); end = unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details); tlb_finish_mmu(tlb, address, end); spin_unlock(&mm->page_table_lock); @@ -2225,22 +2226,6 @@ unsigned long vmalloc_to_pfn(void * vmalloc_addr) EXPORT_SYMBOL(vmalloc_to_pfn); -/* - * update_mem_hiwater - * - update per process rss and vm high water data - */ -void update_mem_hiwater(struct task_struct *tsk) -{ - if (tsk->mm) { - unsigned long rss = get_mm_rss(tsk->mm); - - if (tsk->mm->hiwater_rss < rss) - tsk->mm->hiwater_rss = rss; - if (tsk->mm->hiwater_vm < tsk->mm->total_vm) - tsk->mm->hiwater_vm = tsk->mm->total_vm; - } -} - #if !defined(__HAVE_ARCH_GATE_AREA) #if defined(AT_SYSINFO_EHDR) @@ -1640,6 +1640,8 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr) */ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) { + /* Update high watermark before we lower total_vm */ + update_hiwater_vm(mm); do { long nrpages = vma_pages(vma); @@ -1668,6 +1670,7 @@ static void unmap_region(struct mm_struct *mm, lru_add_drain(); spin_lock(&mm->page_table_lock); tlb = tlb_gather_mmu(mm, 0); + update_hiwater_rss(mm); unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL); vm_unacct_memory(nr_accounted); free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, @@ -1953,6 +1956,7 @@ void exit_mmap(struct mm_struct *mm) flush_cache_mm(mm); tlb = tlb_gather_mmu(mm, 1); + /* Don't update_hiwater_rss(mm) here, do_exit already did */ /* Use -1 here to ensure all VMAs in the mm are unmapped */ end = unmap_vmas(&tlb, mm, vma, 0, -1, &nr_accounted, NULL); vm_unacct_memory(nr_accounted); diff --git a/mm/mremap.c b/mm/mremap.c index 318eea5..ccf4564 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -167,6 +167,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, unsigned long new_pgoff; unsigned long moved_len; unsigned long excess = 0; + unsigned long hiwater_vm; int split = 0; /* @@ -205,9 +206,15 @@ static unsigned long move_vma(struct vm_area_struct *vma, } /* - * if we failed to move page tables we still do total_vm increment - * since do_munmap() will decrement it by old_len == new_len + * If we failed to move page tables we still do total_vm increment + * since do_munmap() will decrement it by old_len == new_len. + * + * Since total_vm is about to be raised artificially high for a + * moment, we need to restore high watermark afterwards: if stats + * are taken meanwhile, total_vm and hiwater_vm appear too high. + * If this were a serious issue, we'd add a flag to do_munmap(). */ + hiwater_vm = mm->hiwater_vm; mm->total_vm += new_len >> PAGE_SHIFT; vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT); @@ -216,6 +223,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, vm_unacct_memory(excess >> PAGE_SHIFT); excess = 0; } + mm->hiwater_vm = hiwater_vm; /* Restore VM_ACCOUNT if one or two pieces of vma left */ if (excess) { @@ -931,6 +931,8 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len) realalloc -= kobjsize(vml); askedalloc -= sizeof(*vml); kfree(vml); + + update_hiwater_vm(mm); mm->total_vm -= len >> PAGE_SHIFT; #ifdef DEBUG @@ -1078,19 +1080,6 @@ void arch_unmap_area(struct mm_struct *mm, unsigned long addr) { } -void update_mem_hiwater(struct task_struct *tsk) -{ - unsigned long rss; - - if (likely(tsk->mm)) { - rss = get_mm_rss(tsk->mm); - if (tsk->mm->hiwater_rss < rss) - tsk->mm->hiwater_rss = rss; - if (tsk->mm->hiwater_vm < tsk->mm->total_vm) - tsk->mm->hiwater_vm = tsk->mm->total_vm; - } -} - void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows) @@ -538,6 +538,9 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma) if (pte_dirty(pteval)) set_page_dirty(page); + /* Update high watermark before we lower rss */ + update_hiwater_rss(mm); + if (PageAnon(page)) { swp_entry_t entry = { .val = page->private }; /* @@ -628,6 +631,9 @@ static void try_to_unmap_cluster(unsigned long cursor, if (!pmd_present(*pmd)) goto out_unlock; + /* Update high watermark before we lower rss */ + update_hiwater_rss(mm); + for (original_pte = pte = pte_offset_map(pmd, address); address < end; pte++, address += PAGE_SIZE) { |