From c654345924f7cce87bb221b89db91cba890421ba Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 8 Oct 2012 16:28:21 -0700 Subject: mm: remove __GFP_NO_KSWAPD When transparent huge pages were introduced, memory compaction and swap storms were an issue, and the kernel had to be careful to not make THP allocations cause pageout or compaction. Now that we have working compaction deferral, kswapd is smart enough to invoke compaction and the quadratic behaviour around isolate_free_pages has been fixed, it should be safe to remove __GFP_NO_KSWAPD. [minchan@kernel.org: Comment fix] [mgorman@suse.de: Avoid direct reclaim for deferred compaction] Cc: Andrea Arcangeli Signed-off-by: Rik van Riel Signed-off-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c13ea75..5e92698 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2362,9 +2362,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, goto nopage; restart: - if (!(gfp_mask & __GFP_NO_KSWAPD)) - wake_all_kswapd(order, zonelist, high_zoneidx, - zone_idx(preferred_zone)); + wake_all_kswapd(order, zonelist, high_zoneidx, + zone_idx(preferred_zone)); /* * OK, we're below the kswapd watermark and have kicked background @@ -2441,7 +2440,7 @@ rebalance: * system then fail the allocation instead of entering direct reclaim. */ if ((deferred_compaction || contended_compaction) && - (gfp_mask & __GFP_NO_KSWAPD)) + (gfp_mask & (__GFP_MOVABLE|__GFP_REPEAT)) == __GFP_MOVABLE) goto nopage; /* Try direct reclaim and then allocating */ -- cgit v1.1 From 5180da410db6369d1f95c9014da1c9bc33fb043e Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 8 Oct 2012 16:28:29 -0700 Subject: x86, pat: separate the pfn attribute tracking for remap_pfn_range and vm_insert_pfn With PAT enabled, vm_insert_pfn() looks up the existing pfn memory attribute and uses it. Expectation is that the driver reserves the memory attributes for the pfn before calling vm_insert_pfn(). remap_pfn_range() (when called for the whole vma) will setup a new attribute (based on the prot argument) for the specified pfn range. This addresses the legacy usage which typically calls remap_pfn_range() with a desired memory attribute. For ranges smaller than the vma size (which is typically not the case), remap_pfn_range() will use the existing memory attribute for the pfn range. Expose two different API's for these different behaviors. track_pfn_insert() for tracking the pfn attribute set by vm_insert_pfn() and track_pfn_remap() for the remap_pfn_range(). This cleanup also prepares the ground for the track/untrack pfn vma routines to take over the ownership of setting PAT specific vm_flag in the 'vma'. [khlebnikov@openvz.org: Clear checks in track_pfn_remap()] [akpm@linux-foundation.org: tweak a few comments] Signed-off-by: Suresh Siddha Signed-off-by: Konstantin Khlebnikov Cc: Venkatesh Pallipadi Cc: H. Peter Anvin Cc: Nick Piggin Cc: Ingo Molnar Cc: Alexander Viro Cc: Carsten Otte Cc: Chris Metcalf Cc: Cyrill Gorcunov Cc: Eric Paris Cc: Hugh Dickins Cc: James Morris Cc: Jason Baron Cc: Kentaro Takeda Cc: Konstantin Khlebnikov Cc: Matt Helsley Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Robert Richter Cc: Suresh Siddha Cc: Tetsuo Handa Acked-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 5736170..6bef278 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1060,7 +1060,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, * We do not free on error cases below as remove_vma * gets called on error from higher level routine */ - ret = track_pfn_vma_copy(vma); + ret = track_pfn_copy(vma); if (ret) return ret; } @@ -1328,7 +1328,7 @@ static void unmap_single_vma(struct mmu_gather *tlb, uprobe_munmap(vma, start, end); if (unlikely(is_pfn_mapping(vma))) - untrack_pfn_vma(vma, 0, 0); + untrack_pfn(vma, 0, 0); if (start != end) { if (unlikely(is_vm_hugetlb_page(vma))) { @@ -2162,14 +2162,11 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, if (addr < vma->vm_start || addr >= vma->vm_end) return -EFAULT; - if (track_pfn_vma_new(vma, &pgprot, pfn, PAGE_SIZE)) + if (track_pfn_insert(vma, &pgprot, pfn)) return -EINVAL; ret = insert_pfn(vma, addr, pfn, pgprot); - if (ret) - untrack_pfn_vma(vma, pfn, PAGE_SIZE); - return ret; } EXPORT_SYMBOL(vm_insert_pfn); @@ -2311,7 +2308,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; - err = track_pfn_vma_new(vma, &prot, pfn, PAGE_ALIGN(size)); + err = track_pfn_remap(vma, &prot, pfn, PAGE_ALIGN(size)); if (err) { /* * To indicate that track_pfn related cleanup is not @@ -2335,7 +2332,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, } while (pgd++, addr = next, addr != end); if (err) - untrack_pfn_vma(vma, pfn, PAGE_ALIGN(size)); + untrack_pfn(vma, pfn, PAGE_ALIGN(size)); return err; } -- cgit v1.1 From b3b9c2932c32e0692018ed5f12f3fd8c70eea8ce Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 8 Oct 2012 16:28:34 -0700 Subject: mm, x86, pat: rework linear pfn-mmap tracking Replace the generic vma-flag VM_PFN_AT_MMAP with x86-only VM_PAT. We can toss mapping address from remap_pfn_range() into track_pfn_vma_new(), and collect all PAT-related logic together in arch/x86/. This patch also restores orignal frustration-free is_cow_mapping() check in remap_pfn_range(), as it was before commit v2.6.28-rc8-88-g3c8bb73 ("x86: PAT: store vm_pgoff for all linear_over_vma_region mappings - v3") is_linear_pfn_mapping() checks can be removed from mm/huge_memory.c, because it already handled by VM_PFNMAP in VM_NO_THP bit-mask. [suresh.b.siddha@intel.com: Reset the VM_PAT flag as part of untrack_pfn_vma()] Signed-off-by: Konstantin Khlebnikov Signed-off-by: Suresh Siddha Cc: Venkatesh Pallipadi Cc: H. Peter Anvin Cc: Nick Piggin Cc: Ingo Molnar Cc: Alexander Viro Cc: Carsten Otte Cc: Chris Metcalf Cc: Cyrill Gorcunov Cc: Eric Paris Cc: Hugh Dickins Cc: James Morris Cc: Jason Baron Cc: Kentaro Takeda Cc: Matt Helsley Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Robert Richter Cc: Tetsuo Handa Cc: Venkatesh Pallipadi Acked-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 19 +++---------------- mm/memory.c | 26 ++++++++++---------------- 2 files changed, 13 insertions(+), 32 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 141dbb6..73cb22e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1655,11 +1655,7 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma) if (vma->vm_ops) /* khugepaged not yet working on file or special mappings */ return 0; - /* - * If is_pfn_mapping() is true is_learn_pfn_mapping() must be - * true too, verify it here. - */ - VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP); + VM_BUG_ON(vma->vm_flags & VM_NO_THP); hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; hend = vma->vm_end & HPAGE_PMD_MASK; if (hstart < hend) @@ -1912,11 +1908,7 @@ static void collapse_huge_page(struct mm_struct *mm, goto out; if (is_vma_temporary_stack(vma)) goto out; - /* - * If is_pfn_mapping() is true is_learn_pfn_mapping() must be - * true too, verify it here. - */ - VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP); + VM_BUG_ON(vma->vm_flags & VM_NO_THP); pgd = pgd_offset(mm, address); if (!pgd_present(*pgd)) @@ -2154,12 +2146,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, goto skip; if (is_vma_temporary_stack(vma)) goto skip; - /* - * If is_pfn_mapping() is true is_learn_pfn_mapping() - * must be true too, verify it here. - */ - VM_BUG_ON(is_linear_pfn_mapping(vma) || - vma->vm_flags & VM_NO_THP); + VM_BUG_ON(vma->vm_flags & VM_NO_THP); hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; hend = vma->vm_end & HPAGE_PMD_MASK; diff --git a/mm/memory.c b/mm/memory.c index 6bef278..655e142 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1055,7 +1055,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (is_vm_hugetlb_page(vma)) return copy_hugetlb_page_range(dst_mm, src_mm, vma); - if (unlikely(is_pfn_mapping(vma))) { + if (unlikely(vma->vm_flags & VM_PFNMAP)) { /* * We do not free on error cases below as remove_vma * gets called on error from higher level routine @@ -1327,7 +1327,7 @@ static void unmap_single_vma(struct mmu_gather *tlb, if (vma->vm_file) uprobe_munmap(vma, start, end); - if (unlikely(is_pfn_mapping(vma))) + if (unlikely(vma->vm_flags & VM_PFNMAP)) untrack_pfn(vma, 0, 0); if (start != end) { @@ -2299,26 +2299,20 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, * There's a horrible special case to handle copy-on-write * behaviour that some programs depend on. We mark the "original" * un-COW'ed pages by matching them up with "vma->vm_pgoff". + * See vm_normal_page() for details. */ - if (addr == vma->vm_start && end == vma->vm_end) { + if (is_cow_mapping(vma->vm_flags)) { + if (addr != vma->vm_start || end != vma->vm_end) + return -EINVAL; vma->vm_pgoff = pfn; - vma->vm_flags |= VM_PFN_AT_MMAP; - } else if (is_cow_mapping(vma->vm_flags)) + } + + err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size)); + if (err) return -EINVAL; vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; - err = track_pfn_remap(vma, &prot, pfn, PAGE_ALIGN(size)); - if (err) { - /* - * To indicate that track_pfn related cleanup is not - * needed from higher level routine calling unmap_vmas - */ - vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP); - vma->vm_flags &= ~VM_PFN_AT_MMAP; - return -EINVAL; - } - BUG_ON(addr >= end); pfn -= addr >> PAGE_SHIFT; pgd = pgd_offset(mm, addr); -- cgit v1.1 From cc2383ec06be093789469852e1fe96e1148e9a2c Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 8 Oct 2012 16:28:37 -0700 Subject: mm: introduce arch-specific vma flag VM_ARCH_1 Combine several arch-specific vma flags into one. before patch: 0x00000200 0x01000000 0x20000000 0x40000000 x86 VM_NOHUGEPAGE VM_HUGEPAGE - VM_PAT powerpc - - VM_SAO - parisc VM_GROWSUP - - - ia64 VM_GROWSUP - - - nommu - VM_MAPPED_COPY - - others - - - - after patch: 0x00000200 0x01000000 0x20000000 0x40000000 x86 - VM_PAT VM_HUGEPAGE VM_NOHUGEPAGE powerpc - VM_SAO - - parisc - VM_GROWSUP - - ia64 - VM_GROWSUP - - nommu - VM_MAPPED_COPY - - others - VM_ARCH_1 - - And voila! One completely free bit. Signed-off-by: Konstantin Khlebnikov Cc: Alexander Viro Cc: Carsten Otte Cc: Chris Metcalf Cc: Cyrill Gorcunov Cc: Eric Paris Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Morris Cc: Jason Baron Cc: Kentaro Takeda Cc: Matt Helsley Cc: Nick Piggin Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Robert Richter Cc: Suresh Siddha Cc: Tetsuo Handa Cc: Venkatesh Pallipadi Acked-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 2 +- mm/ksm.c | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 73cb22e..4720669 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1491,7 +1491,7 @@ out: return ret; } -#define VM_NO_THP (VM_SPECIAL|VM_INSERTPAGE|VM_MIXEDMAP|VM_SAO| \ +#define VM_NO_THP (VM_SPECIAL|VM_INSERTPAGE|VM_MIXEDMAP| \ VM_HUGETLB|VM_SHARED|VM_MAYSHARE) int hugepage_madvise(struct vm_area_struct *vma, diff --git a/mm/ksm.c b/mm/ksm.c index 47c8853..d1cbe2a 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1470,9 +1470,14 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | VM_PFNMAP | VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | - VM_NONLINEAR | VM_MIXEDMAP | VM_SAO)) + VM_NONLINEAR | VM_MIXEDMAP)) return 0; /* just ignore the advice */ +#ifdef VM_SAO + if (*vm_flags & VM_SAO) + return 0; +#endif + if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { err = __ksm_enter(mm); if (err) -- cgit v1.1 From 4b6e1e37026ec7dae9b23d78ffcebdd5ddb1bfa1 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 8 Oct 2012 16:28:40 -0700 Subject: mm: kill vma flag VM_INSERTPAGE Merge VM_INSERTPAGE into VM_MIXEDMAP. VM_MIXEDMAP VMA can mix pure-pfn ptes, special ptes and normal ptes. Now copy_page_range() always copies VM_MIXEDMAP VMA on fork like VM_PFNMAP. If driver populates whole VMA at mmap() it probably not expects page-faults. This patch removes special check from vma_wants_writenotify() which disables pages write tracking for VMA populated via vm_instert_page(). BDI below mapped file should not use dirty-accounting, moreover do_wp_page() can handle this. vm_insert_page() still marks vma after first usage. Usually it is called from f_op->mmap() handler under mm->mmap_sem write-lock, so it able to change vma->vm_flags. Caller must set VM_MIXEDMAP at mmap time if it wants to call this function from other places, for example from page-fault handler. Signed-off-by: Konstantin Khlebnikov Cc: Alexander Viro Cc: Carsten Otte Cc: Chris Metcalf Cc: Cyrill Gorcunov Cc: Eric Paris Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Morris Cc: Jason Baron Cc: Kentaro Takeda Cc: Matt Helsley Cc: Nick Piggin Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Robert Richter Cc: Suresh Siddha Cc: Tetsuo Handa Cc: Venkatesh Pallipadi Acked-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 3 +-- mm/ksm.c | 2 +- mm/memory.c | 14 ++++++++++++-- mm/mmap.c | 2 +- 4 files changed, 15 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 4720669..9b72d12 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1491,8 +1491,7 @@ out: return ret; } -#define VM_NO_THP (VM_SPECIAL|VM_INSERTPAGE|VM_MIXEDMAP| \ - VM_HUGETLB|VM_SHARED|VM_MAYSHARE) +#define VM_NO_THP (VM_SPECIAL|VM_MIXEDMAP|VM_HUGETLB|VM_SHARED|VM_MAYSHARE) int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice) diff --git a/mm/ksm.c b/mm/ksm.c index d1cbe2a..f9ccb16 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1469,7 +1469,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, */ if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | VM_PFNMAP | VM_IO | VM_DONTEXPAND | - VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | + VM_RESERVED | VM_HUGETLB | VM_NONLINEAR | VM_MIXEDMAP)) return 0; /* just ignore the advice */ diff --git a/mm/memory.c b/mm/memory.c index 655e142..7b1e4fe 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1047,7 +1047,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, * readonly mappings. The tradeoff is that copy_page_range is more * efficient than faulting. */ - if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) { + if (!(vma->vm_flags & (VM_HUGETLB | VM_NONLINEAR | + VM_PFNMAP | VM_MIXEDMAP))) { if (!vma->anon_vma) return 0; } @@ -2085,6 +2086,11 @@ out: * ask for a shared writable mapping! * * The page does not need to be reserved. + * + * Usually this function is called from f_op->mmap() handler + * under mm->mmap_sem write-lock, so it can change vma->vm_flags. + * Caller must set VM_MIXEDMAP on vma if it wants to call this + * function from other places, for example from page-fault handler. */ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page) @@ -2093,7 +2099,11 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, return -EFAULT; if (!page_count(page)) return -EINVAL; - vma->vm_flags |= VM_INSERTPAGE; + if (!(vma->vm_flags & VM_MIXEDMAP)) { + BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem)); + BUG_ON(vma->vm_flags & VM_PFNMAP); + vma->vm_flags |= VM_MIXEDMAP; + } return insert_page(vma, addr, page, vma->vm_page_prot); } EXPORT_SYMBOL(vm_insert_page); diff --git a/mm/mmap.c b/mm/mmap.c index 872441e..b0989f4 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1190,7 +1190,7 @@ int vma_wants_writenotify(struct vm_area_struct *vma) return 0; /* Specialty mapping? */ - if (vm_flags & (VM_PFNMAP|VM_INSERTPAGE)) + if (vm_flags & VM_PFNMAP) return 0; /* Can the mapping track the dirty pages? */ -- cgit v1.1 From 0b173bc4daa8f8ec03a85abf5e47b23502ff80af Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 8 Oct 2012 16:28:46 -0700 Subject: mm: kill vma flag VM_CAN_NONLINEAR Move actual pte filling for non-linear file mappings into the new special vma operation: ->remap_pages(). Filesystems must implement this method to get non-linear mapping support, if it uses filemap_fault() then generic_file_remap_pages() can be used. Now device drivers can implement this method and obtain nonlinear vma support. Signed-off-by: Konstantin Khlebnikov Cc: Alexander Viro Cc: Carsten Otte Cc: Chris Metcalf #arch/tile Cc: Cyrill Gorcunov Cc: Eric Paris Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Morris Cc: Jason Baron Cc: Kentaro Takeda Cc: Matt Helsley Cc: Nick Piggin Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Robert Richter Cc: Suresh Siddha Cc: Tetsuo Handa Cc: Venkatesh Pallipadi Acked-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 2 +- mm/filemap_xip.c | 3 ++- mm/fremap.c | 14 ++++++++------ mm/mmap.c | 3 +-- mm/nommu.c | 8 ++++++++ mm/shmem.c | 3 +-- 6 files changed, 21 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 3843445..a9827b4 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1737,6 +1737,7 @@ EXPORT_SYMBOL(filemap_page_mkwrite); const struct vm_operations_struct generic_file_vm_ops = { .fault = filemap_fault, .page_mkwrite = filemap_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; /* This is used for a general mmap of a disk file */ @@ -1749,7 +1750,6 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma) return -ENOEXEC; file_accessed(file); vma->vm_ops = &generic_file_vm_ops; - vma->vm_flags |= VM_CAN_NONLINEAR; return 0; } diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 13e013b..9175022 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -305,6 +305,7 @@ out: static const struct vm_operations_struct xip_file_vm_ops = { .fault = xip_file_fault, .page_mkwrite = filemap_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; int xip_file_mmap(struct file * file, struct vm_area_struct * vma) @@ -313,7 +314,7 @@ int xip_file_mmap(struct file * file, struct vm_area_struct * vma) file_accessed(file); vma->vm_ops = &xip_file_vm_ops; - vma->vm_flags |= VM_CAN_NONLINEAR | VM_MIXEDMAP; + vma->vm_flags |= VM_MIXEDMAP; return 0; } EXPORT_SYMBOL_GPL(xip_file_mmap); diff --git a/mm/fremap.c b/mm/fremap.c index 048659c..3d731a4 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -5,6 +5,7 @@ * * started by Ingo Molnar, Copyright (C) 2002, 2003 */ +#include #include #include #include @@ -80,9 +81,10 @@ out: return err; } -static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, unsigned long size, pgoff_t pgoff) +int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr, + unsigned long size, pgoff_t pgoff) { + struct mm_struct *mm = vma->vm_mm; int err; do { @@ -95,9 +97,9 @@ static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma, pgoff++; } while (size); - return 0; - + return 0; } +EXPORT_SYMBOL(generic_file_remap_pages); /** * sys_remap_file_pages - remap arbitrary pages of an existing VM_SHARED vma @@ -167,7 +169,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, if (vma->vm_private_data && !(vma->vm_flags & VM_NONLINEAR)) goto out; - if (!(vma->vm_flags & VM_CAN_NONLINEAR)) + if (!vma->vm_ops->remap_pages) goto out; if (start < vma->vm_start || start + size > vma->vm_end) @@ -228,7 +230,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, } mmu_notifier_invalidate_range_start(mm, start, start + size); - err = populate_range(mm, vma, start, size, pgoff); + err = vma->vm_ops->remap_pages(vma, start, size, pgoff); mmu_notifier_invalidate_range_end(mm, start, start + size); if (!err && !(flags & MAP_NONBLOCK)) { if (vma->vm_flags & VM_LOCKED) { diff --git a/mm/mmap.c b/mm/mmap.c index b0989f4..d0686d3 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -669,8 +669,7 @@ again: remove_next = 1 + (end > next->vm_end); static inline int is_mergeable_vma(struct vm_area_struct *vma, struct file *file, unsigned long vm_flags) { - /* VM_CAN_NONLINEAR may get set later by f_op->mmap() */ - if ((vma->vm_flags ^ vm_flags) & ~VM_CAN_NONLINEAR) + if (vma->vm_flags ^ vm_flags) return 0; if (vma->vm_file != file) return 0; diff --git a/mm/nommu.c b/mm/nommu.c index dee2ff8..98318dc 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1961,6 +1961,14 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } EXPORT_SYMBOL(filemap_fault); +int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr, + unsigned long size, pgoff_t pgoff) +{ + BUG(); + return 0; +} +EXPORT_SYMBOL(generic_file_remap_pages); + static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, unsigned long addr, void *buf, int len, int write) { diff --git a/mm/shmem.c b/mm/shmem.c index d375211..cc12072f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1339,7 +1339,6 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) { file_accessed(file); vma->vm_ops = &shmem_vm_ops; - vma->vm_flags |= VM_CAN_NONLINEAR; return 0; } @@ -2643,6 +2642,7 @@ static const struct vm_operations_struct shmem_vm_ops = { .set_policy = shmem_set_policy, .get_policy = shmem_get_policy, #endif + .remap_pages = generic_file_remap_pages, }; static struct dentry *shmem_mount(struct file_system_type *fs_type, @@ -2836,7 +2836,6 @@ int shmem_zero_setup(struct vm_area_struct *vma) fput(vma->vm_file); vma->vm_file = file; vma->vm_ops = &shmem_vm_ops; - vma->vm_flags |= VM_CAN_NONLINEAR; return 0; } -- cgit v1.1 From e9714acf8c439688884234dcac2bfc38bb607d38 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 8 Oct 2012 16:28:54 -0700 Subject: mm: kill vma flag VM_EXECUTABLE and mm->num_exe_file_vmas Currently the kernel sets mm->exe_file during sys_execve() and then tracks number of vmas with VM_EXECUTABLE flag in mm->num_exe_file_vmas, as soon as this counter drops to zero kernel resets mm->exe_file to NULL. Plus it resets mm->exe_file at last mmput() when mm->mm_users drops to zero. VMA with VM_EXECUTABLE flag appears after mapping file with flag MAP_EXECUTABLE, such vmas can appears only at sys_execve() or after vma splitting, because sys_mmap ignores this flag. Usually binfmt module sets mm->exe_file and mmaps executable vmas with this file, they hold mm->exe_file while task is running. comment from v2.6.25-6245-g925d1c4 ("procfs task exe symlink"), where all this stuff was introduced: > The kernel implements readlink of /proc/pid/exe by getting the file from > the first executable VMA. Then the path to the file is reconstructed and > reported as the result. > > Because of the VMA walk the code is slightly different on nommu systems. > This patch avoids separate /proc/pid/exe code on nommu systems. Instead of > walking the VMAs to find the first executable file-backed VMA we store a > reference to the exec'd file in the mm_struct. > > That reference would prevent the filesystem holding the executable file > from being unmounted even after unmapping the VMAs. So we track the number > of VM_EXECUTABLE VMAs and drop the new reference when the last one is > unmapped. This avoids pinning the mounted filesystem. exe_file's vma accounting is hooked into every file mmap/unmmap and vma split/merge just to fix some hypothetical pinning fs from umounting by mm, which already unmapped all its executable files, but still alive. Seems like currently nobody depends on this behaviour. We can try to remove this logic and keep mm->exe_file until final mmput(). mm->exe_file is still protected with mm->mmap_sem, because we want to change it via new sys_prctl(PR_SET_MM_EXE_FILE). Also via this syscall task can change its mm->exe_file and unpin mountpoint explicitly. Signed-off-by: Konstantin Khlebnikov Cc: Alexander Viro Cc: Carsten Otte Cc: Chris Metcalf Cc: Cyrill Gorcunov Cc: Eric Paris Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Morris Cc: Jason Baron Cc: Kentaro Takeda Cc: Matt Helsley Cc: Nick Piggin Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Robert Richter Cc: Suresh Siddha Cc: Tetsuo Handa Cc: Venkatesh Pallipadi Acked-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 25 ++++--------------------- mm/nommu.c | 11 +---------- 2 files changed, 5 insertions(+), 31 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index d0686d3..c1ad2e78 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -231,11 +231,8 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) might_sleep(); if (vma->vm_ops && vma->vm_ops->close) vma->vm_ops->close(vma); - if (vma->vm_file) { + if (vma->vm_file) fput(vma->vm_file); - if (vma->vm_flags & VM_EXECUTABLE) - removed_exe_file_vma(vma->vm_mm); - } mpol_put(vma_policy(vma)); kmem_cache_free(vm_area_cachep, vma); return next; @@ -636,8 +633,6 @@ again: remove_next = 1 + (end > next->vm_end); if (file) { uprobe_munmap(next, next->vm_start, next->vm_end); fput(file); - if (next->vm_flags & VM_EXECUTABLE) - removed_exe_file_vma(mm); } if (next->anon_vma) anon_vma_merge(vma, next); @@ -1304,8 +1299,6 @@ munmap_back: error = file->f_op->mmap(file, vma); if (error) goto unmap_and_free_vma; - if (vm_flags & VM_EXECUTABLE) - added_exe_file_vma(mm); /* Can addr have changed?? * @@ -1987,11 +1980,8 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, if (anon_vma_clone(new, vma)) goto out_free_mpol; - if (new->vm_file) { + if (new->vm_file) get_file(new->vm_file); - if (vma->vm_flags & VM_EXECUTABLE) - added_exe_file_vma(mm); - } if (new->vm_ops && new->vm_ops->open) new->vm_ops->open(new); @@ -2009,11 +1999,8 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, /* Clean everything up if vma_adjust failed. */ if (new->vm_ops && new->vm_ops->close) new->vm_ops->close(new); - if (new->vm_file) { - if (vma->vm_flags & VM_EXECUTABLE) - removed_exe_file_vma(mm); + if (new->vm_file) fput(new->vm_file); - } unlink_anon_vmas(new); out_free_mpol: mpol_put(pol); @@ -2408,12 +2395,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, new_vma->vm_start = addr; new_vma->vm_end = addr + len; new_vma->vm_pgoff = pgoff; - if (new_vma->vm_file) { + if (new_vma->vm_file) get_file(new_vma->vm_file); - - if (vma->vm_flags & VM_EXECUTABLE) - added_exe_file_vma(mm); - } if (new_vma->vm_ops && new_vma->vm_ops->open) new_vma->vm_ops->open(new_vma); vma_link(mm, new_vma, prev, rb_link, rb_parent); diff --git a/mm/nommu.c b/mm/nommu.c index 98318dc..9c4a7b6 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -789,11 +789,8 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma) kenter("%p", vma); if (vma->vm_ops && vma->vm_ops->close) vma->vm_ops->close(vma); - if (vma->vm_file) { + if (vma->vm_file) fput(vma->vm_file); - if (vma->vm_flags & VM_EXECUTABLE) - removed_exe_file_vma(mm); - } put_nommu_region(vma->vm_region); kmem_cache_free(vm_area_cachep, vma); } @@ -1284,10 +1281,6 @@ unsigned long do_mmap_pgoff(struct file *file, if (file) { region->vm_file = get_file(file); vma->vm_file = get_file(file); - if (vm_flags & VM_EXECUTABLE) { - added_exe_file_vma(current->mm); - vma->vm_mm = current->mm; - } } down_write(&nommu_region_sem); @@ -1440,8 +1433,6 @@ error: kmem_cache_free(vm_region_jar, region); if (vma->vm_file) fput(vma->vm_file); - if (vma->vm_flags & VM_EXECUTABLE) - removed_exe_file_vma(vma->vm_mm); kmem_cache_free(vm_area_cachep, vma); kleave(" = %d", ret); return ret; -- cgit v1.1 From 0103bd16fb90bc741c7a03fd1ea4e8a505abad23 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 8 Oct 2012 16:28:59 -0700 Subject: mm: prepare VM_DONTDUMP for using in drivers Rename VM_NODUMP into VM_DONTDUMP: this name matches other negative flags: VM_DONTEXPAND, VM_DONTCOPY. Currently this flag used only for sys_madvise. The next patch will use it for replacing the outdated flag VM_RESERVED. Also forbid madvise(MADV_DODUMP) for special kernel mappings VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP) Signed-off-by: Konstantin Khlebnikov Cc: Alexander Viro Cc: Carsten Otte Cc: Chris Metcalf Cc: Cyrill Gorcunov Cc: Eric Paris Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Morris Cc: Jason Baron Cc: Kentaro Takeda Cc: Matt Helsley Cc: Nick Piggin Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Robert Richter Cc: Suresh Siddha Cc: Tetsuo Handa Cc: Venkatesh Pallipadi Acked-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/madvise.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/madvise.c b/mm/madvise.c index 14d260f..03dfa5c 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -69,10 +69,14 @@ static long madvise_behavior(struct vm_area_struct * vma, new_flags &= ~VM_DONTCOPY; break; case MADV_DONTDUMP: - new_flags |= VM_NODUMP; + new_flags |= VM_DONTDUMP; break; case MADV_DODUMP: - new_flags &= ~VM_NODUMP; + if (new_flags & VM_SPECIAL) { + error = -EINVAL; + goto out; + } + new_flags &= ~VM_DONTDUMP; break; case MADV_MERGEABLE: case MADV_UNMERGEABLE: -- cgit v1.1 From 314e51b9851b4f4e8ab302243ff5a6fc6147f379 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 8 Oct 2012 16:29:02 -0700 Subject: mm: kill vma flag VM_RESERVED and mm->reserved_vm counter A long time ago, in v2.4, VM_RESERVED kept swapout process off VMA, currently it lost original meaning but still has some effects: | effect | alternative flags -+------------------------+--------------------------------------------- 1| account as reserved_vm | VM_IO 2| skip in core dump | VM_IO, VM_DONTDUMP 3| do not merge or expand | VM_IO, VM_DONTEXPAND, VM_HUGETLB, VM_PFNMAP 4| do not mlock | VM_IO, VM_DONTEXPAND, VM_HUGETLB, VM_PFNMAP This patch removes reserved_vm counter from mm_struct. Seems like nobody cares about it, it does not exported into userspace directly, it only reduces total_vm showed in proc. Thus VM_RESERVED can be replaced with VM_IO or pair VM_DONTEXPAND | VM_DONTDUMP. remap_pfn_range() and io_remap_pfn_range() set VM_IO|VM_DONTEXPAND|VM_DONTDUMP. remap_vmalloc_range() set VM_DONTEXPAND | VM_DONTDUMP. [akpm@linux-foundation.org: drivers/vfio/pci/vfio_pci.c fixup] Signed-off-by: Konstantin Khlebnikov Cc: Alexander Viro Cc: Carsten Otte Cc: Chris Metcalf Cc: Cyrill Gorcunov Cc: Eric Paris Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Morris Cc: Jason Baron Cc: Kentaro Takeda Cc: Matt Helsley Cc: Nick Piggin Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Robert Richter Cc: Suresh Siddha Cc: Tetsuo Handa Cc: Venkatesh Pallipadi Acked-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 3 +-- mm/memory.c | 11 +++++------ mm/mlock.c | 2 +- mm/mmap.c | 2 -- mm/nommu.c | 2 +- mm/vmalloc.c | 3 +-- 6 files changed, 9 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/ksm.c b/mm/ksm.c index f9ccb16..9638620 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1469,8 +1469,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, */ if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | VM_PFNMAP | VM_IO | VM_DONTEXPAND | - VM_RESERVED | VM_HUGETLB | - VM_NONLINEAR | VM_MIXEDMAP)) + VM_HUGETLB | VM_NONLINEAR | VM_MIXEDMAP)) return 0; /* just ignore the advice */ #ifdef VM_SAO diff --git a/mm/memory.c b/mm/memory.c index 7b1e4fe..e09c048 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2297,14 +2297,13 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, * rest of the world about it: * VM_IO tells people not to look at these pages * (accesses can have side effects). - * VM_RESERVED is specified all over the place, because - * in 2.4 it kept swapout's vma scan off this vma; but - * in 2.6 the LRU scan won't even find its pages, so this - * flag means no more than count its pages in reserved_vm, - * and omit it from core dump, even when VM_IO turned off. * VM_PFNMAP tells the core MM that the base pages are just * raw PFN mappings, and do not have a "struct page" associated * with them. + * VM_DONTEXPAND + * Disable vma merging and expanding with mremap(). + * VM_DONTDUMP + * Omit vma from core dump, even when VM_IO turned off. * * There's a horrible special case to handle copy-on-write * behaviour that some programs depend on. We mark the "original" @@ -2321,7 +2320,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, if (err) return -EINVAL; - vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; + vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; BUG_ON(addr >= end); pfn -= addr >> PAGE_SHIFT; diff --git a/mm/mlock.c b/mm/mlock.c index ef726e8..a948be4 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -227,7 +227,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma, if (vma->vm_flags & (VM_IO | VM_PFNMAP)) goto no_mlock; - if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || + if (!((vma->vm_flags & VM_DONTEXPAND) || is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm))) { diff --git a/mm/mmap.c b/mm/mmap.c index c1ad2e78..a76042d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -945,8 +945,6 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags, mm->exec_vm += pages; } else if (flags & stack_flags) mm->stack_vm += pages; - if (flags & (VM_RESERVED|VM_IO)) - mm->reserved_vm += pages; } #endif /* CONFIG_PROC_FS */ diff --git a/mm/nommu.c b/mm/nommu.c index 9c4a7b6..12e84e6 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1811,7 +1811,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, if (addr != (pfn << PAGE_SHIFT)) return -EINVAL; - vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; + vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; return 0; } EXPORT_SYMBOL(remap_pfn_range); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 2bb90b1..8de7046 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2163,8 +2163,7 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, usize -= PAGE_SIZE; } while (usize > 0); - /* Prevent "things" like memory migration? VM_flags need a cleanup... */ - vma->vm_flags |= VM_RESERVED; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; return 0; } -- cgit v1.1 From d741c9cdeee6a569dae0dbbaf028065402955b59 Mon Sep 17 00:00:00 2001 From: Robin Dong Date: Mon, 8 Oct 2012 16:29:05 -0700 Subject: mm: fix nonuniform page status when writing new file with small buffer When writing a new file with 2048 bytes buffer, such as write(fd, buffer, 2048), it will call generic_perform_write() twice for every page: write_begin mark_page_accessed(page) write_end write_begin mark_page_accessed(page) write_end Pages 1-13 will be added to lru-pvecs in write_begin() and will *NOT* be added to active_list even they have be accessed twice because they are not PageLRU(page). But when page 14th comes, all pages in lru-pvecs will be moved to inactive_list (by __lru_cache_add() ) in first write_begin(), now page 14th *is* PageLRU(page). And after second write_end() only page 14th will be in active_list. In Hadoop environment, we do comes to this situation: after writing a file, we find out that only 14th, 28th, 42th... page are in active_list and others in inactive_list. Now kswapd works, shrinks the inactive_list, the file only have 14th, 28th...pages in memory, the readahead request size will be broken to only 52k (13*4k), system's performance falls dramatically. This problem can also replay by below steps (the machine has 8G memory): 1. dd if=/dev/zero of=/test/file.out bs=1024 count=1048576 2. cat another 7.5G file to /dev/null 3. vmtouch -m 1G -v /test/file.out, it will show: /test/file.out [oooooooooooooooooooOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO] 187847/262144 the 'o' means same pages are in memory but same are not. The solution for this problem is simple: the 14th page should be added to lru_add_pvecs before mark_page_accessed() just as other pages. [akpm@linux-foundation.org: tweak comment] [akpm@linux-foundation.org: grab better comment from the v3 patch] Signed-off-by: Robin Dong Reviewed-by: Minchan Kim Cc: KOSAKI Motohiro Reviewed-by: Johannes Weiner Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/swap.c b/mm/swap.c index 7782588..f76c76c 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -446,13 +446,22 @@ void mark_page_accessed(struct page *page) } EXPORT_SYMBOL(mark_page_accessed); +/* + * Order of operations is important: flush the pagevec when it's already + * full, not when adding the last page, to make sure that last page is + * not added to the LRU directly when passed to this function. Because + * mark_page_accessed() (called after this when writing) only activates + * pages that are on the LRU, linear writes in subpage chunks would see + * every PAGEVEC_SIZE page activated, which is unexpected. + */ void __lru_cache_add(struct page *page, enum lru_list lru) { struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru]; page_cache_get(page); - if (!pagevec_add(pvec, page)) + if (!pagevec_space(pvec)) __pagevec_lru_add(pvec, lru); + pagevec_add(pvec, page); put_cpu_var(lru_add_pvecs); } EXPORT_SYMBOL(__lru_cache_add); -- cgit v1.1 From 6597d783397aebb793fb529474cce5089aa4c67f Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 8 Oct 2012 16:29:07 -0700 Subject: mm/mmap.c: replace find_vma_prepare() with clearer find_vma_links() People get confused by find_vma_prepare(), because it doesn't care about what it returns in its output args, when its callers won't be interested. Clarify by passing in end-of-range address too, and returning failure if any existing vma overlaps the new range: instead of returning an ambiguous vma which most callers then must check. find_vma_links() is a clearer name. This does revert 2.6.27's dfe195fb79e88 ("mm: fix uninitialized variables for find_vma_prepare callers"), but it looks like gcc 4.3.0 was one of those releases too eager to shout about uninitialized variables: only copy_vma() warns with 4.5.1 and 4.7.1, which a BUG on error silences. [hughd@google.com: fix warning, remove BUG()] Signed-off-by: Hugh Dickins Cc: Benny Halevy Acked-by: Hillf Danton Signed-off-by: Hugh Dickins Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 45 +++++++++++++++++++++------------------------ 1 file changed, 21 insertions(+), 24 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index a76042d..2ba2743 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -353,17 +353,14 @@ void validate_mm(struct mm_struct *mm) #define validate_mm(mm) do { } while (0) #endif -static struct vm_area_struct * -find_vma_prepare(struct mm_struct *mm, unsigned long addr, - struct vm_area_struct **pprev, struct rb_node ***rb_link, - struct rb_node ** rb_parent) +static int find_vma_links(struct mm_struct *mm, unsigned long addr, + unsigned long end, struct vm_area_struct **pprev, + struct rb_node ***rb_link, struct rb_node **rb_parent) { - struct vm_area_struct * vma; - struct rb_node ** __rb_link, * __rb_parent, * rb_prev; + struct rb_node **__rb_link, *__rb_parent, *rb_prev; __rb_link = &mm->mm_rb.rb_node; rb_prev = __rb_parent = NULL; - vma = NULL; while (*__rb_link) { struct vm_area_struct *vma_tmp; @@ -372,9 +369,9 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr, vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb); if (vma_tmp->vm_end > addr) { - vma = vma_tmp; - if (vma_tmp->vm_start <= addr) - break; + /* Fail if an existing vma overlaps the area */ + if (vma_tmp->vm_start < end) + return -ENOMEM; __rb_link = &__rb_parent->rb_left; } else { rb_prev = __rb_parent; @@ -387,7 +384,7 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr, *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); *rb_link = __rb_link; *rb_parent = __rb_parent; - return vma; + return 0; } void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, @@ -456,11 +453,12 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, */ static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) { - struct vm_area_struct *__vma, *prev; + struct vm_area_struct *prev; struct rb_node **rb_link, *rb_parent; - __vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent); - BUG_ON(__vma && __vma->vm_start < vma->vm_end); + if (find_vma_links(mm, vma->vm_start, vma->vm_end, + &prev, &rb_link, &rb_parent)) + BUG(); __vma_link(mm, vma, prev, rb_link, rb_parent); mm->map_count++; } @@ -1221,8 +1219,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, /* Clear old maps */ error = -ENOMEM; munmap_back: - vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); - if (vma && vma->vm_start < addr + len) { + if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { if (do_munmap(mm, addr, len)) return -ENOMEM; goto munmap_back; @@ -2183,8 +2180,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) * Clear old maps. this also does some error checking for us */ munmap_back: - vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); - if (vma && vma->vm_start < addr + len) { + if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { if (do_munmap(mm, addr, len)) return -ENOMEM; goto munmap_back; @@ -2298,10 +2294,10 @@ void exit_mmap(struct mm_struct *mm) * and into the inode's i_mmap tree. If vm_file is non-NULL * then i_mmap_mutex is taken here. */ -int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) +int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) { - struct vm_area_struct * __vma, * prev; - struct rb_node ** rb_link, * rb_parent; + struct vm_area_struct *prev; + struct rb_node **rb_link, *rb_parent; /* * The vm_pgoff of a purely anonymous vma should be irrelevant @@ -2319,8 +2315,8 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) BUG_ON(vma->anon_vma); vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; } - __vma = find_vma_prepare(mm,vma->vm_start,&prev,&rb_link,&rb_parent); - if (__vma && __vma->vm_start < vma->vm_end) + if (find_vma_links(mm, vma->vm_start, vma->vm_end, + &prev, &rb_link, &rb_parent)) return -ENOMEM; if ((vma->vm_flags & VM_ACCOUNT) && security_vm_enough_memory_mm(mm, vma_pages(vma))) @@ -2354,7 +2350,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, faulted_in_anon_vma = false; } - find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); + if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) + return NULL; /* should never get here */ new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); if (new_vma) { -- cgit v1.1 From 4ffb6335da87b51c17e7ff6495170785f21558dd Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 8 Oct 2012 16:29:09 -0700 Subject: mm: compaction: update comment in try_to_compact_pages Allocation success rates have been far lower since 3.4 due to commit fe2c2a106663 ("vmscan: reclaim at order 0 when compaction is enabled"). This commit was introduced for good reasons and it was known in advance that the success rates would suffer but it was justified on the grounds that the high allocation success rates were achieved by aggressive reclaim. Success rates are expected to suffer even more in 3.6 due to commit 7db8889ab05b ("mm: have order > 0 compaction start off where it left") which testing has shown to severely reduce allocation success rates under load - to 0% in one case. This series aims to improve the allocation success rates without regressing the benefits of commit fe2c2a106663. The series is based on latest mmotm and takes into account the __GFP_NO_KSWAPD flag is going away. Patch 1 updates a stale comment seeing as I was in the general area. Patch 2 updates reclaim/compaction to reclaim pages scaled on the number of recent failures. Patch 3 captures suitable high-order pages freed by compaction to reduce races with parallel allocation requests. Patch 4 fixes the upstream commit [7db8889a: mm: have order > 0 compaction start off where it left] to enable compaction again Patch 5 identifies when compacion is taking too long due to contention and aborts. STRESS-HIGHALLOC 3.6-rc1-akpm full-series Pass 1 36.00 ( 0.00%) 51.00 (15.00%) Pass 2 42.00 ( 0.00%) 63.00 (21.00%) while Rested 86.00 ( 0.00%) 86.00 ( 0.00%) From http://www.csn.ul.ie/~mel/postings/mmtests-20120424/global-dhp__stress-highalloc-performance-ext3/hydra/comparison.html I know that the allocation success rates in 3.3.6 was 78% in comparison to 36% in in the current akpm tree. With the full series applied, the success rates are up to around 51% with some variability in the results. This is not as high a success rate but it does not reclaim excessively which is a key point. MMTests Statistics: vmstat Page Ins 3050912 3078892 Page Outs 8033528 8039096 Swap Ins 0 0 Swap Outs 0 0 Note that swap in/out rates remain at 0. In 3.3.6 with 78% success rates there were 71881 pages swapped out. Direct pages scanned 70942 122976 Kswapd pages scanned 1366300 1520122 Kswapd pages reclaimed 1366214 1484629 Direct pages reclaimed 70936 105716 Kswapd efficiency 99% 97% Kswapd velocity 1072.550 1182.615 Direct efficiency 99% 85% Direct velocity 55.690 95.672 The kswapd velocity changes very little as expected. kswapd velocity is around the 1000 pages/sec mark where as in kernel 3.3.6 with the high allocation success rates it was 8140 pages/second. Direct velocity is higher as a result of patch 2 of the series but this is expected and is acceptable. The direct reclaim and kswapd velocities change very little. If these get accepted for merging then there is a difficulty in how they should be handled. 7db8889a ("mm: have order > 0 compaction start off where it left") is broken but it is already in 3.6-rc1 and needs to be fixed. However, if just patch 4 from this series is applied then Jim Schutt's workload is known to break again as his workload also requires patch 5. While it would be preferred to have all these patches in 3.6 to improve compaction in general, it would at least be acceptable if just patches 4 and 5 were merged to 3.6 to fix a known problem without breaking compaction completely. On the face of it, that would force __GFP_NO_KSWAPD patches to be merged at the same time but I can do a version of this series with __GFP_NO_KSWAPD change reverted and then rebase it on top of this series. That might be best overall because I note that the __GFP_NO_KSWAPD patch should have removed deferred_compaction from page_alloc.c but it didn't but fixing that causes collisions with this series. This patch: The comment about order applied when the check was order > PAGE_ALLOC_COSTLY_ORDER which has not been the case since c5a73c3d ("thp: use compaction for all allocation orders"). Fixing the comment while I'm in the general area. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Reviewed-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 7fcd3a5..7168edc 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -869,11 +869,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, struct zone *zone; int rc = COMPACT_SKIPPED; - /* - * Check whether it is worth even starting compaction. The order check is - * made because an assumption is made that the page allocator can satisfy - * the "cheaper" orders without taking special steps - */ + /* Check if the GFP flags allow compaction */ if (!order || !may_enter_fs || !may_perform_io) return rc; -- cgit v1.1 From 83fde0f22872aa8c1d46f775cc7bdfa864499e65 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 8 Oct 2012 16:29:11 -0700 Subject: mm: vmscan: scale number of pages reclaimed by reclaim/compaction based on failures If allocation fails after compaction then compaction may be deferred for a number of allocation attempts. If there are subsequent failures, compact_defer_shift is increased to defer for longer periods. This patch uses that information to scale the number of pages reclaimed with compact_defer_shift until allocations succeed again. The rationale is that reclaiming the normal number of pages still allowed compaction to fail and its success depends on the number of pages. If it's failing, reclaim more pages until it succeeds again. Note that this is not implying that VM reclaim is not reclaiming enough pages or that its logic is broken. try_to_free_pages() always asks for SWAP_CLUSTER_MAX pages to be reclaimed regardless of order and that is what it does. Direct reclaim stops normally with this check. if (sc->nr_reclaimed >= sc->nr_to_reclaim) goto out; should_continue_reclaim delays when that check is made until a minimum number of pages for reclaim/compaction are reclaimed. It is possible that this patch could instead set nr_to_reclaim in try_to_free_pages() and drive it from there but that's behaves differently and not necessarily for the better. If driven from do_try_to_free_pages(), it is also possible that priorities will rise. When they reach DEF_PRIORITY-2, it will also start stalling and setting pages for immediate reclaim which is more disruptive than not desirable in this case. That is a more wide-reaching change that could cause another regression related to THP requests causing interactive jitter. [akpm@linux-foundation.org: fix build] Signed-off-by: Mel Gorman Acked-by: Rik van Riel Reviewed-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 99b434b..f6fdae7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1729,6 +1729,28 @@ static bool in_reclaim_compaction(struct scan_control *sc) return false; } +#ifdef CONFIG_COMPACTION +/* + * If compaction is deferred for sc->order then scale the number of pages + * reclaimed based on the number of consecutive allocation failures + */ +static unsigned long scale_for_compaction(unsigned long pages_for_compaction, + struct lruvec *lruvec, struct scan_control *sc) +{ + struct zone *zone = lruvec_zone(lruvec); + + if (zone->compact_order_failed <= sc->order) + pages_for_compaction <<= zone->compact_defer_shift; + return pages_for_compaction; +} +#else +static unsigned long scale_for_compaction(unsigned long pages_for_compaction, + struct lruvec *lruvec, struct scan_control *sc) +{ + return pages_for_compaction; +} +#endif + /* * Reclaim/compaction is used for high-order allocation requests. It reclaims * order-0 pages before compacting the zone. should_continue_reclaim() returns @@ -1776,6 +1798,9 @@ static inline bool should_continue_reclaim(struct lruvec *lruvec, * inactive lists are large enough, continue reclaiming */ pages_for_compaction = (2UL << sc->order); + + pages_for_compaction = scale_for_compaction(pages_for_compaction, + lruvec, sc); inactive_lru_pages = get_lru_size(lruvec, LRU_INACTIVE_FILE); if (nr_swap_pages > 0) inactive_lru_pages += get_lru_size(lruvec, LRU_INACTIVE_ANON); -- cgit v1.1 From 1fb3f8ca0e9222535a39b884cb67a34628411b9f Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 8 Oct 2012 16:29:12 -0700 Subject: mm: compaction: capture a suitable high-order page immediately when it is made available While compaction is migrating pages to free up large contiguous blocks for allocation it races with other allocation requests that may steal these blocks or break them up. This patch alters direct compaction to capture a suitable free page as soon as it becomes available to reduce this race. It uses similar logic to split_free_page() to ensure that watermarks are still obeyed. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Reviewed-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++++++------- mm/internal.h | 1 + mm/page_alloc.c | 63 ++++++++++++++++++++++++++++++---------- 3 files changed, 127 insertions(+), 27 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 7168edc..0fbc6b7 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -91,6 +91,60 @@ static inline bool compact_trylock_irqsave(spinlock_t *lock, return compact_checklock_irqsave(lock, flags, false, cc); } +static void compact_capture_page(struct compact_control *cc) +{ + unsigned long flags; + int mtype, mtype_low, mtype_high; + + if (!cc->page || *cc->page) + return; + + /* + * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP + * regardless of the migratetype of the freelist is is captured from. + * This is fine because the order for a high-order MIGRATE_MOVABLE + * allocation is typically at least a pageblock size and overall + * fragmentation is not impaired. Other allocation types must + * capture pages from their own migratelist because otherwise they + * could pollute other pageblocks like MIGRATE_MOVABLE with + * difficult to move pages and making fragmentation worse overall. + */ + if (cc->migratetype == MIGRATE_MOVABLE) { + mtype_low = 0; + mtype_high = MIGRATE_PCPTYPES; + } else { + mtype_low = cc->migratetype; + mtype_high = cc->migratetype + 1; + } + + /* Speculatively examine the free lists without zone lock */ + for (mtype = mtype_low; mtype < mtype_high; mtype++) { + int order; + for (order = cc->order; order < MAX_ORDER; order++) { + struct page *page; + struct free_area *area; + area = &(cc->zone->free_area[order]); + if (list_empty(&area->free_list[mtype])) + continue; + + /* Take the lock and attempt capture of the page */ + if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc)) + return; + if (!list_empty(&area->free_list[mtype])) { + page = list_entry(area->free_list[mtype].next, + struct page, lru); + if (capture_free_page(page, cc->order, mtype)) { + spin_unlock_irqrestore(&cc->zone->lock, + flags); + *cc->page = page; + return; + } + } + spin_unlock_irqrestore(&cc->zone->lock, flags); + } + } +} + /* * Isolate free pages onto a private freelist. Caller must hold zone->lock. * If @strict is true, will abort returning 0 on any invalid PFNs or non-free @@ -645,7 +699,6 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, static int compact_finished(struct zone *zone, struct compact_control *cc) { - unsigned int order; unsigned long watermark; if (fatal_signal_pending(current)) @@ -688,14 +741,22 @@ static int compact_finished(struct zone *zone, return COMPACT_CONTINUE; /* Direct compactor: Is a suitable page free? */ - for (order = cc->order; order < MAX_ORDER; order++) { - /* Job done if page is free of the right migratetype */ - if (!list_empty(&zone->free_area[order].free_list[cc->migratetype])) - return COMPACT_PARTIAL; - - /* Job done if allocation would set block type */ - if (order >= pageblock_order && zone->free_area[order].nr_free) + if (cc->page) { + /* Was a suitable page captured? */ + if (*cc->page) return COMPACT_PARTIAL; + } else { + unsigned int order; + for (order = cc->order; order < MAX_ORDER; order++) { + struct free_area *area = &zone->free_area[cc->order]; + /* Job done if page is free of the right migratetype */ + if (!list_empty(&area->free_list[cc->migratetype])) + return COMPACT_PARTIAL; + + /* Job done if allocation would set block type */ + if (cc->order >= pageblock_order && area->nr_free) + return COMPACT_PARTIAL; + } } return COMPACT_CONTINUE; @@ -817,6 +878,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) goto out; } } + + /* Capture a page now if it is a suitable size */ + compact_capture_page(cc); } out: @@ -829,7 +893,8 @@ out: static unsigned long compact_zone_order(struct zone *zone, int order, gfp_t gfp_mask, - bool sync, bool *contended) + bool sync, bool *contended, + struct page **page) { struct compact_control cc = { .nr_freepages = 0, @@ -839,6 +904,7 @@ static unsigned long compact_zone_order(struct zone *zone, .zone = zone, .sync = sync, .contended = contended, + .page = page, }; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); @@ -860,7 +926,7 @@ int sysctl_extfrag_threshold = 500; */ unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask, - bool sync, bool *contended) + bool sync, bool *contended, struct page **page) { enum zone_type high_zoneidx = gfp_zone(gfp_mask); int may_enter_fs = gfp_mask & __GFP_FS; @@ -881,7 +947,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, int status; status = compact_zone_order(zone, order, gfp_mask, sync, - contended); + contended, page); rc = max(status, rc); /* If a normal allocation would succeed, stop compacting */ @@ -936,6 +1002,7 @@ int compact_pgdat(pg_data_t *pgdat, int order) struct compact_control cc = { .order = order, .sync = false, + .page = NULL, }; return __compact_pgdat(pgdat, &cc); @@ -946,6 +1013,7 @@ static int compact_node(int nid) struct compact_control cc = { .order = -1, .sync = true, + .page = NULL, }; return __compact_pgdat(NODE_DATA(nid), &cc); diff --git a/mm/internal.h b/mm/internal.h index b8c91b3..e549a7f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -131,6 +131,7 @@ struct compact_control { int migratetype; /* MOVABLE, RECLAIMABLE etc */ struct zone *zone; bool *contended; /* True if a lock was contended */ + struct page **page; /* Page captured of requested size */ }; unsigned long diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5e92698..cfd565db 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1380,16 +1380,11 @@ void split_page(struct page *page, unsigned int order) } /* - * Similar to split_page except the page is already free. As this is only - * being used for migration, the migratetype of the block also changes. - * As this is called with interrupts disabled, the caller is responsible - * for calling arch_alloc_page() and kernel_map_page() after interrupts - * are enabled. - * - * Note: this is probably too low level an operation for use in drivers. - * Please consult with lkml before using this in your driver. + * Similar to the split_page family of functions except that the page + * required at the given order and being isolated now to prevent races + * with parallel allocators */ -int split_free_page(struct page *page) +int capture_free_page(struct page *page, int alloc_order, int migratetype) { unsigned int order; unsigned long watermark; @@ -1411,10 +1406,11 @@ int split_free_page(struct page *page) rmv_page_order(page); __mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order)); - /* Split into individual pages */ - set_page_refcounted(page); - split_page(page, order); + if (alloc_order != order) + expand(zone, page, alloc_order, order, + &zone->free_area[order], migratetype); + /* Set the pageblock if the captured page is at least a pageblock */ if (order >= pageblock_order - 1) { struct page *endpage = page + (1 << order) - 1; for (; page < endpage; page += pageblock_nr_pages) { @@ -1425,7 +1421,35 @@ int split_free_page(struct page *page) } } - return 1 << order; + return 1UL << order; +} + +/* + * Similar to split_page except the page is already free. As this is only + * being used for migration, the migratetype of the block also changes. + * As this is called with interrupts disabled, the caller is responsible + * for calling arch_alloc_page() and kernel_map_page() after interrupts + * are enabled. + * + * Note: this is probably too low level an operation for use in drivers. + * Please consult with lkml before using this in your driver. + */ +int split_free_page(struct page *page) +{ + unsigned int order; + int nr_pages; + + BUG_ON(!PageBuddy(page)); + order = page_order(page); + + nr_pages = capture_free_page(page, order, 0); + if (!nr_pages) + return 0; + + /* Split into individual pages */ + set_page_refcounted(page); + split_page(page, order); + return nr_pages; } /* @@ -2105,7 +2129,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, bool *contended_compaction, bool *deferred_compaction, unsigned long *did_some_progress) { - struct page *page; + struct page *page = NULL; if (!order) return NULL; @@ -2118,10 +2142,16 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, current->flags |= PF_MEMALLOC; *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, nodemask, sync_migration, - contended_compaction); + contended_compaction, &page); current->flags &= ~PF_MEMALLOC; - if (*did_some_progress != COMPACT_SKIPPED) { + /* If compaction captured a page, prep and use it */ + if (page) { + prep_new_page(page, order, gfp_mask); + goto got_page; + } + + if (*did_some_progress != COMPACT_SKIPPED) { /* Page migration frees to the PCP lists but we want merging */ drain_pages(get_cpu()); put_cpu(); @@ -2131,6 +2161,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, alloc_flags & ~ALLOC_NO_WATERMARKS, preferred_zone, migratetype); if (page) { +got_page: preferred_zone->compact_considered = 0; preferred_zone->compact_defer_shift = 0; if (order >= preferred_zone->compact_order_failed) -- cgit v1.1 From 8d34694c1abf29df1f3c7317936b7e3e2e308d9b Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Mon, 8 Oct 2012 16:29:14 -0700 Subject: revert "mm: mempolicy: Let vma_merge and vma_split handle vma->vm_policy linkages" Commit 05f144a0d5c2 ("mm: mempolicy: Let vma_merge and vma_split handle vma->vm_policy linkages") removed vma->vm_policy updates code but it is the purpose of mbind_range(). Now, mbind_range() is virtually a no-op and while it does not allow memory corruption it is not the right fix. This patch is a revert. [mgorman@suse.de: Edited changelog] Signed-off-by: KOSAKI Motohiro Signed-off-by: Mel Gorman Cc: Christoph Lameter Cc: Josh Boyer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 41 ++++++++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4ada3be..92daa26 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -607,6 +607,27 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, return first; } +/* Apply policy to a single VMA */ +static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new) +{ + int err = 0; + struct mempolicy *old = vma->vm_policy; + + pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", + vma->vm_start, vma->vm_end, vma->vm_pgoff, + vma->vm_ops, vma->vm_file, + vma->vm_ops ? vma->vm_ops->set_policy : NULL); + + if (vma->vm_ops && vma->vm_ops->set_policy) + err = vma->vm_ops->set_policy(vma, new); + if (!err) { + mpol_get(new); + vma->vm_policy = new; + mpol_put(old); + } + return err; +} + /* Step 2: apply policy to a range and do splits. */ static int mbind_range(struct mm_struct *mm, unsigned long start, unsigned long end, struct mempolicy *new_pol) @@ -655,23 +676,9 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, if (err) goto out; } - - /* - * Apply policy to a single VMA. The reference counting of - * policy for vma_policy linkages has already been handled by - * vma_merge and split_vma as necessary. If this is a shared - * policy then ->set_policy will increment the reference count - * for an sp node. - */ - pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", - vma->vm_start, vma->vm_end, vma->vm_pgoff, - vma->vm_ops, vma->vm_file, - vma->vm_ops ? vma->vm_ops->set_policy : NULL); - if (vma->vm_ops && vma->vm_ops->set_policy) { - err = vma->vm_ops->set_policy(vma, new_pol); - if (err) - goto out; - } + err = policy_vma(vma, new_pol); + if (err) + goto out; } out: -- cgit v1.1 From 869833f2c5c6e4dd09a5378cfc665ffb4615e5d2 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Mon, 8 Oct 2012 16:29:16 -0700 Subject: mempolicy: remove mempolicy sharing Dave Jones' system call fuzz testing tool "trinity" triggered the following bug error with slab debugging enabled ============================================================================= BUG numa_policy (Not tainted): Poison overwritten ----------------------------------------------------------------------------- INFO: 0xffff880146498250-0xffff880146498250. First byte 0x6a instead of 0x6b INFO: Allocated in mpol_new+0xa3/0x140 age=46310 cpu=6 pid=32154 __slab_alloc+0x3d3/0x445 kmem_cache_alloc+0x29d/0x2b0 mpol_new+0xa3/0x140 sys_mbind+0x142/0x620 system_call_fastpath+0x16/0x1b INFO: Freed in __mpol_put+0x27/0x30 age=46268 cpu=6 pid=32154 __slab_free+0x2e/0x1de kmem_cache_free+0x25a/0x260 __mpol_put+0x27/0x30 remove_vma+0x68/0x90 exit_mmap+0x118/0x140 mmput+0x73/0x110 exit_mm+0x108/0x130 do_exit+0x162/0xb90 do_group_exit+0x4f/0xc0 sys_exit_group+0x17/0x20 system_call_fastpath+0x16/0x1b INFO: Slab 0xffffea0005192600 objects=27 used=27 fp=0x (null) flags=0x20000000004080 INFO: Object 0xffff880146498250 @offset=592 fp=0xffff88014649b9d0 The problem is that the structure is being prematurely freed due to a reference count imbalance. In the following case mbind(addr, len) should replace the memory policies of both vma1 and vma2 and thus they will become to share the same mempolicy and the new mempolicy will have the MPOL_F_SHARED flag. +-------------------+-------------------+ | vma1 | vma2(shmem) | +-------------------+-------------------+ | | addr addr+len alloc_pages_vma() uses get_vma_policy() and mpol_cond_put() pair for maintaining the mempolicy reference count. The current rule is that get_vma_policy() only increments refcount for shmem VMA and mpol_conf_put() only decrements refcount if the policy has MPOL_F_SHARED. In above case, vma1 is not shmem vma and vma->policy has MPOL_F_SHARED! The reference count will be decreased even though was not increased whenever alloc_page_vma() is called. This has been broken since commit [52cd3b07: mempolicy: rework mempolicy Reference Counting] in 2008. There is another serious bug with the sharing of memory policies. Currently, mempolicy rebind logic (it is called from cpuset rebinding) ignores a refcount of mempolicy and override it forcibly. Thus, any mempolicy sharing may cause mempolicy corruption. The bug was introduced by commit [68860ec1: cpusets: automatic numa mempolicy rebinding]. Ideally, the shared policy handling would be rewritten to either properly handle COW of the policy structures or at least reference count MPOL_F_SHARED based exclusively on information within the policy. However, this patch takes the easier approach of disabling any policy sharing between VMAs. Each new range allocated with sp_alloc will allocate a new policy, set the reference count to 1 and drop the reference count of the old policy. This increases the memory footprint but is not expected to be a major problem as mbind() is unlikely to be used for fine-grained ranges. It is also inefficient because it means we allocate a new policy even in cases where mbind_range() could use the new_policy passed to it. However, it is more straight-forward and the change should be invisible to the user. [mgorman@suse.de: Edited changelog] Reported-by: Dave Jones , Cc: Christoph Lameter , Reviewed-by: Christoph Lameter Signed-off-by: KOSAKI Motohiro Signed-off-by: Mel Gorman Cc: Josh Boyer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 52 ++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 38 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 92daa26..f0728ae 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -607,24 +607,39 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, return first; } -/* Apply policy to a single VMA */ -static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new) +/* + * Apply policy to a single VMA + * This must be called with the mmap_sem held for writing. + */ +static int vma_replace_policy(struct vm_area_struct *vma, + struct mempolicy *pol) { - int err = 0; - struct mempolicy *old = vma->vm_policy; + int err; + struct mempolicy *old; + struct mempolicy *new; pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", vma->vm_start, vma->vm_end, vma->vm_pgoff, vma->vm_ops, vma->vm_file, vma->vm_ops ? vma->vm_ops->set_policy : NULL); - if (vma->vm_ops && vma->vm_ops->set_policy) + new = mpol_dup(pol); + if (IS_ERR(new)) + return PTR_ERR(new); + + if (vma->vm_ops && vma->vm_ops->set_policy) { err = vma->vm_ops->set_policy(vma, new); - if (!err) { - mpol_get(new); - vma->vm_policy = new; - mpol_put(old); + if (err) + goto err_out; } + + old = vma->vm_policy; + vma->vm_policy = new; /* protected by mmap_sem */ + mpol_put(old); + + return 0; + err_out: + mpol_put(new); return err; } @@ -676,7 +691,7 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, if (err) goto out; } - err = policy_vma(vma, new_pol); + err = vma_replace_policy(vma, new_pol); if (err) goto out; } @@ -2153,15 +2168,24 @@ static void sp_delete(struct shared_policy *sp, struct sp_node *n) static struct sp_node *sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol) { - struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL); + struct sp_node *n; + struct mempolicy *newpol; + n = kmem_cache_alloc(sn_cache, GFP_KERNEL); if (!n) return NULL; + + newpol = mpol_dup(pol); + if (IS_ERR(newpol)) { + kmem_cache_free(sn_cache, n); + return NULL; + } + newpol->flags |= MPOL_F_SHARED; + n->start = start; n->end = end; - mpol_get(pol); - pol->flags |= MPOL_F_SHARED; /* for unref */ - n->policy = pol; + n->policy = newpol; + return n; } -- cgit v1.1 From b22d127a39ddd10d93deee3d96e643657ad53a49 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 8 Oct 2012 16:29:17 -0700 Subject: mempolicy: fix a race in shared_policy_replace() shared_policy_replace() use of sp_alloc() is unsafe. 1) sp_node cannot be dereferenced if sp->lock is not held and 2) another thread can modify sp_node between spin_unlock for allocating a new sp node and next spin_lock. The bug was introduced before 2.6.12-rc2. Kosaki's original patch for this problem was to allocate an sp node and policy within shared_policy_replace and initialise it when the lock is reacquired. I was not keen on this approach because it partially duplicates sp_alloc(). As the paths were sp->lock is taken are not that performance critical this patch converts sp->lock to sp->mutex so it can sleep when calling sp_alloc(). [kosaki.motohiro@jp.fujitsu.com: Original patch] Signed-off-by: Mel Gorman Acked-by: KOSAKI Motohiro Reviewed-by: Christoph Lameter Cc: Josh Boyer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 37 ++++++++++++++++--------------------- 1 file changed, 16 insertions(+), 21 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index f0728ae..b2f12ec 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2083,7 +2083,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) */ /* lookup first element intersecting start-end */ -/* Caller holds sp->lock */ +/* Caller holds sp->mutex */ static struct sp_node * sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) { @@ -2147,13 +2147,13 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) if (!sp->root.rb_node) return NULL; - spin_lock(&sp->lock); + mutex_lock(&sp->mutex); sn = sp_lookup(sp, idx, idx+1); if (sn) { mpol_get(sn->policy); pol = sn->policy; } - spin_unlock(&sp->lock); + mutex_unlock(&sp->mutex); return pol; } @@ -2193,10 +2193,10 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end, static int shared_policy_replace(struct shared_policy *sp, unsigned long start, unsigned long end, struct sp_node *new) { - struct sp_node *n, *new2 = NULL; + struct sp_node *n; + int ret = 0; -restart: - spin_lock(&sp->lock); + mutex_lock(&sp->mutex); n = sp_lookup(sp, start, end); /* Take care of old policies in the same range. */ while (n && n->start < end) { @@ -2209,16 +2209,14 @@ restart: } else { /* Old policy spanning whole new range. */ if (n->end > end) { + struct sp_node *new2; + new2 = sp_alloc(end, n->end, n->policy); if (!new2) { - spin_unlock(&sp->lock); - new2 = sp_alloc(end, n->end, n->policy); - if (!new2) - return -ENOMEM; - goto restart; + ret = -ENOMEM; + goto out; } n->end = start; sp_insert(sp, new2); - new2 = NULL; break; } else n->end = start; @@ -2229,12 +2227,9 @@ restart: } if (new) sp_insert(sp, new); - spin_unlock(&sp->lock); - if (new2) { - mpol_put(new2->policy); - kmem_cache_free(sn_cache, new2); - } - return 0; +out: + mutex_unlock(&sp->mutex); + return ret; } /** @@ -2252,7 +2247,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) int ret; sp->root = RB_ROOT; /* empty tree == default mempolicy */ - spin_lock_init(&sp->lock); + mutex_init(&sp->mutex); if (mpol) { struct vm_area_struct pvma; @@ -2318,7 +2313,7 @@ void mpol_free_shared_policy(struct shared_policy *p) if (!p->root.rb_node) return; - spin_lock(&p->lock); + mutex_lock(&p->mutex); next = rb_first(&p->root); while (next) { n = rb_entry(next, struct sp_node, nd); @@ -2327,7 +2322,7 @@ void mpol_free_shared_policy(struct shared_policy *p) mpol_put(n->policy); kmem_cache_free(sn_cache, n); } - spin_unlock(&p->lock); + mutex_unlock(&p->mutex); } /* assumes fs == KERNEL_DS */ -- cgit v1.1 From 63f74ca21f1fad36d075e063f06dcc6d39fe86b2 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Mon, 8 Oct 2012 16:29:19 -0700 Subject: mempolicy: fix refcount leak in mpol_set_shared_policy() When shared_policy_replace() fails to allocate new->policy is not freed correctly by mpol_set_shared_policy(). The problem is that shared mempolicy code directly call kmem_cache_free() in multiple places where it is easy to make a mistake. This patch creates an sp_free wrapper function and uses it. The bug was introduced pre-git age (IOW, before 2.6.12-rc2). [mgorman@suse.de: Editted changelog] Signed-off-by: KOSAKI Motohiro Signed-off-by: Mel Gorman Reviewed-by: Christoph Lameter Cc: Josh Boyer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index b2f12ec..1763418 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2157,12 +2157,17 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) return pol; } +static void sp_free(struct sp_node *n) +{ + mpol_put(n->policy); + kmem_cache_free(sn_cache, n); +} + static void sp_delete(struct shared_policy *sp, struct sp_node *n) { pr_debug("deleting %lx-l%lx\n", n->start, n->end); rb_erase(&n->nd, &sp->root); - mpol_put(n->policy); - kmem_cache_free(sn_cache, n); + sp_free(n); } static struct sp_node *sp_alloc(unsigned long start, unsigned long end, @@ -2301,7 +2306,7 @@ int mpol_set_shared_policy(struct shared_policy *info, } err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new); if (err && new) - kmem_cache_free(sn_cache, new); + sp_free(new); return err; } @@ -2318,9 +2323,7 @@ void mpol_free_shared_policy(struct shared_policy *p) while (next) { n = rb_entry(next, struct sp_node, nd); next = rb_next(&n->nd); - rb_erase(&n->nd, &p->root); - mpol_put(n->policy); - kmem_cache_free(sn_cache, n); + sp_delete(p, n); } mutex_unlock(&p->mutex); } -- cgit v1.1 From 00442ad04a5eac08a98255697c510e708f6082e2 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 8 Oct 2012 16:29:20 -0700 Subject: mempolicy: fix a memory corruption by refcount imbalance in alloc_pages_vma() Commit cc9a6c877661 ("cpuset: mm: reduce large amounts of memory barrier related damage v3") introduced a potential memory corruption. shmem_alloc_page() uses a pseudo vma and it has one significant unique combination, vma->vm_ops=NULL and vma->policy->flags & MPOL_F_SHARED. get_vma_policy() does NOT increase a policy ref when vma->vm_ops=NULL and mpol_cond_put() DOES decrease a policy ref when a policy has MPOL_F_SHARED. Therefore, when a cpuset update race occurs, alloc_pages_vma() falls in 'goto retry_cpuset' path, decrements the reference count and frees the policy prematurely. Signed-off-by: KOSAKI Motohiro Signed-off-by: Mel Gorman Reviewed-by: Christoph Lameter Cc: Josh Boyer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 1763418..3d64b36 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1552,8 +1552,18 @@ struct mempolicy *get_vma_policy(struct task_struct *task, addr); if (vpol) pol = vpol; - } else if (vma->vm_policy) + } else if (vma->vm_policy) { pol = vma->vm_policy; + + /* + * shmem_alloc_page() passes MPOL_F_SHARED policy with + * a pseudo vma whose vma->vm_ops=NULL. Take a reference + * count on these policies which will be dropped by + * mpol_cond_put() later + */ + if (mpol_needs_cond_ref(pol)) + mpol_get(pol); + } } if (!pol) pol = &default_policy; -- cgit v1.1 From 21a92735f660eaecf69a6f2e777f18463760ec32 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 8 Oct 2012 16:29:24 -0700 Subject: mm: mmu_notifier: have mmu_notifiers use a global SRCU so they may safely schedule With an RCU based mmu_notifier implementation, any callout to mmu_notifier_invalidate_range_{start,end}() or mmu_notifier_invalidate_page() would not be allowed to call schedule() as that could potentially allow a modification to the mmu_notifier structure while it is currently being used. Since srcu allocs 4 machine words per instance per cpu, we may end up with memory exhaustion if we use srcu per mm. So all mms share a global srcu. Note that during large mmu_notifier activity exit & unregister paths might hang for longer periods, but it is tolerable for current mmu_notifier clients. Signed-off-by: Sagi Grimberg Signed-off-by: Andrea Arcangeli Cc: Peter Zijlstra Cc: Haggai Eran Cc: "Paul E. McKenney" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmu_notifier.c | 73 ++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 48 insertions(+), 25 deletions(-) (limited to 'mm') diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 862b608..35ff447 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -14,10 +14,14 @@ #include #include #include +#include #include #include #include +/* global SRCU for all MMs */ +struct srcu_struct srcu; + /* * This function can't run concurrently against mmu_notifier_register * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap @@ -25,8 +29,8 @@ * in parallel despite there being no task using this mm any more, * through the vmas outside of the exit_mmap context, such as with * vmtruncate. This serializes against mmu_notifier_unregister with - * the mmu_notifier_mm->lock in addition to RCU and it serializes - * against the other mmu notifiers with RCU. struct mmu_notifier_mm + * the mmu_notifier_mm->lock in addition to SRCU and it serializes + * against the other mmu notifiers with SRCU. struct mmu_notifier_mm * can't go away from under us as exit_mmap holds an mm_count pin * itself. */ @@ -34,12 +38,13 @@ void __mmu_notifier_release(struct mm_struct *mm) { struct mmu_notifier *mn; struct hlist_node *n; + int id; /* * RCU here will block mmu_notifier_unregister until * ->release returns. */ - rcu_read_lock(); + id = srcu_read_lock(&srcu); hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) /* * if ->release runs before mmu_notifier_unregister it @@ -50,7 +55,7 @@ void __mmu_notifier_release(struct mm_struct *mm) */ if (mn->ops->release) mn->ops->release(mn, mm); - rcu_read_unlock(); + srcu_read_unlock(&srcu, id); spin_lock(&mm->mmu_notifier_mm->lock); while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { @@ -68,7 +73,7 @@ void __mmu_notifier_release(struct mm_struct *mm) spin_unlock(&mm->mmu_notifier_mm->lock); /* - * synchronize_rcu here prevents mmu_notifier_release to + * synchronize_srcu here prevents mmu_notifier_release to * return to exit_mmap (which would proceed freeing all pages * in the mm) until the ->release method returns, if it was * invoked by mmu_notifier_unregister. @@ -76,7 +81,7 @@ void __mmu_notifier_release(struct mm_struct *mm) * The mmu_notifier_mm can't go away from under us because one * mm_count is hold by exit_mmap. */ - synchronize_rcu(); + synchronize_srcu(&srcu); } /* @@ -89,14 +94,14 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm, { struct mmu_notifier *mn; struct hlist_node *n; - int young = 0; + int young = 0, id; - rcu_read_lock(); + id = srcu_read_lock(&srcu); hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { if (mn->ops->clear_flush_young) young |= mn->ops->clear_flush_young(mn, mm, address); } - rcu_read_unlock(); + srcu_read_unlock(&srcu, id); return young; } @@ -106,9 +111,9 @@ int __mmu_notifier_test_young(struct mm_struct *mm, { struct mmu_notifier *mn; struct hlist_node *n; - int young = 0; + int young = 0, id; - rcu_read_lock(); + id = srcu_read_lock(&srcu); hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { if (mn->ops->test_young) { young = mn->ops->test_young(mn, mm, address); @@ -116,7 +121,7 @@ int __mmu_notifier_test_young(struct mm_struct *mm, break; } } - rcu_read_unlock(); + srcu_read_unlock(&srcu, id); return young; } @@ -126,8 +131,9 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, { struct mmu_notifier *mn; struct hlist_node *n; + int id; - rcu_read_lock(); + id = srcu_read_lock(&srcu); hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { if (mn->ops->change_pte) mn->ops->change_pte(mn, mm, address, pte); @@ -138,7 +144,7 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, else if (mn->ops->invalidate_page) mn->ops->invalidate_page(mn, mm, address); } - rcu_read_unlock(); + srcu_read_unlock(&srcu, id); } void __mmu_notifier_invalidate_page(struct mm_struct *mm, @@ -146,13 +152,14 @@ void __mmu_notifier_invalidate_page(struct mm_struct *mm, { struct mmu_notifier *mn; struct hlist_node *n; + int id; - rcu_read_lock(); + id = srcu_read_lock(&srcu); hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { if (mn->ops->invalidate_page) mn->ops->invalidate_page(mn, mm, address); } - rcu_read_unlock(); + srcu_read_unlock(&srcu, id); } void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, @@ -160,13 +167,14 @@ void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, { struct mmu_notifier *mn; struct hlist_node *n; + int id; - rcu_read_lock(); + id = srcu_read_lock(&srcu); hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { if (mn->ops->invalidate_range_start) mn->ops->invalidate_range_start(mn, mm, start, end); } - rcu_read_unlock(); + srcu_read_unlock(&srcu, id); } void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, @@ -174,13 +182,14 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, { struct mmu_notifier *mn; struct hlist_node *n; + int id; - rcu_read_lock(); + id = srcu_read_lock(&srcu); hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { if (mn->ops->invalidate_range_end) mn->ops->invalidate_range_end(mn, mm, start, end); } - rcu_read_unlock(); + srcu_read_unlock(&srcu, id); } static int do_mmu_notifier_register(struct mmu_notifier *mn, @@ -192,6 +201,12 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn, BUG_ON(atomic_read(&mm->mm_users) <= 0); + /* + * Verify that mmu_notifier_init() already run and the global srcu is + * initialized. + */ + BUG_ON(!srcu.per_cpu_ref); + ret = -ENOMEM; mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL); if (unlikely(!mmu_notifier_mm)) @@ -274,8 +289,8 @@ void __mmu_notifier_mm_destroy(struct mm_struct *mm) /* * This releases the mm_count pin automatically and frees the mm * structure if it was the last user of it. It serializes against - * running mmu notifiers with RCU and against mmu_notifier_unregister - * with the unregister lock + RCU. All sptes must be dropped before + * running mmu notifiers with SRCU and against mmu_notifier_unregister + * with the unregister lock + SRCU. All sptes must be dropped before * calling mmu_notifier_unregister. ->release or any other notifier * method may be invoked concurrently with mmu_notifier_unregister, * and only after mmu_notifier_unregister returned we're guaranteed @@ -290,8 +305,9 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) * RCU here will force exit_mmap to wait ->release to finish * before freeing the pages. */ - rcu_read_lock(); + int id; + id = srcu_read_lock(&srcu); /* * exit_mmap will block in mmu_notifier_release to * guarantee ->release is called before freeing the @@ -299,7 +315,7 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) */ if (mn->ops->release) mn->ops->release(mn, mm); - rcu_read_unlock(); + srcu_read_unlock(&srcu, id); spin_lock(&mm->mmu_notifier_mm->lock); hlist_del_rcu(&mn->hlist); @@ -310,10 +326,17 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) * Wait any running method to finish, of course including * ->release if it was run by mmu_notifier_relase instead of us. */ - synchronize_rcu(); + synchronize_srcu(&srcu); BUG_ON(atomic_read(&mm->mm_count) <= 0); mmdrop(mm); } EXPORT_SYMBOL_GPL(mmu_notifier_unregister); + +static int __init mmu_notifier_init(void) +{ + return init_srcu_struct(&srcu); +} + +module_init(mmu_notifier_init); -- cgit v1.1 From e0f3c3f78da29b114e7c1c68019036559f715948 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Mon, 8 Oct 2012 16:29:26 -0700 Subject: mm/mmu_notifier: init notifier if necessary While registering MMU notifier, new instance of MMU notifier_mm will be allocated and later free'd if currrent mm_struct's MMU notifier_mm has been initialized. That causes some overhead. The patch tries to elominate that. Signed-off-by: Gavin Shan Signed-off-by: Wanpeng Li Cc: Andrea Arcangeli Cc: Avi Kivity Cc: Hugh Dickins Cc: Marcelo Tosatti Cc: Xiao Guangrong Cc: Sagi Grimberg Cc: Haggai Eran Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmu_notifier.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 35ff447..947df83 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -207,22 +207,23 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn, */ BUG_ON(!srcu.per_cpu_ref); - ret = -ENOMEM; - mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL); - if (unlikely(!mmu_notifier_mm)) - goto out; - if (take_mmap_sem) down_write(&mm->mmap_sem); ret = mm_take_all_locks(mm); if (unlikely(ret)) - goto out_cleanup; + goto out; if (!mm_has_notifiers(mm)) { + mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), + GFP_KERNEL); + if (unlikely(!mmu_notifier_mm)) { + ret = -ENOMEM; + goto out_of_mem; + } INIT_HLIST_HEAD(&mmu_notifier_mm->list); spin_lock_init(&mmu_notifier_mm->lock); + mm->mmu_notifier_mm = mmu_notifier_mm; - mmu_notifier_mm = NULL; } atomic_inc(&mm->mm_count); @@ -238,13 +239,12 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn, hlist_add_head(&mn->hlist, &mm->mmu_notifier_mm->list); spin_unlock(&mm->mmu_notifier_mm->lock); +out_of_mem: mm_drop_all_locks(mm); -out_cleanup: +out: if (take_mmap_sem) up_write(&mm->mmap_sem); - /* kfree() does nothing if mmu_notifier_mm is NULL */ - kfree(mmu_notifier_mm); -out: + BUG_ON(atomic_read(&mm->mm_users) <= 0); return ret; } -- cgit v1.1 From d5dc0ad928fb9e972001e552597fd0b794863f34 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Mon, 8 Oct 2012 16:29:27 -0700 Subject: mm/vmscan: fix error number for failed kthread Fix the return value while failing to create the kswapd kernel thread. Also, the error message is prioritized as KERN_ERR. Signed-off-by: Gavin Shan Signed-off-by: Wanpeng Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index f6fdae7..d16bf5a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3126,9 +3126,9 @@ int kswapd_run(int nid) if (IS_ERR(pgdat->kswapd)) { /* failure at boot is fatal */ BUG_ON(system_state == SYSTEM_BOOTING); - printk("Failed to start kswapd on node %d\n",nid); pgdat->kswapd = NULL; - ret = -1; + pr_err("Failed to start kswapd on node %d\n", nid); + ret = PTR_ERR(pgdat->kswapd); } return ret; } -- cgit v1.1 From 01dc52ebdf472f77cca623ca693ca24cfc0f1bbe Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 8 Oct 2012 16:29:30 -0700 Subject: oom: remove deprecated oom_adj The deprecated /proc//oom_adj is scheduled for removal this month. Signed-off-by: Davidlohr Bueso Acked-by: David Rientjes Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 1986008..79e0f3e 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -428,8 +428,8 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, { task_lock(current); pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " - "oom_adj=%d, oom_score_adj=%d\n", - current->comm, gfp_mask, order, current->signal->oom_adj, + "oom_score_adj=%d\n", + current->comm, gfp_mask, order, current->signal->oom_score_adj); cpuset_print_task_mems_allowed(current); task_unlock(current); -- cgit v1.1 From 5d3a551c28c6669dc43be40d8fafafbc2ec8f42b Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 8 Oct 2012 16:29:32 -0700 Subject: mm: hugetlb: add arch hook for clearing page flags before entering pool The core page allocator ensures that page flags are zeroed when freeing pages via free_pages_check. A number of architectures (ARM, PPC, MIPS) rely on this property to treat new pages as dirty with respect to the data cache and perform the appropriate flushing before mapping the pages into userspace. This can lead to cache synchronisation problems when using hugepages, since the allocator keeps its own pool of pages above the usual page allocator and does not reset the page flags when freeing a page into the pool. This patch adds a new architecture hook, arch_clear_hugepage_flags, so that architectures which rely on the page flags being in a particular state for fresh allocations can adjust the flags accordingly when a page is freed into the pool. Signed-off-by: Will Deacon Cc: Michal Hocko Reviewed-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bc72712..f1bb534 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -637,6 +637,7 @@ static void free_huge_page(struct page *page) h->surplus_huge_pages--; h->surplus_huge_pages_node[nid]--; } else { + arch_clear_hugepage_flags(page); enqueue_huge_page(h, page); } spin_unlock(&hugetlb_lock); -- cgit v1.1 From db971418824381d3583c73751181fec76c743bf6 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:29:34 -0700 Subject: mm: adjust final #endif position in mm/internal.h Make sure the #endif that terminates the standard #ifndef / #define / #endif construct gets labeled, and gets positioned at the end of the file as is normally the case. Signed-off-by: Michel Lespinasse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index e549a7f..bbd7b34 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -341,7 +341,6 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, #define ZONE_RECLAIM_FULL -1 #define ZONE_RECLAIM_SOME 0 #define ZONE_RECLAIM_SUCCESS 1 -#endif extern int hwpoison_filter(struct page *p); @@ -357,3 +356,5 @@ extern unsigned long vm_mmap_pgoff(struct file *, unsigned long, unsigned long, unsigned long); extern void set_pageblock_order(void); + +#endif /* __MM_INTERNAL_H */ -- cgit v1.1 From 65b3c07b43f7f8a5cbf8923011bd4e6650e3d1dc Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 8 Oct 2012 16:29:35 -0700 Subject: thp: fix the count of THP_COLLAPSE_ALLOC THP_COLLAPSE_ALLOC is double counted if NUMA is disabled since it has already been calculated in khugepaged_alloc_hugepage Signed-off-by: Xiao Guangrong Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9b72d12..860ea91 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1874,9 +1874,9 @@ static void collapse_huge_page(struct mm_struct *mm, *hpage = ERR_PTR(-ENOMEM); return; } + count_vm_event(THP_COLLAPSE_ALLOC); #endif - count_vm_event(THP_COLLAPSE_ALLOC); if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { #ifdef CONFIG_NUMA put_page(new_page); -- cgit v1.1 From 637e3a27ec2c84f7ecd083fa6943da2f19eb5e9f Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 8 Oct 2012 16:29:38 -0700 Subject: thp: remove unnecessary check in start_khugepaged The check is unnecessary since if mm_slot_cache or mm_slots_hash initialize failed, no sysfs interface will be created Signed-off-by: Xiao Guangrong Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 860ea91..9833d8e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -140,10 +140,7 @@ static int start_khugepaged(void) int err = 0; if (khugepaged_enabled()) { int wakeup; - if (unlikely(!mm_slot_cache || !mm_slots_hash)) { - err = -ENOMEM; - goto out; - } + mutex_lock(&khugepaged_mutex); if (!khugepaged_thread) khugepaged_thread = kthread_run(khugepaged, NULL, @@ -163,7 +160,7 @@ static int start_khugepaged(void) } else /* wakeup to exit */ wake_up_interruptible(&khugepaged_wait); -out: + return err; } -- cgit v1.1 From 911891afe1c3104adf0f802189909868239ebbfd Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 8 Oct 2012 16:29:41 -0700 Subject: thp: move khugepaged_mutex out of khugepaged Currently, hugepaged_mutex is used really complexly and hard to understand, actually, it is just used to serialize start_khugepaged and khugepaged for these reasons: - khugepaged_thread is shared between them - the thp disable path (echo never > transparent_hugepage/enabled) is nonblocking, so we need to protect khugepaged_thread to get a stable running state These can be avoided by: - use the lock to serialize the thread creation and cancel - thp disable path can not finised until the thread exits Then khugepaged_thread is fully controlled by start_khugepaged, khugepaged will be happy without the lock Signed-off-by: Xiao Guangrong Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 36 +++++++++++++----------------------- 1 file changed, 13 insertions(+), 23 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9833d8e..0931b2b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -139,9 +139,6 @@ static int start_khugepaged(void) { int err = 0; if (khugepaged_enabled()) { - int wakeup; - - mutex_lock(&khugepaged_mutex); if (!khugepaged_thread) khugepaged_thread = kthread_run(khugepaged, NULL, "khugepaged"); @@ -151,15 +148,17 @@ static int start_khugepaged(void) err = PTR_ERR(khugepaged_thread); khugepaged_thread = NULL; } - wakeup = !list_empty(&khugepaged_scan.mm_head); - mutex_unlock(&khugepaged_mutex); - if (wakeup) + + if (!list_empty(&khugepaged_scan.mm_head)) wake_up_interruptible(&khugepaged_wait); set_recommended_min_free_kbytes(); - } else + } else if (khugepaged_thread) { /* wakeup to exit */ wake_up_interruptible(&khugepaged_wait); + kthread_stop(khugepaged_thread); + khugepaged_thread = NULL; + } return err; } @@ -221,7 +220,12 @@ static ssize_t enabled_store(struct kobject *kobj, TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); if (ret > 0) { - int err = start_khugepaged(); + int err; + + mutex_lock(&khugepaged_mutex); + err = start_khugepaged(); + mutex_unlock(&khugepaged_mutex); + if (err) ret = err; } @@ -2329,20 +2333,10 @@ static int khugepaged(void *none) set_freezable(); set_user_nice(current, 19); - /* serialize with start_khugepaged() */ - mutex_lock(&khugepaged_mutex); - - for (;;) { - mutex_unlock(&khugepaged_mutex); + while (!kthread_should_stop()) { VM_BUG_ON(khugepaged_thread != current); khugepaged_loop(); VM_BUG_ON(khugepaged_thread != current); - - mutex_lock(&khugepaged_mutex); - if (!khugepaged_enabled()) - break; - if (unlikely(kthread_should_stop())) - break; } spin_lock(&khugepaged_mm_lock); @@ -2351,10 +2345,6 @@ static int khugepaged(void *none) if (mm_slot) collect_mm_slot(mm_slot); spin_unlock(&khugepaged_mm_lock); - - khugepaged_thread = NULL; - mutex_unlock(&khugepaged_mutex); - return 0; } -- cgit v1.1 From e060f0e0139b83f05bb90fa05563d14179b9a7ff Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 8 Oct 2012 16:29:42 -0700 Subject: thp: remove unnecessary khugepaged_thread check Now, khugepaged creation and cancel are completely serial under the protection of khugepaged_mutex, it is impossible that many khugepaged entities are running Signed-off-by: Xiao Guangrong Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0931b2b..46e3f15 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2333,11 +2333,8 @@ static int khugepaged(void *none) set_freezable(); set_user_nice(current, 19); - while (!kthread_should_stop()) { - VM_BUG_ON(khugepaged_thread != current); + while (!kthread_should_stop()) khugepaged_loop(); - VM_BUG_ON(khugepaged_thread != current); - } spin_lock(&khugepaged_mm_lock); mm_slot = khugepaged_scan.mm_slot; -- cgit v1.1 From 2017c0bff8ba79ea527361adbe19471e174775d6 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 8 Oct 2012 16:29:44 -0700 Subject: thp: remove wake_up_interruptible in the exit path Add the check of kthread_should_stop() to the conditions which are used to wakeup on khugepaged_wait, then kthread_stop is enough to let the thread exit Signed-off-by: Xiao Guangrong Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 46e3f15..a94c07a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -154,8 +154,6 @@ static int start_khugepaged(void) set_recommended_min_free_kbytes(); } else if (khugepaged_thread) { - /* wakeup to exit */ - wake_up_interruptible(&khugepaged_wait); kthread_stop(khugepaged_thread); khugepaged_thread = NULL; } @@ -2221,7 +2219,7 @@ static int khugepaged_has_work(void) static int khugepaged_wait_event(void) { return !list_empty(&khugepaged_scan.mm_head) || - !khugepaged_enabled(); + kthread_should_stop(); } static void khugepaged_do_scan(struct page **hpage) @@ -2288,6 +2286,24 @@ static struct page *khugepaged_alloc_hugepage(void) } #endif +static void khugepaged_wait_work(void) +{ + try_to_freeze(); + + if (khugepaged_has_work()) { + if (!khugepaged_scan_sleep_millisecs) + return; + + wait_event_freezable_timeout(khugepaged_wait, + kthread_should_stop(), + msecs_to_jiffies(khugepaged_scan_sleep_millisecs)); + return; + } + + if (khugepaged_enabled()) + wait_event_freezable(khugepaged_wait, khugepaged_wait_event()); +} + static void khugepaged_loop(void) { struct page *hpage; @@ -2312,17 +2328,8 @@ static void khugepaged_loop(void) if (hpage) put_page(hpage); #endif - try_to_freeze(); - if (unlikely(kthread_should_stop())) - break; - if (khugepaged_has_work()) { - if (!khugepaged_scan_sleep_millisecs) - continue; - wait_event_freezable_timeout(khugepaged_wait, false, - msecs_to_jiffies(khugepaged_scan_sleep_millisecs)); - } else if (khugepaged_enabled()) - wait_event_freezable(khugepaged_wait, - khugepaged_wait_event()); + + khugepaged_wait_work(); } } -- cgit v1.1 From 9817626e722a5e5699cf38f5d3a4c9851e054436 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 8 Oct 2012 16:29:46 -0700 Subject: thp: remove some code depend on CONFIG_NUMA If NUMA is disabled, hpage is used as page pre-alloc, so there are two cases for hpage: - it is !NULL, means the page is not consumed otherwise, - the page has been consumed If NUMA is enabled, hpage is just used as alloc-fail indicator which is not a real page, NULL means not fail triggered. So, we can release the page only if !IS_ERR_OR_NULL Signed-off-by: Xiao Guangrong Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a94c07a..1e21b4c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2306,11 +2306,8 @@ static void khugepaged_wait_work(void) static void khugepaged_loop(void) { - struct page *hpage; + struct page *hpage = NULL; -#ifdef CONFIG_NUMA - hpage = NULL; -#endif while (likely(khugepaged_enabled())) { #ifndef CONFIG_NUMA hpage = khugepaged_alloc_hugepage(); @@ -2324,10 +2321,9 @@ static void khugepaged_loop(void) #endif khugepaged_do_scan(&hpage); -#ifndef CONFIG_NUMA - if (hpage) + + if (!IS_ERR_OR_NULL(hpage)) put_page(hpage); -#endif khugepaged_wait_work(); } -- cgit v1.1 From d516904bd239fe2c9f1bd46cf146bb4b8831321c Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 8 Oct 2012 16:29:48 -0700 Subject: thp: merge page pre-alloc in khugepaged_loop into khugepaged_do_scan There are two pre-alloc operations in these two function, the different is: - it allows to sleep if page alloc fail in khugepaged_loop - it exits immediately if page alloc fail in khugepaged_do_scan Actually, in khugepaged_do_scan, we can allow the pre-alloc to sleep on the first failure, then the operation in khugepaged_loop can be removed Signed-off-by: Xiao Guangrong Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 97 ++++++++++++++++++++++++++------------------------------ 1 file changed, 45 insertions(+), 52 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1e21b4c..d5b5fcc 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2222,10 +2222,40 @@ static int khugepaged_wait_event(void) kthread_should_stop(); } -static void khugepaged_do_scan(struct page **hpage) +static void khugepaged_alloc_sleep(void) +{ + wait_event_freezable_timeout(khugepaged_wait, false, + msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); +} + +#ifndef CONFIG_NUMA +static struct page *khugepaged_alloc_hugepage(bool *wait) +{ + struct page *hpage; + + do { + hpage = alloc_hugepage(khugepaged_defrag()); + if (!hpage) { + count_vm_event(THP_COLLAPSE_ALLOC_FAILED); + if (!*wait) + return NULL; + + *wait = false; + khugepaged_alloc_sleep(); + } else + count_vm_event(THP_COLLAPSE_ALLOC); + } while (unlikely(!hpage) && likely(khugepaged_enabled())); + + return hpage; +} +#endif + +static void khugepaged_do_scan(void) { + struct page *hpage = NULL; unsigned int progress = 0, pass_through_head = 0; unsigned int pages = khugepaged_pages_to_scan; + bool wait = true; barrier(); /* write khugepaged_pages_to_scan to local stack */ @@ -2233,17 +2263,18 @@ static void khugepaged_do_scan(struct page **hpage) cond_resched(); #ifndef CONFIG_NUMA - if (!*hpage) { - *hpage = alloc_hugepage(khugepaged_defrag()); - if (unlikely(!*hpage)) { - count_vm_event(THP_COLLAPSE_ALLOC_FAILED); + if (!hpage) + hpage = khugepaged_alloc_hugepage(&wait); + + if (unlikely(!hpage)) + break; +#else + if (IS_ERR(hpage)) { + if (!wait) break; - } - count_vm_event(THP_COLLAPSE_ALLOC); + wait = false; + khugepaged_alloc_sleep(); } -#else - if (IS_ERR(*hpage)) - break; #endif if (unlikely(kthread_should_stop() || freezing(current))) @@ -2255,37 +2286,16 @@ static void khugepaged_do_scan(struct page **hpage) if (khugepaged_has_work() && pass_through_head < 2) progress += khugepaged_scan_mm_slot(pages - progress, - hpage); + &hpage); else progress = pages; spin_unlock(&khugepaged_mm_lock); } -} -static void khugepaged_alloc_sleep(void) -{ - wait_event_freezable_timeout(khugepaged_wait, false, - msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); + if (!IS_ERR_OR_NULL(hpage)) + put_page(hpage); } -#ifndef CONFIG_NUMA -static struct page *khugepaged_alloc_hugepage(void) -{ - struct page *hpage; - - do { - hpage = alloc_hugepage(khugepaged_defrag()); - if (!hpage) { - count_vm_event(THP_COLLAPSE_ALLOC_FAILED); - khugepaged_alloc_sleep(); - } else - count_vm_event(THP_COLLAPSE_ALLOC); - } while (unlikely(!hpage) && - likely(khugepaged_enabled())); - return hpage; -} -#endif - static void khugepaged_wait_work(void) { try_to_freeze(); @@ -2306,25 +2316,8 @@ static void khugepaged_wait_work(void) static void khugepaged_loop(void) { - struct page *hpage = NULL; - while (likely(khugepaged_enabled())) { -#ifndef CONFIG_NUMA - hpage = khugepaged_alloc_hugepage(); - if (unlikely(!hpage)) - break; -#else - if (IS_ERR(hpage)) { - khugepaged_alloc_sleep(); - hpage = NULL; - } -#endif - - khugepaged_do_scan(&hpage); - - if (!IS_ERR_OR_NULL(hpage)) - put_page(hpage); - + khugepaged_do_scan(); khugepaged_wait_work(); } } -- cgit v1.1 From 420256ef02660af0acf28c12fe4b7d514ca88a4d Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 8 Oct 2012 16:29:49 -0700 Subject: thp: release page in page pre-alloc path If NUMA is enabled, we can release the page in the page pre-alloc operation, then the CONFIG_NUMA dependent code can be reduced Signed-off-by: Xiao Guangrong Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d5b5fcc..9c4390f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1873,15 +1873,12 @@ static void collapse_huge_page(struct mm_struct *mm, *hpage = ERR_PTR(-ENOMEM); return; } + *hpage = new_page; count_vm_event(THP_COLLAPSE_ALLOC); #endif - if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { -#ifdef CONFIG_NUMA - put_page(new_page); -#endif + if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) return; - } /* * Prevent all access to pagetables with the exception of @@ -1982,9 +1979,8 @@ static void collapse_huge_page(struct mm_struct *mm, prepare_pmd_huge_pte(pgtable, mm); spin_unlock(&mm->page_table_lock); -#ifndef CONFIG_NUMA *hpage = NULL; -#endif + khugepaged_pages_collapsed++; out_up_write: up_write(&mm->mmap_sem); @@ -1992,9 +1988,6 @@ out_up_write: out: mem_cgroup_uncharge_page(new_page); -#ifdef CONFIG_NUMA - put_page(new_page); -#endif goto out_up_write; } @@ -2260,8 +2253,6 @@ static void khugepaged_do_scan(void) barrier(); /* write khugepaged_pages_to_scan to local stack */ while (progress < pages) { - cond_resched(); - #ifndef CONFIG_NUMA if (!hpage) hpage = khugepaged_alloc_hugepage(&wait); @@ -2274,8 +2265,12 @@ static void khugepaged_do_scan(void) break; wait = false; khugepaged_alloc_sleep(); + } else if (hpage) { + put_page(hpage); + hpage = NULL; } #endif + cond_resched(); if (unlikely(kthread_should_stop() || freezing(current))) break; -- cgit v1.1 From 26234f36ef3ec7efcfa9acb181427849c1f9db7c Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 8 Oct 2012 16:29:51 -0700 Subject: thp: introduce khugepaged_prealloc_page and khugepaged_alloc_page They are used to abstract the difference between NUMA enabled and NUMA disabled to make the code more readable Signed-off-by: Xiao Guangrong Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 166 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 98 insertions(+), 68 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9c4390f..f0e9993 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1827,28 +1827,34 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, } } -static void collapse_huge_page(struct mm_struct *mm, - unsigned long address, - struct page **hpage, - struct vm_area_struct *vma, - int node) +static void khugepaged_alloc_sleep(void) { - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd, _pmd; - pte_t *pte; - pgtable_t pgtable; - struct page *new_page; - spinlock_t *ptl; - int isolated; - unsigned long hstart, hend; + wait_event_freezable_timeout(khugepaged_wait, false, + msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); +} - VM_BUG_ON(address & ~HPAGE_PMD_MASK); -#ifndef CONFIG_NUMA - up_read(&mm->mmap_sem); - VM_BUG_ON(!*hpage); - new_page = *hpage; -#else +#ifdef CONFIG_NUMA +static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) +{ + if (IS_ERR(*hpage)) { + if (!*wait) + return false; + + *wait = false; + khugepaged_alloc_sleep(); + } else if (*hpage) { + put_page(*hpage); + *hpage = NULL; + } + + return true; +} + +static struct page +*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long address, + int node) +{ VM_BUG_ON(*hpage); /* * Allocate the page while the vma is still valid and under @@ -1860,7 +1866,7 @@ static void collapse_huge_page(struct mm_struct *mm, * mmap_sem in read mode is good idea also to allow greater * scalability. */ - new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address, + *hpage = alloc_hugepage_vma(khugepaged_defrag(), vma, address, node, __GFP_OTHER_NODE); /* @@ -1868,15 +1874,81 @@ static void collapse_huge_page(struct mm_struct *mm, * preparation for taking it in write mode. */ up_read(&mm->mmap_sem); - if (unlikely(!new_page)) { + if (unlikely(!*hpage)) { count_vm_event(THP_COLLAPSE_ALLOC_FAILED); *hpage = ERR_PTR(-ENOMEM); - return; + return NULL; } - *hpage = new_page; + count_vm_event(THP_COLLAPSE_ALLOC); + return *hpage; +} +#else +static struct page *khugepaged_alloc_hugepage(bool *wait) +{ + struct page *hpage; + + do { + hpage = alloc_hugepage(khugepaged_defrag()); + if (!hpage) { + count_vm_event(THP_COLLAPSE_ALLOC_FAILED); + if (!*wait) + return NULL; + + *wait = false; + khugepaged_alloc_sleep(); + } else + count_vm_event(THP_COLLAPSE_ALLOC); + } while (unlikely(!hpage) && likely(khugepaged_enabled())); + + return hpage; +} + +static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) +{ + if (!*hpage) + *hpage = khugepaged_alloc_hugepage(wait); + + if (unlikely(!*hpage)) + return false; + + return true; +} + +static struct page +*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long address, + int node) +{ + up_read(&mm->mmap_sem); + VM_BUG_ON(!*hpage); + return *hpage; +} #endif +static void collapse_huge_page(struct mm_struct *mm, + unsigned long address, + struct page **hpage, + struct vm_area_struct *vma, + int node) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd, _pmd; + pte_t *pte; + pgtable_t pgtable; + struct page *new_page; + spinlock_t *ptl; + int isolated; + unsigned long hstart, hend; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + + /* release the mmap_sem read lock. */ + new_page = khugepaged_alloc_page(hpage, mm, vma, address, node); + if (!new_page) + return; + if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) return; @@ -2215,34 +2287,6 @@ static int khugepaged_wait_event(void) kthread_should_stop(); } -static void khugepaged_alloc_sleep(void) -{ - wait_event_freezable_timeout(khugepaged_wait, false, - msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); -} - -#ifndef CONFIG_NUMA -static struct page *khugepaged_alloc_hugepage(bool *wait) -{ - struct page *hpage; - - do { - hpage = alloc_hugepage(khugepaged_defrag()); - if (!hpage) { - count_vm_event(THP_COLLAPSE_ALLOC_FAILED); - if (!*wait) - return NULL; - - *wait = false; - khugepaged_alloc_sleep(); - } else - count_vm_event(THP_COLLAPSE_ALLOC); - } while (unlikely(!hpage) && likely(khugepaged_enabled())); - - return hpage; -} -#endif - static void khugepaged_do_scan(void) { struct page *hpage = NULL; @@ -2253,23 +2297,9 @@ static void khugepaged_do_scan(void) barrier(); /* write khugepaged_pages_to_scan to local stack */ while (progress < pages) { -#ifndef CONFIG_NUMA - if (!hpage) - hpage = khugepaged_alloc_hugepage(&wait); - - if (unlikely(!hpage)) + if (!khugepaged_prealloc_page(&hpage, &wait)) break; -#else - if (IS_ERR(hpage)) { - if (!wait) - break; - wait = false; - khugepaged_alloc_sleep(); - } else if (hpage) { - put_page(hpage); - hpage = NULL; - } -#endif + cond_resched(); if (unlikely(kthread_should_stop() || freezing(current))) -- cgit v1.1 From b7231789b0224e73af4efc7973f8bcf17fc16edd Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 8 Oct 2012 16:29:54 -0700 Subject: thp: remove khugepaged_loop Merge khugepaged_loop into khugepaged Signed-off-by: Xiao Guangrong Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f0e9993..77b470b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2339,14 +2339,6 @@ static void khugepaged_wait_work(void) wait_event_freezable(khugepaged_wait, khugepaged_wait_event()); } -static void khugepaged_loop(void) -{ - while (likely(khugepaged_enabled())) { - khugepaged_do_scan(); - khugepaged_wait_work(); - } -} - static int khugepaged(void *none) { struct mm_slot *mm_slot; @@ -2354,8 +2346,10 @@ static int khugepaged(void *none) set_freezable(); set_user_nice(current, 19); - while (!kthread_should_stop()) - khugepaged_loop(); + while (!kthread_should_stop()) { + khugepaged_do_scan(); + khugepaged_wait_work(); + } spin_lock(&khugepaged_mm_lock); mm_slot = khugepaged_scan.mm_slot; -- cgit v1.1 From 17c230afa58a6d013a4949d5c04b823a281d40fa Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 8 Oct 2012 16:29:56 -0700 Subject: thp: use khugepaged_enabled to remove duplicate code Use khugepaged_enabled to see whether thp is enabled Signed-off-by: Xiao Guangrong Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 77b470b..a169c4d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -102,10 +102,7 @@ static int set_recommended_min_free_kbytes(void) unsigned long recommended_min; extern int min_free_kbytes; - if (!test_bit(TRANSPARENT_HUGEPAGE_FLAG, - &transparent_hugepage_flags) && - !test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, - &transparent_hugepage_flags)) + if (!khugepaged_enabled()) return 0; for_each_populated_zone(zone) @@ -228,11 +225,7 @@ static ssize_t enabled_store(struct kobject *kobj, ret = err; } - if (ret > 0 && - (test_bit(TRANSPARENT_HUGEPAGE_FLAG, - &transparent_hugepage_flags) || - test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, - &transparent_hugepage_flags))) + if (ret > 0 && khugepaged_enabled()) set_recommended_min_free_kbytes(); return ret; -- cgit v1.1 From 227e4047488d3ee0173a914275a7fd207ad51e5b Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 8 Oct 2012 16:29:59 -0700 Subject: thp: remove unnecessary set_recommended_min_free_kbytes Since it is called in start_khugepaged Signed-off-by: Xiao Guangrong Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a169c4d..e19cc42 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -225,9 +225,6 @@ static ssize_t enabled_store(struct kobject *kobj, ret = err; } - if (ret > 0 && khugepaged_enabled()) - set_recommended_min_free_kbytes(); - return ret; } static struct kobj_attribute enabled_attr = @@ -562,8 +559,6 @@ static int __init hugepage_init(void) start_khugepaged(); - set_recommended_min_free_kbytes(); - return 0; out: hugepage_exit_sysfs(hugepage_kobj); -- cgit v1.1 From ca42b26ab285edc5ee3f9faa48379d258db53c35 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:30:01 -0700 Subject: mm: fix potential anon_vma locking issue in mprotect() Fix an anon_vma locking issue in the following situation: - vma has no anon_vma - next has an anon_vma - vma is being shrunk / next is being expanded, due to an mprotect call We need to take next's anon_vma lock to avoid races with rmap users (such as page migration) while next is being expanded. Signed-off-by: Michel Lespinasse Reviewed-by: Andrea Arcangeli Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 2ba2743..e3c365f 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -573,8 +573,12 @@ again: remove_next = 1 + (end > next->vm_end); */ if (vma->anon_vma && (importer || start != vma->vm_start)) { anon_vma = vma->anon_vma; + VM_BUG_ON(adjust_next && next->anon_vma && + anon_vma != next->anon_vma); + } else if (adjust_next && next->anon_vma) + anon_vma = next->anon_vma; + if (anon_vma) anon_vma_lock(anon_vma); - } if (root) { flush_dcache_mmap_lock(mapping); -- cgit v1.1 From 15626062f4a98279c59a2a5208c496cf65cbf8c0 Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Mon, 8 Oct 2012 16:30:04 -0700 Subject: thp, x86: introduce HAVE_ARCH_TRANSPARENT_HUGEPAGE Cleanup patch in preparation for transparent hugepage support on s390. Adding new architectures to the TRANSPARENT_HUGEPAGE config option can make the "depends" line rather ugly, like "depends on (X86 || (S390 && 64BIT)) && MMU". This patch adds a HAVE_ARCH_TRANSPARENT_HUGEPAGE instead. x86 already has MMU "def_bool y", so the MMU check is superfluous there and HAVE_ARCH_TRANSPARENT_HUGEPAGE can be selected in arch/x86/Kconfig. Signed-off-by: Gerald Schaefer Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Andrea Arcangeli Cc: Andi Kleen Cc: Hugh Dickins Cc: Hillf Danton Cc: Martin Schwidefsky Cc: Heiko Carstens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index d5c8019..3322342 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -318,7 +318,7 @@ config NOMMU_INITIAL_TRIM_EXCESS config TRANSPARENT_HUGEPAGE bool "Transparent Hugepage Support" - depends on X86 && MMU + depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE select COMPACTION help Transparent Hugepages allows the kernel to use huge pages and -- cgit v1.1 From e3ebcf64381188a2744a9829a4eb5c2b60f1974c Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Mon, 8 Oct 2012 16:30:07 -0700 Subject: thp: remove assumptions on pgtable_t type The thp page table pre-allocation code currently assumes that pgtable_t is of type "struct page *". This may not be true for all architectures, so this patch removes that assumption by replacing the functions prepare_pmd_huge_pte() and get_pmd_huge_pte() with two new functions that can be defined architecture-specific. It also removes two VM_BUG_ON checks for page_count() and page_mapcount() operating on a pgtable_t. Apart from the VM_BUG_ON removal, there will be no functional change introduced by this patch. Signed-off-by: Gerald Schaefer Cc: Andrea Arcangeli Cc: Andi Kleen Cc: Hugh Dickins Cc: Hillf Danton Cc: Martin Schwidefsky Cc: Heiko Carstens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 50 ++++++++------------------------------------------ mm/pgtable-generic.c | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 42 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e19cc42..9ea6d19 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -598,19 +598,6 @@ out: } __setup("transparent_hugepage=", setup_transparent_hugepage); -static void prepare_pmd_huge_pte(pgtable_t pgtable, - struct mm_struct *mm) -{ - assert_spin_locked(&mm->page_table_lock); - - /* FIFO */ - if (!mm->pmd_huge_pte) - INIT_LIST_HEAD(&pgtable->lru); - else - list_add(&pgtable->lru, &mm->pmd_huge_pte->lru); - mm->pmd_huge_pte = pgtable; -} - static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) { if (likely(vma->vm_flags & VM_WRITE)) @@ -652,7 +639,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, */ page_add_new_anon_rmap(page, vma, haddr); set_pmd_at(mm, haddr, pmd, entry); - prepare_pmd_huge_pte(pgtable, mm); + pgtable_trans_huge_deposit(mm, pgtable); add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); mm->nr_ptes++; spin_unlock(&mm->page_table_lock); @@ -778,7 +765,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmdp_set_wrprotect(src_mm, addr, src_pmd); pmd = pmd_mkold(pmd_wrprotect(pmd)); set_pmd_at(dst_mm, addr, dst_pmd, pmd); - prepare_pmd_huge_pte(pgtable, dst_mm); + pgtable_trans_huge_deposit(dst_mm, pgtable); dst_mm->nr_ptes++; ret = 0; @@ -789,25 +776,6 @@ out: return ret; } -/* no "address" argument so destroys page coloring of some arch */ -pgtable_t get_pmd_huge_pte(struct mm_struct *mm) -{ - pgtable_t pgtable; - - assert_spin_locked(&mm->page_table_lock); - - /* FIFO */ - pgtable = mm->pmd_huge_pte; - if (list_empty(&pgtable->lru)) - mm->pmd_huge_pte = NULL; - else { - mm->pmd_huge_pte = list_entry(pgtable->lru.next, - struct page, lru); - list_del(&pgtable->lru); - } - return pgtable; -} - static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, @@ -863,7 +831,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, pmdp_clear_flush_notify(vma, haddr, pmd); /* leave pmd empty until pte is filled */ - pgtable = get_pmd_huge_pte(mm); + pgtable = pgtable_trans_huge_withdraw(mm); pmd_populate(mm, &_pmd, pgtable); for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { @@ -1028,7 +996,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (__pmd_trans_huge_lock(pmd, vma) == 1) { struct page *page; pgtable_t pgtable; - pgtable = get_pmd_huge_pte(tlb->mm); + pgtable = pgtable_trans_huge_withdraw(tlb->mm); page = pmd_page(*pmd); pmd_clear(pmd); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); @@ -1345,11 +1313,11 @@ static int __split_huge_page_map(struct page *page, pmd = page_check_address_pmd(page, mm, address, PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG); if (pmd) { - pgtable = get_pmd_huge_pte(mm); + pgtable = pgtable_trans_huge_withdraw(mm); pmd_populate(mm, &_pmd, pgtable); - for (i = 0, haddr = address; i < HPAGE_PMD_NR; - i++, haddr += PAGE_SIZE) { + haddr = address; + for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { pte_t *pte, entry; BUG_ON(PageCompound(page+i)); entry = mk_pte(page + i, vma->vm_page_prot); @@ -2017,8 +1985,6 @@ static void collapse_huge_page(struct mm_struct *mm, pte_unmap(pte); __SetPageUptodate(new_page); pgtable = pmd_pgtable(_pmd); - VM_BUG_ON(page_count(pgtable) != 1); - VM_BUG_ON(page_mapcount(pgtable) != 0); _pmd = mk_pmd(new_page, vma->vm_page_prot); _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); @@ -2036,7 +2002,7 @@ static void collapse_huge_page(struct mm_struct *mm, page_add_new_anon_rmap(new_page, vma, address); set_pmd_at(mm, address, pmd, _pmd); update_mmu_cache(vma, address, _pmd); - prepare_pmd_huge_pte(pgtable, mm); + pgtable_trans_huge_deposit(mm, pgtable); spin_unlock(&mm->page_table_lock); *hpage = NULL; diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 74c0dda..29867e0 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -120,3 +120,42 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif + +#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable) +{ + assert_spin_locked(&mm->page_table_lock); + + /* FIFO */ + if (!mm->pmd_huge_pte) + INIT_LIST_HEAD(&pgtable->lru); + else + list_add(&pgtable->lru, &mm->pmd_huge_pte->lru); + mm->pmd_huge_pte = pgtable; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#endif + +#ifndef __HAVE_ARCH_PGTABLE_WITHDRAW +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +/* no "address" argument so destroys page coloring of some arch */ +pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm) +{ + pgtable_t pgtable; + + assert_spin_locked(&mm->page_table_lock); + + /* FIFO */ + pgtable = mm->pmd_huge_pte; + if (list_empty(&pgtable->lru)) + mm->pmd_huge_pte = NULL; + else { + mm->pmd_huge_pte = list_entry(pgtable->lru.next, + struct page, lru); + list_del(&pgtable->lru); + } + return pgtable; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#endif -- cgit v1.1 From 46dcde735c9d8953bbd8d105ca6779e5b5300c28 Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Mon, 8 Oct 2012 16:30:09 -0700 Subject: thp: introduce pmdp_invalidate() On s390, a valid page table entry must not be changed while it is attached to any CPU. So instead of pmd_mknotpresent() and set_pmd_at(), an IDTE operation would be necessary there. This patch introduces the pmdp_invalidate() function, to allow architecture-specific implementations. Signed-off-by: Gerald Schaefer Cc: Andrea Arcangeli Cc: Andi Kleen Cc: Hugh Dickins Cc: Hillf Danton Cc: Martin Schwidefsky Cc: Heiko Carstens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 3 +-- mm/pgtable-generic.c | 11 +++++++++++ 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9ea6d19..bec6243 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1361,8 +1361,7 @@ static int __split_huge_page_map(struct page *page, * SMP TLB and finally we write the non-huge version * of the pmd entry with pmd_populate. */ - set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd)); - flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + pmdp_invalidate(vma, address, pmd); pmd_populate(mm, pmd, pgtable); ret = 1; } diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 29867e0..e642627 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -159,3 +159,14 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm) } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif + +#ifndef __HAVE_ARCH_PMDP_INVALIDATE +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) +{ + set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(*pmdp)); + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#endif -- cgit v1.1 From 8e72033f2a489b6c98c4e3c7cc281b1afd6cb85c Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Mon, 8 Oct 2012 16:30:12 -0700 Subject: thp: make MADV_HUGEPAGE check for mm->def_flags This adds a check to hugepage_madvise(), to refuse MADV_HUGEPAGE if VM_NOHUGEPAGE is set in mm->def_flags. On s390, the VM_NOHUGEPAGE flag will be set in mm->def_flags for kvm processes, to prevent any future thp mappings. In order to also prevent MADV_HUGEPAGE on such an mm, hugepage_madvise() should check mm->def_flags. Signed-off-by: Gerald Schaefer Cc: Andrea Arcangeli Cc: Andi Kleen Cc: Hugh Dickins Cc: Hillf Danton Cc: Martin Schwidefsky Cc: Heiko Carstens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index bec6243..010d329 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1450,6 +1450,8 @@ out: int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice) { + struct mm_struct *mm = vma->vm_mm; + switch (advice) { case MADV_HUGEPAGE: /* @@ -1457,6 +1459,8 @@ int hugepage_madvise(struct vm_area_struct *vma, */ if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP)) return -EINVAL; + if (mm->def_flags & VM_NOHUGEPAGE) + return -EINVAL; *vm_flags &= ~VM_NOHUGEPAGE; *vm_flags |= VM_HUGEPAGE; /* -- cgit v1.1 From 6b2dbba8b6ac4df26f72eda1e5ea7bab9f950e08 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:25 -0700 Subject: mm: replace vma prio_tree with an interval tree Implement an interval tree as a replacement for the VMA prio_tree. The algorithms are similar to lib/interval_tree.c; however that code can't be directly reused as the interval endpoints are not explicitly stored in the VMA. So instead, the common algorithm is moved into a template and the details (node type, how to get interval endpoints from the node, etc) are filled in using the C preprocessor. Once the interval tree functions are available, using them as a replacement to the VMA prio tree is a relatively simple, mechanical job. Signed-off-by: Michel Lespinasse Cc: Rik van Riel Cc: Hillf Danton Cc: Peter Zijlstra Cc: Catalin Marinas Cc: Andrea Arcangeli Cc: David Woodhouse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Makefile | 4 +- mm/filemap_xip.c | 3 +- mm/fremap.c | 2 +- mm/hugetlb.c | 3 +- mm/interval_tree.c | 61 +++++++++++++++ mm/memory-failure.c | 3 +- mm/memory.c | 9 +-- mm/mmap.c | 22 +++--- mm/nommu.c | 12 ++- mm/prio_tree.c | 208 ---------------------------------------------------- mm/rmap.c | 18 ++--- 11 files changed, 94 insertions(+), 251 deletions(-) create mode 100644 mm/interval_tree.c delete mode 100644 mm/prio_tree.c (limited to 'mm') diff --git a/mm/Makefile b/mm/Makefile index 92753e2..6b025f8 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -14,9 +14,9 @@ endif obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ maccess.o page_alloc.o page-writeback.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ - prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ + util.o mmzone.o vmstat.o backing-dev.o \ mm_init.o mmu_context.o percpu.o slab_common.o \ - compaction.o $(mmu-y) + compaction.o interval_tree.o $(mmu-y) obj-y += init-mm.o diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 9175022..a52daee 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -167,7 +167,6 @@ __xip_unmap (struct address_space * mapping, { struct vm_area_struct *vma; struct mm_struct *mm; - struct prio_tree_iter iter; unsigned long address; pte_t *pte; pte_t pteval; @@ -184,7 +183,7 @@ __xip_unmap (struct address_space * mapping, retry: mutex_lock(&mapping->i_mmap_mutex); - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { mm = vma->vm_mm; address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); diff --git a/mm/fremap.c b/mm/fremap.c index 3d731a4..3899a86 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -214,7 +214,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, mutex_lock(&mapping->i_mmap_mutex); flush_dcache_mmap_lock(mapping); vma->vm_flags |= VM_NONLINEAR; - vma_prio_tree_remove(vma, &mapping->i_mmap); + vma_interval_tree_remove(vma, &mapping->i_mmap); vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); flush_dcache_mmap_unlock(mapping); mutex_unlock(&mapping->i_mmap_mutex); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f1bb534..c9b40e3 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2474,7 +2474,6 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, struct hstate *h = hstate_vma(vma); struct vm_area_struct *iter_vma; struct address_space *mapping; - struct prio_tree_iter iter; pgoff_t pgoff; /* @@ -2491,7 +2490,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, * __unmap_hugepage_range() is called as the lock is already held */ mutex_lock(&mapping->i_mmap_mutex); - vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) { /* Do not unmap the current VMA */ if (iter_vma == vma) continue; diff --git a/mm/interval_tree.c b/mm/interval_tree.c new file mode 100644 index 0000000..7dc56566 --- /dev/null +++ b/mm/interval_tree.c @@ -0,0 +1,61 @@ +/* + * mm/interval_tree.c - interval tree for mapping->i_mmap + * + * Copyright (C) 2012, Michel Lespinasse + * + * This file is released under the GPL v2. + */ + +#include +#include + +#define ITSTRUCT struct vm_area_struct +#define ITRB shared.linear.rb +#define ITTYPE unsigned long +#define ITSUBTREE shared.linear.rb_subtree_last +#define ITSTART(n) ((n)->vm_pgoff) +#define ITLAST(n) ((n)->vm_pgoff + \ + (((n)->vm_end - (n)->vm_start) >> PAGE_SHIFT) - 1) +#define ITSTATIC +#define ITPREFIX vma_interval_tree + +#include + +/* Insert old immediately after vma in the interval tree */ +void vma_interval_tree_add(struct vm_area_struct *vma, + struct vm_area_struct *old, + struct address_space *mapping) +{ + struct rb_node **link; + struct vm_area_struct *parent; + unsigned long last; + + if (unlikely(vma->vm_flags & VM_NONLINEAR)) { + list_add(&vma->shared.nonlinear, &old->shared.nonlinear); + return; + } + + last = ITLAST(vma); + + if (!old->shared.linear.rb.rb_right) { + parent = old; + link = &old->shared.linear.rb.rb_right; + } else { + parent = rb_entry(old->shared.linear.rb.rb_right, + struct vm_area_struct, shared.linear.rb); + if (parent->shared.linear.rb_subtree_last < last) + parent->shared.linear.rb_subtree_last = last; + while (parent->shared.linear.rb.rb_left) { + parent = rb_entry(parent->shared.linear.rb.rb_left, + struct vm_area_struct, shared.linear.rb); + if (parent->shared.linear.rb_subtree_last < last) + parent->shared.linear.rb_subtree_last = last; + } + link = &parent->shared.linear.rb.rb_left; + } + + vma->shared.linear.rb_subtree_last = last; + rb_link_node(&vma->shared.linear.rb, &parent->shared.linear.rb, link); + rb_insert_augmented(&vma->shared.linear.rb, &mapping->i_mmap, + &vma_interval_tree_augment_callbacks); +} diff --git a/mm/memory-failure.c b/mm/memory-failure.c index a6e2141..c38a625 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -431,7 +431,6 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, { struct vm_area_struct *vma; struct task_struct *tsk; - struct prio_tree_iter iter; struct address_space *mapping = page->mapping; mutex_lock(&mapping->i_mmap_mutex); @@ -442,7 +441,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, if (!task_early_kill(tsk)) continue; - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { /* * Send early kill signal to tasks where a vma covers diff --git a/mm/memory.c b/mm/memory.c index e09c048..d205e43 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2801,14 +2801,13 @@ static void unmap_mapping_range_vma(struct vm_area_struct *vma, zap_page_range_single(vma, start_addr, end_addr - start_addr, details); } -static inline void unmap_mapping_range_tree(struct prio_tree_root *root, +static inline void unmap_mapping_range_tree(struct rb_root *root, struct zap_details *details) { struct vm_area_struct *vma; - struct prio_tree_iter iter; pgoff_t vba, vea, zba, zea; - vma_prio_tree_foreach(vma, &iter, root, + vma_interval_tree_foreach(vma, root, details->first_index, details->last_index) { vba = vma->vm_pgoff; @@ -2839,7 +2838,7 @@ static inline void unmap_mapping_range_list(struct list_head *head, * across *all* the pages in each nonlinear VMA, not just the pages * whose virtual address lies outside the file truncation point. */ - list_for_each_entry(vma, head, shared.vm_set.list) { + list_for_each_entry(vma, head, shared.nonlinear) { details->nonlinear_vma = vma; unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details); } @@ -2883,7 +2882,7 @@ void unmap_mapping_range(struct address_space *mapping, mutex_lock(&mapping->i_mmap_mutex); - if (unlikely(!prio_tree_empty(&mapping->i_mmap))) + if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) unmap_mapping_range_tree(&mapping->i_mmap, &details); if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); diff --git a/mm/mmap.c b/mm/mmap.c index e3c365f..5ac533f 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -199,14 +199,14 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma, flush_dcache_mmap_lock(mapping); if (unlikely(vma->vm_flags & VM_NONLINEAR)) - list_del_init(&vma->shared.vm_set.list); + list_del_init(&vma->shared.nonlinear); else - vma_prio_tree_remove(vma, &mapping->i_mmap); + vma_interval_tree_remove(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); } /* - * Unlink a file-based vm structure from its prio_tree, to hide + * Unlink a file-based vm structure from its interval tree, to hide * vma from rmap and vmtruncate before freeing its page tables. */ void unlink_file_vma(struct vm_area_struct *vma) @@ -411,7 +411,7 @@ static void __vma_link_file(struct vm_area_struct *vma) if (unlikely(vma->vm_flags & VM_NONLINEAR)) vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); else - vma_prio_tree_insert(vma, &mapping->i_mmap); + vma_interval_tree_insert(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); } } @@ -449,7 +449,7 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, /* * Helper for vma_adjust() in the split_vma insert case: insert a vma into the - * mm's list and rbtree. It has already been inserted into the prio_tree. + * mm's list and rbtree. It has already been inserted into the interval tree. */ static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) { @@ -491,7 +491,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, struct vm_area_struct *next = vma->vm_next; struct vm_area_struct *importer = NULL; struct address_space *mapping = NULL; - struct prio_tree_root *root = NULL; + struct rb_root *root = NULL; struct anon_vma *anon_vma = NULL; struct file *file = vma->vm_file; long adjust_next = 0; @@ -554,7 +554,7 @@ again: remove_next = 1 + (end > next->vm_end); mutex_lock(&mapping->i_mmap_mutex); if (insert) { /* - * Put into prio_tree now, so instantiated pages + * Put into interval tree now, so instantiated pages * are visible to arm/parisc __flush_dcache_page * throughout; but we cannot insert into address * space until vma start or end is updated. @@ -582,9 +582,9 @@ again: remove_next = 1 + (end > next->vm_end); if (root) { flush_dcache_mmap_lock(mapping); - vma_prio_tree_remove(vma, root); + vma_interval_tree_remove(vma, root); if (adjust_next) - vma_prio_tree_remove(next, root); + vma_interval_tree_remove(next, root); } vma->vm_start = start; @@ -597,8 +597,8 @@ again: remove_next = 1 + (end > next->vm_end); if (root) { if (adjust_next) - vma_prio_tree_insert(next, root); - vma_prio_tree_insert(vma, root); + vma_interval_tree_insert(next, root); + vma_interval_tree_insert(vma, root); flush_dcache_mmap_unlock(mapping); } diff --git a/mm/nommu.c b/mm/nommu.c index 12e84e6..45131b4 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -698,7 +698,7 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) mutex_lock(&mapping->i_mmap_mutex); flush_dcache_mmap_lock(mapping); - vma_prio_tree_insert(vma, &mapping->i_mmap); + vma_interval_tree_insert(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); mutex_unlock(&mapping->i_mmap_mutex); } @@ -764,7 +764,7 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) mutex_lock(&mapping->i_mmap_mutex); flush_dcache_mmap_lock(mapping); - vma_prio_tree_remove(vma, &mapping->i_mmap); + vma_interval_tree_remove(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); mutex_unlock(&mapping->i_mmap_mutex); } @@ -2044,7 +2044,6 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, size_t newsize) { struct vm_area_struct *vma; - struct prio_tree_iter iter; struct vm_region *region; pgoff_t low, high; size_t r_size, r_top; @@ -2056,8 +2055,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, mutex_lock(&inode->i_mapping->i_mmap_mutex); /* search for VMAs that fall within the dead zone */ - vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, - low, high) { + vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, low, high) { /* found one - only interested if it's shared out of the page * cache */ if (vma->vm_flags & VM_SHARED) { @@ -2073,8 +2071,8 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, * we don't check for any regions that start beyond the EOF as there * shouldn't be any */ - vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, - 0, ULONG_MAX) { + vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, + 0, ULONG_MAX) { if (!(vma->vm_flags & VM_SHARED)) continue; diff --git a/mm/prio_tree.c b/mm/prio_tree.c deleted file mode 100644 index 799dcfd..0000000 --- a/mm/prio_tree.c +++ /dev/null @@ -1,208 +0,0 @@ -/* - * mm/prio_tree.c - priority search tree for mapping->i_mmap - * - * Copyright (C) 2004, Rajesh Venkatasubramanian - * - * This file is released under the GPL v2. - * - * Based on the radix priority search tree proposed by Edward M. McCreight - * SIAM Journal of Computing, vol. 14, no.2, pages 257-276, May 1985 - * - * 02Feb2004 Initial version - */ - -#include -#include -#include - -/* - * See lib/prio_tree.c for details on the general radix priority search tree - * code. - */ - -/* - * The following #defines are mirrored from lib/prio_tree.c. They're only used - * for debugging, and should be removed (along with the debugging code using - * them) when switching also VMAs to the regular prio_tree code. - */ - -#define RADIX_INDEX(vma) ((vma)->vm_pgoff) -#define VMA_SIZE(vma) (((vma)->vm_end - (vma)->vm_start) >> PAGE_SHIFT) -/* avoid overflow */ -#define HEAP_INDEX(vma) ((vma)->vm_pgoff + (VMA_SIZE(vma) - 1)) - -/* - * Radix priority search tree for address_space->i_mmap - * - * For each vma that map a unique set of file pages i.e., unique [radix_index, - * heap_index] value, we have a corresponding priority search tree node. If - * multiple vmas have identical [radix_index, heap_index] value, then one of - * them is used as a tree node and others are stored in a vm_set list. The tree - * node points to the first vma (head) of the list using vm_set.head. - * - * prio_tree_root - * | - * A vm_set.head - * / \ / - * L R -> H-I-J-K-M-N-O-P-Q-S - * ^ ^ <-- vm_set.list --> - * tree nodes - * - * We need some way to identify whether a vma is a tree node, head of a vm_set - * list, or just a member of a vm_set list. We cannot use vm_flags to store - * such information. The reason is, in the above figure, it is possible that - * vm_flags' of R and H are covered by the different mmap_sems. When R is - * removed under R->mmap_sem, H replaces R as a tree node. Since we do not hold - * H->mmap_sem, we cannot use H->vm_flags for marking that H is a tree node now. - * That's why some trick involving shared.vm_set.parent is used for identifying - * tree nodes and list head nodes. - * - * vma radix priority search tree node rules: - * - * vma->shared.vm_set.parent != NULL ==> a tree node - * vma->shared.vm_set.head != NULL ==> list of others mapping same range - * vma->shared.vm_set.head == NULL ==> no others map the same range - * - * vma->shared.vm_set.parent == NULL - * vma->shared.vm_set.head != NULL ==> list head of vmas mapping same range - * vma->shared.vm_set.head == NULL ==> a list node - */ - -/* - * Add a new vma known to map the same set of pages as the old vma: - * useful for fork's dup_mmap as well as vma_prio_tree_insert below. - * Note that it just happens to work correctly on i_mmap_nonlinear too. - */ -void vma_prio_tree_add(struct vm_area_struct *vma, struct vm_area_struct *old) -{ - /* Leave these BUG_ONs till prio_tree patch stabilizes */ - BUG_ON(RADIX_INDEX(vma) != RADIX_INDEX(old)); - BUG_ON(HEAP_INDEX(vma) != HEAP_INDEX(old)); - - vma->shared.vm_set.head = NULL; - vma->shared.vm_set.parent = NULL; - - if (!old->shared.vm_set.parent) - list_add(&vma->shared.vm_set.list, - &old->shared.vm_set.list); - else if (old->shared.vm_set.head) - list_add_tail(&vma->shared.vm_set.list, - &old->shared.vm_set.head->shared.vm_set.list); - else { - INIT_LIST_HEAD(&vma->shared.vm_set.list); - vma->shared.vm_set.head = old; - old->shared.vm_set.head = vma; - } -} - -void vma_prio_tree_insert(struct vm_area_struct *vma, - struct prio_tree_root *root) -{ - struct prio_tree_node *ptr; - struct vm_area_struct *old; - - vma->shared.vm_set.head = NULL; - - ptr = raw_prio_tree_insert(root, &vma->shared.prio_tree_node); - if (ptr != (struct prio_tree_node *) &vma->shared.prio_tree_node) { - old = prio_tree_entry(ptr, struct vm_area_struct, - shared.prio_tree_node); - vma_prio_tree_add(vma, old); - } -} - -void vma_prio_tree_remove(struct vm_area_struct *vma, - struct prio_tree_root *root) -{ - struct vm_area_struct *node, *head, *new_head; - - if (!vma->shared.vm_set.head) { - if (!vma->shared.vm_set.parent) - list_del_init(&vma->shared.vm_set.list); - else - raw_prio_tree_remove(root, &vma->shared.prio_tree_node); - } else { - /* Leave this BUG_ON till prio_tree patch stabilizes */ - BUG_ON(vma->shared.vm_set.head->shared.vm_set.head != vma); - if (vma->shared.vm_set.parent) { - head = vma->shared.vm_set.head; - if (!list_empty(&head->shared.vm_set.list)) { - new_head = list_entry( - head->shared.vm_set.list.next, - struct vm_area_struct, - shared.vm_set.list); - list_del_init(&head->shared.vm_set.list); - } else - new_head = NULL; - - raw_prio_tree_replace(root, &vma->shared.prio_tree_node, - &head->shared.prio_tree_node); - head->shared.vm_set.head = new_head; - if (new_head) - new_head->shared.vm_set.head = head; - - } else { - node = vma->shared.vm_set.head; - if (!list_empty(&vma->shared.vm_set.list)) { - new_head = list_entry( - vma->shared.vm_set.list.next, - struct vm_area_struct, - shared.vm_set.list); - list_del_init(&vma->shared.vm_set.list); - node->shared.vm_set.head = new_head; - new_head->shared.vm_set.head = node; - } else - node->shared.vm_set.head = NULL; - } - } -} - -/* - * Helper function to enumerate vmas that map a given file page or a set of - * contiguous file pages. The function returns vmas that at least map a single - * page in the given range of contiguous file pages. - */ -struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma, - struct prio_tree_iter *iter) -{ - struct prio_tree_node *ptr; - struct vm_area_struct *next; - - if (!vma) { - /* - * First call is with NULL vma - */ - ptr = prio_tree_next(iter); - if (ptr) { - next = prio_tree_entry(ptr, struct vm_area_struct, - shared.prio_tree_node); - prefetch(next->shared.vm_set.head); - return next; - } else - return NULL; - } - - if (vma->shared.vm_set.parent) { - if (vma->shared.vm_set.head) { - next = vma->shared.vm_set.head; - prefetch(next->shared.vm_set.list.next); - return next; - } - } else { - next = list_entry(vma->shared.vm_set.list.next, - struct vm_area_struct, shared.vm_set.list); - if (!next->shared.vm_set.head) { - prefetch(next->shared.vm_set.list.next); - return next; - } - } - - ptr = prio_tree_next(iter); - if (ptr) { - next = prio_tree_entry(ptr, struct vm_area_struct, - shared.prio_tree_node); - prefetch(next->shared.vm_set.head); - return next; - } else - return NULL; -} diff --git a/mm/rmap.c b/mm/rmap.c index 0f3b7cd..7b5b51d 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -820,7 +820,6 @@ static int page_referenced_file(struct page *page, struct address_space *mapping = page->mapping; pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); struct vm_area_struct *vma; - struct prio_tree_iter iter; int referenced = 0; /* @@ -846,7 +845,7 @@ static int page_referenced_file(struct page *page, */ mapcount = page_mapcount(page); - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); if (address == -EFAULT) continue; @@ -945,13 +944,12 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page) { pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); struct vm_area_struct *vma; - struct prio_tree_iter iter; int ret = 0; BUG_ON(PageAnon(page)); mutex_lock(&mapping->i_mmap_mutex); - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { if (vma->vm_flags & VM_SHARED) { unsigned long address = vma_address(page, vma); if (address == -EFAULT) @@ -1547,7 +1545,6 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) struct address_space *mapping = page->mapping; pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); struct vm_area_struct *vma; - struct prio_tree_iter iter; int ret = SWAP_AGAIN; unsigned long cursor; unsigned long max_nl_cursor = 0; @@ -1555,7 +1552,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) unsigned int mapcount; mutex_lock(&mapping->i_mmap_mutex); - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); if (address == -EFAULT) continue; @@ -1576,7 +1573,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) goto out; list_for_each_entry(vma, &mapping->i_mmap_nonlinear, - shared.vm_set.list) { + shared.nonlinear) { cursor = (unsigned long) vma->vm_private_data; if (cursor > max_nl_cursor) max_nl_cursor = cursor; @@ -1608,7 +1605,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) do { list_for_each_entry(vma, &mapping->i_mmap_nonlinear, - shared.vm_set.list) { + shared.nonlinear) { cursor = (unsigned long) vma->vm_private_data; while ( cursor < max_nl_cursor && cursor < vma->vm_end - vma->vm_start) { @@ -1631,7 +1628,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) * in locked vmas). Reset cursor on all unreserved nonlinear * vmas, now forgetting on which ones it had fallen behind. */ - list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) + list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear) vma->vm_private_data = NULL; out: mutex_unlock(&mapping->i_mmap_mutex); @@ -1748,13 +1745,12 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, struct address_space *mapping = page->mapping; pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); struct vm_area_struct *vma; - struct prio_tree_iter iter; int ret = SWAP_AGAIN; if (!mapping) return ret; mutex_lock(&mapping->i_mmap_mutex); - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); if (address == -EFAULT) continue; -- cgit v1.1 From 85d3a316c714197f94e75c1e5b2d37607d66e5de Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:27 -0700 Subject: kmemleak: use rbtree instead of prio tree kmemleak uses a tree where each node represents an allocated memory object in order to quickly find out what object a given address is part of. However, the objects don't overlap, so rbtrees are a better choice than prio tree for this use. They are both faster and have lower memory overhead. Tested by booting a kernel with kmemleak enabled, loading the kmemleak_test module, and looking for the expected messages. Signed-off-by: Michel Lespinasse Cc: Rik van Riel Cc: Hillf Danton Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: David Woodhouse Acked-by: Catalin Marinas Tested-by: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kmemleak.c | 100 ++++++++++++++++++++++++++++++---------------------------- 1 file changed, 51 insertions(+), 49 deletions(-) (limited to 'mm') diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 0de83b4..a217cc5 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -29,7 +29,7 @@ * - kmemleak_lock (rwlock): protects the object_list modifications and * accesses to the object_tree_root. The object_list is the main list * holding the metadata (struct kmemleak_object) for the allocated memory - * blocks. The object_tree_root is a priority search tree used to look-up + * blocks. The object_tree_root is a red black tree used to look-up * metadata based on a pointer to the corresponding memory block. The * kmemleak_object structures are added to the object_list and * object_tree_root in the create_object() function called from the @@ -71,7 +71,7 @@ #include #include #include -#include +#include #include #include #include @@ -132,7 +132,7 @@ struct kmemleak_scan_area { * Structure holding the metadata for each allocated memory block. * Modifications to such objects should be made while holding the * object->lock. Insertions or deletions from object_list, gray_list or - * tree_node are already protected by the corresponding locks or mutex (see + * rb_node are already protected by the corresponding locks or mutex (see * the notes on locking above). These objects are reference-counted * (use_count) and freed using the RCU mechanism. */ @@ -141,7 +141,7 @@ struct kmemleak_object { unsigned long flags; /* object status flags */ struct list_head object_list; struct list_head gray_list; - struct prio_tree_node tree_node; + struct rb_node rb_node; struct rcu_head rcu; /* object_list lockless traversal */ /* object usage count; object freed when use_count == 0 */ atomic_t use_count; @@ -182,9 +182,9 @@ struct kmemleak_object { static LIST_HEAD(object_list); /* the list of gray-colored objects (see color_gray comment below) */ static LIST_HEAD(gray_list); -/* prio search tree for object boundaries */ -static struct prio_tree_root object_tree_root; -/* rw_lock protecting the access to object_list and prio_tree_root */ +/* search tree for object boundaries */ +static struct rb_root object_tree_root = RB_ROOT; +/* rw_lock protecting the access to object_list and object_tree_root */ static DEFINE_RWLOCK(kmemleak_lock); /* allocation caches for kmemleak internal data */ @@ -380,7 +380,7 @@ static void dump_object_info(struct kmemleak_object *object) trace.entries = object->trace; pr_notice("Object 0x%08lx (size %zu):\n", - object->tree_node.start, object->size); + object->pointer, object->size); pr_notice(" comm \"%s\", pid %d, jiffies %lu\n", object->comm, object->pid, object->jiffies); pr_notice(" min_count = %d\n", object->min_count); @@ -392,32 +392,32 @@ static void dump_object_info(struct kmemleak_object *object) } /* - * Look-up a memory block metadata (kmemleak_object) in the priority search + * Look-up a memory block metadata (kmemleak_object) in the object search * tree based on a pointer value. If alias is 0, only values pointing to the * beginning of the memory block are allowed. The kmemleak_lock must be held * when calling this function. */ static struct kmemleak_object *lookup_object(unsigned long ptr, int alias) { - struct prio_tree_node *node; - struct prio_tree_iter iter; - struct kmemleak_object *object; - - prio_tree_iter_init(&iter, &object_tree_root, ptr, ptr); - node = prio_tree_next(&iter); - if (node) { - object = prio_tree_entry(node, struct kmemleak_object, - tree_node); - if (!alias && object->pointer != ptr) { + struct rb_node *rb = object_tree_root.rb_node; + + while (rb) { + struct kmemleak_object *object = + rb_entry(rb, struct kmemleak_object, rb_node); + if (ptr < object->pointer) + rb = object->rb_node.rb_left; + else if (object->pointer + object->size <= ptr) + rb = object->rb_node.rb_right; + else if (object->pointer == ptr || alias) + return object; + else { kmemleak_warn("Found object by alias at 0x%08lx\n", ptr); dump_object_info(object); - object = NULL; + break; } - } else - object = NULL; - - return object; + } + return NULL; } /* @@ -471,7 +471,7 @@ static void put_object(struct kmemleak_object *object) } /* - * Look up an object in the prio search tree and increase its use_count. + * Look up an object in the object search tree and increase its use_count. */ static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias) { @@ -516,8 +516,8 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, int min_count, gfp_t gfp) { unsigned long flags; - struct kmemleak_object *object; - struct prio_tree_node *node; + struct kmemleak_object *object, *parent; + struct rb_node **link, *rb_parent; object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp)); if (!object) { @@ -560,31 +560,34 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, /* kernel backtrace */ object->trace_len = __save_stack_trace(object->trace); - INIT_PRIO_TREE_NODE(&object->tree_node); - object->tree_node.start = ptr; - object->tree_node.last = ptr + size - 1; - write_lock_irqsave(&kmemleak_lock, flags); min_addr = min(min_addr, ptr); max_addr = max(max_addr, ptr + size); - node = prio_tree_insert(&object_tree_root, &object->tree_node); - /* - * The code calling the kernel does not yet have the pointer to the - * memory block to be able to free it. However, we still hold the - * kmemleak_lock here in case parts of the kernel started freeing - * random memory blocks. - */ - if (node != &object->tree_node) { - kmemleak_stop("Cannot insert 0x%lx into the object search tree " - "(already existing)\n", ptr); - object = lookup_object(ptr, 1); - spin_lock(&object->lock); - dump_object_info(object); - spin_unlock(&object->lock); - - goto out; + link = &object_tree_root.rb_node; + rb_parent = NULL; + while (*link) { + rb_parent = *link; + parent = rb_entry(rb_parent, struct kmemleak_object, rb_node); + if (ptr + size <= parent->pointer) + link = &parent->rb_node.rb_left; + else if (parent->pointer + parent->size <= ptr) + link = &parent->rb_node.rb_right; + else { + kmemleak_stop("Cannot insert 0x%lx into the object " + "search tree (overlaps existing)\n", + ptr); + kmem_cache_free(object_cache, object); + object = parent; + spin_lock(&object->lock); + dump_object_info(object); + spin_unlock(&object->lock); + goto out; + } } + rb_link_node(&object->rb_node, rb_parent, link); + rb_insert_color(&object->rb_node, &object_tree_root); + list_add_tail_rcu(&object->object_list, &object_list); out: write_unlock_irqrestore(&kmemleak_lock, flags); @@ -600,7 +603,7 @@ static void __delete_object(struct kmemleak_object *object) unsigned long flags; write_lock_irqsave(&kmemleak_lock, flags); - prio_tree_remove(&object_tree_root, &object->tree_node); + rb_erase(&object->rb_node, &object_tree_root); list_del_rcu(&object->object_list); write_unlock_irqrestore(&kmemleak_lock, flags); @@ -1766,7 +1769,6 @@ void __init kmemleak_init(void) object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE); scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE); - INIT_PRIO_TREE_ROOT(&object_tree_root); if (crt_early_log >= ARRAY_SIZE(early_log)) pr_warning("Early log buffer exceeded (%d), please increase " -- cgit v1.1 From 9826a516ff77c5820e591211e4f3e58ff36f46be Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:35 -0700 Subject: mm: interval tree updates Update the generic interval tree code that was introduced in "mm: replace vma prio_tree with an interval tree". Changes: - fixed 'endpoing' typo noticed by Andrew Morton - replaced include/linux/interval_tree_tmpl.h, which was used as a template (including it automatically defined the interval tree functions) with include/linux/interval_tree_generic.h, which only defines a preprocessor macro INTERVAL_TREE_DEFINE(), which itself defines the interval tree functions when invoked. Now that is a very long macro which is unfortunate, but it does make the usage sites (lib/interval_tree.c and mm/interval_tree.c) a bit nicer than previously. - make use of RB_DECLARE_CALLBACKS() in the INTERVAL_TREE_DEFINE() macro, instead of duplicating that code in the interval tree template. - replaced vma_interval_tree_add(), which was actually handling the nonlinear and interval tree cases, with vma_interval_tree_insert_after() which handles only the interval tree case and has an API that is more consistent with the other interval tree handling functions. The nonlinear case is now handled explicitly in kernel/fork.c dup_mmap(). Signed-off-by: Michel Lespinasse Cc: Andrea Arcangeli Cc: Rik van Riel Cc: Peter Zijlstra Cc: Daniel Santos Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/interval_tree.c | 60 ++++++++++++++++++++++++++---------------------------- 1 file changed, 29 insertions(+), 31 deletions(-) (limited to 'mm') diff --git a/mm/interval_tree.c b/mm/interval_tree.c index 7dc56566..4ab7b9e 100644 --- a/mm/interval_tree.c +++ b/mm/interval_tree.c @@ -8,40 +8,38 @@ #include #include +#include -#define ITSTRUCT struct vm_area_struct -#define ITRB shared.linear.rb -#define ITTYPE unsigned long -#define ITSUBTREE shared.linear.rb_subtree_last -#define ITSTART(n) ((n)->vm_pgoff) -#define ITLAST(n) ((n)->vm_pgoff + \ - (((n)->vm_end - (n)->vm_start) >> PAGE_SHIFT) - 1) -#define ITSTATIC -#define ITPREFIX vma_interval_tree - -#include - -/* Insert old immediately after vma in the interval tree */ -void vma_interval_tree_add(struct vm_area_struct *vma, - struct vm_area_struct *old, - struct address_space *mapping) +static inline unsigned long vma_start_pgoff(struct vm_area_struct *v) +{ + return v->vm_pgoff; +} + +static inline unsigned long vma_last_pgoff(struct vm_area_struct *v) +{ + return v->vm_pgoff + ((v->vm_end - v->vm_start) >> PAGE_SHIFT) - 1; +} + +INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.linear.rb, + unsigned long, shared.linear.rb_subtree_last, + vma_start_pgoff, vma_last_pgoff,, vma_interval_tree) + +/* Insert node immediately after prev in the interval tree */ +void vma_interval_tree_insert_after(struct vm_area_struct *node, + struct vm_area_struct *prev, + struct rb_root *root) { struct rb_node **link; struct vm_area_struct *parent; - unsigned long last; - - if (unlikely(vma->vm_flags & VM_NONLINEAR)) { - list_add(&vma->shared.nonlinear, &old->shared.nonlinear); - return; - } + unsigned long last = vma_last_pgoff(node); - last = ITLAST(vma); + VM_BUG_ON(vma_start_pgoff(node) != vma_start_pgoff(prev)); - if (!old->shared.linear.rb.rb_right) { - parent = old; - link = &old->shared.linear.rb.rb_right; + if (!prev->shared.linear.rb.rb_right) { + parent = prev; + link = &prev->shared.linear.rb.rb_right; } else { - parent = rb_entry(old->shared.linear.rb.rb_right, + parent = rb_entry(prev->shared.linear.rb.rb_right, struct vm_area_struct, shared.linear.rb); if (parent->shared.linear.rb_subtree_last < last) parent->shared.linear.rb_subtree_last = last; @@ -54,8 +52,8 @@ void vma_interval_tree_add(struct vm_area_struct *vma, link = &parent->shared.linear.rb.rb_left; } - vma->shared.linear.rb_subtree_last = last; - rb_link_node(&vma->shared.linear.rb, &parent->shared.linear.rb, link); - rb_insert_augmented(&vma->shared.linear.rb, &mapping->i_mmap, - &vma_interval_tree_augment_callbacks); + node->shared.linear.rb_subtree_last = last; + rb_link_node(&node->shared.linear.rb, &parent->shared.linear.rb, link); + rb_insert_augmented(&node->shared.linear.rb, root, + &vma_interval_tree_augment); } -- cgit v1.1 From 108d6642ad81bb1d62b401490a334d2c12397517 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:36 -0700 Subject: mm anon rmap: remove anon_vma_moveto_tail mremap() had a clever optimization where move_ptes() did not take the anon_vma lock to avoid a race with anon rmap users such as page migration. Instead, the avc's were ordered in such a way that the origin vma was always visited by rmap before the destination. This ordering and the use of page table locks rmap usage safe. However, we want to replace the use of linked lists in anon rmap with an interval tree, and this will make it harder to impose such ordering as the interval tree will always be sorted by the avc->vma->vm_pgoff value. For now, let's replace the anon_vma_moveto_tail() ordering function with proper anon_vma locking in move_ptes(). Once we have the anon interval tree in place, we will re-introduce an optimization to avoid taking these locks in the most common cases. Signed-off-by: Michel Lespinasse Cc: Andrea Arcangeli Cc: Rik van Riel Cc: Peter Zijlstra Cc: Daniel Santos Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 3 +-- mm/mremap.c | 14 +++++--------- mm/rmap.c | 45 --------------------------------------------- 3 files changed, 6 insertions(+), 56 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 5ac533f..66984aa 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2378,8 +2378,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, */ VM_BUG_ON(faulted_in_anon_vma); *vmap = new_vma; - } else - anon_vma_moveto_tail(new_vma); + } } else { new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (new_vma) { diff --git a/mm/mremap.c b/mm/mremap.c index cc06d0e..5588bb6 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -74,6 +74,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, unsigned long new_addr) { struct address_space *mapping = NULL; + struct anon_vma *anon_vma = vma->anon_vma; struct mm_struct *mm = vma->vm_mm; pte_t *old_pte, *new_pte, pte; spinlock_t *old_ptl, *new_ptl; @@ -88,6 +89,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, mapping = vma->vm_file->f_mapping; mutex_lock(&mapping->i_mmap_mutex); } + if (anon_vma) + anon_vma_lock(anon_vma); /* * We don't have to worry about the ordering of src and dst @@ -114,6 +117,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, spin_unlock(new_ptl); pte_unmap(new_pte - 1); pte_unmap_unlock(old_pte - 1, old_ptl); + if (anon_vma) + anon_vma_unlock(anon_vma); if (mapping) mutex_unlock(&mapping->i_mmap_mutex); } @@ -221,15 +226,6 @@ static unsigned long move_vma(struct vm_area_struct *vma, moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len); if (moved_len < old_len) { /* - * Before moving the page tables from the new vma to - * the old vma, we need to be sure the old vma is - * queued after new vma in the same_anon_vma list to - * prevent SMP races with rmap_walk (that could lead - * rmap_walk to miss some page table). - */ - anon_vma_moveto_tail(vma); - - /* * On error, move entries back from new area to old, * which will succeed since page tables still there, * and then proceed to unmap new area instead of old. diff --git a/mm/rmap.c b/mm/rmap.c index 7b5b51d..8cbd62f 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -269,51 +269,6 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) } /* - * Some rmap walk that needs to find all ptes/hugepmds without false - * negatives (like migrate and split_huge_page) running concurrent - * with operations that copy or move pagetables (like mremap() and - * fork()) to be safe. They depend on the anon_vma "same_anon_vma" - * list to be in a certain order: the dst_vma must be placed after the - * src_vma in the list. This is always guaranteed by fork() but - * mremap() needs to call this function to enforce it in case the - * dst_vma isn't newly allocated and chained with the anon_vma_clone() - * function but just an extension of a pre-existing vma through - * vma_merge. - * - * NOTE: the same_anon_vma list can still be changed by other - * processes while mremap runs because mremap doesn't hold the - * anon_vma mutex to prevent modifications to the list while it - * runs. All we need to enforce is that the relative order of this - * process vmas isn't changing (we don't care about other vmas - * order). Each vma corresponds to an anon_vma_chain structure so - * there's no risk that other processes calling anon_vma_moveto_tail() - * and changing the same_anon_vma list under mremap() will screw with - * the relative order of this process vmas in the list, because we - * they can't alter the order of any vma that belongs to this - * process. And there can't be another anon_vma_moveto_tail() running - * concurrently with mremap() coming from this process because we hold - * the mmap_sem for the whole mremap(). fork() ordering dependency - * also shouldn't be affected because fork() only cares that the - * parent vmas are placed in the list before the child vmas and - * anon_vma_moveto_tail() won't reorder vmas from either the fork() - * parent or child. - */ -void anon_vma_moveto_tail(struct vm_area_struct *dst) -{ - struct anon_vma_chain *pavc; - struct anon_vma *root = NULL; - - list_for_each_entry_reverse(pavc, &dst->anon_vma_chain, same_vma) { - struct anon_vma *anon_vma = pavc->anon_vma; - VM_BUG_ON(pavc->vma != dst); - root = lock_anon_vma_root(root, anon_vma); - list_del(&pavc->same_anon_vma); - list_add_tail(&pavc->same_anon_vma, &anon_vma->head); - } - unlock_anon_vma_root(root); -} - -/* * Attach vma to its own anon_vma, as well as to the anon_vmas that * the corresponding VMA in the parent process is attached to. * Returns 0 on success, non-zero on failure. -- cgit v1.1 From bf181b9f9d8dfbba58b23441ad60d0bc33806d64 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:39 -0700 Subject: mm anon rmap: replace same_anon_vma linked list with an interval tree. When a large VMA (anon or private file mapping) is first touched, which will populate its anon_vma field, and then split into many regions through the use of mprotect(), the original anon_vma ends up linking all of the vmas on a linked list. This can cause rmap to become inefficient, as we have to walk potentially thousands of irrelevent vmas before finding the one a given anon page might fall into. By replacing the same_anon_vma linked list with an interval tree (where each avc's interval is determined by its vma's start and last pgoffs), we can make rmap efficient for this use case again. While the change is large, all of its pieces are fairly simple. Most places that were walking the same_anon_vma list were looking for a known pgoff, so they can just use the anon_vma_interval_tree_foreach() interval tree iterator instead. The exception here is ksm, where the page's index is not known. It would probably be possible to rework ksm so that the index would be known, but for now I have decided to keep things simple and just walk the entirety of the interval tree there. When updating vma's that already have an anon_vma assigned, we must take care to re-index the corresponding avc's on their interval tree. This is done through the use of anon_vma_interval_tree_pre_update_vma() and anon_vma_interval_tree_post_update_vma(), which remove the avc's from their interval tree before the update and re-insert them after the update. The anon_vma stays locked during the update, so there is no chance that rmap would miss the vmas that are being updated. Signed-off-by: Michel Lespinasse Cc: Andrea Arcangeli Cc: Rik van Riel Cc: Peter Zijlstra Cc: Daniel Santos Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 5 ++-- mm/interval_tree.c | 14 ++++++++++ mm/ksm.c | 9 ++++--- mm/memory-failure.c | 5 +++- mm/mmap.c | 73 ++++++++++++++++++++++++++++++++++++++++------------- mm/rmap.c | 24 +++++++++--------- 6 files changed, 94 insertions(+), 36 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 010d329..ce59ada 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1375,13 +1375,14 @@ static void __split_huge_page(struct page *page, struct anon_vma *anon_vma) { int mapcount, mapcount2; + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); struct anon_vma_chain *avc; BUG_ON(!PageHead(page)); BUG_ON(PageTail(page)); mapcount = 0; - list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { + anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { struct vm_area_struct *vma = avc->vma; unsigned long addr = vma_address(page, vma); BUG_ON(is_vma_temporary_stack(vma)); @@ -1407,7 +1408,7 @@ static void __split_huge_page(struct page *page, __split_huge_page_refcount(page); mapcount2 = 0; - list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { + anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { struct vm_area_struct *vma = avc->vma; unsigned long addr = vma_address(page, vma); BUG_ON(is_vma_temporary_stack(vma)); diff --git a/mm/interval_tree.c b/mm/interval_tree.c index 4ab7b9e..f7c72cd 100644 --- a/mm/interval_tree.c +++ b/mm/interval_tree.c @@ -8,6 +8,7 @@ #include #include +#include #include static inline unsigned long vma_start_pgoff(struct vm_area_struct *v) @@ -57,3 +58,16 @@ void vma_interval_tree_insert_after(struct vm_area_struct *node, rb_insert_augmented(&node->shared.linear.rb, root, &vma_interval_tree_augment); } + +static inline unsigned long avc_start_pgoff(struct anon_vma_chain *avc) +{ + return vma_start_pgoff(avc->vma); +} + +static inline unsigned long avc_last_pgoff(struct anon_vma_chain *avc) +{ + return vma_last_pgoff(avc->vma); +} + +INTERVAL_TREE_DEFINE(struct anon_vma_chain, rb, unsigned long, rb_subtree_last, + avc_start_pgoff, avc_last_pgoff,, anon_vma_interval_tree) diff --git a/mm/ksm.c b/mm/ksm.c index 9638620..14ee5cf 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1618,7 +1618,8 @@ again: struct vm_area_struct *vma; anon_vma_lock(anon_vma); - list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { + anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, + 0, ULONG_MAX) { vma = vmac->vma; if (rmap_item->address < vma->vm_start || rmap_item->address >= vma->vm_end) @@ -1671,7 +1672,8 @@ again: struct vm_area_struct *vma; anon_vma_lock(anon_vma); - list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { + anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, + 0, ULONG_MAX) { vma = vmac->vma; if (rmap_item->address < vma->vm_start || rmap_item->address >= vma->vm_end) @@ -1723,7 +1725,8 @@ again: struct vm_area_struct *vma; anon_vma_lock(anon_vma); - list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { + anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, + 0, ULONG_MAX) { vma = vmac->vma; if (rmap_item->address < vma->vm_start || rmap_item->address >= vma->vm_end) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index c38a625..6c5899b 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -400,18 +400,21 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, struct vm_area_struct *vma; struct task_struct *tsk; struct anon_vma *av; + pgoff_t pgoff; av = page_lock_anon_vma(page); if (av == NULL) /* Not actually mapped anymore */ return; + pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); read_lock(&tasklist_lock); for_each_process (tsk) { struct anon_vma_chain *vmac; if (!task_early_kill(tsk)) continue; - list_for_each_entry(vmac, &av->head, same_anon_vma) { + anon_vma_interval_tree_foreach(vmac, &av->rb_root, + pgoff, pgoff) { vma = vmac->vma; if (!page_mapped_in_vma(page, vma)) continue; diff --git a/mm/mmap.c b/mm/mmap.c index 66984aa..2e580ed 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -353,6 +353,38 @@ void validate_mm(struct mm_struct *mm) #define validate_mm(mm) do { } while (0) #endif +/* + * vma has some anon_vma assigned, and is already inserted on that + * anon_vma's interval trees. + * + * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the + * vma must be removed from the anon_vma's interval trees using + * anon_vma_interval_tree_pre_update_vma(). + * + * After the update, the vma will be reinserted using + * anon_vma_interval_tree_post_update_vma(). + * + * The entire update must be protected by exclusive mmap_sem and by + * the root anon_vma's mutex. + */ +static inline void +anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma) +{ + struct anon_vma_chain *avc; + + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root); +} + +static inline void +anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) +{ + struct anon_vma_chain *avc; + + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); +} + static int find_vma_links(struct mm_struct *mm, unsigned long addr, unsigned long end, struct vm_area_struct **pprev, struct rb_node ***rb_link, struct rb_node **rb_parent) @@ -565,20 +597,17 @@ again: remove_next = 1 + (end > next->vm_end); vma_adjust_trans_huge(vma, start, end, adjust_next); - /* - * When changing only vma->vm_end, we don't really need anon_vma - * lock. This is a fairly rare case by itself, but the anon_vma - * lock may be shared between many sibling processes. Skipping - * the lock for brk adjustments makes a difference sometimes. - */ - if (vma->anon_vma && (importer || start != vma->vm_start)) { - anon_vma = vma->anon_vma; + anon_vma = vma->anon_vma; + if (!anon_vma && adjust_next) + anon_vma = next->anon_vma; + if (anon_vma) { VM_BUG_ON(adjust_next && next->anon_vma && anon_vma != next->anon_vma); - } else if (adjust_next && next->anon_vma) - anon_vma = next->anon_vma; - if (anon_vma) anon_vma_lock(anon_vma); + anon_vma_interval_tree_pre_update_vma(vma); + if (adjust_next) + anon_vma_interval_tree_pre_update_vma(next); + } if (root) { flush_dcache_mmap_lock(mapping); @@ -619,8 +648,12 @@ again: remove_next = 1 + (end > next->vm_end); __insert_vm_struct(mm, insert); } - if (anon_vma) + if (anon_vma) { + anon_vma_interval_tree_post_update_vma(vma); + if (adjust_next) + anon_vma_interval_tree_post_update_vma(next); anon_vma_unlock(anon_vma); + } if (mapping) mutex_unlock(&mapping->i_mmap_mutex); @@ -1748,7 +1781,9 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { error = acct_stack_growth(vma, size, grow); if (!error) { + anon_vma_interval_tree_pre_update_vma(vma); vma->vm_end = address; + anon_vma_interval_tree_post_update_vma(vma); perf_event_mmap(vma); } } @@ -1798,8 +1833,10 @@ int expand_downwards(struct vm_area_struct *vma, if (grow <= vma->vm_pgoff) { error = acct_stack_growth(vma, size, grow); if (!error) { + anon_vma_interval_tree_pre_update_vma(vma); vma->vm_start = address; vma->vm_pgoff -= grow; + anon_vma_interval_tree_post_update_vma(vma); perf_event_mmap(vma); } } @@ -2515,7 +2552,7 @@ static DEFINE_MUTEX(mm_all_locks_mutex); static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) { - if (!test_bit(0, (unsigned long *) &anon_vma->root->head.next)) { + if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) { /* * The LSB of head.next can't change from under us * because we hold the mm_all_locks_mutex. @@ -2531,7 +2568,7 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) * anon_vma->root->mutex. */ if (__test_and_set_bit(0, (unsigned long *) - &anon_vma->root->head.next)) + &anon_vma->root->rb_root.rb_node)) BUG(); } } @@ -2572,7 +2609,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) * A single task can't take more than one mm_take_all_locks() in a row * or it would deadlock. * - * The LSB in anon_vma->head.next and the AS_MM_ALL_LOCKS bitflag in + * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in * mapping->flags avoid to take the same lock twice, if more than one * vma in this mm is backed by the same anon_vma or address_space. * @@ -2619,13 +2656,13 @@ out_unlock: static void vm_unlock_anon_vma(struct anon_vma *anon_vma) { - if (test_bit(0, (unsigned long *) &anon_vma->root->head.next)) { + if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) { /* * The LSB of head.next can't change to 0 from under * us because we hold the mm_all_locks_mutex. * * We must however clear the bitflag before unlocking - * the vma so the users using the anon_vma->head will + * the vma so the users using the anon_vma->rb_root will * never see our bitflag. * * No need of atomic instructions here, head.next @@ -2633,7 +2670,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma) * anon_vma->root->mutex. */ if (!__test_and_clear_bit(0, (unsigned long *) - &anon_vma->root->head.next)) + &anon_vma->root->rb_root.rb_node)) BUG(); anon_vma_unlock(anon_vma); } diff --git a/mm/rmap.c b/mm/rmap.c index 8cbd62f..9c61bf3 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -127,12 +127,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, avc->vma = vma; avc->anon_vma = anon_vma; list_add(&avc->same_vma, &vma->anon_vma_chain); - - /* - * It's critical to add new vmas to the tail of the anon_vma, - * see comment in huge_memory.c:__split_huge_page(). - */ - list_add_tail(&avc->same_anon_vma, &anon_vma->head); + anon_vma_interval_tree_insert(avc, &anon_vma->rb_root); } /** @@ -336,13 +331,13 @@ void unlink_anon_vmas(struct vm_area_struct *vma) struct anon_vma *anon_vma = avc->anon_vma; root = lock_anon_vma_root(root, anon_vma); - list_del(&avc->same_anon_vma); + anon_vma_interval_tree_remove(avc, &anon_vma->rb_root); /* * Leave empty anon_vmas on the list - we'll need * to free them outside the lock. */ - if (list_empty(&anon_vma->head)) + if (RB_EMPTY_ROOT(&anon_vma->rb_root)) continue; list_del(&avc->same_vma); @@ -371,7 +366,7 @@ static void anon_vma_ctor(void *data) mutex_init(&anon_vma->mutex); atomic_set(&anon_vma->refcount, 0); - INIT_LIST_HEAD(&anon_vma->head); + anon_vma->rb_root = RB_ROOT; } void __init anon_vma_init(void) @@ -724,6 +719,7 @@ static int page_referenced_anon(struct page *page, { unsigned int mapcount; struct anon_vma *anon_vma; + pgoff_t pgoff; struct anon_vma_chain *avc; int referenced = 0; @@ -732,7 +728,8 @@ static int page_referenced_anon(struct page *page, return referenced; mapcount = page_mapcount(page); - list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { + pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { struct vm_area_struct *vma = avc->vma; unsigned long address = vma_address(page, vma); if (address == -EFAULT) @@ -1445,6 +1442,7 @@ bool is_vma_temporary_stack(struct vm_area_struct *vma) static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) { struct anon_vma *anon_vma; + pgoff_t pgoff; struct anon_vma_chain *avc; int ret = SWAP_AGAIN; @@ -1452,7 +1450,8 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) if (!anon_vma) return ret; - list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { + pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { struct vm_area_struct *vma = avc->vma; unsigned long address; @@ -1668,6 +1667,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, struct vm_area_struct *, unsigned long, void *), void *arg) { struct anon_vma *anon_vma; + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); struct anon_vma_chain *avc; int ret = SWAP_AGAIN; @@ -1681,7 +1681,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, if (!anon_vma) return ret; anon_vma_lock(anon_vma); - list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { + anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { struct vm_area_struct *vma = avc->vma; unsigned long address = vma_address(page, vma); if (address == -EFAULT) -- cgit v1.1 From 86c2ad19956f84f2191e062fcb979367b6365871 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:42 -0700 Subject: mm rmap: remove vma_address check for address inside vma In file and anon rmap, we use interval trees to find potentially relevant vmas and then call vma_address() to find the virtual address the given page might be found at in these vmas. vma_address() used to include a check that the returned address falls within the limits of the vma, but this check isn't necessary now that we always use interval trees in rmap: the interval tree just doesn't return any vmas which this check would find to be irrelevant. As a result, we can replace the use of -EFAULT error code (which then needed to be checked in every call site) with a VM_BUG_ON(). Signed-off-by: Michel Lespinasse Cc: Andrea Arcangeli Cc: Rik van Riel Cc: Peter Zijlstra Cc: Daniel Santos Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 4 ---- mm/rmap.c | 48 +++++++++++++++++++++--------------------------- 2 files changed, 21 insertions(+), 31 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index ce59ada..7cf8b0e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1386,8 +1386,6 @@ static void __split_huge_page(struct page *page, struct vm_area_struct *vma = avc->vma; unsigned long addr = vma_address(page, vma); BUG_ON(is_vma_temporary_stack(vma)); - if (addr == -EFAULT) - continue; mapcount += __split_huge_page_splitting(page, vma, addr); } /* @@ -1412,8 +1410,6 @@ static void __split_huge_page(struct page *page, struct vm_area_struct *vma = avc->vma; unsigned long addr = vma_address(page, vma); BUG_ON(is_vma_temporary_stack(vma)); - if (addr == -EFAULT) - continue; mapcount2 += __split_huge_page_map(page, vma, addr); } if (mapcount != mapcount2) diff --git a/mm/rmap.c b/mm/rmap.c index 9c61bf3..2877741 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -510,22 +510,26 @@ void page_unlock_anon_vma(struct anon_vma *anon_vma) /* * At what user virtual address is page expected in @vma? - * Returns virtual address or -EFAULT if page's index/offset is not - * within the range mapped the @vma. */ -inline unsigned long -vma_address(struct page *page, struct vm_area_struct *vma) +static inline unsigned long +__vma_address(struct page *page, struct vm_area_struct *vma) { pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - unsigned long address; if (unlikely(is_vm_hugetlb_page(vma))) pgoff = page->index << huge_page_order(page_hstate(page)); - address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); - if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { - /* page should be within @vma mapping range */ - return -EFAULT; - } + + return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); +} + +inline unsigned long +vma_address(struct page *page, struct vm_area_struct *vma) +{ + unsigned long address = __vma_address(page, vma); + + /* page should be within @vma mapping range */ + VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); + return address; } @@ -535,6 +539,7 @@ vma_address(struct page *page, struct vm_area_struct *vma) */ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) { + unsigned long address; if (PageAnon(page)) { struct anon_vma *page__anon_vma = page_anon_vma(page); /* @@ -550,7 +555,10 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) return -EFAULT; } else return -EFAULT; - return vma_address(page, vma); + address = __vma_address(page, vma); + if (unlikely(address < vma->vm_start || address >= vma->vm_end)) + return -EFAULT; + return address; } /* @@ -624,8 +632,8 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) pte_t *pte; spinlock_t *ptl; - address = vma_address(page, vma); - if (address == -EFAULT) /* out of vma range */ + address = __vma_address(page, vma); + if (unlikely(address < vma->vm_start || address >= vma->vm_end)) return 0; pte = page_check_address(page, vma->vm_mm, address, &ptl, 1); if (!pte) /* the page is not in this mm */ @@ -732,8 +740,6 @@ static int page_referenced_anon(struct page *page, anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { struct vm_area_struct *vma = avc->vma; unsigned long address = vma_address(page, vma); - if (address == -EFAULT) - continue; /* * If we are reclaiming on behalf of a cgroup, skip * counting on behalf of references from different @@ -799,8 +805,6 @@ static int page_referenced_file(struct page *page, vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); - if (address == -EFAULT) - continue; /* * If we are reclaiming on behalf of a cgroup, skip * counting on behalf of references from different @@ -904,8 +908,6 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page) vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { if (vma->vm_flags & VM_SHARED) { unsigned long address = vma_address(page, vma); - if (address == -EFAULT) - continue; ret += page_mkclean_one(page, vma, address); } } @@ -1468,8 +1470,6 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) continue; address = vma_address(page, vma); - if (address == -EFAULT) - continue; ret = try_to_unmap_one(page, vma, address, flags); if (ret != SWAP_AGAIN || !page_mapped(page)) break; @@ -1508,8 +1508,6 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) mutex_lock(&mapping->i_mmap_mutex); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); - if (address == -EFAULT) - continue; ret = try_to_unmap_one(page, vma, address, flags); if (ret != SWAP_AGAIN || !page_mapped(page)) goto out; @@ -1684,8 +1682,6 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { struct vm_area_struct *vma = avc->vma; unsigned long address = vma_address(page, vma); - if (address == -EFAULT) - continue; ret = rmap_one(page, vma, address, arg); if (ret != SWAP_AGAIN) break; @@ -1707,8 +1703,6 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, mutex_lock(&mapping->i_mmap_mutex); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); - if (address == -EFAULT) - continue; ret = rmap_one(page, vma, address, arg); if (ret != SWAP_AGAIN) break; -- cgit v1.1 From ed8ea8150182f8d715fceb3b175ef0a9ebacd872 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:45 -0700 Subject: mm: add CONFIG_DEBUG_VM_RB build option Add a CONFIG_DEBUG_VM_RB build option for the previously existing DEBUG_MM_RB code. Now that Andi Kleen modified it to avoid using recursive algorithms, we can expose it a bit more. Also extend this code to validate_mm() after stack expansion, and to check that the vma's start and last pgoffs have not changed since the nodes were inserted on the anon vma interval tree (as it is important that the nodes be reindexed after each such update). Signed-off-by: Michel Lespinasse Cc: Andrea Arcangeli Cc: Rik van Riel Cc: Peter Zijlstra Cc: Daniel Santos Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/interval_tree.c | 41 ++++++++++++++++++++++++++++++++++++++++- mm/mmap.c | 19 +++++++++---------- 2 files changed, 49 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/interval_tree.c b/mm/interval_tree.c index f7c72cd..4a5822a 100644 --- a/mm/interval_tree.c +++ b/mm/interval_tree.c @@ -70,4 +70,43 @@ static inline unsigned long avc_last_pgoff(struct anon_vma_chain *avc) } INTERVAL_TREE_DEFINE(struct anon_vma_chain, rb, unsigned long, rb_subtree_last, - avc_start_pgoff, avc_last_pgoff,, anon_vma_interval_tree) + avc_start_pgoff, avc_last_pgoff, + static inline, __anon_vma_interval_tree) + +void anon_vma_interval_tree_insert(struct anon_vma_chain *node, + struct rb_root *root) +{ +#ifdef CONFIG_DEBUG_VM_RB + node->cached_vma_start = avc_start_pgoff(node); + node->cached_vma_last = avc_last_pgoff(node); +#endif + __anon_vma_interval_tree_insert(node, root); +} + +void anon_vma_interval_tree_remove(struct anon_vma_chain *node, + struct rb_root *root) +{ + __anon_vma_interval_tree_remove(node, root); +} + +struct anon_vma_chain * +anon_vma_interval_tree_iter_first(struct rb_root *root, + unsigned long first, unsigned long last) +{ + return __anon_vma_interval_tree_iter_first(root, first, last); +} + +struct anon_vma_chain * +anon_vma_interval_tree_iter_next(struct anon_vma_chain *node, + unsigned long first, unsigned long last) +{ + return __anon_vma_interval_tree_iter_next(node, first, last); +} + +#ifdef CONFIG_DEBUG_VM_RB +void anon_vma_interval_tree_verify(struct anon_vma_chain *node) +{ + WARN_ON_ONCE(node->cached_vma_start != avc_start_pgoff(node)); + WARN_ON_ONCE(node->cached_vma_last != avc_last_pgoff(node)); +} +#endif diff --git a/mm/mmap.c b/mm/mmap.c index 2e580ed..deb422c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -51,12 +51,6 @@ static void unmap_region(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, unsigned long start, unsigned long end); -/* - * WARNING: the debugging will use recursive algorithms so never enable this - * unless you know what you are doing. - */ -#undef DEBUG_MM_RB - /* description of effects of mapping type and prot in current implementation. * this is due to the limited x86 page protection hardware. The expected * behavior is in parens: @@ -303,7 +297,7 @@ out: return retval; } -#ifdef DEBUG_MM_RB +#ifdef CONFIG_DEBUG_VM_RB static int browse_rb(struct rb_root *root) { int i = 0, j; @@ -337,9 +331,12 @@ void validate_mm(struct mm_struct *mm) { int bug = 0; int i = 0; - struct vm_area_struct *tmp = mm->mmap; - while (tmp) { - tmp = tmp->vm_next; + struct vm_area_struct *vma = mm->mmap; + while (vma) { + struct anon_vma_chain *avc; + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + anon_vma_interval_tree_verify(avc); + vma = vma->vm_next; i++; } if (i != mm->map_count) @@ -1790,6 +1787,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) } vma_unlock_anon_vma(vma); khugepaged_enter_vma_merge(vma); + validate_mm(vma->vm_mm); return error; } #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ @@ -1843,6 +1841,7 @@ int expand_downwards(struct vm_area_struct *vma, } vma_unlock_anon_vma(vma); khugepaged_enter_vma_merge(vma); + validate_mm(vma->vm_mm); return error; } -- cgit v1.1 From 523d4e2008fd4a68b1a164e63e8c75b7b20f07e0 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:48 -0700 Subject: mm anon rmap: in mremap, set the new vma's position before anon_vma_clone() anon_vma_clone() expects new_vma->vm_{start,end,pgoff} to be correctly set so that the new vma can be indexed on the anon interval tree. copy_vma() was failing to do that, which broke mremap(). Signed-off-by: Michel Lespinasse Cc: Jiri Slaby Cc: Hugh Dickins Tested-by: Sasha Levin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index deb422c..8124899 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2419,16 +2419,16 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (new_vma) { *new_vma = *vma; + new_vma->vm_start = addr; + new_vma->vm_end = addr + len; + new_vma->vm_pgoff = pgoff; pol = mpol_dup(vma_policy(vma)); if (IS_ERR(pol)) goto out_free_vma; + vma_set_policy(new_vma, pol); INIT_LIST_HEAD(&new_vma->anon_vma_chain); if (anon_vma_clone(new_vma, vma)) goto out_free_mempol; - vma_set_policy(new_vma, pol); - new_vma->vm_start = addr; - new_vma->vm_end = addr + len; - new_vma->vm_pgoff = pgoff; if (new_vma->vm_file) get_file(new_vma->vm_file); if (new_vma->vm_ops && new_vma->vm_ops->open) -- cgit v1.1 From 38a76013ad809beb0b52f60d365c960d035bd83c Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:50 -0700 Subject: mm: avoid taking rmap locks in move_ptes() During mremap(), the destination VMA is generally placed after the original vma in rmap traversal order: in move_vma(), we always have new_pgoff >= vma->vm_pgoff, and as a result new_vma->vm_pgoff >= vma->vm_pgoff unless vma_merge() merged the new vma with an adjacent one. When the destination VMA is placed after the original in rmap traversal order, we can avoid taking the rmap locks in move_ptes(). Essentially, this reintroduces the optimization that had been disabled in "mm anon rmap: remove anon_vma_moveto_tail". The difference is that we don't try to impose the rmap traversal order; instead we just rely on things being in the desired order in the common case and fall back to taking locks in the uncommon case. Also we skip the i_mmap_mutex in addition to the anon_vma lock: in both cases, the vmas are traversed in increasing vm_pgoff order with ties resolved in tree insertion order. Signed-off-by: Michel Lespinasse Cc: Andrea Arcangeli Cc: Rik van Riel Cc: Peter Zijlstra Cc: Daniel Santos Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 7 +++++-- mm/mremap.c | 57 +++++++++++++++++++++++++++++++++++++++------------------ 2 files changed, 44 insertions(+), 20 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 8124899..2d94235 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2371,7 +2371,8 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) * prior to moving page table entries, to effect an mremap move. */ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, - unsigned long addr, unsigned long len, pgoff_t pgoff) + unsigned long addr, unsigned long len, pgoff_t pgoff, + bool *need_rmap_locks) { struct vm_area_struct *vma = *vmap; unsigned long vma_start = vma->vm_start; @@ -2413,8 +2414,9 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, * linear if there are no pages mapped yet. */ VM_BUG_ON(faulted_in_anon_vma); - *vmap = new_vma; + *vmap = vma = new_vma; } + *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); } else { new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (new_vma) { @@ -2434,6 +2436,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, if (new_vma->vm_ops && new_vma->vm_ops->open) new_vma->vm_ops->open(new_vma); vma_link(mm, new_vma, prev, rb_link, rb_parent); + *need_rmap_locks = false; } } return new_vma; diff --git a/mm/mremap.c b/mm/mremap.c index 5588bb6..3b639a4 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -71,26 +71,42 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma, static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, unsigned long old_addr, unsigned long old_end, struct vm_area_struct *new_vma, pmd_t *new_pmd, - unsigned long new_addr) + unsigned long new_addr, bool need_rmap_locks) { struct address_space *mapping = NULL; - struct anon_vma *anon_vma = vma->anon_vma; + struct anon_vma *anon_vma = NULL; struct mm_struct *mm = vma->vm_mm; pte_t *old_pte, *new_pte, pte; spinlock_t *old_ptl, *new_ptl; - if (vma->vm_file) { - /* - * Subtle point from Rajesh Venkatasubramanian: before - * moving file-based ptes, we must lock truncate_pagecache - * out, since it might clean the dst vma before the src vma, - * and we propagate stale pages into the dst afterward. - */ - mapping = vma->vm_file->f_mapping; - mutex_lock(&mapping->i_mmap_mutex); + /* + * When need_rmap_locks is true, we take the i_mmap_mutex and anon_vma + * locks to ensure that rmap will always observe either the old or the + * new ptes. This is the easiest way to avoid races with + * truncate_pagecache(), page migration, etc... + * + * When need_rmap_locks is false, we use other ways to avoid + * such races: + * + * - During exec() shift_arg_pages(), we use a specially tagged vma + * which rmap call sites look for using is_vma_temporary_stack(). + * + * - During mremap(), new_vma is often known to be placed after vma + * in rmap traversal order. This ensures rmap will always observe + * either the old pte, or the new pte, or both (the page table locks + * serialize access to individual ptes, but only rmap traversal + * order guarantees that we won't miss both the old and new ptes). + */ + if (need_rmap_locks) { + if (vma->vm_file) { + mapping = vma->vm_file->f_mapping; + mutex_lock(&mapping->i_mmap_mutex); + } + if (vma->anon_vma) { + anon_vma = vma->anon_vma; + anon_vma_lock(anon_vma); + } } - if (anon_vma) - anon_vma_lock(anon_vma); /* * We don't have to worry about the ordering of src and dst @@ -127,7 +143,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, - unsigned long new_addr, unsigned long len) + unsigned long new_addr, unsigned long len, + bool need_rmap_locks) { unsigned long extent, next, old_end; pmd_t *old_pmd, *new_pmd; @@ -174,7 +191,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, if (extent > LATENCY_LIMIT) extent = LATENCY_LIMIT; move_ptes(vma, old_pmd, old_addr, old_addr + extent, - new_vma, new_pmd, new_addr); + new_vma, new_pmd, new_addr, need_rmap_locks); need_flush = true; } if (likely(need_flush)) @@ -198,6 +215,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, unsigned long hiwater_vm; int split = 0; int err; + bool need_rmap_locks; /* * We'd prefer to avoid failure later on in do_munmap: @@ -219,18 +237,21 @@ static unsigned long move_vma(struct vm_area_struct *vma, return err; new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT); - new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff); + new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff, + &need_rmap_locks); if (!new_vma) return -ENOMEM; - moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len); + moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len, + need_rmap_locks); if (moved_len < old_len) { /* * On error, move entries back from new area to old, * which will succeed since page tables still there, * and then proceed to unmap new area instead of old. */ - move_page_tables(new_vma, new_addr, vma, old_addr, moved_len); + move_page_tables(new_vma, new_addr, vma, old_addr, moved_len, + true); vma = new_vma; old_len = new_len; old_addr = new_addr; -- cgit v1.1 From 1e8537baacd59e96bbe5f8d3d32feafd11f509fe Mon Sep 17 00:00:00 2001 From: Xishi Qiu Date: Mon, 8 Oct 2012 16:31:51 -0700 Subject: memory-hotplug: build zonelists when offlining pages online_pages() does build_all_zonelists() and zone_pcp_update(), I think offline_pages() should do it too. When the zone has no memory to allocate, remove it from other nodes' zonelists. zone_batchsize() depends on zone's present pages, if zone's present pages are changed, zone's pcp should be updated. Signed-off-by: Xishi Qiu Cc: Yasuaki Ishimatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 6a5b90d..b3501615 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -970,8 +970,13 @@ repeat: init_per_zone_wmark_min(); - if (!populated_zone(zone)) + if (!populated_zone(zone)) { zone_pcp_reset(zone); + mutex_lock(&zonelists_mutex); + build_all_zonelists(NULL, NULL); + mutex_unlock(&zonelists_mutex); + } else + zone_pcp_update(zone); if (!node_present_pages(node)) { node_clear_state(node, N_HIGH_MEMORY); -- cgit v1.1 From 70400303ce0c4ced3139499c676d5c79636b0c72 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Mon, 8 Oct 2012 16:31:52 -0700 Subject: mm: mmu_notifier: make the mmu_notifier srcu static The variable must be static especially given the variable name. s/RCU/SRCU/ over a few comments. Signed-off-by: Andrea Arcangeli Cc: Xiao Guangrong Cc: Sagi Grimberg Cc: Peter Zijlstra Cc: Haggai Eran Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmu_notifier.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 947df83..c297142 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -20,7 +20,7 @@ #include /* global SRCU for all MMs */ -struct srcu_struct srcu; +static struct srcu_struct srcu; /* * This function can't run concurrently against mmu_notifier_register @@ -41,7 +41,7 @@ void __mmu_notifier_release(struct mm_struct *mm) int id; /* - * RCU here will block mmu_notifier_unregister until + * SRCU here will block mmu_notifier_unregister until * ->release returns. */ id = srcu_read_lock(&srcu); @@ -302,7 +302,7 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) if (!hlist_unhashed(&mn->hlist)) { /* - * RCU here will force exit_mmap to wait ->release to finish + * SRCU here will force exit_mmap to wait ->release to finish * before freeing the pages. */ int id; -- cgit v1.1 From 02c6de8d757cb32c0829a45d81c3dfcbcafd998b Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 8 Oct 2012 16:31:55 -0700 Subject: mm: cma: discard clean pages during contiguous allocation instead of migration Drop clean cache pages instead of migration during alloc_contig_range() to minimise allocation latency by reducing the amount of migration that is necessary. It's useful for CMA because latency of migration is more important than evicting the background process's working set. In addition, as pages are reclaimed then fewer free pages for migration targets are required so it avoids memory reclaiming to get free pages, which is a contributory factor to increased latency. I measured elapsed time of __alloc_contig_migrate_range() which migrates 10M in 40M movable zone in QEMU machine. Before - 146ms, After - 7ms [akpm@linux-foundation.org: fix nommu build] Signed-off-by: Mel Gorman Signed-off-by: Minchan Kim Reviewed-by: Mel Gorman Cc: Marek Szyprowski Acked-by: Michal Nazarewicz Cc: Rik van Riel Tested-by: Kyungmin Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 3 ++- mm/page_alloc.c | 2 ++ mm/vmscan.c | 43 +++++++++++++++++++++++++++++++++++++------ 3 files changed, 41 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index bbd7b34..8312d4f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -356,5 +356,6 @@ extern unsigned long vm_mmap_pgoff(struct file *, unsigned long, unsigned long, unsigned long); extern void set_pageblock_order(void); - +unsigned long reclaim_clean_pages_from_list(struct zone *zone, + struct list_head *page_list); #endif /* __MM_INTERNAL_H */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cfd565db..cefd14e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5700,6 +5700,8 @@ static int __alloc_contig_migrate_range(unsigned long start, unsigned long end) break; } + reclaim_clean_pages_from_list(cc.zone, &cc.migratepages); + ret = migrate_pages(&cc.migratepages, __alloc_contig_migrate_alloc, 0, false, MIGRATE_SYNC); diff --git a/mm/vmscan.c b/mm/vmscan.c index d16bf5a..1ee4b69 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -674,8 +674,10 @@ static enum page_references page_check_references(struct page *page, static unsigned long shrink_page_list(struct list_head *page_list, struct zone *zone, struct scan_control *sc, + enum ttu_flags ttu_flags, unsigned long *ret_nr_dirty, - unsigned long *ret_nr_writeback) + unsigned long *ret_nr_writeback, + bool force_reclaim) { LIST_HEAD(ret_pages); LIST_HEAD(free_pages); @@ -689,10 +691,10 @@ static unsigned long shrink_page_list(struct list_head *page_list, mem_cgroup_uncharge_start(); while (!list_empty(page_list)) { - enum page_references references; struct address_space *mapping; struct page *page; int may_enter_fs; + enum page_references references = PAGEREF_RECLAIM_CLEAN; cond_resched(); @@ -758,7 +760,9 @@ static unsigned long shrink_page_list(struct list_head *page_list, wait_on_page_writeback(page); } - references = page_check_references(page, sc); + if (!force_reclaim) + references = page_check_references(page, sc); + switch (references) { case PAGEREF_ACTIVATE: goto activate_locked; @@ -788,7 +792,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, * processes. Try to unmap it here. */ if (page_mapped(page) && mapping) { - switch (try_to_unmap(page, TTU_UNMAP)) { + switch (try_to_unmap(page, ttu_flags)) { case SWAP_FAIL: goto activate_locked; case SWAP_AGAIN: @@ -960,6 +964,33 @@ keep: return nr_reclaimed; } +unsigned long reclaim_clean_pages_from_list(struct zone *zone, + struct list_head *page_list) +{ + struct scan_control sc = { + .gfp_mask = GFP_KERNEL, + .priority = DEF_PRIORITY, + .may_unmap = 1, + }; + unsigned long ret, dummy1, dummy2; + struct page *page, *next; + LIST_HEAD(clean_pages); + + list_for_each_entry_safe(page, next, page_list, lru) { + if (page_is_file_cache(page) && !PageDirty(page)) { + ClearPageActive(page); + list_move(&page->lru, &clean_pages); + } + } + + ret = shrink_page_list(&clean_pages, zone, &sc, + TTU_UNMAP|TTU_IGNORE_ACCESS, + &dummy1, &dummy2, true); + list_splice(&clean_pages, page_list); + __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); + return ret; +} + /* * Attempt to remove the specified page from its LRU. Only take this page * if it is of the appropriate PageActive status. Pages which are being @@ -1278,8 +1309,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, if (nr_taken == 0) return 0; - nr_reclaimed = shrink_page_list(&page_list, zone, sc, - &nr_dirty, &nr_writeback); + nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP, + &nr_dirty, &nr_writeback, false); spin_lock_irq(&zone->lru_lock); -- cgit v1.1 From 770c8aaaf6f04a87e6765f24d497132de9152a46 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Mon, 8 Oct 2012 16:31:57 -0700 Subject: mm: fix tracing in free_pcppages_bulk() page->private gets re-used in __free_one_page() to store page order (so trace_mm_page_pcpu_drain() may print order instead of migratetype) thus migratetype value must be cached locally. Fixes regression introduced in commit a7016235a61d ("mm: fix migratetype bug which slowed swapping"). This caused incorrect data to be attached to the mm_page_pcpu_drain trace event. [akpm@linux-foundation.org: add comment] Cc: Marek Szyprowski Cc: Michal Nazarewicz Acked-by: Minchan Kim Acked-by: Mel Gorman Cc: Hugh Dickins Signed-off-by: Bartlomiej Zolnierkiewicz Signed-off-by: Kyungmin Park Acked-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cefd14e..3f18a14 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -668,12 +668,15 @@ static void free_pcppages_bulk(struct zone *zone, int count, batch_free = to_free; do { + int mt; /* migratetype of the to-be-freed page */ + page = list_entry(list->prev, struct page, lru); /* must delete as __free_one_page list manipulates */ list_del(&page->lru); + mt = page_private(page); /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ - __free_one_page(page, zone, 0, page_private(page)); - trace_mm_page_pcpu_drain(page, 0, page_private(page)); + __free_one_page(page, zone, 0, mt); + trace_mm_page_pcpu_drain(page, 0, mt); } while (--to_free && --batch_free && !list_empty(list)); } __mod_zone_page_state(zone, NR_FREE_PAGES, count); -- cgit v1.1 From 2139cbe627b8910ded55148f87ee10f7485408ed Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Mon, 8 Oct 2012 16:32:00 -0700 Subject: cma: fix counting of isolated pages Isolated free pages shouldn't be accounted to NR_FREE_PAGES counter. Fix it by properly decreasing/increasing NR_FREE_PAGES counter in set_migratetype_isolate()/unset_migratetype_isolate() and removing counter adjustment for isolated pages from free_one_page() and split_free_page(). Signed-off-by: Bartlomiej Zolnierkiewicz Signed-off-by: Kyungmin Park Cc: Marek Szyprowski Cc: Michal Nazarewicz Cc: Minchan Kim Cc: Mel Gorman Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 9 +++++++-- mm/page_isolation.c | 12 +++++++++--- 2 files changed, 16 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3f18a14..d259cc2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -691,7 +691,8 @@ static void free_one_page(struct zone *zone, struct page *page, int order, zone->pages_scanned = 0; __free_one_page(page, zone, order, migratetype); - __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); + if (unlikely(migratetype != MIGRATE_ISOLATE)) + __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); spin_unlock(&zone->lock); } @@ -1392,6 +1393,7 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype) unsigned int order; unsigned long watermark; struct zone *zone; + int mt; BUG_ON(!PageBuddy(page)); @@ -1407,7 +1409,10 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype) list_del(&page->lru); zone->free_area[order].nr_free--; rmv_page_order(page); - __mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order)); + + mt = get_pageblock_migratetype(page); + if (unlikely(mt != MIGRATE_ISOLATE)) + __mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order)); if (alloc_order != order) expand(zone, page, alloc_order, order, diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 247d1f1..3ca1716 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -76,8 +76,12 @@ int set_migratetype_isolate(struct page *page) out: if (!ret) { + unsigned long nr_pages; + set_pageblock_isolate(page); - move_freepages_block(zone, page, MIGRATE_ISOLATE); + nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE); + + __mod_zone_page_state(zone, NR_FREE_PAGES, -nr_pages); } spin_unlock_irqrestore(&zone->lock, flags); @@ -89,12 +93,14 @@ out: void unset_migratetype_isolate(struct page *page, unsigned migratetype) { struct zone *zone; - unsigned long flags; + unsigned long flags, nr_pages; + zone = page_zone(page); spin_lock_irqsave(&zone->lock, flags); if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) goto out; - move_freepages_block(zone, page, migratetype); + nr_pages = move_freepages_block(zone, page, migratetype); + __mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages); restore_pageblock_isolate(page, migratetype); out: spin_unlock_irqrestore(&zone->lock, flags); -- cgit v1.1 From d1ce749a0db12202b711d1aba1d29e823034648d Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Mon, 8 Oct 2012 16:32:02 -0700 Subject: cma: count free CMA pages Add NR_FREE_CMA_PAGES counter to be later used for checking watermark in __zone_watermark_ok(). For simplicity and to avoid #ifdef hell make this counter always available (not only when CONFIG_CMA=y). [akpm@linux-foundation.org: use conventional migratetype naming] Signed-off-by: Bartlomiej Zolnierkiewicz Signed-off-by: Kyungmin Park Cc: Marek Szyprowski Cc: Michal Nazarewicz Cc: Minchan Kim Cc: Mel Gorman Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 26 +++++++++++++++++++------- mm/page_isolation.c | 5 +++-- mm/vmstat.c | 1 + 3 files changed, 23 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d259cc2..6969a8a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -558,7 +558,8 @@ static inline void __free_one_page(struct page *page, if (page_is_guard(buddy)) { clear_page_guard_flag(buddy); set_page_private(page, 0); - __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); + __mod_zone_freepage_state(zone, 1 << order, + migratetype); } else { list_del(&buddy->lru); zone->free_area[order].nr_free--; @@ -677,6 +678,8 @@ static void free_pcppages_bulk(struct zone *zone, int count, /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ __free_one_page(page, zone, 0, mt); trace_mm_page_pcpu_drain(page, 0, mt); + if (is_migrate_cma(mt)) + __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1); } while (--to_free && --batch_free && !list_empty(list)); } __mod_zone_page_state(zone, NR_FREE_PAGES, count); @@ -692,7 +695,7 @@ static void free_one_page(struct zone *zone, struct page *page, int order, __free_one_page(page, zone, order, migratetype); if (unlikely(migratetype != MIGRATE_ISOLATE)) - __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); + __mod_zone_freepage_state(zone, 1 << order, migratetype); spin_unlock(&zone->lock); } @@ -815,7 +818,8 @@ static inline void expand(struct zone *zone, struct page *page, set_page_guard_flag(&page[size]); set_page_private(&page[size], high); /* Guard pages are not available for any usage */ - __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high)); + __mod_zone_freepage_state(zone, -(1 << high), + migratetype); continue; } #endif @@ -1141,6 +1145,9 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, } set_page_private(page, mt); list = &page->lru; + if (is_migrate_cma(mt)) + __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, + -(1 << order)); } __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); spin_unlock(&zone->lock); @@ -1412,7 +1419,7 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype) mt = get_pageblock_migratetype(page); if (unlikely(mt != MIGRATE_ISOLATE)) - __mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order)); + __mod_zone_freepage_state(zone, -(1UL << order), mt); if (alloc_order != order) expand(zone, page, alloc_order, order, @@ -1516,7 +1523,8 @@ again: spin_unlock(&zone->lock); if (!page) goto failed; - __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order)); + __mod_zone_freepage_state(zone, -(1 << order), + get_pageblock_migratetype(page)); } __count_zone_vm_events(PGALLOC, zone, 1 << order); @@ -2890,7 +2898,8 @@ void show_free_areas(unsigned int filter) " unevictable:%lu" " dirty:%lu writeback:%lu unstable:%lu\n" " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n" - " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n", + " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" + " free_cma:%lu\n", global_page_state(NR_ACTIVE_ANON), global_page_state(NR_INACTIVE_ANON), global_page_state(NR_ISOLATED_ANON), @@ -2907,7 +2916,8 @@ void show_free_areas(unsigned int filter) global_page_state(NR_FILE_MAPPED), global_page_state(NR_SHMEM), global_page_state(NR_PAGETABLE), - global_page_state(NR_BOUNCE)); + global_page_state(NR_BOUNCE), + global_page_state(NR_FREE_CMA_PAGES)); for_each_populated_zone(zone) { int i; @@ -2939,6 +2949,7 @@ void show_free_areas(unsigned int filter) " pagetables:%lukB" " unstable:%lukB" " bounce:%lukB" + " free_cma:%lukB" " writeback_tmp:%lukB" " pages_scanned:%lu" " all_unreclaimable? %s" @@ -2968,6 +2979,7 @@ void show_free_areas(unsigned int filter) K(zone_page_state(zone, NR_PAGETABLE)), K(zone_page_state(zone, NR_UNSTABLE_NFS)), K(zone_page_state(zone, NR_BOUNCE)), + K(zone_page_state(zone, NR_FREE_CMA_PAGES)), K(zone_page_state(zone, NR_WRITEBACK_TEMP)), zone->pages_scanned, (zone->all_unreclaimable ? "yes" : "no") diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 3ca1716..345643b 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -77,11 +77,12 @@ int set_migratetype_isolate(struct page *page) out: if (!ret) { unsigned long nr_pages; + int migratetype = get_pageblock_migratetype(page); set_pageblock_isolate(page); nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE); - __mod_zone_page_state(zone, NR_FREE_PAGES, -nr_pages); + __mod_zone_freepage_state(zone, -nr_pages, migratetype); } spin_unlock_irqrestore(&zone->lock, flags); @@ -100,7 +101,7 @@ void unset_migratetype_isolate(struct page *page, unsigned migratetype) if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) goto out; nr_pages = move_freepages_block(zone, page, migratetype); - __mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages); + __mod_zone_freepage_state(zone, nr_pages, migratetype); restore_pageblock_isolate(page, migratetype); out: spin_unlock_irqrestore(&zone->lock, flags); diff --git a/mm/vmstat.c b/mm/vmstat.c index b3e3b9d..acbd85c 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -722,6 +722,7 @@ const char * const vmstat_text[] = { "numa_other", #endif "nr_anon_transparent_hugepages", + "nr_free_cma", "nr_dirty_threshold", "nr_dirty_background_threshold", -- cgit v1.1 From d95ea5d18e699515468368415c93ed49b1a3221b Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Mon, 8 Oct 2012 16:32:05 -0700 Subject: cma: fix watermark checking * Add ALLOC_CMA alloc flag and pass it to [__]zone_watermark_ok() (from Minchan Kim). * During watermark check decrease available free pages number by free CMA pages number if necessary (unmovable allocations cannot use pages from CMA areas). Signed-off-by: Bartlomiej Zolnierkiewicz Signed-off-by: Kyungmin Park Cc: Marek Szyprowski Cc: Michal Nazarewicz Cc: Minchan Kim Cc: Mel Gorman Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 8 +++++++- mm/internal.h | 14 ++++++++++++++ mm/page_alloc.c | 31 +++++++++++++++---------------- 3 files changed, 36 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 0fbc6b7..1f61bcb 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -934,6 +934,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, struct zoneref *z; struct zone *zone; int rc = COMPACT_SKIPPED; + int alloc_flags = 0; /* Check if the GFP flags allow compaction */ if (!order || !may_enter_fs || !may_perform_io) @@ -941,6 +942,10 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, count_vm_event(COMPACTSTALL); +#ifdef CONFIG_CMA + if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) + alloc_flags |= ALLOC_CMA; +#endif /* Compact each zone in the list */ for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, nodemask) { @@ -951,7 +956,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, rc = max(status, rc); /* If a normal allocation would succeed, stop compacting */ - if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) + if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, + alloc_flags)) break; } diff --git a/mm/internal.h b/mm/internal.h index 8312d4f..96cda4c 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -358,4 +358,18 @@ extern unsigned long vm_mmap_pgoff(struct file *, unsigned long, extern void set_pageblock_order(void); unsigned long reclaim_clean_pages_from_list(struct zone *zone, struct list_head *page_list); +/* The ALLOC_WMARK bits are used as an index to zone->watermark */ +#define ALLOC_WMARK_MIN WMARK_MIN +#define ALLOC_WMARK_LOW WMARK_LOW +#define ALLOC_WMARK_HIGH WMARK_HIGH +#define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */ + +/* Mask to get the watermark bits */ +#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1) + +#define ALLOC_HARDER 0x10 /* try to alloc harder */ +#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ +#define ALLOC_CPUSET 0x40 /* check for correct cpuset */ +#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ + #endif /* __MM_INTERNAL_H */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6969a8a..f2c7cc6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1541,19 +1541,6 @@ failed: return NULL; } -/* The ALLOC_WMARK bits are used as an index to zone->watermark */ -#define ALLOC_WMARK_MIN WMARK_MIN -#define ALLOC_WMARK_LOW WMARK_LOW -#define ALLOC_WMARK_HIGH WMARK_HIGH -#define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */ - -/* Mask to get the watermark bits */ -#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1) - -#define ALLOC_HARDER 0x10 /* try to alloc harder */ -#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ -#define ALLOC_CPUSET 0x40 /* check for correct cpuset */ - #ifdef CONFIG_FAIL_PAGE_ALLOC static struct { @@ -1648,7 +1635,11 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, min -= min / 2; if (alloc_flags & ALLOC_HARDER) min -= min / 4; - +#ifdef CONFIG_CMA + /* If allocation can't use CMA areas don't use free CMA pages */ + if (!(alloc_flags & ALLOC_CMA)) + free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES); +#endif if (free_pages <= min + lowmem_reserve) return false; for (o = 0; o < order; o++) { @@ -2362,7 +2353,10 @@ gfp_to_alloc_flags(gfp_t gfp_mask) unlikely(test_thread_flag(TIF_MEMDIE)))) alloc_flags |= ALLOC_NO_WATERMARKS; } - +#ifdef CONFIG_CMA + if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) + alloc_flags |= ALLOC_CMA; +#endif return alloc_flags; } @@ -2587,6 +2581,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, struct page *page = NULL; int migratetype = allocflags_to_migratetype(gfp_mask); unsigned int cpuset_mems_cookie; + int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET; gfp_mask &= gfp_allowed_mask; @@ -2615,9 +2610,13 @@ retry_cpuset: if (!preferred_zone) goto out; +#ifdef CONFIG_CMA + if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) + alloc_flags |= ALLOC_CMA; +#endif /* First allocation attempt */ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, - zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET, + zonelist, high_zoneidx, alloc_flags, preferred_zone, migratetype); if (unlikely(!page)) page = __alloc_pages_slowpath(gfp_mask, order, -- cgit v1.1 From b12c4ad14ee0232ad47c2bef404b6d42a3578332 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 8 Oct 2012 16:32:08 -0700 Subject: mm: page_alloc: use get_freepage_migratetype() instead of page_private() The page allocator uses set_page_private and page_private for handling migratetype when it frees page. Let's replace them with [set|get] _freepage_migratetype to make it more clear. Signed-off-by: Minchan Kim Acked-by: KAMEZAWA Hiroyuki Reviewed-by: Yasuaki Ishimatsu Acked-by: Mel Gorman Cc: Xishi Qiu Cc: Wen Congyang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 6 +++--- mm/page_isolation.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f2c7cc6..6aa0a8e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -674,7 +674,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, page = list_entry(list->prev, struct page, lru); /* must delete as __free_one_page list manipulates */ list_del(&page->lru); - mt = page_private(page); + mt = get_freepage_migratetype(page); /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ __free_one_page(page, zone, 0, mt); trace_mm_page_pcpu_drain(page, 0, mt); @@ -1143,7 +1143,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE) mt = migratetype; } - set_page_private(page, mt); + set_freepage_migratetype(page, mt); list = &page->lru; if (is_migrate_cma(mt)) __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, @@ -1313,7 +1313,7 @@ void free_hot_cold_page(struct page *page, int cold) return; migratetype = get_pageblock_migratetype(page); - set_page_private(page, migratetype); + set_freepage_migratetype(page, migratetype); local_irq_save(flags); if (unlikely(wasMlocked)) free_page_mlock(page); diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 345643b..9c03dca 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -203,7 +203,7 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) if (PageBuddy(page)) pfn += 1 << page_order(page); else if (page_count(page) == 0 && - page_private(page) == MIGRATE_ISOLATE) + get_freepage_migratetype(page) == MIGRATE_ISOLATE) pfn += 1; else break; -- cgit v1.1 From 95e3441248053fc06bbb1dbbd34409a84211619e Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 8 Oct 2012 16:32:11 -0700 Subject: mm: remain migratetype in freed page The page allocator caches the pageblock information in page->private while it is in the PCP freelists but this is overwritten with the order of the page when freed to the buddy allocator. This patch stores the migratetype of the page in the page->index field so that it is available at all times when the page remain in free_list. This patch adds a new call site in __free_pages_ok so it might be overhead a bit but it's for high order allocation. So I believe damage isn't hurt. Signed-off-by: Minchan Kim Acked-by: KAMEZAWA Hiroyuki Reviewed-by: Yasuaki Ishimatsu Acked-by: Mel Gorman Cc: Xishi Qiu Cc: Wen Congyang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6aa0a8e..94fd283 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -729,6 +729,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) { unsigned long flags; int wasMlocked = __TestClearPageMlocked(page); + int migratetype; if (!free_pages_prepare(page, order)) return; @@ -737,8 +738,9 @@ static void __free_pages_ok(struct page *page, unsigned int order) if (unlikely(wasMlocked)) free_page_mlock(page); __count_vm_events(PGFREE, 1 << order); - free_one_page(page_zone(page), page, order, - get_pageblock_migratetype(page)); + migratetype = get_pageblock_migratetype(page); + set_freepage_migratetype(page, migratetype); + free_one_page(page_zone(page), page, order, migratetype); local_irq_restore(flags); } @@ -959,6 +961,7 @@ static int move_freepages(struct zone *zone, order = page_order(page); list_move(&page->lru, &zone->free_area[order].free_list[migratetype]); + set_freepage_migratetype(page, migratetype); page += 1 << order; pages_moved += 1 << order; } -- cgit v1.1 From 41d575ad4a511b71a4a41c8313004212f5c229b1 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 8 Oct 2012 16:32:14 -0700 Subject: memory-hotplug: bug fix race between isolation and allocation Like below, memory-hotplug makes race between page-isolation and page-allocation so it can hit BUG_ON in __offline_isolated_pages. CPU A CPU B start_isolate_page_range set_migratetype_isolate spin_lock_irqsave(zone->lock) free_hot_cold_page(Page A) /* without zone->lock */ migratetype = get_pageblock_migratetype(Page A); /* * Page could be moved into MIGRATE_MOVABLE * of per_cpu_pages */ list_add_tail(&page->lru, &pcp->lists[migratetype]); set_pageblock_isolate move_freepages_block drain_all_pages /* Page A could be in MIGRATE_MOVABLE of free_list. */ check_pages_isolated __test_page_isolated_in_pageblock /* * We can't catch freed page which * is free_list[MIGRATE_MOVABLE] */ if (PageBuddy(page A)) pfn += 1 << page_order(page A); /* So, Page A could be allocated */ __offline_isolated_pages /* * BUG_ON hit or offline page * which is used by someone */ BUG_ON(!PageBuddy(page A)); This patch checks page's migratetype in freelist in __test_page_isolated_in_pageblock. So now __test_page_isolated_in_pageblock can check the page caused by above race and can fail of memory offlining. Signed-off-by: Minchan Kim Acked-by: KAMEZAWA Hiroyuki Reviewed-by: Yasuaki Ishimatsu Acked-by: Mel Gorman Cc: Xishi Qiu Cc: Wen Congyang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_isolation.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 9c03dca..6744235 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -200,8 +200,11 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) continue; } page = pfn_to_page(pfn); - if (PageBuddy(page)) + if (PageBuddy(page)) { + if (get_freepage_migratetype(page) != MIGRATE_ISOLATE) + break; pfn += 1 << page_order(page); + } else if (page_count(page) == 0 && get_freepage_migratetype(page) == MIGRATE_ISOLATE) pfn += 1; -- cgit v1.1 From 435b405c06119d93333738172b8060b0ed12af41 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 8 Oct 2012 16:32:16 -0700 Subject: memory-hotplug: fix pages missed by race rather than failing If race between allocation and isolation in memory-hotplug offline happens, some pages could be in MIGRATE_MOVABLE of free_list although the pageblock's migratetype of the page is MIGRATE_ISOLATE. The race could be detected by get_freepage_migratetype in __test_page_isolated_in_pageblock. If it is detected, now EBUSY gets bubbled all the way up and the hotplug operations fails. But better idea is instead of returning and failing memory-hotremove, move the free page to the correct list at the time it is detected. It could enhance memory-hotremove operation success ratio although the race is really rare. Suggested by Mel Gorman. [akpm@linux-foundation.org: small cleanup] Signed-off-by: Minchan Kim Cc: KAMEZAWA Hiroyuki Reviewed-by: Yasuaki Ishimatsu Acked-by: Mel Gorman Cc: Xishi Qiu Cc: Wen Congyang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- mm/page_isolation.c | 16 ++++++++++++++-- 2 files changed, 15 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 94fd283..82f0b2f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -925,7 +925,7 @@ static int fallbacks[MIGRATE_TYPES][4] = { * Note that start_page and end_pages are not aligned on a pageblock * boundary. If alignment is required, use move_freepages_block() */ -static int move_freepages(struct zone *zone, +int move_freepages(struct zone *zone, struct page *start_page, struct page *end_page, int migratetype) { diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 6744235..5f34a90 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -201,8 +201,20 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) } page = pfn_to_page(pfn); if (PageBuddy(page)) { - if (get_freepage_migratetype(page) != MIGRATE_ISOLATE) - break; + /* + * If race between isolatation and allocation happens, + * some free pages could be in MIGRATE_MOVABLE list + * although pageblock's migratation type of the page + * is MIGRATE_ISOLATE. Catch it and move the page into + * MIGRATE_ISOLATE list. + */ + if (get_freepage_migratetype(page) != MIGRATE_ISOLATE) { + struct page *end_page; + + end_page = page + (1 << page_order(page)) - 1; + move_freepages(page_zone(page), page, end_page, + MIGRATE_ISOLATE); + } pfn += 1 << page_order(page); } else if (page_count(page) == 0 && -- cgit v1.1 From 45cac65b0fcd287ebb877b141d40ba9bbe8e5da7 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 8 Oct 2012 16:32:19 -0700 Subject: readahead: fault retry breaks mmap file read random detection .fault now can retry. The retry can break state machine of .fault. In filemap_fault, if page is miss, ra->mmap_miss is increased. In the second try, since the page is in page cache now, ra->mmap_miss is decreased. And these are done in one fault, so we can't detect random mmap file access. Add a new flag to indicate .fault is tried once. In the second try, skip ra->mmap_miss decreasing. The filemap_fault state machine is ok with it. I only tested x86, didn't test other archs, but looks the change for other archs is obvious, but who knows :) Signed-off-by: Shaohua Li Cc: Rik van Riel Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index a9827b4..83efee7 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1607,13 +1607,13 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) * Do we have something in the page cache already? */ page = find_get_page(mapping, offset); - if (likely(page)) { + if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) { /* * We found the page, so try async readahead before * waiting for the lock. */ do_async_mmap_readahead(vma, ra, file, page, offset); - } else { + } else if (!page) { /* No page in the page cache at all */ do_sync_mmap_readahead(vma, ra, file, offset); count_vm_event(PGMAJFAULT); -- cgit v1.1 From e9d24ad30fc5c4c601824fb39712350b053ca812 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 8 Oct 2012 16:32:21 -0700 Subject: mm/memblock: use existing interface to set nid Use the existing interface function to set the NUMA node ID (NID) for the regions, either memory or reserved region. Signed-off-by: Wanpeng Li Cc: Michal Hocko Cc: KAMEZAWA Hiroyuki Cc: Minchan Kim Cc: Gavin Shan Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index 82aa349..d809f17 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -756,7 +756,7 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, return ret; for (i = start_rgn; i < end_rgn; i++) - type->regions[i].nid = nid; + memblock_set_region_node(&type->regions[i], nid); memblock_merge_regions(type); return 0; -- cgit v1.1 From f2d52fe51c8c0a18cf5fbe583bad51090d12c146 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 8 Oct 2012 16:32:24 -0700 Subject: mm/memblock: cleanup early_node_map[] related comments Commit 0ee332c14518 ("memblock: Kill early_node_map[]") removed early_node_map[]. Clean up the comments to comply with that change. Signed-off-by: Wanpeng Li Cc: Michal Hocko Cc: KAMEZAWA Hiroyuki Cc: Minchan Kim Cc: Gavin Shan Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/nobootmem.c | 2 -- mm/page_alloc.c | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 4055730..bd82f6b 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -162,8 +162,6 @@ unsigned long __init free_all_bootmem(void) * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id * because in some case like Node0 doesn't have RAM installed * low ram will be on Node1 - * Use MAX_NUMNODES will make sure all ranges in early_node_map[] - * will be used instead of only Node0 related */ return free_low_memory_core_early(MAX_NUMNODES); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 82f0b2f..ca002b3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4931,7 +4931,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) zone_movable_pfn[i] << PAGE_SHIFT); } - /* Print out the early_node_map[] */ + /* Print out the early node map */ printk("Early memory node ranges\n"); for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) printk(" node %3d: [mem %#010lx-%#010lx]\n", nid, -- cgit v1.1 From e64c5237cf6ff474cb2f3f832f48f2b441dd9979 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 8 Oct 2012 16:32:27 -0700 Subject: mm: compaction: abort compaction loop if lock is contended or run too long isolate_migratepages_range() might isolate no pages if for example when zone->lru_lock is contended and running asynchronous compaction. In this case, we should abort compaction, otherwise, compact_zone will run a useless loop and make zone->lru_lock is even contended. An additional check is added to ensure that cc.migratepages and cc.freepages get properly drained whan compaction is aborted. [minchan@kernel.org: Putback pages isolated for migration if aborting] [akpm@linux-foundation.org: compact_zone_order requires non-NULL arg contended] [akpm@linux-foundation.org: make compact_zone_order() require non-NULL arg `contended'] [minchan@kernel.org: Putback pages isolated for migration if aborting] Signed-off-by: Andrea Arcangeli Signed-off-by: Shaohua Li Signed-off-by: Mel Gorman Acked-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 17 ++++++++++++----- mm/internal.h | 2 +- 2 files changed, 13 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 1f61bcb..0649cc1 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -70,8 +70,7 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, /* async aborts if taking too long or contended */ if (!cc->sync) { - if (cc->contended) - *cc->contended = true; + cc->contended = true; return false; } @@ -688,7 +687,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, /* Perform the isolation */ low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn); - if (!low_pfn) + if (!low_pfn || cc->contended) return ISOLATE_ABORT; cc->migrate_pfn = low_pfn; @@ -848,6 +847,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) switch (isolate_migratepages(zone, cc)) { case ISOLATE_ABORT: ret = COMPACT_PARTIAL; + putback_lru_pages(&cc->migratepages); + cc->nr_migratepages = 0; goto out; case ISOLATE_NONE: continue; @@ -896,6 +897,7 @@ static unsigned long compact_zone_order(struct zone *zone, bool sync, bool *contended, struct page **page) { + unsigned long ret; struct compact_control cc = { .nr_freepages = 0, .nr_migratepages = 0, @@ -903,13 +905,18 @@ static unsigned long compact_zone_order(struct zone *zone, .migratetype = allocflags_to_migratetype(gfp_mask), .zone = zone, .sync = sync, - .contended = contended, .page = page, }; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); - return compact_zone(zone, &cc); + ret = compact_zone(zone, &cc); + + VM_BUG_ON(!list_empty(&cc.freepages)); + VM_BUG_ON(!list_empty(&cc.migratepages)); + + *contended = cc.contended; + return ret; } int sysctl_extfrag_threshold = 500; diff --git a/mm/internal.h b/mm/internal.h index 96cda4c..97664be 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -130,7 +130,7 @@ struct compact_control { int order; /* order a direct compactor needs */ int migratetype; /* MOVABLE, RECLAIMABLE etc */ struct zone *zone; - bool *contended; /* True if a lock was contended */ + bool contended; /* True if a lock was contended */ struct page **page; /* Page captured of requested size */ }; -- cgit v1.1 From 3cc668f4e30fbd97b3c0574d8cac7a83903c9bc7 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 8 Oct 2012 16:32:30 -0700 Subject: mm: compaction: move fatal signal check out of compact_checklock_irqsave Commit c67fe3752abe ("mm: compaction: Abort async compaction if locks are contended or taking too long") addressed a lock contention problem in compaction by introducing compact_checklock_irqsave() that effecively aborting async compaction in the event of compaction. To preserve existing behaviour it also moved a fatal_signal_pending() check into compact_checklock_irqsave() but that is very misleading. It "hides" the check within a locking function but has nothing to do with locking as such. It just happens to work in a desirable fashion. This patch moves the fatal_signal_pending() check to isolate_migratepages_range() where it belongs. Arguably the same check should also happen when isolating pages for freeing but it's overkill. Signed-off-by: Mel Gorman Cc: Rik van Riel Cc: KAMEZAWA Hiroyuki Cc: Shaohua Li Cc: Minchan Kim Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 0649cc1..78075a2 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -75,8 +75,6 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, } cond_resched(); - if (fatal_signal_pending(current)) - return false; } if (!locked) @@ -363,7 +361,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, /* Check if it is ok to still hold the lock */ locked = compact_checklock_irqsave(&zone->lru_lock, &flags, locked, cc); - if (!locked) + if (!locked || fatal_signal_pending(current)) break; /* -- cgit v1.1 From 661c4cb9b829110cb68c18ea05a56be39f75a4d2 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 8 Oct 2012 16:32:31 -0700 Subject: mm: compaction: Update try_to_compact_pages()kerneldoc comment Parameters were added without documentation, tut tut. Signed-off-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 78075a2..b16dd38 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -926,6 +926,8 @@ int sysctl_extfrag_threshold = 500; * @gfp_mask: The GFP mask of the current allocation * @nodemask: The allowed nodes to allocate from * @sync: Whether migration is synchronous or not + * @contended: Return value that is true if compaction was aborted due to lock contention + * @page: Optionally capture a free page of the requested order during compaction * * This is the main entry point for direct page compaction. */ -- cgit v1.1 From 2a1402aa044b55c2d30ab0ed9405693ef06fb07c Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 8 Oct 2012 16:32:33 -0700 Subject: mm: compaction: acquire the zone->lru_lock as late as possible Richard Davies and Shaohua Li have both reported lock contention problems in compaction on the zone and LRU locks as well as significant amounts of time being spent in compaction. This series aims to reduce lock contention and scanning rates to reduce that CPU usage. Richard reported at https://lkml.org/lkml/2012/9/21/91 that this series made a big different to a problem he reported in August: http://marc.info/?l=kvm&m=134511507015614&w=2 Patch 1 defers acquiring the zone->lru_lock as long as possible. Patch 2 defers acquiring the zone->lock as lock as possible. Patch 3 reverts Rik's "skip-free" patches as the core concept gets reimplemented later and the remaining patches are easier to understand if this is reverted first. Patch 4 adds a pageblock-skip bit to the pageblock flags to cache what pageblocks should be skipped by the migrate and free scanners. This drastically reduces the amount of scanning compaction has to do. Patch 5 reimplements something similar to Rik's idea except it uses the pageblock-skip information to decide where the scanners should restart from and does not need to wrap around. I tested this on 3.6-rc6 + linux-next/akpm. Kernels tested were akpm-20120920 3.6-rc6 + linux-next/akpm as of Septeber 20th, 2012 lesslock Patches 1-6 revert Patches 1-7 cachefail Patches 1-8 skipuseless Patches 1-9 Stress high-order allocation tests looked ok. Success rates are more or less the same with the full series applied but there is an expectation that there is less opportunity to race with other allocation requests if there is less scanning. The time to complete the tests did not vary that much and are uninteresting as were the vmstat statistics so I will not present them here. Using ftrace I recorded how much scanning was done by compaction and got this 3.6.0-rc6 3.6.0-rc6 3.6.0-rc6 3.6.0-rc6 3.6.0-rc6 akpm-20120920 lockless revert-v2r2 cachefail skipuseless Total free scanned 360753976 515414028 565479007 17103281 18916589 Total free isolated 2852429 3597369 4048601 670493 727840 Total free efficiency 0.0079% 0.0070% 0.0072% 0.0392% 0.0385% Total migrate scanned 247728664 822729112 1004645830 17946827 14118903 Total migrate isolated 2555324 3245937 3437501 616359 658616 Total migrate efficiency 0.0103% 0.0039% 0.0034% 0.0343% 0.0466% The efficiency is worthless because of the nature of the test and the number of failures. The really interesting point as far as this patch series is concerned is the number of pages scanned. Note that reverting Rik's patches massively increases the number of pages scanned indicating that those patches really did make a difference to CPU usage. However, caching what pageblocks should be skipped has a much higher impact. With patches 1-8 applied, free page and migrate page scanning are both reduced by 95% in comparison to the akpm kernel. If the basic concept of Rik's patches are implemened on top then scanning then the free scanner barely changed but migrate scanning was further reduced. That said, tests on 3.6-rc5 indicated that the last patch had greater impact than what was measured here so it is a bit variable. One way or the other, this series has a large impact on the amount of scanning compaction does when there is a storm of THP allocations. This patch: Compaction's migrate scanner acquires the zone->lru_lock when scanning a range of pages looking for LRU pages to acquire. It does this even if there are no LRU pages in the range. If multiple processes are compacting then this can cause severe locking contention. To make matters worse commit b2eef8c0 ("mm: compaction: minimise the time IRQs are disabled while isolating pages for migration") releases the lru_lock every SWAP_CLUSTER_MAX pages that are scanned. This patch makes two changes to how the migrate scanner acquires the LRU lock. First, it only releases the LRU lock every SWAP_CLUSTER_MAX pages if the lock is contended. This reduces the number of times it unnecessarily disables and re-enables IRQs. The second is that it defers acquiring the LRU lock for as long as possible. If there are no LRU pages or the only LRU pages are transhuge then the LRU lock will not be acquired at all which reduces contention on zone->lru_lock. [minchan@kernel.org: augment comment] [akpm@linux-foundation.org: tweak comment text] Signed-off-by: Mel Gorman Acked-by: Rik van Riel Cc: Richard Davies Cc: Shaohua Li Cc: Avi Kivity Acked-by: Rafael Aquini Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 65 +++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 45 insertions(+), 20 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index b16dd38..832c4183 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -50,6 +50,11 @@ static inline bool migrate_async_suitable(int migratetype) return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE; } +static inline bool should_release_lock(spinlock_t *lock) +{ + return need_resched() || spin_is_contended(lock); +} + /* * Compaction requires the taking of some coarse locks that are potentially * very heavily contended. Check if the process needs to be scheduled or @@ -62,7 +67,7 @@ static inline bool migrate_async_suitable(int migratetype) static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, bool locked, struct compact_control *cc) { - if (need_resched() || spin_is_contended(lock)) { + if (should_release_lock(lock)) { if (locked) { spin_unlock_irqrestore(lock, *flags); locked = false; @@ -327,7 +332,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, isolate_mode_t mode = 0; struct lruvec *lruvec; unsigned long flags; - bool locked; + bool locked = false; /* * Ensure that there are not too many pages isolated from the LRU @@ -347,23 +352,17 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, /* Time to isolate some pages for migration */ cond_resched(); - spin_lock_irqsave(&zone->lru_lock, flags); - locked = true; for (; low_pfn < end_pfn; low_pfn++) { struct page *page; /* give a chance to irqs before checking need_resched() */ - if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) { - spin_unlock_irqrestore(&zone->lru_lock, flags); - locked = false; + if (locked && !((low_pfn+1) % SWAP_CLUSTER_MAX)) { + if (should_release_lock(&zone->lru_lock)) { + spin_unlock_irqrestore(&zone->lru_lock, flags); + locked = false; + } } - /* Check if it is ok to still hold the lock */ - locked = compact_checklock_irqsave(&zone->lru_lock, &flags, - locked, cc); - if (!locked || fatal_signal_pending(current)) - break; - /* * migrate_pfn does not necessarily start aligned to a * pageblock. Ensure that pfn_valid is called when moving @@ -403,21 +402,40 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, pageblock_nr = low_pfn >> pageblock_order; if (!cc->sync && last_pageblock_nr != pageblock_nr && !migrate_async_suitable(get_pageblock_migratetype(page))) { - low_pfn += pageblock_nr_pages; - low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1; - last_pageblock_nr = pageblock_nr; - continue; + goto next_pageblock; } + /* Check may be lockless but that's ok as we recheck later */ if (!PageLRU(page)) continue; /* - * PageLRU is set, and lru_lock excludes isolation, - * splitting and collapsing (collapsing has already - * happened if PageLRU is set). + * PageLRU is set. lru_lock normally excludes isolation + * splitting and collapsing (collapsing has already happened + * if PageLRU is set) but the lock is not necessarily taken + * here and it is wasteful to take it just to check transhuge. + * Check TransHuge without lock and skip the whole pageblock if + * it's either a transhuge or hugetlbfs page, as calling + * compound_order() without preventing THP from splitting the + * page underneath us may return surprising results. */ if (PageTransHuge(page)) { + if (!locked) + goto next_pageblock; + low_pfn += (1 << compound_order(page)) - 1; + continue; + } + + /* Check if it is ok to still hold the lock */ + locked = compact_checklock_irqsave(&zone->lru_lock, &flags, + locked, cc); + if (!locked || fatal_signal_pending(current)) + break; + + /* Recheck PageLRU and PageTransHuge under lock */ + if (!PageLRU(page)) + continue; + if (PageTransHuge(page)) { low_pfn += (1 << compound_order(page)) - 1; continue; } @@ -444,6 +462,13 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, ++low_pfn; break; } + + continue; + +next_pageblock: + low_pfn += pageblock_nr_pages; + low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1; + last_pageblock_nr = pageblock_nr; } acct_isolated(zone, locked, cc); -- cgit v1.1 From f40d1e42bb988d2a26e8e111ea4c4c7bac819b7e Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 8 Oct 2012 16:32:36 -0700 Subject: mm: compaction: acquire the zone->lock as late as possible Compaction's free scanner acquires the zone->lock when checking for PageBuddy pages and isolating them. It does this even if there are no PageBuddy pages in the range. This patch defers acquiring the zone lock for as long as possible. In the event there are no free pages in the pageblock then the lock will not be acquired at all which reduces contention on zone->lock. Signed-off-by: Mel Gorman Acked-by: Rik van Riel Cc: Richard Davies Cc: Shaohua Li Cc: Avi Kivity Acked-by: Rafael Aquini Acked-by: Minchan Kim Tested-by: Peter Ujfalusi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 140 ++++++++++++++++++++++++++++++-------------------------- 1 file changed, 76 insertions(+), 64 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 832c4183..bdf6e13 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -93,6 +93,27 @@ static inline bool compact_trylock_irqsave(spinlock_t *lock, return compact_checklock_irqsave(lock, flags, false, cc); } +/* Returns true if the page is within a block suitable for migration to */ +static bool suitable_migration_target(struct page *page) +{ + int migratetype = get_pageblock_migratetype(page); + + /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ + if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE) + return false; + + /* If the page is a large free page, then allow migration */ + if (PageBuddy(page) && page_order(page) >= pageblock_order) + return true; + + /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ + if (migrate_async_suitable(migratetype)) + return true; + + /* Otherwise skip the block */ + return false; +} + static void compact_capture_page(struct compact_control *cc) { unsigned long flags; @@ -153,38 +174,56 @@ static void compact_capture_page(struct compact_control *cc) * pages inside of the pageblock (even though it may still end up isolating * some pages). */ -static unsigned long isolate_freepages_block(unsigned long blockpfn, +static unsigned long isolate_freepages_block(struct compact_control *cc, + unsigned long blockpfn, unsigned long end_pfn, struct list_head *freelist, bool strict) { int nr_scanned = 0, total_isolated = 0; struct page *cursor; + unsigned long nr_strict_required = end_pfn - blockpfn; + unsigned long flags; + bool locked = false; cursor = pfn_to_page(blockpfn); - /* Isolate free pages. This assumes the block is valid */ + /* Isolate free pages. */ for (; blockpfn < end_pfn; blockpfn++, cursor++) { int isolated, i; struct page *page = cursor; - if (!pfn_valid_within(blockpfn)) { - if (strict) - return 0; - continue; - } nr_scanned++; + if (!pfn_valid_within(blockpfn)) + continue; + if (!PageBuddy(page)) + continue; + + /* + * The zone lock must be held to isolate freepages. + * Unfortunately this is a very coarse lock and can be + * heavily contended if there are parallel allocations + * or parallel compactions. For async compaction do not + * spin on the lock and we acquire the lock as late as + * possible. + */ + locked = compact_checklock_irqsave(&cc->zone->lock, &flags, + locked, cc); + if (!locked) + break; + + /* Recheck this is a suitable migration target under lock */ + if (!strict && !suitable_migration_target(page)) + break; - if (!PageBuddy(page)) { - if (strict) - return 0; + /* Recheck this is a buddy page under lock */ + if (!PageBuddy(page)) continue; - } /* Found a free page, break it into order-0 pages */ isolated = split_free_page(page); if (!isolated && strict) - return 0; + break; total_isolated += isolated; for (i = 0; i < isolated; i++) { list_add(&page->lru, freelist); @@ -199,6 +238,18 @@ static unsigned long isolate_freepages_block(unsigned long blockpfn, } trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated); + + /* + * If strict isolation is requested by CMA then check that all the + * pages requested were isolated. If there were any failures, 0 is + * returned and CMA will fail. + */ + if (strict && nr_strict_required != total_isolated) + total_isolated = 0; + + if (locked) + spin_unlock_irqrestore(&cc->zone->lock, flags); + return total_isolated; } @@ -218,12 +269,17 @@ static unsigned long isolate_freepages_block(unsigned long blockpfn, unsigned long isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn) { - unsigned long isolated, pfn, block_end_pfn, flags; + unsigned long isolated, pfn, block_end_pfn; struct zone *zone = NULL; LIST_HEAD(freelist); + /* cc needed for isolate_freepages_block to acquire zone->lock */ + struct compact_control cc = { + .sync = true, + }; + if (pfn_valid(start_pfn)) - zone = page_zone(pfn_to_page(start_pfn)); + cc.zone = zone = page_zone(pfn_to_page(start_pfn)); for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) { if (!pfn_valid(pfn) || zone != page_zone(pfn_to_page(pfn))) @@ -236,10 +292,8 @@ isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn) block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); block_end_pfn = min(block_end_pfn, end_pfn); - spin_lock_irqsave(&zone->lock, flags); - isolated = isolate_freepages_block(pfn, block_end_pfn, + isolated = isolate_freepages_block(&cc, pfn, block_end_pfn, &freelist, true); - spin_unlock_irqrestore(&zone->lock, flags); /* * In strict mode, isolate_freepages_block() returns 0 if @@ -483,29 +537,6 @@ next_pageblock: #endif /* CONFIG_COMPACTION || CONFIG_CMA */ #ifdef CONFIG_COMPACTION - -/* Returns true if the page is within a block suitable for migration to */ -static bool suitable_migration_target(struct page *page) -{ - - int migratetype = get_pageblock_migratetype(page); - - /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ - if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE) - return false; - - /* If the page is a large free page, then allow migration */ - if (PageBuddy(page) && page_order(page) >= pageblock_order) - return true; - - /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ - if (migrate_async_suitable(migratetype)) - return true; - - /* Otherwise skip the block */ - return false; -} - /* * Returns the start pfn of the last page block in a zone. This is the starting * point for full compaction of a zone. Compaction searches for free pages from @@ -529,7 +560,6 @@ static void isolate_freepages(struct zone *zone, { struct page *page; unsigned long high_pfn, low_pfn, pfn, zone_end_pfn, end_pfn; - unsigned long flags; int nr_freepages = cc->nr_freepages; struct list_head *freelist = &cc->freepages; @@ -577,30 +607,12 @@ static void isolate_freepages(struct zone *zone, if (!suitable_migration_target(page)) continue; - /* - * Found a block suitable for isolating free pages from. Now - * we disabled interrupts, double check things are ok and - * isolate the pages. This is to minimise the time IRQs - * are disabled - */ + /* Found a block suitable for isolating free pages from */ isolated = 0; - - /* - * The zone lock must be held to isolate freepages. This - * unfortunately this is a very coarse lock and can be - * heavily contended if there are parallel allocations - * or parallel compactions. For async compaction do not - * spin on the lock - */ - if (!compact_trylock_irqsave(&zone->lock, &flags, cc)) - break; - if (suitable_migration_target(page)) { - end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn); - isolated = isolate_freepages_block(pfn, end_pfn, - freelist, false); - nr_freepages += isolated; - } - spin_unlock_irqrestore(&zone->lock, flags); + end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn); + isolated = isolate_freepages_block(cc, pfn, end_pfn, + freelist, false); + nr_freepages += isolated; /* * Record the highest PFN we isolated pages from. When next -- cgit v1.1 From 753341a4b85ff337487b9959c71c529f522004f4 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 8 Oct 2012 16:32:40 -0700 Subject: revert "mm: have order > 0 compaction start off where it left" This reverts commit 7db8889ab05b ("mm: have order > 0 compaction start off where it left") and commit de74f1cc ("mm: have order > 0 compaction start near a pageblock with free pages"). These patches were a good idea and tests confirmed that they massively reduced the amount of scanning but the implementation is complex and tricky to understand. A later patch will cache what pageblocks should be skipped and reimplements the concept of compact_cached_free_pfn on top for both migration and free scanners. Signed-off-by: Mel Gorman Acked-by: Rik van Riel Cc: Richard Davies Cc: Shaohua Li Cc: Avi Kivity Acked-by: Rafael Aquini Acked-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 65 +++++---------------------------------------------------- mm/internal.h | 6 ------ mm/page_alloc.c | 5 ----- 3 files changed, 5 insertions(+), 71 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index bdf6e13..db76361 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -538,20 +538,6 @@ next_pageblock: #endif /* CONFIG_COMPACTION || CONFIG_CMA */ #ifdef CONFIG_COMPACTION /* - * Returns the start pfn of the last page block in a zone. This is the starting - * point for full compaction of a zone. Compaction searches for free pages from - * the end of each zone, while isolate_freepages_block scans forward inside each - * page block. - */ -static unsigned long start_free_pfn(struct zone *zone) -{ - unsigned long free_pfn; - free_pfn = zone->zone_start_pfn + zone->spanned_pages; - free_pfn &= ~(pageblock_nr_pages-1); - return free_pfn; -} - -/* * Based on information in the current compact_control, find blocks * suitable for isolating free pages from and then isolate them. */ @@ -619,19 +605,8 @@ static void isolate_freepages(struct zone *zone, * looking for free pages, the search will restart here as * page migration may have returned some pages to the allocator */ - if (isolated) { + if (isolated) high_pfn = max(high_pfn, pfn); - - /* - * If the free scanner has wrapped, update - * compact_cached_free_pfn to point to the highest - * pageblock with free pages. This reduces excessive - * scanning of full pageblocks near the end of the - * zone - */ - if (cc->order > 0 && cc->wrapped) - zone->compact_cached_free_pfn = high_pfn; - } } /* split_free_page does not map the pages */ @@ -639,11 +614,6 @@ static void isolate_freepages(struct zone *zone, cc->free_pfn = high_pfn; cc->nr_freepages = nr_freepages; - - /* If compact_cached_free_pfn is reset then set it now */ - if (cc->order > 0 && !cc->wrapped && - zone->compact_cached_free_pfn == start_free_pfn(zone)) - zone->compact_cached_free_pfn = high_pfn; } /* @@ -738,26 +708,8 @@ static int compact_finished(struct zone *zone, if (fatal_signal_pending(current)) return COMPACT_PARTIAL; - /* - * A full (order == -1) compaction run starts at the beginning and - * end of a zone; it completes when the migrate and free scanner meet. - * A partial (order > 0) compaction can start with the free scanner - * at a random point in the zone, and may have to restart. - */ - if (cc->free_pfn <= cc->migrate_pfn) { - if (cc->order > 0 && !cc->wrapped) { - /* We started partway through; restart at the end. */ - unsigned long free_pfn = start_free_pfn(zone); - zone->compact_cached_free_pfn = free_pfn; - cc->free_pfn = free_pfn; - cc->wrapped = 1; - return COMPACT_CONTINUE; - } - return COMPACT_COMPLETE; - } - - /* We wrapped around and ended up where we started. */ - if (cc->wrapped && cc->free_pfn <= cc->start_free_pfn) + /* Compaction run completes if the migrate and free scanner meet */ + if (cc->free_pfn <= cc->migrate_pfn) return COMPACT_COMPLETE; /* @@ -863,15 +815,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) /* Setup to move all movable pages to the end of the zone */ cc->migrate_pfn = zone->zone_start_pfn; - - if (cc->order > 0) { - /* Incremental compaction. Start where the last one stopped. */ - cc->free_pfn = zone->compact_cached_free_pfn; - cc->start_free_pfn = cc->free_pfn; - } else { - /* Order == -1 starts at the end of the zone. */ - cc->free_pfn = start_free_pfn(zone); - } + cc->free_pfn = cc->migrate_pfn + zone->spanned_pages; + cc->free_pfn &= ~(pageblock_nr_pages-1); migrate_prep_local(); diff --git a/mm/internal.h b/mm/internal.h index 97664be..6f6bb9a 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -118,14 +118,8 @@ struct compact_control { unsigned long nr_freepages; /* Number of isolated free pages */ unsigned long nr_migratepages; /* Number of pages to migrate */ unsigned long free_pfn; /* isolate_freepages search base */ - unsigned long start_free_pfn; /* where we started the search */ unsigned long migrate_pfn; /* isolate_migratepages search base */ bool sync; /* Synchronous migration */ - bool wrapped; /* Order > 0 compactions are - incremental, once free_pfn - and migrate_pfn meet, we restart - from the top of the zone; - remember we wrapped around. */ int order; /* order a direct compactor needs */ int migratetype; /* MOVABLE, RECLAIMABLE etc */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ca002b3..628968c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4490,11 +4490,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, zone->spanned_pages = size; zone->present_pages = realsize; -#if defined CONFIG_COMPACTION || defined CONFIG_CMA - zone->compact_cached_free_pfn = zone->zone_start_pfn + - zone->spanned_pages; - zone->compact_cached_free_pfn &= ~(pageblock_nr_pages-1); -#endif #ifdef CONFIG_NUMA zone->node = nid; zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) -- cgit v1.1 From bb13ffeb9f6bfeb301443994dfbf29f91117dfb3 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 8 Oct 2012 16:32:41 -0700 Subject: mm: compaction: cache if a pageblock was scanned and no pages were isolated When compaction was implemented it was known that scanning could potentially be excessive. The ideal was that a counter be maintained for each pageblock but maintaining this information would incur a severe penalty due to a shared writable cache line. It has reached the point where the scanning costs are a serious problem, particularly on long-lived systems where a large process starts and allocates a large number of THPs at the same time. Instead of using a shared counter, this patch adds another bit to the pageblock flags called PG_migrate_skip. If a pageblock is scanned by either migrate or free scanner and 0 pages were isolated, the pageblock is marked to be skipped in the future. When scanning, this bit is checked before any scanning takes place and the block skipped if set. The main difficulty with a patch like this is "when to ignore the cached information?" If it's ignored too often, the scanning rates will still be excessive. If the information is too stale then allocations will fail that might have otherwise succeeded. In this patch o CMA always ignores the information o If the migrate and free scanner meet then the cached information will be discarded if it's at least 5 seconds since the last time the cache was discarded o If there are a large number of allocation failures, discard the cache. The time-based heuristic is very clumsy but there are few choices for a better event. Depending solely on multiple allocation failures still allows excessive scanning when THP allocations are failing in quick succession due to memory pressure. Waiting until memory pressure is relieved would cause compaction to continually fail instead of using reclaim/compaction to try allocate the page. The time-based mechanism is clumsy but a better option is not obvious. Signed-off-by: Mel Gorman Acked-by: Rik van Riel Cc: Richard Davies Cc: Shaohua Li Cc: Avi Kivity Acked-by: Rafael Aquini Cc: Fengguang Wu Cc: Michal Nazarewicz Cc: Bartlomiej Zolnierkiewicz Cc: Kyungmin Park Cc: Mark Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 125 ++++++++++++++++++++++++++++++++++++++++++++++++-------- mm/internal.h | 4 +- mm/page_alloc.c | 38 +++++++++-------- 3 files changed, 131 insertions(+), 36 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index db76361..d9dbb97 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -50,6 +50,79 @@ static inline bool migrate_async_suitable(int migratetype) return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE; } +#ifdef CONFIG_COMPACTION +/* Returns true if the pageblock should be scanned for pages to isolate. */ +static inline bool isolation_suitable(struct compact_control *cc, + struct page *page) +{ + if (cc->ignore_skip_hint) + return true; + + return !get_pageblock_skip(page); +} + +/* + * This function is called to clear all cached information on pageblocks that + * should be skipped for page isolation when the migrate and free page scanner + * meet. + */ +static void reset_isolation_suitable(struct zone *zone) +{ + unsigned long start_pfn = zone->zone_start_pfn; + unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages; + unsigned long pfn; + + /* + * Do not reset more than once every five seconds. If allocations are + * failing sufficiently quickly to allow this to happen then continually + * scanning for compaction is not going to help. The choice of five + * seconds is arbitrary but will mitigate excessive scanning. + */ + if (time_before(jiffies, zone->compact_blockskip_expire)) + return; + zone->compact_blockskip_expire = jiffies + (HZ * 5); + + /* Walk the zone and mark every pageblock as suitable for isolation */ + for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { + struct page *page; + + cond_resched(); + + if (!pfn_valid(pfn)) + continue; + + page = pfn_to_page(pfn); + if (zone != page_zone(page)) + continue; + + clear_pageblock_skip(page); + } +} + +/* + * If no pages were isolated then mark this pageblock to be skipped in the + * future. The information is later cleared by reset_isolation_suitable(). + */ +static void update_pageblock_skip(struct page *page, unsigned long nr_isolated) +{ + if (!page) + return; + + if (!nr_isolated) + set_pageblock_skip(page); +} +#else +static inline bool isolation_suitable(struct compact_control *cc, + struct page *page) +{ + return true; +} + +static void update_pageblock_skip(struct page *page, unsigned long nr_isolated) +{ +} +#endif /* CONFIG_COMPACTION */ + static inline bool should_release_lock(spinlock_t *lock) { return need_resched() || spin_is_contended(lock); @@ -181,7 +254,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, bool strict) { int nr_scanned = 0, total_isolated = 0; - struct page *cursor; + struct page *cursor, *valid_page = NULL; unsigned long nr_strict_required = end_pfn - blockpfn; unsigned long flags; bool locked = false; @@ -196,6 +269,8 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, nr_scanned++; if (!pfn_valid_within(blockpfn)) continue; + if (!valid_page) + valid_page = page; if (!PageBuddy(page)) continue; @@ -250,6 +325,10 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, if (locked) spin_unlock_irqrestore(&cc->zone->lock, flags); + /* Update the pageblock-skip if the whole pageblock was scanned */ + if (blockpfn == end_pfn) + update_pageblock_skip(valid_page, total_isolated); + return total_isolated; } @@ -267,22 +346,14 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, * a free page). */ unsigned long -isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn) +isolate_freepages_range(struct compact_control *cc, + unsigned long start_pfn, unsigned long end_pfn) { unsigned long isolated, pfn, block_end_pfn; - struct zone *zone = NULL; LIST_HEAD(freelist); - /* cc needed for isolate_freepages_block to acquire zone->lock */ - struct compact_control cc = { - .sync = true, - }; - - if (pfn_valid(start_pfn)) - cc.zone = zone = page_zone(pfn_to_page(start_pfn)); - for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) { - if (!pfn_valid(pfn) || zone != page_zone(pfn_to_page(pfn))) + if (!pfn_valid(pfn) || cc->zone != page_zone(pfn_to_page(pfn))) break; /* @@ -292,7 +363,7 @@ isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn) block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); block_end_pfn = min(block_end_pfn, end_pfn); - isolated = isolate_freepages_block(&cc, pfn, block_end_pfn, + isolated = isolate_freepages_block(cc, pfn, block_end_pfn, &freelist, true); /* @@ -387,6 +458,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, struct lruvec *lruvec; unsigned long flags; bool locked = false; + struct page *page = NULL, *valid_page = NULL; /* * Ensure that there are not too many pages isolated from the LRU @@ -407,8 +479,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, /* Time to isolate some pages for migration */ cond_resched(); for (; low_pfn < end_pfn; low_pfn++) { - struct page *page; - /* give a chance to irqs before checking need_resched() */ if (locked && !((low_pfn+1) % SWAP_CLUSTER_MAX)) { if (should_release_lock(&zone->lru_lock)) { @@ -444,6 +514,14 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, if (page_zone(page) != zone) continue; + if (!valid_page) + valid_page = page; + + /* If isolation recently failed, do not retry */ + pageblock_nr = low_pfn >> pageblock_order; + if (!isolation_suitable(cc, page)) + goto next_pageblock; + /* Skip if free */ if (PageBuddy(page)) continue; @@ -453,7 +531,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, * migration is optimistic to see if the minimum amount of work * satisfies the allocation */ - pageblock_nr = low_pfn >> pageblock_order; if (!cc->sync && last_pageblock_nr != pageblock_nr && !migrate_async_suitable(get_pageblock_migratetype(page))) { goto next_pageblock; @@ -530,6 +607,10 @@ next_pageblock: if (locked) spin_unlock_irqrestore(&zone->lru_lock, flags); + /* Update the pageblock-skip if the whole pageblock was scanned */ + if (low_pfn == end_pfn) + update_pageblock_skip(valid_page, nr_isolated); + trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); return low_pfn; @@ -593,6 +674,10 @@ static void isolate_freepages(struct zone *zone, if (!suitable_migration_target(page)) continue; + /* If isolation recently failed, do not retry */ + if (!isolation_suitable(cc, page)) + continue; + /* Found a block suitable for isolating free pages from */ isolated = 0; end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn); @@ -709,8 +794,10 @@ static int compact_finished(struct zone *zone, return COMPACT_PARTIAL; /* Compaction run completes if the migrate and free scanner meet */ - if (cc->free_pfn <= cc->migrate_pfn) + if (cc->free_pfn <= cc->migrate_pfn) { + reset_isolation_suitable(cc->zone); return COMPACT_COMPLETE; + } /* * order == -1 is expected when compacting via @@ -818,6 +905,10 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) cc->free_pfn = cc->migrate_pfn + zone->spanned_pages; cc->free_pfn &= ~(pageblock_nr_pages-1); + /* Clear pageblock skip if there are numerous alloc failures */ + if (zone->compact_defer_shift == COMPACT_MAX_DEFER_SHIFT) + reset_isolation_suitable(zone); + migrate_prep_local(); while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { diff --git a/mm/internal.h b/mm/internal.h index 6f6bb9a..7ba56ac 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -120,6 +120,7 @@ struct compact_control { unsigned long free_pfn; /* isolate_freepages search base */ unsigned long migrate_pfn; /* isolate_migratepages search base */ bool sync; /* Synchronous migration */ + bool ignore_skip_hint; /* Scan blocks even if marked skip */ int order; /* order a direct compactor needs */ int migratetype; /* MOVABLE, RECLAIMABLE etc */ @@ -129,7 +130,8 @@ struct compact_control { }; unsigned long -isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn); +isolate_freepages_range(struct compact_control *cc, + unsigned long start_pfn, unsigned long end_pfn); unsigned long isolate_migratepages_range(struct zone *zone, struct compact_control *cc, unsigned long low_pfn, unsigned long end_pfn); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 628968c..44c5604 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5679,7 +5679,8 @@ __alloc_contig_migrate_alloc(struct page *page, unsigned long private, } /* [start, end) must belong to a single zone. */ -static int __alloc_contig_migrate_range(unsigned long start, unsigned long end) +static int __alloc_contig_migrate_range(struct compact_control *cc, + unsigned long start, unsigned long end) { /* This function is based on compact_zone() from compaction.c. */ @@ -5687,25 +5688,17 @@ static int __alloc_contig_migrate_range(unsigned long start, unsigned long end) unsigned int tries = 0; int ret = 0; - struct compact_control cc = { - .nr_migratepages = 0, - .order = -1, - .zone = page_zone(pfn_to_page(start)), - .sync = true, - }; - INIT_LIST_HEAD(&cc.migratepages); - migrate_prep_local(); - while (pfn < end || !list_empty(&cc.migratepages)) { + while (pfn < end || !list_empty(&cc->migratepages)) { if (fatal_signal_pending(current)) { ret = -EINTR; break; } - if (list_empty(&cc.migratepages)) { - cc.nr_migratepages = 0; - pfn = isolate_migratepages_range(cc.zone, &cc, + if (list_empty(&cc->migratepages)) { + cc->nr_migratepages = 0; + pfn = isolate_migratepages_range(cc->zone, cc, pfn, end); if (!pfn) { ret = -EINTR; @@ -5717,14 +5710,14 @@ static int __alloc_contig_migrate_range(unsigned long start, unsigned long end) break; } - reclaim_clean_pages_from_list(cc.zone, &cc.migratepages); + reclaim_clean_pages_from_list(cc->zone, &cc->migratepages); - ret = migrate_pages(&cc.migratepages, + ret = migrate_pages(&cc->migratepages, __alloc_contig_migrate_alloc, 0, false, MIGRATE_SYNC); } - putback_lru_pages(&cc.migratepages); + putback_lru_pages(&cc->migratepages); return ret > 0 ? 0 : ret; } @@ -5803,6 +5796,15 @@ int alloc_contig_range(unsigned long start, unsigned long end, unsigned long outer_start, outer_end; int ret = 0, order; + struct compact_control cc = { + .nr_migratepages = 0, + .order = -1, + .zone = page_zone(pfn_to_page(start)), + .sync = true, + .ignore_skip_hint = true, + }; + INIT_LIST_HEAD(&cc.migratepages); + /* * What we do here is we mark all pageblocks in range as * MIGRATE_ISOLATE. Because pageblock and max order pages may @@ -5832,7 +5834,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, if (ret) goto done; - ret = __alloc_contig_migrate_range(start, end); + ret = __alloc_contig_migrate_range(&cc, start, end); if (ret) goto done; @@ -5881,7 +5883,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start); /* Grab isolated pages from freelists. */ - outer_end = isolate_freepages_range(outer_start, end); + outer_end = isolate_freepages_range(&cc, outer_start, end); if (!outer_end) { ret = -EBUSY; goto done; -- cgit v1.1 From c89511ab2f8fe2b47585e60da8af7fd213ec877e Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 8 Oct 2012 16:32:45 -0700 Subject: mm: compaction: Restart compaction from near where it left off This is almost entirely based on Rik's previous patches and discussions with him about how this might be implemented. Order > 0 compaction stops when enough free pages of the correct page order have been coalesced. When doing subsequent higher order allocations, it is possible for compaction to be invoked many times. However, the compaction code always starts out looking for things to compact at the start of the zone, and for free pages to compact things to at the end of the zone. This can cause quadratic behaviour, with isolate_freepages starting at the end of the zone each time, even though previous invocations of the compaction code already filled up all free memory on that end of the zone. This can cause isolate_freepages to take enormous amounts of CPU with certain workloads on larger memory systems. This patch caches where the migration and free scanner should start from on subsequent compaction invocations using the pageblock-skip information. When compaction starts it begins from the cached restart points and will update the cached restart points until a page is isolated or a pageblock is skipped that would have been scanned by synchronous compaction. Signed-off-by: Mel Gorman Acked-by: Rik van Riel Cc: Richard Davies Cc: Shaohua Li Cc: Avi Kivity Acked-by: Rafael Aquini Cc: Fengguang Wu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++---------- mm/internal.h | 4 ++++ 2 files changed, 52 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index d9dbb97..f94cbc0 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -80,6 +80,9 @@ static void reset_isolation_suitable(struct zone *zone) */ if (time_before(jiffies, zone->compact_blockskip_expire)) return; + + zone->compact_cached_migrate_pfn = start_pfn; + zone->compact_cached_free_pfn = end_pfn; zone->compact_blockskip_expire = jiffies + (HZ * 5); /* Walk the zone and mark every pageblock as suitable for isolation */ @@ -103,13 +106,29 @@ static void reset_isolation_suitable(struct zone *zone) * If no pages were isolated then mark this pageblock to be skipped in the * future. The information is later cleared by reset_isolation_suitable(). */ -static void update_pageblock_skip(struct page *page, unsigned long nr_isolated) +static void update_pageblock_skip(struct compact_control *cc, + struct page *page, unsigned long nr_isolated, + bool migrate_scanner) { + struct zone *zone = cc->zone; if (!page) return; - if (!nr_isolated) + if (!nr_isolated) { + unsigned long pfn = page_to_pfn(page); set_pageblock_skip(page); + + /* Update where compaction should restart */ + if (migrate_scanner) { + if (!cc->finished_update_migrate && + pfn > zone->compact_cached_migrate_pfn) + zone->compact_cached_migrate_pfn = pfn; + } else { + if (!cc->finished_update_free && + pfn < zone->compact_cached_free_pfn) + zone->compact_cached_free_pfn = pfn; + } + } } #else static inline bool isolation_suitable(struct compact_control *cc, @@ -118,7 +137,9 @@ static inline bool isolation_suitable(struct compact_control *cc, return true; } -static void update_pageblock_skip(struct page *page, unsigned long nr_isolated) +static void update_pageblock_skip(struct compact_control *cc, + struct page *page, unsigned long nr_isolated, + bool migrate_scanner) { } #endif /* CONFIG_COMPACTION */ @@ -327,7 +348,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, /* Update the pageblock-skip if the whole pageblock was scanned */ if (blockpfn == end_pfn) - update_pageblock_skip(valid_page, total_isolated); + update_pageblock_skip(cc, valid_page, total_isolated, false); return total_isolated; } @@ -533,6 +554,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, */ if (!cc->sync && last_pageblock_nr != pageblock_nr && !migrate_async_suitable(get_pageblock_migratetype(page))) { + cc->finished_update_migrate = true; goto next_pageblock; } @@ -583,6 +605,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, VM_BUG_ON(PageTransCompound(page)); /* Successfully isolated */ + cc->finished_update_migrate = true; del_page_from_lru_list(page, lruvec, page_lru(page)); list_add(&page->lru, migratelist); cc->nr_migratepages++; @@ -609,7 +632,7 @@ next_pageblock: /* Update the pageblock-skip if the whole pageblock was scanned */ if (low_pfn == end_pfn) - update_pageblock_skip(valid_page, nr_isolated); + update_pageblock_skip(cc, valid_page, nr_isolated, true); trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); @@ -690,8 +713,10 @@ static void isolate_freepages(struct zone *zone, * looking for free pages, the search will restart here as * page migration may have returned some pages to the allocator */ - if (isolated) + if (isolated) { + cc->finished_update_free = true; high_pfn = max(high_pfn, pfn); + } } /* split_free_page does not map the pages */ @@ -888,6 +913,8 @@ unsigned long compaction_suitable(struct zone *zone, int order) static int compact_zone(struct zone *zone, struct compact_control *cc) { int ret; + unsigned long start_pfn = zone->zone_start_pfn; + unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages; ret = compaction_suitable(zone, cc->order); switch (ret) { @@ -900,10 +927,21 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) ; } - /* Setup to move all movable pages to the end of the zone */ - cc->migrate_pfn = zone->zone_start_pfn; - cc->free_pfn = cc->migrate_pfn + zone->spanned_pages; - cc->free_pfn &= ~(pageblock_nr_pages-1); + /* + * Setup to move all movable pages to the end of the zone. Used cached + * information on where the scanners should start but check that it + * is initialised by ensuring the values are within zone boundaries. + */ + cc->migrate_pfn = zone->compact_cached_migrate_pfn; + cc->free_pfn = zone->compact_cached_free_pfn; + if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) { + cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1); + zone->compact_cached_free_pfn = cc->free_pfn; + } + if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) { + cc->migrate_pfn = start_pfn; + zone->compact_cached_migrate_pfn = cc->migrate_pfn; + } /* Clear pageblock skip if there are numerous alloc failures */ if (zone->compact_defer_shift == COMPACT_MAX_DEFER_SHIFT) diff --git a/mm/internal.h b/mm/internal.h index 7ba56ac..7f72f24 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -121,6 +121,10 @@ struct compact_control { unsigned long migrate_pfn; /* isolate_migratepages search base */ bool sync; /* Synchronous migration */ bool ignore_skip_hint; /* Scan blocks even if marked skip */ + bool finished_update_free; /* True when the zone cached pfns are + * no longer being updated + */ + bool finished_update_migrate; int order; /* order a direct compactor needs */ int migratetype; /* MOVABLE, RECLAIMABLE etc */ -- cgit v1.1 From 62997027ca5b3d4618198ed8b1aba40b61b1137b Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 8 Oct 2012 16:32:47 -0700 Subject: mm: compaction: clear PG_migrate_skip based on compaction and reclaim activity Compaction caches if a pageblock was scanned and no pages were isolated so that the pageblocks can be skipped in the future to reduce scanning. This information is not cleared by the page allocator based on activity due to the impact it would have to the page allocator fast paths. Hence there is a requirement that something clear the cache or pageblocks will be skipped forever. Currently the cache is cleared if there were a number of recent allocation failures and it has not been cleared within the last 5 seconds. Time-based decisions like this are terrible as they have no relationship to VM activity and is basically a big hammer. Unfortunately, accurate heuristics would add cost to some hot paths so this patch implements a rough heuristic. There are two cases where the cache is cleared. 1. If a !kswapd process completes a compaction cycle (migrate and free scanner meet), the zone is marked compact_blockskip_flush. When kswapd goes to sleep, it will clear the cache. This is expected to be the common case where the cache is cleared. It does not really matter if kswapd happens to be asleep or going to sleep when the flag is set as it will be woken on the next allocation request. 2. If there have been multiple failures recently and compaction just finished being deferred then a process will clear the cache and start a full scan. This situation happens if there are multiple high-order allocation requests under heavy memory pressure. The clearing of the PG_migrate_skip bits and other scans is inherently racy but the race is harmless. For allocations that can fail such as THP, they will simply fail. For requests that cannot fail, they will retry the allocation. Tests indicated that scanning rates were roughly similar to when the time-based heuristic was used and the allocation success rates were similar. Signed-off-by: Mel Gorman Cc: Rik van Riel Cc: Richard Davies Cc: Shaohua Li Cc: Avi Kivity Cc: Rafael Aquini Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 50 ++++++++++++++++++++++++++++++++++---------------- mm/page_alloc.c | 1 + mm/vmscan.c | 8 ++++++++ 3 files changed, 43 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index f94cbc0..d8187f9 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -66,24 +66,15 @@ static inline bool isolation_suitable(struct compact_control *cc, * should be skipped for page isolation when the migrate and free page scanner * meet. */ -static void reset_isolation_suitable(struct zone *zone) +static void __reset_isolation_suitable(struct zone *zone) { unsigned long start_pfn = zone->zone_start_pfn; unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages; unsigned long pfn; - /* - * Do not reset more than once every five seconds. If allocations are - * failing sufficiently quickly to allow this to happen then continually - * scanning for compaction is not going to help. The choice of five - * seconds is arbitrary but will mitigate excessive scanning. - */ - if (time_before(jiffies, zone->compact_blockskip_expire)) - return; - zone->compact_cached_migrate_pfn = start_pfn; zone->compact_cached_free_pfn = end_pfn; - zone->compact_blockskip_expire = jiffies + (HZ * 5); + zone->compact_blockskip_flush = false; /* Walk the zone and mark every pageblock as suitable for isolation */ for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { @@ -102,9 +93,24 @@ static void reset_isolation_suitable(struct zone *zone) } } +void reset_isolation_suitable(pg_data_t *pgdat) +{ + int zoneid; + + for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { + struct zone *zone = &pgdat->node_zones[zoneid]; + if (!populated_zone(zone)) + continue; + + /* Only flush if a full compaction finished recently */ + if (zone->compact_blockskip_flush) + __reset_isolation_suitable(zone); + } +} + /* * If no pages were isolated then mark this pageblock to be skipped in the - * future. The information is later cleared by reset_isolation_suitable(). + * future. The information is later cleared by __reset_isolation_suitable(). */ static void update_pageblock_skip(struct compact_control *cc, struct page *page, unsigned long nr_isolated, @@ -820,7 +826,15 @@ static int compact_finished(struct zone *zone, /* Compaction run completes if the migrate and free scanner meet */ if (cc->free_pfn <= cc->migrate_pfn) { - reset_isolation_suitable(cc->zone); + /* + * Mark that the PG_migrate_skip information should be cleared + * by kswapd when it goes to sleep. kswapd does not set the + * flag itself as the decision to be clear should be directly + * based on an allocation request. + */ + if (!current_is_kswapd()) + zone->compact_blockskip_flush = true; + return COMPACT_COMPLETE; } @@ -943,9 +957,13 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) zone->compact_cached_migrate_pfn = cc->migrate_pfn; } - /* Clear pageblock skip if there are numerous alloc failures */ - if (zone->compact_defer_shift == COMPACT_MAX_DEFER_SHIFT) - reset_isolation_suitable(zone); + /* + * Clear pageblock skip if there were failures recently and compaction + * is about to be retried after being deferred. kswapd does not do + * this reset as it'll reset the cached information when going to sleep. + */ + if (compaction_restarting(zone, cc->order) && !current_is_kswapd()) + __reset_isolation_suitable(zone); migrate_prep_local(); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 44c5604..b97cf12 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2172,6 +2172,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, preferred_zone, migratetype); if (page) { got_page: + preferred_zone->compact_blockskip_flush = false; preferred_zone->compact_considered = 0; preferred_zone->compact_defer_shift = 0; if (order >= preferred_zone->compact_order_failed) diff --git a/mm/vmscan.c b/mm/vmscan.c index 1ee4b69..b010efc 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2895,6 +2895,14 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) */ set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); + /* + * Compaction records what page blocks it recently failed to + * isolate pages from and skips them in the future scanning. + * When kswapd is going to sleep, it is reasonable to assume + * that pages and compaction may succeed so reset the cache. + */ + reset_isolation_suitable(pgdat); + if (!kthread_should_stop()) schedule(); -- cgit v1.1 From 3f6d4caeb9a9d8f7e5bbf3f49f1fd71e1414ff64 Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Mon, 8 Oct 2012 16:32:50 -0700 Subject: mm/hugetlb.c: remove duplicate inclusion of header file Signed-off-by: Sachin Kamat Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c9b40e3..8536741 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -30,7 +30,6 @@ #include #include #include -#include #include "internal.h" const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; -- cgit v1.1 From 723a0644a7255f532575fd43245f9ef976491328 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 8 Oct 2012 16:32:52 -0700 Subject: mm/page_alloc: refactor out __alloc_contig_migrate_alloc() __alloc_contig_migrate_alloc() can be used by memory-hotplug so refactor it out (move + rename as a common name) into page_isolation.c. [akpm@linux-foundation.org: checkpatch fixes] Signed-off-by: Minchan Kim Cc: Kamezawa Hiroyuki Reviewed-by: Yasuaki Ishimatsu Acked-by: Michal Nazarewicz Cc: Marek Szyprowski Cc: Wen Congyang Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 14 +------------- mm/page_isolation.c | 11 +++++++++++ 2 files changed, 12 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b97cf12..8ac5938 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5667,18 +5667,6 @@ static unsigned long pfn_max_align_up(unsigned long pfn) pageblock_nr_pages)); } -static struct page * -__alloc_contig_migrate_alloc(struct page *page, unsigned long private, - int **resultp) -{ - gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; - - if (PageHighMem(page)) - gfp_mask |= __GFP_HIGHMEM; - - return alloc_page(gfp_mask); -} - /* [start, end) must belong to a single zone. */ static int __alloc_contig_migrate_range(struct compact_control *cc, unsigned long start, unsigned long end) @@ -5714,7 +5702,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, reclaim_clean_pages_from_list(cc->zone, &cc->migratepages); ret = migrate_pages(&cc->migratepages, - __alloc_contig_migrate_alloc, + alloc_migrate_target, 0, false, MIGRATE_SYNC); } diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 5f34a90..f2f5b48 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -255,3 +255,14 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) spin_unlock_irqrestore(&zone->lock, flags); return ret ? 0 : -EBUSY; } + +struct page *alloc_migrate_target(struct page *page, unsigned long private, + int **resultp) +{ + gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; + + if (PageHighMem(page)) + gfp_mask |= __GFP_HIGHMEM; + + return alloc_page(gfp_mask); +} -- cgit v1.1 From 74c08f982674cfd5dfeb2702d631db9bcdabf788 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 8 Oct 2012 16:32:54 -0700 Subject: memory-hotplug: don't replace lowmem pages with highmem The changelog for commit 6a6dccba2fdc ("mm: cma: don't replace lowmem pages with highmem") mentioned that lowmem pages can be replaced by highmem pages during CMA migration. 6a6dccba2fdc fixed that issue. Quote from that changelog: : The filesystem layer expects pages in the block device's mapping to not : be in highmem (the mapping's gfp mask is set in bdget()), but CMA can : currently replace lowmem pages with highmem pages, leading to crashes in : filesystem code such as the one below: : : Unable to handle kernel NULL pointer dereference at virtual address 00000400 : pgd = c0c98000 : [00000400] *pgd=00c91831, *pte=00000000, *ppte=00000000 : Internal error: Oops: 817 [#1] PREEMPT SMP ARM : CPU: 0 Not tainted (3.5.0-rc5+ #80) : PC is at __memzero+0x24/0x80 : ... : Process fsstress (pid: 323, stack limit = 0xc0cbc2f0) : Backtrace: : [] (ext4_getblk+0x0/0x180) from [] (ext4_bread+0x1c/0x98) : [] (ext4_bread+0x0/0x98) from [] (ext4_mkdir+0x160/0x3bc) : r4:c15337f0 : [] (ext4_mkdir+0x0/0x3bc) from [] (vfs_mkdir+0x8c/0x98) : [] (vfs_mkdir+0x0/0x98) from [] (sys_mkdirat+0x74/0xac) : r6:00000000 r5:c152eb40 r4:000001ff r3:c14b43f0 : [] (sys_mkdirat+0x0/0xac) from [] (sys_mkdir+0x20/0x24) : r6:beccdcf0 r5:00074000 r4:beccdbbc : [] (sys_mkdir+0x0/0x24) from [] (ret_fast_syscall+0x0/0x30) Memory-hotplug has same problem as CMA has so the same fix can be applied to memory-hotplug as well. Fix it by reusing. Signed-off-by: Minchan Kim Cc: Kamezawa Hiroyuki Reviewed-by: Yasuaki Ishimatsu Acked-by: Michal Nazarewicz Cc: Marek Szyprowski Cc: Wen Congyang Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index b3501615..f9ac095 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -756,13 +756,6 @@ static unsigned long scan_lru_pages(unsigned long start, unsigned long end) return 0; } -static struct page * -hotremove_migrate_alloc(struct page *page, unsigned long private, int **x) -{ - /* This should be improooooved!! */ - return alloc_page(GFP_HIGHUSER_MOVABLE); -} - #define NR_OFFLINE_AT_ONCE_PAGES (256) static int do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) @@ -813,8 +806,12 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) putback_lru_pages(&source); goto out; } - /* this function returns # of failed pages */ - ret = migrate_pages(&source, hotremove_migrate_alloc, 0, + + /* + * alloc_migrate_target should be improooooved!! + * migrate_pages returns # of failed pages. + */ + ret = migrate_pages(&source, alloc_migrate_target, 0, true, MIGRATE_SYNC); if (ret) putback_lru_pages(&source); -- cgit v1.1 From e3b4126c556ca3a07699adf202d44bed3f453638 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 8 Oct 2012 16:32:57 -0700 Subject: thp: khugepaged_prealloc_page() forgot to reset the page alloc indicator If NUMA is enabled, the indicator is not reset if the previous page request failed, ausing us to trigger the BUG_ON() in khugepaged_alloc_page(). Signed-off-by: Xiao Guangrong Cc: Hugh Dickins Cc: Andrea Arcangeli Cc: Michel Lespinasse Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7cf8b0e..7153e0d1 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1797,6 +1797,7 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) return false; *wait = false; + *hpage = NULL; khugepaged_alloc_sleep(); } else if (*hpage) { put_page(*hpage); -- cgit v1.1 From eab1eef9911c36966b5d5934e6970581b3316013 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 8 Oct 2012 16:33:01 -0700 Subject: mm: thp: fix the update_mmu_cache() last argument passing in mm/huge_memory.c The update_mmu_cache() takes a pointer (to pte_t by default) as the last argument but the huge_memory.c passes a pmd_t value. The patch changes the argument to the pmd_t * pointer. Signed-off-by: Catalin Marinas Signed-off-by: Steve Capper Signed-off-by: Will Deacon Cc: Arnd Bergmann Reviewed-by: Kirill A. Shutemov Cc: Michal Hocko Cc: Gerald Schaefer Reviewed-by: Andrea Arcangeli Cc: Chris Metcalf Cc: Ralf Baechle Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7153e0d1..0e77409 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -889,7 +889,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, entry = pmd_mkyoung(orig_pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1)) - update_mmu_cache(vma, address, entry); + update_mmu_cache(vma, address, pmd); ret |= VM_FAULT_WRITE; goto out_unlock; } @@ -941,7 +941,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, pmdp_clear_flush_notify(vma, haddr, pmd); page_add_new_anon_rmap(new_page, vma, haddr); set_pmd_at(mm, haddr, pmd, entry); - update_mmu_cache(vma, address, entry); + update_mmu_cache(vma, address, pmd); page_remove_rmap(page); put_page(page); ret |= VM_FAULT_WRITE; @@ -2002,7 +2002,7 @@ static void collapse_huge_page(struct mm_struct *mm, BUG_ON(!pmd_none(*pmd)); page_add_new_anon_rmap(new_page, vma, address); set_pmd_at(mm, address, pmd, _pmd); - update_mmu_cache(vma, address, _pmd); + update_mmu_cache(vma, address, pmd); pgtable_trans_huge_deposit(mm, pgtable); spin_unlock(&mm->page_table_lock); -- cgit v1.1 From 05106e6a54aed321191b4bb5c9ee09538cbad3b1 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 8 Oct 2012 16:33:03 -0700 Subject: mm: enable CONFIG_COMPACTION by default Now that lumpy reclaim has been removed, compaction is the only way left to free up contiguous memory areas. It is time to just enable CONFIG_COMPACTION by default. Signed-off-by: Rik van Riel Cc: Mel Gorman Acked-by: Rafael Aquini Acked-by: Johannes Weiner Acked-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index 3322342..a3f8ddd 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -191,6 +191,7 @@ config SPLIT_PTLOCK_CPUS # support for memory compaction config COMPACTION bool "Allow for memory compaction" + def_bool y select MIGRATION depends on MMU help -- cgit v1.1 From 7f1290f2f2a4d2c3f1b7ce8e87256e052ca23125 Mon Sep 17 00:00:00 2001 From: Jianguo Wu Date: Mon, 8 Oct 2012 16:33:06 -0700 Subject: mm: fix-up zone present pages I think zone->present_pages indicates pages that buddy system can management, it should be: zone->present_pages = spanned pages - absent pages - bootmem pages, but is now: zone->present_pages = spanned pages - absent pages - memmap pages. spanned pages: total size, including holes. absent pages: holes. bootmem pages: pages used in system boot, managed by bootmem allocator. memmap pages: pages used by page structs. This may cause zone->present_pages less than it should be. For example, numa node 1 has ZONE_NORMAL and ZONE_MOVABLE, it's memmap and other bootmem will be allocated from ZONE_MOVABLE, so ZONE_NORMAL's present_pages should be spanned pages - absent pages, but now it also minus memmap pages(free_area_init_core), which are actually allocated from ZONE_MOVABLE. When offlining all memory of a zone, this will cause zone->present_pages less than 0, because present_pages is unsigned long type, it is actually a very large integer, it indirectly caused zone->watermark[WMARK_MIN] becomes a large integer(setup_per_zone_wmarks()), than cause totalreserve_pages become a large integer(calculate_totalreserve_pages()), and finally cause memory allocating failure when fork process(__vm_enough_memory()). [root@localhost ~]# dmesg -bash: fork: Cannot allocate memory I think the bug described in http://marc.info/?l=linux-mm&m=134502182714186&w=2 is also caused by wrong zone present pages. This patch intends to fix-up zone->present_pages when memory are freed to buddy system on x86_64 and IA64 platforms. Signed-off-by: Jianguo Wu Signed-off-by: Jiang Liu Reported-by: Petr Tesarik Tested-by: Petr Tesarik Cc: "Luck, Tony" Cc: Mel Gorman Cc: Yinghai Lu Cc: Minchan Kim Cc: Johannes Weiner Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 10 +++++++++- mm/memory_hotplug.c | 7 +++++++ mm/nobootmem.c | 3 +++ mm/page_alloc.c | 34 ++++++++++++++++++++++++++++++++++ 4 files changed, 53 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index f468185..434be4a 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -198,6 +198,8 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) int order = ilog2(BITS_PER_LONG); __free_pages_bootmem(pfn_to_page(start), order); + fixup_zone_present_pages(page_to_nid(pfn_to_page(start)), + start, start + BITS_PER_LONG); count += BITS_PER_LONG; start += BITS_PER_LONG; } else { @@ -208,6 +210,9 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) if (vec & 1) { page = pfn_to_page(start + off); __free_pages_bootmem(page, 0); + fixup_zone_present_pages( + page_to_nid(page), + start + off, start + off + 1); count++; } vec >>= 1; @@ -221,8 +226,11 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) pages = bdata->node_low_pfn - bdata->node_min_pfn; pages = bootmem_bootmap_pages(pages); count += pages; - while (pages--) + while (pages--) { + fixup_zone_present_pages(page_to_nid(page), + page_to_pfn(page), page_to_pfn(page) + 1); __free_pages_bootmem(page++, 0); + } bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index f9ac095..ce690a9 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -106,6 +106,7 @@ static void get_page_bootmem(unsigned long info, struct page *page, void __ref put_page_bootmem(struct page *page) { unsigned long type; + struct zone *zone; type = (unsigned long) page->lru.next; BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || @@ -116,6 +117,12 @@ void __ref put_page_bootmem(struct page *page) set_page_private(page, 0); INIT_LIST_HEAD(&page->lru); __free_pages_bootmem(page, 0); + + zone = page_zone(page); + zone_span_writelock(zone); + zone->present_pages++; + zone_span_writeunlock(zone); + totalram_pages++; } } diff --git a/mm/nobootmem.c b/mm/nobootmem.c index bd82f6b..714d5d6 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -116,6 +116,8 @@ static unsigned long __init __free_memory_core(phys_addr_t start, return 0; __free_pages_memory(start_pfn, end_pfn); + fixup_zone_present_pages(pfn_to_nid(start >> PAGE_SHIFT), + start_pfn, end_pfn); return end_pfn - start_pfn; } @@ -126,6 +128,7 @@ unsigned long __init free_low_memory_core_early(int nodeid) phys_addr_t start, end, size; u64 i; + reset_zone_present_pages(); for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) count += __free_memory_core(start, end); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8ac5938..00750bc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6087,3 +6087,37 @@ void dump_page(struct page *page) dump_page_flags(page->flags); mem_cgroup_print_bad_page(page); } + +/* reset zone->present_pages */ +void reset_zone_present_pages(void) +{ + struct zone *z; + int i, nid; + + for_each_node_state(nid, N_HIGH_MEMORY) { + for (i = 0; i < MAX_NR_ZONES; i++) { + z = NODE_DATA(nid)->node_zones + i; + z->present_pages = 0; + } + } +} + +/* calculate zone's present pages in buddy system */ +void fixup_zone_present_pages(int nid, unsigned long start_pfn, + unsigned long end_pfn) +{ + struct zone *z; + unsigned long zone_start_pfn, zone_end_pfn; + int i; + + for (i = 0; i < MAX_NR_ZONES; i++) { + z = NODE_DATA(nid)->node_zones + i; + zone_start_pfn = z->zone_start_pfn; + zone_end_pfn = zone_start_pfn + z->spanned_pages; + + /* if the two regions intersect */ + if (!(zone_start_pfn >= end_pfn || zone_end_pfn <= start_pfn)) + z->present_pages += min(end_pfn, zone_end_pfn) - + max(start_pfn, zone_start_pfn); + } +} -- cgit v1.1 From 4bd2c1ee4b439d926e437a4841e8145230df98c9 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 8 Oct 2012 16:33:10 -0700 Subject: memcg: cleanup kmem tcp ifdefs TCP kmem accounting is currently guarded by CONFIG_MEMCG_KMEM ifdefs but the code is not used if !CONFIG_INET so we should rather test for both. The same applies to net/sock.h, net/ip.h and net/tcp_memcontrol.h but let's keep those outside of any ifdefs because it is considered safer wrt. future maintainability. Tested with - CONFIG_INET && CONFIG_MEMCG_KMEM - !CONFIG_INET && CONFIG_MEMCG_KMEM - CONFIG_INET && !CONFIG_MEMCG_KMEM - !CONFIG_INET && !CONFIG_MEMCG_KMEM Signed-off-by: Sachin Kamat Signed-off-by: Michal Hocko Cc: Glauber Costa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a72f2ff..4f5f936 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -51,6 +51,7 @@ #include #include "internal.h" #include +#include #include #include @@ -326,7 +327,7 @@ struct mem_cgroup { struct mem_cgroup_stat_cpu nocpu_base; spinlock_t pcp_counter_lock; -#ifdef CONFIG_INET +#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) struct tcp_memcontrol tcp_mem; #endif }; @@ -412,9 +413,7 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) } /* Writing them here to avoid exposing memcg's inner layout */ -#ifdef CONFIG_MEMCG_KMEM -#include -#include +#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) static bool mem_cgroup_is_root(struct mem_cgroup *memcg); void sock_update_memcg(struct sock *sk) @@ -461,7 +460,6 @@ void sock_release_memcg(struct sock *sk) } } -#ifdef CONFIG_INET struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) { if (!memcg || mem_cgroup_is_root(memcg)) @@ -470,10 +468,7 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) return &memcg->tcp_mem.cg_proto; } EXPORT_SYMBOL(tcp_proto_cgroup); -#endif /* CONFIG_INET */ -#endif /* CONFIG_MEMCG_KMEM */ -#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) static void disarm_sock_keys(struct mem_cgroup *memcg) { if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto)) -- cgit v1.1 From 7ffc0edc49d0df5dac077c1830e2533b27d3a4ed Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 8 Oct 2012 16:33:13 -0700 Subject: memcg: move mem_cgroup_is_root upwards kmem code uses this function and it is better to not use forward declarations for static inline functions as some (older) compilers don't like it: gcc version 4.3.4 [gcc-4_3-branch revision 152973] (SUSE Linux) mm/memcontrol.c:421: warning: `mem_cgroup_is_root' declared inline after being called mm/memcontrol.c:421: warning: previous declaration of `mem_cgroup_is_root' was here Signed-off-by: Michal Hocko Cc: Glauber Costa Cc: Sachin Kamat Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4f5f936..7acf43b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -412,10 +412,14 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) return container_of(s, struct mem_cgroup, css); } +static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) +{ + return (memcg == root_mem_cgroup); +} + /* Writing them here to avoid exposing memcg's inner layout */ #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) -static bool mem_cgroup_is_root(struct mem_cgroup *memcg); void sock_update_memcg(struct sock *sk) { if (mem_cgroup_sockets_enabled) { @@ -1011,11 +1015,6 @@ void mem_cgroup_iter_break(struct mem_cgroup *root, iter != NULL; \ iter = mem_cgroup_iter(NULL, iter, NULL)) -static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) -{ - return (memcg == root_mem_cgroup); -} - void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) { struct mem_cgroup *memcg; -- cgit v1.1 From ec4d9f626d5908b6052c2973f37992f1db52e967 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 8 Oct 2012 16:33:14 -0700 Subject: mm: fix invalidate_complete_page2() lock ordering In fuzzing with trinity, lockdep protested "possible irq lock inversion dependency detected" when isolate_lru_page() reenabled interrupts while still holding the supposedly irq-safe tree_lock: invalidate_inode_pages2 invalidate_complete_page2 spin_lock_irq(&mapping->tree_lock) clear_page_mlock isolate_lru_page spin_unlock_irq(&zone->lru_lock) isolate_lru_page() is correct to enable interrupts unconditionally: invalidate_complete_page2() is incorrect to call clear_page_mlock() while holding tree_lock, which is supposed to nest inside lru_lock. Both truncate_complete_page() and invalidate_complete_page() call clear_page_mlock() before taking tree_lock to remove page from radix_tree. I guess invalidate_complete_page2() preferred to test PageDirty (again) under tree_lock before committing to the munlock; but since the page has already been unmapped, its state is already somewhat inconsistent, and no worse if clear_page_mlock() moved up. Reported-by: Sasha Levin Deciphered-by: Andrew Morton Signed-off-by: Hugh Dickins Acked-by: Mel Gorman Cc: Rik van Riel Cc: Johannes Weiner Cc: Michel Lespinasse Cc: Ying Han Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/truncate.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/truncate.c b/mm/truncate.c index 75801ac..f38055c 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -394,11 +394,12 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) return 0; + clear_page_mlock(page); + spin_lock_irq(&mapping->tree_lock); if (PageDirty(page)) goto failed; - clear_page_mlock(page); BUG_ON(page_has_private(page)); __delete_from_page_cache(page); spin_unlock_irq(&mapping->tree_lock); -- cgit v1.1 From 39b5f29ac1f988c1615fbc9c69f6651ab0d0c3c7 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 8 Oct 2012 16:33:18 -0700 Subject: mm: remove vma arg from page_evictable page_evictable(page, vma) is an irritant: almost all its callers pass NULL for vma. Remove the vma arg and use mlocked_vma_newpage(vma, page) explicitly in the couple of places it's needed. But in those places we don't even need page_evictable() itself! They're dealing with a freshly allocated anonymous page, which has no "mapping" and cannot be mlocked yet. Signed-off-by: Hugh Dickins Acked-by: Mel Gorman Cc: Rik van Riel Acked-by: Johannes Weiner Cc: Michel Lespinasse Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 5 ++--- mm/ksm.c | 2 +- mm/rmap.c | 2 +- mm/swap.c | 2 +- mm/vmscan.c | 27 +++++++++------------------ 5 files changed, 14 insertions(+), 24 deletions(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index 7f72f24..78f25d6 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -168,9 +168,8 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma) } /* - * Called only in fault path via page_evictable() for a new page - * to determine if it's being mapped into a LOCKED vma. - * If so, mark page as mlocked. + * Called only in fault path, to determine if a new page is being + * mapped into a LOCKED vma. If it is, mark page as mlocked. */ static inline int mlocked_vma_newpage(struct vm_area_struct *vma, struct page *page) diff --git a/mm/ksm.c b/mm/ksm.c index 14ee5cf..ecbc090 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1586,7 +1586,7 @@ struct page *ksm_does_need_to_copy(struct page *page, SetPageSwapBacked(new_page); __set_page_locked(new_page); - if (page_evictable(new_page, vma)) + if (!mlocked_vma_newpage(vma, new_page)) lru_cache_add_lru(new_page, LRU_ACTIVE_ANON); else add_page_to_unevictable_list(new_page); diff --git a/mm/rmap.c b/mm/rmap.c index 2877741..0d86433 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1080,7 +1080,7 @@ void page_add_new_anon_rmap(struct page *page, else __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); __page_set_anon_rmap(page, vma, address, 1); - if (page_evictable(page, vma)) + if (!mlocked_vma_newpage(vma, page)) lru_cache_add_lru(page, LRU_ACTIVE_ANON); else add_page_to_unevictable_list(page); diff --git a/mm/swap.c b/mm/swap.c index f76c76c..6310dc2 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -751,7 +751,7 @@ void lru_add_page_tail(struct page *page, struct page *page_tail, SetPageLRU(page_tail); - if (page_evictable(page_tail, NULL)) { + if (page_evictable(page_tail)) { if (PageActive(page)) { SetPageActive(page_tail); active = 1; diff --git a/mm/vmscan.c b/mm/vmscan.c index b010efc..8b62730 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -553,7 +553,7 @@ void putback_lru_page(struct page *page) redo: ClearPageUnevictable(page); - if (page_evictable(page, NULL)) { + if (page_evictable(page)) { /* * For evictable pages, we can use the cache. * In event of a race, worst case is we end up with an @@ -587,7 +587,7 @@ redo: * page is on unevictable list, it never be freed. To avoid that, * check after we added it to the list, again. */ - if (lru == LRU_UNEVICTABLE && page_evictable(page, NULL)) { + if (lru == LRU_UNEVICTABLE && page_evictable(page)) { if (!isolate_lru_page(page)) { put_page(page); goto redo; @@ -709,7 +709,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, sc->nr_scanned++; - if (unlikely(!page_evictable(page, NULL))) + if (unlikely(!page_evictable(page))) goto cull_mlocked; if (!sc->may_unmap && page_mapped(page)) @@ -1217,7 +1217,7 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) VM_BUG_ON(PageLRU(page)); list_del(&page->lru); - if (unlikely(!page_evictable(page, NULL))) { + if (unlikely(!page_evictable(page))) { spin_unlock_irq(&zone->lru_lock); putback_lru_page(page); spin_lock_irq(&zone->lru_lock); @@ -1470,7 +1470,7 @@ static void shrink_active_list(unsigned long nr_to_scan, page = lru_to_page(&l_hold); list_del(&page->lru); - if (unlikely(!page_evictable(page, NULL))) { + if (unlikely(!page_evictable(page))) { putback_lru_page(page); continue; } @@ -3414,27 +3414,18 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) /* * page_evictable - test whether a page is evictable * @page: the page to test - * @vma: the VMA in which the page is or will be mapped, may be NULL * * Test whether page is evictable--i.e., should be placed on active/inactive - * lists vs unevictable list. The vma argument is !NULL when called from the - * fault path to determine how to instantate a new page. + * lists vs unevictable list. * * Reasons page might not be evictable: * (1) page's mapping marked unevictable * (2) page is part of an mlocked VMA * */ -int page_evictable(struct page *page, struct vm_area_struct *vma) +int page_evictable(struct page *page) { - - if (mapping_unevictable(page_mapping(page))) - return 0; - - if (PageMlocked(page) || (vma && mlocked_vma_newpage(vma, page))) - return 0; - - return 1; + return !mapping_unevictable(page_mapping(page)) && !PageMlocked(page); } #ifdef CONFIG_SHMEM @@ -3472,7 +3463,7 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages) if (!PageLRU(page) || !PageUnevictable(page)) continue; - if (page_evictable(page, NULL)) { + if (page_evictable(page)) { enum lru_list lru = page_lru_base_type(page); VM_BUG_ON(PageActive(page)); -- cgit v1.1 From e6c509f85455041d3d7c4b863bf80bc294288cc1 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 8 Oct 2012 16:33:19 -0700 Subject: mm: use clear_page_mlock() in page_remove_rmap() We had thought that pages could no longer get freed while still marked as mlocked; but Johannes Weiner posted this program to demonstrate that truncating an mlocked private file mapping containing COWed pages is still mishandled: #include #include #include #include #include #include #include int main(void) { char *map; int fd; system("grep mlockfreed /proc/vmstat"); fd = open("chigurh", O_CREAT|O_EXCL|O_RDWR); unlink("chigurh"); ftruncate(fd, 4096); map = mmap(NULL, 4096, PROT_WRITE, MAP_PRIVATE, fd, 0); map[0] = 11; mlock(map, sizeof(fd)); ftruncate(fd, 0); close(fd); munlock(map, sizeof(fd)); munmap(map, 4096); system("grep mlockfreed /proc/vmstat"); return 0; } The anon COWed pages are not caught by truncation's clear_page_mlock() of the pagecache pages; but unmap_mapping_range() unmaps them, so we ought to look out for them there in page_remove_rmap(). Indeed, why should truncation or invalidation be doing the clear_page_mlock() when removing from pagecache? mlock is a property of mapping in userspace, not a property of pagecache: an mlocked unmapped page is nonsensical. Reported-by: Johannes Weiner Signed-off-by: Hugh Dickins Cc: Mel Gorman Cc: Rik van Riel Cc: Michel Lespinasse Cc: Ying Han Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 7 +------ mm/memory.c | 10 +++++----- mm/mlock.c | 16 +++------------- mm/rmap.c | 4 ++++ mm/truncate.c | 4 ---- 5 files changed, 13 insertions(+), 28 deletions(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index 78f25d6..4dc93e2 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -201,12 +201,7 @@ extern void munlock_vma_page(struct page *page); * If called for a page that is still mapped by mlocked vmas, all we do * is revert to lazy LRU behaviour -- semantics are not broken. */ -extern void __clear_page_mlock(struct page *page); -static inline void clear_page_mlock(struct page *page) -{ - if (unlikely(TestClearPageMlocked(page))) - __clear_page_mlock(page); -} +extern void clear_page_mlock(struct page *page); /* * mlock_migrate_page - called only from migrate_page_copy() to diff --git a/mm/memory.c b/mm/memory.c index d205e43..5f5d1f0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1577,12 +1577,12 @@ split_fallthrough: if (page->mapping && trylock_page(page)) { lru_add_drain(); /* push cached pages to LRU */ /* - * Because we lock page here and migration is - * blocked by the pte's page reference, we need - * only check for file-cache page truncation. + * Because we lock page here, and migration is + * blocked by the pte's page reference, and we + * know the page is still mapped, we don't even + * need to check for file-cache page truncation. */ - if (page->mapping) - mlock_vma_page(page); + mlock_vma_page(page); unlock_page(page); } } diff --git a/mm/mlock.c b/mm/mlock.c index a948be4..de73215 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -51,13 +51,10 @@ EXPORT_SYMBOL(can_do_mlock); /* * LRU accounting for clear_page_mlock() */ -void __clear_page_mlock(struct page *page) +void clear_page_mlock(struct page *page) { - VM_BUG_ON(!PageLocked(page)); - - if (!page->mapping) { /* truncated ? */ + if (!TestClearPageMlocked(page)) return; - } dec_zone_page_state(page, NR_MLOCK); count_vm_event(UNEVICTABLE_PGCLEARED); @@ -290,14 +287,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); if (page && !IS_ERR(page)) { lock_page(page); - /* - * Like in __mlock_vma_pages_range(), - * because we lock page here and migration is - * blocked by the elevated reference, we need - * only check for file-cache page truncation. - */ - if (page->mapping) - munlock_vma_page(page); + munlock_vma_page(page); unlock_page(page); put_page(page); } diff --git a/mm/rmap.c b/mm/rmap.c index 0d86433..bf03149 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1155,7 +1155,10 @@ void page_remove_rmap(struct page *page) } else { __dec_zone_page_state(page, NR_FILE_MAPPED); mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED); + mem_cgroup_end_update_page_stat(page, &locked, &flags); } + if (unlikely(PageMlocked(page))) + clear_page_mlock(page); /* * It would be tidy to reset the PageAnon mapping here, * but that might overwrite a racing page_add_anon_rmap @@ -1165,6 +1168,7 @@ void page_remove_rmap(struct page *page) * Leaving it set also helps swapoff to reinstate ptes * faster for those pages still in swapcache. */ + return; out: if (!anon) mem_cgroup_end_update_page_stat(page, &locked, &flags); diff --git a/mm/truncate.c b/mm/truncate.c index f38055c..d51ce92 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -107,7 +107,6 @@ truncate_complete_page(struct address_space *mapping, struct page *page) cancel_dirty_page(page, PAGE_CACHE_SIZE); - clear_page_mlock(page); ClearPageMappedToDisk(page); delete_from_page_cache(page); return 0; @@ -132,7 +131,6 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) if (page_has_private(page) && !try_to_release_page(page, 0)) return 0; - clear_page_mlock(page); ret = remove_mapping(mapping, page); return ret; @@ -394,8 +392,6 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) return 0; - clear_page_mlock(page); - spin_lock_irq(&mapping->tree_lock); if (PageDirty(page)) goto failed; -- cgit v1.1 From a0c5e813f087dffc0d9b173d2e7d3328b1482fd5 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 8 Oct 2012 16:33:21 -0700 Subject: mm: remove free_page_mlock We should not be seeing non-0 unevictable_pgs_mlockfreed any longer. So remove free_page_mlock() from the page freeing paths: __PG_MLOCKED is already in PAGE_FLAGS_CHECK_AT_FREE, so free_pages_check() will now be checking it, reporting "BUG: Bad page state" if it's ever found set. Comment UNEVICTABLE_MLOCKFREED and unevictable_pgs_mlockfreed always 0. Signed-off-by: Hugh Dickins Acked-by: Mel Gorman Cc: Rik van Riel Cc: Johannes Weiner Cc: Michel Lespinasse Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 17 ----------------- mm/vmstat.c | 2 +- 2 files changed, 1 insertion(+), 18 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 00750bc..dbb5386 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -598,17 +598,6 @@ out: zone->free_area[order].nr_free++; } -/* - * free_page_mlock() -- clean up attempts to free and mlocked() page. - * Page should not be on lru, so no need to fix that up. - * free_pages_check() will verify... - */ -static inline void free_page_mlock(struct page *page) -{ - __dec_zone_page_state(page, NR_MLOCK); - __count_vm_event(UNEVICTABLE_MLOCKFREED); -} - static inline int free_pages_check(struct page *page) { if (unlikely(page_mapcount(page) | @@ -728,15 +717,12 @@ static bool free_pages_prepare(struct page *page, unsigned int order) static void __free_pages_ok(struct page *page, unsigned int order) { unsigned long flags; - int wasMlocked = __TestClearPageMlocked(page); int migratetype; if (!free_pages_prepare(page, order)) return; local_irq_save(flags); - if (unlikely(wasMlocked)) - free_page_mlock(page); __count_vm_events(PGFREE, 1 << order); migratetype = get_pageblock_migratetype(page); set_freepage_migratetype(page, migratetype); @@ -1310,7 +1296,6 @@ void free_hot_cold_page(struct page *page, int cold) struct per_cpu_pages *pcp; unsigned long flags; int migratetype; - int wasMlocked = __TestClearPageMlocked(page); if (!free_pages_prepare(page, 0)) return; @@ -1318,8 +1303,6 @@ void free_hot_cold_page(struct page *page, int cold) migratetype = get_pageblock_migratetype(page); set_freepage_migratetype(page, migratetype); local_irq_save(flags); - if (unlikely(wasMlocked)) - free_page_mlock(page); __count_vm_event(PGFREE); /* diff --git a/mm/vmstat.c b/mm/vmstat.c index acbd85c..05e3a99 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -782,7 +782,7 @@ const char * const vmstat_text[] = { "unevictable_pgs_munlocked", "unevictable_pgs_cleared", "unevictable_pgs_stranded", - "unevictable_pgs_mlockfreed", + "unevictable_pgs_mlockfreed", /* no longer useful: always zero */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE "thp_fault_alloc", -- cgit v1.1 From 957f822a0ab95e88b146638bad6209bbc315bedd Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 8 Oct 2012 16:33:24 -0700 Subject: mm, numa: reclaim from all nodes within reclaim distance RECLAIM_DISTANCE represents the distance between nodes at which it is deemed too costly to allocate from; it's preferred to try to reclaim from a local zone before falling back to allocating on a remote node with such a distance. To do this, zone_reclaim_mode is set if the distance between any two nodes on the system is greather than this distance. This, however, ends up causing the page allocator to reclaim from every zone regardless of its affinity. What we really want is to reclaim only from zones that are closer than RECLAIM_DISTANCE. This patch adds a nodemask to each node that represents the set of nodes that are within this distance. During the zone iteration, if the bit for a zone's node is set for the local node, then reclaim is attempted; otherwise, the zone is skipped. [akpm@linux-foundation.org: fix CONFIG_NUMA=n build] Signed-off-by: David Rientjes Cc: Mel Gorman Cc: Minchan Kim Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 41 ++++++++++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index dbb5386..9b8e624 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1799,6 +1799,22 @@ static void zlc_clear_zones_full(struct zonelist *zonelist) bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); } +static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) +{ + return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes); +} + +static void __paginginit init_zone_allows_reclaim(int nid) +{ + int i; + + for_each_online_node(i) + if (node_distance(nid, i) <= RECLAIM_DISTANCE) { + node_set(i, NODE_DATA(nid)->reclaim_nodes); + zone_reclaim_mode = 1; + } +} + #else /* CONFIG_NUMA */ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) @@ -1819,6 +1835,15 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) static void zlc_clear_zones_full(struct zonelist *zonelist) { } + +static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) +{ + return true; +} + +static inline void init_zone_allows_reclaim(int nid) +{ +} #endif /* CONFIG_NUMA */ /* @@ -1903,7 +1928,8 @@ zonelist_scan: did_zlc_setup = 1; } - if (zone_reclaim_mode == 0) + if (zone_reclaim_mode == 0 || + !zone_allows_reclaim(preferred_zone, zone)) goto this_zone_full; /* @@ -3364,21 +3390,13 @@ static void build_zonelists(pg_data_t *pgdat) j = 0; while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { - int distance = node_distance(local_node, node); - - /* - * If another node is sufficiently far away then it is better - * to reclaim pages in a zone before going off node. - */ - if (distance > RECLAIM_DISTANCE) - zone_reclaim_mode = 1; - /* * We don't want to pressure a particular node. * So adding penalty to the first node in same * distance group to make it round-robin. */ - if (distance != node_distance(local_node, prev_node)) + if (node_distance(local_node, node) != + node_distance(local_node, prev_node)) node_load[node] = load; prev_node = node; @@ -4552,6 +4570,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; + init_zone_allows_reclaim(nid); calculate_node_totalpages(pgdat, zones_size, zholes_size); alloc_node_mem_map(pgdat); -- cgit v1.1 From 36e4f20af833d1ce196e6a4ade05dc26c44652d1 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 8 Oct 2012 16:33:31 -0700 Subject: hugetlb: do not use vma_hugecache_offset() for vma_prio_tree_foreach Commit 0c176d52b0b2 ("mm: hugetlb: fix pgoff computation when unmapping page from vma") fixed pgoff calculation but it has replaced it by vma_hugecache_offset() which is not approapriate for offsets used for vma_prio_tree_foreach() because that one expects index in page units rather than in huge_page_shift. Johannes said: : The resulting index may not be too big, but it can be too small: assume : hpage size of 2M and the address to unmap to be 0x200000. This is regular : page index 512 and hpage index 1. If you have a VMA that maps the file : only starting at the second huge page, that VMAs vm_pgoff will be 512 but : you ask for offset 1 and miss it even though it does map the page of : interest. hugetlb_cow() will try to unmap, miss the vma, and retry the : cow until the allocation succeeds or the skipped vma(s) go away. Signed-off-by: Michal Hocko Acked-by: Hillf Danton Cc: Mel Gorman Cc: KAMEZAWA Hiroyuki Cc: Andrea Arcangeli Cc: David Rientjes Acked-by: Johannes Weiner Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8536741..de5d1dc 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2480,7 +2480,8 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, * from page cache lookup which is in HPAGE_SIZE units. */ address = address & huge_page_mask(h); - pgoff = vma_hugecache_offset(h, vma, address); + pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + + vma->vm_pgoff; mapping = vma->vm_file->f_dentry->d_inode->i_mapping; /* -- cgit v1.1 From 2ec74c3ef2d8c58d71e0e00336fb6b891192155a Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 8 Oct 2012 16:33:33 -0700 Subject: mm: move all mmu notifier invocations to be done outside the PT lock In order to allow sleeping during mmu notifier calls, we need to avoid invoking them under the page table spinlock. This patch solves the problem by calling invalidate_page notification after releasing the lock (but before freeing the page itself), or by wrapping the page invalidation with calls to invalidate_range_begin and invalidate_range_end. To prevent accidental changes to the invalidate_range_end arguments after the call to invalidate_range_begin, the patch introduces a convention of saving the arguments in consistently named locals: unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ ... mmun_start = ... mmun_end = ... mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); ... mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); The patch changes code to use this convention for all calls to mmu_notifier_invalidate_range_start/end, except those where the calls are close enough so that anyone who glances at the code can see the values aren't changing. This patchset is a preliminary step towards on-demand paging design to be added to the RDMA stack. Why do we want on-demand paging for Infiniband? Applications register memory with an RDMA adapter using system calls, and subsequently post IO operations that refer to the corresponding virtual addresses directly to HW. Until now, this was achieved by pinning the memory during the registration calls. The goal of on demand paging is to avoid pinning the pages of registered memory regions (MRs). This will allow users the same flexibility they get when swapping any other part of their processes address spaces. Instead of requiring the entire MR to fit in physical memory, we can allow the MR to be larger, and only fit the current working set in physical memory. Why should anyone care? What problems are users currently experiencing? This can make programming with RDMA much simpler. Today, developers that are working with more data than their RAM can hold need either to deregister and reregister memory regions throughout their process's life, or keep a single memory region and copy the data to it. On demand paging will allow these developers to register a single MR at the beginning of their process's life, and let the operating system manage which pages needs to be fetched at a given time. In the future, we might be able to provide a single memory access key for each process that would provide the entire process's address as one large memory region, and the developers wouldn't need to register memory regions at all. Is there any prospect that any other subsystems will utilise these infrastructural changes? If so, which and how, etc? As for other subsystems, I understand that XPMEM wanted to sleep in MMU notifiers, as Christoph Lameter wrote at http://lkml.indiana.edu/hypermail/linux/kernel/0802.1/0460.html and perhaps Andrea knows about other use cases. Scheduling in mmu notifications is required since we need to sync the hardware with the secondary page tables change. A TLB flush of an IO device is inherently slower than a CPU TLB flush, so our design works by sending the invalidation request to the device, and waiting for an interrupt before exiting the mmu notifier handler. Avi said: kvm may be a buyer. kvm::mmu_lock, which serializes guest page faults, also protects long operations such as destroying large ranges. It would be good to convert it into a spinlock, but as it is used inside mmu notifiers, this cannot be done. (there are alternatives, such as keeping the spinlock and using a generation counter to do the teardown in O(1), which is what the "may" is doing up there). [akpm@linux-foundation.orgpossible speed tweak in hugetlb_cow(), cleanups] Signed-off-by: Andrea Arcangeli Signed-off-by: Sagi Grimberg Signed-off-by: Haggai Eran Cc: Peter Zijlstra Cc: Xiao Guangrong Cc: Or Gerlitz Cc: Haggai Eran Cc: Shachar Raindel Cc: Liran Liss Cc: Christoph Lameter Cc: Avi Kivity Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap_xip.c | 4 +++- mm/huge_memory.c | 42 ++++++++++++++++++++++++++++++++++++------ mm/hugetlb.c | 21 +++++++++++++-------- mm/memory.c | 28 +++++++++++++++++++--------- mm/mremap.c | 8 ++++++-- mm/rmap.c | 18 +++++++++++++++--- 6 files changed, 92 insertions(+), 29 deletions(-) (limited to 'mm') diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index a52daee..a912da6 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -192,11 +192,13 @@ retry: if (pte) { /* Nuke the page table entry. */ flush_cache_page(vma, address, pte_pfn(*pte)); - pteval = ptep_clear_flush_notify(vma, address, pte); + pteval = ptep_clear_flush(vma, address, pte); page_remove_rmap(page); dec_mm_counter(mm, MM_FILEPAGES); BUG_ON(pte_dirty(pteval)); pte_unmap_unlock(pte, ptl); + /* must invalidate_page _before_ freeing the page */ + mmu_notifier_invalidate_page(mm, address); page_cache_release(page); } } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0e77409..08a943b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -787,6 +787,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, pmd_t _pmd; int ret = 0, i; struct page **pages; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR, GFP_KERNEL); @@ -823,12 +825,16 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, cond_resched(); } + mmun_start = haddr; + mmun_end = haddr + HPAGE_PMD_SIZE; + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + spin_lock(&mm->page_table_lock); if (unlikely(!pmd_same(*pmd, orig_pmd))) goto out_free_pages; VM_BUG_ON(!PageHead(page)); - pmdp_clear_flush_notify(vma, haddr, pmd); + pmdp_clear_flush(vma, haddr, pmd); /* leave pmd empty until pte is filled */ pgtable = pgtable_trans_huge_withdraw(mm); @@ -851,6 +857,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, page_remove_rmap(page); spin_unlock(&mm->page_table_lock); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + ret |= VM_FAULT_WRITE; put_page(page); @@ -859,6 +867,7 @@ out: out_free_pages: spin_unlock(&mm->page_table_lock); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); mem_cgroup_uncharge_start(); for (i = 0; i < HPAGE_PMD_NR; i++) { mem_cgroup_uncharge_page(pages[i]); @@ -875,6 +884,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, int ret = 0; struct page *page, *new_page; unsigned long haddr; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ VM_BUG_ON(!vma->anon_vma); spin_lock(&mm->page_table_lock); @@ -925,20 +936,24 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); __SetPageUptodate(new_page); + mmun_start = haddr; + mmun_end = haddr + HPAGE_PMD_SIZE; + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + spin_lock(&mm->page_table_lock); put_page(page); if (unlikely(!pmd_same(*pmd, orig_pmd))) { spin_unlock(&mm->page_table_lock); mem_cgroup_uncharge_page(new_page); put_page(new_page); - goto out; + goto out_mn; } else { pmd_t entry; VM_BUG_ON(!PageHead(page)); entry = mk_pmd(new_page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); entry = pmd_mkhuge(entry); - pmdp_clear_flush_notify(vma, haddr, pmd); + pmdp_clear_flush(vma, haddr, pmd); page_add_new_anon_rmap(new_page, vma, haddr); set_pmd_at(mm, haddr, pmd, entry); update_mmu_cache(vma, address, pmd); @@ -946,10 +961,14 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, put_page(page); ret |= VM_FAULT_WRITE; } -out_unlock: spin_unlock(&mm->page_table_lock); +out_mn: + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); out: return ret; +out_unlock: + spin_unlock(&mm->page_table_lock); + return ret; } struct page *follow_trans_huge_pmd(struct mm_struct *mm, @@ -1162,7 +1181,11 @@ static int __split_huge_page_splitting(struct page *page, struct mm_struct *mm = vma->vm_mm; pmd_t *pmd; int ret = 0; + /* For mmu_notifiers */ + const unsigned long mmun_start = address; + const unsigned long mmun_end = address + HPAGE_PMD_SIZE; + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); spin_lock(&mm->page_table_lock); pmd = page_check_address_pmd(page, mm, address, PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG); @@ -1174,10 +1197,11 @@ static int __split_huge_page_splitting(struct page *page, * and it won't wait on the anon_vma->root->mutex to * serialize against split_huge_page*. */ - pmdp_splitting_flush_notify(vma, address, pmd); + pmdp_splitting_flush(vma, address, pmd); ret = 1; } spin_unlock(&mm->page_table_lock); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); return ret; } @@ -1898,6 +1922,8 @@ static void collapse_huge_page(struct mm_struct *mm, spinlock_t *ptl; int isolated; unsigned long hstart, hend; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ VM_BUG_ON(address & ~HPAGE_PMD_MASK); @@ -1952,6 +1978,9 @@ static void collapse_huge_page(struct mm_struct *mm, pte = pte_offset_map(pmd, address); ptl = pte_lockptr(mm, pmd); + mmun_start = address; + mmun_end = address + HPAGE_PMD_SIZE; + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); spin_lock(&mm->page_table_lock); /* probably unnecessary */ /* * After this gup_fast can't run anymore. This also removes @@ -1959,8 +1988,9 @@ static void collapse_huge_page(struct mm_struct *mm, * huge and small TLB entries for the same virtual address * to avoid the risk of CPU bugs in that area. */ - _pmd = pmdp_clear_flush_notify(vma, address, pmd); + _pmd = pmdp_clear_flush(vma, address, pmd); spin_unlock(&mm->page_table_lock); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); spin_lock(ptl); isolated = __collapse_huge_page_isolate(vma, address, pte); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index de5d1dc..993f7c1 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2355,13 +2355,15 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, struct page *page; struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); + const unsigned long mmun_start = start; /* For mmu_notifiers */ + const unsigned long mmun_end = end; /* For mmu_notifiers */ WARN_ON(!is_vm_hugetlb_page(vma)); BUG_ON(start & ~huge_page_mask(h)); BUG_ON(end & ~huge_page_mask(h)); tlb_start_vma(tlb, vma); - mmu_notifier_invalidate_range_start(mm, start, end); + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); again: spin_lock(&mm->page_table_lock); for (address = start; address < end; address += sz) { @@ -2425,7 +2427,7 @@ again: if (address < end && !ref_page) goto again; } - mmu_notifier_invalidate_range_end(mm, start, end); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); tlb_end_vma(tlb, vma); } @@ -2525,6 +2527,8 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, struct page *old_page, *new_page; int avoidcopy; int outside_reserve = 0; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ old_page = pte_page(pte); @@ -2611,6 +2615,9 @@ retry_avoidcopy: pages_per_huge_page(h)); __SetPageUptodate(new_page); + mmun_start = address & huge_page_mask(h); + mmun_end = mmun_start + huge_page_size(h); + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); /* * Retake the page_table_lock to check for racing updates * before the page tables are altered @@ -2619,9 +2626,6 @@ retry_avoidcopy: ptep = huge_pte_offset(mm, address & huge_page_mask(h)); if (likely(pte_same(huge_ptep_get(ptep), pte))) { /* Break COW */ - mmu_notifier_invalidate_range_start(mm, - address & huge_page_mask(h), - (address & huge_page_mask(h)) + huge_page_size(h)); huge_ptep_clear_flush(vma, address, ptep); set_huge_pte_at(mm, address, ptep, make_huge_pte(vma, new_page, 1)); @@ -2629,10 +2633,11 @@ retry_avoidcopy: hugepage_add_new_anon_rmap(new_page, vma, address); /* Make the old page be freed below */ new_page = old_page; - mmu_notifier_invalidate_range_end(mm, - address & huge_page_mask(h), - (address & huge_page_mask(h)) + huge_page_size(h)); } + spin_unlock(&mm->page_table_lock); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + /* Caller expects lock to be held */ + spin_lock(&mm->page_table_lock); page_cache_release(new_page); page_cache_release(old_page); return 0; diff --git a/mm/memory.c b/mm/memory.c index 5f5d1f0..b03a4a2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -712,7 +712,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, add_taint(TAINT_BAD_PAGE); } -static inline int is_cow_mapping(vm_flags_t flags) +static inline bool is_cow_mapping(vm_flags_t flags) { return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; } @@ -1039,6 +1039,9 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, unsigned long next; unsigned long addr = vma->vm_start; unsigned long end = vma->vm_end; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ + bool is_cow; int ret; /* @@ -1072,8 +1075,12 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, * parent mm. And a permission downgrade will only happen if * is_cow_mapping() returns true. */ - if (is_cow_mapping(vma->vm_flags)) - mmu_notifier_invalidate_range_start(src_mm, addr, end); + is_cow = is_cow_mapping(vma->vm_flags); + mmun_start = addr; + mmun_end = end; + if (is_cow) + mmu_notifier_invalidate_range_start(src_mm, mmun_start, + mmun_end); ret = 0; dst_pgd = pgd_offset(dst_mm, addr); @@ -1089,9 +1096,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, } } while (dst_pgd++, src_pgd++, addr = next, addr != end); - if (is_cow_mapping(vma->vm_flags)) - mmu_notifier_invalidate_range_end(src_mm, - vma->vm_start, end); + if (is_cow) + mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end); return ret; } @@ -2516,7 +2522,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, spinlock_t *ptl, pte_t orig_pte) __releases(ptl) { - struct page *old_page, *new_page; + struct page *old_page, *new_page = NULL; pte_t entry; int ret = 0; int page_mkwrite = 0; @@ -2760,10 +2766,14 @@ gotten: } else mem_cgroup_uncharge_page(new_page); - if (new_page) - page_cache_release(new_page); unlock: pte_unmap_unlock(page_table, ptl); + if (new_page) { + if (new_page == old_page) + /* cow happened, notify before releasing old_page */ + mmu_notifier_invalidate_page(mm, address); + page_cache_release(new_page); + } if (old_page) { /* * Don't let another task, with possibly unlocked vma, diff --git a/mm/mremap.c b/mm/mremap.c index 3b639a4..1b61c2d 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -149,11 +149,15 @@ unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long extent, next, old_end; pmd_t *old_pmd, *new_pmd; bool need_flush = false; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ old_end = old_addr + len; flush_cache_range(vma, old_addr, old_end); - mmu_notifier_invalidate_range_start(vma->vm_mm, old_addr, old_end); + mmun_start = old_addr; + mmun_end = old_end; + mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); for (; old_addr < old_end; old_addr += extent, new_addr += extent) { cond_resched(); @@ -197,7 +201,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, if (likely(need_flush)) flush_tlb_range(vma, old_end-len, old_addr); - mmu_notifier_invalidate_range_end(vma->vm_mm, old_end-len, old_end); + mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); return len + old_addr - old_end; /* how much done */ } diff --git a/mm/rmap.c b/mm/rmap.c index bf03149..7df7984 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -884,7 +884,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, pte_t entry; flush_cache_page(vma, address, pte_pfn(*pte)); - entry = ptep_clear_flush_notify(vma, address, pte); + entry = ptep_clear_flush(vma, address, pte); entry = pte_wrprotect(entry); entry = pte_mkclean(entry); set_pte_at(mm, address, pte, entry); @@ -892,6 +892,9 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, } pte_unmap_unlock(pte, ptl); + + if (ret) + mmu_notifier_invalidate_page(mm, address); out: return ret; } @@ -1212,7 +1215,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, /* Nuke the page table entry. */ flush_cache_page(vma, address, page_to_pfn(page)); - pteval = ptep_clear_flush_notify(vma, address, pte); + pteval = ptep_clear_flush(vma, address, pte); /* Move the dirty bit to the physical page now the pte is gone. */ if (pte_dirty(pteval)) @@ -1274,6 +1277,8 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, out_unmap: pte_unmap_unlock(pte, ptl); + if (ret != SWAP_FAIL) + mmu_notifier_invalidate_page(mm, address); out: return ret; @@ -1338,6 +1343,8 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, spinlock_t *ptl; struct page *page; unsigned long address; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ unsigned long end; int ret = SWAP_AGAIN; int locked_vma = 0; @@ -1361,6 +1368,10 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, if (!pmd_present(*pmd)) return ret; + mmun_start = address; + mmun_end = end; + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + /* * If we can acquire the mmap_sem for read, and vma is VM_LOCKED, * keep the sem while scanning the cluster for mlocking pages. @@ -1394,7 +1405,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, /* Nuke the page table entry. */ flush_cache_page(vma, address, pte_pfn(*pte)); - pteval = ptep_clear_flush_notify(vma, address, pte); + pteval = ptep_clear_flush(vma, address, pte); /* If nonlinear, store the file page offset in the pte. */ if (page->index != linear_page_index(vma, address)) @@ -1410,6 +1421,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, (*mapcount)--; } pte_unmap_unlock(pte - 1, ptl); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); if (locked_vma) up_read(&vma->vm_mm->mmap_sem); return ret; -- cgit v1.1 From 6bdb913f0a70a4dfb7f066fb15e2d6f960701d00 Mon Sep 17 00:00:00 2001 From: Haggai Eran Date: Mon, 8 Oct 2012 16:33:35 -0700 Subject: mm: wrap calls to set_pte_at_notify with invalidate_range_start and invalidate_range_end In order to allow sleeping during invalidate_page mmu notifier calls, we need to avoid calling when holding the PT lock. In addition to its direct calls, invalidate_page can also be called as a substitute for a change_pte call, in case the notifier client hasn't implemented change_pte. This patch drops the invalidate_page call from change_pte, and instead wraps all calls to change_pte with invalidate_range_start and invalidate_range_end calls. Note that change_pte still cannot sleep after this patch, and that clients implementing change_pte should not take action on it in case the number of outstanding invalidate_range_start calls is larger than one, otherwise they might miss a later invalidation. Signed-off-by: Haggai Eran Cc: Andrea Arcangeli Cc: Sagi Grimberg Cc: Peter Zijlstra Cc: Xiao Guangrong Cc: Or Gerlitz Cc: Haggai Eran Cc: Shachar Raindel Cc: Liran Liss Cc: Christoph Lameter Cc: Avi Kivity Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 21 +++++++++++++++++++-- mm/memory.c | 18 ++++++++++++------ mm/mmu_notifier.c | 6 ------ 3 files changed, 31 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/ksm.c b/mm/ksm.c index ecbc090..ae539f0 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -709,15 +709,22 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, spinlock_t *ptl; int swapped; int err = -EFAULT; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ addr = page_address_in_vma(page, vma); if (addr == -EFAULT) goto out; BUG_ON(PageTransCompound(page)); + + mmun_start = addr; + mmun_end = addr + PAGE_SIZE; + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + ptep = page_check_address(page, mm, addr, &ptl, 0); if (!ptep) - goto out; + goto out_mn; if (pte_write(*ptep) || pte_dirty(*ptep)) { pte_t entry; @@ -752,6 +759,8 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, out_unlock: pte_unmap_unlock(ptep, ptl); +out_mn: + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); out: return err; } @@ -776,6 +785,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, spinlock_t *ptl; unsigned long addr; int err = -EFAULT; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ addr = page_address_in_vma(page, vma); if (addr == -EFAULT) @@ -794,10 +805,14 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, if (!pmd_present(*pmd)) goto out; + mmun_start = addr; + mmun_end = addr + PAGE_SIZE; + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); if (!pte_same(*ptep, orig_pte)) { pte_unmap_unlock(ptep, ptl); - goto out; + goto out_mn; } get_page(kpage); @@ -814,6 +829,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, pte_unmap_unlock(ptep, ptl); err = 0; +out_mn: + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); out: return err; } diff --git a/mm/memory.c b/mm/memory.c index b03a4a2..01ec048 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2527,6 +2527,9 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, int ret = 0; int page_mkwrite = 0; struct page *dirty_page = NULL; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ + bool mmun_called = false; /* For mmu_notifiers */ old_page = vm_normal_page(vma, address, orig_pte); if (!old_page) { @@ -2704,6 +2707,11 @@ gotten: if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) goto oom_free_new; + mmun_start = address & PAGE_MASK; + mmun_end = (address & PAGE_MASK) + PAGE_SIZE; + mmun_called = true; + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + /* * Re-check the pte - we dropped the lock */ @@ -2766,14 +2774,12 @@ gotten: } else mem_cgroup_uncharge_page(new_page); + if (new_page) + page_cache_release(new_page); unlock: pte_unmap_unlock(page_table, ptl); - if (new_page) { - if (new_page == old_page) - /* cow happened, notify before releasing old_page */ - mmu_notifier_invalidate_page(mm, address); - page_cache_release(new_page); - } + if (mmun_called) + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); if (old_page) { /* * Don't let another task, with possibly unlocked vma, diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index c297142..479a1e7 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -137,12 +137,6 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { if (mn->ops->change_pte) mn->ops->change_pte(mn, mm, address, pte); - /* - * Some drivers don't have change_pte, - * so we must call invalidate_page in that case. - */ - else if (mn->ops->invalidate_page) - mn->ops->invalidate_page(mn, mm, address); } srcu_read_unlock(&srcu, id); } -- cgit v1.1 From 082708072a4250f5c4dbc62065e7af93f5e45646 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 8 Oct 2012 16:33:38 -0700 Subject: mm: revert 0def08e3 ("mm/mempolicy.c: check return code of check_range") Revert commit 0def08e3acc2 because check_range can't fail in migrate_to_node with considering current usecases. Quote from Johannes : I think it makes sense to revert. Not because of the semantics, but I : just don't see how check_range() could even fail for this callsite: : : 1. we pass mm->mmap->vm_start in there, so we should not fail due to : find_vma() : : 2. we pass MPOL_MF_DISCONTIG_OK, so the discontig checks do not apply : and so can not fail : : 3. we pass MPOL_MF_MOVE | MPOL_MF_MOVE_ALL, the page table loops will : continue until addr == end, so we never fail with -EIO And I added a new VM_BUG_ON for checking migrate_to_node's future usecase which might pass to MPOL_MF_STRICT. Suggested-by: Johannes Weiner Signed-off-by: Minchan Kim Acked-by: KOSAKI Motohiro Cc: Mel Gorman Cc: Christoph Lameter Cc: David Rientjes Cc: Vasiliy Kulikov Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 3d64b36..0b78fb9 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -946,15 +946,18 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, nodemask_t nmask; LIST_HEAD(pagelist); int err = 0; - struct vm_area_struct *vma; nodes_clear(nmask); node_set(source, nmask); - vma = check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, + /* + * This does not "check" the range but isolates all pages that + * need migration. Between passing in the full user address + * space range and MPOL_MF_DISCONTIG_OK, this call can not fail. + */ + VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); + check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, flags | MPOL_MF_DISCONTIG_OK, &pagelist); - if (IS_ERR(vma)) - return PTR_ERR(vma); if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, new_node_page, dest, -- cgit v1.1 From 5a883813845a2bb5ed2bd8c9240736c0740b156f Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 8 Oct 2012 16:33:39 -0700 Subject: memory-hotplug: fix zone stat mismatch During memory-hotplug, I found NR_ISOLATED_[ANON|FILE] are increasing, causing the kernel to hang. When the system doesn't have enough free pages, it enters reclaim but never reclaim any pages due to too_many_isolated()==true and loops forever. The cause is that when we do memory-hotadd after memory-remove, __zone_pcp_update() clears a zone's ZONE_STAT_ITEMS in setup_pageset() although the vm_stat_diff of all CPUs still have values. In addtion, when we offline all pages of the zone, we reset them in zone_pcp_reset without draining so we loss some zone stat item. Reviewed-by: Wen Congyang Signed-off-by: Minchan Kim Cc: Kamezawa Hiroyuki Cc: Yasuaki Ishimatsu Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 7 +++++++ mm/vmstat.c | 12 ++++++++++++ 2 files changed, 19 insertions(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9b8e624..5485f0e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5916,6 +5916,7 @@ static int __meminit __zone_pcp_update(void *data) local_irq_save(flags); if (pcp->count > 0) free_pcppages_bulk(zone, pcp->count, pcp); + drain_zonestat(zone, pset); setup_pageset(pset, batch); local_irq_restore(flags); } @@ -5932,10 +5933,16 @@ void __meminit zone_pcp_update(struct zone *zone) void zone_pcp_reset(struct zone *zone) { unsigned long flags; + int cpu; + struct per_cpu_pageset *pset; /* avoid races with drain_pages() */ local_irq_save(flags); if (zone->pageset != &boot_pageset) { + for_each_online_cpu(cpu) { + pset = per_cpu_ptr(zone->pageset, cpu); + drain_zonestat(zone, pset); + } free_percpu(zone->pageset); zone->pageset = &boot_pageset; } diff --git a/mm/vmstat.c b/mm/vmstat.c index 05e3a99..2f11309 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -495,6 +495,18 @@ void refresh_cpu_vm_stats(int cpu) atomic_long_add(global_diff[i], &vm_stat[i]); } +void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset) +{ + int i; + + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + if (pset->vm_stat_diff[i]) { + int v = pset->vm_stat_diff[i]; + pset->vm_stat_diff[i] = 0; + atomic_long_add(v, &zone->vm_stat[i]); + atomic_long_add(v, &vm_stat[i]); + } +} #endif #ifdef CONFIG_NUMA -- cgit v1.1 From 8befedfe678ff84d5bc356be4f3cb1fd84959d02 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 8 Oct 2012 16:33:42 -0700 Subject: mm: remove unevictable_pgs_mlockfreed Simply remove UNEVICTABLE_MLOCKFREED and unevictable_pgs_mlockfreed line from /proc/vmstat: Johannes and Mel point out that it was very unlikely to have been used by any tool, and of course we can restore it easily enough if that turns out to be wrong. Signed-off-by: Hugh Dickins Cc: Mel Gorman Cc: Rik van Riel Cc: Johannes Weiner Cc: Michel Lespinasse Cc: Ying Han Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm') diff --git a/mm/vmstat.c b/mm/vmstat.c index 2f11309..c737057 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -794,7 +794,6 @@ const char * const vmstat_text[] = { "unevictable_pgs_munlocked", "unevictable_pgs_cleared", "unevictable_pgs_stranded", - "unevictable_pgs_mlockfreed", /* no longer useful: always zero */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE "thp_fault_alloc", -- cgit v1.1 From c462f179e4d273f0da0e7cf302e29b0edf43844e Mon Sep 17 00:00:00 2001 From: "Robert P. J. Day" Date: Mon, 8 Oct 2012 16:33:43 -0700 Subject: mm/memory.c: fix typo in comment Signed-off-by: Robert P. J. Day Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 01ec048..45bb6d2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2148,7 +2148,7 @@ out: * @addr: target user address of this page * @pfn: source kernel pfn * - * Similar to vm_inert_page, this allows drivers to insert individual pages + * Similar to vm_insert_page, this allows drivers to insert individual pages * they've allocated into a user vma. Same comments apply. * * This function should only be called from a vm_ops->fault handler, and -- cgit v1.1 From e46a28790e594c0876d1a84270926abf75460f61 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 8 Oct 2012 16:33:48 -0700 Subject: CMA: migrate mlocked pages Presently CMA cannot migrate mlocked pages so it ends up failing to allocate contiguous memory space. This patch makes mlocked pages be migrated out. Of course, it can affect realtime processes but in CMA usecase, contiguous memory allocation failing is far worse than access latency to an mlocked page being variable while CMA is running. If someone wants to make the system realtime, he shouldn't enable CMA because stalls can still happen at random times. [akpm@linux-foundation.org: tweak comment text, per Mel] Signed-off-by: Minchan Kim Acked-by: Mel Gorman Cc: Michal Nazarewicz Cc: Bartlomiej Zolnierkiewicz Cc: Marek Szyprowski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 8 ++++++-- mm/internal.h | 2 +- mm/page_alloc.c | 2 +- mm/vmscan.c | 4 ++-- 4 files changed, 10 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index d8187f9..2c4ce17 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -461,6 +461,7 @@ static bool too_many_isolated(struct zone *zone) * @cc: Compaction control structure. * @low_pfn: The first PFN of the range. * @end_pfn: The one-past-the-last PFN of the range. + * @unevictable: true if it allows to isolate unevictable pages * * Isolate all pages that can be migrated from the range specified by * [low_pfn, end_pfn). Returns zero if there is a fatal signal @@ -476,7 +477,7 @@ static bool too_many_isolated(struct zone *zone) */ unsigned long isolate_migratepages_range(struct zone *zone, struct compact_control *cc, - unsigned long low_pfn, unsigned long end_pfn) + unsigned long low_pfn, unsigned long end_pfn, bool unevictable) { unsigned long last_pageblock_nr = 0, pageblock_nr; unsigned long nr_scanned = 0, nr_isolated = 0; @@ -602,6 +603,9 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, if (!cc->sync) mode |= ISOLATE_ASYNC_MIGRATE; + if (unevictable) + mode |= ISOLATE_UNEVICTABLE; + lruvec = mem_cgroup_page_lruvec(page, zone); /* Try isolate the page */ @@ -807,7 +811,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, } /* Perform the isolation */ - low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn); + low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn, false); if (!low_pfn || cc->contended) return ISOLATE_ABORT; diff --git a/mm/internal.h b/mm/internal.h index 4dc93e2..f5f295f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -138,7 +138,7 @@ isolate_freepages_range(struct compact_control *cc, unsigned long start_pfn, unsigned long end_pfn); unsigned long isolate_migratepages_range(struct zone *zone, struct compact_control *cc, - unsigned long low_pfn, unsigned long end_pfn); + unsigned long low_pfn, unsigned long end_pfn, bool unevictable); #endif diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5485f0e..fd86c47 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5690,7 +5690,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, if (list_empty(&cc->migratepages)) { cc->nr_migratepages = 0; pfn = isolate_migratepages_range(cc->zone, cc, - pfn, end); + pfn, end, true); if (!pfn) { ret = -EINTR; break; diff --git a/mm/vmscan.c b/mm/vmscan.c index 8b62730..2624edc 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1009,8 +1009,8 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode) if (!PageLRU(page)) return ret; - /* Do not give back unevictable pages for compaction */ - if (PageUnevictable(page)) + /* Compaction should not handle unevictable pages but CMA can do so */ + if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE)) return ret; ret = -EBUSY; -- cgit v1.1 From beb51eaa88238daba698ad837222ad277d440b6d Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 8 Oct 2012 16:33:51 -0700 Subject: cma: decrease cc.nr_migratepages after reclaiming pagelist reclaim_clean_pages_from_list() reclaims clean pages before migration so cc.nr_migratepages should be updated. Currently, there is no problem but it can be wrong if we try to use the value in future. Signed-off-by: Minchan Kim Acked-by: Mel Gorman Cc: Michal Nazarewicz Cc: Bartlomiej Zolnierkiewicz Cc: Marek Szyprowski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fd86c47..bb90971 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5674,7 +5674,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, unsigned long start, unsigned long end) { /* This function is based on compact_zone() from compaction.c. */ - + unsigned long nr_reclaimed; unsigned long pfn = start; unsigned int tries = 0; int ret = 0; @@ -5701,7 +5701,9 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, break; } - reclaim_clean_pages_from_list(cc->zone, &cc->migratepages); + nr_reclaimed = reclaim_clean_pages_from_list(cc->zone, + &cc->migratepages); + cc->nr_migratepages -= nr_reclaimed; ret = migrate_pages(&cc->migratepages, alloc_migrate_target, -- cgit v1.1 From c22331166b49b94b51424bb7fa1f83e09ad82734 Mon Sep 17 00:00:00 2001 From: Raghavendra D Prabhu Date: Mon, 8 Oct 2012 16:33:55 -0700 Subject: mm: avoid section mismatch warning for memblock_type_name Following section mismatch warning is thrown during build; WARNING: vmlinux.o(.text+0x32408f): Section mismatch in reference from the function memblock_type_name() to the variable .meminit.data:memblock The function memblock_type_name() references the variable __meminitdata memblock. This is often because memblock_type_name lacks a __meminitdata annotation or the annotation of memblock is wrong. This is because memblock_type_name makes reference to memblock variable with attribute __meminitdata. Hence, the warning (even if the function is inline). [akpm@linux-foundation.org: remove inline] Signed-off-by: Raghavendra D Prabhu Cc: Tejun Heo Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index d809f17..931eef1 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -41,7 +41,8 @@ static int memblock_memory_in_slab __initdata_memblock = 0; static int memblock_reserved_in_slab __initdata_memblock = 0; /* inline so we don't get a warning when pr_debug is compiled out */ -static inline const char *memblock_type_name(struct memblock_type *type) +static __init_memblock const char * +memblock_type_name(struct memblock_type *type) { if (type == &memblock.memory) return "memory"; -- cgit v1.1 From a16cee10c7ab994546ed98d9abfd4de74050124a Mon Sep 17 00:00:00 2001 From: Wen Congyang Date: Mon, 8 Oct 2012 16:33:58 -0700 Subject: memory-hotplug: preparation to notify memory block's state at memory hot remove remove_memory() is called in two cases: 1. echo offline >/sys/devices/system/memory/memoryXX/state 2. hot remove a memory device In the 1st case, the memory block's state is changed and the notification that memory block's state changed is sent to userland after calling remove_memory(). So user can notice memory block is changed. But in the 2nd case, the memory block's state is not changed and the notification is not also sent to userspcae even if calling remove_memory(). So user cannot notice memory block is changed. For adding the notification at memory hot remove, the patch just prepare as follows: 1st case uses offline_pages() for offlining memory. 2nd case uses remove_memory() for offlining memory and changing memory block's state and notifing the information. The patch does not implement notification to remove_memory(). Signed-off-by: Wen Congyang Signed-off-by: Yasuaki Ishimatsu Cc: David Rientjes Cc: Jiang Liu Cc: Len Brown Cc: Christoph Lameter Cc: Minchan Kim Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index ce690a9..dfc0a61 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -874,7 +874,7 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) return offlined; } -static int __ref offline_pages(unsigned long start_pfn, +static int __ref __offline_pages(unsigned long start_pfn, unsigned long end_pfn, unsigned long timeout) { unsigned long pfn, nr_pages, expire; @@ -1007,15 +1007,24 @@ out: return ret; } +int offline_pages(unsigned long start_pfn, unsigned long nr_pages) +{ + return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); +} + int remove_memory(u64 start, u64 size) { unsigned long start_pfn, end_pfn; start_pfn = PFN_DOWN(start); end_pfn = start_pfn + PFN_DOWN(size); - return offline_pages(start_pfn, end_pfn, 120 * HZ); + return __offline_pages(start_pfn, end_pfn, 120 * HZ); } #else +int offline_pages(unsigned long start_pfn, unsigned long nr_pages) +{ + return -EINVAL; +} int remove_memory(u64 start, u64 size) { return -EINVAL; -- cgit v1.1 From e90bdb7f52f94204c78fb40b0804645defdebd71 Mon Sep 17 00:00:00 2001 From: Wen Congyang Date: Mon, 8 Oct 2012 16:34:01 -0700 Subject: memory-hotplug: update memory block's state and notify userspace remove_memory() will be called when hot removing a memory device. But even if offlining memory, we cannot notice it. So the patch updates the memory block's state and sends notification to userspace. Additionally, the memory device may contain more than one memory block. If the memory block has been offlined, __offline_pages() will fail. So we should try to offline one memory block at a time. Thus remove_memory() also check each memory block's state. So there is no need to check the memory block's state before calling remove_memory(). Signed-off-by: Wen Congyang Signed-off-by: Yasuaki Ishimatsu Cc: David Rientjes Cc: Jiang Liu Cc: Len Brown Cc: Christoph Lameter Cc: Minchan Kim Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index dfc0a61..7d07974 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1014,11 +1014,42 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages) int remove_memory(u64 start, u64 size) { + struct memory_block *mem = NULL; + struct mem_section *section; unsigned long start_pfn, end_pfn; + unsigned long pfn, section_nr; + int ret; start_pfn = PFN_DOWN(start); end_pfn = start_pfn + PFN_DOWN(size); - return __offline_pages(start_pfn, end_pfn, 120 * HZ); + + for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { + section_nr = pfn_to_section_nr(pfn); + if (!present_section_nr(section_nr)) + continue; + + section = __nr_to_section(section_nr); + /* same memblock? */ + if (mem) + if ((section_nr >= mem->start_section_nr) && + (section_nr <= mem->end_section_nr)) + continue; + + mem = find_memory_block_hinted(section, mem); + if (!mem) + continue; + + ret = offline_memory_block(mem); + if (ret) { + kobject_put(&mem->dev.kobj); + return ret; + } + } + + if (mem) + kobject_put(&mem->dev.kobj); + + return 0; } #else int offline_pages(unsigned long start_pfn, unsigned long nr_pages) -- cgit v1.1 From b676b293fb48672904ee1b9828cb50b4eed01717 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 8 Oct 2012 16:34:03 -0700 Subject: mm, thp: fix mapped pages avoiding unevictable list on mlock When a transparent hugepage is mapped and it is included in an mlock() range, follow_page() incorrectly avoids setting the page's mlock bit and moving it to the unevictable lru. This is evident if you try to mlock(), munlock(), and then mlock() a range again. Currently: #define MAP_SIZE (4 << 30) /* 4GB */ void *ptr = mmap(NULL, MAP_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); mlock(ptr, MAP_SIZE); $ grep -E "Unevictable|Inactive\(anon" /proc/meminfo Inactive(anon): 6304 kB Unevictable: 4213924 kB munlock(ptr, MAP_SIZE); Inactive(anon): 4186252 kB Unevictable: 19652 kB mlock(ptr, MAP_SIZE); Inactive(anon): 4198556 kB Unevictable: 21684 kB Notice that less than 2MB was added to the unevictable list; this is because these pages in the range are not transparent hugepages since the 4GB range was allocated with mmap() and has no specific alignment. If posix_memalign() were used instead, unevictable would not have grown at all on the second mlock(). The fix is to call mlock_vma_page() so that the mlock bit is set and the page is added to the unevictable list. With this patch: mlock(ptr, MAP_SIZE); Inactive(anon): 4056 kB Unevictable: 4213940 kB munlock(ptr, MAP_SIZE); Inactive(anon): 4198268 kB Unevictable: 19636 kB mlock(ptr, MAP_SIZE); Inactive(anon): 4008 kB Unevictable: 4213940 kB Signed-off-by: David Rientjes Acked-by: Hugh Dickins Reviewed-by: Andrea Arcangeli Cc: Naoya Horiguchi Cc: KAMEZAWA Hiroyuki Cc: Johannes Weiner Cc: Michel Lespinasse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 11 ++++++++++- mm/memory.c | 2 +- 2 files changed, 11 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 08a943b..3a8d6b7 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -971,11 +971,12 @@ out_unlock: return ret; } -struct page *follow_trans_huge_pmd(struct mm_struct *mm, +struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, unsigned int flags) { + struct mm_struct *mm = vma->vm_mm; struct page *page = NULL; assert_spin_locked(&mm->page_table_lock); @@ -998,6 +999,14 @@ struct page *follow_trans_huge_pmd(struct mm_struct *mm, _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd); } + if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { + if (page->mapping && trylock_page(page)) { + lru_add_drain(); + if (page->mapping) + mlock_vma_page(page); + unlock_page(page); + } + } page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; VM_BUG_ON(!PageCompound(page)); if (flags & FOLL_GET) diff --git a/mm/memory.c b/mm/memory.c index 45bb6d2..fb135ba 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1528,7 +1528,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, spin_unlock(&mm->page_table_lock); wait_split_huge_page(vma->anon_vma, pmd); } else { - page = follow_trans_huge_pmd(mm, address, + page = follow_trans_huge_pmd(vma, address, pmd, flags); spin_unlock(&mm->page_table_lock); goto out; -- cgit v1.1 From 8449d21fb49e9824e2736c5febd6b5d287cd2ba1 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 8 Oct 2012 16:34:06 -0700 Subject: mm, thp: fix mlock statistics NR_MLOCK is only accounted in single page units: there's no logic to handle transparent hugepages. This patch checks the appropriate number of pages to adjust the statistics by so that the correct amount of memory is reflected. Currently: $ grep Mlocked /proc/meminfo Mlocked: 19636 kB #define MAP_SIZE (4 << 30) /* 4GB */ void *ptr = mmap(NULL, MAP_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); mlock(ptr, MAP_SIZE); $ grep Mlocked /proc/meminfo Mlocked: 29844 kB munlock(ptr, MAP_SIZE); $ grep Mlocked /proc/meminfo Mlocked: 19636 kB And with this patch: $ grep Mlock /proc/meminfo Mlocked: 19636 kB mlock(ptr, MAP_SIZE); $ grep Mlock /proc/meminfo Mlocked: 4213664 kB munlock(ptr, MAP_SIZE); $ grep Mlock /proc/meminfo Mlocked: 19636 kB Signed-off-by: David Rientjes Reported-by: Hugh Dickens Acked-by: Hugh Dickins Reviewed-by: Andrea Arcangeli Cc: Naoya Horiguchi Cc: KAMEZAWA Hiroyuki Cc: Johannes Weiner Reviewed-by: Michel Lespinasse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 3 ++- mm/mlock.c | 9 ++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index f5f295f..a4fa284 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -180,7 +180,8 @@ static inline int mlocked_vma_newpage(struct vm_area_struct *vma, return 0; if (!TestSetPageMlocked(page)) { - inc_zone_page_state(page, NR_MLOCK); + mod_zone_page_state(page_zone(page), NR_MLOCK, + hpage_nr_pages(page)); count_vm_event(UNEVICTABLE_PGMLOCKED); } return 1; diff --git a/mm/mlock.c b/mm/mlock.c index de73215..f0b9ce5 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -56,7 +56,8 @@ void clear_page_mlock(struct page *page) if (!TestClearPageMlocked(page)) return; - dec_zone_page_state(page, NR_MLOCK); + mod_zone_page_state(page_zone(page), NR_MLOCK, + -hpage_nr_pages(page)); count_vm_event(UNEVICTABLE_PGCLEARED); if (!isolate_lru_page(page)) { putback_lru_page(page); @@ -78,7 +79,8 @@ void mlock_vma_page(struct page *page) BUG_ON(!PageLocked(page)); if (!TestSetPageMlocked(page)) { - inc_zone_page_state(page, NR_MLOCK); + mod_zone_page_state(page_zone(page), NR_MLOCK, + hpage_nr_pages(page)); count_vm_event(UNEVICTABLE_PGMLOCKED); if (!isolate_lru_page(page)) putback_lru_page(page); @@ -105,7 +107,8 @@ void munlock_vma_page(struct page *page) BUG_ON(!PageLocked(page)); if (TestClearPageMlocked(page)) { - dec_zone_page_state(page, NR_MLOCK); + mod_zone_page_state(page_zone(page), NR_MLOCK, + -hpage_nr_pages(page)); if (!isolate_lru_page(page)) { int ret = SWAP_AGAIN; -- cgit v1.1 From 45ec16908e84e00d847e9063231fb3a75e6366bc Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 8 Oct 2012 16:34:09 -0700 Subject: mm: use %pK for /proc/vmallocinfo In the paranoid case of sysctl kernel.kptr_restrict=2, mask the kernel virtual addresses in /proc/vmallocinfo too. Signed-off-by: Kees Cook Reported-by: Brad Spengler Acked-by: KOSAKI Motohiro Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 8de7046..78e0830 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2571,7 +2571,7 @@ static int s_show(struct seq_file *m, void *p) { struct vm_struct *v = p; - seq_printf(m, "0x%p-0x%p %7ld", + seq_printf(m, "0x%pK-0x%pK %7ld", v->addr, v->addr + v->size, v->size); if (v->caller) -- cgit v1.1 From 7795912c257bc068445f1db429c94d6b4b6ee604 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 8 Oct 2012 16:34:11 -0700 Subject: mm: document PageHuge somewhat Acked-by: David Rientjes Cc: Mel Gorman Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 993f7c1..59a0059 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -671,6 +671,11 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order) } } +/* + * PageHuge() only returns true for hugetlbfs pages, but not for normal or + * transparent huge pages. See the PageTransHuge() documentation for more + * details. + */ int PageHuge(struct page *page) { compound_page_dtor *dtor; -- cgit v1.1 From d760afd4d2570653891f94e13b848e97150dc5a6 Mon Sep 17 00:00:00 2001 From: Yasuaki Ishimatsu Date: Mon, 8 Oct 2012 16:34:14 -0700 Subject: memory-hotplug: suppress "Trying to free nonexistent resource " warning When our x86 box calls __remove_pages(), release_mem_region() shows many warnings. And x86 box cannot unregister iomem_resource. "Trying to free nonexistent resource " release_mem_region() has been changed to be called in each PAGES_PER_SECTION by commit de7f0cba9678 ("memory hotplug: release memory regions in PAGES_PER_SECTION chunks"). Because powerpc registers iomem_resource in each PAGES_PER_SECTION chunk. But when I hot add memory on x86 box, iomem_resource is register in each _CRS not PAGES_PER_SECTION chunk. So x86 box unregisters iomem_resource. The patch fixes the problem. Signed-off-by: Yasuaki Ishimatsu Cc: David Rientjes Cc: Jiang Liu Cc: Len Brown Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Christoph Lameter Cc: Minchan Kim Cc: Andrew Morton Cc: KOSAKI Motohiro Cc: Wen Congyang Cc: Dave Hansen Cc: Nathan Fontenot Cc: Badari Pulavarty Cc: Yasunori Goto Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 7d07974..56b758a 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -369,11 +369,11 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); BUG_ON(nr_pages % PAGES_PER_SECTION); + release_mem_region(phys_start_pfn << PAGE_SHIFT, nr_pages * PAGE_SIZE); + sections_to_remove = nr_pages / PAGES_PER_SECTION; for (i = 0; i < sections_to_remove; i++) { unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; - release_mem_region(pfn << PAGE_SHIFT, - PAGES_PER_SECTION << PAGE_SHIFT); ret = __remove_section(zone, __pfn_to_section(pfn)); if (ret) break; -- cgit v1.1 From b113da65785d5f3f9ff1451ec0fe43d6d76da25b Mon Sep 17 00:00:00 2001 From: David Miller Date: Mon, 8 Oct 2012 16:34:25 -0700 Subject: mm: Add and use update_mmu_cache_pmd() in transparent huge page code. The transparent huge page code passes a PMD pointer in as the third argument of update_mmu_cache(), which expects a PTE pointer. This never got noticed because X86 implements update_mmu_cache() as a macro and thus we don't get any type checking, and X86 is the only architecture which supports transparent huge pages currently. Before other architectures can support transparent huge pages properly we need to add a new interface which will take a PMD pointer as the third argument rather than a PTE pointer. [akpm@linux-foundation.org: implement update_mm_cache_pmd() for s390] Signed-off-by: David S. Miller Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Gerald Schaefer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 3a8d6b7..68a3c93 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -900,7 +900,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, entry = pmd_mkyoung(orig_pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1)) - update_mmu_cache(vma, address, pmd); + update_mmu_cache_pmd(vma, address, pmd); ret |= VM_FAULT_WRITE; goto out_unlock; } @@ -956,7 +956,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, pmdp_clear_flush(vma, haddr, pmd); page_add_new_anon_rmap(new_page, vma, haddr); set_pmd_at(mm, haddr, pmd, entry); - update_mmu_cache(vma, address, pmd); + update_mmu_cache_pmd(vma, address, pmd); page_remove_rmap(page); put_page(page); ret |= VM_FAULT_WRITE; @@ -2041,7 +2041,7 @@ static void collapse_huge_page(struct mm_struct *mm, BUG_ON(!pmd_none(*pmd)); page_add_new_anon_rmap(new_page, vma, address); set_pmd_at(mm, address, pmd, _pmd); - update_mmu_cache(vma, address, pmd); + update_mmu_cache_pmd(vma, address, pmd); pgtable_trans_huge_deposit(mm, pgtable); spin_unlock(&mm->page_table_lock); -- cgit v1.1 From f5c8ad47284ca01dafc37da5a72bb9644174d387 Mon Sep 17 00:00:00 2001 From: David Miller Date: Mon, 8 Oct 2012 16:34:26 -0700 Subject: mm: thp: Use more portable PMD clearing sequenece in zap_huge_pmd(). Invalidation sequences are handled in various ways on various architectures. One way, which sparc64 uses, is to let the set_*_at() functions accumulate pending flushes into a per-cpu array. Then the flush_tlb_range() et al. calls process the pending TLB flushes. In this regime, the __tlb_remove_*tlb_entry() implementations are essentially NOPs. The canonical PTE zap in mm/memory.c is: ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); tlb_remove_tlb_entry(tlb, pte, addr); With a subsequent tlb_flush_mmu() if needed. Mirror this in the THP PMD zapping using: orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd); page = pmd_page(orig_pmd); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); And we properly accomodate TLB flush mechanims like the one described above. Signed-off-by: David S. Miller Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Gerald Schaefer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 68a3c93..a863af2 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1024,9 +1024,10 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (__pmd_trans_huge_lock(pmd, vma) == 1) { struct page *page; pgtable_t pgtable; + pmd_t orig_pmd; pgtable = pgtable_trans_huge_withdraw(tlb->mm); - page = pmd_page(*pmd); - pmd_clear(pmd); + orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd); + page = pmd_page(orig_pmd); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); page_remove_rmap(page); VM_BUG_ON(page_mapcount(page) < 0); -- cgit v1.1