From 373d4d099761cb1f637bed488ab3871945882273 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 21 Jan 2013 17:17:39 +1030 Subject: taint: add explicit flag to show whether lock dep is still OK. Fix up all callers as they were before, with make one change: an unsigned module taints the kernel, but doesn't turn off lockdep. Signed-off-by: Rusty Russell --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index bb1369f..bc8bec7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -716,7 +716,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n", (unsigned long)vma->vm_file->f_op->mmap); dump_stack(); - add_taint(TAINT_BAD_PAGE); + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); } static inline bool is_cow_mapping(vm_flags_t flags) -- cgit v1.1 From af34770e55fd899c96d8d73bdc04dbc956096650 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 22 Feb 2013 16:32:20 -0800 Subject: mm: reduce rmap overhead for ex-KSM page copies created on swap faults When ex-KSM pages are faulted from swap cache, the fault handler is not capable of re-establishing anon_vma-spanning KSM pages. In this case, a copy of the page is created instead, just like during a COW break. These freshly made copies are known to be exclusive to the faulting VMA and there is no reason to go look for this page in parent and sibling processes during rmap operations. Use page_add_new_anon_rmap() for these copies. This also puts them on the proper LRU lists and marks them SwapBacked, so we can get rid of doing this ad-hoc in the KSM copy code. Signed-off-by: Johannes Weiner Reviewed-by: Rik van Riel Acked-by: Hugh Dickins Cc: Simon Jeons Cc: Mel Gorman Cc: Michal Hocko Cc: Satoru Moriya Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index bb1369f..0abd070 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3044,7 +3044,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, } flush_icache_page(vma, page); set_pte_at(mm, address, page_table, pte); - do_page_add_anon_rmap(page, vma, address, exclusive); + if (swapcache) /* ksm created a completely new copy */ + page_add_new_anon_rmap(page, vma, address); + else + do_page_add_anon_rmap(page, vma, address, exclusive); /* It's better to call commit-charge after rmap is established */ mem_cgroup_commit_charge_swapin(page, ptr); -- cgit v1.1 From cea10a19b7972a1954c4a2d05a7de8db48b444fb Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Fri, 22 Feb 2013 16:32:44 -0800 Subject: mm: directly use __mlock_vma_pages_range() in find_extend_vma() In find_extend_vma(), we don't need mlock_vma_pages_range() to verify the vma type - we know we're working with a stack. So, we can call directly into __mlock_vma_pages_range(), and remove the last make_pages_present() call site. Note that we don't use mm_populate() here, so we can't release the mmap_sem while allocating new stack pages. This is deemed acceptable, because the stack vmas grow by a bounded number of pages at a time, and these are anon pages so we don't have to read from disk to populate them. Signed-off-by: Michel Lespinasse Acked-by: Rik van Riel Tested-by: Andy Lutomirski Cc: Greg Ungerer Cc: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 24 ------------------------ 1 file changed, 24 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 0abd070..7837cea 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3824,30 +3824,6 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) } #endif /* __PAGETABLE_PMD_FOLDED */ -int make_pages_present(unsigned long addr, unsigned long end) -{ - int ret, len, write; - struct vm_area_struct * vma; - - vma = find_vma(current->mm, addr); - if (!vma) - return -ENOMEM; - /* - * We want to touch writable mappings with a write fault in order - * to break COW, except for shared mappings because these don't COW - * and we would not want to dirty them for nothing. - */ - write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE; - BUG_ON(addr >= end); - BUG_ON(end > vma->vm_end); - len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; - ret = get_user_pages(current, current->mm, addr, - len, write, 0, NULL, NULL); - if (ret < 0) - return ret; - return ret == len ? 0 : -EFAULT; -} - #if !defined(__HAVE_ARCH_GATE_AREA) #if defined(AT_SYSINFO_EHDR) -- cgit v1.1 From 75980e97daccfc6babbac7e180ff118537955f5d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Feb 2013 16:34:32 -0800 Subject: mm: fold page->_last_nid into page->flags where possible page->_last_nid fits into page->flags on 64-bit. The unlikely 32-bit NUMA configuration with NUMA Balancing will still need an extra page field. As Peter notes "Completely dropping 32bit support for CONFIG_NUMA_BALANCING would simplify things, but it would also remove the warning if we grow enough 64bit only page-flags to push the last-cpu out." [mgorman@suse.de: minor modifications] Signed-off-by: Mel Gorman Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: Ingo Molnar Cc: Simon Jeons Cc: Wanpeng Li Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 7837cea..054250e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -69,6 +69,10 @@ #include "internal.h" +#ifdef LAST_NID_NOT_IN_PAGE_FLAGS +#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_nid. +#endif + #ifndef CONFIG_NEED_MULTIPLE_NODES /* use the per-pgdat data instead for discontigmem - mbligh */ unsigned long max_mapnr; -- cgit v1.1 From cbf86cfe04a66471f23b9e62e5eba4e525f38855 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 22 Feb 2013 16:35:08 -0800 Subject: ksm: remove old stable nodes more thoroughly Switching merge_across_nodes after running KSM is liable to oops on stale nodes still left over from the previous stable tree. It's not something that people will often want to do, but it would be lame to demand a reboot when they're trying to determine which merge_across_nodes setting is best. How can this happen? We only permit switching merge_across_nodes when pages_shared is 0, and usually set run 2 to force that beforehand, which ought to unmerge everything: yet oopses still occur when you then run 1. Three causes: 1. The old stable tree (built according to the inverse merge_across_nodes) has not been fully torn down. A stable node lingers until get_ksm_page() notices that the page it references no longer references it: but the page is not necessarily freed as soon as expected, particularly when swapcache. Fix this with a pass through the old stable tree, applying get_ksm_page() to each of the remaining nodes (most found stale and removed immediately), with forced removal of any left over. Unless the page is still mapped: I've not seen that case, it shouldn't occur, but better to WARN_ON_ONCE and EBUSY than BUG. 2. __ksm_enter() has a nice little optimization, to insert the new mm just behind ksmd's cursor, so there's a full pass for it to stabilize (or be removed) before ksmd addresses it. Nice when ksmd is running, but not so nice when we're trying to unmerge all mms: we were missing those mms forked and inserted behind the unmerge cursor. Easily fixed by inserting at the end when KSM_RUN_UNMERGE. 3. It is possible for a KSM page to be faulted back from swapcache into an mm, just after unmerge_and_remove_all_rmap_items() scanned past it. Fix this by copying on fault when KSM_RUN_UNMERGE: but that is private to ksm.c, so dissolve the distinction between ksm_might_need_to_copy() and ksm_does_need_to_copy(), doing it all in the one call into ksm.c. A long outstanding, unrelated bugfix sneaks in with that third fix: ksm_does_need_to_copy() would copy from a !PageUptodate page (implying I/O error when read in from swap) to a page which it then marks Uptodate. Fix this case by not copying, letting do_swap_page() discover the error. Signed-off-by: Hugh Dickins Cc: Rik van Riel Cc: Petr Holasek Cc: Andrea Arcangeli Cc: Izik Eidus Cc: Gerald Schaefer Cc: KOSAKI Motohiro Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 054250e..7bd22a6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2994,17 +2994,16 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) goto out_page; - if (ksm_might_need_to_copy(page, vma, address)) { - swapcache = page; - page = ksm_does_need_to_copy(page, vma, address); - - if (unlikely(!page)) { - ret = VM_FAULT_OOM; - page = swapcache; - swapcache = NULL; - goto out_page; - } + swapcache = page; + page = ksm_might_need_to_copy(page, vma, address); + if (unlikely(!page)) { + ret = VM_FAULT_OOM; + page = swapcache; + swapcache = NULL; + goto out_page; } + if (page == swapcache) + swapcache = NULL; if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { ret = VM_FAULT_OOM; -- cgit v1.1 From 28a35716d317980ae9bc2ff2f84c33a3cda9e884 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Fri, 22 Feb 2013 16:35:55 -0800 Subject: mm: use long type for page counts in mm_populate() and get_user_pages() Use long type for page counts in mm_populate() so as to avoid integer overflow when running the following test code: int main(void) { void *p = mmap(NULL, 0x100000000000, PROT_READ, MAP_PRIVATE | MAP_ANON, -1, 0); printf("p: %p\n", p); mlockall(MCL_CURRENT); printf("done\n"); return 0; } Signed-off-by: Michel Lespinasse Cc: Andrea Arcangeli Cc: Rik van Riel Cc: Mel Gorman Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 7bd22a6..bc929db 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1677,15 +1677,15 @@ static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long add * instead of __get_user_pages. __get_user_pages should be used only if * you need some special @gup_flags. */ -int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int nr_pages, unsigned int gup_flags, - struct page **pages, struct vm_area_struct **vmas, - int *nonblocking) +long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + struct vm_area_struct **vmas, int *nonblocking) { - int i; + long i; unsigned long vm_flags; - if (nr_pages <= 0) + if (!nr_pages) return 0; VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); @@ -1981,9 +1981,9 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, * * See also get_user_pages_fast, for performance critical applications. */ -int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int nr_pages, int write, int force, - struct page **pages, struct vm_area_struct **vmas) +long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, int write, + int force, struct page **pages, struct vm_area_struct **vmas) { int flags = FOLL_TOUCH; -- cgit v1.1 From 240aadeedc4a89fc44623f8ce4ca46bda73db07e Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Fri, 22 Feb 2013 16:35:56 -0800 Subject: mm: accelerate mm_populate() treatment of THP pages This change adds a follow_page_mask function which is equivalent to follow_page, but with an extra page_mask argument. follow_page_mask sets *page_mask to HPAGE_PMD_NR - 1 when it encounters a THP page, and to 0 in other cases. __get_user_pages() makes use of this in order to accelerate populating THP ranges - that is, when both the pages and vmas arrays are NULL, we don't need to iterate HPAGE_PMD_NR times to cover a single THP page (and we also avoid taking mm->page_table_lock that many times). Signed-off-by: Michel Lespinasse Cc: Andrea Arcangeli Cc: Rik van Riel Cc: Mel Gorman Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index bc929db..5d2ef12 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1462,10 +1462,11 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, EXPORT_SYMBOL_GPL(zap_vma_ptes); /** - * follow_page - look up a page descriptor from a user-virtual address + * follow_page_mask - look up a page descriptor from a user-virtual address * @vma: vm_area_struct mapping @address * @address: virtual address to look up * @flags: flags modifying lookup behaviour + * @page_mask: on output, *page_mask is set according to the size of the page * * @flags can have FOLL_ flags set, defined in * @@ -1473,8 +1474,9 @@ EXPORT_SYMBOL_GPL(zap_vma_ptes); * an error pointer if there is a mapping to something not represented * by a page descriptor (see also vm_normal_page()). */ -struct page *follow_page(struct vm_area_struct *vma, unsigned long address, - unsigned int flags) +struct page *follow_page_mask(struct vm_area_struct *vma, + unsigned long address, unsigned int flags, + unsigned int *page_mask) { pgd_t *pgd; pud_t *pud; @@ -1484,6 +1486,8 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, struct page *page; struct mm_struct *mm = vma->vm_mm; + *page_mask = 0; + page = follow_huge_addr(mm, address, flags & FOLL_WRITE); if (!IS_ERR(page)) { BUG_ON(flags & FOLL_GET); @@ -1530,6 +1534,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, page = follow_trans_huge_pmd(vma, address, pmd, flags); spin_unlock(&mm->page_table_lock); + *page_mask = HPAGE_PMD_NR - 1; goto out; } } else @@ -1684,6 +1689,7 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, { long i; unsigned long vm_flags; + unsigned int page_mask; if (!nr_pages) return 0; @@ -1761,6 +1767,7 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, get_page(page); } pte_unmap(pte); + page_mask = 0; goto next_page; } @@ -1778,6 +1785,7 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, do { struct page *page; unsigned int foll_flags = gup_flags; + unsigned int page_increm; /* * If we have a pending SIGKILL, don't keep faulting @@ -1787,7 +1795,8 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, return i ? i : -ERESTARTSYS; cond_resched(); - while (!(page = follow_page(vma, start, foll_flags))) { + while (!(page = follow_page_mask(vma, start, + foll_flags, &page_mask))) { int ret; unsigned int fault_flags = 0; @@ -1861,13 +1870,19 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, flush_anon_page(vma, page, start); flush_dcache_page(page); + page_mask = 0; } next_page: - if (vmas) + if (vmas) { vmas[i] = vma; - i++; - start += PAGE_SIZE; - nr_pages--; + page_mask = 0; + } + page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); + if (page_increm > nr_pages) + page_increm = nr_pages; + i += page_increm; + start += page_increm * PAGE_SIZE; + nr_pages -= page_increm; } while (nr_pages && start < vma->vm_end); } while (nr_pages); return i; -- cgit v1.1 From 5117b3b835f288314a2d4e5512bc1747e3a7c8ed Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 22 Feb 2013 16:36:07 -0800 Subject: mm,ksm: FOLL_MIGRATION do migration_entry_wait In "ksm: remove old stable nodes more thoroughly" I said that I'd never seen its WARN_ON_ONCE(page_mapped(page)). True at the time of writing, but it soon appeared once I tried fuller tests on the whole series. It turned out to be due to the KSM page migration itself: unmerge_and_ remove_all_rmap_items() failed to locate and replace all the KSM pages, because of that hiatus in page migration when old pte has been replaced by migration entry, but not yet by new pte. follow_page() finds no page at that instant, but a KSM page reappears shortly after, without a fault. Add FOLL_MIGRATION flag, so follow_page() can do migration_entry_wait() for KSM's break_cow(). I'd have preferred to avoid another flag, and do it every time, in case someone else makes the same easy mistake; but did not find another transgressor (the common get_user_pages() is of course safe), and cannot be sure that every follow_page() caller is prepared to sleep - ia64's xencomm_vtop()? Now, THP's wait_split_huge_page() can already sleep there, since anon_vma locking was changed to mutex, but maybe that's somehow excluded. Signed-off-by: Hugh Dickins Cc: Mel Gorman Cc: Petr Holasek Cc: Andrea Arcangeli Cc: Izik Eidus Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 5d2ef12..ec8ba01 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1548,8 +1548,24 @@ split_fallthrough: ptep = pte_offset_map_lock(mm, pmd, address, &ptl); pte = *ptep; - if (!pte_present(pte)) - goto no_page; + if (!pte_present(pte)) { + swp_entry_t entry; + /* + * KSM's break_ksm() relies upon recognizing a ksm page + * even while it is being migrated, so for that case we + * need migration_entry_wait(). + */ + if (likely(!(flags & FOLL_MIGRATION))) + goto no_page; + if (pte_none(pte) || pte_file(pte)) + goto no_page; + entry = pte_to_swp_entry(pte); + if (!is_migration_entry(entry)) + goto no_page; + pte_unmap_unlock(ptep, ptl); + migration_entry_wait(mm, pmd, address); + goto split_fallthrough; + } if ((flags & FOLL_NUMA) && pte_numa(pte)) goto no_page; if ((flags & FOLL_WRITE) && !pte_write(pte)) -- cgit v1.1 From 56f31801ccdecb420d0d1fd2bf9f337c355214a9 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 22 Feb 2013 16:36:10 -0800 Subject: mm: cleanup "swapcache" in do_swap_page I dislike the way in which "swapcache" gets used in do_swap_page(): there is always a page from swapcache there (even if maybe uncached by the time we lock it), but tests are made according to "swapcache". Rework that with "page != swapcache", as has been done in unuse_pte(). Signed-off-by: Hugh Dickins Cc: Mel Gorman Cc: Petr Holasek Cc: Andrea Arcangeli Cc: Izik Eidus Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index ec8ba01..705473a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2954,7 +2954,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned int flags, pte_t orig_pte) { spinlock_t *ptl; - struct page *page, *swapcache = NULL; + struct page *page, *swapcache; swp_entry_t entry; pte_t pte; int locked; @@ -3005,9 +3005,11 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, */ ret = VM_FAULT_HWPOISON; delayacct_clear_flag(DELAYACCT_PF_SWAPIN); + swapcache = page; goto out_release; } + swapcache = page; locked = lock_page_or_retry(page, mm, flags); delayacct_clear_flag(DELAYACCT_PF_SWAPIN); @@ -3025,16 +3027,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) goto out_page; - swapcache = page; page = ksm_might_need_to_copy(page, vma, address); if (unlikely(!page)) { ret = VM_FAULT_OOM; page = swapcache; - swapcache = NULL; goto out_page; } - if (page == swapcache) - swapcache = NULL; if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { ret = VM_FAULT_OOM; @@ -3078,10 +3076,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, } flush_icache_page(vma, page); set_pte_at(mm, address, page_table, pte); - if (swapcache) /* ksm created a completely new copy */ - page_add_new_anon_rmap(page, vma, address); - else + if (page == swapcache) do_page_add_anon_rmap(page, vma, address, exclusive); + else /* ksm created a completely new copy */ + page_add_new_anon_rmap(page, vma, address); /* It's better to call commit-charge after rmap is established */ mem_cgroup_commit_charge_swapin(page, ptr); @@ -3089,7 +3087,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) try_to_free_swap(page); unlock_page(page); - if (swapcache) { + if (page != swapcache) { /* * Hold the lock to avoid the swap entry to be reused * until we take the PT lock for the pte_same() check @@ -3122,7 +3120,7 @@ out_page: unlock_page(page); out_release: page_cache_release(page); - if (swapcache) { + if (page != swapcache) { unlock_page(swapcache); page_cache_release(swapcache); } -- cgit v1.1