From f872f5400cc01373d8e29d9c7a5296ccfaf4ccf3 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 29 Dec 2015 20:12:19 -0800 Subject: mm: Add a vm_special_mapping.fault() method Requiring special mappings to give a list of struct pages is inflexible: it prevents sane use of IO memory in a special mapping, it's inefficient (it requires arch code to initialize a list of struct pages, and it requires the mm core to walk the entire list just to figure out how long it is), and it prevents arch code from doing anything fancy when a special mapping fault occurs. Add a .fault method as an alternative to filling in a .pages array. Looks-OK-to: Andrew Morton Signed-off-by: Andy Lutomirski Reviewed-by: Kees Cook Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/a26d1677c0bc7e774c33f469451a78ca31e9e6af.1451446564.git.luto@kernel.org Signed-off-by: Ingo Molnar --- mm/mmap.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 2ce04a6..f717453 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3030,11 +3030,16 @@ static int special_mapping_fault(struct vm_area_struct *vma, pgoff_t pgoff; struct page **pages; - if (vma->vm_ops == &legacy_special_mapping_vmops) + if (vma->vm_ops == &legacy_special_mapping_vmops) { pages = vma->vm_private_data; - else - pages = ((struct vm_special_mapping *)vma->vm_private_data)-> - pages; + } else { + struct vm_special_mapping *sm = vma->vm_private_data; + + if (sm->fault) + return sm->fault(sm, vma, vmf); + + pages = sm->pages; + } for (pgoff = vmf->pgoff; pgoff && *pages; ++pages) pgoff--; -- cgit v1.1 From 1745cbc5d0dee0749a6bc0ea8e872c5db0074061 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 29 Dec 2015 20:12:20 -0800 Subject: mm: Add vm_insert_pfn_prot() The x86 vvar vma contains pages with differing cacheability flags. x86 currently implements this by manually inserting all the ptes using (io_)remap_pfn_range when the vma is set up. x86 wants to move to using .fault with VM_FAULT_NOPAGE to set up the mappings as needed. The correct API to use to insert a pfn in .fault is vm_insert_pfn(), but vm_insert_pfn() can't override the vma's cache mode, and the HPET page in particular needs to be uncached despite the fact that the rest of the VMA is cached. Add vm_insert_pfn_prot() to support varying cacheability within the same non-COW VMA in a more sane manner. x86 could alternatively use multiple VMAs, but that's messy, would break CRIU, and would create unnecessary VMAs that would waste memory. Signed-off-by: Andy Lutomirski Reviewed-by: Kees Cook Acked-by: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/d2938d1eb37be7a5e4f86182db646551f11e45aa.1451446564.git.luto@kernel.org Signed-off-by: Ingo Molnar --- mm/memory.c | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index c387430..a29f0b9 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1564,8 +1564,29 @@ out: int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn) { + return vm_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot); +} +EXPORT_SYMBOL(vm_insert_pfn); + +/** + * vm_insert_pfn_prot - insert single pfn into user vma with specified pgprot + * @vma: user vma to map to + * @addr: target user address of this page + * @pfn: source kernel pfn + * @pgprot: pgprot flags for the inserted page + * + * This is exactly like vm_insert_pfn, except that it allows drivers to + * to override pgprot on a per-page basis. + * + * This only makes sense for IO mappings, and it makes no sense for + * cow mappings. In general, using multiple vmas is preferable; + * vm_insert_pfn_prot should only be used if using multiple VMAs is + * impractical. + */ +int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, pgprot_t pgprot) +{ int ret; - pgprot_t pgprot = vma->vm_page_prot; /* * Technically, architectures with pte_special can avoid all these * restrictions (same for remap_pfn_range). However we would like @@ -1587,7 +1608,7 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, return ret; } -EXPORT_SYMBOL(vm_insert_pfn); +EXPORT_SYMBOL(vm_insert_pfn_prot); int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn) -- cgit v1.1 From 782b86641e5d471e9eb1cf0072c012d2f758e568 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 26 Jan 2016 21:57:24 +0100 Subject: xen, mm: Set IORESOURCE_SYSTEM_RAM to System RAM Set IORESOURCE_SYSTEM_RAM in struct resource.flags of "System RAM" entries. Signed-off-by: Toshi Kani Signed-off-by: Borislav Petkov Acked-by: David Vrabel # xen Cc: Andrew Banman Cc: Andrew Morton Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Dan Williams Cc: David Rientjes Cc: Denys Vlasenko Cc: Gu Zheng Cc: H. Peter Anvin Cc: Konrad Rzeszutek Wilk Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Mel Gorman Cc: Naoya Horiguchi Cc: Peter Zijlstra Cc: Tang Chen Cc: Thomas Gleixner Cc: Toshi Kani Cc: linux-arch@vger.kernel.org Cc: linux-mm Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1453841853-11383-9-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- mm/memory_hotplug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 4af58a3..979b18c 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -138,7 +138,7 @@ static struct resource *register_memory_resource(u64 start, u64 size) res->name = "System RAM"; res->start = start; res->end = start + size - 1; - res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; + res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; if (request_resource(&iomem_resource, res) < 0) { pr_debug("System RAM resource %pR cannot be added\n", res); kfree(res); -- cgit v1.1 From bdb428c82aab5d8aba6fbb29dde2cf678eadc432 Mon Sep 17 00:00:00 2001 From: Bogdan Sikora Date: Sun, 27 Dec 2015 14:58:23 +0100 Subject: lib+mm: fix few spelling mistakes All are in comments. Signed-off-by: Bogdan Sikora Cc: Cc: Rafael Aquini Cc: Kent Overstreet Cc: Jan Kara [jkosina@suse.cz: more fixup] Acked-by: Rafael Aquini Signed-off-by: Jiri Slaby Signed-off-by: Jiri Kosina --- mm/balloon_compaction.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index d3116be..59c2bc8 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -13,10 +13,10 @@ /* * balloon_page_enqueue - allocates a new page and inserts it into the balloon * page list. - * @b_dev_info: balloon device decriptor where we will insert a new page to + * @b_dev_info: balloon device descriptor where we will insert a new page to * * Driver must call it to properly allocate a new enlisted balloon page - * before definetively removing it from the guest system. + * before definitively removing it from the guest system. * This function returns the page address for the recently enqueued page or * NULL in the case we fail to allocate a new page this turn. */ -- cgit v1.1 From 1e9877902dc7e11d2be038371c6fbf2dfcd469d7 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 12 Feb 2016 13:01:54 -0800 Subject: mm/gup: Introduce get_user_pages_remote() For protection keys, we need to understand whether protections should be enforced in software or not. In general, we enforce protections when working on our own task, but not when on others. We call these "current" and "remote" operations. This patch introduces a new get_user_pages() variant: get_user_pages_remote() Which is a replacement for when get_user_pages() is called on non-current tsk/mm. We also introduce a new gup flag: FOLL_REMOTE which can be used for the "__" gup variants to get this new behavior. The uprobes is_trap_at_addr() location holds mmap_sem and calls get_user_pages(current->mm) on an instruction address. This makes it a pretty unique gup caller. Being an instruction access and also really originating from the kernel (vs. the app), I opted to consider this a 'remote' access where protection keys will not be enforced. Without protection keys, this patch should not change any behavior. Signed-off-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andrea Arcangeli Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Kirill A. Shutemov Cc: Linus Torvalds Cc: Naoya Horiguchi Cc: Peter Zijlstra Cc: Rik van Riel Cc: Srikar Dronamraju Cc: Vlastimil Babka Cc: jack@suse.cz Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20160212210154.3F0E51EA@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- mm/gup.c | 27 ++++++++++++++++++++++----- mm/memory.c | 2 +- mm/process_vm_access.c | 11 ++++++++--- 3 files changed, 31 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/gup.c b/mm/gup.c index 7bf19ff..36ca850 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -870,7 +870,7 @@ long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, EXPORT_SYMBOL(get_user_pages_unlocked); /* - * get_user_pages() - pin user pages in memory + * get_user_pages_remote() - pin user pages in memory * @tsk: the task_struct to use for page fault accounting, or * NULL if faults are not to be recorded. * @mm: mm_struct of target mm @@ -924,12 +924,29 @@ EXPORT_SYMBOL(get_user_pages_unlocked); * should use get_user_pages because it cannot pass * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault. */ -long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, int write, - int force, struct page **pages, struct vm_area_struct **vmas) +long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages, + struct vm_area_struct **vmas) { return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force, - pages, vmas, NULL, false, FOLL_TOUCH); + pages, vmas, NULL, false, + FOLL_TOUCH | FOLL_REMOTE); +} +EXPORT_SYMBOL(get_user_pages_remote); + +/* + * This is the same as get_user_pages_remote() for the time + * being. + */ +long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages, + struct vm_area_struct **vmas) +{ + return __get_user_pages_locked(tsk, mm, start, nr_pages, + write, force, pages, vmas, NULL, false, + FOLL_TOUCH); } EXPORT_SYMBOL(get_user_pages); diff --git a/mm/memory.c b/mm/memory.c index 38090ca..8bfbad0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3685,7 +3685,7 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, void *maddr; struct page *page = NULL; - ret = get_user_pages(tsk, mm, addr, 1, + ret = get_user_pages_remote(tsk, mm, addr, 1, write, 1, &page, &vma); if (ret <= 0) { #ifndef CONFIG_HAVE_IOREMAP_PROT diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index 5d453e5..07514d4 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -98,9 +98,14 @@ static int process_vm_rw_single_vec(unsigned long addr, int pages = min(nr_pages, max_pages_per_loop); size_t bytes; - /* Get the pages we're interested in */ - pages = get_user_pages_unlocked(task, mm, pa, pages, - vm_write, 0, process_pages); + /* + * Get the pages we're interested in. We must + * add FOLL_REMOTE because task/mm might not + * current/current->mm + */ + pages = __get_user_pages_unlocked(task, mm, pa, pages, + vm_write, 0, process_pages, + FOLL_REMOTE); if (pages <= 0) return -EFAULT; -- cgit v1.1 From cde70140fed8429acf7a14e2e2cbd3e329036653 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 12 Feb 2016 13:01:55 -0800 Subject: mm/gup: Overload get_user_pages() functions The concept here was a suggestion from Ingo. The implementation horrors are all mine. This allows get_user_pages(), get_user_pages_unlocked(), and get_user_pages_locked() to be called with or without the leading tsk/mm arguments. We will give a compile-time warning about the old style being __deprecated and we will also WARN_ON() if the non-remote version is used for a remote-style access. Doing this, folks will get nice warnings and will not break the build. This should be nice for -next and will hopefully let developers fix up their own code instead of maintainers needing to do it at merge time. The way we do this is hideous. It uses the __VA_ARGS__ macro functionality to call different functions based on the number of arguments passed to the macro. There's an additional hack to ensure that our EXPORT_SYMBOL() of the deprecated symbols doesn't trigger a warning. We should be able to remove this mess as soon as -rc1 hits in the release after this is merged. Signed-off-by: Dave Hansen Cc: Al Viro Cc: Alexander Kuleshov Cc: Andrea Arcangeli Cc: Andrew Morton Cc: Dan Williams Cc: Dave Hansen Cc: Dominik Dingel Cc: Geliang Tang Cc: Jan Kara Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Konstantin Khlebnikov Cc: Leon Romanovsky Cc: Linus Torvalds Cc: Masahiro Yamada Cc: Mateusz Guzik Cc: Maxime Coquelin Cc: Michal Hocko Cc: Naoya Horiguchi Cc: Oleg Nesterov Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Thomas Gleixner Cc: Vladimir Davydov Cc: Vlastimil Babka Cc: Xie XiuQi Cc: linux-kernel@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20160212210155.73222EE1@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- mm/gup.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++-------------- mm/nommu.c | 64 +++++++++++++++++++++++++++++++++++++++++++------------------- mm/util.c | 4 +--- 3 files changed, 94 insertions(+), 36 deletions(-) (limited to 'mm') diff --git a/mm/gup.c b/mm/gup.c index 36ca850..8a035e0 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1,3 +1,4 @@ +#define __DISABLE_GUP_DEPRECATED 1 #include #include #include @@ -807,15 +808,15 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk, * if (locked) * up_read(&mm->mmap_sem); */ -long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, +long get_user_pages_locked6(unsigned long start, unsigned long nr_pages, int write, int force, struct page **pages, int *locked) { - return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force, - pages, NULL, locked, true, FOLL_TOUCH); + return __get_user_pages_locked(current, current->mm, start, nr_pages, + write, force, pages, NULL, locked, true, + FOLL_TOUCH); } -EXPORT_SYMBOL(get_user_pages_locked); +EXPORT_SYMBOL(get_user_pages_locked6); /* * Same as get_user_pages_unlocked(...., FOLL_TOUCH) but it allows to @@ -860,14 +861,13 @@ EXPORT_SYMBOL(__get_user_pages_unlocked); * or if "force" shall be set to 1 (get_user_pages_fast misses the * "force" parameter). */ -long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, +long get_user_pages_unlocked5(unsigned long start, unsigned long nr_pages, int write, int force, struct page **pages) { - return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write, - force, pages, FOLL_TOUCH); + return __get_user_pages_unlocked(current, current->mm, start, nr_pages, + write, force, pages, FOLL_TOUCH); } -EXPORT_SYMBOL(get_user_pages_unlocked); +EXPORT_SYMBOL(get_user_pages_unlocked5); /* * get_user_pages_remote() - pin user pages in memory @@ -939,16 +939,15 @@ EXPORT_SYMBOL(get_user_pages_remote); * This is the same as get_user_pages_remote() for the time * being. */ -long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, +long get_user_pages6(unsigned long start, unsigned long nr_pages, int write, int force, struct page **pages, struct vm_area_struct **vmas) { - return __get_user_pages_locked(tsk, mm, start, nr_pages, + return __get_user_pages_locked(current, current->mm, start, nr_pages, write, force, pages, vmas, NULL, false, FOLL_TOUCH); } -EXPORT_SYMBOL(get_user_pages); +EXPORT_SYMBOL(get_user_pages6); /** * populate_vma_page_range() - populate a range of pages in the vma. @@ -1484,3 +1483,38 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, } #endif /* CONFIG_HAVE_GENERIC_RCU_GUP */ + +long get_user_pages8(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages, + struct vm_area_struct **vmas) +{ + WARN_ONCE(tsk != current, "get_user_pages() called on remote task"); + WARN_ONCE(mm != current->mm, "get_user_pages() called on remote mm"); + + return get_user_pages6(start, nr_pages, write, force, pages, vmas); +} +EXPORT_SYMBOL(get_user_pages8); + +long get_user_pages_locked8(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages, int *locked) +{ + WARN_ONCE(tsk != current, "get_user_pages_locked() called on remote task"); + WARN_ONCE(mm != current->mm, "get_user_pages_locked() called on remote mm"); + + return get_user_pages_locked6(start, nr_pages, write, force, pages, locked); +} +EXPORT_SYMBOL(get_user_pages_locked8); + +long get_user_pages_unlocked7(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages) +{ + WARN_ONCE(tsk != current, "get_user_pages_unlocked() called on remote task"); + WARN_ONCE(mm != current->mm, "get_user_pages_unlocked() called on remote mm"); + + return get_user_pages_unlocked5(start, nr_pages, write, force, pages); +} +EXPORT_SYMBOL(get_user_pages_unlocked7); + diff --git a/mm/nommu.c b/mm/nommu.c index fbf6f0f1..b64d04d 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -15,6 +15,8 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#define __DISABLE_GUP_DEPRECATED + #include #include #include @@ -182,8 +184,7 @@ finish_or_fault: * slab page or a secondary page from a compound page * - don't permit access to VMAs that don't support it, such as I/O mappings */ -long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, +long get_user_pages6(unsigned long start, unsigned long nr_pages, int write, int force, struct page **pages, struct vm_area_struct **vmas) { @@ -194,20 +195,18 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (force) flags |= FOLL_FORCE; - return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas, - NULL); + return __get_user_pages(current, current->mm, start, nr_pages, flags, + pages, vmas, NULL); } -EXPORT_SYMBOL(get_user_pages); +EXPORT_SYMBOL(get_user_pages6); -long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, - int write, int force, struct page **pages, - int *locked) +long get_user_pages_locked6(unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages, + int *locked) { - return get_user_pages(tsk, mm, start, nr_pages, write, force, - pages, NULL); + return get_user_pages6(start, nr_pages, write, force, pages, NULL); } -EXPORT_SYMBOL(get_user_pages_locked); +EXPORT_SYMBOL(get_user_pages_locked6); long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, @@ -216,21 +215,20 @@ long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, { long ret; down_read(&mm->mmap_sem); - ret = get_user_pages(tsk, mm, start, nr_pages, write, force, - pages, NULL); + ret = __get_user_pages(tsk, mm, start, nr_pages, gup_flags, pages, + NULL, NULL); up_read(&mm->mmap_sem); return ret; } EXPORT_SYMBOL(__get_user_pages_unlocked); -long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, +long get_user_pages_unlocked5(unsigned long start, unsigned long nr_pages, int write, int force, struct page **pages) { - return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write, - force, pages, 0); + return __get_user_pages_unlocked(current, current->mm, start, nr_pages, + write, force, pages, 0); } -EXPORT_SYMBOL(get_user_pages_unlocked); +EXPORT_SYMBOL(get_user_pages_unlocked5); /** * follow_pfn - look up PFN at a user virtual address @@ -2108,3 +2106,31 @@ static int __meminit init_admin_reserve(void) return 0; } subsys_initcall(init_admin_reserve); + +long get_user_pages8(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages, + struct vm_area_struct **vmas) +{ + return get_user_pages6(start, nr_pages, write, force, pages, vmas); +} +EXPORT_SYMBOL(get_user_pages8); + +long get_user_pages_locked8(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages, + int *locked) +{ + return get_user_pages_locked6(start, nr_pages, write, + force, pages, locked); +} +EXPORT_SYMBOL(get_user_pages_locked8); + +long get_user_pages_unlocked7(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages) +{ + return get_user_pages_unlocked5(start, nr_pages, write, force, pages); +} +EXPORT_SYMBOL(get_user_pages_unlocked7); + diff --git a/mm/util.c b/mm/util.c index 4fb14ca..1e60116 100644 --- a/mm/util.c +++ b/mm/util.c @@ -283,9 +283,7 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast); int __weak get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) { - struct mm_struct *mm = current->mm; - return get_user_pages_unlocked(current, mm, start, nr_pages, - write, 0, pages); + return get_user_pages_unlocked(start, nr_pages, write, 0, pages); } EXPORT_SYMBOL_GPL(get_user_pages_fast); -- cgit v1.1 From d4edcf0d56958db0aca0196314ca38a5e730ea92 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 12 Feb 2016 13:01:56 -0800 Subject: mm/gup: Switch all callers of get_user_pages() to not pass tsk/mm We will soon modify the vanilla get_user_pages() so it can no longer be used on mm/tasks other than 'current/current->mm', which is by far the most common way it is called. For now, we allow the old-style calls, but warn when they are used. (implemented in previous patch) This patch switches all callers of: get_user_pages() get_user_pages_unlocked() get_user_pages_locked() to stop passing tsk/mm so they will no longer see the warnings. Signed-off-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andrea Arcangeli Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Kirill A. Shutemov Cc: Linus Torvalds Cc: Naoya Horiguchi Cc: Peter Zijlstra Cc: Rik van Riel Cc: Srikar Dronamraju Cc: Vlastimil Babka Cc: jack@suse.cz Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20160212210156.113E9407@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- mm/frame_vector.c | 2 +- mm/gup.c | 6 ++++-- mm/ksm.c | 2 +- mm/mempolicy.c | 6 +++--- 4 files changed, 9 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/frame_vector.c b/mm/frame_vector.c index 7cf2b71..381bb07 100644 --- a/mm/frame_vector.c +++ b/mm/frame_vector.c @@ -58,7 +58,7 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames, if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) { vec->got_ref = true; vec->is_pfns = false; - ret = get_user_pages_locked(current, mm, start, nr_frames, + ret = get_user_pages_locked(start, nr_frames, write, force, (struct page **)(vec->ptrs), &locked); goto out; } diff --git a/mm/gup.c b/mm/gup.c index 8a035e0..de24ef4 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -936,8 +936,10 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, EXPORT_SYMBOL(get_user_pages_remote); /* - * This is the same as get_user_pages_remote() for the time - * being. + * This is the same as get_user_pages_remote(), just with a + * less-flexible calling convention where we assume that the task + * and mm being operated on are the current task's. We also + * obviously don't pass FOLL_REMOTE in here. */ long get_user_pages6(unsigned long start, unsigned long nr_pages, int write, int force, struct page **pages, diff --git a/mm/ksm.c b/mm/ksm.c index ca6d2a0..c2013f6 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -352,7 +352,7 @@ static inline bool ksm_test_exit(struct mm_struct *mm) /* * We use break_ksm to break COW on a ksm page: it's a stripped down * - * if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1) + * if (get_user_pages(addr, 1, 1, 1, &page, NULL) == 1) * put_page(page); * * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma, diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4c4187c..dd0ce7f 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -844,12 +844,12 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) } } -static int lookup_node(struct mm_struct *mm, unsigned long addr) +static int lookup_node(unsigned long addr) { struct page *p; int err; - err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL); + err = get_user_pages(addr & PAGE_MASK, 1, 0, 0, &p, NULL); if (err >= 0) { err = page_to_nid(p); put_page(p); @@ -904,7 +904,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, if (flags & MPOL_F_NODE) { if (flags & MPOL_F_ADDR) { - err = lookup_node(mm, addr); + err = lookup_node(addr); if (err < 0) goto out; *policy = err; -- cgit v1.1 From 63c17fb8e5a46a16e10e82005748837fd11a2024 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 12 Feb 2016 13:02:08 -0800 Subject: mm/core, x86/mm/pkeys: Store protection bits in high VMA flags vma->vm_flags is an 'unsigned long', so has space for 32 flags on 32-bit architectures. The high 32 bits are unused on 64-bit platforms. We've steered away from using the unused high VMA bits for things because we would have difficulty supporting it on 32-bit. Protection Keys are not available in 32-bit mode, so there is no concern about supporting this feature in 32-bit mode or on 32-bit CPUs. This patch carves out 4 bits from the high half of vma->vm_flags and allows architectures to set config option to make them available. Sparse complains about these constants unless we explicitly call them "UL". Signed-off-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dan Williams Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jan Kara Cc: Kirill A. Shutemov Cc: Konstantin Khlebnikov Cc: Linus Torvalds Cc: Mel Gorman Cc: Michal Hocko Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Sasha Levin Cc: Valentin Rothberg Cc: Vladimir Davydov Cc: Vlastimil Babka Cc: Xie XiuQi Cc: linux-kernel@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20160212210208.81AF00D5@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- mm/Kconfig | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index 03cbfa0..6cf4399 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -669,3 +669,6 @@ config ZONE_DEVICE config FRAME_VECTOR bool + +config ARCH_USES_HIGH_VMA_FLAGS + bool -- cgit v1.1 From d4925e00d59698a201231cf99dce47d8b922bb34 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 12 Feb 2016 13:02:16 -0800 Subject: mm/gup: Factor out VMA fault permission checking This code matches a fault condition up with the VMA and ensures that the VMA allows the fault to be handled instead of just erroring out. We will be extending this in a moment to comprehend protection keys. Signed-off-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Andy Lutomirski Cc: Aneesh Kumar K.V Cc: Borislav Petkov Cc: Brian Gerst Cc: Dan Williams Cc: Dave Hansen Cc: Denys Vlasenko Cc: Dominik Dingel Cc: Eric B Munson Cc: H. Peter Anvin Cc: Jason Low Cc: Kirill A. Shutemov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Sasha Levin Cc: linux-kernel@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20160212210216.C3824032@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- mm/gup.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/gup.c b/mm/gup.c index de24ef4..b935c2c 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -610,6 +610,18 @@ next_page: } EXPORT_SYMBOL(__get_user_pages); +bool vma_permits_fault(struct vm_area_struct *vma, unsigned int fault_flags) +{ + vm_flags_t vm_flags; + + vm_flags = (fault_flags & FAULT_FLAG_WRITE) ? VM_WRITE : VM_READ; + + if (!(vm_flags & vma->vm_flags)) + return false; + + return true; +} + /* * fixup_user_fault() - manually resolve a user page fault * @tsk: the task_struct to use for page fault accounting, or @@ -645,7 +657,6 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, bool *unlocked) { struct vm_area_struct *vma; - vm_flags_t vm_flags; int ret, major = 0; if (unlocked) @@ -656,8 +667,7 @@ retry: if (!vma || address < vma->vm_start) return -EFAULT; - vm_flags = (fault_flags & FAULT_FLAG_WRITE) ? VM_WRITE : VM_READ; - if (!(vm_flags & vma->vm_flags)) + if (!vma_permits_fault(vma, fault_flags)) return -EFAULT; ret = handle_mm_fault(mm, vma, address, fault_flags); -- cgit v1.1 From 33a709b25a760b91184bb335cf7d7c32b8123013 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 12 Feb 2016 13:02:19 -0800 Subject: mm/gup, x86/mm/pkeys: Check VMAs and PTEs for protection keys Today, for normal faults and page table walks, we check the VMA and/or PTE to ensure that it is compatible with the action. For instance, if we get a write fault on a non-writeable VMA, we SIGSEGV. We try to do the same thing for protection keys. Basically, we try to make sure that if a user does this: mprotect(ptr, size, PROT_NONE); *ptr = foo; they see the same effects with protection keys when they do this: mprotect(ptr, size, PROT_READ|PROT_WRITE); set_pkey(ptr, size, 4); wrpkru(0xffffff3f); // access disable pkey 4 *ptr = foo; The state to do that checking is in the VMA, but we also sometimes have to do it on the page tables only, like when doing a get_user_pages_fast() where we have no VMA. We add two functions and expose them to generic code: arch_pte_access_permitted(pte_flags, write) arch_vma_access_permitted(vma, write) These are, of course, backed up in x86 arch code with checks against the PTE or VMA's protection key. But, there are also cases where we do not want to respect protection keys. When we ptrace(), for instance, we do not want to apply the tracer's PKRU permissions to the PTEs from the process being traced. Signed-off-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Alexey Kardashevskiy Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Aneesh Kumar K.V Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Boaz Harrosh Cc: Borislav Petkov Cc: Brian Gerst Cc: Dan Williams Cc: Dave Hansen Cc: David Gibson Cc: David Hildenbrand Cc: David Vrabel Cc: Denys Vlasenko Cc: Dominik Dingel Cc: Dominik Vogt Cc: Guan Xuetao Cc: H. Peter Anvin Cc: Heiko Carstens Cc: Hugh Dickins Cc: Jason Low Cc: Jerome Marchand Cc: Juergen Gross Cc: Kirill A. Shutemov Cc: Laurent Dufour Cc: Linus Torvalds Cc: Martin Schwidefsky Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michael Ellerman Cc: Michal Hocko Cc: Mikulas Patocka Cc: Minchan Kim Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Rik van Riel Cc: Sasha Levin Cc: Shachar Raindel Cc: Stephen Smalley Cc: Toshi Kani Cc: Vlastimil Babka Cc: linux-arch@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: linux-mm@kvack.org Cc: linux-s390@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/20160212210219.14D5D715@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- mm/gup.c | 18 +++++++++++++++--- mm/memory.c | 4 ++++ 2 files changed, 19 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/gup.c b/mm/gup.c index b935c2c..e0f5f357 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -15,6 +15,7 @@ #include #include +#include #include #include @@ -444,6 +445,8 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) if (!(vm_flags & VM_MAYREAD)) return -EFAULT; } + if (!arch_vma_access_permitted(vma, (gup_flags & FOLL_WRITE))) + return -EFAULT; return 0; } @@ -612,13 +615,19 @@ EXPORT_SYMBOL(__get_user_pages); bool vma_permits_fault(struct vm_area_struct *vma, unsigned int fault_flags) { - vm_flags_t vm_flags; - - vm_flags = (fault_flags & FAULT_FLAG_WRITE) ? VM_WRITE : VM_READ; + bool write = !!(fault_flags & FAULT_FLAG_WRITE); + vm_flags_t vm_flags = write ? VM_WRITE : VM_READ; if (!(vm_flags & vma->vm_flags)) return false; + /* + * The architecture might have a hardware protection + * mechanism other than read/write that can deny access + */ + if (!arch_vma_access_permitted(vma, write)) + return false; + return true; } @@ -1172,6 +1181,9 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, pte_protnone(pte) || (write && !pte_write(pte))) goto pte_unmap; + if (!arch_pte_access_permitted(pte, write)) + goto pte_unmap; + VM_BUG_ON(!pfn_valid(pte_pfn(pte))); page = pte_page(pte); head = compound_head(page); diff --git a/mm/memory.c b/mm/memory.c index 8bfbad0..d7e84fe 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -65,6 +65,7 @@ #include #include +#include #include #include #include @@ -3378,6 +3379,9 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, pmd_t *pmd; pte_t *pte; + if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE)) + return VM_FAULT_SIGSEGV; + if (unlikely(is_vm_hugetlb_page(vma))) return hugetlb_fault(mm, vma, address, flags); -- cgit v1.1 From 1b2ee1266ea647713dbaf44825967c180dfc8d76 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 12 Feb 2016 13:02:21 -0800 Subject: mm/core: Do not enforce PKEY permissions on remote mm access We try to enforce protection keys in software the same way that we do in hardware. (See long example below). But, we only want to do this when accessing our *own* process's memory. If GDB set PKRU[6].AD=1 (disable access to PKEY 6), then tried to PTRACE_POKE a target process which just happened to have some mprotect_pkey(pkey=6) memory, we do *not* want to deny the debugger access to that memory. PKRU is fundamentally a thread-local structure and we do not want to enforce it on access to _another_ thread's data. This gets especially tricky when we have workqueues or other delayed-work mechanisms that might run in a random process's context. We can check that we only enforce pkeys when operating on our *own* mm, but delayed work gets performed when a random user context is active. We might end up with a situation where a delayed-work gup fails when running randomly under its "own" task but succeeds when running under another process. We want to avoid that. To avoid that, we use the new GUP flag: FOLL_REMOTE and add a fault flag: FAULT_FLAG_REMOTE. They indicate that we are walking an mm which is not guranteed to be the same as current->mm and should not be subject to protection key enforcement. Thanks to Jerome Glisse for pointing out this scenario. Signed-off-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Alexey Kardashevskiy Cc: Andrea Arcangeli Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Boaz Harrosh Cc: Borislav Petkov Cc: Brian Gerst Cc: Dan Williams Cc: Dave Chinner Cc: Dave Hansen Cc: David Gibson Cc: Denys Vlasenko Cc: Dominik Dingel Cc: Dominik Vogt Cc: Eric B Munson Cc: Geliang Tang Cc: Guan Xuetao Cc: H. Peter Anvin Cc: Heiko Carstens Cc: Hugh Dickins Cc: Jan Kara Cc: Jason Low Cc: Jerome Marchand Cc: Joerg Roedel Cc: Kirill A. Shutemov Cc: Konstantin Khlebnikov Cc: Laurent Dufour Cc: Linus Torvalds Cc: Martin Schwidefsky Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michael Ellerman Cc: Michal Hocko Cc: Mikulas Patocka Cc: Minchan Kim Cc: Oleg Nesterov Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Rik van Riel Cc: Sasha Levin Cc: Shachar Raindel Cc: Vlastimil Babka Cc: Xie XiuQi Cc: iommu@lists.linux-foundation.org Cc: linux-arch@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: linux-mm@kvack.org Cc: linux-s390@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Signed-off-by: Ingo Molnar --- mm/gup.c | 15 ++++++++++----- mm/ksm.c | 10 ++++++++-- mm/memory.c | 3 ++- 3 files changed, 20 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/gup.c b/mm/gup.c index e0f5f357..d276760 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -365,6 +365,8 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, return -ENOENT; if (*flags & FOLL_WRITE) fault_flags |= FAULT_FLAG_WRITE; + if (*flags & FOLL_REMOTE) + fault_flags |= FAULT_FLAG_REMOTE; if (nonblocking) fault_flags |= FAULT_FLAG_ALLOW_RETRY; if (*flags & FOLL_NOWAIT) @@ -415,11 +417,13 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) { vm_flags_t vm_flags = vma->vm_flags; + int write = (gup_flags & FOLL_WRITE); + int foreign = (gup_flags & FOLL_REMOTE); if (vm_flags & (VM_IO | VM_PFNMAP)) return -EFAULT; - if (gup_flags & FOLL_WRITE) { + if (write) { if (!(vm_flags & VM_WRITE)) { if (!(gup_flags & FOLL_FORCE)) return -EFAULT; @@ -445,7 +449,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) if (!(vm_flags & VM_MAYREAD)) return -EFAULT; } - if (!arch_vma_access_permitted(vma, (gup_flags & FOLL_WRITE))) + if (!arch_vma_access_permitted(vma, write, foreign)) return -EFAULT; return 0; } @@ -615,7 +619,8 @@ EXPORT_SYMBOL(__get_user_pages); bool vma_permits_fault(struct vm_area_struct *vma, unsigned int fault_flags) { - bool write = !!(fault_flags & FAULT_FLAG_WRITE); + bool write = !!(fault_flags & FAULT_FLAG_WRITE); + bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE); vm_flags_t vm_flags = write ? VM_WRITE : VM_READ; if (!(vm_flags & vma->vm_flags)) @@ -623,9 +628,9 @@ bool vma_permits_fault(struct vm_area_struct *vma, unsigned int fault_flags) /* * The architecture might have a hardware protection - * mechanism other than read/write that can deny access + * mechanism other than read/write that can deny access. */ - if (!arch_vma_access_permitted(vma, write)) + if (!arch_vma_access_permitted(vma, write, foreign)) return false; return true; diff --git a/mm/ksm.c b/mm/ksm.c index c2013f6..b99e8281 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -359,6 +359,10 @@ static inline bool ksm_test_exit(struct mm_struct *mm) * in case the application has unmapped and remapped mm,addr meanwhile. * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP * mmap of /dev/mem or /dev/kmem, where we would not want to touch it. + * + * FAULT_FLAG/FOLL_REMOTE are because we do this outside the context + * of the process that owns 'vma'. We also do not want to enforce + * protection keys here anyway. */ static int break_ksm(struct vm_area_struct *vma, unsigned long addr) { @@ -367,12 +371,14 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr) do { cond_resched(); - page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION); + page = follow_page(vma, addr, + FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE); if (IS_ERR_OR_NULL(page)) break; if (PageKsm(page)) ret = handle_mm_fault(vma->vm_mm, vma, addr, - FAULT_FLAG_WRITE); + FAULT_FLAG_WRITE | + FAULT_FLAG_REMOTE); else ret = VM_FAULT_WRITE; put_page(page); diff --git a/mm/memory.c b/mm/memory.c index d7e84fe..76c44e5 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3379,7 +3379,8 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, pmd_t *pmd; pte_t *pte; - if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE)) + if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE, + flags & FAULT_FLAG_REMOTE)) return VM_FAULT_SIGSEGV; if (unlikely(is_vm_hugetlb_page(vma))) -- cgit v1.1 From d61172b4b695b821388cdb6088a41d431bcbb93b Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 12 Feb 2016 13:02:24 -0800 Subject: mm/core, x86/mm/pkeys: Differentiate instruction fetches As discussed earlier, we attempt to enforce protection keys in software. However, the code checks all faults to ensure that they are not violating protection key permissions. It was assumed that all faults are either write faults where we check PKRU[key].WD (write disable) or read faults where we check the AD (access disable) bit. But, there is a third category of faults for protection keys: instruction faults. Instruction faults never run afoul of protection keys because they do not affect instruction fetches. So, plumb the PF_INSTR bit down in to the arch_vma_access_permitted() function where we do the protection key checks. We also add a new FAULT_FLAG_INSTRUCTION. This is because handle_mm_fault() is not passed the architecture-specific error_code where we keep PF_INSTR, so we need to encode the instruction fetch information in to the arch-generic fault flags. Signed-off-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20160212210224.96928009@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- mm/gup.c | 11 +++++++++-- mm/memory.c | 1 + 2 files changed, 10 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/gup.c b/mm/gup.c index d276760..7f1c4fb 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -449,7 +449,11 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) if (!(vm_flags & VM_MAYREAD)) return -EFAULT; } - if (!arch_vma_access_permitted(vma, write, foreign)) + /* + * gups are always data accesses, not instruction + * fetches, so execute=false here + */ + if (!arch_vma_access_permitted(vma, write, false, foreign)) return -EFAULT; return 0; } @@ -629,8 +633,11 @@ bool vma_permits_fault(struct vm_area_struct *vma, unsigned int fault_flags) /* * The architecture might have a hardware protection * mechanism other than read/write that can deny access. + * + * gup always represents data access, not instruction + * fetches, so execute=false here: */ - if (!arch_vma_access_permitted(vma, write, foreign)) + if (!arch_vma_access_permitted(vma, write, false, foreign)) return false; return true; diff --git a/mm/memory.c b/mm/memory.c index 76c44e5..99e9f92 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3380,6 +3380,7 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, pte_t *pte; if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE, + flags & FAULT_FLAG_INSTRUCTION, flags & FAULT_FLAG_REMOTE)) return VM_FAULT_SIGSEGV; -- cgit v1.1 From e6bfb70959a0ca6ddedb29e779a293c6f71ed0e7 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 12 Feb 2016 13:02:31 -0800 Subject: mm/core, arch, powerpc: Pass a protection key in to calc_vm_flag_bits() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This plumbs a protection key through calc_vm_flag_bits(). We could have done this in calc_vm_prot_bits(), but I did not feel super strongly which way to go. It was pretty arbitrary which one to use. Signed-off-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andrea Arcangeli Cc: Andrew Morton Cc: Andy Lutomirski Cc: Arve Hjønnevåg Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Brian Gerst Cc: Chen Gang Cc: Dan Williams Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: Denys Vlasenko Cc: Eric W. Biederman Cc: Geliang Tang Cc: Greg Kroah-Hartman Cc: H. Peter Anvin Cc: Kirill A. Shutemov Cc: Konstantin Khlebnikov Cc: Leon Romanovsky Cc: Linus Torvalds Cc: Masahiro Yamada Cc: Maxime Coquelin Cc: Mel Gorman Cc: Michael Ellerman Cc: Oleg Nesterov Cc: Paul Gortmaker Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Rik van Riel Cc: Riley Andrews Cc: Vladimir Davydov Cc: devel@driverdev.osuosl.org Cc: linux-api@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: linux-mm@kvack.org Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/20160212210231.E6F1F0D6@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- mm/mmap.c | 2 +- mm/mprotect.c | 2 +- mm/nommu.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index e2e9f48..784d2d6 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1313,7 +1313,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, * to. we assume access permissions have been handled by the open * of the memory object, so we don't do any here. */ - vm_flags |= calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | + vm_flags |= calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(flags) | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; if (flags & MAP_LOCKED) diff --git a/mm/mprotect.c b/mm/mprotect.c index f7cb3d4..3790c8b 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -380,7 +380,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) prot |= PROT_EXEC; - vm_flags = calc_vm_prot_bits(prot); + vm_flags = calc_vm_prot_bits(prot, 0); down_write(¤t->mm->mmap_sem); diff --git a/mm/nommu.c b/mm/nommu.c index b64d04d..5ba39b8 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1082,7 +1082,7 @@ static unsigned long determine_vm_flags(struct file *file, { unsigned long vm_flags; - vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags); + vm_flags = calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(flags); /* vm_flags |= mm->def_flags; */ if (!(capabilities & NOMMU_MAP_DIRECT)) { -- cgit v1.1 From 66d375709d2c891acc639538fd3179fa0cbb0daf Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 12 Feb 2016 13:02:32 -0800 Subject: mm/core, x86/mm/pkeys: Add arch_validate_pkey() The syscall-level code is passed a protection key and need to return an appropriate error code if the protection key is bogus. We will be using this in subsequent patches. Note that this also begins a series of arch-specific calls that we need to expose in otherwise arch-independent code. We create a linux/pkeys.h header where we will put *all* the stubs for these functions. Signed-off-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20160212210232.774EEAAB@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- mm/Kconfig | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index 6cf4399..2702bb6 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -672,3 +672,5 @@ config FRAME_VECTOR config ARCH_USES_HIGH_VMA_FLAGS bool +config ARCH_HAS_PKEYS + bool -- cgit v1.1 From 62b5f7d013fc455b8db26cf01e421f4c0d264b92 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 12 Feb 2016 13:02:40 -0800 Subject: mm/core, x86/mm/pkeys: Add execute-only protection keys support Protection keys provide new page-based protection in hardware. But, they have an interesting attribute: they only affect data accesses and never affect instruction fetches. That means that if we set up some memory which is set as "access-disabled" via protection keys, we can still execute from it. This patch uses protection keys to set up mappings to do just that. If a user calls: mmap(..., PROT_EXEC); or mprotect(ptr, sz, PROT_EXEC); (note PROT_EXEC-only without PROT_READ/WRITE), the kernel will notice this, and set a special protection key on the memory. It also sets the appropriate bits in the Protection Keys User Rights (PKRU) register so that the memory becomes unreadable and unwritable. I haven't found any userspace that does this today. With this facility in place, we expect userspace to move to use it eventually. Userspace _could_ start doing this today. Any PROT_EXEC calls get converted to PROT_READ inside the kernel, and would transparently be upgraded to "true" PROT_EXEC with this code. IOW, userspace never has to do any PROT_EXEC runtime detection. This feature provides enhanced protection against leaking executable memory contents. This helps thwart attacks which are attempting to find ROP gadgets on the fly. But, the security provided by this approach is not comprehensive. The PKRU register which controls access permissions is a normal user register writable from unprivileged userspace. An attacker who can execute the 'wrpkru' instruction can easily disable the protection provided by this feature. The protection key that is used for execute-only support is permanently dedicated at compile time. This is fine for now because there is currently no API to set a protection key other than this one. Despite there being a constant PKRU value across the entire system, we do not set it unless this feature is in use in a process. That is to preserve the PKRU XSAVE 'init state', which can lead to faster context switches. PKRU *is* a user register and the kernel is modifying it. That means that code doing: pkru = rdpkru() pkru |= 0x100; mmap(..., PROT_EXEC); wrpkru(pkru); could lose the bits in PKRU that enforce execute-only permissions. To avoid this, we suggest avoiding ever calling mmap() or mprotect() when the PKRU value is expected to be unstable. Signed-off-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andrea Arcangeli Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Aneesh Kumar K.V Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Chen Gang Cc: Dan Williams Cc: Dave Chinner Cc: Dave Hansen Cc: David Hildenbrand Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Kees Cook Cc: Kirill A. Shutemov Cc: Konstantin Khlebnikov Cc: Linus Torvalds Cc: Mel Gorman Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Piotr Kwapulinski Cc: Rik van Riel Cc: Stephen Smalley Cc: Vladimir Murzin Cc: Will Deacon Cc: keescook@google.com Cc: linux-kernel@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20160212210240.CB4BB5CA@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- mm/mmap.c | 10 +++++++++- mm/mprotect.c | 8 ++++---- 2 files changed, 13 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 784d2d6..0175b7d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include @@ -1270,6 +1271,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long pgoff, unsigned long *populate) { struct mm_struct *mm = current->mm; + int pkey = 0; *populate = 0; @@ -1309,11 +1311,17 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (offset_in_page(addr)) return addr; + if (prot == PROT_EXEC) { + pkey = execute_only_pkey(mm); + if (pkey < 0) + pkey = 0; + } + /* Do simple checking here so the lower-level routines won't have * to. we assume access permissions have been handled by the open * of the memory object, so we don't do any here. */ - vm_flags |= calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(flags) | + vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; if (flags & MAP_LOCKED) diff --git a/mm/mprotect.c b/mm/mprotect.c index 3790c8b..fa37c4c 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -354,7 +355,7 @@ fail: SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, unsigned long, prot) { - unsigned long vm_flags, nstart, end, tmp, reqprot; + unsigned long nstart, end, tmp, reqprot; struct vm_area_struct *vma, *prev; int error = -EINVAL; const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP); @@ -380,8 +381,6 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) prot |= PROT_EXEC; - vm_flags = calc_vm_prot_bits(prot, 0); - down_write(¤t->mm->mmap_sem); vma = find_vma(current->mm, start); @@ -411,10 +410,11 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, for (nstart = start ; ; ) { unsigned long newflags; + int pkey = arch_override_mprotect_pkey(vma, prot, -1); /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ - newflags = vm_flags; + newflags = calc_vm_prot_bits(prot, pkey); newflags |= (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); /* newflags >> 4 shift VM_MAY% in place of VM_% */ -- cgit v1.1 From ff20c2e0acc5ad7e27c68592ade135efee399549 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 1 Mar 2016 09:45:14 +0530 Subject: mm: Some arch may want to use HPAGE_PMD related values as variables With next generation power processor, we are having a new mmu model [1] that require us to maintain a different linux page table format. Inorder to support both current and future ppc64 systems with a single kernel we need to make sure kernel can select between different page table format at runtime. With the new MMU (radix MMU) added, we will have two different pmd hugepage size 16MB for hash model and 2MB for Radix model. Hence make HPAGE_PMD related values as a variable. Actual conversion of HPAGE_PMD to a variable for ppc64 happens in a followup patch. [1] http://ibm.biz/power-isa3 (Needs registration). Signed-off-by: Kirill A. Shutemov Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- mm/huge_memory.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index aea8f7a..36c22a8 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -83,7 +83,7 @@ unsigned long transparent_hugepage_flags __read_mostly = (1<= MAX_ORDER); + /* + * we use page->mapping and page->index in second tail page + * as list_head: assuming THP order >= 2 + */ + MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2); + err = hugepage_init_sysfs(&hugepage_kobj); if (err) goto err_sysfs; @@ -764,7 +776,6 @@ void prep_transhuge_page(struct page *page) * we use page->mapping and page->indexlru in second tail page * as list_head: assuming THP order >= 2 */ - BUILD_BUG_ON(HPAGE_PMD_ORDER < 2); INIT_LIST_HEAD(page_deferred_list(page)); set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR); -- cgit v1.1 From 7640131032db9118a78af715ac77ba2debeeb17c Mon Sep 17 00:00:00 2001 From: Matthew Dawson Date: Fri, 11 Mar 2016 13:08:07 -0800 Subject: mm/mempool: avoid KASAN marking mempool poison checks as use-after-free When removing an element from the mempool, mark it as unpoisoned in KASAN before verifying its contents for SLUB/SLAB debugging. Otherwise KASAN will flag the reads checking the element use-after-free writes as use-after-free reads. Signed-off-by: Matthew Dawson Acked-by: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mempool.c b/mm/mempool.c index 004d42b..7924f4f 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -135,8 +135,8 @@ static void *remove_element(mempool_t *pool) void *element = pool->elements[--pool->curr_nr]; BUG_ON(pool->curr_nr < 0); - check_element(pool, element); kasan_unpoison_element(pool, element); + check_element(pool, element); return element; } -- cgit v1.1 From 376bf125ac781d32e202760ed7deb1ae4ed35d31 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 15 Mar 2016 14:53:32 -0700 Subject: slub: clean up code for kmem cgroup support to kmem_cache_free_bulk This change is primarily an attempt to make it easier to realize the optimizations the compiler performs in-case CONFIG_MEMCG_KMEM is not enabled. Performance wise, even when CONFIG_MEMCG_KMEM is compiled in, the overhead is zero. This is because, as long as no process have enabled kmem cgroups accounting, the assignment is replaced by asm-NOP operations. This is possible because memcg_kmem_enabled() uses a static_key_false() construct. It also helps readability as it avoid accessing the p[] array like: p[size - 1] which "expose" that the array is processed backwards inside helper function build_detached_freelist(). Lastly this also makes the code more robust, in error case like passing NULL pointers in the array. Which were previously handled before commit 033745189b1b ("slub: add missing kmem cgroup support to kmem_cache_free_bulk"). Fixes: 033745189b1b ("slub: add missing kmem cgroup support to kmem_cache_free_bulk") Signed-off-by: Jesper Dangaard Brouer Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index d8fbd4a..2a722e1 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2815,6 +2815,7 @@ struct detached_freelist { void *tail; void *freelist; int cnt; + struct kmem_cache *s; }; /* @@ -2829,8 +2830,9 @@ struct detached_freelist { * synchronization primitive. Look ahead in the array is limited due * to performance reasons. */ -static int build_detached_freelist(struct kmem_cache *s, size_t size, - void **p, struct detached_freelist *df) +static inline +int build_detached_freelist(struct kmem_cache *s, size_t size, + void **p, struct detached_freelist *df) { size_t first_skipped_index = 0; int lookahead = 3; @@ -2846,8 +2848,11 @@ static int build_detached_freelist(struct kmem_cache *s, size_t size, if (!object) return 0; + /* Support for memcg, compiler can optimize this out */ + df->s = cache_from_obj(s, object); + /* Start new detached freelist */ - set_freepointer(s, object, NULL); + set_freepointer(df->s, object, NULL); df->page = virt_to_head_page(object); df->tail = object; df->freelist = object; @@ -2862,7 +2867,7 @@ static int build_detached_freelist(struct kmem_cache *s, size_t size, /* df->page is always set at this point */ if (df->page == virt_to_head_page(object)) { /* Opportunity build freelist */ - set_freepointer(s, object, df->freelist); + set_freepointer(df->s, object, df->freelist); df->freelist = object; df->cnt++; p[size] = NULL; /* mark object processed */ @@ -2881,25 +2886,20 @@ static int build_detached_freelist(struct kmem_cache *s, size_t size, return first_skipped_index; } - /* Note that interrupts must be enabled when calling this function. */ -void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p) +void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) { if (WARN_ON(!size)) return; do { struct detached_freelist df; - struct kmem_cache *s; - - /* Support for memcg */ - s = cache_from_obj(orig_s, p[size - 1]); size = build_detached_freelist(s, size, p, &df); if (unlikely(!df.page)) continue; - slab_free(s, df.page, df.freelist, df.tail, df.cnt, _RET_IP_); + slab_free(df.s, df.page, df.freelist, df.tail, df.cnt,_RET_IP_); } while (likely(size)); } EXPORT_SYMBOL(kmem_cache_free_bulk); -- cgit v1.1 From 11c7aec2a9b4e685bbf6a15148e7841b3525fc0c Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 15 Mar 2016 14:53:35 -0700 Subject: mm/slab: move SLUB alloc hooks to common mm/slab.h First step towards sharing alloc_hook's between SLUB and SLAB allocators. Move the SLUB allocators *_alloc_hook to the common mm/slab.h for internal slab definitions. Signed-off-by: Jesper Dangaard Brouer Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.h | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ mm/slub.c | 54 ------------------------------------------------------ 2 files changed, 62 insertions(+), 54 deletions(-) (limited to 'mm') diff --git a/mm/slab.h b/mm/slab.h index 2eedace..fd231c9 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -38,6 +38,10 @@ struct kmem_cache { #endif #include +#include +#include +#include +#include /* * State of the slab allocator. @@ -321,6 +325,64 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) return s; } +static inline size_t slab_ksize(const struct kmem_cache *s) +{ +#ifndef CONFIG_SLUB + return s->object_size; + +#else /* CONFIG_SLUB */ +# ifdef CONFIG_SLUB_DEBUG + /* + * Debugging requires use of the padding between object + * and whatever may come after it. + */ + if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) + return s->object_size; +# endif + /* + * If we have the need to store the freelist pointer + * back there or track user information then we can + * only use the space before that information. + */ + if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) + return s->inuse; + /* + * Else we can use all the padding etc for the allocation + */ + return s->size; +#endif +} + +static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, + gfp_t flags) +{ + flags &= gfp_allowed_mask; + lockdep_trace_alloc(flags); + might_sleep_if(gfpflags_allow_blocking(flags)); + + if (should_failslab(s->object_size, flags, s->flags)) + return NULL; + + return memcg_kmem_get_cache(s, flags); +} + +static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, + size_t size, void **p) +{ + size_t i; + + flags &= gfp_allowed_mask; + for (i = 0; i < size; i++) { + void *object = p[i]; + + kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); + kmemleak_alloc_recursive(object, s->object_size, 1, + s->flags, flags); + kasan_slab_alloc(s, object); + } + memcg_kmem_put_cache(s); +} + #ifndef CONFIG_SLOB /* * The slab lists for all objects. diff --git a/mm/slub.c b/mm/slub.c index 2a722e1..6dd04c0 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -284,30 +284,6 @@ static inline int slab_index(void *p, struct kmem_cache *s, void *addr) return (p - addr) / s->size; } -static inline size_t slab_ksize(const struct kmem_cache *s) -{ -#ifdef CONFIG_SLUB_DEBUG - /* - * Debugging requires use of the padding between object - * and whatever may come after it. - */ - if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) - return s->object_size; - -#endif - /* - * If we have the need to store the freelist pointer - * back there or track user information then we can - * only use the space before that information. - */ - if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) - return s->inuse; - /* - * Else we can use all the padding etc for the allocation - */ - return s->size; -} - static inline int order_objects(int order, unsigned long size, int reserved) { return ((PAGE_SIZE << order) - reserved) / size; @@ -1281,36 +1257,6 @@ static inline void kfree_hook(const void *x) kasan_kfree_large(x); } -static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, - gfp_t flags) -{ - flags &= gfp_allowed_mask; - lockdep_trace_alloc(flags); - might_sleep_if(gfpflags_allow_blocking(flags)); - - if (should_failslab(s->object_size, flags, s->flags)) - return NULL; - - return memcg_kmem_get_cache(s, flags); -} - -static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, - size_t size, void **p) -{ - size_t i; - - flags &= gfp_allowed_mask; - for (i = 0; i < size; i++) { - void *object = p[i]; - - kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); - kmemleak_alloc_recursive(object, s->object_size, 1, - s->flags, flags); - kasan_slab_alloc(s, object); - } - memcg_kmem_put_cache(s); -} - static inline void slab_free_hook(struct kmem_cache *s, void *x) { kmemleak_free_recursive(x, s->flags); -- cgit v1.1 From fab9963a69dbd71304357dbfe4ec5345f14cebdd Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 15 Mar 2016 14:53:38 -0700 Subject: mm: fault-inject take over bootstrap kmem_cache check Remove the SLAB specific function slab_should_failslab(), by moving the check against fault-injection for the bootstrap slab, into the shared function should_failslab() (used by both SLAB and SLUB). This is a step towards sharing alloc_hook's between SLUB and SLAB. This bootstrap slab "kmem_cache" is used for allocating struct kmem_cache objects to the allocator itself. Signed-off-by: Jesper Dangaard Brouer Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/failslab.c | 12 +++++++++--- mm/slab.c | 12 ++---------- mm/slab.h | 2 +- 3 files changed, 12 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/failslab.c b/mm/failslab.c index 79171b4..b0fac98 100644 --- a/mm/failslab.c +++ b/mm/failslab.c @@ -1,5 +1,7 @@ #include #include +#include +#include "slab.h" static struct { struct fault_attr attr; @@ -11,18 +13,22 @@ static struct { .cache_filter = false, }; -bool should_failslab(size_t size, gfp_t gfpflags, unsigned long cache_flags) +bool should_failslab(struct kmem_cache *s, gfp_t gfpflags) { + /* No fault-injection for bootstrap cache */ + if (unlikely(s == kmem_cache)) + return false; + if (gfpflags & __GFP_NOFAIL) return false; if (failslab.ignore_gfp_reclaim && (gfpflags & __GFP_RECLAIM)) return false; - if (failslab.cache_filter && !(cache_flags & SLAB_FAILSLAB)) + if (failslab.cache_filter && !(s->flags & SLAB_FAILSLAB)) return false; - return should_fail(&failslab.attr, size); + return should_fail(&failslab.attr, s->object_size); } static int __init setup_failslab(char *str) diff --git a/mm/slab.c b/mm/slab.c index 621fbcb..95f344d 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2926,14 +2926,6 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, #define cache_alloc_debugcheck_after(a,b,objp,d) (objp) #endif -static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags) -{ - if (unlikely(cachep == kmem_cache)) - return false; - - return should_failslab(cachep->object_size, flags, cachep->flags); -} - static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) { void *objp; @@ -3155,7 +3147,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, lockdep_trace_alloc(flags); - if (slab_should_failslab(cachep, flags)) + if (should_failslab(cachep, flags)) return NULL; cachep = memcg_kmem_get_cache(cachep, flags); @@ -3243,7 +3235,7 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) lockdep_trace_alloc(flags); - if (slab_should_failslab(cachep, flags)) + if (should_failslab(cachep, flags)) return NULL; cachep = memcg_kmem_get_cache(cachep, flags); diff --git a/mm/slab.h b/mm/slab.h index fd231c9..6c7f16a 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -360,7 +360,7 @@ static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, lockdep_trace_alloc(flags); might_sleep_if(gfpflags_allow_blocking(flags)); - if (should_failslab(s->object_size, flags, s->flags)) + if (should_failslab(s, flags)) return NULL; return memcg_kmem_get_cache(s, flags); -- cgit v1.1 From 011eceaf0ad54e0916df8593ffc27b9f73d89fcf Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 15 Mar 2016 14:53:41 -0700 Subject: slab: use slab_pre_alloc_hook in SLAB allocator shared with SLUB Deduplicate code in SLAB allocator functions slab_alloc() and slab_alloc_node() by using the slab_pre_alloc_hook() call, which is now shared between SLUB and SLAB. Signed-off-by: Jesper Dangaard Brouer Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 95f344d..1857a65 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3144,14 +3144,10 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, int slab_node = numa_mem_id(); flags &= gfp_allowed_mask; - - lockdep_trace_alloc(flags); - - if (should_failslab(cachep, flags)) + cachep = slab_pre_alloc_hook(cachep, flags); + if (unlikely(!cachep)) return NULL; - cachep = memcg_kmem_get_cache(cachep, flags); - cache_alloc_debugcheck_before(cachep, flags); local_irq_save(save_flags); @@ -3232,14 +3228,10 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) void *objp; flags &= gfp_allowed_mask; - - lockdep_trace_alloc(flags); - - if (should_failslab(cachep, flags)) + cachep = slab_pre_alloc_hook(cachep, flags); + if (unlikely(!cachep)) return NULL; - cachep = memcg_kmem_get_cache(cachep, flags); - cache_alloc_debugcheck_before(cachep, flags); local_irq_save(save_flags); objp = __do_cache_alloc(cachep, flags); -- cgit v1.1 From 0142eae3ae15b9b9f0ae2a8e68e3c8dc347a2394 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 15 Mar 2016 14:53:44 -0700 Subject: mm: kmemcheck skip object if slab allocation failed In the SLAB allocator kmemcheck_slab_alloc() is guarded against being called in case the object is NULL. In SLUB allocator this NULL pointer invocation can happen, which seems like an oversight. Move the NULL pointer check into kmemcheck code (kmemcheck_slab_alloc) so the check gets moved out of the fastpath, when not compiled with CONFIG_KMEMCHECK. This is a step towards sharing post_alloc_hook between SLUB and SLAB, because slab_post_alloc_hook() does not perform this check before calling kmemcheck_slab_alloc(). Signed-off-by: Jesper Dangaard Brouer Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kmemcheck.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c index cab58bb..6f4f424 100644 --- a/mm/kmemcheck.c +++ b/mm/kmemcheck.c @@ -60,6 +60,9 @@ void kmemcheck_free_shadow(struct page *page, int order) void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object, size_t size) { + if (unlikely(!object)) /* Skip object if allocation failed */ + return; + /* * Has already been memset(), which initializes the shadow for us * as well. -- cgit v1.1 From d5e3ed66d6f260b3bb68cb5cf0fe79777e8febf0 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 15 Mar 2016 14:53:47 -0700 Subject: slab: use slab_post_alloc_hook in SLAB allocator shared with SLUB Reviewers notice that the order in slab_post_alloc_hook() of kmemcheck_slab_alloc() and kmemleak_alloc_recursive() gets swapped compared to slab.c / SLAB allocator. Also notice memset now occurs before calling kmemcheck_slab_alloc() and kmemleak_alloc_recursive(). I assume this reordering of kmemcheck, kmemleak and memset is okay because this is the order they are used by the SLUB allocator. This patch completes the sharing of alloc_hook's between SLUB and SLAB. Signed-off-by: Jesper Dangaard Brouer Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 1857a65..f872208 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3176,16 +3176,11 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, out: local_irq_restore(save_flags); ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); - kmemleak_alloc_recursive(ptr, cachep->object_size, 1, cachep->flags, - flags); - if (likely(ptr)) { - kmemcheck_slab_alloc(cachep, flags, ptr, cachep->object_size); - if (unlikely(flags & __GFP_ZERO)) - memset(ptr, 0, cachep->object_size); - } + if (unlikely(flags & __GFP_ZERO) && ptr) + memset(ptr, 0, cachep->object_size); - memcg_kmem_put_cache(cachep); + slab_post_alloc_hook(cachep, flags, 1, &ptr); return ptr; } @@ -3237,17 +3232,12 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) objp = __do_cache_alloc(cachep, flags); local_irq_restore(save_flags); objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); - kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags, - flags); prefetchw(objp); - if (likely(objp)) { - kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size); - if (unlikely(flags & __GFP_ZERO)) - memset(objp, 0, cachep->object_size); - } + if (unlikely(flags & __GFP_ZERO) && objp) + memset(objp, 0, cachep->object_size); - memcg_kmem_put_cache(cachep); + slab_post_alloc_hook(cachep, flags, 1, &objp); return objp; } -- cgit v1.1 From 2a777eac173a53b33f2f7dbed2b61a1f5eb0b531 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 15 Mar 2016 14:53:50 -0700 Subject: slab: implement bulk alloc in SLAB allocator This patch implements the alloc side of bulk API for the SLAB allocator. Further optimization are still possible by changing the call to __do_cache_alloc() into something that can return multiple objects. This optimization is left for later, given end results already show in the area of 80% speedup. Signed-off-by: Jesper Dangaard Brouer Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index f872208..0c2e2ca 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3392,9 +3392,42 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) EXPORT_SYMBOL(kmem_cache_free_bulk); int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, - void **p) + void **p) { - return __kmem_cache_alloc_bulk(s, flags, size, p); + size_t i; + + s = slab_pre_alloc_hook(s, flags); + if (!s) + return 0; + + cache_alloc_debugcheck_before(s, flags); + + local_irq_disable(); + for (i = 0; i < size; i++) { + void *objp = __do_cache_alloc(s, flags); + + /* this call could be done outside IRQ disabled section */ + objp = cache_alloc_debugcheck_after(s, flags, objp, _RET_IP_); + + if (unlikely(!objp)) + goto error; + p[i] = objp; + } + local_irq_enable(); + + /* Clear memory outside IRQ disabled section */ + if (unlikely(flags & __GFP_ZERO)) + for (i = 0; i < size; i++) + memset(p[i], 0, s->object_size); + + slab_post_alloc_hook(s, flags, size, p); + /* FIXME: Trace call missing. Christoph would like a bulk variant */ + return size; +error: + local_irq_enable(); + slab_post_alloc_hook(s, flags, i, p); + __kmem_cache_free_bulk(s, i, p); + return 0; } EXPORT_SYMBOL(kmem_cache_alloc_bulk); -- cgit v1.1 From 7b0501dd6b1862a83c5c1c65beadce99014b97a4 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 15 Mar 2016 14:53:53 -0700 Subject: slab: avoid running debug SLAB code with IRQs disabled for alloc_bulk Move the call to cache_alloc_debugcheck_after() outside the IRQ disabled section in kmem_cache_alloc_bulk(). When CONFIG_DEBUG_SLAB is disabled the compiler should remove this code. Signed-off-by: Jesper Dangaard Brouer Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 0c2e2ca..207fa92 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3391,6 +3391,16 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) } EXPORT_SYMBOL(kmem_cache_free_bulk); +static __always_inline void +cache_alloc_debugcheck_after_bulk(struct kmem_cache *s, gfp_t flags, + size_t size, void **p, unsigned long caller) +{ + size_t i; + + for (i = 0; i < size; i++) + p[i] = cache_alloc_debugcheck_after(s, flags, p[i], caller); +} + int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, void **p) { @@ -3406,15 +3416,14 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, for (i = 0; i < size; i++) { void *objp = __do_cache_alloc(s, flags); - /* this call could be done outside IRQ disabled section */ - objp = cache_alloc_debugcheck_after(s, flags, objp, _RET_IP_); - if (unlikely(!objp)) goto error; p[i] = objp; } local_irq_enable(); + cache_alloc_debugcheck_after_bulk(s, flags, size, p, _RET_IP_); + /* Clear memory outside IRQ disabled section */ if (unlikely(flags & __GFP_ZERO)) for (i = 0; i < size; i++) @@ -3425,6 +3434,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, return size; error: local_irq_enable(); + cache_alloc_debugcheck_after_bulk(s, flags, i, p, _RET_IP_); slab_post_alloc_hook(s, flags, i, p); __kmem_cache_free_bulk(s, i, p); return 0; -- cgit v1.1 From e6cdb58d1c8307f66039b04203e378d89938281c Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 15 Mar 2016 14:53:56 -0700 Subject: slab: implement bulk free in SLAB allocator This patch implements the free side of bulk API for the SLAB allocator kmem_cache_free_bulk(), and concludes the implementation of optimized bulk API for SLAB allocator. Benchmarked[1] cost of alloc+free (obj size 256 bytes) on CPU i7-4790K @ 4.00GHz, with no debug options, no PREEMPT and CONFIG_MEMCG_KMEM=y but no active user of kmemcg. SLAB single alloc+free cost: 87 cycles(tsc) 21.814 ns with this optimized config. bulk- Current fallback - optimized SLAB bulk 1 - 102 cycles(tsc) 25.747 ns - 41 cycles(tsc) 10.490 ns - improved 59.8% 2 - 94 cycles(tsc) 23.546 ns - 26 cycles(tsc) 6.567 ns - improved 72.3% 3 - 92 cycles(tsc) 23.127 ns - 20 cycles(tsc) 5.244 ns - improved 78.3% 4 - 90 cycles(tsc) 22.663 ns - 18 cycles(tsc) 4.588 ns - improved 80.0% 8 - 88 cycles(tsc) 22.242 ns - 14 cycles(tsc) 3.656 ns - improved 84.1% 16 - 88 cycles(tsc) 22.010 ns - 13 cycles(tsc) 3.480 ns - improved 85.2% 30 - 89 cycles(tsc) 22.305 ns - 13 cycles(tsc) 3.303 ns - improved 85.4% 32 - 89 cycles(tsc) 22.277 ns - 13 cycles(tsc) 3.309 ns - improved 85.4% 34 - 88 cycles(tsc) 22.246 ns - 13 cycles(tsc) 3.294 ns - improved 85.2% 48 - 88 cycles(tsc) 22.121 ns - 13 cycles(tsc) 3.492 ns - improved 85.2% 64 - 88 cycles(tsc) 22.052 ns - 13 cycles(tsc) 3.411 ns - improved 85.2% 128 - 89 cycles(tsc) 22.452 ns - 15 cycles(tsc) 3.841 ns - improved 83.1% 158 - 89 cycles(tsc) 22.403 ns - 14 cycles(tsc) 3.746 ns - improved 84.3% 250 - 91 cycles(tsc) 22.775 ns - 16 cycles(tsc) 4.111 ns - improved 82.4% Notice it is not recommended to do very large bulk operation with this bulk API, because local IRQs are disabled in this period. [1] https://github.com/netoptimizer/prototype-kernel/blob/master/kernel/mm/slab_bulk_test01.c Signed-off-by: Jesper Dangaard Brouer Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 207fa92..2e075c0 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3385,12 +3385,6 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) } EXPORT_SYMBOL(kmem_cache_alloc); -void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) -{ - __kmem_cache_free_bulk(s, size, p); -} -EXPORT_SYMBOL(kmem_cache_free_bulk); - static __always_inline void cache_alloc_debugcheck_after_bulk(struct kmem_cache *s, gfp_t flags, size_t size, void **p, unsigned long caller) @@ -3584,6 +3578,29 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp) } EXPORT_SYMBOL(kmem_cache_free); +void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p) +{ + struct kmem_cache *s; + size_t i; + + local_irq_disable(); + for (i = 0; i < size; i++) { + void *objp = p[i]; + + s = cache_from_obj(orig_s, objp); + + debug_check_no_locks_freed(objp, s->object_size); + if (!(s->flags & SLAB_DEBUG_OBJECTS)) + debug_check_no_obj_freed(objp, s->object_size); + + __cache_free(s, objp, _RET_IP_); + } + local_irq_enable(); + + /* FIXME: add tracing */ +} +EXPORT_SYMBOL(kmem_cache_free_bulk); + /** * kfree - free previously allocated memory * @objp: pointer returned by kmalloc. -- cgit v1.1 From ca257195511d536308700548de008b51729221eb Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 15 Mar 2016 14:54:00 -0700 Subject: mm: new API kfree_bulk() for SLAB+SLUB allocators This patch introduce a new API call kfree_bulk() for bulk freeing memory objects not bound to a single kmem_cache. Christoph pointed out that it is possible to implement freeing of objects, without knowing the kmem_cache pointer as that information is available from the object's page->slab_cache. Proposing to remove the kmem_cache argument from the bulk free API. Jesper demonstrated that these extra steps per object comes at a performance cost. It is only in the case CONFIG_MEMCG_KMEM is compiled in and activated runtime that these steps are done anyhow. The extra cost is most visible for SLAB allocator, because the SLUB allocator does the page lookup (virt_to_head_page()) anyhow. Thus, the conclusion was to keep the kmem_cache free bulk API with a kmem_cache pointer, but we can still implement a kfree_bulk() API fairly easily. Simply by handling if kmem_cache_free_bulk() gets called with a kmem_cache NULL pointer. This does increase the code size a bit, but implementing a separate kfree_bulk() call would likely increase code size even more. Below benchmarks cost of alloc+free (obj size 256 bytes) on CPU i7-4790K @ 4.00GHz, no PREEMPT and CONFIG_MEMCG_KMEM=y. Code size increase for SLAB: add/remove: 0/0 grow/shrink: 1/0 up/down: 74/0 (74) function old new delta kmem_cache_free_bulk 660 734 +74 SLAB fastpath: 87 cycles(tsc) 21.814 sz - fallback - kmem_cache_free_bulk - kfree_bulk 1 - 103 cycles 25.878 ns - 41 cycles 10.498 ns - 81 cycles 20.312 ns 2 - 94 cycles 23.673 ns - 26 cycles 6.682 ns - 42 cycles 10.649 ns 3 - 92 cycles 23.181 ns - 21 cycles 5.325 ns - 39 cycles 9.950 ns 4 - 90 cycles 22.727 ns - 18 cycles 4.673 ns - 26 cycles 6.693 ns 8 - 89 cycles 22.270 ns - 14 cycles 3.664 ns - 23 cycles 5.835 ns 16 - 88 cycles 22.038 ns - 14 cycles 3.503 ns - 22 cycles 5.543 ns 30 - 89 cycles 22.284 ns - 13 cycles 3.310 ns - 20 cycles 5.197 ns 32 - 88 cycles 22.249 ns - 13 cycles 3.420 ns - 20 cycles 5.166 ns 34 - 88 cycles 22.224 ns - 14 cycles 3.643 ns - 20 cycles 5.170 ns 48 - 88 cycles 22.088 ns - 14 cycles 3.507 ns - 20 cycles 5.203 ns 64 - 88 cycles 22.063 ns - 13 cycles 3.428 ns - 20 cycles 5.152 ns 128 - 89 cycles 22.483 ns - 15 cycles 3.891 ns - 23 cycles 5.885 ns 158 - 89 cycles 22.381 ns - 15 cycles 3.779 ns - 22 cycles 5.548 ns 250 - 91 cycles 22.798 ns - 16 cycles 4.152 ns - 23 cycles 5.967 ns SLAB when enabling MEMCG_KMEM runtime: - kmemcg fastpath: 130 cycles(tsc) 32.684 ns (step:0) 1 - 148 cycles 37.220 ns - 66 cycles 16.622 ns - 66 cycles 16.583 ns 2 - 141 cycles 35.510 ns - 51 cycles 12.820 ns - 58 cycles 14.625 ns 3 - 140 cycles 35.017 ns - 37 cycles 9.326 ns - 33 cycles 8.474 ns 4 - 137 cycles 34.507 ns - 31 cycles 7.888 ns - 33 cycles 8.300 ns 8 - 140 cycles 35.069 ns - 25 cycles 6.461 ns - 25 cycles 6.436 ns 16 - 138 cycles 34.542 ns - 23 cycles 5.945 ns - 22 cycles 5.670 ns 30 - 136 cycles 34.227 ns - 22 cycles 5.502 ns - 22 cycles 5.587 ns 32 - 136 cycles 34.253 ns - 21 cycles 5.475 ns - 21 cycles 5.324 ns 34 - 136 cycles 34.254 ns - 21 cycles 5.448 ns - 20 cycles 5.194 ns 48 - 136 cycles 34.075 ns - 21 cycles 5.458 ns - 21 cycles 5.367 ns 64 - 135 cycles 33.994 ns - 21 cycles 5.350 ns - 21 cycles 5.259 ns 128 - 137 cycles 34.446 ns - 23 cycles 5.816 ns - 22 cycles 5.688 ns 158 - 137 cycles 34.379 ns - 22 cycles 5.727 ns - 22 cycles 5.602 ns 250 - 138 cycles 34.755 ns - 24 cycles 6.093 ns - 23 cycles 5.986 ns Code size increase for SLUB: function old new delta kmem_cache_free_bulk 717 799 +82 SLUB benchmark: SLUB fastpath: 46 cycles(tsc) 11.691 ns (step:0) sz - fallback - kmem_cache_free_bulk - kfree_bulk 1 - 61 cycles 15.486 ns - 53 cycles 13.364 ns - 57 cycles 14.464 ns 2 - 54 cycles 13.703 ns - 32 cycles 8.110 ns - 33 cycles 8.482 ns 3 - 53 cycles 13.272 ns - 25 cycles 6.362 ns - 27 cycles 6.947 ns 4 - 51 cycles 12.994 ns - 24 cycles 6.087 ns - 24 cycles 6.078 ns 8 - 50 cycles 12.576 ns - 21 cycles 5.354 ns - 22 cycles 5.513 ns 16 - 49 cycles 12.368 ns - 20 cycles 5.054 ns - 20 cycles 5.042 ns 30 - 49 cycles 12.273 ns - 18 cycles 4.748 ns - 19 cycles 4.758 ns 32 - 49 cycles 12.401 ns - 19 cycles 4.821 ns - 19 cycles 4.810 ns 34 - 98 cycles 24.519 ns - 24 cycles 6.154 ns - 24 cycles 6.157 ns 48 - 83 cycles 20.833 ns - 21 cycles 5.446 ns - 21 cycles 5.429 ns 64 - 75 cycles 18.891 ns - 20 cycles 5.247 ns - 20 cycles 5.238 ns 128 - 93 cycles 23.271 ns - 27 cycles 6.856 ns - 27 cycles 6.823 ns 158 - 102 cycles 25.581 ns - 30 cycles 7.714 ns - 30 cycles 7.695 ns 250 - 107 cycles 26.917 ns - 38 cycles 9.514 ns - 38 cycles 9.506 ns SLUB when enabling MEMCG_KMEM runtime: - kmemcg fastpath: 71 cycles(tsc) 17.897 ns (step:0) 1 - 85 cycles 21.484 ns - 78 cycles 19.569 ns - 75 cycles 18.938 ns 2 - 81 cycles 20.363 ns - 45 cycles 11.258 ns - 44 cycles 11.076 ns 3 - 78 cycles 19.709 ns - 33 cycles 8.354 ns - 32 cycles 8.044 ns 4 - 77 cycles 19.430 ns - 28 cycles 7.216 ns - 28 cycles 7.003 ns 8 - 101 cycles 25.288 ns - 23 cycles 5.849 ns - 23 cycles 5.787 ns 16 - 76 cycles 19.148 ns - 20 cycles 5.162 ns - 20 cycles 5.081 ns 30 - 76 cycles 19.067 ns - 19 cycles 4.868 ns - 19 cycles 4.821 ns 32 - 76 cycles 19.052 ns - 19 cycles 4.857 ns - 19 cycles 4.815 ns 34 - 121 cycles 30.291 ns - 25 cycles 6.333 ns - 25 cycles 6.268 ns 48 - 108 cycles 27.111 ns - 21 cycles 5.498 ns - 21 cycles 5.458 ns 64 - 100 cycles 25.164 ns - 20 cycles 5.242 ns - 20 cycles 5.229 ns 128 - 155 cycles 38.976 ns - 27 cycles 6.886 ns - 27 cycles 6.892 ns 158 - 132 cycles 33.034 ns - 30 cycles 7.711 ns - 30 cycles 7.728 ns 250 - 130 cycles 32.612 ns - 38 cycles 9.560 ns - 38 cycles 9.549 ns Signed-off-by: Jesper Dangaard Brouer Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 5 ++++- mm/slab_common.c | 8 ++++++-- mm/slub.c | 21 ++++++++++++++++++--- 3 files changed, 28 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 2e075c0..b3eca03 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3587,7 +3587,10 @@ void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p) for (i = 0; i < size; i++) { void *objp = p[i]; - s = cache_from_obj(orig_s, objp); + if (!orig_s) /* called via kfree_bulk */ + s = virt_to_cache(objp); + else + s = cache_from_obj(orig_s, objp); debug_check_no_locks_freed(objp, s->object_size); if (!(s->flags & SLAB_DEBUG_OBJECTS)) diff --git a/mm/slab_common.c b/mm/slab_common.c index 065b7bd..6afb226 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -109,8 +109,12 @@ void __kmem_cache_free_bulk(struct kmem_cache *s, size_t nr, void **p) { size_t i; - for (i = 0; i < nr; i++) - kmem_cache_free(s, p[i]); + for (i = 0; i < nr; i++) { + if (s) + kmem_cache_free(s, p[i]); + else + kfree(p[i]); + } } int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr, diff --git a/mm/slub.c b/mm/slub.c index 6dd04c0..9620815 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2783,23 +2783,38 @@ int build_detached_freelist(struct kmem_cache *s, size_t size, size_t first_skipped_index = 0; int lookahead = 3; void *object; + struct page *page; /* Always re-init detached_freelist */ df->page = NULL; do { object = p[--size]; + /* Do we need !ZERO_OR_NULL_PTR(object) here? (for kfree) */ } while (!object && size); if (!object) return 0; - /* Support for memcg, compiler can optimize this out */ - df->s = cache_from_obj(s, object); + page = virt_to_head_page(object); + if (!s) { + /* Handle kalloc'ed objects */ + if (unlikely(!PageSlab(page))) { + BUG_ON(!PageCompound(page)); + kfree_hook(object); + __free_kmem_pages(page, compound_order(page)); + p[size] = NULL; /* mark object processed */ + return size; + } + /* Derive kmem_cache from object */ + df->s = page->slab_cache; + } else { + df->s = cache_from_obj(s, object); /* Support for memcg */ + } /* Start new detached freelist */ + df->page = page; set_freepointer(df->s, object, NULL); - df->page = virt_to_head_page(object); df->tail = object; df->freelist = object; p[size] = NULL; /* mark object processed */ -- cgit v1.1 From 9f706d6820d3ea776d6b3fc0c1de9f81eb0d021b Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 15 Mar 2016 14:54:03 -0700 Subject: mm: fix some spelling Fix up trivial spelling errors, noticed while reading the code. Signed-off-by: Jesper Dangaard Brouer Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slab.h b/mm/slab.h index 6c7f16a..e880bbe 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -172,7 +172,7 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer, /* * Generic implementation of bulk operations * These are useful for situations in which the allocator cannot - * perform optimizations. In that case segments of the objecct listed + * perform optimizations. In that case segments of the object listed * may be allocated or freed using these operations. */ void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **); -- cgit v1.1 From 12c61fe9b763812adac44522100527caf534462e Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 15 Mar 2016 14:54:06 -0700 Subject: mm/slab: fix stale code comment This patchset implements a new freed object management way, that is, OBJFREELIST_SLAB. Purpose of it is to reduce memory overhead in SLAB. SLAB needs a array to manage freed objects in a slab. If there is leftover after objects are packed into a slab, we can use it as a management array, and, in this case, there is no memory waste. But, in the other cases, we need to allocate extra memory for a management array or utilize dedicated internal memory in a slab for it. Both cases causes memory waste so it's not good. With this patchset, freed object itself can be used for a management array. So, memory waste could be reduced. Detailed idea and numbers are described in last patch's commit description. Please refer it. In fact, I tested another idea implementing OBJFREELIST_SLAB with extendable linked array through another freed object. It can remove memory waste completely but it causes more computational overhead in critical lock path and it seems that overhead outweigh benefit. So, this patchset doesn't include it. I will attach prototype just for a reference. This patch (of 16): We use freelist_idx_t type for free object management whose size would be smaller than size of unsigned int. Fix it. Signed-off-by: Joonsoo Kim Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index b3eca03..23fc4b0 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -537,7 +537,7 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, * on it. For the latter case, the memory allocated for a * slab is used for: * - * - One unsigned int for each object + * - One freelist_idx_t for each object * - Padding to respect alignment of @align * - @buffer_size bytes for each object * -- cgit v1.1 From 6fb924304ac35f1ab9f3abe73527efcd5156131f Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 15 Mar 2016 14:54:09 -0700 Subject: mm/slab: remove useless structure define It is obsolete so remove it. Signed-off-by: Joonsoo Kim Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 23fc4b0..3634dc1 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -225,16 +225,6 @@ static inline void clear_obj_pfmemalloc(void **objp) } /* - * bootstrap: The caches do not work without cpuarrays anymore, but the - * cpuarrays are allocated from the generic caches... - */ -#define BOOT_CPUCACHE_ENTRIES 1 -struct arraycache_init { - struct array_cache cache; - void *entries[BOOT_CPUCACHE_ENTRIES]; -}; - -/* * Need this for bootstrapping a per node allocator. */ #define NUM_INIT_LISTS (2 * MAX_NUMNODES) @@ -457,6 +447,7 @@ static inline unsigned int obj_to_index(const struct kmem_cache *cache, return reciprocal_divide(offset, cache->reciprocal_buffer_size); } +#define BOOT_CPUCACHE_ENTRIES 1 /* internal cache of cache description objs */ static struct kmem_cache kmem_cache_boot = { .batchcount = 1, -- cgit v1.1 From 260b61dd46ed07f517e4059ab9881402cbd6a385 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 15 Mar 2016 14:54:12 -0700 Subject: mm/slab: remove the checks for slab implementation bug Some of "#if DEBUG" are for reporting slab implementation bug rather than user usecase bug. It's not really needed because slab is stable for a quite long time and it makes code too dirty. This patch remove it. Signed-off-by: Joonsoo Kim Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 29 +++++++---------------------- 1 file changed, 7 insertions(+), 22 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 3634dc1..14c3f9c 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2110,8 +2110,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) if (!(flags & SLAB_DESTROY_BY_RCU)) flags |= SLAB_POISON; #endif - if (flags & SLAB_DESTROY_BY_RCU) - BUG_ON(flags & SLAB_POISON); #endif /* @@ -2368,9 +2366,6 @@ static int drain_freelist(struct kmem_cache *cache, } page = list_entry(p, struct page, lru); -#if DEBUG - BUG_ON(page->active); -#endif list_del(&page->lru); /* * Safe to drop the lock. The slab is no longer linked @@ -2528,30 +2523,23 @@ static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) } } -static void *slab_get_obj(struct kmem_cache *cachep, struct page *page, - int nodeid) +static void *slab_get_obj(struct kmem_cache *cachep, struct page *page) { void *objp; objp = index_to_obj(cachep, page, get_free_obj(page, page->active)); page->active++; -#if DEBUG - WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid); -#endif return objp; } -static void slab_put_obj(struct kmem_cache *cachep, struct page *page, - void *objp, int nodeid) +static void slab_put_obj(struct kmem_cache *cachep, + struct page *page, void *objp) { unsigned int objnr = obj_to_index(cachep, page, objp); #if DEBUG unsigned int i; - /* Verify that the slab belongs to the intended node */ - WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid); - /* Verify double free bug */ for (i = page->active; i < cachep->num; i++) { if (get_free_obj(page, i) == objnr) { @@ -2817,8 +2805,7 @@ retry: STATS_INC_ACTIVE(cachep); STATS_SET_HIGH(cachep); - ac_put_obj(cachep, ac, slab_get_obj(cachep, page, - node)); + ac_put_obj(cachep, ac, slab_get_obj(cachep, page)); } /* move slabp to correct slabp list: */ @@ -3101,7 +3088,7 @@ retry: BUG_ON(page->active == cachep->num); - obj = slab_get_obj(cachep, page, nodeid); + obj = slab_get_obj(cachep, page); n->free_objects--; /* move slabp to correct slabp list: */ list_del(&page->lru); @@ -3252,7 +3239,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp, page = virt_to_head_page(objp); list_del(&page->lru); check_spinlock_acquired_node(cachep, node); - slab_put_obj(cachep, page, objp, node); + slab_put_obj(cachep, page, objp); STATS_DEC_ACTIVE(cachep); n->free_objects++; @@ -3282,9 +3269,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) LIST_HEAD(list); batchcount = ac->batchcount; -#if DEBUG - BUG_ON(!batchcount || batchcount > ac->avail); -#endif + check_irq_off(); n = get_node(cachep, node); spin_lock(&n->list_lock); -- cgit v1.1 From a307ebd468e0b97c203f5a99a56a6017e4d1991a Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 15 Mar 2016 14:54:15 -0700 Subject: mm/slab: activate debug_pagealloc in SLAB when it is actually enabled Signed-off-by: Joonsoo Kim Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 14c3f9c..4807cf4 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1838,7 +1838,8 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, if (cachep->flags & SLAB_POISON) { #ifdef CONFIG_DEBUG_PAGEALLOC - if (cachep->size % PAGE_SIZE == 0 && + if (debug_pagealloc_enabled() && + cachep->size % PAGE_SIZE == 0 && OFF_SLAB(cachep)) kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, 1); @@ -2176,7 +2177,8 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) * to check size >= 256. It guarantees that all necessary small * sized slab is initialized in current slab initialization sequence. */ - if (!slab_early_init && size >= kmalloc_size(INDEX_NODE) && + if (debug_pagealloc_enabled() && + !slab_early_init && size >= kmalloc_size(INDEX_NODE) && size >= 256 && cachep->object_size > cache_line_size() && ALIGN(size, cachep->align) < PAGE_SIZE) { cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align); @@ -2232,7 +2234,8 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) * poisoning, then it's going to smash the contents of * the redzone and userword anyhow, so switch them off. */ - if (size % PAGE_SIZE == 0 && flags & SLAB_POISON) + if (debug_pagealloc_enabled() && + size % PAGE_SIZE == 0 && flags & SLAB_POISON) flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); #endif } @@ -2716,7 +2719,8 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, set_obj_status(page, objnr, OBJECT_FREE); if (cachep->flags & SLAB_POISON) { #ifdef CONFIG_DEBUG_PAGEALLOC - if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { + if (debug_pagealloc_enabled() && + (cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) { store_stackinfo(cachep, objp, caller); kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, 0); @@ -2861,7 +2865,8 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, return objp; if (cachep->flags & SLAB_POISON) { #ifdef CONFIG_DEBUG_PAGEALLOC - if ((cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) + if (debug_pagealloc_enabled() && + (cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, 1); else -- cgit v1.1 From 40323278b557a5909bbecfa181c91a3af7afbbe3 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 15 Mar 2016 14:54:18 -0700 Subject: mm/slab: use more appropriate condition check for debug_pagealloc debug_pagealloc debugging is related to SLAB_POISON flag rather than FORCED_DEBUG option, although FORCED_DEBUG option will enable SLAB_POISON. Fix it. Signed-off-by: Joonsoo Kim Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 4807cf4..8bca9be 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2169,7 +2169,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) else size += BYTES_PER_WORD; } -#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) /* * To activate debug pagealloc, off-slab management is necessary * requirement. In early phase of initialization, small sized slab @@ -2177,7 +2176,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) * to check size >= 256. It guarantees that all necessary small * sized slab is initialized in current slab initialization sequence. */ - if (debug_pagealloc_enabled() && + if (debug_pagealloc_enabled() && (flags & SLAB_POISON) && !slab_early_init && size >= kmalloc_size(INDEX_NODE) && size >= 256 && cachep->object_size > cache_line_size() && ALIGN(size, cachep->align) < PAGE_SIZE) { @@ -2185,7 +2184,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) size = PAGE_SIZE; } #endif -#endif /* * Determine if the slab management is 'on' or 'off' slab. -- cgit v1.1 From 40b44137971c2e5865a78f9f7de274449983ccb5 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 15 Mar 2016 14:54:21 -0700 Subject: mm/slab: clean up DEBUG_PAGEALLOC processing code Currently, open code for checking DEBUG_PAGEALLOC cache is spread to some sites. It makes code unreadable and hard to change. This patch cleans up this code. The following patch will change the criteria for DEBUG_PAGEALLOC cache so this clean-up will help it, too. [akpm@linux-foundation.org: fix build with CONFIG_DEBUG_PAGEALLOC=n] Signed-off-by: Joonsoo Kim Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 97 ++++++++++++++++++++++++++++++++------------------------------- 1 file changed, 49 insertions(+), 48 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 8bca9be..3142ec3 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1661,6 +1661,14 @@ static void kmem_rcu_free(struct rcu_head *head) } #if DEBUG +static bool is_debug_pagealloc_cache(struct kmem_cache *cachep) +{ + if (debug_pagealloc_enabled() && OFF_SLAB(cachep) && + (cachep->size % PAGE_SIZE) == 0) + return true; + + return false; +} #ifdef CONFIG_DEBUG_PAGEALLOC static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr, @@ -1694,6 +1702,23 @@ static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr, } *addr++ = 0x87654321; } + +static void slab_kernel_map(struct kmem_cache *cachep, void *objp, + int map, unsigned long caller) +{ + if (!is_debug_pagealloc_cache(cachep)) + return; + + if (caller) + store_stackinfo(cachep, objp, caller); + + kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, map); +} + +#else +static inline void slab_kernel_map(struct kmem_cache *cachep, void *objp, + int map, unsigned long caller) {} + #endif static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val) @@ -1772,6 +1797,9 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) int size, i; int lines = 0; + if (is_debug_pagealloc_cache(cachep)) + return; + realobj = (char *)objp + obj_offset(cachep); size = cachep->object_size; @@ -1837,17 +1865,8 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, void *objp = index_to_obj(cachep, page, i); if (cachep->flags & SLAB_POISON) { -#ifdef CONFIG_DEBUG_PAGEALLOC - if (debug_pagealloc_enabled() && - cachep->size % PAGE_SIZE == 0 && - OFF_SLAB(cachep)) - kernel_map_pages(virt_to_page(objp), - cachep->size / PAGE_SIZE, 1); - else - check_poison_obj(cachep, objp); -#else check_poison_obj(cachep, objp); -#endif + slab_kernel_map(cachep, objp, 1, 0); } if (cachep->flags & SLAB_RED_ZONE) { if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) @@ -2226,16 +2245,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) if (flags & CFLGS_OFF_SLAB) { /* really off slab. No need for manual alignment */ freelist_size = calculate_freelist_size(cachep->num, 0); - -#ifdef CONFIG_PAGE_POISONING - /* If we're going to use the generic kernel_map_pages() - * poisoning, then it's going to smash the contents of - * the redzone and userword anyhow, so switch them off. - */ - if (debug_pagealloc_enabled() && - size % PAGE_SIZE == 0 && flags & SLAB_POISON) - flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); -#endif } cachep->colour_off = cache_line_size(); @@ -2251,7 +2260,19 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) cachep->size = size; cachep->reciprocal_buffer_size = reciprocal_value(size); - if (flags & CFLGS_OFF_SLAB) { +#if DEBUG + /* + * If we're going to use the generic kernel_map_pages() + * poisoning, then it's going to smash the contents of + * the redzone and userword anyhow, so switch them off. + */ + if (IS_ENABLED(CONFIG_PAGE_POISONING) && + (cachep->flags & SLAB_POISON) && + is_debug_pagealloc_cache(cachep)) + cachep->flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); +#endif + + if (OFF_SLAB(cachep)) { cachep->freelist_cache = kmalloc_slab(freelist_size, 0u); /* * This is a possibility for one of the kmalloc_{dma,}_caches. @@ -2475,9 +2496,6 @@ static void cache_init_objs(struct kmem_cache *cachep, for (i = 0; i < cachep->num; i++) { void *objp = index_to_obj(cachep, page, i); #if DEBUG - /* need to poison the objs? */ - if (cachep->flags & SLAB_POISON) - poison_obj(cachep, objp, POISON_FREE); if (cachep->flags & SLAB_STORE_USER) *dbg_userword(cachep, objp) = NULL; @@ -2501,10 +2519,11 @@ static void cache_init_objs(struct kmem_cache *cachep, slab_error(cachep, "constructor overwrote the" " start of an object"); } - if ((cachep->size % PAGE_SIZE) == 0 && - OFF_SLAB(cachep) && cachep->flags & SLAB_POISON) - kernel_map_pages(virt_to_page(objp), - cachep->size / PAGE_SIZE, 0); + /* need to poison the objs? */ + if (cachep->flags & SLAB_POISON) { + poison_obj(cachep, objp, POISON_FREE); + slab_kernel_map(cachep, objp, 0, 0); + } #else if (cachep->ctor) cachep->ctor(objp); @@ -2716,18 +2735,8 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, set_obj_status(page, objnr, OBJECT_FREE); if (cachep->flags & SLAB_POISON) { -#ifdef CONFIG_DEBUG_PAGEALLOC - if (debug_pagealloc_enabled() && - (cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) { - store_stackinfo(cachep, objp, caller); - kernel_map_pages(virt_to_page(objp), - cachep->size / PAGE_SIZE, 0); - } else { - poison_obj(cachep, objp, POISON_FREE); - } -#else poison_obj(cachep, objp, POISON_FREE); -#endif + slab_kernel_map(cachep, objp, 0, caller); } return objp; } @@ -2862,16 +2871,8 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, if (!objp) return objp; if (cachep->flags & SLAB_POISON) { -#ifdef CONFIG_DEBUG_PAGEALLOC - if (debug_pagealloc_enabled() && - (cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) - kernel_map_pages(virt_to_page(objp), - cachep->size / PAGE_SIZE, 1); - else - check_poison_obj(cachep, objp); -#else check_poison_obj(cachep, objp); -#endif + slab_kernel_map(cachep, objp, 1, 0); poison_obj(cachep, objp, POISON_INUSE); } if (cachep->flags & SLAB_STORE_USER) -- cgit v1.1 From d31676dfde257cb2b3e52d4e657d8ad2251e4d49 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 15 Mar 2016 14:54:24 -0700 Subject: mm/slab: alternative implementation for DEBUG_SLAB_LEAK DEBUG_SLAB_LEAK is a debug option. It's current implementation requires status buffer so we need more memory to use it. And, it cause kmem_cache initialization step more complex. To remove this extra memory usage and to simplify initialization step, this patch implement this feature with another way. When user requests to get slab object owner information, it marks that getting information is started. And then, all free objects in caches are flushed to corresponding slab page. Now, we can distinguish all freed object so we can know all allocated objects, too. After collecting slab object owner information on allocated objects, mark is checked that there is no free during the processing. If true, we can be sure that our information is correct so information is returned to user. Although this way is rather complex, it has two important benefits mentioned above. So, I think it is worth changing. There is one drawback that it takes more time to get slab object owner information but it is just a debug option so it doesn't matter at all. To help review, this patch implements new way only. Following patch will remove useless code. Signed-off-by: Joonsoo Kim Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 63 insertions(+), 22 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 3142ec3..907abe9 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -396,20 +396,25 @@ static void set_obj_status(struct page *page, int idx, int val) status[idx] = val; } -static inline unsigned int get_obj_status(struct page *page, int idx) +static inline bool is_store_user_clean(struct kmem_cache *cachep) { - int freelist_size; - char *status; - struct kmem_cache *cachep = page->slab_cache; + return atomic_read(&cachep->store_user_clean) == 1; +} - freelist_size = cachep->num * sizeof(freelist_idx_t); - status = (char *)page->freelist + freelist_size; +static inline void set_store_user_clean(struct kmem_cache *cachep) +{ + atomic_set(&cachep->store_user_clean, 1); +} - return status[idx]; +static inline void set_store_user_dirty(struct kmem_cache *cachep) +{ + if (is_store_user_clean(cachep)) + atomic_set(&cachep->store_user_clean, 0); } #else static inline void set_obj_status(struct page *page, int idx, int val) {} +static inline void set_store_user_dirty(struct kmem_cache *cachep) {} #endif @@ -2550,6 +2555,11 @@ static void *slab_get_obj(struct kmem_cache *cachep, struct page *page) objp = index_to_obj(cachep, page, get_free_obj(page, page->active)); page->active++; +#if DEBUG + if (cachep->flags & SLAB_STORE_USER) + set_store_user_dirty(cachep); +#endif + return objp; } @@ -2725,8 +2735,10 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, *dbg_redzone1(cachep, objp) = RED_INACTIVE; *dbg_redzone2(cachep, objp) = RED_INACTIVE; } - if (cachep->flags & SLAB_STORE_USER) + if (cachep->flags & SLAB_STORE_USER) { + set_store_user_dirty(cachep); *dbg_userword(cachep, objp) = (void *)caller; + } objnr = obj_to_index(cachep, page, objp); @@ -4119,15 +4131,34 @@ static void handle_slab(unsigned long *n, struct kmem_cache *c, struct page *page) { void *p; - int i; + int i, j; + unsigned long v; if (n[0] == n[1]) return; for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) { - if (get_obj_status(page, i) != OBJECT_ACTIVE) + bool active = true; + + for (j = page->active; j < c->num; j++) { + if (get_free_obj(page, j) == i) { + active = false; + break; + } + } + + if (!active) continue; - if (!add_caller(n, (unsigned long)*dbg_userword(c, p))) + /* + * probe_kernel_read() is used for DEBUG_PAGEALLOC. page table + * mapping is established when actual object allocation and + * we could mistakenly access the unmapped object in the cpu + * cache. + */ + if (probe_kernel_read(&v, dbg_userword(c, p), sizeof(v))) + continue; + + if (!add_caller(n, v)) return; } } @@ -4163,21 +4194,31 @@ static int leaks_show(struct seq_file *m, void *p) if (!(cachep->flags & SLAB_RED_ZONE)) return 0; - /* OK, we can do it */ + /* + * Set store_user_clean and start to grab stored user information + * for all objects on this cache. If some alloc/free requests comes + * during the processing, information would be wrong so restart + * whole processing. + */ + do { + set_store_user_clean(cachep); + drain_cpu_caches(cachep); + + x[1] = 0; - x[1] = 0; + for_each_kmem_cache_node(cachep, node, n) { - for_each_kmem_cache_node(cachep, node, n) { + check_irq_on(); + spin_lock_irq(&n->list_lock); - check_irq_on(); - spin_lock_irq(&n->list_lock); + list_for_each_entry(page, &n->slabs_full, lru) + handle_slab(x, cachep, page); + list_for_each_entry(page, &n->slabs_partial, lru) + handle_slab(x, cachep, page); + spin_unlock_irq(&n->list_lock); + } + } while (!is_store_user_clean(cachep)); - list_for_each_entry(page, &n->slabs_full, lru) - handle_slab(x, cachep, page); - list_for_each_entry(page, &n->slabs_partial, lru) - handle_slab(x, cachep, page); - spin_unlock_irq(&n->list_lock); - } name = cachep->name; if (x[0] == x[1]) { /* Increase the buffer size */ -- cgit v1.1 From 249247b6f8ee362189a2f2bf598a14ff6c95fb4c Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 15 Mar 2016 14:54:27 -0700 Subject: mm/slab: remove object status buffer for DEBUG_SLAB_LEAK Now, we don't use object status buffer in any setup. Remove it. Signed-off-by: Joonsoo Kim Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 34 ++-------------------------------- 1 file changed, 2 insertions(+), 32 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 907abe9..02be9d9 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -380,22 +380,8 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp) #endif -#define OBJECT_FREE (0) -#define OBJECT_ACTIVE (1) - #ifdef CONFIG_DEBUG_SLAB_LEAK -static void set_obj_status(struct page *page, int idx, int val) -{ - int freelist_size; - char *status; - struct kmem_cache *cachep = page->slab_cache; - - freelist_size = cachep->num * sizeof(freelist_idx_t); - status = (char *)page->freelist + freelist_size; - status[idx] = val; -} - static inline bool is_store_user_clean(struct kmem_cache *cachep) { return atomic_read(&cachep->store_user_clean) == 1; @@ -413,7 +399,6 @@ static inline void set_store_user_dirty(struct kmem_cache *cachep) } #else -static inline void set_obj_status(struct page *page, int idx, int val) {} static inline void set_store_user_dirty(struct kmem_cache *cachep) {} #endif @@ -476,9 +461,6 @@ static size_t calculate_freelist_size(int nr_objs, size_t align) size_t freelist_size; freelist_size = nr_objs * sizeof(freelist_idx_t); - if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) - freelist_size += nr_objs * sizeof(char); - if (align) freelist_size = ALIGN(freelist_size, align); @@ -491,10 +473,7 @@ static int calculate_nr_objs(size_t slab_size, size_t buffer_size, int nr_objs; size_t remained_size; size_t freelist_size; - int extra_space = 0; - if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) - extra_space = sizeof(char); /* * Ignore padding for the initial guess. The padding * is at most @align-1 bytes, and @buffer_size is at @@ -503,7 +482,7 @@ static int calculate_nr_objs(size_t slab_size, size_t buffer_size, * into the memory allocation when taking the padding * into account. */ - nr_objs = slab_size / (buffer_size + idx_size + extra_space); + nr_objs = slab_size / (buffer_size + idx_size); /* * This calculated number will be either the right @@ -1961,16 +1940,13 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, break; if (flags & CFLGS_OFF_SLAB) { - size_t freelist_size_per_obj = sizeof(freelist_idx_t); /* * Max number of objs-per-slab for caches which * use off-slab slabs. Needed to avoid a possible * looping condition in cache_grow(). */ - if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) - freelist_size_per_obj += sizeof(char); offslab_limit = size; - offslab_limit /= freelist_size_per_obj; + offslab_limit /= sizeof(freelist_idx_t); if (num > offslab_limit) break; @@ -2533,7 +2509,6 @@ static void cache_init_objs(struct kmem_cache *cachep, if (cachep->ctor) cachep->ctor(objp); #endif - set_obj_status(page, i, OBJECT_FREE); set_free_obj(page, i, i); } } @@ -2745,7 +2720,6 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, BUG_ON(objnr >= cachep->num); BUG_ON(objp != index_to_obj(cachep, page, objnr)); - set_obj_status(page, objnr, OBJECT_FREE); if (cachep->flags & SLAB_POISON) { poison_obj(cachep, objp, POISON_FREE); slab_kernel_map(cachep, objp, 0, caller); @@ -2878,8 +2852,6 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags, void *objp, unsigned long caller) { - struct page *page; - if (!objp) return objp; if (cachep->flags & SLAB_POISON) { @@ -2904,8 +2876,6 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, *dbg_redzone2(cachep, objp) = RED_ACTIVE; } - page = virt_to_head_page(objp); - set_obj_status(page, obj_to_index(cachep, page, objp), OBJECT_ACTIVE); objp += obj_offset(cachep); if (cachep->ctor && cachep->flags & SLAB_POISON) cachep->ctor(objp); -- cgit v1.1 From 2e6b3602168797fd4d80d86d208c4ba8fcfa3b8b Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 15 Mar 2016 14:54:30 -0700 Subject: mm/slab: put the freelist at the end of slab page Currently, the freelist is at the front of slab page. This requires extra space to meet object alignment requirement. If we put the freelist at the end of a slab page, objects could start at page boundary and will be at correct alignment. This is possible because freelist has no alignment constraint itself. This gives us two benefits: It removes extra memory space for the freelist alignment and remove complex calculation at cache initialization step. I can't think notable drawback here. I mentioned that this would reduce extra memory space, but, this benefit is rather theoretical because it can be applied to very few cases. Following is the example cache type that can get benefit from this change. size align num before after 32 8 124 4100 4092 64 8 63 4103 4095 88 8 46 4102 4094 272 8 15 4103 4095 408 8 10 4098 4090 32 16 124 4108 4092 64 16 63 4111 4095 32 32 124 4124 4092 64 32 63 4127 4095 96 32 42 4106 4074 before means whole size for objects and aligned freelist before applying patch and after shows the result of this patch. Since before is more than 4096, number of object should decrease and memory waste happens. Anyway, this patch removes complex calculation so looks beneficial to me. [akpm@linux-foundation.org: fix kerneldoc] Signed-off-by: Joonsoo Kim Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 90 ++++++++++++++++----------------------------------------------- 1 file changed, 22 insertions(+), 68 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 02be9d9..b3d91b0 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -456,55 +456,12 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) return this_cpu_ptr(cachep->cpu_cache); } -static size_t calculate_freelist_size(int nr_objs, size_t align) -{ - size_t freelist_size; - - freelist_size = nr_objs * sizeof(freelist_idx_t); - if (align) - freelist_size = ALIGN(freelist_size, align); - - return freelist_size; -} - -static int calculate_nr_objs(size_t slab_size, size_t buffer_size, - size_t idx_size, size_t align) -{ - int nr_objs; - size_t remained_size; - size_t freelist_size; - - /* - * Ignore padding for the initial guess. The padding - * is at most @align-1 bytes, and @buffer_size is at - * least @align. In the worst case, this result will - * be one greater than the number of objects that fit - * into the memory allocation when taking the padding - * into account. - */ - nr_objs = slab_size / (buffer_size + idx_size); - - /* - * This calculated number will be either the right - * amount, or one greater than what we want. - */ - remained_size = slab_size - nr_objs * buffer_size; - freelist_size = calculate_freelist_size(nr_objs, align); - if (remained_size < freelist_size) - nr_objs--; - - return nr_objs; -} - /* * Calculate the number of objects and left-over bytes for a given buffer size. */ static void cache_estimate(unsigned long gfporder, size_t buffer_size, - size_t align, int flags, size_t *left_over, - unsigned int *num) + unsigned long flags, size_t *left_over, unsigned int *num) { - int nr_objs; - size_t mgmt_size; size_t slab_size = PAGE_SIZE << gfporder; /* @@ -512,9 +469,12 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, * on it. For the latter case, the memory allocated for a * slab is used for: * - * - One freelist_idx_t for each object - * - Padding to respect alignment of @align * - @buffer_size bytes for each object + * - One freelist_idx_t for each object + * + * We don't need to consider alignment of freelist because + * freelist will be at the end of slab page. The objects will be + * at the correct alignment. * * If the slab management structure is off the slab, then the * alignment will already be calculated into the size. Because @@ -522,16 +482,13 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, * correct alignment when allocated. */ if (flags & CFLGS_OFF_SLAB) { - mgmt_size = 0; - nr_objs = slab_size / buffer_size; - + *num = slab_size / buffer_size; + *left_over = slab_size % buffer_size; } else { - nr_objs = calculate_nr_objs(slab_size, buffer_size, - sizeof(freelist_idx_t), align); - mgmt_size = calculate_freelist_size(nr_objs, align); + *num = slab_size / (buffer_size + sizeof(freelist_idx_t)); + *left_over = slab_size % + (buffer_size + sizeof(freelist_idx_t)); } - *num = nr_objs; - *left_over = slab_size - nr_objs*buffer_size - mgmt_size; } #if DEBUG @@ -1911,7 +1868,6 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list) * calculate_slab_order - calculate size (page order) of slabs * @cachep: pointer to the cache that is being created * @size: size of objects to be created in this cache. - * @align: required alignment for the objects. * @flags: slab allocation flags * * Also calculates the number of objects per slab. @@ -1921,7 +1877,7 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list) * towards high-order requests, this should be changed. */ static size_t calculate_slab_order(struct kmem_cache *cachep, - size_t size, size_t align, unsigned long flags) + size_t size, unsigned long flags) { unsigned long offslab_limit; size_t left_over = 0; @@ -1931,7 +1887,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, unsigned int num; size_t remainder; - cache_estimate(gfporder, size, align, flags, &remainder, &num); + cache_estimate(gfporder, size, flags, &remainder, &num); if (!num) continue; @@ -2207,12 +2163,12 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE) size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align); - left_over = calculate_slab_order(cachep, size, cachep->align, flags); + left_over = calculate_slab_order(cachep, size, flags); if (!cachep->num) return -E2BIG; - freelist_size = calculate_freelist_size(cachep->num, cachep->align); + freelist_size = cachep->num * sizeof(freelist_idx_t); /* * If the slab has been placed off-slab, and we have enough space then @@ -2223,11 +2179,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) left_over -= freelist_size; } - if (flags & CFLGS_OFF_SLAB) { - /* really off slab. No need for manual alignment */ - freelist_size = calculate_freelist_size(cachep->num, 0); - } - cachep->colour_off = cache_line_size(); /* Offset must be a multiple of the alignment. */ if (cachep->colour_off < cachep->align) @@ -2443,6 +2394,9 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep, void *freelist; void *addr = page_address(page); + page->s_mem = addr + colour_off; + page->active = 0; + if (OFF_SLAB(cachep)) { /* Slab management obj is off-slab. */ freelist = kmem_cache_alloc_node(cachep->freelist_cache, @@ -2450,11 +2404,11 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep, if (!freelist) return NULL; } else { - freelist = addr + colour_off; - colour_off += cachep->freelist_size; + /* We will use last bytes at the slab for freelist */ + freelist = addr + (PAGE_SIZE << cachep->gfporder) - + cachep->freelist_size; } - page->active = 0; - page->s_mem = addr + colour_off; + return freelist; } -- cgit v1.1 From 832a15d209cd260180407bde1af18965b21623f3 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 15 Mar 2016 14:54:33 -0700 Subject: mm/slab: align cache size first before determination of OFF_SLAB candidate Finding suitable OFF_SLAB candidate is more related to aligned cache size rather than original size. Same reasoning can be applied to the debug pagealloc candidate. So, this patch moves up alignment fixup to proper position. From that point, size is aligned so we can remove some alignment fixups. Signed-off-by: Joonsoo Kim Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index b3d91b0..d5dffc8 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2125,6 +2125,17 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) else size += BYTES_PER_WORD; } +#endif + + size = ALIGN(size, cachep->align); + /* + * We should restrict the number of objects in a slab to implement + * byte sized index. Refer comment on SLAB_OBJ_MIN_SIZE definition. + */ + if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE) + size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align); + +#if DEBUG /* * To activate debug pagealloc, off-slab management is necessary * requirement. In early phase of initialization, small sized slab @@ -2135,8 +2146,8 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) if (debug_pagealloc_enabled() && (flags & SLAB_POISON) && !slab_early_init && size >= kmalloc_size(INDEX_NODE) && size >= 256 && cachep->object_size > cache_line_size() && - ALIGN(size, cachep->align) < PAGE_SIZE) { - cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align); + size < PAGE_SIZE) { + cachep->obj_offset += PAGE_SIZE - size; size = PAGE_SIZE; } #endif @@ -2148,20 +2159,13 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak) */ if (size >= OFF_SLAB_MIN_SIZE && !slab_early_init && - !(flags & SLAB_NOLEAKTRACE)) + !(flags & SLAB_NOLEAKTRACE)) { /* * Size is large, assume best to place the slab management obj * off-slab (should allow better packing of objs). */ flags |= CFLGS_OFF_SLAB; - - size = ALIGN(size, cachep->align); - /* - * We should restrict the number of objects in a slab to implement - * byte sized index. Refer comment on SLAB_OBJ_MIN_SIZE definition. - */ - if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE) - size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align); + } left_over = calculate_slab_order(cachep, size, flags); -- cgit v1.1 From 158e319bba59e890c3920ce6d827c188287bae84 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 15 Mar 2016 14:54:35 -0700 Subject: mm/slab: clean up cache type determination Current cache type determination code is open-code and looks not understandable. Following patch will introduce one more cache type and it would make code more complex. So, before it happens, this patch abstracts these codes. Signed-off-by: Joonsoo Kim Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 105 ++++++++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 71 insertions(+), 34 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index d5dffc8..9b56685 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2023,6 +2023,64 @@ __kmem_cache_alias(const char *name, size_t size, size_t align, return cachep; } +static bool set_off_slab_cache(struct kmem_cache *cachep, + size_t size, unsigned long flags) +{ + size_t left; + + cachep->num = 0; + + /* + * Determine if the slab management is 'on' or 'off' slab. + * (bootstrapping cannot cope with offslab caches so don't do + * it too early on. Always use on-slab management when + * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak) + */ + if (size < OFF_SLAB_MIN_SIZE) + return false; + + if (slab_early_init) + return false; + + if (flags & SLAB_NOLEAKTRACE) + return false; + + /* + * Size is large, assume best to place the slab management obj + * off-slab (should allow better packing of objs). + */ + left = calculate_slab_order(cachep, size, flags | CFLGS_OFF_SLAB); + if (!cachep->num) + return false; + + /* + * If the slab has been placed off-slab, and we have enough space then + * move it on-slab. This is at the expense of any extra colouring. + */ + if (left >= cachep->num * sizeof(freelist_idx_t)) + return false; + + cachep->colour = left / cachep->colour_off; + + return true; +} + +static bool set_on_slab_cache(struct kmem_cache *cachep, + size_t size, unsigned long flags) +{ + size_t left; + + cachep->num = 0; + + left = calculate_slab_order(cachep, size, flags); + if (!cachep->num) + return false; + + cachep->colour = left / cachep->colour_off; + + return true; +} + /** * __kmem_cache_create - Create a cache. * @cachep: cache management descriptor @@ -2047,7 +2105,6 @@ __kmem_cache_alias(const char *name, size_t size, size_t align, int __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) { - size_t left_over, freelist_size; size_t ralign = BYTES_PER_WORD; gfp_t gfp; int err; @@ -2098,6 +2155,10 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) * 4) Store it. */ cachep->align = ralign; + cachep->colour_off = cache_line_size(); + /* Offset must be a multiple of the alignment. */ + if (cachep->colour_off < cachep->align) + cachep->colour_off = cachep->align; if (slab_is_available()) gfp = GFP_KERNEL; @@ -2152,43 +2213,18 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) } #endif - /* - * Determine if the slab management is 'on' or 'off' slab. - * (bootstrapping cannot cope with offslab caches so don't do - * it too early on. Always use on-slab management when - * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak) - */ - if (size >= OFF_SLAB_MIN_SIZE && !slab_early_init && - !(flags & SLAB_NOLEAKTRACE)) { - /* - * Size is large, assume best to place the slab management obj - * off-slab (should allow better packing of objs). - */ + if (set_off_slab_cache(cachep, size, flags)) { flags |= CFLGS_OFF_SLAB; + goto done; } - left_over = calculate_slab_order(cachep, size, flags); - - if (!cachep->num) - return -E2BIG; - - freelist_size = cachep->num * sizeof(freelist_idx_t); + if (set_on_slab_cache(cachep, size, flags)) + goto done; - /* - * If the slab has been placed off-slab, and we have enough space then - * move it on-slab. This is at the expense of any extra colouring. - */ - if (flags & CFLGS_OFF_SLAB && left_over >= freelist_size) { - flags &= ~CFLGS_OFF_SLAB; - left_over -= freelist_size; - } + return -E2BIG; - cachep->colour_off = cache_line_size(); - /* Offset must be a multiple of the alignment. */ - if (cachep->colour_off < cachep->align) - cachep->colour_off = cachep->align; - cachep->colour = left_over / cachep->colour_off; - cachep->freelist_size = freelist_size; +done: + cachep->freelist_size = cachep->num * sizeof(freelist_idx_t); cachep->flags = flags; cachep->allocflags = __GFP_COMP; if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA)) @@ -2209,7 +2245,8 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) #endif if (OFF_SLAB(cachep)) { - cachep->freelist_cache = kmalloc_slab(freelist_size, 0u); + cachep->freelist_cache = + kmalloc_slab(cachep->freelist_size, 0u); /* * This is a possibility for one of the kmalloc_{dma,}_caches. * But since we go off slab only for object size greater than -- cgit v1.1 From f3a3c320d54eea39a419fd539f5b0e9c74517b0a Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 15 Mar 2016 14:54:38 -0700 Subject: mm/slab: do not change cache size if debug pagealloc isn't possible We can fail to setup off slab in some conditions. Even in this case, debug pagealloc increases cache size to PAGE_SIZE in advance and it is waste because debug pagealloc cannot work for it when it isn't the off slab. To improve this situation, this patch checks first that this cache with increased size is suitable for off slab. It actually increases cache size when it is suitable for off-slab, so possible waste is removed. Signed-off-by: Joonsoo Kim Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 9b56685..21aad9d 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2206,10 +2206,17 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) */ if (debug_pagealloc_enabled() && (flags & SLAB_POISON) && !slab_early_init && size >= kmalloc_size(INDEX_NODE) && - size >= 256 && cachep->object_size > cache_line_size() && - size < PAGE_SIZE) { - cachep->obj_offset += PAGE_SIZE - size; - size = PAGE_SIZE; + size >= 256 && cachep->object_size > cache_line_size()) { + if (size < PAGE_SIZE || size % PAGE_SIZE == 0) { + size_t tmp_size = ALIGN(size, PAGE_SIZE); + + if (set_off_slab_cache(cachep, tmp_size, flags)) { + flags |= CFLGS_OFF_SLAB; + cachep->obj_offset += tmp_size - size; + size = tmp_size; + goto done; + } + } } #endif -- cgit v1.1 From 3217fd9bdf0017bd0847939f67d52a9c71d8fc56 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 15 Mar 2016 14:54:41 -0700 Subject: mm/slab: make criteria for off slab determination robust and simple To become an off slab, there are some constraints to avoid bootstrapping problem and recursive call. This can be avoided differently by simply checking that corresponding kmalloc cache is ready and it's not a off slab. It would be more robust because static size checking can be affected by cache size change or architecture type but dynamic checking isn't. One check 'freelist_cache->size > cachep->size / 2' is added to check benefit of choosing off slab, because, now, there is no size constraint which ensures enough advantage when selecting off slab. Signed-off-by: Joonsoo Kim Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 45 +++++++++++++++++---------------------------- 1 file changed, 17 insertions(+), 28 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 21aad9d..ab43d9f 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -272,7 +272,6 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent) #define CFLGS_OFF_SLAB (0x80000000UL) #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) -#define OFF_SLAB_MIN_SIZE (max_t(size_t, PAGE_SIZE >> 5, KMALLOC_MIN_SIZE + 1)) #define BATCHREFILL_LIMIT 16 /* @@ -1879,7 +1878,6 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list) static size_t calculate_slab_order(struct kmem_cache *cachep, size_t size, unsigned long flags) { - unsigned long offslab_limit; size_t left_over = 0; int gfporder; @@ -1896,16 +1894,24 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, break; if (flags & CFLGS_OFF_SLAB) { + struct kmem_cache *freelist_cache; + size_t freelist_size; + + freelist_size = num * sizeof(freelist_idx_t); + freelist_cache = kmalloc_slab(freelist_size, 0u); + if (!freelist_cache) + continue; + /* - * Max number of objs-per-slab for caches which - * use off-slab slabs. Needed to avoid a possible - * looping condition in cache_grow(). + * Needed to avoid possible looping condition + * in cache_grow() */ - offslab_limit = size; - offslab_limit /= sizeof(freelist_idx_t); + if (OFF_SLAB(freelist_cache)) + continue; - if (num > offslab_limit) - break; + /* check if off slab has enough benefit */ + if (freelist_cache->size > cachep->size / 2) + continue; } /* Found something acceptable - save it away */ @@ -2031,17 +2037,9 @@ static bool set_off_slab_cache(struct kmem_cache *cachep, cachep->num = 0; /* - * Determine if the slab management is 'on' or 'off' slab. - * (bootstrapping cannot cope with offslab caches so don't do - * it too early on. Always use on-slab management when - * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak) + * Always use on-slab management when SLAB_NOLEAKTRACE + * to avoid recursive calls into kmemleak. */ - if (size < OFF_SLAB_MIN_SIZE) - return false; - - if (slab_early_init) - return false; - if (flags & SLAB_NOLEAKTRACE) return false; @@ -2205,7 +2203,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) * sized slab is initialized in current slab initialization sequence. */ if (debug_pagealloc_enabled() && (flags & SLAB_POISON) && - !slab_early_init && size >= kmalloc_size(INDEX_NODE) && size >= 256 && cachep->object_size > cache_line_size()) { if (size < PAGE_SIZE || size % PAGE_SIZE == 0) { size_t tmp_size = ALIGN(size, PAGE_SIZE); @@ -2254,14 +2251,6 @@ done: if (OFF_SLAB(cachep)) { cachep->freelist_cache = kmalloc_slab(cachep->freelist_size, 0u); - /* - * This is a possibility for one of the kmalloc_{dma,}_caches. - * But since we go off slab only for object size greater than - * OFF_SLAB_MIN_SIZE, and kmalloc_{dma,}_caches get created - * in ascending order,this should not happen at all. - * But leave a BUG_ON for some lucky dude. - */ - BUG_ON(ZERO_OR_NULL_PTR(cachep->freelist_cache)); } err = setup_cpu_cache(cachep, gfp); -- cgit v1.1 From d8410234db6a1bb05408ba2f0e3fd6b04ef626a3 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 15 Mar 2016 14:54:44 -0700 Subject: mm/slab: factor out slab list fixup code Slab list should be fixed up after object is detached from the slab and this happens at two places. They do exactly same thing. They will be changed in the following patch, so, to reduce code duplication, this patch factor out them and make it common function. Signed-off-by: Joonsoo Kim Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index ab43d9f..95e5d63 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2723,6 +2723,17 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, #define cache_free_debugcheck(x,objp,z) (objp) #endif +static inline void fixup_slab_list(struct kmem_cache *cachep, + struct kmem_cache_node *n, struct page *page) +{ + /* move slabp to correct slabp list: */ + list_del(&page->lru); + if (page->active == cachep->num) + list_add(&page->lru, &n->slabs_full); + else + list_add(&page->lru, &n->slabs_partial); +} + static struct page *get_first_slab(struct kmem_cache_node *n) { struct page *page; @@ -2796,12 +2807,7 @@ retry: ac_put_obj(cachep, ac, slab_get_obj(cachep, page)); } - /* move slabp to correct slabp list: */ - list_del(&page->lru); - if (page->active == cachep->num) - list_add(&page->lru, &n->slabs_full); - else - list_add(&page->lru, &n->slabs_partial); + fixup_slab_list(cachep, n, page); } must_grow: @@ -3067,13 +3073,8 @@ retry: obj = slab_get_obj(cachep, page); n->free_objects--; - /* move slabp to correct slabp list: */ - list_del(&page->lru); - if (page->active == cachep->num) - list_add(&page->lru, &n->slabs_full); - else - list_add(&page->lru, &n->slabs_partial); + fixup_slab_list(cachep, n, page); spin_unlock(&n->list_lock); goto done; -- cgit v1.1 From 10b2e9e8e808bd30e1f4018a36366d07b0abd12f Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 15 Mar 2016 14:54:47 -0700 Subject: mm/slab: factor out debugging initialization in cache_init_objs() cache_init_objs() will be changed in following patch and current form doesn't fit well for that change. So, before doing it, this patch separates debugging initialization. This would cause two loop iteration when debugging is enabled, but, this overhead seems too light than debug feature itself so effect may not be visible. This patch will greatly simplify changes in cache_init_objs() in following patch. Signed-off-by: Joonsoo Kim Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 95e5d63..d3608d1 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2460,14 +2460,14 @@ static inline void set_free_obj(struct page *page, ((freelist_idx_t *)(page->freelist))[idx] = val; } -static void cache_init_objs(struct kmem_cache *cachep, - struct page *page) +static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page) { +#if DEBUG int i; for (i = 0; i < cachep->num; i++) { void *objp = index_to_obj(cachep, page, i); -#if DEBUG + if (cachep->flags & SLAB_STORE_USER) *dbg_userword(cachep, objp) = NULL; @@ -2496,10 +2496,22 @@ static void cache_init_objs(struct kmem_cache *cachep, poison_obj(cachep, objp, POISON_FREE); slab_kernel_map(cachep, objp, 0, 0); } -#else - if (cachep->ctor) - cachep->ctor(objp); + } #endif +} + +static void cache_init_objs(struct kmem_cache *cachep, + struct page *page) +{ + int i; + + cache_init_objs_debug(cachep, page); + + for (i = 0; i < cachep->num; i++) { + /* constructor could break poison info */ + if (DEBUG == 0 && cachep->ctor) + cachep->ctor(index_to_obj(cachep, page, i)); + set_free_obj(page, i, i); } } -- cgit v1.1 From b03a017bebc403d40aa53a092e79b3020786537d Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 15 Mar 2016 14:54:50 -0700 Subject: mm/slab: introduce new slab management type, OBJFREELIST_SLAB SLAB needs an array to manage freed objects in a slab. It is only used if some objects are freed so we can use free object itself as this array. This requires additional branch in somewhat critical lock path to check if it is first freed object or not but that's all we need. Benefits is that we can save extra memory usage and reduce some computational overhead by allocating a management array when new slab is created. Code change is rather complex than what we can expect from the idea, in order to handle debugging feature efficiently. If you want to see core idea only, please remove '#if DEBUG' block in the patch. Although this idea can apply to all caches whose size is larger than management array size, it isn't applied to caches which have a constructor. If such cache's object is used for management array, constructor should be called for it before that object is returned to user. I guess that overhead overwhelm benefit in that case so this idea doesn't applied to them at least now. For summary, from now on, slab management type is determined by following logic. 1) if management array size is smaller than object size and no ctor, it becomes OBJFREELIST_SLAB. 2) if management array size is smaller than leftover, it becomes NORMAL_SLAB which uses leftover as a array. 3) if OFF_SLAB help to save memory than way 4), it becomes OFF_SLAB. It allocate a management array from the other cache so memory waste happens. 4) others become NORMAL_SLAB. It uses dedicated internal memory in a slab as a management array so it causes memory waste. In my system, without enabling CONFIG_DEBUG_SLAB, Almost caches become OBJFREELIST_SLAB and NORMAL_SLAB (using leftover) which doesn't waste memory. Following is the result of number of caches with specific slab management type. TOTAL = OBJFREELIST + NORMAL(leftover) + NORMAL + OFF /Before/ 126 = 0 + 60 + 25 + 41 /After/ 126 = 97 + 12 + 15 + 2 Result shows that number of caches that doesn't waste memory increase from 60 to 109. I did some benchmarking and it looks that benefit are more than loss. Kmalloc: Repeatedly allocate then free test /Before/ [ 0.286809] 1. Kmalloc: Repeatedly allocate then free test [ 1.143674] 100000 times kmalloc(32) -> 116 cycles kfree -> 78 cycles [ 1.441726] 100000 times kmalloc(64) -> 121 cycles kfree -> 80 cycles [ 1.815734] 100000 times kmalloc(128) -> 168 cycles kfree -> 85 cycles [ 2.380709] 100000 times kmalloc(256) -> 287 cycles kfree -> 95 cycles [ 3.101153] 100000 times kmalloc(512) -> 370 cycles kfree -> 117 cycles [ 3.942432] 100000 times kmalloc(1024) -> 413 cycles kfree -> 156 cycles [ 5.227396] 100000 times kmalloc(2048) -> 622 cycles kfree -> 248 cycles [ 7.519793] 100000 times kmalloc(4096) -> 1102 cycles kfree -> 452 cycles /After/ [ 1.205313] 100000 times kmalloc(32) -> 117 cycles kfree -> 78 cycles [ 1.510526] 100000 times kmalloc(64) -> 124 cycles kfree -> 81 cycles [ 1.827382] 100000 times kmalloc(128) -> 130 cycles kfree -> 84 cycles [ 2.226073] 100000 times kmalloc(256) -> 177 cycles kfree -> 92 cycles [ 2.814747] 100000 times kmalloc(512) -> 286 cycles kfree -> 112 cycles [ 3.532952] 100000 times kmalloc(1024) -> 344 cycles kfree -> 141 cycles [ 4.608777] 100000 times kmalloc(2048) -> 519 cycles kfree -> 210 cycles [ 6.350105] 100000 times kmalloc(4096) -> 789 cycles kfree -> 391 cycles In fact, I tested another idea implementing OBJFREELIST_SLAB with extendable linked array through another freed object. It can remove memory waste completely but it causes more computational overhead in critical lock path and it seems that overhead outweigh benefit. So, this patch doesn't include it. Signed-off-by: Joonsoo Kim Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 94 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 86 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index d3608d1..85e394f 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -270,7 +270,9 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent) MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ } while (0) +#define CFLGS_OBJFREELIST_SLAB (0x40000000UL) #define CFLGS_OFF_SLAB (0x80000000UL) +#define OBJFREELIST_SLAB(x) ((x)->flags & CFLGS_OBJFREELIST_SLAB) #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) #define BATCHREFILL_LIMIT 16 @@ -480,7 +482,7 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, * the slabs are all pages aligned, the objects will be at the * correct alignment when allocated. */ - if (flags & CFLGS_OFF_SLAB) { + if (flags & (CFLGS_OBJFREELIST_SLAB | CFLGS_OFF_SLAB)) { *num = slab_size / buffer_size; *left_over = slab_size % buffer_size; } else { @@ -1801,6 +1803,12 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct page *page) { int i; + + if (OBJFREELIST_SLAB(cachep) && cachep->flags & SLAB_POISON) { + poison_obj(cachep, page->freelist - obj_offset(cachep), + POISON_FREE); + } + for (i = 0; i < cachep->num; i++) { void *objp = index_to_obj(cachep, page, i); @@ -2029,6 +2037,29 @@ __kmem_cache_alias(const char *name, size_t size, size_t align, return cachep; } +static bool set_objfreelist_slab_cache(struct kmem_cache *cachep, + size_t size, unsigned long flags) +{ + size_t left; + + cachep->num = 0; + + if (cachep->ctor || flags & SLAB_DESTROY_BY_RCU) + return false; + + left = calculate_slab_order(cachep, size, + flags | CFLGS_OBJFREELIST_SLAB); + if (!cachep->num) + return false; + + if (cachep->num * sizeof(freelist_idx_t) > cachep->object_size) + return false; + + cachep->colour = left / cachep->colour_off; + + return true; +} + static bool set_off_slab_cache(struct kmem_cache *cachep, size_t size, unsigned long flags) { @@ -2217,6 +2248,11 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) } #endif + if (set_objfreelist_slab_cache(cachep, size, flags)) { + flags |= CFLGS_OBJFREELIST_SLAB; + goto done; + } + if (set_off_slab_cache(cachep, size, flags)) { flags |= CFLGS_OFF_SLAB; goto done; @@ -2434,7 +2470,9 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep, page->s_mem = addr + colour_off; page->active = 0; - if (OFF_SLAB(cachep)) { + if (OBJFREELIST_SLAB(cachep)) + freelist = NULL; + else if (OFF_SLAB(cachep)) { /* Slab management obj is off-slab. */ freelist = kmem_cache_alloc_node(cachep->freelist_cache, local_flags, nodeid); @@ -2507,6 +2545,11 @@ static void cache_init_objs(struct kmem_cache *cachep, cache_init_objs_debug(cachep, page); + if (OBJFREELIST_SLAB(cachep)) { + page->freelist = index_to_obj(cachep, page, cachep->num - 1) + + obj_offset(cachep); + } + for (i = 0; i < cachep->num; i++) { /* constructor could break poison info */ if (DEBUG == 0 && cachep->ctor) @@ -2558,6 +2601,9 @@ static void slab_put_obj(struct kmem_cache *cachep, } #endif page->active--; + if (!page->freelist) + page->freelist = objp + obj_offset(cachep); + set_free_obj(page, page->active, objnr); } @@ -2632,7 +2678,7 @@ static int cache_grow(struct kmem_cache *cachep, /* Get slab management. */ freelist = alloc_slabmgmt(cachep, page, offset, local_flags & ~GFP_CONSTRAINT_MASK, nodeid); - if (!freelist) + if (OFF_SLAB(cachep) && !freelist) goto opps1; slab_map_pages(cachep, page, freelist); @@ -2735,14 +2781,42 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, #define cache_free_debugcheck(x,objp,z) (objp) #endif +static inline void fixup_objfreelist_debug(struct kmem_cache *cachep, + void **list) +{ +#if DEBUG + void *next = *list; + void *objp; + + while (next) { + objp = next - obj_offset(cachep); + next = *(void **)next; + poison_obj(cachep, objp, POISON_FREE); + } +#endif +} + static inline void fixup_slab_list(struct kmem_cache *cachep, - struct kmem_cache_node *n, struct page *page) + struct kmem_cache_node *n, struct page *page, + void **list) { /* move slabp to correct slabp list: */ list_del(&page->lru); - if (page->active == cachep->num) + if (page->active == cachep->num) { list_add(&page->lru, &n->slabs_full); - else + if (OBJFREELIST_SLAB(cachep)) { +#if DEBUG + /* Poisoning will be done without holding the lock */ + if (cachep->flags & SLAB_POISON) { + void **objp = page->freelist; + + *objp = *list; + *list = objp; + } +#endif + page->freelist = NULL; + } + } else list_add(&page->lru, &n->slabs_partial); } @@ -2768,6 +2842,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags, struct kmem_cache_node *n; struct array_cache *ac; int node; + void *list = NULL; check_irq_off(); node = numa_mem_id(); @@ -2819,13 +2894,14 @@ retry: ac_put_obj(cachep, ac, slab_get_obj(cachep, page)); } - fixup_slab_list(cachep, n, page); + fixup_slab_list(cachep, n, page, &list); } must_grow: n->free_objects -= ac->avail; alloc_done: spin_unlock(&n->list_lock); + fixup_objfreelist_debug(cachep, &list); if (unlikely(!ac->avail)) { int x; @@ -3062,6 +3138,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, struct page *page; struct kmem_cache_node *n; void *obj; + void *list = NULL; int x; VM_BUG_ON(nodeid < 0 || nodeid >= MAX_NUMNODES); @@ -3086,9 +3163,10 @@ retry: obj = slab_get_obj(cachep, page); n->free_objects--; - fixup_slab_list(cachep, n, page); + fixup_slab_list(cachep, n, page, &list); spin_unlock(&n->list_lock); + fixup_objfreelist_debug(cachep, &list); goto done; must_grow: -- cgit v1.1 From 70f75067b15659bb03404e75eded41011c67dc57 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 15 Mar 2016 14:54:53 -0700 Subject: mm/slab: avoid returning values by reference Returing values by reference is bad practice. Instead, just use function return value. Signed-off-by: Joonsoo Kim Suggested-by: Christoph Lameter Acked-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 85e394f..4f4e647 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -460,9 +460,10 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) /* * Calculate the number of objects and left-over bytes for a given buffer size. */ -static void cache_estimate(unsigned long gfporder, size_t buffer_size, - unsigned long flags, size_t *left_over, unsigned int *num) +static unsigned int cache_estimate(unsigned long gfporder, size_t buffer_size, + unsigned long flags, size_t *left_over) { + unsigned int num; size_t slab_size = PAGE_SIZE << gfporder; /* @@ -483,13 +484,15 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, * correct alignment when allocated. */ if (flags & (CFLGS_OBJFREELIST_SLAB | CFLGS_OFF_SLAB)) { - *num = slab_size / buffer_size; + num = slab_size / buffer_size; *left_over = slab_size % buffer_size; } else { - *num = slab_size / (buffer_size + sizeof(freelist_idx_t)); + num = slab_size / (buffer_size + sizeof(freelist_idx_t)); *left_over = slab_size % (buffer_size + sizeof(freelist_idx_t)); } + + return num; } #if DEBUG @@ -1893,7 +1896,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, unsigned int num; size_t remainder; - cache_estimate(gfporder, size, flags, &remainder, &num); + num = cache_estimate(gfporder, size, flags, &remainder); if (!num) continue; -- cgit v1.1 From f68f8dddb5e9101b5bddb4d61ba0d81e4f9e0040 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 15 Mar 2016 14:54:56 -0700 Subject: mm/slab: re-implement pfmemalloc support Current implementation of pfmemalloc handling in SLAB has some problems. 1) pfmemalloc_active is set to true when there is just one or more pfmemalloc slabs in the system, but it is cleared when there is no pfmemalloc slab in one arbitrary kmem_cache. So, pfmemalloc_active could be wrongly cleared. 2) Search to partial and free list doesn't happen when non-pfmemalloc object are not found in cpu cache. Instead, allocating new slab happens and it is not optimal. 3) Even after sk_memalloc_socks() is disabled, cpu cache would keep pfmemalloc objects tagged with SLAB_OBJ_PFMEMALLOC. It isn't cleared if sk_memalloc_socks() is disabled so it could cause problem. 4) If cpu cache is filled with pfmemalloc objects, it would cause slow down non-pfmemalloc allocation. To me, current pointer tagging approach looks complex and fragile so this patch re-implement whole thing instead of fixing problems one by one. Design principle for new implementation is that 1) Don't disrupt non-pfmemalloc allocation in fast path even if sk_memalloc_socks() is enabled. It's more likely case than pfmemalloc allocation. 2) Ensure that pfmemalloc slab is used only for pfmemalloc allocation. 3) Don't consider performance of pfmemalloc allocation in memory deficiency state. As a result, all pfmemalloc alloc/free in memory tight state will be handled in slow-path. If there is non-pfmemalloc free object, it will be returned first even for pfmemalloc user in fast-path so that performance of pfmemalloc user isn't affected in normal case and pfmemalloc objects will be kept as long as possible. Signed-off-by: Joonsoo Kim Tested-by: Mel Gorman Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 284 +++++++++++++++++++++++++------------------------------------- 1 file changed, 116 insertions(+), 168 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 4f4e647..863c005 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -169,12 +169,6 @@ typedef unsigned short freelist_idx_t; #define SLAB_OBJ_MAX_NUM ((1 << sizeof(freelist_idx_t) * BITS_PER_BYTE) - 1) /* - * true if a page was allocated from pfmemalloc reserves for network-based - * swap - */ -static bool pfmemalloc_active __read_mostly; - -/* * struct array_cache * * Purpose: @@ -195,10 +189,6 @@ struct array_cache { * Must have this definition in here for the proper * alignment of array_cache. Also simplifies accessing * the entries. - * - * Entries should not be directly dereferenced as - * entries belonging to slabs marked pfmemalloc will - * have the lower bits set SLAB_OBJ_PFMEMALLOC */ }; @@ -207,23 +197,6 @@ struct alien_cache { struct array_cache ac; }; -#define SLAB_OBJ_PFMEMALLOC 1 -static inline bool is_obj_pfmemalloc(void *objp) -{ - return (unsigned long)objp & SLAB_OBJ_PFMEMALLOC; -} - -static inline void set_obj_pfmemalloc(void **objp) -{ - *objp = (void *)((unsigned long)*objp | SLAB_OBJ_PFMEMALLOC); - return; -} - -static inline void clear_obj_pfmemalloc(void **objp) -{ - *objp = (void *)((unsigned long)*objp & ~SLAB_OBJ_PFMEMALLOC); -} - /* * Need this for bootstrapping a per node allocator. */ @@ -623,120 +596,21 @@ static struct array_cache *alloc_arraycache(int node, int entries, return ac; } -static inline bool is_slab_pfmemalloc(struct page *page) -{ - return PageSlabPfmemalloc(page); -} - -/* Clears pfmemalloc_active if no slabs have pfmalloc set */ -static void recheck_pfmemalloc_active(struct kmem_cache *cachep, - struct array_cache *ac) -{ - struct kmem_cache_node *n = get_node(cachep, numa_mem_id()); - struct page *page; - unsigned long flags; - - if (!pfmemalloc_active) - return; - - spin_lock_irqsave(&n->list_lock, flags); - list_for_each_entry(page, &n->slabs_full, lru) - if (is_slab_pfmemalloc(page)) - goto out; - - list_for_each_entry(page, &n->slabs_partial, lru) - if (is_slab_pfmemalloc(page)) - goto out; - - list_for_each_entry(page, &n->slabs_free, lru) - if (is_slab_pfmemalloc(page)) - goto out; - - pfmemalloc_active = false; -out: - spin_unlock_irqrestore(&n->list_lock, flags); -} - -static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac, - gfp_t flags, bool force_refill) +static noinline void cache_free_pfmemalloc(struct kmem_cache *cachep, + struct page *page, void *objp) { - int i; - void *objp = ac->entry[--ac->avail]; - - /* Ensure the caller is allowed to use objects from PFMEMALLOC slab */ - if (unlikely(is_obj_pfmemalloc(objp))) { - struct kmem_cache_node *n; - - if (gfp_pfmemalloc_allowed(flags)) { - clear_obj_pfmemalloc(&objp); - return objp; - } - - /* The caller cannot use PFMEMALLOC objects, find another one */ - for (i = 0; i < ac->avail; i++) { - /* If a !PFMEMALLOC object is found, swap them */ - if (!is_obj_pfmemalloc(ac->entry[i])) { - objp = ac->entry[i]; - ac->entry[i] = ac->entry[ac->avail]; - ac->entry[ac->avail] = objp; - return objp; - } - } - - /* - * If there are empty slabs on the slabs_free list and we are - * being forced to refill the cache, mark this one !pfmemalloc. - */ - n = get_node(cachep, numa_mem_id()); - if (!list_empty(&n->slabs_free) && force_refill) { - struct page *page = virt_to_head_page(objp); - ClearPageSlabPfmemalloc(page); - clear_obj_pfmemalloc(&objp); - recheck_pfmemalloc_active(cachep, ac); - return objp; - } - - /* No !PFMEMALLOC objects available */ - ac->avail++; - objp = NULL; - } - - return objp; -} - -static inline void *ac_get_obj(struct kmem_cache *cachep, - struct array_cache *ac, gfp_t flags, bool force_refill) -{ - void *objp; - - if (unlikely(sk_memalloc_socks())) - objp = __ac_get_obj(cachep, ac, flags, force_refill); - else - objp = ac->entry[--ac->avail]; - - return objp; -} - -static noinline void *__ac_put_obj(struct kmem_cache *cachep, - struct array_cache *ac, void *objp) -{ - if (unlikely(pfmemalloc_active)) { - /* Some pfmemalloc slabs exist, check if this is one */ - struct page *page = virt_to_head_page(objp); - if (PageSlabPfmemalloc(page)) - set_obj_pfmemalloc(&objp); - } + struct kmem_cache_node *n; + int page_node; + LIST_HEAD(list); - return objp; -} + page_node = page_to_nid(page); + n = get_node(cachep, page_node); -static inline void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac, - void *objp) -{ - if (unlikely(sk_memalloc_socks())) - objp = __ac_put_obj(cachep, ac, objp); + spin_lock(&n->list_lock); + free_block(cachep, &objp, 1, page_node, &list); + spin_unlock(&n->list_lock); - ac->entry[ac->avail++] = objp; + slabs_destroy(cachep, &list); } /* @@ -939,7 +813,7 @@ static int __cache_free_alien(struct kmem_cache *cachep, void *objp, STATS_INC_ACOVERFLOW(cachep); __drain_alien_cache(cachep, ac, page_node, &list); } - ac_put_obj(cachep, ac, objp); + ac->entry[ac->avail++] = objp; spin_unlock(&alien->lock); slabs_destroy(cachep, &list); } else { @@ -1540,10 +1414,6 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, return NULL; } - /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ - if (page_is_pfmemalloc(page)) - pfmemalloc_active = true; - nr_pages = (1 << cachep->gfporder); if (cachep->flags & SLAB_RECLAIM_ACCOUNT) add_zone_page_state(page_zone(page), @@ -1551,8 +1421,10 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, else add_zone_page_state(page_zone(page), NR_SLAB_UNRECLAIMABLE, nr_pages); + __SetPageSlab(page); - if (page_is_pfmemalloc(page)) + /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ + if (sk_memalloc_socks() && page_is_pfmemalloc(page)) SetPageSlabPfmemalloc(page); if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { @@ -2823,7 +2695,46 @@ static inline void fixup_slab_list(struct kmem_cache *cachep, list_add(&page->lru, &n->slabs_partial); } -static struct page *get_first_slab(struct kmem_cache_node *n) +/* Try to find non-pfmemalloc slab if needed */ +static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n, + struct page *page, bool pfmemalloc) +{ + if (!page) + return NULL; + + if (pfmemalloc) + return page; + + if (!PageSlabPfmemalloc(page)) + return page; + + /* No need to keep pfmemalloc slab if we have enough free objects */ + if (n->free_objects > n->free_limit) { + ClearPageSlabPfmemalloc(page); + return page; + } + + /* Move pfmemalloc slab to the end of list to speed up next search */ + list_del(&page->lru); + if (!page->active) + list_add_tail(&page->lru, &n->slabs_free); + else + list_add_tail(&page->lru, &n->slabs_partial); + + list_for_each_entry(page, &n->slabs_partial, lru) { + if (!PageSlabPfmemalloc(page)) + return page; + } + + list_for_each_entry(page, &n->slabs_free, lru) { + if (!PageSlabPfmemalloc(page)) + return page; + } + + return NULL; +} + +static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc) { struct page *page; @@ -2835,11 +2746,41 @@ static struct page *get_first_slab(struct kmem_cache_node *n) struct page, lru); } + if (sk_memalloc_socks()) + return get_valid_first_slab(n, page, pfmemalloc); + return page; } -static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags, - bool force_refill) +static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep, + struct kmem_cache_node *n, gfp_t flags) +{ + struct page *page; + void *obj; + void *list = NULL; + + if (!gfp_pfmemalloc_allowed(flags)) + return NULL; + + spin_lock(&n->list_lock); + page = get_first_slab(n, true); + if (!page) { + spin_unlock(&n->list_lock); + return NULL; + } + + obj = slab_get_obj(cachep, page); + n->free_objects--; + + fixup_slab_list(cachep, n, page, &list); + + spin_unlock(&n->list_lock); + fixup_objfreelist_debug(cachep, &list); + + return obj; +} + +static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) { int batchcount; struct kmem_cache_node *n; @@ -2849,8 +2790,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags, check_irq_off(); node = numa_mem_id(); - if (unlikely(force_refill)) - goto force_grow; + retry: ac = cpu_cache_get(cachep); batchcount = ac->batchcount; @@ -2876,7 +2816,7 @@ retry: while (batchcount > 0) { struct page *page; /* Get slab alloc is to come from. */ - page = get_first_slab(n); + page = get_first_slab(n, false); if (!page) goto must_grow; @@ -2894,7 +2834,7 @@ retry: STATS_INC_ACTIVE(cachep); STATS_SET_HIGH(cachep); - ac_put_obj(cachep, ac, slab_get_obj(cachep, page)); + ac->entry[ac->avail++] = slab_get_obj(cachep, page); } fixup_slab_list(cachep, n, page, &list); @@ -2908,7 +2848,15 @@ alloc_done: if (unlikely(!ac->avail)) { int x; -force_grow: + + /* Check if we can use obj in pfmemalloc slab */ + if (sk_memalloc_socks()) { + void *obj = cache_alloc_pfmemalloc(cachep, n, flags); + + if (obj) + return obj; + } + x = cache_grow(cachep, gfp_exact_node(flags), node, NULL); /* cache_grow can reenable interrupts, then ac could change. */ @@ -2916,7 +2864,7 @@ force_grow: node = numa_mem_id(); /* no objects in sight? abort */ - if (!x && (ac->avail == 0 || force_refill)) + if (!x && ac->avail == 0) return NULL; if (!ac->avail) /* objects refilled by interrupt? */ @@ -2924,7 +2872,7 @@ force_grow: } ac->touched = 1; - return ac_get_obj(cachep, ac, flags, force_refill); + return ac->entry[--ac->avail]; } static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, @@ -2982,28 +2930,20 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) { void *objp; struct array_cache *ac; - bool force_refill = false; check_irq_off(); ac = cpu_cache_get(cachep); if (likely(ac->avail)) { ac->touched = 1; - objp = ac_get_obj(cachep, ac, flags, false); + objp = ac->entry[--ac->avail]; - /* - * Allow for the possibility all avail objects are not allowed - * by the current flags - */ - if (objp) { - STATS_INC_ALLOCHIT(cachep); - goto out; - } - force_refill = true; + STATS_INC_ALLOCHIT(cachep); + goto out; } STATS_INC_ALLOCMISS(cachep); - objp = cache_alloc_refill(cachep, flags, force_refill); + objp = cache_alloc_refill(cachep, flags); /* * the 'ac' may be updated by cache_alloc_refill(), * and kmemleak_erase() requires its correct value. @@ -3151,7 +3091,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, retry: check_irq_off(); spin_lock(&n->list_lock); - page = get_first_slab(n); + page = get_first_slab(n, false); if (!page) goto must_grow; @@ -3304,7 +3244,6 @@ static void free_block(struct kmem_cache *cachep, void **objpp, void *objp; struct page *page; - clear_obj_pfmemalloc(&objpp[i]); objp = objpp[i]; page = virt_to_head_page(objp); @@ -3410,7 +3349,16 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp, cache_flusharray(cachep, ac); } - ac_put_obj(cachep, ac, objp); + if (sk_memalloc_socks()) { + struct page *page = virt_to_head_page(objp); + + if (unlikely(PageSlabPfmemalloc(page))) { + cache_free_pfmemalloc(cachep, page, objp); + return; + } + } + + ac->entry[ac->avail++] = objp; } /** -- cgit v1.1 From 282acb436176db665b6d61688ca5885a09cbdace Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Tue, 15 Mar 2016 14:54:59 -0700 Subject: slub: drop lock at the end of free_debug_processing This series takes the suggestion of Christoph Lameter and only focuses on optimizing the slow path where the debug processing runs. The two main optimizations in this series are letting the consistency checks be skipped and relaxing the cmpxchg restrictions when we are not doing consistency checks. With hackbench -g 20 -l 1000 averaged over 100 runs: Before slub_debug=P mean 15.607 variance .086 stdev .294 After slub_debug=P mean 10.836 variance .155 stdev .394 This still isn't as fast as what is in grsecurity unfortunately so there's still work to be done. Profiling ___slab_alloc shows that 25-50% of time is spent in deactivate_slab. I haven't looked too closely to see if this is something that can be optimized. My plan for now is to focus on getting all of this merged (if appropriate) before digging in to another task. This patch (of 4): Currently, free_debug_processing has a comment "Keep node_lock to preserve integrity until the object is actually freed". In actuallity, the lock is dropped immediately in __slab_free. Rather than wait until __slab_free and potentially throw off the unlikely marking, just drop the lock in __slab_free. This also lets free_debug_processing take its own copy of the spinlock flags rather than trying to share the ones from __slab_free. Since there is no use for the node afterwards, change the return type of free_debug_processing to return an int like alloc_debug_processing. Credit to Mathias Krause for the original work which inspired this series [akpm@linux-foundation.org: fix build] Signed-off-by: Laura Abbott Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Kees Cook Cc: Mathias Krause Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 9620815..a03e0ae 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1044,16 +1044,17 @@ bad: } /* Supports checking bulk free of a constructed freelist */ -static noinline struct kmem_cache_node *free_debug_processing( +static noinline int free_debug_processing( struct kmem_cache *s, struct page *page, void *head, void *tail, int bulk_cnt, - unsigned long addr, unsigned long *flags) + unsigned long addr) { struct kmem_cache_node *n = get_node(s, page_to_nid(page)); void *object = head; int cnt = 0; + unsigned long uninitialized_var(flags); - spin_lock_irqsave(&n->list_lock, *flags); + spin_lock_irqsave(&n->list_lock, flags); slab_lock(page); if (!check_slab(s, page)) @@ -1106,17 +1107,14 @@ out: bulk_cnt, cnt); slab_unlock(page); - /* - * Keep node_lock to preserve integrity - * until the object is actually freed - */ - return n; + spin_unlock_irqrestore(&n->list_lock, flags); + return 1; fail: slab_unlock(page); - spin_unlock_irqrestore(&n->list_lock, *flags); + spin_unlock_irqrestore(&n->list_lock, flags); slab_fix(s, "Object at 0x%p not freed", object); - return NULL; + return 0; } static int __init setup_slub_debug(char *str) @@ -1207,10 +1205,10 @@ static inline void setup_object_debug(struct kmem_cache *s, static inline int alloc_debug_processing(struct kmem_cache *s, struct page *page, void *object, unsigned long addr) { return 0; } -static inline struct kmem_cache_node *free_debug_processing( +static inline int free_debug_processing( struct kmem_cache *s, struct page *page, void *head, void *tail, int bulk_cnt, - unsigned long addr, unsigned long *flags) { return NULL; } + unsigned long addr) { return 0; } static inline int slab_pad_check(struct kmem_cache *s, struct page *page) { return 1; } @@ -2588,8 +2586,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, stat(s, FREE_SLOWPATH); if (kmem_cache_debug(s) && - !(n = free_debug_processing(s, page, head, tail, cnt, - addr, &flags))) + !free_debug_processing(s, page, head, tail, cnt, addr)) return; do { -- cgit v1.1 From 804aa132d341cc5838b710e62de02d62e6ad0185 Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Tue, 15 Mar 2016 14:55:02 -0700 Subject: slub: fix/clean free_debug_processing return paths Since commit 19c7ff9ecd89 ("slub: Take node lock during object free checks") check_object has been incorrectly returning success as it follows the out label which just returns the node. Thanks to refactoring, the out and fail paths are now basically the same. Combine the two into one and just use a single label. Credit to Mathias Krause for the original work which inspired this series Signed-off-by: Laura Abbott Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Kees Cook Cc: Mathias Krause Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index a03e0ae..744d29b 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1053,24 +1053,25 @@ static noinline int free_debug_processing( void *object = head; int cnt = 0; unsigned long uninitialized_var(flags); + int ret = 0; spin_lock_irqsave(&n->list_lock, flags); slab_lock(page); if (!check_slab(s, page)) - goto fail; + goto out; next_object: cnt++; if (!check_valid_pointer(s, page, object)) { slab_err(s, page, "Invalid object pointer 0x%p", object); - goto fail; + goto out; } if (on_freelist(s, page, object)) { object_err(s, page, object, "Object already free"); - goto fail; + goto out; } if (!check_object(s, page, object, SLUB_RED_ACTIVE)) @@ -1087,7 +1088,7 @@ next_object: } else object_err(s, page, object, "page slab pointer corrupt."); - goto fail; + goto out; } if (s->flags & SLAB_STORE_USER) @@ -1101,6 +1102,8 @@ next_object: object = get_freepointer(s, object); goto next_object; } + ret = 1; + out: if (cnt != bulk_cnt) slab_err(s, page, "Bulk freelist count(%d) invalid(%d)\n", @@ -1108,13 +1111,9 @@ out: slab_unlock(page); spin_unlock_irqrestore(&n->list_lock, flags); - return 1; - -fail: - slab_unlock(page); - spin_unlock_irqrestore(&n->list_lock, flags); - slab_fix(s, "Object at 0x%p not freed", object); - return 0; + if (!ret) + slab_fix(s, "Object at 0x%p not freed", object); + return ret; } static int __init setup_slub_debug(char *str) -- cgit v1.1 From becfda68abca673d61d5cc953e8e099816db99d9 Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Tue, 15 Mar 2016 14:55:06 -0700 Subject: slub: convert SLAB_DEBUG_FREE to SLAB_CONSISTENCY_CHECKS SLAB_DEBUG_FREE allows expensive consistency checks at free to be turned on or off. Expand its use to be able to turn off all consistency checks. This gives a nice speed up if you only want features such as poisoning or tracing. Credit to Mathias Krause for the original work which inspired this series Signed-off-by: Laura Abbott Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Kees Cook Cc: Mathias Krause Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.h | 5 ++-- mm/slub.c | 94 +++++++++++++++++++++++++++++++++++++++------------------------ 2 files changed, 62 insertions(+), 37 deletions(-) (limited to 'mm') diff --git a/mm/slab.h b/mm/slab.h index e880bbe..b793436 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -125,7 +125,7 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size, #define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) #elif defined(CONFIG_SLUB_DEBUG) #define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ - SLAB_TRACE | SLAB_DEBUG_FREE) + SLAB_TRACE | SLAB_CONSISTENCY_CHECKS) #else #define SLAB_DEBUG_FLAGS (0) #endif @@ -311,7 +311,8 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) * to not do even the assignment. In that case, slab_equal_or_root * will also be a constant. */ - if (!memcg_kmem_enabled() && !unlikely(s->flags & SLAB_DEBUG_FREE)) + if (!memcg_kmem_enabled() && + !unlikely(s->flags & SLAB_CONSISTENCY_CHECKS)) return s; page = virt_to_head_page(x); diff --git a/mm/slub.c b/mm/slub.c index 744d29b..9cde663 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -160,7 +160,7 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) */ #define MAX_PARTIAL 10 -#define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \ +#define DEBUG_DEFAULT_FLAGS (SLAB_CONSISTENCY_CHECKS | SLAB_RED_ZONE | \ SLAB_POISON | SLAB_STORE_USER) /* @@ -1007,20 +1007,32 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page, init_tracking(s, object); } -static noinline int alloc_debug_processing(struct kmem_cache *s, +static inline int alloc_consistency_checks(struct kmem_cache *s, struct page *page, void *object, unsigned long addr) { if (!check_slab(s, page)) - goto bad; + return 0; if (!check_valid_pointer(s, page, object)) { object_err(s, page, object, "Freelist Pointer check fails"); - goto bad; + return 0; } if (!check_object(s, page, object, SLUB_RED_INACTIVE)) - goto bad; + return 0; + + return 1; +} + +static noinline int alloc_debug_processing(struct kmem_cache *s, + struct page *page, + void *object, unsigned long addr) +{ + if (s->flags & SLAB_CONSISTENCY_CHECKS) { + if (!alloc_consistency_checks(s, page, object, addr)) + goto bad; + } /* Success perform special debug activities for allocs */ if (s->flags & SLAB_STORE_USER) @@ -1043,39 +1055,21 @@ bad: return 0; } -/* Supports checking bulk free of a constructed freelist */ -static noinline int free_debug_processing( - struct kmem_cache *s, struct page *page, - void *head, void *tail, int bulk_cnt, - unsigned long addr) +static inline int free_consistency_checks(struct kmem_cache *s, + struct page *page, void *object, unsigned long addr) { - struct kmem_cache_node *n = get_node(s, page_to_nid(page)); - void *object = head; - int cnt = 0; - unsigned long uninitialized_var(flags); - int ret = 0; - - spin_lock_irqsave(&n->list_lock, flags); - slab_lock(page); - - if (!check_slab(s, page)) - goto out; - -next_object: - cnt++; - if (!check_valid_pointer(s, page, object)) { slab_err(s, page, "Invalid object pointer 0x%p", object); - goto out; + return 0; } if (on_freelist(s, page, object)) { object_err(s, page, object, "Object already free"); - goto out; + return 0; } if (!check_object(s, page, object, SLUB_RED_ACTIVE)) - goto out; + return 0; if (unlikely(s != page->slab_cache)) { if (!PageSlab(page)) { @@ -1088,7 +1082,37 @@ next_object: } else object_err(s, page, object, "page slab pointer corrupt."); - goto out; + return 0; + } + return 1; +} + +/* Supports checking bulk free of a constructed freelist */ +static noinline int free_debug_processing( + struct kmem_cache *s, struct page *page, + void *head, void *tail, int bulk_cnt, + unsigned long addr) +{ + struct kmem_cache_node *n = get_node(s, page_to_nid(page)); + void *object = head; + int cnt = 0; + unsigned long uninitialized_var(flags); + int ret = 0; + + spin_lock_irqsave(&n->list_lock, flags); + slab_lock(page); + + if (s->flags & SLAB_CONSISTENCY_CHECKS) { + if (!check_slab(s, page)) + goto out; + } + +next_object: + cnt++; + + if (s->flags & SLAB_CONSISTENCY_CHECKS) { + if (!free_consistency_checks(s, page, object, addr)) + goto out; } if (s->flags & SLAB_STORE_USER) @@ -1145,7 +1169,7 @@ static int __init setup_slub_debug(char *str) for (; *str && *str != ','; str++) { switch (tolower(*str)) { case 'f': - slub_debug |= SLAB_DEBUG_FREE; + slub_debug |= SLAB_CONSISTENCY_CHECKS; break; case 'z': slub_debug |= SLAB_RED_ZONE; @@ -1449,7 +1473,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page) int order = compound_order(page); int pages = 1 << order; - if (kmem_cache_debug(s)) { + if (s->flags & SLAB_CONSISTENCY_CHECKS) { void *p; slab_pad_check(s, page); @@ -4769,16 +4793,16 @@ SLAB_ATTR_RO(total_objects); static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE)); + return sprintf(buf, "%d\n", !!(s->flags & SLAB_CONSISTENCY_CHECKS)); } static ssize_t sanity_checks_store(struct kmem_cache *s, const char *buf, size_t length) { - s->flags &= ~SLAB_DEBUG_FREE; + s->flags &= ~SLAB_CONSISTENCY_CHECKS; if (buf[0] == '1') { s->flags &= ~__CMPXCHG_DOUBLE; - s->flags |= SLAB_DEBUG_FREE; + s->flags |= SLAB_CONSISTENCY_CHECKS; } return length; } @@ -5313,7 +5337,7 @@ static char *create_unique_id(struct kmem_cache *s) *p++ = 'd'; if (s->flags & SLAB_RECLAIM_ACCOUNT) *p++ = 'a'; - if (s->flags & SLAB_DEBUG_FREE) + if (s->flags & SLAB_CONSISTENCY_CHECKS) *p++ = 'F'; if (!(s->flags & SLAB_NOTRACK)) *p++ = 't'; -- cgit v1.1 From 149daaf3a02c020ddae64668c3a1bd0c4f314d6a Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Tue, 15 Mar 2016 14:55:09 -0700 Subject: slub: relax CMPXCHG consistency restrictions When debug options are enabled, cmpxchg on the page is disabled. This is because the page must be locked to ensure there are no false positives when performing consistency checks. Some debug options such as poisoning and red zoning only act on the object itself. There is no need to protect other CPUs from modification on only the object. Allow cmpxchg to happen with poisoning and red zoning are set on a slab. Credit to Mathias Krause for the original work which inspired this series Signed-off-by: Laura Abbott Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Kees Cook Cc: Mathias Krause Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 9cde663..c2a227d 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -164,6 +164,14 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) SLAB_POISON | SLAB_STORE_USER) /* + * These debug flags cannot use CMPXCHG because there might be consistency + * issues when checking or reading debug information + */ +#define SLAB_NO_CMPXCHG (SLAB_CONSISTENCY_CHECKS | SLAB_STORE_USER | \ + SLAB_TRACE) + + +/* * Debugging flags that require metadata to be stored in the slab. These get * disabled when slub_debug=O is used and a cache's min order increases with * metadata. @@ -3338,7 +3346,7 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) - if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0) + if (system_has_cmpxchg_double() && (s->flags & SLAB_NO_CMPXCHG) == 0) /* Enable fast mode */ s->flags |= __CMPXCHG_DOUBLE; #endif @@ -4846,7 +4854,6 @@ static ssize_t red_zone_store(struct kmem_cache *s, s->flags &= ~SLAB_RED_ZONE; if (buf[0] == '1') { - s->flags &= ~__CMPXCHG_DOUBLE; s->flags |= SLAB_RED_ZONE; } calculate_sizes(s, -1); @@ -4867,7 +4874,6 @@ static ssize_t poison_store(struct kmem_cache *s, s->flags &= ~SLAB_POISON; if (buf[0] == '1') { - s->flags &= ~__CMPXCHG_DOUBLE; s->flags |= SLAB_POISON; } calculate_sizes(s, -1); -- cgit v1.1 From d86bd1bece6fc41d59253002db5441fe960a37f6 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 15 Mar 2016 14:55:12 -0700 Subject: mm/slub: support left redzone SLUB already has a redzone debugging feature. But it is only positioned at the end of object (aka right redzone) so it cannot catch left oob. Although current object's right redzone acts as left redzone of next object, first object in a slab cannot take advantage of this effect. This patch explicitly adds a left red zone to each object to detect left oob more precisely. Background: Someone complained to me that left OOB doesn't catch even if KASAN is enabled which does page allocation debugging. That page is out of our control so it would be allocated when left OOB happens and, in this case, we can't find OOB. Moreover, SLUB debugging feature can be enabled without page allocator debugging and, in this case, we will miss that OOB. Before trying to implement, I expected that changes would be too complex, but, it doesn't look that complex to me now. Almost changes are applied to debug specific functions so I feel okay. Signed-off-by: Joonsoo Kim Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 100 ++++++++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 71 insertions(+), 29 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index c2a227d..2d4d817 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -124,6 +124,14 @@ static inline int kmem_cache_debug(struct kmem_cache *s) #endif } +static inline void *fixup_red_left(struct kmem_cache *s, void *p) +{ + if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE) + p += s->red_left_pad; + + return p; +} + static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) { #ifdef CONFIG_SLUB_CPU_PARTIAL @@ -232,24 +240,6 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si) * Core slab cache functions *******************************************************************/ -/* Verify that a pointer has an address that is valid within a slab page */ -static inline int check_valid_pointer(struct kmem_cache *s, - struct page *page, const void *object) -{ - void *base; - - if (!object) - return 1; - - base = page_address(page); - if (object < base || object >= base + page->objects * s->size || - (object - base) % s->size) { - return 0; - } - - return 1; -} - static inline void *get_freepointer(struct kmem_cache *s, void *object) { return *(void **)(object + s->offset); @@ -279,12 +269,14 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) /* Loop over all objects in a slab */ #define for_each_object(__p, __s, __addr, __objects) \ - for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\ - __p += (__s)->size) + for (__p = fixup_red_left(__s, __addr); \ + __p < (__addr) + (__objects) * (__s)->size; \ + __p += (__s)->size) #define for_each_object_idx(__p, __idx, __s, __addr, __objects) \ - for (__p = (__addr), __idx = 1; __idx <= __objects;\ - __p += (__s)->size, __idx++) + for (__p = fixup_red_left(__s, __addr), __idx = 1; \ + __idx <= __objects; \ + __p += (__s)->size, __idx++) /* Determine object index from a given position */ static inline int slab_index(void *p, struct kmem_cache *s, void *addr) @@ -442,6 +434,22 @@ static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map) set_bit(slab_index(p, s, addr), map); } +static inline int size_from_object(struct kmem_cache *s) +{ + if (s->flags & SLAB_RED_ZONE) + return s->size - s->red_left_pad; + + return s->size; +} + +static inline void *restore_red_left(struct kmem_cache *s, void *p) +{ + if (s->flags & SLAB_RED_ZONE) + p -= s->red_left_pad; + + return p; +} + /* * Debug settings: */ @@ -475,6 +483,26 @@ static inline void metadata_access_disable(void) /* * Object debugging */ + +/* Verify that a pointer has an address that is valid within a slab page */ +static inline int check_valid_pointer(struct kmem_cache *s, + struct page *page, void *object) +{ + void *base; + + if (!object) + return 1; + + base = page_address(page); + object = restore_red_left(s, object); + if (object < base || object >= base + page->objects * s->size || + (object - base) % s->size) { + return 0; + } + + return 1; +} + static void print_section(char *text, u8 *addr, unsigned int length) { metadata_access_enable(); @@ -614,7 +642,9 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", p, p - addr, get_freepointer(s, p)); - if (p > addr + 16) + if (s->flags & SLAB_RED_ZONE) + print_section("Redzone ", p - s->red_left_pad, s->red_left_pad); + else if (p > addr + 16) print_section("Bytes b4 ", p - 16, 16); print_section("Object ", p, min_t(unsigned long, s->object_size, @@ -631,9 +661,9 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) if (s->flags & SLAB_STORE_USER) off += 2 * sizeof(struct track); - if (off != s->size) + if (off != size_from_object(s)) /* Beginning of the filler is the free pointer */ - print_section("Padding ", p + off, s->size - off); + print_section("Padding ", p + off, size_from_object(s) - off); dump_stack(); } @@ -663,6 +693,9 @@ static void init_object(struct kmem_cache *s, void *object, u8 val) { u8 *p = object; + if (s->flags & SLAB_RED_ZONE) + memset(p - s->red_left_pad, val, s->red_left_pad); + if (s->flags & __OBJECT_POISON) { memset(p, POISON_FREE, s->object_size - 1); p[s->object_size - 1] = POISON_END; @@ -755,11 +788,11 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) /* We also have user information there */ off += 2 * sizeof(struct track); - if (s->size == off) + if (size_from_object(s) == off) return 1; return check_bytes_and_report(s, page, p, "Object padding", - p + off, POISON_INUSE, s->size - off); + p + off, POISON_INUSE, size_from_object(s) - off); } /* Check the pad bytes at the end of a slab page */ @@ -804,6 +837,10 @@ static int check_object(struct kmem_cache *s, struct page *page, if (s->flags & SLAB_RED_ZONE) { if (!check_bytes_and_report(s, page, object, "Redzone", + object - s->red_left_pad, val, s->red_left_pad)) + return 0; + + if (!check_bytes_and_report(s, page, object, "Redzone", endobject, val, s->inuse - s->object_size)) return 0; } else { @@ -1445,7 +1482,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) set_freepointer(s, p, NULL); } - page->freelist = start; + page->freelist = fixup_red_left(s, start); page->inuse = page->objects; page->frozen = 1; @@ -3274,7 +3311,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) */ size += 2 * sizeof(struct track); - if (flags & SLAB_RED_ZONE) + if (flags & SLAB_RED_ZONE) { /* * Add some empty padding so that we can catch * overwrites from earlier objects rather than let @@ -3283,6 +3320,11 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) * of the object. */ size += sizeof(void *); + + s->red_left_pad = sizeof(void *); + s->red_left_pad = ALIGN(s->red_left_pad, s->align); + size += s->red_left_pad; + } #endif /* -- cgit v1.1 From d91749c1dda71a7030c054a5ab8dc5419bc6730b Mon Sep 17 00:00:00 2001 From: Taku Izumi Date: Tue, 15 Mar 2016 14:55:18 -0700 Subject: mm/page_alloc.c: calculate zone_start_pfn at zone_spanned_pages_in_node() Xeon E7 v3 based systems supports Address Range Mirroring and UEFI BIOS complied with UEFI spec 2.5 can notify which ranges are mirrored (reliable) via EFI memory map. Now Linux kernel utilize its information and allocates boot time memory from reliable region. My requirement is: - allocate kernel memory from mirrored region - allocate user memory from non-mirrored region In order to meet my requirement, ZONE_MOVABLE is useful. By arranging non-mirrored range into ZONE_MOVABLE, mirrored memory is used for kernel allocations. My idea is to extend existing "kernelcore" option and introduces kernelcore=mirror option. By specifying "mirror" instead of specifying the amount of memory, non-mirrored region will be arranged into ZONE_MOVABLE. Earlier discussions are at: https://lkml.org/lkml/2015/10/9/24 https://lkml.org/lkml/2015/10/15/9 https://lkml.org/lkml/2015/11/27/18 https://lkml.org/lkml/2015/12/8/836 For example, suppose 2-nodes system with the following memory range: node 0 [mem 0x0000000000001000-0x000000109fffffff] node 1 [mem 0x00000010a0000000-0x000000209fffffff] and the following ranges are marked as reliable (mirrored): [0x0000000000000000-0x0000000100000000] [0x0000000100000000-0x0000000180000000] [0x0000000800000000-0x0000000880000000] [0x00000010a0000000-0x0000001120000000] [0x00000017a0000000-0x0000001820000000] If you specify kernelcore=mirror, ZONE_NORMAL and ZONE_MOVABLE are arranged like bellow: - node 0: ZONE_NORMAL : [0x0000000100000000-0x00000010a0000000] ZONE_MOVABLE: [0x0000000180000000-0x00000010a0000000] - node 1: ZONE_NORMAL : [0x00000010a0000000-0x00000020a0000000] ZONE_MOVABLE: [0x0000001120000000-0x00000020a0000000] In overlapped range, pages to be ZONE_MOVABLE in ZONE_NORMAL are treated as absent pages, and vice versa. This patch (of 2): Currently each zone's zone_start_pfn is calculated at free_area_init_core(). However zone's range is fixed at the time when invoking zone_spanned_pages_in_node(). This patch changes how each zone->zone_start_pfn is calculated in zone_spanned_pages_in_node(). Signed-off-by: Taku Izumi Cc: Tony Luck Cc: Xishi Qiu Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Dave Hansen Cc: Matt Fleming Cc: Arnd Bergmann Cc: Steve Capper Cc: Sudeep Holla Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 40 +++++++++++++++++++++++++++++----------- 1 file changed, 29 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 838ca8bb..0d20a19 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4953,31 +4953,31 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid, unsigned long zone_type, unsigned long node_start_pfn, unsigned long node_end_pfn, + unsigned long *zone_start_pfn, + unsigned long *zone_end_pfn, unsigned long *ignored) { - unsigned long zone_start_pfn, zone_end_pfn; - /* When hotadd a new node from cpu_up(), the node should be empty */ if (!node_start_pfn && !node_end_pfn) return 0; /* Get the start and end of the zone */ - zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; - zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; + *zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; + *zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; adjust_zone_range_for_zone_movable(nid, zone_type, node_start_pfn, node_end_pfn, - &zone_start_pfn, &zone_end_pfn); + zone_start_pfn, zone_end_pfn); /* Check that this node has pages within the zone's required range */ - if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn) + if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn) return 0; /* Move the zone boundaries inside the node if necessary */ - zone_end_pfn = min(zone_end_pfn, node_end_pfn); - zone_start_pfn = max(zone_start_pfn, node_start_pfn); + *zone_end_pfn = min(*zone_end_pfn, node_end_pfn); + *zone_start_pfn = max(*zone_start_pfn, node_start_pfn); /* Return the spanned pages */ - return zone_end_pfn - zone_start_pfn; + return *zone_end_pfn - *zone_start_pfn; } /* @@ -5042,8 +5042,18 @@ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, unsigned long zone_type, unsigned long node_start_pfn, unsigned long node_end_pfn, + unsigned long *zone_start_pfn, + unsigned long *zone_end_pfn, unsigned long *zones_size) { + unsigned int zone; + + *zone_start_pfn = node_start_pfn; + for (zone = 0; zone < zone_type; zone++) + *zone_start_pfn += zones_size[zone]; + + *zone_end_pfn = *zone_start_pfn + zones_size[zone_type]; + return zones_size[zone_type]; } @@ -5072,15 +5082,22 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, for (i = 0; i < MAX_NR_ZONES; i++) { struct zone *zone = pgdat->node_zones + i; + unsigned long zone_start_pfn, zone_end_pfn; unsigned long size, real_size; size = zone_spanned_pages_in_node(pgdat->node_id, i, node_start_pfn, node_end_pfn, + &zone_start_pfn, + &zone_end_pfn, zones_size); real_size = size - zone_absent_pages_in_node(pgdat->node_id, i, node_start_pfn, node_end_pfn, zholes_size); + if (size) + zone->zone_start_pfn = zone_start_pfn; + else + zone->zone_start_pfn = 0; zone->spanned_pages = size; zone->present_pages = real_size; @@ -5201,7 +5218,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) { enum zone_type j; int nid = pgdat->node_id; - unsigned long zone_start_pfn = pgdat->node_start_pfn; int ret; pgdat_resize_init(pgdat); @@ -5222,6 +5238,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize, freesize, memmap_pages; + unsigned long zone_start_pfn = zone->zone_start_pfn; size = zone->spanned_pages; realsize = freesize = zone->present_pages; @@ -5290,7 +5307,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) ret = init_currently_empty_zone(zone, zone_start_pfn, size); BUG_ON(ret); memmap_init(size, nid, j, zone_start_pfn); - zone_start_pfn += size; } } @@ -5358,6 +5374,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, (u64)start_pfn << PAGE_SHIFT, end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0); +#else + start_pfn = node_start_pfn; #endif calculate_node_totalpages(pgdat, start_pfn, end_pfn, zones_size, zholes_size); -- cgit v1.1 From 342332e6a925e9ed015e5465062c38d2b86ec8f9 Mon Sep 17 00:00:00 2001 From: Taku Izumi Date: Tue, 15 Mar 2016 14:55:22 -0700 Subject: mm/page_alloc.c: introduce kernelcore=mirror option This patch extends existing "kernelcore" option and introduces kernelcore=mirror option. By specifying "mirror" instead of specifying the amount of memory, non-mirrored (non-reliable) region will be arranged into ZONE_MOVABLE. [akpm@linux-foundation.org: fix build with CONFIG_HAVE_MEMBLOCK_NODE_MAP=n] Signed-off-by: Taku Izumi Tested-by: Sudeep Holla Cc: Tony Luck Cc: Xishi Qiu Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Dave Hansen Cc: Matt Fleming Cc: Arnd Bergmann Cc: Steve Capper Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 114 +++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 108 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0d20a19..b8160b9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -247,6 +247,7 @@ static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; static unsigned long __initdata required_kernelcore; static unsigned long __initdata required_movablecore; static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; +static bool mirrored_kernelcore; /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ int movable_zone; @@ -4491,6 +4492,9 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, pg_data_t *pgdat = NODE_DATA(nid); unsigned long pfn; unsigned long nr_initialised = 0; +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP + struct memblock_region *r = NULL, *tmp; +#endif if (highest_memmap_pfn < end_pfn - 1) highest_memmap_pfn = end_pfn - 1; @@ -4516,6 +4520,40 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised)) break; + +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP + /* + * if not mirrored_kernelcore and ZONE_MOVABLE exists, + * range from zone_movable_pfn[nid] to end of each node + * should be ZONE_MOVABLE not ZONE_NORMAL. skip it. + */ + if (!mirrored_kernelcore && zone_movable_pfn[nid]) + if (zone == ZONE_NORMAL && + pfn >= zone_movable_pfn[nid]) + continue; + + /* + * check given memblock attribute by firmware which + * can affect kernel memory layout. + * if zone==ZONE_MOVABLE but memory is mirrored, + * it's an overlapped memmap init. skip it. + */ + if (mirrored_kernelcore && zone == ZONE_MOVABLE) { + if (!r || + pfn >= memblock_region_memory_end_pfn(r)) { + for_each_memblock(memory, tmp) + if (pfn < memblock_region_memory_end_pfn(tmp)) + break; + r = tmp; + } + if (pfn >= memblock_region_memory_base_pfn(r) && + memblock_is_mirror(r)) { + /* already initialized as NORMAL */ + pfn = memblock_region_memory_end_pfn(r); + continue; + } + } +#endif } /* @@ -4934,11 +4972,6 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid, *zone_end_pfn = min(node_end_pfn, arch_zone_highest_possible_pfn[movable_zone]); - /* Adjust for ZONE_MOVABLE starting within this range */ - } else if (*zone_start_pfn < zone_movable_pfn[nid] && - *zone_end_pfn > zone_movable_pfn[nid]) { - *zone_end_pfn = zone_movable_pfn[nid]; - /* Check if this whole range is within ZONE_MOVABLE */ } else if (*zone_start_pfn >= zone_movable_pfn[nid]) *zone_start_pfn = *zone_end_pfn; @@ -5023,6 +5056,7 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; unsigned long zone_start_pfn, zone_end_pfn; + unsigned long nr_absent; /* When hotadd a new node from cpu_up(), the node should be empty */ if (!node_start_pfn && !node_end_pfn) @@ -5034,7 +5068,39 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, adjust_zone_range_for_zone_movable(nid, zone_type, node_start_pfn, node_end_pfn, &zone_start_pfn, &zone_end_pfn); - return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); + nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); + + /* + * ZONE_MOVABLE handling. + * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages + * and vice versa. + */ + if (zone_movable_pfn[nid]) { + if (mirrored_kernelcore) { + unsigned long start_pfn, end_pfn; + struct memblock_region *r; + + for_each_memblock(memory, r) { + start_pfn = clamp(memblock_region_memory_base_pfn(r), + zone_start_pfn, zone_end_pfn); + end_pfn = clamp(memblock_region_memory_end_pfn(r), + zone_start_pfn, zone_end_pfn); + + if (zone_type == ZONE_MOVABLE && + memblock_is_mirror(r)) + nr_absent += end_pfn - start_pfn; + + if (zone_type == ZONE_NORMAL && + !memblock_is_mirror(r)) + nr_absent += end_pfn - start_pfn; + } + } else { + if (zone_type == ZONE_NORMAL) + nr_absent += node_end_pfn - zone_movable_pfn[nid]; + } + } + + return nr_absent; } #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ @@ -5547,6 +5613,36 @@ static void __init find_zone_movable_pfns_for_nodes(void) } /* + * If kernelcore=mirror is specified, ignore movablecore option + */ + if (mirrored_kernelcore) { + bool mem_below_4gb_not_mirrored = false; + + for_each_memblock(memory, r) { + if (memblock_is_mirror(r)) + continue; + + nid = r->nid; + + usable_startpfn = memblock_region_memory_base_pfn(r); + + if (usable_startpfn < 0x100000) { + mem_below_4gb_not_mirrored = true; + continue; + } + + zone_movable_pfn[nid] = zone_movable_pfn[nid] ? + min(usable_startpfn, zone_movable_pfn[nid]) : + usable_startpfn; + } + + if (mem_below_4gb_not_mirrored) + pr_warn("This configuration results in unmirrored kernel memory."); + + goto out2; + } + + /* * If movablecore=nn[KMG] was specified, calculate what size of * kernelcore that corresponds so that memory usable for * any allocation type is evenly spread. If both kernelcore @@ -5806,6 +5902,12 @@ static int __init cmdline_parse_core(char *p, unsigned long *core) */ static int __init cmdline_parse_kernelcore(char *p) { + /* parse kernelcore=mirror */ + if (parse_option_str(p, "mirror")) { + mirrored_kernelcore = true; + return 0; + } + return cmdline_parse_core(p, &required_kernelcore); } -- cgit v1.1 From b72d0ffb5dbc4070089b36230b98687ca4577cbc Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 15 Mar 2016 14:55:25 -0700 Subject: mm/page_alloc.c: rework code layout in memmap_init_zone() This function is getting full of weird tricks to avoid word-wrapping. Use a goto to eliminate a tab stop then use the new space Cc: Taku Izumi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 79 +++++++++++++++++++++++++++------------------------------ 1 file changed, 38 insertions(+), 41 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b8160b9..fe4378f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4508,54 +4508,51 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, for (pfn = start_pfn; pfn < end_pfn; pfn++) { /* - * There can be holes in boot-time mem_map[]s - * handed to this function. They do not - * exist on hotplugged memory. + * There can be holes in boot-time mem_map[]s handed to this + * function. They do not exist on hotplugged memory. */ - if (context == MEMMAP_EARLY) { - if (!early_pfn_valid(pfn)) - continue; - if (!early_pfn_in_nid(pfn, nid)) - continue; - if (!update_defer_init(pgdat, pfn, end_pfn, - &nr_initialised)) - break; + if (context != MEMMAP_EARLY) + goto not_early; + + if (!early_pfn_valid(pfn)) + continue; + if (!early_pfn_in_nid(pfn, nid)) + continue; + if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised)) + break; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP - /* - * if not mirrored_kernelcore and ZONE_MOVABLE exists, - * range from zone_movable_pfn[nid] to end of each node - * should be ZONE_MOVABLE not ZONE_NORMAL. skip it. - */ - if (!mirrored_kernelcore && zone_movable_pfn[nid]) - if (zone == ZONE_NORMAL && - pfn >= zone_movable_pfn[nid]) - continue; + /* + * If not mirrored_kernelcore and ZONE_MOVABLE exists, range + * from zone_movable_pfn[nid] to end of each node should be + * ZONE_MOVABLE not ZONE_NORMAL. skip it. + */ + if (!mirrored_kernelcore && zone_movable_pfn[nid]) + if (zone == ZONE_NORMAL && pfn >= zone_movable_pfn[nid]) + continue; - /* - * check given memblock attribute by firmware which - * can affect kernel memory layout. - * if zone==ZONE_MOVABLE but memory is mirrored, - * it's an overlapped memmap init. skip it. - */ - if (mirrored_kernelcore && zone == ZONE_MOVABLE) { - if (!r || - pfn >= memblock_region_memory_end_pfn(r)) { - for_each_memblock(memory, tmp) - if (pfn < memblock_region_memory_end_pfn(tmp)) - break; - r = tmp; - } - if (pfn >= memblock_region_memory_base_pfn(r) && - memblock_is_mirror(r)) { - /* already initialized as NORMAL */ - pfn = memblock_region_memory_end_pfn(r); - continue; - } + /* + * Check given memblock attribute by firmware which can affect + * kernel memory layout. If zone==ZONE_MOVABLE but memory is + * mirrored, it's an overlapped memmap init. skip it. + */ + if (mirrored_kernelcore && zone == ZONE_MOVABLE) { + if (!r || pfn >= memblock_region_memory_end_pfn(r)) { + for_each_memblock(memory, tmp) + if (pfn < memblock_region_memory_end_pfn(tmp)) + break; + r = tmp; + } + if (pfn >= memblock_region_memory_base_pfn(r) && + memblock_is_mirror(r)) { + /* already initialized as NORMAL */ + pfn = memblock_region_memory_end_pfn(r); + continue; } -#endif } +#endif +not_early: /* * Mark the block movable so that blocks are reserved for * movable at startup. This will force kernel allocations -- cgit v1.1 From d59b1087a98e402ed9a7cc577f4da435f9a555f5 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Tue, 15 Mar 2016 14:55:27 -0700 Subject: mm/page-writeback: fix dirty_ratelimit calculation Calculation of dirty_ratelimit sometimes is not correct. E.g. initial values of dirty_ratelimit == INIT_BW and step == 0, lead to the following result: UBSAN: Undefined behaviour in ../mm/page-writeback.c:1286:7 shift exponent 25600 is too large for 64-bit type 'long unsigned int' The fix is straightforward - make step 0 if the shift exponent is too big. Signed-off-by: Andrey Ryabinin Cc: Wu Fengguang Cc: Tejun Heo Cc: Andy Shevchenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 6fe7d15..d782cba 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1169,6 +1169,7 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc, unsigned long balanced_dirty_ratelimit; unsigned long step; unsigned long x; + unsigned long shift; /* * The dirty rate will match the writeout rate in long term, except @@ -1293,11 +1294,11 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc, * rate itself is constantly fluctuating. So decrease the track speed * when it gets close to the target. Helps eliminate pointless tremors. */ - step >>= dirty_ratelimit / (2 * step + 1); - /* - * Limit the tracking speed to avoid overshooting. - */ - step = (step + 7) / 8; + shift = dirty_ratelimit / (2 * step + 1); + if (shift < BITS_PER_LONG) + step = DIV_ROUND_UP(step >> shift, 8); + else + step = 0; if (dirty_ratelimit < balanced_dirty_ratelimit) dirty_ratelimit += step; -- cgit v1.1 From ea6eabb05b26bd3d6f60b29b77a03bc61479fc0f Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Tue, 15 Mar 2016 14:55:30 -0700 Subject: mm/debug_pagealloc: ask users for default setting of debug_pagealloc Since commit 031bc5743f158 ("mm/debug-pagealloc: make debug-pagealloc boottime configurable") CONFIG_DEBUG_PAGEALLOC is by default not adding any page debugging. This resulted in several unnoticed bugs, e.g. https://lkml.kernel.org/g/<569F5E29.3090107@de.ibm.com> or https://lkml.kernel.org/g/<56A20F30.4050705@de.ibm.com> as this behaviour change was not even documented in Kconfig. Let's provide a new Kconfig symbol that allows to change the default back to enabled, e.g. for debug kernels. This also makes the change obvious to kernel packagers. Let's also change the Kconfig description for CONFIG_DEBUG_PAGEALLOC, to indicate that there are two stages of overhead. Signed-off-by: Christian Borntraeger Cc: Joonsoo Kim Cc: Peter Zijlstra Cc: Heiko Carstens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig.debug | 18 ++++++++++++++++-- mm/page_alloc.c | 6 +++++- 2 files changed, 21 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 957d3da..a0c136a 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -16,8 +16,8 @@ config DEBUG_PAGEALLOC select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC ---help--- Unmap pages from the kernel linear mapping after free_pages(). - This results in a large slowdown, but helps to find certain types - of memory corruption. + Depending on runtime enablement, this results in a small or large + slowdown, but helps to find certain types of memory corruption. For architectures which don't enable ARCH_SUPPORTS_DEBUG_PAGEALLOC, fill the pages with poison patterns after free_pages() and verify @@ -26,5 +26,19 @@ config DEBUG_PAGEALLOC that would result in incorrect warnings of memory corruption after a resume because free pages are not saved to the suspend image. + By default this option will have a small overhead, e.g. by not + allowing the kernel mapping to be backed by large pages on some + architectures. Even bigger overhead comes when the debugging is + enabled by DEBUG_PAGEALLOC_ENABLE_DEFAULT or the debug_pagealloc + command line parameter. + +config DEBUG_PAGEALLOC_ENABLE_DEFAULT + bool "Enable debug page memory allocations by default?" + default n + depends on DEBUG_PAGEALLOC + ---help--- + Enable debug page memory allocations by default? This value + can be overridden by debug_pagealloc=off|on. + config PAGE_POISONING bool diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fe4378f..36a0a79 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -478,7 +478,8 @@ void prep_compound_page(struct page *page, unsigned int order) #ifdef CONFIG_DEBUG_PAGEALLOC unsigned int _debug_guardpage_minorder; -bool _debug_pagealloc_enabled __read_mostly; +bool _debug_pagealloc_enabled __read_mostly + = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT); bool _debug_guardpage_enabled __read_mostly; static int __init early_debug_pagealloc(char *buf) @@ -489,6 +490,9 @@ static int __init early_debug_pagealloc(char *buf) if (strcmp(buf, "on") == 0) _debug_pagealloc_enabled = true; + if (strcmp(buf, "off") == 0) + _debug_pagealloc_enabled = false; + return 0; } early_param("debug_pagealloc", early_debug_pagealloc); -- cgit v1.1 From 32b635298ff4e991d8d8f64dc23782b02eec29c3 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 15 Mar 2016 14:55:36 -0700 Subject: mm: filemap: remove redundant code in do_read_cache_page do_read_cache_page and __read_cache_page duplicate page filler code when filling the page for the first time. This patch simply removes the duplicate logic. Signed-off-by: Mel Gorman Reviewed-by: Jan Kara Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 43 ++++++++++++------------------------------- 1 file changed, 12 insertions(+), 31 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index da7a35d..deae0b9 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2303,7 +2303,7 @@ static struct page *wait_on_page_read(struct page *page) return page; } -static struct page *__read_cache_page(struct address_space *mapping, +static struct page *do_read_cache_page(struct address_space *mapping, pgoff_t index, int (*filler)(void *, struct page *), void *data, @@ -2325,31 +2325,19 @@ repeat: /* Presumably ENOMEM for radix tree node */ return ERR_PTR(err); } + +filler: err = filler(data, page); if (err < 0) { page_cache_release(page); - page = ERR_PTR(err); - } else { - page = wait_on_page_read(page); + return ERR_PTR(err); } - } - return page; -} - -static struct page *do_read_cache_page(struct address_space *mapping, - pgoff_t index, - int (*filler)(void *, struct page *), - void *data, - gfp_t gfp) - -{ - struct page *page; - int err; -retry: - page = __read_cache_page(mapping, index, filler, data, gfp); - if (IS_ERR(page)) - return page; + page = wait_on_page_read(page); + if (IS_ERR(page)) + return page; + goto out; + } if (PageUptodate(page)) goto out; @@ -2357,21 +2345,14 @@ retry: if (!page->mapping) { unlock_page(page); page_cache_release(page); - goto retry; + goto repeat; } if (PageUptodate(page)) { unlock_page(page); goto out; } - err = filler(data, page); - if (err < 0) { - page_cache_release(page); - return ERR_PTR(err); - } else { - page = wait_on_page_read(page); - if (IS_ERR(page)) - return page; - } + goto filler; + out: mark_page_accessed(page); return page; -- cgit v1.1 From ebded02788b5d7c7600f8cff26ae07896d568649 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 15 Mar 2016 14:55:39 -0700 Subject: mm: filemap: avoid unnecessary calls to lock_page when waiting for IO to complete during a read In the generic read paths the kernel looks up a page in the page cache and if it's up to date, it is used. If not, the page lock is acquired to wait for IO to complete and then check the page. If multiple processes are waiting on IO, they all serialise against the lock and duplicate the checks. This is unnecessary. The page lock in itself does not give any guarantees to the callers about the page state as it can be immediately truncated or reclaimed after the page is unlocked. It's sufficient to wait_on_page_locked and then continue if the page is up to date on wakeup. It is possible that a truncated but up-to-date page is returned but the reference taken during read prevents it disappearing underneath the caller and the data is still valid if PageUptodate. The overall impact is small as even if processes serialise on the lock, the lock section is tiny once the IO is complete. Profiles indicated that unlock_page and friends are generally a tiny portion of a read-intensive workload. An artificial test was created that had instances of dd access a cache-cold file on an ext4 filesystem and measure how long the read took. paralleldd 4.4.0 4.4.0 vanilla avoidlock Amean Elapsd-1 5.28 ( 0.00%) 5.15 ( 2.50%) Amean Elapsd-4 5.29 ( 0.00%) 5.17 ( 2.12%) Amean Elapsd-7 5.28 ( 0.00%) 5.18 ( 1.78%) Amean Elapsd-12 5.20 ( 0.00%) 5.33 ( -2.50%) Amean Elapsd-21 5.14 ( 0.00%) 5.21 ( -1.41%) Amean Elapsd-30 5.30 ( 0.00%) 5.12 ( 3.38%) Amean Elapsd-48 5.78 ( 0.00%) 5.42 ( 6.21%) Amean Elapsd-79 6.78 ( 0.00%) 6.62 ( 2.46%) Amean Elapsd-110 9.09 ( 0.00%) 8.99 ( 1.15%) Amean Elapsd-128 10.60 ( 0.00%) 10.43 ( 1.66%) The impact is small but intuitively, it makes sense to avoid unnecessary calls to lock_page. Signed-off-by: Mel Gorman Reviewed-by: Jan Kara Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index deae0b9..4a0f5fa 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1668,6 +1668,15 @@ find_page: index, last_index - index); } if (!PageUptodate(page)) { + /* + * See comment in do_read_cache_page on why + * wait_on_page_locked is used to avoid unnecessarily + * serialisations and why it's safe. + */ + wait_on_page_locked_killable(page); + if (PageUptodate(page)) + goto page_ok; + if (inode->i_blkbits == PAGE_CACHE_SHIFT || !mapping->a_ops->is_partially_uptodate) goto page_not_up_to_date; @@ -2341,12 +2350,52 @@ filler: if (PageUptodate(page)) goto out; + /* + * Page is not up to date and may be locked due one of the following + * case a: Page is being filled and the page lock is held + * case b: Read/write error clearing the page uptodate status + * case c: Truncation in progress (page locked) + * case d: Reclaim in progress + * + * Case a, the page will be up to date when the page is unlocked. + * There is no need to serialise on the page lock here as the page + * is pinned so the lock gives no additional protection. Even if the + * the page is truncated, the data is still valid if PageUptodate as + * it's a race vs truncate race. + * Case b, the page will not be up to date + * Case c, the page may be truncated but in itself, the data may still + * be valid after IO completes as it's a read vs truncate race. The + * operation must restart if the page is not uptodate on unlock but + * otherwise serialising on page lock to stabilise the mapping gives + * no additional guarantees to the caller as the page lock is + * released before return. + * Case d, similar to truncation. If reclaim holds the page lock, it + * will be a race with remove_mapping that determines if the mapping + * is valid on unlock but otherwise the data is valid and there is + * no need to serialise with page lock. + * + * As the page lock gives no additional guarantee, we optimistically + * wait on the page to be unlocked and check if it's up to date and + * use the page if it is. Otherwise, the page lock is required to + * distinguish between the different cases. The motivation is that we + * avoid spurious serialisations and wakeups when multiple processes + * wait on the same page for IO to complete. + */ + wait_on_page_locked(page); + if (PageUptodate(page)) + goto out; + + /* Distinguish between all the cases under the safety of the lock */ lock_page(page); + + /* Case c or d, restart the operation */ if (!page->mapping) { unlock_page(page); page_cache_release(page); goto repeat; } + + /* Someone else locked and filled the page in a very small window */ if (PageUptodate(page)) { unlock_page(page); goto out; -- cgit v1.1 From 420adbe9fc1a45187cfa74df9dbfd72272c4e2fa Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 15 Mar 2016 14:55:52 -0700 Subject: mm, tracing: unify mm flags handling in tracepoints and printk In tracepoints, it's possible to print gfp flags in a human-friendly format through a macro show_gfp_flags(), which defines a translation array and passes is to __print_flags(). Since the following patch will introduce support for gfp flags printing in printk(), it would be nice to reuse the array. This is not straightforward, since __print_flags() can't simply reference an array defined in a .c file such as mm/debug.c - it has to be a macro to allow the macro magic to communicate the format to userspace tools such as trace-cmd. The solution is to create a macro __def_gfpflag_names which is used both in show_gfp_flags(), and to define the gfpflag_names[] array in mm/debug.c. On the other hand, mm/debug.c also defines translation tables for page flags and vma flags, and desire was expressed (but not implemented in this series) to use these also from tracepoints. Thus, this patch also renames the events/gfpflags.h file to events/mmflags.h and moves the table definitions there, using the same macro approach as for gfpflags. This allows translating all three kinds of mm-specific flags both in tracepoints and printk. Signed-off-by: Vlastimil Babka Reviewed-by: Michal Hocko Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Ingo Molnar Cc: Rasmus Villemoes Cc: Joonsoo Kim Cc: Minchan Kim Cc: Sasha Levin Cc: "Kirill A. Shutemov" Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/debug.c | 88 ++++++++------------------------------------------------------ 1 file changed, 11 insertions(+), 77 deletions(-) (limited to 'mm') diff --git a/mm/debug.c b/mm/debug.c index f05b2d5..410af90 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -9,41 +9,14 @@ #include #include #include +#include static const struct trace_print_flags pageflag_names[] = { - {1UL << PG_locked, "locked" }, - {1UL << PG_error, "error" }, - {1UL << PG_referenced, "referenced" }, - {1UL << PG_uptodate, "uptodate" }, - {1UL << PG_dirty, "dirty" }, - {1UL << PG_lru, "lru" }, - {1UL << PG_active, "active" }, - {1UL << PG_slab, "slab" }, - {1UL << PG_owner_priv_1, "owner_priv_1" }, - {1UL << PG_arch_1, "arch_1" }, - {1UL << PG_reserved, "reserved" }, - {1UL << PG_private, "private" }, - {1UL << PG_private_2, "private_2" }, - {1UL << PG_writeback, "writeback" }, - {1UL << PG_head, "head" }, - {1UL << PG_swapcache, "swapcache" }, - {1UL << PG_mappedtodisk, "mappedtodisk" }, - {1UL << PG_reclaim, "reclaim" }, - {1UL << PG_swapbacked, "swapbacked" }, - {1UL << PG_unevictable, "unevictable" }, -#ifdef CONFIG_MMU - {1UL << PG_mlocked, "mlocked" }, -#endif -#ifdef CONFIG_ARCH_USES_PG_UNCACHED - {1UL << PG_uncached, "uncached" }, -#endif -#ifdef CONFIG_MEMORY_FAILURE - {1UL << PG_hwpoison, "hwpoison" }, -#endif -#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT) - {1UL << PG_young, "young" }, - {1UL << PG_idle, "idle" }, -#endif + __def_pageflag_names +}; + +static const struct trace_print_flags gfpflag_names[] = { + __def_gfpflag_names }; static void dump_flags(unsigned long flags, @@ -108,47 +81,8 @@ EXPORT_SYMBOL(dump_page); #ifdef CONFIG_DEBUG_VM -static const struct trace_print_flags vmaflags_names[] = { - {VM_READ, "read" }, - {VM_WRITE, "write" }, - {VM_EXEC, "exec" }, - {VM_SHARED, "shared" }, - {VM_MAYREAD, "mayread" }, - {VM_MAYWRITE, "maywrite" }, - {VM_MAYEXEC, "mayexec" }, - {VM_MAYSHARE, "mayshare" }, - {VM_GROWSDOWN, "growsdown" }, - {VM_PFNMAP, "pfnmap" }, - {VM_DENYWRITE, "denywrite" }, - {VM_LOCKONFAULT, "lockonfault" }, - {VM_LOCKED, "locked" }, - {VM_IO, "io" }, - {VM_SEQ_READ, "seqread" }, - {VM_RAND_READ, "randread" }, - {VM_DONTCOPY, "dontcopy" }, - {VM_DONTEXPAND, "dontexpand" }, - {VM_ACCOUNT, "account" }, - {VM_NORESERVE, "noreserve" }, - {VM_HUGETLB, "hugetlb" }, -#if defined(CONFIG_X86) - {VM_PAT, "pat" }, -#elif defined(CONFIG_PPC) - {VM_SAO, "sao" }, -#elif defined(CONFIG_PARISC) || defined(CONFIG_METAG) || defined(CONFIG_IA64) - {VM_GROWSUP, "growsup" }, -#elif !defined(CONFIG_MMU) - {VM_MAPPED_COPY, "mappedcopy" }, -#else - {VM_ARCH_1, "arch_1" }, -#endif - {VM_DONTDUMP, "dontdump" }, -#ifdef CONFIG_MEM_SOFT_DIRTY - {VM_SOFTDIRTY, "softdirty" }, -#endif - {VM_MIXEDMAP, "mixedmap" }, - {VM_HUGEPAGE, "hugepage" }, - {VM_NOHUGEPAGE, "nohugepage" }, - {VM_MERGEABLE, "mergeable" }, +static const struct trace_print_flags vmaflag_names[] = { + __def_vmaflag_names }; void dump_vma(const struct vm_area_struct *vma) @@ -162,7 +96,7 @@ void dump_vma(const struct vm_area_struct *vma) (unsigned long)pgprot_val(vma->vm_page_prot), vma->anon_vma, vma->vm_ops, vma->vm_pgoff, vma->vm_file, vma->vm_private_data); - dump_flags(vma->vm_flags, vmaflags_names, ARRAY_SIZE(vmaflags_names)); + dump_flags(vma->vm_flags, vmaflag_names, ARRAY_SIZE(vmaflag_names)); } EXPORT_SYMBOL(dump_vma); @@ -233,8 +167,8 @@ void dump_mm(const struct mm_struct *mm) "" /* This is here to not have a comma! */ ); - dump_flags(mm->def_flags, vmaflags_names, - ARRAY_SIZE(vmaflags_names)); + dump_flags(mm->def_flags, vmaflag_names, + ARRAY_SIZE(vmaflag_names)); } #endif /* CONFIG_DEBUG_VM */ -- cgit v1.1 From edf14cdbf9a0e5ab52698ca66d07a76ade0d5c46 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 15 Mar 2016 14:55:56 -0700 Subject: mm, printk: introduce new format string for flags In mm we use several kinds of flags bitfields that are sometimes printed for debugging purposes, or exported to userspace via sysfs. To make them easier to interpret independently on kernel version and config, we want to dump also the symbolic flag names. So far this has been done with repeated calls to pr_cont(), which is unreliable on SMP, and not usable for e.g. sysfs export. To get a more reliable and universal solution, this patch extends printk() format string for pointers to handle the page flags (%pGp), gfp_flags (%pGg) and vma flags (%pGv). Existing users of dump_flag_names() are converted and simplified. It would be possible to pass flags by value instead of pointer, but the %p format string for pointers already has extensions for various kernel structures, so it's a good fit, and the extra indirection in a non-critical path is negligible. [linux@rasmusvillemoes.dk: lots of good implementation suggestions] Signed-off-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Ingo Molnar Cc: Rasmus Villemoes Cc: Joonsoo Kim Cc: Minchan Kim Cc: Sasha Levin Cc: "Kirill A. Shutemov" Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/debug.c | 34 ++++++++++++++++++++-------------- mm/internal.h | 6 ++++++ 2 files changed, 26 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/debug.c b/mm/debug.c index 410af90..0328fd3 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -11,12 +11,21 @@ #include #include -static const struct trace_print_flags pageflag_names[] = { - __def_pageflag_names +#include "internal.h" + +const struct trace_print_flags pageflag_names[] = { + __def_pageflag_names, + {0, NULL} +}; + +const struct trace_print_flags gfpflag_names[] = { + __def_gfpflag_names, + {0, NULL} }; -static const struct trace_print_flags gfpflag_names[] = { - __def_gfpflag_names +const struct trace_print_flags vmaflag_names[] = { + __def_vmaflag_names, + {0, NULL} }; static void dump_flags(unsigned long flags, @@ -58,14 +67,15 @@ void dump_page_badflags(struct page *page, const char *reason, if (PageCompound(page)) pr_cont(" compound_mapcount: %d", compound_mapcount(page)); pr_cont("\n"); - BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS); - dump_flags(page->flags, pageflag_names, ARRAY_SIZE(pageflag_names)); + BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1); + dump_flags(page->flags, pageflag_names, + ARRAY_SIZE(pageflag_names) - 1); if (reason) pr_alert("page dumped because: %s\n", reason); if (page->flags & badflags) { pr_alert("bad because of flags:\n"); - dump_flags(page->flags & badflags, - pageflag_names, ARRAY_SIZE(pageflag_names)); + dump_flags(page->flags & badflags, pageflag_names, + ARRAY_SIZE(pageflag_names) - 1); } #ifdef CONFIG_MEMCG if (page->mem_cgroup) @@ -81,10 +91,6 @@ EXPORT_SYMBOL(dump_page); #ifdef CONFIG_DEBUG_VM -static const struct trace_print_flags vmaflag_names[] = { - __def_vmaflag_names -}; - void dump_vma(const struct vm_area_struct *vma) { pr_emerg("vma %p start %p end %p\n" @@ -96,7 +102,7 @@ void dump_vma(const struct vm_area_struct *vma) (unsigned long)pgprot_val(vma->vm_page_prot), vma->anon_vma, vma->vm_ops, vma->vm_pgoff, vma->vm_file, vma->vm_private_data); - dump_flags(vma->vm_flags, vmaflag_names, ARRAY_SIZE(vmaflag_names)); + dump_flags(vma->vm_flags, vmaflag_names, ARRAY_SIZE(vmaflag_names) - 1); } EXPORT_SYMBOL(dump_vma); @@ -168,7 +174,7 @@ void dump_mm(const struct mm_struct *mm) ); dump_flags(mm->def_flags, vmaflag_names, - ARRAY_SIZE(vmaflag_names)); + ARRAY_SIZE(vmaflag_names) - 1); } #endif /* CONFIG_DEBUG_VM */ diff --git a/mm/internal.h b/mm/internal.h index a38a21e..6636e1d 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -14,6 +14,7 @@ #include #include #include +#include /* * The set of flags that only affect watermark checking and reclaim @@ -466,4 +467,9 @@ static inline void try_to_unmap_flush_dirty(void) } #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ + +extern const struct trace_print_flags pageflag_names[]; +extern const struct trace_print_flags vmaflag_names[]; +extern const struct trace_print_flags gfpflag_names[]; + #endif /* __MM_INTERNAL_H */ -- cgit v1.1 From b8eceeb99014cf5ae90d7f2c606d36074baa73ae Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 15 Mar 2016 14:55:59 -0700 Subject: mm, debug: replace dump_flags() with the new printk formats With the new printk format strings for flags, we can get rid of dump_flags() in mm/debug.c. This also fixes dump_vma() which used dump_flags() for printing vma flags. However dump_flags() did a page-flags specific filtering of bits higher than NR_PAGEFLAGS in order to remove the zone id part. For dump_vma() this resulted in removing several VM_* flags from the symbolic translation. Signed-off-by: Vlastimil Babka Acked-by: Michal Hocko Acked-by: David Rientjes Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Ingo Molnar Cc: Rasmus Villemoes Cc: Joonsoo Kim Cc: Minchan Kim Cc: Sasha Levin Cc: "Kirill A. Shutemov" Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/debug.c | 60 ++++++++++++++---------------------------------------------- 1 file changed, 14 insertions(+), 46 deletions(-) (limited to 'mm') diff --git a/mm/debug.c b/mm/debug.c index 0328fd3..231e145 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -28,36 +28,6 @@ const struct trace_print_flags vmaflag_names[] = { {0, NULL} }; -static void dump_flags(unsigned long flags, - const struct trace_print_flags *names, int count) -{ - const char *delim = ""; - unsigned long mask; - int i; - - pr_emerg("flags: %#lx(", flags); - - /* remove zone id */ - flags &= (1UL << NR_PAGEFLAGS) - 1; - - for (i = 0; i < count && flags; i++) { - - mask = names[i].mask; - if ((flags & mask) != mask) - continue; - - flags &= ~mask; - pr_cont("%s%s", delim, names[i].name); - delim = "|"; - } - - /* check for left over flags */ - if (flags) - pr_cont("%s%#lx", delim, flags); - - pr_cont(")\n"); -} - void dump_page_badflags(struct page *page, const char *reason, unsigned long badflags) { @@ -68,15 +38,15 @@ void dump_page_badflags(struct page *page, const char *reason, pr_cont(" compound_mapcount: %d", compound_mapcount(page)); pr_cont("\n"); BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1); - dump_flags(page->flags, pageflag_names, - ARRAY_SIZE(pageflag_names) - 1); + pr_emerg("flags: %#lx(%pGp)\n", page->flags, &page->flags); + if (reason) pr_alert("page dumped because: %s\n", reason); - if (page->flags & badflags) { - pr_alert("bad because of flags:\n"); - dump_flags(page->flags & badflags, pageflag_names, - ARRAY_SIZE(pageflag_names) - 1); - } + + badflags &= page->flags; + if (badflags) + pr_alert("bad because of flags: %#lx(%pGp)\n", badflags, + &badflags); #ifdef CONFIG_MEMCG if (page->mem_cgroup) pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup); @@ -96,13 +66,14 @@ void dump_vma(const struct vm_area_struct *vma) pr_emerg("vma %p start %p end %p\n" "next %p prev %p mm %p\n" "prot %lx anon_vma %p vm_ops %p\n" - "pgoff %lx file %p private_data %p\n", + "pgoff %lx file %p private_data %p\n" + "flags: %#lx(%pGv)\n", vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_next, vma->vm_prev, vma->vm_mm, (unsigned long)pgprot_val(vma->vm_page_prot), vma->anon_vma, vma->vm_ops, vma->vm_pgoff, - vma->vm_file, vma->vm_private_data); - dump_flags(vma->vm_flags, vmaflag_names, ARRAY_SIZE(vmaflag_names) - 1); + vma->vm_file, vma->vm_private_data, + vma->vm_flags, &vma->vm_flags); } EXPORT_SYMBOL(dump_vma); @@ -136,7 +107,7 @@ void dump_mm(const struct mm_struct *mm) #if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION) "tlb_flush_pending %d\n" #endif - "%s", /* This is here to hold the comma */ + "def_flags: %#lx(%pGv)\n", mm, mm->mmap, mm->vmacache_seqnum, mm->task_size, #ifdef CONFIG_MMU @@ -170,11 +141,8 @@ void dump_mm(const struct mm_struct *mm) #if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION) mm->tlb_flush_pending, #endif - "" /* This is here to not have a comma! */ - ); - - dump_flags(mm->def_flags, vmaflag_names, - ARRAY_SIZE(vmaflag_names) - 1); + mm->def_flags, &mm->def_flags + ); } #endif /* CONFIG_DEBUG_VM */ -- cgit v1.1 From c5c990e8a1fe313a5ab13f069f1410acf932927b Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 15 Mar 2016 14:56:02 -0700 Subject: mm, page_alloc: print symbolic gfp_flags on allocation failure It would be useful to translate gfp_flags into string representation when printing in case of an allocation failure, especially as the flags have been undergoing some changes recently and the script ./scripts/gfp-translate needs a matching source version to be accurate. Example output: stapio: page allocation failure: order:9, mode:0x2080020(GFP_ATOMIC) Signed-off-by: Vlastimil Babka Acked-by: Michal Hocko Acked-by: David Rientjes Cc: Joonsoo Kim Cc: Minchan Kim Cc: Sasha Levin Cc: "Kirill A. Shutemov" Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 36a0a79..4e8029a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2695,9 +2695,8 @@ void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, const char *fmt, ...) va_end(args); } - pr_warn("%s: page allocation failure: order:%u, mode:0x%x\n", - current->comm, order, gfp_mask); - + pr_warn("%s: page allocation failure: order:%u, mode:%#x(%pGg)\n", + current->comm, order, gfp_mask, &gfp_mask); dump_stack(); if (!should_suppress_show_mem()) show_mem(filter); -- cgit v1.1 From a0795cd416d1142117695f932a9690611ae0edbb Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 15 Mar 2016 14:56:05 -0700 Subject: mm, oom: print symbolic gfp_flags in oom warning It would be useful to translate gfp_flags into string representation when printing in case of an OOM, especially as the flags have been undergoing some changes recently and the script ./scripts/gfp-translate needs a matching source version to be accurate. Example output: a.out invoked oom-killer: gfp_mask=0x24280ca(GFP_HIGHUSER_MOVABLE|GFP_ZERO), order=0, om_score_adj=0 Signed-off-by: Vlastimil Babka Acked-by: Michal Hocko Acked-by: David Rientjes Cc: Joonsoo Kim Cc: Minchan Kim Cc: Sasha Levin Cc: "Kirill A. Shutemov" Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index dc490c0..e97a05d 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -386,10 +386,11 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) static void dump_header(struct oom_control *oc, struct task_struct *p, struct mem_cgroup *memcg) { - pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " - "oom_score_adj=%hd\n", - current->comm, oc->gfp_mask, oc->order, + pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, " + "oom_score_adj=%hd\n", + current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order, current->signal->oom_score_adj); + cpuset_print_current_mems_allowed(); dump_stack(); if (memcg) -- cgit v1.1 From 60f30350fd69a3e4d5f0f45937d3274c22565134 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 15 Mar 2016 14:56:08 -0700 Subject: mm, page_owner: print migratetype of page and pageblock, symbolic flags The information in /sys/kernel/debug/page_owner includes the migratetype of the pageblock the page belongs to. This is also checked against the page's migratetype (as declared by gfp_flags during its allocation), and the page is reported as Fallback if its migratetype differs from the pageblock's one. t This is somewhat misleading because in fact fallback allocation is not the only reason why these two can differ. It also doesn't direcly provide the page's migratetype, although it's possible to derive that from the gfp_flags. It's arguably better to print both page and pageblock's migratetype and leave the interpretation to the consumer than to suggest fallback allocation as the only possible reason. While at it, we can print the migratetypes as string the same way as /proc/pagetypeinfo does, as some of the numeric values depend on kernel configuration. For that, this patch moves the migratetype_names array from #ifdef CONFIG_PROC_FS part of mm/vmstat.c to mm/page_alloc.c and exports it. With the new format strings for flags, we can now also provide symbolic page and gfp flags in the /sys/kernel/debug/page_owner file. This replaces the positional printing of page flags as single letters, which might have looked nicer, but was limited to a subset of flags, and required the user to remember the letters. Example page_owner entry after the patch: Page allocated via order 0, mask 0x24213ca(GFP_HIGHUSER_MOVABLE|__GFP_COLD|__GFP_NOWARN|__GFP_NORETRY) PFN 520 type Movable Block 1 type Movable Flags 0xfffff8001006c(referenced|uptodate|lru|active|mappedtodisk) [] __alloc_pages_nodemask+0x134/0x230 [] alloc_pages_current+0x88/0x120 [] __page_cache_alloc+0xe6/0x120 [] __do_page_cache_readahead+0xdc/0x240 [] ondemand_readahead+0x135/0x260 [] page_cache_sync_readahead+0x31/0x50 [] generic_file_read_iter+0x453/0x760 [] __vfs_read+0xa7/0xd0 Signed-off-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Joonsoo Kim Cc: Minchan Kim Cc: Sasha Levin Cc: "Kirill A. Shutemov" Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 13 +++++++++++++ mm/page_owner.c | 24 +++++++----------------- mm/vmstat.c | 13 ------------- 3 files changed, 20 insertions(+), 30 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4e8029a..030fafc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -223,6 +223,19 @@ static char * const zone_names[MAX_NR_ZONES] = { #endif }; +char * const migratetype_names[MIGRATE_TYPES] = { + "Unmovable", + "Movable", + "Reclaimable", + "HighAtomic", +#ifdef CONFIG_CMA + "CMA", +#endif +#ifdef CONFIG_MEMORY_ISOLATION + "Isolate", +#endif +}; + compound_page_dtor * const compound_page_dtors[] = { NULL, free_compound_page, diff --git a/mm/page_owner.c b/mm/page_owner.c index 983c3a1..7a37a30 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -100,8 +100,9 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, return -ENOMEM; ret = snprintf(kbuf, count, - "Page allocated via order %u, mask 0x%x\n", - page_ext->order, page_ext->gfp_mask); + "Page allocated via order %u, mask %#x(%pGg)\n", + page_ext->order, page_ext->gfp_mask, + &page_ext->gfp_mask); if (ret >= count) goto err; @@ -110,23 +111,12 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, pageblock_mt = get_pfnblock_migratetype(page, pfn); page_mt = gfpflags_to_migratetype(page_ext->gfp_mask); ret += snprintf(kbuf + ret, count - ret, - "PFN %lu Block %lu type %d %s Flags %s%s%s%s%s%s%s%s%s%s%s%s\n", + "PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n", pfn, + migratetype_names[page_mt], pfn >> pageblock_order, - pageblock_mt, - pageblock_mt != page_mt ? "Fallback" : " ", - PageLocked(page) ? "K" : " ", - PageError(page) ? "E" : " ", - PageReferenced(page) ? "R" : " ", - PageUptodate(page) ? "U" : " ", - PageDirty(page) ? "D" : " ", - PageLRU(page) ? "L" : " ", - PageActive(page) ? "A" : " ", - PageSlab(page) ? "S" : " ", - PageWriteback(page) ? "W" : " ", - PageCompound(page) ? "C" : " ", - PageSwapCache(page) ? "B" : " ", - PageMappedToDisk(page) ? "M" : " "); + migratetype_names[pageblock_mt], + page->flags, &page->flags); if (ret >= count) goto err; diff --git a/mm/vmstat.c b/mm/vmstat.c index 084c672..72c1798 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -924,19 +924,6 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, #endif #ifdef CONFIG_PROC_FS -static char * const migratetype_names[MIGRATE_TYPES] = { - "Unmovable", - "Movable", - "Reclaimable", - "HighAtomic", -#ifdef CONFIG_CMA - "CMA", -#endif -#ifdef CONFIG_MEMORY_ISOLATION - "Isolate", -#endif -}; - static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, struct zone *zone) { -- cgit v1.1 From 7dd80b8af0bcd705a9ef2fa272c082882616a499 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 15 Mar 2016 14:56:12 -0700 Subject: mm, page_owner: convert page_owner_inited to static key CONFIG_PAGE_OWNER attempts to impose negligible runtime overhead when enabled during compilation, but not actually enabled during runtime by boot param page_owner=on. This overhead can be further reduced using the static key mechanism, which this patch does. Signed-off-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Joonsoo Kim Cc: Minchan Kim Cc: Sasha Levin Cc: "Kirill A. Shutemov" Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_owner.c | 9 +++++---- mm/vmstat.c | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/page_owner.c b/mm/page_owner.c index 7a37a30..feaa28b 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -5,10 +5,11 @@ #include #include #include +#include #include "internal.h" static bool page_owner_disabled = true; -bool page_owner_inited __read_mostly; +DEFINE_STATIC_KEY_FALSE(page_owner_inited); static void init_early_allocated_pages(void); @@ -37,7 +38,7 @@ static void init_page_owner(void) if (page_owner_disabled) return; - page_owner_inited = true; + static_branch_enable(&page_owner_inited); init_early_allocated_pages(); } @@ -147,7 +148,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) struct page *page; struct page_ext *page_ext; - if (!page_owner_inited) + if (!static_branch_unlikely(&page_owner_inited)) return -EINVAL; page = NULL; @@ -295,7 +296,7 @@ static int __init pageowner_init(void) { struct dentry *dentry; - if (!page_owner_inited) { + if (!static_branch_unlikely(&page_owner_inited)) { pr_info("page_owner is disabled\n"); return 0; } diff --git a/mm/vmstat.c b/mm/vmstat.c index 72c1798..69ce64f 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1120,7 +1120,7 @@ static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat) #ifdef CONFIG_PAGE_OWNER int mtype; - if (!page_owner_inited) + if (!static_branch_unlikely(&page_owner_inited)) return; drain_all_pages(NULL); -- cgit v1.1 From d435edca928805074dae005ab9a42d9fa60fc702 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 15 Mar 2016 14:56:15 -0700 Subject: mm, page_owner: copy page owner info during migration The page_owner mechanism stores gfp_flags of an allocation and stack trace that lead to it. During page migration, the original information is practically replaced by the allocation of free page as the migration target. Arguably this is less useful and might lead to all the page_owner info for migratable pages gradually converge towards compaction or numa balancing migrations. It has also lead to inaccuracies such as one fixed by commit e2cfc91120fa ("mm/page_owner: set correct gfp_mask on page_owner"). This patch thus introduces copying the page_owner info during migration. However, since the fact that the page has been migrated from its original place might be useful for debugging, the next patch will introduce a way to track that information as well. Signed-off-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Joonsoo Kim Cc: Minchan Kim Cc: Sasha Levin Cc: "Kirill A. Shutemov" Cc: Mel Gorman Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 3 +++ mm/page_owner.c | 25 +++++++++++++++++++++++++ 2 files changed, 28 insertions(+) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 3ad0fea..8133805 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -38,6 +38,7 @@ #include #include #include +#include #include @@ -578,6 +579,8 @@ void migrate_page_copy(struct page *newpage, struct page *page) */ if (PageWriteback(newpage)) end_page_writeback(newpage); + + copy_page_owner(page, newpage); } /************************************************************ diff --git a/mm/page_owner.c b/mm/page_owner.c index feaa28b..774b556 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -84,6 +84,31 @@ gfp_t __get_page_owner_gfp(struct page *page) return page_ext->gfp_mask; } +void __copy_page_owner(struct page *oldpage, struct page *newpage) +{ + struct page_ext *old_ext = lookup_page_ext(oldpage); + struct page_ext *new_ext = lookup_page_ext(newpage); + int i; + + new_ext->order = old_ext->order; + new_ext->gfp_mask = old_ext->gfp_mask; + new_ext->nr_entries = old_ext->nr_entries; + + for (i = 0; i < ARRAY_SIZE(new_ext->trace_entries); i++) + new_ext->trace_entries[i] = old_ext->trace_entries[i]; + + /* + * We don't clear the bit on the oldpage as it's going to be freed + * after migration. Until then, the info can be useful in case of + * a bug, and the overal stats will be off a bit only temporarily. + * Also, migrate_misplaced_transhuge_page() can still fail the + * migration and then we want the oldpage to retain the info. But + * in that case we also don't need to explicitly clear the info from + * the new page, which will be freed. + */ + __set_bit(PAGE_EXT_OWNER, &new_ext->flags); +} + static ssize_t print_page_owner(char __user *buf, size_t count, unsigned long pfn, struct page *page, struct page_ext *page_ext) -- cgit v1.1 From 7cd12b4abfd2f8f42414c520bbd051a5b7dc7a8c Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 15 Mar 2016 14:56:18 -0700 Subject: mm, page_owner: track and print last migrate reason During migration, page_owner info is now copied with the rest of the page, so the stacktrace leading to free page allocation during migration is overwritten. For debugging purposes, it might be however useful to know that the page has been migrated since its initial allocation. This might happen many times during the lifetime for different reasons and fully tracking this, especially with stacktraces would incur extra memory costs. As a compromise, store and print the migrate_reason of the last migration that occurred to the page. This is enough to distinguish compaction, numa balancing etc. Example page_owner entry after the patch: Page allocated via order 0, mask 0x24200ca(GFP_HIGHUSER_MOVABLE) PFN 628753 type Movable Block 1228 type Movable Flags 0x1fffff80040030(dirty|lru|swapbacked) [] __alloc_pages_nodemask+0x134/0x230 [] alloc_pages_vma+0xb5/0x250 [] shmem_alloc_page+0x61/0x90 [] shmem_getpage_gfp+0x678/0x960 [] shmem_fallocate+0x329/0x440 [] vfs_fallocate+0x140/0x230 [] SyS_fallocate+0x44/0x70 [] entry_SYSCALL_64_fastpath+0x12/0x71 Page has been migrated, last migrate reason: compaction Signed-off-by: Vlastimil Babka Cc: Joonsoo Kim Cc: Minchan Kim Cc: Sasha Levin Cc: "Kirill A. Shutemov" Cc: Mel Gorman Cc: Michal Hocko Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/debug.c | 11 +++++++++++ mm/migrate.c | 10 +++++++--- mm/page_owner.c | 17 +++++++++++++++++ 3 files changed, 35 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/debug.c b/mm/debug.c index 231e145..78dc548 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -10,9 +10,20 @@ #include #include #include +#include #include "internal.h" +char *migrate_reason_names[MR_TYPES] = { + "compaction", + "memory_failure", + "memory_hotplug", + "syscall_or_cpuset", + "mempolicy_mbind", + "numa_misplaced", + "cma", +}; + const struct trace_print_flags pageflag_names[] = { __def_pageflag_names, {0, NULL} diff --git a/mm/migrate.c b/mm/migrate.c index 8133805..432ecd0 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -955,8 +955,10 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page, } rc = __unmap_and_move(page, newpage, force, mode); - if (rc == MIGRATEPAGE_SUCCESS) + if (rc == MIGRATEPAGE_SUCCESS) { put_new_page = NULL; + set_page_owner_migrate_reason(newpage, reason); + } out: if (rc != -EAGAIN) { @@ -1021,7 +1023,7 @@ out: static int unmap_and_move_huge_page(new_page_t get_new_page, free_page_t put_new_page, unsigned long private, struct page *hpage, int force, - enum migrate_mode mode) + enum migrate_mode mode, int reason) { int rc = -EAGAIN; int *result = NULL; @@ -1079,6 +1081,7 @@ put_anon: if (rc == MIGRATEPAGE_SUCCESS) { hugetlb_cgroup_migrate(hpage, new_hpage); put_new_page = NULL; + set_page_owner_migrate_reason(new_hpage, reason); } unlock_page(hpage); @@ -1151,7 +1154,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, if (PageHuge(page)) rc = unmap_and_move_huge_page(get_new_page, put_new_page, private, page, - pass > 2, mode); + pass > 2, mode, reason); else rc = unmap_and_move(get_new_page, put_new_page, private, page, pass > 2, mode, @@ -1842,6 +1845,7 @@ fail_putback: set_page_memcg(new_page, page_memcg(page)); set_page_memcg(page, NULL); page_remove_rmap(page, true); + set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED); spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); diff --git a/mm/page_owner.c b/mm/page_owner.c index 774b556..a57068c 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -6,6 +6,7 @@ #include #include #include +#include #include "internal.h" static bool page_owner_disabled = true; @@ -73,10 +74,18 @@ void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask) page_ext->order = order; page_ext->gfp_mask = gfp_mask; page_ext->nr_entries = trace.nr_entries; + page_ext->last_migrate_reason = -1; __set_bit(PAGE_EXT_OWNER, &page_ext->flags); } +void __set_page_owner_migrate_reason(struct page *page, int reason) +{ + struct page_ext *page_ext = lookup_page_ext(page); + + page_ext->last_migrate_reason = reason; +} + gfp_t __get_page_owner_gfp(struct page *page) { struct page_ext *page_ext = lookup_page_ext(page); @@ -151,6 +160,14 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, if (ret >= count) goto err; + if (page_ext->last_migrate_reason != -1) { + ret += snprintf(kbuf + ret, count - ret, + "Page has been migrated, last migrate reason: %s\n", + migrate_reason_names[page_ext->last_migrate_reason]); + if (ret >= count) + goto err; + } + ret += snprintf(kbuf + ret, count - ret, "\n"); if (ret >= count) goto err; -- cgit v1.1 From 4e462112e98f9ad6dd62e160f8b14c7df5fed2fc Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 15 Mar 2016 14:56:21 -0700 Subject: mm, page_owner: dump page owner info from dump_page() The page_owner mechanism is useful for dealing with memory leaks. By reading /sys/kernel/debug/page_owner one can determine the stack traces leading to allocations of all pages, and find e.g. a buggy driver. This information might be also potentially useful for debugging, such as the VM_BUG_ON_PAGE() calls to dump_page(). So let's print the stored info from dump_page(). Example output: page:ffffea000292f1c0 count:1 mapcount:0 mapping:ffff8800b2f6cc18 index:0x91d flags: 0x1fffff8001002c(referenced|uptodate|lru|mappedtodisk) page dumped because: VM_BUG_ON_PAGE(1) page->mem_cgroup:ffff8801392c5000 page allocated via order 0, migratetype Movable, gfp_mask 0x24213ca(GFP_HIGHUSER_MOVABLE|__GFP_COLD|__GFP_NOWARN|__GFP_NORETRY) [] __alloc_pages_nodemask+0x134/0x230 [] alloc_pages_current+0x88/0x120 [] __page_cache_alloc+0xe6/0x120 [] __do_page_cache_readahead+0xdc/0x240 [] ondemand_readahead+0x135/0x260 [] page_cache_async_readahead+0x6c/0x70 [] generic_file_read_iter+0x3f2/0x760 [] __vfs_read+0xa7/0xd0 page has been migrated, last migrate reason: compaction Signed-off-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Joonsoo Kim Cc: Minchan Kim Cc: Sasha Levin Cc: "Kirill A. Shutemov" Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/debug.c | 2 ++ mm/page_alloc.c | 1 + mm/page_owner.c | 25 +++++++++++++++++++++++++ 3 files changed, 28 insertions(+) (limited to 'mm') diff --git a/mm/debug.c b/mm/debug.c index 78dc548..61b1f1b 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "internal.h" @@ -67,6 +68,7 @@ void dump_page_badflags(struct page *page, const char *reason, void dump_page(struct page *page, const char *reason) { dump_page_badflags(page, reason, 0); + dump_page_owner(page); } EXPORT_SYMBOL(dump_page); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 030fafc..d98672d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -443,6 +443,7 @@ static void bad_page(struct page *page, const char *reason, printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", current->comm, page_to_pfn(page)); dump_page_badflags(page, reason, bad_flags); + dump_page_owner(page); print_modules(); dump_stack(); diff --git a/mm/page_owner.c b/mm/page_owner.c index a57068c..44ad1f0 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -183,6 +183,31 @@ err: return -ENOMEM; } +void __dump_page_owner(struct page *page) +{ + struct page_ext *page_ext = lookup_page_ext(page); + struct stack_trace trace = { + .nr_entries = page_ext->nr_entries, + .entries = &page_ext->trace_entries[0], + }; + gfp_t gfp_mask = page_ext->gfp_mask; + int mt = gfpflags_to_migratetype(gfp_mask); + + if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) { + pr_alert("page_owner info is not active (free page?)\n"); + return; + } + + pr_alert("page allocated via order %u, migratetype %s, " + "gfp_mask %#x(%pGg)\n", page_ext->order, + migratetype_names[mt], gfp_mask, &gfp_mask); + print_stack_trace(&trace, 0); + + if (page_ext->last_migrate_reason != -1) + pr_alert("page has been migrated, last migrate reason: %s\n", + migrate_reason_names[page_ext->last_migrate_reason]); +} + static ssize_t read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) { -- cgit v1.1 From ff8e81163889ac4c7f59e7f7db6377d0c5d8d69c Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 15 Mar 2016 14:56:24 -0700 Subject: mm, debug: move bad flags printing to bad_page() Since bad_page() is the only user of the badflags parameter of dump_page_badflags(), we can move the code to bad_page() and simplify a bit. The dump_page_badflags() function is renamed to __dump_page() and can still be called separately from dump_page() for temporary debug prints where page_owner info is not desired. The only user-visible change is that page->mem_cgroup is printed before the bad flags. Signed-off-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Joonsoo Kim Cc: Minchan Kim Cc: Sasha Levin Cc: "Kirill A. Shutemov" Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/debug.c | 10 +++------- mm/page_alloc.c | 10 +++++++--- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/debug.c b/mm/debug.c index 61b1f1b..df7247b 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -40,8 +40,7 @@ const struct trace_print_flags vmaflag_names[] = { {0, NULL} }; -void dump_page_badflags(struct page *page, const char *reason, - unsigned long badflags) +void __dump_page(struct page *page, const char *reason) { pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx", page, atomic_read(&page->_count), page_mapcount(page), @@ -50,15 +49,12 @@ void dump_page_badflags(struct page *page, const char *reason, pr_cont(" compound_mapcount: %d", compound_mapcount(page)); pr_cont("\n"); BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1); + pr_emerg("flags: %#lx(%pGp)\n", page->flags, &page->flags); if (reason) pr_alert("page dumped because: %s\n", reason); - badflags &= page->flags; - if (badflags) - pr_alert("bad because of flags: %#lx(%pGp)\n", badflags, - &badflags); #ifdef CONFIG_MEMCG if (page->mem_cgroup) pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup); @@ -67,7 +63,7 @@ void dump_page_badflags(struct page *page, const char *reason, void dump_page(struct page *page, const char *reason) { - dump_page_badflags(page, reason, 0); + __dump_page(page, reason); dump_page_owner(page); } EXPORT_SYMBOL(dump_page); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d98672d..0691403 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -430,7 +430,7 @@ static void bad_page(struct page *page, const char *reason, goto out; } if (nr_unshown) { - printk(KERN_ALERT + pr_alert( "BUG: Bad page state: %lu messages suppressed\n", nr_unshown); nr_unshown = 0; @@ -440,9 +440,13 @@ static void bad_page(struct page *page, const char *reason, if (nr_shown++ == 0) resume = jiffies + 60 * HZ; - printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", + pr_alert("BUG: Bad page state in process %s pfn:%05lx\n", current->comm, page_to_pfn(page)); - dump_page_badflags(page, reason, bad_flags); + __dump_page(page, reason); + bad_flags &= page->flags; + if (bad_flags) + pr_alert("bad because of flags: %#lx(%pGp)\n", + bad_flags, &bad_flags); dump_page_owner(page); print_modules(); -- cgit v1.1 From 8823b1dbc05fab1a8bec275eeae4709257c2661d Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Tue, 15 Mar 2016 14:56:27 -0700 Subject: mm/page_poison.c: enable PAGE_POISONING as a separate option Page poisoning is currently set up as a feature if architectures don't have architecture debug page_alloc to allow unmapping of pages. It has uses apart from that though. Clearing of the pages on free provides an increase in security as it helps to limit the risk of information leaks. Allow page poisoning to be enabled as a separate option independent of kernel_map pages since the two features do separate work. Because of how hiberanation is implemented, the checks on alloc cannot occur if hibernation is enabled. The runtime alloc checks can also be enabled with an option when !HIBERNATION. Credit to Grsecurity/PaX team for inspiring this work Signed-off-by: Laura Abbott Cc: Rafael J. Wysocki Cc: "Kirill A. Shutemov" Cc: Vlastimil Babka Cc: Michal Hocko Cc: Kees Cook Cc: Mathias Krause Cc: Dave Hansen Cc: Jianyu Zhan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig.debug | 25 +++++++- mm/Makefile | 2 +- mm/debug-pagealloc.c | 137 ---------------------------------------- mm/page_alloc.c | 2 + mm/page_poison.c | 173 +++++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 200 insertions(+), 139 deletions(-) delete mode 100644 mm/debug-pagealloc.c create mode 100644 mm/page_poison.c (limited to 'mm') diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index a0c136a..1f99f9a 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -41,4 +41,27 @@ config DEBUG_PAGEALLOC_ENABLE_DEFAULT can be overridden by debug_pagealloc=off|on. config PAGE_POISONING - bool + bool "Poison pages after freeing" + select PAGE_EXTENSION + select PAGE_POISONING_NO_SANITY if HIBERNATION + ---help--- + Fill the pages with poison patterns after free_pages() and verify + the patterns before alloc_pages. The filling of the memory helps + reduce the risk of information leaks from freed data. This does + have a potential performance impact. + + Note that "poison" here is not the same thing as the "HWPoison" + for CONFIG_MEMORY_FAILURE. This is software poisoning only. + + If unsure, say N + +config PAGE_POISONING_NO_SANITY + depends on PAGE_POISONING + bool "Only poison, don't sanity check" + ---help--- + Skip the sanity checking on alloc, only fill the pages with + poison on free. This reduces some of the overhead of the + poisoning feature. + + If you are only interested in sanitization, say Y. Otherwise + say N. diff --git a/mm/Makefile b/mm/Makefile index 2ed4319..cfdd481d 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -48,7 +48,7 @@ obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o obj-$(CONFIG_SLOB) += slob.o obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o obj-$(CONFIG_KSM) += ksm.o -obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o +obj-$(CONFIG_PAGE_POISONING) += page_poison.o obj-$(CONFIG_SLAB) += slab.o obj-$(CONFIG_SLUB) += slub.o obj-$(CONFIG_KMEMCHECK) += kmemcheck.o diff --git a/mm/debug-pagealloc.c b/mm/debug-pagealloc.c deleted file mode 100644 index 5bf5906..0000000 --- a/mm/debug-pagealloc.c +++ /dev/null @@ -1,137 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include - -static bool page_poisoning_enabled __read_mostly; - -static bool need_page_poisoning(void) -{ - if (!debug_pagealloc_enabled()) - return false; - - return true; -} - -static void init_page_poisoning(void) -{ - if (!debug_pagealloc_enabled()) - return; - - page_poisoning_enabled = true; -} - -struct page_ext_operations page_poisoning_ops = { - .need = need_page_poisoning, - .init = init_page_poisoning, -}; - -static inline void set_page_poison(struct page *page) -{ - struct page_ext *page_ext; - - page_ext = lookup_page_ext(page); - __set_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); -} - -static inline void clear_page_poison(struct page *page) -{ - struct page_ext *page_ext; - - page_ext = lookup_page_ext(page); - __clear_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); -} - -static inline bool page_poison(struct page *page) -{ - struct page_ext *page_ext; - - page_ext = lookup_page_ext(page); - return test_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); -} - -static void poison_page(struct page *page) -{ - void *addr = kmap_atomic(page); - - set_page_poison(page); - memset(addr, PAGE_POISON, PAGE_SIZE); - kunmap_atomic(addr); -} - -static void poison_pages(struct page *page, int n) -{ - int i; - - for (i = 0; i < n; i++) - poison_page(page + i); -} - -static bool single_bit_flip(unsigned char a, unsigned char b) -{ - unsigned char error = a ^ b; - - return error && !(error & (error - 1)); -} - -static void check_poison_mem(unsigned char *mem, size_t bytes) -{ - static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 10); - unsigned char *start; - unsigned char *end; - - start = memchr_inv(mem, PAGE_POISON, bytes); - if (!start) - return; - - for (end = mem + bytes - 1; end > start; end--) { - if (*end != PAGE_POISON) - break; - } - - if (!__ratelimit(&ratelimit)) - return; - else if (start == end && single_bit_flip(*start, PAGE_POISON)) - printk(KERN_ERR "pagealloc: single bit error\n"); - else - printk(KERN_ERR "pagealloc: memory corruption\n"); - - print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, start, - end - start + 1, 1); - dump_stack(); -} - -static void unpoison_page(struct page *page) -{ - void *addr; - - if (!page_poison(page)) - return; - - addr = kmap_atomic(page); - check_poison_mem(addr, PAGE_SIZE); - clear_page_poison(page); - kunmap_atomic(addr); -} - -static void unpoison_pages(struct page *page, int n) -{ - int i; - - for (i = 0; i < n; i++) - unpoison_page(page + i); -} - -void __kernel_map_pages(struct page *page, int numpages, int enable) -{ - if (!page_poisoning_enabled) - return; - - if (enable) - unpoison_pages(page, numpages); - else - poison_pages(page, numpages); -} diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0691403..2a08349 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1025,6 +1025,7 @@ static bool free_pages_prepare(struct page *page, unsigned int order) PAGE_SIZE << order); } arch_free_page(page, order); + kernel_poison_pages(page, 1 << order, 0); kernel_map_pages(page, 1 << order, 0); return true; @@ -1420,6 +1421,7 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, arch_alloc_page(page, order); kernel_map_pages(page, 1 << order, 1); + kernel_poison_pages(page, 1 << order, 1); kasan_alloc_pages(page, order); if (gfp_flags & __GFP_ZERO) diff --git a/mm/page_poison.c b/mm/page_poison.c new file mode 100644 index 0000000..89d3bc7 --- /dev/null +++ b/mm/page_poison.c @@ -0,0 +1,173 @@ +#include +#include +#include +#include +#include +#include +#include + +static bool __page_poisoning_enabled __read_mostly; +static bool want_page_poisoning __read_mostly; + +static int early_page_poison_param(char *buf) +{ + if (!buf) + return -EINVAL; + + if (strcmp(buf, "on") == 0) + want_page_poisoning = true; + else if (strcmp(buf, "off") == 0) + want_page_poisoning = false; + + return 0; +} +early_param("page_poison", early_page_poison_param); + +bool page_poisoning_enabled(void) +{ + return __page_poisoning_enabled; +} + +static bool need_page_poisoning(void) +{ + return want_page_poisoning; +} + +static void init_page_poisoning(void) +{ + /* + * page poisoning is debug page alloc for some arches. If either + * of those options are enabled, enable poisoning + */ + if (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC)) { + if (!want_page_poisoning && !debug_pagealloc_enabled()) + return; + } else { + if (!want_page_poisoning) + return; + } + + __page_poisoning_enabled = true; +} + +struct page_ext_operations page_poisoning_ops = { + .need = need_page_poisoning, + .init = init_page_poisoning, +}; + +static inline void set_page_poison(struct page *page) +{ + struct page_ext *page_ext; + + page_ext = lookup_page_ext(page); + __set_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); +} + +static inline void clear_page_poison(struct page *page) +{ + struct page_ext *page_ext; + + page_ext = lookup_page_ext(page); + __clear_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); +} + +static inline bool page_poison(struct page *page) +{ + struct page_ext *page_ext; + + page_ext = lookup_page_ext(page); + return test_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); +} + +static void poison_page(struct page *page) +{ + void *addr = kmap_atomic(page); + + set_page_poison(page); + memset(addr, PAGE_POISON, PAGE_SIZE); + kunmap_atomic(addr); +} + +static void poison_pages(struct page *page, int n) +{ + int i; + + for (i = 0; i < n; i++) + poison_page(page + i); +} + +static bool single_bit_flip(unsigned char a, unsigned char b) +{ + unsigned char error = a ^ b; + + return error && !(error & (error - 1)); +} + +static void check_poison_mem(unsigned char *mem, size_t bytes) +{ + static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 10); + unsigned char *start; + unsigned char *end; + + if (IS_ENABLED(CONFIG_PAGE_POISONING_NO_SANITY)) + return; + + start = memchr_inv(mem, PAGE_POISON, bytes); + if (!start) + return; + + for (end = mem + bytes - 1; end > start; end--) { + if (*end != PAGE_POISON) + break; + } + + if (!__ratelimit(&ratelimit)) + return; + else if (start == end && single_bit_flip(*start, PAGE_POISON)) + pr_err("pagealloc: single bit error\n"); + else + pr_err("pagealloc: memory corruption\n"); + + print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, start, + end - start + 1, 1); + dump_stack(); +} + +static void unpoison_page(struct page *page) +{ + void *addr; + + if (!page_poison(page)) + return; + + addr = kmap_atomic(page); + check_poison_mem(addr, PAGE_SIZE); + clear_page_poison(page); + kunmap_atomic(addr); +} + +static void unpoison_pages(struct page *page, int n) +{ + int i; + + for (i = 0; i < n; i++) + unpoison_page(page + i); +} + +void kernel_poison_pages(struct page *page, int numpages, int enable) +{ + if (!page_poisoning_enabled()) + return; + + if (enable) + unpoison_pages(page, numpages); + else + poison_pages(page, numpages); +} + +#ifndef CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC +void __kernel_map_pages(struct page *page, int numpages, int enable) +{ + /* This function does nothing, all work is done via poison pages */ +} +#endif -- cgit v1.1 From 1414c7f4f7d72d138fff35f00151d15749b5beda Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Tue, 15 Mar 2016 14:56:30 -0700 Subject: mm/page_poisoning.c: allow for zero poisoning By default, page poisoning uses a poison value (0xaa) on free. If this is changed to 0, the page is not only sanitized but zeroing on alloc with __GFP_ZERO can be skipped as well. The tradeoff is that detecting corruption from the poisoning is harder to detect. This feature also cannot be used with hibernation since pages are not guaranteed to be zeroed after hibernation. Credit to Grsecurity/PaX team for inspiring this work Signed-off-by: Laura Abbott Acked-by: Rafael J. Wysocki Cc: "Kirill A. Shutemov" Cc: Vlastimil Babka Cc: Michal Hocko Cc: Kees Cook Cc: Mathias Krause Cc: Dave Hansen Cc: Jianyu Zhan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig.debug | 14 ++++++++++++++ mm/page_alloc.c | 11 ++++++++++- mm/page_ext.c | 10 ++++++++-- mm/page_poison.c | 7 +++++-- 4 files changed, 37 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 1f99f9a..5c50b23 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -65,3 +65,17 @@ config PAGE_POISONING_NO_SANITY If you are only interested in sanitization, say Y. Otherwise say N. + +config PAGE_POISONING_ZERO + bool "Use zero for poisoning instead of random data" + depends on PAGE_POISONING + ---help--- + Instead of using the existing poison value, fill the pages with + zeros. This makes it harder to detect when errors are occurring + due to sanitization but the zeroing at free means that it is + no longer necessary to write zeros when GFP_ZERO is used on + allocation. + + Enabling page poisoning with this option will disable hibernation + + If unsure, say N diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2a08349..50897dc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1405,15 +1405,24 @@ static inline int check_new_page(struct page *page) return 0; } +static inline bool free_pages_prezeroed(bool poisoned) +{ + return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) && + page_poisoning_enabled() && poisoned; +} + static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, int alloc_flags) { int i; + bool poisoned = true; for (i = 0; i < (1 << order); i++) { struct page *p = page + i; if (unlikely(check_new_page(p))) return 1; + if (poisoned) + poisoned &= page_is_poisoned(p); } set_page_private(page, 0); @@ -1424,7 +1433,7 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, kernel_poison_pages(page, 1 << order, 1); kasan_alloc_pages(page, order); - if (gfp_flags & __GFP_ZERO) + if (!free_pages_prezeroed(poisoned) && (gfp_flags & __GFP_ZERO)) for (i = 0; i < (1 << order); i++) clear_highpage(page + i); diff --git a/mm/page_ext.c b/mm/page_ext.c index 292ca7b..2d864e6 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -106,12 +106,15 @@ struct page_ext *lookup_page_ext(struct page *page) struct page_ext *base; base = NODE_DATA(page_to_nid(page))->node_page_ext; -#ifdef CONFIG_DEBUG_VM +#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING) /* * The sanity checks the page allocator does upon freeing a * page can reach here before the page_ext arrays are * allocated when feeding a range of pages to the allocator * for the first time during bootup or memory hotplug. + * + * This check is also necessary for ensuring page poisoning + * works as expected when enabled */ if (unlikely(!base)) return NULL; @@ -180,12 +183,15 @@ struct page_ext *lookup_page_ext(struct page *page) { unsigned long pfn = page_to_pfn(page); struct mem_section *section = __pfn_to_section(pfn); -#ifdef CONFIG_DEBUG_VM +#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING) /* * The sanity checks the page allocator does upon freeing a * page can reach here before the page_ext arrays are * allocated when feeding a range of pages to the allocator * for the first time during bootup or memory hotplug. + * + * This check is also necessary for ensuring page poisoning + * works as expected when enabled */ if (!section->page_ext) return NULL; diff --git a/mm/page_poison.c b/mm/page_poison.c index 89d3bc7..479e7ea 100644 --- a/mm/page_poison.c +++ b/mm/page_poison.c @@ -71,11 +71,14 @@ static inline void clear_page_poison(struct page *page) __clear_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); } -static inline bool page_poison(struct page *page) +bool page_is_poisoned(struct page *page) { struct page_ext *page_ext; page_ext = lookup_page_ext(page); + if (!page_ext) + return false; + return test_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); } @@ -137,7 +140,7 @@ static void unpoison_page(struct page *page) { void *addr; - if (!page_poison(page)) + if (!page_is_poisoned(page)) return; addr = kmap_atomic(page); -- cgit v1.1 From 5b3810e5c6e1b9a1858464a53b5d72ee050bb918 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 15 Mar 2016 14:56:33 -0700 Subject: mm, sl[au]b: print gfp_flags as strings in slab_out_of_memory() We can now print gfp_flags more human-readable. Make use of this in slab_out_of_memory() for SLUB and SLAB. Also convert the SLAB variant it to pr_warn() along the way. Signed-off-by: Vlastimil Babka Acked-by: David Rientjes Cc: Christoph Lameter Cc: Pekka Enberg Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 10 ++++------ mm/slub.c | 4 ++-- 2 files changed, 6 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 863c005..852fc5c 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1350,10 +1350,9 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slab_oom_rs)) return; - printk(KERN_WARNING - "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n", - nodeid, gfpflags); - printk(KERN_WARNING " cache: %s, object size: %d, order: %d\n", + pr_warn("SLAB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n", + nodeid, gfpflags, &gfpflags); + pr_warn(" cache: %s, object size: %d, order: %d\n", cachep->name, cachep->size, cachep->gfporder); for_each_kmem_cache_node(cachep, node, n) { @@ -1377,8 +1376,7 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) num_slabs += active_slabs; num_objs = num_slabs * cachep->num; - printk(KERN_WARNING - " node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n", + pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n", node, active_slabs, num_slabs, active_objs, num_objs, free_objects); } diff --git a/mm/slub.c b/mm/slub.c index 2d4d817..6c91324 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2236,8 +2236,8 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs)) return; - pr_warn("SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n", - nid, gfpflags); + pr_warn("SLUB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n", + nid, gfpflags, &gfpflags); pr_warn(" cache: %s, object size: %d, buffer size: %d, default order: %d, min order: %d\n", s->name, s->object_size, s->size, oo_order(s->oo), oo_order(s->min)); -- cgit v1.1 From 23a003bfd23ea9ea0b7756b920e51f64b284b468 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Tue, 15 Mar 2016 14:56:36 -0700 Subject: mm/madvise: pass return code of memory_failure() to userspace Currently the return value of memory_failure() is not passed to userspace when madvise(MADV_HWPOISON) is used. This is inconvenient for test programs that want to know the result of error handling. So let's return it to the caller as we already do in the MADV_SOFT_OFFLINE case. Signed-off-by: Naoya Horiguchi Cc: Chen Gong Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/madvise.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/madvise.c b/mm/madvise.c index f56825b..6a77114 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -555,8 +555,9 @@ static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end) } pr_info("Injecting memory failure for page %#lx at %#lx\n", page_to_pfn(p), start); - /* Ignore return value for now */ - memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); + ret = memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); + if (ret) + return ret; } return 0; } -- cgit v1.1 From 0b94f17507dc3a92f091812e5bff890c8845d1a3 Mon Sep 17 00:00:00 2001 From: Wang Xiaoqiang Date: Tue, 15 Mar 2016 14:56:39 -0700 Subject: mm/memory-failure.c: remove useless "undef"s Remove the useless #undef, since the corresponding #define has already been removed. Signed-off-by: Wang Xiaoqiang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index ac595e7..67c30eb 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -826,8 +826,6 @@ static struct page_state { #undef lru #undef swapbacked #undef head -#undef tail -#undef compound #undef slab #undef reserved -- cgit v1.1 From 4355c018c2ba8017592520573e76ad376ad656db Mon Sep 17 00:00:00 2001 From: Liang Chen Date: Tue, 15 Mar 2016 14:56:42 -0700 Subject: mm/mempolicy.c: skip VM_HUGETLB and VM_MIXEDMAP VMA for lazy mbind VM_HUGETLB and VM_MIXEDMAP vma needs to be excluded to avoid compound pages being marked for migration and unexpected COWs when handling hugetlb fault. Thanks to Naoya Horiguchi for reminding me on these checks. Signed-off-by: Liang Chen Signed-off-by: Gavin Guo Suggested-by: Naoya Horiguchi Acked-by: David Rientjes Cc: SeongJae Park Cc: Rik van Riel Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 9a3f6b9..8cbc743 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -643,7 +643,9 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, if (flags & MPOL_MF_LAZY) { /* Similar to task_numa_work, skip inaccessible VMAs */ - if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) + if (!is_vm_hugetlb_page(vma) && + (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) && + !(vma->vm_flags & VM_MIXEDMAP)) change_prot_numa(vma, start, endvma); return 1; } -- cgit v1.1 From 9cb65bc3b1114004e2ccee5939031325c7bf16e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mika=20Penttil=C3=A4?= Date: Tue, 15 Mar 2016 14:56:45 -0700 Subject: mm/memory.c: make apply_to_page_range() more robust MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Arm and arm64 used to trigger this BUG_ON() - this has now been fixed. But a WARN_ON() here is sufficient to catch future buggy callers. Signed-off-by: Mika Penttilä Reviewed-by: Pekka Enberg Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 8132787..8adb5b7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1876,7 +1876,9 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, unsigned long end = addr + size; int err; - BUG_ON(addr >= end); + if (WARN_ON(addr >= end)) + return -EINVAL; + pgd = pgd_offset(mm, addr); do { next = pgd_addr_end(addr, end); -- cgit v1.1 From 31bc3858ea3ebcc3157b3f5f0e624c5962f5a7a6 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 15 Mar 2016 14:56:48 -0700 Subject: memory-hotplug: add automatic onlining policy for the newly added memory Currently, all newly added memory blocks remain in 'offline' state unless someone onlines them, some linux distributions carry special udev rules like: SUBSYSTEM=="memory", ACTION=="add", ATTR{state}=="offline", ATTR{state}="online" to make this happen automatically. This is not a great solution for virtual machines where memory hotplug is being used to address high memory pressure situations as such onlining is slow and a userspace process doing this (udev) has a chance of being killed by the OOM killer as it will probably require to allocate some memory. Introduce default policy for the newly added memory blocks in /sys/devices/system/memory/auto_online_blocks file with two possible values: "offline" which preserves the current behavior and "online" which causes all newly added memory blocks to go online as soon as they're added. The default is "offline". Signed-off-by: Vitaly Kuznetsov Reviewed-by: Daniel Kiper Cc: Jonathan Corbet Cc: Greg Kroah-Hartman Cc: Daniel Kiper Cc: Dan Williams Cc: Tang Chen Cc: David Vrabel Acked-by: David Rientjes Cc: Naoya Horiguchi Cc: Xishi Qiu Cc: Mel Gorman Cc: "K. Y. Srinivasan" Cc: Igor Mammedov Cc: Kay Sievers Cc: Konrad Rzeszutek Wilk Cc: Boris Ostrovsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 979b18c..484e867 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -77,6 +77,9 @@ static struct { #define memhp_lock_acquire() lock_map_acquire(&mem_hotplug.dep_map) #define memhp_lock_release() lock_map_release(&mem_hotplug.dep_map) +bool memhp_auto_online; +EXPORT_SYMBOL_GPL(memhp_auto_online); + void get_online_mems(void) { might_sleep(); @@ -1261,8 +1264,13 @@ int zone_for_memory(int nid, u64 start, u64 size, int zone_default, return zone_default; } +static int online_memory_block(struct memory_block *mem, void *arg) +{ + return memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); +} + /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ -int __ref add_memory_resource(int nid, struct resource *res) +int __ref add_memory_resource(int nid, struct resource *res, bool online) { u64 start, size; pg_data_t *pgdat = NULL; @@ -1322,6 +1330,11 @@ int __ref add_memory_resource(int nid, struct resource *res) /* create new memmap entry */ firmware_map_add_hotplug(start, start + size, "System RAM"); + /* online pages if requested */ + if (online) + walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), + NULL, online_memory_block); + goto out; error: @@ -1345,7 +1358,7 @@ int __ref add_memory(int nid, u64 start, u64 size) if (IS_ERR(res)) return PTR_ERR(res); - ret = add_memory_resource(nid, res); + ret = add_memory_resource(nid, res, memhp_auto_online); if (ret < 0) release_memory_resource(res); return ret; -- cgit v1.1 From cecf257b62e39eab0e802e91c62ae5679673db02 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Tue, 15 Mar 2016 14:56:55 -0700 Subject: mm: vmscan: do not clear SHRINKER_NUMA_AWARE if nr_node_ids == 1 Currently, on shrinker registration we clear SHRINKER_NUMA_AWARE if there's the only NUMA node present. The comment states that this will allow us to save some small loop time later. It used to be true when this code was added (see commit 1d3d4437eae1b ("vmscan: per-node deferred work")), but since commit 6b4f7799c6a57 ("mm: vmscan: invoke slab shrinkers from shrink_zone()") it doesn't make any difference. Anyway, running on non-NUMA machine shouldn't make a shrinker NUMA unaware, so zap this hunk. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 71b1c29..db5722a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -228,14 +228,6 @@ int register_shrinker(struct shrinker *shrinker) { size_t size = sizeof(*shrinker->nr_deferred); - /* - * If we only have one possible node in the system anyway, save - * ourselves the trouble and disable NUMA aware behavior. This way we - * will save memory and some small loop time later. - */ - if (nr_node_ids == 1) - shrinker->flags &= ~SHRINKER_NUMA_AWARE; - if (shrinker->flags & SHRINKER_NUMA_AWARE) size *= nr_node_ids; -- cgit v1.1 From d7206a70af5c094446927b5dea8704f0f96303e3 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Tue, 15 Mar 2016 14:56:58 -0700 Subject: mm/madvise: update comment on sys_madvise() Some new MADV_* advices are not documented in sys_madvise() comment. So let's update it. [akpm@linux-foundation.org: modifications suggested by Michal] Signed-off-by: Naoya Horiguchi Acked-by: Michal Hocko Cc: Minchan Kim Cc: "Kirill A. Shutemov" Cc: Jason Baron Cc: Chen Gong Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/madvise.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'mm') diff --git a/mm/madvise.c b/mm/madvise.c index 6a77114..a011473 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -639,14 +639,28 @@ madvise_behavior_valid(int behavior) * some pages ahead. * MADV_DONTNEED - the application is finished with the given range, * so the kernel can free resources associated with it. + * MADV_FREE - the application marks pages in the given range as lazy free, + * where actual purges are postponed until memory pressure happens. * MADV_REMOVE - the application wants to free up the given range of * pages and associated backing store. * MADV_DONTFORK - omit this area from child's address space when forking: * typically, to avoid COWing pages pinned by get_user_pages(). * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking. + * MADV_HWPOISON - trigger memory error handler as if the given memory range + * were corrupted by unrecoverable hardware memory failure. + * MADV_SOFT_OFFLINE - try to soft-offline the given range of memory. * MADV_MERGEABLE - the application recommends that KSM try to merge pages in * this area with pages of identical content from other such areas. * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others. + * MADV_HUGEPAGE - the application wants to back the given range by transparent + * huge pages in the future. Existing pages might be coalesced and + * new pages might be allocated as THP. + * MADV_NOHUGEPAGE - mark the given range as not worth being backed by + * transparent huge pages so the existing pages will not be + * coalesced into THP and new pages will not be allocated as THP. + * MADV_DONTDUMP - the application wants to prevent pages in the given range + * from being included in its core dump. + * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. * * return values: * zero - success -- cgit v1.1 From 0db2cb8da89d991762ec2aece45e55ceaee34664 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 15 Mar 2016 14:57:01 -0700 Subject: mm, vmscan: make zone_reclaimable_pages more precise zone_reclaimable_pages() is used in should_reclaim_retry() which uses it to calculate the target for the watermark check. This means that precise numbers are important for the correct decision. zone_reclaimable_pages uses zone_page_state which can contain stale data with per-cpu diffs not synced yet (the last vmstat_update might have run 1s in the past). Use zone_page_state_snapshot() in zone_reclaimable_pages() instead. None of the current callers is in a hot path where getting the precise value (which involves per-cpu iteration) would cause an unreasonable overhead. Signed-off-by: Michal Hocko Signed-off-by: Tetsuo Handa Suggested-by: David Rientjes Acked-by: David Rientjes Acked-by: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index db5722a..039f08d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -195,21 +195,21 @@ static unsigned long zone_reclaimable_pages(struct zone *zone) { unsigned long nr; - nr = zone_page_state(zone, NR_ACTIVE_FILE) + - zone_page_state(zone, NR_INACTIVE_FILE) + - zone_page_state(zone, NR_ISOLATED_FILE); + nr = zone_page_state_snapshot(zone, NR_ACTIVE_FILE) + + zone_page_state_snapshot(zone, NR_INACTIVE_FILE) + + zone_page_state_snapshot(zone, NR_ISOLATED_FILE); if (get_nr_swap_pages() > 0) - nr += zone_page_state(zone, NR_ACTIVE_ANON) + - zone_page_state(zone, NR_INACTIVE_ANON) + - zone_page_state(zone, NR_ISOLATED_ANON); + nr += zone_page_state_snapshot(zone, NR_ACTIVE_ANON) + + zone_page_state_snapshot(zone, NR_INACTIVE_ANON) + + zone_page_state_snapshot(zone, NR_ISOLATED_ANON); return nr; } bool zone_reclaimable(struct zone *zone) { - return zone_page_state(zone, NR_PAGES_SCANNED) < + return zone_page_state_snapshot(zone, NR_PAGES_SCANNED) < zone_reclaimable_pages(zone) * 6; } -- cgit v1.1 From 81f8c3a461d16f0355ced3d56d6d1bb5923207a1 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 15 Mar 2016 14:57:04 -0700 Subject: mm: memcontrol: generalize locking for the page->mem_cgroup binding These patches tag the page cache radix tree eviction entries with the memcg an evicted page belonged to, thus making per-cgroup LRU reclaim work properly and be as adaptive to new cache workingsets as global reclaim already is. This should have been part of the original thrash detection patch series, but was deferred due to the complexity of those patches. This patch (of 5): So far the only sites that needed to exclude charge migration to stabilize page->mem_cgroup have been per-cgroup page statistics, hence the name mem_cgroup_begin_page_stat(). But per-cgroup thrash detection will add another site that needs to ensure page->mem_cgroup lifetime. Rename these locking functions to the more generic lock_page_memcg() and unlock_page_memcg(). Since charge migration is a cgroup1 feature only, we might be able to delete it at some point, and these now easy to identify locking sites along with it. Signed-off-by: Johannes Weiner Suggested-by: Vladimir Davydov Acked-by: Vladimir Davydov Cc: Michal Hocko Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 12 ++++++------ mm/memcontrol.c | 34 ++++++++++++++-------------------- mm/page-writeback.c | 28 ++++++++++++++-------------- mm/rmap.c | 8 ++++---- mm/truncate.c | 6 +++--- mm/vmscan.c | 8 ++++---- 6 files changed, 45 insertions(+), 51 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 4a0f5fa..ee8140c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -101,7 +101,7 @@ * ->tree_lock (page_remove_rmap->set_page_dirty) * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) * ->inode->i_lock (page_remove_rmap->set_page_dirty) - * ->memcg->move_lock (page_remove_rmap->mem_cgroup_begin_page_stat) + * ->memcg->move_lock (page_remove_rmap->lock_page_memcg) * bdi.wb->list_lock (zap_pte_range->set_page_dirty) * ->inode->i_lock (zap_pte_range->set_page_dirty) * ->private_lock (zap_pte_range->__set_page_dirty_buffers) @@ -177,7 +177,7 @@ static void page_cache_tree_delete(struct address_space *mapping, * Delete a page from the page cache and free it. Caller has to make * sure the page is locked and that nobody else uses it - or that usage * is safe. The caller must hold the mapping's tree_lock and - * mem_cgroup_begin_page_stat(). + * lock_page_memcg(). */ void __delete_from_page_cache(struct page *page, void *shadow, struct mem_cgroup *memcg) @@ -263,11 +263,11 @@ void delete_from_page_cache(struct page *page) freepage = mapping->a_ops->freepage; - memcg = mem_cgroup_begin_page_stat(page); + memcg = lock_page_memcg(page); spin_lock_irqsave(&mapping->tree_lock, flags); __delete_from_page_cache(page, NULL, memcg); spin_unlock_irqrestore(&mapping->tree_lock, flags); - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(memcg); if (freepage) freepage(page); @@ -561,7 +561,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) new->mapping = mapping; new->index = offset; - memcg = mem_cgroup_begin_page_stat(old); + memcg = lock_page_memcg(old); spin_lock_irqsave(&mapping->tree_lock, flags); __delete_from_page_cache(old, NULL, memcg); error = radix_tree_insert(&mapping->page_tree, offset, new); @@ -576,7 +576,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) if (PageSwapBacked(new)) __inc_zone_page_state(new, NR_SHMEM); spin_unlock_irqrestore(&mapping->tree_lock, flags); - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(memcg); mem_cgroup_replace_page(old, new); radix_tree_preload_end(); if (freepage) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d06cae2..953f0f9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1709,19 +1709,13 @@ cleanup: } /** - * mem_cgroup_begin_page_stat - begin a page state statistics transaction - * @page: page that is going to change accounted state - * - * This function must mark the beginning of an accounted page state - * change to prevent double accounting when the page is concurrently - * being moved to another memcg: + * lock_page_memcg - lock a page->mem_cgroup binding + * @page: the page * - * memcg = mem_cgroup_begin_page_stat(page); - * if (TestClearPageState(page)) - * mem_cgroup_update_page_stat(memcg, state, -1); - * mem_cgroup_end_page_stat(memcg); + * This function protects unlocked LRU pages from being moved to + * another cgroup and stabilizes their page->mem_cgroup binding. */ -struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page) +struct mem_cgroup *lock_page_memcg(struct page *page) { struct mem_cgroup *memcg; unsigned long flags; @@ -1759,20 +1753,20 @@ again: /* * When charge migration first begins, we can have locked and * unlocked page stat updates happening concurrently. Track - * the task who has the lock for mem_cgroup_end_page_stat(). + * the task who has the lock for unlock_page_memcg(). */ memcg->move_lock_task = current; memcg->move_lock_flags = flags; return memcg; } -EXPORT_SYMBOL(mem_cgroup_begin_page_stat); +EXPORT_SYMBOL(lock_page_memcg); /** - * mem_cgroup_end_page_stat - finish a page state statistics transaction - * @memcg: the memcg that was accounted against + * unlock_page_memcg - unlock a page->mem_cgroup binding + * @memcg: the memcg returned by lock_page_memcg() */ -void mem_cgroup_end_page_stat(struct mem_cgroup *memcg) +void unlock_page_memcg(struct mem_cgroup *memcg) { if (memcg && memcg->move_lock_task == current) { unsigned long flags = memcg->move_lock_flags; @@ -1785,7 +1779,7 @@ void mem_cgroup_end_page_stat(struct mem_cgroup *memcg) rcu_read_unlock(); } -EXPORT_SYMBOL(mem_cgroup_end_page_stat); +EXPORT_SYMBOL(unlock_page_memcg); /* * size of first charge trial. "32" comes from vmscan.c's magic value. @@ -4923,9 +4917,9 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) lru_add_drain_all(); /* - * Signal mem_cgroup_begin_page_stat() to take the memcg's - * move_lock while we're moving its pages to another memcg. - * Then wait for already started RCU-only updates to finish. + * Signal lock_page_memcg() to take the memcg's move_lock + * while we're moving its pages to another memcg. Then wait + * for already started RCU-only updates to finish. */ atomic_inc(&mc.from->moving_account); synchronize_rcu(); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index d782cba..2b5ea12 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2410,7 +2410,7 @@ int __set_page_dirty_no_writeback(struct page *page) /* * Helper function for set_page_dirty family. * - * Caller must hold mem_cgroup_begin_page_stat(). + * Caller must hold lock_page_memcg(). * * NOTE: This relies on being atomic wrt interrupts. */ @@ -2442,7 +2442,7 @@ EXPORT_SYMBOL(account_page_dirtied); /* * Helper function for deaccounting dirty page without writeback. * - * Caller must hold mem_cgroup_begin_page_stat(). + * Caller must hold lock_page_memcg(). */ void account_page_cleaned(struct page *page, struct address_space *mapping, struct mem_cgroup *memcg, struct bdi_writeback *wb) @@ -2471,13 +2471,13 @@ int __set_page_dirty_nobuffers(struct page *page) { struct mem_cgroup *memcg; - memcg = mem_cgroup_begin_page_stat(page); + memcg = lock_page_memcg(page); if (!TestSetPageDirty(page)) { struct address_space *mapping = page_mapping(page); unsigned long flags; if (!mapping) { - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(memcg); return 1; } @@ -2488,7 +2488,7 @@ int __set_page_dirty_nobuffers(struct page *page) radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); spin_unlock_irqrestore(&mapping->tree_lock, flags); - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(memcg); if (mapping->host) { /* !PageAnon && !swapper_space */ @@ -2496,7 +2496,7 @@ int __set_page_dirty_nobuffers(struct page *page) } return 1; } - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(memcg); return 0; } EXPORT_SYMBOL(__set_page_dirty_nobuffers); @@ -2629,14 +2629,14 @@ void cancel_dirty_page(struct page *page) struct mem_cgroup *memcg; bool locked; - memcg = mem_cgroup_begin_page_stat(page); + memcg = lock_page_memcg(page); wb = unlocked_inode_to_wb_begin(inode, &locked); if (TestClearPageDirty(page)) account_page_cleaned(page, mapping, memcg, wb); unlocked_inode_to_wb_end(inode, locked); - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(memcg); } else { ClearPageDirty(page); } @@ -2705,7 +2705,7 @@ int clear_page_dirty_for_io(struct page *page) * always locked coming in here, so we get the desired * exclusion. */ - memcg = mem_cgroup_begin_page_stat(page); + memcg = lock_page_memcg(page); wb = unlocked_inode_to_wb_begin(inode, &locked); if (TestClearPageDirty(page)) { mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY); @@ -2714,7 +2714,7 @@ int clear_page_dirty_for_io(struct page *page) ret = 1; } unlocked_inode_to_wb_end(inode, locked); - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(memcg); return ret; } return TestClearPageDirty(page); @@ -2727,7 +2727,7 @@ int test_clear_page_writeback(struct page *page) struct mem_cgroup *memcg; int ret; - memcg = mem_cgroup_begin_page_stat(page); + memcg = lock_page_memcg(page); if (mapping) { struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); @@ -2755,7 +2755,7 @@ int test_clear_page_writeback(struct page *page) dec_zone_page_state(page, NR_WRITEBACK); inc_zone_page_state(page, NR_WRITTEN); } - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(memcg); return ret; } @@ -2765,7 +2765,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write) struct mem_cgroup *memcg; int ret; - memcg = mem_cgroup_begin_page_stat(page); + memcg = lock_page_memcg(page); if (mapping) { struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); @@ -2796,7 +2796,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write) mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); inc_zone_page_state(page, NR_WRITEBACK); } - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(memcg); return ret; } diff --git a/mm/rmap.c b/mm/rmap.c index 79f3bf0..2871e7d 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1289,19 +1289,19 @@ void page_add_file_rmap(struct page *page) { struct mem_cgroup *memcg; - memcg = mem_cgroup_begin_page_stat(page); + memcg = lock_page_memcg(page); if (atomic_inc_and_test(&page->_mapcount)) { __inc_zone_page_state(page, NR_FILE_MAPPED); mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); } - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(memcg); } static void page_remove_file_rmap(struct page *page) { struct mem_cgroup *memcg; - memcg = mem_cgroup_begin_page_stat(page); + memcg = lock_page_memcg(page); /* Hugepages are not counted in NR_FILE_MAPPED for now. */ if (unlikely(PageHuge(page))) { @@ -1325,7 +1325,7 @@ static void page_remove_file_rmap(struct page *page) if (unlikely(PageMlocked(page))) clear_page_mlock(page); out: - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(memcg); } static void page_remove_anon_compound_rmap(struct page *page) diff --git a/mm/truncate.c b/mm/truncate.c index e3ee0e2..51a24f6 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -528,7 +528,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) return 0; - memcg = mem_cgroup_begin_page_stat(page); + memcg = lock_page_memcg(page); spin_lock_irqsave(&mapping->tree_lock, flags); if (PageDirty(page)) goto failed; @@ -536,7 +536,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) BUG_ON(page_has_private(page)); __delete_from_page_cache(page, NULL, memcg); spin_unlock_irqrestore(&mapping->tree_lock, flags); - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(memcg); if (mapping->a_ops->freepage) mapping->a_ops->freepage(page); @@ -545,7 +545,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) return 1; failed: spin_unlock_irqrestore(&mapping->tree_lock, flags); - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(memcg); return 0; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 039f08d..08547a7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -608,7 +608,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, BUG_ON(!PageLocked(page)); BUG_ON(mapping != page_mapping(page)); - memcg = mem_cgroup_begin_page_stat(page); + memcg = lock_page_memcg(page); spin_lock_irqsave(&mapping->tree_lock, flags); /* * The non racy check for a busy page. @@ -648,7 +648,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, mem_cgroup_swapout(page, swap); __delete_from_swap_cache(page); spin_unlock_irqrestore(&mapping->tree_lock, flags); - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(memcg); swapcache_free(swap); } else { void (*freepage)(struct page *); @@ -676,7 +676,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, shadow = workingset_eviction(mapping, page); __delete_from_page_cache(page, shadow, memcg); spin_unlock_irqrestore(&mapping->tree_lock, flags); - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(memcg); if (freepage != NULL) freepage(page); @@ -686,7 +686,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, cannot_free: spin_unlock_irqrestore(&mapping->tree_lock, flags); - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(memcg); return 0; } -- cgit v1.1 From 689c94f03ae251181725fe853b4ffc870f05c7fe Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 15 Mar 2016 14:57:07 -0700 Subject: mm: workingset: #define radix entry eviction mask This is a compile-time constant, no need to calculate it on refault. Signed-off-by: Johannes Weiner Reviewed-by: Vladimir Davydov Cc: Michal Hocko Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/workingset.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/workingset.c b/mm/workingset.c index 61ead9e..3ef92f6 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -152,6 +152,10 @@ * refault distance will immediately activate the refaulting page. */ +#define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \ + ZONES_SHIFT + NODES_SHIFT) +#define EVICTION_MASK (~0UL >> EVICTION_SHIFT) + static void *pack_shadow(unsigned long eviction, struct zone *zone) { eviction = (eviction << NODES_SHIFT) | zone_to_nid(zone); @@ -168,7 +172,6 @@ static void unpack_shadow(void *shadow, unsigned long entry = (unsigned long)shadow; unsigned long eviction; unsigned long refault; - unsigned long mask; int zid, nid; entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT; @@ -181,8 +184,7 @@ static void unpack_shadow(void *shadow, *zone = NODE_DATA(nid)->node_zones + zid; refault = atomic_long_read(&(*zone)->inactive_age); - mask = ~0UL >> (NODES_SHIFT + ZONES_SHIFT + - RADIX_TREE_EXCEPTIONAL_SHIFT); + /* * The unsigned subtraction here gives an accurate distance * across inactive_age overflows in most cases. @@ -199,7 +201,7 @@ static void unpack_shadow(void *shadow, * inappropriate activation leading to pressure on the active * list is not a problem. */ - *distance = (refault - eviction) & mask; + *distance = (refault - eviction) & EVICTION_MASK; } /** -- cgit v1.1 From 162453bfbdf4c0f58cb3058aad9ad8cda1044cda Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 15 Mar 2016 14:57:10 -0700 Subject: mm: workingset: separate shadow unpacking and refault calculation Per-cgroup thrash detection will need to derive a live memcg from the eviction cookie, and doing that inside unpack_shadow() will get nasty with the reference handling spread over two functions. In preparation, make unpack_shadow() clearly about extracting static data, and let workingset_refault() do all the higher-level handling. Signed-off-by: Johannes Weiner Reviewed-by: Vladimir Davydov Cc: Michal Hocko Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/workingset.c | 56 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 28 insertions(+), 28 deletions(-) (limited to 'mm') diff --git a/mm/workingset.c b/mm/workingset.c index 3ef92f6..f874b2c 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -165,13 +165,10 @@ static void *pack_shadow(unsigned long eviction, struct zone *zone) return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY); } -static void unpack_shadow(void *shadow, - struct zone **zone, - unsigned long *distance) +static void unpack_shadow(void *shadow, struct zone **zonep, + unsigned long *evictionp) { unsigned long entry = (unsigned long)shadow; - unsigned long eviction; - unsigned long refault; int zid, nid; entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT; @@ -179,29 +176,9 @@ static void unpack_shadow(void *shadow, entry >>= ZONES_SHIFT; nid = entry & ((1UL << NODES_SHIFT) - 1); entry >>= NODES_SHIFT; - eviction = entry; - - *zone = NODE_DATA(nid)->node_zones + zid; - refault = atomic_long_read(&(*zone)->inactive_age); - - /* - * The unsigned subtraction here gives an accurate distance - * across inactive_age overflows in most cases. - * - * There is a special case: usually, shadow entries have a - * short lifetime and are either refaulted or reclaimed along - * with the inode before they get too old. But it is not - * impossible for the inactive_age to lap a shadow entry in - * the field, which can then can result in a false small - * refault distance, leading to a false activation should this - * old entry actually refault again. However, earlier kernels - * used to deactivate unconditionally with *every* reclaim - * invocation for the longest time, so the occasional - * inappropriate activation leading to pressure on the active - * list is not a problem. - */ - *distance = (refault - eviction) & EVICTION_MASK; + *zonep = NODE_DATA(nid)->node_zones + zid; + *evictionp = entry; } /** @@ -233,9 +210,32 @@ void *workingset_eviction(struct address_space *mapping, struct page *page) bool workingset_refault(void *shadow) { unsigned long refault_distance; + unsigned long eviction; + unsigned long refault; struct zone *zone; - unpack_shadow(shadow, &zone, &refault_distance); + unpack_shadow(shadow, &zone, &eviction); + + refault = atomic_long_read(&zone->inactive_age); + + /* + * The unsigned subtraction here gives an accurate distance + * across inactive_age overflows in most cases. + * + * There is a special case: usually, shadow entries have a + * short lifetime and are either refaulted or reclaimed along + * with the inode before they get too old. But it is not + * impossible for the inactive_age to lap a shadow entry in + * the field, which can then can result in a false small + * refault distance, leading to a false activation should this + * old entry actually refault again. However, earlier kernels + * used to deactivate unconditionally with *every* reclaim + * invocation for the longest time, so the occasional + * inappropriate activation leading to pressure on the active + * list is not a problem. + */ + refault_distance = (refault - eviction) & EVICTION_MASK; + inc_zone_state(zone, WORKINGSET_REFAULT); if (refault_distance <= zone_page_state(zone, NR_ACTIVE_FILE)) { -- cgit v1.1 From 612e44939c3c77245ac80843c0c7876c8cf97282 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 15 Mar 2016 14:57:13 -0700 Subject: mm: workingset: eviction buckets for bigmem/lowbit machines For per-cgroup thrash detection, we need to store the memcg ID inside the radix tree cookie as well. However, on 32 bit that doesn't leave enough bits for the eviction timestamp to cover the necessary range of recently evicted pages. The radix tree entry would look like this: [ RADIX_TREE_EXCEPTIONAL(2) | ZONEID(2) | MEMCGID(16) | EVICTION(12) ] 12 bits means 4096 pages, means 16M worth of recently evicted pages. But refaults are actionable up to distances covering half of memory. To not miss refaults, we have to stretch out the range at the cost of how precisely we can tell when a page was evicted. This way we can shave off lower bits from the eviction timestamp until the necessary range is covered. E.g. grouping evictions into 1M buckets (256 pages) will stretch the longest representable refault distance to 4G. This patch implements eviction buckets that are automatically sized according to the available bits and the necessary refault range, in preparation for per-cgroup thrash detection. The maximum actionable distance is currently half of memory, but to support memory hotplug of up to 200% of boot-time memory, we size the buckets to cover double the distance. Beyond that, thrashing won't be detectable anymore. During boot, the kernel will print out the exact parameters, like so: [ 0.113929] workingset: timestamp_bits=12 max_order=18 bucket_order=6 In this example, there are 12 radix entry bits available for the eviction timestamp, to cover a maximum distance of 2^18 pages (this is a 1G machine). Consequently, evictions must be grouped into buckets of 2^6 pages, or 256K. Signed-off-by: Johannes Weiner Reviewed-by: Vladimir Davydov Cc: Michal Hocko Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/workingset.c | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/workingset.c b/mm/workingset.c index f874b2c..9a26a60 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -156,8 +156,19 @@ ZONES_SHIFT + NODES_SHIFT) #define EVICTION_MASK (~0UL >> EVICTION_SHIFT) +/* + * Eviction timestamps need to be able to cover the full range of + * actionable refaults. However, bits are tight in the radix tree + * entry, and after storing the identifier for the lruvec there might + * not be enough left to represent every single actionable refault. In + * that case, we have to sacrifice granularity for distance, and group + * evictions into coarser buckets by shaving off lower timestamp bits. + */ +static unsigned int bucket_order __read_mostly; + static void *pack_shadow(unsigned long eviction, struct zone *zone) { + eviction >>= bucket_order; eviction = (eviction << NODES_SHIFT) | zone_to_nid(zone); eviction = (eviction << ZONES_SHIFT) | zone_idx(zone); eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT); @@ -178,7 +189,7 @@ static void unpack_shadow(void *shadow, struct zone **zonep, entry >>= NODES_SHIFT; *zonep = NODE_DATA(nid)->node_zones + zid; - *evictionp = entry; + *evictionp = entry << bucket_order; } /** @@ -400,8 +411,25 @@ static struct lock_class_key shadow_nodes_key; static int __init workingset_init(void) { + unsigned int timestamp_bits; + unsigned int max_order; int ret; + BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT); + /* + * Calculate the eviction bucket size to cover the longest + * actionable refault distance, which is currently half of + * memory (totalram_pages/2). However, memory hotplug may add + * some more pages at runtime, so keep working with up to + * double the initial memory by using totalram_pages as-is. + */ + timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT; + max_order = fls_long(totalram_pages - 1); + if (max_order > timestamp_bits) + bucket_order = max_order - timestamp_bits; + printk("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n", + timestamp_bits, max_order, bucket_order); + ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key); if (ret) goto err; -- cgit v1.1 From 23047a96d7cfcfca1a6d026ecaec526ea4803e9e Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 15 Mar 2016 14:57:16 -0700 Subject: mm: workingset: per-cgroup cache thrash detection Cache thrash detection (see a528910e12ec "mm: thrash detection-based file cache sizing" for details) currently only works on the system level, not inside cgroups. Worse, as the refaults are compared to the global number of active cache, cgroups might wrongfully get all their refaults activated when their pages are hotter than those of others. Move the refault machinery from the zone to the lruvec, and then tag eviction entries with the memcg ID. This makes the thrash detection work correctly inside cgroups. [sergey.senozhatsky@gmail.com: do not return from workingset_activation() with locked rcu and page] Signed-off-by: Johannes Weiner Signed-off-by: Sergey Senozhatsky Reviewed-by: Vladimir Davydov Cc: Michal Hocko Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 25 ------------------ mm/vmscan.c | 18 ++++++------- mm/workingset.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++-------- 3 files changed, 78 insertions(+), 44 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 953f0f9..864e237 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -268,31 +268,6 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) return (memcg == root_mem_cgroup); } -/* - * We restrict the id in the range of [1, 65535], so it can fit into - * an unsigned short. - */ -#define MEM_CGROUP_ID_MAX USHRT_MAX - -static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) -{ - return memcg->css.id; -} - -/* - * A helper function to get mem_cgroup from ID. must be called under - * rcu_read_lock(). The caller is responsible for calling - * css_tryget_online() if the mem_cgroup is used for charging. (dropping - * refcnt from swap can be called against removed memcg.) - */ -static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) -{ - struct cgroup_subsys_state *css; - - css = css_from_id(id, &memory_cgrp_subsys); - return mem_cgroup_from_css(css); -} - #ifndef CONFIG_SLOB /* * This will be the memcg's index in each cache's ->memcg_params.memcg_caches. diff --git a/mm/vmscan.c b/mm/vmscan.c index 08547a7..fd434cc 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -213,7 +213,7 @@ bool zone_reclaimable(struct zone *zone) zone_reclaimable_pages(zone) * 6; } -static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru) +unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru) { if (!mem_cgroup_disabled()) return mem_cgroup_get_lru_size(lruvec, lru); @@ -1923,8 +1923,8 @@ static bool inactive_file_is_low(struct lruvec *lruvec) unsigned long inactive; unsigned long active; - inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE); - active = get_lru_size(lruvec, LRU_ACTIVE_FILE); + inactive = lruvec_lru_size(lruvec, LRU_INACTIVE_FILE); + active = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE); return active > inactive; } @@ -2063,7 +2063,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, * system is under heavy pressure. */ if (!inactive_file_is_low(lruvec) && - get_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) { + lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) { scan_balance = SCAN_FILE; goto out; } @@ -2089,10 +2089,10 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, * anon in [0], file in [1] */ - anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) + - get_lru_size(lruvec, LRU_INACTIVE_ANON); - file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + - get_lru_size(lruvec, LRU_INACTIVE_FILE); + anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON) + + lruvec_lru_size(lruvec, LRU_INACTIVE_ANON); + file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE) + + lruvec_lru_size(lruvec, LRU_INACTIVE_FILE); spin_lock_irq(&zone->lru_lock); if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { @@ -2130,7 +2130,7 @@ out: unsigned long size; unsigned long scan; - size = get_lru_size(lruvec, lru); + size = lruvec_lru_size(lruvec, lru); scan = size >> sc->priority; if (!scan && pass && force_scan) diff --git a/mm/workingset.c b/mm/workingset.c index 9a26a60..14bc23a 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -153,7 +153,8 @@ */ #define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \ - ZONES_SHIFT + NODES_SHIFT) + ZONES_SHIFT + NODES_SHIFT + \ + MEM_CGROUP_ID_SHIFT) #define EVICTION_MASK (~0UL >> EVICTION_SHIFT) /* @@ -166,9 +167,10 @@ */ static unsigned int bucket_order __read_mostly; -static void *pack_shadow(unsigned long eviction, struct zone *zone) +static void *pack_shadow(int memcgid, struct zone *zone, unsigned long eviction) { eviction >>= bucket_order; + eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; eviction = (eviction << NODES_SHIFT) | zone_to_nid(zone); eviction = (eviction << ZONES_SHIFT) | zone_idx(zone); eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT); @@ -176,18 +178,21 @@ static void *pack_shadow(unsigned long eviction, struct zone *zone) return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY); } -static void unpack_shadow(void *shadow, struct zone **zonep, +static void unpack_shadow(void *shadow, int *memcgidp, struct zone **zonep, unsigned long *evictionp) { unsigned long entry = (unsigned long)shadow; - int zid, nid; + int memcgid, nid, zid; entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT; zid = entry & ((1UL << ZONES_SHIFT) - 1); entry >>= ZONES_SHIFT; nid = entry & ((1UL << NODES_SHIFT) - 1); entry >>= NODES_SHIFT; + memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); + entry >>= MEM_CGROUP_ID_SHIFT; + *memcgidp = memcgid; *zonep = NODE_DATA(nid)->node_zones + zid; *evictionp = entry << bucket_order; } @@ -202,11 +207,20 @@ static void unpack_shadow(void *shadow, struct zone **zonep, */ void *workingset_eviction(struct address_space *mapping, struct page *page) { + struct mem_cgroup *memcg = page_memcg(page); struct zone *zone = page_zone(page); + int memcgid = mem_cgroup_id(memcg); unsigned long eviction; + struct lruvec *lruvec; - eviction = atomic_long_inc_return(&zone->inactive_age); - return pack_shadow(eviction, zone); + /* Page is fully exclusive and pins page->mem_cgroup */ + VM_BUG_ON_PAGE(PageLRU(page), page); + VM_BUG_ON_PAGE(page_count(page), page); + VM_BUG_ON_PAGE(!PageLocked(page), page); + + lruvec = mem_cgroup_zone_lruvec(zone, memcg); + eviction = atomic_long_inc_return(&lruvec->inactive_age); + return pack_shadow(memcgid, zone, eviction); } /** @@ -221,13 +235,42 @@ void *workingset_eviction(struct address_space *mapping, struct page *page) bool workingset_refault(void *shadow) { unsigned long refault_distance; + unsigned long active_file; + struct mem_cgroup *memcg; unsigned long eviction; + struct lruvec *lruvec; unsigned long refault; struct zone *zone; + int memcgid; - unpack_shadow(shadow, &zone, &eviction); + unpack_shadow(shadow, &memcgid, &zone, &eviction); - refault = atomic_long_read(&zone->inactive_age); + rcu_read_lock(); + /* + * Look up the memcg associated with the stored ID. It might + * have been deleted since the page's eviction. + * + * Note that in rare events the ID could have been recycled + * for a new cgroup that refaults a shared page. This is + * impossible to tell from the available data. However, this + * should be a rare and limited disturbance, and activations + * are always speculative anyway. Ultimately, it's the aging + * algorithm's job to shake out the minimum access frequency + * for the active cache. + * + * XXX: On !CONFIG_MEMCG, this will always return NULL; it + * would be better if the root_mem_cgroup existed in all + * configurations instead. + */ + memcg = mem_cgroup_from_id(memcgid); + if (!mem_cgroup_disabled() && !memcg) { + rcu_read_unlock(); + return false; + } + lruvec = mem_cgroup_zone_lruvec(zone, memcg); + refault = atomic_long_read(&lruvec->inactive_age); + active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE); + rcu_read_unlock(); /* * The unsigned subtraction here gives an accurate distance @@ -249,7 +292,7 @@ bool workingset_refault(void *shadow) inc_zone_state(zone, WORKINGSET_REFAULT); - if (refault_distance <= zone_page_state(zone, NR_ACTIVE_FILE)) { + if (refault_distance <= active_file) { inc_zone_state(zone, WORKINGSET_ACTIVATE); return true; } @@ -262,7 +305,23 @@ bool workingset_refault(void *shadow) */ void workingset_activation(struct page *page) { - atomic_long_inc(&page_zone(page)->inactive_age); + struct mem_cgroup *memcg; + struct lruvec *lruvec; + + memcg = lock_page_memcg(page); + /* + * Filter non-memcg pages here, e.g. unmap can call + * mark_page_accessed() on VDSO pages. + * + * XXX: See workingset_refault() - this should return + * root_mem_cgroup even for !CONFIG_MEMCG. + */ + if (!mem_cgroup_disabled() && !memcg) + goto out; + lruvec = mem_cgroup_zone_lruvec(page_zone(page), memcg); + atomic_long_inc(&lruvec->inactive_age); +out: + unlock_page_memcg(memcg); } /* -- cgit v1.1 From 6a93ca8fde3cfce0f00f02281139a377c83e8d8c Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 15 Mar 2016 14:57:19 -0700 Subject: mm: migrate: do not touch page->mem_cgroup of live pages Changing a page's memcg association complicates dealing with the page, so we want to limit this as much as possible. Page migration e.g. does not have to do that. Just like page cache replacement, it can forcibly charge a replacement page, and then uncharge the old page when it gets freed. Temporarily overcharging the cgroup by a single page is not an issue in practice, and charging is so cheap nowadays that this is much preferrable to the headache of messing with live pages. The only place that still changes the page->mem_cgroup binding of live pages is when pages move along with a task to another cgroup. But that path isolates the page from the LRU, takes the page lock, and the move lock (lock_page_memcg()). That means page->mem_cgroup is always stable in callers that have the page isolated from the LRU or locked. Lighter unlocked paths, like writeback accounting, can use lock_page_memcg(). [akpm@linux-foundation.org: fix build] [vdavydov@virtuozzo.com: fix lockdep splat] Signed-off-by: Johannes Weiner Acked-by: Vladimir Davydov Cc: Johannes Weiner Cc: Michal Hocko Cc: Greg Thelen Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 2 +- mm/memcontrol.c | 13 +++++++------ mm/migrate.c | 15 +++++++++------ mm/shmem.c | 2 +- 4 files changed, 18 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index ee8140c..d8317ca 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -577,7 +577,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) __inc_zone_page_state(new, NR_SHMEM); spin_unlock_irqrestore(&mapping->tree_lock, flags); unlock_page_memcg(memcg); - mem_cgroup_replace_page(old, new); + mem_cgroup_migrate(old, new); radix_tree_preload_end(); if (freepage) freepage(old); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 864e237..64506b2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4457,7 +4457,7 @@ static int mem_cgroup_move_account(struct page *page, VM_BUG_ON(compound && !PageTransHuge(page)); /* - * Prevent mem_cgroup_replace_page() from looking at + * Prevent mem_cgroup_migrate() from looking at * page->mem_cgroup of its source page while we change it. */ ret = -EBUSY; @@ -5486,16 +5486,17 @@ void mem_cgroup_uncharge_list(struct list_head *page_list) } /** - * mem_cgroup_replace_page - migrate a charge to another page - * @oldpage: currently charged page - * @newpage: page to transfer the charge to + * mem_cgroup_migrate - charge a page's replacement + * @oldpage: currently circulating page + * @newpage: replacement page * - * Migrate the charge from @oldpage to @newpage. + * Charge @newpage as a replacement page for @oldpage. @oldpage will + * be uncharged upon free. * * Both pages must be locked, @newpage->mapping must be set up. * Either or both pages might be on the LRU already. */ -void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage) +void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) { struct mem_cgroup *memcg; unsigned int nr_pages; diff --git a/mm/migrate.c b/mm/migrate.c index 432ecd0..848327d 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -326,12 +326,13 @@ int migrate_page_move_mapping(struct address_space *mapping, return -EAGAIN; /* No turning back from here */ - set_page_memcg(newpage, page_memcg(page)); newpage->index = page->index; newpage->mapping = page->mapping; if (PageSwapBacked(page)) SetPageSwapBacked(newpage); + mem_cgroup_migrate(page, newpage); + return MIGRATEPAGE_SUCCESS; } @@ -373,7 +374,6 @@ int migrate_page_move_mapping(struct address_space *mapping, * Now we know that no one else is looking at the page: * no turning back from here. */ - set_page_memcg(newpage, page_memcg(page)); newpage->index = page->index; newpage->mapping = page->mapping; if (PageSwapBacked(page)) @@ -428,6 +428,8 @@ int migrate_page_move_mapping(struct address_space *mapping, } local_irq_enable(); + mem_cgroup_migrate(page, newpage); + return MIGRATEPAGE_SUCCESS; } @@ -458,9 +460,9 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, return -EAGAIN; } - set_page_memcg(newpage, page_memcg(page)); newpage->index = page->index; newpage->mapping = page->mapping; + get_page(newpage); radix_tree_replace_slot(pslot, newpage); @@ -468,6 +470,9 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, page_unfreeze_refs(page, expected_count - 1); spin_unlock_irq(&mapping->tree_lock); + + mem_cgroup_migrate(page, newpage); + return MIGRATEPAGE_SUCCESS; } @@ -775,7 +780,6 @@ static int move_to_new_page(struct page *newpage, struct page *page, * page is freed; but stats require that PageAnon be left as PageAnon. */ if (rc == MIGRATEPAGE_SUCCESS) { - set_page_memcg(page, NULL); if (!PageAnon(page)) page->mapping = NULL; } @@ -1842,8 +1846,7 @@ fail_putback: } mlock_migrate_page(new_page, page); - set_page_memcg(new_page, page_memcg(page)); - set_page_memcg(page, NULL); + mem_cgroup_migrate(page, new_page); page_remove_rmap(page, true); set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED); diff --git a/mm/shmem.c b/mm/shmem.c index 440e2a7..1acfdbc 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1116,7 +1116,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, */ oldpage = newpage; } else { - mem_cgroup_replace_page(oldpage, newpage); + mem_cgroup_migrate(oldpage, newpage); lru_cache_add_anon(newpage); *pagep = newpage; } -- cgit v1.1 From 62cccb8c8e7a3ca233f49d5e7dcb1557d25465cd Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 15 Mar 2016 14:57:22 -0700 Subject: mm: simplify lock_page_memcg() Now that migration doesn't clear page->mem_cgroup of live pages anymore, it's safe to make lock_page_memcg() and the memcg stat functions take pages, and spare the callers from memcg objects. [akpm@linux-foundation.org: fix warnings] Signed-off-by: Johannes Weiner Suggested-by: Vladimir Davydov Acked-by: Vladimir Davydov Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 20 ++++++++------------ mm/memcontrol.c | 23 +++++++++-------------- mm/page-writeback.c | 49 +++++++++++++++++++++---------------------------- mm/rmap.c | 16 ++++++---------- mm/truncate.c | 9 ++++----- mm/vmscan.c | 11 +++++------ mm/workingset.c | 9 ++++----- 7 files changed, 57 insertions(+), 80 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index d8317ca..8e629c4 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -179,8 +179,7 @@ static void page_cache_tree_delete(struct address_space *mapping, * is safe. The caller must hold the mapping's tree_lock and * lock_page_memcg(). */ -void __delete_from_page_cache(struct page *page, void *shadow, - struct mem_cgroup *memcg) +void __delete_from_page_cache(struct page *page, void *shadow) { struct address_space *mapping = page->mapping; @@ -239,8 +238,7 @@ void __delete_from_page_cache(struct page *page, void *shadow, * anyway will be cleared before returning page into buddy allocator. */ if (WARN_ON_ONCE(PageDirty(page))) - account_page_cleaned(page, mapping, memcg, - inode_to_wb(mapping->host)); + account_page_cleaned(page, mapping, inode_to_wb(mapping->host)); } /** @@ -254,7 +252,6 @@ void __delete_from_page_cache(struct page *page, void *shadow, void delete_from_page_cache(struct page *page) { struct address_space *mapping = page->mapping; - struct mem_cgroup *memcg; unsigned long flags; void (*freepage)(struct page *); @@ -263,11 +260,11 @@ void delete_from_page_cache(struct page *page) freepage = mapping->a_ops->freepage; - memcg = lock_page_memcg(page); + lock_page_memcg(page); spin_lock_irqsave(&mapping->tree_lock, flags); - __delete_from_page_cache(page, NULL, memcg); + __delete_from_page_cache(page, NULL); spin_unlock_irqrestore(&mapping->tree_lock, flags); - unlock_page_memcg(memcg); + unlock_page_memcg(page); if (freepage) freepage(page); @@ -551,7 +548,6 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) if (!error) { struct address_space *mapping = old->mapping; void (*freepage)(struct page *); - struct mem_cgroup *memcg; unsigned long flags; pgoff_t offset = old->index; @@ -561,9 +557,9 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) new->mapping = mapping; new->index = offset; - memcg = lock_page_memcg(old); + lock_page_memcg(old); spin_lock_irqsave(&mapping->tree_lock, flags); - __delete_from_page_cache(old, NULL, memcg); + __delete_from_page_cache(old, NULL); error = radix_tree_insert(&mapping->page_tree, offset, new); BUG_ON(error); mapping->nrpages++; @@ -576,7 +572,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) if (PageSwapBacked(new)) __inc_zone_page_state(new, NR_SHMEM); spin_unlock_irqrestore(&mapping->tree_lock, flags); - unlock_page_memcg(memcg); + unlock_page_memcg(old); mem_cgroup_migrate(old, new); radix_tree_preload_end(); if (freepage) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 64506b2..3e41998 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1690,7 +1690,7 @@ cleanup: * This function protects unlocked LRU pages from being moved to * another cgroup and stabilizes their page->mem_cgroup binding. */ -struct mem_cgroup *lock_page_memcg(struct page *page) +void lock_page_memcg(struct page *page) { struct mem_cgroup *memcg; unsigned long flags; @@ -1699,25 +1699,18 @@ struct mem_cgroup *lock_page_memcg(struct page *page) * The RCU lock is held throughout the transaction. The fast * path can get away without acquiring the memcg->move_lock * because page moving starts with an RCU grace period. - * - * The RCU lock also protects the memcg from being freed when - * the page state that is going to change is the only thing - * preventing the page from being uncharged. - * E.g. end-writeback clearing PageWriteback(), which allows - * migration to go ahead and uncharge the page before the - * account transaction might be complete. */ rcu_read_lock(); if (mem_cgroup_disabled()) - return NULL; + return; again: memcg = page->mem_cgroup; if (unlikely(!memcg)) - return NULL; + return; if (atomic_read(&memcg->moving_account) <= 0) - return memcg; + return; spin_lock_irqsave(&memcg->move_lock, flags); if (memcg != page->mem_cgroup) { @@ -1733,16 +1726,18 @@ again: memcg->move_lock_task = current; memcg->move_lock_flags = flags; - return memcg; + return; } EXPORT_SYMBOL(lock_page_memcg); /** * unlock_page_memcg - unlock a page->mem_cgroup binding - * @memcg: the memcg returned by lock_page_memcg() + * @page: the page */ -void unlock_page_memcg(struct mem_cgroup *memcg) +void unlock_page_memcg(struct page *page) { + struct mem_cgroup *memcg = page->mem_cgroup; + if (memcg && memcg->move_lock_task == current) { unsigned long flags = memcg->move_lock_flags; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 2b5ea12..d7cf2c5 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2414,8 +2414,7 @@ int __set_page_dirty_no_writeback(struct page *page) * * NOTE: This relies on being atomic wrt interrupts. */ -void account_page_dirtied(struct page *page, struct address_space *mapping, - struct mem_cgroup *memcg) +void account_page_dirtied(struct page *page, struct address_space *mapping) { struct inode *inode = mapping->host; @@ -2427,7 +2426,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping, inode_attach_wb(inode, page); wb = inode_to_wb(inode); - mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_DIRTY); + mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_DIRTY); __inc_zone_page_state(page, NR_FILE_DIRTY); __inc_zone_page_state(page, NR_DIRTIED); __inc_wb_stat(wb, WB_RECLAIMABLE); @@ -2445,10 +2444,10 @@ EXPORT_SYMBOL(account_page_dirtied); * Caller must hold lock_page_memcg(). */ void account_page_cleaned(struct page *page, struct address_space *mapping, - struct mem_cgroup *memcg, struct bdi_writeback *wb) + struct bdi_writeback *wb) { if (mapping_cap_account_dirty(mapping)) { - mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY); + mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY); dec_zone_page_state(page, NR_FILE_DIRTY); dec_wb_stat(wb, WB_RECLAIMABLE); task_io_account_cancelled_write(PAGE_CACHE_SIZE); @@ -2469,26 +2468,24 @@ void account_page_cleaned(struct page *page, struct address_space *mapping, */ int __set_page_dirty_nobuffers(struct page *page) { - struct mem_cgroup *memcg; - - memcg = lock_page_memcg(page); + lock_page_memcg(page); if (!TestSetPageDirty(page)) { struct address_space *mapping = page_mapping(page); unsigned long flags; if (!mapping) { - unlock_page_memcg(memcg); + unlock_page_memcg(page); return 1; } spin_lock_irqsave(&mapping->tree_lock, flags); BUG_ON(page_mapping(page) != mapping); WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); - account_page_dirtied(page, mapping, memcg); + account_page_dirtied(page, mapping); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); spin_unlock_irqrestore(&mapping->tree_lock, flags); - unlock_page_memcg(memcg); + unlock_page_memcg(page); if (mapping->host) { /* !PageAnon && !swapper_space */ @@ -2496,7 +2493,7 @@ int __set_page_dirty_nobuffers(struct page *page) } return 1; } - unlock_page_memcg(memcg); + unlock_page_memcg(page); return 0; } EXPORT_SYMBOL(__set_page_dirty_nobuffers); @@ -2626,17 +2623,16 @@ void cancel_dirty_page(struct page *page) if (mapping_cap_account_dirty(mapping)) { struct inode *inode = mapping->host; struct bdi_writeback *wb; - struct mem_cgroup *memcg; bool locked; - memcg = lock_page_memcg(page); + lock_page_memcg(page); wb = unlocked_inode_to_wb_begin(inode, &locked); if (TestClearPageDirty(page)) - account_page_cleaned(page, mapping, memcg, wb); + account_page_cleaned(page, mapping, wb); unlocked_inode_to_wb_end(inode, locked); - unlock_page_memcg(memcg); + unlock_page_memcg(page); } else { ClearPageDirty(page); } @@ -2667,7 +2663,6 @@ int clear_page_dirty_for_io(struct page *page) if (mapping && mapping_cap_account_dirty(mapping)) { struct inode *inode = mapping->host; struct bdi_writeback *wb; - struct mem_cgroup *memcg; bool locked; /* @@ -2705,16 +2700,16 @@ int clear_page_dirty_for_io(struct page *page) * always locked coming in here, so we get the desired * exclusion. */ - memcg = lock_page_memcg(page); + lock_page_memcg(page); wb = unlocked_inode_to_wb_begin(inode, &locked); if (TestClearPageDirty(page)) { - mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY); + mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY); dec_zone_page_state(page, NR_FILE_DIRTY); dec_wb_stat(wb, WB_RECLAIMABLE); ret = 1; } unlocked_inode_to_wb_end(inode, locked); - unlock_page_memcg(memcg); + unlock_page_memcg(page); return ret; } return TestClearPageDirty(page); @@ -2724,10 +2719,9 @@ EXPORT_SYMBOL(clear_page_dirty_for_io); int test_clear_page_writeback(struct page *page) { struct address_space *mapping = page_mapping(page); - struct mem_cgroup *memcg; int ret; - memcg = lock_page_memcg(page); + lock_page_memcg(page); if (mapping) { struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); @@ -2751,21 +2745,20 @@ int test_clear_page_writeback(struct page *page) ret = TestClearPageWriteback(page); } if (ret) { - mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); + mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); dec_zone_page_state(page, NR_WRITEBACK); inc_zone_page_state(page, NR_WRITTEN); } - unlock_page_memcg(memcg); + unlock_page_memcg(page); return ret; } int __test_set_page_writeback(struct page *page, bool keep_write) { struct address_space *mapping = page_mapping(page); - struct mem_cgroup *memcg; int ret; - memcg = lock_page_memcg(page); + lock_page_memcg(page); if (mapping) { struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); @@ -2793,10 +2786,10 @@ int __test_set_page_writeback(struct page *page, bool keep_write) ret = TestSetPageWriteback(page); } if (!ret) { - mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); + mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); inc_zone_page_state(page, NR_WRITEBACK); } - unlock_page_memcg(memcg); + unlock_page_memcg(page); return ret; } diff --git a/mm/rmap.c b/mm/rmap.c index 2871e7d..02f0bfc 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1287,21 +1287,17 @@ void page_add_new_anon_rmap(struct page *page, */ void page_add_file_rmap(struct page *page) { - struct mem_cgroup *memcg; - - memcg = lock_page_memcg(page); + lock_page_memcg(page); if (atomic_inc_and_test(&page->_mapcount)) { __inc_zone_page_state(page, NR_FILE_MAPPED); - mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); + mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); } - unlock_page_memcg(memcg); + unlock_page_memcg(page); } static void page_remove_file_rmap(struct page *page) { - struct mem_cgroup *memcg; - - memcg = lock_page_memcg(page); + lock_page_memcg(page); /* Hugepages are not counted in NR_FILE_MAPPED for now. */ if (unlikely(PageHuge(page))) { @@ -1320,12 +1316,12 @@ static void page_remove_file_rmap(struct page *page) * pte lock(a spinlock) is held, which implies preemption disabled. */ __dec_zone_page_state(page, NR_FILE_MAPPED); - mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); + mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); if (unlikely(PageMlocked(page))) clear_page_mlock(page); out: - unlock_page_memcg(memcg); + unlock_page_memcg(page); } static void page_remove_anon_compound_rmap(struct page *page) diff --git a/mm/truncate.c b/mm/truncate.c index 51a24f6..87311af 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -519,7 +519,6 @@ EXPORT_SYMBOL(invalidate_mapping_pages); static int invalidate_complete_page2(struct address_space *mapping, struct page *page) { - struct mem_cgroup *memcg; unsigned long flags; if (page->mapping != mapping) @@ -528,15 +527,15 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) return 0; - memcg = lock_page_memcg(page); + lock_page_memcg(page); spin_lock_irqsave(&mapping->tree_lock, flags); if (PageDirty(page)) goto failed; BUG_ON(page_has_private(page)); - __delete_from_page_cache(page, NULL, memcg); + __delete_from_page_cache(page, NULL); spin_unlock_irqrestore(&mapping->tree_lock, flags); - unlock_page_memcg(memcg); + unlock_page_memcg(page); if (mapping->a_ops->freepage) mapping->a_ops->freepage(page); @@ -545,7 +544,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) return 1; failed: spin_unlock_irqrestore(&mapping->tree_lock, flags); - unlock_page_memcg(memcg); + unlock_page_memcg(page); return 0; } diff --git a/mm/vmscan.c b/mm/vmscan.c index fd434cc..34f7e2d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -603,12 +603,11 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, bool reclaimed) { unsigned long flags; - struct mem_cgroup *memcg; BUG_ON(!PageLocked(page)); BUG_ON(mapping != page_mapping(page)); - memcg = lock_page_memcg(page); + lock_page_memcg(page); spin_lock_irqsave(&mapping->tree_lock, flags); /* * The non racy check for a busy page. @@ -648,7 +647,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, mem_cgroup_swapout(page, swap); __delete_from_swap_cache(page); spin_unlock_irqrestore(&mapping->tree_lock, flags); - unlock_page_memcg(memcg); + unlock_page_memcg(page); swapcache_free(swap); } else { void (*freepage)(struct page *); @@ -674,9 +673,9 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, if (reclaimed && page_is_file_cache(page) && !mapping_exiting(mapping) && !dax_mapping(mapping)) shadow = workingset_eviction(mapping, page); - __delete_from_page_cache(page, shadow, memcg); + __delete_from_page_cache(page, shadow); spin_unlock_irqrestore(&mapping->tree_lock, flags); - unlock_page_memcg(memcg); + unlock_page_memcg(page); if (freepage != NULL) freepage(page); @@ -686,7 +685,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, cannot_free: spin_unlock_irqrestore(&mapping->tree_lock, flags); - unlock_page_memcg(memcg); + unlock_page_memcg(page); return 0; } diff --git a/mm/workingset.c b/mm/workingset.c index 14bc23a..6130ba0b 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -305,10 +305,9 @@ bool workingset_refault(void *shadow) */ void workingset_activation(struct page *page) { - struct mem_cgroup *memcg; struct lruvec *lruvec; - memcg = lock_page_memcg(page); + lock_page_memcg(page); /* * Filter non-memcg pages here, e.g. unmap can call * mark_page_accessed() on VDSO pages. @@ -316,12 +315,12 @@ void workingset_activation(struct page *page) * XXX: See workingset_refault() - this should return * root_mem_cgroup even for !CONFIG_MEMCG. */ - if (!mem_cgroup_disabled() && !memcg) + if (!mem_cgroup_disabled() && !page_memcg(page)) goto out; - lruvec = mem_cgroup_zone_lruvec(page_zone(page), memcg); + lruvec = mem_cgroup_zone_lruvec(page_zone(page), page_memcg(page)); atomic_long_inc(&lruvec->inactive_age); out: - unlock_page_memcg(memcg); + unlock_page_memcg(page); } /* -- cgit v1.1 From fdf1cdb91b6ab7a8a91df68c384f36b8a0909cab Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 15 Mar 2016 14:57:25 -0700 Subject: mm: remove unnecessary uses of lock_page_memcg() There are several users that nest lock_page_memcg() inside lock_page() to prevent page->mem_cgroup from changing. But the page lock prevents pages from moving between cgroups, so that is unnecessary overhead. Remove lock_page_memcg() in contexts with locked contexts and fix the debug code in the page stat functions to be okay with the page lock. Signed-off-by: Johannes Weiner Acked-by: Vladimir Davydov Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 7 +------ mm/page-writeback.c | 2 -- mm/truncate.c | 3 --- mm/vmscan.c | 4 ---- 4 files changed, 1 insertion(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 8e629c4..61b441b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -176,8 +176,7 @@ static void page_cache_tree_delete(struct address_space *mapping, /* * Delete a page from the page cache and free it. Caller has to make * sure the page is locked and that nobody else uses it - or that usage - * is safe. The caller must hold the mapping's tree_lock and - * lock_page_memcg(). + * is safe. The caller must hold the mapping's tree_lock. */ void __delete_from_page_cache(struct page *page, void *shadow) { @@ -260,11 +259,9 @@ void delete_from_page_cache(struct page *page) freepage = mapping->a_ops->freepage; - lock_page_memcg(page); spin_lock_irqsave(&mapping->tree_lock, flags); __delete_from_page_cache(page, NULL); spin_unlock_irqrestore(&mapping->tree_lock, flags); - unlock_page_memcg(page); if (freepage) freepage(page); @@ -557,7 +554,6 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) new->mapping = mapping; new->index = offset; - lock_page_memcg(old); spin_lock_irqsave(&mapping->tree_lock, flags); __delete_from_page_cache(old, NULL); error = radix_tree_insert(&mapping->page_tree, offset, new); @@ -572,7 +568,6 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) if (PageSwapBacked(new)) __inc_zone_page_state(new, NR_SHMEM); spin_unlock_irqrestore(&mapping->tree_lock, flags); - unlock_page_memcg(old); mem_cgroup_migrate(old, new); radix_tree_preload_end(); if (freepage) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index d7cf2c5..11ff8f7 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2700,7 +2700,6 @@ int clear_page_dirty_for_io(struct page *page) * always locked coming in here, so we get the desired * exclusion. */ - lock_page_memcg(page); wb = unlocked_inode_to_wb_begin(inode, &locked); if (TestClearPageDirty(page)) { mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY); @@ -2709,7 +2708,6 @@ int clear_page_dirty_for_io(struct page *page) ret = 1; } unlocked_inode_to_wb_end(inode, locked); - unlock_page_memcg(page); return ret; } return TestClearPageDirty(page); diff --git a/mm/truncate.c b/mm/truncate.c index 87311af..7598b55 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -527,7 +527,6 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) return 0; - lock_page_memcg(page); spin_lock_irqsave(&mapping->tree_lock, flags); if (PageDirty(page)) goto failed; @@ -535,7 +534,6 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) BUG_ON(page_has_private(page)); __delete_from_page_cache(page, NULL); spin_unlock_irqrestore(&mapping->tree_lock, flags); - unlock_page_memcg(page); if (mapping->a_ops->freepage) mapping->a_ops->freepage(page); @@ -544,7 +542,6 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) return 1; failed: spin_unlock_irqrestore(&mapping->tree_lock, flags); - unlock_page_memcg(page); return 0; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 34f7e2d..dd98447 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -607,7 +607,6 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, BUG_ON(!PageLocked(page)); BUG_ON(mapping != page_mapping(page)); - lock_page_memcg(page); spin_lock_irqsave(&mapping->tree_lock, flags); /* * The non racy check for a busy page. @@ -647,7 +646,6 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, mem_cgroup_swapout(page, swap); __delete_from_swap_cache(page); spin_unlock_irqrestore(&mapping->tree_lock, flags); - unlock_page_memcg(page); swapcache_free(swap); } else { void (*freepage)(struct page *); @@ -675,7 +673,6 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, shadow = workingset_eviction(mapping, page); __delete_from_page_cache(page, shadow); spin_unlock_irqrestore(&mapping->tree_lock, flags); - unlock_page_memcg(page); if (freepage != NULL) freepage(page); @@ -685,7 +682,6 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, cannot_free: spin_unlock_irqrestore(&mapping->tree_lock, flags); - unlock_page_memcg(page); return 0; } -- cgit v1.1 From 88193f7ce6657ec4197b1f26b73b37197373b8e6 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 15 Mar 2016 14:57:28 -0700 Subject: mm: use linear_page_index() in do_fault() do_fault() assumes that PAGE_SIZE is the same as PAGE_CACHE_SIZE. Use linear_page_index() to calculate pgoff in the correct units. Signed-off-by: Matthew Wilcox Acked-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 8adb5b7..032f05c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3124,8 +3124,7 @@ static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, unsigned int flags, pte_t orig_pte) { - pgoff_t pgoff = (((address & PAGE_MASK) - - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + pgoff_t pgoff = linear_page_index(vma, address); pte_unmap(page_table); /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ -- cgit v1.1 From 8df651c7059e7980f08430d4ebbd134b046657ee Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 15 Mar 2016 14:57:30 -0700 Subject: thp: cleanup split_huge_page() After one of bugfixes to freeze_page(), we don't have freezed pages in rmap, therefore mapcount of all subpages of freezed THP is zero. And we have assert for that. Let's drop code which deal with non-zero mapcount of subpages. Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e10a4fe..1ea21e2 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3220,28 +3220,26 @@ static void unfreeze_page(struct anon_vma *anon_vma, struct page *page) } } -static int __split_huge_page_tail(struct page *head, int tail, +static void __split_huge_page_tail(struct page *head, int tail, struct lruvec *lruvec, struct list_head *list) { - int mapcount; struct page *page_tail = head + tail; - mapcount = atomic_read(&page_tail->_mapcount) + 1; + VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail); VM_BUG_ON_PAGE(atomic_read(&page_tail->_count) != 0, page_tail); /* * tail_page->_count is zero and not changing from under us. But * get_page_unless_zero() may be running from under us on the - * tail_page. If we used atomic_set() below instead of atomic_add(), we + * tail_page. If we used atomic_set() below instead of atomic_inc(), we * would then run atomic_set() concurrently with * get_page_unless_zero(), and atomic_set() is implemented in C not * using locked ops. spin_unlock on x86 sometime uses locked ops * because of PPro errata 66, 92, so unless somebody can guarantee * atomic_set() here would be safe on all archs (and not only on x86), - * it's safer to use atomic_add(). + * it's safer to use atomic_inc(). */ - atomic_add(mapcount + 1, &page_tail->_count); - + atomic_inc(&page_tail->_count); page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; page_tail->flags |= (head->flags & @@ -3275,8 +3273,6 @@ static int __split_huge_page_tail(struct page *head, int tail, page_tail->index = head->index + tail; page_cpupid_xchg_last(page_tail, page_cpupid_last(head)); lru_add_page_tail(head, page_tail, lruvec, list); - - return mapcount; } static void __split_huge_page(struct page *page, struct list_head *list) @@ -3284,7 +3280,7 @@ static void __split_huge_page(struct page *page, struct list_head *list) struct page *head = compound_head(page); struct zone *zone = page_zone(head); struct lruvec *lruvec; - int i, tail_mapcount; + int i; /* prevent PageLRU to go away from under us, and freeze lru stats */ spin_lock_irq(&zone->lru_lock); @@ -3293,10 +3289,8 @@ static void __split_huge_page(struct page *page, struct list_head *list) /* complete memcg works before add pages to LRU */ mem_cgroup_split_huge_fixup(head); - tail_mapcount = 0; for (i = HPAGE_PMD_NR - 1; i >= 1; i--) - tail_mapcount += __split_huge_page_tail(head, i, lruvec, list); - atomic_sub(tail_mapcount, &head->_count); + __split_huge_page_tail(head, i, lruvec, list); ClearPageCompound(head); spin_unlock_irq(&zone->lru_lock); -- cgit v1.1 From 5aa174801fc85b4b48a2520b2a50ee78990b7925 Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Tue, 15 Mar 2016 14:57:42 -0700 Subject: mm/memblock.c: remove unnecessary memblock_type variable We define struct memblock_type *type in the memblock_add_region() and memblock_reserve_region() functions only for passing it to the memlock_add_range() and memblock_reserve_range() functions. Let's remove these variables and will pass a type directly. Signed-off-by: Alexander Kuleshov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index dd79899..fc7824f 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -612,14 +612,12 @@ static int __init_memblock memblock_add_region(phys_addr_t base, int nid, unsigned long flags) { - struct memblock_type *type = &memblock.memory; - memblock_dbg("memblock_add: [%#016llx-%#016llx] flags %#02lx %pF\n", (unsigned long long)base, (unsigned long long)base + size - 1, flags, (void *)_RET_IP_); - return memblock_add_range(type, base, size, nid, flags); + return memblock_add_range(&memblock.memory, base, size, nid, flags); } int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) @@ -740,14 +738,12 @@ static int __init_memblock memblock_reserve_region(phys_addr_t base, int nid, unsigned long flags) { - struct memblock_type *type = &memblock.reserved; - memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n", (unsigned long long)base, (unsigned long long)base + size - 1, flags, (void *)_RET_IP_); - return memblock_add_range(type, base, size, nid, flags); + return memblock_add_range(&memblock.reserved, base, size, nid, flags); } int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) -- cgit v1.1 From 623446e4dc45b37740268165107cc63abb3022f0 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 15 Mar 2016 14:57:45 -0700 Subject: mm/compaction: fix invalid free_pfn and compact_cached_free_pfn free_pfn and compact_cached_free_pfn are the pointer that remember restart position of freepage scanner. When they are reset or invalid, we set them to zone_end_pfn because freepage scanner works in reverse direction. But, because zone range is defined as [zone_start_pfn, zone_end_pfn), zone_end_pfn is invalid to access. Therefore, we should not store it to free_pfn and compact_cached_free_pfn. Instead, we need to store zone_end_pfn - 1 to them. There is one more thing we should consider. Freepage scanner scan reversely by pageblock unit. If free_pfn and compact_cached_free_pfn are set to middle of pageblock, it regards that sitiation as that it already scans front part of pageblock so we lose opportunity to scan there. To fix-up, this patch do round_down() to guarantee that reset position will be pageblock aligned. Note that thanks to the current pageblock_pfn_to_page() implementation, actual access to zone_end_pfn doesn't happen until now. But, following patch will change pageblock_pfn_to_page() so this patch is needed from now on. Signed-off-by: Joonsoo Kim Acked-by: David Rientjes Acked-by: Vlastimil Babka Cc: Aaron Lu Cc: Mel Gorman Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 585de54..56fa321 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -200,7 +200,8 @@ static void reset_cached_positions(struct zone *zone) { zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn; zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn; - zone->compact_cached_free_pfn = zone_end_pfn(zone); + zone->compact_cached_free_pfn = + round_down(zone_end_pfn(zone) - 1, pageblock_nr_pages); } /* @@ -1371,11 +1372,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) */ cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync]; cc->free_pfn = zone->compact_cached_free_pfn; - if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) { - cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1); + if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) { + cc->free_pfn = round_down(end_pfn - 1, pageblock_nr_pages); zone->compact_cached_free_pfn = cc->free_pfn; } - if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) { + if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) { cc->migrate_pfn = start_pfn; zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; -- cgit v1.1 From e1409c325fdc1fef7b3d8025c51892355f065d15 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 15 Mar 2016 14:57:48 -0700 Subject: mm/compaction: pass only pageblock aligned range to pageblock_pfn_to_page pageblock_pfn_to_page() is used to check there is valid pfn and all pages in the pageblock is in a single zone. If there is a hole in the pageblock, passing arbitrary position to pageblock_pfn_to_page() could cause to skip whole pageblock scanning, instead of just skipping the hole page. For deterministic behaviour, it's better to always pass pageblock aligned range to pageblock_pfn_to_page(). It will also help further optimization on pageblock_pfn_to_page() in the following patch. Signed-off-by: Joonsoo Kim Cc: Aaron Lu Cc: David Rientjes Cc: Mel Gorman Cc: Rik van Riel Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 41 ++++++++++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 56fa321..8ce36eb 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -555,13 +555,17 @@ unsigned long isolate_freepages_range(struct compact_control *cc, unsigned long start_pfn, unsigned long end_pfn) { - unsigned long isolated, pfn, block_end_pfn; + unsigned long isolated, pfn, block_start_pfn, block_end_pfn; LIST_HEAD(freelist); pfn = start_pfn; + block_start_pfn = pfn & ~(pageblock_nr_pages - 1); + if (block_start_pfn < cc->zone->zone_start_pfn) + block_start_pfn = cc->zone->zone_start_pfn; block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); for (; pfn < end_pfn; pfn += isolated, + block_start_pfn = block_end_pfn, block_end_pfn += pageblock_nr_pages) { /* Protect pfn from changing by isolate_freepages_block */ unsigned long isolate_start_pfn = pfn; @@ -574,11 +578,13 @@ isolate_freepages_range(struct compact_control *cc, * scanning range to right one. */ if (pfn >= block_end_pfn) { + block_start_pfn = pfn & ~(pageblock_nr_pages - 1); block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); block_end_pfn = min(block_end_pfn, end_pfn); } - if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone)) + if (!pageblock_pfn_to_page(block_start_pfn, + block_end_pfn, cc->zone)) break; isolated = isolate_freepages_block(cc, &isolate_start_pfn, @@ -864,18 +870,23 @@ unsigned long isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, unsigned long end_pfn) { - unsigned long pfn, block_end_pfn; + unsigned long pfn, block_start_pfn, block_end_pfn; /* Scan block by block. First and last block may be incomplete */ pfn = start_pfn; + block_start_pfn = pfn & ~(pageblock_nr_pages - 1); + if (block_start_pfn < cc->zone->zone_start_pfn) + block_start_pfn = cc->zone->zone_start_pfn; block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); for (; pfn < end_pfn; pfn = block_end_pfn, + block_start_pfn = block_end_pfn, block_end_pfn += pageblock_nr_pages) { block_end_pfn = min(block_end_pfn, end_pfn); - if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone)) + if (!pageblock_pfn_to_page(block_start_pfn, + block_end_pfn, cc->zone)) continue; pfn = isolate_migratepages_block(cc, pfn, block_end_pfn, @@ -1104,7 +1115,9 @@ int sysctl_compact_unevictable_allowed __read_mostly = 1; static isolate_migrate_t isolate_migratepages(struct zone *zone, struct compact_control *cc) { - unsigned long low_pfn, end_pfn; + unsigned long block_start_pfn; + unsigned long block_end_pfn; + unsigned long low_pfn; unsigned long isolate_start_pfn; struct page *page; const isolate_mode_t isolate_mode = @@ -1116,16 +1129,21 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, * initialized by compact_zone() */ low_pfn = cc->migrate_pfn; + block_start_pfn = cc->migrate_pfn & ~(pageblock_nr_pages - 1); + if (block_start_pfn < zone->zone_start_pfn) + block_start_pfn = zone->zone_start_pfn; /* Only scan within a pageblock boundary */ - end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages); + block_end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages); /* * Iterate over whole pageblocks until we find the first suitable. * Do not cross the free scanner. */ - for (; end_pfn <= cc->free_pfn; - low_pfn = end_pfn, end_pfn += pageblock_nr_pages) { + for (; block_end_pfn <= cc->free_pfn; + low_pfn = block_end_pfn, + block_start_pfn = block_end_pfn, + block_end_pfn += pageblock_nr_pages) { /* * This can potentially iterate a massively long zone with @@ -1136,7 +1154,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, && compact_should_abort(cc)) break; - page = pageblock_pfn_to_page(low_pfn, end_pfn, zone); + page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn, + zone); if (!page) continue; @@ -1155,8 +1174,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, /* Perform the isolation */ isolate_start_pfn = low_pfn; - low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn, - isolate_mode); + low_pfn = isolate_migratepages_block(cc, low_pfn, + block_end_pfn, isolate_mode); if (!low_pfn || cc->contended) { acct_isolated(zone, cc); -- cgit v1.1 From 7cf91a98e607c2f935dbcc177d70011e95b8faff Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 15 Mar 2016 14:57:51 -0700 Subject: mm/compaction: speed up pageblock_pfn_to_page() when zone is contiguous There is a performance drop report due to hugepage allocation and in there half of cpu time are spent on pageblock_pfn_to_page() in compaction [1]. In that workload, compaction is triggered to make hugepage but most of pageblocks are un-available for compaction due to pageblock type and skip bit so compaction usually fails. Most costly operations in this case is to find valid pageblock while scanning whole zone range. To check if pageblock is valid to compact, valid pfn within pageblock is required and we can obtain it by calling pageblock_pfn_to_page(). This function checks whether pageblock is in a single zone and return valid pfn if possible. Problem is that we need to check it every time before scanning pageblock even if we re-visit it and this turns out to be very expensive in this workload. Although we have no way to skip this pageblock check in the system where hole exists at arbitrary position, we can use cached value for zone continuity and just do pfn_to_page() in the system where hole doesn't exist. This optimization considerably speeds up in above workload. Before vs After Max: 1096 MB/s vs 1325 MB/s Min: 635 MB/s 1015 MB/s Avg: 899 MB/s 1194 MB/s Avg is improved by roughly 30% [2]. [1]: http://www.spinics.net/lists/linux-mm/msg97378.html [2]: https://lkml.org/lkml/2015/12/9/23 [akpm@linux-foundation.org: don't forget to restore zone->contiguous on error path, per Vlastimil] Signed-off-by: Joonsoo Kim Reported-by: Aaron Lu Acked-by: Vlastimil Babka Tested-by: Aaron Lu Cc: Mel Gorman Cc: Rik van Riel Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 43 ----------------------------- mm/internal.h | 12 +++++++++ mm/memory_hotplug.c | 13 +++++++-- mm/page_alloc.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 100 insertions(+), 46 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 8ce36eb..93f71d9 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -71,49 +71,6 @@ static inline bool migrate_async_suitable(int migratetype) return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE; } -/* - * Check that the whole (or subset of) a pageblock given by the interval of - * [start_pfn, end_pfn) is valid and within the same zone, before scanning it - * with the migration of free compaction scanner. The scanners then need to - * use only pfn_valid_within() check for arches that allow holes within - * pageblocks. - * - * Return struct page pointer of start_pfn, or NULL if checks were not passed. - * - * It's possible on some configurations to have a setup like node0 node1 node0 - * i.e. it's possible that all pages within a zones range of pages do not - * belong to a single zone. We assume that a border between node0 and node1 - * can occur within a single pageblock, but not a node0 node1 node0 - * interleaving within a single pageblock. It is therefore sufficient to check - * the first and last page of a pageblock and avoid checking each individual - * page in a pageblock. - */ -static struct page *pageblock_pfn_to_page(unsigned long start_pfn, - unsigned long end_pfn, struct zone *zone) -{ - struct page *start_page; - struct page *end_page; - - /* end_pfn is one past the range we are checking */ - end_pfn--; - - if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn)) - return NULL; - - start_page = pfn_to_page(start_pfn); - - if (page_zone(start_page) != zone) - return NULL; - - end_page = pfn_to_page(end_pfn); - - /* This gives a shorter code than deriving page_zone(end_page) */ - if (page_zone_id(start_page) != page_zone_id(end_page)) - return NULL; - - return start_page; -} - #ifdef CONFIG_COMPACTION /* Do not skip compaction more than 64 times */ diff --git a/mm/internal.h b/mm/internal.h index 6636e1d..ad9400d 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -132,6 +132,18 @@ __find_buddy_index(unsigned long page_idx, unsigned int order) return page_idx ^ (1 << order); } +extern struct page *__pageblock_pfn_to_page(unsigned long start_pfn, + unsigned long end_pfn, struct zone *zone); + +static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn, + unsigned long end_pfn, struct zone *zone) +{ + if (zone->contiguous) + return pfn_to_page(start_pfn); + + return __pageblock_pfn_to_page(start_pfn, end_pfn, zone); +} + extern int __isolate_free_page(struct page *page, unsigned int order); extern void __free_pages_bootmem(struct page *page, unsigned long pfn, unsigned int order); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 484e867..24ea063 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -512,6 +512,8 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, int start_sec, end_sec; struct vmem_altmap *altmap; + clear_zone_contiguous(zone); + /* during initialize mem_map, align hot-added range to section */ start_sec = pfn_to_section_nr(phys_start_pfn); end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); @@ -524,7 +526,8 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, if (altmap->base_pfn != phys_start_pfn || vmem_altmap_offset(altmap) > nr_pages) { pr_warn_once("memory add fail, invalid altmap\n"); - return -EINVAL; + err = -EINVAL; + goto out; } altmap->alloc = 0; } @@ -542,7 +545,8 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, err = 0; } vmemmap_populate_print_last(); - +out: + set_zone_contiguous(zone); return err; } EXPORT_SYMBOL_GPL(__add_pages); @@ -814,6 +818,8 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, } } + clear_zone_contiguous(zone); + /* * We can only remove entire sections */ @@ -829,6 +835,9 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, if (ret) break; } + + set_zone_contiguous(zone); + return ret; } EXPORT_SYMBOL_GPL(__remove_pages); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 50897dc..c46b75d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1128,6 +1128,75 @@ void __init __free_pages_bootmem(struct page *page, unsigned long pfn, return __free_pages_boot_core(page, pfn, order); } +/* + * Check that the whole (or subset of) a pageblock given by the interval of + * [start_pfn, end_pfn) is valid and within the same zone, before scanning it + * with the migration of free compaction scanner. The scanners then need to + * use only pfn_valid_within() check for arches that allow holes within + * pageblocks. + * + * Return struct page pointer of start_pfn, or NULL if checks were not passed. + * + * It's possible on some configurations to have a setup like node0 node1 node0 + * i.e. it's possible that all pages within a zones range of pages do not + * belong to a single zone. We assume that a border between node0 and node1 + * can occur within a single pageblock, but not a node0 node1 node0 + * interleaving within a single pageblock. It is therefore sufficient to check + * the first and last page of a pageblock and avoid checking each individual + * page in a pageblock. + */ +struct page *__pageblock_pfn_to_page(unsigned long start_pfn, + unsigned long end_pfn, struct zone *zone) +{ + struct page *start_page; + struct page *end_page; + + /* end_pfn is one past the range we are checking */ + end_pfn--; + + if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn)) + return NULL; + + start_page = pfn_to_page(start_pfn); + + if (page_zone(start_page) != zone) + return NULL; + + end_page = pfn_to_page(end_pfn); + + /* This gives a shorter code than deriving page_zone(end_page) */ + if (page_zone_id(start_page) != page_zone_id(end_page)) + return NULL; + + return start_page; +} + +void set_zone_contiguous(struct zone *zone) +{ + unsigned long block_start_pfn = zone->zone_start_pfn; + unsigned long block_end_pfn; + + block_end_pfn = ALIGN(block_start_pfn + 1, pageblock_nr_pages); + for (; block_start_pfn < zone_end_pfn(zone); + block_start_pfn = block_end_pfn, + block_end_pfn += pageblock_nr_pages) { + + block_end_pfn = min(block_end_pfn, zone_end_pfn(zone)); + + if (!__pageblock_pfn_to_page(block_start_pfn, + block_end_pfn, zone)) + return; + } + + /* We confirm that there is no hole */ + zone->contiguous = true; +} + +void clear_zone_contiguous(struct zone *zone) +{ + zone->contiguous = false; +} + #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT static void __init deferred_free_range(struct page *page, unsigned long pfn, int nr_pages) @@ -1278,9 +1347,13 @@ free_range: pgdat_init_report_one_done(); return 0; } +#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ void __init page_alloc_init_late(void) { + struct zone *zone; + +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT int nid; /* There will be num_node_state(N_MEMORY) threads */ @@ -1294,8 +1367,11 @@ void __init page_alloc_init_late(void) /* Reinit limits that are based on free pages after the kernel is up */ files_maxfiles_init(); +#endif + + for_each_populated_zone(zone) + set_zone_contiguous(zone); } -#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ #ifdef CONFIG_CMA /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ -- cgit v1.1 From 74485cf2bc85d2a10c3653fff4fe956db67ce2a9 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 15 Mar 2016 14:57:54 -0700 Subject: mm: migrate: consolidate mem_cgroup_migrate() calls Rather than scattering mem_cgroup_migrate() calls all over the place, have a single call from a safe place where every migration operation eventually ends up in - migrate_page_copy(). Signed-off-by: Johannes Weiner Suggested-by: Hugh Dickins Acked-by: Vladimir Davydov Acked-by: Michal Hocko Cc: Mateusz Guzik Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 848327d..568284e 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -331,8 +331,6 @@ int migrate_page_move_mapping(struct address_space *mapping, if (PageSwapBacked(page)) SetPageSwapBacked(newpage); - mem_cgroup_migrate(page, newpage); - return MIGRATEPAGE_SUCCESS; } @@ -428,8 +426,6 @@ int migrate_page_move_mapping(struct address_space *mapping, } local_irq_enable(); - mem_cgroup_migrate(page, newpage); - return MIGRATEPAGE_SUCCESS; } @@ -471,8 +467,6 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, spin_unlock_irq(&mapping->tree_lock); - mem_cgroup_migrate(page, newpage); - return MIGRATEPAGE_SUCCESS; } @@ -586,6 +580,8 @@ void migrate_page_copy(struct page *newpage, struct page *page) end_page_writeback(newpage); copy_page_owner(page, newpage); + + mem_cgroup_migrate(page, newpage); } /************************************************************ @@ -1846,7 +1842,6 @@ fail_putback: } mlock_migrate_page(new_page, page); - mem_cgroup_migrate(page, new_page); page_remove_rmap(page, true); set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED); -- cgit v1.1 From 9cf7666ace654cc50f815e95f0415e3fcdd6fcfe Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 15 Mar 2016 14:57:58 -0700 Subject: mm: memcontrol: drop unnecessary lru locking from mem_cgroup_migrate() Migration accounting in the memory controller used to have to handle both oldpage and newpage being on the LRU already; fuse's page cache replacement used to pass a recycled newpage that had been uncharged but not freed and removed from the LRU, and the memcg migration code used to uncharge oldpage to "pass on" the existing charge to newpage. Nowadays, pages are no longer uncharged when truncated from the page cache, but rather only at free time, so if a LRU page is recycled in page cache replacement it'll also still be charged. And we bail out of the charge transfer altogether in that case. Tell commit_charge() that we know newpage is not on the LRU, to avoid taking the zone->lru_lock unnecessarily from the migration path. But also, oldpage is no longer uncharged inside migration. We only use oldpage for its page->mem_cgroup and page size, so we don't care about its LRU state anymore either. Remove any mention from the kernel doc. Signed-off-by: Johannes Weiner Suggested-by: Hugh Dickins Acked-by: Vladimir Davydov Acked-by: Michal Hocko Cc: Mateusz Guzik Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3e41998..42882c1 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5489,7 +5489,6 @@ void mem_cgroup_uncharge_list(struct list_head *page_list) * be uncharged upon free. * * Both pages must be locked, @newpage->mapping must be set up. - * Either or both pages might be on the LRU already. */ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) { @@ -5524,7 +5523,7 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) page_counter_charge(&memcg->memsw, nr_pages); css_get_many(&memcg->css, nr_pages); - commit_charge(newpage, memcg, true); + commit_charge(newpage, memcg, false); local_irq_disable(); mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages); -- cgit v1.1 From 6a618957ad17d8f4f4c7eeede752685374b1b176 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 17 Mar 2016 14:17:26 -0700 Subject: mm: oom_kill: don't ignore oom score on exiting tasks When the OOM killer scans tasks and encounters a PF_EXITING one, it force-selects that task regardless of the score. The problem is that if that task got stuck waiting for some state the allocation site is holding, the OOM reaper can not move on to the next best victim. Frankly, I don't even know why we check for exiting tasks in the OOM killer. We've tried direct reclaim at least 15 times by the time we decide the system is OOM, there was plenty of time to exit and free memory; and a task might exit voluntarily right after we issue a kill. This is testing pure noise. Remove it. Signed-off-by: Tetsuo Handa Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Kirill A. Shutemov Cc: Mel Gorman Cc: David Rientjes Cc: Oleg Nesterov Cc: Hugh Dickins Cc: Andrea Argangeli Cc: Rik van Riel Cc: Sasha Levin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index e97a05d..63ced70 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -287,9 +287,6 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, if (oom_task_origin(task)) return OOM_SCAN_SELECT; - if (task_will_free_mem(task) && !is_sysrq_oom(oc)) - return OOM_SCAN_ABORT; - return OOM_SCAN_OK; } -- cgit v1.1 From fcff7d7eebe6d31e2ce20d994555c86a90197034 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 17 Mar 2016 14:17:29 -0700 Subject: mm: memcontrol: do not bypass slab charge if memcg is offline Slab pages are charged in two steps. First, an appropriate per memcg cache is selected (see memcg_kmem_get_cache) basing on the current context, then the new slab page is charged to the memory cgroup which the selected cache was created for (see memcg_charge_slab -> __memcg_kmem_charge_memcg). It is OK to bypass kmemcg charge at step 1, but if step 1 succeeded and we successfully allocated a new slab page, step 2 must be performed, otherwise we would get a per memcg kmem cache which contains a slab that does not hold a reference to the memory cgroup owning the cache. Since per memcg kmem caches are destroyed on memcg css free, this could result in freeing a cache while there are still active objects in it. However, currently we will bypass slab page charge if the memory cgroup owning the cache is offline (see __memcg_kmem_charge_memcg). This is very unlikely to occur in practice, because for this to happen a process must be migrated to a different cgroup and the old cgroup must be removed while the process is in kmalloc somewhere between steps 1 and 2 (e.g. trying to allocate a new page). Nevertheless, it's still better to eliminate such a possibility. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 42882c1..5c9d45e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2325,9 +2325,6 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, struct page_counter *counter; int ret; - if (!memcg_kmem_online(memcg)) - return 0; - ret = try_charge(memcg, gfp, nr_pages); if (ret) return ret; @@ -2346,10 +2343,11 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) { struct mem_cgroup *memcg; - int ret; + int ret = 0; memcg = get_mem_cgroup_from_mm(current->mm); - ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg); + if (memcg_kmem_online(memcg)) + ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg); css_put(&memcg->css); return ret; } -- cgit v1.1 From 72b54e7314a2e7a68567c92bbb32fe2598a3c783 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 17 Mar 2016 14:17:32 -0700 Subject: mm: memcontrol: make tree_{stat,events} fetch all stats Currently, tree_{stat,events} helpers can only get one stat index at a time, so when there are a lot of stats to be reported one has to call it over and over again (see memory_stat_show). This is neither effective, nor does it look good. Instead, let's make these helpers take a snapshot of all available counters. Signed-off-by: Vladimir Davydov Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 67 +++++++++++++++++++++++++++++++++------------------------ 1 file changed, 39 insertions(+), 28 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5c9d45e..4302660 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2717,39 +2717,48 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, return retval; } -static unsigned long tree_stat(struct mem_cgroup *memcg, - enum mem_cgroup_stat_index idx) +static void tree_stat(struct mem_cgroup *memcg, unsigned long *stat) { struct mem_cgroup *iter; - unsigned long val = 0; + int i; - for_each_mem_cgroup_tree(iter, memcg) - val += mem_cgroup_read_stat(iter, idx); + memset(stat, 0, sizeof(*stat) * MEMCG_NR_STAT); - return val; + for_each_mem_cgroup_tree(iter, memcg) { + for (i = 0; i < MEMCG_NR_STAT; i++) + stat[i] += mem_cgroup_read_stat(iter, i); + } } -static unsigned long tree_events(struct mem_cgroup *memcg, - enum mem_cgroup_events_index idx) +static void tree_events(struct mem_cgroup *memcg, unsigned long *events) { struct mem_cgroup *iter; - unsigned long val = 0; + int i; - for_each_mem_cgroup_tree(iter, memcg) - val += mem_cgroup_read_events(iter, idx); + memset(events, 0, sizeof(*events) * MEMCG_NR_EVENTS); - return val; + for_each_mem_cgroup_tree(iter, memcg) { + for (i = 0; i < MEMCG_NR_EVENTS; i++) + events[i] += mem_cgroup_read_events(iter, i); + } } static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) { - unsigned long val; + unsigned long val = 0; if (mem_cgroup_is_root(memcg)) { - val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE); - val += tree_stat(memcg, MEM_CGROUP_STAT_RSS); - if (swap) - val += tree_stat(memcg, MEM_CGROUP_STAT_SWAP); + struct mem_cgroup *iter; + + for_each_mem_cgroup_tree(iter, memcg) { + val += mem_cgroup_read_stat(iter, + MEM_CGROUP_STAT_CACHE); + val += mem_cgroup_read_stat(iter, + MEM_CGROUP_STAT_RSS); + if (swap) + val += mem_cgroup_read_stat(iter, + MEM_CGROUP_STAT_SWAP); + } } else { if (!swap) val = page_counter_read(&memcg->memory); @@ -5075,6 +5084,8 @@ static int memory_events_show(struct seq_file *m, void *v) static int memory_stat_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + unsigned long stat[MEMCG_NR_STAT]; + unsigned long events[MEMCG_NR_EVENTS]; int i; /* @@ -5088,22 +5099,22 @@ static int memory_stat_show(struct seq_file *m, void *v) * Current memory state: */ + tree_stat(memcg, stat); + tree_events(memcg, events); + seq_printf(m, "anon %llu\n", - (u64)tree_stat(memcg, MEM_CGROUP_STAT_RSS) * PAGE_SIZE); + (u64)stat[MEM_CGROUP_STAT_RSS] * PAGE_SIZE); seq_printf(m, "file %llu\n", - (u64)tree_stat(memcg, MEM_CGROUP_STAT_CACHE) * PAGE_SIZE); + (u64)stat[MEM_CGROUP_STAT_CACHE] * PAGE_SIZE); seq_printf(m, "sock %llu\n", - (u64)tree_stat(memcg, MEMCG_SOCK) * PAGE_SIZE); + (u64)stat[MEMCG_SOCK] * PAGE_SIZE); seq_printf(m, "file_mapped %llu\n", - (u64)tree_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED) * - PAGE_SIZE); + (u64)stat[MEM_CGROUP_STAT_FILE_MAPPED] * PAGE_SIZE); seq_printf(m, "file_dirty %llu\n", - (u64)tree_stat(memcg, MEM_CGROUP_STAT_DIRTY) * - PAGE_SIZE); + (u64)stat[MEM_CGROUP_STAT_DIRTY] * PAGE_SIZE); seq_printf(m, "file_writeback %llu\n", - (u64)tree_stat(memcg, MEM_CGROUP_STAT_WRITEBACK) * - PAGE_SIZE); + (u64)stat[MEM_CGROUP_STAT_WRITEBACK] * PAGE_SIZE); for (i = 0; i < NR_LRU_LISTS; i++) { struct mem_cgroup *mi; @@ -5118,9 +5129,9 @@ static int memory_stat_show(struct seq_file *m, void *v) /* Accumulated memory events */ seq_printf(m, "pgfault %lu\n", - tree_events(memcg, MEM_CGROUP_EVENTS_PGFAULT)); + events[MEM_CGROUP_EVENTS_PGFAULT]); seq_printf(m, "pgmajfault %lu\n", - tree_events(memcg, MEM_CGROUP_EVENTS_PGMAJFAULT)); + events[MEM_CGROUP_EVENTS_PGMAJFAULT]); return 0; } -- cgit v1.1 From 27ee57c93ff00b8a2d6c6dd6b0b3dddda7b43b77 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 17 Mar 2016 14:17:35 -0700 Subject: mm: memcontrol: report slab usage in cgroup2 memory.stat Show how much memory is used for storing reclaimable and unreclaimable in-kernel data structures allocated from slab caches. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 8 ++++++++ mm/slab.c | 8 +++++--- mm/slab.h | 30 ++++++++++++++++++++++++++++-- mm/slub.c | 3 ++- 4 files changed, 43 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4302660..3ad64bf 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5106,6 +5106,9 @@ static int memory_stat_show(struct seq_file *m, void *v) (u64)stat[MEM_CGROUP_STAT_RSS] * PAGE_SIZE); seq_printf(m, "file %llu\n", (u64)stat[MEM_CGROUP_STAT_CACHE] * PAGE_SIZE); + seq_printf(m, "slab %llu\n", + (u64)(stat[MEMCG_SLAB_RECLAIMABLE] + + stat[MEMCG_SLAB_UNRECLAIMABLE]) * PAGE_SIZE); seq_printf(m, "sock %llu\n", (u64)stat[MEMCG_SOCK] * PAGE_SIZE); @@ -5126,6 +5129,11 @@ static int memory_stat_show(struct seq_file *m, void *v) mem_cgroup_lru_names[i], (u64)val * PAGE_SIZE); } + seq_printf(m, "slab_reclaimable %llu\n", + (u64)stat[MEMCG_SLAB_RECLAIMABLE] * PAGE_SIZE); + seq_printf(m, "slab_unreclaimable %llu\n", + (u64)stat[MEMCG_SLAB_UNRECLAIMABLE] * PAGE_SIZE); + /* Accumulated memory events */ seq_printf(m, "pgfault %lu\n", diff --git a/mm/slab.c b/mm/slab.c index 852fc5c..56dd0df 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1442,9 +1442,10 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, */ static void kmem_freepages(struct kmem_cache *cachep, struct page *page) { - const unsigned long nr_freed = (1 << cachep->gfporder); + int order = cachep->gfporder; + unsigned long nr_freed = (1 << order); - kmemcheck_free_shadow(page, cachep->gfporder); + kmemcheck_free_shadow(page, order); if (cachep->flags & SLAB_RECLAIM_ACCOUNT) sub_zone_page_state(page_zone(page), @@ -1461,7 +1462,8 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page) if (current->reclaim_state) current->reclaim_state->reclaimed_slab += nr_freed; - __free_kmem_pages(page, cachep->gfporder); + memcg_uncharge_slab(page, order, cachep); + __free_pages(page, order); } static void kmem_rcu_free(struct rcu_head *head) diff --git a/mm/slab.h b/mm/slab.h index b793436..ff39a8f 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -246,12 +246,33 @@ static __always_inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order, struct kmem_cache *s) { + int ret; + if (!memcg_kmem_enabled()) return 0; if (is_root_cache(s)) return 0; - return __memcg_kmem_charge_memcg(page, gfp, order, - s->memcg_params.memcg); + + ret = __memcg_kmem_charge_memcg(page, gfp, order, + s->memcg_params.memcg); + if (ret) + return ret; + + memcg_kmem_update_page_stat(page, + (s->flags & SLAB_RECLAIM_ACCOUNT) ? + MEMCG_SLAB_RECLAIMABLE : MEMCG_SLAB_UNRECLAIMABLE, + 1 << order); + return 0; +} + +static __always_inline void memcg_uncharge_slab(struct page *page, int order, + struct kmem_cache *s) +{ + memcg_kmem_update_page_stat(page, + (s->flags & SLAB_RECLAIM_ACCOUNT) ? + MEMCG_SLAB_RECLAIMABLE : MEMCG_SLAB_UNRECLAIMABLE, + -(1 << order)); + memcg_kmem_uncharge(page, order); } extern void slab_init_memcg_params(struct kmem_cache *); @@ -294,6 +315,11 @@ static inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order, return 0; } +static inline void memcg_uncharge_slab(struct page *page, int order, + struct kmem_cache *s) +{ +} + static inline void slab_init_memcg_params(struct kmem_cache *s) { } diff --git a/mm/slub.c b/mm/slub.c index 6c91324..712d534 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1540,7 +1540,8 @@ static void __free_slab(struct kmem_cache *s, struct page *page) page_mapcount_reset(page); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += pages; - __free_kmem_pages(page, order); + memcg_uncharge_slab(page, order, s); + __free_pages(page, order); } #define need_reserve_slab_rcu \ -- cgit v1.1 From 12580e4b54ba8a1b22ec977c200be0174ca42348 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 17 Mar 2016 14:17:38 -0700 Subject: mm: memcontrol: report kernel stack usage in cgroup2 memory.stat Show how much memory is allocated to kernel stacks. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3ad64bf..4b7dda7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5106,6 +5106,8 @@ static int memory_stat_show(struct seq_file *m, void *v) (u64)stat[MEM_CGROUP_STAT_RSS] * PAGE_SIZE); seq_printf(m, "file %llu\n", (u64)stat[MEM_CGROUP_STAT_CACHE] * PAGE_SIZE); + seq_printf(m, "kernel_stack %llu\n", + (u64)stat[MEMCG_KERNEL_STACK] * PAGE_SIZE); seq_printf(m, "slab %llu\n", (u64)(stat[MEMCG_SLAB_RECLAIMABLE] + stat[MEMCG_SLAB_UNRECLAIMABLE]) * PAGE_SIZE); -- cgit v1.1 From 832fc1de01aea28255cb11d270679b7f1273f0d7 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 17 Mar 2016 14:17:41 -0700 Subject: /proc/kpageflags: return KPF_BUDDY for "tail" buddy pages Currently /proc/kpageflags returns nothing for "tail" buddy pages, which is inconvenient when grasping how free pages are distributed. This patch sets KPF_BUDDY for such pages. With this patch: $ grep MemFree /proc/meminfo ; tools/vm/page-types -b buddy MemFree: 3134992 kB flags page-count MB symbolic-flags long-symbolic-flags 0x0000000000000400 779272 3044 __________B_______________________________ buddy 0x0000000000000c00 4385 17 __________BM______________________________ buddy,mmap total 783657 3061 783657 pages is 3134628 kB (roughly consistent with the global counter,) so it's OK. [akpm@linux-foundation.org: update comment, per Naoya] Signed-off-by: Naoya Horiguchi Reviewed-by: Vladimir Davydov > Cc: Konstantin Khlebnikov Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 3 --- mm/page_alloc.c | 2 -- 2 files changed, 5 deletions(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index ad9400d..b95952c 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -148,9 +148,6 @@ extern int __isolate_free_page(struct page *page, unsigned int order); extern void __free_pages_bootmem(struct page *page, unsigned long pfn, unsigned int order); extern void prep_compound_page(struct page *page, unsigned int order); -#ifdef CONFIG_MEMORY_FAILURE -extern bool is_free_buddy_page(struct page *page); -#endif extern int user_min_free_kbytes; #if defined CONFIG_COMPACTION || defined CONFIG_CMA diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c46b75d..c7332a4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7152,7 +7152,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) } #endif -#ifdef CONFIG_MEMORY_FAILURE bool is_free_buddy_page(struct page *page) { struct zone *zone = page_zone(page); @@ -7171,4 +7170,3 @@ bool is_free_buddy_page(struct page *page) return order < MAX_ORDER; } -#endif -- cgit v1.1 From f48d97f340cbb0c323fa7a7b36bd76a108a9f49f Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 17 Mar 2016 14:17:49 -0700 Subject: mm/vmalloc: query dynamic DEBUG_PAGEALLOC setting As CONFIG_DEBUG_PAGEALLOC can be enabled/disabled via kernel parameters we can optimize some cases by checking the enablement state. This is follow-up work for Christian's Optimize CONFIG_DEBUG_PAGEALLOC: https://lkml.org/lkml/2016/1/27/194 Remaining work is to make sparc to be aware of this but it looks not easy for me so I skip that in this series. This patch (of 5): We can disable debug_pagealloc processing even if the code is complied with CONFIG_DEBUG_PAGEALLOC. This patch changes the code to query whether it is enabled or not in runtime. [akpm@linux-foundation.org: update comment, per David. Adjust comment to use 80 cols] Signed-off-by: Joonsoo Kim Reviewed-by: Christian Borntraeger Acked-by: David Rientjes Cc: Benjamin Herrenschmidt Cc: Takashi Iwai Cc: Chris Metcalf Cc: Christoph Lameter Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index fb42a5b..d4b2e34 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -531,22 +531,21 @@ static void unmap_vmap_area(struct vmap_area *va) static void vmap_debug_free_range(unsigned long start, unsigned long end) { /* - * Unmap page tables and force a TLB flush immediately if - * CONFIG_DEBUG_PAGEALLOC is set. This catches use after free - * bugs similarly to those in linear kernel virtual address - * space after a page has been freed. + * Unmap page tables and force a TLB flush immediately if pagealloc + * debugging is enabled. This catches use after free bugs similarly to + * those in linear kernel virtual address space after a page has been + * freed. * - * All the lazy freeing logic is still retained, in order to - * minimise intrusiveness of this debugging feature. + * All the lazy freeing logic is still retained, in order to minimise + * intrusiveness of this debugging feature. * - * This is going to be *slow* (linear kernel virtual address - * debugging doesn't do a broadcast TLB flush so it is a lot - * faster). + * This is going to be *slow* (linear kernel virtual address debugging + * doesn't do a broadcast TLB flush so it is a lot faster). */ -#ifdef CONFIG_DEBUG_PAGEALLOC - vunmap_page_range(start, end); - flush_tlb_kernel_range(start, end); -#endif + if (debug_pagealloc_enabled()) { + vunmap_page_range(start, end); + flush_tlb_kernel_range(start, end); + } } /* -- cgit v1.1 From 922d566cdcb7166c729ff67bb15ff5f93a3774b6 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 17 Mar 2016 14:17:53 -0700 Subject: mm/slub: query dynamic DEBUG_PAGEALLOC setting We can disable debug_pagealloc processing even if the code is compiled with CONFIG_DEBUG_PAGEALLOC. This patch changes the code to query whether it is enabled or not in runtime. [akpm@linux-foundation.org: clean up code, per Christian] Signed-off-by: Joonsoo Kim Reviewed-by: Christian Borntraeger Cc: Benjamin Herrenschmidt Cc: Chris Metcalf Cc: Christoph Lameter Cc: David Rientjes Cc: Pekka Enberg Cc: Takashi Iwai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 712d534..2f2f04d 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -254,11 +254,10 @@ static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) { void *p; -#ifdef CONFIG_DEBUG_PAGEALLOC + if (!debug_pagealloc_enabled()) + return get_freepointer(s, object); + probe_kernel_read(&p, (void **)(object + s->offset), sizeof(p)); -#else - p = get_freepointer(s, object); -#endif return p; } -- cgit v1.1 From 505f6d22dbc63f333d1178dc80264e40b5c35268 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 17 Mar 2016 14:17:56 -0700 Subject: sound: query dynamic DEBUG_PAGEALLOC setting We can disable debug_pagealloc processing even if the code is compiled with CONFIG_DEBUG_PAGEALLOC. This patch changes the code to query whether it is enabled or not in runtime. [akpm@linux-foundation.org: export _debug_pagealloc_enabled to modules] Signed-off-by: Joonsoo Kim Acked-by: David Rientjes Acked-by: Takashi Iwai Cc: Benjamin Herrenschmidt Cc: Chris Metcalf Cc: Christian Borntraeger Cc: Christoph Lameter Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c7332a4..b1fc19e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -498,6 +498,7 @@ void prep_compound_page(struct page *page, unsigned int order) unsigned int _debug_guardpage_minorder; bool _debug_pagealloc_enabled __read_mostly = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT); +EXPORT_SYMBOL(_debug_pagealloc_enabled); bool _debug_guardpage_enabled __read_mostly; static int __init early_debug_pagealloc(char *buf) -- cgit v1.1 From 81c5857b279e6b18f6ff0d1975e80a07af542cd1 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 17 Mar 2016 14:18:05 -0700 Subject: mm, kswapd: remove bogus check of balance_classzone_idx During work on kcompactd integration I have spotted a confusing check of balance_classzone_idx, which I believe is bogus. The balanced_classzone_idx is filled by balance_pgdat() as the highest zone it attempted to balance. This was introduced by commit dc83edd941f4 ("mm: kswapd: use the classzone idx that kswapd was using for sleeping_prematurely()"). The intention is that (as expressed in today's function names), the value used for kswapd_shrink_zone() calls in balance_pgdat() is the same as for the decisions in kswapd_try_to_sleep(). An unwanted side-effect of that commit was breaking the checks in kswapd() whether there was another kswapd_wakeup with a tighter (=lower) classzone_idx. Commits 215ddd6664ce ("mm: vmscan: only read new_classzone_idx from pgdat when reclaiming successfully") and d2ebd0f6b895 ("kswapd: avoid unnecessary rebalance after an unsuccessful balancing") tried to fixed, but apparently introduced a bogus check that this patch removes. Consider zone indexes X < Y < Z, where: - Z is the value used for the first kswapd wakeup. - Y is returned as balanced_classzone_idx, which means zones with index higher than Y (including Z) were found to be unreclaimable. - X is the value used for the second kswapd wakeup The new wakeup with value X means that kswapd is now supposed to balance harder all zones with index <= X. But instead, due to Y < Z, it will go sleep and won't read the new value X. This is subtly wrong. The effect of this patch is that kswapd will react better in some situations, where e.g. the first wakeup is for ZONE_DMA32, the second is for ZONE_DMA, and due to unreclaimable ZONE_NORMAL. Before this patch, kswapd would go sleep instead of reclaiming ZONE_DMA harder. I expect these situations are very rare, and more value is in better maintainability due to the removal of confusing and bogus check. Signed-off-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: "Kirill A. Shutemov" Cc: Rik van Riel Cc: Joonsoo Kim Cc: Mel Gorman Cc: David Rientjes Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index dd98447..5dcc711 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3468,8 +3468,7 @@ static int kswapd(void *p) * new request of a similar or harder type will succeed soon * so consider going to sleep on the basis we reclaimed at */ - if (balanced_classzone_idx >= new_classzone_idx && - balanced_order == new_order) { + if (balanced_order == new_order) { new_order = pgdat->kswapd_max_order; new_classzone_idx = pgdat->classzone_idx; pgdat->kswapd_max_order = 0; -- cgit v1.1 From 698b1b30642f1ff0ea10ef1de9745ab633031377 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 17 Mar 2016 14:18:08 -0700 Subject: mm, compaction: introduce kcompactd Memory compaction can be currently performed in several contexts: - kswapd balancing a zone after a high-order allocation failure - direct compaction to satisfy a high-order allocation, including THP page fault attemps - khugepaged trying to collapse a hugepage - manually from /proc The purpose of compaction is two-fold. The obvious purpose is to satisfy a (pending or future) high-order allocation, and is easy to evaluate. The other purpose is to keep overal memory fragmentation low and help the anti-fragmentation mechanism. The success wrt the latter purpose is more The current situation wrt the purposes has a few drawbacks: - compaction is invoked only when a high-order page or hugepage is not available (or manually). This might be too late for the purposes of keeping memory fragmentation low. - direct compaction increases latency of allocations. Again, it would be better if compaction was performed asynchronously to keep fragmentation low, before the allocation itself comes. - (a special case of the previous) the cost of compaction during THP page faults can easily offset the benefits of THP. - kswapd compaction appears to be complex, fragile and not working in some scenarios. It could also end up compacting for a high-order allocation request when it should be reclaiming memory for a later order-0 request. To improve the situation, we should be able to benefit from an equivalent of kswapd, but for compaction - i.e. a background thread which responds to fragmentation and the need for high-order allocations (including hugepages) somewhat proactively. One possibility is to extend the responsibilities of kswapd, which could however complicate its design too much. It should be better to let kswapd handle reclaim, as order-0 allocations are often more critical than high-order ones. Another possibility is to extend khugepaged, but this kthread is a single instance and tied to THP configs. This patch goes with the option of a new set of per-node kthreads called kcompactd, and lays the foundations, without introducing any new tunables. The lifecycle mimics kswapd kthreads, including the memory hotplug hooks. For compaction, kcompactd uses the standard compaction_suitable() and ompact_finished() criteria and the deferred compaction functionality. Unlike direct compaction, it uses only sync compaction, as there's no allocation latency to minimize. This patch doesn't yet add a call to wakeup_kcompactd. The kswapd compact/reclaim loop for high-order pages will be replaced by waking up kcompactd in the next patch with the description of what's wrong with the old approach. Waking up of the kcompactd threads is also tied to kswapd activity and follows these rules: - we don't want to affect any fastpaths, so wake up kcompactd only from the slowpath, as it's done for kswapd - if kswapd is doing reclaim, it's more important than compaction, so don't invoke kcompactd until kswapd goes to sleep - the target order used for kswapd is passed to kcompactd Future possible future uses for kcompactd include the ability to wake up kcompactd on demand in special situations, such as when hugepages are not available (currently not done due to __GFP_NO_KSWAPD) or when a fragmentation event (i.e. __rmqueue_fallback()) occurs. It's also possible to perform periodic compaction with kcompactd. [arnd@arndb.de: fix build errors with kcompactd] [paul.gortmaker@windriver.com: don't use modular references for non modular code] Signed-off-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: "Kirill A. Shutemov" Cc: Rik van Riel Cc: Joonsoo Kim Cc: Mel Gorman Cc: David Rientjes Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Arnd Bergmann Signed-off-by: Paul Gortmaker Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 222 ++++++++++++++++++++++++++++++++++++++++++++++++++++ mm/memory_hotplug.c | 9 ++- mm/page_alloc.c | 3 + mm/vmstat.c | 1 + 4 files changed, 233 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 93f71d9..5b2bfba 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -7,6 +7,7 @@ * * Copyright IBM Corp. 2007-2010 Mel Gorman */ +#include #include #include #include @@ -17,6 +18,8 @@ #include #include #include +#include +#include #include "internal.h" #ifdef CONFIG_COMPACTION @@ -1736,4 +1739,223 @@ void compaction_unregister_node(struct node *node) } #endif /* CONFIG_SYSFS && CONFIG_NUMA */ +static inline bool kcompactd_work_requested(pg_data_t *pgdat) +{ + return pgdat->kcompactd_max_order > 0; +} + +static bool kcompactd_node_suitable(pg_data_t *pgdat) +{ + int zoneid; + struct zone *zone; + enum zone_type classzone_idx = pgdat->kcompactd_classzone_idx; + + for (zoneid = 0; zoneid < classzone_idx; zoneid++) { + zone = &pgdat->node_zones[zoneid]; + + if (!populated_zone(zone)) + continue; + + if (compaction_suitable(zone, pgdat->kcompactd_max_order, 0, + classzone_idx) == COMPACT_CONTINUE) + return true; + } + + return false; +} + +static void kcompactd_do_work(pg_data_t *pgdat) +{ + /* + * With no special task, compact all zones so that a page of requested + * order is allocatable. + */ + int zoneid; + struct zone *zone; + struct compact_control cc = { + .order = pgdat->kcompactd_max_order, + .classzone_idx = pgdat->kcompactd_classzone_idx, + .mode = MIGRATE_SYNC_LIGHT, + .ignore_skip_hint = true, + + }; + bool success = false; + + trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order, + cc.classzone_idx); + count_vm_event(KCOMPACTD_WAKE); + + for (zoneid = 0; zoneid < cc.classzone_idx; zoneid++) { + int status; + + zone = &pgdat->node_zones[zoneid]; + if (!populated_zone(zone)) + continue; + + if (compaction_deferred(zone, cc.order)) + continue; + + if (compaction_suitable(zone, cc.order, 0, zoneid) != + COMPACT_CONTINUE) + continue; + + cc.nr_freepages = 0; + cc.nr_migratepages = 0; + cc.zone = zone; + INIT_LIST_HEAD(&cc.freepages); + INIT_LIST_HEAD(&cc.migratepages); + + status = compact_zone(zone, &cc); + + if (zone_watermark_ok(zone, cc.order, low_wmark_pages(zone), + cc.classzone_idx, 0)) { + success = true; + compaction_defer_reset(zone, cc.order, false); + } else if (status == COMPACT_COMPLETE) { + /* + * We use sync migration mode here, so we defer like + * sync direct compaction does. + */ + defer_compaction(zone, cc.order); + } + + VM_BUG_ON(!list_empty(&cc.freepages)); + VM_BUG_ON(!list_empty(&cc.migratepages)); + } + + /* + * Regardless of success, we are done until woken up next. But remember + * the requested order/classzone_idx in case it was higher/tighter than + * our current ones + */ + if (pgdat->kcompactd_max_order <= cc.order) + pgdat->kcompactd_max_order = 0; + if (pgdat->kcompactd_classzone_idx >= cc.classzone_idx) + pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1; +} + +void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx) +{ + if (!order) + return; + + if (pgdat->kcompactd_max_order < order) + pgdat->kcompactd_max_order = order; + + if (pgdat->kcompactd_classzone_idx > classzone_idx) + pgdat->kcompactd_classzone_idx = classzone_idx; + + if (!waitqueue_active(&pgdat->kcompactd_wait)) + return; + + if (!kcompactd_node_suitable(pgdat)) + return; + + trace_mm_compaction_wakeup_kcompactd(pgdat->node_id, order, + classzone_idx); + wake_up_interruptible(&pgdat->kcompactd_wait); +} + +/* + * The background compaction daemon, started as a kernel thread + * from the init process. + */ +static int kcompactd(void *p) +{ + pg_data_t *pgdat = (pg_data_t*)p; + struct task_struct *tsk = current; + + const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); + + if (!cpumask_empty(cpumask)) + set_cpus_allowed_ptr(tsk, cpumask); + + set_freezable(); + + pgdat->kcompactd_max_order = 0; + pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1; + + while (!kthread_should_stop()) { + trace_mm_compaction_kcompactd_sleep(pgdat->node_id); + wait_event_freezable(pgdat->kcompactd_wait, + kcompactd_work_requested(pgdat)); + + kcompactd_do_work(pgdat); + } + + return 0; +} + +/* + * This kcompactd start function will be called by init and node-hot-add. + * On node-hot-add, kcompactd will moved to proper cpus if cpus are hot-added. + */ +int kcompactd_run(int nid) +{ + pg_data_t *pgdat = NODE_DATA(nid); + int ret = 0; + + if (pgdat->kcompactd) + return 0; + + pgdat->kcompactd = kthread_run(kcompactd, pgdat, "kcompactd%d", nid); + if (IS_ERR(pgdat->kcompactd)) { + pr_err("Failed to start kcompactd on node %d\n", nid); + ret = PTR_ERR(pgdat->kcompactd); + pgdat->kcompactd = NULL; + } + return ret; +} + +/* + * Called by memory hotplug when all memory in a node is offlined. Caller must + * hold mem_hotplug_begin/end(). + */ +void kcompactd_stop(int nid) +{ + struct task_struct *kcompactd = NODE_DATA(nid)->kcompactd; + + if (kcompactd) { + kthread_stop(kcompactd); + NODE_DATA(nid)->kcompactd = NULL; + } +} + +/* + * It's optimal to keep kcompactd on the same CPUs as their memory, but + * not required for correctness. So if the last cpu in a node goes + * away, we get changed to run anywhere: as the first one comes back, + * restore their cpu bindings. + */ +static int cpu_callback(struct notifier_block *nfb, unsigned long action, + void *hcpu) +{ + int nid; + + if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { + for_each_node_state(nid, N_MEMORY) { + pg_data_t *pgdat = NODE_DATA(nid); + const struct cpumask *mask; + + mask = cpumask_of_node(pgdat->node_id); + + if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) + /* One of our CPUs online: restore mask */ + set_cpus_allowed_ptr(pgdat->kcompactd, mask); + } + } + return NOTIFY_OK; +} + +static int __init kcompactd_init(void) +{ + int nid; + + for_each_node_state(nid, N_MEMORY) + kcompactd_run(nid); + hotcpu_notifier(cpu_callback, 0); + return 0; +} +subsys_initcall(kcompactd_init) + #endif /* CONFIG_COMPACTION */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 24ea063..d9bcb26 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -33,6 +33,7 @@ #include #include #include +#include #include @@ -1105,8 +1106,10 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ init_per_zone_wmark_min(); - if (onlined_pages) + if (onlined_pages) { kswapd_run(zone_to_nid(zone)); + kcompactd_run(nid); + } vm_total_pages = nr_free_pagecache_pages(); @@ -1880,8 +1883,10 @@ repeat: zone_pcp_update(zone); node_states_clear_node(node, &arg); - if (arg.status_change_nid >= 0) + if (arg.status_change_nid >= 0) { kswapd_stop(node); + kcompactd_stop(node); + } vm_total_pages = nr_free_pagecache_pages(); writeback_set_ratelimit(); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b1fc19e..25a75da 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5405,6 +5405,9 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) #endif init_waitqueue_head(&pgdat->kswapd_wait); init_waitqueue_head(&pgdat->pfmemalloc_wait); +#ifdef CONFIG_COMPACTION + init_waitqueue_head(&pgdat->kcompactd_wait); +#endif pgdat_page_ext_init(pgdat); for (j = 0; j < MAX_NR_ZONES; j++) { diff --git a/mm/vmstat.c b/mm/vmstat.c index 69ce64f..f800662 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -826,6 +826,7 @@ const char * const vmstat_text[] = { "compact_stall", "compact_fail", "compact_success", + "compact_daemon_wake", #endif #ifdef CONFIG_HUGETLB_PAGE -- cgit v1.1 From e888ca3545dc6823603b976e40b62af2c68b6fcc Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 17 Mar 2016 14:18:12 -0700 Subject: mm, memory hotplug: small cleanup in online_pages() We can reuse the nid we've determined instead of repeated pfn_to_nid() usages. Also zone_to_nid() should be a bit cheaper in general than pfn_to_nid(). Signed-off-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: "Kirill A. Shutemov" Cc: Rik van Riel Cc: Joonsoo Kim Cc: Mel Gorman Cc: David Rientjes Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index d9bcb26..c832ef3 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1055,7 +1055,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ arg.nr_pages = nr_pages; node_states_check_changes_online(nr_pages, zone, &arg); - nid = pfn_to_nid(pfn); + nid = zone_to_nid(zone); ret = memory_notify(MEM_GOING_ONLINE, &arg); ret = notifier_to_errno(ret); @@ -1095,7 +1095,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ pgdat_resize_unlock(zone->zone_pgdat, &flags); if (onlined_pages) { - node_states_set_node(zone_to_nid(zone), &arg); + node_states_set_node(nid, &arg); if (need_zonelists_rebuild) build_all_zonelists(NULL, NULL); else @@ -1107,7 +1107,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ init_per_zone_wmark_min(); if (onlined_pages) { - kswapd_run(zone_to_nid(zone)); + kswapd_run(nid); kcompactd_run(nid); } -- cgit v1.1 From accf62422b3a67fce8ce086aa81c8300ddbf42be Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 17 Mar 2016 14:18:15 -0700 Subject: mm, kswapd: replace kswapd compaction with waking up kcompactd Similarly to direct reclaim/compaction, kswapd attempts to combine reclaim and compaction to attempt making memory allocation of given order available. The details differ from direct reclaim e.g. in having high watermark as a goal. The code involved in kswapd's reclaim/compaction decisions has evolved to be quite complex. Testing reveals that it doesn't actually work in at least one scenario, and closer inspection suggests that it could be greatly simplified without compromising on the goal (make high-order page available) or efficiency (don't reclaim too much). The simplification relieas of doing all compaction in kcompactd, which is simply woken up when high watermarks are reached by kswapd's reclaim. The scenario where kswapd compaction doesn't work was found with mmtests test stress-highalloc configured to attempt order-9 allocations without direct reclaim, just waking up kswapd. There was no compaction attempt from kswapd during the whole test. Some added instrumentation shows what happens: - balance_pgdat() sets end_zone to Normal, as it's not balanced - reclaim is attempted on DMA zone, which sets nr_attempted to 99, but it cannot reclaim anything, so sc.nr_reclaimed is 0 - for zones DMA32 and Normal, kswapd_shrink_zone uses testorder=0, so it merely checks if high watermarks were reached for base pages. This is true, so no reclaim is attempted. For DMA, testorder=0 wasn't used, as compaction_suitable() returned COMPACT_SKIPPED - even though the pgdat_needs_compaction flag wasn't set to false, no compaction happens due to the condition sc.nr_reclaimed > nr_attempted being false (as 0 < 99) - priority-- due to nr_reclaimed being 0, repeat until priority reaches 0 pgdat_balanced() is false as only the small zone DMA appears balanced (curiously in that check, watermark appears OK and compaction_suitable() returns COMPACT_PARTIAL, because a lower classzone_idx is used there) Now, even if it was decided that reclaim shouldn't be attempted on the DMA zone, the scenario would be the same, as (sc.nr_reclaimed=0 > nr_attempted=0) is also false. The condition really should use >= as the comment suggests. Then there is a mismatch in the check for setting pgdat_needs_compaction to false using low watermark, while the rest uses high watermark, and who knows what other subtlety. Hopefully this demonstrates that this is unsustainable. Luckily we can simplify this a lot. The reclaim/compaction decisions make sense for direct reclaim scenario, but in kswapd, our primary goal is to reach high watermark in order-0 pages. Afterwards we can attempt compaction just once. Unlike direct reclaim, we don't reclaim extra pages (over the high watermark), the current code already disallows it for good reasons. After this patch, we simply wake up kcompactd to process the pgdat, after we have either succeeded or failed to reach the high watermarks in kswapd, which goes to sleep. We pass kswapd's order and classzone_idx, so kcompactd can apply the same criteria to determine which zones are worth compacting. Note that we use the classzone_idx from wakeup_kswapd(), not balanced_classzone_idx which can include higher zones that kswapd tried to balance too, but didn't consider them in pgdat_balanced(). Since kswapd now cannot create high-order pages itself, we need to adjust how it determines the zones to be balanced. The key element here is adding a "highorder" parameter to zone_balanced, which, when set to false, makes it consider only order-0 watermark instead of the desired higher order (this was done previously by kswapd_shrink_zone(), but not elsewhere). This false is passed for example in pgdat_balanced(). Importantly, wakeup_kswapd() uses true to make sure kswapd and thus kcompactd are woken up for a high-order allocation failure. The last thing is to decide what to do with pageblock_skip bitmap handling. Compaction maintains a pageblock_skip bitmap to record pageblocks where isolation recently failed. This bitmap can be reset by three ways: 1) direct compaction is restarting after going through the full deferred cycle 2) kswapd goes to sleep, and some other direct compaction has previously finished scanning the whole zone and set zone->compact_blockskip_flush. Note that a successful direct compaction clears this flag. 3) compaction was invoked manually via trigger in /proc The case 2) is somewhat fuzzy to begin with, but after introducing kcompactd we should update it. The check for direct compaction in 1), and to set the flush flag in 2) use current_is_kswapd(), which doesn't work for kcompactd. Thus, this patch adds bool direct_compaction to compact_control to use in 2). For the case 1) we remove the check completely - unlike the former kswapd compaction, kcompactd does use the deferred compaction functionality, so flushing tied to restarting from deferred compaction makes sense here. Note that when kswapd goes to sleep, kcompactd is woken up, so it will see the flushed pageblock_skip bits. This is different from when the former kswapd compaction observed the bits and I believe it makes more sense. Kcompactd can afford to be more thorough than a direct compaction trying to limit allocation latency, or kswapd whose primary goal is to reclaim. For testing, I used stress-highalloc configured to do order-9 allocations with GFP_NOWAIT|__GFP_HIGH|__GFP_COMP, so they relied just on kswapd/kcompactd reclaim/compaction (the interfering kernel builds in phases 1 and 2 work as usual): stress-highalloc 4.5-rc1+before 4.5-rc1+after -nodirect -nodirect Success 1 Min 1.00 ( 0.00%) 5.00 (-66.67%) Success 1 Mean 1.40 ( 0.00%) 6.20 (-55.00%) Success 1 Max 2.00 ( 0.00%) 7.00 (-16.67%) Success 2 Min 1.00 ( 0.00%) 5.00 (-66.67%) Success 2 Mean 1.80 ( 0.00%) 6.40 (-52.38%) Success 2 Max 3.00 ( 0.00%) 7.00 (-16.67%) Success 3 Min 34.00 ( 0.00%) 62.00 ( 1.59%) Success 3 Mean 41.80 ( 0.00%) 63.80 ( 1.24%) Success 3 Max 53.00 ( 0.00%) 65.00 ( 2.99%) User 3166.67 3181.09 System 1153.37 1158.25 Elapsed 1768.53 1799.37 4.5-rc1+before 4.5-rc1+after -nodirect -nodirect Direct pages scanned 32938 32797 Kswapd pages scanned 2183166 2202613 Kswapd pages reclaimed 2152359 2143524 Direct pages reclaimed 32735 32545 Percentage direct scans 1% 1% THP fault alloc 579 612 THP collapse alloc 304 316 THP splits 0 0 THP fault fallback 793 778 THP collapse fail 11 16 Compaction stalls 1013 1007 Compaction success 92 67 Compaction failures 920 939 Page migrate success 238457 721374 Page migrate failure 23021 23469 Compaction pages isolated 504695 1479924 Compaction migrate scanned 661390 8812554 Compaction free scanned 13476658 84327916 Compaction cost 262 838 After this patch we see improvements in allocation success rate (especially for phase 3) along with increased compaction activity. The compaction stalls (direct compaction) in the interfering kernel builds (probably THP's) also decreased somewhat thanks to kcompactd activity, yet THP alloc successes improved a bit. Note that elapsed and user time isn't so useful for this benchmark, because of the background interference being unpredictable. It's just to quickly spot some major unexpected differences. System time is somewhat more useful and that didn't increase. Also (after adjusting mmtests' ftrace monitor): Time kswapd awake 2547781 2269241 Time kcompactd awake 0 119253 Time direct compacting 939937 557649 Time kswapd compacting 0 0 Time kcompactd compacting 0 119099 The decrease of overal time spent compacting appears to not match the increased compaction stats. I suspect the tasks get rescheduled and since the ftrace monitor doesn't see that, the reported time is wall time, not CPU time. But arguably direct compactors care about overall latency anyway, whether busy compacting or waiting for CPU doesn't matter. And that latency seems to almost halved. It's also interesting how much time kswapd spent awake just going through all the priorities and failing to even try compacting, over and over. We can also configure stress-highalloc to perform both direct reclaim/compaction and wakeup kswapd/kcompactd, by using GFP_KERNEL|__GFP_HIGH|__GFP_COMP: stress-highalloc 4.5-rc1+before 4.5-rc1+after -direct -direct Success 1 Min 4.00 ( 0.00%) 9.00 (-50.00%) Success 1 Mean 8.00 ( 0.00%) 10.00 (-19.05%) Success 1 Max 12.00 ( 0.00%) 11.00 ( 15.38%) Success 2 Min 4.00 ( 0.00%) 9.00 (-50.00%) Success 2 Mean 8.20 ( 0.00%) 10.00 (-16.28%) Success 2 Max 13.00 ( 0.00%) 11.00 ( 8.33%) Success 3 Min 75.00 ( 0.00%) 74.00 ( 1.33%) Success 3 Mean 75.60 ( 0.00%) 75.20 ( 0.53%) Success 3 Max 77.00 ( 0.00%) 76.00 ( 0.00%) User 3344.73 3246.04 System 1194.24 1172.29 Elapsed 1838.04 1836.76 4.5-rc1+before 4.5-rc1+after -direct -direct Direct pages scanned 125146 120966 Kswapd pages scanned 2119757 2135012 Kswapd pages reclaimed 2073183 2108388 Direct pages reclaimed 124909 120577 Percentage direct scans 5% 5% THP fault alloc 599 652 THP collapse alloc 323 354 THP splits 0 0 THP fault fallback 806 793 THP collapse fail 17 16 Compaction stalls 2457 2025 Compaction success 906 518 Compaction failures 1551 1507 Page migrate success 2031423 2360608 Page migrate failure 32845 40852 Compaction pages isolated 4129761 4802025 Compaction migrate scanned 11996712 21750613 Compaction free scanned 214970969 344372001 Compaction cost 2271 2694 In this scenario, this patch doesn't change the overall success rate as direct compaction already tries all it can. There's however significant reduction in direct compaction stalls (that is, the number of allocations that went into direct compaction). The number of successes (i.e. direct compaction stalls that ended up with successful allocation) is reduced by the same number. This means the offload to kcompactd is working as expected, and direct compaction is reduced either due to detecting contention, or compaction deferred by kcompactd. In the previous version of this patchset there was some apparent reduction of success rate, but the changes in this version (such as using sync compaction only), new baseline kernel, and/or averaging results from 5 executions (my bet), made this go away. Ftrace-based stats seem to roughly agree: Time kswapd awake 2532984 2326824 Time kcompactd awake 0 257916 Time direct compacting 864839 735130 Time kswapd compacting 0 0 Time kcompactd compacting 0 257585 Signed-off-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: "Kirill A. Shutemov" Cc: Rik van Riel Cc: Joonsoo Kim Cc: Mel Gorman Cc: David Rientjes Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 10 ++-- mm/internal.h | 1 + mm/vmscan.c | 147 ++++++++++++++++++-------------------------------------- 3 files changed, 54 insertions(+), 104 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 5b2bfba..ccf97b0 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1191,11 +1191,11 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc, /* * Mark that the PG_migrate_skip information should be cleared - * by kswapd when it goes to sleep. kswapd does not set the + * by kswapd when it goes to sleep. kcompactd does not set the * flag itself as the decision to be clear should be directly * based on an allocation request. */ - if (!current_is_kswapd()) + if (cc->direct_compaction) zone->compact_blockskip_flush = true; return COMPACT_COMPLETE; @@ -1338,10 +1338,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) /* * Clear pageblock skip if there were failures recently and compaction - * is about to be retried after being deferred. kswapd does not do - * this reset as it'll reset the cached information when going to sleep. + * is about to be retried after being deferred. */ - if (compaction_restarting(zone, cc->order) && !current_is_kswapd()) + if (compaction_restarting(zone, cc->order)) __reset_isolation_suitable(zone); /* @@ -1477,6 +1476,7 @@ static unsigned long compact_zone_order(struct zone *zone, int order, .mode = mode, .alloc_flags = alloc_flags, .classzone_idx = classzone_idx, + .direct_compaction = true, }; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); diff --git a/mm/internal.h b/mm/internal.h index b95952c..4042a8a 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -172,6 +172,7 @@ struct compact_control { unsigned long last_migrated_pfn;/* Not yet flushed page being freed */ enum migrate_mode mode; /* Async or sync migration mode */ bool ignore_skip_hint; /* Scan blocks even if marked skip */ + bool direct_compaction; /* False from kcompactd or /proc/... */ int order; /* order a direct compactor needs */ const gfp_t gfp_mask; /* gfp mask of a direct compactor */ const int alloc_flags; /* alloc flags of a direct compactor */ diff --git a/mm/vmscan.c b/mm/vmscan.c index 5dcc711..f87cfaa 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2968,18 +2968,23 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc) } while (memcg); } -static bool zone_balanced(struct zone *zone, int order, - unsigned long balance_gap, int classzone_idx) +static bool zone_balanced(struct zone *zone, int order, bool highorder, + unsigned long balance_gap, int classzone_idx) { - if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) + - balance_gap, classzone_idx)) - return false; + unsigned long mark = high_wmark_pages(zone) + balance_gap; - if (IS_ENABLED(CONFIG_COMPACTION) && order && compaction_suitable(zone, - order, 0, classzone_idx) == COMPACT_SKIPPED) - return false; + /* + * When checking from pgdat_balanced(), kswapd should stop and sleep + * when it reaches the high order-0 watermark and let kcompactd take + * over. Other callers such as wakeup_kswapd() want to determine the + * true high-order watermark. + */ + if (IS_ENABLED(CONFIG_COMPACTION) && !highorder) { + mark += (1UL << order); + order = 0; + } - return true; + return zone_watermark_ok_safe(zone, order, mark, classzone_idx); } /* @@ -3029,7 +3034,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) continue; } - if (zone_balanced(zone, order, 0, i)) + if (zone_balanced(zone, order, false, 0, i)) balanced_pages += zone->managed_pages; else if (!order) return false; @@ -3083,10 +3088,8 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, */ static bool kswapd_shrink_zone(struct zone *zone, int classzone_idx, - struct scan_control *sc, - unsigned long *nr_attempted) + struct scan_control *sc) { - int testorder = sc->order; unsigned long balance_gap; bool lowmem_pressure; @@ -3094,17 +3097,6 @@ static bool kswapd_shrink_zone(struct zone *zone, sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone)); /* - * Kswapd reclaims only single pages with compaction enabled. Trying - * too hard to reclaim until contiguous free pages have become - * available can hurt performance by evicting too much useful data - * from memory. Do not reclaim more than needed for compaction. - */ - if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && - compaction_suitable(zone, sc->order, 0, classzone_idx) - != COMPACT_SKIPPED) - testorder = 0; - - /* * We put equal pressure on every zone, unless one zone has way too * many pages free already. The "too many pages" is defined as the * high wmark plus a "gap" where the gap is either the low @@ -3118,15 +3110,12 @@ static bool kswapd_shrink_zone(struct zone *zone, * reclaim is necessary */ lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone)); - if (!lowmem_pressure && zone_balanced(zone, testorder, + if (!lowmem_pressure && zone_balanced(zone, sc->order, false, balance_gap, classzone_idx)) return true; shrink_zone(zone, sc, zone_idx(zone) == classzone_idx); - /* Account for the number of pages attempted to reclaim */ - *nr_attempted += sc->nr_to_reclaim; - clear_bit(ZONE_WRITEBACK, &zone->flags); /* @@ -3136,7 +3125,7 @@ static bool kswapd_shrink_zone(struct zone *zone, * waits. */ if (zone_reclaimable(zone) && - zone_balanced(zone, testorder, 0, classzone_idx)) { + zone_balanced(zone, sc->order, false, 0, classzone_idx)) { clear_bit(ZONE_CONGESTED, &zone->flags); clear_bit(ZONE_DIRTY, &zone->flags); } @@ -3148,7 +3137,7 @@ static bool kswapd_shrink_zone(struct zone *zone, * For kswapd, balance_pgdat() will work across all this node's zones until * they are all at high_wmark_pages(zone). * - * Returns the final order kswapd was reclaiming at + * Returns the highest zone idx kswapd was reclaiming at * * There is special handling here for zones which are full of pinned pages. * This can happen if the pages are all mlocked, or if they are all used by @@ -3165,8 +3154,7 @@ static bool kswapd_shrink_zone(struct zone *zone, * interoperates with the page allocator fallback scheme to ensure that aging * of pages is balanced across the zones. */ -static unsigned long balance_pgdat(pg_data_t *pgdat, int order, - int *classzone_idx) +static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) { int i; int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ @@ -3183,9 +3171,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, count_vm_event(PAGEOUTRUN); do { - unsigned long nr_attempted = 0; bool raise_priority = true; - bool pgdat_needs_compaction = (order > 0); sc.nr_reclaimed = 0; @@ -3220,7 +3206,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, break; } - if (!zone_balanced(zone, order, 0, 0)) { + if (!zone_balanced(zone, order, false, 0, 0)) { end_zone = i; break; } else { @@ -3236,24 +3222,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, if (i < 0) goto out; - for (i = 0; i <= end_zone; i++) { - struct zone *zone = pgdat->node_zones + i; - - if (!populated_zone(zone)) - continue; - - /* - * If any zone is currently balanced then kswapd will - * not call compaction as it is expected that the - * necessary pages are already available. - */ - if (pgdat_needs_compaction && - zone_watermark_ok(zone, order, - low_wmark_pages(zone), - *classzone_idx, 0)) - pgdat_needs_compaction = false; - } - /* * If we're getting trouble reclaiming, start doing writepage * even in laptop mode. @@ -3297,8 +3265,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, * that that high watermark would be met at 100% * efficiency. */ - if (kswapd_shrink_zone(zone, end_zone, - &sc, &nr_attempted)) + if (kswapd_shrink_zone(zone, end_zone, &sc)) raise_priority = false; } @@ -3311,49 +3278,29 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, pfmemalloc_watermark_ok(pgdat)) wake_up_all(&pgdat->pfmemalloc_wait); - /* - * Fragmentation may mean that the system cannot be rebalanced - * for high-order allocations in all zones. If twice the - * allocation size has been reclaimed and the zones are still - * not balanced then recheck the watermarks at order-0 to - * prevent kswapd reclaiming excessively. Assume that a - * process requested a high-order can direct reclaim/compact. - */ - if (order && sc.nr_reclaimed >= 2UL << order) - order = sc.order = 0; - /* Check if kswapd should be suspending */ if (try_to_freeze() || kthread_should_stop()) break; /* - * Compact if necessary and kswapd is reclaiming at least the - * high watermark number of pages as requsted - */ - if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted) - compact_pgdat(pgdat, order); - - /* * Raise priority if scanning rate is too low or there was no * progress in reclaiming pages */ if (raise_priority || !sc.nr_reclaimed) sc.priority--; } while (sc.priority >= 1 && - !pgdat_balanced(pgdat, order, *classzone_idx)); + !pgdat_balanced(pgdat, order, classzone_idx)); out: /* - * Return the order we were reclaiming at so prepare_kswapd_sleep() - * makes a decision on the order we were last reclaiming at. However, - * if another caller entered the allocator slow path while kswapd - * was awake, order will remain at the higher level + * Return the highest zone idx we were reclaiming at so + * prepare_kswapd_sleep() makes the same decisions as here. */ - *classzone_idx = end_zone; - return order; + return end_zone; } -static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) +static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, + int classzone_idx, int balanced_classzone_idx) { long remaining = 0; DEFINE_WAIT(wait); @@ -3364,7 +3311,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); /* Try to sleep for a short interval */ - if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) { + if (prepare_kswapd_sleep(pgdat, order, remaining, + balanced_classzone_idx)) { remaining = schedule_timeout(HZ/10); finish_wait(&pgdat->kswapd_wait, &wait); prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); @@ -3374,7 +3322,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) * After a short sleep, check if it was a premature sleep. If not, then * go fully to sleep until explicitly woken up. */ - if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) { + if (prepare_kswapd_sleep(pgdat, order, remaining, + balanced_classzone_idx)) { trace_mm_vmscan_kswapd_sleep(pgdat->node_id); /* @@ -3395,6 +3344,12 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) */ reset_isolation_suitable(pgdat); + /* + * We have freed the memory, now we should compact it to make + * allocation of the requested order possible. + */ + wakeup_kcompactd(pgdat, order, classzone_idx); + if (!kthread_should_stop()) schedule(); @@ -3424,7 +3379,6 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) static int kswapd(void *p) { unsigned long order, new_order; - unsigned balanced_order; int classzone_idx, new_classzone_idx; int balanced_classzone_idx; pg_data_t *pgdat = (pg_data_t*)p; @@ -3457,23 +3411,19 @@ static int kswapd(void *p) set_freezable(); order = new_order = 0; - balanced_order = 0; classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; balanced_classzone_idx = classzone_idx; for ( ; ; ) { bool ret; /* - * If the last balance_pgdat was unsuccessful it's unlikely a - * new request of a similar or harder type will succeed soon - * so consider going to sleep on the basis we reclaimed at + * While we were reclaiming, there might have been another + * wakeup, so check the values. */ - if (balanced_order == new_order) { - new_order = pgdat->kswapd_max_order; - new_classzone_idx = pgdat->classzone_idx; - pgdat->kswapd_max_order = 0; - pgdat->classzone_idx = pgdat->nr_zones - 1; - } + new_order = pgdat->kswapd_max_order; + new_classzone_idx = pgdat->classzone_idx; + pgdat->kswapd_max_order = 0; + pgdat->classzone_idx = pgdat->nr_zones - 1; if (order < new_order || classzone_idx > new_classzone_idx) { /* @@ -3483,7 +3433,7 @@ static int kswapd(void *p) order = new_order; classzone_idx = new_classzone_idx; } else { - kswapd_try_to_sleep(pgdat, balanced_order, + kswapd_try_to_sleep(pgdat, order, classzone_idx, balanced_classzone_idx); order = pgdat->kswapd_max_order; classzone_idx = pgdat->classzone_idx; @@ -3503,9 +3453,8 @@ static int kswapd(void *p) */ if (!ret) { trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); - balanced_classzone_idx = classzone_idx; - balanced_order = balance_pgdat(pgdat, order, - &balanced_classzone_idx); + balanced_classzone_idx = balance_pgdat(pgdat, order, + classzone_idx); } } @@ -3535,7 +3484,7 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) } if (!waitqueue_active(&pgdat->kswapd_wait)) return; - if (zone_balanced(zone, order, 0, 0)) + if (zone_balanced(zone, order, true, 0, 0)) return; trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); -- cgit v1.1 From b313aeee25098bf361c808fe45ba2d3af398ec46 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 17 Mar 2016 14:18:27 -0700 Subject: mm: memcontrol: enable kmem accounting for all cgroups in the legacy hierarchy Workingset code was recently made memcg aware, but shadow node shrinker is still global. As a result, one small cgroup can consume all memory available for shadow nodes, possibly hurting other cgroups by reclaiming their shadow nodes, even though reclaim distances stored in its shadow nodes have no effect. To avoid this, we need to make shadow node shrinker memcg aware. The actual work is done in patch 6 of the series. Patches 1 and 2 prepare memcg/shrinker infrastructure for the change. Patch 3 is just a collateral cleanup. Patch 4 makes radix_tree_node accounted, which is necessary for making shadow node shrinker memcg aware. Patch 5 reduces shadow nodes overhead in case workload mostly uses anonymous pages. This patch: Currently, in the legacy hierarchy kmem accounting is off for all cgroups by default and must be enabled explicitly by writing something to memory.kmem.limit_in_bytes. Since we don't support reclaim on hitting kmem limit, nor do we have any plans to implement it, this is likely to be -1, just to enable kmem accounting and limit kernel memory consumption by the memory.limit_in_bytes along with user memory. This user API was introduced when the implementation of kmem accounting lacked slab shrinker support and hence was useless in practice. Things have changed since then - slab shrinkers were made memcg aware, the accounting overhead seems to be negligible, and a failure to charge a kmem allocation should not have critical consequences, because we only account those kernel objects that should be safe to fail. That's why kmem accounting is enabled by default for all cgroups in the default hierarchy, which will eventually replace the legacy one. The ability to enable kmem accounting for some cgroups while keeping it disabled for others is getting difficult to maintain. E.g. to make shadow node shrinker memcg aware (see mm/workingset.c), we need to know the relationship between the number of shadow nodes allocated for a cgroup and the size of its lru list. If kmem accounting is enabled for all cgroups there is no problem, but what should we do if kmem accounting is enabled only for half of cgroups? We've no other choice but use global lru stats while scanning root cgroup's shadow nodes, but that would be wrong if kmem accounting was enabled for all cgroups (which is the case if the unified hierarchy is used), in which case we should use lru stats of the root cgroup's lruvec. That being said, let's enable kmem accounting for all memory cgroups by default. If one finds it unstable or too costly, it can always be disabled system-wide by passing cgroup.memory=nokmem to the kernel at boot time. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 41 +++++------------------------------------ 1 file changed, 5 insertions(+), 36 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4b7dda7..28d1b1e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2824,6 +2824,9 @@ static int memcg_online_kmem(struct mem_cgroup *memcg) { int memcg_id; + if (cgroup_memory_nokmem) + return 0; + BUG_ON(memcg->kmemcg_id >= 0); BUG_ON(memcg->kmem_state); @@ -2844,24 +2847,6 @@ static int memcg_online_kmem(struct mem_cgroup *memcg) return 0; } -static int memcg_propagate_kmem(struct mem_cgroup *parent, - struct mem_cgroup *memcg) -{ - int ret = 0; - - mutex_lock(&memcg_limit_mutex); - /* - * If the parent cgroup is not kmem-online now, it cannot be - * onlined after this point, because it has at least one child - * already. - */ - if (memcg_kmem_online(parent) || - (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nokmem)) - ret = memcg_online_kmem(memcg); - mutex_unlock(&memcg_limit_mutex); - return ret; -} - static void memcg_offline_kmem(struct mem_cgroup *memcg) { struct cgroup_subsys_state *css; @@ -2920,10 +2905,6 @@ static void memcg_free_kmem(struct mem_cgroup *memcg) } } #else -static int memcg_propagate_kmem(struct mem_cgroup *parent, struct mem_cgroup *memcg) -{ - return 0; -} static int memcg_online_kmem(struct mem_cgroup *memcg) { return 0; @@ -2939,22 +2920,10 @@ static void memcg_free_kmem(struct mem_cgroup *memcg) static int memcg_update_kmem_limit(struct mem_cgroup *memcg, unsigned long limit) { - int ret = 0; + int ret; mutex_lock(&memcg_limit_mutex); - /* Top-level cgroup doesn't propagate from root */ - if (!memcg_kmem_online(memcg)) { - if (cgroup_is_populated(memcg->css.cgroup) || - (memcg->use_hierarchy && memcg_has_children(memcg))) - ret = -EBUSY; - if (ret) - goto out; - ret = memcg_online_kmem(memcg); - if (ret) - goto out; - } ret = page_counter_limit(&memcg->kmem, limit); -out: mutex_unlock(&memcg_limit_mutex); return ret; } @@ -4205,7 +4174,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) return &memcg->css; } - error = memcg_propagate_kmem(parent, memcg); + error = memcg_online_kmem(memcg); if (error) goto fail; -- cgit v1.1 From 0fc9f58a90a5012942abf0ae98cfc852afebc0a6 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 17 Mar 2016 14:18:30 -0700 Subject: mm: vmscan: pass root_mem_cgroup instead of NULL to memcg aware shrinker It's just convenient to implement a memcg aware shrinker when you know that shrink_control->memcg != NULL unless memcg_kmem_enabled() returns false. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index f87cfaa..b41b82d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -382,9 +382,8 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, * * @memcg specifies the memory cgroup to target. If it is not NULL, * only shrinkers with SHRINKER_MEMCG_AWARE set will be called to scan - * objects from the memory cgroup specified. Otherwise all shrinkers - * are called, and memcg aware shrinkers are supposed to scan the - * global list then. + * objects from the memory cgroup specified. Otherwise, only unaware + * shrinkers are called. * * @nr_scanned and @nr_eligible form a ratio that indicate how much of * the available objects should be scanned. Page reclaim for example @@ -404,7 +403,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct shrinker *shrinker; unsigned long freed = 0; - if (memcg && !memcg_kmem_online(memcg)) + if (memcg && (!memcg_kmem_enabled() || !mem_cgroup_online(memcg))) return 0; if (nr_scanned == 0) @@ -428,7 +427,13 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, .memcg = memcg, }; - if (memcg && !(shrinker->flags & SHRINKER_MEMCG_AWARE)) + /* + * If kernel memory accounting is disabled, we ignore + * SHRINKER_MEMCG_AWARE flag and call all shrinkers + * passing NULL for memcg. + */ + if (memcg_kmem_enabled() && + !!memcg != !!(shrinker->flags & SHRINKER_MEMCG_AWARE)) continue; if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) -- cgit v1.1 From b6ecd2dea4435a771a99c497a6ac5df6d3618c5a Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 17 Mar 2016 14:18:33 -0700 Subject: mm: memcontrol: zap memcg_kmem_online helper As kmem accounting is now either enabled for all cgroups or disabled system-wide, there's no point in having memcg_kmem_online() helper - instead one can use memcg_kmem_enabled() and mem_cgroup_online(), as shrink_slab() now does. There are only two places left where this helper is used - __memcg_kmem_charge() and memcg_create_kmem_cache(). The former can only be called if memcg_kmem_enabled() returned true. Since the cgroup it operates on is online, mem_cgroup_is_root() check will be enough. memcg_create_kmem_cache() can't use mem_cgroup_online() helper instead of memcg_kmem_online(), because it relies on the fact that in memcg_offline_kmem() memcg->kmem_state is changed before memcg_deactivate_kmem_caches() is called, but there we can just open-code the check. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- mm/slab_common.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 28d1b1e..341bf86 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2346,7 +2346,7 @@ int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) int ret = 0; memcg = get_mem_cgroup_from_mm(current->mm); - if (memcg_kmem_online(memcg)) + if (!mem_cgroup_is_root(memcg)) ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg); css_put(&memcg->css); return ret; diff --git a/mm/slab_common.c b/mm/slab_common.c index 6afb226..8addc3c 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -510,7 +510,7 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg, * The memory cgroup could have been offlined while the cache * creation work was pending. */ - if (!memcg_kmem_online(memcg)) + if (memcg->kmem_state != KMEM_ONLINE) goto out_unlock; idx = memcg_cache_id(memcg); -- cgit v1.1 From cdcbb72ebfec52373e57092eccaadd5a6e261c3e Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 17 Mar 2016 14:18:39 -0700 Subject: mm: workingset: size shadow nodes lru basing on file cache size A page is activated on refault if the refault distance stored in the corresponding shadow entry is less than the number of active file pages. Since active file pages can't occupy more than half memory, we assume that the maximal effective refault distance can't be greater than half the number of present pages and size the shadow nodes lru list appropriately. Generally speaking, this assumption is correct, but it can result in wasting a considerable chunk of memory on stale shadow nodes in case the portion of file pages is small, e.g. if a workload mostly uses anonymous memory. To sort this out, we need to compute the size of shadow nodes lru basing not on the maximal possible, but the current size of file cache. We could take the size of active file lru for the maximal refault distance, but active lru is pretty unstable - it can shrink dramatically at runtime possibly disrupting workingset detection logic. Instead we assume that the maximal refault distance equals half the total number of file cache pages. This will protect us against active file lru size fluctuations while still being correct, because size of active lru is normally maintained lower than size of inactive lru. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/workingset.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/workingset.c b/mm/workingset.c index 6130ba0b..68e8cd9 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -349,7 +349,9 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc); local_irq_enable(); - pages = node_present_pages(sc->nid); + pages = node_page_state(sc->nid, NR_ACTIVE_FILE) + + node_page_state(sc->nid, NR_INACTIVE_FILE); + /* * Active cache pages are limited to 50% of memory, and shadow * entries that represent a refault distance bigger than that -- cgit v1.1 From 0a6b76dd23fa08c5fd7b68acdb55018a37afd4aa Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 17 Mar 2016 14:18:42 -0700 Subject: mm: workingset: make shadow node shrinker memcg aware Workingset code was recently made memcg aware, but shadow node shrinker is still global. As a result, one small cgroup can consume all memory available for shadow nodes, possibly hurting other cgroups by reclaiming their shadow nodes, even though reclaim distances stored in its shadow nodes have no effect. To avoid this, we need to make shadow node shrinker memcg aware. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 5 ++--- mm/workingset.c | 10 +++++++--- 2 files changed, 9 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 341bf86..ae8b81c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -638,9 +638,8 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, __this_cpu_add(memcg->stat->nr_page_events, nr_pages); } -static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, - int nid, - unsigned int lru_mask) +unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, + int nid, unsigned int lru_mask) { unsigned long nr = 0; int zid; diff --git a/mm/workingset.c b/mm/workingset.c index 68e8cd9..8a75f8d 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -349,8 +349,12 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc); local_irq_enable(); - pages = node_page_state(sc->nid, NR_ACTIVE_FILE) + - node_page_state(sc->nid, NR_INACTIVE_FILE); + if (memcg_kmem_enabled()) + pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid, + LRU_ALL_FILE); + else + pages = node_page_state(sc->nid, NR_ACTIVE_FILE) + + node_page_state(sc->nid, NR_INACTIVE_FILE); /* * Active cache pages are limited to 50% of memory, and shadow @@ -460,7 +464,7 @@ static struct shrinker workingset_shadow_shrinker = { .count_objects = count_shadow_nodes, .scan_objects = scan_shadow_nodes, .seeks = DEFAULT_SEEKS, - .flags = SHRINKER_NUMA_AWARE, + .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, }; /* -- cgit v1.1 From f9719a03de51e13526d614e79d002f838770b2d6 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 17 Mar 2016 14:18:45 -0700 Subject: thp, vmstats: count deferred split events Count how many times we put a THP in split queue. Currently, it happens on partial unmap of a THP. Rapidly growing value can indicate that an application behaves unfriendly wrt THP: often fault in huge page and then unmap part of it. This leads to unnecessary memory fragmentation and the application may require tuning. The event also can help with debugging kernel [mis-]behaviour. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 1 + mm/vmstat.c | 1 + 2 files changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1ea21e2..1dddfb2 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3455,6 +3455,7 @@ void deferred_split_huge_page(struct page *page) spin_lock_irqsave(&pgdata->split_queue_lock, flags); if (list_empty(page_deferred_list(page))) { + count_vm_event(THP_DEFERRED_SPLIT_PAGE); list_add_tail(page_deferred_list(page), &pgdata->split_queue); pgdata->split_queue_len++; } diff --git a/mm/vmstat.c b/mm/vmstat.c index f800662..5e43004 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -848,6 +848,7 @@ const char * const vmstat_text[] = { "thp_collapse_alloc_failed", "thp_split_page", "thp_split_page_failed", + "thp_deferred_split_page", "thp_split_pmd", "thp_zero_page_alloc", "thp_zero_page_alloc_failed", -- cgit v1.1 From ea606cf5d8df370e7932460dfd960b21f20e7c6d Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 17 Mar 2016 14:18:48 -0700 Subject: mm: move max_map_count bits into mm.h max_map_count sysctl unrelated to scheduler. Move its bits from include/linux/sched/sysctl.h to include/linux/mm.h. Signed-off-by: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 1 - mm/mremap.c | 1 - mm/nommu.c | 1 - 3 files changed, 3 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 90e3b86..676f422 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -37,7 +37,6 @@ #include #include #include -#include #include #include #include diff --git a/mm/mremap.c b/mm/mremap.c index 8eeba02..e30c8a6 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include diff --git a/mm/nommu.c b/mm/nommu.c index fbf6f0f1..9bdf8b1 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -33,7 +33,6 @@ #include #include #include -#include #include #include -- cgit v1.1 From 39a1aa8e194ab67983de3b9d0b204ccee12e689a Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 17 Mar 2016 14:18:50 -0700 Subject: mm: deduplicate memory overcommitment code Currently we have two copies of the same code which implements memory overcommitment logic. Let's move it into mm/util.c and hence avoid duplication. No functional changes here. Signed-off-by: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 124 ------------------------------------------------------------- mm/nommu.c | 116 --------------------------------------------------------- mm/util.c | 124 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 124 insertions(+), 240 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 676f422..1464192 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -122,130 +122,6 @@ void vma_set_page_prot(struct vm_area_struct *vma) } } - -int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */ -int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */ -unsigned long sysctl_overcommit_kbytes __read_mostly; -int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; -unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ -unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ -/* - * Make sure vm_committed_as in one cacheline and not cacheline shared with - * other variables. It can be updated by several CPUs frequently. - */ -struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp; - -/* - * The global memory commitment made in the system can be a metric - * that can be used to drive ballooning decisions when Linux is hosted - * as a guest. On Hyper-V, the host implements a policy engine for dynamically - * balancing memory across competing virtual machines that are hosted. - * Several metrics drive this policy engine including the guest reported - * memory commitment. - */ -unsigned long vm_memory_committed(void) -{ - return percpu_counter_read_positive(&vm_committed_as); -} -EXPORT_SYMBOL_GPL(vm_memory_committed); - -/* - * Check that a process has enough memory to allocate a new virtual - * mapping. 0 means there is enough memory for the allocation to - * succeed and -ENOMEM implies there is not. - * - * We currently support three overcommit policies, which are set via the - * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting - * - * Strict overcommit modes added 2002 Feb 26 by Alan Cox. - * Additional code 2002 Jul 20 by Robert Love. - * - * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise. - * - * Note this is a helper function intended to be used by LSMs which - * wish to use this logic. - */ -int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) -{ - long free, allowed, reserve; - - VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) < - -(s64)vm_committed_as_batch * num_online_cpus(), - "memory commitment underflow"); - - vm_acct_memory(pages); - - /* - * Sometimes we want to use more memory than we have - */ - if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) - return 0; - - if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { - free = global_page_state(NR_FREE_PAGES); - free += global_page_state(NR_FILE_PAGES); - - /* - * shmem pages shouldn't be counted as free in this - * case, they can't be purged, only swapped out, and - * that won't affect the overall amount of available - * memory in the system. - */ - free -= global_page_state(NR_SHMEM); - - free += get_nr_swap_pages(); - - /* - * Any slabs which are created with the - * SLAB_RECLAIM_ACCOUNT flag claim to have contents - * which are reclaimable, under pressure. The dentry - * cache and most inode caches should fall into this - */ - free += global_page_state(NR_SLAB_RECLAIMABLE); - - /* - * Leave reserved pages. The pages are not for anonymous pages. - */ - if (free <= totalreserve_pages) - goto error; - else - free -= totalreserve_pages; - - /* - * Reserve some for root - */ - if (!cap_sys_admin) - free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); - - if (free > pages) - return 0; - - goto error; - } - - allowed = vm_commit_limit(); - /* - * Reserve some for root - */ - if (!cap_sys_admin) - allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); - - /* - * Don't let a single process grow so big a user can't recover - */ - if (mm) { - reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); - allowed -= min_t(long, mm->total_vm / 32, reserve); - } - - if (percpu_counter_read_positive(&vm_committed_as) < allowed) - return 0; -error: - vm_unacct_memory(pages); - - return -ENOMEM; -} - /* * Requires inode->i_mapping->i_mmap_rwsem */ diff --git a/mm/nommu.c b/mm/nommu.c index 9bdf8b1..6402f27 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -47,33 +47,11 @@ struct page *mem_map; unsigned long max_mapnr; EXPORT_SYMBOL(max_mapnr); unsigned long highest_memmap_pfn; -struct percpu_counter vm_committed_as; -int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ -int sysctl_overcommit_ratio = 50; /* default is 50% */ -unsigned long sysctl_overcommit_kbytes __read_mostly; -int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; -unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ -unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ int heap_stack_gap = 0; atomic_long_t mmap_pages_allocated; -/* - * The global memory commitment made in the system can be a metric - * that can be used to drive ballooning decisions when Linux is hosted - * as a guest. On Hyper-V, the host implements a policy engine for dynamically - * balancing memory across competing virtual machines that are hosted. - * Several metrics drive this policy engine including the guest reported - * memory commitment. - */ -unsigned long vm_memory_committed(void) -{ - return percpu_counter_read_positive(&vm_committed_as); -} - -EXPORT_SYMBOL_GPL(vm_memory_committed); - EXPORT_SYMBOL(mem_map); /* list of mapped, potentially shareable regions */ @@ -1828,100 +1806,6 @@ void unmap_mapping_range(struct address_space *mapping, } EXPORT_SYMBOL(unmap_mapping_range); -/* - * Check that a process has enough memory to allocate a new virtual - * mapping. 0 means there is enough memory for the allocation to - * succeed and -ENOMEM implies there is not. - * - * We currently support three overcommit policies, which are set via the - * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting - * - * Strict overcommit modes added 2002 Feb 26 by Alan Cox. - * Additional code 2002 Jul 20 by Robert Love. - * - * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise. - * - * Note this is a helper function intended to be used by LSMs which - * wish to use this logic. - */ -int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) -{ - long free, allowed, reserve; - - vm_acct_memory(pages); - - /* - * Sometimes we want to use more memory than we have - */ - if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) - return 0; - - if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { - free = global_page_state(NR_FREE_PAGES); - free += global_page_state(NR_FILE_PAGES); - - /* - * shmem pages shouldn't be counted as free in this - * case, they can't be purged, only swapped out, and - * that won't affect the overall amount of available - * memory in the system. - */ - free -= global_page_state(NR_SHMEM); - - free += get_nr_swap_pages(); - - /* - * Any slabs which are created with the - * SLAB_RECLAIM_ACCOUNT flag claim to have contents - * which are reclaimable, under pressure. The dentry - * cache and most inode caches should fall into this - */ - free += global_page_state(NR_SLAB_RECLAIMABLE); - - /* - * Leave reserved pages. The pages are not for anonymous pages. - */ - if (free <= totalreserve_pages) - goto error; - else - free -= totalreserve_pages; - - /* - * Reserve some for root - */ - if (!cap_sys_admin) - free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); - - if (free > pages) - return 0; - - goto error; - } - - allowed = vm_commit_limit(); - /* - * Reserve some 3% for root - */ - if (!cap_sys_admin) - allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); - - /* - * Don't let a single process grow so big a user can't recover - */ - if (mm) { - reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); - allowed -= min_t(long, mm->total_vm / 32, reserve); - } - - if (percpu_counter_read_positive(&vm_committed_as) < allowed) - return 0; - -error: - vm_unacct_memory(pages); - - return -ENOMEM; -} - int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { BUG(); diff --git a/mm/util.c b/mm/util.c index 4fb14ca..47a57e5 100644 --- a/mm/util.c +++ b/mm/util.c @@ -396,6 +396,13 @@ int __page_mapcount(struct page *page) } EXPORT_SYMBOL_GPL(__page_mapcount); +int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; +int sysctl_overcommit_ratio __read_mostly = 50; +unsigned long sysctl_overcommit_kbytes __read_mostly; +int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; +unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ +unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ + int overcommit_ratio_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -437,6 +444,123 @@ unsigned long vm_commit_limit(void) return allowed; } +/* + * Make sure vm_committed_as in one cacheline and not cacheline shared with + * other variables. It can be updated by several CPUs frequently. + */ +struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp; + +/* + * The global memory commitment made in the system can be a metric + * that can be used to drive ballooning decisions when Linux is hosted + * as a guest. On Hyper-V, the host implements a policy engine for dynamically + * balancing memory across competing virtual machines that are hosted. + * Several metrics drive this policy engine including the guest reported + * memory commitment. + */ +unsigned long vm_memory_committed(void) +{ + return percpu_counter_read_positive(&vm_committed_as); +} +EXPORT_SYMBOL_GPL(vm_memory_committed); + +/* + * Check that a process has enough memory to allocate a new virtual + * mapping. 0 means there is enough memory for the allocation to + * succeed and -ENOMEM implies there is not. + * + * We currently support three overcommit policies, which are set via the + * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting + * + * Strict overcommit modes added 2002 Feb 26 by Alan Cox. + * Additional code 2002 Jul 20 by Robert Love. + * + * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise. + * + * Note this is a helper function intended to be used by LSMs which + * wish to use this logic. + */ +int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) +{ + long free, allowed, reserve; + + VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) < + -(s64)vm_committed_as_batch * num_online_cpus(), + "memory commitment underflow"); + + vm_acct_memory(pages); + + /* + * Sometimes we want to use more memory than we have + */ + if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) + return 0; + + if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { + free = global_page_state(NR_FREE_PAGES); + free += global_page_state(NR_FILE_PAGES); + + /* + * shmem pages shouldn't be counted as free in this + * case, they can't be purged, only swapped out, and + * that won't affect the overall amount of available + * memory in the system. + */ + free -= global_page_state(NR_SHMEM); + + free += get_nr_swap_pages(); + + /* + * Any slabs which are created with the + * SLAB_RECLAIM_ACCOUNT flag claim to have contents + * which are reclaimable, under pressure. The dentry + * cache and most inode caches should fall into this + */ + free += global_page_state(NR_SLAB_RECLAIMABLE); + + /* + * Leave reserved pages. The pages are not for anonymous pages. + */ + if (free <= totalreserve_pages) + goto error; + else + free -= totalreserve_pages; + + /* + * Reserve some for root + */ + if (!cap_sys_admin) + free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); + + if (free > pages) + return 0; + + goto error; + } + + allowed = vm_commit_limit(); + /* + * Reserve some for root + */ + if (!cap_sys_admin) + allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); + + /* + * Don't let a single process grow so big a user can't recover + */ + if (mm) { + reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); + allowed -= min_t(long, mm->total_vm / 32, reserve); + } + + if (percpu_counter_read_positive(&vm_committed_as) < allowed) + return 0; +error: + vm_unacct_memory(pages); + + return -ENOMEM; +} + /** * get_cmdline() - copy the cmdline value to a buffer. * @task: the task whose cmdline value to copy. -- cgit v1.1 From 458aa76d132dc1c3c60be0f0db99bcc0ce1767fc Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 17 Mar 2016 14:18:56 -0700 Subject: mm/thp/migration: switch from flush_tlb_range to flush_pmd_tlb_range We remove one instace of flush_tlb_range here. That was added by commit f714f4f20e59 ("mm: numa: call MMU notifiers on THP migration"). But the pmdp_huge_clear_flush_notify should have done the require flush for us. Hence remove the extra flush. Signed-off-by: Aneesh Kumar K.V Cc: Mel Gorman Cc: "Kirill A. Shutemov" Cc: Vineet Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 8 +++++--- mm/pgtable-generic.c | 14 -------------- 2 files changed, 5 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 568284e..fdaf081 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1773,7 +1773,10 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, put_page(new_page); goto out_fail; } - + /* + * We are not sure a pending tlb flush here is for a huge page + * mapping or not. Hence use the tlb range variant + */ if (mm_tlb_flush_pending(mm)) flush_tlb_range(vma, mmun_start, mmun_end); @@ -1829,12 +1832,11 @@ fail_putback: page_add_anon_rmap(new_page, vma, mmun_start, true); pmdp_huge_clear_flush_notify(vma, mmun_start, pmd); set_pmd_at(mm, mmun_start, pmd, entry); - flush_tlb_range(vma, mmun_start, mmun_end); update_mmu_cache_pmd(vma, address, &entry); if (page_count(page) != 2) { set_pmd_at(mm, mmun_start, pmd, orig_entry); - flush_tlb_range(vma, mmun_start, mmun_end); + flush_pmd_tlb_range(vma, mmun_start, mmun_end); mmu_notifier_invalidate_range(mm, mmun_start, mmun_end); update_mmu_cache_pmd(vma, address, &entry); page_remove_rmap(new_page, true); diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 06a005b..71c5f91 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -84,20 +84,6 @@ pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, #ifdef CONFIG_TRANSPARENT_HUGEPAGE -#ifndef __HAVE_ARCH_FLUSH_PMD_TLB_RANGE - -/* - * ARCHes with special requirements for evicting THP backing TLB entries can - * implement this. Otherwise also, it can help optimize normal TLB flush in - * THP regime. stock flush_tlb_range() typically has optimization to nuke the - * entire TLB if flush span is greater than a threshold, which will - * likely be true for a single huge page. Thus a single thp flush will - * invalidate the entire TLB which is not desirable. - * e.g. see arch/arc: flush_pmd_tlb_range - */ -#define flush_pmd_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) -#endif - #ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, -- cgit v1.1 From 7eb50292d7f74ccb89155960d62b2697f2536b28 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 17 Mar 2016 14:19:02 -0700 Subject: mm/Kconfig: remove redundant arch depend for memory hotplug MEMORY_HOTPLUG already depends on ARCH_ENABLE_MEMORY_HOTPLUG which is selected by the supported architectures, so the following arch depend is unnecessary. Signed-off-by: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 1 - 1 file changed, 1 deletion(-) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index 03cbfa0..c077765 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -187,7 +187,6 @@ config MEMORY_HOTPLUG bool "Allow for memory hot-add" depends on SPARSEMEM || X86_64_ACPI_NUMA depends on ARCH_ENABLE_MEMORY_HOTPLUG - depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390) config MEMORY_HOTPLUG_SPARSE def_bool y -- cgit v1.1 From d02bd27bd33dd7e8d22594cd568b81be0cb584cd Mon Sep 17 00:00:00 2001 From: Igor Redko Date: Thu, 17 Mar 2016 14:19:05 -0700 Subject: mm/page_alloc.c: calculate 'available' memory in a separate function Add a new field, VIRTIO_BALLOON_S_AVAIL, to virtio_balloon memory statistics protocol, corresponding to 'Available' in /proc/meminfo. It indicates to the hypervisor how big the balloon can be inflated without pushing the guest system to swap. This metric would be very useful in VM orchestration software to improve memory management of different VMs under overcommit. This patch (of 2): Factor out calculation of the available memory counter into a separate exportable function, in order to be able to use it in other parts of the kernel. In particular, it appears a relevant metric to report to the hypervisor via virtio-balloon statistics interface (in a followup patch). Signed-off-by: Igor Redko Signed-off-by: Denis V. Lunev Reviewed-by: Roman Kagan Cc: Michael S. Tsirkin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 25a75da..941b802 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3713,6 +3713,49 @@ static inline void show_node(struct zone *zone) printk("Node %d ", zone_to_nid(zone)); } +long si_mem_available(void) +{ + long available; + unsigned long pagecache; + unsigned long wmark_low = 0; + unsigned long pages[NR_LRU_LISTS]; + struct zone *zone; + int lru; + + for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) + pages[lru] = global_page_state(NR_LRU_BASE + lru); + + for_each_zone(zone) + wmark_low += zone->watermark[WMARK_LOW]; + + /* + * Estimate the amount of memory available for userspace allocations, + * without causing swapping. + */ + available = global_page_state(NR_FREE_PAGES) - totalreserve_pages; + + /* + * Not all the page cache can be freed, otherwise the system will + * start swapping. Assume at least half of the page cache, or the + * low watermark worth of cache, needs to stay. + */ + pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE]; + pagecache -= min(pagecache / 2, wmark_low); + available += pagecache; + + /* + * Part of the reclaimable slab consists of items that are in use, + * and cannot be freed. Cap this estimate at the low watermark. + */ + available += global_page_state(NR_SLAB_RECLAIMABLE) - + min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low); + + if (available < 0) + available = 0; + return available; +} +EXPORT_SYMBOL_GPL(si_mem_available); + void si_meminfo(struct sysinfo *val) { val->totalram = totalram_pages; -- cgit v1.1 From 3ed3a4f0ddffece942bb2661924d87be4ce63cb7 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 17 Mar 2016 14:19:11 -0700 Subject: mm: cleanup *pte_alloc* interfaces There are few things about *pte_alloc*() helpers worth cleaning up: - 'vma' argument is unused, let's drop it; - most __pte_alloc() callers do speculative check for pmd_none(), before taking ptl: let's introduce pte_alloc() macro which does the check. The only direct user of __pte_alloc left is userfaultfd, which has different expectation about atomicity wrt pmd. - pte_alloc_map() and pte_alloc_map_lock() are redefined using pte_alloc(). [sudeep.holla@arm.com: fix build for arm64 hugetlbpage] [sfr@canb.auug.org.au: fix arch/arm/mm/mmu.c some more] Signed-off-by: Kirill A. Shutemov Cc: Dave Hansen Signed-off-by: Sudeep Holla Acked-by: Kirill A. Shutemov Signed-off-by: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 8 +++----- mm/mremap.c | 3 +-- mm/userfaultfd.c | 3 +-- 3 files changed, 5 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 0e24764..1974fc0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -562,8 +562,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, } } -int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, - pmd_t *pmd, unsigned long address) +int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) { spinlock_t *ptl; pgtable_t new = pte_alloc_one(mm, address); @@ -3419,12 +3418,11 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, } /* - * Use __pte_alloc instead of pte_alloc_map, because we can't + * Use pte_alloc() instead of pte_alloc_map, because we can't * run pte_offset_map on the pmd, if an huge pmd could * materialize from under us from a different thread. */ - if (unlikely(pmd_none(*pmd)) && - unlikely(__pte_alloc(mm, vma, pmd, address))) + if (unlikely(pte_alloc(mm, pmd, address))) return VM_FAULT_OOM; /* * If a huge pmd materialized under us just retry later. Use diff --git a/mm/mremap.c b/mm/mremap.c index e30c8a6..3fa0a467 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -213,8 +213,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, continue; VM_BUG_ON(pmd_trans_huge(*old_pmd)); } - if (pmd_none(*new_pmd) && __pte_alloc(new_vma->vm_mm, new_vma, - new_pmd, new_addr)) + if (pte_alloc(new_vma->vm_mm, new_pmd, new_addr)) break; next = (new_addr + PMD_SIZE) & PMD_MASK; if (extent > next - new_addr) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 806b0c7..9f3a029 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -230,8 +230,7 @@ retry: break; } if (unlikely(pmd_none(dst_pmdval)) && - unlikely(__pte_alloc(dst_mm, dst_vma, dst_pmd, - dst_addr))) { + unlikely(__pte_alloc(dst_mm, dst_pmd, dst_addr))) { err = -ENOMEM; break; } -- cgit v1.1 From 795ae7a0de6b834a0cc202aa55c190ef81496665 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 17 Mar 2016 14:19:14 -0700 Subject: mm: scale kswapd watermarks in proportion to memory In machines with 140G of memory and enterprise flash storage, we have seen read and write bursts routinely exceed the kswapd watermarks and cause thundering herds in direct reclaim. Unfortunately, the only way to tune kswapd aggressiveness is through adjusting min_free_kbytes - the system's emergency reserves - which is entirely unrelated to the system's latency requirements. In order to get kswapd to maintain a 250M buffer of free memory, the emergency reserves need to be set to 1G. That is a lot of memory wasted for no good reason. On the other hand, it's reasonable to assume that allocation bursts and overall allocation concurrency scale with memory capacity, so it makes sense to make kswapd aggressiveness a function of that as well. Change the kswapd watermark scale factor from the currently fixed 25% of the tunable emergency reserve to a tunable 0.1% of memory. Beyond 1G of memory, this will produce bigger watermark steps than the current formula in default settings. Ensure that the new formula never chooses steps smaller than that, i.e. 25% of the emergency reserve. On a 140G machine, this raises the default watermark steps - the distance between min and low, and low and high - from 16M to 143M. Signed-off-by: Johannes Weiner Acked-by: Mel Gorman Acked-by: Rik van Riel Acked-by: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 941b802..d156310 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -249,6 +249,7 @@ compound_page_dtor * const compound_page_dtors[] = { int min_free_kbytes = 1024; int user_min_free_kbytes = -1; +int watermark_scale_factor = 10; static unsigned long __meminitdata nr_kernel_pages; static unsigned long __meminitdata nr_all_pages; @@ -6347,8 +6348,17 @@ static void __setup_per_zone_wmarks(void) zone->watermark[WMARK_MIN] = tmp; } - zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); - zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); + /* + * Set the kswapd watermarks distance according to the + * scale factor in proportion to available memory, but + * ensure a minimum size on small systems. + */ + tmp = max_t(u64, tmp >> 2, + mult_frac(zone->managed_pages, + watermark_scale_factor, 10000)); + + zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp; + zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2; __mod_zone_page_state(zone, NR_ALLOC_BATCH, high_wmark_pages(zone) - low_wmark_pages(zone) - @@ -6489,6 +6499,21 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, return 0; } +int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + int rc; + + rc = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (rc) + return rc; + + if (write) + setup_per_zone_wmarks(); + + return 0; +} + #ifdef CONFIG_NUMA int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) -- cgit v1.1 From f9054c70d28bc214b2857cf8db8269f4f45a5e23 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 17 Mar 2016 14:19:19 -0700 Subject: mm, mempool: only set __GFP_NOMEMALLOC if there are free elements If an oom killed thread calls mempool_alloc(), it is possible that it'll loop forever if there are no elements on the freelist since __GFP_NOMEMALLOC prevents it from accessing needed memory reserves in oom conditions. Only set __GFP_NOMEMALLOC if there are elements on the freelist. If there are no free elements, allow allocations without the bit set so that memory reserves can be accessed if needed. Additionally, using mempool_alloc() with __GFP_NOMEMALLOC is not supported since the implementation can loop forever without accessing memory reserves when needed. Signed-off-by: David Rientjes Cc: Greg Thelen Cc: Michal Hocko Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempool.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/mempool.c b/mm/mempool.c index 7924f4f..07c383d 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -310,25 +310,36 @@ EXPORT_SYMBOL(mempool_resize); * returns NULL. Note that due to preallocation, this function * *never* fails when called from process contexts. (it might * fail if called from an IRQ context.) - * Note: using __GFP_ZERO is not supported. + * Note: neither __GFP_NOMEMALLOC nor __GFP_ZERO are supported. */ -void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask) +void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) { void *element; unsigned long flags; wait_queue_t wait; gfp_t gfp_temp; + /* If oom killed, memory reserves are essential to prevent livelock */ + VM_WARN_ON_ONCE(gfp_mask & __GFP_NOMEMALLOC); + /* No element size to zero on allocation */ VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO); + might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM); - gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */ gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */ gfp_mask |= __GFP_NOWARN; /* failures are OK */ gfp_temp = gfp_mask & ~(__GFP_DIRECT_RECLAIM|__GFP_IO); repeat_alloc: + if (likely(pool->curr_nr)) { + /* + * Don't allocate from emergency reserves if there are + * elements available. This check is racy, but it will + * be rechecked each loop. + */ + gfp_temp |= __GFP_NOMEMALLOC; + } element = pool->alloc(gfp_temp, pool->pool_data); if (likely(element != NULL)) @@ -352,11 +363,12 @@ repeat_alloc: * We use gfp mask w/o direct reclaim or IO for the first round. If * alloc failed with that and @pool was empty, retry immediately. */ - if (gfp_temp != gfp_mask) { + if ((gfp_temp & ~__GFP_NOMEMALLOC) != gfp_mask) { spin_unlock_irqrestore(&pool->lock, flags); gfp_temp = gfp_mask; goto repeat_alloc; } + gfp_temp = gfp_mask; /* We must not sleep if !__GFP_DIRECT_RECLAIM */ if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) { -- cgit v1.1 From 444eb2a449ef36fe115431ed7b71467c4563c7f1 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 17 Mar 2016 14:19:23 -0700 Subject: mm: thp: set THP defrag by default to madvise and add a stall-free defrag option THP defrag is enabled by default to direct reclaim/compact but not wake kswapd in the event of a THP allocation failure. The problem is that THP allocation requests potentially enter reclaim/compaction. This potentially incurs a severe stall that is not guaranteed to be offset by reduced TLB misses. While there has been considerable effort to reduce the impact of reclaim/compaction, it is still a high cost and workloads that should fit in memory fail to do so. Specifically, a simple anon/file streaming workload will enter direct reclaim on NUMA at least even though the working set size is 80% of RAM. It's been years and it's time to throw in the towel. First, this patch defines THP defrag as follows; madvise: A failed allocation will direct reclaim/compact if the application requests it never: Neither reclaim/compact nor wake kswapd defer: A failed allocation will wake kswapd/kcompactd always: A failed allocation will direct reclaim/compact (historical behaviour) khugepaged defrag will enter direct/reclaim but not wake kswapd. Next it sets the default defrag option to be "madvise" to only enter direct reclaim/compaction for applications that specifically requested it. Lastly, it removes a check from the page allocator slowpath that is related to __GFP_THISNODE to allow "defer" to work. The callers that really cares are slub/slab and they are updated accordingly. The slab one may be surprising because it also corrects a comment as kswapd was never woken up by that path. This means that a THP fault will no longer stall for most applications by default and the ideal for most users that get THP if they are immediately available. There are still options for users that prefer a stall at startup of a new application by either restoring historical behaviour with "always" or pick a half-way point with "defer" where kswapd does some of the work in the background and wakes kcompactd if necessary. THP defrag for khugepaged remains enabled and will enter direct/reclaim but no wakeup kswapd or kcompactd. After this patch a THP allocation failure will quickly fallback and rely on khugepaged to recover the situation at some time in the future. In some cases, this will reduce THP usage but the benefit of THP is hard to measure and not a universal win where as a stall to reclaim/compaction is definitely measurable and can be painful. The first test for this is using "usemem" to read a large file and write a large anonymous mapping (to avoid the zero page) multiple times. The total size of the mappings is 80% of RAM and the benchmark simply measures how long it takes to complete. It uses multiple threads to see if that is a factor. On UMA, the performance is almost identical so is not reported but on NUMA, we see this usemem 4.4.0 4.4.0 kcompactd-v1r1 nodefrag-v1r3 Amean System-1 102.86 ( 0.00%) 46.81 ( 54.50%) Amean System-4 37.85 ( 0.00%) 34.02 ( 10.12%) Amean System-7 48.12 ( 0.00%) 46.89 ( 2.56%) Amean System-12 51.98 ( 0.00%) 56.96 ( -9.57%) Amean System-21 80.16 ( 0.00%) 79.05 ( 1.39%) Amean System-30 110.71 ( 0.00%) 107.17 ( 3.20%) Amean System-48 127.98 ( 0.00%) 124.83 ( 2.46%) Amean Elapsd-1 185.84 ( 0.00%) 105.51 ( 43.23%) Amean Elapsd-4 26.19 ( 0.00%) 25.58 ( 2.33%) Amean Elapsd-7 21.65 ( 0.00%) 21.62 ( 0.16%) Amean Elapsd-12 18.58 ( 0.00%) 17.94 ( 3.43%) Amean Elapsd-21 17.53 ( 0.00%) 16.60 ( 5.33%) Amean Elapsd-30 17.45 ( 0.00%) 17.13 ( 1.84%) Amean Elapsd-48 15.40 ( 0.00%) 15.27 ( 0.82%) For a single thread, the benchmark completes 43.23% faster with this patch applied with smaller benefits as the thread increases. Similar, notice the large reduction in most cases in system CPU usage. The overall CPU time is 4.4.0 4.4.0 kcompactd-v1r1 nodefrag-v1r3 User 10357.65 10438.33 System 3988.88 3543.94 Elapsed 2203.01 1634.41 Which is substantial. Now, the reclaim figures 4.4.0 4.4.0 kcompactd-v1r1nodefrag-v1r3 Minor Faults 128458477 278352931 Major Faults 2174976 225 Swap Ins 16904701 0 Swap Outs 17359627 0 Allocation stalls 43611 0 DMA allocs 0 0 DMA32 allocs 19832646 19448017 Normal allocs 614488453 580941839 Movable allocs 0 0 Direct pages scanned 24163800 0 Kswapd pages scanned 0 0 Kswapd pages reclaimed 0 0 Direct pages reclaimed 20691346 0 Compaction stalls 42263 0 Compaction success 938 0 Compaction failures 41325 0 This patch eliminates almost all swapping and direct reclaim activity. There is still overhead but it's from NUMA balancing which does not identify that it's pointless trying to do anything with this workload. I also tried the thpscale benchmark which forces a corner case where compaction can be used heavily and measures the latency of whether base or huge pages were used thpscale Fault Latencies 4.4.0 4.4.0 kcompactd-v1r1 nodefrag-v1r3 Amean fault-base-1 5288.84 ( 0.00%) 2817.12 ( 46.73%) Amean fault-base-3 6365.53 ( 0.00%) 3499.11 ( 45.03%) Amean fault-base-5 6526.19 ( 0.00%) 4363.06 ( 33.15%) Amean fault-base-7 7142.25 ( 0.00%) 4858.08 ( 31.98%) Amean fault-base-12 13827.64 ( 0.00%) 10292.11 ( 25.57%) Amean fault-base-18 18235.07 ( 0.00%) 13788.84 ( 24.38%) Amean fault-base-24 21597.80 ( 0.00%) 24388.03 (-12.92%) Amean fault-base-30 26754.15 ( 0.00%) 19700.55 ( 26.36%) Amean fault-base-32 26784.94 ( 0.00%) 19513.57 ( 27.15%) Amean fault-huge-1 4223.96 ( 0.00%) 2178.57 ( 48.42%) Amean fault-huge-3 2194.77 ( 0.00%) 2149.74 ( 2.05%) Amean fault-huge-5 2569.60 ( 0.00%) 2346.95 ( 8.66%) Amean fault-huge-7 3612.69 ( 0.00%) 2997.70 ( 17.02%) Amean fault-huge-12 3301.75 ( 0.00%) 6727.02 (-103.74%) Amean fault-huge-18 6696.47 ( 0.00%) 6685.72 ( 0.16%) Amean fault-huge-24 8000.72 ( 0.00%) 9311.43 (-16.38%) Amean fault-huge-30 13305.55 ( 0.00%) 9750.45 ( 26.72%) Amean fault-huge-32 9981.71 ( 0.00%) 10316.06 ( -3.35%) The average time to fault pages is substantially reduced in the majority of caseds but with the obvious caveat that fewer THPs are actually used in this adverse workload 4.4.0 4.4.0 kcompactd-v1r1 nodefrag-v1r3 Percentage huge-1 0.71 ( 0.00%) 14.04 (1865.22%) Percentage huge-3 10.77 ( 0.00%) 33.05 (206.85%) Percentage huge-5 60.39 ( 0.00%) 38.51 (-36.23%) Percentage huge-7 45.97 ( 0.00%) 34.57 (-24.79%) Percentage huge-12 68.12 ( 0.00%) 40.07 (-41.17%) Percentage huge-18 64.93 ( 0.00%) 47.82 (-26.35%) Percentage huge-24 62.69 ( 0.00%) 44.23 (-29.44%) Percentage huge-30 43.49 ( 0.00%) 55.38 ( 27.34%) Percentage huge-32 50.72 ( 0.00%) 51.90 ( 2.35%) 4.4.0 4.4.0 kcompactd-v1r1nodefrag-v1r3 Minor Faults 37429143 47564000 Major Faults 1916 1558 Swap Ins 1466 1079 Swap Outs 2936863 149626 Allocation stalls 62510 3 DMA allocs 0 0 DMA32 allocs 6566458 6401314 Normal allocs 216361697 216538171 Movable allocs 0 0 Direct pages scanned 25977580 17998 Kswapd pages scanned 0 3638931 Kswapd pages reclaimed 0 207236 Direct pages reclaimed 8833714 88 Compaction stalls 103349 5 Compaction success 270 4 Compaction failures 103079 1 Note again that while this does swap as it's an aggressive workload, the direct relcim activity and allocation stalls is substantially reduced. There is some kswapd activity but ftrace showed that the kswapd activity was due to normal wakeups from 4K pages being allocated. Compaction-related stalls and activity are almost eliminated. I also tried the stutter benchmark. For this, I do not have figures for NUMA but it's something that does impact UMA so I'll report what is available stutter 4.4.0 4.4.0 kcompactd-v1r1 nodefrag-v1r3 Min mmap 7.3571 ( 0.00%) 7.3438 ( 0.18%) 1st-qrtle mmap 7.5278 ( 0.00%) 17.9200 (-138.05%) 2nd-qrtle mmap 7.6818 ( 0.00%) 21.6055 (-181.25%) 3rd-qrtle mmap 11.0889 ( 0.00%) 21.8881 (-97.39%) Max-90% mmap 27.8978 ( 0.00%) 22.1632 ( 20.56%) Max-93% mmap 28.3202 ( 0.00%) 22.3044 ( 21.24%) Max-95% mmap 28.5600 ( 0.00%) 22.4580 ( 21.37%) Max-99% mmap 29.6032 ( 0.00%) 25.5216 ( 13.79%) Max mmap 4109.7289 ( 0.00%) 4813.9832 (-17.14%) Mean mmap 12.4474 ( 0.00%) 19.3027 (-55.07%) This benchmark is trying to fault an anonymous mapping while there is a heavy IO load -- a scenario that desktop users used to complain about frequently. This shows a mix because the ideal case of mapping with THP is not hit as often. However, note that 99% of the mappings complete 13.79% faster. The CPU usage here is particularly interesting 4.4.0 4.4.0 kcompactd-v1r1nodefrag-v1r3 User 67.50 0.99 System 1327.88 91.30 Elapsed 2079.00 2128.98 And once again we look at the reclaim figures 4.4.0 4.4.0 kcompactd-v1r1nodefrag-v1r3 Minor Faults 335241922 1314582827 Major Faults 715 819 Swap Ins 0 0 Swap Outs 0 0 Allocation stalls 532723 0 DMA allocs 0 0 DMA32 allocs 1822364341 1177950222 Normal allocs 1815640808 1517844854 Movable allocs 0 0 Direct pages scanned 21892772 0 Kswapd pages scanned 20015890 41879484 Kswapd pages reclaimed 19961986 41822072 Direct pages reclaimed 21892741 0 Compaction stalls 1065755 0 Compaction success 514 0 Compaction failures 1065241 0 Allocation stalls and all direct reclaim activity is eliminated as well as compaction-related stalls. THP gives impressive gains in some cases but only if they are quickly available. We're not going to reach the point where they are completely free so lets take the costs out of the fast paths finally and defer the cost to kswapd, kcompactd and khugepaged where it belongs. Signed-off-by: Mel Gorman Acked-by: Rik van Riel Acked-by: Johannes Weiner Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 101 ++++++++++++++++++++++++++++++++++++------------------- mm/page_alloc.c | 8 ----- mm/slab.c | 8 ++--- mm/slub.c | 2 +- 4 files changed, 71 insertions(+), 48 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1dddfb2..e08b165 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -78,7 +78,7 @@ unsigned long transparent_hugepage_flags __read_mostly = #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE (1<vm_flags & VM_HUGEPAGE)) + reclaim_flags = __GFP_DIRECT_RECLAIM; + else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) + reclaim_flags = __GFP_KSWAPD_RECLAIM; + else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) + reclaim_flags = __GFP_DIRECT_RECLAIM; + + return GFP_TRANSHUGE | reclaim_flags; +} + +/* Defrag for khugepaged will enter direct reclaim/compaction if necessary */ +static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void) { - return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_RECLAIM)) | extra_gfp; + return GFP_TRANSHUGE | (khugepaged_defrag() ? __GFP_DIRECT_RECLAIM : 0); } /* Caller must hold page table lock. */ @@ -919,7 +950,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, } return ret; } - gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0); + gfp = alloc_hugepage_direct_gfpmask(vma); page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); if (unlikely(!page)) { count_vm_event(THP_FAULT_FALLBACK); @@ -1279,7 +1310,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, alloc: if (transparent_hugepage_enabled(vma) && !transparent_hugepage_debug_cow()) { - huge_gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0); + huge_gfp = alloc_hugepage_direct_gfpmask(vma); new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER); } else new_page = NULL; @@ -2249,11 +2280,12 @@ static int khugepaged_find_target_node(void) return 0; } -static inline struct page *alloc_hugepage(int defrag) +static inline struct page *alloc_khugepaged_hugepage(void) { struct page *page; - page = alloc_pages(alloc_hugepage_gfpmask(defrag, 0), HPAGE_PMD_ORDER); + page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(), + HPAGE_PMD_ORDER); if (page) prep_transhuge_page(page); return page; @@ -2264,7 +2296,7 @@ static struct page *khugepaged_alloc_hugepage(bool *wait) struct page *hpage; do { - hpage = alloc_hugepage(khugepaged_defrag()); + hpage = alloc_khugepaged_hugepage(); if (!hpage) { count_vm_event(THP_COLLAPSE_ALLOC_FAILED); if (!*wait) @@ -2335,8 +2367,7 @@ static void collapse_huge_page(struct mm_struct *mm, VM_BUG_ON(address & ~HPAGE_PMD_MASK); /* Only allocate from the target node */ - gfp = alloc_hugepage_gfpmask(khugepaged_defrag(), __GFP_OTHER_NODE) | - __GFP_THISNODE; + gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_OTHER_NODE | __GFP_THISNODE; /* release the mmap_sem read lock. */ new_page = khugepaged_alloc_page(hpage, gfp, mm, address, node); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d156310..096a00d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3119,14 +3119,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) gfp_mask &= ~__GFP_ATOMIC; - /* - * If this allocation cannot block and it is for a specific node, then - * fail early. There's no need to wakeup kswapd or retry for a - * speculative node-specific allocation. - */ - if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !can_direct_reclaim) - goto nopage; - retry: if (gfp_mask & __GFP_KSWAPD_RECLAIM) wake_all_kswapds(order, ac); diff --git a/mm/slab.c b/mm/slab.c index 56dd0df..e1f6c27 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -670,7 +670,7 @@ static inline void *____cache_alloc_node(struct kmem_cache *cachep, static inline gfp_t gfp_exact_node(gfp_t flags) { - return flags; + return flags & ~__GFP_NOFAIL; } #else /* CONFIG_NUMA */ @@ -841,12 +841,12 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) } /* - * Construct gfp mask to allocate from a specific node but do not direct reclaim - * or warn about failures. kswapd may still wake to reclaim in the background. + * Construct gfp mask to allocate from a specific node but do not reclaim or + * warn about failures. */ static inline gfp_t gfp_exact_node(gfp_t flags) { - return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~__GFP_DIRECT_RECLAIM; + return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~(__GFP_RECLAIM|__GFP_NOFAIL); } #endif diff --git a/mm/slub.c b/mm/slub.c index 2f2f04d..64ed5f3 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1426,7 +1426,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) */ alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min)) - alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_DIRECT_RECLAIM; + alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~(__GFP_RECLAIM|__GFP_NOFAIL); page = alloc_slab_page(s, alloc_gfp, node, oo); if (unlikely(!page)) { -- cgit v1.1 From fe896d1878949ea92ba547587bc3075cc688fb8f Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 17 Mar 2016 14:19:26 -0700 Subject: mm: introduce page reference manipulation functions The success of CMA allocation largely depends on the success of migration and key factor of it is page reference count. Until now, page reference is manipulated by direct calling atomic functions so we cannot follow up who and where manipulate it. Then, it is hard to find actual reason of CMA allocation failure. CMA allocation should be guaranteed to succeed so finding offending place is really important. In this patch, call sites where page reference is manipulated are converted to introduced wrapper function. This is preparation step to add tracepoint to each page reference manipulation function. With this facility, we can easily find reason of CMA allocation failure. There is no functional change in this patch. In addition, this patch also converts reference read sites. It will help a second step that renames page._count to something else and prevents later attempt to direct access to it (Suggested by Andrew). Signed-off-by: Joonsoo Kim Acked-by: Michal Nazarewicz Acked-by: Vlastimil Babka Cc: Minchan Kim Cc: Mel Gorman Cc: "Kirill A. Shutemov" Cc: Sergey Senozhatsky Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/debug.c | 2 +- mm/huge_memory.c | 6 +++--- mm/internal.h | 7 +------ mm/memory_hotplug.c | 4 ++-- mm/migrate.c | 10 +++++----- mm/page_alloc.c | 12 ++++++------ mm/vmscan.c | 6 +++--- 7 files changed, 21 insertions(+), 26 deletions(-) (limited to 'mm') diff --git a/mm/debug.c b/mm/debug.c index df7247b..8865bfb 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -43,7 +43,7 @@ const struct trace_print_flags vmaflag_names[] = { void __dump_page(struct page *page, const char *reason) { pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx", - page, atomic_read(&page->_count), page_mapcount(page), + page, page_ref_count(page), page_mapcount(page), page->mapping, page->index); if (PageCompound(page)) pr_cont(" compound_mapcount: %d", compound_mapcount(page)); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e08b165..bb944c7 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2888,7 +2888,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, page = pmd_page(*pmd); VM_BUG_ON_PAGE(!page_count(page), page); - atomic_add(HPAGE_PMD_NR - 1, &page->_count); + page_ref_add(page, HPAGE_PMD_NR - 1); write = pmd_write(*pmd); young = pmd_young(*pmd); dirty = pmd_dirty(*pmd); @@ -3257,7 +3257,7 @@ static void __split_huge_page_tail(struct page *head, int tail, struct page *page_tail = head + tail; VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail); - VM_BUG_ON_PAGE(atomic_read(&page_tail->_count) != 0, page_tail); + VM_BUG_ON_PAGE(page_ref_count(page_tail) != 0, page_tail); /* * tail_page->_count is zero and not changing from under us. But @@ -3270,7 +3270,7 @@ static void __split_huge_page_tail(struct page *head, int tail, * atomic_set() here would be safe on all archs (and not only on x86), * it's safer to use atomic_inc(). */ - atomic_inc(&page_tail->_count); + page_ref_inc(page_tail); page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; page_tail->flags |= (head->flags & diff --git a/mm/internal.h b/mm/internal.h index 4042a8a..57d7b0e 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -38,11 +38,6 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); -static inline void set_page_count(struct page *page, int v) -{ - atomic_set(&page->_count, v); -} - extern int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, pgoff_t offset, unsigned long nr_to_read, unsigned long lookahead_size); @@ -64,7 +59,7 @@ static inline unsigned long ra_submit(struct file_ra_state *ra, static inline void set_page_refcounted(struct page *page) { VM_BUG_ON_PAGE(PageTail(page), page); - VM_BUG_ON_PAGE(atomic_read(&page->_count), page); + VM_BUG_ON_PAGE(page_ref_count(page), page); set_page_count(page, 1); } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c832ef3..e62aa07 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -167,7 +167,7 @@ void get_page_bootmem(unsigned long info, struct page *page, page->lru.next = (struct list_head *) type; SetPagePrivate(page); set_page_private(page, info); - atomic_inc(&page->_count); + page_ref_inc(page); } void put_page_bootmem(struct page *page) @@ -178,7 +178,7 @@ void put_page_bootmem(struct page *page) BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE); - if (atomic_dec_return(&page->_count) == 1) { + if (page_ref_dec_return(page) == 1) { ClearPagePrivate(page); set_page_private(page, 0); INIT_LIST_HEAD(&page->lru); diff --git a/mm/migrate.c b/mm/migrate.c index fdaf081..577c94b 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -349,7 +349,7 @@ int migrate_page_move_mapping(struct address_space *mapping, return -EAGAIN; } - if (!page_freeze_refs(page, expected_count)) { + if (!page_ref_freeze(page, expected_count)) { spin_unlock_irq(&mapping->tree_lock); return -EAGAIN; } @@ -363,7 +363,7 @@ int migrate_page_move_mapping(struct address_space *mapping, */ if (mode == MIGRATE_ASYNC && head && !buffer_migrate_lock_buffers(head, mode)) { - page_unfreeze_refs(page, expected_count); + page_ref_unfreeze(page, expected_count); spin_unlock_irq(&mapping->tree_lock); return -EAGAIN; } @@ -397,7 +397,7 @@ int migrate_page_move_mapping(struct address_space *mapping, * to one less reference. * We know this isn't the last reference. */ - page_unfreeze_refs(page, expected_count - 1); + page_ref_unfreeze(page, expected_count - 1); spin_unlock(&mapping->tree_lock); /* Leave irq disabled to prevent preemption while updating stats */ @@ -451,7 +451,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, return -EAGAIN; } - if (!page_freeze_refs(page, expected_count)) { + if (!page_ref_freeze(page, expected_count)) { spin_unlock_irq(&mapping->tree_lock); return -EAGAIN; } @@ -463,7 +463,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, radix_tree_replace_slot(pslot, newpage); - page_unfreeze_refs(page, expected_count - 1); + page_ref_unfreeze(page, expected_count - 1); spin_unlock_irq(&mapping->tree_lock); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 096a00d..30134a8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -766,7 +766,7 @@ static inline int free_pages_check(struct page *page) bad_reason = "nonzero mapcount"; if (unlikely(page->mapping != NULL)) bad_reason = "non-NULL mapping"; - if (unlikely(atomic_read(&page->_count) != 0)) + if (unlikely(page_ref_count(page) != 0)) bad_reason = "nonzero _count"; if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) { bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; @@ -1462,7 +1462,7 @@ static inline int check_new_page(struct page *page) bad_reason = "nonzero mapcount"; if (unlikely(page->mapping != NULL)) bad_reason = "non-NULL mapping"; - if (unlikely(atomic_read(&page->_count) != 0)) + if (unlikely(page_ref_count(page) != 0)) bad_reason = "nonzero _count"; if (unlikely(page->flags & __PG_HWPOISON)) { bad_reason = "HWPoisoned (hardware-corrupted)"; @@ -3475,7 +3475,7 @@ refill: /* Even if we own the page, we do not use atomic_set(). * This would break get_page_unless_zero() users. */ - atomic_add(size - 1, &page->_count); + page_ref_add(page, size - 1); /* reset page count bias and offset to start of new frag */ nc->pfmemalloc = page_is_pfmemalloc(page); @@ -3487,7 +3487,7 @@ refill: if (unlikely(offset < 0)) { page = virt_to_page(nc->va); - if (!atomic_sub_and_test(nc->pagecnt_bias, &page->_count)) + if (!page_ref_sub_and_test(page, nc->pagecnt_bias)) goto refill; #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) @@ -3495,7 +3495,7 @@ refill: size = nc->size; #endif /* OK, page count is 0, we can safely set it */ - atomic_set(&page->_count, size); + set_page_count(page, size); /* reset page count bias and offset to start of new frag */ nc->pagecnt_bias = size; @@ -6852,7 +6852,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, * This check already skips compound tails of THP * because their page->_count is zero at all time. */ - if (!atomic_read(&page->_count)) { + if (!page_ref_count(page)) { if (PageBuddy(page)) iter += (1 << page_order(page)) - 1; continue; diff --git a/mm/vmscan.c b/mm/vmscan.c index b41b82d..b934223e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -638,11 +638,11 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, * Note that if SetPageDirty is always performed via set_page_dirty, * and thus under tree_lock, then this ordering is not required. */ - if (!page_freeze_refs(page, 2)) + if (!page_ref_freeze(page, 2)) goto cannot_free; /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */ if (unlikely(PageDirty(page))) { - page_unfreeze_refs(page, 2); + page_ref_unfreeze(page, 2); goto cannot_free; } @@ -704,7 +704,7 @@ int remove_mapping(struct address_space *mapping, struct page *page) * drops the pagecache ref for us without requiring another * atomic operation. */ - page_unfreeze_refs(page, 1); + page_ref_unfreeze(page, 1); return 1; } return 0; -- cgit v1.1 From 95813b8faa0cd315f61a8b9d9c523792370b693e Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 17 Mar 2016 14:19:29 -0700 Subject: mm/page_ref: add tracepoint to track down page reference manipulation CMA allocation should be guaranteed to succeed by definition, but, unfortunately, it would be failed sometimes. It is hard to track down the problem, because it is related to page reference manipulation and we don't have any facility to analyze it. This patch adds tracepoints to track down page reference manipulation. With it, we can find exact reason of failure and can fix the problem. Following is an example of tracepoint output. (note: this example is stale version that printing flags as the number. Recent version will print it as human readable string.) <...>-9018 [004] 92.678375: page_ref_set: pfn=0x17ac9 flags=0x0 count=1 mapcount=0 mapping=(nil) mt=4 val=1 <...>-9018 [004] 92.678378: kernel_stack: => get_page_from_freelist (ffffffff81176659) => __alloc_pages_nodemask (ffffffff81176d22) => alloc_pages_vma (ffffffff811bf675) => handle_mm_fault (ffffffff8119e693) => __do_page_fault (ffffffff810631ea) => trace_do_page_fault (ffffffff81063543) => do_async_page_fault (ffffffff8105c40a) => async_page_fault (ffffffff817581d8) [snip] <...>-9018 [004] 92.678379: page_ref_mod: pfn=0x17ac9 flags=0x40048 count=2 mapcount=1 mapping=0xffff880015a78dc1 mt=4 val=1 [snip] ... ... <...>-9131 [001] 93.174468: test_pages_isolated: start_pfn=0x17800 end_pfn=0x17c00 fin_pfn=0x17ac9 ret=fail [snip] <...>-9018 [004] 93.174843: page_ref_mod_and_test: pfn=0x17ac9 flags=0x40068 count=0 mapcount=0 mapping=0xffff880015a78dc1 mt=4 val=-1 ret=1 => release_pages (ffffffff8117c9e4) => free_pages_and_swap_cache (ffffffff811b0697) => tlb_flush_mmu_free (ffffffff81199616) => tlb_finish_mmu (ffffffff8119a62c) => exit_mmap (ffffffff811a53f7) => mmput (ffffffff81073f47) => do_exit (ffffffff810794e9) => do_group_exit (ffffffff81079def) => SyS_exit_group (ffffffff81079e74) => entry_SYSCALL_64_fastpath (ffffffff817560b6) This output shows that problem comes from exit path. In exit path, to improve performance, pages are not freed immediately. They are gathered and processed by batch. During this process, migration cannot be possible and CMA allocation is failed. This problem is hard to find without this page reference tracepoint facility. Enabling this feature bloat kernel text 30 KB in my configuration. text data bss dec hex filename 12127327 2243616 1507328 15878271 f2487f vmlinux_disabled 12157208 2258880 1507328 15923416 f2f8d8 vmlinux_enabled Note that, due to header file dependency problem between mm.h and tracepoint.h, this feature has to open code the static key functions for tracepoints. Proposed by Steven Rostedt in following link. https://lkml.org/lkml/2015/12/9/699 [arnd@arndb.de: crypto/async_pq: use __free_page() instead of put_page()] [iamjoonsoo.kim@lge.com: fix build failure for xtensa] [akpm@linux-foundation.org: tweak Kconfig text, per Vlastimil] Signed-off-by: Joonsoo Kim Acked-by: Michal Nazarewicz Acked-by: Vlastimil Babka Cc: Minchan Kim Cc: Mel Gorman Cc: "Kirill A. Shutemov" Cc: Sergey Senozhatsky Acked-by: Steven Rostedt Signed-off-by: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig.debug | 13 +++++++++++++ mm/Makefile | 1 + mm/debug_page_ref.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 68 insertions(+) create mode 100644 mm/debug_page_ref.c (limited to 'mm') diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 5c50b23..22f4cd9 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -79,3 +79,16 @@ config PAGE_POISONING_ZERO Enabling page poisoning with this option will disable hibernation If unsure, say N + bool + +config DEBUG_PAGE_REF + bool "Enable tracepoint to track down page reference manipulation" + depends on DEBUG_KERNEL + depends on TRACEPOINTS + ---help--- + This is a feature to add tracepoint for tracking down page reference + manipulation. This tracking is useful to diagnose functional failure + due to migration failures caused by page reference mismatches. Be + careful when enabling this feature because it adds about 30 KB to the + kernel code. However the runtime performance overhead is virtually + nil until the tracepoints are actually enabled. diff --git a/mm/Makefile b/mm/Makefile index cfdd481d..6da300a 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -81,3 +81,4 @@ obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o obj-$(CONFIG_USERFAULTFD) += userfaultfd.o obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o +obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o diff --git a/mm/debug_page_ref.c b/mm/debug_page_ref.c new file mode 100644 index 0000000..1aef3d5 --- /dev/null +++ b/mm/debug_page_ref.c @@ -0,0 +1,54 @@ +#include +#include + +#define CREATE_TRACE_POINTS +#include + +void __page_ref_set(struct page *page, int v) +{ + trace_page_ref_set(page, v); +} +EXPORT_SYMBOL(__page_ref_set); +EXPORT_TRACEPOINT_SYMBOL(page_ref_set); + +void __page_ref_mod(struct page *page, int v) +{ + trace_page_ref_mod(page, v); +} +EXPORT_SYMBOL(__page_ref_mod); +EXPORT_TRACEPOINT_SYMBOL(page_ref_mod); + +void __page_ref_mod_and_test(struct page *page, int v, int ret) +{ + trace_page_ref_mod_and_test(page, v, ret); +} +EXPORT_SYMBOL(__page_ref_mod_and_test); +EXPORT_TRACEPOINT_SYMBOL(page_ref_mod_and_test); + +void __page_ref_mod_and_return(struct page *page, int v, int ret) +{ + trace_page_ref_mod_and_return(page, v, ret); +} +EXPORT_SYMBOL(__page_ref_mod_and_return); +EXPORT_TRACEPOINT_SYMBOL(page_ref_mod_and_return); + +void __page_ref_mod_unless(struct page *page, int v, int u) +{ + trace_page_ref_mod_unless(page, v, u); +} +EXPORT_SYMBOL(__page_ref_mod_unless); +EXPORT_TRACEPOINT_SYMBOL(page_ref_mod_unless); + +void __page_ref_freeze(struct page *page, int v, int ret) +{ + trace_page_ref_freeze(page, v, ret); +} +EXPORT_SYMBOL(__page_ref_freeze); +EXPORT_TRACEPOINT_SYMBOL(page_ref_freeze); + +void __page_ref_unfreeze(struct page *page, int v) +{ + trace_page_ref_unfreeze(page, v); +} +EXPORT_SYMBOL(__page_ref_unfreeze); +EXPORT_TRACEPOINT_SYMBOL(page_ref_unfreeze); -- cgit v1.1 From 0f352e5392c86d054998dd6526f2fdc33ec4bed5 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 17 Mar 2016 14:19:32 -0700 Subject: mm: remove __GFP_NOFAIL is deprecated comment Commit 647757197cd3 ("mm: clarify __GFP_NOFAIL deprecation status") was incomplete and didn't remove the comment about __GFP_NOFAIL being deprecated in buffered_rmqueue. Let's get rid of this leftover but keep the WARN_ON_ONCE for order > 1 because we should really discourage from using __GFP_NOFAIL with higher order allocations because those are just too subtle. Signed-off-by: Michal Hocko Reviewed-by: Nikolay Borisov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 30134a8..30f01c6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2350,19 +2350,11 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, list_del(&page->lru); pcp->count--; } else { - if (unlikely(gfp_flags & __GFP_NOFAIL)) { - /* - * __GFP_NOFAIL is not to be used in new code. - * - * All __GFP_NOFAIL callers should be fixed so that they - * properly detect and handle allocation failures. - * - * We most definitely don't want callers attempting to - * allocate greater than order-1 page units with - * __GFP_NOFAIL. - */ - WARN_ON_ONCE(order > 1); - } + /* + * We most definitely don't want callers attempting to + * allocate greater than order-1 page units with __GFP_NOFAIL. + */ + WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); spin_lock_irqsave(&zone->lock, flags); page = NULL; -- cgit v1.1 From e33e33b4d1c699d06fb8ccd6da80b309b84ec975 Mon Sep 17 00:00:00 2001 From: Chen Yucong Date: Thu, 17 Mar 2016 14:19:35 -0700 Subject: mm, memory hotplug: print debug message in the proper way for online_pages online_pages() simply returns an error value if memory_notify(MEM_GOING_ONLINE, &arg) return a value that is not what we want for successfully onlining target pages. This patch arms to print more failure information like offline_pages() in online_pages. This patch also converts printk(KERN_) to pr_(), and moves __offline_pages() to not print failure information with KERN_INFO according to David Rientjes's suggestion[1]. [1] https://lkml.org/lkml/2016/2/24/1094 Signed-off-by: Chen Yucong Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index e62aa07..f5758b6 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1059,10 +1059,9 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ ret = memory_notify(MEM_GOING_ONLINE, &arg); ret = notifier_to_errno(ret); - if (ret) { - memory_notify(MEM_CANCEL_ONLINE, &arg); - return ret; - } + if (ret) + goto failed_addition; + /* * If this zone is not populated, then it is not in zonelist. * This means the page allocator ignores this zone. @@ -1080,12 +1079,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ if (need_zonelists_rebuild) zone_pcp_reset(zone); mutex_unlock(&zonelists_mutex); - printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n", - (unsigned long long) pfn << PAGE_SHIFT, - (((unsigned long long) pfn + nr_pages) - << PAGE_SHIFT) - 1); - memory_notify(MEM_CANCEL_ONLINE, &arg); - return ret; + goto failed_addition; } zone->present_pages += onlined_pages; @@ -1118,6 +1112,13 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ if (onlined_pages) memory_notify(MEM_ONLINE, &arg); return 0; + +failed_addition: + pr_debug("online_pages [mem %#010llx-%#010llx] failed\n", + (unsigned long long) pfn << PAGE_SHIFT, + (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1); + memory_notify(MEM_CANCEL_ONLINE, &arg); + return ret; } #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ @@ -1529,8 +1530,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) } else { #ifdef CONFIG_DEBUG_VM - printk(KERN_ALERT "removing pfn %lx from LRU failed\n", - pfn); + pr_alert("removing pfn %lx from LRU failed\n", pfn); dump_page(page, "failed to remove from LRU"); #endif put_page(page); @@ -1858,7 +1858,7 @@ repeat: ret = -EBUSY; goto failed_removal; } - printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages); + pr_info("Offlined Pages %ld\n", offlined_pages); /* Ok, all of our target is isolated. We cannot do rollback at this point. */ offline_isolated_pages(start_pfn, end_pfn); @@ -1895,9 +1895,9 @@ repeat: return 0; failed_removal: - printk(KERN_INFO "memory offlining [mem %#010llx-%#010llx] failed\n", - (unsigned long long) start_pfn << PAGE_SHIFT, - ((unsigned long long) end_pfn << PAGE_SHIFT) - 1); + pr_debug("memory offlining [mem %#010llx-%#010llx] failed\n", + (unsigned long long) start_pfn << PAGE_SHIFT, + ((unsigned long long) end_pfn << PAGE_SHIFT) - 1); memory_notify(MEM_CANCEL_OFFLINE, &arg); /* pushback to free area */ undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); -- cgit v1.1 From d334c9bcb4aebd0c3e89ee415cba86fa5f0253ee Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 17 Mar 2016 14:19:38 -0700 Subject: mm: memcontrol: cleanup css_reset callback - Do not take memcg_limit_mutex for resetting limits - the cgroup cannot be altered from userspace anymore, so no need to protect them. - Use plain page_counter_limit() for resetting ->memory and ->memsw limits instead of mem_cgrouop_resize_* helpers - we enlarge the limits, so no need in special handling. - Reset ->swap and ->tcpmem limits as well. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Acked-by: Michal Hocko Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ae8b81c..8615b06 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4257,9 +4257,11 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX); - mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX); - memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX); + page_counter_limit(&memcg->memory, PAGE_COUNTER_MAX); + page_counter_limit(&memcg->swap, PAGE_COUNTER_MAX); + page_counter_limit(&memcg->memsw, PAGE_COUNTER_MAX); + page_counter_limit(&memcg->kmem, PAGE_COUNTER_MAX); + page_counter_limit(&memcg->tcpmem, PAGE_COUNTER_MAX); memcg->low = 0; memcg->high = PAGE_COUNTER_MAX; memcg->soft_limit = PAGE_COUNTER_MAX; -- cgit v1.1 From b11a7b94100cba5ec926a181894c2897a22651b9 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 17 Mar 2016 14:19:41 -0700 Subject: mm: exclude ZONE_DEVICE from GFP_ZONE_TABLE ZONE_DEVICE (merged in 4.3) and ZONE_CMA (proposed) are examples of new mm zones that are bumping up against the current maximum limit of 4 zones, i.e. 2 bits in page->flags for the GFP_ZONE_TABLE. The GFP_ZONE_TABLE poses an interesting constraint since include/linux/gfp.h gets included by the 32-bit portion of a 64-bit build. We need to be careful to only build the table for zones that have a corresponding gfp_t flag. GFP_ZONES_SHIFT is introduced for this purpose. This patch does not attempt to solve the problem of adding a new zone that also has a corresponding GFP_ flag. Vlastimil points out that ZONE_DEVICE, by depending on x86_64 and SPARSEMEM_VMEMMAP implies that SECTIONS_WIDTH is zero. In other words even though ZONE_DEVICE does not fit in GFP_ZONE_TABLE it is free to consume another bit in page->flags (expand ZONES_WIDTH) with room to spare. Link: https://bugzilla.kernel.org/show_bug.cgi?id=110931 Fixes: 033fbae988fc ("mm: ZONE_DEVICE for "device memory"") Signed-off-by: Dan Williams Reported-by: Mark Reported-by: Vlastimil Babka Cc: Mel Gorman Cc: Rik van Riel Cc: Joonsoo Kim Cc: Dave Hansen Cc: Sudip Mukherjee Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index c077765..520dae6 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -651,8 +651,6 @@ config IDLE_PAGE_TRACKING config ZONE_DEVICE bool "Device memory (pmem, etc...) hotplug support" if EXPERT - default !ZONE_DMA - depends on !ZONE_DMA depends on MEMORY_HOTPLUG depends on MEMORY_HOTREMOVE depends on X86_64 #arch_add_memory() comprehends device memory -- cgit v1.1 From 598d80914e84fa79580850530f5d4a50a99bf4f5 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 17 Mar 2016 14:19:44 -0700 Subject: mm: convert pr_warning to pr_warn There are a mixture of pr_warning and pr_warn uses in mm. Use pr_warn consistently. Miscellanea: - Coalesce formats - Realign arguments Signed-off-by: Joe Perches Acked-by: Tejun Heo [percpu] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 5 ++--- mm/kmemleak.c | 14 +++++++------- mm/percpu.c | 15 +++++++-------- 3 files changed, 16 insertions(+), 18 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index aefba5a..06058ea 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2665,7 +2665,7 @@ void __init hugetlb_add_hstate(unsigned int order) unsigned long i; if (size_to_hstate(PAGE_SIZE << order)) { - pr_warning("hugepagesz= specified twice, ignoring\n"); + pr_warn("hugepagesz= specified twice, ignoring\n"); return; } BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); @@ -2701,8 +2701,7 @@ static int __init hugetlb_nrpages_setup(char *s) mhp = &parsed_hstate->max_huge_pages; if (mhp == last_mhp) { - pr_warning("hugepages= specified twice without " - "interleaving hugepagesz=, ignoring\n"); + pr_warn("hugepages= specified twice without interleaving hugepagesz=, ignoring\n"); return 1; } diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 25c0ad3..a81cd76 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -276,7 +276,7 @@ static void kmemleak_disable(void); * Print a warning and dump the stack trace. */ #define kmemleak_warn(x...) do { \ - pr_warning(x); \ + pr_warn(x); \ dump_stack(); \ kmemleak_warning = 1; \ } while (0) @@ -543,7 +543,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp)); if (!object) { - pr_warning("Cannot allocate a kmemleak_object structure\n"); + pr_warn("Cannot allocate a kmemleak_object structure\n"); kmemleak_disable(); return NULL; } @@ -764,7 +764,7 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp) area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp)); if (!area) { - pr_warning("Cannot allocate a scan area\n"); + pr_warn("Cannot allocate a scan area\n"); goto out; } @@ -1515,7 +1515,7 @@ static void start_scan_thread(void) return; scan_thread = kthread_run(kmemleak_scan_thread, NULL, "kmemleak"); if (IS_ERR(scan_thread)) { - pr_warning("Failed to create the scan thread\n"); + pr_warn("Failed to create the scan thread\n"); scan_thread = NULL; } } @@ -1874,8 +1874,8 @@ void __init kmemleak_init(void) scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE); if (crt_early_log > ARRAY_SIZE(early_log)) - pr_warning("Early log buffer exceeded (%d), please increase " - "DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n", crt_early_log); + pr_warn("Early log buffer exceeded (%d), please increase DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n", + crt_early_log); /* the kernel is still in UP mode, so disabling the IRQs is enough */ local_irq_save(flags); @@ -1960,7 +1960,7 @@ static int __init kmemleak_late_init(void) dentry = debugfs_create_file("kmemleak", S_IRUGO, NULL, NULL, &kmemleak_fops); if (!dentry) - pr_warning("Failed to create the debugfs kmemleak file\n"); + pr_warn("Failed to create the debugfs kmemleak file\n"); mutex_lock(&scan_mutex); start_scan_thread(); mutex_unlock(&scan_mutex); diff --git a/mm/percpu.c b/mm/percpu.c index 998607a..847814b 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1033,8 +1033,8 @@ fail_unlock: spin_unlock_irqrestore(&pcpu_lock, flags); fail: if (!is_atomic && warn_limit) { - pr_warning("PERCPU: allocation failed, size=%zu align=%zu atomic=%d, %s\n", - size, align, is_atomic, err); + pr_warn("PERCPU: allocation failed, size=%zu align=%zu atomic=%d, %s\n", + size, align, is_atomic, err); dump_stack(); if (!--warn_limit) pr_info("PERCPU: limit reached, disable warning\n"); @@ -1723,7 +1723,7 @@ static int __init percpu_alloc_setup(char *str) pcpu_chosen_fc = PCPU_FC_PAGE; #endif else - pr_warning("PERCPU: unknown allocator %s specified\n", str); + pr_warn("PERCPU: unknown allocator %s specified\n", str); return 0; } @@ -2016,9 +2016,8 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, /* warn if maximum distance is further than 75% of vmalloc space */ if (max_distance > VMALLOC_TOTAL * 3 / 4) { - pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc " - "space 0x%lx\n", max_distance, - VMALLOC_TOTAL); + pr_warn("PERCPU: max_distance=0x%zx too large for vmalloc space 0x%lx\n", + max_distance, VMALLOC_TOTAL); #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK /* and fail if we have fallback */ rc = -EINVAL; @@ -2100,8 +2099,8 @@ int __init pcpu_page_first_chunk(size_t reserved_size, ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE); if (!ptr) { - pr_warning("PERCPU: failed to allocate %s page " - "for cpu%u\n", psize_str, cpu); + pr_warn("PERCPU: failed to allocate %s page for cpu%u\n", + psize_str, cpu); goto enomem; } /* kmemleak tracks the percpu allocations separately */ -- cgit v1.1 From 756a025f00091918d9d09ca3229defb160b409c0 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 17 Mar 2016 14:19:47 -0700 Subject: mm: coalesce split strings Kernel style prefers a single string over split strings when the string is 'user-visible'. Miscellanea: - Add a missing newline - Realign arguments Signed-off-by: Joe Perches Acked-by: Tejun Heo [percpu] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/dmapool.c | 10 ++++------ mm/huge_memory.c | 3 +-- mm/kasan/report.c | 6 ++---- mm/kmemcheck.c | 3 +-- mm/kmemleak.c | 18 ++++++++---------- mm/memblock.c | 3 +-- mm/memory_hotplug.c | 3 +-- mm/mempolicy.c | 4 +--- mm/mmap.c | 8 +++----- mm/oom_kill.c | 3 +-- mm/page_alloc.c | 37 +++++++++++++++++-------------------- mm/page_owner.c | 5 ++--- mm/percpu.c | 4 ++-- mm/slab.c | 28 ++++++++++------------------ mm/slab_common.c | 10 ++++------ mm/slub.c | 19 +++++++++---------- mm/sparse-vmemmap.c | 8 ++++---- mm/sparse.c | 8 ++++---- mm/swapfile.c | 3 +-- mm/vmalloc.c | 4 ++-- 20 files changed, 78 insertions(+), 109 deletions(-) (limited to 'mm') diff --git a/mm/dmapool.c b/mm/dmapool.c index 57312b5..2821500 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -452,13 +452,11 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) } spin_unlock_irqrestore(&pool->lock, flags); if (pool->dev) - dev_err(pool->dev, "dma_pool_free %s, dma %Lx " - "already free\n", pool->name, - (unsigned long long)dma); + dev_err(pool->dev, "dma_pool_free %s, dma %Lx already free\n", + pool->name, (unsigned long long)dma); else - printk(KERN_ERR "dma_pool_free %s, dma %Lx " - "already free\n", pool->name, - (unsigned long long)dma); + printk(KERN_ERR "dma_pool_free %s, dma %Lx already free\n", + pool->name, (unsigned long long)dma); return; } } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index bb944c7..e1a177c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -168,8 +168,7 @@ static void set_recommended_min_free_kbytes(void) if (recommended_min > min_free_kbytes) { if (user_min_free_kbytes >= 0) - pr_info("raising min_free_kbytes from %d to %lu " - "to help transparent hugepage allocations\n", + pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n", min_free_kbytes, recommended_min); min_free_kbytes = recommended_min; diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 12f222d..745aa8f 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -214,8 +214,7 @@ static void kasan_report_error(struct kasan_access_info *info) */ kasan_disable_current(); spin_lock_irqsave(&report_lock, flags); - pr_err("=================================" - "=================================\n"); + pr_err("==================================================================\n"); if (info->access_addr < kasan_shadow_to_mem((void *)KASAN_SHADOW_START)) { if ((unsigned long)info->access_addr < PAGE_SIZE) @@ -236,8 +235,7 @@ static void kasan_report_error(struct kasan_access_info *info) print_address_description(info); print_shadow_for_address(info->first_bad_addr); } - pr_err("=================================" - "=================================\n"); + pr_err("==================================================================\n"); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); spin_unlock_irqrestore(&report_lock, flags); kasan_enable_current(); diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c index 6f4f424..e5f8333 100644 --- a/mm/kmemcheck.c +++ b/mm/kmemcheck.c @@ -20,8 +20,7 @@ void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node) shadow = alloc_pages_node(node, flags | __GFP_NOTRACK, order); if (!shadow) { if (printk_ratelimit()) - printk(KERN_ERR "kmemcheck: failed to allocate " - "shadow bitmap\n"); + printk(KERN_ERR "kmemcheck: failed to allocate shadow bitmap\n"); return; } diff --git a/mm/kmemleak.c b/mm/kmemleak.c index a81cd76..e642992 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -596,8 +596,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, else if (parent->pointer + parent->size <= ptr) link = &parent->rb_node.rb_right; else { - kmemleak_stop("Cannot insert 0x%lx into the object " - "search tree (overlaps existing)\n", + kmemleak_stop("Cannot insert 0x%lx into the object search tree (overlaps existing)\n", ptr); /* * No need for parent->lock here since "parent" cannot @@ -670,8 +669,8 @@ static void delete_object_part(unsigned long ptr, size_t size) object = find_and_remove_object(ptr, 1); if (!object) { #ifdef DEBUG - kmemleak_warn("Partially freeing unknown object at 0x%08lx " - "(size %zu)\n", ptr, size); + kmemleak_warn("Partially freeing unknown object at 0x%08lx (size %zu)\n", + ptr, size); #endif return; } @@ -717,8 +716,8 @@ static void paint_ptr(unsigned long ptr, int color) object = find_and_get_object(ptr, 0); if (!object) { - kmemleak_warn("Trying to color unknown object " - "at 0x%08lx as %s\n", ptr, + kmemleak_warn("Trying to color unknown object at 0x%08lx as %s\n", + ptr, (color == KMEMLEAK_GREY) ? "Grey" : (color == KMEMLEAK_BLACK) ? "Black" : "Unknown"); return; @@ -1463,8 +1462,8 @@ static void kmemleak_scan(void) if (new_leaks) { kmemleak_found_leaks = true; - pr_info("%d new suspected memory leaks (see " - "/sys/kernel/debug/kmemleak)\n", new_leaks); + pr_info("%d new suspected memory leaks (see /sys/kernel/debug/kmemleak)\n", + new_leaks); } } @@ -1795,8 +1794,7 @@ static void kmemleak_do_cleanup(struct work_struct *work) if (!kmemleak_found_leaks) __kmemleak_do_cleanup(); else - pr_info("Kmemleak disabled without freeing internal data. " - "Reclaim the memory with \"echo clear > /sys/kernel/debug/kmemleak\"\n"); + pr_info("Kmemleak disabled without freeing internal data. Reclaim the memory with \"echo clear > /sys/kernel/debug/kmemleak\".\n"); } static DECLARE_WORK(cleanup_work, kmemleak_do_cleanup); diff --git a/mm/memblock.c b/mm/memblock.c index fc7824f..b570ddd 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -238,8 +238,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, * so we use WARN_ONCE() here to see the stack trace if * fail happens. */ - WARN_ONCE(1, "memblock: bottom-up allocation failed, " - "memory hotunplug may be affected\n"); + WARN_ONCE(1, "memblock: bottom-up allocation failed, memory hotunplug may be affected\n"); } return __memblock_find_range_top_down(start, end, size, align, nid, diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index f5758b6..aa34431 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1970,8 +1970,7 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)); endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1; - pr_warn("removing memory fails, because memory " - "[%pa-%pa] is onlined\n", + pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n", &beginpa, &endpa); } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 8cbc743..b25de27 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2559,9 +2559,7 @@ static void __init check_numabalancing_enable(void) set_numabalancing_state(numabalancing_override == 1); if (num_online_nodes() > 1 && !numabalancing_override) { - pr_info("%s automatic NUMA balancing. " - "Configure with numa_balancing= or the " - "kernel.numa_balancing sysctl", + pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n", numabalancing_default ? "Enabling" : "Disabling"); set_numabalancing_state(numabalancing_default); } diff --git a/mm/mmap.c b/mm/mmap.c index 1464192..e06345a 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2517,9 +2517,8 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, unsigned long ret = -EINVAL; struct file *file; - pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. " - "See Documentation/vm/remap_file_pages.txt.\n", - current->comm, current->pid); + pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/vm/remap_file_pages.txt.\n", + current->comm, current->pid); if (prot) return ret; @@ -2885,8 +2884,7 @@ bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages) if (is_data_mapping(flags) && mm->data_vm + npages > rlimit(RLIMIT_DATA) >> PAGE_SHIFT) { if (ignore_rlimit_data) - pr_warn_once("%s (%d): VmData %lu exceed data ulimit " - "%lu. Will be forbidden soon.\n", + pr_warn_once("%s (%d): VmData %lu exceed data ulimit %lu. Will be forbidden soon.\n", current->comm, current->pid, (mm->data_vm + npages) << PAGE_SHIFT, rlimit(RLIMIT_DATA)); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 63ced70..fde3d37 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -383,8 +383,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) static void dump_header(struct oom_control *oc, struct task_struct *p, struct mem_cgroup *memcg) { - pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, " - "oom_score_adj=%hd\n", + pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n", current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order, current->signal->oom_score_adj); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 30f01c6..42cf199 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4074,8 +4074,7 @@ static int __parse_numa_zonelist_order(char *s) user_zonelist_order = ZONELIST_ORDER_ZONE; } else { printk(KERN_WARNING - "Ignoring invalid numa_zonelist_order value: " - "%s\n", s); + "Ignoring invalid numa_zonelist_order value: %s\n", s); return -EINVAL; } return 0; @@ -4539,12 +4538,11 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) else page_group_by_mobility_disabled = 0; - pr_info("Built %i zonelists in %s order, mobility grouping %s. " - "Total pages: %ld\n", - nr_online_nodes, - zonelist_order_name[current_zonelist_order], - page_group_by_mobility_disabled ? "off" : "on", - vm_total_pages); + pr_info("Built %i zonelists in %s order, mobility grouping %s. Total pages: %ld\n", + nr_online_nodes, + zonelist_order_name[current_zonelist_order], + page_group_by_mobility_disabled ? "off" : "on", + vm_total_pages); #ifdef CONFIG_NUMA pr_info("Policy zone: %s\n", zone_names[policy_zone]); #endif @@ -6142,22 +6140,21 @@ void __init mem_init_print_info(const char *str) #undef adj_init_size - pr_info("Memory: %luK/%luK available " - "(%luK kernel code, %luK rwdata, %luK rodata, " - "%luK init, %luK bss, %luK reserved, %luK cma-reserved" + pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved" #ifdef CONFIG_HIGHMEM - ", %luK highmem" + ", %luK highmem" #endif - "%s%s)\n", - nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10), - codesize >> 10, datasize >> 10, rosize >> 10, - (init_data_size + init_code_size) >> 10, bss_size >> 10, - (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT-10), - totalcma_pages << (PAGE_SHIFT-10), + "%s%s)\n", + nr_free_pages() << (PAGE_SHIFT - 10), + physpages << (PAGE_SHIFT - 10), + codesize >> 10, datasize >> 10, rosize >> 10, + (init_data_size + init_code_size) >> 10, bss_size >> 10, + (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT - 10), + totalcma_pages << (PAGE_SHIFT - 10), #ifdef CONFIG_HIGHMEM - totalhigh_pages << (PAGE_SHIFT-10), + totalhigh_pages << (PAGE_SHIFT - 10), #endif - str ? ", " : "", str ? str : ""); + str ? ", " : "", str ? str : ""); } /** diff --git a/mm/page_owner.c b/mm/page_owner.c index 44ad1f0..ac3d8d1 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -198,9 +198,8 @@ void __dump_page_owner(struct page *page) return; } - pr_alert("page allocated via order %u, migratetype %s, " - "gfp_mask %#x(%pGg)\n", page_ext->order, - migratetype_names[mt], gfp_mask, &gfp_mask); + pr_alert("page allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n", + page_ext->order, migratetype_names[mt], gfp_mask, &gfp_mask); print_stack_trace(&trace, 0); if (page_ext->last_migrate_reason != -1) diff --git a/mm/percpu.c b/mm/percpu.c index 847814b..1571547 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -888,8 +888,8 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, size = ALIGN(size, 2); if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) { - WARN(true, "illegal size (%zu) or align (%zu) for " - "percpu allocation\n", size, align); + WARN(true, "illegal size (%zu) or align (%zu) for percpu allocation\n", + size, align); return NULL; } diff --git a/mm/slab.c b/mm/slab.c index e1f6c27..e558f85 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1566,11 +1566,9 @@ static void dump_line(char *data, int offset, int limit) if (bad_count == 1) { error ^= POISON_FREE; if (!(error & (error - 1))) { - printk(KERN_ERR "Single bit error detected. Probably " - "bad RAM.\n"); + printk(KERN_ERR "Single bit error detected. Probably bad RAM.\n"); #ifdef CONFIG_X86 - printk(KERN_ERR "Run memtest86+ or a similar memory " - "test tool.\n"); + printk(KERN_ERR "Run memtest86+ or a similar memory test tool.\n"); #else printk(KERN_ERR "Run a memory test tool.\n"); #endif @@ -1693,11 +1691,9 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, } if (cachep->flags & SLAB_RED_ZONE) { if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) - slab_error(cachep, "start of a freed object " - "was overwritten"); + slab_error(cachep, "start of a freed object was overwritten"); if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) - slab_error(cachep, "end of a freed object " - "was overwritten"); + slab_error(cachep, "end of a freed object was overwritten"); } } } @@ -2398,11 +2394,9 @@ static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page) if (cachep->flags & SLAB_RED_ZONE) { if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) - slab_error(cachep, "constructor overwrote the" - " end of an object"); + slab_error(cachep, "constructor overwrote the end of an object"); if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) - slab_error(cachep, "constructor overwrote the" - " start of an object"); + slab_error(cachep, "constructor overwrote the start of an object"); } /* need to poison the objs? */ if (cachep->flags & SLAB_POISON) { @@ -2469,8 +2463,8 @@ static void slab_put_obj(struct kmem_cache *cachep, /* Verify double free bug */ for (i = page->active; i < cachep->num; i++) { if (get_free_obj(page, i) == objnr) { - printk(KERN_ERR "slab: double free detected in cache " - "'%s', objp %p\n", cachep->name, objp); + printk(KERN_ERR "slab: double free detected in cache '%s', objp %p\n", + cachep->name, objp); BUG(); } } @@ -2901,8 +2895,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, if (cachep->flags & SLAB_RED_ZONE) { if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || *dbg_redzone2(cachep, objp) != RED_INACTIVE) { - slab_error(cachep, "double free, or memory outside" - " object was overwritten"); + slab_error(cachep, "double free, or memory outside object was overwritten"); printk(KERN_ERR "%p: redzone 1:0x%llx, redzone 2:0x%llx\n", objp, *dbg_redzone1(cachep, objp), @@ -4028,8 +4021,7 @@ void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *cachep) unsigned long node_frees = cachep->node_frees; unsigned long overflows = cachep->node_overflow; - seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu " - "%4lu %4lu %4lu %4lu %4lu", + seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu %4lu %4lu %4lu %4lu %4lu", allocs, high, grown, reaped, errors, max_freeable, node_allocs, node_frees, overflows); diff --git a/mm/slab_common.c b/mm/slab_common.c index 8addc3c..e885e11 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -726,8 +726,8 @@ void kmem_cache_destroy(struct kmem_cache *s) err = shutdown_cache(s, &release, &need_rcu_barrier); if (err) { - pr_err("kmem_cache_destroy %s: " - "Slab cache still has objects\n", s->name); + pr_err("kmem_cache_destroy %s: Slab cache still has objects\n", + s->name); dump_stack(); } out_unlock: @@ -1047,13 +1047,11 @@ static void print_slabinfo_header(struct seq_file *m) #else seq_puts(m, "slabinfo - version: 2.1\n"); #endif - seq_puts(m, "# name " - " "); + seq_puts(m, "# name "); seq_puts(m, " : tunables "); seq_puts(m, " : slabdata "); #ifdef CONFIG_DEBUG_SLAB - seq_puts(m, " : globalstat " - " "); + seq_puts(m, " : globalstat "); seq_puts(m, " : cpustat "); #endif seq_putc(m, '\n'); diff --git a/mm/slub.c b/mm/slub.c index 64ed5f3..7277413 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -950,14 +950,14 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search) max_objects = MAX_OBJS_PER_PAGE; if (page->objects != max_objects) { - slab_err(s, page, "Wrong number of objects. Found %d but " - "should be %d", page->objects, max_objects); + slab_err(s, page, "Wrong number of objects. Found %d but should be %d", + page->objects, max_objects); page->objects = max_objects; slab_fix(s, "Number of objects adjusted."); } if (page->inuse != page->objects - nr) { - slab_err(s, page, "Wrong object count. Counter is %d but " - "counted were %d", page->inuse, page->objects - nr); + slab_err(s, page, "Wrong object count. Counter is %d but counted were %d", + page->inuse, page->objects - nr); page->inuse = page->objects - nr; slab_fix(s, "Object count adjusted."); } @@ -1117,8 +1117,8 @@ static inline int free_consistency_checks(struct kmem_cache *s, if (unlikely(s != page->slab_cache)) { if (!PageSlab(page)) { - slab_err(s, page, "Attempt to free object(0x%p) " - "outside of slab", object); + slab_err(s, page, "Attempt to free object(0x%p) outside of slab", + object); } else if (!page->slab_cache) { pr_err("SLUB : no slab for object 0x%p.\n", object); @@ -3439,10 +3439,9 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) free_kmem_cache_nodes(s); error: if (flags & SLAB_PANIC) - panic("Cannot create slab %s size=%lu realsize=%u " - "order=%u offset=%u flags=%lx\n", - s->name, (unsigned long)s->size, s->size, - oo_order(s->oo), s->offset, flags); + panic("Cannot create slab %s size=%lu realsize=%u order=%u offset=%u flags=%lx\n", + s->name, (unsigned long)s->size, s->size, + oo_order(s->oo), s->offset, flags); return -EINVAL; } diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index b60802b..d3511f9 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -166,8 +166,8 @@ void __meminit vmemmap_verify(pte_t *pte, int node, int actual_node = early_pfn_to_nid(pfn); if (node_distance(actual_node, node) > LOCAL_DISTANCE) - printk(KERN_WARNING "[%lx-%lx] potential offnode " - "page_structs\n", start, end - 1); + printk(KERN_WARNING "[%lx-%lx] potential offnode page_structs\n", + start, end - 1); } pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node) @@ -292,8 +292,8 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, if (map_map[pnum]) continue; ms = __nr_to_section(pnum); - printk(KERN_ERR "%s: sparsemem memory map backing failed " - "some memory will not be available.\n", __func__); + printk(KERN_ERR "%s: sparsemem memory map backing failed some memory will not be available.\n", + __func__); ms->section_mem_map = 0; } diff --git a/mm/sparse.c b/mm/sparse.c index 3717cee..7cdb27d 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -428,8 +428,8 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, if (map_map[pnum]) continue; ms = __nr_to_section(pnum); - printk(KERN_ERR "%s: sparsemem memory map backing failed " - "some memory will not be available.\n", __func__); + printk(KERN_ERR "%s: sparsemem memory map backing failed some memory will not be available.\n", + __func__); ms->section_mem_map = 0; } } @@ -456,8 +456,8 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) if (map) return map; - printk(KERN_ERR "%s: sparsemem memory map backing failed " - "some memory will not be available.\n", __func__); + printk(KERN_ERR "%s: sparsemem memory map backing failed some memory will not be available.\n", + __func__); ms->section_mem_map = 0; return NULL; } diff --git a/mm/swapfile.c b/mm/swapfile.c index d2c3736..b86cf26 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2526,8 +2526,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map); - pr_info("Adding %uk swap on %s. " - "Priority:%d extents:%d across:%lluk %s%s%s%s%s\n", + pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s%s\n", p->pages<<(PAGE_SHIFT-10), name->name, p->prio, nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), (p->flags & SWP_SOLIDSTATE) ? "SS" : "", diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d4b2e34..e86c24e 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -469,8 +469,8 @@ overflow: goto retry; } if (printk_ratelimit()) - pr_warn("vmap allocation for size %lu failed: " - "use vmalloc= to increase size.\n", size); + pr_warn("vmap allocation for size %lu failed: use vmalloc= to increase size\n", + size); kfree(va); return ERR_PTR(-EBUSY); } -- cgit v1.1 From 1170532bb49f9468aedabdc1d5a560e2521a2bcc Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 17 Mar 2016 14:19:50 -0700 Subject: mm: convert printk(KERN_ to pr_ Most of the mm subsystem uses pr_ so make it consistent. Miscellanea: - Realign arguments - Add missing newline to format - kmemleak-test.c has a "kmemleak: " prefix added to the "Kmemleak testing" logging message via pr_fmt Signed-off-by: Joe Perches Acked-by: Tejun Heo [percpu] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/backing-dev.c | 4 ++-- mm/bootmem.c | 7 +++---- mm/dmapool.c | 12 +++++------- mm/internal.h | 2 +- mm/kmemcheck.c | 2 +- mm/kmemleak-test.c | 2 +- mm/memory-failure.c | 52 +++++++++++++++++++++------------------------------- mm/memory.c | 17 +++++++---------- mm/mm_init.c | 7 +++---- mm/nobootmem.c | 4 ++-- mm/page_alloc.c | 24 +++++++++--------------- mm/page_io.c | 22 +++++++++++----------- mm/percpu-km.c | 6 +++--- mm/percpu.c | 12 ++++++------ mm/shmem.c | 14 ++++++-------- mm/slab.c | 51 ++++++++++++++++++++++++--------------------------- mm/slab_common.c | 2 +- mm/sparse-vmemmap.c | 6 +++--- mm/sparse.c | 17 +++++++---------- mm/swap_cgroup.c | 5 ++--- 20 files changed, 118 insertions(+), 150 deletions(-) (limited to 'mm') diff --git a/mm/backing-dev.c b/mm/backing-dev.c index c554d17..bfbd709 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -1026,8 +1026,8 @@ int pdflush_proc_obsolete(struct ctl_table *table, int write, if (copy_to_user(buffer, kbuf, sizeof(kbuf))) return -EFAULT; - printk_once(KERN_WARNING "%s exported in /proc is scheduled for removal\n", - table->procname); + pr_warn_once("%s exported in /proc is scheduled for removal\n", + table->procname); *lenp = 2; *ppos += *lenp; diff --git a/mm/bootmem.c b/mm/bootmem.c index 91e32bc..0aa7dda 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -50,8 +50,7 @@ early_param("bootmem_debug", bootmem_debug_setup); #define bdebug(fmt, args...) ({ \ if (unlikely(bootmem_debug)) \ - printk(KERN_INFO \ - "bootmem::%s " fmt, \ + pr_info("bootmem::%s " fmt, \ __func__, ## args); \ }) @@ -680,7 +679,7 @@ static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, /* * Whoops, we cannot satisfy the allocation request. */ - printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); + pr_alert("bootmem alloc of %lu bytes failed!\n", size); panic("Out of memory"); return NULL; } @@ -755,7 +754,7 @@ void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, if (ptr) return ptr; - printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); + pr_alert("bootmem alloc of %lu bytes failed!\n", size); panic("Out of memory"); return NULL; } diff --git a/mm/dmapool.c b/mm/dmapool.c index 2821500..abcbfe8 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -294,8 +294,7 @@ void dma_pool_destroy(struct dma_pool *pool) "dma_pool_destroy %s, %p busy\n", pool->name, page->vaddr); else - printk(KERN_ERR - "dma_pool_destroy %s, %p busy\n", + pr_err("dma_pool_destroy %s, %p busy\n", pool->name, page->vaddr); /* leak the still-in-use consistent memory */ list_del(&page->page_list); @@ -424,7 +423,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) "dma_pool_free %s, %p/%lx (bad dma)\n", pool->name, vaddr, (unsigned long)dma); else - printk(KERN_ERR "dma_pool_free %s, %p/%lx (bad dma)\n", + pr_err("dma_pool_free %s, %p/%lx (bad dma)\n", pool->name, vaddr, (unsigned long)dma); return; } @@ -438,8 +437,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) "dma_pool_free %s, %p (bad vaddr)/%Lx\n", pool->name, vaddr, (unsigned long long)dma); else - printk(KERN_ERR - "dma_pool_free %s, %p (bad vaddr)/%Lx\n", + pr_err("dma_pool_free %s, %p (bad vaddr)/%Lx\n", pool->name, vaddr, (unsigned long long)dma); return; } @@ -455,8 +453,8 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) dev_err(pool->dev, "dma_pool_free %s, dma %Lx already free\n", pool->name, (unsigned long long)dma); else - printk(KERN_ERR "dma_pool_free %s, dma %Lx already free\n", - pool->name, (unsigned long long)dma); + pr_err("dma_pool_free %s, dma %Lx already free\n", + pool->name, (unsigned long long)dma); return; } } diff --git a/mm/internal.h b/mm/internal.h index 57d7b0e..7449392 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -386,7 +386,7 @@ extern int mminit_loglevel; do { \ if (level < mminit_loglevel) { \ if (level <= MMINIT_WARNING) \ - printk(KERN_WARNING "mminit::" prefix " " fmt, ##arg); \ + pr_warn("mminit::" prefix " " fmt, ##arg); \ else \ printk(KERN_DEBUG "mminit::" prefix " " fmt, ##arg); \ } \ diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c index e5f8333..5bf1917 100644 --- a/mm/kmemcheck.c +++ b/mm/kmemcheck.c @@ -20,7 +20,7 @@ void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node) shadow = alloc_pages_node(node, flags | __GFP_NOTRACK, order); if (!shadow) { if (printk_ratelimit()) - printk(KERN_ERR "kmemcheck: failed to allocate shadow bitmap\n"); + pr_err("kmemcheck: failed to allocate shadow bitmap\n"); return; } diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c index dcdcadb..dd3c23a 100644 --- a/mm/kmemleak-test.c +++ b/mm/kmemleak-test.c @@ -49,7 +49,7 @@ static int __init kmemleak_test_init(void) struct test_node *elem; int i; - printk(KERN_INFO "Kmemleak testing\n"); + pr_info("Kmemleak testing\n"); /* make some orphan objects */ pr_info("kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL)); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 67c30eb..5a544c6 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -184,9 +184,8 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno, struct siginfo si; int ret; - printk(KERN_ERR - "MCE %#lx: Killing %s:%d due to hardware memory corruption\n", - pfn, t->comm, t->pid); + pr_err("MCE %#lx: Killing %s:%d due to hardware memory corruption\n", + pfn, t->comm, t->pid); si.si_signo = SIGBUS; si.si_errno = 0; si.si_addr = (void *)addr; @@ -209,8 +208,8 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno, ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */ } if (ret < 0) - printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n", - t->comm, t->pid, ret); + pr_info("MCE: Error sending signal to %s:%d: %d\n", + t->comm, t->pid, ret); return ret; } @@ -290,8 +289,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, } else { tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC); if (!tk) { - printk(KERN_ERR - "MCE: Out of memory while machine check handling\n"); + pr_err("MCE: Out of memory while machine check handling\n"); return; } } @@ -336,9 +334,8 @@ static void kill_procs(struct list_head *to_kill, int forcekill, int trapno, * signal and then access the memory. Just kill it. */ if (fail || tk->addr_valid == 0) { - printk(KERN_ERR - "MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n", - pfn, tk->tsk->comm, tk->tsk->pid); + pr_err("MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n", + pfn, tk->tsk->comm, tk->tsk->pid); force_sig(SIGKILL, tk->tsk); } @@ -350,9 +347,8 @@ static void kill_procs(struct list_head *to_kill, int forcekill, int trapno, */ else if (kill_proc(tk->tsk, tk->addr, trapno, pfn, page, flags) < 0) - printk(KERN_ERR - "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n", - pfn, tk->tsk->comm, tk->tsk->pid); + pr_err("MCE %#lx: Cannot send advisory machine check signal to %s:%d\n", + pfn, tk->tsk->comm, tk->tsk->pid); } put_task_struct(tk->tsk); kfree(tk); @@ -563,7 +559,7 @@ static int me_kernel(struct page *p, unsigned long pfn) */ static int me_unknown(struct page *p, unsigned long pfn) { - printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn); + pr_err("MCE %#lx: Unknown page state\n", pfn); return MF_FAILED; } @@ -608,8 +604,8 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) if (mapping->a_ops->error_remove_page) { err = mapping->a_ops->error_remove_page(mapping, p); if (err != 0) { - printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n", - pfn, err); + pr_info("MCE %#lx: Failed to punch page: %d\n", + pfn, err); } else if (page_has_private(p) && !try_to_release_page(p, GFP_NOIO)) { pr_info("MCE %#lx: failed to release buffers\n", pfn); @@ -624,8 +620,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) if (invalidate_inode_page(p)) ret = MF_RECOVERED; else - printk(KERN_INFO "MCE %#lx: Failed to invalidate\n", - pfn); + pr_info("MCE %#lx: Failed to invalidate\n", pfn); } return ret; } @@ -854,8 +849,7 @@ static int page_action(struct page_state *ps, struct page *p, if (ps->action == me_swapcache_dirty && result == MF_DELAYED) count--; if (count != 0) { - printk(KERN_ERR - "MCE %#lx: %s still referenced by %d users\n", + pr_err("MCE %#lx: %s still referenced by %d users\n", pfn, action_page_types[ps->type], count); result = MF_FAILED; } @@ -934,8 +928,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, } if (PageSwapCache(p)) { - printk(KERN_ERR - "MCE %#lx: keeping poisoned page in swap cache\n", pfn); + pr_err("MCE %#lx: keeping poisoned page in swap cache\n", pfn); ttu |= TTU_IGNORE_HWPOISON; } @@ -953,8 +946,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, } else { kill = 0; ttu |= TTU_IGNORE_HWPOISON; - printk(KERN_INFO - "MCE %#lx: corrupted page was clean: dropped without side effects\n", + pr_info("MCE %#lx: corrupted page was clean: dropped without side effects\n", pfn); } } @@ -972,8 +964,8 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, ret = try_to_unmap(hpage, ttu); if (ret != SWAP_SUCCESS) - printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", - pfn, page_mapcount(hpage)); + pr_err("MCE %#lx: failed to unmap page (mapcount=%d)\n", + pfn, page_mapcount(hpage)); /* * Now that the dirty bit has been propagated to the @@ -1040,16 +1032,14 @@ int memory_failure(unsigned long pfn, int trapno, int flags) panic("Memory failure from trap %d on page %lx", trapno, pfn); if (!pfn_valid(pfn)) { - printk(KERN_ERR - "MCE %#lx: memory outside kernel control\n", - pfn); + pr_err("MCE %#lx: memory outside kernel control\n", pfn); return -ENXIO; } p = pfn_to_page(pfn); orig_head = hpage = compound_head(p); if (TestSetPageHWPoison(p)) { - printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn); + pr_err("MCE %#lx: already hardware poisoned\n", pfn); return 0; } @@ -1180,7 +1170,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) * unpoison always clear PG_hwpoison inside page lock */ if (!PageHWPoison(p)) { - printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn); + pr_err("MCE %#lx: just unpoisoned\n", pfn); num_poisoned_pages_sub(nr_pages); unlock_page(hpage); put_hwpoison_page(hpage); diff --git a/mm/memory.c b/mm/memory.c index 1974fc0..ac6bc15 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -660,9 +660,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, return; } if (nr_unshown) { - printk(KERN_ALERT - "BUG: Bad page map: %lu messages suppressed\n", - nr_unshown); + pr_alert("BUG: Bad page map: %lu messages suppressed\n", + nr_unshown); nr_unshown = 0; } nr_shown = 0; @@ -673,15 +672,13 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL; index = linear_page_index(vma, addr); - printk(KERN_ALERT - "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", - current->comm, - (long long)pte_val(pte), (long long)pmd_val(*pmd)); + pr_alert("BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", + current->comm, + (long long)pte_val(pte), (long long)pmd_val(*pmd)); if (page) dump_page(page, "bad pte"); - printk(KERN_ALERT - "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", - (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); + pr_alert("addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", + (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); /* * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y */ diff --git a/mm/mm_init.c b/mm/mm_init.c index fdadf91..5b72266 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -55,13 +55,12 @@ void __init mminit_verify_zonelist(void) /* Iterate the zonelist */ for_each_zone_zonelist(zone, z, zonelist, zoneid) { #ifdef CONFIG_NUMA - printk(KERN_CONT "%d:%s ", - zone->node, zone->name); + pr_cont("%d:%s ", zone->node, zone->name); #else - printk(KERN_CONT "0:%s ", zone->name); + pr_cont("0:%s ", zone->name); #endif /* CONFIG_NUMA */ } - printk(KERN_CONT "\n"); + pr_cont("\n"); } } } diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 99feb2b..bd05a70 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -288,7 +288,7 @@ static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, /* * Whoops, we cannot satisfy the allocation request. */ - printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); + pr_alert("bootmem alloc of %lu bytes failed!\n", size); panic("Out of memory"); return NULL; } @@ -360,7 +360,7 @@ static void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, if (ptr) return ptr; - printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); + pr_alert("bootmem alloc of %lu bytes failed!\n", size); panic("Out of memory"); return NULL; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 42cf199..2a9eaec 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -544,11 +544,11 @@ static int __init debug_guardpage_minorder_setup(char *buf) unsigned long res; if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) { - printk(KERN_ERR "Bad debug_guardpage_minorder value\n"); + pr_err("Bad debug_guardpage_minorder value\n"); return 0; } _debug_guardpage_minorder = res; - printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res); + pr_info("Setting debug_guardpage_minorder to %lu\n", res); return 0; } __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup); @@ -4073,8 +4073,7 @@ static int __parse_numa_zonelist_order(char *s) } else if (*s == 'z' || *s == 'Z') { user_zonelist_order = ZONELIST_ORDER_ZONE; } else { - printk(KERN_WARNING - "Ignoring invalid numa_zonelist_order value: %s\n", s); + pr_warn("Ignoring invalid numa_zonelist_order value: %s\n", s); return -EINVAL; } return 0; @@ -5458,8 +5457,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) " %s zone: %lu pages used for memmap\n", zone_names[j], memmap_pages); } else - printk(KERN_WARNING - " %s zone: %lu pages exceeds freesize %lu\n", + pr_warn(" %s zone: %lu pages exceeds freesize %lu\n", zone_names[j], memmap_pages, freesize); } @@ -5667,8 +5665,7 @@ static unsigned long __init find_min_pfn_for_node(int nid) min_pfn = min(min_pfn, start_pfn); if (min_pfn == ULONG_MAX) { - printk(KERN_WARNING - "Could not find start_pfn for node %d\n", nid); + pr_warn("Could not find start_pfn for node %d\n", nid); return 0; } @@ -6686,11 +6683,8 @@ void *__init alloc_large_system_hash(const char *tablename, if (!table) panic("Failed to allocate %s hash table\n", tablename); - printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n", - tablename, - (1UL << log2qty), - ilog2(size) - PAGE_SHIFT, - size); + pr_info("%s hash table entries: %ld (order: %d, %lu bytes)\n", + tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size); if (_hash_shift) *_hash_shift = log2qty; @@ -7191,8 +7185,8 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) BUG_ON(!PageBuddy(page)); order = page_order(page); #ifdef CONFIG_DEBUG_VM - printk(KERN_INFO "remove from free list %lx %d %lx\n", - pfn, 1 << order, end_pfn); + pr_info("remove from free list %lx %d %lx\n", + pfn, 1 << order, end_pfn); #endif list_del(&page->lru); rmv_page_order(page); diff --git a/mm/page_io.c b/mm/page_io.c index b995a5b..ff74e51 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -56,10 +56,10 @@ void end_swap_bio_write(struct bio *bio) * Also clear PG_reclaim to avoid rotate_reclaimable_page() */ set_page_dirty(page); - printk(KERN_ALERT "Write-error on swap-device (%u:%u:%Lu)\n", - imajor(bio->bi_bdev->bd_inode), - iminor(bio->bi_bdev->bd_inode), - (unsigned long long)bio->bi_iter.bi_sector); + pr_alert("Write-error on swap-device (%u:%u:%llu)\n", + imajor(bio->bi_bdev->bd_inode), + iminor(bio->bi_bdev->bd_inode), + (unsigned long long)bio->bi_iter.bi_sector); ClearPageReclaim(page); } end_page_writeback(page); @@ -73,10 +73,10 @@ static void end_swap_bio_read(struct bio *bio) if (bio->bi_error) { SetPageError(page); ClearPageUptodate(page); - printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n", - imajor(bio->bi_bdev->bd_inode), - iminor(bio->bi_bdev->bd_inode), - (unsigned long long)bio->bi_iter.bi_sector); + pr_alert("Read-error on swap-device (%u:%u:%llu)\n", + imajor(bio->bi_bdev->bd_inode), + iminor(bio->bi_bdev->bd_inode), + (unsigned long long)bio->bi_iter.bi_sector); goto out; } @@ -216,7 +216,7 @@ reprobe: out: return ret; bad_bmap: - printk(KERN_ERR "swapon: swapfile has holes\n"); + pr_err("swapon: swapfile has holes\n"); ret = -EINVAL; goto out; } @@ -290,8 +290,8 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, */ set_page_dirty(page); ClearPageReclaim(page); - pr_err_ratelimited("Write error on dio swapfile (%Lu)\n", - page_file_offset(page)); + pr_err_ratelimited("Write error on dio swapfile (%llu)\n", + page_file_offset(page)); } end_page_writeback(page); return ret; diff --git a/mm/percpu-km.c b/mm/percpu-km.c index 10e3d0b..0db94b7 100644 --- a/mm/percpu-km.c +++ b/mm/percpu-km.c @@ -95,7 +95,7 @@ static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai) /* all units must be in a single group */ if (ai->nr_groups != 1) { - printk(KERN_CRIT "percpu: can't handle more than one groups\n"); + pr_crit("percpu: can't handle more than one groups\n"); return -EINVAL; } @@ -103,8 +103,8 @@ static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai) alloc_pages = roundup_pow_of_two(nr_pages); if (alloc_pages > nr_pages) - printk(KERN_WARNING "percpu: wasting %zu pages per chunk\n", - alloc_pages - nr_pages); + pr_warn("percpu: wasting %zu pages per chunk\n", + alloc_pages - nr_pages); return 0; } diff --git a/mm/percpu.c b/mm/percpu.c index 1571547..c987fd4 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1449,20 +1449,20 @@ static void pcpu_dump_alloc_info(const char *lvl, for (alloc_end += gi->nr_units / upa; alloc < alloc_end; alloc++) { if (!(alloc % apl)) { - printk(KERN_CONT "\n"); + pr_cont("\n"); printk("%spcpu-alloc: ", lvl); } - printk(KERN_CONT "[%0*d] ", group_width, group); + pr_cont("[%0*d] ", group_width, group); for (unit_end += upa; unit < unit_end; unit++) if (gi->cpu_map[unit] != NR_CPUS) - printk(KERN_CONT "%0*d ", cpu_width, - gi->cpu_map[unit]); + pr_cont("%0*d ", + cpu_width, gi->cpu_map[unit]); else - printk(KERN_CONT "%s ", empty_str); + pr_cont("%s ", empty_str); } } - printk(KERN_CONT "\n"); + pr_cont("\n"); } /** diff --git a/mm/shmem.c b/mm/shmem.c index 1acfdbc..c484f68 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2823,9 +2823,8 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, if ((value = strchr(this_char,'=')) != NULL) { *value++ = 0; } else { - printk(KERN_ERR - "tmpfs: No value for mount option '%s'\n", - this_char); + pr_err("tmpfs: No value for mount option '%s'\n", + this_char); goto error; } @@ -2880,8 +2879,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, if (mpol_parse_str(value, &mpol)) goto bad_val; } else { - printk(KERN_ERR "tmpfs: Bad mount option %s\n", - this_char); + pr_err("tmpfs: Bad mount option %s\n", this_char); goto error; } } @@ -2889,7 +2887,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, return 0; bad_val: - printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n", + pr_err("tmpfs: Bad value '%s' for mount option '%s'\n", value, this_char); error: mpol_put(mpol); @@ -3286,14 +3284,14 @@ int __init shmem_init(void) error = register_filesystem(&shmem_fs_type); if (error) { - printk(KERN_ERR "Could not register tmpfs\n"); + pr_err("Could not register tmpfs\n"); goto out2; } shm_mnt = kern_mount(&shmem_fs_type); if (IS_ERR(shm_mnt)) { error = PTR_ERR(shm_mnt); - printk(KERN_ERR "Could not kern_mount tmpfs\n"); + pr_err("Could not kern_mount tmpfs\n"); goto out1; } return 0; diff --git a/mm/slab.c b/mm/slab.c index e558f85..e719a5c 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -474,7 +474,7 @@ static unsigned int cache_estimate(unsigned long gfporder, size_t buffer_size, static void __slab_error(const char *function, struct kmem_cache *cachep, char *msg) { - printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", + pr_err("slab error in %s(): cache `%s': %s\n", function, cachep->name, msg); dump_stack(); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); @@ -1553,7 +1553,7 @@ static void dump_line(char *data, int offset, int limit) unsigned char error = 0; int bad_count = 0; - printk(KERN_ERR "%03x: ", offset); + pr_err("%03x: ", offset); for (i = 0; i < limit; i++) { if (data[offset + i] != POISON_FREE) { error = data[offset + i]; @@ -1566,11 +1566,11 @@ static void dump_line(char *data, int offset, int limit) if (bad_count == 1) { error ^= POISON_FREE; if (!(error & (error - 1))) { - printk(KERN_ERR "Single bit error detected. Probably bad RAM.\n"); + pr_err("Single bit error detected. Probably bad RAM.\n"); #ifdef CONFIG_X86 - printk(KERN_ERR "Run memtest86+ or a similar memory test tool.\n"); + pr_err("Run memtest86+ or a similar memory test tool.\n"); #else - printk(KERN_ERR "Run a memory test tool.\n"); + pr_err("Run a memory test tool.\n"); #endif } } @@ -1585,13 +1585,13 @@ static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines) char *realobj; if (cachep->flags & SLAB_RED_ZONE) { - printk(KERN_ERR "Redzone: 0x%llx/0x%llx.\n", - *dbg_redzone1(cachep, objp), - *dbg_redzone2(cachep, objp)); + pr_err("Redzone: 0x%llx/0x%llx\n", + *dbg_redzone1(cachep, objp), + *dbg_redzone2(cachep, objp)); } if (cachep->flags & SLAB_STORE_USER) { - printk(KERN_ERR "Last user: [<%p>](%pSR)\n", + pr_err("Last user: [<%p>](%pSR)\n", *dbg_userword(cachep, objp), *dbg_userword(cachep, objp)); } @@ -1627,9 +1627,9 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) /* Mismatch ! */ /* Print header */ if (lines == 0) { - printk(KERN_ERR - "Slab corruption (%s): %s start=%p, len=%d\n", - print_tainted(), cachep->name, realobj, size); + pr_err("Slab corruption (%s): %s start=%p, len=%d\n", + print_tainted(), cachep->name, + realobj, size); print_objinfo(cachep, objp, 0); } /* Hexdump the affected line */ @@ -1656,15 +1656,13 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) if (objnr) { objp = index_to_obj(cachep, page, objnr - 1); realobj = (char *)objp + obj_offset(cachep); - printk(KERN_ERR "Prev obj: start=%p, len=%d\n", - realobj, size); + pr_err("Prev obj: start=%p, len=%d\n", realobj, size); print_objinfo(cachep, objp, 2); } if (objnr + 1 < cachep->num) { objp = index_to_obj(cachep, page, objnr + 1); realobj = (char *)objp + obj_offset(cachep); - printk(KERN_ERR "Next obj: start=%p, len=%d\n", - realobj, size); + pr_err("Next obj: start=%p, len=%d\n", realobj, size); print_objinfo(cachep, objp, 2); } } @@ -2463,7 +2461,7 @@ static void slab_put_obj(struct kmem_cache *cachep, /* Verify double free bug */ for (i = page->active; i < cachep->num; i++) { if (get_free_obj(page, i) == objnr) { - printk(KERN_ERR "slab: double free detected in cache '%s', objp %p\n", + pr_err("slab: double free detected in cache '%s', objp %p\n", cachep->name, objp); BUG(); } @@ -2583,7 +2581,7 @@ failed: static void kfree_debugcheck(const void *objp) { if (!virt_addr_valid(objp)) { - printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n", + pr_err("kfree_debugcheck: out of range ptr %lxh\n", (unsigned long)objp); BUG(); } @@ -2607,8 +2605,8 @@ static inline void verify_redzone_free(struct kmem_cache *cache, void *obj) else slab_error(cache, "memory outside object was overwritten"); - printk(KERN_ERR "%p: redzone 1:0x%llx, redzone 2:0x%llx.\n", - obj, redzone1, redzone2); + pr_err("%p: redzone 1:0x%llx, redzone 2:0x%llx\n", + obj, redzone1, redzone2); } static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, @@ -2896,10 +2894,9 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || *dbg_redzone2(cachep, objp) != RED_INACTIVE) { slab_error(cachep, "double free, or memory outside object was overwritten"); - printk(KERN_ERR - "%p: redzone 1:0x%llx, redzone 2:0x%llx\n", - objp, *dbg_redzone1(cachep, objp), - *dbg_redzone2(cachep, objp)); + pr_err("%p: redzone 1:0x%llx, redzone 2:0x%llx\n", + objp, *dbg_redzone1(cachep, objp), + *dbg_redzone2(cachep, objp)); } *dbg_redzone1(cachep, objp) = RED_ACTIVE; *dbg_redzone2(cachep, objp) = RED_ACTIVE; @@ -2910,7 +2907,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, cachep->ctor(objp); if (ARCH_SLAB_MINALIGN && ((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) { - printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", + pr_err("0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", objp, (int)ARCH_SLAB_MINALIGN); } return objp; @@ -3837,7 +3834,7 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) skip_setup: err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp); if (err) - printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", + pr_err("enable_cpucache failed for %s, error %d\n", cachep->name, -err); return err; } @@ -3993,7 +3990,7 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) name = cachep->name; if (error) - printk(KERN_ERR "slab: cache %s error: %s\n", name, error); + pr_err("slab: cache %s error: %s\n", name, error); sinfo->active_objs = active_objs; sinfo->num_objs = num_objs; diff --git a/mm/slab_common.c b/mm/slab_common.c index e885e11..b2e3796 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -442,7 +442,7 @@ out_unlock: panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n", name, err); else { - printk(KERN_WARNING "kmem_cache_create(%s) failed with error %d", + pr_warn("kmem_cache_create(%s) failed with error %d\n", name, err); dump_stack(); } diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index d3511f9..68885dc 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -166,8 +166,8 @@ void __meminit vmemmap_verify(pte_t *pte, int node, int actual_node = early_pfn_to_nid(pfn); if (node_distance(actual_node, node) > LOCAL_DISTANCE) - printk(KERN_WARNING "[%lx-%lx] potential offnode page_structs\n", - start, end - 1); + pr_warn("[%lx-%lx] potential offnode page_structs\n", + start, end - 1); } pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node) @@ -292,7 +292,7 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, if (map_map[pnum]) continue; ms = __nr_to_section(pnum); - printk(KERN_ERR "%s: sparsemem memory map backing failed some memory will not be available.\n", + pr_err("%s: sparsemem memory map backing failed some memory will not be available\n", __func__); ms->section_mem_map = 0; } diff --git a/mm/sparse.c b/mm/sparse.c index 7cdb27d..5d0cf45 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -313,9 +313,8 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap) usemap_nid = sparse_early_nid(__nr_to_section(usemap_snr)); if (usemap_nid != nid) { - printk(KERN_INFO - "node %d must be removed before remove section %ld\n", - nid, usemap_snr); + pr_info("node %d must be removed before remove section %ld\n", + nid, usemap_snr); return; } /* @@ -324,10 +323,8 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap) * gather other removable sections for dynamic partitioning. * Just notify un-removable section's number here. */ - printk(KERN_INFO "Section %ld and %ld (node %d)", usemap_snr, - pgdat_snr, nid); - printk(KERN_CONT - " have a circular dependency on usemap and pgdat allocations\n"); + pr_info("Section %ld and %ld (node %d) have a circular dependency on usemap and pgdat allocations\n", + usemap_snr, pgdat_snr, nid); } #else static unsigned long * __init @@ -355,7 +352,7 @@ static void __init sparse_early_usemaps_alloc_node(void *data, usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid), size * usemap_count); if (!usemap) { - printk(KERN_WARNING "%s: allocation failed\n", __func__); + pr_warn("%s: allocation failed\n", __func__); return; } @@ -428,7 +425,7 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, if (map_map[pnum]) continue; ms = __nr_to_section(pnum); - printk(KERN_ERR "%s: sparsemem memory map backing failed some memory will not be available.\n", + pr_err("%s: sparsemem memory map backing failed some memory will not be available\n", __func__); ms->section_mem_map = 0; } @@ -456,7 +453,7 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) if (map) return map; - printk(KERN_ERR "%s: sparsemem memory map backing failed some memory will not be available.\n", + pr_err("%s: sparsemem memory map backing failed some memory will not be available\n", __func__); ms->section_mem_map = 0; return NULL; diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c index b5f7f24..310ac0b 100644 --- a/mm/swap_cgroup.c +++ b/mm/swap_cgroup.c @@ -174,9 +174,8 @@ int swap_cgroup_swapon(int type, unsigned long max_pages) return 0; nomem: - printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n"); - printk(KERN_INFO - "swap_cgroup can be disabled by swapaccount=0 boot option\n"); + pr_info("couldn't allocate enough memory for swap_cgroup\n"); + pr_info("swap_cgroup can be disabled by swapaccount=0 boot option\n"); return -ENOMEM; } -- cgit v1.1 From 870d4b12ad15d21c5db67b373bdc2f62cfe2ec64 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 17 Mar 2016 14:19:53 -0700 Subject: mm: percpu: use pr_fmt to prefix output Use the normal mechanism to make the logging output consistently "percpu:" instead of a mix of "PERCPU:" and "percpu:" Signed-off-by: Joe Perches Acked-by: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/percpu-km.c | 4 ++-- mm/percpu.c | 20 +++++++++++--------- 2 files changed, 13 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/percpu-km.c b/mm/percpu-km.c index 0db94b7..d66911f 100644 --- a/mm/percpu-km.c +++ b/mm/percpu-km.c @@ -95,7 +95,7 @@ static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai) /* all units must be in a single group */ if (ai->nr_groups != 1) { - pr_crit("percpu: can't handle more than one groups\n"); + pr_crit("can't handle more than one group\n"); return -EINVAL; } @@ -103,7 +103,7 @@ static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai) alloc_pages = roundup_pow_of_two(nr_pages); if (alloc_pages > nr_pages) - pr_warn("percpu: wasting %zu pages per chunk\n", + pr_warn("wasting %zu pages per chunk\n", alloc_pages - nr_pages); return 0; diff --git a/mm/percpu.c b/mm/percpu.c index c987fd4..0c59684 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -53,6 +53,8 @@ * setup the first chunk containing the kernel static percpu area */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -1033,11 +1035,11 @@ fail_unlock: spin_unlock_irqrestore(&pcpu_lock, flags); fail: if (!is_atomic && warn_limit) { - pr_warn("PERCPU: allocation failed, size=%zu align=%zu atomic=%d, %s\n", + pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n", size, align, is_atomic, err); dump_stack(); if (!--warn_limit) - pr_info("PERCPU: limit reached, disable warning\n"); + pr_info("limit reached, disable warning\n"); } if (is_atomic) { /* see the flag handling in pcpu_blance_workfn() */ @@ -1538,8 +1540,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, #define PCPU_SETUP_BUG_ON(cond) do { \ if (unlikely(cond)) { \ - pr_emerg("PERCPU: failed to initialize, %s", #cond); \ - pr_emerg("PERCPU: cpu_possible_mask=%*pb\n", \ + pr_emerg("failed to initialize, %s\n", #cond); \ + pr_emerg("cpu_possible_mask=%*pb\n", \ cpumask_pr_args(cpu_possible_mask)); \ pcpu_dump_alloc_info(KERN_EMERG, ai); \ BUG(); \ @@ -1723,7 +1725,7 @@ static int __init percpu_alloc_setup(char *str) pcpu_chosen_fc = PCPU_FC_PAGE; #endif else - pr_warn("PERCPU: unknown allocator %s specified\n", str); + pr_warn("unknown allocator %s specified\n", str); return 0; } @@ -2016,7 +2018,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, /* warn if maximum distance is further than 75% of vmalloc space */ if (max_distance > VMALLOC_TOTAL * 3 / 4) { - pr_warn("PERCPU: max_distance=0x%zx too large for vmalloc space 0x%lx\n", + pr_warn("max_distance=0x%zx too large for vmalloc space 0x%lx\n", max_distance, VMALLOC_TOTAL); #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK /* and fail if we have fallback */ @@ -2025,7 +2027,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, #endif } - pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n", + pr_info("Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n", PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size, ai->dyn_size, ai->unit_size); @@ -2099,7 +2101,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE); if (!ptr) { - pr_warn("PERCPU: failed to allocate %s page for cpu%u\n", + pr_warn("failed to allocate %s page for cpu%u\n", psize_str, cpu); goto enomem; } @@ -2139,7 +2141,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, } /* we're ready, commit */ - pr_info("PERCPU: %d %s pages/cpu @%p s%zu r%zu d%zu\n", + pr_info("%d %s pages/cpu @%p s%zu r%zu d%zu\n", unit_pages, psize_str, vm.addr, ai->static_size, ai->reserved_size, ai->dyn_size); -- cgit v1.1 From 99490f16f8b287a028306e4092cc85393907075f Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 17 Mar 2016 14:19:58 -0700 Subject: mm: ZONE_DEVICE depends on SPARSEMEM_VMEMMAP The primary use case for devm_memremap_pages() is to allocate an memmap array from persistent memory. That capabilty requires vmem_altmap which requires SPARSEMEM_VMEMMAP. Also, without SPARSEMEM_VMEMMAP the addition of ZONE_DEVICE expands ZONES_WIDTH and triggers the: "Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid." ...warning in mm/memory.c. SPARSEMEM_VMEMMAP=n && ZONE_DEVICE=y is not a configuration we should worry about supporting. Signed-off-by: Dan Williams Reported-by: Vlastimil Babka Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index 520dae6..05efa6a5 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -653,6 +653,7 @@ config ZONE_DEVICE bool "Device memory (pmem, etc...) hotplug support" if EXPERT depends on MEMORY_HOTPLUG depends on MEMORY_HOTREMOVE + depends on SPARSEMEM_VMEMMAP depends on X86_64 #arch_add_memory() comprehends device memory help -- cgit v1.1 From b97731992d00f09456726bfc5ab6641c07773038 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 17 Mar 2016 14:20:01 -0700 Subject: rmap: introduce rmap_walk_locked() This patchset rewrites freeze_page() and unfreeze_page() using try_to_unmap() and remove_migration_ptes(). Result is much simpler, but somewhat slower. Migration 8GiB worth of PMD-mapped THP: Baseline 20.21 +/- 0.393 Patched 20.73 +/- 0.082 Slowdown 1.03x It's 3% slower, comparing to 14% in v1. I don't it should be a stopper. Splitting of PTE-mapped pages slowed more. But this is not a common case. Migration 8GiB worth of PMD-mapped THP: Baseline 20.39 +/- 0.225 Patched 22.43 +/- 0.496 Slowdown 1.10x rmap_walk_locked() is the same as rmap_walk(), but the caller takes care of the relevant rmap lock. This is preparation for switching THP splitting from custom rmap walk in freeze_page()/unfreeze_page() to the generic one. There is no support for KSM pages for now: not clear which lock is implied. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 41 ++++++++++++++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index 02f0bfc..30b739c 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1715,14 +1715,21 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page, * vm_flags for that VMA. That should be OK, because that vma shouldn't be * LOCKED. */ -static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) +static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, + bool locked) { struct anon_vma *anon_vma; pgoff_t pgoff; struct anon_vma_chain *avc; int ret = SWAP_AGAIN; - anon_vma = rmap_walk_anon_lock(page, rwc); + if (locked) { + anon_vma = page_anon_vma(page); + /* anon_vma disappear under us? */ + VM_BUG_ON_PAGE(!anon_vma, page); + } else { + anon_vma = rmap_walk_anon_lock(page, rwc); + } if (!anon_vma) return ret; @@ -1742,7 +1749,9 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) if (rwc->done && rwc->done(page)) break; } - anon_vma_unlock_read(anon_vma); + + if (!locked) + anon_vma_unlock_read(anon_vma); return ret; } @@ -1759,9 +1768,10 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) * vm_flags for that VMA. That should be OK, because that vma shouldn't be * LOCKED. */ -static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) +static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, + bool locked) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); pgoff_t pgoff; struct vm_area_struct *vma; int ret = SWAP_AGAIN; @@ -1778,7 +1788,8 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) return ret; pgoff = page_to_pgoff(page); - i_mmap_lock_read(mapping); + if (!locked) + i_mmap_lock_read(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); @@ -1795,7 +1806,8 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) } done: - i_mmap_unlock_read(mapping); + if (!locked) + i_mmap_unlock_read(mapping); return ret; } @@ -1804,9 +1816,20 @@ int rmap_walk(struct page *page, struct rmap_walk_control *rwc) if (unlikely(PageKsm(page))) return rmap_walk_ksm(page, rwc); else if (PageAnon(page)) - return rmap_walk_anon(page, rwc); + return rmap_walk_anon(page, rwc, false); + else + return rmap_walk_file(page, rwc, false); +} + +/* Like rmap_walk, but caller holds relevant rmap lock */ +int rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc) +{ + /* no ksm support for now */ + VM_BUG_ON_PAGE(PageKsm(page), page); + if (PageAnon(page)) + return rmap_walk_anon(page, rwc, true); else - return rmap_walk_file(page, rwc); + return rmap_walk_file(page, rwc, true); } #ifdef CONFIG_HUGETLB_PAGE -- cgit v1.1 From 2a52bcbcc688eecead2953143f7ef695b8e44575 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 17 Mar 2016 14:20:04 -0700 Subject: rmap: extend try_to_unmap() to be usable by split_huge_page() Add support for two ttu_flags: - TTU_SPLIT_HUGE_PMD would split PMD if it's there, before trying to unmap page; - TTU_RMAP_LOCKED indicates that caller holds relevant rmap lock; Also, change rwc->done to !page_mapcount() instead of !page_mapped(). try_to_unmap() works on pte level, so we are really interested in the mappedness of this small page rather than of the compound page it's a part of. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 5 +---- mm/rmap.c | 24 ++++++++++++++++-------- 2 files changed, 17 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e1a177c..11d1567 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3006,15 +3006,12 @@ out: } } -static void split_huge_pmd_address(struct vm_area_struct *vma, - unsigned long address) +void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; - VM_BUG_ON(!(address & ~HPAGE_PMD_MASK)); - pgd = pgd_offset(vma->vm_mm, address); if (!pgd_present(*pgd)) return; diff --git a/mm/rmap.c b/mm/rmap.c index 30b739c..945933a 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1431,6 +1431,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED)) goto out; + if (flags & TTU_SPLIT_HUGE_PMD) + split_huge_pmd_address(vma, address); pte = page_check_address(page, mm, address, &ptl, 0); if (!pte) goto out; @@ -1576,10 +1578,10 @@ static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg) return is_vma_temporary_stack(vma); } -static int page_not_mapped(struct page *page) +static int page_mapcount_is_zero(struct page *page) { - return !page_mapped(page); -}; + return !page_mapcount(page); +} /** * try_to_unmap - try to remove all page table mappings to a page @@ -1606,12 +1608,10 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) struct rmap_walk_control rwc = { .rmap_one = try_to_unmap_one, .arg = &rp, - .done = page_not_mapped, + .done = page_mapcount_is_zero, .anon_lock = page_lock_anon_vma_read, }; - VM_BUG_ON_PAGE(!PageHuge(page) && PageTransHuge(page), page); - /* * During exec, a temporary VMA is setup and later moved. * The VMA is moved under the anon_vma lock but not the @@ -1623,9 +1623,12 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) if ((flags & TTU_MIGRATION) && !PageKsm(page) && PageAnon(page)) rwc.invalid_vma = invalid_migration_vma; - ret = rmap_walk(page, &rwc); + if (flags & TTU_RMAP_LOCKED) + ret = rmap_walk_locked(page, &rwc); + else + ret = rmap_walk(page, &rwc); - if (ret != SWAP_MLOCK && !page_mapped(page)) { + if (ret != SWAP_MLOCK && !page_mapcount(page)) { ret = SWAP_SUCCESS; if (rp.lazyfreed && !PageDirty(page)) ret = SWAP_LZFREE; @@ -1633,6 +1636,11 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) return ret; } +static int page_not_mapped(struct page *page) +{ + return !page_mapped(page); +}; + /** * try_to_munlock - try to munlock a page * @page: the page to be munlocked -- cgit v1.1 From e388466de4a2a1a50c43bfaeacc0c8254d9e7cb2 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 17 Mar 2016 14:20:07 -0700 Subject: mm: make remove_migration_ptes() beyond mm/migration.c Make remove_migration_ptes() available to be used in split_huge_page(). New parameter 'locked' added: as with try_to_umap() we need a way to indicate that caller holds rmap lock. We also shouldn't try to mlock() pte-mapped huge pages: pte-mapeed THP pages are never mlocked. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 577c94b..6c822a7 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -172,7 +172,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, else page_add_file_rmap(new); - if (vma->vm_flags & VM_LOCKED) + if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new)) mlock_vma_page(new); /* No need to invalidate - it was non-present before */ @@ -187,14 +187,17 @@ out: * Get rid of all migration entries and replace them by * references to the indicated page. */ -static void remove_migration_ptes(struct page *old, struct page *new) +void remove_migration_ptes(struct page *old, struct page *new, bool locked) { struct rmap_walk_control rwc = { .rmap_one = remove_migration_pte, .arg = old, }; - rmap_walk(new, &rwc); + if (locked) + rmap_walk_locked(new, &rwc); + else + rmap_walk(new, &rwc); } /* @@ -702,7 +705,7 @@ static int writeout(struct address_space *mapping, struct page *page) * At this point we know that the migration attempt cannot * be successful. */ - remove_migration_ptes(page, page); + remove_migration_ptes(page, page, false); rc = mapping->a_ops->writepage(page, &wbc); @@ -900,7 +903,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, if (page_was_mapped) remove_migration_ptes(page, - rc == MIGRATEPAGE_SUCCESS ? newpage : page); + rc == MIGRATEPAGE_SUCCESS ? newpage : page, false); out_unlock_both: unlock_page(newpage); @@ -1070,7 +1073,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, if (page_was_mapped) remove_migration_ptes(hpage, - rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage); + rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage, false); unlock_page(new_hpage); -- cgit v1.1 From fec89c109f3a7737fe3a7bf0f40d1fb7709d353b Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 17 Mar 2016 14:20:10 -0700 Subject: thp: rewrite freeze_page()/unfreeze_page() with generic rmap walkers freeze_page() and unfreeze_page() helpers evolved in rather complex beasts. It would be nice to cut complexity of this code. This patch rewrites freeze_page() using standard try_to_unmap(). unfreeze_page() is rewritten with remove_migration_ptes(). The result is much simpler. But the new variant is somewhat slower for PTE-mapped THPs. Current helpers iterates over VMAs the compound page is mapped to, and then over ptes within this VMA. New helpers iterates over small page, then over VMA the small page mapped to, and only then find relevant pte. We have short cut for PMD-mapped THP: we directly install migration entries on PMD split. I don't think the slowdown is critical, considering how much simpler result is and that split_huge_page() is quite rare nowadays. It only happens due memory pressure or migration. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 210 ++++++++++--------------------------------------------- mm/rmap.c | 10 ++- 2 files changed, 44 insertions(+), 176 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 11d1567..4a58fa1 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2977,7 +2977,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, } void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long address) + unsigned long address, bool freeze) { spinlock_t *ptl; struct mm_struct *mm = vma->vm_mm; @@ -2994,7 +2994,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, page = NULL; } else if (!pmd_devmap(*pmd)) goto out; - __split_huge_pmd_locked(vma, pmd, haddr, false); + __split_huge_pmd_locked(vma, pmd, haddr, freeze); out: spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PMD_SIZE); @@ -3006,7 +3006,8 @@ out: } } -void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address) +void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, + bool freeze, struct page *page) { pgd_t *pgd; pud_t *pud; @@ -3023,11 +3024,20 @@ void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address) pmd = pmd_offset(pud, address); if (!pmd_present(*pmd) || (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd))) return; + + /* + * If caller asks to setup a migration entries, we need a page to check + * pmd against. Otherwise we can end up replacing wrong page. + */ + VM_BUG_ON(freeze && !page); + if (page && page != pmd_page(*pmd)) + return; + /* * Caller holds the mmap_sem write mode, so a huge pmd cannot * materialize from under us. */ - split_huge_pmd(vma, pmd, address); + __split_huge_pmd(vma, pmd, address, freeze); } void vma_adjust_trans_huge(struct vm_area_struct *vma, @@ -3043,7 +3053,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, if (start & ~HPAGE_PMD_MASK && (start & HPAGE_PMD_MASK) >= vma->vm_start && (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) - split_huge_pmd_address(vma, start); + split_huge_pmd_address(vma, start, false, NULL); /* * If the new end address isn't hpage aligned and it could @@ -3053,7 +3063,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, if (end & ~HPAGE_PMD_MASK && (end & HPAGE_PMD_MASK) >= vma->vm_start && (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) - split_huge_pmd_address(vma, end); + split_huge_pmd_address(vma, end, false, NULL); /* * If we're also updating the vma->vm_next->vm_start, if the new @@ -3067,184 +3077,36 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, if (nstart & ~HPAGE_PMD_MASK && (nstart & HPAGE_PMD_MASK) >= next->vm_start && (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end) - split_huge_pmd_address(next, nstart); + split_huge_pmd_address(next, nstart, false, NULL); } } -static void freeze_page_vma(struct vm_area_struct *vma, struct page *page, - unsigned long address) +static void freeze_page(struct page *page) { - unsigned long haddr = address & HPAGE_PMD_MASK; - spinlock_t *ptl; - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - int i, nr = HPAGE_PMD_NR; - - /* Skip pages which doesn't belong to the VMA */ - if (address < vma->vm_start) { - int off = (vma->vm_start - address) >> PAGE_SHIFT; - page += off; - nr -= off; - address = vma->vm_start; - } - - pgd = pgd_offset(vma->vm_mm, address); - if (!pgd_present(*pgd)) - return; - pud = pud_offset(pgd, address); - if (!pud_present(*pud)) - return; - pmd = pmd_offset(pud, address); - ptl = pmd_lock(vma->vm_mm, pmd); - if (!pmd_present(*pmd)) { - spin_unlock(ptl); - return; - } - if (pmd_trans_huge(*pmd)) { - if (page == pmd_page(*pmd)) - __split_huge_pmd_locked(vma, pmd, haddr, true); - spin_unlock(ptl); - return; - } - spin_unlock(ptl); - - pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl); - for (i = 0; i < nr; i++, address += PAGE_SIZE, page++, pte++) { - pte_t entry, swp_pte; - swp_entry_t swp_entry; - - /* - * We've just crossed page table boundary: need to map next one. - * It can happen if THP was mremaped to non PMD-aligned address. - */ - if (unlikely(address == haddr + HPAGE_PMD_SIZE)) { - pte_unmap_unlock(pte - 1, ptl); - pmd = mm_find_pmd(vma->vm_mm, address); - if (!pmd) - return; - pte = pte_offset_map_lock(vma->vm_mm, pmd, - address, &ptl); - } - - if (!pte_present(*pte)) - continue; - if (page_to_pfn(page) != pte_pfn(*pte)) - continue; - flush_cache_page(vma, address, page_to_pfn(page)); - entry = ptep_clear_flush(vma, address, pte); - if (pte_dirty(entry)) - SetPageDirty(page); - swp_entry = make_migration_entry(page, pte_write(entry)); - swp_pte = swp_entry_to_pte(swp_entry); - if (pte_soft_dirty(entry)) - swp_pte = pte_swp_mksoft_dirty(swp_pte); - set_pte_at(vma->vm_mm, address, pte, swp_pte); - page_remove_rmap(page, false); - put_page(page); - } - pte_unmap_unlock(pte - 1, ptl); -} - -static void freeze_page(struct anon_vma *anon_vma, struct page *page) -{ - struct anon_vma_chain *avc; - pgoff_t pgoff = page_to_pgoff(page); + enum ttu_flags ttu_flags = TTU_MIGRATION | TTU_IGNORE_MLOCK | + TTU_IGNORE_ACCESS | TTU_RMAP_LOCKED; + int i, ret; VM_BUG_ON_PAGE(!PageHead(page), page); - anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, - pgoff + HPAGE_PMD_NR - 1) { - unsigned long address = __vma_address(page, avc->vma); - - mmu_notifier_invalidate_range_start(avc->vma->vm_mm, - address, address + HPAGE_PMD_SIZE); - freeze_page_vma(avc->vma, page, address); - mmu_notifier_invalidate_range_end(avc->vma->vm_mm, - address, address + HPAGE_PMD_SIZE); - } -} - -static void unfreeze_page_vma(struct vm_area_struct *vma, struct page *page, - unsigned long address) -{ - spinlock_t *ptl; - pmd_t *pmd; - pte_t *pte, entry; - swp_entry_t swp_entry; - unsigned long haddr = address & HPAGE_PMD_MASK; - int i, nr = HPAGE_PMD_NR; - - /* Skip pages which doesn't belong to the VMA */ - if (address < vma->vm_start) { - int off = (vma->vm_start - address) >> PAGE_SHIFT; - page += off; - nr -= off; - address = vma->vm_start; - } - - pmd = mm_find_pmd(vma->vm_mm, address); - if (!pmd) - return; - - pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl); - for (i = 0; i < nr; i++, address += PAGE_SIZE, page++, pte++) { - /* - * We've just crossed page table boundary: need to map next one. - * It can happen if THP was mremaped to non-PMD aligned address. - */ - if (unlikely(address == haddr + HPAGE_PMD_SIZE)) { - pte_unmap_unlock(pte - 1, ptl); - pmd = mm_find_pmd(vma->vm_mm, address); - if (!pmd) - return; - pte = pte_offset_map_lock(vma->vm_mm, pmd, - address, &ptl); - } - - if (!is_swap_pte(*pte)) - continue; - - swp_entry = pte_to_swp_entry(*pte); - if (!is_migration_entry(swp_entry)) - continue; - if (migration_entry_to_page(swp_entry) != page) - continue; - - get_page(page); - page_add_anon_rmap(page, vma, address, false); - - entry = pte_mkold(mk_pte(page, vma->vm_page_prot)); - if (PageDirty(page)) - entry = pte_mkdirty(entry); - if (is_write_migration_entry(swp_entry)) - entry = maybe_mkwrite(entry, vma); - - flush_dcache_page(page); - set_pte_at(vma->vm_mm, address, pte, entry); + /* We only need TTU_SPLIT_HUGE_PMD once */ + ret = try_to_unmap(page, ttu_flags | TTU_SPLIT_HUGE_PMD); + for (i = 1; !ret && i < HPAGE_PMD_NR; i++) { + /* Cut short if the page is unmapped */ + if (page_count(page) == 1) + return; - /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, address, pte); + ret = try_to_unmap(page + i, ttu_flags); } - pte_unmap_unlock(pte - 1, ptl); + VM_BUG_ON(ret); } -static void unfreeze_page(struct anon_vma *anon_vma, struct page *page) +static void unfreeze_page(struct page *page) { - struct anon_vma_chain *avc; - pgoff_t pgoff = page_to_pgoff(page); - - anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, - pgoff, pgoff + HPAGE_PMD_NR - 1) { - unsigned long address = __vma_address(page, avc->vma); + int i; - mmu_notifier_invalidate_range_start(avc->vma->vm_mm, - address, address + HPAGE_PMD_SIZE); - unfreeze_page_vma(avc->vma, page, address); - mmu_notifier_invalidate_range_end(avc->vma->vm_mm, - address, address + HPAGE_PMD_SIZE); - } + for (i = 0; i < HPAGE_PMD_NR; i++) + remove_migration_ptes(page + i, page + i, true); } static void __split_huge_page_tail(struct page *head, int tail, @@ -3322,7 +3184,7 @@ static void __split_huge_page(struct page *page, struct list_head *list) ClearPageCompound(head); spin_unlock_irq(&zone->lru_lock); - unfreeze_page(page_anon_vma(head), head); + unfreeze_page(head); for (i = 0; i < HPAGE_PMD_NR; i++) { struct page *subpage = head + i; @@ -3418,7 +3280,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) } mlocked = PageMlocked(page); - freeze_page(anon_vma, head); + freeze_page(head); VM_BUG_ON_PAGE(compound_mapcount(head), head); /* Make sure the page is not on per-CPU pagevec as it takes pin */ @@ -3447,7 +3309,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) BUG(); } else { spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); - unfreeze_page(anon_vma, head); + unfreeze_page(head); ret = -EBUSY; } diff --git a/mm/rmap.c b/mm/rmap.c index 945933a..c399a0d 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1431,8 +1431,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED)) goto out; - if (flags & TTU_SPLIT_HUGE_PMD) - split_huge_pmd_address(vma, address); + if (flags & TTU_SPLIT_HUGE_PMD) { + split_huge_pmd_address(vma, address, + flags & TTU_MIGRATION, page); + /* check if we have anything to do after split */ + if (page_mapcount(page) == 0) + goto out; + } + pte = page_check_address(page, mm, address, &ptl, 0); if (!pte) goto out; -- cgit v1.1 From 5f7377147ce0b5d521c0ee2aab2ec3ead51297db Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 17 Mar 2016 14:20:13 -0700 Subject: thp: fix deadlock in split_huge_pmd() split_huge_pmd() tries to munlock page with munlock_vma_page(). That requires the page to locked. If the is locked by caller, we would get a deadlock: Unable to find swap-space signature INFO: task trinity-c85:1907 blocked for more than 120 seconds. Not tainted 4.4.0-00032-gf19d0bdced41-dirty #1606 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. trinity-c85 D ffff88084d997608 0 1907 309 0x00000000 Call Trace: schedule+0x9f/0x1c0 schedule_timeout+0x48e/0x600 io_schedule_timeout+0x1c3/0x390 bit_wait_io+0x29/0xd0 __wait_on_bit_lock+0x94/0x140 __lock_page+0x1d4/0x280 __split_huge_pmd+0x5a8/0x10f0 split_huge_pmd_address+0x1d9/0x230 try_to_unmap_one+0x540/0xc70 rmap_walk_anon+0x284/0x810 rmap_walk_locked+0x11e/0x190 try_to_unmap+0x1b1/0x4b0 split_huge_page_to_list+0x49d/0x18a0 follow_page_mask+0xa36/0xea0 SyS_move_pages+0xaf3/0x1570 entry_SYSCALL_64_fastpath+0x12/0x6b 2 locks held by trinity-c85/1907: #0: (&mm->mmap_sem){++++++}, at: SyS_move_pages+0x933/0x1570 #1: (&anon_vma->rwsem){++++..}, at: split_huge_page_to_list+0x402/0x18a0 I don't think the deadlock is triggerable without split_huge_page() simplifilcation patchset. But munlock_vma_page() here is wrong: we want to munlock the page unconditionally, no need in rmap lookup, that munlock_vma_page() does. Let's use clear_page_mlock() instead. It can be called under ptl. Fixes: e90309c9f772 ("thp: allow mlocked THP again") Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 4a58fa1..021db17 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2981,29 +2981,20 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, { spinlock_t *ptl; struct mm_struct *mm = vma->vm_mm; - struct page *page = NULL; unsigned long haddr = address & HPAGE_PMD_MASK; mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE); ptl = pmd_lock(mm, pmd); if (pmd_trans_huge(*pmd)) { - page = pmd_page(*pmd); + struct page *page = pmd_page(*pmd); if (PageMlocked(page)) - get_page(page); - else - page = NULL; + clear_page_mlock(page); } else if (!pmd_devmap(*pmd)) goto out; __split_huge_pmd_locked(vma, pmd, haddr, freeze); out: spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PMD_SIZE); - if (page) { - lock_page(page); - munlock_vma_page(page); - unlock_page(page); - put_page(page); - } } void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, -- cgit v1.1 From 987b3095c2a76ea1b90438f5ace6f9cee354a47d Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Thu, 17 Mar 2016 14:20:16 -0700 Subject: mm: meminit: initialise more memory for inode/dentry hash tables in early boot Upstream has supported page parallel initialisation for X86 and the boot time is improved greately. Some tests have been done for Power. Here is the result I have done with different memory size. * 4GB memory: boot time is as the following: with patch vs without patch: 10.4s vs 24.5s boot time is improved 57% * 200GB memory: boot time looks the same with and without patches. boot time is about 38s * 32TB memory: boot time looks the same with and without patches boot time is about 160s. The boot time is much shorter than X86 with 24TB memory. From community discussion, it costs about 694s for X86 24T system. Parallel initialisation improves the performance by deferring memory initilisation to kswap with N kthreads, it should improve the performance therotically. In testing on X86, performance is improved greatly with huge memory. But on Power platform, it is improved greatly with less than 100GB memory. For huge memory, it is not improved greatly. But it saves the time with several threads at least, as the following information shows(32TB system log): [ 22.648169] node 9 initialised, 16607461 pages in 280ms [ 22.783772] node 3 initialised, 23937243 pages in 410ms [ 22.858877] node 6 initialised, 29179347 pages in 490ms [ 22.863252] node 2 initialised, 29179347 pages in 490ms [ 22.907545] node 0 initialised, 32049614 pages in 540ms [ 22.920891] node 15 initialised, 32212280 pages in 550ms [ 22.923236] node 4 initialised, 32306127 pages in 550ms [ 22.923384] node 12 initialised, 32314319 pages in 550ms [ 22.924754] node 8 initialised, 32314319 pages in 550ms [ 22.940780] node 13 initialised, 33353677 pages in 570ms [ 22.940796] node 11 initialised, 33353677 pages in 570ms [ 22.941700] node 5 initialised, 33353677 pages in 570ms [ 22.941721] node 10 initialised, 33353677 pages in 570ms [ 22.941876] node 7 initialised, 33353677 pages in 570ms [ 22.944946] node 14 initialised, 33353677 pages in 570ms [ 22.946063] node 1 initialised, 33345485 pages in 580ms It saves the time about 550*16 ms at least, although it can be ignore to compare the boot time about 160 seconds. What's more, the boot time is much shorter on Power even without patches than x86 for huge memory machine. So this patchset is still necessary to be enabled for Power. This patch (of 2): This patch is based on Mel Gorman's old patch in the mailing list, https://lkml.org/lkml/2015/5/5/280 which is discussed but it is fixed with a completion to wait for all memory initialised in page_alloc_init_late(). It is to fix the OOM problem on X86 with 24TB memory which allocates memory in late initialisation. But for Power platform with 32TB memory, it causes a call trace in vfs_caches_init->inode_init() and inode hash table needs more memory. So this patch allocates 1GB for 0.25TB/node for large system as it is mentioned in https://lkml.org/lkml/2015/5/1/627 This call trace is found on Power with 32TB memory, 1024CPUs, 16nodes. Currently, it only allocates 2GB*16=32GB for early initialisation. But Dentry cache hash table needes 16GB and Inode cache hash table needs 16GB. So the system have no enough memory for it. The log from dmesg as the following: Dentry cache hash table entries: 2147483648 (order: 18,17179869184 bytes) vmalloc: allocation failure, allocated 16021913600 of 17179934720 bytes swapper/0: page allocation failure: order:0,mode:0x2080020 CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.4.0-0-ppc64 Call Trace: .dump_stack+0xb4/0xb664 (unreliable) .warn_alloc_failed+0x114/0x160 .__vmalloc_area_node+0x1a4/0x2b0 .__vmalloc_node_range+0xe4/0x110 .__vmalloc_node+0x40/0x50 .alloc_large_system_hash+0x134/0x2a4 .inode_init+0xa4/0xf0 .vfs_caches_init+0x80/0x144 .start_kernel+0x40c/0x4e0 start_here_common+0x20/0x4a4 Signed-off-by: Li Zhang Acked-by: Mel Gorman Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2a9eaec..3583f71 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -308,13 +308,20 @@ static inline bool update_defer_init(pg_data_t *pgdat, unsigned long pfn, unsigned long zone_end, unsigned long *nr_initialised) { + unsigned long max_initialise; + /* Always populate low zones for address-contrained allocations */ if (zone_end < pgdat_end_pfn(pgdat)) return true; + /* + * Initialise at least 2G of a node but also take into account that + * two large system hashes that can take up 1GB for 0.25TB/node. + */ + max_initialise = max(2UL << (30 - PAGE_SHIFT), + (pgdat->node_spanned_pages >> 8)); - /* Initialise at least 2G of the highest zone */ (*nr_initialised)++; - if (*nr_initialised > (2UL << (30 - PAGE_SHIFT)) && + if ((*nr_initialised > max_initialise) && (pfn & (PAGES_PER_SECTION - 1)) == 0) { pgdat->first_deferred_pfn = pfn; return false; -- cgit v1.1 From 588083bb37a3cea8533c392370a554417c8f29cb Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 17 Mar 2016 14:20:25 -0700 Subject: mm: memcontrol: reclaim when shrinking memory.high below usage When setting memory.high below usage, nothing happens until the next charge comes along, and then it will only reclaim its own charge and not the now potentially huge excess of the new memory.high. This can cause groups to stay in excess of their memory.high indefinitely. To fix that, when shrinking memory.high, kick off a reclaim cycle that goes after the delta. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Vladimir Davydov Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8615b06..f7c9b4c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4992,6 +4992,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long nr_pages; unsigned long high; int err; @@ -5002,6 +5003,11 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, memcg->high = high; + nr_pages = page_counter_read(&memcg->memory); + if (nr_pages > high) + try_to_free_mem_cgroup_pages(memcg, nr_pages - high, + GFP_KERNEL, true); + memcg_wb_domain_size_changed(memcg); return nbytes; } -- cgit v1.1 From b6e6edcfa40561e9c8abe5eecf1c96f8e5fd9c6f Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 17 Mar 2016 14:20:28 -0700 Subject: mm: memcontrol: reclaim and OOM kill when shrinking memory.max below usage Setting the original memory.limit_in_bytes hardlimit is subject to a race condition when the desired value is below the current usage. The code tries a few times to first reclaim and then see if the usage has dropped to where we would like it to be, but there is no locking, and the workload is free to continue making new charges up to the old limit. Thus, attempting to shrink a workload relies on pure luck and hope that the workload happens to cooperate. To fix this in the cgroup2 memory.max knob, do it the other way round: set the limit first, then try enforcement. And if reclaim is not able to succeed, trigger OOM kills in the group. Keep going until the new limit is met, we run out of OOM victims and there's only unreclaimable memory left, or the task writing to memory.max is killed. This allows users to shrink groups reliably, and the behavior is consistent with what happens when new charges are attempted in excess of memory.max. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Vladimir Davydov Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 38 ++++++++++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f7c9b4c..8614e0d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1236,7 +1236,7 @@ static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) return limit; } -static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, +static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, int order) { struct oom_control oc = { @@ -1314,6 +1314,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, } unlock: mutex_unlock(&oom_lock); + return chosen; } #if MAX_NUMNODES > 1 @@ -5029,6 +5030,8 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned int nr_reclaims = MEM_CGROUP_RECLAIM_RETRIES; + bool drained = false; unsigned long max; int err; @@ -5037,9 +5040,36 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, if (err) return err; - err = mem_cgroup_resize_limit(memcg, max); - if (err) - return err; + xchg(&memcg->memory.limit, max); + + for (;;) { + unsigned long nr_pages = page_counter_read(&memcg->memory); + + if (nr_pages <= max) + break; + + if (signal_pending(current)) { + err = -EINTR; + break; + } + + if (!drained) { + drain_all_stock(memcg); + drained = true; + continue; + } + + if (nr_reclaims) { + if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max, + GFP_KERNEL, true)) + nr_reclaims--; + continue; + } + + mem_cgroup_events(memcg, MEMCG_OOM, 1); + if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0)) + break; + } memcg_wb_domain_size_changed(memcg); return nbytes; -- cgit v1.1 From 8b5926560fafe80fe764af2aade152d490a96546 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 17 Mar 2016 14:20:31 -0700 Subject: mm: memcontrol: clarify the uncharge_list() loop uncharge_list() does an unusual list walk because the function can take regular lists with dedicated list_heads as well as singleton lists where a single page is passed via the page->lru list node. This can sometimes lead to confusion as well as suggestions to replace the loop with a list_for_each_entry(), which wouldn't work. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8614e0d..fa7bf35 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5420,6 +5420,10 @@ static void uncharge_list(struct list_head *page_list) struct list_head *next; struct page *page; + /* + * Note that the list can be a single page->lru; hence the + * do-while loop instead of a simple list_for_each_entry(). + */ next = page_list->next; do { unsigned int nr_pages = 1; -- cgit v1.1 From e0775d10f1e65548fcbadf614a6bf3d216165021 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 17 Mar 2016 14:20:34 -0700 Subject: mm: memcontrol: zap oom_info_lock mem_cgroup_print_oom_info is always called under oom_lock, so oom_info_lock is redundant. Signed-off-by: Vladimir Davydov Cc: Johannes Weiner Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fa7bf35..36db05f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1150,12 +1150,9 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) */ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) { - /* oom_info_lock ensures that parallel ooms do not interleave */ - static DEFINE_MUTEX(oom_info_lock); struct mem_cgroup *iter; unsigned int i; - mutex_lock(&oom_info_lock); rcu_read_lock(); if (p) { @@ -1199,7 +1196,6 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) pr_cont("\n"); } - mutex_unlock(&oom_info_lock); } /* -- cgit v1.1 From a1c0b1a074c0095492a956b4fe0067b74a82cfe3 Mon Sep 17 00:00:00 2001 From: Shawn Lin Date: Thu, 17 Mar 2016 14:20:37 -0700 Subject: mm/vmalloc: use PAGE_ALIGNED() to check PAGE_SIZE alignment We have PAGE_ALIGNED() in mm.h, so let's use it instead of IS_ALIGNED() for checking PAGE_SIZE aligned case. Signed-off-by: Shawn Lin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index e86c24e..ae7d20b 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1085,7 +1085,7 @@ void vm_unmap_ram(const void *mem, unsigned int count) BUG_ON(!addr); BUG_ON(addr < VMALLOC_START); BUG_ON(addr > VMALLOC_END); - BUG_ON(!IS_ALIGNED(addr, PAGE_SIZE)); + BUG_ON(!PAGE_ALIGNED(addr)); debug_check_no_locks_freed(mem, size); vmap_debug_free_range(addr, addr+size); -- cgit v1.1 From a82cbf07131b56e7e51fbb008b82ef769af08790 Mon Sep 17 00:00:00 2001 From: YiPing Xu Date: Thu, 17 Mar 2016 14:20:39 -0700 Subject: zsmalloc: drop unused member 'mapping_area->huge' When unmapping a huge class page in zs_unmap_object, the page will be unmapped by kmap_atomic. the "!area->huge" branch in __zs_unmap_object is alway true, and no code set "area->huge" now, so we can drop it. Signed-off-by: YiPing Xu Reviewed-by: Sergey Senozhatsky Acked-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 2d7c4c1..43e4cbc 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -281,7 +281,6 @@ struct mapping_area { #endif char *vm_addr; /* address of kmap_atomic()'ed pages */ enum zs_mapmode vm_mm; /* mapping mode */ - bool huge; }; static int create_handle_cache(struct zs_pool *pool) @@ -1127,11 +1126,9 @@ static void __zs_unmap_object(struct mapping_area *area, goto out; buf = area->vm_buf; - if (!area->huge) { - buf = buf + ZS_HANDLE_SIZE; - size -= ZS_HANDLE_SIZE; - off += ZS_HANDLE_SIZE; - } + buf = buf + ZS_HANDLE_SIZE; + size -= ZS_HANDLE_SIZE; + off += ZS_HANDLE_SIZE; sizes[0] = PAGE_SIZE - off; sizes[1] = size - sizes[0]; -- cgit v1.1 From 1120ed5483941d9cd2cf52cb9644a4311dbd1011 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Thu, 17 Mar 2016 14:20:42 -0700 Subject: mm/zsmalloc: add `freeable' column to pool stat Add a new column to pool stats, which will tell how many pages ideally can be freed by class compaction, so it will be easier to analyze zsmalloc fragmentation. At the moment, we have only numbers of FULL and ALMOST_EMPTY classes, but they don't tell us how badly the class is fragmented internally. The new /sys/kernel/debug/zsmalloc/zramX/classes output look as follows: class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage freeable [..] 12 224 0 2 146 5 8 4 4 13 240 0 0 0 0 0 1 0 14 256 1 13 1840 1672 115 1 10 15 272 0 0 0 0 0 1 0 [..] 49 816 0 3 745 735 149 1 2 51 848 3 4 361 306 76 4 8 52 864 12 14 378 268 81 3 21 54 896 1 12 117 57 26 2 12 57 944 0 0 0 0 0 3 0 [..] Total 26 131 12709 10994 1071 134 For example, from this particular output we can easily conclude that class-896 is heavily fragmented -- it occupies 26 pages, 12 can be freed by compaction. Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 43e4cbc..e72efb10 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -494,6 +494,8 @@ static void __exit zs_stat_exit(void) debugfs_remove_recursive(zs_stat_root); } +static unsigned long zs_can_compact(struct size_class *class); + static int zs_stats_size_show(struct seq_file *s, void *v) { int i; @@ -501,14 +503,15 @@ static int zs_stats_size_show(struct seq_file *s, void *v) struct size_class *class; int objs_per_zspage; unsigned long class_almost_full, class_almost_empty; - unsigned long obj_allocated, obj_used, pages_used; + unsigned long obj_allocated, obj_used, pages_used, freeable; unsigned long total_class_almost_full = 0, total_class_almost_empty = 0; unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0; + unsigned long total_freeable = 0; - seq_printf(s, " %5s %5s %11s %12s %13s %10s %10s %16s\n", + seq_printf(s, " %5s %5s %11s %12s %13s %10s %10s %16s %8s\n", "class", "size", "almost_full", "almost_empty", "obj_allocated", "obj_used", "pages_used", - "pages_per_zspage"); + "pages_per_zspage", "freeable"); for (i = 0; i < zs_size_classes; i++) { class = pool->size_class[i]; @@ -521,6 +524,7 @@ static int zs_stats_size_show(struct seq_file *s, void *v) class_almost_empty = zs_stat_get(class, CLASS_ALMOST_EMPTY); obj_allocated = zs_stat_get(class, OBJ_ALLOCATED); obj_used = zs_stat_get(class, OBJ_USED); + freeable = zs_can_compact(class); spin_unlock(&class->lock); objs_per_zspage = get_maxobj_per_zspage(class->size, @@ -528,23 +532,25 @@ static int zs_stats_size_show(struct seq_file *s, void *v) pages_used = obj_allocated / objs_per_zspage * class->pages_per_zspage; - seq_printf(s, " %5u %5u %11lu %12lu %13lu %10lu %10lu %16d\n", + seq_printf(s, " %5u %5u %11lu %12lu %13lu" + " %10lu %10lu %16d %8lu\n", i, class->size, class_almost_full, class_almost_empty, obj_allocated, obj_used, pages_used, - class->pages_per_zspage); + class->pages_per_zspage, freeable); total_class_almost_full += class_almost_full; total_class_almost_empty += class_almost_empty; total_objs += obj_allocated; total_used_objs += obj_used; total_pages += pages_used; + total_freeable += freeable; } seq_puts(s, "\n"); - seq_printf(s, " %5s %5s %11lu %12lu %13lu %10lu %10lu\n", + seq_printf(s, " %5s %5s %11lu %12lu %13lu %10lu %10lu %16s %8lu\n", "Total", "", total_class_almost_full, total_class_almost_empty, total_objs, - total_used_objs, total_pages); + total_used_objs, total_pages, "", total_freeable); return 0; } -- cgit v1.1 From 6afcf2895e6f229476bd0dc95fe915e639ae0a49 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Thu, 17 Mar 2016 14:20:45 -0700 Subject: mm,oom: make oom_killer_disable() killable While oom_killer_disable() is called by freeze_processes() after all user threads except the current thread are frozen, it is possible that kernel threads invoke the OOM killer and sends SIGKILL to the current thread due to sharing the thawed victim's memory. Therefore, checking for SIGKILL is preferable than TIF_MEMDIE. Signed-off-by: Tetsuo Handa Cc: Tetsuo Handa Cc: David Rientjes Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index fde3d37..06f7e17 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -455,15 +455,11 @@ void exit_oom_victim(void) bool oom_killer_disable(void) { /* - * Make sure to not race with an ongoing OOM killer - * and that the current is not the victim. + * Make sure to not race with an ongoing OOM killer. Check that the + * current is not killed (possibly due to sharing the victim's memory). */ - mutex_lock(&oom_lock); - if (test_thread_flag(TIF_MEMDIE)) { - mutex_unlock(&oom_lock); + if (mutex_lock_killable(&oom_lock)) return false; - } - oom_killer_disabled = true; mutex_unlock(&oom_lock); -- cgit v1.1 From 0a687aace3b8e215e08eed8a67014f5b8f133ab0 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Thu, 17 Mar 2016 14:20:48 -0700 Subject: mm,oom: do not loop !__GFP_FS allocation if the OOM killer is disabled After the OOM killer is disabled during suspend operation, any !__GFP_NOFAIL && __GFP_FS allocations are forced to fail. Thus, any !__GFP_NOFAIL && !__GFP_FS allocations should be forced to fail as well. Signed-off-by: Tetsuo Handa Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3583f71..a762be5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2858,8 +2858,12 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, * XXX: Page reclaim didn't yield anything, * and the OOM killer can't be invoked, but * keep looping as per tradition. + * + * But do not keep looping if oom_killer_disable() + * was already called, for the system is trying to + * enter a quiescent state during suspend. */ - *did_some_progress = 1; + *did_some_progress = !oom_killer_disabled; goto out; } if (pm_suspended_storage()) -- cgit v1.1 From 93e205a728e6cb8d7d11f6836e289798a1de25e2 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 17 Mar 2016 14:21:15 -0700 Subject: fix Christoph's email addresses There are various email addresses for me throughout the kernel. Use the one that will always be valid. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmu_notifier.c | 2 +- mm/quicklist.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 5fbdd36..f4259e4 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -3,7 +3,7 @@ * * Copyright (C) 2008 Qumranet, Inc. * Copyright (C) 2008 SGI - * Christoph Lameter + * Christoph Lameter * * This work is licensed under the terms of the GNU GPL, version 2. See * the COPYING file in the top-level directory. diff --git a/mm/quicklist.c b/mm/quicklist.c index 9422129..daf6ff6 100644 --- a/mm/quicklist.c +++ b/mm/quicklist.c @@ -8,7 +8,7 @@ * improved on it. * * Copyright (C) 2007 SGI, - * Christoph Lameter + * Christoph Lameter * Generalized, added support for multiple lists and * constructors / destructors. */ -- cgit v1.1 From e61452365372570253b2b1de84bab0cdb2e62c64 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 17 Mar 2016 14:21:54 -0700 Subject: radix_tree: add support for multi-order entries With huge pages, it is convenient to have the radix tree be able to return an entry that covers multiple indices. Previous attempts to deal with the problem have involved inserting N duplicate entries, which is a waste of memory and leads to problems trying to handle aliased tags, or probing the tree multiple times to find alternative entries which might cover the requested index. This approach inserts one canonical entry into the tree for a given range of indices, and may also insert other entries in order to ensure that lookups find the canonical entry. This solution only tolerates inserting powers of two that are greater than the fanout of the tree. If we wish to expand the radix tree's abilities to support large-ish pages that is less than the fanout at the penultimate level of the tree, then we would need to add one more step in lookup to ensure that any sibling nodes in the final level of the tree are dereferenced and we return the canonical entry that they reference. Signed-off-by: Matthew Wilcox Cc: Johannes Weiner Cc: Matthew Wilcox Cc: "Kirill A. Shutemov" Cc: Ross Zwisler Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 61b441b..084ad0f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -586,7 +586,7 @@ static int page_cache_tree_insert(struct address_space *mapping, void **slot; int error; - error = __radix_tree_create(&mapping->page_tree, page->index, + error = __radix_tree_create(&mapping->page_tree, page->index, 0, &node, &slot); if (error) return error; -- cgit v1.1 From 2cf938aae17203426a89b5955bd1c9668657bfa8 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 17 Mar 2016 14:22:03 -0700 Subject: mm: use radix_tree_iter_retry() Instead of a 'goto restart', we can now use radix_tree_iter_retry() to restart from our current position. This will make a difference when there are more ways to happen across an indirect pointer. And it eliminates some confusing gotos. [vbabka@suse.cz: remove now-obsolete-and-misleading comment] Signed-off-by: Matthew Wilcox Cc: Hugh Dickins Cc: Konstantin Khlebnikov Signed-off-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 53 +++++++++++++++++------------------------------------ mm/shmem.c | 23 ++++++++++++----------- 2 files changed, 29 insertions(+), 47 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 084ad0f..7c00f10 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1255,7 +1255,6 @@ unsigned find_get_entries(struct address_space *mapping, return 0; rcu_read_lock(); -restart: radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { struct page *page; repeat: @@ -1263,8 +1262,10 @@ repeat: if (unlikely(!page)) continue; if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) - goto restart; + if (radix_tree_deref_retry(page)) { + slot = radix_tree_iter_retry(&iter); + continue; + } /* * A shadow entry of a recently evicted page, a swap * entry from shmem/tmpfs or a DAX entry. Return it @@ -1317,7 +1318,6 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start, return 0; rcu_read_lock(); -restart: radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { struct page *page; repeat: @@ -1327,13 +1327,8 @@ repeat: if (radix_tree_exception(page)) { if (radix_tree_deref_retry(page)) { - /* - * Transient condition which can only trigger - * when entry at index 0 moves out of or back - * to root: none yet gotten, safe to restart. - */ - WARN_ON(iter.index); - goto restart; + slot = radix_tree_iter_retry(&iter); + continue; } /* * A shadow entry of a recently evicted page, @@ -1384,7 +1379,6 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, return 0; rcu_read_lock(); -restart: radix_tree_for_each_contig(slot, &mapping->page_tree, &iter, index) { struct page *page; repeat: @@ -1395,12 +1389,8 @@ repeat: if (radix_tree_exception(page)) { if (radix_tree_deref_retry(page)) { - /* - * Transient condition which can only trigger - * when entry at index 0 moves out of or back - * to root: none yet gotten, safe to restart. - */ - goto restart; + slot = radix_tree_iter_retry(&iter); + continue; } /* * A shadow entry of a recently evicted page, @@ -1460,7 +1450,6 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, return 0; rcu_read_lock(); -restart: radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, *index, tag) { struct page *page; @@ -1471,12 +1460,8 @@ repeat: if (radix_tree_exception(page)) { if (radix_tree_deref_retry(page)) { - /* - * Transient condition which can only trigger - * when entry at index 0 moves out of or back - * to root: none yet gotten, safe to restart. - */ - goto restart; + slot = radix_tree_iter_retry(&iter); + continue; } /* * A shadow entry of a recently evicted page. @@ -1539,7 +1524,6 @@ unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, return 0; rcu_read_lock(); -restart: radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, start, tag) { struct page *page; @@ -1549,12 +1533,8 @@ repeat: continue; if (radix_tree_exception(page)) { if (radix_tree_deref_retry(page)) { - /* - * Transient condition which can only trigger - * when entry at index 0 moves out of or back - * to root: none yet gotten, safe to restart. - */ - goto restart; + slot = radix_tree_iter_retry(&iter); + continue; } /* @@ -2171,10 +2151,11 @@ repeat: if (unlikely(!page)) goto next; if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) - break; - else - goto next; + if (radix_tree_deref_retry(page)) { + slot = radix_tree_iter_retry(&iter); + continue; + } + goto next; } if (!page_cache_get_speculative(page)) diff --git a/mm/shmem.c b/mm/shmem.c index c484f68..91c0dad 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -383,13 +383,10 @@ restart: page = radix_tree_deref_slot(slot); - /* - * This should only be possible to happen at index 0, so we - * don't need to reset the counter, nor do we risk infinite - * restarts. - */ - if (radix_tree_deref_retry(page)) - goto restart; + if (radix_tree_deref_retry(page)) { + slot = radix_tree_iter_retry(&iter); + continue; + } if (radix_tree_exceptional_entry(page)) swapped++; @@ -1951,8 +1948,10 @@ restart: radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { page = radix_tree_deref_slot(slot); if (!page || radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) - goto restart; + if (radix_tree_deref_retry(page)) { + slot = radix_tree_iter_retry(&iter); + continue; + } } else if (page_count(page) - page_mapcount(page) > 1) { spin_lock_irq(&mapping->tree_lock); radix_tree_tag_set(&mapping->page_tree, iter.index, @@ -2006,8 +2005,10 @@ restart: page = radix_tree_deref_slot(slot); if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) - goto restart; + if (radix_tree_deref_retry(page)) { + slot = radix_tree_iter_retry(&iter); + continue; + } page = NULL; } -- cgit v1.1 From 7165092fe5ca70bae722ac6cd78421cfd0eec18d Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 17 Mar 2016 14:22:06 -0700 Subject: radix-tree,shmem: introduce radix_tree_iter_next() shmem likes to occasionally drop the lock, schedule, then reacqire the lock and continue with the iteration from the last place it left off. This is currently done with a pretty ugly goto. Introduce radix_tree_iter_next() and use it throughout shmem.c. [koct9i@gmail.com: fix bug in radix_tree_iter_next() for tagged iteration] Signed-off-by: Matthew Wilcox Cc: Hugh Dickins Signed-off-by: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 91c0dad..9428c51 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -376,7 +376,6 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping, rcu_read_lock(); -restart: radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { if (iter.index >= end) break; @@ -393,8 +392,7 @@ restart: if (need_resched()) { cond_resched_rcu(); - start = iter.index + 1; - goto restart; + slot = radix_tree_iter_next(&iter); } } @@ -1944,7 +1942,6 @@ static void shmem_tag_pins(struct address_space *mapping) start = 0; rcu_read_lock(); -restart: radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { page = radix_tree_deref_slot(slot); if (!page || radix_tree_exception(page)) { @@ -1961,8 +1958,7 @@ restart: if (need_resched()) { cond_resched_rcu(); - start = iter.index + 1; - goto restart; + slot = radix_tree_iter_next(&iter); } } rcu_read_unlock(); @@ -1999,7 +1995,6 @@ static int shmem_wait_for_pins(struct address_space *mapping) start = 0; rcu_read_lock(); -restart: radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, start, SHMEM_TAG_PINNED) { @@ -2033,8 +2028,7 @@ restart: continue_resched: if (need_resched()) { cond_resched_rcu(); - start = iter.index + 1; - goto restart; + slot = radix_tree_iter_next(&iter); } } rcu_read_unlock(); -- cgit v1.1 From 3f2b1a04f44933f2d6fe0a9bf9a9c1c452df23f7 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 22 Mar 2016 14:24:36 -0700 Subject: zram: revive swap_slot_free_notify Commit b430e9d1c6d4 ("remove compressed copy from zram in-memory") applied swap_slot_free_notify call in *end_swap_bio_read* to remove duplicated memory between zram and memory. However, with the introduction of rw_page in zram: 8c7f01025f7b ("zram: implement rw_page operation of zram"), it became void because rw_page doesn't need bio. Memory footprint is really important in embedded platforms which have small memory, for example, 512M) recently because it could start to kill processes if memory footprint exceeds some threshold by LMK or some similar memory management modules. This patch restores the function for rw_page, thereby eliminating this duplication. Signed-off-by: Minchan Kim Cc: Sergey Senozhatsky Cc: karam.lee Cc: Cc: Chan Jeong Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_io.c | 93 ++++++++++++++++++++++++++++++++---------------------------- 1 file changed, 50 insertions(+), 43 deletions(-) (limited to 'mm') diff --git a/mm/page_io.c b/mm/page_io.c index ff74e51..18aac78 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -66,6 +66,54 @@ void end_swap_bio_write(struct bio *bio) bio_put(bio); } +static void swap_slot_free_notify(struct page *page) +{ + struct swap_info_struct *sis; + struct gendisk *disk; + + /* + * There is no guarantee that the page is in swap cache - the software + * suspend code (at least) uses end_swap_bio_read() against a non- + * swapcache page. So we must check PG_swapcache before proceeding with + * this optimization. + */ + if (unlikely(!PageSwapCache(page))) + return; + + sis = page_swap_info(page); + if (!(sis->flags & SWP_BLKDEV)) + return; + + /* + * The swap subsystem performs lazy swap slot freeing, + * expecting that the page will be swapped out again. + * So we can avoid an unnecessary write if the page + * isn't redirtied. + * This is good for real swap storage because we can + * reduce unnecessary I/O and enhance wear-leveling + * if an SSD is used as the as swap device. + * But if in-memory swap device (eg zram) is used, + * this causes a duplicated copy between uncompressed + * data in VM-owned memory and compressed data in + * zram-owned memory. So let's free zram-owned memory + * and make the VM-owned decompressed page *dirty*, + * so the page should be swapped out somewhere again if + * we again wish to reclaim it. + */ + disk = sis->bdev->bd_disk; + if (disk->fops->swap_slot_free_notify) { + swp_entry_t entry; + unsigned long offset; + + entry.val = page_private(page); + offset = swp_offset(entry); + + SetPageDirty(page); + disk->fops->swap_slot_free_notify(sis->bdev, + offset); + } +} + static void end_swap_bio_read(struct bio *bio) { struct page *page = bio->bi_io_vec[0].bv_page; @@ -81,49 +129,7 @@ static void end_swap_bio_read(struct bio *bio) } SetPageUptodate(page); - - /* - * There is no guarantee that the page is in swap cache - the software - * suspend code (at least) uses end_swap_bio_read() against a non- - * swapcache page. So we must check PG_swapcache before proceeding with - * this optimization. - */ - if (likely(PageSwapCache(page))) { - struct swap_info_struct *sis; - - sis = page_swap_info(page); - if (sis->flags & SWP_BLKDEV) { - /* - * The swap subsystem performs lazy swap slot freeing, - * expecting that the page will be swapped out again. - * So we can avoid an unnecessary write if the page - * isn't redirtied. - * This is good for real swap storage because we can - * reduce unnecessary I/O and enhance wear-leveling - * if an SSD is used as the as swap device. - * But if in-memory swap device (eg zram) is used, - * this causes a duplicated copy between uncompressed - * data in VM-owned memory and compressed data in - * zram-owned memory. So let's free zram-owned memory - * and make the VM-owned decompressed page *dirty*, - * so the page should be swapped out somewhere again if - * we again wish to reclaim it. - */ - struct gendisk *disk = sis->bdev->bd_disk; - if (disk->fops->swap_slot_free_notify) { - swp_entry_t entry; - unsigned long offset; - - entry.val = page_private(page); - offset = swp_offset(entry); - - SetPageDirty(page); - disk->fops->swap_slot_free_notify(sis->bdev, - offset); - } - } - } - + swap_slot_free_notify(page); out: unlock_page(page); bio_put(bio); @@ -347,6 +353,7 @@ int swap_readpage(struct page *page) ret = bdev_read_page(sis->bdev, swap_page_sector(page), page); if (!ret) { + swap_slot_free_notify(page); count_vm_event(PSWPIN); return 0; } -- cgit v1.1 From 5c9a8750a6409c63a0f01d51a9024861022f6593 Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Tue, 22 Mar 2016 14:27:30 -0700 Subject: kernel: add kcov code coverage kcov provides code coverage collection for coverage-guided fuzzing (randomized testing). Coverage-guided fuzzing is a testing technique that uses coverage feedback to determine new interesting inputs to a system. A notable user-space example is AFL (http://lcamtuf.coredump.cx/afl/). However, this technique is not widely used for kernel testing due to missing compiler and kernel support. kcov does not aim to collect as much coverage as possible. It aims to collect more or less stable coverage that is function of syscall inputs. To achieve this goal it does not collect coverage in soft/hard interrupts and instrumentation of some inherently non-deterministic or non-interesting parts of kernel is disbled (e.g. scheduler, locking). Currently there is a single coverage collection mode (tracing), but the API anticipates additional collection modes. Initially I also implemented a second mode which exposes coverage in a fixed-size hash table of counters (what Quentin used in his original patch). I've dropped the second mode for simplicity. This patch adds the necessary support on kernel side. The complimentary compiler support was added in gcc revision 231296. We've used this support to build syzkaller system call fuzzer, which has found 90 kernel bugs in just 2 months: https://github.com/google/syzkaller/wiki/Found-Bugs We've also found 30+ bugs in our internal systems with syzkaller. Another (yet unexplored) direction where kcov coverage would greatly help is more traditional "blob mutation". For example, mounting a random blob as a filesystem, or receiving a random blob over wire. Why not gcov. Typical fuzzing loop looks as follows: (1) reset coverage, (2) execute a bit of code, (3) collect coverage, repeat. A typical coverage can be just a dozen of basic blocks (e.g. an invalid input). In such context gcov becomes prohibitively expensive as reset/collect coverage steps depend on total number of basic blocks/edges in program (in case of kernel it is about 2M). Cost of kcov depends only on number of executed basic blocks/edges. On top of that, kernel requires per-thread coverage because there are always background threads and unrelated processes that also produce coverage. With inlined gcov instrumentation per-thread coverage is not possible. kcov exposes kernel PCs and control flow to user-space which is insecure. But debugfs should not be mapped as user accessible. Based on a patch by Quentin Casasnovas. [akpm@linux-foundation.org: make task_struct.kcov_mode have type `enum kcov_mode'] [akpm@linux-foundation.org: unbreak allmodconfig] [akpm@linux-foundation.org: follow x86 Makefile layout standards] Signed-off-by: Dmitry Vyukov Reviewed-by: Kees Cook Cc: syzkaller Cc: Vegard Nossum Cc: Catalin Marinas Cc: Tavis Ormandy Cc: Will Deacon Cc: Quentin Casasnovas Cc: Kostya Serebryany Cc: Eric Dumazet Cc: Alexander Potapenko Cc: Kees Cook Cc: Bjorn Helgaas Cc: Sasha Levin Cc: David Drysdale Cc: Ard Biesheuvel Cc: Andrey Ryabinin Cc: Kirill A. Shutemov Cc: Jiri Slaby Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Makefile | 15 +++++++++++++++ mm/kasan/Makefile | 1 + 2 files changed, 16 insertions(+) (limited to 'mm') diff --git a/mm/Makefile b/mm/Makefile index 6da300a..f5e797cb 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -5,6 +5,21 @@ KASAN_SANITIZE_slab_common.o := n KASAN_SANITIZE_slub.o := n +# These files are disabled because they produce non-interesting and/or +# flaky coverage that is not a function of syscall inputs. E.g. slab is out of +# free pages, or a task is migrated between nodes. +KCOV_INSTRUMENT_slab_common.o := n +KCOV_INSTRUMENT_slob.o := n +KCOV_INSTRUMENT_slab.o := n +KCOV_INSTRUMENT_slub.o := n +KCOV_INSTRUMENT_page_alloc.o := n +KCOV_INSTRUMENT_debug-pagealloc.o := n +KCOV_INSTRUMENT_kmemleak.o := n +KCOV_INSTRUMENT_kmemcheck.o := n +KCOV_INSTRUMENT_memcontrol.o := n +KCOV_INSTRUMENT_mmzone.o := n +KCOV_INSTRUMENT_vmstat.o := n + mmu-y := nommu.o mmu-$(CONFIG_MMU) := gup.o highmem.o memory.o mincore.o \ mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile index a61460d..131daad 100644 --- a/mm/kasan/Makefile +++ b/mm/kasan/Makefile @@ -1,5 +1,6 @@ KASAN_SANITIZE := n UBSAN_SANITIZE_kasan.o := n +KCOV_INSTRUMENT := n CFLAGS_REMOVE_kasan.o = -pg # Function splitter causes unnecessary splits in __asan_load1/__asan_store1 -- cgit v1.1 From f138556daf62665f19178732ec4daf86c4ca13f4 Mon Sep 17 00:00:00 2001 From: Piotr Kwapulinski Date: Tue, 22 Mar 2016 14:27:51 -0700 Subject: mm/mprotect.c: don't imply PROT_EXEC on non-exec fs The mprotect(PROT_READ) fails when called by the READ_IMPLIES_EXEC binary on a memory mapped file located on non-exec fs. The mprotect does not check whether fs is _executable_ or not. The PROT_EXEC flag is set automatically even if a memory mapped file is located on non-exec fs. Fix it by checking whether a memory mapped file is located on a non-exec fs. If so the PROT_EXEC is not implied by the PROT_READ. The implementation uses the VM_MAYEXEC flag set properly in mmap. Now it is consistent with mmap. I did the isolated tests (PT_GNU_STACK X/NX, multiple VMAs, X/NX fs). I also patched the official 3.19.0-47-generic Ubuntu 14.04 kernel and it seems to work. Signed-off-by: Piotr Kwapulinski Cc: Mel Gorman Cc: Kirill A. Shutemov Cc: Aneesh Kumar K.V Cc: Benjamin Herrenschmidt Cc: Konstantin Khlebnikov Cc: Dan Williams Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mprotect.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/mprotect.c b/mm/mprotect.c index fa37c4c..b650c54 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -359,6 +359,9 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, struct vm_area_struct *vma, *prev; int error = -EINVAL; const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP); + const bool rier = (current->personality & READ_IMPLIES_EXEC) && + (prot & PROT_READ); + prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP); if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */ return -EINVAL; @@ -375,11 +378,6 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, return -EINVAL; reqprot = prot; - /* - * Does the application expect PROT_READ to imply PROT_EXEC: - */ - if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) - prot |= PROT_EXEC; down_write(¤t->mm->mmap_sem); @@ -414,6 +412,10 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ + /* Does the application expect PROT_READ to imply PROT_EXEC */ + if (rier && (vma->vm_flags & VM_MAYEXEC)) + prot |= PROT_EXEC; + newflags = calc_vm_prot_bits(prot, pkey); newflags |= (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); @@ -445,6 +447,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, error = -ENOMEM; goto out; } + prot = reqprot; } out: up_write(¤t->mm->mmap_sem); -- cgit v1.1 From aac453635549699c13a84ea1456d5b0e574ef855 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 25 Mar 2016 14:20:24 -0700 Subject: mm, oom: introduce oom reaper This patch (of 5): This is based on the idea from Mel Gorman discussed during LSFMM 2015 and independently brought up by Oleg Nesterov. The OOM killer currently allows to kill only a single task in a good hope that the task will terminate in a reasonable time and frees up its memory. Such a task (oom victim) will get an access to memory reserves via mark_oom_victim to allow a forward progress should there be a need for additional memory during exit path. It has been shown (e.g. by Tetsuo Handa) that it is not that hard to construct workloads which break the core assumption mentioned above and the OOM victim might take unbounded amount of time to exit because it might be blocked in the uninterruptible state waiting for an event (e.g. lock) which is blocked by another task looping in the page allocator. This patch reduces the probability of such a lockup by introducing a specialized kernel thread (oom_reaper) which tries to reclaim additional memory by preemptively reaping the anonymous or swapped out memory owned by the oom victim under an assumption that such a memory won't be needed when its owner is killed and kicked from the userspace anyway. There is one notable exception to this, though, if the OOM victim was in the process of coredumping the result would be incomplete. This is considered a reasonable constrain because the overall system health is more important than debugability of a particular application. A kernel thread has been chosen because we need a reliable way of invocation so workqueue context is not appropriate because all the workers might be busy (e.g. allocating memory). Kswapd which sounds like another good fit is not appropriate as well because it might get blocked on locks during reclaim as well. oom_reaper has to take mmap_sem on the target task for reading so the solution is not 100% because the semaphore might be held or blocked for write but the probability is reduced considerably wrt. basically any lock blocking forward progress as described above. In order to prevent from blocking on the lock without any forward progress we are using only a trylock and retry 10 times with a short sleep in between. Users of mmap_sem which need it for write should be carefully reviewed to use _killable waiting as much as possible and reduce allocations requests done with the lock held to absolute minimum to reduce the risk even further. The API between oom killer and oom reaper is quite trivial. wake_oom_reaper updates mm_to_reap with cmpxchg to guarantee only NULL->mm transition and oom_reaper clear this atomically once it is done with the work. This means that only a single mm_struct can be reaped at the time. As the operation is potentially disruptive we are trying to limit it to the ncessary minimum and the reaper blocks any updates while it operates on an mm. mm_struct is pinned by mm_count to allow parallel exit_mmap and a race is detected by atomic_inc_not_zero(mm_users). Signed-off-by: Michal Hocko Suggested-by: Oleg Nesterov Suggested-by: Mel Gorman Acked-by: Mel Gorman Acked-by: David Rientjes Cc: Mel Gorman Cc: Tetsuo Handa Cc: Oleg Nesterov Cc: Hugh Dickins Cc: Andrea Argangeli Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 5 ++ mm/memory.c | 17 ++++--- mm/oom_kill.c | 151 +++++++++++++++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 160 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index 7449392..b79abb6 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -38,6 +38,11 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); +void unmap_page_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, + unsigned long addr, unsigned long end, + struct zap_details *details); + extern int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, pgoff_t offset, unsigned long nr_to_read, unsigned long lookahead_size); diff --git a/mm/memory.c b/mm/memory.c index 81dca00..098f00d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1102,6 +1102,12 @@ again: if (!PageAnon(page)) { if (pte_dirty(ptent)) { + /* + * oom_reaper cannot tear down dirty + * pages + */ + if (unlikely(details && details->ignore_dirty)) + continue; force_flush = 1; set_page_dirty(page); } @@ -1120,8 +1126,8 @@ again: } continue; } - /* If details->check_mapping, we leave swap entries. */ - if (unlikely(details)) + /* only check swap_entries if explicitly asked for in details */ + if (unlikely(details && !details->check_swap_entries)) continue; entry = pte_to_swp_entry(ptent); @@ -1226,7 +1232,7 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb, return addr; } -static void unmap_page_range(struct mmu_gather *tlb, +void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end, struct zap_details *details) @@ -1234,9 +1240,6 @@ static void unmap_page_range(struct mmu_gather *tlb, pgd_t *pgd; unsigned long next; - if (details && !details->check_mapping) - details = NULL; - BUG_ON(addr >= end); tlb_start_vma(tlb, vma); pgd = pgd_offset(vma->vm_mm, addr); @@ -2432,7 +2435,7 @@ static inline void unmap_mapping_range_tree(struct rb_root *root, void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows) { - struct zap_details details; + struct zap_details details = { }; pgoff_t hba = holebegin >> PAGE_SHIFT; pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 06f7e17..f7ed6ec 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -35,6 +35,11 @@ #include #include #include +#include +#include + +#include +#include "internal.h" #define CREATE_TRACE_POINTS #include @@ -405,6 +410,133 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait); bool oom_killer_disabled __read_mostly; +#ifdef CONFIG_MMU +/* + * OOM Reaper kernel thread which tries to reap the memory used by the OOM + * victim (if that is possible) to help the OOM killer to move on. + */ +static struct task_struct *oom_reaper_th; +static struct mm_struct *mm_to_reap; +static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); + +static bool __oom_reap_vmas(struct mm_struct *mm) +{ + struct mmu_gather tlb; + struct vm_area_struct *vma; + struct zap_details details = {.check_swap_entries = true, + .ignore_dirty = true}; + bool ret = true; + + /* We might have raced with exit path */ + if (!atomic_inc_not_zero(&mm->mm_users)) + return true; + + if (!down_read_trylock(&mm->mmap_sem)) { + ret = false; + goto out; + } + + tlb_gather_mmu(&tlb, mm, 0, -1); + for (vma = mm->mmap ; vma; vma = vma->vm_next) { + if (is_vm_hugetlb_page(vma)) + continue; + + /* + * mlocked VMAs require explicit munlocking before unmap. + * Let's keep it simple here and skip such VMAs. + */ + if (vma->vm_flags & VM_LOCKED) + continue; + + /* + * Only anonymous pages have a good chance to be dropped + * without additional steps which we cannot afford as we + * are OOM already. + * + * We do not even care about fs backed pages because all + * which are reclaimable have already been reclaimed and + * we do not want to block exit_mmap by keeping mm ref + * count elevated without a good reason. + */ + if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) + unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end, + &details); + } + tlb_finish_mmu(&tlb, 0, -1); + up_read(&mm->mmap_sem); +out: + mmput(mm); + return ret; +} + +static void oom_reap_vmas(struct mm_struct *mm) +{ + int attempts = 0; + + /* Retry the down_read_trylock(mmap_sem) a few times */ + while (attempts++ < 10 && !__oom_reap_vmas(mm)) + schedule_timeout_idle(HZ/10); + + /* Drop a reference taken by wake_oom_reaper */ + mmdrop(mm); +} + +static int oom_reaper(void *unused) +{ + while (true) { + struct mm_struct *mm; + + wait_event_freezable(oom_reaper_wait, + (mm = READ_ONCE(mm_to_reap))); + oom_reap_vmas(mm); + WRITE_ONCE(mm_to_reap, NULL); + } + + return 0; +} + +static void wake_oom_reaper(struct mm_struct *mm) +{ + struct mm_struct *old_mm; + + if (!oom_reaper_th) + return; + + /* + * Pin the given mm. Use mm_count instead of mm_users because + * we do not want to delay the address space tear down. + */ + atomic_inc(&mm->mm_count); + + /* + * Make sure that only a single mm is ever queued for the reaper + * because multiple are not necessary and the operation might be + * disruptive so better reduce it to the bare minimum. + */ + old_mm = cmpxchg(&mm_to_reap, NULL, mm); + if (!old_mm) + wake_up(&oom_reaper_wait); + else + mmdrop(mm); +} + +static int __init oom_init(void) +{ + oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper"); + if (IS_ERR(oom_reaper_th)) { + pr_err("Unable to start OOM reaper %ld. Continuing regardless\n", + PTR_ERR(oom_reaper_th)); + oom_reaper_th = NULL; + } + return 0; +} +subsys_initcall(oom_init) +#else +static void wake_oom_reaper(struct mm_struct *mm) +{ +} +#endif + /** * mark_oom_victim - mark the given task as OOM victim * @tsk: task to mark @@ -510,6 +642,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, unsigned int victim_points = 0; static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); + bool can_oom_reap = true; /* * If the task is already exiting, don't alarm the sysadmin or kill @@ -600,17 +733,23 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, continue; if (same_thread_group(p, victim)) continue; - if (unlikely(p->flags & PF_KTHREAD)) - continue; - if (is_global_init(p)) - continue; - if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) + if (unlikely(p->flags & PF_KTHREAD) || is_global_init(p) || + p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { + /* + * We cannot use oom_reaper for the mm shared by this + * process because it wouldn't get killed and so the + * memory might be still used. + */ + can_oom_reap = false; continue; - + } do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); } rcu_read_unlock(); + if (can_oom_reap) + wake_oom_reaper(mm); + mmdrop(mm); put_task_struct(victim); } -- cgit v1.1 From 36324a990cf578b57828c04cd85ac62cd25cf5a4 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 25 Mar 2016 14:20:27 -0700 Subject: oom: clear TIF_MEMDIE after oom_reaper managed to unmap the address space When oom_reaper manages to unmap all the eligible vmas there shouldn't be much of the freable memory held by the oom victim left anymore so it makes sense to clear the TIF_MEMDIE flag for the victim and allow the OOM killer to select another task. The lack of TIF_MEMDIE also means that the victim cannot access memory reserves anymore but that shouldn't be a problem because it would get the access again if it needs to allocate and hits the OOM killer again due to the fatal_signal_pending resp. PF_EXITING check. We can safely hide the task from the OOM killer because it is clearly not a good candidate anymore as everyhing reclaimable has been torn down already. This patch will allow to cap the time an OOM victim can keep TIF_MEMDIE and thus hold off further global OOM killer actions granted the oom reaper is able to take mmap_sem for the associated mm struct. This is not guaranteed now but further steps should make sure that mmap_sem for write should be blocked killable which will help to reduce such a lock contention. This is not done by this patch. Note that exit_oom_victim might be called on a remote task from __oom_reap_task now so we have to check and clear the flag atomically otherwise we might race and underflow oom_victims or wake up waiters too early. Signed-off-by: Michal Hocko Suggested-by: Johannes Weiner Suggested-by: Tetsuo Handa Cc: Andrea Argangeli Cc: David Rientjes Cc: Hugh Dickins Cc: Mel Gorman Cc: Oleg Nesterov Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 73 +++++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 48 insertions(+), 25 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index f7ed6ec..2830b1c 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -416,20 +416,36 @@ bool oom_killer_disabled __read_mostly; * victim (if that is possible) to help the OOM killer to move on. */ static struct task_struct *oom_reaper_th; -static struct mm_struct *mm_to_reap; +static struct task_struct *task_to_reap; static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); -static bool __oom_reap_vmas(struct mm_struct *mm) +static bool __oom_reap_task(struct task_struct *tsk) { struct mmu_gather tlb; struct vm_area_struct *vma; + struct mm_struct *mm; + struct task_struct *p; struct zap_details details = {.check_swap_entries = true, .ignore_dirty = true}; bool ret = true; - /* We might have raced with exit path */ - if (!atomic_inc_not_zero(&mm->mm_users)) + /* + * Make sure we find the associated mm_struct even when the particular + * thread has already terminated and cleared its mm. + * We might have race with exit path so consider our work done if there + * is no mm. + */ + p = find_lock_task_mm(tsk); + if (!p) + return true; + + mm = p->mm; + if (!atomic_inc_not_zero(&mm->mm_users)) { + task_unlock(p); return true; + } + + task_unlock(p); if (!down_read_trylock(&mm->mmap_sem)) { ret = false; @@ -464,60 +480,66 @@ static bool __oom_reap_vmas(struct mm_struct *mm) } tlb_finish_mmu(&tlb, 0, -1); up_read(&mm->mmap_sem); + + /* + * Clear TIF_MEMDIE because the task shouldn't be sitting on a + * reasonably reclaimable memory anymore. OOM killer can continue + * by selecting other victim if unmapping hasn't led to any + * improvements. This also means that selecting this task doesn't + * make any sense. + */ + tsk->signal->oom_score_adj = OOM_SCORE_ADJ_MIN; + exit_oom_victim(tsk); out: mmput(mm); return ret; } -static void oom_reap_vmas(struct mm_struct *mm) +static void oom_reap_task(struct task_struct *tsk) { int attempts = 0; /* Retry the down_read_trylock(mmap_sem) a few times */ - while (attempts++ < 10 && !__oom_reap_vmas(mm)) + while (attempts++ < 10 && !__oom_reap_task(tsk)) schedule_timeout_idle(HZ/10); /* Drop a reference taken by wake_oom_reaper */ - mmdrop(mm); + put_task_struct(tsk); } static int oom_reaper(void *unused) { while (true) { - struct mm_struct *mm; + struct task_struct *tsk; wait_event_freezable(oom_reaper_wait, - (mm = READ_ONCE(mm_to_reap))); - oom_reap_vmas(mm); - WRITE_ONCE(mm_to_reap, NULL); + (tsk = READ_ONCE(task_to_reap))); + oom_reap_task(tsk); + WRITE_ONCE(task_to_reap, NULL); } return 0; } -static void wake_oom_reaper(struct mm_struct *mm) +static void wake_oom_reaper(struct task_struct *tsk) { - struct mm_struct *old_mm; + struct task_struct *old_tsk; if (!oom_reaper_th) return; - /* - * Pin the given mm. Use mm_count instead of mm_users because - * we do not want to delay the address space tear down. - */ - atomic_inc(&mm->mm_count); + get_task_struct(tsk); /* * Make sure that only a single mm is ever queued for the reaper * because multiple are not necessary and the operation might be * disruptive so better reduce it to the bare minimum. */ - old_mm = cmpxchg(&mm_to_reap, NULL, mm); - if (!old_mm) + old_tsk = cmpxchg(&task_to_reap, NULL, tsk); + if (!old_tsk) wake_up(&oom_reaper_wait); else - mmdrop(mm); + put_task_struct(tsk); } static int __init oom_init(void) @@ -532,7 +554,7 @@ static int __init oom_init(void) } subsys_initcall(oom_init) #else -static void wake_oom_reaper(struct mm_struct *mm) +static void wake_oom_reaper(struct task_struct *tsk) { } #endif @@ -563,9 +585,10 @@ void mark_oom_victim(struct task_struct *tsk) /** * exit_oom_victim - note the exit of an OOM victim */ -void exit_oom_victim(void) +void exit_oom_victim(struct task_struct *tsk) { - clear_thread_flag(TIF_MEMDIE); + if (!test_and_clear_tsk_thread_flag(tsk, TIF_MEMDIE)) + return; if (!atomic_dec_return(&oom_victims)) wake_up_all(&oom_victims_wait); @@ -748,7 +771,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, rcu_read_unlock(); if (can_oom_reap) - wake_oom_reaper(mm); + wake_oom_reaper(victim); mmdrop(mm); put_task_struct(victim); -- cgit v1.1 From bc448e897b6d24aae32701763b8a1fe15d29fa26 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 25 Mar 2016 14:20:30 -0700 Subject: mm, oom_reaper: report success/failure Inform about the successful/failed oom_reaper attempts and dump all the held locks to tell us more who is blocking the progress. [akpm@linux-foundation.org: fix CONFIG_MMU=n build] Signed-off-by: Michal Hocko Cc: Andrea Argangeli Cc: David Rientjes Cc: Hugh Dickins Cc: Johannes Weiner Cc: Mel Gorman Cc: Oleg Nesterov Cc: Rik van Riel Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 2830b1c..30a6099 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -410,6 +410,8 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait); bool oom_killer_disabled __read_mostly; +#define K(x) ((x) << (PAGE_SHIFT-10)) + #ifdef CONFIG_MMU /* * OOM Reaper kernel thread which tries to reap the memory used by the OOM @@ -479,6 +481,11 @@ static bool __oom_reap_task(struct task_struct *tsk) &details); } tlb_finish_mmu(&tlb, 0, -1); + pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", + task_pid_nr(tsk), tsk->comm, + K(get_mm_counter(mm, MM_ANONPAGES)), + K(get_mm_counter(mm, MM_FILEPAGES)), + K(get_mm_counter(mm, MM_SHMEMPAGES))); up_read(&mm->mmap_sem); /* @@ -495,14 +502,21 @@ out: return ret; } +#define MAX_OOM_REAP_RETRIES 10 static void oom_reap_task(struct task_struct *tsk) { int attempts = 0; /* Retry the down_read_trylock(mmap_sem) a few times */ - while (attempts++ < 10 && !__oom_reap_task(tsk)) + while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task(tsk)) schedule_timeout_idle(HZ/10); + if (attempts > MAX_OOM_REAP_RETRIES) { + pr_info("oom_reaper: unable to reap pid:%d (%s)\n", + task_pid_nr(tsk), tsk->comm); + debug_show_all_locks(); + } + /* Drop a reference taken by wake_oom_reaper */ put_task_struct(tsk); } @@ -649,7 +663,6 @@ static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) return false; } -#define K(x) ((x) << (PAGE_SHIFT-10)) /* * Must be called while holding a reference to p, which will be released upon * returning. -- cgit v1.1 From 03049269de433cb5fe2859be9ae4469ceb1163ed Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 25 Mar 2016 14:20:33 -0700 Subject: mm, oom_reaper: implement OOM victims queuing wake_oom_reaper has allowed only 1 oom victim to be queued. The main reason for that was the simplicity as other solutions would require some way of queuing. The current approach is racy and that was deemed sufficient as the oom_reaper is considered a best effort approach to help with oom handling when the OOM victim cannot terminate in a reasonable time. The race could lead to missing an oom victim which can get stuck out_of_memory wake_oom_reaper cmpxchg // OK oom_reaper oom_reap_task __oom_reap_task oom_victim terminates atomic_inc_not_zero // fail out_of_memory wake_oom_reaper cmpxchg // fails task_to_reap = NULL This race requires 2 OOM invocations in a short time period which is not very likely but certainly not impossible. E.g. the original victim might have not released a lot of memory for some reason. The situation would improve considerably if wake_oom_reaper used a more robust queuing. This is what this patch implements. This means adding oom_reaper_list list_head into task_struct (eat a hole before embeded thread_struct for that purpose) and a oom_reaper_lock spinlock for queuing synchronization. wake_oom_reaper will then add the task on the queue and oom_reaper will dequeue it. Signed-off-by: Michal Hocko Cc: Vladimir Davydov Cc: Andrea Argangeli Cc: David Rientjes Cc: Hugh Dickins Cc: Johannes Weiner Cc: Mel Gorman Cc: Oleg Nesterov Cc: Rik van Riel Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 30a6099..f6d4ae9 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -418,8 +418,10 @@ bool oom_killer_disabled __read_mostly; * victim (if that is possible) to help the OOM killer to move on. */ static struct task_struct *oom_reaper_th; -static struct task_struct *task_to_reap; static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); +static LIST_HEAD(oom_reaper_list); +static DEFINE_SPINLOCK(oom_reaper_lock); + static bool __oom_reap_task(struct task_struct *tsk) { @@ -524,12 +526,20 @@ static void oom_reap_task(struct task_struct *tsk) static int oom_reaper(void *unused) { while (true) { - struct task_struct *tsk; + struct task_struct *tsk = NULL; wait_event_freezable(oom_reaper_wait, - (tsk = READ_ONCE(task_to_reap))); - oom_reap_task(tsk); - WRITE_ONCE(task_to_reap, NULL); + (!list_empty(&oom_reaper_list))); + spin_lock(&oom_reaper_lock); + if (!list_empty(&oom_reaper_list)) { + tsk = list_first_entry(&oom_reaper_list, + struct task_struct, oom_reaper_list); + list_del(&tsk->oom_reaper_list); + } + spin_unlock(&oom_reaper_lock); + + if (tsk) + oom_reap_task(tsk); } return 0; @@ -537,23 +547,15 @@ static int oom_reaper(void *unused) static void wake_oom_reaper(struct task_struct *tsk) { - struct task_struct *old_tsk; - if (!oom_reaper_th) return; get_task_struct(tsk); - /* - * Make sure that only a single mm is ever queued for the reaper - * because multiple are not necessary and the operation might be - * disruptive so better reduce it to the bare minimum. - */ - old_tsk = cmpxchg(&task_to_reap, NULL, tsk); - if (!old_tsk) - wake_up(&oom_reaper_wait); - else - put_task_struct(tsk); + spin_lock(&oom_reaper_lock); + list_add(&tsk->oom_reaper_list, &oom_reaper_list); + spin_unlock(&oom_reaper_lock); + wake_up(&oom_reaper_wait); } static int __init oom_init(void) -- cgit v1.1 From 855b018325737f7691f9b7d86339df40aa4e47c3 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 25 Mar 2016 14:20:36 -0700 Subject: oom, oom_reaper: disable oom_reaper for oom_kill_allocating_task Tetsuo has reported that oom_kill_allocating_task=1 will cause oom_reaper_list corruption because oom_kill_process doesn't follow standard OOM exclusion (aka ignores TIF_MEMDIE) and allows to enqueue the same task multiple times - e.g. by sacrificing the same child multiple times. This patch fixes the issue by introducing a new MMF_OOM_KILLED mm flag which is set in oom_kill_process atomically and oom reaper is disabled if the flag was already set. Signed-off-by: Michal Hocko Reported-by: Tetsuo Handa Cc: David Rientjes Cc: Mel Gorman Cc: Oleg Nesterov Cc: Hugh Dickins Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index f6d4ae9..1a21819 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -680,7 +680,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, unsigned int victim_points = 0; static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); - bool can_oom_reap = true; + bool can_oom_reap; /* * If the task is already exiting, don't alarm the sysadmin or kill @@ -742,6 +742,10 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, /* Get a reference to safely compare mm after task_unlock(victim) */ mm = victim->mm; atomic_inc(&mm->mm_count); + + /* Make sure we do not try to oom reap the mm multiple times */ + can_oom_reap = !test_and_set_bit(MMF_OOM_KILLED, &mm->flags); + /* * We should send SIGKILL before setting TIF_MEMDIE in order to prevent * the OOM victim from depleting the memory reserves from the user -- cgit v1.1 From 29c696e1c6eceb5db6b21f0c89495fcfcd40c0eb Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Fri, 25 Mar 2016 14:20:39 -0700 Subject: oom: make oom_reaper_list single linked Entries are only added/removed from oom_reaper_list at head so we can use a single linked list and hence save a word in task_struct. Signed-off-by: Vladimir Davydov Signed-off-by: Michal Hocko Cc: Tetsuo Handa Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 1a21819..a49638f 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -419,7 +419,7 @@ bool oom_killer_disabled __read_mostly; */ static struct task_struct *oom_reaper_th; static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); -static LIST_HEAD(oom_reaper_list); +static struct task_struct *oom_reaper_list; static DEFINE_SPINLOCK(oom_reaper_lock); @@ -528,13 +528,11 @@ static int oom_reaper(void *unused) while (true) { struct task_struct *tsk = NULL; - wait_event_freezable(oom_reaper_wait, - (!list_empty(&oom_reaper_list))); + wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL); spin_lock(&oom_reaper_lock); - if (!list_empty(&oom_reaper_list)) { - tsk = list_first_entry(&oom_reaper_list, - struct task_struct, oom_reaper_list); - list_del(&tsk->oom_reaper_list); + if (oom_reaper_list != NULL) { + tsk = oom_reaper_list; + oom_reaper_list = tsk->oom_reaper_list; } spin_unlock(&oom_reaper_lock); @@ -553,7 +551,8 @@ static void wake_oom_reaper(struct task_struct *tsk) get_task_struct(tsk); spin_lock(&oom_reaper_lock); - list_add(&tsk->oom_reaper_list, &oom_reaper_list); + tsk->oom_reaper_list = oom_reaper_list; + oom_reaper_list = tsk; spin_unlock(&oom_reaper_lock); wake_up(&oom_reaper_wait); } -- cgit v1.1 From e26796066fdf929cbba22dabb801808f986acdb9 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 25 Mar 2016 14:20:41 -0700 Subject: oom: make oom_reaper freezable After "oom: clear TIF_MEMDIE after oom_reaper managed to unmap the address space" oom_reaper will call exit_oom_victim on the target task after it is done. This might however race with the PM freezer: CPU0 CPU1 CPU2 freeze_processes try_to_freeze_tasks # Allocation request out_of_memory oom_killer_disable wake_oom_reaper(P1) __oom_reap_task exit_oom_victim(P1) wait_event(oom_victims==0) [...] do_exit(P1) perform IO/interfere with the freezer which breaks the oom_killer_disable semantic. We no longer have a guarantee that the oom victim won't interfere with the freezer because it might be anywhere on the way to do_exit while the freezer thinks the task has already terminated. It might trigger IO or touch devices which are frozen already. In order to close this race, make the oom_reaper thread freezable. This will work because a) already running oom_reaper will block freezer to enter the quiescent state b) wake_oom_reaper will not wake up the reaper after it has been frozen c) the only way to call exit_oom_victim after try_to_freeze_tasks is from the oom victim's context when we know the further interference shouldn't be possible Signed-off-by: Michal Hocko Cc: Tetsuo Handa Cc: David Rientjes Cc: Mel Gorman Cc: Oleg Nesterov Cc: Hugh Dickins Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index a49638f..8cc55f0 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -525,6 +525,8 @@ static void oom_reap_task(struct task_struct *tsk) static int oom_reaper(void *unused) { + set_freezable(); + while (true) { struct task_struct *tsk = NULL; -- cgit v1.1 From bb29902a7515208846114b3b36a4281a9bbf766a Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Fri, 25 Mar 2016 14:20:44 -0700 Subject: oom, oom_reaper: protect oom_reaper_list using simpler way "oom, oom_reaper: disable oom_reaper for oom_kill_allocating_task" tried to protect oom_reaper_list using MMF_OOM_KILLED flag. But we can do it by simply checking tsk->oom_reaper_list != NULL. Signed-off-by: Tetsuo Handa Signed-off-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 8cc55f0..b34d279 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -547,7 +547,7 @@ static int oom_reaper(void *unused) static void wake_oom_reaper(struct task_struct *tsk) { - if (!oom_reaper_th) + if (!oom_reaper_th || tsk->oom_reaper_list) return; get_task_struct(tsk); @@ -681,7 +681,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, unsigned int victim_points = 0; static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); - bool can_oom_reap; + bool can_oom_reap = true; /* * If the task is already exiting, don't alarm the sysadmin or kill @@ -743,10 +743,6 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, /* Get a reference to safely compare mm after task_unlock(victim) */ mm = victim->mm; atomic_inc(&mm->mm_count); - - /* Make sure we do not try to oom reap the mm multiple times */ - can_oom_reap = !test_and_set_bit(MMF_OOM_KILLED, &mm->flags); - /* * We should send SIGKILL before setting TIF_MEMDIE in order to prevent * the OOM victim from depleting the memory reserves from the user -- cgit v1.1 From d9dddbf556674bf125ecd925b24e43a5cf2a568a Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 25 Mar 2016 14:21:50 -0700 Subject: mm/page_alloc: prevent merging between isolated and other pageblocks Hanjun Guo has reported that a CMA stress test causes broken accounting of CMA and free pages: > Before the test, I got: > -bash-4.3# cat /proc/meminfo | grep Cma > CmaTotal: 204800 kB > CmaFree: 195044 kB > > > After running the test: > -bash-4.3# cat /proc/meminfo | grep Cma > CmaTotal: 204800 kB > CmaFree: 6602584 kB > > So the freed CMA memory is more than total.. > > Also the the MemFree is more than mem total: > > -bash-4.3# cat /proc/meminfo > MemTotal: 16342016 kB > MemFree: 22367268 kB > MemAvailable: 22370528 kB Laura Abbott has confirmed the issue and suspected the freepage accounting rewrite around 3.18/4.0 by Joonsoo Kim. Joonsoo had a theory that this is caused by unexpected merging between MIGRATE_ISOLATE and MIGRATE_CMA pageblocks: > CMA isolates MAX_ORDER aligned blocks, but, during the process, > partialy isolated block exists. If MAX_ORDER is 11 and > pageblock_order is 9, two pageblocks make up MAX_ORDER > aligned block and I can think following scenario because pageblock > (un)isolation would be done one by one. > > (each character means one pageblock. 'C', 'I' means MIGRATE_CMA, > MIGRATE_ISOLATE, respectively. > > CC -> IC -> II (Isolation) > II -> CI -> CC (Un-isolation) > > If some pages are freed at this intermediate state such as IC or CI, > that page could be merged to the other page that is resident on > different type of pageblock and it will cause wrong freepage count. This was supposed to be prevented by CMA operating on MAX_ORDER blocks, but since it doesn't hold the zone->lock between pageblocks, a race window does exist. It's also likely that unexpected merging can occur between MIGRATE_ISOLATE and non-CMA pageblocks. This should be prevented in __free_one_page() since commit 3c605096d315 ("mm/page_alloc: restrict max order of merging on isolated pageblock"). However, we only check the migratetype of the pageblock where buddy merging has been initiated, not the migratetype of the buddy pageblock (or group of pageblocks) which can be MIGRATE_ISOLATE. Joonsoo has suggested checking for buddy migratetype as part of page_is_buddy(), but that would add extra checks in allocator hotpath and bloat-o-meter has shown significant code bloat (the function is inline). This patch reduces the bloat at some expense of more complicated code. The buddy-merging while-loop in __free_one_page() is initially bounded to pageblock_border and without any migratetype checks. The checks are placed outside, bumping the max_order if merging is allowed, and returning to the while-loop with a statement which can't be possibly considered harmful. This fixes the accounting bug and also removes the arguably weird state in the original commit 3c605096d315 where buddies could be left unmerged. Fixes: 3c605096d315 ("mm/page_alloc: restrict max order of merging on isolated pageblock") Link: https://lkml.org/lkml/2016/3/2/280 Signed-off-by: Vlastimil Babka Reported-by: Hanjun Guo Tested-by: Hanjun Guo Acked-by: Joonsoo Kim Debugged-by: Laura Abbott Debugged-by: Joonsoo Kim Cc: Mel Gorman Cc: "Kirill A. Shutemov" Cc: Johannes Weiner Cc: Minchan Kim Cc: Yasuaki Ishimatsu Cc: Zhang Yanfei Cc: Michal Nazarewicz Cc: Naoya Horiguchi Cc: "Aneesh Kumar K.V" Cc: [3.18+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 46 +++++++++++++++++++++++++++++++++------------- 1 file changed, 33 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a762be5..59de90d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -692,34 +692,28 @@ static inline void __free_one_page(struct page *page, unsigned long combined_idx; unsigned long uninitialized_var(buddy_idx); struct page *buddy; - unsigned int max_order = MAX_ORDER; + unsigned int max_order; + + max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1); VM_BUG_ON(!zone_is_initialized(zone)); VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); VM_BUG_ON(migratetype == -1); - if (is_migrate_isolate(migratetype)) { - /* - * We restrict max order of merging to prevent merge - * between freepages on isolate pageblock and normal - * pageblock. Without this, pageblock isolation - * could cause incorrect freepage accounting. - */ - max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1); - } else { + if (likely(!is_migrate_isolate(migratetype))) __mod_zone_freepage_state(zone, 1 << order, migratetype); - } - page_idx = pfn & ((1 << max_order) - 1); + page_idx = pfn & ((1 << MAX_ORDER) - 1); VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page); VM_BUG_ON_PAGE(bad_range(zone, page), page); +continue_merging: while (order < max_order - 1) { buddy_idx = __find_buddy_index(page_idx, order); buddy = page + (buddy_idx - page_idx); if (!page_is_buddy(page, buddy, order)) - break; + goto done_merging; /* * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, * merge with it and move up one order. @@ -736,6 +730,32 @@ static inline void __free_one_page(struct page *page, page_idx = combined_idx; order++; } + if (max_order < MAX_ORDER) { + /* If we are here, it means order is >= pageblock_order. + * We want to prevent merge between freepages on isolate + * pageblock and normal pageblock. Without this, pageblock + * isolation could cause incorrect freepage or CMA accounting. + * + * We don't want to hit this code for the more frequent + * low-order merging. + */ + if (unlikely(has_isolate_pageblock(zone))) { + int buddy_mt; + + buddy_idx = __find_buddy_index(page_idx, order); + buddy = page + (buddy_idx - page_idx); + buddy_mt = get_pageblock_migratetype(buddy); + + if (migratetype != buddy_mt + && (is_migrate_isolate(migratetype) || + is_migrate_isolate(buddy_mt))) + goto done_merging; + } + max_order++; + goto continue_merging; + } + +done_merging: set_page_order(page, order); /* -- cgit v1.1 From 7ed2f9e663854db313f177a511145630e398b402 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Fri, 25 Mar 2016 14:21:59 -0700 Subject: mm, kasan: SLAB support Add KASAN hooks to SLAB allocator. This patch is based on the "mm: kasan: unified support for SLUB and SLAB allocators" patch originally prepared by Dmitry Chernenkov. Signed-off-by: Alexander Potapenko Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Andrey Ryabinin Cc: Steven Rostedt Cc: Konstantin Serebryany Cc: Dmitry Chernenkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Makefile | 1 + mm/kasan/kasan.c | 102 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ mm/kasan/kasan.h | 34 ++++++++++++++++++ mm/kasan/report.c | 54 +++++++++++++++++++++++------ mm/slab.c | 43 +++++++++++++++++++---- mm/slab_common.c | 2 +- 6 files changed, 218 insertions(+), 18 deletions(-) (limited to 'mm') diff --git a/mm/Makefile b/mm/Makefile index f5e797cb..deb467e 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -3,6 +3,7 @@ # KASAN_SANITIZE_slab_common.o := n +KASAN_SANITIZE_slab.o := n KASAN_SANITIZE_slub.o := n # These files are disabled because they produce non-interesting and/or diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 1ad20ad..7c82509 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -334,6 +334,59 @@ void kasan_free_pages(struct page *page, unsigned int order) KASAN_FREE_PAGE); } +#ifdef CONFIG_SLAB +/* + * Adaptive redzone policy taken from the userspace AddressSanitizer runtime. + * For larger allocations larger redzones are used. + */ +static size_t optimal_redzone(size_t object_size) +{ + int rz = + object_size <= 64 - 16 ? 16 : + object_size <= 128 - 32 ? 32 : + object_size <= 512 - 64 ? 64 : + object_size <= 4096 - 128 ? 128 : + object_size <= (1 << 14) - 256 ? 256 : + object_size <= (1 << 15) - 512 ? 512 : + object_size <= (1 << 16) - 1024 ? 1024 : 2048; + return rz; +} + +void kasan_cache_create(struct kmem_cache *cache, size_t *size, + unsigned long *flags) +{ + int redzone_adjust; + /* Make sure the adjusted size is still less than + * KMALLOC_MAX_CACHE_SIZE. + * TODO: this check is only useful for SLAB, but not SLUB. We'll need + * to skip it for SLUB when it starts using kasan_cache_create(). + */ + if (*size > KMALLOC_MAX_CACHE_SIZE - + sizeof(struct kasan_alloc_meta) - + sizeof(struct kasan_free_meta)) + return; + *flags |= SLAB_KASAN; + /* Add alloc meta. */ + cache->kasan_info.alloc_meta_offset = *size; + *size += sizeof(struct kasan_alloc_meta); + + /* Add free meta. */ + if (cache->flags & SLAB_DESTROY_BY_RCU || cache->ctor || + cache->object_size < sizeof(struct kasan_free_meta)) { + cache->kasan_info.free_meta_offset = *size; + *size += sizeof(struct kasan_free_meta); + } + redzone_adjust = optimal_redzone(cache->object_size) - + (*size - cache->object_size); + if (redzone_adjust > 0) + *size += redzone_adjust; + *size = min(KMALLOC_MAX_CACHE_SIZE, + max(*size, + cache->object_size + + optimal_redzone(cache->object_size))); +} +#endif + void kasan_poison_slab(struct page *page) { kasan_poison_shadow(page_address(page), @@ -351,8 +404,36 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object) kasan_poison_shadow(object, round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE), KASAN_KMALLOC_REDZONE); +#ifdef CONFIG_SLAB + if (cache->flags & SLAB_KASAN) { + struct kasan_alloc_meta *alloc_info = + get_alloc_info(cache, object); + alloc_info->state = KASAN_STATE_INIT; + } +#endif +} + +static inline void set_track(struct kasan_track *track) +{ + track->cpu = raw_smp_processor_id(); + track->pid = current->pid; + track->when = jiffies; } +#ifdef CONFIG_SLAB +struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache, + const void *object) +{ + return (void *)object + cache->kasan_info.alloc_meta_offset; +} + +struct kasan_free_meta *get_free_info(struct kmem_cache *cache, + const void *object) +{ + return (void *)object + cache->kasan_info.free_meta_offset; +} +#endif + void kasan_slab_alloc(struct kmem_cache *cache, void *object) { kasan_kmalloc(cache, object, cache->object_size); @@ -367,6 +448,17 @@ void kasan_slab_free(struct kmem_cache *cache, void *object) if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU)) return; +#ifdef CONFIG_SLAB + if (cache->flags & SLAB_KASAN) { + struct kasan_free_meta *free_info = + get_free_info(cache, object); + struct kasan_alloc_meta *alloc_info = + get_alloc_info(cache, object); + alloc_info->state = KASAN_STATE_FREE; + set_track(&free_info->track); + } +#endif + kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE); } @@ -386,6 +478,16 @@ void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size) kasan_unpoison_shadow(object, size); kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start, KASAN_KMALLOC_REDZONE); +#ifdef CONFIG_SLAB + if (cache->flags & SLAB_KASAN) { + struct kasan_alloc_meta *alloc_info = + get_alloc_info(cache, object); + + alloc_info->state = KASAN_STATE_ALLOC; + alloc_info->alloc_size = size; + set_track(&alloc_info->track); + } +#endif } EXPORT_SYMBOL(kasan_kmalloc); diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 4f6c62e..7b9e4ab9 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -54,6 +54,40 @@ struct kasan_global { #endif }; +/** + * Structures to keep alloc and free tracks * + */ + +enum kasan_state { + KASAN_STATE_INIT, + KASAN_STATE_ALLOC, + KASAN_STATE_FREE +}; + +struct kasan_track { + u64 cpu : 6; /* for NR_CPUS = 64 */ + u64 pid : 16; /* 65536 processes */ + u64 when : 42; /* ~140 years */ +}; + +struct kasan_alloc_meta { + u32 state : 2; /* enum kasan_state */ + u32 alloc_size : 30; + struct kasan_track track; +}; + +struct kasan_free_meta { + /* Allocator freelist pointer, unused by KASAN. */ + void **freelist; + struct kasan_track track; +}; + +struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache, + const void *object); +struct kasan_free_meta *get_free_info(struct kmem_cache *cache, + const void *object); + + static inline const void *kasan_shadow_to_mem(const void *shadow_addr) { return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 745aa8f..3e3385c 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -115,6 +115,46 @@ static inline bool init_task_stack_addr(const void *addr) sizeof(init_thread_union.stack)); } +#ifdef CONFIG_SLAB +static void print_track(struct kasan_track *track) +{ + pr_err("PID = %u, CPU = %u, timestamp = %lu\n", track->pid, + track->cpu, (unsigned long)track->when); +} + +static void object_err(struct kmem_cache *cache, struct page *page, + void *object, char *unused_reason) +{ + struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object); + struct kasan_free_meta *free_info; + + dump_stack(); + pr_err("Object at %p, in cache %s\n", object, cache->name); + if (!(cache->flags & SLAB_KASAN)) + return; + switch (alloc_info->state) { + case KASAN_STATE_INIT: + pr_err("Object not allocated yet\n"); + break; + case KASAN_STATE_ALLOC: + pr_err("Object allocated with size %u bytes.\n", + alloc_info->alloc_size); + pr_err("Allocation:\n"); + print_track(&alloc_info->track); + break; + case KASAN_STATE_FREE: + pr_err("Object freed, allocated with size %u bytes\n", + alloc_info->alloc_size); + free_info = get_free_info(cache, object); + pr_err("Allocation:\n"); + print_track(&alloc_info->track); + pr_err("Deallocation:\n"); + print_track(&free_info->track); + break; + } +} +#endif + static void print_address_description(struct kasan_access_info *info) { const void *addr = info->access_addr; @@ -126,17 +166,10 @@ static void print_address_description(struct kasan_access_info *info) if (PageSlab(page)) { void *object; struct kmem_cache *cache = page->slab_cache; - void *last_object; - - object = virt_to_obj(cache, page_address(page), addr); - last_object = page_address(page) + - page->objects * cache->size; - - if (unlikely(object > last_object)) - object = last_object; /* we hit into padding */ - + object = nearest_obj(cache, page, + (void *)info->access_addr); object_err(cache, page, object, - "kasan: bad access detected"); + "kasan: bad access detected"); return; } dump_page(page, "kasan: bad access detected"); @@ -146,7 +179,6 @@ static void print_address_description(struct kasan_access_info *info) if (!init_task_stack_addr(addr)) pr_err("Address belongs to variable %pS\n", addr); } - dump_stack(); } diff --git a/mm/slab.c b/mm/slab.c index e719a5c..7515578 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2086,6 +2086,8 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) } #endif + kasan_cache_create(cachep, &size, &flags); + size = ALIGN(size, cachep->align); /* * We should restrict the number of objects in a slab to implement @@ -2387,8 +2389,13 @@ static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page) * cache which they are a constructor for. Otherwise, deadlock. * They must also be threaded. */ - if (cachep->ctor && !(cachep->flags & SLAB_POISON)) + if (cachep->ctor && !(cachep->flags & SLAB_POISON)) { + kasan_unpoison_object_data(cachep, + objp + obj_offset(cachep)); cachep->ctor(objp + obj_offset(cachep)); + kasan_poison_object_data( + cachep, objp + obj_offset(cachep)); + } if (cachep->flags & SLAB_RED_ZONE) { if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) @@ -2409,6 +2416,7 @@ static void cache_init_objs(struct kmem_cache *cachep, struct page *page) { int i; + void *objp; cache_init_objs_debug(cachep, page); @@ -2419,8 +2427,12 @@ static void cache_init_objs(struct kmem_cache *cachep, for (i = 0; i < cachep->num; i++) { /* constructor could break poison info */ - if (DEBUG == 0 && cachep->ctor) - cachep->ctor(index_to_obj(cachep, page, i)); + if (DEBUG == 0 && cachep->ctor) { + objp = index_to_obj(cachep, page, i); + kasan_unpoison_object_data(cachep, objp); + cachep->ctor(objp); + kasan_poison_object_data(cachep, objp); + } set_free_obj(page, i, i); } @@ -2550,6 +2562,7 @@ static int cache_grow(struct kmem_cache *cachep, slab_map_pages(cachep, page, freelist); + kasan_poison_slab(page); cache_init_objs(cachep, page); if (gfpflags_allow_blocking(local_flags)) @@ -3316,6 +3329,8 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp, { struct array_cache *ac = cpu_cache_get(cachep); + kasan_slab_free(cachep, objp); + check_irq_off(); kmemleak_free_recursive(objp, cachep->flags); objp = cache_free_debugcheck(cachep, objp, caller); @@ -3363,6 +3378,7 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) { void *ret = slab_alloc(cachep, flags, _RET_IP_); + kasan_slab_alloc(cachep, ret); trace_kmem_cache_alloc(_RET_IP_, ret, cachep->object_size, cachep->size, flags); @@ -3428,6 +3444,7 @@ kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size) ret = slab_alloc(cachep, flags, _RET_IP_); + kasan_kmalloc(cachep, ret, size); trace_kmalloc(_RET_IP_, ret, size, cachep->size, flags); return ret; @@ -3451,6 +3468,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) { void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); + kasan_slab_alloc(cachep, ret); trace_kmem_cache_alloc_node(_RET_IP_, ret, cachep->object_size, cachep->size, flags, nodeid); @@ -3468,7 +3486,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep, void *ret; ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); - + kasan_kmalloc(cachep, ret, size); trace_kmalloc_node(_RET_IP_, ret, size, cachep->size, flags, nodeid); @@ -3481,11 +3499,15 @@ static __always_inline void * __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) { struct kmem_cache *cachep; + void *ret; cachep = kmalloc_slab(size, flags); if (unlikely(ZERO_OR_NULL_PTR(cachep))) return cachep; - return kmem_cache_alloc_node_trace(cachep, flags, node, size); + ret = kmem_cache_alloc_node_trace(cachep, flags, node, size); + kasan_kmalloc(cachep, ret, size); + + return ret; } void *__kmalloc_node(size_t size, gfp_t flags, int node) @@ -3519,6 +3541,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, return cachep; ret = slab_alloc(cachep, flags, caller); + kasan_kmalloc(cachep, ret, size); trace_kmalloc(caller, ret, size, cachep->size, flags); @@ -4290,10 +4313,18 @@ module_init(slab_proc_init); */ size_t ksize(const void *objp) { + size_t size; + BUG_ON(!objp); if (unlikely(objp == ZERO_SIZE_PTR)) return 0; - return virt_to_cache(objp)->object_size; + size = virt_to_cache(objp)->object_size; + /* We assume that ksize callers could use the whole allocated area, + * so we need to unpoison this area. + */ + kasan_krealloc(objp, size); + + return size; } EXPORT_SYMBOL(ksize); diff --git a/mm/slab_common.c b/mm/slab_common.c index b2e3796..4de72e2 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -35,7 +35,7 @@ struct kmem_cache *kmem_cache; */ #define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \ - SLAB_FAILSLAB) + SLAB_FAILSLAB | SLAB_KASAN) #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \ SLAB_NOTRACK | SLAB_ACCOUNT) -- cgit v1.1 From 505f5dcb1c419e55a9621a01f83eb5745d8d7398 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Fri, 25 Mar 2016 14:22:02 -0700 Subject: mm, kasan: add GFP flags to KASAN API Add GFP flags to KASAN hooks for future patches to use. This patch is based on the "mm: kasan: unified support for SLUB and SLAB allocators" patch originally prepared by Dmitry Chernenkov. Signed-off-by: Alexander Potapenko Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Andrey Ryabinin Cc: Steven Rostedt Cc: Konstantin Serebryany Cc: Dmitry Chernenkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/kasan.c | 15 ++++++++------- mm/mempool.c | 16 ++++++++-------- mm/slab.c | 15 ++++++++------- mm/slab.h | 2 +- mm/slab_common.c | 4 ++-- mm/slub.c | 15 ++++++++------- 6 files changed, 35 insertions(+), 32 deletions(-) (limited to 'mm') diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 7c82509..cb998e0 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -434,9 +434,9 @@ struct kasan_free_meta *get_free_info(struct kmem_cache *cache, } #endif -void kasan_slab_alloc(struct kmem_cache *cache, void *object) +void kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags) { - kasan_kmalloc(cache, object, cache->object_size); + kasan_kmalloc(cache, object, cache->object_size, flags); } void kasan_slab_free(struct kmem_cache *cache, void *object) @@ -462,7 +462,8 @@ void kasan_slab_free(struct kmem_cache *cache, void *object) kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE); } -void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size) +void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size, + gfp_t flags) { unsigned long redzone_start; unsigned long redzone_end; @@ -491,7 +492,7 @@ void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size) } EXPORT_SYMBOL(kasan_kmalloc); -void kasan_kmalloc_large(const void *ptr, size_t size) +void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags) { struct page *page; unsigned long redzone_start; @@ -510,7 +511,7 @@ void kasan_kmalloc_large(const void *ptr, size_t size) KASAN_PAGE_REDZONE); } -void kasan_krealloc(const void *object, size_t size) +void kasan_krealloc(const void *object, size_t size, gfp_t flags) { struct page *page; @@ -520,9 +521,9 @@ void kasan_krealloc(const void *object, size_t size) page = virt_to_head_page(object); if (unlikely(!PageSlab(page))) - kasan_kmalloc_large(object, size); + kasan_kmalloc_large(object, size, flags); else - kasan_kmalloc(page->slab_cache, object, size); + kasan_kmalloc(page->slab_cache, object, size, flags); } void kasan_kfree(void *ptr) diff --git a/mm/mempool.c b/mm/mempool.c index 07c383d..9b7a14a 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -112,12 +112,12 @@ static void kasan_poison_element(mempool_t *pool, void *element) kasan_free_pages(element, (unsigned long)pool->pool_data); } -static void kasan_unpoison_element(mempool_t *pool, void *element) +static void kasan_unpoison_element(mempool_t *pool, void *element, gfp_t flags) { if (pool->alloc == mempool_alloc_slab) - kasan_slab_alloc(pool->pool_data, element); + kasan_slab_alloc(pool->pool_data, element, flags); if (pool->alloc == mempool_kmalloc) - kasan_krealloc(element, (size_t)pool->pool_data); + kasan_krealloc(element, (size_t)pool->pool_data, flags); if (pool->alloc == mempool_alloc_pages) kasan_alloc_pages(element, (unsigned long)pool->pool_data); } @@ -130,12 +130,12 @@ static void add_element(mempool_t *pool, void *element) pool->elements[pool->curr_nr++] = element; } -static void *remove_element(mempool_t *pool) +static void *remove_element(mempool_t *pool, gfp_t flags) { void *element = pool->elements[--pool->curr_nr]; BUG_ON(pool->curr_nr < 0); - kasan_unpoison_element(pool, element); + kasan_unpoison_element(pool, element, flags); check_element(pool, element); return element; } @@ -154,7 +154,7 @@ void mempool_destroy(mempool_t *pool) return; while (pool->curr_nr) { - void *element = remove_element(pool); + void *element = remove_element(pool, GFP_KERNEL); pool->free(element, pool->pool_data); } kfree(pool->elements); @@ -250,7 +250,7 @@ int mempool_resize(mempool_t *pool, int new_min_nr) spin_lock_irqsave(&pool->lock, flags); if (new_min_nr <= pool->min_nr) { while (new_min_nr < pool->curr_nr) { - element = remove_element(pool); + element = remove_element(pool, GFP_KERNEL); spin_unlock_irqrestore(&pool->lock, flags); pool->free(element, pool->pool_data); spin_lock_irqsave(&pool->lock, flags); @@ -347,7 +347,7 @@ repeat_alloc: spin_lock_irqsave(&pool->lock, flags); if (likely(pool->curr_nr)) { - element = remove_element(pool); + element = remove_element(pool, gfp_temp); spin_unlock_irqrestore(&pool->lock, flags); /* paired with rmb in mempool_free(), read comment there */ smp_wmb(); diff --git a/mm/slab.c b/mm/slab.c index 7515578..17e2848 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3378,7 +3378,7 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) { void *ret = slab_alloc(cachep, flags, _RET_IP_); - kasan_slab_alloc(cachep, ret); + kasan_slab_alloc(cachep, ret, flags); trace_kmem_cache_alloc(_RET_IP_, ret, cachep->object_size, cachep->size, flags); @@ -3444,7 +3444,7 @@ kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size) ret = slab_alloc(cachep, flags, _RET_IP_); - kasan_kmalloc(cachep, ret, size); + kasan_kmalloc(cachep, ret, size, flags); trace_kmalloc(_RET_IP_, ret, size, cachep->size, flags); return ret; @@ -3468,7 +3468,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) { void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); - kasan_slab_alloc(cachep, ret); + kasan_slab_alloc(cachep, ret, flags); trace_kmem_cache_alloc_node(_RET_IP_, ret, cachep->object_size, cachep->size, flags, nodeid); @@ -3486,7 +3486,8 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep, void *ret; ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); - kasan_kmalloc(cachep, ret, size); + + kasan_kmalloc(cachep, ret, size, flags); trace_kmalloc_node(_RET_IP_, ret, size, cachep->size, flags, nodeid); @@ -3505,7 +3506,7 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) if (unlikely(ZERO_OR_NULL_PTR(cachep))) return cachep; ret = kmem_cache_alloc_node_trace(cachep, flags, node, size); - kasan_kmalloc(cachep, ret, size); + kasan_kmalloc(cachep, ret, size, flags); return ret; } @@ -3541,7 +3542,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, return cachep; ret = slab_alloc(cachep, flags, caller); - kasan_kmalloc(cachep, ret, size); + kasan_kmalloc(cachep, ret, size, flags); trace_kmalloc(caller, ret, size, cachep->size, flags); @@ -4323,7 +4324,7 @@ size_t ksize(const void *objp) /* We assume that ksize callers could use the whole allocated area, * so we need to unpoison this area. */ - kasan_krealloc(objp, size); + kasan_krealloc(objp, size, GFP_NOWAIT); return size; } diff --git a/mm/slab.h b/mm/slab.h index ff39a8f..5969769 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -405,7 +405,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); - kasan_slab_alloc(s, object); + kasan_slab_alloc(s, object, flags); } memcg_kmem_put_cache(s); } diff --git a/mm/slab_common.c b/mm/slab_common.c index 4de72e2..3239bfd 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1013,7 +1013,7 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) page = alloc_kmem_pages(flags, order); ret = page ? page_address(page) : NULL; kmemleak_alloc(ret, size, 1, flags); - kasan_kmalloc_large(ret, size); + kasan_kmalloc_large(ret, size, flags); return ret; } EXPORT_SYMBOL(kmalloc_order); @@ -1192,7 +1192,7 @@ static __always_inline void *__do_krealloc(const void *p, size_t new_size, ks = ksize(p); if (ks >= new_size) { - kasan_krealloc((void *)p, new_size); + kasan_krealloc((void *)p, new_size, flags); return (void *)p; } diff --git a/mm/slub.c b/mm/slub.c index 7277413..4dbb109e 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1313,7 +1313,7 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node, static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) { kmemleak_alloc(ptr, size, 1, flags); - kasan_kmalloc_large(ptr, size); + kasan_kmalloc_large(ptr, size, flags); } static inline void kfree_hook(const void *x) @@ -2596,7 +2596,7 @@ void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) { void *ret = slab_alloc(s, gfpflags, _RET_IP_); trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags); - kasan_kmalloc(s, ret, size); + kasan_kmalloc(s, ret, size, gfpflags); return ret; } EXPORT_SYMBOL(kmem_cache_alloc_trace); @@ -2624,7 +2624,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s, trace_kmalloc_node(_RET_IP_, ret, size, s->size, gfpflags, node); - kasan_kmalloc(s, ret, size); + kasan_kmalloc(s, ret, size, gfpflags); return ret; } EXPORT_SYMBOL(kmem_cache_alloc_node_trace); @@ -3182,7 +3182,8 @@ static void early_kmem_cache_node_alloc(int node) init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); init_tracking(kmem_cache_node, n); #endif - kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node)); + kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node), + GFP_KERNEL); init_kmem_cache_node(n); inc_slabs_node(kmem_cache_node, node, page->objects); @@ -3561,7 +3562,7 @@ void *__kmalloc(size_t size, gfp_t flags) trace_kmalloc(_RET_IP_, ret, size, s->size, flags); - kasan_kmalloc(s, ret, size); + kasan_kmalloc(s, ret, size, flags); return ret; } @@ -3606,7 +3607,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node); - kasan_kmalloc(s, ret, size); + kasan_kmalloc(s, ret, size, flags); return ret; } @@ -3635,7 +3636,7 @@ size_t ksize(const void *object) size_t size = __ksize(object); /* We assume that ksize callers could use whole allocated area, so we need unpoison this area. */ - kasan_krealloc(object, size); + kasan_krealloc(object, size, GFP_NOWAIT); return size; } EXPORT_SYMBOL(ksize); -- cgit v1.1 From cd11016e5f5212c13c0cec7384a525edc93b4921 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Fri, 25 Mar 2016 14:22:08 -0700 Subject: mm, kasan: stackdepot implementation. Enable stackdepot for SLAB Implement the stack depot and provide CONFIG_STACKDEPOT. Stack depot will allow KASAN store allocation/deallocation stack traces for memory chunks. The stack traces are stored in a hash table and referenced by handles which reside in the kasan_alloc_meta and kasan_free_meta structures in the allocated memory chunks. IRQ stack traces are cut below the IRQ entry point to avoid unnecessary duplication. Right now stackdepot support is only enabled in SLAB allocator. Once KASAN features in SLAB are on par with those in SLUB we can switch SLUB to stackdepot as well, thus removing the dependency on SLUB stack bookkeeping, which wastes a lot of memory. This patch is based on the "mm: kasan: stack depots" patch originally prepared by Dmitry Chernenkov. Joonsoo has said that he plans to reuse the stackdepot code for the mm/page_owner.c debugging facility. [akpm@linux-foundation.org: s/depot_stack_handle/depot_stack_handle_t] [aryabinin@virtuozzo.com: comment style fixes] Signed-off-by: Alexander Potapenko Signed-off-by: Andrey Ryabinin Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Steven Rostedt Cc: Konstantin Serebryany Cc: Dmitry Chernenkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/kasan.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++------ mm/kasan/kasan.h | 11 +++++++---- mm/kasan/report.c | 12 ++++++++++-- 3 files changed, 66 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index cb998e0..acb3b6c 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -17,7 +17,9 @@ #define DISABLE_BRANCH_PROFILING #include +#include #include +#include #include #include #include @@ -32,7 +34,6 @@ #include #include #include -#include #include "kasan.h" #include "../slab.h" @@ -413,23 +414,65 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object) #endif } -static inline void set_track(struct kasan_track *track) +#ifdef CONFIG_SLAB +static inline int in_irqentry_text(unsigned long ptr) +{ + return (ptr >= (unsigned long)&__irqentry_text_start && + ptr < (unsigned long)&__irqentry_text_end) || + (ptr >= (unsigned long)&__softirqentry_text_start && + ptr < (unsigned long)&__softirqentry_text_end); +} + +static inline void filter_irq_stacks(struct stack_trace *trace) +{ + int i; + + if (!trace->nr_entries) + return; + for (i = 0; i < trace->nr_entries; i++) + if (in_irqentry_text(trace->entries[i])) { + /* Include the irqentry function into the stack. */ + trace->nr_entries = i + 1; + break; + } +} + +static inline depot_stack_handle_t save_stack(gfp_t flags) +{ + unsigned long entries[KASAN_STACK_DEPTH]; + struct stack_trace trace = { + .nr_entries = 0, + .entries = entries, + .max_entries = KASAN_STACK_DEPTH, + .skip = 0 + }; + + save_stack_trace(&trace); + filter_irq_stacks(&trace); + if (trace.nr_entries != 0 && + trace.entries[trace.nr_entries-1] == ULONG_MAX) + trace.nr_entries--; + + return depot_save_stack(&trace, flags); +} + +static inline void set_track(struct kasan_track *track, gfp_t flags) { - track->cpu = raw_smp_processor_id(); track->pid = current->pid; - track->when = jiffies; + track->stack = save_stack(flags); } -#ifdef CONFIG_SLAB struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache, const void *object) { + BUILD_BUG_ON(sizeof(struct kasan_alloc_meta) > 32); return (void *)object + cache->kasan_info.alloc_meta_offset; } struct kasan_free_meta *get_free_info(struct kmem_cache *cache, const void *object) { + BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32); return (void *)object + cache->kasan_info.free_meta_offset; } #endif @@ -486,7 +529,7 @@ void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size, alloc_info->state = KASAN_STATE_ALLOC; alloc_info->alloc_size = size; - set_track(&alloc_info->track); + set_track(&alloc_info->track, flags); } #endif } diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 7b9e4ab9..30a2f0b 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -2,6 +2,7 @@ #define __MM_KASAN_KASAN_H #include +#include #define KASAN_SHADOW_SCALE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT) #define KASAN_SHADOW_MASK (KASAN_SHADOW_SCALE_SIZE - 1) @@ -64,16 +65,18 @@ enum kasan_state { KASAN_STATE_FREE }; +#define KASAN_STACK_DEPTH 64 + struct kasan_track { - u64 cpu : 6; /* for NR_CPUS = 64 */ - u64 pid : 16; /* 65536 processes */ - u64 when : 42; /* ~140 years */ + u32 pid; + depot_stack_handle_t stack; }; struct kasan_alloc_meta { + struct kasan_track track; u32 state : 2; /* enum kasan_state */ u32 alloc_size : 30; - struct kasan_track track; + u32 reserved; }; struct kasan_free_meta { diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 3e3385c..60869a5a 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -118,8 +119,15 @@ static inline bool init_task_stack_addr(const void *addr) #ifdef CONFIG_SLAB static void print_track(struct kasan_track *track) { - pr_err("PID = %u, CPU = %u, timestamp = %lu\n", track->pid, - track->cpu, (unsigned long)track->when); + pr_err("PID = %u\n", track->pid); + if (track->stack) { + struct stack_trace trace; + + depot_fetch_stack(track->stack, &trace); + print_stack_trace(&trace, 0); + } else { + pr_err("(stack is not available)\n"); + } } static void object_err(struct kmem_cache *cache, struct page *page, -- cgit v1.1 From e7080a439a6f507abbc860847c33efc39b5c1c6d Mon Sep 17 00:00:00 2001 From: Nicolai Stange Date: Fri, 25 Mar 2016 14:22:14 -0700 Subject: mm/filemap: generic_file_read_iter(): check for zero reads unconditionally If - generic_file_read_iter() gets called with a zero read length, - the read offset is at a page boundary, - IOCB_DIRECT is not set - and the page in question hasn't made it into the page cache yet, then do_generic_file_read() will trigger a readahead with a req_size hint of zero. Since roundup_pow_of_two(0) is undefined, UBSAN reports UBSAN: Undefined behaviour in include/linux/log2.h:63:13 shift exponent 64 is too large for 64-bit type 'long unsigned int' CPU: 3 PID: 1017 Comm: sa1 Tainted: G L 4.5.0-next-20160318+ #14 [...] Call Trace: [...] [] ondemand_readahead+0x3aa/0x3d0 [] ? ondemand_readahead+0x3aa/0x3d0 [] ? find_get_entry+0x2d/0x210 [] page_cache_sync_readahead+0x63/0xa0 [] do_generic_file_read+0x80d/0xf90 [] generic_file_read_iter+0x185/0x420 [...] [] __vfs_read+0x256/0x3d0 [...] when get_init_ra_size() gets called from ondemand_readahead(). The net effect is that the initial readahead size is arch dependent for requested read lengths of zero: for example, since 1UL << (sizeof(unsigned long) * 8) evaluates to 1 on x86 while its result is 0 on ARMv7, the initial readahead size becomes 4 on the former and 0 on the latter. What's more, whether or not the file access timestamp is updated for zero length reads is decided differently for the two cases of IOCB_DIRECT being set or cleared: in the first case, generic_file_read_iter() explicitly skips updating that timestamp while in the latter case, it is always updated through the call to do_generic_file_read(). According to POSIX, zero length reads "do not modify the last data access timestamp" and thus, the IOCB_DIRECT behaviour is POSIXly correct. Let generic_file_read_iter() unconditionally check the requested read length at its entry and return immediately with success if it is zero. Signed-off-by: Nicolai Stange Cc: Al Viro Reviewed-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 7c00f10..a8c69c8 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1840,15 +1840,16 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) ssize_t retval = 0; loff_t *ppos = &iocb->ki_pos; loff_t pos = *ppos; + size_t count = iov_iter_count(iter); + + if (!count) + goto out; /* skip atime */ if (iocb->ki_flags & IOCB_DIRECT) { struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; - size_t count = iov_iter_count(iter); loff_t size; - if (!count) - goto out; /* skip atime */ size = i_size_read(inode); retval = filemap_write_and_wait_range(mapping, pos, pos + count - 1); -- cgit v1.1 From 0fda2788b03c1868e2f20b3b7995b8cc2adf4715 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 25 Mar 2016 14:22:20 -0700 Subject: thp: fix typo in khugepaged_scan_pmd() !PageLRU should lead to SCAN_PAGE_LRU, not SCAN_SCAN_ABORT result. Signed-off-by: Kirill A. Shutemov Cc: Ebru Akagunduz Cc: Rik van Riel Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index fbfb1b8..86f9f8b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2578,7 +2578,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, } khugepaged_node_load[node]++; if (!PageLRU(page)) { - result = SCAN_SCAN_ABORT; + result = SCAN_PAGE_LRU; goto out_unmap; } if (PageLocked(page)) { -- cgit v1.1 From 0b355eaaaae9bb8bb08b563ef55ecb23a4d743da Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Fri, 1 Apr 2016 14:31:15 -0700 Subject: mm, kasan: fix compilation for CONFIG_SLAB Add the missing argument to set_track(). Fixes: cd11016e5f52 ("mm, kasan: stackdepot implementation. Enable stackdepot for SLAB") Signed-off-by: Alexander Potapenko Cc: Andrey Konovalov Cc: Christoph Lameter Cc: Dmitry Vyukov Cc: Andrey Ryabinin Cc: Steven Rostedt Cc: Joonsoo Kim Cc: Konstantin Serebryany Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/kasan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index acb3b6c..38f1dd7 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -498,7 +498,7 @@ void kasan_slab_free(struct kmem_cache *cache, void *object) struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object); alloc_info->state = KASAN_STATE_FREE; - set_track(&free_info->track); + set_track(&free_info->track, GFP_NOWAIT); } #endif -- cgit v1.1 From 6f25a14a7053b69917e2ebea0d31dd444cd31fd5 Mon Sep 17 00:00:00 2001 From: Xishi Qiu Date: Fri, 1 Apr 2016 14:31:20 -0700 Subject: mm: fix invalid node in alloc_migrate_target() It is incorrect to use next_node to find a target node, it will return MAX_NUMNODES or invalid node. This will lead to crash in buddy system allocation. Fixes: c8721bbbdd36 ("mm: memory-hotplug: enable memory hotplug to handle hugepage") Signed-off-by: Xishi Qiu Acked-by: Vlastimil Babka Acked-by: Naoya Horiguchi Cc: Joonsoo Kim Cc: David Rientjes Cc: "Laura Abbott" Cc: Hui Zhu Cc: Wang Xiaoqiang Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_isolation.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 92c4c36..31555b6 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -289,11 +289,11 @@ struct page *alloc_migrate_target(struct page *page, unsigned long private, * now as a simple work-around, we use the next node for destination. */ if (PageHuge(page)) { - nodemask_t src = nodemask_of_node(page_to_nid(page)); - nodemask_t dst; - nodes_complement(dst, src); + int node = next_online_node(page_to_nid(page)); + if (node == MAX_NUMNODES) + node = first_online_node; return alloc_huge_page_node(page_hstate(compound_head(page)), - next_node(page_to_nid(page), dst)); + node); } if (PageHighMem(page)) -- cgit v1.1 From 858eaaa711700ce4595e039441e239e56d7b9514 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Fri, 1 Apr 2016 14:31:26 -0700 Subject: mm/rmap: batched invalidations should use existing api The recently introduced batched invalidations mechanism uses its own mechanism for shootdown. However, it does wrong accounting of interrupts (e.g., inc_irq_stat is called for local invalidations), trace-points (e.g., TLB_REMOTE_SHOOTDOWN for local invalidations) and may break some platforms as it bypasses the invalidation mechanisms of Xen and SGI UV. This patch reuses the existing TLB flushing mechnaisms instead. We use NULL as mm to indicate a global invalidation is required. Fixes 72b252aed506b8 ("mm: send one IPI per CPU to TLB flush all entries after unmapping pages") Signed-off-by: Nadav Amit Cc: Mel Gorman Cc: Rik van Riel Cc: Dave Hansen Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 28 +++++++--------------------- 1 file changed, 7 insertions(+), 21 deletions(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index c399a0d..395e314 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -569,19 +569,6 @@ void page_unlock_anon_vma_read(struct anon_vma *anon_vma) } #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH -static void percpu_flush_tlb_batch_pages(void *data) -{ - /* - * All TLB entries are flushed on the assumption that it is - * cheaper to flush all TLBs and let them be refilled than - * flushing individual PFNs. Note that we do not track mm's - * to flush as that might simply be multiple full TLB flushes - * for no gain. - */ - count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); - flush_tlb_local(); -} - /* * Flush TLB entries for recently unmapped pages from remote CPUs. It is * important if a PTE was dirty when it was unmapped that it's flushed @@ -598,15 +585,14 @@ void try_to_unmap_flush(void) cpu = get_cpu(); - trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, -1UL); - - if (cpumask_test_cpu(cpu, &tlb_ubc->cpumask)) - percpu_flush_tlb_batch_pages(&tlb_ubc->cpumask); - - if (cpumask_any_but(&tlb_ubc->cpumask, cpu) < nr_cpu_ids) { - smp_call_function_many(&tlb_ubc->cpumask, - percpu_flush_tlb_batch_pages, (void *)tlb_ubc, true); + if (cpumask_test_cpu(cpu, &tlb_ubc->cpumask)) { + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); + local_flush_tlb(); + trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL); } + + if (cpumask_any_but(&tlb_ubc->cpumask, cpu) < nr_cpu_ids) + flush_tlb_others(&tlb_ubc->cpumask, NULL, 0, TLB_FLUSH_ALL); cpumask_clear(&tlb_ubc->cpumask); tlb_ubc->flush_required = false; tlb_ubc->writable = false; -- cgit v1.1 From af8e15cc85a253155fdcea707588bf6ddfc0be2e Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 1 Apr 2016 14:31:34 -0700 Subject: oom, oom_reaper: do not enqueue task if it is on the oom_reaper_list head Commit bb29902a7515 ("oom, oom_reaper: protect oom_reaper_list using simpler way") has simplified the check for tasks already enqueued for the oom reaper by checking tsk->oom_reaper_list != NULL. This check is not sufficient because the tsk might be the head of the queue without any other tasks queued and then we would simply lockup looping on the same task. Fix the condition by checking for the head as well. Fixes: bb29902a7515 ("oom, oom_reaper: protect oom_reaper_list using simpler way") Signed-off-by: Michal Hocko Acked-by: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index b34d279..8634958 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -547,7 +547,11 @@ static int oom_reaper(void *unused) static void wake_oom_reaper(struct task_struct *tsk) { - if (!oom_reaper_th || tsk->oom_reaper_list) + if (!oom_reaper_th) + return; + + /* tsk is already queued? */ + if (tsk == oom_reaper_list || tsk->oom_reaper_list) return; get_task_struct(tsk); -- cgit v1.1 From ec3b6882509188602dfaadf2814a9cca062f2786 Mon Sep 17 00:00:00 2001 From: Neil Zhang Date: Fri, 1 Apr 2016 14:31:37 -0700 Subject: mm/page_isolation.c: fix the function comments Commit fea85cff11de ("mm/page_isolation.c: return last tested pfn rather than failure indicator") changed the meaning of the return value. Let's change the function comments as well. Signed-off-by: Neil Zhang Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_isolation.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 31555b6..c4f5682 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -215,7 +215,7 @@ int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, * all pages in [start_pfn...end_pfn) must be in the same zone. * zone->lock must be held before call this. * - * Returns 1 if all pages in the range are isolated. + * Returns the last tested pfn. */ static unsigned long __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn, -- cgit v1.1 From 09cbfeaf1a5a67bfb3201e0c83c810cecb2efa5a Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 1 Apr 2016 15:29:47 +0300 Subject: mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> ; - >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> ; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov Acked-by: Michal Hocko Signed-off-by: Linus Torvalds --- mm/fadvise.c | 8 ++-- mm/filemap.c | 126 +++++++++++++++++++++++++------------------------- mm/hugetlb.c | 8 ++-- mm/madvise.c | 6 +-- mm/memory-failure.c | 2 +- mm/memory.c | 54 +++++++++++----------- mm/mincore.c | 4 +- mm/nommu.c | 2 +- mm/page-writeback.c | 12 ++--- mm/page_io.c | 2 +- mm/readahead.c | 20 ++++---- mm/rmap.c | 2 +- mm/shmem.c | 130 ++++++++++++++++++++++++++-------------------------- mm/swap.c | 12 ++--- mm/swap_state.c | 12 ++--- mm/swapfile.c | 12 ++--- mm/truncate.c | 40 ++++++++-------- mm/userfaultfd.c | 4 +- mm/zswap.c | 4 +- 19 files changed, 230 insertions(+), 230 deletions(-) (limited to 'mm') diff --git a/mm/fadvise.c b/mm/fadvise.c index b8a5bc6..b8024fa 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -97,8 +97,8 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) break; case POSIX_FADV_WILLNEED: /* First and last PARTIAL page! */ - start_index = offset >> PAGE_CACHE_SHIFT; - end_index = endbyte >> PAGE_CACHE_SHIFT; + start_index = offset >> PAGE_SHIFT; + end_index = endbyte >> PAGE_SHIFT; /* Careful about overflow on the "+1" */ nrpages = end_index - start_index + 1; @@ -124,8 +124,8 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) * preserved on the expectation that it is better to preserve * needed memory than to discard unneeded memory. */ - start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT; - end_index = (endbyte >> PAGE_CACHE_SHIFT); + start_index = (offset+(PAGE_SIZE-1)) >> PAGE_SHIFT; + end_index = (endbyte >> PAGE_SHIFT); if (end_index >= start_index) { unsigned long count = invalidate_mapping_pages(mapping, diff --git a/mm/filemap.c b/mm/filemap.c index a8c69c8..f2479af 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -265,7 +265,7 @@ void delete_from_page_cache(struct page *page) if (freepage) freepage(page); - page_cache_release(page); + put_page(page); } EXPORT_SYMBOL(delete_from_page_cache); @@ -352,8 +352,8 @@ EXPORT_SYMBOL(filemap_flush); static int __filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, loff_t end_byte) { - pgoff_t index = start_byte >> PAGE_CACHE_SHIFT; - pgoff_t end = end_byte >> PAGE_CACHE_SHIFT; + pgoff_t index = start_byte >> PAGE_SHIFT; + pgoff_t end = end_byte >> PAGE_SHIFT; struct pagevec pvec; int nr_pages; int ret = 0; @@ -550,7 +550,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) pgoff_t offset = old->index; freepage = mapping->a_ops->freepage; - page_cache_get(new); + get_page(new); new->mapping = mapping; new->index = offset; @@ -572,7 +572,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) radix_tree_preload_end(); if (freepage) freepage(old); - page_cache_release(old); + put_page(old); } return error; @@ -651,7 +651,7 @@ static int __add_to_page_cache_locked(struct page *page, return error; } - page_cache_get(page); + get_page(page); page->mapping = mapping; page->index = offset; @@ -675,7 +675,7 @@ err_insert: spin_unlock_irq(&mapping->tree_lock); if (!huge) mem_cgroup_cancel_charge(page, memcg, false); - page_cache_release(page); + put_page(page); return error; } @@ -1083,7 +1083,7 @@ repeat: * include/linux/pagemap.h for details. */ if (unlikely(page != *pagep)) { - page_cache_release(page); + put_page(page); goto repeat; } } @@ -1121,7 +1121,7 @@ repeat: /* Has the page been truncated? */ if (unlikely(page->mapping != mapping)) { unlock_page(page); - page_cache_release(page); + put_page(page); goto repeat; } VM_BUG_ON_PAGE(page->index != offset, page); @@ -1168,7 +1168,7 @@ repeat: if (fgp_flags & FGP_LOCK) { if (fgp_flags & FGP_NOWAIT) { if (!trylock_page(page)) { - page_cache_release(page); + put_page(page); return NULL; } } else { @@ -1178,7 +1178,7 @@ repeat: /* Has the page been truncated? */ if (unlikely(page->mapping != mapping)) { unlock_page(page); - page_cache_release(page); + put_page(page); goto repeat; } VM_BUG_ON_PAGE(page->index != offset, page); @@ -1209,7 +1209,7 @@ no_page: err = add_to_page_cache_lru(page, mapping, offset, gfp_mask & GFP_RECLAIM_MASK); if (unlikely(err)) { - page_cache_release(page); + put_page(page); page = NULL; if (err == -EEXIST) goto repeat; @@ -1278,7 +1278,7 @@ repeat: /* Has the page moved? */ if (unlikely(page != *slot)) { - page_cache_release(page); + put_page(page); goto repeat; } export: @@ -1343,7 +1343,7 @@ repeat: /* Has the page moved? */ if (unlikely(page != *slot)) { - page_cache_release(page); + put_page(page); goto repeat; } @@ -1405,7 +1405,7 @@ repeat: /* Has the page moved? */ if (unlikely(page != *slot)) { - page_cache_release(page); + put_page(page); goto repeat; } @@ -1415,7 +1415,7 @@ repeat: * negatives, which is just confusing to the caller. */ if (page->mapping == NULL || page->index != iter.index) { - page_cache_release(page); + put_page(page); break; } @@ -1482,7 +1482,7 @@ repeat: /* Has the page moved? */ if (unlikely(page != *slot)) { - page_cache_release(page); + put_page(page); goto repeat; } @@ -1549,7 +1549,7 @@ repeat: /* Has the page moved? */ if (unlikely(page != *slot)) { - page_cache_release(page); + put_page(page); goto repeat; } export: @@ -1610,11 +1610,11 @@ static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos, unsigned int prev_offset; int error = 0; - index = *ppos >> PAGE_CACHE_SHIFT; - prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT; - prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1); - last_index = (*ppos + iter->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; - offset = *ppos & ~PAGE_CACHE_MASK; + index = *ppos >> PAGE_SHIFT; + prev_index = ra->prev_pos >> PAGE_SHIFT; + prev_offset = ra->prev_pos & (PAGE_SIZE-1); + last_index = (*ppos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT; + offset = *ppos & ~PAGE_MASK; for (;;) { struct page *page; @@ -1648,7 +1648,7 @@ find_page: if (PageUptodate(page)) goto page_ok; - if (inode->i_blkbits == PAGE_CACHE_SHIFT || + if (inode->i_blkbits == PAGE_SHIFT || !mapping->a_ops->is_partially_uptodate) goto page_not_up_to_date; if (!trylock_page(page)) @@ -1672,18 +1672,18 @@ page_ok: */ isize = i_size_read(inode); - end_index = (isize - 1) >> PAGE_CACHE_SHIFT; + end_index = (isize - 1) >> PAGE_SHIFT; if (unlikely(!isize || index > end_index)) { - page_cache_release(page); + put_page(page); goto out; } /* nr is the maximum number of bytes to copy from this page */ - nr = PAGE_CACHE_SIZE; + nr = PAGE_SIZE; if (index == end_index) { - nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; + nr = ((isize - 1) & ~PAGE_MASK) + 1; if (nr <= offset) { - page_cache_release(page); + put_page(page); goto out; } } @@ -1711,11 +1711,11 @@ page_ok: ret = copy_page_to_iter(page, offset, nr, iter); offset += ret; - index += offset >> PAGE_CACHE_SHIFT; - offset &= ~PAGE_CACHE_MASK; + index += offset >> PAGE_SHIFT; + offset &= ~PAGE_MASK; prev_offset = offset; - page_cache_release(page); + put_page(page); written += ret; if (!iov_iter_count(iter)) goto out; @@ -1735,7 +1735,7 @@ page_not_up_to_date_locked: /* Did it get truncated before we got the lock? */ if (!page->mapping) { unlock_page(page); - page_cache_release(page); + put_page(page); continue; } @@ -1757,7 +1757,7 @@ readpage: if (unlikely(error)) { if (error == AOP_TRUNCATED_PAGE) { - page_cache_release(page); + put_page(page); error = 0; goto find_page; } @@ -1774,7 +1774,7 @@ readpage: * invalidate_mapping_pages got it */ unlock_page(page); - page_cache_release(page); + put_page(page); goto find_page; } unlock_page(page); @@ -1789,7 +1789,7 @@ readpage: readpage_error: /* UHHUH! A synchronous read error occurred. Report it */ - page_cache_release(page); + put_page(page); goto out; no_cached_page: @@ -1805,7 +1805,7 @@ no_cached_page: error = add_to_page_cache_lru(page, mapping, index, mapping_gfp_constraint(mapping, GFP_KERNEL)); if (error) { - page_cache_release(page); + put_page(page); if (error == -EEXIST) { error = 0; goto find_page; @@ -1817,10 +1817,10 @@ no_cached_page: out: ra->prev_pos = prev_index; - ra->prev_pos <<= PAGE_CACHE_SHIFT; + ra->prev_pos <<= PAGE_SHIFT; ra->prev_pos |= prev_offset; - *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset; + *ppos = ((loff_t)index << PAGE_SHIFT) + offset; file_accessed(filp); return written ? written : error; } @@ -1912,7 +1912,7 @@ static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask) else if (ret == -EEXIST) ret = 0; /* losing race to add is OK */ - page_cache_release(page); + put_page(page); } while (ret == AOP_TRUNCATED_PAGE); @@ -2022,8 +2022,8 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) loff_t size; int ret = 0; - size = round_up(i_size_read(inode), PAGE_CACHE_SIZE); - if (offset >= size >> PAGE_CACHE_SHIFT) + size = round_up(i_size_read(inode), PAGE_SIZE); + if (offset >= size >> PAGE_SHIFT) return VM_FAULT_SIGBUS; /* @@ -2049,7 +2049,7 @@ retry_find: } if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) { - page_cache_release(page); + put_page(page); return ret | VM_FAULT_RETRY; } @@ -2072,10 +2072,10 @@ retry_find: * Found the page and have a reference on it. * We must recheck i_size under page lock. */ - size = round_up(i_size_read(inode), PAGE_CACHE_SIZE); - if (unlikely(offset >= size >> PAGE_CACHE_SHIFT)) { + size = round_up(i_size_read(inode), PAGE_SIZE); + if (unlikely(offset >= size >> PAGE_SHIFT)) { unlock_page(page); - page_cache_release(page); + put_page(page); return VM_FAULT_SIGBUS; } @@ -2120,7 +2120,7 @@ page_not_uptodate: if (!PageUptodate(page)) error = -EIO; } - page_cache_release(page); + put_page(page); if (!error || error == AOP_TRUNCATED_PAGE) goto retry_find; @@ -2164,7 +2164,7 @@ repeat: /* Has the page moved? */ if (unlikely(page != *slot)) { - page_cache_release(page); + put_page(page); goto repeat; } @@ -2178,8 +2178,8 @@ repeat: if (page->mapping != mapping || !PageUptodate(page)) goto unlock; - size = round_up(i_size_read(mapping->host), PAGE_CACHE_SIZE); - if (page->index >= size >> PAGE_CACHE_SHIFT) + size = round_up(i_size_read(mapping->host), PAGE_SIZE); + if (page->index >= size >> PAGE_SHIFT) goto unlock; pte = vmf->pte + page->index - vmf->pgoff; @@ -2195,7 +2195,7 @@ repeat: unlock: unlock_page(page); skip: - page_cache_release(page); + put_page(page); next: if (iter.index == vmf->max_pgoff) break; @@ -2278,7 +2278,7 @@ static struct page *wait_on_page_read(struct page *page) if (!IS_ERR(page)) { wait_on_page_locked(page); if (!PageUptodate(page)) { - page_cache_release(page); + put_page(page); page = ERR_PTR(-EIO); } } @@ -2301,7 +2301,7 @@ repeat: return ERR_PTR(-ENOMEM); err = add_to_page_cache_lru(page, mapping, index, gfp); if (unlikely(err)) { - page_cache_release(page); + put_page(page); if (err == -EEXIST) goto repeat; /* Presumably ENOMEM for radix tree node */ @@ -2311,7 +2311,7 @@ repeat: filler: err = filler(data, page); if (err < 0) { - page_cache_release(page); + put_page(page); return ERR_PTR(err); } @@ -2364,7 +2364,7 @@ filler: /* Case c or d, restart the operation */ if (!page->mapping) { unlock_page(page); - page_cache_release(page); + put_page(page); goto repeat; } @@ -2511,7 +2511,7 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) struct iov_iter data; write_len = iov_iter_count(from); - end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT; + end = (pos + write_len - 1) >> PAGE_SHIFT; written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1); if (written) @@ -2525,7 +2525,7 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) */ if (mapping->nrpages) { written = invalidate_inode_pages2_range(mapping, - pos >> PAGE_CACHE_SHIFT, end); + pos >> PAGE_SHIFT, end); /* * If a page can not be invalidated, return 0 to fall back * to buffered write. @@ -2550,7 +2550,7 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) */ if (mapping->nrpages) { invalidate_inode_pages2_range(mapping, - pos >> PAGE_CACHE_SHIFT, end); + pos >> PAGE_SHIFT, end); } if (written > 0) { @@ -2611,8 +2611,8 @@ ssize_t generic_perform_write(struct file *file, size_t copied; /* Bytes copied from user */ void *fsdata; - offset = (pos & (PAGE_CACHE_SIZE - 1)); - bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, + offset = (pos & (PAGE_SIZE - 1)); + bytes = min_t(unsigned long, PAGE_SIZE - offset, iov_iter_count(i)); again: @@ -2665,7 +2665,7 @@ again: * because not all segments in the iov can be copied at * once without a pagefault. */ - bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, + bytes = min_t(unsigned long, PAGE_SIZE - offset, iov_iter_single_seg_count(i)); goto again; } @@ -2752,8 +2752,8 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) iocb->ki_pos = endbyte + 1; written += status; invalidate_mapping_pages(mapping, - pos >> PAGE_CACHE_SHIFT, - endbyte >> PAGE_CACHE_SHIFT); + pos >> PAGE_SHIFT, + endbyte >> PAGE_SHIFT); } else { /* * We don't know how much we wrote, so just return diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 06058ea..19d0d08 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3346,7 +3346,7 @@ retry_avoidcopy: old_page != pagecache_page) outside_reserve = 1; - page_cache_get(old_page); + get_page(old_page); /* * Drop page table lock as buddy allocator may be called. It will @@ -3364,7 +3364,7 @@ retry_avoidcopy: * may get SIGKILLed if it later faults. */ if (outside_reserve) { - page_cache_release(old_page); + put_page(old_page); BUG_ON(huge_pte_none(pte)); unmap_ref_private(mm, vma, old_page, address); BUG_ON(huge_pte_none(pte)); @@ -3425,9 +3425,9 @@ retry_avoidcopy: spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); out_release_all: - page_cache_release(new_page); + put_page(new_page); out_release_old: - page_cache_release(old_page); + put_page(old_page); spin_lock(ptl); /* Caller expects lock to be held */ return ret; diff --git a/mm/madvise.c b/mm/madvise.c index a011473..07427d3 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -170,7 +170,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, vma, index); if (page) - page_cache_release(page); + put_page(page); } return 0; @@ -204,14 +204,14 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma, page = find_get_entry(mapping, index); if (!radix_tree_exceptional_entry(page)) { if (page) - page_cache_release(page); + put_page(page); continue; } swap = radix_to_swp_entry(page); page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE, NULL, 0); if (page) - page_cache_release(page); + put_page(page); } lru_add_drain(); /* Push any new pages onto the LRU now */ diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 5a544c6..78f5f26 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -538,7 +538,7 @@ static int delete_from_lru_cache(struct page *p) /* * drop the page count elevated by isolate_lru_page() */ - page_cache_release(p); + put_page(p); return 0; } return -EIO; diff --git a/mm/memory.c b/mm/memory.c index 098f00d..07a4204 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2054,7 +2054,7 @@ static inline int wp_page_reuse(struct mm_struct *mm, VM_BUG_ON_PAGE(PageAnon(page), page); mapping = page->mapping; unlock_page(page); - page_cache_release(page); + put_page(page); if ((dirtied || page_mkwrite) && mapping) { /* @@ -2188,7 +2188,7 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, } if (new_page) - page_cache_release(new_page); + put_page(new_page); pte_unmap_unlock(page_table, ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); @@ -2203,14 +2203,14 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, munlock_vma_page(old_page); unlock_page(old_page); } - page_cache_release(old_page); + put_page(old_page); } return page_copied ? VM_FAULT_WRITE : 0; oom_free_new: - page_cache_release(new_page); + put_page(new_page); oom: if (old_page) - page_cache_release(old_page); + put_page(old_page); return VM_FAULT_OOM; } @@ -2258,7 +2258,7 @@ static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma, { int page_mkwrite = 0; - page_cache_get(old_page); + get_page(old_page); if (vma->vm_ops && vma->vm_ops->page_mkwrite) { int tmp; @@ -2267,7 +2267,7 @@ static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma, tmp = do_page_mkwrite(vma, old_page, address); if (unlikely(!tmp || (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { - page_cache_release(old_page); + put_page(old_page); return tmp; } /* @@ -2281,7 +2281,7 @@ static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma, if (!pte_same(*page_table, orig_pte)) { unlock_page(old_page); pte_unmap_unlock(page_table, ptl); - page_cache_release(old_page); + put_page(old_page); return 0; } page_mkwrite = 1; @@ -2341,7 +2341,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, */ if (PageAnon(old_page) && !PageKsm(old_page)) { if (!trylock_page(old_page)) { - page_cache_get(old_page); + get_page(old_page); pte_unmap_unlock(page_table, ptl); lock_page(old_page); page_table = pte_offset_map_lock(mm, pmd, address, @@ -2349,10 +2349,10 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, if (!pte_same(*page_table, orig_pte)) { unlock_page(old_page); pte_unmap_unlock(page_table, ptl); - page_cache_release(old_page); + put_page(old_page); return 0; } - page_cache_release(old_page); + put_page(old_page); } if (reuse_swap_page(old_page)) { /* @@ -2375,7 +2375,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, /* * Ok, we need to copy. Oh, well.. */ - page_cache_get(old_page); + get_page(old_page); pte_unmap_unlock(page_table, ptl); return wp_page_copy(mm, vma, address, page_table, pmd, @@ -2619,7 +2619,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, * parallel locked swapcache. */ unlock_page(swapcache); - page_cache_release(swapcache); + put_page(swapcache); } if (flags & FAULT_FLAG_WRITE) { @@ -2641,10 +2641,10 @@ out_nomap: out_page: unlock_page(page); out_release: - page_cache_release(page); + put_page(page); if (page != swapcache) { unlock_page(swapcache); - page_cache_release(swapcache); + put_page(swapcache); } return ret; } @@ -2752,7 +2752,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, if (userfaultfd_missing(vma)) { pte_unmap_unlock(page_table, ptl); mem_cgroup_cancel_charge(page, memcg, false); - page_cache_release(page); + put_page(page); return handle_userfault(vma, address, flags, VM_UFFD_MISSING); } @@ -2771,10 +2771,10 @@ unlock: return 0; release: mem_cgroup_cancel_charge(page, memcg, false); - page_cache_release(page); + put_page(page); goto unlock; oom_free_page: - page_cache_release(page); + put_page(page); oom: return VM_FAULT_OOM; } @@ -2807,7 +2807,7 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address, if (unlikely(PageHWPoison(vmf.page))) { if (ret & VM_FAULT_LOCKED) unlock_page(vmf.page); - page_cache_release(vmf.page); + put_page(vmf.page); return VM_FAULT_HWPOISON; } @@ -2996,7 +2996,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(!pte_same(*pte, orig_pte))) { pte_unmap_unlock(pte, ptl); unlock_page(fault_page); - page_cache_release(fault_page); + put_page(fault_page); return ret; } do_set_pte(vma, address, fault_page, pte, false, false); @@ -3024,7 +3024,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_OOM; if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) { - page_cache_release(new_page); + put_page(new_page); return VM_FAULT_OOM; } @@ -3041,7 +3041,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, pte_unmap_unlock(pte, ptl); if (fault_page) { unlock_page(fault_page); - page_cache_release(fault_page); + put_page(fault_page); } else { /* * The fault handler has no page to lock, so it holds @@ -3057,7 +3057,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, pte_unmap_unlock(pte, ptl); if (fault_page) { unlock_page(fault_page); - page_cache_release(fault_page); + put_page(fault_page); } else { /* * The fault handler has no page to lock, so it holds @@ -3068,7 +3068,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, return ret; uncharge_out: mem_cgroup_cancel_charge(new_page, memcg, false); - page_cache_release(new_page); + put_page(new_page); return ret; } @@ -3096,7 +3096,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, tmp = do_page_mkwrite(vma, fault_page, address); if (unlikely(!tmp || (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { - page_cache_release(fault_page); + put_page(fault_page); return tmp; } } @@ -3105,7 +3105,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(!pte_same(*pte, orig_pte))) { pte_unmap_unlock(pte, ptl); unlock_page(fault_page); - page_cache_release(fault_page); + put_page(fault_page); return ret; } do_set_pte(vma, address, fault_page, pte, true, false); @@ -3736,7 +3736,7 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, buf, maddr + offset, bytes); } kunmap(page); - page_cache_release(page); + put_page(page); } len -= bytes; buf += bytes; diff --git a/mm/mincore.c b/mm/mincore.c index 563f320..0eed23d 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -75,7 +75,7 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) #endif if (page) { present = PageUptodate(page); - page_cache_release(page); + put_page(page); } return present; @@ -226,7 +226,7 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len, unsigned char *tmp; /* Check the start address: needs to be page-aligned.. */ - if (start & ~PAGE_CACHE_MASK) + if (start & ~PAGE_MASK) return -EINVAL; /* ..and we need to be passed a valid user-space range */ diff --git a/mm/nommu.c b/mm/nommu.c index de8b6b6..102e257 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -141,7 +141,7 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (pages) { pages[i] = virt_to_page(start); if (pages[i]) - page_cache_get(pages[i]); + get_page(pages[i]); } if (vmas) vmas[i] = vma; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 11ff8f7..999792d 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2176,8 +2176,8 @@ int write_cache_pages(struct address_space *mapping, cycled = 0; end = -1; } else { - index = wbc->range_start >> PAGE_CACHE_SHIFT; - end = wbc->range_end >> PAGE_CACHE_SHIFT; + index = wbc->range_start >> PAGE_SHIFT; + end = wbc->range_end >> PAGE_SHIFT; if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; cycled = 1; /* ignore range_cyclic tests */ @@ -2382,14 +2382,14 @@ int write_one_page(struct page *page, int wait) wait_on_page_writeback(page); if (clear_page_dirty_for_io(page)) { - page_cache_get(page); + get_page(page); ret = mapping->a_ops->writepage(page, &wbc); if (ret == 0 && wait) { wait_on_page_writeback(page); if (PageError(page)) ret = -EIO; } - page_cache_release(page); + put_page(page); } else { unlock_page(page); } @@ -2431,7 +2431,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) __inc_zone_page_state(page, NR_DIRTIED); __inc_wb_stat(wb, WB_RECLAIMABLE); __inc_wb_stat(wb, WB_DIRTIED); - task_io_account_write(PAGE_CACHE_SIZE); + task_io_account_write(PAGE_SIZE); current->nr_dirtied++; this_cpu_inc(bdp_ratelimits); } @@ -2450,7 +2450,7 @@ void account_page_cleaned(struct page *page, struct address_space *mapping, mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY); dec_zone_page_state(page, NR_FILE_DIRTY); dec_wb_stat(wb, WB_RECLAIMABLE); - task_io_account_cancelled_write(PAGE_CACHE_SIZE); + task_io_account_cancelled_write(PAGE_SIZE); } } diff --git a/mm/page_io.c b/mm/page_io.c index 18aac78..cd92e3d 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -252,7 +252,7 @@ out: static sector_t swap_page_sector(struct page *page) { - return (sector_t)__page_file_index(page) << (PAGE_CACHE_SHIFT - 9); + return (sector_t)__page_file_index(page) << (PAGE_SHIFT - 9); } int __swap_writepage(struct page *page, struct writeback_control *wbc, diff --git a/mm/readahead.c b/mm/readahead.c index 20e58e8..40be3ae 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -47,11 +47,11 @@ static void read_cache_pages_invalidate_page(struct address_space *mapping, if (!trylock_page(page)) BUG(); page->mapping = mapping; - do_invalidatepage(page, 0, PAGE_CACHE_SIZE); + do_invalidatepage(page, 0, PAGE_SIZE); page->mapping = NULL; unlock_page(page); } - page_cache_release(page); + put_page(page); } /* @@ -93,14 +93,14 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages, read_cache_pages_invalidate_page(mapping, page); continue; } - page_cache_release(page); + put_page(page); ret = filler(data, page); if (unlikely(ret)) { read_cache_pages_invalidate_pages(mapping, pages); break; } - task_io_account_read(PAGE_CACHE_SIZE); + task_io_account_read(PAGE_SIZE); } return ret; } @@ -130,7 +130,7 @@ static int read_pages(struct address_space *mapping, struct file *filp, mapping_gfp_constraint(mapping, GFP_KERNEL))) { mapping->a_ops->readpage(filp, page); } - page_cache_release(page); + put_page(page); } ret = 0; @@ -163,7 +163,7 @@ int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, if (isize == 0) goto out; - end_index = ((isize - 1) >> PAGE_CACHE_SHIFT); + end_index = ((isize - 1) >> PAGE_SHIFT); /* * Preallocate as many pages as we will need. @@ -216,7 +216,7 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, while (nr_to_read) { int err; - unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_CACHE_SIZE; + unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_SIZE; if (this_chunk > nr_to_read) this_chunk = nr_to_read; @@ -425,7 +425,7 @@ ondemand_readahead(struct address_space *mapping, * trivial case: (offset - prev_offset) == 1 * unaligned reads: (offset - prev_offset) == 0 */ - prev_offset = (unsigned long long)ra->prev_pos >> PAGE_CACHE_SHIFT; + prev_offset = (unsigned long long)ra->prev_pos >> PAGE_SHIFT; if (offset - prev_offset <= 1UL) goto initial_readahead; @@ -558,8 +558,8 @@ SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count) if (f.file) { if (f.file->f_mode & FMODE_READ) { struct address_space *mapping = f.file->f_mapping; - pgoff_t start = offset >> PAGE_CACHE_SHIFT; - pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT; + pgoff_t start = offset >> PAGE_SHIFT; + pgoff_t end = (offset + count - 1) >> PAGE_SHIFT; unsigned long len = end - start + 1; ret = do_readahead(mapping, f.file, start, len); } diff --git a/mm/rmap.c b/mm/rmap.c index c399a0d..525b92f 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1555,7 +1555,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, discard: page_remove_rmap(page, PageHuge(page)); - page_cache_release(page); + put_page(page); out_unmap: pte_unmap_unlock(pte, ptl); diff --git a/mm/shmem.c b/mm/shmem.c index 9428c51..719bd6b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -75,8 +75,8 @@ static struct vfsmount *shm_mnt; #include "internal.h" -#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512) -#define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT) +#define BLOCKS_PER_PAGE (PAGE_SIZE/512) +#define VM_ACCT(size) (PAGE_ALIGN(size) >> PAGE_SHIFT) /* Pretend that each entry is of this size in directory's i_size */ #define BOGO_DIRENT_SIZE 20 @@ -176,13 +176,13 @@ static inline int shmem_reacct_size(unsigned long flags, static inline int shmem_acct_block(unsigned long flags) { return (flags & VM_NORESERVE) ? - security_vm_enough_memory_mm(current->mm, VM_ACCT(PAGE_CACHE_SIZE)) : 0; + security_vm_enough_memory_mm(current->mm, VM_ACCT(PAGE_SIZE)) : 0; } static inline void shmem_unacct_blocks(unsigned long flags, long pages) { if (flags & VM_NORESERVE) - vm_unacct_memory(pages * VM_ACCT(PAGE_CACHE_SIZE)); + vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE)); } static const struct super_operations shmem_ops; @@ -300,7 +300,7 @@ static int shmem_add_to_page_cache(struct page *page, VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageSwapBacked(page), page); - page_cache_get(page); + get_page(page); page->mapping = mapping; page->index = index; @@ -318,7 +318,7 @@ static int shmem_add_to_page_cache(struct page *page, } else { page->mapping = NULL; spin_unlock_irq(&mapping->tree_lock); - page_cache_release(page); + put_page(page); } return error; } @@ -338,7 +338,7 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap) __dec_zone_page_state(page, NR_FILE_PAGES); __dec_zone_page_state(page, NR_SHMEM); spin_unlock_irq(&mapping->tree_lock); - page_cache_release(page); + put_page(page); BUG_ON(error); } @@ -474,10 +474,10 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, { struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); - pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - pgoff_t end = (lend + 1) >> PAGE_CACHE_SHIFT; - unsigned int partial_start = lstart & (PAGE_CACHE_SIZE - 1); - unsigned int partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1); + pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT; + pgoff_t end = (lend + 1) >> PAGE_SHIFT; + unsigned int partial_start = lstart & (PAGE_SIZE - 1); + unsigned int partial_end = (lend + 1) & (PAGE_SIZE - 1); struct pagevec pvec; pgoff_t indices[PAGEVEC_SIZE]; long nr_swaps_freed = 0; @@ -530,7 +530,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, struct page *page = NULL; shmem_getpage(inode, start - 1, &page, SGP_READ, NULL); if (page) { - unsigned int top = PAGE_CACHE_SIZE; + unsigned int top = PAGE_SIZE; if (start > end) { top = partial_end; partial_end = 0; @@ -538,7 +538,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, zero_user_segment(page, partial_start, top); set_page_dirty(page); unlock_page(page); - page_cache_release(page); + put_page(page); } } if (partial_end) { @@ -548,7 +548,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, zero_user_segment(page, 0, partial_end); set_page_dirty(page); unlock_page(page); - page_cache_release(page); + put_page(page); } } if (start >= end) @@ -833,7 +833,7 @@ int shmem_unuse(swp_entry_t swap, struct page *page) mem_cgroup_commit_charge(page, memcg, true, false); out: unlock_page(page); - page_cache_release(page); + put_page(page); return error; } @@ -1080,7 +1080,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, if (!newpage) return -ENOMEM; - page_cache_get(newpage); + get_page(newpage); copy_highpage(newpage, oldpage); flush_dcache_page(newpage); @@ -1120,8 +1120,8 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, set_page_private(oldpage, 0); unlock_page(oldpage); - page_cache_release(oldpage); - page_cache_release(oldpage); + put_page(oldpage); + put_page(oldpage); return error; } @@ -1145,7 +1145,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, int once = 0; int alloced = 0; - if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT)) + if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT)) return -EFBIG; repeat: swap.val = 0; @@ -1156,7 +1156,7 @@ repeat: } if (sgp != SGP_WRITE && sgp != SGP_FALLOC && - ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { + ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { error = -EINVAL; goto unlock; } @@ -1169,7 +1169,7 @@ repeat: if (sgp != SGP_READ) goto clear; unlock_page(page); - page_cache_release(page); + put_page(page); page = NULL; } if (page || (sgp == SGP_READ && !swap.val)) { @@ -1327,7 +1327,7 @@ clear: /* Perhaps the file has been truncated since we checked */ if (sgp != SGP_WRITE && sgp != SGP_FALLOC && - ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { + ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { if (alloced) { ClearPageDirty(page); delete_from_page_cache(page); @@ -1355,7 +1355,7 @@ failed: unlock: if (page) { unlock_page(page); - page_cache_release(page); + put_page(page); } if (error == -ENOSPC && !once++) { info = SHMEM_I(inode); @@ -1577,7 +1577,7 @@ shmem_write_begin(struct file *file, struct address_space *mapping, { struct inode *inode = mapping->host; struct shmem_inode_info *info = SHMEM_I(inode); - pgoff_t index = pos >> PAGE_CACHE_SHIFT; + pgoff_t index = pos >> PAGE_SHIFT; /* i_mutex is held by caller */ if (unlikely(info->seals)) { @@ -1601,16 +1601,16 @@ shmem_write_end(struct file *file, struct address_space *mapping, i_size_write(inode, pos + copied); if (!PageUptodate(page)) { - if (copied < PAGE_CACHE_SIZE) { - unsigned from = pos & (PAGE_CACHE_SIZE - 1); + if (copied < PAGE_SIZE) { + unsigned from = pos & (PAGE_SIZE - 1); zero_user_segments(page, 0, from, - from + copied, PAGE_CACHE_SIZE); + from + copied, PAGE_SIZE); } SetPageUptodate(page); } set_page_dirty(page); unlock_page(page); - page_cache_release(page); + put_page(page); return copied; } @@ -1635,8 +1635,8 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) if (!iter_is_iovec(to)) sgp = SGP_DIRTY; - index = *ppos >> PAGE_CACHE_SHIFT; - offset = *ppos & ~PAGE_CACHE_MASK; + index = *ppos >> PAGE_SHIFT; + offset = *ppos & ~PAGE_MASK; for (;;) { struct page *page = NULL; @@ -1644,11 +1644,11 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) unsigned long nr, ret; loff_t i_size = i_size_read(inode); - end_index = i_size >> PAGE_CACHE_SHIFT; + end_index = i_size >> PAGE_SHIFT; if (index > end_index) break; if (index == end_index) { - nr = i_size & ~PAGE_CACHE_MASK; + nr = i_size & ~PAGE_MASK; if (nr <= offset) break; } @@ -1666,14 +1666,14 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) * We must evaluate after, since reads (unlike writes) * are called without i_mutex protection against truncate */ - nr = PAGE_CACHE_SIZE; + nr = PAGE_SIZE; i_size = i_size_read(inode); - end_index = i_size >> PAGE_CACHE_SHIFT; + end_index = i_size >> PAGE_SHIFT; if (index == end_index) { - nr = i_size & ~PAGE_CACHE_MASK; + nr = i_size & ~PAGE_MASK; if (nr <= offset) { if (page) - page_cache_release(page); + put_page(page); break; } } @@ -1694,7 +1694,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) mark_page_accessed(page); } else { page = ZERO_PAGE(0); - page_cache_get(page); + get_page(page); } /* @@ -1704,10 +1704,10 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) ret = copy_page_to_iter(page, offset, nr, to); retval += ret; offset += ret; - index += offset >> PAGE_CACHE_SHIFT; - offset &= ~PAGE_CACHE_MASK; + index += offset >> PAGE_SHIFT; + offset &= ~PAGE_MASK; - page_cache_release(page); + put_page(page); if (!iov_iter_count(to)) break; if (ret < nr) { @@ -1717,7 +1717,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) cond_resched(); } - *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset; + *ppos = ((loff_t) index << PAGE_SHIFT) + offset; file_accessed(file); return retval ? retval : error; } @@ -1755,9 +1755,9 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, if (splice_grow_spd(pipe, &spd)) return -ENOMEM; - index = *ppos >> PAGE_CACHE_SHIFT; - loff = *ppos & ~PAGE_CACHE_MASK; - req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + index = *ppos >> PAGE_SHIFT; + loff = *ppos & ~PAGE_MASK; + req_pages = (len + loff + PAGE_SIZE - 1) >> PAGE_SHIFT; nr_pages = min(req_pages, spd.nr_pages_max); spd.nr_pages = find_get_pages_contig(mapping, index, @@ -1774,7 +1774,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, index++; } - index = *ppos >> PAGE_CACHE_SHIFT; + index = *ppos >> PAGE_SHIFT; nr_pages = spd.nr_pages; spd.nr_pages = 0; @@ -1784,7 +1784,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, if (!len) break; - this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); + this_len = min_t(unsigned long, len, PAGE_SIZE - loff); page = spd.pages[page_nr]; if (!PageUptodate(page) || page->mapping != mapping) { @@ -1793,19 +1793,19 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, if (error) break; unlock_page(page); - page_cache_release(spd.pages[page_nr]); + put_page(spd.pages[page_nr]); spd.pages[page_nr] = page; } isize = i_size_read(inode); - end_index = (isize - 1) >> PAGE_CACHE_SHIFT; + end_index = (isize - 1) >> PAGE_SHIFT; if (unlikely(!isize || index > end_index)) break; if (end_index == index) { unsigned int plen; - plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; + plen = ((isize - 1) & ~PAGE_MASK) + 1; if (plen <= loff) break; @@ -1822,7 +1822,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, } while (page_nr < nr_pages) - page_cache_release(spd.pages[page_nr++]); + put_page(spd.pages[page_nr++]); if (spd.nr_pages) error = splice_to_pipe(pipe, &spd); @@ -1904,10 +1904,10 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) else if (offset >= inode->i_size) offset = -ENXIO; else { - start = offset >> PAGE_CACHE_SHIFT; - end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + start = offset >> PAGE_SHIFT; + end = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT; new_offset = shmem_seek_hole_data(mapping, start, end, whence); - new_offset <<= PAGE_CACHE_SHIFT; + new_offset <<= PAGE_SHIFT; if (new_offset > offset) { if (new_offset < inode->i_size) offset = new_offset; @@ -2203,8 +2203,8 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, goto out; } - start = offset >> PAGE_CACHE_SHIFT; - end = (offset + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + start = offset >> PAGE_SHIFT; + end = (offset + len + PAGE_SIZE - 1) >> PAGE_SHIFT; /* Try to avoid a swapstorm if len is impossible to satisfy */ if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) { error = -ENOSPC; @@ -2237,8 +2237,8 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, if (error) { /* Remove the !PageUptodate pages we added */ shmem_undo_range(inode, - (loff_t)start << PAGE_CACHE_SHIFT, - (loff_t)index << PAGE_CACHE_SHIFT, true); + (loff_t)start << PAGE_SHIFT, + (loff_t)index << PAGE_SHIFT, true); goto undone; } @@ -2259,7 +2259,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, */ set_page_dirty(page); unlock_page(page); - page_cache_release(page); + put_page(page); cond_resched(); } @@ -2280,7 +2280,7 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); buf->f_type = TMPFS_MAGIC; - buf->f_bsize = PAGE_CACHE_SIZE; + buf->f_bsize = PAGE_SIZE; buf->f_namelen = NAME_MAX; if (sbinfo->max_blocks) { buf->f_blocks = sbinfo->max_blocks; @@ -2523,7 +2523,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s struct shmem_inode_info *info; len = strlen(symname) + 1; - if (len > PAGE_CACHE_SIZE) + if (len > PAGE_SIZE) return -ENAMETOOLONG; inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0, VM_NORESERVE); @@ -2562,7 +2562,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s SetPageUptodate(page); set_page_dirty(page); unlock_page(page); - page_cache_release(page); + put_page(page); } dir->i_size += BOGO_DIRENT_SIZE; dir->i_ctime = dir->i_mtime = CURRENT_TIME; @@ -2835,7 +2835,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, if (*rest) goto bad_val; sbinfo->max_blocks = - DIV_ROUND_UP(size, PAGE_CACHE_SIZE); + DIV_ROUND_UP(size, PAGE_SIZE); } else if (!strcmp(this_char,"nr_blocks")) { sbinfo->max_blocks = memparse(value, &rest); if (*rest) @@ -2940,7 +2940,7 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root) if (sbinfo->max_blocks != shmem_default_max_blocks()) seq_printf(seq, ",size=%luk", - sbinfo->max_blocks << (PAGE_CACHE_SHIFT - 10)); + sbinfo->max_blocks << (PAGE_SHIFT - 10)); if (sbinfo->max_inodes != shmem_default_max_inodes()) seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes); if (sbinfo->mode != (S_IRWXUGO | S_ISVTX)) @@ -3082,8 +3082,8 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent) sbinfo->free_inodes = sbinfo->max_inodes; sb->s_maxbytes = MAX_LFS_FILESIZE; - sb->s_blocksize = PAGE_CACHE_SIZE; - sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_blocksize = PAGE_SIZE; + sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = TMPFS_MAGIC; sb->s_op = &shmem_ops; sb->s_time_gran = 1; diff --git a/mm/swap.c b/mm/swap.c index 09fe5e9..ea641e2 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -114,7 +114,7 @@ void put_pages_list(struct list_head *pages) victim = list_entry(pages->prev, struct page, lru); list_del(&victim->lru); - page_cache_release(victim); + put_page(victim); } } EXPORT_SYMBOL(put_pages_list); @@ -142,7 +142,7 @@ int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write, return seg; pages[seg] = kmap_to_page(kiov[seg].iov_base); - page_cache_get(pages[seg]); + get_page(pages[seg]); } return seg; @@ -236,7 +236,7 @@ void rotate_reclaimable_page(struct page *page) struct pagevec *pvec; unsigned long flags; - page_cache_get(page); + get_page(page); local_irq_save(flags); pvec = this_cpu_ptr(&lru_rotate_pvecs); if (!pagevec_add(pvec, page)) @@ -294,7 +294,7 @@ void activate_page(struct page *page) if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { struct pagevec *pvec = &get_cpu_var(activate_page_pvecs); - page_cache_get(page); + get_page(page); if (!pagevec_add(pvec, page)) pagevec_lru_move_fn(pvec, __activate_page, NULL); put_cpu_var(activate_page_pvecs); @@ -389,7 +389,7 @@ static void __lru_cache_add(struct page *page) { struct pagevec *pvec = &get_cpu_var(lru_add_pvec); - page_cache_get(page); + get_page(page); if (!pagevec_space(pvec)) __pagevec_lru_add(pvec); pagevec_add(pvec, page); @@ -646,7 +646,7 @@ void deactivate_page(struct page *page) if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs); - page_cache_get(page); + get_page(page); if (!pagevec_add(pvec, page)) pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); put_cpu_var(lru_deactivate_pvecs); diff --git a/mm/swap_state.c b/mm/swap_state.c index 69cb246..366ce35 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -85,7 +85,7 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry) VM_BUG_ON_PAGE(PageSwapCache(page), page); VM_BUG_ON_PAGE(!PageSwapBacked(page), page); - page_cache_get(page); + get_page(page); SetPageSwapCache(page); set_page_private(page, entry.val); @@ -109,7 +109,7 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry) VM_BUG_ON(error == -EEXIST); set_page_private(page, 0UL); ClearPageSwapCache(page); - page_cache_release(page); + put_page(page); } return error; @@ -226,7 +226,7 @@ void delete_from_swap_cache(struct page *page) spin_unlock_irq(&address_space->tree_lock); swapcache_free(entry); - page_cache_release(page); + put_page(page); } /* @@ -252,7 +252,7 @@ static inline void free_swap_cache(struct page *page) void free_page_and_swap_cache(struct page *page) { free_swap_cache(page); - page_cache_release(page); + put_page(page); } /* @@ -380,7 +380,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, } while (err != -ENOMEM); if (new_page) - page_cache_release(new_page); + put_page(new_page); return found_page; } @@ -495,7 +495,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, continue; if (offset != entry_offset) SetPageReadahead(page); - page_cache_release(page); + put_page(page); } blk_finish_plug(&plug); diff --git a/mm/swapfile.c b/mm/swapfile.c index 560ad38..83874ec 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -119,7 +119,7 @@ __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) ret = try_to_free_swap(page); unlock_page(page); } - page_cache_release(page); + put_page(page); return ret; } @@ -1000,7 +1000,7 @@ int free_swap_and_cache(swp_entry_t entry) page = find_get_page(swap_address_space(entry), entry.val); if (page && !trylock_page(page)) { - page_cache_release(page); + put_page(page); page = NULL; } } @@ -1017,7 +1017,7 @@ int free_swap_and_cache(swp_entry_t entry) SetPageDirty(page); } unlock_page(page); - page_cache_release(page); + put_page(page); } return p != NULL; } @@ -1518,7 +1518,7 @@ int try_to_unuse(unsigned int type, bool frontswap, } if (retval) { unlock_page(page); - page_cache_release(page); + put_page(page); break; } @@ -1570,7 +1570,7 @@ int try_to_unuse(unsigned int type, bool frontswap, */ SetPageDirty(page); unlock_page(page); - page_cache_release(page); + put_page(page); /* * Make sure that we aren't completely killing @@ -2574,7 +2574,7 @@ bad_swap: out: if (page && !IS_ERR(page)) { kunmap(page); - page_cache_release(page); + put_page(page); } if (name) putname(name); diff --git a/mm/truncate.c b/mm/truncate.c index 7598b55..b002728 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -118,7 +118,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page) return -EIO; if (page_has_private(page)) - do_invalidatepage(page, 0, PAGE_CACHE_SIZE); + do_invalidatepage(page, 0, PAGE_SIZE); /* * Some filesystems seem to re-dirty the page even after @@ -159,8 +159,8 @@ int truncate_inode_page(struct address_space *mapping, struct page *page) { if (page_mapped(page)) { unmap_mapping_range(mapping, - (loff_t)page->index << PAGE_CACHE_SHIFT, - PAGE_CACHE_SIZE, 0); + (loff_t)page->index << PAGE_SHIFT, + PAGE_SIZE, 0); } return truncate_complete_page(mapping, page); } @@ -241,8 +241,8 @@ void truncate_inode_pages_range(struct address_space *mapping, return; /* Offsets within partial pages */ - partial_start = lstart & (PAGE_CACHE_SIZE - 1); - partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1); + partial_start = lstart & (PAGE_SIZE - 1); + partial_end = (lend + 1) & (PAGE_SIZE - 1); /* * 'start' and 'end' always covers the range of pages to be fully @@ -250,7 +250,7 @@ void truncate_inode_pages_range(struct address_space *mapping, * start of the range and 'partial_end' at the end of the range. * Note that 'end' is exclusive while 'lend' is inclusive. */ - start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT; if (lend == -1) /* * lend == -1 indicates end-of-file so we have to set 'end' @@ -259,7 +259,7 @@ void truncate_inode_pages_range(struct address_space *mapping, */ end = -1; else - end = (lend + 1) >> PAGE_CACHE_SHIFT; + end = (lend + 1) >> PAGE_SHIFT; pagevec_init(&pvec, 0); index = start; @@ -298,7 +298,7 @@ void truncate_inode_pages_range(struct address_space *mapping, if (partial_start) { struct page *page = find_lock_page(mapping, start - 1); if (page) { - unsigned int top = PAGE_CACHE_SIZE; + unsigned int top = PAGE_SIZE; if (start > end) { /* Truncation within a single page */ top = partial_end; @@ -311,7 +311,7 @@ void truncate_inode_pages_range(struct address_space *mapping, do_invalidatepage(page, partial_start, top - partial_start); unlock_page(page); - page_cache_release(page); + put_page(page); } } if (partial_end) { @@ -324,7 +324,7 @@ void truncate_inode_pages_range(struct address_space *mapping, do_invalidatepage(page, 0, partial_end); unlock_page(page); - page_cache_release(page); + put_page(page); } } /* @@ -538,7 +538,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) if (mapping->a_ops->freepage) mapping->a_ops->freepage(page); - page_cache_release(page); /* pagecache ref */ + put_page(page); /* pagecache ref */ return 1; failed: spin_unlock_irqrestore(&mapping->tree_lock, flags); @@ -608,18 +608,18 @@ int invalidate_inode_pages2_range(struct address_space *mapping, * Zap the rest of the file in one hit. */ unmap_mapping_range(mapping, - (loff_t)index << PAGE_CACHE_SHIFT, + (loff_t)index << PAGE_SHIFT, (loff_t)(1 + end - index) - << PAGE_CACHE_SHIFT, - 0); + << PAGE_SHIFT, + 0); did_range_unmap = 1; } else { /* * Just zap this page */ unmap_mapping_range(mapping, - (loff_t)index << PAGE_CACHE_SHIFT, - PAGE_CACHE_SIZE, 0); + (loff_t)index << PAGE_SHIFT, + PAGE_SIZE, 0); } } BUG_ON(page_mapped(page)); @@ -744,14 +744,14 @@ void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to) WARN_ON(to > inode->i_size); - if (from >= to || bsize == PAGE_CACHE_SIZE) + if (from >= to || bsize == PAGE_SIZE) return; /* Page straddling @from will not have any hole block created? */ rounded_from = round_up(from, bsize); - if (to <= rounded_from || !(rounded_from & (PAGE_CACHE_SIZE - 1))) + if (to <= rounded_from || !(rounded_from & (PAGE_SIZE - 1))) return; - index = from >> PAGE_CACHE_SHIFT; + index = from >> PAGE_SHIFT; page = find_lock_page(inode->i_mapping, index); /* Page not cached? Nothing to do */ if (!page) @@ -763,7 +763,7 @@ void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to) if (page_mkclean(page)) set_page_dirty(page); unlock_page(page); - page_cache_release(page); + put_page(page); } EXPORT_SYMBOL(pagecache_isize_extended); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 9f3a029..af817e5 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -93,7 +93,7 @@ out_release_uncharge_unlock: pte_unmap_unlock(dst_pte, ptl); mem_cgroup_cancel_charge(page, memcg, false); out_release: - page_cache_release(page); + put_page(page); goto out; } @@ -287,7 +287,7 @@ out_unlock: up_read(&dst_mm->mmap_sem); out: if (page) - page_cache_release(page); + put_page(page); BUG_ON(copied < 0); BUG_ON(err > 0); BUG_ON(!copied && !err); diff --git a/mm/zswap.c b/mm/zswap.c index bf14508..91dad80 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -869,7 +869,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) case ZSWAP_SWAPCACHE_EXIST: /* page is already in the swap cache, ignore for now */ - page_cache_release(page); + put_page(page); ret = -EEXIST; goto fail; @@ -897,7 +897,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) /* start writeback */ __swap_writepage(page, &wbc, end_swap_bio_write); - page_cache_release(page); + put_page(page); zswap_written_back_pages++; spin_lock(&tree->lock); -- cgit v1.1 From ea1754a084760e68886f5b725c8eaada9cc57155 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 1 Apr 2016 15:29:48 +0300 Subject: mm, fs: remove remaining PAGE_CACHE_* and page_cache_{get,release} usage Mostly direct substitution with occasional adjustment or removing outdated comments. Signed-off-by: Kirill A. Shutemov Acked-by: Michal Hocko Signed-off-by: Linus Torvalds --- mm/gup.c | 2 +- mm/memory.c | 1 - mm/mincore.c | 4 ++-- mm/swap.c | 2 +- 4 files changed, 4 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/gup.c b/mm/gup.c index 7f1c4fb..fb87aea 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1107,7 +1107,7 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) * @addr: user address * * Returns struct page pointer of user page pinned for dump, - * to be freed afterwards by page_cache_release() or put_page(). + * to be freed afterwards by put_page(). * * Returns NULL on any kind of failure - a hole must then be inserted into * the corefile, to preserve alignment with its headers; and also returns diff --git a/mm/memory.c b/mm/memory.c index 07a4204..93897f2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2400,7 +2400,6 @@ static inline void unmap_mapping_range_tree(struct rb_root *root, vba = vma->vm_pgoff; vea = vba + vma_pages(vma) - 1; - /* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */ zba = details->first_index; if (zba < vba) zba = vba; diff --git a/mm/mincore.c b/mm/mincore.c index 0eed23d..c0b5ba9 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -211,7 +211,7 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v * return values: * zero - success * -EFAULT - vec points to an illegal address - * -EINVAL - addr is not a multiple of PAGE_CACHE_SIZE + * -EINVAL - addr is not a multiple of PAGE_SIZE * -ENOMEM - Addresses in the range [addr, addr + len] are * invalid for the address space of this process, or * specify one or more pages which are not currently @@ -233,7 +233,7 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len, if (!access_ok(VERIFY_READ, (void __user *) start, len)) return -ENOMEM; - /* This also avoids any overflows on PAGE_CACHE_ALIGN */ + /* This also avoids any overflows on PAGE_ALIGN */ pages = len >> PAGE_SHIFT; pages += (offset_in_page(len)) != 0; diff --git a/mm/swap.c b/mm/swap.c index ea641e2..a0bc206 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -698,7 +698,7 @@ void lru_add_drain_all(void) } /** - * release_pages - batched page_cache_release() + * release_pages - batched put_page() * @pages: array of pages to release * @nr: number of pages * @cold: whether the pages are cache cold -- cgit v1.1