From 6a618957ad17d8f4f4c7eeede752685374b1b176 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 17 Mar 2016 14:17:26 -0700 Subject: mm: oom_kill: don't ignore oom score on exiting tasks When the OOM killer scans tasks and encounters a PF_EXITING one, it force-selects that task regardless of the score. The problem is that if that task got stuck waiting for some state the allocation site is holding, the OOM reaper can not move on to the next best victim. Frankly, I don't even know why we check for exiting tasks in the OOM killer. We've tried direct reclaim at least 15 times by the time we decide the system is OOM, there was plenty of time to exit and free memory; and a task might exit voluntarily right after we issue a kill. This is testing pure noise. Remove it. Signed-off-by: Tetsuo Handa Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Kirill A. Shutemov Cc: Mel Gorman Cc: David Rientjes Cc: Oleg Nesterov Cc: Hugh Dickins Cc: Andrea Argangeli Cc: Rik van Riel Cc: Sasha Levin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index e97a05d..63ced70 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -287,9 +287,6 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, if (oom_task_origin(task)) return OOM_SCAN_SELECT; - if (task_will_free_mem(task) && !is_sysrq_oom(oc)) - return OOM_SCAN_ABORT; - return OOM_SCAN_OK; } -- cgit v1.1 From fcff7d7eebe6d31e2ce20d994555c86a90197034 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 17 Mar 2016 14:17:29 -0700 Subject: mm: memcontrol: do not bypass slab charge if memcg is offline Slab pages are charged in two steps. First, an appropriate per memcg cache is selected (see memcg_kmem_get_cache) basing on the current context, then the new slab page is charged to the memory cgroup which the selected cache was created for (see memcg_charge_slab -> __memcg_kmem_charge_memcg). It is OK to bypass kmemcg charge at step 1, but if step 1 succeeded and we successfully allocated a new slab page, step 2 must be performed, otherwise we would get a per memcg kmem cache which contains a slab that does not hold a reference to the memory cgroup owning the cache. Since per memcg kmem caches are destroyed on memcg css free, this could result in freeing a cache while there are still active objects in it. However, currently we will bypass slab page charge if the memory cgroup owning the cache is offline (see __memcg_kmem_charge_memcg). This is very unlikely to occur in practice, because for this to happen a process must be migrated to a different cgroup and the old cgroup must be removed while the process is in kmalloc somewhere between steps 1 and 2 (e.g. trying to allocate a new page). Nevertheless, it's still better to eliminate such a possibility. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 42882c1..5c9d45e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2325,9 +2325,6 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, struct page_counter *counter; int ret; - if (!memcg_kmem_online(memcg)) - return 0; - ret = try_charge(memcg, gfp, nr_pages); if (ret) return ret; @@ -2346,10 +2343,11 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) { struct mem_cgroup *memcg; - int ret; + int ret = 0; memcg = get_mem_cgroup_from_mm(current->mm); - ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg); + if (memcg_kmem_online(memcg)) + ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg); css_put(&memcg->css); return ret; } -- cgit v1.1 From 72b54e7314a2e7a68567c92bbb32fe2598a3c783 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 17 Mar 2016 14:17:32 -0700 Subject: mm: memcontrol: make tree_{stat,events} fetch all stats Currently, tree_{stat,events} helpers can only get one stat index at a time, so when there are a lot of stats to be reported one has to call it over and over again (see memory_stat_show). This is neither effective, nor does it look good. Instead, let's make these helpers take a snapshot of all available counters. Signed-off-by: Vladimir Davydov Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 67 +++++++++++++++++++++++++++++++++------------------------ 1 file changed, 39 insertions(+), 28 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5c9d45e..4302660 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2717,39 +2717,48 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, return retval; } -static unsigned long tree_stat(struct mem_cgroup *memcg, - enum mem_cgroup_stat_index idx) +static void tree_stat(struct mem_cgroup *memcg, unsigned long *stat) { struct mem_cgroup *iter; - unsigned long val = 0; + int i; - for_each_mem_cgroup_tree(iter, memcg) - val += mem_cgroup_read_stat(iter, idx); + memset(stat, 0, sizeof(*stat) * MEMCG_NR_STAT); - return val; + for_each_mem_cgroup_tree(iter, memcg) { + for (i = 0; i < MEMCG_NR_STAT; i++) + stat[i] += mem_cgroup_read_stat(iter, i); + } } -static unsigned long tree_events(struct mem_cgroup *memcg, - enum mem_cgroup_events_index idx) +static void tree_events(struct mem_cgroup *memcg, unsigned long *events) { struct mem_cgroup *iter; - unsigned long val = 0; + int i; - for_each_mem_cgroup_tree(iter, memcg) - val += mem_cgroup_read_events(iter, idx); + memset(events, 0, sizeof(*events) * MEMCG_NR_EVENTS); - return val; + for_each_mem_cgroup_tree(iter, memcg) { + for (i = 0; i < MEMCG_NR_EVENTS; i++) + events[i] += mem_cgroup_read_events(iter, i); + } } static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) { - unsigned long val; + unsigned long val = 0; if (mem_cgroup_is_root(memcg)) { - val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE); - val += tree_stat(memcg, MEM_CGROUP_STAT_RSS); - if (swap) - val += tree_stat(memcg, MEM_CGROUP_STAT_SWAP); + struct mem_cgroup *iter; + + for_each_mem_cgroup_tree(iter, memcg) { + val += mem_cgroup_read_stat(iter, + MEM_CGROUP_STAT_CACHE); + val += mem_cgroup_read_stat(iter, + MEM_CGROUP_STAT_RSS); + if (swap) + val += mem_cgroup_read_stat(iter, + MEM_CGROUP_STAT_SWAP); + } } else { if (!swap) val = page_counter_read(&memcg->memory); @@ -5075,6 +5084,8 @@ static int memory_events_show(struct seq_file *m, void *v) static int memory_stat_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + unsigned long stat[MEMCG_NR_STAT]; + unsigned long events[MEMCG_NR_EVENTS]; int i; /* @@ -5088,22 +5099,22 @@ static int memory_stat_show(struct seq_file *m, void *v) * Current memory state: */ + tree_stat(memcg, stat); + tree_events(memcg, events); + seq_printf(m, "anon %llu\n", - (u64)tree_stat(memcg, MEM_CGROUP_STAT_RSS) * PAGE_SIZE); + (u64)stat[MEM_CGROUP_STAT_RSS] * PAGE_SIZE); seq_printf(m, "file %llu\n", - (u64)tree_stat(memcg, MEM_CGROUP_STAT_CACHE) * PAGE_SIZE); + (u64)stat[MEM_CGROUP_STAT_CACHE] * PAGE_SIZE); seq_printf(m, "sock %llu\n", - (u64)tree_stat(memcg, MEMCG_SOCK) * PAGE_SIZE); + (u64)stat[MEMCG_SOCK] * PAGE_SIZE); seq_printf(m, "file_mapped %llu\n", - (u64)tree_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED) * - PAGE_SIZE); + (u64)stat[MEM_CGROUP_STAT_FILE_MAPPED] * PAGE_SIZE); seq_printf(m, "file_dirty %llu\n", - (u64)tree_stat(memcg, MEM_CGROUP_STAT_DIRTY) * - PAGE_SIZE); + (u64)stat[MEM_CGROUP_STAT_DIRTY] * PAGE_SIZE); seq_printf(m, "file_writeback %llu\n", - (u64)tree_stat(memcg, MEM_CGROUP_STAT_WRITEBACK) * - PAGE_SIZE); + (u64)stat[MEM_CGROUP_STAT_WRITEBACK] * PAGE_SIZE); for (i = 0; i < NR_LRU_LISTS; i++) { struct mem_cgroup *mi; @@ -5118,9 +5129,9 @@ static int memory_stat_show(struct seq_file *m, void *v) /* Accumulated memory events */ seq_printf(m, "pgfault %lu\n", - tree_events(memcg, MEM_CGROUP_EVENTS_PGFAULT)); + events[MEM_CGROUP_EVENTS_PGFAULT]); seq_printf(m, "pgmajfault %lu\n", - tree_events(memcg, MEM_CGROUP_EVENTS_PGMAJFAULT)); + events[MEM_CGROUP_EVENTS_PGMAJFAULT]); return 0; } -- cgit v1.1 From 27ee57c93ff00b8a2d6c6dd6b0b3dddda7b43b77 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 17 Mar 2016 14:17:35 -0700 Subject: mm: memcontrol: report slab usage in cgroup2 memory.stat Show how much memory is used for storing reclaimable and unreclaimable in-kernel data structures allocated from slab caches. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 8 ++++++++ mm/slab.c | 8 +++++--- mm/slab.h | 30 ++++++++++++++++++++++++++++-- mm/slub.c | 3 ++- 4 files changed, 43 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4302660..3ad64bf 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5106,6 +5106,9 @@ static int memory_stat_show(struct seq_file *m, void *v) (u64)stat[MEM_CGROUP_STAT_RSS] * PAGE_SIZE); seq_printf(m, "file %llu\n", (u64)stat[MEM_CGROUP_STAT_CACHE] * PAGE_SIZE); + seq_printf(m, "slab %llu\n", + (u64)(stat[MEMCG_SLAB_RECLAIMABLE] + + stat[MEMCG_SLAB_UNRECLAIMABLE]) * PAGE_SIZE); seq_printf(m, "sock %llu\n", (u64)stat[MEMCG_SOCK] * PAGE_SIZE); @@ -5126,6 +5129,11 @@ static int memory_stat_show(struct seq_file *m, void *v) mem_cgroup_lru_names[i], (u64)val * PAGE_SIZE); } + seq_printf(m, "slab_reclaimable %llu\n", + (u64)stat[MEMCG_SLAB_RECLAIMABLE] * PAGE_SIZE); + seq_printf(m, "slab_unreclaimable %llu\n", + (u64)stat[MEMCG_SLAB_UNRECLAIMABLE] * PAGE_SIZE); + /* Accumulated memory events */ seq_printf(m, "pgfault %lu\n", diff --git a/mm/slab.c b/mm/slab.c index 852fc5c..56dd0df 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1442,9 +1442,10 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, */ static void kmem_freepages(struct kmem_cache *cachep, struct page *page) { - const unsigned long nr_freed = (1 << cachep->gfporder); + int order = cachep->gfporder; + unsigned long nr_freed = (1 << order); - kmemcheck_free_shadow(page, cachep->gfporder); + kmemcheck_free_shadow(page, order); if (cachep->flags & SLAB_RECLAIM_ACCOUNT) sub_zone_page_state(page_zone(page), @@ -1461,7 +1462,8 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page) if (current->reclaim_state) current->reclaim_state->reclaimed_slab += nr_freed; - __free_kmem_pages(page, cachep->gfporder); + memcg_uncharge_slab(page, order, cachep); + __free_pages(page, order); } static void kmem_rcu_free(struct rcu_head *head) diff --git a/mm/slab.h b/mm/slab.h index b793436..ff39a8f 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -246,12 +246,33 @@ static __always_inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order, struct kmem_cache *s) { + int ret; + if (!memcg_kmem_enabled()) return 0; if (is_root_cache(s)) return 0; - return __memcg_kmem_charge_memcg(page, gfp, order, - s->memcg_params.memcg); + + ret = __memcg_kmem_charge_memcg(page, gfp, order, + s->memcg_params.memcg); + if (ret) + return ret; + + memcg_kmem_update_page_stat(page, + (s->flags & SLAB_RECLAIM_ACCOUNT) ? + MEMCG_SLAB_RECLAIMABLE : MEMCG_SLAB_UNRECLAIMABLE, + 1 << order); + return 0; +} + +static __always_inline void memcg_uncharge_slab(struct page *page, int order, + struct kmem_cache *s) +{ + memcg_kmem_update_page_stat(page, + (s->flags & SLAB_RECLAIM_ACCOUNT) ? + MEMCG_SLAB_RECLAIMABLE : MEMCG_SLAB_UNRECLAIMABLE, + -(1 << order)); + memcg_kmem_uncharge(page, order); } extern void slab_init_memcg_params(struct kmem_cache *); @@ -294,6 +315,11 @@ static inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order, return 0; } +static inline void memcg_uncharge_slab(struct page *page, int order, + struct kmem_cache *s) +{ +} + static inline void slab_init_memcg_params(struct kmem_cache *s) { } diff --git a/mm/slub.c b/mm/slub.c index 6c91324..712d534 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1540,7 +1540,8 @@ static void __free_slab(struct kmem_cache *s, struct page *page) page_mapcount_reset(page); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += pages; - __free_kmem_pages(page, order); + memcg_uncharge_slab(page, order, s); + __free_pages(page, order); } #define need_reserve_slab_rcu \ -- cgit v1.1 From 12580e4b54ba8a1b22ec977c200be0174ca42348 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 17 Mar 2016 14:17:38 -0700 Subject: mm: memcontrol: report kernel stack usage in cgroup2 memory.stat Show how much memory is allocated to kernel stacks. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3ad64bf..4b7dda7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5106,6 +5106,8 @@ static int memory_stat_show(struct seq_file *m, void *v) (u64)stat[MEM_CGROUP_STAT_RSS] * PAGE_SIZE); seq_printf(m, "file %llu\n", (u64)stat[MEM_CGROUP_STAT_CACHE] * PAGE_SIZE); + seq_printf(m, "kernel_stack %llu\n", + (u64)stat[MEMCG_KERNEL_STACK] * PAGE_SIZE); seq_printf(m, "slab %llu\n", (u64)(stat[MEMCG_SLAB_RECLAIMABLE] + stat[MEMCG_SLAB_UNRECLAIMABLE]) * PAGE_SIZE); -- cgit v1.1 From 832fc1de01aea28255cb11d270679b7f1273f0d7 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 17 Mar 2016 14:17:41 -0700 Subject: /proc/kpageflags: return KPF_BUDDY for "tail" buddy pages Currently /proc/kpageflags returns nothing for "tail" buddy pages, which is inconvenient when grasping how free pages are distributed. This patch sets KPF_BUDDY for such pages. With this patch: $ grep MemFree /proc/meminfo ; tools/vm/page-types -b buddy MemFree: 3134992 kB flags page-count MB symbolic-flags long-symbolic-flags 0x0000000000000400 779272 3044 __________B_______________________________ buddy 0x0000000000000c00 4385 17 __________BM______________________________ buddy,mmap total 783657 3061 783657 pages is 3134628 kB (roughly consistent with the global counter,) so it's OK. [akpm@linux-foundation.org: update comment, per Naoya] Signed-off-by: Naoya Horiguchi Reviewed-by: Vladimir Davydov > Cc: Konstantin Khlebnikov Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 3 --- mm/page_alloc.c | 2 -- 2 files changed, 5 deletions(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index ad9400d..b95952c 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -148,9 +148,6 @@ extern int __isolate_free_page(struct page *page, unsigned int order); extern void __free_pages_bootmem(struct page *page, unsigned long pfn, unsigned int order); extern void prep_compound_page(struct page *page, unsigned int order); -#ifdef CONFIG_MEMORY_FAILURE -extern bool is_free_buddy_page(struct page *page); -#endif extern int user_min_free_kbytes; #if defined CONFIG_COMPACTION || defined CONFIG_CMA diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c46b75d..c7332a4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7152,7 +7152,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) } #endif -#ifdef CONFIG_MEMORY_FAILURE bool is_free_buddy_page(struct page *page) { struct zone *zone = page_zone(page); @@ -7171,4 +7170,3 @@ bool is_free_buddy_page(struct page *page) return order < MAX_ORDER; } -#endif -- cgit v1.1 From f48d97f340cbb0c323fa7a7b36bd76a108a9f49f Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 17 Mar 2016 14:17:49 -0700 Subject: mm/vmalloc: query dynamic DEBUG_PAGEALLOC setting As CONFIG_DEBUG_PAGEALLOC can be enabled/disabled via kernel parameters we can optimize some cases by checking the enablement state. This is follow-up work for Christian's Optimize CONFIG_DEBUG_PAGEALLOC: https://lkml.org/lkml/2016/1/27/194 Remaining work is to make sparc to be aware of this but it looks not easy for me so I skip that in this series. This patch (of 5): We can disable debug_pagealloc processing even if the code is complied with CONFIG_DEBUG_PAGEALLOC. This patch changes the code to query whether it is enabled or not in runtime. [akpm@linux-foundation.org: update comment, per David. Adjust comment to use 80 cols] Signed-off-by: Joonsoo Kim Reviewed-by: Christian Borntraeger Acked-by: David Rientjes Cc: Benjamin Herrenschmidt Cc: Takashi Iwai Cc: Chris Metcalf Cc: Christoph Lameter Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index fb42a5b..d4b2e34 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -531,22 +531,21 @@ static void unmap_vmap_area(struct vmap_area *va) static void vmap_debug_free_range(unsigned long start, unsigned long end) { /* - * Unmap page tables and force a TLB flush immediately if - * CONFIG_DEBUG_PAGEALLOC is set. This catches use after free - * bugs similarly to those in linear kernel virtual address - * space after a page has been freed. + * Unmap page tables and force a TLB flush immediately if pagealloc + * debugging is enabled. This catches use after free bugs similarly to + * those in linear kernel virtual address space after a page has been + * freed. * - * All the lazy freeing logic is still retained, in order to - * minimise intrusiveness of this debugging feature. + * All the lazy freeing logic is still retained, in order to minimise + * intrusiveness of this debugging feature. * - * This is going to be *slow* (linear kernel virtual address - * debugging doesn't do a broadcast TLB flush so it is a lot - * faster). + * This is going to be *slow* (linear kernel virtual address debugging + * doesn't do a broadcast TLB flush so it is a lot faster). */ -#ifdef CONFIG_DEBUG_PAGEALLOC - vunmap_page_range(start, end); - flush_tlb_kernel_range(start, end); -#endif + if (debug_pagealloc_enabled()) { + vunmap_page_range(start, end); + flush_tlb_kernel_range(start, end); + } } /* -- cgit v1.1 From 922d566cdcb7166c729ff67bb15ff5f93a3774b6 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 17 Mar 2016 14:17:53 -0700 Subject: mm/slub: query dynamic DEBUG_PAGEALLOC setting We can disable debug_pagealloc processing even if the code is compiled with CONFIG_DEBUG_PAGEALLOC. This patch changes the code to query whether it is enabled or not in runtime. [akpm@linux-foundation.org: clean up code, per Christian] Signed-off-by: Joonsoo Kim Reviewed-by: Christian Borntraeger Cc: Benjamin Herrenschmidt Cc: Chris Metcalf Cc: Christoph Lameter Cc: David Rientjes Cc: Pekka Enberg Cc: Takashi Iwai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 712d534..2f2f04d 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -254,11 +254,10 @@ static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) { void *p; -#ifdef CONFIG_DEBUG_PAGEALLOC + if (!debug_pagealloc_enabled()) + return get_freepointer(s, object); + probe_kernel_read(&p, (void **)(object + s->offset), sizeof(p)); -#else - p = get_freepointer(s, object); -#endif return p; } -- cgit v1.1 From 505f6d22dbc63f333d1178dc80264e40b5c35268 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 17 Mar 2016 14:17:56 -0700 Subject: sound: query dynamic DEBUG_PAGEALLOC setting We can disable debug_pagealloc processing even if the code is compiled with CONFIG_DEBUG_PAGEALLOC. This patch changes the code to query whether it is enabled or not in runtime. [akpm@linux-foundation.org: export _debug_pagealloc_enabled to modules] Signed-off-by: Joonsoo Kim Acked-by: David Rientjes Acked-by: Takashi Iwai Cc: Benjamin Herrenschmidt Cc: Chris Metcalf Cc: Christian Borntraeger Cc: Christoph Lameter Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c7332a4..b1fc19e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -498,6 +498,7 @@ void prep_compound_page(struct page *page, unsigned int order) unsigned int _debug_guardpage_minorder; bool _debug_pagealloc_enabled __read_mostly = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT); +EXPORT_SYMBOL(_debug_pagealloc_enabled); bool _debug_guardpage_enabled __read_mostly; static int __init early_debug_pagealloc(char *buf) -- cgit v1.1 From 81c5857b279e6b18f6ff0d1975e80a07af542cd1 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 17 Mar 2016 14:18:05 -0700 Subject: mm, kswapd: remove bogus check of balance_classzone_idx During work on kcompactd integration I have spotted a confusing check of balance_classzone_idx, which I believe is bogus. The balanced_classzone_idx is filled by balance_pgdat() as the highest zone it attempted to balance. This was introduced by commit dc83edd941f4 ("mm: kswapd: use the classzone idx that kswapd was using for sleeping_prematurely()"). The intention is that (as expressed in today's function names), the value used for kswapd_shrink_zone() calls in balance_pgdat() is the same as for the decisions in kswapd_try_to_sleep(). An unwanted side-effect of that commit was breaking the checks in kswapd() whether there was another kswapd_wakeup with a tighter (=lower) classzone_idx. Commits 215ddd6664ce ("mm: vmscan: only read new_classzone_idx from pgdat when reclaiming successfully") and d2ebd0f6b895 ("kswapd: avoid unnecessary rebalance after an unsuccessful balancing") tried to fixed, but apparently introduced a bogus check that this patch removes. Consider zone indexes X < Y < Z, where: - Z is the value used for the first kswapd wakeup. - Y is returned as balanced_classzone_idx, which means zones with index higher than Y (including Z) were found to be unreclaimable. - X is the value used for the second kswapd wakeup The new wakeup with value X means that kswapd is now supposed to balance harder all zones with index <= X. But instead, due to Y < Z, it will go sleep and won't read the new value X. This is subtly wrong. The effect of this patch is that kswapd will react better in some situations, where e.g. the first wakeup is for ZONE_DMA32, the second is for ZONE_DMA, and due to unreclaimable ZONE_NORMAL. Before this patch, kswapd would go sleep instead of reclaiming ZONE_DMA harder. I expect these situations are very rare, and more value is in better maintainability due to the removal of confusing and bogus check. Signed-off-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: "Kirill A. Shutemov" Cc: Rik van Riel Cc: Joonsoo Kim Cc: Mel Gorman Cc: David Rientjes Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index dd98447..5dcc711 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3468,8 +3468,7 @@ static int kswapd(void *p) * new request of a similar or harder type will succeed soon * so consider going to sleep on the basis we reclaimed at */ - if (balanced_classzone_idx >= new_classzone_idx && - balanced_order == new_order) { + if (balanced_order == new_order) { new_order = pgdat->kswapd_max_order; new_classzone_idx = pgdat->classzone_idx; pgdat->kswapd_max_order = 0; -- cgit v1.1 From 698b1b30642f1ff0ea10ef1de9745ab633031377 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 17 Mar 2016 14:18:08 -0700 Subject: mm, compaction: introduce kcompactd Memory compaction can be currently performed in several contexts: - kswapd balancing a zone after a high-order allocation failure - direct compaction to satisfy a high-order allocation, including THP page fault attemps - khugepaged trying to collapse a hugepage - manually from /proc The purpose of compaction is two-fold. The obvious purpose is to satisfy a (pending or future) high-order allocation, and is easy to evaluate. The other purpose is to keep overal memory fragmentation low and help the anti-fragmentation mechanism. The success wrt the latter purpose is more The current situation wrt the purposes has a few drawbacks: - compaction is invoked only when a high-order page or hugepage is not available (or manually). This might be too late for the purposes of keeping memory fragmentation low. - direct compaction increases latency of allocations. Again, it would be better if compaction was performed asynchronously to keep fragmentation low, before the allocation itself comes. - (a special case of the previous) the cost of compaction during THP page faults can easily offset the benefits of THP. - kswapd compaction appears to be complex, fragile and not working in some scenarios. It could also end up compacting for a high-order allocation request when it should be reclaiming memory for a later order-0 request. To improve the situation, we should be able to benefit from an equivalent of kswapd, but for compaction - i.e. a background thread which responds to fragmentation and the need for high-order allocations (including hugepages) somewhat proactively. One possibility is to extend the responsibilities of kswapd, which could however complicate its design too much. It should be better to let kswapd handle reclaim, as order-0 allocations are often more critical than high-order ones. Another possibility is to extend khugepaged, but this kthread is a single instance and tied to THP configs. This patch goes with the option of a new set of per-node kthreads called kcompactd, and lays the foundations, without introducing any new tunables. The lifecycle mimics kswapd kthreads, including the memory hotplug hooks. For compaction, kcompactd uses the standard compaction_suitable() and ompact_finished() criteria and the deferred compaction functionality. Unlike direct compaction, it uses only sync compaction, as there's no allocation latency to minimize. This patch doesn't yet add a call to wakeup_kcompactd. The kswapd compact/reclaim loop for high-order pages will be replaced by waking up kcompactd in the next patch with the description of what's wrong with the old approach. Waking up of the kcompactd threads is also tied to kswapd activity and follows these rules: - we don't want to affect any fastpaths, so wake up kcompactd only from the slowpath, as it's done for kswapd - if kswapd is doing reclaim, it's more important than compaction, so don't invoke kcompactd until kswapd goes to sleep - the target order used for kswapd is passed to kcompactd Future possible future uses for kcompactd include the ability to wake up kcompactd on demand in special situations, such as when hugepages are not available (currently not done due to __GFP_NO_KSWAPD) or when a fragmentation event (i.e. __rmqueue_fallback()) occurs. It's also possible to perform periodic compaction with kcompactd. [arnd@arndb.de: fix build errors with kcompactd] [paul.gortmaker@windriver.com: don't use modular references for non modular code] Signed-off-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: "Kirill A. Shutemov" Cc: Rik van Riel Cc: Joonsoo Kim Cc: Mel Gorman Cc: David Rientjes Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Arnd Bergmann Signed-off-by: Paul Gortmaker Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 222 ++++++++++++++++++++++++++++++++++++++++++++++++++++ mm/memory_hotplug.c | 9 ++- mm/page_alloc.c | 3 + mm/vmstat.c | 1 + 4 files changed, 233 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 93f71d9..5b2bfba 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -7,6 +7,7 @@ * * Copyright IBM Corp. 2007-2010 Mel Gorman */ +#include #include #include #include @@ -17,6 +18,8 @@ #include #include #include +#include +#include #include "internal.h" #ifdef CONFIG_COMPACTION @@ -1736,4 +1739,223 @@ void compaction_unregister_node(struct node *node) } #endif /* CONFIG_SYSFS && CONFIG_NUMA */ +static inline bool kcompactd_work_requested(pg_data_t *pgdat) +{ + return pgdat->kcompactd_max_order > 0; +} + +static bool kcompactd_node_suitable(pg_data_t *pgdat) +{ + int zoneid; + struct zone *zone; + enum zone_type classzone_idx = pgdat->kcompactd_classzone_idx; + + for (zoneid = 0; zoneid < classzone_idx; zoneid++) { + zone = &pgdat->node_zones[zoneid]; + + if (!populated_zone(zone)) + continue; + + if (compaction_suitable(zone, pgdat->kcompactd_max_order, 0, + classzone_idx) == COMPACT_CONTINUE) + return true; + } + + return false; +} + +static void kcompactd_do_work(pg_data_t *pgdat) +{ + /* + * With no special task, compact all zones so that a page of requested + * order is allocatable. + */ + int zoneid; + struct zone *zone; + struct compact_control cc = { + .order = pgdat->kcompactd_max_order, + .classzone_idx = pgdat->kcompactd_classzone_idx, + .mode = MIGRATE_SYNC_LIGHT, + .ignore_skip_hint = true, + + }; + bool success = false; + + trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order, + cc.classzone_idx); + count_vm_event(KCOMPACTD_WAKE); + + for (zoneid = 0; zoneid < cc.classzone_idx; zoneid++) { + int status; + + zone = &pgdat->node_zones[zoneid]; + if (!populated_zone(zone)) + continue; + + if (compaction_deferred(zone, cc.order)) + continue; + + if (compaction_suitable(zone, cc.order, 0, zoneid) != + COMPACT_CONTINUE) + continue; + + cc.nr_freepages = 0; + cc.nr_migratepages = 0; + cc.zone = zone; + INIT_LIST_HEAD(&cc.freepages); + INIT_LIST_HEAD(&cc.migratepages); + + status = compact_zone(zone, &cc); + + if (zone_watermark_ok(zone, cc.order, low_wmark_pages(zone), + cc.classzone_idx, 0)) { + success = true; + compaction_defer_reset(zone, cc.order, false); + } else if (status == COMPACT_COMPLETE) { + /* + * We use sync migration mode here, so we defer like + * sync direct compaction does. + */ + defer_compaction(zone, cc.order); + } + + VM_BUG_ON(!list_empty(&cc.freepages)); + VM_BUG_ON(!list_empty(&cc.migratepages)); + } + + /* + * Regardless of success, we are done until woken up next. But remember + * the requested order/classzone_idx in case it was higher/tighter than + * our current ones + */ + if (pgdat->kcompactd_max_order <= cc.order) + pgdat->kcompactd_max_order = 0; + if (pgdat->kcompactd_classzone_idx >= cc.classzone_idx) + pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1; +} + +void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx) +{ + if (!order) + return; + + if (pgdat->kcompactd_max_order < order) + pgdat->kcompactd_max_order = order; + + if (pgdat->kcompactd_classzone_idx > classzone_idx) + pgdat->kcompactd_classzone_idx = classzone_idx; + + if (!waitqueue_active(&pgdat->kcompactd_wait)) + return; + + if (!kcompactd_node_suitable(pgdat)) + return; + + trace_mm_compaction_wakeup_kcompactd(pgdat->node_id, order, + classzone_idx); + wake_up_interruptible(&pgdat->kcompactd_wait); +} + +/* + * The background compaction daemon, started as a kernel thread + * from the init process. + */ +static int kcompactd(void *p) +{ + pg_data_t *pgdat = (pg_data_t*)p; + struct task_struct *tsk = current; + + const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); + + if (!cpumask_empty(cpumask)) + set_cpus_allowed_ptr(tsk, cpumask); + + set_freezable(); + + pgdat->kcompactd_max_order = 0; + pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1; + + while (!kthread_should_stop()) { + trace_mm_compaction_kcompactd_sleep(pgdat->node_id); + wait_event_freezable(pgdat->kcompactd_wait, + kcompactd_work_requested(pgdat)); + + kcompactd_do_work(pgdat); + } + + return 0; +} + +/* + * This kcompactd start function will be called by init and node-hot-add. + * On node-hot-add, kcompactd will moved to proper cpus if cpus are hot-added. + */ +int kcompactd_run(int nid) +{ + pg_data_t *pgdat = NODE_DATA(nid); + int ret = 0; + + if (pgdat->kcompactd) + return 0; + + pgdat->kcompactd = kthread_run(kcompactd, pgdat, "kcompactd%d", nid); + if (IS_ERR(pgdat->kcompactd)) { + pr_err("Failed to start kcompactd on node %d\n", nid); + ret = PTR_ERR(pgdat->kcompactd); + pgdat->kcompactd = NULL; + } + return ret; +} + +/* + * Called by memory hotplug when all memory in a node is offlined. Caller must + * hold mem_hotplug_begin/end(). + */ +void kcompactd_stop(int nid) +{ + struct task_struct *kcompactd = NODE_DATA(nid)->kcompactd; + + if (kcompactd) { + kthread_stop(kcompactd); + NODE_DATA(nid)->kcompactd = NULL; + } +} + +/* + * It's optimal to keep kcompactd on the same CPUs as their memory, but + * not required for correctness. So if the last cpu in a node goes + * away, we get changed to run anywhere: as the first one comes back, + * restore their cpu bindings. + */ +static int cpu_callback(struct notifier_block *nfb, unsigned long action, + void *hcpu) +{ + int nid; + + if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { + for_each_node_state(nid, N_MEMORY) { + pg_data_t *pgdat = NODE_DATA(nid); + const struct cpumask *mask; + + mask = cpumask_of_node(pgdat->node_id); + + if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) + /* One of our CPUs online: restore mask */ + set_cpus_allowed_ptr(pgdat->kcompactd, mask); + } + } + return NOTIFY_OK; +} + +static int __init kcompactd_init(void) +{ + int nid; + + for_each_node_state(nid, N_MEMORY) + kcompactd_run(nid); + hotcpu_notifier(cpu_callback, 0); + return 0; +} +subsys_initcall(kcompactd_init) + #endif /* CONFIG_COMPACTION */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 24ea063..d9bcb26 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -33,6 +33,7 @@ #include #include #include +#include #include @@ -1105,8 +1106,10 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ init_per_zone_wmark_min(); - if (onlined_pages) + if (onlined_pages) { kswapd_run(zone_to_nid(zone)); + kcompactd_run(nid); + } vm_total_pages = nr_free_pagecache_pages(); @@ -1880,8 +1883,10 @@ repeat: zone_pcp_update(zone); node_states_clear_node(node, &arg); - if (arg.status_change_nid >= 0) + if (arg.status_change_nid >= 0) { kswapd_stop(node); + kcompactd_stop(node); + } vm_total_pages = nr_free_pagecache_pages(); writeback_set_ratelimit(); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b1fc19e..25a75da 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5405,6 +5405,9 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) #endif init_waitqueue_head(&pgdat->kswapd_wait); init_waitqueue_head(&pgdat->pfmemalloc_wait); +#ifdef CONFIG_COMPACTION + init_waitqueue_head(&pgdat->kcompactd_wait); +#endif pgdat_page_ext_init(pgdat); for (j = 0; j < MAX_NR_ZONES; j++) { diff --git a/mm/vmstat.c b/mm/vmstat.c index 69ce64f..f800662 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -826,6 +826,7 @@ const char * const vmstat_text[] = { "compact_stall", "compact_fail", "compact_success", + "compact_daemon_wake", #endif #ifdef CONFIG_HUGETLB_PAGE -- cgit v1.1 From e888ca3545dc6823603b976e40b62af2c68b6fcc Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 17 Mar 2016 14:18:12 -0700 Subject: mm, memory hotplug: small cleanup in online_pages() We can reuse the nid we've determined instead of repeated pfn_to_nid() usages. Also zone_to_nid() should be a bit cheaper in general than pfn_to_nid(). Signed-off-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: "Kirill A. Shutemov" Cc: Rik van Riel Cc: Joonsoo Kim Cc: Mel Gorman Cc: David Rientjes Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index d9bcb26..c832ef3 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1055,7 +1055,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ arg.nr_pages = nr_pages; node_states_check_changes_online(nr_pages, zone, &arg); - nid = pfn_to_nid(pfn); + nid = zone_to_nid(zone); ret = memory_notify(MEM_GOING_ONLINE, &arg); ret = notifier_to_errno(ret); @@ -1095,7 +1095,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ pgdat_resize_unlock(zone->zone_pgdat, &flags); if (onlined_pages) { - node_states_set_node(zone_to_nid(zone), &arg); + node_states_set_node(nid, &arg); if (need_zonelists_rebuild) build_all_zonelists(NULL, NULL); else @@ -1107,7 +1107,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ init_per_zone_wmark_min(); if (onlined_pages) { - kswapd_run(zone_to_nid(zone)); + kswapd_run(nid); kcompactd_run(nid); } -- cgit v1.1 From accf62422b3a67fce8ce086aa81c8300ddbf42be Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 17 Mar 2016 14:18:15 -0700 Subject: mm, kswapd: replace kswapd compaction with waking up kcompactd Similarly to direct reclaim/compaction, kswapd attempts to combine reclaim and compaction to attempt making memory allocation of given order available. The details differ from direct reclaim e.g. in having high watermark as a goal. The code involved in kswapd's reclaim/compaction decisions has evolved to be quite complex. Testing reveals that it doesn't actually work in at least one scenario, and closer inspection suggests that it could be greatly simplified without compromising on the goal (make high-order page available) or efficiency (don't reclaim too much). The simplification relieas of doing all compaction in kcompactd, which is simply woken up when high watermarks are reached by kswapd's reclaim. The scenario where kswapd compaction doesn't work was found with mmtests test stress-highalloc configured to attempt order-9 allocations without direct reclaim, just waking up kswapd. There was no compaction attempt from kswapd during the whole test. Some added instrumentation shows what happens: - balance_pgdat() sets end_zone to Normal, as it's not balanced - reclaim is attempted on DMA zone, which sets nr_attempted to 99, but it cannot reclaim anything, so sc.nr_reclaimed is 0 - for zones DMA32 and Normal, kswapd_shrink_zone uses testorder=0, so it merely checks if high watermarks were reached for base pages. This is true, so no reclaim is attempted. For DMA, testorder=0 wasn't used, as compaction_suitable() returned COMPACT_SKIPPED - even though the pgdat_needs_compaction flag wasn't set to false, no compaction happens due to the condition sc.nr_reclaimed > nr_attempted being false (as 0 < 99) - priority-- due to nr_reclaimed being 0, repeat until priority reaches 0 pgdat_balanced() is false as only the small zone DMA appears balanced (curiously in that check, watermark appears OK and compaction_suitable() returns COMPACT_PARTIAL, because a lower classzone_idx is used there) Now, even if it was decided that reclaim shouldn't be attempted on the DMA zone, the scenario would be the same, as (sc.nr_reclaimed=0 > nr_attempted=0) is also false. The condition really should use >= as the comment suggests. Then there is a mismatch in the check for setting pgdat_needs_compaction to false using low watermark, while the rest uses high watermark, and who knows what other subtlety. Hopefully this demonstrates that this is unsustainable. Luckily we can simplify this a lot. The reclaim/compaction decisions make sense for direct reclaim scenario, but in kswapd, our primary goal is to reach high watermark in order-0 pages. Afterwards we can attempt compaction just once. Unlike direct reclaim, we don't reclaim extra pages (over the high watermark), the current code already disallows it for good reasons. After this patch, we simply wake up kcompactd to process the pgdat, after we have either succeeded or failed to reach the high watermarks in kswapd, which goes to sleep. We pass kswapd's order and classzone_idx, so kcompactd can apply the same criteria to determine which zones are worth compacting. Note that we use the classzone_idx from wakeup_kswapd(), not balanced_classzone_idx which can include higher zones that kswapd tried to balance too, but didn't consider them in pgdat_balanced(). Since kswapd now cannot create high-order pages itself, we need to adjust how it determines the zones to be balanced. The key element here is adding a "highorder" parameter to zone_balanced, which, when set to false, makes it consider only order-0 watermark instead of the desired higher order (this was done previously by kswapd_shrink_zone(), but not elsewhere). This false is passed for example in pgdat_balanced(). Importantly, wakeup_kswapd() uses true to make sure kswapd and thus kcompactd are woken up for a high-order allocation failure. The last thing is to decide what to do with pageblock_skip bitmap handling. Compaction maintains a pageblock_skip bitmap to record pageblocks where isolation recently failed. This bitmap can be reset by three ways: 1) direct compaction is restarting after going through the full deferred cycle 2) kswapd goes to sleep, and some other direct compaction has previously finished scanning the whole zone and set zone->compact_blockskip_flush. Note that a successful direct compaction clears this flag. 3) compaction was invoked manually via trigger in /proc The case 2) is somewhat fuzzy to begin with, but after introducing kcompactd we should update it. The check for direct compaction in 1), and to set the flush flag in 2) use current_is_kswapd(), which doesn't work for kcompactd. Thus, this patch adds bool direct_compaction to compact_control to use in 2). For the case 1) we remove the check completely - unlike the former kswapd compaction, kcompactd does use the deferred compaction functionality, so flushing tied to restarting from deferred compaction makes sense here. Note that when kswapd goes to sleep, kcompactd is woken up, so it will see the flushed pageblock_skip bits. This is different from when the former kswapd compaction observed the bits and I believe it makes more sense. Kcompactd can afford to be more thorough than a direct compaction trying to limit allocation latency, or kswapd whose primary goal is to reclaim. For testing, I used stress-highalloc configured to do order-9 allocations with GFP_NOWAIT|__GFP_HIGH|__GFP_COMP, so they relied just on kswapd/kcompactd reclaim/compaction (the interfering kernel builds in phases 1 and 2 work as usual): stress-highalloc 4.5-rc1+before 4.5-rc1+after -nodirect -nodirect Success 1 Min 1.00 ( 0.00%) 5.00 (-66.67%) Success 1 Mean 1.40 ( 0.00%) 6.20 (-55.00%) Success 1 Max 2.00 ( 0.00%) 7.00 (-16.67%) Success 2 Min 1.00 ( 0.00%) 5.00 (-66.67%) Success 2 Mean 1.80 ( 0.00%) 6.40 (-52.38%) Success 2 Max 3.00 ( 0.00%) 7.00 (-16.67%) Success 3 Min 34.00 ( 0.00%) 62.00 ( 1.59%) Success 3 Mean 41.80 ( 0.00%) 63.80 ( 1.24%) Success 3 Max 53.00 ( 0.00%) 65.00 ( 2.99%) User 3166.67 3181.09 System 1153.37 1158.25 Elapsed 1768.53 1799.37 4.5-rc1+before 4.5-rc1+after -nodirect -nodirect Direct pages scanned 32938 32797 Kswapd pages scanned 2183166 2202613 Kswapd pages reclaimed 2152359 2143524 Direct pages reclaimed 32735 32545 Percentage direct scans 1% 1% THP fault alloc 579 612 THP collapse alloc 304 316 THP splits 0 0 THP fault fallback 793 778 THP collapse fail 11 16 Compaction stalls 1013 1007 Compaction success 92 67 Compaction failures 920 939 Page migrate success 238457 721374 Page migrate failure 23021 23469 Compaction pages isolated 504695 1479924 Compaction migrate scanned 661390 8812554 Compaction free scanned 13476658 84327916 Compaction cost 262 838 After this patch we see improvements in allocation success rate (especially for phase 3) along with increased compaction activity. The compaction stalls (direct compaction) in the interfering kernel builds (probably THP's) also decreased somewhat thanks to kcompactd activity, yet THP alloc successes improved a bit. Note that elapsed and user time isn't so useful for this benchmark, because of the background interference being unpredictable. It's just to quickly spot some major unexpected differences. System time is somewhat more useful and that didn't increase. Also (after adjusting mmtests' ftrace monitor): Time kswapd awake 2547781 2269241 Time kcompactd awake 0 119253 Time direct compacting 939937 557649 Time kswapd compacting 0 0 Time kcompactd compacting 0 119099 The decrease of overal time spent compacting appears to not match the increased compaction stats. I suspect the tasks get rescheduled and since the ftrace monitor doesn't see that, the reported time is wall time, not CPU time. But arguably direct compactors care about overall latency anyway, whether busy compacting or waiting for CPU doesn't matter. And that latency seems to almost halved. It's also interesting how much time kswapd spent awake just going through all the priorities and failing to even try compacting, over and over. We can also configure stress-highalloc to perform both direct reclaim/compaction and wakeup kswapd/kcompactd, by using GFP_KERNEL|__GFP_HIGH|__GFP_COMP: stress-highalloc 4.5-rc1+before 4.5-rc1+after -direct -direct Success 1 Min 4.00 ( 0.00%) 9.00 (-50.00%) Success 1 Mean 8.00 ( 0.00%) 10.00 (-19.05%) Success 1 Max 12.00 ( 0.00%) 11.00 ( 15.38%) Success 2 Min 4.00 ( 0.00%) 9.00 (-50.00%) Success 2 Mean 8.20 ( 0.00%) 10.00 (-16.28%) Success 2 Max 13.00 ( 0.00%) 11.00 ( 8.33%) Success 3 Min 75.00 ( 0.00%) 74.00 ( 1.33%) Success 3 Mean 75.60 ( 0.00%) 75.20 ( 0.53%) Success 3 Max 77.00 ( 0.00%) 76.00 ( 0.00%) User 3344.73 3246.04 System 1194.24 1172.29 Elapsed 1838.04 1836.76 4.5-rc1+before 4.5-rc1+after -direct -direct Direct pages scanned 125146 120966 Kswapd pages scanned 2119757 2135012 Kswapd pages reclaimed 2073183 2108388 Direct pages reclaimed 124909 120577 Percentage direct scans 5% 5% THP fault alloc 599 652 THP collapse alloc 323 354 THP splits 0 0 THP fault fallback 806 793 THP collapse fail 17 16 Compaction stalls 2457 2025 Compaction success 906 518 Compaction failures 1551 1507 Page migrate success 2031423 2360608 Page migrate failure 32845 40852 Compaction pages isolated 4129761 4802025 Compaction migrate scanned 11996712 21750613 Compaction free scanned 214970969 344372001 Compaction cost 2271 2694 In this scenario, this patch doesn't change the overall success rate as direct compaction already tries all it can. There's however significant reduction in direct compaction stalls (that is, the number of allocations that went into direct compaction). The number of successes (i.e. direct compaction stalls that ended up with successful allocation) is reduced by the same number. This means the offload to kcompactd is working as expected, and direct compaction is reduced either due to detecting contention, or compaction deferred by kcompactd. In the previous version of this patchset there was some apparent reduction of success rate, but the changes in this version (such as using sync compaction only), new baseline kernel, and/or averaging results from 5 executions (my bet), made this go away. Ftrace-based stats seem to roughly agree: Time kswapd awake 2532984 2326824 Time kcompactd awake 0 257916 Time direct compacting 864839 735130 Time kswapd compacting 0 0 Time kcompactd compacting 0 257585 Signed-off-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: "Kirill A. Shutemov" Cc: Rik van Riel Cc: Joonsoo Kim Cc: Mel Gorman Cc: David Rientjes Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 10 ++-- mm/internal.h | 1 + mm/vmscan.c | 147 ++++++++++++++++++-------------------------------------- 3 files changed, 54 insertions(+), 104 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 5b2bfba..ccf97b0 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1191,11 +1191,11 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc, /* * Mark that the PG_migrate_skip information should be cleared - * by kswapd when it goes to sleep. kswapd does not set the + * by kswapd when it goes to sleep. kcompactd does not set the * flag itself as the decision to be clear should be directly * based on an allocation request. */ - if (!current_is_kswapd()) + if (cc->direct_compaction) zone->compact_blockskip_flush = true; return COMPACT_COMPLETE; @@ -1338,10 +1338,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) /* * Clear pageblock skip if there were failures recently and compaction - * is about to be retried after being deferred. kswapd does not do - * this reset as it'll reset the cached information when going to sleep. + * is about to be retried after being deferred. */ - if (compaction_restarting(zone, cc->order) && !current_is_kswapd()) + if (compaction_restarting(zone, cc->order)) __reset_isolation_suitable(zone); /* @@ -1477,6 +1476,7 @@ static unsigned long compact_zone_order(struct zone *zone, int order, .mode = mode, .alloc_flags = alloc_flags, .classzone_idx = classzone_idx, + .direct_compaction = true, }; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); diff --git a/mm/internal.h b/mm/internal.h index b95952c..4042a8a 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -172,6 +172,7 @@ struct compact_control { unsigned long last_migrated_pfn;/* Not yet flushed page being freed */ enum migrate_mode mode; /* Async or sync migration mode */ bool ignore_skip_hint; /* Scan blocks even if marked skip */ + bool direct_compaction; /* False from kcompactd or /proc/... */ int order; /* order a direct compactor needs */ const gfp_t gfp_mask; /* gfp mask of a direct compactor */ const int alloc_flags; /* alloc flags of a direct compactor */ diff --git a/mm/vmscan.c b/mm/vmscan.c index 5dcc711..f87cfaa 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2968,18 +2968,23 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc) } while (memcg); } -static bool zone_balanced(struct zone *zone, int order, - unsigned long balance_gap, int classzone_idx) +static bool zone_balanced(struct zone *zone, int order, bool highorder, + unsigned long balance_gap, int classzone_idx) { - if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) + - balance_gap, classzone_idx)) - return false; + unsigned long mark = high_wmark_pages(zone) + balance_gap; - if (IS_ENABLED(CONFIG_COMPACTION) && order && compaction_suitable(zone, - order, 0, classzone_idx) == COMPACT_SKIPPED) - return false; + /* + * When checking from pgdat_balanced(), kswapd should stop and sleep + * when it reaches the high order-0 watermark and let kcompactd take + * over. Other callers such as wakeup_kswapd() want to determine the + * true high-order watermark. + */ + if (IS_ENABLED(CONFIG_COMPACTION) && !highorder) { + mark += (1UL << order); + order = 0; + } - return true; + return zone_watermark_ok_safe(zone, order, mark, classzone_idx); } /* @@ -3029,7 +3034,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) continue; } - if (zone_balanced(zone, order, 0, i)) + if (zone_balanced(zone, order, false, 0, i)) balanced_pages += zone->managed_pages; else if (!order) return false; @@ -3083,10 +3088,8 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, */ static bool kswapd_shrink_zone(struct zone *zone, int classzone_idx, - struct scan_control *sc, - unsigned long *nr_attempted) + struct scan_control *sc) { - int testorder = sc->order; unsigned long balance_gap; bool lowmem_pressure; @@ -3094,17 +3097,6 @@ static bool kswapd_shrink_zone(struct zone *zone, sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone)); /* - * Kswapd reclaims only single pages with compaction enabled. Trying - * too hard to reclaim until contiguous free pages have become - * available can hurt performance by evicting too much useful data - * from memory. Do not reclaim more than needed for compaction. - */ - if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && - compaction_suitable(zone, sc->order, 0, classzone_idx) - != COMPACT_SKIPPED) - testorder = 0; - - /* * We put equal pressure on every zone, unless one zone has way too * many pages free already. The "too many pages" is defined as the * high wmark plus a "gap" where the gap is either the low @@ -3118,15 +3110,12 @@ static bool kswapd_shrink_zone(struct zone *zone, * reclaim is necessary */ lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone)); - if (!lowmem_pressure && zone_balanced(zone, testorder, + if (!lowmem_pressure && zone_balanced(zone, sc->order, false, balance_gap, classzone_idx)) return true; shrink_zone(zone, sc, zone_idx(zone) == classzone_idx); - /* Account for the number of pages attempted to reclaim */ - *nr_attempted += sc->nr_to_reclaim; - clear_bit(ZONE_WRITEBACK, &zone->flags); /* @@ -3136,7 +3125,7 @@ static bool kswapd_shrink_zone(struct zone *zone, * waits. */ if (zone_reclaimable(zone) && - zone_balanced(zone, testorder, 0, classzone_idx)) { + zone_balanced(zone, sc->order, false, 0, classzone_idx)) { clear_bit(ZONE_CONGESTED, &zone->flags); clear_bit(ZONE_DIRTY, &zone->flags); } @@ -3148,7 +3137,7 @@ static bool kswapd_shrink_zone(struct zone *zone, * For kswapd, balance_pgdat() will work across all this node's zones until * they are all at high_wmark_pages(zone). * - * Returns the final order kswapd was reclaiming at + * Returns the highest zone idx kswapd was reclaiming at * * There is special handling here for zones which are full of pinned pages. * This can happen if the pages are all mlocked, or if they are all used by @@ -3165,8 +3154,7 @@ static bool kswapd_shrink_zone(struct zone *zone, * interoperates with the page allocator fallback scheme to ensure that aging * of pages is balanced across the zones. */ -static unsigned long balance_pgdat(pg_data_t *pgdat, int order, - int *classzone_idx) +static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) { int i; int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ @@ -3183,9 +3171,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, count_vm_event(PAGEOUTRUN); do { - unsigned long nr_attempted = 0; bool raise_priority = true; - bool pgdat_needs_compaction = (order > 0); sc.nr_reclaimed = 0; @@ -3220,7 +3206,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, break; } - if (!zone_balanced(zone, order, 0, 0)) { + if (!zone_balanced(zone, order, false, 0, 0)) { end_zone = i; break; } else { @@ -3236,24 +3222,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, if (i < 0) goto out; - for (i = 0; i <= end_zone; i++) { - struct zone *zone = pgdat->node_zones + i; - - if (!populated_zone(zone)) - continue; - - /* - * If any zone is currently balanced then kswapd will - * not call compaction as it is expected that the - * necessary pages are already available. - */ - if (pgdat_needs_compaction && - zone_watermark_ok(zone, order, - low_wmark_pages(zone), - *classzone_idx, 0)) - pgdat_needs_compaction = false; - } - /* * If we're getting trouble reclaiming, start doing writepage * even in laptop mode. @@ -3297,8 +3265,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, * that that high watermark would be met at 100% * efficiency. */ - if (kswapd_shrink_zone(zone, end_zone, - &sc, &nr_attempted)) + if (kswapd_shrink_zone(zone, end_zone, &sc)) raise_priority = false; } @@ -3311,49 +3278,29 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, pfmemalloc_watermark_ok(pgdat)) wake_up_all(&pgdat->pfmemalloc_wait); - /* - * Fragmentation may mean that the system cannot be rebalanced - * for high-order allocations in all zones. If twice the - * allocation size has been reclaimed and the zones are still - * not balanced then recheck the watermarks at order-0 to - * prevent kswapd reclaiming excessively. Assume that a - * process requested a high-order can direct reclaim/compact. - */ - if (order && sc.nr_reclaimed >= 2UL << order) - order = sc.order = 0; - /* Check if kswapd should be suspending */ if (try_to_freeze() || kthread_should_stop()) break; /* - * Compact if necessary and kswapd is reclaiming at least the - * high watermark number of pages as requsted - */ - if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted) - compact_pgdat(pgdat, order); - - /* * Raise priority if scanning rate is too low or there was no * progress in reclaiming pages */ if (raise_priority || !sc.nr_reclaimed) sc.priority--; } while (sc.priority >= 1 && - !pgdat_balanced(pgdat, order, *classzone_idx)); + !pgdat_balanced(pgdat, order, classzone_idx)); out: /* - * Return the order we were reclaiming at so prepare_kswapd_sleep() - * makes a decision on the order we were last reclaiming at. However, - * if another caller entered the allocator slow path while kswapd - * was awake, order will remain at the higher level + * Return the highest zone idx we were reclaiming at so + * prepare_kswapd_sleep() makes the same decisions as here. */ - *classzone_idx = end_zone; - return order; + return end_zone; } -static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) +static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, + int classzone_idx, int balanced_classzone_idx) { long remaining = 0; DEFINE_WAIT(wait); @@ -3364,7 +3311,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); /* Try to sleep for a short interval */ - if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) { + if (prepare_kswapd_sleep(pgdat, order, remaining, + balanced_classzone_idx)) { remaining = schedule_timeout(HZ/10); finish_wait(&pgdat->kswapd_wait, &wait); prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); @@ -3374,7 +3322,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) * After a short sleep, check if it was a premature sleep. If not, then * go fully to sleep until explicitly woken up. */ - if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) { + if (prepare_kswapd_sleep(pgdat, order, remaining, + balanced_classzone_idx)) { trace_mm_vmscan_kswapd_sleep(pgdat->node_id); /* @@ -3395,6 +3344,12 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) */ reset_isolation_suitable(pgdat); + /* + * We have freed the memory, now we should compact it to make + * allocation of the requested order possible. + */ + wakeup_kcompactd(pgdat, order, classzone_idx); + if (!kthread_should_stop()) schedule(); @@ -3424,7 +3379,6 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) static int kswapd(void *p) { unsigned long order, new_order; - unsigned balanced_order; int classzone_idx, new_classzone_idx; int balanced_classzone_idx; pg_data_t *pgdat = (pg_data_t*)p; @@ -3457,23 +3411,19 @@ static int kswapd(void *p) set_freezable(); order = new_order = 0; - balanced_order = 0; classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; balanced_classzone_idx = classzone_idx; for ( ; ; ) { bool ret; /* - * If the last balance_pgdat was unsuccessful it's unlikely a - * new request of a similar or harder type will succeed soon - * so consider going to sleep on the basis we reclaimed at + * While we were reclaiming, there might have been another + * wakeup, so check the values. */ - if (balanced_order == new_order) { - new_order = pgdat->kswapd_max_order; - new_classzone_idx = pgdat->classzone_idx; - pgdat->kswapd_max_order = 0; - pgdat->classzone_idx = pgdat->nr_zones - 1; - } + new_order = pgdat->kswapd_max_order; + new_classzone_idx = pgdat->classzone_idx; + pgdat->kswapd_max_order = 0; + pgdat->classzone_idx = pgdat->nr_zones - 1; if (order < new_order || classzone_idx > new_classzone_idx) { /* @@ -3483,7 +3433,7 @@ static int kswapd(void *p) order = new_order; classzone_idx = new_classzone_idx; } else { - kswapd_try_to_sleep(pgdat, balanced_order, + kswapd_try_to_sleep(pgdat, order, classzone_idx, balanced_classzone_idx); order = pgdat->kswapd_max_order; classzone_idx = pgdat->classzone_idx; @@ -3503,9 +3453,8 @@ static int kswapd(void *p) */ if (!ret) { trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); - balanced_classzone_idx = classzone_idx; - balanced_order = balance_pgdat(pgdat, order, - &balanced_classzone_idx); + balanced_classzone_idx = balance_pgdat(pgdat, order, + classzone_idx); } } @@ -3535,7 +3484,7 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) } if (!waitqueue_active(&pgdat->kswapd_wait)) return; - if (zone_balanced(zone, order, 0, 0)) + if (zone_balanced(zone, order, true, 0, 0)) return; trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); -- cgit v1.1 From b313aeee25098bf361c808fe45ba2d3af398ec46 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 17 Mar 2016 14:18:27 -0700 Subject: mm: memcontrol: enable kmem accounting for all cgroups in the legacy hierarchy Workingset code was recently made memcg aware, but shadow node shrinker is still global. As a result, one small cgroup can consume all memory available for shadow nodes, possibly hurting other cgroups by reclaiming their shadow nodes, even though reclaim distances stored in its shadow nodes have no effect. To avoid this, we need to make shadow node shrinker memcg aware. The actual work is done in patch 6 of the series. Patches 1 and 2 prepare memcg/shrinker infrastructure for the change. Patch 3 is just a collateral cleanup. Patch 4 makes radix_tree_node accounted, which is necessary for making shadow node shrinker memcg aware. Patch 5 reduces shadow nodes overhead in case workload mostly uses anonymous pages. This patch: Currently, in the legacy hierarchy kmem accounting is off for all cgroups by default and must be enabled explicitly by writing something to memory.kmem.limit_in_bytes. Since we don't support reclaim on hitting kmem limit, nor do we have any plans to implement it, this is likely to be -1, just to enable kmem accounting and limit kernel memory consumption by the memory.limit_in_bytes along with user memory. This user API was introduced when the implementation of kmem accounting lacked slab shrinker support and hence was useless in practice. Things have changed since then - slab shrinkers were made memcg aware, the accounting overhead seems to be negligible, and a failure to charge a kmem allocation should not have critical consequences, because we only account those kernel objects that should be safe to fail. That's why kmem accounting is enabled by default for all cgroups in the default hierarchy, which will eventually replace the legacy one. The ability to enable kmem accounting for some cgroups while keeping it disabled for others is getting difficult to maintain. E.g. to make shadow node shrinker memcg aware (see mm/workingset.c), we need to know the relationship between the number of shadow nodes allocated for a cgroup and the size of its lru list. If kmem accounting is enabled for all cgroups there is no problem, but what should we do if kmem accounting is enabled only for half of cgroups? We've no other choice but use global lru stats while scanning root cgroup's shadow nodes, but that would be wrong if kmem accounting was enabled for all cgroups (which is the case if the unified hierarchy is used), in which case we should use lru stats of the root cgroup's lruvec. That being said, let's enable kmem accounting for all memory cgroups by default. If one finds it unstable or too costly, it can always be disabled system-wide by passing cgroup.memory=nokmem to the kernel at boot time. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 41 +++++------------------------------------ 1 file changed, 5 insertions(+), 36 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4b7dda7..28d1b1e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2824,6 +2824,9 @@ static int memcg_online_kmem(struct mem_cgroup *memcg) { int memcg_id; + if (cgroup_memory_nokmem) + return 0; + BUG_ON(memcg->kmemcg_id >= 0); BUG_ON(memcg->kmem_state); @@ -2844,24 +2847,6 @@ static int memcg_online_kmem(struct mem_cgroup *memcg) return 0; } -static int memcg_propagate_kmem(struct mem_cgroup *parent, - struct mem_cgroup *memcg) -{ - int ret = 0; - - mutex_lock(&memcg_limit_mutex); - /* - * If the parent cgroup is not kmem-online now, it cannot be - * onlined after this point, because it has at least one child - * already. - */ - if (memcg_kmem_online(parent) || - (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nokmem)) - ret = memcg_online_kmem(memcg); - mutex_unlock(&memcg_limit_mutex); - return ret; -} - static void memcg_offline_kmem(struct mem_cgroup *memcg) { struct cgroup_subsys_state *css; @@ -2920,10 +2905,6 @@ static void memcg_free_kmem(struct mem_cgroup *memcg) } } #else -static int memcg_propagate_kmem(struct mem_cgroup *parent, struct mem_cgroup *memcg) -{ - return 0; -} static int memcg_online_kmem(struct mem_cgroup *memcg) { return 0; @@ -2939,22 +2920,10 @@ static void memcg_free_kmem(struct mem_cgroup *memcg) static int memcg_update_kmem_limit(struct mem_cgroup *memcg, unsigned long limit) { - int ret = 0; + int ret; mutex_lock(&memcg_limit_mutex); - /* Top-level cgroup doesn't propagate from root */ - if (!memcg_kmem_online(memcg)) { - if (cgroup_is_populated(memcg->css.cgroup) || - (memcg->use_hierarchy && memcg_has_children(memcg))) - ret = -EBUSY; - if (ret) - goto out; - ret = memcg_online_kmem(memcg); - if (ret) - goto out; - } ret = page_counter_limit(&memcg->kmem, limit); -out: mutex_unlock(&memcg_limit_mutex); return ret; } @@ -4205,7 +4174,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) return &memcg->css; } - error = memcg_propagate_kmem(parent, memcg); + error = memcg_online_kmem(memcg); if (error) goto fail; -- cgit v1.1 From 0fc9f58a90a5012942abf0ae98cfc852afebc0a6 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 17 Mar 2016 14:18:30 -0700 Subject: mm: vmscan: pass root_mem_cgroup instead of NULL to memcg aware shrinker It's just convenient to implement a memcg aware shrinker when you know that shrink_control->memcg != NULL unless memcg_kmem_enabled() returns false. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index f87cfaa..b41b82d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -382,9 +382,8 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, * * @memcg specifies the memory cgroup to target. If it is not NULL, * only shrinkers with SHRINKER_MEMCG_AWARE set will be called to scan - * objects from the memory cgroup specified. Otherwise all shrinkers - * are called, and memcg aware shrinkers are supposed to scan the - * global list then. + * objects from the memory cgroup specified. Otherwise, only unaware + * shrinkers are called. * * @nr_scanned and @nr_eligible form a ratio that indicate how much of * the available objects should be scanned. Page reclaim for example @@ -404,7 +403,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct shrinker *shrinker; unsigned long freed = 0; - if (memcg && !memcg_kmem_online(memcg)) + if (memcg && (!memcg_kmem_enabled() || !mem_cgroup_online(memcg))) return 0; if (nr_scanned == 0) @@ -428,7 +427,13 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, .memcg = memcg, }; - if (memcg && !(shrinker->flags & SHRINKER_MEMCG_AWARE)) + /* + * If kernel memory accounting is disabled, we ignore + * SHRINKER_MEMCG_AWARE flag and call all shrinkers + * passing NULL for memcg. + */ + if (memcg_kmem_enabled() && + !!memcg != !!(shrinker->flags & SHRINKER_MEMCG_AWARE)) continue; if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) -- cgit v1.1 From b6ecd2dea4435a771a99c497a6ac5df6d3618c5a Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 17 Mar 2016 14:18:33 -0700 Subject: mm: memcontrol: zap memcg_kmem_online helper As kmem accounting is now either enabled for all cgroups or disabled system-wide, there's no point in having memcg_kmem_online() helper - instead one can use memcg_kmem_enabled() and mem_cgroup_online(), as shrink_slab() now does. There are only two places left where this helper is used - __memcg_kmem_charge() and memcg_create_kmem_cache(). The former can only be called if memcg_kmem_enabled() returned true. Since the cgroup it operates on is online, mem_cgroup_is_root() check will be enough. memcg_create_kmem_cache() can't use mem_cgroup_online() helper instead of memcg_kmem_online(), because it relies on the fact that in memcg_offline_kmem() memcg->kmem_state is changed before memcg_deactivate_kmem_caches() is called, but there we can just open-code the check. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- mm/slab_common.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 28d1b1e..341bf86 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2346,7 +2346,7 @@ int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) int ret = 0; memcg = get_mem_cgroup_from_mm(current->mm); - if (memcg_kmem_online(memcg)) + if (!mem_cgroup_is_root(memcg)) ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg); css_put(&memcg->css); return ret; diff --git a/mm/slab_common.c b/mm/slab_common.c index 6afb226..8addc3c 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -510,7 +510,7 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg, * The memory cgroup could have been offlined while the cache * creation work was pending. */ - if (!memcg_kmem_online(memcg)) + if (memcg->kmem_state != KMEM_ONLINE) goto out_unlock; idx = memcg_cache_id(memcg); -- cgit v1.1 From cdcbb72ebfec52373e57092eccaadd5a6e261c3e Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 17 Mar 2016 14:18:39 -0700 Subject: mm: workingset: size shadow nodes lru basing on file cache size A page is activated on refault if the refault distance stored in the corresponding shadow entry is less than the number of active file pages. Since active file pages can't occupy more than half memory, we assume that the maximal effective refault distance can't be greater than half the number of present pages and size the shadow nodes lru list appropriately. Generally speaking, this assumption is correct, but it can result in wasting a considerable chunk of memory on stale shadow nodes in case the portion of file pages is small, e.g. if a workload mostly uses anonymous memory. To sort this out, we need to compute the size of shadow nodes lru basing not on the maximal possible, but the current size of file cache. We could take the size of active file lru for the maximal refault distance, but active lru is pretty unstable - it can shrink dramatically at runtime possibly disrupting workingset detection logic. Instead we assume that the maximal refault distance equals half the total number of file cache pages. This will protect us against active file lru size fluctuations while still being correct, because size of active lru is normally maintained lower than size of inactive lru. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/workingset.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/workingset.c b/mm/workingset.c index 6130ba0b..68e8cd9 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -349,7 +349,9 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc); local_irq_enable(); - pages = node_present_pages(sc->nid); + pages = node_page_state(sc->nid, NR_ACTIVE_FILE) + + node_page_state(sc->nid, NR_INACTIVE_FILE); + /* * Active cache pages are limited to 50% of memory, and shadow * entries that represent a refault distance bigger than that -- cgit v1.1 From 0a6b76dd23fa08c5fd7b68acdb55018a37afd4aa Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 17 Mar 2016 14:18:42 -0700 Subject: mm: workingset: make shadow node shrinker memcg aware Workingset code was recently made memcg aware, but shadow node shrinker is still global. As a result, one small cgroup can consume all memory available for shadow nodes, possibly hurting other cgroups by reclaiming their shadow nodes, even though reclaim distances stored in its shadow nodes have no effect. To avoid this, we need to make shadow node shrinker memcg aware. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 5 ++--- mm/workingset.c | 10 +++++++--- 2 files changed, 9 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 341bf86..ae8b81c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -638,9 +638,8 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, __this_cpu_add(memcg->stat->nr_page_events, nr_pages); } -static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, - int nid, - unsigned int lru_mask) +unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, + int nid, unsigned int lru_mask) { unsigned long nr = 0; int zid; diff --git a/mm/workingset.c b/mm/workingset.c index 68e8cd9..8a75f8d 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -349,8 +349,12 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc); local_irq_enable(); - pages = node_page_state(sc->nid, NR_ACTIVE_FILE) + - node_page_state(sc->nid, NR_INACTIVE_FILE); + if (memcg_kmem_enabled()) + pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid, + LRU_ALL_FILE); + else + pages = node_page_state(sc->nid, NR_ACTIVE_FILE) + + node_page_state(sc->nid, NR_INACTIVE_FILE); /* * Active cache pages are limited to 50% of memory, and shadow @@ -460,7 +464,7 @@ static struct shrinker workingset_shadow_shrinker = { .count_objects = count_shadow_nodes, .scan_objects = scan_shadow_nodes, .seeks = DEFAULT_SEEKS, - .flags = SHRINKER_NUMA_AWARE, + .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, }; /* -- cgit v1.1 From f9719a03de51e13526d614e79d002f838770b2d6 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 17 Mar 2016 14:18:45 -0700 Subject: thp, vmstats: count deferred split events Count how many times we put a THP in split queue. Currently, it happens on partial unmap of a THP. Rapidly growing value can indicate that an application behaves unfriendly wrt THP: often fault in huge page and then unmap part of it. This leads to unnecessary memory fragmentation and the application may require tuning. The event also can help with debugging kernel [mis-]behaviour. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 1 + mm/vmstat.c | 1 + 2 files changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1ea21e2..1dddfb2 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3455,6 +3455,7 @@ void deferred_split_huge_page(struct page *page) spin_lock_irqsave(&pgdata->split_queue_lock, flags); if (list_empty(page_deferred_list(page))) { + count_vm_event(THP_DEFERRED_SPLIT_PAGE); list_add_tail(page_deferred_list(page), &pgdata->split_queue); pgdata->split_queue_len++; } diff --git a/mm/vmstat.c b/mm/vmstat.c index f800662..5e43004 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -848,6 +848,7 @@ const char * const vmstat_text[] = { "thp_collapse_alloc_failed", "thp_split_page", "thp_split_page_failed", + "thp_deferred_split_page", "thp_split_pmd", "thp_zero_page_alloc", "thp_zero_page_alloc_failed", -- cgit v1.1 From ea606cf5d8df370e7932460dfd960b21f20e7c6d Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 17 Mar 2016 14:18:48 -0700 Subject: mm: move max_map_count bits into mm.h max_map_count sysctl unrelated to scheduler. Move its bits from include/linux/sched/sysctl.h to include/linux/mm.h. Signed-off-by: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 1 - mm/mremap.c | 1 - mm/nommu.c | 1 - 3 files changed, 3 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 90e3b86..676f422 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -37,7 +37,6 @@ #include #include #include -#include #include #include #include diff --git a/mm/mremap.c b/mm/mremap.c index 8eeba02..e30c8a6 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include diff --git a/mm/nommu.c b/mm/nommu.c index fbf6f0f1..9bdf8b1 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -33,7 +33,6 @@ #include #include #include -#include #include #include -- cgit v1.1 From 39a1aa8e194ab67983de3b9d0b204ccee12e689a Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 17 Mar 2016 14:18:50 -0700 Subject: mm: deduplicate memory overcommitment code Currently we have two copies of the same code which implements memory overcommitment logic. Let's move it into mm/util.c and hence avoid duplication. No functional changes here. Signed-off-by: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 124 ------------------------------------------------------------- mm/nommu.c | 116 --------------------------------------------------------- mm/util.c | 124 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 124 insertions(+), 240 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 676f422..1464192 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -122,130 +122,6 @@ void vma_set_page_prot(struct vm_area_struct *vma) } } - -int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */ -int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */ -unsigned long sysctl_overcommit_kbytes __read_mostly; -int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; -unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ -unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ -/* - * Make sure vm_committed_as in one cacheline and not cacheline shared with - * other variables. It can be updated by several CPUs frequently. - */ -struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp; - -/* - * The global memory commitment made in the system can be a metric - * that can be used to drive ballooning decisions when Linux is hosted - * as a guest. On Hyper-V, the host implements a policy engine for dynamically - * balancing memory across competing virtual machines that are hosted. - * Several metrics drive this policy engine including the guest reported - * memory commitment. - */ -unsigned long vm_memory_committed(void) -{ - return percpu_counter_read_positive(&vm_committed_as); -} -EXPORT_SYMBOL_GPL(vm_memory_committed); - -/* - * Check that a process has enough memory to allocate a new virtual - * mapping. 0 means there is enough memory for the allocation to - * succeed and -ENOMEM implies there is not. - * - * We currently support three overcommit policies, which are set via the - * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting - * - * Strict overcommit modes added 2002 Feb 26 by Alan Cox. - * Additional code 2002 Jul 20 by Robert Love. - * - * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise. - * - * Note this is a helper function intended to be used by LSMs which - * wish to use this logic. - */ -int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) -{ - long free, allowed, reserve; - - VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) < - -(s64)vm_committed_as_batch * num_online_cpus(), - "memory commitment underflow"); - - vm_acct_memory(pages); - - /* - * Sometimes we want to use more memory than we have - */ - if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) - return 0; - - if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { - free = global_page_state(NR_FREE_PAGES); - free += global_page_state(NR_FILE_PAGES); - - /* - * shmem pages shouldn't be counted as free in this - * case, they can't be purged, only swapped out, and - * that won't affect the overall amount of available - * memory in the system. - */ - free -= global_page_state(NR_SHMEM); - - free += get_nr_swap_pages(); - - /* - * Any slabs which are created with the - * SLAB_RECLAIM_ACCOUNT flag claim to have contents - * which are reclaimable, under pressure. The dentry - * cache and most inode caches should fall into this - */ - free += global_page_state(NR_SLAB_RECLAIMABLE); - - /* - * Leave reserved pages. The pages are not for anonymous pages. - */ - if (free <= totalreserve_pages) - goto error; - else - free -= totalreserve_pages; - - /* - * Reserve some for root - */ - if (!cap_sys_admin) - free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); - - if (free > pages) - return 0; - - goto error; - } - - allowed = vm_commit_limit(); - /* - * Reserve some for root - */ - if (!cap_sys_admin) - allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); - - /* - * Don't let a single process grow so big a user can't recover - */ - if (mm) { - reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); - allowed -= min_t(long, mm->total_vm / 32, reserve); - } - - if (percpu_counter_read_positive(&vm_committed_as) < allowed) - return 0; -error: - vm_unacct_memory(pages); - - return -ENOMEM; -} - /* * Requires inode->i_mapping->i_mmap_rwsem */ diff --git a/mm/nommu.c b/mm/nommu.c index 9bdf8b1..6402f27 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -47,33 +47,11 @@ struct page *mem_map; unsigned long max_mapnr; EXPORT_SYMBOL(max_mapnr); unsigned long highest_memmap_pfn; -struct percpu_counter vm_committed_as; -int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ -int sysctl_overcommit_ratio = 50; /* default is 50% */ -unsigned long sysctl_overcommit_kbytes __read_mostly; -int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; -unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ -unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ int heap_stack_gap = 0; atomic_long_t mmap_pages_allocated; -/* - * The global memory commitment made in the system can be a metric - * that can be used to drive ballooning decisions when Linux is hosted - * as a guest. On Hyper-V, the host implements a policy engine for dynamically - * balancing memory across competing virtual machines that are hosted. - * Several metrics drive this policy engine including the guest reported - * memory commitment. - */ -unsigned long vm_memory_committed(void) -{ - return percpu_counter_read_positive(&vm_committed_as); -} - -EXPORT_SYMBOL_GPL(vm_memory_committed); - EXPORT_SYMBOL(mem_map); /* list of mapped, potentially shareable regions */ @@ -1828,100 +1806,6 @@ void unmap_mapping_range(struct address_space *mapping, } EXPORT_SYMBOL(unmap_mapping_range); -/* - * Check that a process has enough memory to allocate a new virtual - * mapping. 0 means there is enough memory for the allocation to - * succeed and -ENOMEM implies there is not. - * - * We currently support three overcommit policies, which are set via the - * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting - * - * Strict overcommit modes added 2002 Feb 26 by Alan Cox. - * Additional code 2002 Jul 20 by Robert Love. - * - * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise. - * - * Note this is a helper function intended to be used by LSMs which - * wish to use this logic. - */ -int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) -{ - long free, allowed, reserve; - - vm_acct_memory(pages); - - /* - * Sometimes we want to use more memory than we have - */ - if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) - return 0; - - if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { - free = global_page_state(NR_FREE_PAGES); - free += global_page_state(NR_FILE_PAGES); - - /* - * shmem pages shouldn't be counted as free in this - * case, they can't be purged, only swapped out, and - * that won't affect the overall amount of available - * memory in the system. - */ - free -= global_page_state(NR_SHMEM); - - free += get_nr_swap_pages(); - - /* - * Any slabs which are created with the - * SLAB_RECLAIM_ACCOUNT flag claim to have contents - * which are reclaimable, under pressure. The dentry - * cache and most inode caches should fall into this - */ - free += global_page_state(NR_SLAB_RECLAIMABLE); - - /* - * Leave reserved pages. The pages are not for anonymous pages. - */ - if (free <= totalreserve_pages) - goto error; - else - free -= totalreserve_pages; - - /* - * Reserve some for root - */ - if (!cap_sys_admin) - free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); - - if (free > pages) - return 0; - - goto error; - } - - allowed = vm_commit_limit(); - /* - * Reserve some 3% for root - */ - if (!cap_sys_admin) - allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); - - /* - * Don't let a single process grow so big a user can't recover - */ - if (mm) { - reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); - allowed -= min_t(long, mm->total_vm / 32, reserve); - } - - if (percpu_counter_read_positive(&vm_committed_as) < allowed) - return 0; - -error: - vm_unacct_memory(pages); - - return -ENOMEM; -} - int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { BUG(); diff --git a/mm/util.c b/mm/util.c index 4fb14ca..47a57e5 100644 --- a/mm/util.c +++ b/mm/util.c @@ -396,6 +396,13 @@ int __page_mapcount(struct page *page) } EXPORT_SYMBOL_GPL(__page_mapcount); +int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; +int sysctl_overcommit_ratio __read_mostly = 50; +unsigned long sysctl_overcommit_kbytes __read_mostly; +int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; +unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ +unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ + int overcommit_ratio_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -437,6 +444,123 @@ unsigned long vm_commit_limit(void) return allowed; } +/* + * Make sure vm_committed_as in one cacheline and not cacheline shared with + * other variables. It can be updated by several CPUs frequently. + */ +struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp; + +/* + * The global memory commitment made in the system can be a metric + * that can be used to drive ballooning decisions when Linux is hosted + * as a guest. On Hyper-V, the host implements a policy engine for dynamically + * balancing memory across competing virtual machines that are hosted. + * Several metrics drive this policy engine including the guest reported + * memory commitment. + */ +unsigned long vm_memory_committed(void) +{ + return percpu_counter_read_positive(&vm_committed_as); +} +EXPORT_SYMBOL_GPL(vm_memory_committed); + +/* + * Check that a process has enough memory to allocate a new virtual + * mapping. 0 means there is enough memory for the allocation to + * succeed and -ENOMEM implies there is not. + * + * We currently support three overcommit policies, which are set via the + * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting + * + * Strict overcommit modes added 2002 Feb 26 by Alan Cox. + * Additional code 2002 Jul 20 by Robert Love. + * + * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise. + * + * Note this is a helper function intended to be used by LSMs which + * wish to use this logic. + */ +int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) +{ + long free, allowed, reserve; + + VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) < + -(s64)vm_committed_as_batch * num_online_cpus(), + "memory commitment underflow"); + + vm_acct_memory(pages); + + /* + * Sometimes we want to use more memory than we have + */ + if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) + return 0; + + if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { + free = global_page_state(NR_FREE_PAGES); + free += global_page_state(NR_FILE_PAGES); + + /* + * shmem pages shouldn't be counted as free in this + * case, they can't be purged, only swapped out, and + * that won't affect the overall amount of available + * memory in the system. + */ + free -= global_page_state(NR_SHMEM); + + free += get_nr_swap_pages(); + + /* + * Any slabs which are created with the + * SLAB_RECLAIM_ACCOUNT flag claim to have contents + * which are reclaimable, under pressure. The dentry + * cache and most inode caches should fall into this + */ + free += global_page_state(NR_SLAB_RECLAIMABLE); + + /* + * Leave reserved pages. The pages are not for anonymous pages. + */ + if (free <= totalreserve_pages) + goto error; + else + free -= totalreserve_pages; + + /* + * Reserve some for root + */ + if (!cap_sys_admin) + free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); + + if (free > pages) + return 0; + + goto error; + } + + allowed = vm_commit_limit(); + /* + * Reserve some for root + */ + if (!cap_sys_admin) + allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); + + /* + * Don't let a single process grow so big a user can't recover + */ + if (mm) { + reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); + allowed -= min_t(long, mm->total_vm / 32, reserve); + } + + if (percpu_counter_read_positive(&vm_committed_as) < allowed) + return 0; +error: + vm_unacct_memory(pages); + + return -ENOMEM; +} + /** * get_cmdline() - copy the cmdline value to a buffer. * @task: the task whose cmdline value to copy. -- cgit v1.1 From 458aa76d132dc1c3c60be0f0db99bcc0ce1767fc Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 17 Mar 2016 14:18:56 -0700 Subject: mm/thp/migration: switch from flush_tlb_range to flush_pmd_tlb_range We remove one instace of flush_tlb_range here. That was added by commit f714f4f20e59 ("mm: numa: call MMU notifiers on THP migration"). But the pmdp_huge_clear_flush_notify should have done the require flush for us. Hence remove the extra flush. Signed-off-by: Aneesh Kumar K.V Cc: Mel Gorman Cc: "Kirill A. Shutemov" Cc: Vineet Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 8 +++++--- mm/pgtable-generic.c | 14 -------------- 2 files changed, 5 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 568284e..fdaf081 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1773,7 +1773,10 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, put_page(new_page); goto out_fail; } - + /* + * We are not sure a pending tlb flush here is for a huge page + * mapping or not. Hence use the tlb range variant + */ if (mm_tlb_flush_pending(mm)) flush_tlb_range(vma, mmun_start, mmun_end); @@ -1829,12 +1832,11 @@ fail_putback: page_add_anon_rmap(new_page, vma, mmun_start, true); pmdp_huge_clear_flush_notify(vma, mmun_start, pmd); set_pmd_at(mm, mmun_start, pmd, entry); - flush_tlb_range(vma, mmun_start, mmun_end); update_mmu_cache_pmd(vma, address, &entry); if (page_count(page) != 2) { set_pmd_at(mm, mmun_start, pmd, orig_entry); - flush_tlb_range(vma, mmun_start, mmun_end); + flush_pmd_tlb_range(vma, mmun_start, mmun_end); mmu_notifier_invalidate_range(mm, mmun_start, mmun_end); update_mmu_cache_pmd(vma, address, &entry); page_remove_rmap(new_page, true); diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 06a005b..71c5f91 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -84,20 +84,6 @@ pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, #ifdef CONFIG_TRANSPARENT_HUGEPAGE -#ifndef __HAVE_ARCH_FLUSH_PMD_TLB_RANGE - -/* - * ARCHes with special requirements for evicting THP backing TLB entries can - * implement this. Otherwise also, it can help optimize normal TLB flush in - * THP regime. stock flush_tlb_range() typically has optimization to nuke the - * entire TLB if flush span is greater than a threshold, which will - * likely be true for a single huge page. Thus a single thp flush will - * invalidate the entire TLB which is not desirable. - * e.g. see arch/arc: flush_pmd_tlb_range - */ -#define flush_pmd_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) -#endif - #ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, -- cgit v1.1 From 7eb50292d7f74ccb89155960d62b2697f2536b28 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 17 Mar 2016 14:19:02 -0700 Subject: mm/Kconfig: remove redundant arch depend for memory hotplug MEMORY_HOTPLUG already depends on ARCH_ENABLE_MEMORY_HOTPLUG which is selected by the supported architectures, so the following arch depend is unnecessary. Signed-off-by: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 1 - 1 file changed, 1 deletion(-) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index 03cbfa0..c077765 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -187,7 +187,6 @@ config MEMORY_HOTPLUG bool "Allow for memory hot-add" depends on SPARSEMEM || X86_64_ACPI_NUMA depends on ARCH_ENABLE_MEMORY_HOTPLUG - depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390) config MEMORY_HOTPLUG_SPARSE def_bool y -- cgit v1.1 From d02bd27bd33dd7e8d22594cd568b81be0cb584cd Mon Sep 17 00:00:00 2001 From: Igor Redko Date: Thu, 17 Mar 2016 14:19:05 -0700 Subject: mm/page_alloc.c: calculate 'available' memory in a separate function Add a new field, VIRTIO_BALLOON_S_AVAIL, to virtio_balloon memory statistics protocol, corresponding to 'Available' in /proc/meminfo. It indicates to the hypervisor how big the balloon can be inflated without pushing the guest system to swap. This metric would be very useful in VM orchestration software to improve memory management of different VMs under overcommit. This patch (of 2): Factor out calculation of the available memory counter into a separate exportable function, in order to be able to use it in other parts of the kernel. In particular, it appears a relevant metric to report to the hypervisor via virtio-balloon statistics interface (in a followup patch). Signed-off-by: Igor Redko Signed-off-by: Denis V. Lunev Reviewed-by: Roman Kagan Cc: Michael S. Tsirkin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 25a75da..941b802 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3713,6 +3713,49 @@ static inline void show_node(struct zone *zone) printk("Node %d ", zone_to_nid(zone)); } +long si_mem_available(void) +{ + long available; + unsigned long pagecache; + unsigned long wmark_low = 0; + unsigned long pages[NR_LRU_LISTS]; + struct zone *zone; + int lru; + + for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) + pages[lru] = global_page_state(NR_LRU_BASE + lru); + + for_each_zone(zone) + wmark_low += zone->watermark[WMARK_LOW]; + + /* + * Estimate the amount of memory available for userspace allocations, + * without causing swapping. + */ + available = global_page_state(NR_FREE_PAGES) - totalreserve_pages; + + /* + * Not all the page cache can be freed, otherwise the system will + * start swapping. Assume at least half of the page cache, or the + * low watermark worth of cache, needs to stay. + */ + pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE]; + pagecache -= min(pagecache / 2, wmark_low); + available += pagecache; + + /* + * Part of the reclaimable slab consists of items that are in use, + * and cannot be freed. Cap this estimate at the low watermark. + */ + available += global_page_state(NR_SLAB_RECLAIMABLE) - + min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low); + + if (available < 0) + available = 0; + return available; +} +EXPORT_SYMBOL_GPL(si_mem_available); + void si_meminfo(struct sysinfo *val) { val->totalram = totalram_pages; -- cgit v1.1 From 3ed3a4f0ddffece942bb2661924d87be4ce63cb7 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 17 Mar 2016 14:19:11 -0700 Subject: mm: cleanup *pte_alloc* interfaces There are few things about *pte_alloc*() helpers worth cleaning up: - 'vma' argument is unused, let's drop it; - most __pte_alloc() callers do speculative check for pmd_none(), before taking ptl: let's introduce pte_alloc() macro which does the check. The only direct user of __pte_alloc left is userfaultfd, which has different expectation about atomicity wrt pmd. - pte_alloc_map() and pte_alloc_map_lock() are redefined using pte_alloc(). [sudeep.holla@arm.com: fix build for arm64 hugetlbpage] [sfr@canb.auug.org.au: fix arch/arm/mm/mmu.c some more] Signed-off-by: Kirill A. Shutemov Cc: Dave Hansen Signed-off-by: Sudeep Holla Acked-by: Kirill A. Shutemov Signed-off-by: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 8 +++----- mm/mremap.c | 3 +-- mm/userfaultfd.c | 3 +-- 3 files changed, 5 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 0e24764..1974fc0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -562,8 +562,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, } } -int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, - pmd_t *pmd, unsigned long address) +int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) { spinlock_t *ptl; pgtable_t new = pte_alloc_one(mm, address); @@ -3419,12 +3418,11 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, } /* - * Use __pte_alloc instead of pte_alloc_map, because we can't + * Use pte_alloc() instead of pte_alloc_map, because we can't * run pte_offset_map on the pmd, if an huge pmd could * materialize from under us from a different thread. */ - if (unlikely(pmd_none(*pmd)) && - unlikely(__pte_alloc(mm, vma, pmd, address))) + if (unlikely(pte_alloc(mm, pmd, address))) return VM_FAULT_OOM; /* * If a huge pmd materialized under us just retry later. Use diff --git a/mm/mremap.c b/mm/mremap.c index e30c8a6..3fa0a467 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -213,8 +213,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, continue; VM_BUG_ON(pmd_trans_huge(*old_pmd)); } - if (pmd_none(*new_pmd) && __pte_alloc(new_vma->vm_mm, new_vma, - new_pmd, new_addr)) + if (pte_alloc(new_vma->vm_mm, new_pmd, new_addr)) break; next = (new_addr + PMD_SIZE) & PMD_MASK; if (extent > next - new_addr) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 806b0c7..9f3a029 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -230,8 +230,7 @@ retry: break; } if (unlikely(pmd_none(dst_pmdval)) && - unlikely(__pte_alloc(dst_mm, dst_vma, dst_pmd, - dst_addr))) { + unlikely(__pte_alloc(dst_mm, dst_pmd, dst_addr))) { err = -ENOMEM; break; } -- cgit v1.1 From 795ae7a0de6b834a0cc202aa55c190ef81496665 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 17 Mar 2016 14:19:14 -0700 Subject: mm: scale kswapd watermarks in proportion to memory In machines with 140G of memory and enterprise flash storage, we have seen read and write bursts routinely exceed the kswapd watermarks and cause thundering herds in direct reclaim. Unfortunately, the only way to tune kswapd aggressiveness is through adjusting min_free_kbytes - the system's emergency reserves - which is entirely unrelated to the system's latency requirements. In order to get kswapd to maintain a 250M buffer of free memory, the emergency reserves need to be set to 1G. That is a lot of memory wasted for no good reason. On the other hand, it's reasonable to assume that allocation bursts and overall allocation concurrency scale with memory capacity, so it makes sense to make kswapd aggressiveness a function of that as well. Change the kswapd watermark scale factor from the currently fixed 25% of the tunable emergency reserve to a tunable 0.1% of memory. Beyond 1G of memory, this will produce bigger watermark steps than the current formula in default settings. Ensure that the new formula never chooses steps smaller than that, i.e. 25% of the emergency reserve. On a 140G machine, this raises the default watermark steps - the distance between min and low, and low and high - from 16M to 143M. Signed-off-by: Johannes Weiner Acked-by: Mel Gorman Acked-by: Rik van Riel Acked-by: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 941b802..d156310 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -249,6 +249,7 @@ compound_page_dtor * const compound_page_dtors[] = { int min_free_kbytes = 1024; int user_min_free_kbytes = -1; +int watermark_scale_factor = 10; static unsigned long __meminitdata nr_kernel_pages; static unsigned long __meminitdata nr_all_pages; @@ -6347,8 +6348,17 @@ static void __setup_per_zone_wmarks(void) zone->watermark[WMARK_MIN] = tmp; } - zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); - zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); + /* + * Set the kswapd watermarks distance according to the + * scale factor in proportion to available memory, but + * ensure a minimum size on small systems. + */ + tmp = max_t(u64, tmp >> 2, + mult_frac(zone->managed_pages, + watermark_scale_factor, 10000)); + + zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp; + zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2; __mod_zone_page_state(zone, NR_ALLOC_BATCH, high_wmark_pages(zone) - low_wmark_pages(zone) - @@ -6489,6 +6499,21 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, return 0; } +int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + int rc; + + rc = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (rc) + return rc; + + if (write) + setup_per_zone_wmarks(); + + return 0; +} + #ifdef CONFIG_NUMA int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) -- cgit v1.1 From f9054c70d28bc214b2857cf8db8269f4f45a5e23 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 17 Mar 2016 14:19:19 -0700 Subject: mm, mempool: only set __GFP_NOMEMALLOC if there are free elements If an oom killed thread calls mempool_alloc(), it is possible that it'll loop forever if there are no elements on the freelist since __GFP_NOMEMALLOC prevents it from accessing needed memory reserves in oom conditions. Only set __GFP_NOMEMALLOC if there are elements on the freelist. If there are no free elements, allow allocations without the bit set so that memory reserves can be accessed if needed. Additionally, using mempool_alloc() with __GFP_NOMEMALLOC is not supported since the implementation can loop forever without accessing memory reserves when needed. Signed-off-by: David Rientjes Cc: Greg Thelen Cc: Michal Hocko Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempool.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/mempool.c b/mm/mempool.c index 7924f4f..07c383d 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -310,25 +310,36 @@ EXPORT_SYMBOL(mempool_resize); * returns NULL. Note that due to preallocation, this function * *never* fails when called from process contexts. (it might * fail if called from an IRQ context.) - * Note: using __GFP_ZERO is not supported. + * Note: neither __GFP_NOMEMALLOC nor __GFP_ZERO are supported. */ -void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask) +void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) { void *element; unsigned long flags; wait_queue_t wait; gfp_t gfp_temp; + /* If oom killed, memory reserves are essential to prevent livelock */ + VM_WARN_ON_ONCE(gfp_mask & __GFP_NOMEMALLOC); + /* No element size to zero on allocation */ VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO); + might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM); - gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */ gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */ gfp_mask |= __GFP_NOWARN; /* failures are OK */ gfp_temp = gfp_mask & ~(__GFP_DIRECT_RECLAIM|__GFP_IO); repeat_alloc: + if (likely(pool->curr_nr)) { + /* + * Don't allocate from emergency reserves if there are + * elements available. This check is racy, but it will + * be rechecked each loop. + */ + gfp_temp |= __GFP_NOMEMALLOC; + } element = pool->alloc(gfp_temp, pool->pool_data); if (likely(element != NULL)) @@ -352,11 +363,12 @@ repeat_alloc: * We use gfp mask w/o direct reclaim or IO for the first round. If * alloc failed with that and @pool was empty, retry immediately. */ - if (gfp_temp != gfp_mask) { + if ((gfp_temp & ~__GFP_NOMEMALLOC) != gfp_mask) { spin_unlock_irqrestore(&pool->lock, flags); gfp_temp = gfp_mask; goto repeat_alloc; } + gfp_temp = gfp_mask; /* We must not sleep if !__GFP_DIRECT_RECLAIM */ if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) { -- cgit v1.1 From 444eb2a449ef36fe115431ed7b71467c4563c7f1 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 17 Mar 2016 14:19:23 -0700 Subject: mm: thp: set THP defrag by default to madvise and add a stall-free defrag option THP defrag is enabled by default to direct reclaim/compact but not wake kswapd in the event of a THP allocation failure. The problem is that THP allocation requests potentially enter reclaim/compaction. This potentially incurs a severe stall that is not guaranteed to be offset by reduced TLB misses. While there has been considerable effort to reduce the impact of reclaim/compaction, it is still a high cost and workloads that should fit in memory fail to do so. Specifically, a simple anon/file streaming workload will enter direct reclaim on NUMA at least even though the working set size is 80% of RAM. It's been years and it's time to throw in the towel. First, this patch defines THP defrag as follows; madvise: A failed allocation will direct reclaim/compact if the application requests it never: Neither reclaim/compact nor wake kswapd defer: A failed allocation will wake kswapd/kcompactd always: A failed allocation will direct reclaim/compact (historical behaviour) khugepaged defrag will enter direct/reclaim but not wake kswapd. Next it sets the default defrag option to be "madvise" to only enter direct reclaim/compaction for applications that specifically requested it. Lastly, it removes a check from the page allocator slowpath that is related to __GFP_THISNODE to allow "defer" to work. The callers that really cares are slub/slab and they are updated accordingly. The slab one may be surprising because it also corrects a comment as kswapd was never woken up by that path. This means that a THP fault will no longer stall for most applications by default and the ideal for most users that get THP if they are immediately available. There are still options for users that prefer a stall at startup of a new application by either restoring historical behaviour with "always" or pick a half-way point with "defer" where kswapd does some of the work in the background and wakes kcompactd if necessary. THP defrag for khugepaged remains enabled and will enter direct/reclaim but no wakeup kswapd or kcompactd. After this patch a THP allocation failure will quickly fallback and rely on khugepaged to recover the situation at some time in the future. In some cases, this will reduce THP usage but the benefit of THP is hard to measure and not a universal win where as a stall to reclaim/compaction is definitely measurable and can be painful. The first test for this is using "usemem" to read a large file and write a large anonymous mapping (to avoid the zero page) multiple times. The total size of the mappings is 80% of RAM and the benchmark simply measures how long it takes to complete. It uses multiple threads to see if that is a factor. On UMA, the performance is almost identical so is not reported but on NUMA, we see this usemem 4.4.0 4.4.0 kcompactd-v1r1 nodefrag-v1r3 Amean System-1 102.86 ( 0.00%) 46.81 ( 54.50%) Amean System-4 37.85 ( 0.00%) 34.02 ( 10.12%) Amean System-7 48.12 ( 0.00%) 46.89 ( 2.56%) Amean System-12 51.98 ( 0.00%) 56.96 ( -9.57%) Amean System-21 80.16 ( 0.00%) 79.05 ( 1.39%) Amean System-30 110.71 ( 0.00%) 107.17 ( 3.20%) Amean System-48 127.98 ( 0.00%) 124.83 ( 2.46%) Amean Elapsd-1 185.84 ( 0.00%) 105.51 ( 43.23%) Amean Elapsd-4 26.19 ( 0.00%) 25.58 ( 2.33%) Amean Elapsd-7 21.65 ( 0.00%) 21.62 ( 0.16%) Amean Elapsd-12 18.58 ( 0.00%) 17.94 ( 3.43%) Amean Elapsd-21 17.53 ( 0.00%) 16.60 ( 5.33%) Amean Elapsd-30 17.45 ( 0.00%) 17.13 ( 1.84%) Amean Elapsd-48 15.40 ( 0.00%) 15.27 ( 0.82%) For a single thread, the benchmark completes 43.23% faster with this patch applied with smaller benefits as the thread increases. Similar, notice the large reduction in most cases in system CPU usage. The overall CPU time is 4.4.0 4.4.0 kcompactd-v1r1 nodefrag-v1r3 User 10357.65 10438.33 System 3988.88 3543.94 Elapsed 2203.01 1634.41 Which is substantial. Now, the reclaim figures 4.4.0 4.4.0 kcompactd-v1r1nodefrag-v1r3 Minor Faults 128458477 278352931 Major Faults 2174976 225 Swap Ins 16904701 0 Swap Outs 17359627 0 Allocation stalls 43611 0 DMA allocs 0 0 DMA32 allocs 19832646 19448017 Normal allocs 614488453 580941839 Movable allocs 0 0 Direct pages scanned 24163800 0 Kswapd pages scanned 0 0 Kswapd pages reclaimed 0 0 Direct pages reclaimed 20691346 0 Compaction stalls 42263 0 Compaction success 938 0 Compaction failures 41325 0 This patch eliminates almost all swapping and direct reclaim activity. There is still overhead but it's from NUMA balancing which does not identify that it's pointless trying to do anything with this workload. I also tried the thpscale benchmark which forces a corner case where compaction can be used heavily and measures the latency of whether base or huge pages were used thpscale Fault Latencies 4.4.0 4.4.0 kcompactd-v1r1 nodefrag-v1r3 Amean fault-base-1 5288.84 ( 0.00%) 2817.12 ( 46.73%) Amean fault-base-3 6365.53 ( 0.00%) 3499.11 ( 45.03%) Amean fault-base-5 6526.19 ( 0.00%) 4363.06 ( 33.15%) Amean fault-base-7 7142.25 ( 0.00%) 4858.08 ( 31.98%) Amean fault-base-12 13827.64 ( 0.00%) 10292.11 ( 25.57%) Amean fault-base-18 18235.07 ( 0.00%) 13788.84 ( 24.38%) Amean fault-base-24 21597.80 ( 0.00%) 24388.03 (-12.92%) Amean fault-base-30 26754.15 ( 0.00%) 19700.55 ( 26.36%) Amean fault-base-32 26784.94 ( 0.00%) 19513.57 ( 27.15%) Amean fault-huge-1 4223.96 ( 0.00%) 2178.57 ( 48.42%) Amean fault-huge-3 2194.77 ( 0.00%) 2149.74 ( 2.05%) Amean fault-huge-5 2569.60 ( 0.00%) 2346.95 ( 8.66%) Amean fault-huge-7 3612.69 ( 0.00%) 2997.70 ( 17.02%) Amean fault-huge-12 3301.75 ( 0.00%) 6727.02 (-103.74%) Amean fault-huge-18 6696.47 ( 0.00%) 6685.72 ( 0.16%) Amean fault-huge-24 8000.72 ( 0.00%) 9311.43 (-16.38%) Amean fault-huge-30 13305.55 ( 0.00%) 9750.45 ( 26.72%) Amean fault-huge-32 9981.71 ( 0.00%) 10316.06 ( -3.35%) The average time to fault pages is substantially reduced in the majority of caseds but with the obvious caveat that fewer THPs are actually used in this adverse workload 4.4.0 4.4.0 kcompactd-v1r1 nodefrag-v1r3 Percentage huge-1 0.71 ( 0.00%) 14.04 (1865.22%) Percentage huge-3 10.77 ( 0.00%) 33.05 (206.85%) Percentage huge-5 60.39 ( 0.00%) 38.51 (-36.23%) Percentage huge-7 45.97 ( 0.00%) 34.57 (-24.79%) Percentage huge-12 68.12 ( 0.00%) 40.07 (-41.17%) Percentage huge-18 64.93 ( 0.00%) 47.82 (-26.35%) Percentage huge-24 62.69 ( 0.00%) 44.23 (-29.44%) Percentage huge-30 43.49 ( 0.00%) 55.38 ( 27.34%) Percentage huge-32 50.72 ( 0.00%) 51.90 ( 2.35%) 4.4.0 4.4.0 kcompactd-v1r1nodefrag-v1r3 Minor Faults 37429143 47564000 Major Faults 1916 1558 Swap Ins 1466 1079 Swap Outs 2936863 149626 Allocation stalls 62510 3 DMA allocs 0 0 DMA32 allocs 6566458 6401314 Normal allocs 216361697 216538171 Movable allocs 0 0 Direct pages scanned 25977580 17998 Kswapd pages scanned 0 3638931 Kswapd pages reclaimed 0 207236 Direct pages reclaimed 8833714 88 Compaction stalls 103349 5 Compaction success 270 4 Compaction failures 103079 1 Note again that while this does swap as it's an aggressive workload, the direct relcim activity and allocation stalls is substantially reduced. There is some kswapd activity but ftrace showed that the kswapd activity was due to normal wakeups from 4K pages being allocated. Compaction-related stalls and activity are almost eliminated. I also tried the stutter benchmark. For this, I do not have figures for NUMA but it's something that does impact UMA so I'll report what is available stutter 4.4.0 4.4.0 kcompactd-v1r1 nodefrag-v1r3 Min mmap 7.3571 ( 0.00%) 7.3438 ( 0.18%) 1st-qrtle mmap 7.5278 ( 0.00%) 17.9200 (-138.05%) 2nd-qrtle mmap 7.6818 ( 0.00%) 21.6055 (-181.25%) 3rd-qrtle mmap 11.0889 ( 0.00%) 21.8881 (-97.39%) Max-90% mmap 27.8978 ( 0.00%) 22.1632 ( 20.56%) Max-93% mmap 28.3202 ( 0.00%) 22.3044 ( 21.24%) Max-95% mmap 28.5600 ( 0.00%) 22.4580 ( 21.37%) Max-99% mmap 29.6032 ( 0.00%) 25.5216 ( 13.79%) Max mmap 4109.7289 ( 0.00%) 4813.9832 (-17.14%) Mean mmap 12.4474 ( 0.00%) 19.3027 (-55.07%) This benchmark is trying to fault an anonymous mapping while there is a heavy IO load -- a scenario that desktop users used to complain about frequently. This shows a mix because the ideal case of mapping with THP is not hit as often. However, note that 99% of the mappings complete 13.79% faster. The CPU usage here is particularly interesting 4.4.0 4.4.0 kcompactd-v1r1nodefrag-v1r3 User 67.50 0.99 System 1327.88 91.30 Elapsed 2079.00 2128.98 And once again we look at the reclaim figures 4.4.0 4.4.0 kcompactd-v1r1nodefrag-v1r3 Minor Faults 335241922 1314582827 Major Faults 715 819 Swap Ins 0 0 Swap Outs 0 0 Allocation stalls 532723 0 DMA allocs 0 0 DMA32 allocs 1822364341 1177950222 Normal allocs 1815640808 1517844854 Movable allocs 0 0 Direct pages scanned 21892772 0 Kswapd pages scanned 20015890 41879484 Kswapd pages reclaimed 19961986 41822072 Direct pages reclaimed 21892741 0 Compaction stalls 1065755 0 Compaction success 514 0 Compaction failures 1065241 0 Allocation stalls and all direct reclaim activity is eliminated as well as compaction-related stalls. THP gives impressive gains in some cases but only if they are quickly available. We're not going to reach the point where they are completely free so lets take the costs out of the fast paths finally and defer the cost to kswapd, kcompactd and khugepaged where it belongs. Signed-off-by: Mel Gorman Acked-by: Rik van Riel Acked-by: Johannes Weiner Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 101 ++++++++++++++++++++++++++++++++++++------------------- mm/page_alloc.c | 8 ----- mm/slab.c | 8 ++--- mm/slub.c | 2 +- 4 files changed, 71 insertions(+), 48 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1dddfb2..e08b165 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -78,7 +78,7 @@ unsigned long transparent_hugepage_flags __read_mostly = #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE (1<vm_flags & VM_HUGEPAGE)) + reclaim_flags = __GFP_DIRECT_RECLAIM; + else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) + reclaim_flags = __GFP_KSWAPD_RECLAIM; + else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) + reclaim_flags = __GFP_DIRECT_RECLAIM; + + return GFP_TRANSHUGE | reclaim_flags; +} + +/* Defrag for khugepaged will enter direct reclaim/compaction if necessary */ +static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void) { - return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_RECLAIM)) | extra_gfp; + return GFP_TRANSHUGE | (khugepaged_defrag() ? __GFP_DIRECT_RECLAIM : 0); } /* Caller must hold page table lock. */ @@ -919,7 +950,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, } return ret; } - gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0); + gfp = alloc_hugepage_direct_gfpmask(vma); page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); if (unlikely(!page)) { count_vm_event(THP_FAULT_FALLBACK); @@ -1279,7 +1310,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, alloc: if (transparent_hugepage_enabled(vma) && !transparent_hugepage_debug_cow()) { - huge_gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0); + huge_gfp = alloc_hugepage_direct_gfpmask(vma); new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER); } else new_page = NULL; @@ -2249,11 +2280,12 @@ static int khugepaged_find_target_node(void) return 0; } -static inline struct page *alloc_hugepage(int defrag) +static inline struct page *alloc_khugepaged_hugepage(void) { struct page *page; - page = alloc_pages(alloc_hugepage_gfpmask(defrag, 0), HPAGE_PMD_ORDER); + page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(), + HPAGE_PMD_ORDER); if (page) prep_transhuge_page(page); return page; @@ -2264,7 +2296,7 @@ static struct page *khugepaged_alloc_hugepage(bool *wait) struct page *hpage; do { - hpage = alloc_hugepage(khugepaged_defrag()); + hpage = alloc_khugepaged_hugepage(); if (!hpage) { count_vm_event(THP_COLLAPSE_ALLOC_FAILED); if (!*wait) @@ -2335,8 +2367,7 @@ static void collapse_huge_page(struct mm_struct *mm, VM_BUG_ON(address & ~HPAGE_PMD_MASK); /* Only allocate from the target node */ - gfp = alloc_hugepage_gfpmask(khugepaged_defrag(), __GFP_OTHER_NODE) | - __GFP_THISNODE; + gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_OTHER_NODE | __GFP_THISNODE; /* release the mmap_sem read lock. */ new_page = khugepaged_alloc_page(hpage, gfp, mm, address, node); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d156310..096a00d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3119,14 +3119,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) gfp_mask &= ~__GFP_ATOMIC; - /* - * If this allocation cannot block and it is for a specific node, then - * fail early. There's no need to wakeup kswapd or retry for a - * speculative node-specific allocation. - */ - if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !can_direct_reclaim) - goto nopage; - retry: if (gfp_mask & __GFP_KSWAPD_RECLAIM) wake_all_kswapds(order, ac); diff --git a/mm/slab.c b/mm/slab.c index 56dd0df..e1f6c27 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -670,7 +670,7 @@ static inline void *____cache_alloc_node(struct kmem_cache *cachep, static inline gfp_t gfp_exact_node(gfp_t flags) { - return flags; + return flags & ~__GFP_NOFAIL; } #else /* CONFIG_NUMA */ @@ -841,12 +841,12 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) } /* - * Construct gfp mask to allocate from a specific node but do not direct reclaim - * or warn about failures. kswapd may still wake to reclaim in the background. + * Construct gfp mask to allocate from a specific node but do not reclaim or + * warn about failures. */ static inline gfp_t gfp_exact_node(gfp_t flags) { - return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~__GFP_DIRECT_RECLAIM; + return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~(__GFP_RECLAIM|__GFP_NOFAIL); } #endif diff --git a/mm/slub.c b/mm/slub.c index 2f2f04d..64ed5f3 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1426,7 +1426,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) */ alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min)) - alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_DIRECT_RECLAIM; + alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~(__GFP_RECLAIM|__GFP_NOFAIL); page = alloc_slab_page(s, alloc_gfp, node, oo); if (unlikely(!page)) { -- cgit v1.1 From fe896d1878949ea92ba547587bc3075cc688fb8f Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 17 Mar 2016 14:19:26 -0700 Subject: mm: introduce page reference manipulation functions The success of CMA allocation largely depends on the success of migration and key factor of it is page reference count. Until now, page reference is manipulated by direct calling atomic functions so we cannot follow up who and where manipulate it. Then, it is hard to find actual reason of CMA allocation failure. CMA allocation should be guaranteed to succeed so finding offending place is really important. In this patch, call sites where page reference is manipulated are converted to introduced wrapper function. This is preparation step to add tracepoint to each page reference manipulation function. With this facility, we can easily find reason of CMA allocation failure. There is no functional change in this patch. In addition, this patch also converts reference read sites. It will help a second step that renames page._count to something else and prevents later attempt to direct access to it (Suggested by Andrew). Signed-off-by: Joonsoo Kim Acked-by: Michal Nazarewicz Acked-by: Vlastimil Babka Cc: Minchan Kim Cc: Mel Gorman Cc: "Kirill A. Shutemov" Cc: Sergey Senozhatsky Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/debug.c | 2 +- mm/huge_memory.c | 6 +++--- mm/internal.h | 7 +------ mm/memory_hotplug.c | 4 ++-- mm/migrate.c | 10 +++++----- mm/page_alloc.c | 12 ++++++------ mm/vmscan.c | 6 +++--- 7 files changed, 21 insertions(+), 26 deletions(-) (limited to 'mm') diff --git a/mm/debug.c b/mm/debug.c index df7247b..8865bfb 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -43,7 +43,7 @@ const struct trace_print_flags vmaflag_names[] = { void __dump_page(struct page *page, const char *reason) { pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx", - page, atomic_read(&page->_count), page_mapcount(page), + page, page_ref_count(page), page_mapcount(page), page->mapping, page->index); if (PageCompound(page)) pr_cont(" compound_mapcount: %d", compound_mapcount(page)); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e08b165..bb944c7 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2888,7 +2888,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, page = pmd_page(*pmd); VM_BUG_ON_PAGE(!page_count(page), page); - atomic_add(HPAGE_PMD_NR - 1, &page->_count); + page_ref_add(page, HPAGE_PMD_NR - 1); write = pmd_write(*pmd); young = pmd_young(*pmd); dirty = pmd_dirty(*pmd); @@ -3257,7 +3257,7 @@ static void __split_huge_page_tail(struct page *head, int tail, struct page *page_tail = head + tail; VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail); - VM_BUG_ON_PAGE(atomic_read(&page_tail->_count) != 0, page_tail); + VM_BUG_ON_PAGE(page_ref_count(page_tail) != 0, page_tail); /* * tail_page->_count is zero and not changing from under us. But @@ -3270,7 +3270,7 @@ static void __split_huge_page_tail(struct page *head, int tail, * atomic_set() here would be safe on all archs (and not only on x86), * it's safer to use atomic_inc(). */ - atomic_inc(&page_tail->_count); + page_ref_inc(page_tail); page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; page_tail->flags |= (head->flags & diff --git a/mm/internal.h b/mm/internal.h index 4042a8a..57d7b0e 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -38,11 +38,6 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); -static inline void set_page_count(struct page *page, int v) -{ - atomic_set(&page->_count, v); -} - extern int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, pgoff_t offset, unsigned long nr_to_read, unsigned long lookahead_size); @@ -64,7 +59,7 @@ static inline unsigned long ra_submit(struct file_ra_state *ra, static inline void set_page_refcounted(struct page *page) { VM_BUG_ON_PAGE(PageTail(page), page); - VM_BUG_ON_PAGE(atomic_read(&page->_count), page); + VM_BUG_ON_PAGE(page_ref_count(page), page); set_page_count(page, 1); } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c832ef3..e62aa07 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -167,7 +167,7 @@ void get_page_bootmem(unsigned long info, struct page *page, page->lru.next = (struct list_head *) type; SetPagePrivate(page); set_page_private(page, info); - atomic_inc(&page->_count); + page_ref_inc(page); } void put_page_bootmem(struct page *page) @@ -178,7 +178,7 @@ void put_page_bootmem(struct page *page) BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE); - if (atomic_dec_return(&page->_count) == 1) { + if (page_ref_dec_return(page) == 1) { ClearPagePrivate(page); set_page_private(page, 0); INIT_LIST_HEAD(&page->lru); diff --git a/mm/migrate.c b/mm/migrate.c index fdaf081..577c94b 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -349,7 +349,7 @@ int migrate_page_move_mapping(struct address_space *mapping, return -EAGAIN; } - if (!page_freeze_refs(page, expected_count)) { + if (!page_ref_freeze(page, expected_count)) { spin_unlock_irq(&mapping->tree_lock); return -EAGAIN; } @@ -363,7 +363,7 @@ int migrate_page_move_mapping(struct address_space *mapping, */ if (mode == MIGRATE_ASYNC && head && !buffer_migrate_lock_buffers(head, mode)) { - page_unfreeze_refs(page, expected_count); + page_ref_unfreeze(page, expected_count); spin_unlock_irq(&mapping->tree_lock); return -EAGAIN; } @@ -397,7 +397,7 @@ int migrate_page_move_mapping(struct address_space *mapping, * to one less reference. * We know this isn't the last reference. */ - page_unfreeze_refs(page, expected_count - 1); + page_ref_unfreeze(page, expected_count - 1); spin_unlock(&mapping->tree_lock); /* Leave irq disabled to prevent preemption while updating stats */ @@ -451,7 +451,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, return -EAGAIN; } - if (!page_freeze_refs(page, expected_count)) { + if (!page_ref_freeze(page, expected_count)) { spin_unlock_irq(&mapping->tree_lock); return -EAGAIN; } @@ -463,7 +463,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, radix_tree_replace_slot(pslot, newpage); - page_unfreeze_refs(page, expected_count - 1); + page_ref_unfreeze(page, expected_count - 1); spin_unlock_irq(&mapping->tree_lock); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 096a00d..30134a8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -766,7 +766,7 @@ static inline int free_pages_check(struct page *page) bad_reason = "nonzero mapcount"; if (unlikely(page->mapping != NULL)) bad_reason = "non-NULL mapping"; - if (unlikely(atomic_read(&page->_count) != 0)) + if (unlikely(page_ref_count(page) != 0)) bad_reason = "nonzero _count"; if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) { bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; @@ -1462,7 +1462,7 @@ static inline int check_new_page(struct page *page) bad_reason = "nonzero mapcount"; if (unlikely(page->mapping != NULL)) bad_reason = "non-NULL mapping"; - if (unlikely(atomic_read(&page->_count) != 0)) + if (unlikely(page_ref_count(page) != 0)) bad_reason = "nonzero _count"; if (unlikely(page->flags & __PG_HWPOISON)) { bad_reason = "HWPoisoned (hardware-corrupted)"; @@ -3475,7 +3475,7 @@ refill: /* Even if we own the page, we do not use atomic_set(). * This would break get_page_unless_zero() users. */ - atomic_add(size - 1, &page->_count); + page_ref_add(page, size - 1); /* reset page count bias and offset to start of new frag */ nc->pfmemalloc = page_is_pfmemalloc(page); @@ -3487,7 +3487,7 @@ refill: if (unlikely(offset < 0)) { page = virt_to_page(nc->va); - if (!atomic_sub_and_test(nc->pagecnt_bias, &page->_count)) + if (!page_ref_sub_and_test(page, nc->pagecnt_bias)) goto refill; #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) @@ -3495,7 +3495,7 @@ refill: size = nc->size; #endif /* OK, page count is 0, we can safely set it */ - atomic_set(&page->_count, size); + set_page_count(page, size); /* reset page count bias and offset to start of new frag */ nc->pagecnt_bias = size; @@ -6852,7 +6852,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, * This check already skips compound tails of THP * because their page->_count is zero at all time. */ - if (!atomic_read(&page->_count)) { + if (!page_ref_count(page)) { if (PageBuddy(page)) iter += (1 << page_order(page)) - 1; continue; diff --git a/mm/vmscan.c b/mm/vmscan.c index b41b82d..b934223e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -638,11 +638,11 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, * Note that if SetPageDirty is always performed via set_page_dirty, * and thus under tree_lock, then this ordering is not required. */ - if (!page_freeze_refs(page, 2)) + if (!page_ref_freeze(page, 2)) goto cannot_free; /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */ if (unlikely(PageDirty(page))) { - page_unfreeze_refs(page, 2); + page_ref_unfreeze(page, 2); goto cannot_free; } @@ -704,7 +704,7 @@ int remove_mapping(struct address_space *mapping, struct page *page) * drops the pagecache ref for us without requiring another * atomic operation. */ - page_unfreeze_refs(page, 1); + page_ref_unfreeze(page, 1); return 1; } return 0; -- cgit v1.1 From 95813b8faa0cd315f61a8b9d9c523792370b693e Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 17 Mar 2016 14:19:29 -0700 Subject: mm/page_ref: add tracepoint to track down page reference manipulation CMA allocation should be guaranteed to succeed by definition, but, unfortunately, it would be failed sometimes. It is hard to track down the problem, because it is related to page reference manipulation and we don't have any facility to analyze it. This patch adds tracepoints to track down page reference manipulation. With it, we can find exact reason of failure and can fix the problem. Following is an example of tracepoint output. (note: this example is stale version that printing flags as the number. Recent version will print it as human readable string.) <...>-9018 [004] 92.678375: page_ref_set: pfn=0x17ac9 flags=0x0 count=1 mapcount=0 mapping=(nil) mt=4 val=1 <...>-9018 [004] 92.678378: kernel_stack: => get_page_from_freelist (ffffffff81176659) => __alloc_pages_nodemask (ffffffff81176d22) => alloc_pages_vma (ffffffff811bf675) => handle_mm_fault (ffffffff8119e693) => __do_page_fault (ffffffff810631ea) => trace_do_page_fault (ffffffff81063543) => do_async_page_fault (ffffffff8105c40a) => async_page_fault (ffffffff817581d8) [snip] <...>-9018 [004] 92.678379: page_ref_mod: pfn=0x17ac9 flags=0x40048 count=2 mapcount=1 mapping=0xffff880015a78dc1 mt=4 val=1 [snip] ... ... <...>-9131 [001] 93.174468: test_pages_isolated: start_pfn=0x17800 end_pfn=0x17c00 fin_pfn=0x17ac9 ret=fail [snip] <...>-9018 [004] 93.174843: page_ref_mod_and_test: pfn=0x17ac9 flags=0x40068 count=0 mapcount=0 mapping=0xffff880015a78dc1 mt=4 val=-1 ret=1 => release_pages (ffffffff8117c9e4) => free_pages_and_swap_cache (ffffffff811b0697) => tlb_flush_mmu_free (ffffffff81199616) => tlb_finish_mmu (ffffffff8119a62c) => exit_mmap (ffffffff811a53f7) => mmput (ffffffff81073f47) => do_exit (ffffffff810794e9) => do_group_exit (ffffffff81079def) => SyS_exit_group (ffffffff81079e74) => entry_SYSCALL_64_fastpath (ffffffff817560b6) This output shows that problem comes from exit path. In exit path, to improve performance, pages are not freed immediately. They are gathered and processed by batch. During this process, migration cannot be possible and CMA allocation is failed. This problem is hard to find without this page reference tracepoint facility. Enabling this feature bloat kernel text 30 KB in my configuration. text data bss dec hex filename 12127327 2243616 1507328 15878271 f2487f vmlinux_disabled 12157208 2258880 1507328 15923416 f2f8d8 vmlinux_enabled Note that, due to header file dependency problem between mm.h and tracepoint.h, this feature has to open code the static key functions for tracepoints. Proposed by Steven Rostedt in following link. https://lkml.org/lkml/2015/12/9/699 [arnd@arndb.de: crypto/async_pq: use __free_page() instead of put_page()] [iamjoonsoo.kim@lge.com: fix build failure for xtensa] [akpm@linux-foundation.org: tweak Kconfig text, per Vlastimil] Signed-off-by: Joonsoo Kim Acked-by: Michal Nazarewicz Acked-by: Vlastimil Babka Cc: Minchan Kim Cc: Mel Gorman Cc: "Kirill A. Shutemov" Cc: Sergey Senozhatsky Acked-by: Steven Rostedt Signed-off-by: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig.debug | 13 +++++++++++++ mm/Makefile | 1 + mm/debug_page_ref.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 68 insertions(+) create mode 100644 mm/debug_page_ref.c (limited to 'mm') diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 5c50b23..22f4cd9 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -79,3 +79,16 @@ config PAGE_POISONING_ZERO Enabling page poisoning with this option will disable hibernation If unsure, say N + bool + +config DEBUG_PAGE_REF + bool "Enable tracepoint to track down page reference manipulation" + depends on DEBUG_KERNEL + depends on TRACEPOINTS + ---help--- + This is a feature to add tracepoint for tracking down page reference + manipulation. This tracking is useful to diagnose functional failure + due to migration failures caused by page reference mismatches. Be + careful when enabling this feature because it adds about 30 KB to the + kernel code. However the runtime performance overhead is virtually + nil until the tracepoints are actually enabled. diff --git a/mm/Makefile b/mm/Makefile index cfdd481d..6da300a 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -81,3 +81,4 @@ obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o obj-$(CONFIG_USERFAULTFD) += userfaultfd.o obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o +obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o diff --git a/mm/debug_page_ref.c b/mm/debug_page_ref.c new file mode 100644 index 0000000..1aef3d5 --- /dev/null +++ b/mm/debug_page_ref.c @@ -0,0 +1,54 @@ +#include +#include + +#define CREATE_TRACE_POINTS +#include + +void __page_ref_set(struct page *page, int v) +{ + trace_page_ref_set(page, v); +} +EXPORT_SYMBOL(__page_ref_set); +EXPORT_TRACEPOINT_SYMBOL(page_ref_set); + +void __page_ref_mod(struct page *page, int v) +{ + trace_page_ref_mod(page, v); +} +EXPORT_SYMBOL(__page_ref_mod); +EXPORT_TRACEPOINT_SYMBOL(page_ref_mod); + +void __page_ref_mod_and_test(struct page *page, int v, int ret) +{ + trace_page_ref_mod_and_test(page, v, ret); +} +EXPORT_SYMBOL(__page_ref_mod_and_test); +EXPORT_TRACEPOINT_SYMBOL(page_ref_mod_and_test); + +void __page_ref_mod_and_return(struct page *page, int v, int ret) +{ + trace_page_ref_mod_and_return(page, v, ret); +} +EXPORT_SYMBOL(__page_ref_mod_and_return); +EXPORT_TRACEPOINT_SYMBOL(page_ref_mod_and_return); + +void __page_ref_mod_unless(struct page *page, int v, int u) +{ + trace_page_ref_mod_unless(page, v, u); +} +EXPORT_SYMBOL(__page_ref_mod_unless); +EXPORT_TRACEPOINT_SYMBOL(page_ref_mod_unless); + +void __page_ref_freeze(struct page *page, int v, int ret) +{ + trace_page_ref_freeze(page, v, ret); +} +EXPORT_SYMBOL(__page_ref_freeze); +EXPORT_TRACEPOINT_SYMBOL(page_ref_freeze); + +void __page_ref_unfreeze(struct page *page, int v) +{ + trace_page_ref_unfreeze(page, v); +} +EXPORT_SYMBOL(__page_ref_unfreeze); +EXPORT_TRACEPOINT_SYMBOL(page_ref_unfreeze); -- cgit v1.1 From 0f352e5392c86d054998dd6526f2fdc33ec4bed5 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 17 Mar 2016 14:19:32 -0700 Subject: mm: remove __GFP_NOFAIL is deprecated comment Commit 647757197cd3 ("mm: clarify __GFP_NOFAIL deprecation status") was incomplete and didn't remove the comment about __GFP_NOFAIL being deprecated in buffered_rmqueue. Let's get rid of this leftover but keep the WARN_ON_ONCE for order > 1 because we should really discourage from using __GFP_NOFAIL with higher order allocations because those are just too subtle. Signed-off-by: Michal Hocko Reviewed-by: Nikolay Borisov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 30134a8..30f01c6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2350,19 +2350,11 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, list_del(&page->lru); pcp->count--; } else { - if (unlikely(gfp_flags & __GFP_NOFAIL)) { - /* - * __GFP_NOFAIL is not to be used in new code. - * - * All __GFP_NOFAIL callers should be fixed so that they - * properly detect and handle allocation failures. - * - * We most definitely don't want callers attempting to - * allocate greater than order-1 page units with - * __GFP_NOFAIL. - */ - WARN_ON_ONCE(order > 1); - } + /* + * We most definitely don't want callers attempting to + * allocate greater than order-1 page units with __GFP_NOFAIL. + */ + WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); spin_lock_irqsave(&zone->lock, flags); page = NULL; -- cgit v1.1 From e33e33b4d1c699d06fb8ccd6da80b309b84ec975 Mon Sep 17 00:00:00 2001 From: Chen Yucong Date: Thu, 17 Mar 2016 14:19:35 -0700 Subject: mm, memory hotplug: print debug message in the proper way for online_pages online_pages() simply returns an error value if memory_notify(MEM_GOING_ONLINE, &arg) return a value that is not what we want for successfully onlining target pages. This patch arms to print more failure information like offline_pages() in online_pages. This patch also converts printk(KERN_) to pr_(), and moves __offline_pages() to not print failure information with KERN_INFO according to David Rientjes's suggestion[1]. [1] https://lkml.org/lkml/2016/2/24/1094 Signed-off-by: Chen Yucong Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index e62aa07..f5758b6 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1059,10 +1059,9 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ ret = memory_notify(MEM_GOING_ONLINE, &arg); ret = notifier_to_errno(ret); - if (ret) { - memory_notify(MEM_CANCEL_ONLINE, &arg); - return ret; - } + if (ret) + goto failed_addition; + /* * If this zone is not populated, then it is not in zonelist. * This means the page allocator ignores this zone. @@ -1080,12 +1079,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ if (need_zonelists_rebuild) zone_pcp_reset(zone); mutex_unlock(&zonelists_mutex); - printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n", - (unsigned long long) pfn << PAGE_SHIFT, - (((unsigned long long) pfn + nr_pages) - << PAGE_SHIFT) - 1); - memory_notify(MEM_CANCEL_ONLINE, &arg); - return ret; + goto failed_addition; } zone->present_pages += onlined_pages; @@ -1118,6 +1112,13 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ if (onlined_pages) memory_notify(MEM_ONLINE, &arg); return 0; + +failed_addition: + pr_debug("online_pages [mem %#010llx-%#010llx] failed\n", + (unsigned long long) pfn << PAGE_SHIFT, + (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1); + memory_notify(MEM_CANCEL_ONLINE, &arg); + return ret; } #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ @@ -1529,8 +1530,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) } else { #ifdef CONFIG_DEBUG_VM - printk(KERN_ALERT "removing pfn %lx from LRU failed\n", - pfn); + pr_alert("removing pfn %lx from LRU failed\n", pfn); dump_page(page, "failed to remove from LRU"); #endif put_page(page); @@ -1858,7 +1858,7 @@ repeat: ret = -EBUSY; goto failed_removal; } - printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages); + pr_info("Offlined Pages %ld\n", offlined_pages); /* Ok, all of our target is isolated. We cannot do rollback at this point. */ offline_isolated_pages(start_pfn, end_pfn); @@ -1895,9 +1895,9 @@ repeat: return 0; failed_removal: - printk(KERN_INFO "memory offlining [mem %#010llx-%#010llx] failed\n", - (unsigned long long) start_pfn << PAGE_SHIFT, - ((unsigned long long) end_pfn << PAGE_SHIFT) - 1); + pr_debug("memory offlining [mem %#010llx-%#010llx] failed\n", + (unsigned long long) start_pfn << PAGE_SHIFT, + ((unsigned long long) end_pfn << PAGE_SHIFT) - 1); memory_notify(MEM_CANCEL_OFFLINE, &arg); /* pushback to free area */ undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); -- cgit v1.1 From d334c9bcb4aebd0c3e89ee415cba86fa5f0253ee Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 17 Mar 2016 14:19:38 -0700 Subject: mm: memcontrol: cleanup css_reset callback - Do not take memcg_limit_mutex for resetting limits - the cgroup cannot be altered from userspace anymore, so no need to protect them. - Use plain page_counter_limit() for resetting ->memory and ->memsw limits instead of mem_cgrouop_resize_* helpers - we enlarge the limits, so no need in special handling. - Reset ->swap and ->tcpmem limits as well. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Acked-by: Michal Hocko Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ae8b81c..8615b06 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4257,9 +4257,11 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX); - mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX); - memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX); + page_counter_limit(&memcg->memory, PAGE_COUNTER_MAX); + page_counter_limit(&memcg->swap, PAGE_COUNTER_MAX); + page_counter_limit(&memcg->memsw, PAGE_COUNTER_MAX); + page_counter_limit(&memcg->kmem, PAGE_COUNTER_MAX); + page_counter_limit(&memcg->tcpmem, PAGE_COUNTER_MAX); memcg->low = 0; memcg->high = PAGE_COUNTER_MAX; memcg->soft_limit = PAGE_COUNTER_MAX; -- cgit v1.1 From b11a7b94100cba5ec926a181894c2897a22651b9 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 17 Mar 2016 14:19:41 -0700 Subject: mm: exclude ZONE_DEVICE from GFP_ZONE_TABLE ZONE_DEVICE (merged in 4.3) and ZONE_CMA (proposed) are examples of new mm zones that are bumping up against the current maximum limit of 4 zones, i.e. 2 bits in page->flags for the GFP_ZONE_TABLE. The GFP_ZONE_TABLE poses an interesting constraint since include/linux/gfp.h gets included by the 32-bit portion of a 64-bit build. We need to be careful to only build the table for zones that have a corresponding gfp_t flag. GFP_ZONES_SHIFT is introduced for this purpose. This patch does not attempt to solve the problem of adding a new zone that also has a corresponding GFP_ flag. Vlastimil points out that ZONE_DEVICE, by depending on x86_64 and SPARSEMEM_VMEMMAP implies that SECTIONS_WIDTH is zero. In other words even though ZONE_DEVICE does not fit in GFP_ZONE_TABLE it is free to consume another bit in page->flags (expand ZONES_WIDTH) with room to spare. Link: https://bugzilla.kernel.org/show_bug.cgi?id=110931 Fixes: 033fbae988fc ("mm: ZONE_DEVICE for "device memory"") Signed-off-by: Dan Williams Reported-by: Mark Reported-by: Vlastimil Babka Cc: Mel Gorman Cc: Rik van Riel Cc: Joonsoo Kim Cc: Dave Hansen Cc: Sudip Mukherjee Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index c077765..520dae6 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -651,8 +651,6 @@ config IDLE_PAGE_TRACKING config ZONE_DEVICE bool "Device memory (pmem, etc...) hotplug support" if EXPERT - default !ZONE_DMA - depends on !ZONE_DMA depends on MEMORY_HOTPLUG depends on MEMORY_HOTREMOVE depends on X86_64 #arch_add_memory() comprehends device memory -- cgit v1.1 From 598d80914e84fa79580850530f5d4a50a99bf4f5 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 17 Mar 2016 14:19:44 -0700 Subject: mm: convert pr_warning to pr_warn There are a mixture of pr_warning and pr_warn uses in mm. Use pr_warn consistently. Miscellanea: - Coalesce formats - Realign arguments Signed-off-by: Joe Perches Acked-by: Tejun Heo [percpu] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 5 ++--- mm/kmemleak.c | 14 +++++++------- mm/percpu.c | 15 +++++++-------- 3 files changed, 16 insertions(+), 18 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index aefba5a..06058ea 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2665,7 +2665,7 @@ void __init hugetlb_add_hstate(unsigned int order) unsigned long i; if (size_to_hstate(PAGE_SIZE << order)) { - pr_warning("hugepagesz= specified twice, ignoring\n"); + pr_warn("hugepagesz= specified twice, ignoring\n"); return; } BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); @@ -2701,8 +2701,7 @@ static int __init hugetlb_nrpages_setup(char *s) mhp = &parsed_hstate->max_huge_pages; if (mhp == last_mhp) { - pr_warning("hugepages= specified twice without " - "interleaving hugepagesz=, ignoring\n"); + pr_warn("hugepages= specified twice without interleaving hugepagesz=, ignoring\n"); return 1; } diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 25c0ad3..a81cd76 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -276,7 +276,7 @@ static void kmemleak_disable(void); * Print a warning and dump the stack trace. */ #define kmemleak_warn(x...) do { \ - pr_warning(x); \ + pr_warn(x); \ dump_stack(); \ kmemleak_warning = 1; \ } while (0) @@ -543,7 +543,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp)); if (!object) { - pr_warning("Cannot allocate a kmemleak_object structure\n"); + pr_warn("Cannot allocate a kmemleak_object structure\n"); kmemleak_disable(); return NULL; } @@ -764,7 +764,7 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp) area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp)); if (!area) { - pr_warning("Cannot allocate a scan area\n"); + pr_warn("Cannot allocate a scan area\n"); goto out; } @@ -1515,7 +1515,7 @@ static void start_scan_thread(void) return; scan_thread = kthread_run(kmemleak_scan_thread, NULL, "kmemleak"); if (IS_ERR(scan_thread)) { - pr_warning("Failed to create the scan thread\n"); + pr_warn("Failed to create the scan thread\n"); scan_thread = NULL; } } @@ -1874,8 +1874,8 @@ void __init kmemleak_init(void) scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE); if (crt_early_log > ARRAY_SIZE(early_log)) - pr_warning("Early log buffer exceeded (%d), please increase " - "DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n", crt_early_log); + pr_warn("Early log buffer exceeded (%d), please increase DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n", + crt_early_log); /* the kernel is still in UP mode, so disabling the IRQs is enough */ local_irq_save(flags); @@ -1960,7 +1960,7 @@ static int __init kmemleak_late_init(void) dentry = debugfs_create_file("kmemleak", S_IRUGO, NULL, NULL, &kmemleak_fops); if (!dentry) - pr_warning("Failed to create the debugfs kmemleak file\n"); + pr_warn("Failed to create the debugfs kmemleak file\n"); mutex_lock(&scan_mutex); start_scan_thread(); mutex_unlock(&scan_mutex); diff --git a/mm/percpu.c b/mm/percpu.c index 998607a..847814b 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1033,8 +1033,8 @@ fail_unlock: spin_unlock_irqrestore(&pcpu_lock, flags); fail: if (!is_atomic && warn_limit) { - pr_warning("PERCPU: allocation failed, size=%zu align=%zu atomic=%d, %s\n", - size, align, is_atomic, err); + pr_warn("PERCPU: allocation failed, size=%zu align=%zu atomic=%d, %s\n", + size, align, is_atomic, err); dump_stack(); if (!--warn_limit) pr_info("PERCPU: limit reached, disable warning\n"); @@ -1723,7 +1723,7 @@ static int __init percpu_alloc_setup(char *str) pcpu_chosen_fc = PCPU_FC_PAGE; #endif else - pr_warning("PERCPU: unknown allocator %s specified\n", str); + pr_warn("PERCPU: unknown allocator %s specified\n", str); return 0; } @@ -2016,9 +2016,8 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, /* warn if maximum distance is further than 75% of vmalloc space */ if (max_distance > VMALLOC_TOTAL * 3 / 4) { - pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc " - "space 0x%lx\n", max_distance, - VMALLOC_TOTAL); + pr_warn("PERCPU: max_distance=0x%zx too large for vmalloc space 0x%lx\n", + max_distance, VMALLOC_TOTAL); #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK /* and fail if we have fallback */ rc = -EINVAL; @@ -2100,8 +2099,8 @@ int __init pcpu_page_first_chunk(size_t reserved_size, ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE); if (!ptr) { - pr_warning("PERCPU: failed to allocate %s page " - "for cpu%u\n", psize_str, cpu); + pr_warn("PERCPU: failed to allocate %s page for cpu%u\n", + psize_str, cpu); goto enomem; } /* kmemleak tracks the percpu allocations separately */ -- cgit v1.1 From 756a025f00091918d9d09ca3229defb160b409c0 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 17 Mar 2016 14:19:47 -0700 Subject: mm: coalesce split strings Kernel style prefers a single string over split strings when the string is 'user-visible'. Miscellanea: - Add a missing newline - Realign arguments Signed-off-by: Joe Perches Acked-by: Tejun Heo [percpu] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/dmapool.c | 10 ++++------ mm/huge_memory.c | 3 +-- mm/kasan/report.c | 6 ++---- mm/kmemcheck.c | 3 +-- mm/kmemleak.c | 18 ++++++++---------- mm/memblock.c | 3 +-- mm/memory_hotplug.c | 3 +-- mm/mempolicy.c | 4 +--- mm/mmap.c | 8 +++----- mm/oom_kill.c | 3 +-- mm/page_alloc.c | 37 +++++++++++++++++-------------------- mm/page_owner.c | 5 ++--- mm/percpu.c | 4 ++-- mm/slab.c | 28 ++++++++++------------------ mm/slab_common.c | 10 ++++------ mm/slub.c | 19 +++++++++---------- mm/sparse-vmemmap.c | 8 ++++---- mm/sparse.c | 8 ++++---- mm/swapfile.c | 3 +-- mm/vmalloc.c | 4 ++-- 20 files changed, 78 insertions(+), 109 deletions(-) (limited to 'mm') diff --git a/mm/dmapool.c b/mm/dmapool.c index 57312b5..2821500 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -452,13 +452,11 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) } spin_unlock_irqrestore(&pool->lock, flags); if (pool->dev) - dev_err(pool->dev, "dma_pool_free %s, dma %Lx " - "already free\n", pool->name, - (unsigned long long)dma); + dev_err(pool->dev, "dma_pool_free %s, dma %Lx already free\n", + pool->name, (unsigned long long)dma); else - printk(KERN_ERR "dma_pool_free %s, dma %Lx " - "already free\n", pool->name, - (unsigned long long)dma); + printk(KERN_ERR "dma_pool_free %s, dma %Lx already free\n", + pool->name, (unsigned long long)dma); return; } } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index bb944c7..e1a177c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -168,8 +168,7 @@ static void set_recommended_min_free_kbytes(void) if (recommended_min > min_free_kbytes) { if (user_min_free_kbytes >= 0) - pr_info("raising min_free_kbytes from %d to %lu " - "to help transparent hugepage allocations\n", + pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n", min_free_kbytes, recommended_min); min_free_kbytes = recommended_min; diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 12f222d..745aa8f 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -214,8 +214,7 @@ static void kasan_report_error(struct kasan_access_info *info) */ kasan_disable_current(); spin_lock_irqsave(&report_lock, flags); - pr_err("=================================" - "=================================\n"); + pr_err("==================================================================\n"); if (info->access_addr < kasan_shadow_to_mem((void *)KASAN_SHADOW_START)) { if ((unsigned long)info->access_addr < PAGE_SIZE) @@ -236,8 +235,7 @@ static void kasan_report_error(struct kasan_access_info *info) print_address_description(info); print_shadow_for_address(info->first_bad_addr); } - pr_err("=================================" - "=================================\n"); + pr_err("==================================================================\n"); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); spin_unlock_irqrestore(&report_lock, flags); kasan_enable_current(); diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c index 6f4f424..e5f8333 100644 --- a/mm/kmemcheck.c +++ b/mm/kmemcheck.c @@ -20,8 +20,7 @@ void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node) shadow = alloc_pages_node(node, flags | __GFP_NOTRACK, order); if (!shadow) { if (printk_ratelimit()) - printk(KERN_ERR "kmemcheck: failed to allocate " - "shadow bitmap\n"); + printk(KERN_ERR "kmemcheck: failed to allocate shadow bitmap\n"); return; } diff --git a/mm/kmemleak.c b/mm/kmemleak.c index a81cd76..e642992 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -596,8 +596,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, else if (parent->pointer + parent->size <= ptr) link = &parent->rb_node.rb_right; else { - kmemleak_stop("Cannot insert 0x%lx into the object " - "search tree (overlaps existing)\n", + kmemleak_stop("Cannot insert 0x%lx into the object search tree (overlaps existing)\n", ptr); /* * No need for parent->lock here since "parent" cannot @@ -670,8 +669,8 @@ static void delete_object_part(unsigned long ptr, size_t size) object = find_and_remove_object(ptr, 1); if (!object) { #ifdef DEBUG - kmemleak_warn("Partially freeing unknown object at 0x%08lx " - "(size %zu)\n", ptr, size); + kmemleak_warn("Partially freeing unknown object at 0x%08lx (size %zu)\n", + ptr, size); #endif return; } @@ -717,8 +716,8 @@ static void paint_ptr(unsigned long ptr, int color) object = find_and_get_object(ptr, 0); if (!object) { - kmemleak_warn("Trying to color unknown object " - "at 0x%08lx as %s\n", ptr, + kmemleak_warn("Trying to color unknown object at 0x%08lx as %s\n", + ptr, (color == KMEMLEAK_GREY) ? "Grey" : (color == KMEMLEAK_BLACK) ? "Black" : "Unknown"); return; @@ -1463,8 +1462,8 @@ static void kmemleak_scan(void) if (new_leaks) { kmemleak_found_leaks = true; - pr_info("%d new suspected memory leaks (see " - "/sys/kernel/debug/kmemleak)\n", new_leaks); + pr_info("%d new suspected memory leaks (see /sys/kernel/debug/kmemleak)\n", + new_leaks); } } @@ -1795,8 +1794,7 @@ static void kmemleak_do_cleanup(struct work_struct *work) if (!kmemleak_found_leaks) __kmemleak_do_cleanup(); else - pr_info("Kmemleak disabled without freeing internal data. " - "Reclaim the memory with \"echo clear > /sys/kernel/debug/kmemleak\"\n"); + pr_info("Kmemleak disabled without freeing internal data. Reclaim the memory with \"echo clear > /sys/kernel/debug/kmemleak\".\n"); } static DECLARE_WORK(cleanup_work, kmemleak_do_cleanup); diff --git a/mm/memblock.c b/mm/memblock.c index fc7824f..b570ddd 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -238,8 +238,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, * so we use WARN_ONCE() here to see the stack trace if * fail happens. */ - WARN_ONCE(1, "memblock: bottom-up allocation failed, " - "memory hotunplug may be affected\n"); + WARN_ONCE(1, "memblock: bottom-up allocation failed, memory hotunplug may be affected\n"); } return __memblock_find_range_top_down(start, end, size, align, nid, diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index f5758b6..aa34431 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1970,8 +1970,7 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)); endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1; - pr_warn("removing memory fails, because memory " - "[%pa-%pa] is onlined\n", + pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n", &beginpa, &endpa); } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 8cbc743..b25de27 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2559,9 +2559,7 @@ static void __init check_numabalancing_enable(void) set_numabalancing_state(numabalancing_override == 1); if (num_online_nodes() > 1 && !numabalancing_override) { - pr_info("%s automatic NUMA balancing. " - "Configure with numa_balancing= or the " - "kernel.numa_balancing sysctl", + pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n", numabalancing_default ? "Enabling" : "Disabling"); set_numabalancing_state(numabalancing_default); } diff --git a/mm/mmap.c b/mm/mmap.c index 1464192..e06345a 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2517,9 +2517,8 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, unsigned long ret = -EINVAL; struct file *file; - pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. " - "See Documentation/vm/remap_file_pages.txt.\n", - current->comm, current->pid); + pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/vm/remap_file_pages.txt.\n", + current->comm, current->pid); if (prot) return ret; @@ -2885,8 +2884,7 @@ bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages) if (is_data_mapping(flags) && mm->data_vm + npages > rlimit(RLIMIT_DATA) >> PAGE_SHIFT) { if (ignore_rlimit_data) - pr_warn_once("%s (%d): VmData %lu exceed data ulimit " - "%lu. Will be forbidden soon.\n", + pr_warn_once("%s (%d): VmData %lu exceed data ulimit %lu. Will be forbidden soon.\n", current->comm, current->pid, (mm->data_vm + npages) << PAGE_SHIFT, rlimit(RLIMIT_DATA)); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 63ced70..fde3d37 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -383,8 +383,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) static void dump_header(struct oom_control *oc, struct task_struct *p, struct mem_cgroup *memcg) { - pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, " - "oom_score_adj=%hd\n", + pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n", current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order, current->signal->oom_score_adj); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 30f01c6..42cf199 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4074,8 +4074,7 @@ static int __parse_numa_zonelist_order(char *s) user_zonelist_order = ZONELIST_ORDER_ZONE; } else { printk(KERN_WARNING - "Ignoring invalid numa_zonelist_order value: " - "%s\n", s); + "Ignoring invalid numa_zonelist_order value: %s\n", s); return -EINVAL; } return 0; @@ -4539,12 +4538,11 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) else page_group_by_mobility_disabled = 0; - pr_info("Built %i zonelists in %s order, mobility grouping %s. " - "Total pages: %ld\n", - nr_online_nodes, - zonelist_order_name[current_zonelist_order], - page_group_by_mobility_disabled ? "off" : "on", - vm_total_pages); + pr_info("Built %i zonelists in %s order, mobility grouping %s. Total pages: %ld\n", + nr_online_nodes, + zonelist_order_name[current_zonelist_order], + page_group_by_mobility_disabled ? "off" : "on", + vm_total_pages); #ifdef CONFIG_NUMA pr_info("Policy zone: %s\n", zone_names[policy_zone]); #endif @@ -6142,22 +6140,21 @@ void __init mem_init_print_info(const char *str) #undef adj_init_size - pr_info("Memory: %luK/%luK available " - "(%luK kernel code, %luK rwdata, %luK rodata, " - "%luK init, %luK bss, %luK reserved, %luK cma-reserved" + pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved" #ifdef CONFIG_HIGHMEM - ", %luK highmem" + ", %luK highmem" #endif - "%s%s)\n", - nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10), - codesize >> 10, datasize >> 10, rosize >> 10, - (init_data_size + init_code_size) >> 10, bss_size >> 10, - (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT-10), - totalcma_pages << (PAGE_SHIFT-10), + "%s%s)\n", + nr_free_pages() << (PAGE_SHIFT - 10), + physpages << (PAGE_SHIFT - 10), + codesize >> 10, datasize >> 10, rosize >> 10, + (init_data_size + init_code_size) >> 10, bss_size >> 10, + (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT - 10), + totalcma_pages << (PAGE_SHIFT - 10), #ifdef CONFIG_HIGHMEM - totalhigh_pages << (PAGE_SHIFT-10), + totalhigh_pages << (PAGE_SHIFT - 10), #endif - str ? ", " : "", str ? str : ""); + str ? ", " : "", str ? str : ""); } /** diff --git a/mm/page_owner.c b/mm/page_owner.c index 44ad1f0..ac3d8d1 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -198,9 +198,8 @@ void __dump_page_owner(struct page *page) return; } - pr_alert("page allocated via order %u, migratetype %s, " - "gfp_mask %#x(%pGg)\n", page_ext->order, - migratetype_names[mt], gfp_mask, &gfp_mask); + pr_alert("page allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n", + page_ext->order, migratetype_names[mt], gfp_mask, &gfp_mask); print_stack_trace(&trace, 0); if (page_ext->last_migrate_reason != -1) diff --git a/mm/percpu.c b/mm/percpu.c index 847814b..1571547 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -888,8 +888,8 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, size = ALIGN(size, 2); if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) { - WARN(true, "illegal size (%zu) or align (%zu) for " - "percpu allocation\n", size, align); + WARN(true, "illegal size (%zu) or align (%zu) for percpu allocation\n", + size, align); return NULL; } diff --git a/mm/slab.c b/mm/slab.c index e1f6c27..e558f85 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1566,11 +1566,9 @@ static void dump_line(char *data, int offset, int limit) if (bad_count == 1) { error ^= POISON_FREE; if (!(error & (error - 1))) { - printk(KERN_ERR "Single bit error detected. Probably " - "bad RAM.\n"); + printk(KERN_ERR "Single bit error detected. Probably bad RAM.\n"); #ifdef CONFIG_X86 - printk(KERN_ERR "Run memtest86+ or a similar memory " - "test tool.\n"); + printk(KERN_ERR "Run memtest86+ or a similar memory test tool.\n"); #else printk(KERN_ERR "Run a memory test tool.\n"); #endif @@ -1693,11 +1691,9 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, } if (cachep->flags & SLAB_RED_ZONE) { if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) - slab_error(cachep, "start of a freed object " - "was overwritten"); + slab_error(cachep, "start of a freed object was overwritten"); if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) - slab_error(cachep, "end of a freed object " - "was overwritten"); + slab_error(cachep, "end of a freed object was overwritten"); } } } @@ -2398,11 +2394,9 @@ static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page) if (cachep->flags & SLAB_RED_ZONE) { if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) - slab_error(cachep, "constructor overwrote the" - " end of an object"); + slab_error(cachep, "constructor overwrote the end of an object"); if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) - slab_error(cachep, "constructor overwrote the" - " start of an object"); + slab_error(cachep, "constructor overwrote the start of an object"); } /* need to poison the objs? */ if (cachep->flags & SLAB_POISON) { @@ -2469,8 +2463,8 @@ static void slab_put_obj(struct kmem_cache *cachep, /* Verify double free bug */ for (i = page->active; i < cachep->num; i++) { if (get_free_obj(page, i) == objnr) { - printk(KERN_ERR "slab: double free detected in cache " - "'%s', objp %p\n", cachep->name, objp); + printk(KERN_ERR "slab: double free detected in cache '%s', objp %p\n", + cachep->name, objp); BUG(); } } @@ -2901,8 +2895,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, if (cachep->flags & SLAB_RED_ZONE) { if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || *dbg_redzone2(cachep, objp) != RED_INACTIVE) { - slab_error(cachep, "double free, or memory outside" - " object was overwritten"); + slab_error(cachep, "double free, or memory outside object was overwritten"); printk(KERN_ERR "%p: redzone 1:0x%llx, redzone 2:0x%llx\n", objp, *dbg_redzone1(cachep, objp), @@ -4028,8 +4021,7 @@ void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *cachep) unsigned long node_frees = cachep->node_frees; unsigned long overflows = cachep->node_overflow; - seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu " - "%4lu %4lu %4lu %4lu %4lu", + seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu %4lu %4lu %4lu %4lu %4lu", allocs, high, grown, reaped, errors, max_freeable, node_allocs, node_frees, overflows); diff --git a/mm/slab_common.c b/mm/slab_common.c index 8addc3c..e885e11 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -726,8 +726,8 @@ void kmem_cache_destroy(struct kmem_cache *s) err = shutdown_cache(s, &release, &need_rcu_barrier); if (err) { - pr_err("kmem_cache_destroy %s: " - "Slab cache still has objects\n", s->name); + pr_err("kmem_cache_destroy %s: Slab cache still has objects\n", + s->name); dump_stack(); } out_unlock: @@ -1047,13 +1047,11 @@ static void print_slabinfo_header(struct seq_file *m) #else seq_puts(m, "slabinfo - version: 2.1\n"); #endif - seq_puts(m, "# name " - " "); + seq_puts(m, "# name "); seq_puts(m, " : tunables "); seq_puts(m, " : slabdata "); #ifdef CONFIG_DEBUG_SLAB - seq_puts(m, " : globalstat " - " "); + seq_puts(m, " : globalstat "); seq_puts(m, " : cpustat "); #endif seq_putc(m, '\n'); diff --git a/mm/slub.c b/mm/slub.c index 64ed5f3..7277413 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -950,14 +950,14 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search) max_objects = MAX_OBJS_PER_PAGE; if (page->objects != max_objects) { - slab_err(s, page, "Wrong number of objects. Found %d but " - "should be %d", page->objects, max_objects); + slab_err(s, page, "Wrong number of objects. Found %d but should be %d", + page->objects, max_objects); page->objects = max_objects; slab_fix(s, "Number of objects adjusted."); } if (page->inuse != page->objects - nr) { - slab_err(s, page, "Wrong object count. Counter is %d but " - "counted were %d", page->inuse, page->objects - nr); + slab_err(s, page, "Wrong object count. Counter is %d but counted were %d", + page->inuse, page->objects - nr); page->inuse = page->objects - nr; slab_fix(s, "Object count adjusted."); } @@ -1117,8 +1117,8 @@ static inline int free_consistency_checks(struct kmem_cache *s, if (unlikely(s != page->slab_cache)) { if (!PageSlab(page)) { - slab_err(s, page, "Attempt to free object(0x%p) " - "outside of slab", object); + slab_err(s, page, "Attempt to free object(0x%p) outside of slab", + object); } else if (!page->slab_cache) { pr_err("SLUB : no slab for object 0x%p.\n", object); @@ -3439,10 +3439,9 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) free_kmem_cache_nodes(s); error: if (flags & SLAB_PANIC) - panic("Cannot create slab %s size=%lu realsize=%u " - "order=%u offset=%u flags=%lx\n", - s->name, (unsigned long)s->size, s->size, - oo_order(s->oo), s->offset, flags); + panic("Cannot create slab %s size=%lu realsize=%u order=%u offset=%u flags=%lx\n", + s->name, (unsigned long)s->size, s->size, + oo_order(s->oo), s->offset, flags); return -EINVAL; } diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index b60802b..d3511f9 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -166,8 +166,8 @@ void __meminit vmemmap_verify(pte_t *pte, int node, int actual_node = early_pfn_to_nid(pfn); if (node_distance(actual_node, node) > LOCAL_DISTANCE) - printk(KERN_WARNING "[%lx-%lx] potential offnode " - "page_structs\n", start, end - 1); + printk(KERN_WARNING "[%lx-%lx] potential offnode page_structs\n", + start, end - 1); } pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node) @@ -292,8 +292,8 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, if (map_map[pnum]) continue; ms = __nr_to_section(pnum); - printk(KERN_ERR "%s: sparsemem memory map backing failed " - "some memory will not be available.\n", __func__); + printk(KERN_ERR "%s: sparsemem memory map backing failed some memory will not be available.\n", + __func__); ms->section_mem_map = 0; } diff --git a/mm/sparse.c b/mm/sparse.c index 3717cee..7cdb27d 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -428,8 +428,8 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, if (map_map[pnum]) continue; ms = __nr_to_section(pnum); - printk(KERN_ERR "%s: sparsemem memory map backing failed " - "some memory will not be available.\n", __func__); + printk(KERN_ERR "%s: sparsemem memory map backing failed some memory will not be available.\n", + __func__); ms->section_mem_map = 0; } } @@ -456,8 +456,8 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) if (map) return map; - printk(KERN_ERR "%s: sparsemem memory map backing failed " - "some memory will not be available.\n", __func__); + printk(KERN_ERR "%s: sparsemem memory map backing failed some memory will not be available.\n", + __func__); ms->section_mem_map = 0; return NULL; } diff --git a/mm/swapfile.c b/mm/swapfile.c index d2c3736..b86cf26 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2526,8 +2526,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map); - pr_info("Adding %uk swap on %s. " - "Priority:%d extents:%d across:%lluk %s%s%s%s%s\n", + pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s%s\n", p->pages<<(PAGE_SHIFT-10), name->name, p->prio, nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), (p->flags & SWP_SOLIDSTATE) ? "SS" : "", diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d4b2e34..e86c24e 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -469,8 +469,8 @@ overflow: goto retry; } if (printk_ratelimit()) - pr_warn("vmap allocation for size %lu failed: " - "use vmalloc= to increase size.\n", size); + pr_warn("vmap allocation for size %lu failed: use vmalloc= to increase size\n", + size); kfree(va); return ERR_PTR(-EBUSY); } -- cgit v1.1 From 1170532bb49f9468aedabdc1d5a560e2521a2bcc Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 17 Mar 2016 14:19:50 -0700 Subject: mm: convert printk(KERN_ to pr_ Most of the mm subsystem uses pr_ so make it consistent. Miscellanea: - Realign arguments - Add missing newline to format - kmemleak-test.c has a "kmemleak: " prefix added to the "Kmemleak testing" logging message via pr_fmt Signed-off-by: Joe Perches Acked-by: Tejun Heo [percpu] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/backing-dev.c | 4 ++-- mm/bootmem.c | 7 +++---- mm/dmapool.c | 12 +++++------- mm/internal.h | 2 +- mm/kmemcheck.c | 2 +- mm/kmemleak-test.c | 2 +- mm/memory-failure.c | 52 +++++++++++++++++++++------------------------------- mm/memory.c | 17 +++++++---------- mm/mm_init.c | 7 +++---- mm/nobootmem.c | 4 ++-- mm/page_alloc.c | 24 +++++++++--------------- mm/page_io.c | 22 +++++++++++----------- mm/percpu-km.c | 6 +++--- mm/percpu.c | 12 ++++++------ mm/shmem.c | 14 ++++++-------- mm/slab.c | 51 ++++++++++++++++++++++++--------------------------- mm/slab_common.c | 2 +- mm/sparse-vmemmap.c | 6 +++--- mm/sparse.c | 17 +++++++---------- mm/swap_cgroup.c | 5 ++--- 20 files changed, 118 insertions(+), 150 deletions(-) (limited to 'mm') diff --git a/mm/backing-dev.c b/mm/backing-dev.c index c554d17..bfbd709 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -1026,8 +1026,8 @@ int pdflush_proc_obsolete(struct ctl_table *table, int write, if (copy_to_user(buffer, kbuf, sizeof(kbuf))) return -EFAULT; - printk_once(KERN_WARNING "%s exported in /proc is scheduled for removal\n", - table->procname); + pr_warn_once("%s exported in /proc is scheduled for removal\n", + table->procname); *lenp = 2; *ppos += *lenp; diff --git a/mm/bootmem.c b/mm/bootmem.c index 91e32bc..0aa7dda 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -50,8 +50,7 @@ early_param("bootmem_debug", bootmem_debug_setup); #define bdebug(fmt, args...) ({ \ if (unlikely(bootmem_debug)) \ - printk(KERN_INFO \ - "bootmem::%s " fmt, \ + pr_info("bootmem::%s " fmt, \ __func__, ## args); \ }) @@ -680,7 +679,7 @@ static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, /* * Whoops, we cannot satisfy the allocation request. */ - printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); + pr_alert("bootmem alloc of %lu bytes failed!\n", size); panic("Out of memory"); return NULL; } @@ -755,7 +754,7 @@ void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, if (ptr) return ptr; - printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); + pr_alert("bootmem alloc of %lu bytes failed!\n", size); panic("Out of memory"); return NULL; } diff --git a/mm/dmapool.c b/mm/dmapool.c index 2821500..abcbfe8 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -294,8 +294,7 @@ void dma_pool_destroy(struct dma_pool *pool) "dma_pool_destroy %s, %p busy\n", pool->name, page->vaddr); else - printk(KERN_ERR - "dma_pool_destroy %s, %p busy\n", + pr_err("dma_pool_destroy %s, %p busy\n", pool->name, page->vaddr); /* leak the still-in-use consistent memory */ list_del(&page->page_list); @@ -424,7 +423,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) "dma_pool_free %s, %p/%lx (bad dma)\n", pool->name, vaddr, (unsigned long)dma); else - printk(KERN_ERR "dma_pool_free %s, %p/%lx (bad dma)\n", + pr_err("dma_pool_free %s, %p/%lx (bad dma)\n", pool->name, vaddr, (unsigned long)dma); return; } @@ -438,8 +437,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) "dma_pool_free %s, %p (bad vaddr)/%Lx\n", pool->name, vaddr, (unsigned long long)dma); else - printk(KERN_ERR - "dma_pool_free %s, %p (bad vaddr)/%Lx\n", + pr_err("dma_pool_free %s, %p (bad vaddr)/%Lx\n", pool->name, vaddr, (unsigned long long)dma); return; } @@ -455,8 +453,8 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) dev_err(pool->dev, "dma_pool_free %s, dma %Lx already free\n", pool->name, (unsigned long long)dma); else - printk(KERN_ERR "dma_pool_free %s, dma %Lx already free\n", - pool->name, (unsigned long long)dma); + pr_err("dma_pool_free %s, dma %Lx already free\n", + pool->name, (unsigned long long)dma); return; } } diff --git a/mm/internal.h b/mm/internal.h index 57d7b0e..7449392 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -386,7 +386,7 @@ extern int mminit_loglevel; do { \ if (level < mminit_loglevel) { \ if (level <= MMINIT_WARNING) \ - printk(KERN_WARNING "mminit::" prefix " " fmt, ##arg); \ + pr_warn("mminit::" prefix " " fmt, ##arg); \ else \ printk(KERN_DEBUG "mminit::" prefix " " fmt, ##arg); \ } \ diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c index e5f8333..5bf1917 100644 --- a/mm/kmemcheck.c +++ b/mm/kmemcheck.c @@ -20,7 +20,7 @@ void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node) shadow = alloc_pages_node(node, flags | __GFP_NOTRACK, order); if (!shadow) { if (printk_ratelimit()) - printk(KERN_ERR "kmemcheck: failed to allocate shadow bitmap\n"); + pr_err("kmemcheck: failed to allocate shadow bitmap\n"); return; } diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c index dcdcadb..dd3c23a 100644 --- a/mm/kmemleak-test.c +++ b/mm/kmemleak-test.c @@ -49,7 +49,7 @@ static int __init kmemleak_test_init(void) struct test_node *elem; int i; - printk(KERN_INFO "Kmemleak testing\n"); + pr_info("Kmemleak testing\n"); /* make some orphan objects */ pr_info("kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL)); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 67c30eb..5a544c6 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -184,9 +184,8 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno, struct siginfo si; int ret; - printk(KERN_ERR - "MCE %#lx: Killing %s:%d due to hardware memory corruption\n", - pfn, t->comm, t->pid); + pr_err("MCE %#lx: Killing %s:%d due to hardware memory corruption\n", + pfn, t->comm, t->pid); si.si_signo = SIGBUS; si.si_errno = 0; si.si_addr = (void *)addr; @@ -209,8 +208,8 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno, ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */ } if (ret < 0) - printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n", - t->comm, t->pid, ret); + pr_info("MCE: Error sending signal to %s:%d: %d\n", + t->comm, t->pid, ret); return ret; } @@ -290,8 +289,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, } else { tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC); if (!tk) { - printk(KERN_ERR - "MCE: Out of memory while machine check handling\n"); + pr_err("MCE: Out of memory while machine check handling\n"); return; } } @@ -336,9 +334,8 @@ static void kill_procs(struct list_head *to_kill, int forcekill, int trapno, * signal and then access the memory. Just kill it. */ if (fail || tk->addr_valid == 0) { - printk(KERN_ERR - "MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n", - pfn, tk->tsk->comm, tk->tsk->pid); + pr_err("MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n", + pfn, tk->tsk->comm, tk->tsk->pid); force_sig(SIGKILL, tk->tsk); } @@ -350,9 +347,8 @@ static void kill_procs(struct list_head *to_kill, int forcekill, int trapno, */ else if (kill_proc(tk->tsk, tk->addr, trapno, pfn, page, flags) < 0) - printk(KERN_ERR - "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n", - pfn, tk->tsk->comm, tk->tsk->pid); + pr_err("MCE %#lx: Cannot send advisory machine check signal to %s:%d\n", + pfn, tk->tsk->comm, tk->tsk->pid); } put_task_struct(tk->tsk); kfree(tk); @@ -563,7 +559,7 @@ static int me_kernel(struct page *p, unsigned long pfn) */ static int me_unknown(struct page *p, unsigned long pfn) { - printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn); + pr_err("MCE %#lx: Unknown page state\n", pfn); return MF_FAILED; } @@ -608,8 +604,8 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) if (mapping->a_ops->error_remove_page) { err = mapping->a_ops->error_remove_page(mapping, p); if (err != 0) { - printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n", - pfn, err); + pr_info("MCE %#lx: Failed to punch page: %d\n", + pfn, err); } else if (page_has_private(p) && !try_to_release_page(p, GFP_NOIO)) { pr_info("MCE %#lx: failed to release buffers\n", pfn); @@ -624,8 +620,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) if (invalidate_inode_page(p)) ret = MF_RECOVERED; else - printk(KERN_INFO "MCE %#lx: Failed to invalidate\n", - pfn); + pr_info("MCE %#lx: Failed to invalidate\n", pfn); } return ret; } @@ -854,8 +849,7 @@ static int page_action(struct page_state *ps, struct page *p, if (ps->action == me_swapcache_dirty && result == MF_DELAYED) count--; if (count != 0) { - printk(KERN_ERR - "MCE %#lx: %s still referenced by %d users\n", + pr_err("MCE %#lx: %s still referenced by %d users\n", pfn, action_page_types[ps->type], count); result = MF_FAILED; } @@ -934,8 +928,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, } if (PageSwapCache(p)) { - printk(KERN_ERR - "MCE %#lx: keeping poisoned page in swap cache\n", pfn); + pr_err("MCE %#lx: keeping poisoned page in swap cache\n", pfn); ttu |= TTU_IGNORE_HWPOISON; } @@ -953,8 +946,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, } else { kill = 0; ttu |= TTU_IGNORE_HWPOISON; - printk(KERN_INFO - "MCE %#lx: corrupted page was clean: dropped without side effects\n", + pr_info("MCE %#lx: corrupted page was clean: dropped without side effects\n", pfn); } } @@ -972,8 +964,8 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, ret = try_to_unmap(hpage, ttu); if (ret != SWAP_SUCCESS) - printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", - pfn, page_mapcount(hpage)); + pr_err("MCE %#lx: failed to unmap page (mapcount=%d)\n", + pfn, page_mapcount(hpage)); /* * Now that the dirty bit has been propagated to the @@ -1040,16 +1032,14 @@ int memory_failure(unsigned long pfn, int trapno, int flags) panic("Memory failure from trap %d on page %lx", trapno, pfn); if (!pfn_valid(pfn)) { - printk(KERN_ERR - "MCE %#lx: memory outside kernel control\n", - pfn); + pr_err("MCE %#lx: memory outside kernel control\n", pfn); return -ENXIO; } p = pfn_to_page(pfn); orig_head = hpage = compound_head(p); if (TestSetPageHWPoison(p)) { - printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn); + pr_err("MCE %#lx: already hardware poisoned\n", pfn); return 0; } @@ -1180,7 +1170,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) * unpoison always clear PG_hwpoison inside page lock */ if (!PageHWPoison(p)) { - printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn); + pr_err("MCE %#lx: just unpoisoned\n", pfn); num_poisoned_pages_sub(nr_pages); unlock_page(hpage); put_hwpoison_page(hpage); diff --git a/mm/memory.c b/mm/memory.c index 1974fc0..ac6bc15 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -660,9 +660,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, return; } if (nr_unshown) { - printk(KERN_ALERT - "BUG: Bad page map: %lu messages suppressed\n", - nr_unshown); + pr_alert("BUG: Bad page map: %lu messages suppressed\n", + nr_unshown); nr_unshown = 0; } nr_shown = 0; @@ -673,15 +672,13 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL; index = linear_page_index(vma, addr); - printk(KERN_ALERT - "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", - current->comm, - (long long)pte_val(pte), (long long)pmd_val(*pmd)); + pr_alert("BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", + current->comm, + (long long)pte_val(pte), (long long)pmd_val(*pmd)); if (page) dump_page(page, "bad pte"); - printk(KERN_ALERT - "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", - (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); + pr_alert("addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", + (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); /* * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y */ diff --git a/mm/mm_init.c b/mm/mm_init.c index fdadf91..5b72266 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -55,13 +55,12 @@ void __init mminit_verify_zonelist(void) /* Iterate the zonelist */ for_each_zone_zonelist(zone, z, zonelist, zoneid) { #ifdef CONFIG_NUMA - printk(KERN_CONT "%d:%s ", - zone->node, zone->name); + pr_cont("%d:%s ", zone->node, zone->name); #else - printk(KERN_CONT "0:%s ", zone->name); + pr_cont("0:%s ", zone->name); #endif /* CONFIG_NUMA */ } - printk(KERN_CONT "\n"); + pr_cont("\n"); } } } diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 99feb2b..bd05a70 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -288,7 +288,7 @@ static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, /* * Whoops, we cannot satisfy the allocation request. */ - printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); + pr_alert("bootmem alloc of %lu bytes failed!\n", size); panic("Out of memory"); return NULL; } @@ -360,7 +360,7 @@ static void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, if (ptr) return ptr; - printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); + pr_alert("bootmem alloc of %lu bytes failed!\n", size); panic("Out of memory"); return NULL; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 42cf199..2a9eaec 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -544,11 +544,11 @@ static int __init debug_guardpage_minorder_setup(char *buf) unsigned long res; if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) { - printk(KERN_ERR "Bad debug_guardpage_minorder value\n"); + pr_err("Bad debug_guardpage_minorder value\n"); return 0; } _debug_guardpage_minorder = res; - printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res); + pr_info("Setting debug_guardpage_minorder to %lu\n", res); return 0; } __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup); @@ -4073,8 +4073,7 @@ static int __parse_numa_zonelist_order(char *s) } else if (*s == 'z' || *s == 'Z') { user_zonelist_order = ZONELIST_ORDER_ZONE; } else { - printk(KERN_WARNING - "Ignoring invalid numa_zonelist_order value: %s\n", s); + pr_warn("Ignoring invalid numa_zonelist_order value: %s\n", s); return -EINVAL; } return 0; @@ -5458,8 +5457,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) " %s zone: %lu pages used for memmap\n", zone_names[j], memmap_pages); } else - printk(KERN_WARNING - " %s zone: %lu pages exceeds freesize %lu\n", + pr_warn(" %s zone: %lu pages exceeds freesize %lu\n", zone_names[j], memmap_pages, freesize); } @@ -5667,8 +5665,7 @@ static unsigned long __init find_min_pfn_for_node(int nid) min_pfn = min(min_pfn, start_pfn); if (min_pfn == ULONG_MAX) { - printk(KERN_WARNING - "Could not find start_pfn for node %d\n", nid); + pr_warn("Could not find start_pfn for node %d\n", nid); return 0; } @@ -6686,11 +6683,8 @@ void *__init alloc_large_system_hash(const char *tablename, if (!table) panic("Failed to allocate %s hash table\n", tablename); - printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n", - tablename, - (1UL << log2qty), - ilog2(size) - PAGE_SHIFT, - size); + pr_info("%s hash table entries: %ld (order: %d, %lu bytes)\n", + tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size); if (_hash_shift) *_hash_shift = log2qty; @@ -7191,8 +7185,8 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) BUG_ON(!PageBuddy(page)); order = page_order(page); #ifdef CONFIG_DEBUG_VM - printk(KERN_INFO "remove from free list %lx %d %lx\n", - pfn, 1 << order, end_pfn); + pr_info("remove from free list %lx %d %lx\n", + pfn, 1 << order, end_pfn); #endif list_del(&page->lru); rmv_page_order(page); diff --git a/mm/page_io.c b/mm/page_io.c index b995a5b..ff74e51 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -56,10 +56,10 @@ void end_swap_bio_write(struct bio *bio) * Also clear PG_reclaim to avoid rotate_reclaimable_page() */ set_page_dirty(page); - printk(KERN_ALERT "Write-error on swap-device (%u:%u:%Lu)\n", - imajor(bio->bi_bdev->bd_inode), - iminor(bio->bi_bdev->bd_inode), - (unsigned long long)bio->bi_iter.bi_sector); + pr_alert("Write-error on swap-device (%u:%u:%llu)\n", + imajor(bio->bi_bdev->bd_inode), + iminor(bio->bi_bdev->bd_inode), + (unsigned long long)bio->bi_iter.bi_sector); ClearPageReclaim(page); } end_page_writeback(page); @@ -73,10 +73,10 @@ static void end_swap_bio_read(struct bio *bio) if (bio->bi_error) { SetPageError(page); ClearPageUptodate(page); - printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n", - imajor(bio->bi_bdev->bd_inode), - iminor(bio->bi_bdev->bd_inode), - (unsigned long long)bio->bi_iter.bi_sector); + pr_alert("Read-error on swap-device (%u:%u:%llu)\n", + imajor(bio->bi_bdev->bd_inode), + iminor(bio->bi_bdev->bd_inode), + (unsigned long long)bio->bi_iter.bi_sector); goto out; } @@ -216,7 +216,7 @@ reprobe: out: return ret; bad_bmap: - printk(KERN_ERR "swapon: swapfile has holes\n"); + pr_err("swapon: swapfile has holes\n"); ret = -EINVAL; goto out; } @@ -290,8 +290,8 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, */ set_page_dirty(page); ClearPageReclaim(page); - pr_err_ratelimited("Write error on dio swapfile (%Lu)\n", - page_file_offset(page)); + pr_err_ratelimited("Write error on dio swapfile (%llu)\n", + page_file_offset(page)); } end_page_writeback(page); return ret; diff --git a/mm/percpu-km.c b/mm/percpu-km.c index 10e3d0b..0db94b7 100644 --- a/mm/percpu-km.c +++ b/mm/percpu-km.c @@ -95,7 +95,7 @@ static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai) /* all units must be in a single group */ if (ai->nr_groups != 1) { - printk(KERN_CRIT "percpu: can't handle more than one groups\n"); + pr_crit("percpu: can't handle more than one groups\n"); return -EINVAL; } @@ -103,8 +103,8 @@ static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai) alloc_pages = roundup_pow_of_two(nr_pages); if (alloc_pages > nr_pages) - printk(KERN_WARNING "percpu: wasting %zu pages per chunk\n", - alloc_pages - nr_pages); + pr_warn("percpu: wasting %zu pages per chunk\n", + alloc_pages - nr_pages); return 0; } diff --git a/mm/percpu.c b/mm/percpu.c index 1571547..c987fd4 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1449,20 +1449,20 @@ static void pcpu_dump_alloc_info(const char *lvl, for (alloc_end += gi->nr_units / upa; alloc < alloc_end; alloc++) { if (!(alloc % apl)) { - printk(KERN_CONT "\n"); + pr_cont("\n"); printk("%spcpu-alloc: ", lvl); } - printk(KERN_CONT "[%0*d] ", group_width, group); + pr_cont("[%0*d] ", group_width, group); for (unit_end += upa; unit < unit_end; unit++) if (gi->cpu_map[unit] != NR_CPUS) - printk(KERN_CONT "%0*d ", cpu_width, - gi->cpu_map[unit]); + pr_cont("%0*d ", + cpu_width, gi->cpu_map[unit]); else - printk(KERN_CONT "%s ", empty_str); + pr_cont("%s ", empty_str); } } - printk(KERN_CONT "\n"); + pr_cont("\n"); } /** diff --git a/mm/shmem.c b/mm/shmem.c index 1acfdbc..c484f68 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2823,9 +2823,8 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, if ((value = strchr(this_char,'=')) != NULL) { *value++ = 0; } else { - printk(KERN_ERR - "tmpfs: No value for mount option '%s'\n", - this_char); + pr_err("tmpfs: No value for mount option '%s'\n", + this_char); goto error; } @@ -2880,8 +2879,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, if (mpol_parse_str(value, &mpol)) goto bad_val; } else { - printk(KERN_ERR "tmpfs: Bad mount option %s\n", - this_char); + pr_err("tmpfs: Bad mount option %s\n", this_char); goto error; } } @@ -2889,7 +2887,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, return 0; bad_val: - printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n", + pr_err("tmpfs: Bad value '%s' for mount option '%s'\n", value, this_char); error: mpol_put(mpol); @@ -3286,14 +3284,14 @@ int __init shmem_init(void) error = register_filesystem(&shmem_fs_type); if (error) { - printk(KERN_ERR "Could not register tmpfs\n"); + pr_err("Could not register tmpfs\n"); goto out2; } shm_mnt = kern_mount(&shmem_fs_type); if (IS_ERR(shm_mnt)) { error = PTR_ERR(shm_mnt); - printk(KERN_ERR "Could not kern_mount tmpfs\n"); + pr_err("Could not kern_mount tmpfs\n"); goto out1; } return 0; diff --git a/mm/slab.c b/mm/slab.c index e558f85..e719a5c 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -474,7 +474,7 @@ static unsigned int cache_estimate(unsigned long gfporder, size_t buffer_size, static void __slab_error(const char *function, struct kmem_cache *cachep, char *msg) { - printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", + pr_err("slab error in %s(): cache `%s': %s\n", function, cachep->name, msg); dump_stack(); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); @@ -1553,7 +1553,7 @@ static void dump_line(char *data, int offset, int limit) unsigned char error = 0; int bad_count = 0; - printk(KERN_ERR "%03x: ", offset); + pr_err("%03x: ", offset); for (i = 0; i < limit; i++) { if (data[offset + i] != POISON_FREE) { error = data[offset + i]; @@ -1566,11 +1566,11 @@ static void dump_line(char *data, int offset, int limit) if (bad_count == 1) { error ^= POISON_FREE; if (!(error & (error - 1))) { - printk(KERN_ERR "Single bit error detected. Probably bad RAM.\n"); + pr_err("Single bit error detected. Probably bad RAM.\n"); #ifdef CONFIG_X86 - printk(KERN_ERR "Run memtest86+ or a similar memory test tool.\n"); + pr_err("Run memtest86+ or a similar memory test tool.\n"); #else - printk(KERN_ERR "Run a memory test tool.\n"); + pr_err("Run a memory test tool.\n"); #endif } } @@ -1585,13 +1585,13 @@ static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines) char *realobj; if (cachep->flags & SLAB_RED_ZONE) { - printk(KERN_ERR "Redzone: 0x%llx/0x%llx.\n", - *dbg_redzone1(cachep, objp), - *dbg_redzone2(cachep, objp)); + pr_err("Redzone: 0x%llx/0x%llx\n", + *dbg_redzone1(cachep, objp), + *dbg_redzone2(cachep, objp)); } if (cachep->flags & SLAB_STORE_USER) { - printk(KERN_ERR "Last user: [<%p>](%pSR)\n", + pr_err("Last user: [<%p>](%pSR)\n", *dbg_userword(cachep, objp), *dbg_userword(cachep, objp)); } @@ -1627,9 +1627,9 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) /* Mismatch ! */ /* Print header */ if (lines == 0) { - printk(KERN_ERR - "Slab corruption (%s): %s start=%p, len=%d\n", - print_tainted(), cachep->name, realobj, size); + pr_err("Slab corruption (%s): %s start=%p, len=%d\n", + print_tainted(), cachep->name, + realobj, size); print_objinfo(cachep, objp, 0); } /* Hexdump the affected line */ @@ -1656,15 +1656,13 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) if (objnr) { objp = index_to_obj(cachep, page, objnr - 1); realobj = (char *)objp + obj_offset(cachep); - printk(KERN_ERR "Prev obj: start=%p, len=%d\n", - realobj, size); + pr_err("Prev obj: start=%p, len=%d\n", realobj, size); print_objinfo(cachep, objp, 2); } if (objnr + 1 < cachep->num) { objp = index_to_obj(cachep, page, objnr + 1); realobj = (char *)objp + obj_offset(cachep); - printk(KERN_ERR "Next obj: start=%p, len=%d\n", - realobj, size); + pr_err("Next obj: start=%p, len=%d\n", realobj, size); print_objinfo(cachep, objp, 2); } } @@ -2463,7 +2461,7 @@ static void slab_put_obj(struct kmem_cache *cachep, /* Verify double free bug */ for (i = page->active; i < cachep->num; i++) { if (get_free_obj(page, i) == objnr) { - printk(KERN_ERR "slab: double free detected in cache '%s', objp %p\n", + pr_err("slab: double free detected in cache '%s', objp %p\n", cachep->name, objp); BUG(); } @@ -2583,7 +2581,7 @@ failed: static void kfree_debugcheck(const void *objp) { if (!virt_addr_valid(objp)) { - printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n", + pr_err("kfree_debugcheck: out of range ptr %lxh\n", (unsigned long)objp); BUG(); } @@ -2607,8 +2605,8 @@ static inline void verify_redzone_free(struct kmem_cache *cache, void *obj) else slab_error(cache, "memory outside object was overwritten"); - printk(KERN_ERR "%p: redzone 1:0x%llx, redzone 2:0x%llx.\n", - obj, redzone1, redzone2); + pr_err("%p: redzone 1:0x%llx, redzone 2:0x%llx\n", + obj, redzone1, redzone2); } static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, @@ -2896,10 +2894,9 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || *dbg_redzone2(cachep, objp) != RED_INACTIVE) { slab_error(cachep, "double free, or memory outside object was overwritten"); - printk(KERN_ERR - "%p: redzone 1:0x%llx, redzone 2:0x%llx\n", - objp, *dbg_redzone1(cachep, objp), - *dbg_redzone2(cachep, objp)); + pr_err("%p: redzone 1:0x%llx, redzone 2:0x%llx\n", + objp, *dbg_redzone1(cachep, objp), + *dbg_redzone2(cachep, objp)); } *dbg_redzone1(cachep, objp) = RED_ACTIVE; *dbg_redzone2(cachep, objp) = RED_ACTIVE; @@ -2910,7 +2907,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, cachep->ctor(objp); if (ARCH_SLAB_MINALIGN && ((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) { - printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", + pr_err("0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", objp, (int)ARCH_SLAB_MINALIGN); } return objp; @@ -3837,7 +3834,7 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) skip_setup: err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp); if (err) - printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", + pr_err("enable_cpucache failed for %s, error %d\n", cachep->name, -err); return err; } @@ -3993,7 +3990,7 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) name = cachep->name; if (error) - printk(KERN_ERR "slab: cache %s error: %s\n", name, error); + pr_err("slab: cache %s error: %s\n", name, error); sinfo->active_objs = active_objs; sinfo->num_objs = num_objs; diff --git a/mm/slab_common.c b/mm/slab_common.c index e885e11..b2e3796 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -442,7 +442,7 @@ out_unlock: panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n", name, err); else { - printk(KERN_WARNING "kmem_cache_create(%s) failed with error %d", + pr_warn("kmem_cache_create(%s) failed with error %d\n", name, err); dump_stack(); } diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index d3511f9..68885dc 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -166,8 +166,8 @@ void __meminit vmemmap_verify(pte_t *pte, int node, int actual_node = early_pfn_to_nid(pfn); if (node_distance(actual_node, node) > LOCAL_DISTANCE) - printk(KERN_WARNING "[%lx-%lx] potential offnode page_structs\n", - start, end - 1); + pr_warn("[%lx-%lx] potential offnode page_structs\n", + start, end - 1); } pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node) @@ -292,7 +292,7 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, if (map_map[pnum]) continue; ms = __nr_to_section(pnum); - printk(KERN_ERR "%s: sparsemem memory map backing failed some memory will not be available.\n", + pr_err("%s: sparsemem memory map backing failed some memory will not be available\n", __func__); ms->section_mem_map = 0; } diff --git a/mm/sparse.c b/mm/sparse.c index 7cdb27d..5d0cf45 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -313,9 +313,8 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap) usemap_nid = sparse_early_nid(__nr_to_section(usemap_snr)); if (usemap_nid != nid) { - printk(KERN_INFO - "node %d must be removed before remove section %ld\n", - nid, usemap_snr); + pr_info("node %d must be removed before remove section %ld\n", + nid, usemap_snr); return; } /* @@ -324,10 +323,8 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap) * gather other removable sections for dynamic partitioning. * Just notify un-removable section's number here. */ - printk(KERN_INFO "Section %ld and %ld (node %d)", usemap_snr, - pgdat_snr, nid); - printk(KERN_CONT - " have a circular dependency on usemap and pgdat allocations\n"); + pr_info("Section %ld and %ld (node %d) have a circular dependency on usemap and pgdat allocations\n", + usemap_snr, pgdat_snr, nid); } #else static unsigned long * __init @@ -355,7 +352,7 @@ static void __init sparse_early_usemaps_alloc_node(void *data, usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid), size * usemap_count); if (!usemap) { - printk(KERN_WARNING "%s: allocation failed\n", __func__); + pr_warn("%s: allocation failed\n", __func__); return; } @@ -428,7 +425,7 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, if (map_map[pnum]) continue; ms = __nr_to_section(pnum); - printk(KERN_ERR "%s: sparsemem memory map backing failed some memory will not be available.\n", + pr_err("%s: sparsemem memory map backing failed some memory will not be available\n", __func__); ms->section_mem_map = 0; } @@ -456,7 +453,7 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) if (map) return map; - printk(KERN_ERR "%s: sparsemem memory map backing failed some memory will not be available.\n", + pr_err("%s: sparsemem memory map backing failed some memory will not be available\n", __func__); ms->section_mem_map = 0; return NULL; diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c index b5f7f24..310ac0b 100644 --- a/mm/swap_cgroup.c +++ b/mm/swap_cgroup.c @@ -174,9 +174,8 @@ int swap_cgroup_swapon(int type, unsigned long max_pages) return 0; nomem: - printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n"); - printk(KERN_INFO - "swap_cgroup can be disabled by swapaccount=0 boot option\n"); + pr_info("couldn't allocate enough memory for swap_cgroup\n"); + pr_info("swap_cgroup can be disabled by swapaccount=0 boot option\n"); return -ENOMEM; } -- cgit v1.1 From 870d4b12ad15d21c5db67b373bdc2f62cfe2ec64 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 17 Mar 2016 14:19:53 -0700 Subject: mm: percpu: use pr_fmt to prefix output Use the normal mechanism to make the logging output consistently "percpu:" instead of a mix of "PERCPU:" and "percpu:" Signed-off-by: Joe Perches Acked-by: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/percpu-km.c | 4 ++-- mm/percpu.c | 20 +++++++++++--------- 2 files changed, 13 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/percpu-km.c b/mm/percpu-km.c index 0db94b7..d66911f 100644 --- a/mm/percpu-km.c +++ b/mm/percpu-km.c @@ -95,7 +95,7 @@ static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai) /* all units must be in a single group */ if (ai->nr_groups != 1) { - pr_crit("percpu: can't handle more than one groups\n"); + pr_crit("can't handle more than one group\n"); return -EINVAL; } @@ -103,7 +103,7 @@ static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai) alloc_pages = roundup_pow_of_two(nr_pages); if (alloc_pages > nr_pages) - pr_warn("percpu: wasting %zu pages per chunk\n", + pr_warn("wasting %zu pages per chunk\n", alloc_pages - nr_pages); return 0; diff --git a/mm/percpu.c b/mm/percpu.c index c987fd4..0c59684 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -53,6 +53,8 @@ * setup the first chunk containing the kernel static percpu area */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -1033,11 +1035,11 @@ fail_unlock: spin_unlock_irqrestore(&pcpu_lock, flags); fail: if (!is_atomic && warn_limit) { - pr_warn("PERCPU: allocation failed, size=%zu align=%zu atomic=%d, %s\n", + pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n", size, align, is_atomic, err); dump_stack(); if (!--warn_limit) - pr_info("PERCPU: limit reached, disable warning\n"); + pr_info("limit reached, disable warning\n"); } if (is_atomic) { /* see the flag handling in pcpu_blance_workfn() */ @@ -1538,8 +1540,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, #define PCPU_SETUP_BUG_ON(cond) do { \ if (unlikely(cond)) { \ - pr_emerg("PERCPU: failed to initialize, %s", #cond); \ - pr_emerg("PERCPU: cpu_possible_mask=%*pb\n", \ + pr_emerg("failed to initialize, %s\n", #cond); \ + pr_emerg("cpu_possible_mask=%*pb\n", \ cpumask_pr_args(cpu_possible_mask)); \ pcpu_dump_alloc_info(KERN_EMERG, ai); \ BUG(); \ @@ -1723,7 +1725,7 @@ static int __init percpu_alloc_setup(char *str) pcpu_chosen_fc = PCPU_FC_PAGE; #endif else - pr_warn("PERCPU: unknown allocator %s specified\n", str); + pr_warn("unknown allocator %s specified\n", str); return 0; } @@ -2016,7 +2018,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, /* warn if maximum distance is further than 75% of vmalloc space */ if (max_distance > VMALLOC_TOTAL * 3 / 4) { - pr_warn("PERCPU: max_distance=0x%zx too large for vmalloc space 0x%lx\n", + pr_warn("max_distance=0x%zx too large for vmalloc space 0x%lx\n", max_distance, VMALLOC_TOTAL); #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK /* and fail if we have fallback */ @@ -2025,7 +2027,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, #endif } - pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n", + pr_info("Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n", PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size, ai->dyn_size, ai->unit_size); @@ -2099,7 +2101,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE); if (!ptr) { - pr_warn("PERCPU: failed to allocate %s page for cpu%u\n", + pr_warn("failed to allocate %s page for cpu%u\n", psize_str, cpu); goto enomem; } @@ -2139,7 +2141,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, } /* we're ready, commit */ - pr_info("PERCPU: %d %s pages/cpu @%p s%zu r%zu d%zu\n", + pr_info("%d %s pages/cpu @%p s%zu r%zu d%zu\n", unit_pages, psize_str, vm.addr, ai->static_size, ai->reserved_size, ai->dyn_size); -- cgit v1.1 From 99490f16f8b287a028306e4092cc85393907075f Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 17 Mar 2016 14:19:58 -0700 Subject: mm: ZONE_DEVICE depends on SPARSEMEM_VMEMMAP The primary use case for devm_memremap_pages() is to allocate an memmap array from persistent memory. That capabilty requires vmem_altmap which requires SPARSEMEM_VMEMMAP. Also, without SPARSEMEM_VMEMMAP the addition of ZONE_DEVICE expands ZONES_WIDTH and triggers the: "Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid." ...warning in mm/memory.c. SPARSEMEM_VMEMMAP=n && ZONE_DEVICE=y is not a configuration we should worry about supporting. Signed-off-by: Dan Williams Reported-by: Vlastimil Babka Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index 520dae6..05efa6a5 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -653,6 +653,7 @@ config ZONE_DEVICE bool "Device memory (pmem, etc...) hotplug support" if EXPERT depends on MEMORY_HOTPLUG depends on MEMORY_HOTREMOVE + depends on SPARSEMEM_VMEMMAP depends on X86_64 #arch_add_memory() comprehends device memory help -- cgit v1.1 From b97731992d00f09456726bfc5ab6641c07773038 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 17 Mar 2016 14:20:01 -0700 Subject: rmap: introduce rmap_walk_locked() This patchset rewrites freeze_page() and unfreeze_page() using try_to_unmap() and remove_migration_ptes(). Result is much simpler, but somewhat slower. Migration 8GiB worth of PMD-mapped THP: Baseline 20.21 +/- 0.393 Patched 20.73 +/- 0.082 Slowdown 1.03x It's 3% slower, comparing to 14% in v1. I don't it should be a stopper. Splitting of PTE-mapped pages slowed more. But this is not a common case. Migration 8GiB worth of PMD-mapped THP: Baseline 20.39 +/- 0.225 Patched 22.43 +/- 0.496 Slowdown 1.10x rmap_walk_locked() is the same as rmap_walk(), but the caller takes care of the relevant rmap lock. This is preparation for switching THP splitting from custom rmap walk in freeze_page()/unfreeze_page() to the generic one. There is no support for KSM pages for now: not clear which lock is implied. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 41 ++++++++++++++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index 02f0bfc..30b739c 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1715,14 +1715,21 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page, * vm_flags for that VMA. That should be OK, because that vma shouldn't be * LOCKED. */ -static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) +static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, + bool locked) { struct anon_vma *anon_vma; pgoff_t pgoff; struct anon_vma_chain *avc; int ret = SWAP_AGAIN; - anon_vma = rmap_walk_anon_lock(page, rwc); + if (locked) { + anon_vma = page_anon_vma(page); + /* anon_vma disappear under us? */ + VM_BUG_ON_PAGE(!anon_vma, page); + } else { + anon_vma = rmap_walk_anon_lock(page, rwc); + } if (!anon_vma) return ret; @@ -1742,7 +1749,9 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) if (rwc->done && rwc->done(page)) break; } - anon_vma_unlock_read(anon_vma); + + if (!locked) + anon_vma_unlock_read(anon_vma); return ret; } @@ -1759,9 +1768,10 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) * vm_flags for that VMA. That should be OK, because that vma shouldn't be * LOCKED. */ -static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) +static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, + bool locked) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); pgoff_t pgoff; struct vm_area_struct *vma; int ret = SWAP_AGAIN; @@ -1778,7 +1788,8 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) return ret; pgoff = page_to_pgoff(page); - i_mmap_lock_read(mapping); + if (!locked) + i_mmap_lock_read(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); @@ -1795,7 +1806,8 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) } done: - i_mmap_unlock_read(mapping); + if (!locked) + i_mmap_unlock_read(mapping); return ret; } @@ -1804,9 +1816,20 @@ int rmap_walk(struct page *page, struct rmap_walk_control *rwc) if (unlikely(PageKsm(page))) return rmap_walk_ksm(page, rwc); else if (PageAnon(page)) - return rmap_walk_anon(page, rwc); + return rmap_walk_anon(page, rwc, false); + else + return rmap_walk_file(page, rwc, false); +} + +/* Like rmap_walk, but caller holds relevant rmap lock */ +int rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc) +{ + /* no ksm support for now */ + VM_BUG_ON_PAGE(PageKsm(page), page); + if (PageAnon(page)) + return rmap_walk_anon(page, rwc, true); else - return rmap_walk_file(page, rwc); + return rmap_walk_file(page, rwc, true); } #ifdef CONFIG_HUGETLB_PAGE -- cgit v1.1 From 2a52bcbcc688eecead2953143f7ef695b8e44575 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 17 Mar 2016 14:20:04 -0700 Subject: rmap: extend try_to_unmap() to be usable by split_huge_page() Add support for two ttu_flags: - TTU_SPLIT_HUGE_PMD would split PMD if it's there, before trying to unmap page; - TTU_RMAP_LOCKED indicates that caller holds relevant rmap lock; Also, change rwc->done to !page_mapcount() instead of !page_mapped(). try_to_unmap() works on pte level, so we are really interested in the mappedness of this small page rather than of the compound page it's a part of. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 5 +---- mm/rmap.c | 24 ++++++++++++++++-------- 2 files changed, 17 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e1a177c..11d1567 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3006,15 +3006,12 @@ out: } } -static void split_huge_pmd_address(struct vm_area_struct *vma, - unsigned long address) +void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; - VM_BUG_ON(!(address & ~HPAGE_PMD_MASK)); - pgd = pgd_offset(vma->vm_mm, address); if (!pgd_present(*pgd)) return; diff --git a/mm/rmap.c b/mm/rmap.c index 30b739c..945933a 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1431,6 +1431,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED)) goto out; + if (flags & TTU_SPLIT_HUGE_PMD) + split_huge_pmd_address(vma, address); pte = page_check_address(page, mm, address, &ptl, 0); if (!pte) goto out; @@ -1576,10 +1578,10 @@ static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg) return is_vma_temporary_stack(vma); } -static int page_not_mapped(struct page *page) +static int page_mapcount_is_zero(struct page *page) { - return !page_mapped(page); -}; + return !page_mapcount(page); +} /** * try_to_unmap - try to remove all page table mappings to a page @@ -1606,12 +1608,10 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) struct rmap_walk_control rwc = { .rmap_one = try_to_unmap_one, .arg = &rp, - .done = page_not_mapped, + .done = page_mapcount_is_zero, .anon_lock = page_lock_anon_vma_read, }; - VM_BUG_ON_PAGE(!PageHuge(page) && PageTransHuge(page), page); - /* * During exec, a temporary VMA is setup and later moved. * The VMA is moved under the anon_vma lock but not the @@ -1623,9 +1623,12 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) if ((flags & TTU_MIGRATION) && !PageKsm(page) && PageAnon(page)) rwc.invalid_vma = invalid_migration_vma; - ret = rmap_walk(page, &rwc); + if (flags & TTU_RMAP_LOCKED) + ret = rmap_walk_locked(page, &rwc); + else + ret = rmap_walk(page, &rwc); - if (ret != SWAP_MLOCK && !page_mapped(page)) { + if (ret != SWAP_MLOCK && !page_mapcount(page)) { ret = SWAP_SUCCESS; if (rp.lazyfreed && !PageDirty(page)) ret = SWAP_LZFREE; @@ -1633,6 +1636,11 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) return ret; } +static int page_not_mapped(struct page *page) +{ + return !page_mapped(page); +}; + /** * try_to_munlock - try to munlock a page * @page: the page to be munlocked -- cgit v1.1 From e388466de4a2a1a50c43bfaeacc0c8254d9e7cb2 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 17 Mar 2016 14:20:07 -0700 Subject: mm: make remove_migration_ptes() beyond mm/migration.c Make remove_migration_ptes() available to be used in split_huge_page(). New parameter 'locked' added: as with try_to_umap() we need a way to indicate that caller holds rmap lock. We also shouldn't try to mlock() pte-mapped huge pages: pte-mapeed THP pages are never mlocked. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 577c94b..6c822a7 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -172,7 +172,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, else page_add_file_rmap(new); - if (vma->vm_flags & VM_LOCKED) + if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new)) mlock_vma_page(new); /* No need to invalidate - it was non-present before */ @@ -187,14 +187,17 @@ out: * Get rid of all migration entries and replace them by * references to the indicated page. */ -static void remove_migration_ptes(struct page *old, struct page *new) +void remove_migration_ptes(struct page *old, struct page *new, bool locked) { struct rmap_walk_control rwc = { .rmap_one = remove_migration_pte, .arg = old, }; - rmap_walk(new, &rwc); + if (locked) + rmap_walk_locked(new, &rwc); + else + rmap_walk(new, &rwc); } /* @@ -702,7 +705,7 @@ static int writeout(struct address_space *mapping, struct page *page) * At this point we know that the migration attempt cannot * be successful. */ - remove_migration_ptes(page, page); + remove_migration_ptes(page, page, false); rc = mapping->a_ops->writepage(page, &wbc); @@ -900,7 +903,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, if (page_was_mapped) remove_migration_ptes(page, - rc == MIGRATEPAGE_SUCCESS ? newpage : page); + rc == MIGRATEPAGE_SUCCESS ? newpage : page, false); out_unlock_both: unlock_page(newpage); @@ -1070,7 +1073,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, if (page_was_mapped) remove_migration_ptes(hpage, - rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage); + rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage, false); unlock_page(new_hpage); -- cgit v1.1 From fec89c109f3a7737fe3a7bf0f40d1fb7709d353b Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 17 Mar 2016 14:20:10 -0700 Subject: thp: rewrite freeze_page()/unfreeze_page() with generic rmap walkers freeze_page() and unfreeze_page() helpers evolved in rather complex beasts. It would be nice to cut complexity of this code. This patch rewrites freeze_page() using standard try_to_unmap(). unfreeze_page() is rewritten with remove_migration_ptes(). The result is much simpler. But the new variant is somewhat slower for PTE-mapped THPs. Current helpers iterates over VMAs the compound page is mapped to, and then over ptes within this VMA. New helpers iterates over small page, then over VMA the small page mapped to, and only then find relevant pte. We have short cut for PMD-mapped THP: we directly install migration entries on PMD split. I don't think the slowdown is critical, considering how much simpler result is and that split_huge_page() is quite rare nowadays. It only happens due memory pressure or migration. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 210 ++++++++++--------------------------------------------- mm/rmap.c | 10 ++- 2 files changed, 44 insertions(+), 176 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 11d1567..4a58fa1 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2977,7 +2977,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, } void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long address) + unsigned long address, bool freeze) { spinlock_t *ptl; struct mm_struct *mm = vma->vm_mm; @@ -2994,7 +2994,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, page = NULL; } else if (!pmd_devmap(*pmd)) goto out; - __split_huge_pmd_locked(vma, pmd, haddr, false); + __split_huge_pmd_locked(vma, pmd, haddr, freeze); out: spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PMD_SIZE); @@ -3006,7 +3006,8 @@ out: } } -void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address) +void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, + bool freeze, struct page *page) { pgd_t *pgd; pud_t *pud; @@ -3023,11 +3024,20 @@ void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address) pmd = pmd_offset(pud, address); if (!pmd_present(*pmd) || (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd))) return; + + /* + * If caller asks to setup a migration entries, we need a page to check + * pmd against. Otherwise we can end up replacing wrong page. + */ + VM_BUG_ON(freeze && !page); + if (page && page != pmd_page(*pmd)) + return; + /* * Caller holds the mmap_sem write mode, so a huge pmd cannot * materialize from under us. */ - split_huge_pmd(vma, pmd, address); + __split_huge_pmd(vma, pmd, address, freeze); } void vma_adjust_trans_huge(struct vm_area_struct *vma, @@ -3043,7 +3053,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, if (start & ~HPAGE_PMD_MASK && (start & HPAGE_PMD_MASK) >= vma->vm_start && (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) - split_huge_pmd_address(vma, start); + split_huge_pmd_address(vma, start, false, NULL); /* * If the new end address isn't hpage aligned and it could @@ -3053,7 +3063,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, if (end & ~HPAGE_PMD_MASK && (end & HPAGE_PMD_MASK) >= vma->vm_start && (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) - split_huge_pmd_address(vma, end); + split_huge_pmd_address(vma, end, false, NULL); /* * If we're also updating the vma->vm_next->vm_start, if the new @@ -3067,184 +3077,36 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, if (nstart & ~HPAGE_PMD_MASK && (nstart & HPAGE_PMD_MASK) >= next->vm_start && (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end) - split_huge_pmd_address(next, nstart); + split_huge_pmd_address(next, nstart, false, NULL); } } -static void freeze_page_vma(struct vm_area_struct *vma, struct page *page, - unsigned long address) +static void freeze_page(struct page *page) { - unsigned long haddr = address & HPAGE_PMD_MASK; - spinlock_t *ptl; - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - int i, nr = HPAGE_PMD_NR; - - /* Skip pages which doesn't belong to the VMA */ - if (address < vma->vm_start) { - int off = (vma->vm_start - address) >> PAGE_SHIFT; - page += off; - nr -= off; - address = vma->vm_start; - } - - pgd = pgd_offset(vma->vm_mm, address); - if (!pgd_present(*pgd)) - return; - pud = pud_offset(pgd, address); - if (!pud_present(*pud)) - return; - pmd = pmd_offset(pud, address); - ptl = pmd_lock(vma->vm_mm, pmd); - if (!pmd_present(*pmd)) { - spin_unlock(ptl); - return; - } - if (pmd_trans_huge(*pmd)) { - if (page == pmd_page(*pmd)) - __split_huge_pmd_locked(vma, pmd, haddr, true); - spin_unlock(ptl); - return; - } - spin_unlock(ptl); - - pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl); - for (i = 0; i < nr; i++, address += PAGE_SIZE, page++, pte++) { - pte_t entry, swp_pte; - swp_entry_t swp_entry; - - /* - * We've just crossed page table boundary: need to map next one. - * It can happen if THP was mremaped to non PMD-aligned address. - */ - if (unlikely(address == haddr + HPAGE_PMD_SIZE)) { - pte_unmap_unlock(pte - 1, ptl); - pmd = mm_find_pmd(vma->vm_mm, address); - if (!pmd) - return; - pte = pte_offset_map_lock(vma->vm_mm, pmd, - address, &ptl); - } - - if (!pte_present(*pte)) - continue; - if (page_to_pfn(page) != pte_pfn(*pte)) - continue; - flush_cache_page(vma, address, page_to_pfn(page)); - entry = ptep_clear_flush(vma, address, pte); - if (pte_dirty(entry)) - SetPageDirty(page); - swp_entry = make_migration_entry(page, pte_write(entry)); - swp_pte = swp_entry_to_pte(swp_entry); - if (pte_soft_dirty(entry)) - swp_pte = pte_swp_mksoft_dirty(swp_pte); - set_pte_at(vma->vm_mm, address, pte, swp_pte); - page_remove_rmap(page, false); - put_page(page); - } - pte_unmap_unlock(pte - 1, ptl); -} - -static void freeze_page(struct anon_vma *anon_vma, struct page *page) -{ - struct anon_vma_chain *avc; - pgoff_t pgoff = page_to_pgoff(page); + enum ttu_flags ttu_flags = TTU_MIGRATION | TTU_IGNORE_MLOCK | + TTU_IGNORE_ACCESS | TTU_RMAP_LOCKED; + int i, ret; VM_BUG_ON_PAGE(!PageHead(page), page); - anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, - pgoff + HPAGE_PMD_NR - 1) { - unsigned long address = __vma_address(page, avc->vma); - - mmu_notifier_invalidate_range_start(avc->vma->vm_mm, - address, address + HPAGE_PMD_SIZE); - freeze_page_vma(avc->vma, page, address); - mmu_notifier_invalidate_range_end(avc->vma->vm_mm, - address, address + HPAGE_PMD_SIZE); - } -} - -static void unfreeze_page_vma(struct vm_area_struct *vma, struct page *page, - unsigned long address) -{ - spinlock_t *ptl; - pmd_t *pmd; - pte_t *pte, entry; - swp_entry_t swp_entry; - unsigned long haddr = address & HPAGE_PMD_MASK; - int i, nr = HPAGE_PMD_NR; - - /* Skip pages which doesn't belong to the VMA */ - if (address < vma->vm_start) { - int off = (vma->vm_start - address) >> PAGE_SHIFT; - page += off; - nr -= off; - address = vma->vm_start; - } - - pmd = mm_find_pmd(vma->vm_mm, address); - if (!pmd) - return; - - pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl); - for (i = 0; i < nr; i++, address += PAGE_SIZE, page++, pte++) { - /* - * We've just crossed page table boundary: need to map next one. - * It can happen if THP was mremaped to non-PMD aligned address. - */ - if (unlikely(address == haddr + HPAGE_PMD_SIZE)) { - pte_unmap_unlock(pte - 1, ptl); - pmd = mm_find_pmd(vma->vm_mm, address); - if (!pmd) - return; - pte = pte_offset_map_lock(vma->vm_mm, pmd, - address, &ptl); - } - - if (!is_swap_pte(*pte)) - continue; - - swp_entry = pte_to_swp_entry(*pte); - if (!is_migration_entry(swp_entry)) - continue; - if (migration_entry_to_page(swp_entry) != page) - continue; - - get_page(page); - page_add_anon_rmap(page, vma, address, false); - - entry = pte_mkold(mk_pte(page, vma->vm_page_prot)); - if (PageDirty(page)) - entry = pte_mkdirty(entry); - if (is_write_migration_entry(swp_entry)) - entry = maybe_mkwrite(entry, vma); - - flush_dcache_page(page); - set_pte_at(vma->vm_mm, address, pte, entry); + /* We only need TTU_SPLIT_HUGE_PMD once */ + ret = try_to_unmap(page, ttu_flags | TTU_SPLIT_HUGE_PMD); + for (i = 1; !ret && i < HPAGE_PMD_NR; i++) { + /* Cut short if the page is unmapped */ + if (page_count(page) == 1) + return; - /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, address, pte); + ret = try_to_unmap(page + i, ttu_flags); } - pte_unmap_unlock(pte - 1, ptl); + VM_BUG_ON(ret); } -static void unfreeze_page(struct anon_vma *anon_vma, struct page *page) +static void unfreeze_page(struct page *page) { - struct anon_vma_chain *avc; - pgoff_t pgoff = page_to_pgoff(page); - - anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, - pgoff, pgoff + HPAGE_PMD_NR - 1) { - unsigned long address = __vma_address(page, avc->vma); + int i; - mmu_notifier_invalidate_range_start(avc->vma->vm_mm, - address, address + HPAGE_PMD_SIZE); - unfreeze_page_vma(avc->vma, page, address); - mmu_notifier_invalidate_range_end(avc->vma->vm_mm, - address, address + HPAGE_PMD_SIZE); - } + for (i = 0; i < HPAGE_PMD_NR; i++) + remove_migration_ptes(page + i, page + i, true); } static void __split_huge_page_tail(struct page *head, int tail, @@ -3322,7 +3184,7 @@ static void __split_huge_page(struct page *page, struct list_head *list) ClearPageCompound(head); spin_unlock_irq(&zone->lru_lock); - unfreeze_page(page_anon_vma(head), head); + unfreeze_page(head); for (i = 0; i < HPAGE_PMD_NR; i++) { struct page *subpage = head + i; @@ -3418,7 +3280,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) } mlocked = PageMlocked(page); - freeze_page(anon_vma, head); + freeze_page(head); VM_BUG_ON_PAGE(compound_mapcount(head), head); /* Make sure the page is not on per-CPU pagevec as it takes pin */ @@ -3447,7 +3309,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) BUG(); } else { spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); - unfreeze_page(anon_vma, head); + unfreeze_page(head); ret = -EBUSY; } diff --git a/mm/rmap.c b/mm/rmap.c index 945933a..c399a0d 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1431,8 +1431,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED)) goto out; - if (flags & TTU_SPLIT_HUGE_PMD) - split_huge_pmd_address(vma, address); + if (flags & TTU_SPLIT_HUGE_PMD) { + split_huge_pmd_address(vma, address, + flags & TTU_MIGRATION, page); + /* check if we have anything to do after split */ + if (page_mapcount(page) == 0) + goto out; + } + pte = page_check_address(page, mm, address, &ptl, 0); if (!pte) goto out; -- cgit v1.1 From 5f7377147ce0b5d521c0ee2aab2ec3ead51297db Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 17 Mar 2016 14:20:13 -0700 Subject: thp: fix deadlock in split_huge_pmd() split_huge_pmd() tries to munlock page with munlock_vma_page(). That requires the page to locked. If the is locked by caller, we would get a deadlock: Unable to find swap-space signature INFO: task trinity-c85:1907 blocked for more than 120 seconds. Not tainted 4.4.0-00032-gf19d0bdced41-dirty #1606 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. trinity-c85 D ffff88084d997608 0 1907 309 0x00000000 Call Trace: schedule+0x9f/0x1c0 schedule_timeout+0x48e/0x600 io_schedule_timeout+0x1c3/0x390 bit_wait_io+0x29/0xd0 __wait_on_bit_lock+0x94/0x140 __lock_page+0x1d4/0x280 __split_huge_pmd+0x5a8/0x10f0 split_huge_pmd_address+0x1d9/0x230 try_to_unmap_one+0x540/0xc70 rmap_walk_anon+0x284/0x810 rmap_walk_locked+0x11e/0x190 try_to_unmap+0x1b1/0x4b0 split_huge_page_to_list+0x49d/0x18a0 follow_page_mask+0xa36/0xea0 SyS_move_pages+0xaf3/0x1570 entry_SYSCALL_64_fastpath+0x12/0x6b 2 locks held by trinity-c85/1907: #0: (&mm->mmap_sem){++++++}, at: SyS_move_pages+0x933/0x1570 #1: (&anon_vma->rwsem){++++..}, at: split_huge_page_to_list+0x402/0x18a0 I don't think the deadlock is triggerable without split_huge_page() simplifilcation patchset. But munlock_vma_page() here is wrong: we want to munlock the page unconditionally, no need in rmap lookup, that munlock_vma_page() does. Let's use clear_page_mlock() instead. It can be called under ptl. Fixes: e90309c9f772 ("thp: allow mlocked THP again") Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 4a58fa1..021db17 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2981,29 +2981,20 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, { spinlock_t *ptl; struct mm_struct *mm = vma->vm_mm; - struct page *page = NULL; unsigned long haddr = address & HPAGE_PMD_MASK; mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE); ptl = pmd_lock(mm, pmd); if (pmd_trans_huge(*pmd)) { - page = pmd_page(*pmd); + struct page *page = pmd_page(*pmd); if (PageMlocked(page)) - get_page(page); - else - page = NULL; + clear_page_mlock(page); } else if (!pmd_devmap(*pmd)) goto out; __split_huge_pmd_locked(vma, pmd, haddr, freeze); out: spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PMD_SIZE); - if (page) { - lock_page(page); - munlock_vma_page(page); - unlock_page(page); - put_page(page); - } } void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, -- cgit v1.1 From 987b3095c2a76ea1b90438f5ace6f9cee354a47d Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Thu, 17 Mar 2016 14:20:16 -0700 Subject: mm: meminit: initialise more memory for inode/dentry hash tables in early boot Upstream has supported page parallel initialisation for X86 and the boot time is improved greately. Some tests have been done for Power. Here is the result I have done with different memory size. * 4GB memory: boot time is as the following: with patch vs without patch: 10.4s vs 24.5s boot time is improved 57% * 200GB memory: boot time looks the same with and without patches. boot time is about 38s * 32TB memory: boot time looks the same with and without patches boot time is about 160s. The boot time is much shorter than X86 with 24TB memory. From community discussion, it costs about 694s for X86 24T system. Parallel initialisation improves the performance by deferring memory initilisation to kswap with N kthreads, it should improve the performance therotically. In testing on X86, performance is improved greatly with huge memory. But on Power platform, it is improved greatly with less than 100GB memory. For huge memory, it is not improved greatly. But it saves the time with several threads at least, as the following information shows(32TB system log): [ 22.648169] node 9 initialised, 16607461 pages in 280ms [ 22.783772] node 3 initialised, 23937243 pages in 410ms [ 22.858877] node 6 initialised, 29179347 pages in 490ms [ 22.863252] node 2 initialised, 29179347 pages in 490ms [ 22.907545] node 0 initialised, 32049614 pages in 540ms [ 22.920891] node 15 initialised, 32212280 pages in 550ms [ 22.923236] node 4 initialised, 32306127 pages in 550ms [ 22.923384] node 12 initialised, 32314319 pages in 550ms [ 22.924754] node 8 initialised, 32314319 pages in 550ms [ 22.940780] node 13 initialised, 33353677 pages in 570ms [ 22.940796] node 11 initialised, 33353677 pages in 570ms [ 22.941700] node 5 initialised, 33353677 pages in 570ms [ 22.941721] node 10 initialised, 33353677 pages in 570ms [ 22.941876] node 7 initialised, 33353677 pages in 570ms [ 22.944946] node 14 initialised, 33353677 pages in 570ms [ 22.946063] node 1 initialised, 33345485 pages in 580ms It saves the time about 550*16 ms at least, although it can be ignore to compare the boot time about 160 seconds. What's more, the boot time is much shorter on Power even without patches than x86 for huge memory machine. So this patchset is still necessary to be enabled for Power. This patch (of 2): This patch is based on Mel Gorman's old patch in the mailing list, https://lkml.org/lkml/2015/5/5/280 which is discussed but it is fixed with a completion to wait for all memory initialised in page_alloc_init_late(). It is to fix the OOM problem on X86 with 24TB memory which allocates memory in late initialisation. But for Power platform with 32TB memory, it causes a call trace in vfs_caches_init->inode_init() and inode hash table needs more memory. So this patch allocates 1GB for 0.25TB/node for large system as it is mentioned in https://lkml.org/lkml/2015/5/1/627 This call trace is found on Power with 32TB memory, 1024CPUs, 16nodes. Currently, it only allocates 2GB*16=32GB for early initialisation. But Dentry cache hash table needes 16GB and Inode cache hash table needs 16GB. So the system have no enough memory for it. The log from dmesg as the following: Dentry cache hash table entries: 2147483648 (order: 18,17179869184 bytes) vmalloc: allocation failure, allocated 16021913600 of 17179934720 bytes swapper/0: page allocation failure: order:0,mode:0x2080020 CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.4.0-0-ppc64 Call Trace: .dump_stack+0xb4/0xb664 (unreliable) .warn_alloc_failed+0x114/0x160 .__vmalloc_area_node+0x1a4/0x2b0 .__vmalloc_node_range+0xe4/0x110 .__vmalloc_node+0x40/0x50 .alloc_large_system_hash+0x134/0x2a4 .inode_init+0xa4/0xf0 .vfs_caches_init+0x80/0x144 .start_kernel+0x40c/0x4e0 start_here_common+0x20/0x4a4 Signed-off-by: Li Zhang Acked-by: Mel Gorman Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2a9eaec..3583f71 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -308,13 +308,20 @@ static inline bool update_defer_init(pg_data_t *pgdat, unsigned long pfn, unsigned long zone_end, unsigned long *nr_initialised) { + unsigned long max_initialise; + /* Always populate low zones for address-contrained allocations */ if (zone_end < pgdat_end_pfn(pgdat)) return true; + /* + * Initialise at least 2G of a node but also take into account that + * two large system hashes that can take up 1GB for 0.25TB/node. + */ + max_initialise = max(2UL << (30 - PAGE_SHIFT), + (pgdat->node_spanned_pages >> 8)); - /* Initialise at least 2G of the highest zone */ (*nr_initialised)++; - if (*nr_initialised > (2UL << (30 - PAGE_SHIFT)) && + if ((*nr_initialised > max_initialise) && (pfn & (PAGES_PER_SECTION - 1)) == 0) { pgdat->first_deferred_pfn = pfn; return false; -- cgit v1.1 From 588083bb37a3cea8533c392370a554417c8f29cb Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 17 Mar 2016 14:20:25 -0700 Subject: mm: memcontrol: reclaim when shrinking memory.high below usage When setting memory.high below usage, nothing happens until the next charge comes along, and then it will only reclaim its own charge and not the now potentially huge excess of the new memory.high. This can cause groups to stay in excess of their memory.high indefinitely. To fix that, when shrinking memory.high, kick off a reclaim cycle that goes after the delta. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Vladimir Davydov Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8615b06..f7c9b4c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4992,6 +4992,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long nr_pages; unsigned long high; int err; @@ -5002,6 +5003,11 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, memcg->high = high; + nr_pages = page_counter_read(&memcg->memory); + if (nr_pages > high) + try_to_free_mem_cgroup_pages(memcg, nr_pages - high, + GFP_KERNEL, true); + memcg_wb_domain_size_changed(memcg); return nbytes; } -- cgit v1.1 From b6e6edcfa40561e9c8abe5eecf1c96f8e5fd9c6f Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 17 Mar 2016 14:20:28 -0700 Subject: mm: memcontrol: reclaim and OOM kill when shrinking memory.max below usage Setting the original memory.limit_in_bytes hardlimit is subject to a race condition when the desired value is below the current usage. The code tries a few times to first reclaim and then see if the usage has dropped to where we would like it to be, but there is no locking, and the workload is free to continue making new charges up to the old limit. Thus, attempting to shrink a workload relies on pure luck and hope that the workload happens to cooperate. To fix this in the cgroup2 memory.max knob, do it the other way round: set the limit first, then try enforcement. And if reclaim is not able to succeed, trigger OOM kills in the group. Keep going until the new limit is met, we run out of OOM victims and there's only unreclaimable memory left, or the task writing to memory.max is killed. This allows users to shrink groups reliably, and the behavior is consistent with what happens when new charges are attempted in excess of memory.max. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Vladimir Davydov Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 38 ++++++++++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f7c9b4c..8614e0d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1236,7 +1236,7 @@ static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) return limit; } -static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, +static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, int order) { struct oom_control oc = { @@ -1314,6 +1314,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, } unlock: mutex_unlock(&oom_lock); + return chosen; } #if MAX_NUMNODES > 1 @@ -5029,6 +5030,8 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned int nr_reclaims = MEM_CGROUP_RECLAIM_RETRIES; + bool drained = false; unsigned long max; int err; @@ -5037,9 +5040,36 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, if (err) return err; - err = mem_cgroup_resize_limit(memcg, max); - if (err) - return err; + xchg(&memcg->memory.limit, max); + + for (;;) { + unsigned long nr_pages = page_counter_read(&memcg->memory); + + if (nr_pages <= max) + break; + + if (signal_pending(current)) { + err = -EINTR; + break; + } + + if (!drained) { + drain_all_stock(memcg); + drained = true; + continue; + } + + if (nr_reclaims) { + if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max, + GFP_KERNEL, true)) + nr_reclaims--; + continue; + } + + mem_cgroup_events(memcg, MEMCG_OOM, 1); + if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0)) + break; + } memcg_wb_domain_size_changed(memcg); return nbytes; -- cgit v1.1 From 8b5926560fafe80fe764af2aade152d490a96546 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 17 Mar 2016 14:20:31 -0700 Subject: mm: memcontrol: clarify the uncharge_list() loop uncharge_list() does an unusual list walk because the function can take regular lists with dedicated list_heads as well as singleton lists where a single page is passed via the page->lru list node. This can sometimes lead to confusion as well as suggestions to replace the loop with a list_for_each_entry(), which wouldn't work. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8614e0d..fa7bf35 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5420,6 +5420,10 @@ static void uncharge_list(struct list_head *page_list) struct list_head *next; struct page *page; + /* + * Note that the list can be a single page->lru; hence the + * do-while loop instead of a simple list_for_each_entry(). + */ next = page_list->next; do { unsigned int nr_pages = 1; -- cgit v1.1 From e0775d10f1e65548fcbadf614a6bf3d216165021 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 17 Mar 2016 14:20:34 -0700 Subject: mm: memcontrol: zap oom_info_lock mem_cgroup_print_oom_info is always called under oom_lock, so oom_info_lock is redundant. Signed-off-by: Vladimir Davydov Cc: Johannes Weiner Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fa7bf35..36db05f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1150,12 +1150,9 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) */ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) { - /* oom_info_lock ensures that parallel ooms do not interleave */ - static DEFINE_MUTEX(oom_info_lock); struct mem_cgroup *iter; unsigned int i; - mutex_lock(&oom_info_lock); rcu_read_lock(); if (p) { @@ -1199,7 +1196,6 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) pr_cont("\n"); } - mutex_unlock(&oom_info_lock); } /* -- cgit v1.1 From a1c0b1a074c0095492a956b4fe0067b74a82cfe3 Mon Sep 17 00:00:00 2001 From: Shawn Lin Date: Thu, 17 Mar 2016 14:20:37 -0700 Subject: mm/vmalloc: use PAGE_ALIGNED() to check PAGE_SIZE alignment We have PAGE_ALIGNED() in mm.h, so let's use it instead of IS_ALIGNED() for checking PAGE_SIZE aligned case. Signed-off-by: Shawn Lin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index e86c24e..ae7d20b 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1085,7 +1085,7 @@ void vm_unmap_ram(const void *mem, unsigned int count) BUG_ON(!addr); BUG_ON(addr < VMALLOC_START); BUG_ON(addr > VMALLOC_END); - BUG_ON(!IS_ALIGNED(addr, PAGE_SIZE)); + BUG_ON(!PAGE_ALIGNED(addr)); debug_check_no_locks_freed(mem, size); vmap_debug_free_range(addr, addr+size); -- cgit v1.1 From a82cbf07131b56e7e51fbb008b82ef769af08790 Mon Sep 17 00:00:00 2001 From: YiPing Xu Date: Thu, 17 Mar 2016 14:20:39 -0700 Subject: zsmalloc: drop unused member 'mapping_area->huge' When unmapping a huge class page in zs_unmap_object, the page will be unmapped by kmap_atomic. the "!area->huge" branch in __zs_unmap_object is alway true, and no code set "area->huge" now, so we can drop it. Signed-off-by: YiPing Xu Reviewed-by: Sergey Senozhatsky Acked-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 2d7c4c1..43e4cbc 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -281,7 +281,6 @@ struct mapping_area { #endif char *vm_addr; /* address of kmap_atomic()'ed pages */ enum zs_mapmode vm_mm; /* mapping mode */ - bool huge; }; static int create_handle_cache(struct zs_pool *pool) @@ -1127,11 +1126,9 @@ static void __zs_unmap_object(struct mapping_area *area, goto out; buf = area->vm_buf; - if (!area->huge) { - buf = buf + ZS_HANDLE_SIZE; - size -= ZS_HANDLE_SIZE; - off += ZS_HANDLE_SIZE; - } + buf = buf + ZS_HANDLE_SIZE; + size -= ZS_HANDLE_SIZE; + off += ZS_HANDLE_SIZE; sizes[0] = PAGE_SIZE - off; sizes[1] = size - sizes[0]; -- cgit v1.1 From 1120ed5483941d9cd2cf52cb9644a4311dbd1011 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Thu, 17 Mar 2016 14:20:42 -0700 Subject: mm/zsmalloc: add `freeable' column to pool stat Add a new column to pool stats, which will tell how many pages ideally can be freed by class compaction, so it will be easier to analyze zsmalloc fragmentation. At the moment, we have only numbers of FULL and ALMOST_EMPTY classes, but they don't tell us how badly the class is fragmented internally. The new /sys/kernel/debug/zsmalloc/zramX/classes output look as follows: class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage freeable [..] 12 224 0 2 146 5 8 4 4 13 240 0 0 0 0 0 1 0 14 256 1 13 1840 1672 115 1 10 15 272 0 0 0 0 0 1 0 [..] 49 816 0 3 745 735 149 1 2 51 848 3 4 361 306 76 4 8 52 864 12 14 378 268 81 3 21 54 896 1 12 117 57 26 2 12 57 944 0 0 0 0 0 3 0 [..] Total 26 131 12709 10994 1071 134 For example, from this particular output we can easily conclude that class-896 is heavily fragmented -- it occupies 26 pages, 12 can be freed by compaction. Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 43e4cbc..e72efb10 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -494,6 +494,8 @@ static void __exit zs_stat_exit(void) debugfs_remove_recursive(zs_stat_root); } +static unsigned long zs_can_compact(struct size_class *class); + static int zs_stats_size_show(struct seq_file *s, void *v) { int i; @@ -501,14 +503,15 @@ static int zs_stats_size_show(struct seq_file *s, void *v) struct size_class *class; int objs_per_zspage; unsigned long class_almost_full, class_almost_empty; - unsigned long obj_allocated, obj_used, pages_used; + unsigned long obj_allocated, obj_used, pages_used, freeable; unsigned long total_class_almost_full = 0, total_class_almost_empty = 0; unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0; + unsigned long total_freeable = 0; - seq_printf(s, " %5s %5s %11s %12s %13s %10s %10s %16s\n", + seq_printf(s, " %5s %5s %11s %12s %13s %10s %10s %16s %8s\n", "class", "size", "almost_full", "almost_empty", "obj_allocated", "obj_used", "pages_used", - "pages_per_zspage"); + "pages_per_zspage", "freeable"); for (i = 0; i < zs_size_classes; i++) { class = pool->size_class[i]; @@ -521,6 +524,7 @@ static int zs_stats_size_show(struct seq_file *s, void *v) class_almost_empty = zs_stat_get(class, CLASS_ALMOST_EMPTY); obj_allocated = zs_stat_get(class, OBJ_ALLOCATED); obj_used = zs_stat_get(class, OBJ_USED); + freeable = zs_can_compact(class); spin_unlock(&class->lock); objs_per_zspage = get_maxobj_per_zspage(class->size, @@ -528,23 +532,25 @@ static int zs_stats_size_show(struct seq_file *s, void *v) pages_used = obj_allocated / objs_per_zspage * class->pages_per_zspage; - seq_printf(s, " %5u %5u %11lu %12lu %13lu %10lu %10lu %16d\n", + seq_printf(s, " %5u %5u %11lu %12lu %13lu" + " %10lu %10lu %16d %8lu\n", i, class->size, class_almost_full, class_almost_empty, obj_allocated, obj_used, pages_used, - class->pages_per_zspage); + class->pages_per_zspage, freeable); total_class_almost_full += class_almost_full; total_class_almost_empty += class_almost_empty; total_objs += obj_allocated; total_used_objs += obj_used; total_pages += pages_used; + total_freeable += freeable; } seq_puts(s, "\n"); - seq_printf(s, " %5s %5s %11lu %12lu %13lu %10lu %10lu\n", + seq_printf(s, " %5s %5s %11lu %12lu %13lu %10lu %10lu %16s %8lu\n", "Total", "", total_class_almost_full, total_class_almost_empty, total_objs, - total_used_objs, total_pages); + total_used_objs, total_pages, "", total_freeable); return 0; } -- cgit v1.1 From 6afcf2895e6f229476bd0dc95fe915e639ae0a49 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Thu, 17 Mar 2016 14:20:45 -0700 Subject: mm,oom: make oom_killer_disable() killable While oom_killer_disable() is called by freeze_processes() after all user threads except the current thread are frozen, it is possible that kernel threads invoke the OOM killer and sends SIGKILL to the current thread due to sharing the thawed victim's memory. Therefore, checking for SIGKILL is preferable than TIF_MEMDIE. Signed-off-by: Tetsuo Handa Cc: Tetsuo Handa Cc: David Rientjes Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index fde3d37..06f7e17 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -455,15 +455,11 @@ void exit_oom_victim(void) bool oom_killer_disable(void) { /* - * Make sure to not race with an ongoing OOM killer - * and that the current is not the victim. + * Make sure to not race with an ongoing OOM killer. Check that the + * current is not killed (possibly due to sharing the victim's memory). */ - mutex_lock(&oom_lock); - if (test_thread_flag(TIF_MEMDIE)) { - mutex_unlock(&oom_lock); + if (mutex_lock_killable(&oom_lock)) return false; - } - oom_killer_disabled = true; mutex_unlock(&oom_lock); -- cgit v1.1 From 0a687aace3b8e215e08eed8a67014f5b8f133ab0 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Thu, 17 Mar 2016 14:20:48 -0700 Subject: mm,oom: do not loop !__GFP_FS allocation if the OOM killer is disabled After the OOM killer is disabled during suspend operation, any !__GFP_NOFAIL && __GFP_FS allocations are forced to fail. Thus, any !__GFP_NOFAIL && !__GFP_FS allocations should be forced to fail as well. Signed-off-by: Tetsuo Handa Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3583f71..a762be5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2858,8 +2858,12 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, * XXX: Page reclaim didn't yield anything, * and the OOM killer can't be invoked, but * keep looping as per tradition. + * + * But do not keep looping if oom_killer_disable() + * was already called, for the system is trying to + * enter a quiescent state during suspend. */ - *did_some_progress = 1; + *did_some_progress = !oom_killer_disabled; goto out; } if (pm_suspended_storage()) -- cgit v1.1 From 93e205a728e6cb8d7d11f6836e289798a1de25e2 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 17 Mar 2016 14:21:15 -0700 Subject: fix Christoph's email addresses There are various email addresses for me throughout the kernel. Use the one that will always be valid. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmu_notifier.c | 2 +- mm/quicklist.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 5fbdd36..f4259e4 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -3,7 +3,7 @@ * * Copyright (C) 2008 Qumranet, Inc. * Copyright (C) 2008 SGI - * Christoph Lameter + * Christoph Lameter * * This work is licensed under the terms of the GNU GPL, version 2. See * the COPYING file in the top-level directory. diff --git a/mm/quicklist.c b/mm/quicklist.c index 9422129..daf6ff6 100644 --- a/mm/quicklist.c +++ b/mm/quicklist.c @@ -8,7 +8,7 @@ * improved on it. * * Copyright (C) 2007 SGI, - * Christoph Lameter + * Christoph Lameter * Generalized, added support for multiple lists and * constructors / destructors. */ -- cgit v1.1 From e61452365372570253b2b1de84bab0cdb2e62c64 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 17 Mar 2016 14:21:54 -0700 Subject: radix_tree: add support for multi-order entries With huge pages, it is convenient to have the radix tree be able to return an entry that covers multiple indices. Previous attempts to deal with the problem have involved inserting N duplicate entries, which is a waste of memory and leads to problems trying to handle aliased tags, or probing the tree multiple times to find alternative entries which might cover the requested index. This approach inserts one canonical entry into the tree for a given range of indices, and may also insert other entries in order to ensure that lookups find the canonical entry. This solution only tolerates inserting powers of two that are greater than the fanout of the tree. If we wish to expand the radix tree's abilities to support large-ish pages that is less than the fanout at the penultimate level of the tree, then we would need to add one more step in lookup to ensure that any sibling nodes in the final level of the tree are dereferenced and we return the canonical entry that they reference. Signed-off-by: Matthew Wilcox Cc: Johannes Weiner Cc: Matthew Wilcox Cc: "Kirill A. Shutemov" Cc: Ross Zwisler Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 61b441b..084ad0f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -586,7 +586,7 @@ static int page_cache_tree_insert(struct address_space *mapping, void **slot; int error; - error = __radix_tree_create(&mapping->page_tree, page->index, + error = __radix_tree_create(&mapping->page_tree, page->index, 0, &node, &slot); if (error) return error; -- cgit v1.1 From 2cf938aae17203426a89b5955bd1c9668657bfa8 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 17 Mar 2016 14:22:03 -0700 Subject: mm: use radix_tree_iter_retry() Instead of a 'goto restart', we can now use radix_tree_iter_retry() to restart from our current position. This will make a difference when there are more ways to happen across an indirect pointer. And it eliminates some confusing gotos. [vbabka@suse.cz: remove now-obsolete-and-misleading comment] Signed-off-by: Matthew Wilcox Cc: Hugh Dickins Cc: Konstantin Khlebnikov Signed-off-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 53 +++++++++++++++++------------------------------------ mm/shmem.c | 23 ++++++++++++----------- 2 files changed, 29 insertions(+), 47 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 084ad0f..7c00f10 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1255,7 +1255,6 @@ unsigned find_get_entries(struct address_space *mapping, return 0; rcu_read_lock(); -restart: radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { struct page *page; repeat: @@ -1263,8 +1262,10 @@ repeat: if (unlikely(!page)) continue; if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) - goto restart; + if (radix_tree_deref_retry(page)) { + slot = radix_tree_iter_retry(&iter); + continue; + } /* * A shadow entry of a recently evicted page, a swap * entry from shmem/tmpfs or a DAX entry. Return it @@ -1317,7 +1318,6 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start, return 0; rcu_read_lock(); -restart: radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { struct page *page; repeat: @@ -1327,13 +1327,8 @@ repeat: if (radix_tree_exception(page)) { if (radix_tree_deref_retry(page)) { - /* - * Transient condition which can only trigger - * when entry at index 0 moves out of or back - * to root: none yet gotten, safe to restart. - */ - WARN_ON(iter.index); - goto restart; + slot = radix_tree_iter_retry(&iter); + continue; } /* * A shadow entry of a recently evicted page, @@ -1384,7 +1379,6 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, return 0; rcu_read_lock(); -restart: radix_tree_for_each_contig(slot, &mapping->page_tree, &iter, index) { struct page *page; repeat: @@ -1395,12 +1389,8 @@ repeat: if (radix_tree_exception(page)) { if (radix_tree_deref_retry(page)) { - /* - * Transient condition which can only trigger - * when entry at index 0 moves out of or back - * to root: none yet gotten, safe to restart. - */ - goto restart; + slot = radix_tree_iter_retry(&iter); + continue; } /* * A shadow entry of a recently evicted page, @@ -1460,7 +1450,6 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, return 0; rcu_read_lock(); -restart: radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, *index, tag) { struct page *page; @@ -1471,12 +1460,8 @@ repeat: if (radix_tree_exception(page)) { if (radix_tree_deref_retry(page)) { - /* - * Transient condition which can only trigger - * when entry at index 0 moves out of or back - * to root: none yet gotten, safe to restart. - */ - goto restart; + slot = radix_tree_iter_retry(&iter); + continue; } /* * A shadow entry of a recently evicted page. @@ -1539,7 +1524,6 @@ unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, return 0; rcu_read_lock(); -restart: radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, start, tag) { struct page *page; @@ -1549,12 +1533,8 @@ repeat: continue; if (radix_tree_exception(page)) { if (radix_tree_deref_retry(page)) { - /* - * Transient condition which can only trigger - * when entry at index 0 moves out of or back - * to root: none yet gotten, safe to restart. - */ - goto restart; + slot = radix_tree_iter_retry(&iter); + continue; } /* @@ -2171,10 +2151,11 @@ repeat: if (unlikely(!page)) goto next; if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) - break; - else - goto next; + if (radix_tree_deref_retry(page)) { + slot = radix_tree_iter_retry(&iter); + continue; + } + goto next; } if (!page_cache_get_speculative(page)) diff --git a/mm/shmem.c b/mm/shmem.c index c484f68..91c0dad 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -383,13 +383,10 @@ restart: page = radix_tree_deref_slot(slot); - /* - * This should only be possible to happen at index 0, so we - * don't need to reset the counter, nor do we risk infinite - * restarts. - */ - if (radix_tree_deref_retry(page)) - goto restart; + if (radix_tree_deref_retry(page)) { + slot = radix_tree_iter_retry(&iter); + continue; + } if (radix_tree_exceptional_entry(page)) swapped++; @@ -1951,8 +1948,10 @@ restart: radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { page = radix_tree_deref_slot(slot); if (!page || radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) - goto restart; + if (radix_tree_deref_retry(page)) { + slot = radix_tree_iter_retry(&iter); + continue; + } } else if (page_count(page) - page_mapcount(page) > 1) { spin_lock_irq(&mapping->tree_lock); radix_tree_tag_set(&mapping->page_tree, iter.index, @@ -2006,8 +2005,10 @@ restart: page = radix_tree_deref_slot(slot); if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) - goto restart; + if (radix_tree_deref_retry(page)) { + slot = radix_tree_iter_retry(&iter); + continue; + } page = NULL; } -- cgit v1.1 From 7165092fe5ca70bae722ac6cd78421cfd0eec18d Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 17 Mar 2016 14:22:06 -0700 Subject: radix-tree,shmem: introduce radix_tree_iter_next() shmem likes to occasionally drop the lock, schedule, then reacqire the lock and continue with the iteration from the last place it left off. This is currently done with a pretty ugly goto. Introduce radix_tree_iter_next() and use it throughout shmem.c. [koct9i@gmail.com: fix bug in radix_tree_iter_next() for tagged iteration] Signed-off-by: Matthew Wilcox Cc: Hugh Dickins Signed-off-by: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 91c0dad..9428c51 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -376,7 +376,6 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping, rcu_read_lock(); -restart: radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { if (iter.index >= end) break; @@ -393,8 +392,7 @@ restart: if (need_resched()) { cond_resched_rcu(); - start = iter.index + 1; - goto restart; + slot = radix_tree_iter_next(&iter); } } @@ -1944,7 +1942,6 @@ static void shmem_tag_pins(struct address_space *mapping) start = 0; rcu_read_lock(); -restart: radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { page = radix_tree_deref_slot(slot); if (!page || radix_tree_exception(page)) { @@ -1961,8 +1958,7 @@ restart: if (need_resched()) { cond_resched_rcu(); - start = iter.index + 1; - goto restart; + slot = radix_tree_iter_next(&iter); } } rcu_read_unlock(); @@ -1999,7 +1995,6 @@ static int shmem_wait_for_pins(struct address_space *mapping) start = 0; rcu_read_lock(); -restart: radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, start, SHMEM_TAG_PINNED) { @@ -2033,8 +2028,7 @@ restart: continue_resched: if (need_resched()) { cond_resched_rcu(); - start = iter.index + 1; - goto restart; + slot = radix_tree_iter_next(&iter); } } rcu_read_unlock(); -- cgit v1.1