From 7c5f64f84483bd13886348edda8b3e7b799a7fdb Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Fri, 7 Oct 2016 16:57:23 -0700 Subject: mm: oom: deduplicate victim selection code for memcg and global oom When selecting an oom victim, we use the same heuristic for both memory cgroup and global oom. The only difference is the scope of tasks to select the victim from. So we could just export an iterator over all memcg tasks and keep all oom related logic in oom_kill.c, but instead we duplicate pieces of it in memcontrol.c reusing some initially private functions of oom_kill.c in order to not duplicate all of it. That looks ugly and error prone, because any modification of select_bad_process should also be propagated to mem_cgroup_out_of_memory. Let's rework this as follows: keep all oom heuristic related code private to oom_kill.c and make oom_kill.c use exported memcg functions when it's really necessary (like in case of iterating over memcg tasks). Link: http://lkml.kernel.org/r/1470056933-7505-1-git-send-email-vdavydov@virtuozzo.com Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Tetsuo Handa Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 114 ++++++++++++-------------------- mm/oom_kill.c | 200 ++++++++++++++++++++++++++++++-------------------------- 2 files changed, 148 insertions(+), 166 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4be518d..48747ef 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -921,6 +921,43 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) iter = mem_cgroup_iter(NULL, iter, NULL)) /** + * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy + * @memcg: hierarchy root + * @fn: function to call for each task + * @arg: argument passed to @fn + * + * This function iterates over tasks attached to @memcg or to any of its + * descendants and calls @fn for each task. If @fn returns a non-zero + * value, the function breaks the iteration loop and returns the value. + * Otherwise, it will iterate over all tasks and return 0. + * + * This function must not be called for the root memory cgroup. + */ +int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, + int (*fn)(struct task_struct *, void *), void *arg) +{ + struct mem_cgroup *iter; + int ret = 0; + + BUG_ON(memcg == root_mem_cgroup); + + for_each_mem_cgroup_tree(iter, memcg) { + struct css_task_iter it; + struct task_struct *task; + + css_task_iter_start(&iter->css, &it); + while (!ret && (task = css_task_iter_next(&it))) + ret = fn(task, arg); + css_task_iter_end(&it); + if (ret) { + mem_cgroup_iter_break(memcg, iter); + break; + } + } + return ret; +} + +/** * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page * @page: the page * @zone: zone of the page @@ -1178,7 +1215,7 @@ static int mem_cgroup_count_children(struct mem_cgroup *memcg) /* * Return the memory (and swap, if configured) limit for a memcg. */ -static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) +unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) { unsigned long limit; @@ -1205,79 +1242,12 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, .gfp_mask = gfp_mask, .order = order, }; - struct mem_cgroup *iter; - unsigned long chosen_points = 0; - unsigned long totalpages; - unsigned int points = 0; - struct task_struct *chosen = NULL; + bool ret; mutex_lock(&oom_lock); - - /* - * If current has a pending SIGKILL or is exiting, then automatically - * select it. The goal is to allow it to allocate so that it may - * quickly exit and free its memory. - */ - if (task_will_free_mem(current)) { - mark_oom_victim(current); - wake_oom_reaper(current); - goto unlock; - } - - check_panic_on_oom(&oc, CONSTRAINT_MEMCG); - totalpages = mem_cgroup_get_limit(memcg) ? : 1; - for_each_mem_cgroup_tree(iter, memcg) { - struct css_task_iter it; - struct task_struct *task; - - css_task_iter_start(&iter->css, &it); - while ((task = css_task_iter_next(&it))) { - switch (oom_scan_process_thread(&oc, task)) { - case OOM_SCAN_SELECT: - if (chosen) - put_task_struct(chosen); - chosen = task; - chosen_points = ULONG_MAX; - get_task_struct(chosen); - /* fall through */ - case OOM_SCAN_CONTINUE: - continue; - case OOM_SCAN_ABORT: - css_task_iter_end(&it); - mem_cgroup_iter_break(memcg, iter); - if (chosen) - put_task_struct(chosen); - /* Set a dummy value to return "true". */ - chosen = (void *) 1; - goto unlock; - case OOM_SCAN_OK: - break; - }; - points = oom_badness(task, memcg, NULL, totalpages); - if (!points || points < chosen_points) - continue; - /* Prefer thread group leaders for display purposes */ - if (points == chosen_points && - thread_group_leader(chosen)) - continue; - - if (chosen) - put_task_struct(chosen); - chosen = task; - chosen_points = points; - get_task_struct(chosen); - } - css_task_iter_end(&it); - } - - if (chosen) { - points = chosen_points * 1000 / totalpages; - oom_kill_process(&oc, chosen, points, totalpages, - "Memory cgroup out of memory"); - } -unlock: + ret = out_of_memory(&oc); mutex_unlock(&oom_lock); - return chosen; + return ret; } #if MAX_NUMNODES > 1 @@ -1600,7 +1570,7 @@ bool mem_cgroup_oom_synchronize(bool handle) if (!memcg) return false; - if (!handle || oom_killer_disabled) + if (!handle) goto cleanup; owait.memcg = memcg; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index d53a9aa..ef17551 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -132,6 +132,11 @@ static inline bool is_sysrq_oom(struct oom_control *oc) return oc->order == -1; } +static inline bool is_memcg_oom(struct oom_control *oc) +{ + return oc->memcg != NULL; +} + /* return true if the task is not adequate as candidate victim task. */ static bool oom_unkillable_task(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask) @@ -213,12 +218,17 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, return points > 0 ? points : 1; } +enum oom_constraint { + CONSTRAINT_NONE, + CONSTRAINT_CPUSET, + CONSTRAINT_MEMORY_POLICY, + CONSTRAINT_MEMCG, +}; + /* * Determine the type of allocation constraint. */ -#ifdef CONFIG_NUMA -static enum oom_constraint constrained_alloc(struct oom_control *oc, - unsigned long *totalpages) +static enum oom_constraint constrained_alloc(struct oom_control *oc) { struct zone *zone; struct zoneref *z; @@ -226,8 +236,16 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc, bool cpuset_limited = false; int nid; + if (is_memcg_oom(oc)) { + oc->totalpages = mem_cgroup_get_limit(oc->memcg) ?: 1; + return CONSTRAINT_MEMCG; + } + /* Default to all available memory */ - *totalpages = totalram_pages + total_swap_pages; + oc->totalpages = totalram_pages + total_swap_pages; + + if (!IS_ENABLED(CONFIG_NUMA)) + return CONSTRAINT_NONE; if (!oc->zonelist) return CONSTRAINT_NONE; @@ -246,9 +264,9 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc, */ if (oc->nodemask && !nodes_subset(node_states[N_MEMORY], *oc->nodemask)) { - *totalpages = total_swap_pages; + oc->totalpages = total_swap_pages; for_each_node_mask(nid, *oc->nodemask) - *totalpages += node_spanned_pages(nid); + oc->totalpages += node_spanned_pages(nid); return CONSTRAINT_MEMORY_POLICY; } @@ -259,27 +277,21 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc, cpuset_limited = true; if (cpuset_limited) { - *totalpages = total_swap_pages; + oc->totalpages = total_swap_pages; for_each_node_mask(nid, cpuset_current_mems_allowed) - *totalpages += node_spanned_pages(nid); + oc->totalpages += node_spanned_pages(nid); return CONSTRAINT_CPUSET; } return CONSTRAINT_NONE; } -#else -static enum oom_constraint constrained_alloc(struct oom_control *oc, - unsigned long *totalpages) -{ - *totalpages = totalram_pages + total_swap_pages; - return CONSTRAINT_NONE; -} -#endif -enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, - struct task_struct *task) +static int oom_evaluate_task(struct task_struct *task, void *arg) { + struct oom_control *oc = arg; + unsigned long points; + if (oom_unkillable_task(task, NULL, oc->nodemask)) - return OOM_SCAN_CONTINUE; + goto next; /* * This task already has access to memory reserves and is being killed. @@ -289,68 +301,67 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, */ if (!is_sysrq_oom(oc) && atomic_read(&task->signal->oom_victims)) { struct task_struct *p = find_lock_task_mm(task); - enum oom_scan_t ret = OOM_SCAN_ABORT; + bool reaped = false; if (p) { - if (test_bit(MMF_OOM_REAPED, &p->mm->flags)) - ret = OOM_SCAN_CONTINUE; + reaped = test_bit(MMF_OOM_REAPED, &p->mm->flags); task_unlock(p); } - - return ret; + if (reaped) + goto next; + goto abort; } /* * If task is allocating a lot of memory and has been marked to be * killed first if it triggers an oom, then select it. */ - if (oom_task_origin(task)) - return OOM_SCAN_SELECT; + if (oom_task_origin(task)) { + points = ULONG_MAX; + goto select; + } - return OOM_SCAN_OK; + points = oom_badness(task, NULL, oc->nodemask, oc->totalpages); + if (!points || points < oc->chosen_points) + goto next; + + /* Prefer thread group leaders for display purposes */ + if (points == oc->chosen_points && thread_group_leader(oc->chosen)) + goto next; +select: + if (oc->chosen) + put_task_struct(oc->chosen); + get_task_struct(task); + oc->chosen = task; + oc->chosen_points = points; +next: + return 0; +abort: + if (oc->chosen) + put_task_struct(oc->chosen); + oc->chosen = (void *)-1UL; + return 1; } /* - * Simple selection loop. We chose the process with the highest - * number of 'points'. Returns -1 on scan abort. + * Simple selection loop. We choose the process with the highest number of + * 'points'. In case scan was aborted, oc->chosen is set to -1. */ -static struct task_struct *select_bad_process(struct oom_control *oc, - unsigned int *ppoints, unsigned long totalpages) +static void select_bad_process(struct oom_control *oc) { - struct task_struct *p; - struct task_struct *chosen = NULL; - unsigned long chosen_points = 0; - - rcu_read_lock(); - for_each_process(p) { - unsigned int points; - - switch (oom_scan_process_thread(oc, p)) { - case OOM_SCAN_SELECT: - chosen = p; - chosen_points = ULONG_MAX; - /* fall through */ - case OOM_SCAN_CONTINUE: - continue; - case OOM_SCAN_ABORT: - rcu_read_unlock(); - return (struct task_struct *)(-1UL); - case OOM_SCAN_OK: - break; - }; - points = oom_badness(p, NULL, oc->nodemask, totalpages); - if (!points || points < chosen_points) - continue; + if (is_memcg_oom(oc)) + mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc); + else { + struct task_struct *p; - chosen = p; - chosen_points = points; + rcu_read_lock(); + for_each_process(p) + if (oom_evaluate_task(p, oc)) + break; + rcu_read_unlock(); } - if (chosen) - get_task_struct(chosen); - rcu_read_unlock(); - *ppoints = chosen_points * 1000 / totalpages; - return chosen; + oc->chosen_points = oc->chosen_points * 1000 / oc->totalpages; } /** @@ -419,7 +430,7 @@ static void dump_header(struct oom_control *oc, struct task_struct *p) static atomic_t oom_victims = ATOMIC_INIT(0); static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait); -bool oom_killer_disabled __read_mostly; +static bool oom_killer_disabled __read_mostly; #define K(x) ((x) << (PAGE_SHIFT-10)) @@ -627,7 +638,7 @@ static int oom_reaper(void *unused) return 0; } -void wake_oom_reaper(struct task_struct *tsk) +static void wake_oom_reaper(struct task_struct *tsk) { if (!oom_reaper_th) return; @@ -656,7 +667,11 @@ static int __init oom_init(void) return 0; } subsys_initcall(oom_init) -#endif +#else +static inline void wake_oom_reaper(struct task_struct *tsk) +{ +} +#endif /* CONFIG_MMU */ /** * mark_oom_victim - mark the given task as OOM victim @@ -665,7 +680,7 @@ subsys_initcall(oom_init) * Has to be called with oom_lock held and never after * oom has been disabled already. */ -void mark_oom_victim(struct task_struct *tsk) +static void mark_oom_victim(struct task_struct *tsk) { WARN_ON(oom_killer_disabled); /* OOM killer might race with memcg OOM */ @@ -760,7 +775,7 @@ static inline bool __task_will_free_mem(struct task_struct *task) * Caller has to make sure that task->mm is stable (hold task_lock or * it operates on the current). */ -bool task_will_free_mem(struct task_struct *task) +static bool task_will_free_mem(struct task_struct *task) { struct mm_struct *mm = task->mm; struct task_struct *p; @@ -806,14 +821,10 @@ bool task_will_free_mem(struct task_struct *task) return ret; } -/* - * Must be called while holding a reference to p, which will be released upon - * returning. - */ -void oom_kill_process(struct oom_control *oc, struct task_struct *p, - unsigned int points, unsigned long totalpages, - const char *message) +static void oom_kill_process(struct oom_control *oc, const char *message) { + struct task_struct *p = oc->chosen; + unsigned int points = oc->chosen_points; struct task_struct *victim = p; struct task_struct *child; struct task_struct *t; @@ -860,7 +871,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, * oom_badness() returns 0 if the thread is unkillable */ child_points = oom_badness(child, - oc->memcg, oc->nodemask, totalpages); + oc->memcg, oc->nodemask, oc->totalpages); if (child_points > victim_points) { put_task_struct(victim); victim = child; @@ -942,7 +953,8 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, /* * Determines whether the kernel must panic because of the panic_on_oom sysctl. */ -void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint) +static void check_panic_on_oom(struct oom_control *oc, + enum oom_constraint constraint) { if (likely(!sysctl_panic_on_oom)) return; @@ -988,19 +1000,18 @@ EXPORT_SYMBOL_GPL(unregister_oom_notifier); */ bool out_of_memory(struct oom_control *oc) { - struct task_struct *p; - unsigned long totalpages; unsigned long freed = 0; - unsigned int uninitialized_var(points); enum oom_constraint constraint = CONSTRAINT_NONE; if (oom_killer_disabled) return false; - blocking_notifier_call_chain(&oom_notify_list, 0, &freed); - if (freed > 0) - /* Got some memory back in the last second. */ - return true; + if (!is_memcg_oom(oc)) { + blocking_notifier_call_chain(&oom_notify_list, 0, &freed); + if (freed > 0) + /* Got some memory back in the last second. */ + return true; + } /* * If current has a pending SIGKILL or is exiting, then automatically @@ -1024,37 +1035,38 @@ bool out_of_memory(struct oom_control *oc) /* * Check if there were limitations on the allocation (only relevant for - * NUMA) that may require different handling. + * NUMA and memcg) that may require different handling. */ - constraint = constrained_alloc(oc, &totalpages); + constraint = constrained_alloc(oc); if (constraint != CONSTRAINT_MEMORY_POLICY) oc->nodemask = NULL; check_panic_on_oom(oc, constraint); - if (sysctl_oom_kill_allocating_task && current->mm && - !oom_unkillable_task(current, NULL, oc->nodemask) && + if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task && + current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) && current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { get_task_struct(current); - oom_kill_process(oc, current, 0, totalpages, - "Out of memory (oom_kill_allocating_task)"); + oc->chosen = current; + oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)"); return true; } - p = select_bad_process(oc, &points, totalpages); + select_bad_process(oc); /* Found nothing?!?! Either we hang forever, or we panic. */ - if (!p && !is_sysrq_oom(oc)) { + if (!oc->chosen && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) { dump_header(oc, NULL); panic("Out of memory and no killable processes...\n"); } - if (p && p != (void *)-1UL) { - oom_kill_process(oc, p, points, totalpages, "Out of memory"); + if (oc->chosen && oc->chosen != (void *)-1UL) { + oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" : + "Memory cgroup out of memory"); /* * Give the killed process a good chance to exit before trying * to allocate memory again. */ schedule_timeout_killable(1); } - return true; + return !!oc->chosen; } /* -- cgit v1.1 From 252e5c6e2e5b4557599ef86ea5d02b0395e9056c Mon Sep 17 00:00:00 2001 From: zijun_hu Date: Fri, 7 Oct 2016 16:57:26 -0700 Subject: mm/vmalloc.c: fix align value calculation error It causes double align requirement for __get_vm_area_node() if parameter size is power of 2 and VM_IOREMAP is set in parameter flags, for example size=0x10000 -> fls_long(0x10000)=17 -> align=0x20000 get_count_order_long() is implemented and can be used instead of fls_long() for fixing the bug, for example size=0x10000 -> get_count_order_long(0x10000)=16 -> align=0x10000 [akpm@linux-foundation.org: s/get_order_long()/get_count_order_long()/] [zijun_hu@zoho.com: fixes] Link: http://lkml.kernel.org/r/57AABC8B.1040409@zoho.com [akpm@linux-foundation.org: locate get_count_order_long() next to get_count_order()] [akpm@linux-foundation.org: move get_count_order[_long] definitions to pick up fls_long()] [zijun_hu@htc.com: move out get_count_order[_long]() from __KERNEL__ scope] Link: http://lkml.kernel.org/r/57B2C4CE.80303@zoho.com Link: http://lkml.kernel.org/r/fc045ecf-20fa-0722-b3ac-9a6140488fad@zoho.com Signed-off-by: zijun_hu Cc: Tejun Heo Cc: Johannes Weiner Cc: Minchan Kim Cc: David Rientjes Signed-off-by: zijun_hu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 91f44e7..80660a0 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1359,14 +1359,14 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, struct vm_struct *area; BUG_ON(in_interrupt()); - if (flags & VM_IOREMAP) - align = 1ul << clamp_t(int, fls_long(size), - PAGE_SHIFT, IOREMAP_MAX_ORDER); - size = PAGE_ALIGN(size); if (unlikely(!size)) return NULL; + if (flags & VM_IOREMAP) + align = 1ul << clamp_t(int, get_count_order_long(size), + PAGE_SHIFT, IOREMAP_MAX_ORDER); + area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); if (unlikely(!area)) return NULL; -- cgit v1.1 From 58fa2a5512d9f224775fb01433f195e639953c5f Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Fri, 7 Oct 2016 16:57:29 -0700 Subject: mm: memcontrol: add sanity checks for memcg->id.ref on get/put Link: http://lkml.kernel.org/r/1c5ddb1c171dbdfc3262252769d6138a29b35b70.1470219853.git.vdavydov@virtuozzo.com Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 48747ef..5579e76 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4062,11 +4062,13 @@ static DEFINE_IDR(mem_cgroup_idr); static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n) { + VM_BUG_ON(atomic_read(&memcg->id.ref) <= 0); atomic_add(n, &memcg->id.ref); } static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) { + VM_BUG_ON(atomic_read(&memcg->id.ref) < n); if (atomic_sub_and_test(n, &memcg->id.ref)) { idr_remove(&mem_cgroup_idr, memcg->id.id); memcg->id.id = 0; @@ -4255,8 +4257,10 @@ fail: static int mem_cgroup_css_online(struct cgroup_subsys_state *css) { + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + /* Online state pins memcg ID, memcg ID pins CSS */ - mem_cgroup_id_get(mem_cgroup_from_css(css)); + atomic_set(&memcg->id.ref, 1); css_get(css); return 0; } -- cgit v1.1 From 5870c2e1d78b043b69de3199469c056ca3b05102 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 7 Oct 2016 16:57:32 -0700 Subject: mm/oom_kill.c: fix task_will_free_mem() comment Attempt to demystify the task_will_free_mem() loop. Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index ef17551..463cdd2 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -803,8 +803,9 @@ static bool task_will_free_mem(struct task_struct *task) return true; /* - * This is really pessimistic but we do not have any reliable way - * to check that external processes share with our mm + * Make sure that all tasks which share the mm with the given tasks + * are dying as well to make sure that a) nobody pins its mm and + * b) the task is also reapable by the oom reaper. */ rcu_read_lock(); for_each_process(p) { -- cgit v1.1 From 06ed29989f39f5129d4f76f4a2d7ce2efa46a6a1 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 7 Oct 2016 16:57:35 -0700 Subject: mm, compaction: make whole_zone flag ignore cached scanner positions Patch series "make direct compaction more deterministic") This is mostly a followup to Michal's oom detection rework, which highlighted the need for direct compaction to provide better feedback in reclaim/compaction loop, so that it can reliably recognize when compaction cannot make further progress, and allocation should invoke OOM killer or fail. We've discussed this at LSF/MM [1] where I proposed expanding the async/sync migration mode used in compaction to more general "priorities". This patchset adds one new priority that just overrides all the heuristics and makes compaction fully scan all zones. I don't currently think that we need more fine-grained priorities, but we'll see. Other than that there's some smaller fixes and cleanups, mainly related to the THP-specific hacks. I've tested this with stress-highalloc in GFP_KERNEL order-4 and THP-like order-9 scenarios. There's some improvement for compaction stats for the order-4, which is likely due to the better watermarks handling. In the previous version I reported mostly noise wrt compaction stats, and decreased direct reclaim - now the reclaim is without difference. I believe this is due to the less aggressive compaction priority increase in patch 6. "before" is a mmotm tree prior to 4.7 release plus the first part of the series that was sent and merged separately before after order-4: Compaction stalls 27216 30759 Compaction success 19598 25475 Compaction failures 7617 5283 Page migrate success 370510 464919 Page migrate failure 25712 27987 Compaction pages isolated 849601 1041581 Compaction migrate scanned 143146541 101084990 Compaction free scanned 208355124 144863510 Compaction cost 1403 1210 order-9: Compaction stalls 7311 7401 Compaction success 1634 1683 Compaction failures 5677 5718 Page migrate success 194657 183988 Page migrate failure 4753 4170 Compaction pages isolated 498790 456130 Compaction migrate scanned 565371 524174 Compaction free scanned 4230296 4250744 Compaction cost 215 203 [1] https://lwn.net/Articles/684611/ This patch (of 11): A recent patch has added whole_zone flag that compaction sets when scanning starts from the zone boundary, in order to report that zone has been fully scanned in one attempt. For allocations that want to try really hard or cannot fail, we will want to introduce a mode where scanning whole zone is guaranteed regardless of the cached positions. This patch reuses the whole_zone flag in a way that if it's already passed true to compaction, the cached scanner positions are ignored. Employing this flag during reclaim/compaction loop will be done in the next patch. This patch however converts compaction invoked from userspace via procfs to use this flag. Before this patch, the cached positions were first reset to zone boundaries and then read back from struct zone, so there was a window where a parallel compaction could replace the reset values, making the manual compaction less effective. Using the flag instead of performing reset is more robust. [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/20160810091226.6709-2-vbabka@suse.cz Signed-off-by: Vlastimil Babka Tested-by: Lorenzo Stoakes Acked-by: Michal Hocko Cc: Mel Gorman Cc: Joonsoo Kim Cc: David Rientjes Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 43 +++++++++++++++++++++---------------------- mm/internal.h | 2 +- 2 files changed, 22 insertions(+), 23 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 9affb29..c684ca1 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1492,23 +1492,29 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro /* * Setup to move all movable pages to the end of the zone. Used cached - * information on where the scanners should start but check that it - * is initialised by ensuring the values are within zone boundaries. + * information on where the scanners should start (unless we explicitly + * want to compact the whole zone), but check that it is initialised + * by ensuring the values are within zone boundaries. */ - cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync]; - cc->free_pfn = zone->compact_cached_free_pfn; - if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) { - cc->free_pfn = pageblock_start_pfn(end_pfn - 1); - zone->compact_cached_free_pfn = cc->free_pfn; - } - if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) { + if (cc->whole_zone) { cc->migrate_pfn = start_pfn; - zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; - zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; - } + cc->free_pfn = pageblock_start_pfn(end_pfn - 1); + } else { + cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync]; + cc->free_pfn = zone->compact_cached_free_pfn; + if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) { + cc->free_pfn = pageblock_start_pfn(end_pfn - 1); + zone->compact_cached_free_pfn = cc->free_pfn; + } + if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) { + cc->migrate_pfn = start_pfn; + zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; + zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; + } - if (cc->migrate_pfn == start_pfn) - cc->whole_zone = true; + if (cc->migrate_pfn == start_pfn) + cc->whole_zone = true; + } cc->last_migrated_pfn = 0; @@ -1747,14 +1753,6 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) INIT_LIST_HEAD(&cc->freepages); INIT_LIST_HEAD(&cc->migratepages); - /* - * When called via /proc/sys/vm/compact_memory - * this makes sure we compact the whole zone regardless of - * cached scanner positions. - */ - if (is_via_compact_memory(cc->order)) - __reset_isolation_suitable(zone); - if (is_via_compact_memory(cc->order) || !compaction_deferred(zone, cc->order)) compact_zone(zone, cc); @@ -1790,6 +1788,7 @@ static void compact_node(int nid) .order = -1, .mode = MIGRATE_SYNC, .ignore_skip_hint = true, + .whole_zone = true, }; __compact_pgdat(NODE_DATA(nid), &cc); diff --git a/mm/internal.h b/mm/internal.h index 1501304..5214bf8 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -179,7 +179,7 @@ struct compact_control { enum migrate_mode mode; /* Async or sync migration mode */ bool ignore_skip_hint; /* Scan blocks even if marked skip */ bool direct_compaction; /* False from kcompactd or /proc/... */ - bool whole_zone; /* Whole zone has been scanned */ + bool whole_zone; /* Whole zone should/has been scanned */ int order; /* order a direct compactor needs */ const gfp_t gfp_mask; /* gfp mask of a direct compactor */ const unsigned int alloc_flags; /* alloc flags of a direct compactor */ -- cgit v1.1 From 791cae9620e35d18df2cedf2bd444920c3ecf04a Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 7 Oct 2016 16:57:38 -0700 Subject: mm, compaction: cleanup unused functions Since kswapd compaction moved to kcompactd, compact_pgdat() is not called anymore, so we remove it. The only caller of __compact_pgdat() is compact_node(), so we merge them and remove code that was only reachable from kswapd. Link: http://lkml.kernel.org/r/20160810091226.6709-3-vbabka@suse.cz Signed-off-by: Vlastimil Babka Tested-by: Lorenzo Stoakes Acked-by: Michal Hocko Cc: Mel Gorman Cc: Joonsoo Kim Cc: David Rientjes Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 60 ++++++++++++++++----------------------------------------- 1 file changed, 17 insertions(+), 43 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index c684ca1..8e32778 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1736,10 +1736,18 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, /* Compact all zones within a node */ -static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) +static void compact_node(int nid) { + pg_data_t *pgdat = NODE_DATA(nid); int zoneid; struct zone *zone; + struct compact_control cc = { + .order = -1, + .mode = MIGRATE_SYNC, + .ignore_skip_hint = true, + .whole_zone = true, + }; + for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { @@ -1747,53 +1755,19 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) if (!populated_zone(zone)) continue; - cc->nr_freepages = 0; - cc->nr_migratepages = 0; - cc->zone = zone; - INIT_LIST_HEAD(&cc->freepages); - INIT_LIST_HEAD(&cc->migratepages); - - if (is_via_compact_memory(cc->order) || - !compaction_deferred(zone, cc->order)) - compact_zone(zone, cc); - - VM_BUG_ON(!list_empty(&cc->freepages)); - VM_BUG_ON(!list_empty(&cc->migratepages)); + cc.nr_freepages = 0; + cc.nr_migratepages = 0; + cc.zone = zone; + INIT_LIST_HEAD(&cc.freepages); + INIT_LIST_HEAD(&cc.migratepages); - if (is_via_compact_memory(cc->order)) - continue; + compact_zone(zone, &cc); - if (zone_watermark_ok(zone, cc->order, - low_wmark_pages(zone), 0, 0)) - compaction_defer_reset(zone, cc->order, false); + VM_BUG_ON(!list_empty(&cc.freepages)); + VM_BUG_ON(!list_empty(&cc.migratepages)); } } -void compact_pgdat(pg_data_t *pgdat, int order) -{ - struct compact_control cc = { - .order = order, - .mode = MIGRATE_ASYNC, - }; - - if (!order) - return; - - __compact_pgdat(pgdat, &cc); -} - -static void compact_node(int nid) -{ - struct compact_control cc = { - .order = -1, - .mode = MIGRATE_SYNC, - .ignore_skip_hint = true, - .whole_zone = true, - }; - - __compact_pgdat(NODE_DATA(nid), &cc); -} - /* Compact all nodes in the system */ static void compact_nodes(void) { -- cgit v1.1 From cf378319d335663b6722e74db0211b8af55049d5 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 7 Oct 2016 16:57:41 -0700 Subject: mm, compaction: rename COMPACT_PARTIAL to COMPACT_SUCCESS COMPACT_PARTIAL has historically meant that compaction returned after doing some work without fully compacting a zone. It however didn't distinguish if compaction terminated because it succeeded in creating the requested high-order page. This has changed recently and now we only return COMPACT_PARTIAL when compaction thinks it succeeded, or the high-order watermark check in compaction_suitable() passes and no compaction needs to be done. So at this point we can make the return value clearer by renaming it to COMPACT_SUCCESS. The next patch will remove some redundant tests for success where compaction just returned COMPACT_SUCCESS. Link: http://lkml.kernel.org/r/20160810091226.6709-4-vbabka@suse.cz Signed-off-by: Vlastimil Babka Tested-by: Lorenzo Stoakes Acked-by: Michal Hocko Cc: Mel Gorman Cc: Joonsoo Kim Cc: David Rientjes Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 12 ++++++------ mm/vmscan.c | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 8e32778..335eeee 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1329,13 +1329,13 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_ /* Job done if page is free of the right migratetype */ if (!list_empty(&area->free_list[migratetype])) - return COMPACT_PARTIAL; + return COMPACT_SUCCESS; #ifdef CONFIG_CMA /* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */ if (migratetype == MIGRATE_MOVABLE && !list_empty(&area->free_list[MIGRATE_CMA])) - return COMPACT_PARTIAL; + return COMPACT_SUCCESS; #endif /* * Job done if allocation would steal freepages from @@ -1343,7 +1343,7 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_ */ if (find_suitable_fallback(area, order, migratetype, true, &can_steal) != -1) - return COMPACT_PARTIAL; + return COMPACT_SUCCESS; } return COMPACT_NO_SUITABLE_PAGE; @@ -1367,7 +1367,7 @@ static enum compact_result compact_finished(struct zone *zone, * compaction_suitable: Is this suitable to run compaction on this zone now? * Returns * COMPACT_SKIPPED - If there are too few free pages for compaction - * COMPACT_PARTIAL - If the allocation would succeed without compaction + * COMPACT_SUCCESS - If the allocation would succeed without compaction * COMPACT_CONTINUE - If compaction should run now */ static enum compact_result __compaction_suitable(struct zone *zone, int order, @@ -1388,7 +1388,7 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, */ if (zone_watermark_ok(zone, order, watermark, classzone_idx, alloc_flags)) - return COMPACT_PARTIAL; + return COMPACT_SUCCESS; /* * Watermarks for order-0 must be met for compaction. Note the 2UL. @@ -1477,7 +1477,7 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro ret = compaction_suitable(zone, cc->order, cc->alloc_flags, cc->classzone_idx); /* Compaction is likely to fail */ - if (ret == COMPACT_PARTIAL || ret == COMPACT_SKIPPED) + if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED) return ret; /* huh, compaction_suitable is returning something unexpected */ diff --git a/mm/vmscan.c b/mm/vmscan.c index 0fe8b71..981fc84 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2495,7 +2495,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, continue; switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) { - case COMPACT_PARTIAL: + case COMPACT_SUCCESS: case COMPACT_CONTINUE: return false; default: -- cgit v1.1 From 7ceb009a22517297ae0e32863eb86ec766782263 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 7 Oct 2016 16:57:44 -0700 Subject: mm, compaction: don't recheck watermarks after COMPACT_SUCCESS Joonsoo has reminded me that in a later patch changing watermark checks throughout compaction I forgot to update checks in try_to_compact_pages() and compactd_do_work(). Closer inspection however shows that they are redundant now in the success case, because compact_zone() now reliably reports this with COMPACT_SUCCESS. So effectively the checks just repeat (a subset) of checks that have just passed. So instead of checking watermarks again, just test the return value. Note it's also possible that compaction would declare failure e.g. because its find_suitable_fallback() is more strict than simple watermark check, and then the watermark check we are removing would then still succeed. After this patch this is not possible and it's arguably better, because for long-term fragmentation avoidance we should rather try a different zone than allocate with the unsuitable fallback. If compaction of all zones fail and the allocation is important enough, it will retry and succeed anyway. Also remove the stray "bool success" variable from kcompactd_do_work(). Link: http://lkml.kernel.org/r/20160810091226.6709-5-vbabka@suse.cz Signed-off-by: Vlastimil Babka Reported-by: Joonsoo Kim Tested-by: Lorenzo Stoakes Acked-by: Michal Hocko Cc: Mel Gorman Cc: David Rientjes Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 335eeee..2e1113f 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1698,9 +1698,8 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, alloc_flags, ac_classzone_idx(ac)); rc = max(status, rc); - /* If a normal allocation would succeed, stop compacting */ - if (zone_watermark_ok(zone, order, low_wmark_pages(zone), - ac_classzone_idx(ac), alloc_flags)) { + /* The allocation should succeed, stop compacting */ + if (status == COMPACT_SUCCESS) { /* * We think the allocation will succeed in this zone, * but it is not certain, hence the false. The caller @@ -1873,8 +1872,6 @@ static void kcompactd_do_work(pg_data_t *pgdat) .ignore_skip_hint = true, }; - bool success = false; - trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order, cc.classzone_idx); count_vm_event(KCOMPACTD_WAKE); @@ -1903,9 +1900,7 @@ static void kcompactd_do_work(pg_data_t *pgdat) return; status = compact_zone(zone, &cc); - if (zone_watermark_ok(zone, cc.order, low_wmark_pages(zone), - cc.classzone_idx, 0)) { - success = true; + if (status == COMPACT_SUCCESS) { compaction_defer_reset(zone, cc.order, false); } else if (status == COMPACT_PARTIAL_SKIPPED || status == COMPACT_COMPLETE) { /* -- cgit v1.1 From a8e025e55b35f7eaf6c6c011de1f98d47ddf0843 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 7 Oct 2016 16:57:47 -0700 Subject: mm, compaction: add the ultimate direct compaction priority During reclaim/compaction loop, it's desirable to get a final answer from unsuccessful compaction so we can either fail the allocation or invoke the OOM killer. However, heuristics such as deferred compaction or pageblock skip bits can cause compaction to skip parts or whole zones and lead to premature OOM's, failures or excessive reclaim/compaction retries. To remedy this, we introduce a new direct compaction priority called COMPACT_PRIO_SYNC_FULL, which instructs direct compaction to: - ignore deferred compaction status for a zone - ignore pageblock skip hints - ignore cached scanner positions and scan the whole zone The new priority should get eventually picked up by should_compact_retry() and this should improve success rates for costly allocations using __GFP_REPEAT, such as hugetlbfs allocations, and reduce some corner-case OOM's for non-costly allocations. Link: http://lkml.kernel.org/r/20160810091226.6709-6-vbabka@suse.cz [vbabka@suse.cz: use the MIN_COMPACT_PRIORITY alias] Link: http://lkml.kernel.org/r/d443b884-87e7-1c93-8684-3a3a35759fb1@suse.cz Signed-off-by: Vlastimil Babka Tested-by: Lorenzo Stoakes Acked-by: Michal Hocko Cc: Mel Gorman Cc: Joonsoo Kim Cc: David Rientjes Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 2e1113f..2104030 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1644,6 +1644,8 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, .alloc_flags = alloc_flags, .classzone_idx = classzone_idx, .direct_compaction = true, + .whole_zone = (prio == MIN_COMPACT_PRIORITY), + .ignore_skip_hint = (prio == MIN_COMPACT_PRIORITY) }; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); @@ -1689,7 +1691,8 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, ac->nodemask) { enum compact_result status; - if (compaction_deferred(zone, order)) { + if (prio > MIN_COMPACT_PRIORITY + && compaction_deferred(zone, order)) { rc = max_t(enum compact_result, COMPACT_DEFERRED, rc); continue; } -- cgit v1.1 From f2b8228c5f99a92bc07efd36f8dc840e0705a266 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 7 Oct 2016 16:57:50 -0700 Subject: mm, compaction: use correct watermark when checking compaction success The __compact_finished() function uses low watermark in a check that has to pass if the direct compaction is to finish and allocation should succeed. This is too pessimistic, as the allocation will typically use min watermark. It may happen that during compaction, we drop below the low watermark (due to parallel activity), but still form the target high-order page. By checking against low watermark, we might needlessly continue compaction. Similarly, __compaction_suitable() uses low watermark in a check whether allocation can succeed without compaction. Again, this is unnecessarily pessimistic. After this patch, these check will use direct compactor's alloc_flags to determine the watermark, which is effectively the min watermark. Link: http://lkml.kernel.org/r/20160810091226.6709-8-vbabka@suse.cz Signed-off-by: Vlastimil Babka Tested-by: Lorenzo Stoakes Acked-by: Michal Hocko Cc: Mel Gorman Cc: Joonsoo Kim Cc: David Rientjes Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 2104030..e2618ac 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1316,7 +1316,7 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_ return COMPACT_CONTINUE; /* Compaction run is not finished if the watermark is not met */ - watermark = low_wmark_pages(zone); + watermark = zone->watermark[cc->alloc_flags & ALLOC_WMARK_MASK]; if (!zone_watermark_ok(zone, cc->order, watermark, cc->classzone_idx, cc->alloc_flags)) @@ -1381,7 +1381,7 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, if (is_via_compact_memory(order)) return COMPACT_CONTINUE; - watermark = low_wmark_pages(zone); + watermark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; /* * If watermarks for high-order allocation are already met, there * should be no need for compaction at all. @@ -1395,7 +1395,7 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, * This is because during migration, copies of pages need to be * allocated and for a short time, the footprint is higher */ - watermark += (2UL << order); + watermark = low_wmark_pages(zone) + (2UL << order); if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx, alloc_flags, wmark_target)) return COMPACT_SKIPPED; -- cgit v1.1 From 9861a62c335cd34a2b6b25aaaf5898e8370299ec Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 7 Oct 2016 16:57:53 -0700 Subject: mm, compaction: create compact_gap wrapper Compaction uses a watermark gap of (2UL << order) pages at various places and it's not immediately obvious why. Abstract it through a compact_gap() wrapper to create a single place with a thorough explanation. [vbabka@suse.cz: clarify the comment of compact_gap()] Link: http://lkml.kernel.org/r/7b6aed1f-fdf8-2063-9ff4-bbe4de712d37@suse.cz Link: http://lkml.kernel.org/r/20160810091226.6709-9-vbabka@suse.cz Signed-off-by: Vlastimil Babka Tested-by: Lorenzo Stoakes Acked-by: Michal Hocko Cc: Mel Gorman Cc: Joonsoo Kim Cc: David Rientjes Cc: Rik van Riel Signed-off-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 7 +++---- mm/vmscan.c | 6 +++--- 2 files changed, 6 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index e2618ac..bbf41ee 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1391,11 +1391,10 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, return COMPACT_SUCCESS; /* - * Watermarks for order-0 must be met for compaction. Note the 2UL. - * This is because during migration, copies of pages need to be - * allocated and for a short time, the footprint is higher + * Watermarks for order-0 must be met for compaction to be able to + * isolate free pages for migration targets. */ - watermark = low_wmark_pages(zone) + (2UL << order); + watermark = low_wmark_pages(zone) + compact_gap(order); if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx, alloc_flags, wmark_target)) return COMPACT_SKIPPED; diff --git a/mm/vmscan.c b/mm/vmscan.c index 981fc84..2a6978a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2480,7 +2480,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, * If we have not reclaimed enough pages for compaction and the * inactive lists are large enough, continue reclaiming */ - pages_for_compaction = (2UL << sc->order); + pages_for_compaction = compact_gap(sc->order); inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE); if (get_nr_swap_pages() > 0) inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON); @@ -2612,7 +2612,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) * there is a buffer of free pages available to give compaction * a reasonable chance of completing and allocating the page */ - watermark = high_wmark_pages(zone) + (2UL << sc->order); + watermark = high_wmark_pages(zone) + compact_gap(sc->order); watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx); /* @@ -3169,7 +3169,7 @@ static bool kswapd_shrink_node(pg_data_t *pgdat, * excessive reclaim. Assume that a process requested a high-order * can direct reclaim/compact. */ - if (sc->order && sc->nr_reclaimed >= 2UL << sc->order) + if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order)) sc->order = 0; return sc->nr_scanned >= sc->nr_to_reclaim; -- cgit v1.1 From 984fdba6a32e4e9819ebc06ca3acec6582ffd99f Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 7 Oct 2016 16:57:57 -0700 Subject: mm, compaction: use proper alloc_flags in __compaction_suitable() The __compaction_suitable() function checks the low watermark plus a compact_gap() gap to decide if there's enough free memory to perform compaction. This check uses direct compactor's alloc_flags, but that's wrong, since these flags are not applicable for freepage isolation. For example, alloc_flags may indicate access to memory reserves, making compaction proceed, and then fail watermark check during the isolation. A similar problem exists for ALLOC_CMA, which may be part of alloc_flags, but not during freepage isolation. In this case however it makes sense to use ALLOC_CMA both in __compaction_suitable() and __isolate_free_page(), since there's actually nothing preventing the freepage scanner to isolate from CMA pageblocks, with the assumption that a page that could be migrated once by compaction can be migrated also later by CMA allocation. Thus we should count pages in CMA pageblocks when considering compaction suitability and when isolating freepages. To sum up, this patch should remove some false positives from __compaction_suitable(), and allow compaction to proceed when free pages required for compaction reside in the CMA pageblocks. Link: http://lkml.kernel.org/r/20160810091226.6709-10-vbabka@suse.cz Signed-off-by: Vlastimil Babka Tested-by: Lorenzo Stoakes Cc: Michal Hocko Cc: Mel Gorman Cc: Joonsoo Kim Cc: David Rientjes Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 12 ++++++++++-- mm/page_alloc.c | 2 +- 2 files changed, 11 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index bbf41ee..658c009 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1392,11 +1392,19 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, /* * Watermarks for order-0 must be met for compaction to be able to - * isolate free pages for migration targets. + * isolate free pages for migration targets. This means that the + * watermark and alloc_flags have to match, or be more pessimistic than + * the check in __isolate_free_page(). We don't use the direct + * compactor's alloc_flags, as they are not relevant for freepage + * isolation. We however do use the direct compactor's classzone_idx to + * skip over zones where lowmem reserves would prevent allocation even + * if compaction succeeds. + * ALLOC_CMA is used, as pages in CMA pageblocks are considered + * suitable migration targets */ watermark = low_wmark_pages(zone) + compact_gap(order); if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx, - alloc_flags, wmark_target)) + ALLOC_CMA, wmark_target)) return COMPACT_SKIPPED; /* diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a2214c6..637b0e9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2491,7 +2491,7 @@ int __isolate_free_page(struct page *page, unsigned int order) if (!is_migrate_isolate(mt)) { /* Obey watermarks as if the page was being allocated */ watermark = low_wmark_pages(zone) + (1 << order); - if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) + if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) return 0; __mod_zone_freepage_state(zone, -(1UL << order), mt); -- cgit v1.1 From 8348faf91f56371d4bada6fc5915e19580a15ffe Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 7 Oct 2016 16:58:00 -0700 Subject: mm, compaction: require only min watermarks for non-costly orders The __compaction_suitable() function checks the low watermark plus a compact_gap() gap to decide if there's enough free memory to perform compaction. Then __isolate_free_page uses low watermark check to decide if particular free page can be isolated. In the latter case, using low watermark is needlessly pessimistic, as the free page isolations are only temporary. For __compaction_suitable() the higher watermark makes sense for high-order allocations where more freepages increase the chance of success, and we can typically fail with some order-0 fallback when the system is struggling to reach that watermark. But for low-order allocation, forming the page should not be that hard. So using low watermark here might just prevent compaction from even trying, and eventually lead to OOM killer even if we are above min watermarks. So after this patch, we use min watermark for non-costly orders in __compaction_suitable(), and for all orders in __isolate_free_page(). [vbabka@suse.cz: clarify __isolate_free_page() comment] Link: http://lkml.kernel.org/r/7ae4baec-4eca-e70b-2a69-94bea4fb19fa@suse.cz Link: http://lkml.kernel.org/r/20160810091226.6709-11-vbabka@suse.cz Signed-off-by: Vlastimil Babka Tested-by: Lorenzo Stoakes Acked-by: Michal Hocko Cc: Mel Gorman Cc: Joonsoo Kim Cc: David Rientjes Cc: Rik van Riel Signed-off-by: Vlastimil Babka Tested-by: Lorenzo Stoakes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 6 +++++- mm/page_alloc.c | 9 +++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 658c009..29f6c49 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1399,10 +1399,14 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, * isolation. We however do use the direct compactor's classzone_idx to * skip over zones where lowmem reserves would prevent allocation even * if compaction succeeds. + * For costly orders, we require low watermark instead of min for + * compaction to proceed to increase its chances. * ALLOC_CMA is used, as pages in CMA pageblocks are considered * suitable migration targets */ - watermark = low_wmark_pages(zone) + compact_gap(order); + watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ? + low_wmark_pages(zone) : min_wmark_pages(zone); + watermark += compact_gap(order); if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx, ALLOC_CMA, wmark_target)) return COMPACT_SKIPPED; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 637b0e9..c988d32 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2489,8 +2489,13 @@ int __isolate_free_page(struct page *page, unsigned int order) mt = get_pageblock_migratetype(page); if (!is_migrate_isolate(mt)) { - /* Obey watermarks as if the page was being allocated */ - watermark = low_wmark_pages(zone) + (1 << order); + /* + * Obey watermarks as if the page was being allocated. We can + * emulate a high-order watermark check with a raised order-0 + * watermark, because we already know our high-order page + * exists. + */ + watermark = min_wmark_pages(zone) + (1UL << order); if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) return 0; -- cgit v1.1 From fdd4c6149a71ff1da98317adb6f18c28f75a6e3f Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 7 Oct 2016 16:58:03 -0700 Subject: mm, vmscan: make compaction_ready() more accurate and readable The compaction_ready() is used during direct reclaim for costly order allocations to skip reclaim for zones where compaction should be attempted instead. It's combining the standard compaction_suitable() check with its own watermark check based on high watermark with extra gap, and the result is confusing at best. This patch attempts to better structure and document the checks involved. First, compaction_suitable() can determine that the allocation should either succeed already, or that compaction doesn't have enough free pages to proceed. The third possibility is that compaction has enough free pages, but we still decide to reclaim first - unless we are already above the high watermark with gap. This does not mean that the reclaim will actually reach this watermark during single attempt, this is rather an over-reclaim protection. So document the code as such. The check for compaction_deferred() is removed completely, as it in fact had no proper role here. The result after this patch is mainly a less confusing code. We also skip some over-reclaim in cases where the allocation should already succed. Link: http://lkml.kernel.org/r/20160810091226.6709-12-vbabka@suse.cz Signed-off-by: Vlastimil Babka Tested-by: Lorenzo Stoakes Acked-by: Michal Hocko Cc: Mel Gorman Cc: Joonsoo Kim Cc: David Rientjes Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 43 ++++++++++++++++++++----------------------- 1 file changed, 20 insertions(+), 23 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 2a6978a..f406e6f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2598,38 +2598,35 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) } /* - * Returns true if compaction should go ahead for a high-order request, or - * the high-order allocation would succeed without compaction. + * Returns true if compaction should go ahead for a costly-order request, or + * the allocation would already succeed without compaction. Return false if we + * should reclaim first. */ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) { unsigned long watermark; - bool watermark_ok; + enum compact_result suitable; - /* - * Compaction takes time to run and there are potentially other - * callers using the pages just freed. Continue reclaiming until - * there is a buffer of free pages available to give compaction - * a reasonable chance of completing and allocating the page - */ - watermark = high_wmark_pages(zone) + compact_gap(sc->order); - watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx); - - /* - * If compaction is deferred, reclaim up to a point where - * compaction will have a chance of success when re-enabled - */ - if (compaction_deferred(zone, sc->order)) - return watermark_ok; + suitable = compaction_suitable(zone, sc->order, 0, sc->reclaim_idx); + if (suitable == COMPACT_SUCCESS) + /* Allocation should succeed already. Don't reclaim. */ + return true; + if (suitable == COMPACT_SKIPPED) + /* Compaction cannot yet proceed. Do reclaim. */ + return false; /* - * If compaction is not ready to start and allocation is not likely - * to succeed without it, then keep reclaiming. + * Compaction is already possible, but it takes time to run and there + * are potentially other callers using the pages just freed. So proceed + * with reclaim to make a buffer of free pages available to give + * compaction a reasonable chance of completing and allocating the page. + * Note that we won't actually reclaim the whole buffer in one attempt + * as the target watermark in should_continue_reclaim() is lower. But if + * we are already above the high+gap watermark, don't reclaim at all. */ - if (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx) == COMPACT_SKIPPED) - return false; + watermark = high_wmark_pages(zone) + compact_gap(sc->order); - return watermark_ok; + return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx); } /* -- cgit v1.1 From e506b99696a296e9aba2e5f3bc5768aa7d8e2396 Mon Sep 17 00:00:00 2001 From: Xishi Qiu Date: Fri, 7 Oct 2016 16:58:06 -0700 Subject: mem-hotplug: fix node spanned pages when we have a movable node Commit 342332e6a925 ("mm/page_alloc.c: introduce kernelcore=mirror option") rewrote the calculation of node spanned pages. But when we have a movable node, the size of node spanned pages is double added. That's because we have an empty normal zone, the present pages is zero, but its spanned pages is not zero. e.g. Zone ranges: DMA [mem 0x0000000000001000-0x0000000000ffffff] DMA32 [mem 0x0000000001000000-0x00000000ffffffff] Normal [mem 0x0000000100000000-0x0000007c7fffffff] Movable zone start for each node Node 1: 0x0000001080000000 Node 2: 0x0000002080000000 Node 3: 0x0000003080000000 Node 4: 0x0000003c80000000 Node 5: 0x0000004c80000000 Node 6: 0x0000005c80000000 Early memory node ranges node 0: [mem 0x0000000000001000-0x000000000009ffff] node 0: [mem 0x0000000000100000-0x000000007552afff] node 0: [mem 0x000000007bd46000-0x000000007bd46fff] node 0: [mem 0x000000007bdcd000-0x000000007bffffff] node 0: [mem 0x0000000100000000-0x000000107fffffff] node 1: [mem 0x0000001080000000-0x000000207fffffff] node 2: [mem 0x0000002080000000-0x000000307fffffff] node 3: [mem 0x0000003080000000-0x0000003c7fffffff] node 4: [mem 0x0000003c80000000-0x0000004c7fffffff] node 5: [mem 0x0000004c80000000-0x0000005c7fffffff] node 6: [mem 0x0000005c80000000-0x0000006c7fffffff] node 7: [mem 0x0000006c80000000-0x0000007c7fffffff] node1: Normal, start=0x1080000, present=0x0, spanned=0x1000000 Movable, start=0x1080000, present=0x1000000, spanned=0x1000000 pgdat, start=0x1080000, present=0x1000000, spanned=0x2000000 After this patch, the problem is fixed. node1: Normal, start=0x0, present=0x0, spanned=0x0 Movable, start=0x1080000, present=0x1000000, spanned=0x1000000 pgdat, start=0x1080000, present=0x1000000, spanned=0x1000000 Link: http://lkml.kernel.org/r/57A325E8.6070100@huawei.com Signed-off-by: Xishi Qiu Cc: Taku Izumi Cc: Vlastimil Babka Cc: Mel Gorman Cc: Michal Hocko Cc: David Rientjes Cc: Joonsoo Kim Cc: "Kirill A . Shutemov" Cc: Kamezawa Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 54 +++++++++++++++++++++++------------------------------- 1 file changed, 23 insertions(+), 31 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c988d32..26246fd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5005,15 +5005,6 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP /* - * If not mirrored_kernelcore and ZONE_MOVABLE exists, range - * from zone_movable_pfn[nid] to end of each node should be - * ZONE_MOVABLE not ZONE_NORMAL. skip it. - */ - if (!mirrored_kernelcore && zone_movable_pfn[nid]) - if (zone == ZONE_NORMAL && pfn >= zone_movable_pfn[nid]) - continue; - - /* * Check given memblock attribute by firmware which can affect * kernel memory layout. If zone==ZONE_MOVABLE but memory is * mirrored, it's an overlapped memmap init. skip it. @@ -5456,6 +5447,12 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid, *zone_end_pfn = min(node_end_pfn, arch_zone_highest_possible_pfn[movable_zone]); + /* Adjust for ZONE_MOVABLE starting within this range */ + } else if (!mirrored_kernelcore && + *zone_start_pfn < zone_movable_pfn[nid] && + *zone_end_pfn > zone_movable_pfn[nid]) { + *zone_end_pfn = zone_movable_pfn[nid]; + /* Check if this whole range is within ZONE_MOVABLE */ } else if (*zone_start_pfn >= zone_movable_pfn[nid]) *zone_start_pfn = *zone_end_pfn; @@ -5559,28 +5556,23 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages * and vice versa. */ - if (zone_movable_pfn[nid]) { - if (mirrored_kernelcore) { - unsigned long start_pfn, end_pfn; - struct memblock_region *r; - - for_each_memblock(memory, r) { - start_pfn = clamp(memblock_region_memory_base_pfn(r), - zone_start_pfn, zone_end_pfn); - end_pfn = clamp(memblock_region_memory_end_pfn(r), - zone_start_pfn, zone_end_pfn); - - if (zone_type == ZONE_MOVABLE && - memblock_is_mirror(r)) - nr_absent += end_pfn - start_pfn; - - if (zone_type == ZONE_NORMAL && - !memblock_is_mirror(r)) - nr_absent += end_pfn - start_pfn; - } - } else { - if (zone_type == ZONE_NORMAL) - nr_absent += node_end_pfn - zone_movable_pfn[nid]; + if (mirrored_kernelcore && zone_movable_pfn[nid]) { + unsigned long start_pfn, end_pfn; + struct memblock_region *r; + + for_each_memblock(memory, r) { + start_pfn = clamp(memblock_region_memory_base_pfn(r), + zone_start_pfn, zone_end_pfn); + end_pfn = clamp(memblock_region_memory_end_pfn(r), + zone_start_pfn, zone_end_pfn); + + if (zone_type == ZONE_MOVABLE && + memblock_is_mirror(r)) + nr_absent += end_pfn - start_pfn; + + if (zone_type == ZONE_NORMAL && + !memblock_is_mirror(r)) + nr_absent += end_pfn - start_pfn; } } -- cgit v1.1 From e780149bcd4be171421535db0514fa9ff556cb87 Mon Sep 17 00:00:00 2001 From: Xishi Qiu Date: Fri, 7 Oct 2016 16:58:09 -0700 Subject: mm: fix set pageblock migratetype in deferred struct page init On x86_64 MAX_ORDER_NR_PAGES is usually 4M, and a pageblock is usually 2M, so we only set one pageblock's migratetype in deferred_free_range() if pfn is aligned to MAX_ORDER_NR_PAGES. That means it causes uninitialized migratetype blocks, you can see from "cat /proc/pagetypeinfo", almost half blocks are Unmovable. Also we missed freeing the last block in deferred_init_memmap(), it causes memory leak. Fixes: ac5d2539b238 ("mm: meminit: reduce number of times pageblocks are set during struct page init") Link: http://lkml.kernel.org/r/57A3260F.4050709@huawei.com Signed-off-by: Xishi Qiu Cc: Taku Izumi Cc: Vlastimil Babka Cc: Mel Gorman Cc: Michal Hocko Cc: David Rientjes Cc: Joonsoo Kim Cc: "Kirill A . Shutemov" Cc: Kamezawa Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 26246fd..0c34633 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1393,15 +1393,18 @@ static void __init deferred_free_range(struct page *page, return; /* Free a large naturally-aligned chunk if possible */ - if (nr_pages == MAX_ORDER_NR_PAGES && - (pfn & (MAX_ORDER_NR_PAGES-1)) == 0) { + if (nr_pages == pageblock_nr_pages && + (pfn & (pageblock_nr_pages - 1)) == 0) { set_pageblock_migratetype(page, MIGRATE_MOVABLE); - __free_pages_boot_core(page, MAX_ORDER-1); + __free_pages_boot_core(page, pageblock_order); return; } - for (i = 0; i < nr_pages; i++, page++) + for (i = 0; i < nr_pages; i++, page++, pfn++) { + if ((pfn & (pageblock_nr_pages - 1)) == 0) + set_pageblock_migratetype(page, MIGRATE_MOVABLE); __free_pages_boot_core(page, 0); + } } /* Completion tracking for deferred_init_memmap() threads */ @@ -1469,9 +1472,9 @@ static int __init deferred_init_memmap(void *data) /* * Ensure pfn_valid is checked every - * MAX_ORDER_NR_PAGES for memory holes + * pageblock_nr_pages for memory holes */ - if ((pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) { + if ((pfn & (pageblock_nr_pages - 1)) == 0) { if (!pfn_valid(pfn)) { page = NULL; goto free_range; @@ -1484,7 +1487,7 @@ static int __init deferred_init_memmap(void *data) } /* Minimise pfn page lookups and scheduler checks */ - if (page && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0) { + if (page && (pfn & (pageblock_nr_pages - 1)) != 0) { page++; } else { nr_pages += nr_to_free; @@ -1520,6 +1523,9 @@ free_range: free_base_page = NULL; free_base_pfn = nr_to_free = 0; } + /* Free the last block of pages to allocator */ + nr_pages += nr_to_free; + deferred_free_range(free_base_page, free_base_pfn, nr_to_free); first_init_pfn = max(end_pfn, first_init_pfn); } -- cgit v1.1 From bf48438354a79df50fadd2e1c0b81baa2619a8b6 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 7 Oct 2016 16:58:12 -0700 Subject: mm, vmscan: get rid of throttle_vm_writeout throttle_vm_writeout() was introduced back in 2005 to fix OOMs caused by excessive pageout activity during the reclaim. Too many pages could be put under writeback therefore LRUs would be full of unreclaimable pages until the IO completes and in turn the OOM killer could be invoked. There have been some important changes introduced since then in the reclaim path though. Writers are throttled by balance_dirty_pages when initiating the buffered IO and later during the memory pressure, the direct reclaim is throttled by wait_iff_congested if the node is considered congested by dirty pages on LRUs and the underlying bdi is congested by the queued IO. The kswapd is throttled as well if it encounters pages marked for immediate reclaim or under writeback which signals that that there are too many pages under writeback already. Finally should_reclaim_retry does congestion_wait if the reclaim cannot make any progress and there are too many dirty/writeback pages. Another important aspect is that we do not issue any IO from the direct reclaim context anymore. In a heavy parallel load this could queue a lot of IO which would be very scattered and thus unefficient which would just make the problem worse. This three mechanisms should throttle and keep the amount of IO in a steady state even under heavy IO and memory pressure so yet another throttling point doesn't really seem helpful. Quite contrary, Mikulas Patocka has reported that swap backed by dm-crypt doesn't work properly because the swapout IO cannot make sufficient progress as the writeout path depends on dm_crypt worker which has to allocate memory to perform the encryption. In order to guarantee a forward progress it relies on the mempool allocator. mempool_alloc(), however, prefers to use the underlying (usually page) allocator before it grabs objects from the pool. Such an allocation can dive into the memory reclaim and consequently to throttle_vm_writeout. If there are too many dirty or pages under writeback it will get throttled even though it is in fact a flusher to clear pending pages. kworker/u4:0 D ffff88003df7f438 10488 6 2 0x00000000 Workqueue: kcryptd kcryptd_crypt [dm_crypt] Call Trace: schedule+0x3c/0x90 schedule_timeout+0x1d8/0x360 io_schedule_timeout+0xa4/0x110 congestion_wait+0x86/0x1f0 throttle_vm_writeout+0x44/0xd0 shrink_zone_memcg+0x613/0x720 shrink_zone+0xe0/0x300 do_try_to_free_pages+0x1ad/0x450 try_to_free_pages+0xef/0x300 __alloc_pages_nodemask+0x879/0x1210 alloc_pages_current+0xa1/0x1f0 new_slab+0x2d7/0x6a0 ___slab_alloc+0x3fb/0x5c0 __slab_alloc+0x51/0x90 kmem_cache_alloc+0x27b/0x310 mempool_alloc_slab+0x1d/0x30 mempool_alloc+0x91/0x230 bio_alloc_bioset+0xbd/0x260 kcryptd_crypt+0x114/0x3b0 [dm_crypt] Let's just drop throttle_vm_writeout altogether. It is not very much helpful anymore. I have tried to test a potential writeback IO runaway similar to the one described in the original patch which has introduced that [1]. Small virtual machine (512MB RAM, 4 CPUs, 2G of swap space and disk image on a rather slow NFS in a sync mode on the host) with 8 parallel writers each writing 1G worth of data. As soon as the pagecache fills up and the direct reclaim hits then I start anon memory consumer in a loop (allocating 300M and exiting after populating it) in the background to make the memory pressure even stronger as well as to disrupt the steady state for the IO. The direct reclaim is throttled because of the congestion as well as kswapd hitting congestion_wait due to nr_immediate but throttle_vm_writeout doesn't ever trigger the sleep throughout the test. Dirty+writeback are close to nr_dirty_threshold with some fluctuations caused by the anon consumer. [1] https://www2.kernel.org/pub/linux/kernel/people/akpm/patches/2.6/2.6.9-rc1/2.6.9-rc1-mm3/broken-out/vm-pageout-throttling.patch Link: http://lkml.kernel.org/r/1471171473-21418-1-git-send-email-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: Mikulas Patocka Cc: Marcelo Tosatti Cc: NeilBrown Cc: Ondrej Kozina Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 30 ------------------------------ mm/vmscan.c | 2 -- 2 files changed, 32 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 28d6f36..5ed3381 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1965,36 +1965,6 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb) return false; } -void throttle_vm_writeout(gfp_t gfp_mask) -{ - unsigned long background_thresh; - unsigned long dirty_thresh; - - for ( ; ; ) { - global_dirty_limits(&background_thresh, &dirty_thresh); - dirty_thresh = hard_dirty_limit(&global_wb_domain, dirty_thresh); - - /* - * Boost the allowable dirty threshold a bit for page - * allocators so they don't get DoS'ed by heavy writers - */ - dirty_thresh += dirty_thresh / 10; /* wheeee... */ - - if (global_node_page_state(NR_UNSTABLE_NFS) + - global_node_page_state(NR_WRITEBACK) <= dirty_thresh) - break; - congestion_wait(BLK_RW_ASYNC, HZ/10); - - /* - * The caller might hold locks which can prevent IO completion - * or progress in the filesystem. So we cannot just sit here - * waiting for IO to complete. - */ - if ((gfp_mask & (__GFP_FS|__GFP_IO)) != (__GFP_FS|__GFP_IO)) - break; - } -} - /* * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs */ diff --git a/mm/vmscan.c b/mm/vmscan.c index f406e6f..d3715c1 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2418,8 +2418,6 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc if (inactive_list_is_low(lruvec, false, sc)) shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON); - - throttle_vm_writeout(sc->gfp_mask); } /* Use reclaim/compaction for costly allocs or under memory pressure */ -- cgit v1.1 From acbc15a4b397f86d39416df143e30982b1da528b Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Fri, 7 Oct 2016 16:58:15 -0700 Subject: mm/debug_pagealloc.c: clean-up guard page handling code Patch series "Reduce memory waste by page extension user". This patchset tries to reduce memory waste by page extension user. First case is architecture supported debug_pagealloc. It doesn't requires additional memory if guard page isn't used. 8 bytes per page will be saved in this case. Second case is related to page owner feature. Until now, if page_ext users want to use it's own fields on page_ext, fields should be defined in struct page_ext by hard-coding. It has a following problem. struct page_ext { #ifdef CONFIG_A int a; #endif #ifdef CONFIG_B int b; #endif }; Assume that kernel is built with both CONFIG_A and CONFIG_B. Even if we enable feature A and doesn't enable feature B at runtime, each entry of struct page_ext takes two int rather than one int. It's undesirable waste so this patch tries to reduce it. By this patchset, we can save 20 bytes per page dedicated for page owner feature in some configurations. This patch (of 6): We can make code clean by moving decision condition for set_page_guard() into set_page_guard() itself. It will help code readability. There is no functional change. Link: http://lkml.kernel.org/r/1471315879-32294-2-git-send-email-iamjoonsoo.kim@lge.com Signed-off-by: Joonsoo Kim Acked-by: Vlastimil Babka Cc: Minchan Kim Cc: Michal Hocko Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0c34633..e150ba9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -637,17 +637,20 @@ static int __init debug_guardpage_minorder_setup(char *buf) } __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup); -static inline void set_page_guard(struct zone *zone, struct page *page, +static inline bool set_page_guard(struct zone *zone, struct page *page, unsigned int order, int migratetype) { struct page_ext *page_ext; if (!debug_guardpage_enabled()) - return; + return false; + + if (order >= debug_guardpage_minorder()) + return false; page_ext = lookup_page_ext(page); if (unlikely(!page_ext)) - return; + return false; __set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); @@ -655,6 +658,8 @@ static inline void set_page_guard(struct zone *zone, struct page *page, set_page_private(page, order); /* Guard pages are not available for any usage */ __mod_zone_freepage_state(zone, -(1 << order), migratetype); + + return true; } static inline void clear_page_guard(struct zone *zone, struct page *page, @@ -677,8 +682,8 @@ static inline void clear_page_guard(struct zone *zone, struct page *page, } #else struct page_ext_operations debug_guardpage_ops = { NULL, }; -static inline void set_page_guard(struct zone *zone, struct page *page, - unsigned int order, int migratetype) {} +static inline bool set_page_guard(struct zone *zone, struct page *page, + unsigned int order, int migratetype) { return false; } static inline void clear_page_guard(struct zone *zone, struct page *page, unsigned int order, int migratetype) {} #endif @@ -1622,18 +1627,15 @@ static inline void expand(struct zone *zone, struct page *page, size >>= 1; VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]); - if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) && - debug_guardpage_enabled() && - high < debug_guardpage_minorder()) { - /* - * Mark as guard pages (or page), that will allow to - * merge back to allocator when buddy will be freed. - * Corresponding page table entries will not be touched, - * pages will stay not present in virtual address space - */ - set_page_guard(zone, &page[size], high, migratetype); + /* + * Mark as guard pages (or page), that will allow to + * merge back to allocator when buddy will be freed. + * Corresponding page table entries will not be touched, + * pages will stay not present in virtual address space + */ + if (set_page_guard(zone, &page[size], high, migratetype)) continue; - } + list_add(&page[size].lru, &area->free_list[migratetype]); area->nr_free++; set_page_order(&page[size], high); -- cgit v1.1 From f1c1e9f7b5b3ddce6b4f1986939ec87b27515086 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Fri, 7 Oct 2016 16:58:18 -0700 Subject: mm/debug_pagealloc.c: don't allocate page_ext if we don't use guard page What debug_pagealloc does is just mapping/unmapping page table. Basically, it doesn't need additional memory space to memorize something. But, with guard page feature, it requires additional memory to distinguish if the page is for guard or not. Guard page is only used when debug_guardpage_minorder is non-zero so this patch removes additional memory allocation (page_ext) if debug_guardpage_minorder is zero. It saves memory if we just use debug_pagealloc and not guard page. Link: http://lkml.kernel.org/r/1471315879-32294-3-git-send-email-iamjoonsoo.kim@lge.com Signed-off-by: Joonsoo Kim Reviewed-by: Sergey Senozhatsky Acked-by: Vlastimil Babka Cc: Minchan Kim Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e150ba9..06ea805 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -607,6 +607,9 @@ static bool need_debug_guardpage(void) if (!debug_pagealloc_enabled()) return false; + if (!debug_guardpage_minorder()) + return false; + return true; } @@ -615,6 +618,9 @@ static void init_debug_guardpage(void) if (!debug_pagealloc_enabled()) return; + if (!debug_guardpage_minorder()) + return; + _debug_guardpage_enabled = true; } @@ -635,7 +641,7 @@ static int __init debug_guardpage_minorder_setup(char *buf) pr_info("Setting debug_guardpage_minorder to %lu\n", res); return 0; } -__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup); +early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup); static inline bool set_page_guard(struct zone *zone, struct page *page, unsigned int order, int migratetype) -- cgit v1.1 From e2f612e673f61931b2fe62722832cf5fcf6b3313 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Fri, 7 Oct 2016 16:58:21 -0700 Subject: mm/page_owner: move page_owner specific function to page_owner.c There is no reason that page_owner specific function resides on vmstat.c. Link: http://lkml.kernel.org/r/1471315879-32294-4-git-send-email-iamjoonsoo.kim@lge.com Signed-off-by: Joonsoo Kim Reviewed-by: Sergey Senozhatsky Acked-by: Vlastimil Babka Cc: Minchan Kim Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_owner.c | 77 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ mm/vmstat.c | 79 --------------------------------------------------------- 2 files changed, 77 insertions(+), 79 deletions(-) (limited to 'mm') diff --git a/mm/page_owner.c b/mm/page_owner.c index ec6dc18..0f4246d 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -8,6 +8,7 @@ #include #include #include +#include #include "internal.h" @@ -214,6 +215,82 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage) __set_bit(PAGE_EXT_OWNER, &new_ext->flags); } +void pagetypeinfo_showmixedcount_print(struct seq_file *m, + pg_data_t *pgdat, struct zone *zone) +{ + struct page *page; + struct page_ext *page_ext; + unsigned long pfn = zone->zone_start_pfn, block_end_pfn; + unsigned long end_pfn = pfn + zone->spanned_pages; + unsigned long count[MIGRATE_TYPES] = { 0, }; + int pageblock_mt, page_mt; + int i; + + /* Scan block by block. First and last block may be incomplete */ + pfn = zone->zone_start_pfn; + + /* + * Walk the zone in pageblock_nr_pages steps. If a page block spans + * a zone boundary, it will be double counted between zones. This does + * not matter as the mixed block count will still be correct + */ + for (; pfn < end_pfn; ) { + if (!pfn_valid(pfn)) { + pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); + continue; + } + + block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); + block_end_pfn = min(block_end_pfn, end_pfn); + + page = pfn_to_page(pfn); + pageblock_mt = get_pageblock_migratetype(page); + + for (; pfn < block_end_pfn; pfn++) { + if (!pfn_valid_within(pfn)) + continue; + + page = pfn_to_page(pfn); + + if (page_zone(page) != zone) + continue; + + if (PageBuddy(page)) { + pfn += (1UL << page_order(page)) - 1; + continue; + } + + if (PageReserved(page)) + continue; + + page_ext = lookup_page_ext(page); + if (unlikely(!page_ext)) + continue; + + if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) + continue; + + page_mt = gfpflags_to_migratetype(page_ext->gfp_mask); + if (pageblock_mt != page_mt) { + if (is_migrate_cma(pageblock_mt)) + count[MIGRATE_MOVABLE]++; + else + count[pageblock_mt]++; + + pfn = block_end_pfn; + break; + } + pfn += (1UL << page_ext->order) - 1; + } + } + + /* Print counts */ + seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); + for (i = 0; i < MIGRATE_TYPES; i++) + seq_printf(m, "%12lu ", count[i]); + seq_putc(m, '\n'); +} + static ssize_t print_page_owner(char __user *buf, size_t count, unsigned long pfn, struct page *page, struct page_ext *page_ext, diff --git a/mm/vmstat.c b/mm/vmstat.c index 89cec42..dc04e76 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1254,85 +1254,6 @@ static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg) return 0; } -#ifdef CONFIG_PAGE_OWNER -static void pagetypeinfo_showmixedcount_print(struct seq_file *m, - pg_data_t *pgdat, - struct zone *zone) -{ - struct page *page; - struct page_ext *page_ext; - unsigned long pfn = zone->zone_start_pfn, block_end_pfn; - unsigned long end_pfn = pfn + zone->spanned_pages; - unsigned long count[MIGRATE_TYPES] = { 0, }; - int pageblock_mt, page_mt; - int i; - - /* Scan block by block. First and last block may be incomplete */ - pfn = zone->zone_start_pfn; - - /* - * Walk the zone in pageblock_nr_pages steps. If a page block spans - * a zone boundary, it will be double counted between zones. This does - * not matter as the mixed block count will still be correct - */ - for (; pfn < end_pfn; ) { - if (!pfn_valid(pfn)) { - pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); - continue; - } - - block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); - block_end_pfn = min(block_end_pfn, end_pfn); - - page = pfn_to_page(pfn); - pageblock_mt = get_pageblock_migratetype(page); - - for (; pfn < block_end_pfn; pfn++) { - if (!pfn_valid_within(pfn)) - continue; - - page = pfn_to_page(pfn); - - if (page_zone(page) != zone) - continue; - - if (PageBuddy(page)) { - pfn += (1UL << page_order(page)) - 1; - continue; - } - - if (PageReserved(page)) - continue; - - page_ext = lookup_page_ext(page); - if (unlikely(!page_ext)) - continue; - - if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) - continue; - - page_mt = gfpflags_to_migratetype(page_ext->gfp_mask); - if (pageblock_mt != page_mt) { - if (is_migrate_cma(pageblock_mt)) - count[MIGRATE_MOVABLE]++; - else - count[pageblock_mt]++; - - pfn = block_end_pfn; - break; - } - pfn += (1UL << page_ext->order) - 1; - } - } - - /* Print counts */ - seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); - for (i = 0; i < MIGRATE_TYPES; i++) - seq_printf(m, "%12lu ", count[i]); - seq_putc(m, '\n'); -} -#endif /* CONFIG_PAGE_OWNER */ - /* * Print out the number of pageblocks for each migratetype that contain pages * of other types. This gives an indication of how well fallbacks are being -- cgit v1.1 From 0b06bb3f6075803a92a0075ba4eb44888dd8a68a Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Fri, 7 Oct 2016 16:58:24 -0700 Subject: mm/page_ext: rename offset to index Here, 'offset' means entry index in page_ext array. Following patch will use 'offset' for field offset in each entry so rename current 'offset' to prevent confusion. Link: http://lkml.kernel.org/r/1471315879-32294-5-git-send-email-iamjoonsoo.kim@lge.com Signed-off-by: Joonsoo Kim Acked-by: Vlastimil Babka Cc: Minchan Kim Cc: Michal Hocko Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_ext.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_ext.c b/mm/page_ext.c index 44a4c02..1629282 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -102,7 +102,7 @@ void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) struct page_ext *lookup_page_ext(struct page *page) { unsigned long pfn = page_to_pfn(page); - unsigned long offset; + unsigned long index; struct page_ext *base; base = NODE_DATA(page_to_nid(page))->node_page_ext; @@ -119,9 +119,9 @@ struct page_ext *lookup_page_ext(struct page *page) if (unlikely(!base)) return NULL; #endif - offset = pfn - round_down(node_start_pfn(page_to_nid(page)), + index = pfn - round_down(node_start_pfn(page_to_nid(page)), MAX_ORDER_NR_PAGES); - return base + offset; + return base + index; } static int __init alloc_node_page_ext(int nid) -- cgit v1.1 From 980ac1672e7edaa927557a5186f1967cd45afcf5 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Fri, 7 Oct 2016 16:58:27 -0700 Subject: mm/page_ext: support extra space allocation by page_ext user Until now, if some page_ext users want to use it's own field on page_ext, it should be defined in struct page_ext by hard-coding. It has a problem that wastes memory in following situation. struct page_ext { #ifdef CONFIG_A int a; #endif #ifdef CONFIG_B int b; #endif }; Assume that kernel is built with both CONFIG_A and CONFIG_B. Even if we enable feature A and doesn't enable feature B at runtime, each entry of struct page_ext takes two int rather than one int. It's undesirable result so this patch tries to fix it. To solve above problem, this patch implements to support extra space allocation at runtime. When need() callback returns true, it's extra memory requirement is summed to entry size of page_ext. Also, offset for each user's extra memory space is returned. With this offset, user can use this extra space and there is no need to define needed field on page_ext by hard-coding. This patch only implements an infrastructure. Following patch will use it for page_owner which is only user having it's own fields on page_ext. Link: http://lkml.kernel.org/r/1471315879-32294-6-git-send-email-iamjoonsoo.kim@lge.com Signed-off-by: Joonsoo Kim Acked-by: Vlastimil Babka Cc: Minchan Kim Cc: Michal Hocko Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- mm/page_ext.c | 41 +++++++++++++++++++++++++++++++---------- 2 files changed, 32 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 06ea805..b0f133f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -687,7 +687,7 @@ static inline void clear_page_guard(struct zone *zone, struct page *page, __mod_zone_freepage_state(zone, (1 << order), migratetype); } #else -struct page_ext_operations debug_guardpage_ops = { NULL, }; +struct page_ext_operations debug_guardpage_ops; static inline bool set_page_guard(struct zone *zone, struct page *page, unsigned int order, int migratetype) { return false; } static inline void clear_page_guard(struct zone *zone, struct page *page, diff --git a/mm/page_ext.c b/mm/page_ext.c index 1629282..121dcff 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -42,6 +42,11 @@ * and page extension core can skip to allocate memory. As result, * none of memory is wasted. * + * When need callback returns true, page_ext checks if there is a request for + * extra memory through size in struct page_ext_operations. If it is non-zero, + * extra space is allocated for each page_ext entry and offset is returned to + * user through offset in struct page_ext_operations. + * * The init callback is used to do proper initialization after page extension * is completely initialized. In sparse memory system, extra memory is * allocated some time later than memmap is allocated. In other words, lifetime @@ -66,18 +71,24 @@ static struct page_ext_operations *page_ext_ops[] = { }; static unsigned long total_usage; +static unsigned long extra_mem; static bool __init invoke_need_callbacks(void) { int i; int entries = ARRAY_SIZE(page_ext_ops); + bool need = false; for (i = 0; i < entries; i++) { - if (page_ext_ops[i]->need && page_ext_ops[i]->need()) - return true; + if (page_ext_ops[i]->need && page_ext_ops[i]->need()) { + page_ext_ops[i]->offset = sizeof(struct page_ext) + + extra_mem; + extra_mem += page_ext_ops[i]->size; + need = true; + } } - return false; + return need; } static void __init invoke_init_callbacks(void) @@ -91,6 +102,16 @@ static void __init invoke_init_callbacks(void) } } +static unsigned long get_entry_size(void) +{ + return sizeof(struct page_ext) + extra_mem; +} + +static inline struct page_ext *get_entry(void *base, unsigned long index) +{ + return base + get_entry_size() * index; +} + #if !defined(CONFIG_SPARSEMEM) @@ -121,7 +142,7 @@ struct page_ext *lookup_page_ext(struct page *page) #endif index = pfn - round_down(node_start_pfn(page_to_nid(page)), MAX_ORDER_NR_PAGES); - return base + index; + return get_entry(base, index); } static int __init alloc_node_page_ext(int nid) @@ -143,7 +164,7 @@ static int __init alloc_node_page_ext(int nid) !IS_ALIGNED(node_end_pfn(nid), MAX_ORDER_NR_PAGES)) nr_pages += MAX_ORDER_NR_PAGES; - table_size = sizeof(struct page_ext) * nr_pages; + table_size = get_entry_size() * nr_pages; base = memblock_virt_alloc_try_nid_nopanic( table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), @@ -196,7 +217,7 @@ struct page_ext *lookup_page_ext(struct page *page) if (!section->page_ext) return NULL; #endif - return section->page_ext + pfn; + return get_entry(section->page_ext, pfn); } static void *__meminit alloc_page_ext(size_t size, int nid) @@ -229,7 +250,7 @@ static int __meminit init_section_page_ext(unsigned long pfn, int nid) if (section->page_ext) return 0; - table_size = sizeof(struct page_ext) * PAGES_PER_SECTION; + table_size = get_entry_size() * PAGES_PER_SECTION; base = alloc_page_ext(table_size, nid); /* @@ -249,7 +270,7 @@ static int __meminit init_section_page_ext(unsigned long pfn, int nid) * we need to apply a mask. */ pfn &= PAGE_SECTION_MASK; - section->page_ext = base - pfn; + section->page_ext = (void *)base - get_entry_size() * pfn; total_usage += table_size; return 0; } @@ -262,7 +283,7 @@ static void free_page_ext(void *addr) struct page *page = virt_to_page(addr); size_t table_size; - table_size = sizeof(struct page_ext) * PAGES_PER_SECTION; + table_size = get_entry_size() * PAGES_PER_SECTION; BUG_ON(PageReserved(page)); free_pages_exact(addr, table_size); @@ -277,7 +298,7 @@ static void __free_page_ext(unsigned long pfn) ms = __pfn_to_section(pfn); if (!ms || !ms->page_ext) return; - base = ms->page_ext + pfn; + base = get_entry(ms->page_ext, pfn); free_page_ext(base); ms->page_ext = NULL; } -- cgit v1.1 From 9300d8dfd282bd1473395c5c4c76bfdc90b05978 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Fri, 7 Oct 2016 16:58:30 -0700 Subject: mm/page_owner: don't define fields on struct page_ext by hard-coding There is a memory waste problem if we define field on struct page_ext by hard-coding. Entry size of struct page_ext includes the size of those fields even if it is disabled at runtime. Now, extra memory request at runtime is possible so page_owner don't need to define it's own fields by hard-coding. This patch removes hard-coded define and uses extra memory for storing page_owner information in page_owner. Most of code are just mechanical changes. Link: http://lkml.kernel.org/r/1471315879-32294-7-git-send-email-iamjoonsoo.kim@lge.com Signed-off-by: Joonsoo Kim Acked-by: Vlastimil Babka Cc: Minchan Kim Cc: Michal Hocko Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_owner.c | 83 ++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 58 insertions(+), 25 deletions(-) (limited to 'mm') diff --git a/mm/page_owner.c b/mm/page_owner.c index 0f4246d..60634dc 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -18,6 +18,13 @@ */ #define PAGE_OWNER_STACK_DEPTH (16) +struct page_owner { + unsigned int order; + gfp_t gfp_mask; + int last_migrate_reason; + depot_stack_handle_t handle; +}; + static bool page_owner_disabled = true; DEFINE_STATIC_KEY_FALSE(page_owner_inited); @@ -86,10 +93,16 @@ static void init_page_owner(void) } struct page_ext_operations page_owner_ops = { + .size = sizeof(struct page_owner), .need = need_page_owner, .init = init_page_owner, }; +static inline struct page_owner *get_page_owner(struct page_ext *page_ext) +{ + return (void *)page_ext + page_owner_ops.offset; +} + void __reset_page_owner(struct page *page, unsigned int order) { int i; @@ -156,14 +169,16 @@ noinline void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask) { struct page_ext *page_ext = lookup_page_ext(page); + struct page_owner *page_owner; if (unlikely(!page_ext)) return; - page_ext->handle = save_stack(gfp_mask); - page_ext->order = order; - page_ext->gfp_mask = gfp_mask; - page_ext->last_migrate_reason = -1; + page_owner = get_page_owner(page_ext); + page_owner->handle = save_stack(gfp_mask); + page_owner->order = order; + page_owner->gfp_mask = gfp_mask; + page_owner->last_migrate_reason = -1; __set_bit(PAGE_EXT_OWNER, &page_ext->flags); } @@ -171,21 +186,26 @@ noinline void __set_page_owner(struct page *page, unsigned int order, void __set_page_owner_migrate_reason(struct page *page, int reason) { struct page_ext *page_ext = lookup_page_ext(page); + struct page_owner *page_owner; + if (unlikely(!page_ext)) return; - page_ext->last_migrate_reason = reason; + page_owner = get_page_owner(page_ext); + page_owner->last_migrate_reason = reason; } void __split_page_owner(struct page *page, unsigned int order) { int i; struct page_ext *page_ext = lookup_page_ext(page); + struct page_owner *page_owner; if (unlikely(!page_ext)) return; - page_ext->order = 0; + page_owner = get_page_owner(page_ext); + page_owner->order = 0; for (i = 1; i < (1 << order); i++) __copy_page_owner(page, page + i); } @@ -194,14 +214,18 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage) { struct page_ext *old_ext = lookup_page_ext(oldpage); struct page_ext *new_ext = lookup_page_ext(newpage); + struct page_owner *old_page_owner, *new_page_owner; if (unlikely(!old_ext || !new_ext)) return; - new_ext->order = old_ext->order; - new_ext->gfp_mask = old_ext->gfp_mask; - new_ext->last_migrate_reason = old_ext->last_migrate_reason; - new_ext->handle = old_ext->handle; + old_page_owner = get_page_owner(old_ext); + new_page_owner = get_page_owner(new_ext); + new_page_owner->order = old_page_owner->order; + new_page_owner->gfp_mask = old_page_owner->gfp_mask; + new_page_owner->last_migrate_reason = + old_page_owner->last_migrate_reason; + new_page_owner->handle = old_page_owner->handle; /* * We don't clear the bit on the oldpage as it's going to be freed @@ -220,6 +244,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, { struct page *page; struct page_ext *page_ext; + struct page_owner *page_owner; unsigned long pfn = zone->zone_start_pfn, block_end_pfn; unsigned long end_pfn = pfn + zone->spanned_pages; unsigned long count[MIGRATE_TYPES] = { 0, }; @@ -270,7 +295,9 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) continue; - page_mt = gfpflags_to_migratetype(page_ext->gfp_mask); + page_owner = get_page_owner(page_ext); + page_mt = gfpflags_to_migratetype( + page_owner->gfp_mask); if (pageblock_mt != page_mt) { if (is_migrate_cma(pageblock_mt)) count[MIGRATE_MOVABLE]++; @@ -280,7 +307,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, pfn = block_end_pfn; break; } - pfn += (1UL << page_ext->order) - 1; + pfn += (1UL << page_owner->order) - 1; } } @@ -293,7 +320,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, static ssize_t print_page_owner(char __user *buf, size_t count, unsigned long pfn, - struct page *page, struct page_ext *page_ext, + struct page *page, struct page_owner *page_owner, depot_stack_handle_t handle) { int ret; @@ -313,15 +340,15 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, ret = snprintf(kbuf, count, "Page allocated via order %u, mask %#x(%pGg)\n", - page_ext->order, page_ext->gfp_mask, - &page_ext->gfp_mask); + page_owner->order, page_owner->gfp_mask, + &page_owner->gfp_mask); if (ret >= count) goto err; /* Print information relevant to grouping pages by mobility */ pageblock_mt = get_pageblock_migratetype(page); - page_mt = gfpflags_to_migratetype(page_ext->gfp_mask); + page_mt = gfpflags_to_migratetype(page_owner->gfp_mask); ret += snprintf(kbuf + ret, count - ret, "PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n", pfn, @@ -338,10 +365,10 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, if (ret >= count) goto err; - if (page_ext->last_migrate_reason != -1) { + if (page_owner->last_migrate_reason != -1) { ret += snprintf(kbuf + ret, count - ret, "Page has been migrated, last migrate reason: %s\n", - migrate_reason_names[page_ext->last_migrate_reason]); + migrate_reason_names[page_owner->last_migrate_reason]); if (ret >= count) goto err; } @@ -364,6 +391,7 @@ err: void __dump_page_owner(struct page *page) { struct page_ext *page_ext = lookup_page_ext(page); + struct page_owner *page_owner; unsigned long entries[PAGE_OWNER_STACK_DEPTH]; struct stack_trace trace = { .nr_entries = 0, @@ -379,7 +407,9 @@ void __dump_page_owner(struct page *page) pr_alert("There is not page extension available.\n"); return; } - gfp_mask = page_ext->gfp_mask; + + page_owner = get_page_owner(page_ext); + gfp_mask = page_owner->gfp_mask; mt = gfpflags_to_migratetype(gfp_mask); if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) { @@ -387,7 +417,7 @@ void __dump_page_owner(struct page *page) return; } - handle = READ_ONCE(page_ext->handle); + handle = READ_ONCE(page_owner->handle); if (!handle) { pr_alert("page_owner info is not active (free page?)\n"); return; @@ -395,12 +425,12 @@ void __dump_page_owner(struct page *page) depot_fetch_stack(handle, &trace); pr_alert("page allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n", - page_ext->order, migratetype_names[mt], gfp_mask, &gfp_mask); + page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask); print_stack_trace(&trace, 0); - if (page_ext->last_migrate_reason != -1) + if (page_owner->last_migrate_reason != -1) pr_alert("page has been migrated, last migrate reason: %s\n", - migrate_reason_names[page_ext->last_migrate_reason]); + migrate_reason_names[page_owner->last_migrate_reason]); } static ssize_t @@ -409,6 +439,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) unsigned long pfn; struct page *page; struct page_ext *page_ext; + struct page_owner *page_owner; depot_stack_handle_t handle; if (!static_branch_unlikely(&page_owner_inited)) @@ -458,11 +489,13 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) continue; + page_owner = get_page_owner(page_ext); + /* * Access to page_ext->handle isn't synchronous so we should * be careful to access it. */ - handle = READ_ONCE(page_ext->handle); + handle = READ_ONCE(page_owner->handle); if (!handle) continue; @@ -470,7 +503,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) *ppos = (pfn - min_low_pfn) + 1; return print_page_owner(buf, count, pfn, page, - page_ext, handle); + page_owner, handle); } return 0; -- cgit v1.1 From c4b209a426847b55c40360c1d04dc7986b55ddc7 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 7 Oct 2016 16:58:33 -0700 Subject: do_generic_file_read(): fail immediately if killed If a fatal signal has been received, fail immediately instead of trying to read more data. If wait_on_page_locked_killable() was interrupted then this page is most likely is not PageUptodate() and in this case do_generic_file_read() will fail after lock_page_killable(). See also commit ebded02788b5 ("mm: filemap: avoid unnecessary calls to lock_page when waiting for IO to complete during a read") [oleg@redhat.com: changelog addition] Link: http://lkml.kernel.org/r/63068e8e-8bee-b208-8441-a3c39a9d9eb6@sandisk.com Signed-off-by: Bart Van Assche Reviewed-by: Jan Kara Acked-by: Oleg Nesterov Acked-by: Michal Hocko Cc: Mel Gorman Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 68f1813..1b05f75 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1721,7 +1721,9 @@ find_page: * wait_on_page_locked is used to avoid unnecessarily * serialisations and why it's safe. */ - wait_on_page_locked_killable(page); + error = wait_on_page_locked_killable(page); + if (unlikely(error)) + goto readpage_error; if (PageUptodate(page)) goto page_ok; -- cgit v1.1 From 131ddc5c7d814d61f945b6322019e5148f6d39f0 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 7 Oct 2016 16:58:39 -0700 Subject: mm: unrig VMA cache hit ratio Current code doesn't count first FIND operation after VMA cache flush (which happen surprisingly often) artificially increasing cache hit ratio. On my regular setup the difference is: Before After ========================================================== * boot, login into KDE vmacache_find_calls 446216 vmacache_find_calls 492741 vmacache_find_hits 277596 vmacache_find_hits 276096 ~62.2% ~56.0% * rebuild kernel (no changes to code, usual config) vmacache_find_calls 1943007 vmacache_find_calls 2083718 vmacache_find_hits 1246123 vmacache_find_hits 1244146 ~64.1% ~59.7% * rebuild kernel (full rebuild, usual config) vmacache_find_calls 32163155 vmacache_find_calls 33677183 vmacache_find_hits 27889956 vmacache_find_hits 27877591 ~88.2% ~84.3% Total: ~4% cache hit ratio. If someone is counting _relative_ cache _miss_ ratio, misreporting is much higher. Link: http://lkml.kernel.org/r/20160822225009.GA3934@p183.telecom.by Signed-off-by: Alexey Dobriyan Cc: Davidlohr Bueso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmacache.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/vmacache.c b/mm/vmacache.c index fd09dc9..035fdeb 100644 --- a/mm/vmacache.c +++ b/mm/vmacache.c @@ -87,11 +87,11 @@ struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr) { int i; + count_vm_vmacache_event(VMACACHE_FIND_CALLS); + if (!vmacache_valid(mm)) return NULL; - count_vm_vmacache_event(VMACACHE_FIND_CALLS); - for (i = 0; i < VMACACHE_SIZE; i++) { struct vm_area_struct *vma = current->vmacache[i]; @@ -115,11 +115,11 @@ struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm, { int i; + count_vm_vmacache_event(VMACACHE_FIND_CALLS); + if (!vmacache_valid(mm)) return NULL; - count_vm_vmacache_event(VMACACHE_FIND_CALLS); - for (i = 0; i < VMACACHE_SIZE; i++) { struct vm_area_struct *vma = current->vmacache[i]; -- cgit v1.1 From 6b53491598a4d9694318e6e2b11d8c9988a483d4 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 7 Oct 2016 16:58:42 -0700 Subject: mm, swap: add swap_cluster_list This is a code clean up patch without functionality changes. The swap_cluster_list data structure and its operations are introduced to provide some better encapsulation for the free cluster and discard cluster list operations. This avoid some code duplication, improved the code readability, and reduced the total line number. [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/1472067356-16004-1-git-send-email-ying.huang@intel.com Signed-off-by: "Huang, Ying" Acked-by: Minchan Kim Acked-by: Rik van Riel Cc: Tim Chen Cc: Hugh Dickins Cc: Shaohua Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 133 ++++++++++++++++++++++++++++------------------------------ 1 file changed, 63 insertions(+), 70 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index 2657acc..134c085 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -257,6 +257,53 @@ static inline void cluster_set_null(struct swap_cluster_info *info) info->data = 0; } +static inline bool cluster_list_empty(struct swap_cluster_list *list) +{ + return cluster_is_null(&list->head); +} + +static inline unsigned int cluster_list_first(struct swap_cluster_list *list) +{ + return cluster_next(&list->head); +} + +static void cluster_list_init(struct swap_cluster_list *list) +{ + cluster_set_null(&list->head); + cluster_set_null(&list->tail); +} + +static void cluster_list_add_tail(struct swap_cluster_list *list, + struct swap_cluster_info *ci, + unsigned int idx) +{ + if (cluster_list_empty(list)) { + cluster_set_next_flag(&list->head, idx, 0); + cluster_set_next_flag(&list->tail, idx, 0); + } else { + unsigned int tail = cluster_next(&list->tail); + + cluster_set_next(&ci[tail], idx); + cluster_set_next_flag(&list->tail, idx, 0); + } +} + +static unsigned int cluster_list_del_first(struct swap_cluster_list *list, + struct swap_cluster_info *ci) +{ + unsigned int idx; + + idx = cluster_next(&list->head); + if (cluster_next(&list->tail) == idx) { + cluster_set_null(&list->head); + cluster_set_null(&list->tail); + } else + cluster_set_next_flag(&list->head, + cluster_next(&ci[idx]), 0); + + return idx; +} + /* Add a cluster to discard list and schedule it to do discard */ static void swap_cluster_schedule_discard(struct swap_info_struct *si, unsigned int idx) @@ -270,17 +317,7 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si, memset(si->swap_map + idx * SWAPFILE_CLUSTER, SWAP_MAP_BAD, SWAPFILE_CLUSTER); - if (cluster_is_null(&si->discard_cluster_head)) { - cluster_set_next_flag(&si->discard_cluster_head, - idx, 0); - cluster_set_next_flag(&si->discard_cluster_tail, - idx, 0); - } else { - unsigned int tail = cluster_next(&si->discard_cluster_tail); - cluster_set_next(&si->cluster_info[tail], idx); - cluster_set_next_flag(&si->discard_cluster_tail, - idx, 0); - } + cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx); schedule_work(&si->discard_work); } @@ -296,15 +333,8 @@ static void swap_do_scheduled_discard(struct swap_info_struct *si) info = si->cluster_info; - while (!cluster_is_null(&si->discard_cluster_head)) { - idx = cluster_next(&si->discard_cluster_head); - - cluster_set_next_flag(&si->discard_cluster_head, - cluster_next(&info[idx]), 0); - if (cluster_next(&si->discard_cluster_tail) == idx) { - cluster_set_null(&si->discard_cluster_head); - cluster_set_null(&si->discard_cluster_tail); - } + while (!cluster_list_empty(&si->discard_clusters)) { + idx = cluster_list_del_first(&si->discard_clusters, info); spin_unlock(&si->lock); discard_swap_cluster(si, idx * SWAPFILE_CLUSTER, @@ -312,19 +342,7 @@ static void swap_do_scheduled_discard(struct swap_info_struct *si) spin_lock(&si->lock); cluster_set_flag(&info[idx], CLUSTER_FLAG_FREE); - if (cluster_is_null(&si->free_cluster_head)) { - cluster_set_next_flag(&si->free_cluster_head, - idx, 0); - cluster_set_next_flag(&si->free_cluster_tail, - idx, 0); - } else { - unsigned int tail; - - tail = cluster_next(&si->free_cluster_tail); - cluster_set_next(&info[tail], idx); - cluster_set_next_flag(&si->free_cluster_tail, - idx, 0); - } + cluster_list_add_tail(&si->free_clusters, info, idx); memset(si->swap_map + idx * SWAPFILE_CLUSTER, 0, SWAPFILE_CLUSTER); } @@ -353,13 +371,8 @@ static void inc_cluster_info_page(struct swap_info_struct *p, if (!cluster_info) return; if (cluster_is_free(&cluster_info[idx])) { - VM_BUG_ON(cluster_next(&p->free_cluster_head) != idx); - cluster_set_next_flag(&p->free_cluster_head, - cluster_next(&cluster_info[idx]), 0); - if (cluster_next(&p->free_cluster_tail) == idx) { - cluster_set_null(&p->free_cluster_tail); - cluster_set_null(&p->free_cluster_head); - } + VM_BUG_ON(cluster_list_first(&p->free_clusters) != idx); + cluster_list_del_first(&p->free_clusters, cluster_info); cluster_set_count_flag(&cluster_info[idx], 0, 0); } @@ -398,14 +411,7 @@ static void dec_cluster_info_page(struct swap_info_struct *p, } cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); - if (cluster_is_null(&p->free_cluster_head)) { - cluster_set_next_flag(&p->free_cluster_head, idx, 0); - cluster_set_next_flag(&p->free_cluster_tail, idx, 0); - } else { - unsigned int tail = cluster_next(&p->free_cluster_tail); - cluster_set_next(&cluster_info[tail], idx); - cluster_set_next_flag(&p->free_cluster_tail, idx, 0); - } + cluster_list_add_tail(&p->free_clusters, cluster_info, idx); } } @@ -421,8 +427,8 @@ scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si, bool conflict; offset /= SWAPFILE_CLUSTER; - conflict = !cluster_is_null(&si->free_cluster_head) && - offset != cluster_next(&si->free_cluster_head) && + conflict = !cluster_list_empty(&si->free_clusters) && + offset != cluster_list_first(&si->free_clusters) && cluster_is_free(&si->cluster_info[offset]); if (!conflict) @@ -447,11 +453,11 @@ static void scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, new_cluster: cluster = this_cpu_ptr(si->percpu_cluster); if (cluster_is_null(&cluster->index)) { - if (!cluster_is_null(&si->free_cluster_head)) { - cluster->index = si->free_cluster_head; + if (!cluster_list_empty(&si->free_clusters)) { + cluster->index = si->free_clusters.head; cluster->next = cluster_next(&cluster->index) * SWAPFILE_CLUSTER; - } else if (!cluster_is_null(&si->discard_cluster_head)) { + } else if (!cluster_list_empty(&si->discard_clusters)) { /* * we don't have free cluster but have some clusters in * discarding, do discard now and reclaim them @@ -2292,10 +2298,8 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, nr_good_pages = maxpages - 1; /* omit header page */ - cluster_set_null(&p->free_cluster_head); - cluster_set_null(&p->free_cluster_tail); - cluster_set_null(&p->discard_cluster_head); - cluster_set_null(&p->discard_cluster_tail); + cluster_list_init(&p->free_clusters); + cluster_list_init(&p->discard_clusters); for (i = 0; i < swap_header->info.nr_badpages; i++) { unsigned int page_nr = swap_header->info.badpages[i]; @@ -2341,19 +2345,8 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, for (i = 0; i < nr_clusters; i++) { if (!cluster_count(&cluster_info[idx])) { cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); - if (cluster_is_null(&p->free_cluster_head)) { - cluster_set_next_flag(&p->free_cluster_head, - idx, 0); - cluster_set_next_flag(&p->free_cluster_tail, - idx, 0); - } else { - unsigned int tail; - - tail = cluster_next(&p->free_cluster_tail); - cluster_set_next(&cluster_info[tail], idx); - cluster_set_next_flag(&p->free_cluster_tail, - idx, 0); - } + cluster_list_add_tail(&p->free_clusters, cluster_info, + idx); } idx++; if (idx == nr_clusters) -- cgit v1.1 From 7ebffa45551fe7db86a2b32bf586f124ef484e6e Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Fri, 7 Oct 2016 16:58:45 -0700 Subject: mm,oom_reaper: reduce find_lock_task_mm() usage Patch series "fortify oom killer even more", v2. This patch (of 9): __oom_reap_task() can be simplified a bit if it receives a valid mm from oom_reap_task() which also uses that mm when __oom_reap_task() failed. We can drop one find_lock_task_mm() call and also make the __oom_reap_task() code flow easier to follow. Moreover, this will make later patch in the series easier to review. Pinning mm's mm_count for longer time is not really harmful because this will not pin much memory. This patch doesn't introduce any functional change. Link: http://lkml.kernel.org/r/1472119394-11342-2-git-send-email-mhocko@kernel.org Signed-off-by: Tetsuo Handa Signed-off-by: Michal Hocko Cc: Oleg Nesterov Cc: David Rientjes Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 81 ++++++++++++++++++++++++++++------------------------------- 1 file changed, 38 insertions(+), 43 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 463cdd2..87fad95 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -463,12 +463,10 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); static struct task_struct *oom_reaper_list; static DEFINE_SPINLOCK(oom_reaper_lock); -static bool __oom_reap_task(struct task_struct *tsk) +static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) { struct mmu_gather tlb; struct vm_area_struct *vma; - struct mm_struct *mm = NULL; - struct task_struct *p; struct zap_details details = {.check_swap_entries = true, .ignore_dirty = true}; bool ret = true; @@ -476,7 +474,7 @@ static bool __oom_reap_task(struct task_struct *tsk) /* * We have to make sure to not race with the victim exit path * and cause premature new oom victim selection: - * __oom_reap_task exit_mm + * __oom_reap_task_mm exit_mm * mmget_not_zero * mmput * atomic_dec_and_test @@ -489,22 +487,9 @@ static bool __oom_reap_task(struct task_struct *tsk) */ mutex_lock(&oom_lock); - /* - * Make sure we find the associated mm_struct even when the particular - * thread has already terminated and cleared its mm. - * We might have race with exit path so consider our work done if there - * is no mm. - */ - p = find_lock_task_mm(tsk); - if (!p) - goto unlock_oom; - mm = p->mm; - atomic_inc(&mm->mm_count); - task_unlock(p); - if (!down_read_trylock(&mm->mmap_sem)) { ret = false; - goto mm_drop; + goto unlock_oom; } /* @@ -514,7 +499,7 @@ static bool __oom_reap_task(struct task_struct *tsk) */ if (!mmget_not_zero(mm)) { up_read(&mm->mmap_sem); - goto mm_drop; + goto unlock_oom; } tlb_gather_mmu(&tlb, mm, 0, -1); @@ -562,8 +547,6 @@ static bool __oom_reap_task(struct task_struct *tsk) * put the oom_reaper out of the way. */ mmput_async(mm); -mm_drop: - mmdrop(mm); unlock_oom: mutex_unlock(&oom_lock); return ret; @@ -573,36 +556,45 @@ unlock_oom: static void oom_reap_task(struct task_struct *tsk) { int attempts = 0; + struct mm_struct *mm = NULL; + struct task_struct *p = find_lock_task_mm(tsk); + + /* + * Make sure we find the associated mm_struct even when the particular + * thread has already terminated and cleared its mm. + * We might have race with exit path so consider our work done if there + * is no mm. + */ + if (!p) + goto done; + mm = p->mm; + atomic_inc(&mm->mm_count); + task_unlock(p); /* Retry the down_read_trylock(mmap_sem) a few times */ - while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task(tsk)) + while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task_mm(tsk, mm)) schedule_timeout_idle(HZ/10); - if (attempts > MAX_OOM_REAP_RETRIES) { - struct task_struct *p; + if (attempts <= MAX_OOM_REAP_RETRIES) + goto done; - pr_info("oom_reaper: unable to reap pid:%d (%s)\n", - task_pid_nr(tsk), tsk->comm); + pr_info("oom_reaper: unable to reap pid:%d (%s)\n", + task_pid_nr(tsk), tsk->comm); - /* - * If we've already tried to reap this task in the past and - * failed it probably doesn't make much sense to try yet again - * so hide the mm from the oom killer so that it can move on - * to another task with a different mm struct. - */ - p = find_lock_task_mm(tsk); - if (p) { - if (test_and_set_bit(MMF_OOM_NOT_REAPABLE, &p->mm->flags)) { - pr_info("oom_reaper: giving up pid:%d (%s)\n", - task_pid_nr(tsk), tsk->comm); - set_bit(MMF_OOM_REAPED, &p->mm->flags); - } - task_unlock(p); - } - - debug_show_all_locks(); + /* + * If we've already tried to reap this task in the past and + * failed it probably doesn't make much sense to try yet again + * so hide the mm from the oom killer so that it can move on + * to another task with a different mm struct. + */ + if (test_and_set_bit(MMF_OOM_NOT_REAPABLE, &mm->flags)) { + pr_info("oom_reaper: giving up pid:%d (%s)\n", + task_pid_nr(tsk), tsk->comm); + set_bit(MMF_OOM_REAPED, &mm->flags); } + debug_show_all_locks(); +done: /* * Clear TIF_MEMDIE because the task shouldn't be sitting on a * reasonably reclaimable memory anymore or it is not a good candidate @@ -614,6 +606,9 @@ static void oom_reap_task(struct task_struct *tsk) /* Drop a reference taken by wake_oom_reaper */ put_task_struct(tsk); + /* Drop a reference taken above. */ + if (mm) + mmdrop(mm); } static int oom_reaper(void *unused) -- cgit v1.1 From 8496afaba93ece80a83cbd096f0675a1020ddfc4 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Fri, 7 Oct 2016 16:58:48 -0700 Subject: mm,oom_reaper: do not attempt to reap a task twice "mm, oom_reaper: do not attempt to reap a task twice" tried to give the OOM reaper one more chance to retry using MMF_OOM_NOT_REAPABLE flag. But the usefulness of the flag is rather limited and actually never shown in practice. If the flag is set, it means that the holder of mm->mmap_sem cannot call up_write() due to presumably being blocked at unkillable wait waiting for other thread's memory allocation. But since one of threads sharing that mm will queue that mm immediately via task_will_free_mem() shortcut (otherwise, oom_badness() will select the same mm again due to oom_score_adj value unchanged), retrying MMF_OOM_NOT_REAPABLE mm is unlikely helpful. Let's always set MMF_OOM_REAPED. Link: http://lkml.kernel.org/r/1472119394-11342-3-git-send-email-mhocko@kernel.org Signed-off-by: Tetsuo Handa Signed-off-by: Michal Hocko Cc: Oleg Nesterov Cc: David Rientjes Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 87fad95..45097f5 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -578,20 +578,11 @@ static void oom_reap_task(struct task_struct *tsk) if (attempts <= MAX_OOM_REAP_RETRIES) goto done; + /* Ignore this mm because somebody can't call up_write(mmap_sem). */ + set_bit(MMF_OOM_REAPED, &mm->flags); + pr_info("oom_reaper: unable to reap pid:%d (%s)\n", task_pid_nr(tsk), tsk->comm); - - /* - * If we've already tried to reap this task in the past and - * failed it probably doesn't make much sense to try yet again - * so hide the mm from the oom killer so that it can move on - * to another task with a different mm struct. - */ - if (test_and_set_bit(MMF_OOM_NOT_REAPABLE, &mm->flags)) { - pr_info("oom_reaper: giving up pid:%d (%s)\n", - task_pid_nr(tsk), tsk->comm); - set_bit(MMF_OOM_REAPED, &mm->flags); - } debug_show_all_locks(); done: -- cgit v1.1 From 26db62f179d112d345031e14926a4cda9cd40d6e Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 7 Oct 2016 16:58:51 -0700 Subject: oom: keep mm of the killed task available oom_reap_task has to call exit_oom_victim in order to make sure that the oom vicim will not block the oom killer for ever. This is, however, opening new problems (e.g oom_killer_disable exclusion - see commit 74070542099c ("oom, suspend: fix oom_reaper vs. oom_killer_disable race")). exit_oom_victim should be only called from the victim's context ideally. One way to achieve this would be to rely on per mm_struct flags. We already have MMF_OOM_REAPED to hide a task from the oom killer since "mm, oom: hide mm which is shared with kthread or global init". The problem is that the exit path: do_exit exit_mm tsk->mm = NULL; mmput __mmput exit_oom_victim doesn't guarantee that exit_oom_victim will get called in a bounded amount of time. At least exit_aio depends on IO which might get blocked due to lack of memory and who knows what else is lurking there. This patch takes a different approach. We remember tsk->mm into the signal_struct and bind it to the signal struct life time for all oom victims. __oom_reap_task_mm as well as oom_scan_process_thread do not have to rely on find_lock_task_mm anymore and they will have a reliable reference to the mm struct. As a result all the oom specific communication inside the OOM killer can be done via tsk->signal->oom_mm. Increasing the signal_struct for something as unlikely as the oom killer is far from ideal but this approach will make the code much more reasonable and long term we even might want to move task->mm into the signal_struct anyway. In the next step we might want to make the oom killer exclusion and access to memory reserves completely independent which would be also nice. Link: http://lkml.kernel.org/r/1472119394-11342-4-git-send-email-mhocko@kernel.org Signed-off-by: Michal Hocko Cc: Tetsuo Handa Cc: Oleg Nesterov Cc: David Rientjes Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 51 +++++++++++++++++++-------------------------------- 1 file changed, 19 insertions(+), 32 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 45097f5..f16ec08 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -300,14 +300,7 @@ static int oom_evaluate_task(struct task_struct *task, void *arg) * any memory is quite low. */ if (!is_sysrq_oom(oc) && atomic_read(&task->signal->oom_victims)) { - struct task_struct *p = find_lock_task_mm(task); - bool reaped = false; - - if (p) { - reaped = test_bit(MMF_OOM_REAPED, &p->mm->flags); - task_unlock(p); - } - if (reaped) + if (test_bit(MMF_OOM_REAPED, &task->signal->oom_mm->flags)) goto next; goto abort; } @@ -537,11 +530,6 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) up_read(&mm->mmap_sem); /* - * This task can be safely ignored because we cannot do much more - * to release its memory. - */ - set_bit(MMF_OOM_REAPED, &mm->flags); - /* * Drop our reference but make sure the mmput slow path is called from a * different context because we shouldn't risk we get stuck there and * put the oom_reaper out of the way. @@ -556,20 +544,7 @@ unlock_oom: static void oom_reap_task(struct task_struct *tsk) { int attempts = 0; - struct mm_struct *mm = NULL; - struct task_struct *p = find_lock_task_mm(tsk); - - /* - * Make sure we find the associated mm_struct even when the particular - * thread has already terminated and cleared its mm. - * We might have race with exit path so consider our work done if there - * is no mm. - */ - if (!p) - goto done; - mm = p->mm; - atomic_inc(&mm->mm_count); - task_unlock(p); + struct mm_struct *mm = tsk->signal->oom_mm; /* Retry the down_read_trylock(mmap_sem) a few times */ while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task_mm(tsk, mm)) @@ -578,8 +553,6 @@ static void oom_reap_task(struct task_struct *tsk) if (attempts <= MAX_OOM_REAP_RETRIES) goto done; - /* Ignore this mm because somebody can't call up_write(mmap_sem). */ - set_bit(MMF_OOM_REAPED, &mm->flags); pr_info("oom_reaper: unable to reap pid:%d (%s)\n", task_pid_nr(tsk), tsk->comm); @@ -595,11 +568,14 @@ done: tsk->oom_reaper_list = NULL; exit_oom_victim(tsk); + /* + * Hide this mm from OOM killer because it has been either reaped or + * somebody can't call up_write(mmap_sem). + */ + set_bit(MMF_OOM_REAPED, &mm->flags); + /* Drop a reference taken by wake_oom_reaper */ put_task_struct(tsk); - /* Drop a reference taken above. */ - if (mm) - mmdrop(mm); } static int oom_reaper(void *unused) @@ -665,14 +641,25 @@ static inline void wake_oom_reaper(struct task_struct *tsk) * * Has to be called with oom_lock held and never after * oom has been disabled already. + * + * tsk->mm has to be non NULL and caller has to guarantee it is stable (either + * under task_lock or operate on the current). */ static void mark_oom_victim(struct task_struct *tsk) { + struct mm_struct *mm = tsk->mm; + WARN_ON(oom_killer_disabled); /* OOM killer might race with memcg OOM */ if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE)) return; + atomic_inc(&tsk->signal->oom_victims); + + /* oom_mm is bound to the signal struct life time. */ + if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) + atomic_inc(&tsk->signal->oom_mm->mm_count); + /* * Make sure that the task is woken up from uninterruptible sleep * if it is frozen because OOM killer wouldn't be able to free -- cgit v1.1 From 862e3073b3eed13f17bd6be6ca6052db15c0b728 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 7 Oct 2016 16:58:57 -0700 Subject: mm, oom: get rid of signal_struct::oom_victims After "oom: keep mm of the killed task available" we can safely detect an oom victim by checking task->signal->oom_mm so we do not need the signal_struct counter anymore so let's get rid of it. This alone wouldn't be sufficient for nommu archs because exit_oom_victim doesn't hide the process from the oom killer anymore. We can, however, mark the mm with a MMF flag in __mmput. We can reuse MMF_OOM_REAPED and rename it to a more generic MMF_OOM_SKIP. Link: http://lkml.kernel.org/r/1472119394-11342-6-git-send-email-mhocko@kernel.org Signed-off-by: Michal Hocko Cc: Tetsuo Handa Cc: Oleg Nesterov Cc: David Rientjes Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index f16ec08..e2a2c35 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -186,7 +186,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, */ adj = (long)p->signal->oom_score_adj; if (adj == OOM_SCORE_ADJ_MIN || - test_bit(MMF_OOM_REAPED, &p->mm->flags) || + test_bit(MMF_OOM_SKIP, &p->mm->flags) || in_vfork(p)) { task_unlock(p); return 0; @@ -296,11 +296,11 @@ static int oom_evaluate_task(struct task_struct *task, void *arg) /* * This task already has access to memory reserves and is being killed. * Don't allow any other task to have access to the reserves unless - * the task has MMF_OOM_REAPED because chances that it would release + * the task has MMF_OOM_SKIP because chances that it would release * any memory is quite low. */ - if (!is_sysrq_oom(oc) && atomic_read(&task->signal->oom_victims)) { - if (test_bit(MMF_OOM_REAPED, &task->signal->oom_mm->flags)) + if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) { + if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags)) goto next; goto abort; } @@ -572,7 +572,7 @@ done: * Hide this mm from OOM killer because it has been either reaped or * somebody can't call up_write(mmap_sem). */ - set_bit(MMF_OOM_REAPED, &mm->flags); + set_bit(MMF_OOM_SKIP, &mm->flags); /* Drop a reference taken by wake_oom_reaper */ put_task_struct(tsk); @@ -654,8 +654,6 @@ static void mark_oom_victim(struct task_struct *tsk) if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE)) return; - atomic_inc(&tsk->signal->oom_victims); - /* oom_mm is bound to the signal struct life time. */ if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) atomic_inc(&tsk->signal->oom_mm->mm_count); @@ -677,7 +675,6 @@ void exit_oom_victim(struct task_struct *tsk) { if (!test_and_clear_tsk_thread_flag(tsk, TIF_MEMDIE)) return; - atomic_dec(&tsk->signal->oom_victims); if (!atomic_dec_return(&oom_victims)) wake_up_all(&oom_victims_wait); @@ -769,7 +766,7 @@ static bool task_will_free_mem(struct task_struct *task) * This task has already been drained by the oom reaper so there are * only small chances it will free some more */ - if (test_bit(MMF_OOM_REAPED, &mm->flags)) + if (test_bit(MMF_OOM_SKIP, &mm->flags)) return false; if (atomic_read(&mm->mm_users) <= 1) @@ -906,7 +903,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message) * killer to guarantee OOM forward progress. */ can_oom_reap = false; - set_bit(MMF_OOM_REAPED, &mm->flags); + set_bit(MMF_OOM_SKIP, &mm->flags); pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n", task_pid_nr(victim), victim->comm, task_pid_nr(p), p->comm); -- cgit v1.1 From 7d2e7a22cf27e7569e6816ccc05dd74248048b30 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 7 Oct 2016 16:59:00 -0700 Subject: oom, suspend: fix oom_killer_disable vs. pm suspend properly Commit 74070542099c ("oom, suspend: fix oom_reaper vs. oom_killer_disable race") has workaround an existing race between oom_killer_disable and oom_reaper by adding another round of try_to_freeze_tasks after the oom killer was disabled. This was the easiest thing to do for a late 4.7 fix. Let's fix it properly now. After "oom: keep mm of the killed task available" we no longer have to call exit_oom_victim from the oom reaper because we have stable mm available and hide the oom_reaped mm by MMF_OOM_SKIP flag. So let's remove exit_oom_victim and the race described in the above commit doesn't exist anymore if. Unfortunately this alone is not sufficient for the oom_killer_disable usecase because now we do not have any reliable way to reach exit_oom_victim (the victim might get stuck on a way to exit for an unbounded amount of time). OOM killer can cope with that by checking mm flags and move on to another victim but we cannot do the same for oom_killer_disable as we would lose the guarantee of no further interference of the victim with the rest of the system. What we can do instead is to cap the maximum time the oom_killer_disable waits for victims. The only current user of this function (pm suspend) already has a concept of timeout for back off so we can reuse the same value there. Let's drop set_freezable for the oom_reaper kthread because it is no longer needed as the reaper doesn't wake or thaw any processes. Link: http://lkml.kernel.org/r/1472119394-11342-7-git-send-email-mhocko@kernel.org Signed-off-by: Michal Hocko Cc: Tetsuo Handa Cc: Oleg Nesterov Cc: David Rientjes Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index e2a2c35..895a51f 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -559,14 +559,7 @@ static void oom_reap_task(struct task_struct *tsk) debug_show_all_locks(); done: - /* - * Clear TIF_MEMDIE because the task shouldn't be sitting on a - * reasonably reclaimable memory anymore or it is not a good candidate - * for the oom victim right now because it cannot release its memory - * itself nor by the oom reaper. - */ tsk->oom_reaper_list = NULL; - exit_oom_victim(tsk); /* * Hide this mm from OOM killer because it has been either reaped or @@ -580,8 +573,6 @@ done: static int oom_reaper(void *unused) { - set_freezable(); - while (true) { struct task_struct *tsk = NULL; @@ -681,10 +672,20 @@ void exit_oom_victim(struct task_struct *tsk) } /** + * oom_killer_enable - enable OOM killer + */ +void oom_killer_enable(void) +{ + oom_killer_disabled = false; +} + +/** * oom_killer_disable - disable OOM killer + * @timeout: maximum timeout to wait for oom victims in jiffies * * Forces all page allocations to fail rather than trigger OOM killer. - * Will block and wait until all OOM victims are killed. + * Will block and wait until all OOM victims are killed or the given + * timeout expires. * * The function cannot be called when there are runnable user tasks because * the userspace would see unexpected allocation failures as a result. Any @@ -693,8 +694,10 @@ void exit_oom_victim(struct task_struct *tsk) * Returns true if successful and false if the OOM killer cannot be * disabled. */ -bool oom_killer_disable(void) +bool oom_killer_disable(signed long timeout) { + signed long ret; + /* * Make sure to not race with an ongoing OOM killer. Check that the * current is not killed (possibly due to sharing the victim's memory). @@ -704,19 +707,16 @@ bool oom_killer_disable(void) oom_killer_disabled = true; mutex_unlock(&oom_lock); - wait_event(oom_victims_wait, !atomic_read(&oom_victims)); + ret = wait_event_interruptible_timeout(oom_victims_wait, + !atomic_read(&oom_victims), timeout); + if (ret <= 0) { + oom_killer_enable(); + return false; + } return true; } -/** - * oom_killer_enable - enable OOM killer - */ -void oom_killer_enable(void) -{ - oom_killer_disabled = false; -} - static inline bool __task_will_free_mem(struct task_struct *task) { struct signal_struct *sig = task->signal; -- cgit v1.1 From 38531201c12144cd7d96abfdfe7449c2b01375e8 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Fri, 7 Oct 2016 16:59:03 -0700 Subject: mm, oom: enforce exit_oom_victim on current task There are no users of exit_oom_victim on !current task anymore so enforce the API to always work on the current. Link: http://lkml.kernel.org/r/1472119394-11342-8-git-send-email-mhocko@kernel.org Signed-off-by: Tetsuo Handa Signed-off-by: Michal Hocko Cc: Oleg Nesterov Cc: David Rientjes Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 895a51f..3b99054 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -662,10 +662,9 @@ static void mark_oom_victim(struct task_struct *tsk) /** * exit_oom_victim - note the exit of an OOM victim */ -void exit_oom_victim(struct task_struct *tsk) +void exit_oom_victim(void) { - if (!test_and_clear_tsk_thread_flag(tsk, TIF_MEMDIE)) - return; + clear_thread_flag(TIF_MEMDIE); if (!atomic_dec_return(&oom_victims)) wake_up_all(&oom_victims_wait); -- cgit v1.1 From 3f70dc38cec2ad6e5355f80c4c7a15a3f7e97a19 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 7 Oct 2016 16:59:06 -0700 Subject: mm: make sure that kthreads will not refault oom reaped memory There are only few use_mm() users in the kernel right now. Most of them write to the target memory but vhost driver relies on copy_from_user/get_user from a kernel thread context. This makes it impossible to reap the memory of an oom victim which shares the mm with the vhost kernel thread because it could see a zero page unexpectedly and theoretically make an incorrect decision visible outside of the killed task context. To quote Michael S. Tsirkin: : Getting an error from __get_user and friends is handled gracefully. : Getting zero instead of a real value will cause userspace : memory corruption. The vhost kernel thread is bound to an open fd of the vhost device which is not tight to the mm owner life cycle in general. The device fd can be inherited or passed over to another process which means that we really have to be careful about unexpected memory corruption because unlike for normal oom victims the result will be visible outside of the oom victim context. Make sure that no kthread context (users of use_mm) can ever see corrupted data because of the oom reaper and hook into the page fault path by checking MMF_UNSTABLE mm flag. __oom_reap_task_mm will set the flag before it starts unmapping the address space while the flag is checked after the page fault has been handled. If the flag is set then SIGBUS is triggered so any g-u-p user will get a error code. Regular tasks do not need this protection because all which share the mm are killed when the mm is reaped and so the corruption will not outlive them. This patch shouldn't have any visible effect at this moment because the OOM killer doesn't invoke oom reaper for tasks with mm shared with kthreads yet. Link: http://lkml.kernel.org/r/1472119394-11342-9-git-send-email-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: "Michael S. Tsirkin" Cc: Tetsuo Handa Cc: Oleg Nesterov Cc: David Rientjes Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 13 +++++++++++++ mm/oom_kill.c | 8 ++++++++ 2 files changed, 21 insertions(+) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index f1a6804..4bfc3a9 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3658,6 +3658,19 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, mem_cgroup_oom_synchronize(false); } + /* + * This mm has been already reaped by the oom reaper and so the + * refault cannot be trusted in general. Anonymous refaults would + * lose data and give a zero page instead e.g. This is especially + * problem for use_mm() because regular tasks will just die and + * the corrupted data will not be visible anywhere while kthread + * will outlive the oom victim and potentially propagate the date + * further. + */ + if (unlikely((current->flags & PF_KTHREAD) && !(ret & VM_FAULT_ERROR) + && test_bit(MMF_UNSTABLE, &vma->vm_mm->flags))) + ret = VM_FAULT_SIGBUS; + return ret; } EXPORT_SYMBOL_GPL(handle_mm_fault); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 3b99054..5a3ba96 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -495,6 +495,14 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) goto unlock_oom; } + /* + * Tell all users of get_user/copy_from_user etc... that the content + * is no longer stable. No barriers really needed because unmapping + * should imply barriers already and the reader would hit a page fault + * if it stumbled over a reaped memory. + */ + set_bit(MMF_UNSTABLE, &mm->flags); + tlb_gather_mmu(&tlb, mm, 0, -1); for (vma = mm->mmap ; vma; vma = vma->vm_next) { if (is_vm_hugetlb_page(vma)) -- cgit v1.1 From 1b51e65eab64fac72cab009691e8ca9915624876 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 7 Oct 2016 16:59:09 -0700 Subject: oom, oom_reaper: allow to reap mm shared by the kthreads oom reaper was skipped for an mm which is shared with the kernel thread (aka use_mm()). The primary concern was that such a kthread might want to read from the userspace memory and see zero page as a result of the oom reaper action. This is no longer a problem after "mm: make sure that kthreads will not refault oom reaped memory" because any attempt to fault in when the MMF_UNSTABLE is set will result in SIGBUS and so the target user should see an error. This means that we can finally allow oom reaper also to tasks which share their mm with kthreads. Link: http://lkml.kernel.org/r/1472119394-11342-10-git-send-email-mhocko@kernel.org Signed-off-by: Michal Hocko Cc: Tetsuo Handa Cc: Oleg Nesterov Cc: David Rientjes Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 5a3ba96..10f6869 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -902,13 +902,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message) continue; if (same_thread_group(p, victim)) continue; - if (unlikely(p->flags & PF_KTHREAD) || is_global_init(p)) { - /* - * We cannot use oom_reaper for the mm shared by this - * process because it wouldn't get killed and so the - * memory might be still used. Hide the mm from the oom - * killer to guarantee OOM forward progress. - */ + if (is_global_init(p)) { can_oom_reap = false; set_bit(MMF_OOM_SKIP, &mm->flags); pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n", @@ -916,6 +910,12 @@ static void oom_kill_process(struct oom_control *oc, const char *message) task_pid_nr(p), p->comm); continue; } + /* + * No use_mm() user needs to read from the userspace so we are + * ok to reap it. + */ + if (unlikely(p->flags & PF_KTHREAD)) + continue; do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); } rcu_read_unlock(); -- cgit v1.1 From c9634cf012321243ee8e4ea0fb0709904cd58395 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 7 Oct 2016 16:59:12 -0700 Subject: mm: use zonelist name instead of using hardcoded index Use the existing enums instead of hardcoded index when looking at the zonelist. This makes it more readable. No functionality change by this patch. Link: http://lkml.kernel.org/r/1472227078-24852-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com Signed-off-by: Aneesh Kumar K.V Reviewed-by: Anshuman Khandual Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 2 +- mm/page_alloc.c | 8 ++++---- mm/vmscan.c | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 2da72a5..ad1c96a 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1749,7 +1749,7 @@ unsigned int mempolicy_slab_node(void) */ struct zonelist *zonelist; enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); - zonelist = &NODE_DATA(node)->node_zonelists[0]; + zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK]; z = first_zones_zonelist(zonelist, highest_zoneidx, &policy->v.nodes); return z->zone ? z->zone->node : node; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b0f133f..f6a5a22 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4574,7 +4574,7 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) int j; struct zonelist *zonelist; - zonelist = &pgdat->node_zonelists[0]; + zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK]; for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++) ; j = build_zonelists_node(NODE_DATA(node), zonelist, j); @@ -4590,7 +4590,7 @@ static void build_thisnode_zonelists(pg_data_t *pgdat) int j; struct zonelist *zonelist; - zonelist = &pgdat->node_zonelists[1]; + zonelist = &pgdat->node_zonelists[ZONELIST_NOFALLBACK]; j = build_zonelists_node(pgdat, zonelist, 0); zonelist->_zonerefs[j].zone = NULL; zonelist->_zonerefs[j].zone_idx = 0; @@ -4611,7 +4611,7 @@ static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) struct zone *z; struct zonelist *zonelist; - zonelist = &pgdat->node_zonelists[0]; + zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK]; pos = 0; for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) { for (j = 0; j < nr_nodes; j++) { @@ -4746,7 +4746,7 @@ static void build_zonelists(pg_data_t *pgdat) local_node = pgdat->node_id; - zonelist = &pgdat->node_zonelists[0]; + zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK]; j = build_zonelists_node(pgdat, zonelist, 0); /* diff --git a/mm/vmscan.c b/mm/vmscan.c index d3715c1..744f926 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3036,7 +3036,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, */ nid = mem_cgroup_select_victim_node(memcg); - zonelist = NODE_DATA(nid)->node_zonelists; + zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK]; trace_mm_vmscan_memcg_reclaim_begin(0, sc.may_writepage, -- cgit v1.1 From f6f34b4387d9e18304451a131b35d7c4f27a0b5a Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Fri, 7 Oct 2016 16:59:15 -0700 Subject: mm: introduce arch_reserved_kernel_pages() Currently arch specific code can reserve memory blocks but alloc_large_system_hash() may not take it into consideration when sizing the hashes. This can lead to bigger hash than required and lead to no available memory for other purposes. This is specifically true for systems with CONFIG_DEFERRED_STRUCT_PAGE_INIT enabled. One approach to solve this problem would be to walk through the memblock regions and calculate the available memory and base the size of hash system on the available memory. The other approach would be to depend on the architecture to provide the number of pages that are reserved. This change provides hooks to allow the architecture to provide the required info. Link: http://lkml.kernel.org/r/1472476010-4709-2-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Srikar Dronamraju Suggested-by: Mel Gorman Cc: Vlastimil Babka Cc: Michal Hocko Cc: Michael Ellerman Cc: Mahesh Salgaonkar Cc: Hari Bathini Cc: Dave Hansen Cc: Balbir Singh Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f6a5a22..e00f545 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6940,6 +6940,17 @@ static int __init set_hashdist(char *str) __setup("hashdist=", set_hashdist); #endif +#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES +/* + * Returns the number of pages that arch has reserved but + * is not known to alloc_large_system_hash(). + */ +static unsigned long __init arch_reserved_kernel_pages(void) +{ + return 0; +} +#endif + /* * allocate a large system hash table from bootmem * - it is assumed that the hash table must contain an exact power-of-2 @@ -6964,6 +6975,7 @@ void *__init alloc_large_system_hash(const char *tablename, if (!numentries) { /* round applicable memory size up to nearest megabyte */ numentries = nr_kernel_pages; + numentries -= arch_reserved_kernel_pages(); /* It isn't necessary when PAGE_SIZE >= 1MB */ if (PAGE_SHIFT < 20) -- cgit v1.1 From 8907de5dc6e9d5925cf3b0a698cc3a4272fda073 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Fri, 7 Oct 2016 16:59:18 -0700 Subject: mm/memblock.c: expose total reserved memory The total reserved memory in a system is accounted but not available for use use outside mm/memblock.c. By exposing the total reserved memory, systems can better calculate the size of large hashes. Link: http://lkml.kernel.org/r/1472476010-4709-3-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Srikar Dronamraju Suggested-by: Mel Gorman Cc: Vlastimil Babka Cc: Michal Hocko Cc: Michael Ellerman Cc: Mahesh Salgaonkar Cc: Hari Bathini Cc: Dave Hansen Cc: Balbir Singh Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index 483197e..c8dfa43 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1438,6 +1438,11 @@ phys_addr_t __init_memblock memblock_phys_mem_size(void) return memblock.memory.total_size; } +phys_addr_t __init_memblock memblock_reserved_size(void) +{ + return memblock.reserved.total_size; +} + phys_addr_t __init memblock_mem_size(unsigned long limit_pfn) { unsigned long pages = 0; -- cgit v1.1 From 2382705f22c1436a153800cf6051b08f0ea14838 Mon Sep 17 00:00:00 2001 From: zijun_hu Date: Fri, 7 Oct 2016 16:59:24 -0700 Subject: mm/nobootmem.c: remove duplicate macro ARCH_LOW_ADDRESS_LIMIT statements Fix the following bugs: - the same ARCH_LOW_ADDRESS_LIMIT statements are duplicated between header and relevant source - don't ensure ARCH_LOW_ADDRESS_LIMIT perhaps defined by ARCH in asm/processor.h is preferred over default in linux/bootmem.h completely since the former header isn't included by the latter Link: http://lkml.kernel.org/r/e046aeaa-e160-6d9e-dc1b-e084c2fd999f@zoho.com Signed-off-by: zijun_hu Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/nobootmem.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/nobootmem.c b/mm/nobootmem.c index bd05a70..490d46a 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -11,18 +11,21 @@ #include #include #include -#include #include #include #include #include +#include #include #include -#include #include "internal.h" +#ifndef CONFIG_HAVE_MEMBLOCK +#error CONFIG_HAVE_MEMBLOCK not defined +#endif + #ifndef CONFIG_NEED_MULTIPLE_NODES struct pglist_data __refdata contig_page_data; EXPORT_SYMBOL(contig_page_data); @@ -395,9 +398,6 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, return __alloc_bootmem_node(pgdat, size, align, goal); } -#ifndef ARCH_LOW_ADDRESS_LIMIT -#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL -#endif /** * __alloc_bootmem_low - allocate low boot memory -- cgit v1.1 From 1d8bf926f8739bd35d054097907fef35d881e403 Mon Sep 17 00:00:00 2001 From: zijun_hu Date: Fri, 7 Oct 2016 16:59:27 -0700 Subject: mm/bootmem.c: replace kzalloc() by kzalloc_node() In ___alloc_bootmem_node_nopanic(), replace kzalloc() by kzalloc_node() in order to allocate memory within given node preferentially when slab is available Link: http://lkml.kernel.org/r/1f487f12-6af4-5e4f-a28c-1de2361cdcd8@zoho.com Signed-off-by: zijun_hu Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index 0aa7dda..a869f84 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -11,15 +11,12 @@ #include #include #include -#include #include #include #include -#include #include #include - -#include +#include #include "internal.h" @@ -712,7 +709,7 @@ void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat, void *ptr; if (WARN_ON_ONCE(slab_is_available())) - return kzalloc(size, GFP_NOWAIT); + return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); again: /* do not panic in alloc_bootmem_bdata() */ @@ -738,9 +735,6 @@ again: void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) { - if (WARN_ON_ONCE(slab_is_available())) - return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); - return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0); } @@ -812,10 +806,6 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, } -#ifndef ARCH_LOW_ADDRESS_LIMIT -#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL -#endif - /** * __alloc_bootmem_low - allocate low boot memory * @size: size of the request in bytes -- cgit v1.1 From 371a096edf43a8c71844cf71c20765c8b21d07d9 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 7 Oct 2016 16:59:30 -0700 Subject: mm: don't use radix tree writeback tags for pages in swap cache MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit File pages use a set of radix tree tags (DIRTY, TOWRITE, WRITEBACK, etc.) to accelerate finding the pages with a specific tag in the radix tree during inode writeback. But for anonymous pages in the swap cache, there is no inode writeback. So there is no need to find the pages with some writeback tags in the radix tree. It is not necessary to touch radix tree writeback tags for pages in the swap cache. Per Rik van Riel's suggestion, a new flag AS_NO_WRITEBACK_TAGS is introduced for address spaces which don't need to update the writeback tags. The flag is set for swap caches. It may be used for DAX file systems, etc. With this patch, the swap out bandwidth improved 22.3% (from ~1.2GB/s to ~1.48GBps) in the vm-scalability swap-w-seq test case with 8 processes. The test is done on a Xeon E5 v3 system. The swap device used is a RAM simulated PMEM (persistent memory) device. The improvement comes from the reduced contention on the swap cache radix tree lock. To test sequential swapping out, the test case uses 8 processes, which sequentially allocate and write to the anonymous pages until RAM and part of the swap device is used up. Details of comparison is as follow, base base+patch ---------------- -------------------------- %stddev %change %stddev \ | \ 2506952 ± 2% +28.1% 3212076 ± 7% vm-scalability.throughput 1207402 ± 7% +22.3% 1476578 ± 6% vmstat.swap.so 10.86 ± 12% -23.4% 8.31 ± 16% perf-profile.cycles-pp._raw_spin_lock_irq.__add_to_swap_cache.add_to_swap_cache.add_to_swap.shrink_page_list 10.82 ± 13% -33.1% 7.24 ± 14% perf-profile.cycles-pp._raw_spin_lock_irqsave.__remove_mapping.shrink_page_list.shrink_inactive_list.shrink_zone_memcg 10.36 ± 11% -100.0% 0.00 ± -1% perf-profile.cycles-pp._raw_spin_lock_irqsave.__test_set_page_writeback.bdev_write_page.__swap_writepage.swap_writepage 10.52 ± 12% -100.0% 0.00 ± -1% perf-profile.cycles-pp._raw_spin_lock_irqsave.test_clear_page_writeback.end_page_writeback.page_endio.pmem_rw_page Link: http://lkml.kernel.org/r/1472578089-5560-1-git-send-email-ying.huang@intel.com Signed-off-by: "Huang, Ying" Acked-by: Rik van Riel Cc: Hugh Dickins Cc: Shaohua Li Cc: Minchan Kim Cc: Mel Gorman Cc: Tejun Heo Cc: Wu Fengguang Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 4 ++-- mm/swap_state.c | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 5ed3381..439cc63 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2716,7 +2716,7 @@ int test_clear_page_writeback(struct page *page) int ret; lock_page_memcg(page); - if (mapping) { + if (mapping && mapping_use_writeback_tags(mapping)) { struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); unsigned long flags; @@ -2759,7 +2759,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write) int ret; lock_page_memcg(page); - if (mapping) { + if (mapping && mapping_use_writeback_tags(mapping)) { struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); unsigned long flags; diff --git a/mm/swap_state.c b/mm/swap_state.c index c8310a3..268b819 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -37,6 +37,8 @@ struct address_space swapper_spaces[MAX_SWAPFILES] = { .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), .i_mmap_writable = ATOMIC_INIT(0), .a_ops = &swap_aops, + /* swap cache doesn't use writeback related tags */ + .flags = 1 << AS_NO_WRITEBACK_TAGS, } }; -- cgit v1.1 From 9254990fb9f0f15f25605748da20cfbeced7c816 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 7 Oct 2016 16:59:33 -0700 Subject: oom: warn if we go OOM for higher order and compaction is disabled Since the lumpy reclaim is gone there is no source of higher order pages if CONFIG_COMPACTION=n except for the order-0 pages reclaim which is unreliable for that purpose to say the least. Hitting an OOM for !costly higher order requests is therefore all not that hard to imagine. We are trying hard to not invoke OOM killer as much as possible but there is simply no reliable way to detect whether more reclaim retries make sense. Disabling COMPACTION is not widespread but it seems that some users might have disable the feature without realizing full consequences (mostly along with disabling THP because compaction used to be THP mainly thing). This patch just adds a note if the OOM killer was triggered by higher order request with compaction disabled. This will help us identifying possible misconfiguration right from the oom report which is easier than to always keep in mind that somebody might have disabled COMPACTION without a good reason. Link: http://lkml.kernel.org/r/20160830111632.GD23963@dhcp22.suse.cz Signed-off-by: Michal Hocko Cc: David Rientjes Cc: Johannes Weiner Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 10f6869..0034baf 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -406,6 +406,8 @@ static void dump_header(struct oom_control *oc, struct task_struct *p) pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n", current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order, current->signal->oom_score_adj); + if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order) + pr_warn("COMPACTION is disabled!!!\n"); cpuset_print_current_mems_allowed(); dump_stack(); -- cgit v1.1 From 0cf2f6f6dc605e587d2c1120f295934c77e810e8 Mon Sep 17 00:00:00 2001 From: Simon Guo Date: Fri, 7 Oct 2016 16:59:36 -0700 Subject: mm: mlock: check against vma for actual mlock() size In do_mlock(), the check against locked memory limitation has a hole which will fail following cases at step 3): 1) User has a memory chunk from addressA with 50k, and user mem lock rlimit is 64k. 2) mlock(addressA, 30k) 3) mlock(addressA, 40k) The 3rd step should have been allowed since the 40k request is intersected with the previous 30k at step 2), and the 3rd step is actually for mlock on the extra 10k memory. This patch checks vma to caculate the actual "new" mlock size, if necessary, and ajust the logic to fix this issue. [akpm@linux-foundation.org: clean up comment layout] [wei.guo.simon@gmail.com: correct a typo in count_mm_mlocked_page_nr()] Link: http://lkml.kernel.org/r/1473325970-11393-2-git-send-email-wei.guo.simon@gmail.com Link: http://lkml.kernel.org/r/1472554781-9835-2-git-send-email-wei.guo.simon@gmail.com Signed-off-by: Simon Guo Cc: Alexey Klimov Cc: Eric B Munson Cc: Geert Uytterhoeven Cc: "Kirill A. Shutemov" Cc: Mel Gorman Cc: Michal Hocko Cc: Shuah Khan Cc: Simon Guo Cc: Thierry Reding Cc: Vlastimil Babka Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mlock.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) (limited to 'mm') diff --git a/mm/mlock.c b/mm/mlock.c index 14645be..b1fec89 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -617,6 +617,45 @@ static int apply_vma_lock_flags(unsigned long start, size_t len, return error; } +/* + * Go through vma areas and sum size of mlocked + * vma pages, as return value. + * Note deferred memory locking case(mlock2(,,MLOCK_ONFAULT) + * is also counted. + * Return value: previously mlocked page counts + */ +static int count_mm_mlocked_page_nr(struct mm_struct *mm, + unsigned long start, size_t len) +{ + struct vm_area_struct *vma; + int count = 0; + + if (mm == NULL) + mm = current->mm; + + vma = find_vma(mm, start); + if (vma == NULL) + vma = mm->mmap; + + for (; vma ; vma = vma->vm_next) { + if (start >= vma->vm_end) + continue; + if (start + len <= vma->vm_start) + break; + if (vma->vm_flags & VM_LOCKED) { + if (start > vma->vm_start) + count -= (start - vma->vm_start); + if (start + len < vma->vm_end) { + count += start + len - vma->vm_start; + break; + } + count += vma->vm_end - vma->vm_start; + } + } + + return count >> PAGE_SHIFT; +} + static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t flags) { unsigned long locked; @@ -639,6 +678,16 @@ static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t fla return -EINTR; locked += current->mm->locked_vm; + if ((locked > lock_limit) && (!capable(CAP_IPC_LOCK))) { + /* + * It is possible that the regions requested intersect with + * previously mlocked areas, that part area in "mm->locked_vm" + * should not be counted to new mlock increment count. So check + * and adjust locked count if necessary. + */ + locked -= count_mm_mlocked_page_nr(current->mm, + start, len); + } /* check against resource limits */ if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) -- cgit v1.1 From b155b4fde5bdde9fed439cd1f5ea07173df2ed31 Mon Sep 17 00:00:00 2001 From: Simon Guo Date: Fri, 7 Oct 2016 16:59:40 -0700 Subject: mm: mlock: avoid increase mm->locked_vm on mlock() when already mlock2(,MLOCK_ONFAULT) When one vma was with flag VM_LOCKED|VM_LOCKONFAULT (by invoking mlock2(,MLOCK_ONFAULT)), it can again be populated with mlock() with VM_LOCKED flag only. There is a hole in mlock_fixup() which increase mm->locked_vm twice even the two operations are on the same vma and both with VM_LOCKED flags. The issue can be reproduced by following code: mlock2(p, 1024 * 64, MLOCK_ONFAULT); //VM_LOCKED|VM_LOCKONFAULT mlock(p, 1024 * 64); //VM_LOCKED Then check the increase VmLck field in /proc/pid/status(to 128k). When vma is set with different vm_flags, and the new vm_flags is with VM_LOCKED, it is not necessarily be a "new locked" vma. This patch corrects this bug by prevent mm->locked_vm from increment when old vm_flags is already VM_LOCKED. Link: http://lkml.kernel.org/r/1472554781-9835-3-git-send-email-wei.guo.simon@gmail.com Signed-off-by: Simon Guo Acked-by: Kirill A. Shutemov Cc: Alexey Klimov Cc: Eric B Munson Cc: Geert Uytterhoeven Cc: Mel Gorman Cc: Michal Hocko Cc: Shuah Khan Cc: Simon Guo Cc: Thierry Reding Cc: Vlastimil Babka Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mlock.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/mlock.c b/mm/mlock.c index b1fec89..145a425 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -516,6 +516,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, int nr_pages; int ret = 0; int lock = !!(newflags & VM_LOCKED); + vm_flags_t old_flags = vma->vm_flags; if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm)) @@ -550,6 +551,8 @@ success: nr_pages = (end - start) >> PAGE_SHIFT; if (!lock) nr_pages = -nr_pages; + else if (old_flags & VM_LOCKED) + nr_pages = 0; mm->locked_vm += nr_pages; /* -- cgit v1.1 From 74d2fad1334d12bac8fe017aba598dd66c86628b Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Fri, 7 Oct 2016 16:59:56 -0700 Subject: thp, dax: add thp_get_unmapped_area for pmd mappings When CONFIG_FS_DAX_PMD is set, DAX supports mmap() using pmd page size. This feature relies on both mmap virtual address and FS block (i.e. physical address) to be aligned by the pmd page size. Users can use mkfs options to specify FS to align block allocations. However, aligning mmap address requires code changes to existing applications for providing a pmd-aligned address to mmap(). For instance, fio with "ioengine=mmap" performs I/Os with mmap() [1]. It calls mmap() with a NULL address, which needs to be changed to provide a pmd-aligned address for testing with DAX pmd mappings. Changing all applications that call mmap() with NULL is undesirable. Add thp_get_unmapped_area(), which can be called by filesystem's get_unmapped_area to align an mmap address by the pmd size for a DAX file. It calls the default handler, mm->get_unmapped_area(), to find a range and then aligns it for a DAX file. The patch is based on Matthew Wilcox's change that allows adding support of the pud page size easily. [1]: https://github.com/axboe/fio/blob/master/engines/mmap.c Link: http://lkml.kernel.org/r/1472497881-9323-2-git-send-email-toshi.kani@hpe.com Signed-off-by: Toshi Kani Reviewed-by: Dan Williams Cc: Matthew Wilcox Cc: Ross Zwisler Cc: Kirill A. Shutemov Cc: Dave Chinner Cc: Jan Kara Cc: Theodore Ts'o Cc: Andreas Dilger Cc: Mike Kravetz Cc: "Kirill A. Shutemov" Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 283583f..a0b0e56 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -469,6 +469,49 @@ void prep_transhuge_page(struct page *page) set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR); } +unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long len, + loff_t off, unsigned long flags, unsigned long size) +{ + unsigned long addr; + loff_t off_end = off + len; + loff_t off_align = round_up(off, size); + unsigned long len_pad; + + if (off_end <= off_align || (off_end - off_align) < size) + return 0; + + len_pad = len + size; + if (len_pad < len || (off + len_pad) < off) + return 0; + + addr = current->mm->get_unmapped_area(filp, 0, len_pad, + off >> PAGE_SHIFT, flags); + if (IS_ERR_VALUE(addr)) + return 0; + + addr += (off - addr) & (size - 1); + return addr; +} + +unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, unsigned long flags) +{ + loff_t off = (loff_t)pgoff << PAGE_SHIFT; + + if (addr) + goto out; + if (!IS_DAX(filp->f_mapping->host) || !IS_ENABLED(CONFIG_FS_DAX_PMD)) + goto out; + + addr = __thp_get_unmapped_area(filp, len, off, flags, PMD_SIZE); + if (addr) + return addr; + + out: + return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); +} +EXPORT_SYMBOL_GPL(thp_get_unmapped_area); + static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page, gfp_t gfp) { -- cgit v1.1 From 03e86dba5b628a13a58adae62e5b918b969ae93e Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Fri, 7 Oct 2016 17:00:02 -0700 Subject: cpu: fix node state for whether it contains CPU In current kernel code, we only call node_set_state(cpu_to_node(cpu), N_CPU) when a cpu is hot plugged. But we do not set the node state for N_CPU when the cpus are brought online during boot. So this could lead to failure when we check to see if a node contains cpu with node_state(node_id, N_CPU). One use case is in the node_reclaime function: /* * Only run node reclaim on the local node or on nodes that do * not * have associated processors. This will favor the local * processor * over remote processors and spread off node memory allocations * as wide as possible. */ if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id()) return NODE_RECLAIM_NOSCAN; I instrumented the kernel to call this function after boot and it always returns 0 on a x86 desktop machine until I apply the attached patch. int num_cpu_node(void) { int i, nr_cpu_nodes = 0; for_each_node(i) { if (node_state(i, N_CPU)) ++ nr_cpu_nodes; } return nr_cpu_nodes; } Fix this by checking each node for online CPU when we initialize vmstat that's responsible for maintaining node state. Link: http://lkml.kernel.org/r/20160829175922.GA21775@linux.intel.com Signed-off-by: Tim Chen Acked-by: David Rientjes Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Peter Zijlstra Cc: Tim Chen Cc: Cc: Ying Cc: Andi Kleen Cc: Dave Hansen Cc: Dan Williams Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'mm') diff --git a/mm/vmstat.c b/mm/vmstat.c index dc04e76..73aab31 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1715,6 +1715,16 @@ static void __init start_shepherd_timer(void) round_jiffies_relative(sysctl_stat_interval)); } +static void __init init_cpu_node_state(void) +{ + int cpu; + + get_online_cpus(); + for_each_online_cpu(cpu) + node_set_state(cpu_to_node(cpu), N_CPU); + put_online_cpus(); +} + static void vmstat_cpu_dead(int node) { int cpu; @@ -1772,6 +1782,7 @@ static int __init setup_vmstat(void) #ifdef CONFIG_SMP cpu_notifier_register_begin(); __register_cpu_notifier(&vmstat_notifier); + init_cpu_node_state(); start_shepherd_timer(); cpu_notifier_register_done(); -- cgit v1.1 From 6fcb52a56ff60d240f06296b12827e7f20d45f63 Mon Sep 17 00:00:00 2001 From: Aaron Lu Date: Fri, 7 Oct 2016 17:00:08 -0700 Subject: thp: reduce usage of huge zero page's atomic counter The global zero page is used to satisfy an anonymous read fault. If THP(Transparent HugePage) is enabled then the global huge zero page is used. The global huge zero page uses an atomic counter for reference counting and is allocated/freed dynamically according to its counter value. CPU time spent on that counter will greatly increase if there are a lot of processes doing anonymous read faults. This patch proposes a way to reduce the access to the global counter so that the CPU load can be reduced accordingly. To do this, a new flag of the mm_struct is introduced: MMF_USED_HUGE_ZERO_PAGE. With this flag, the process only need to touch the global counter in two cases: 1 The first time it uses the global huge zero page; 2 The time when mm_user of its mm_struct reaches zero. Note that right now, the huge zero page is eligible to be freed as soon as its last use goes away. With this patch, the page will not be eligible to be freed until the exit of the last process from which it was ever used. And with the use of mm_user, the kthread is not eligible to use huge zero page either. Since no kthread is using huge zero page today, there is no difference after applying this patch. But if that is not desired, I can change it to when mm_count reaches zero. Case used for test on Haswell EP: usemem -n 72 --readonly -j 0x200000 100G Which spawns 72 processes and each will mmap 100G anonymous space and then do read only access to that space sequentially with a step of 2MB. CPU cycles from perf report for base commit: 54.03% usemem [kernel.kallsyms] [k] get_huge_zero_page CPU cycles from perf report for this commit: 0.11% usemem [kernel.kallsyms] [k] mm_get_huge_zero_page Performance(throughput) of the workload for base commit: 1784430792 Performance(throughput) of the workload for this commit: 4726928591 164% increase. Runtime of the workload for base commit: 707592 us Runtime of the workload for this commit: 303970 us 50% drop. Link: http://lkml.kernel.org/r/fe51a88f-446a-4622-1363-ad1282d71385@intel.com Signed-off-by: Aaron Lu Cc: Sergey Senozhatsky Cc: "Kirill A. Shutemov" Cc: Dave Hansen Cc: Tim Chen Cc: Huang Ying Cc: Vlastimil Babka Cc: Jerome Marchand Cc: Andrea Arcangeli Cc: Mel Gorman Cc: Ebru Akagunduz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 36 +++++++++++++++++++++++++----------- mm/swap.c | 4 +--- mm/swap_state.c | 4 +--- 3 files changed, 27 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a0b0e56..12b9f1a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -59,7 +59,7 @@ static struct shrinker deferred_split_shrinker; static atomic_t huge_zero_refcount; struct page *huge_zero_page __read_mostly; -struct page *get_huge_zero_page(void) +static struct page *get_huge_zero_page(void) { struct page *zero_page; retry: @@ -86,7 +86,7 @@ retry: return READ_ONCE(huge_zero_page); } -void put_huge_zero_page(void) +static void put_huge_zero_page(void) { /* * Counter should never go to zero here. Only shrinker can put @@ -95,6 +95,26 @@ void put_huge_zero_page(void) BUG_ON(atomic_dec_and_test(&huge_zero_refcount)); } +struct page *mm_get_huge_zero_page(struct mm_struct *mm) +{ + if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) + return READ_ONCE(huge_zero_page); + + if (!get_huge_zero_page()) + return NULL; + + if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) + put_huge_zero_page(); + + return READ_ONCE(huge_zero_page); +} + +void mm_put_huge_zero_page(struct mm_struct *mm) +{ + if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) + put_huge_zero_page(); +} + static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink, struct shrink_control *sc) { @@ -644,7 +664,7 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe) pgtable = pte_alloc_one(vma->vm_mm, haddr); if (unlikely(!pgtable)) return VM_FAULT_OOM; - zero_page = get_huge_zero_page(); + zero_page = mm_get_huge_zero_page(vma->vm_mm); if (unlikely(!zero_page)) { pte_free(vma->vm_mm, pgtable); count_vm_event(THP_FAULT_FALLBACK); @@ -666,10 +686,8 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe) } } else spin_unlock(fe->ptl); - if (!set) { + if (!set) pte_free(vma->vm_mm, pgtable); - put_huge_zero_page(); - } return ret; } gfp = alloc_hugepage_direct_gfpmask(vma); @@ -823,7 +841,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, * since we already have a zero page to copy. It just takes a * reference. */ - zero_page = get_huge_zero_page(); + zero_page = mm_get_huge_zero_page(dst_mm); set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, zero_page); ret = 0; @@ -1081,7 +1099,6 @@ alloc: update_mmu_cache_pmd(vma, fe->address, fe->pmd); if (!page) { add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); - put_huge_zero_page(); } else { VM_BUG_ON_PAGE(!PageHead(page), page); page_remove_rmap(page, true); @@ -1542,7 +1559,6 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, } smp_wmb(); /* make pte visible before pmd */ pmd_populate(mm, pmd, pgtable); - put_huge_zero_page(); } static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, @@ -1565,8 +1581,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, if (!vma_is_anonymous(vma)) { _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd); - if (is_huge_zero_pmd(_pmd)) - put_huge_zero_page(); if (vma_is_dax(vma)) return; page = pmd_page(_pmd); diff --git a/mm/swap.c b/mm/swap.c index 75c63bb..4dcf852 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -748,10 +748,8 @@ void release_pages(struct page **pages, int nr, bool cold) locked_pgdat = NULL; } - if (is_huge_zero_page(page)) { - put_huge_zero_page(); + if (is_huge_zero_page(page)) continue; - } page = compound_head(page); if (!put_page_testzero(page)) diff --git a/mm/swap_state.c b/mm/swap_state.c index 268b819..8679c99 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -254,9 +254,7 @@ static inline void free_swap_cache(struct page *page) void free_page_and_swap_cache(struct page *page) { free_swap_cache(page); - if (is_huge_zero_page(page)) - put_huge_zero_page(); - else + if (!is_huge_zero_page(page)) put_page(page); } -- cgit v1.1 From 0247f3f4d78a475cd3181dc9fc162fdef773aaaa Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 7 Oct 2016 17:00:12 -0700 Subject: mm/memcontrol.c: make the walk_page_range() limit obvious mem_cgroup_count_precharge() and mem_cgroup_move_charge() both call walk_page_range() on the range 0 to ~0UL, neither provide a pte_hole callback, which causes the current implementation to skip non-vma regions. This is all fine but follow up changes would like to make walk_page_range more generic so it is better to be explicit about which range to traverse so let's use highest_vm_end to explicitly traverse only user mmaped memory. [mhocko@kernel.org: rewrote changelog] Link: http://lkml.kernel.org/r/1472655897-22532-1-git-send-email-james.morse@arm.com Signed-off-by: James Morse Acked-by: Naoya Horiguchi Cc: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5579e76..0739d41 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4681,7 +4681,8 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) .mm = mm, }; down_read(&mm->mmap_sem); - walk_page_range(0, ~0UL, &mem_cgroup_count_precharge_walk); + walk_page_range(0, mm->highest_vm_end, + &mem_cgroup_count_precharge_walk); up_read(&mm->mmap_sem); precharge = mc.precharge; @@ -4969,7 +4970,8 @@ retry: * When we have consumed all precharges and failed in doing * additional charge, the page walk just aborts. */ - walk_page_range(0, ~0UL, &mem_cgroup_move_charge_walk); + walk_page_range(0, mc.mm->highest_vm_end, &mem_cgroup_move_charge_walk); + up_read(&mc.mm->mmap_sem); atomic_dec(&mc.from->moving_account); } -- cgit v1.1 From 87744ab3832b83ba71b931f86f9cfdb000d07da5 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 7 Oct 2016 17:00:18 -0700 Subject: mm: fix cache mode tracking in vm_insert_mixed() vm_insert_mixed() unlike vm_insert_pfn_prot() and vmf_insert_pfn_pmd(), fails to check the pgprot_t it uses for the mapping against the one recorded in the memtype tracking tree. Add the missing call to track_pfn_insert() to preclude cases where incompatible aliased mappings are established for a given physical address range. Link: http://lkml.kernel.org/r/147328717909.35069.14256589123570653697.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Cc: David Airlie Cc: Matthew Wilcox Cc: Ross Zwisler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 4bfc3a9..fc1987d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1649,10 +1649,14 @@ EXPORT_SYMBOL(vm_insert_pfn_prot); int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn) { + pgprot_t pgprot = vma->vm_page_prot; + BUG_ON(!(vma->vm_flags & VM_MIXEDMAP)); if (addr < vma->vm_start || addr >= vma->vm_end) return -EFAULT; + if (track_pfn_insert(vma, &pgprot, pfn)) + return -EINVAL; /* * If we don't have pte special, then we have to use the pfn_valid() @@ -1670,9 +1674,9 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, * result in pfn_t_has_page() == false. */ page = pfn_to_page(pfn_t_to_pfn(pfn)); - return insert_page(vma, addr, page, vma->vm_page_prot); + return insert_page(vma, addr, page, pgprot); } - return insert_pfn(vma, addr, pfn, vma->vm_page_prot); + return insert_pfn(vma, addr, pfn, pgprot); } EXPORT_SYMBOL(vm_insert_mixed); -- cgit v1.1 From f6ab1f7f6b2d8e48c5fc47746a67363b20d79a1d Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 7 Oct 2016 17:00:21 -0700 Subject: mm, swap: use offset of swap entry as key of swap cache This patch is to improve the performance of swap cache operations when the type of the swap device is not 0. Originally, the whole swap entry value is used as the key of the swap cache, even though there is one radix tree for each swap device. If the type of the swap device is not 0, the height of the radix tree of the swap cache will be increased unnecessary, especially on 64bit architecture. For example, for a 1GB swap device on the x86_64 architecture, the height of the radix tree of the swap cache is 11. But if the offset of the swap entry is used as the key of the swap cache, the height of the radix tree of the swap cache is 4. The increased height causes unnecessary radix tree descending and increased cache footprint. This patch reduces the height of the radix tree of the swap cache via using the offset of the swap entry instead of the whole swap entry value as the key of the swap cache. In 32 processes sequential swap out test case on a Xeon E5 v3 system with RAM disk as swap, the lock contention for the spinlock of the swap cache is reduced from 20.15% to 12.19%, when the type of the swap device is 1. Use the whole swap entry as key, perf-profile.calltrace.cycles-pp._raw_spin_lock_irq.__add_to_swap_cache.add_to_swap_cache.add_to_swap.shrink_page_list: 10.37, perf-profile.calltrace.cycles-pp._raw_spin_lock_irqsave.__remove_mapping.shrink_page_list.shrink_inactive_list.shrink_node_memcg: 9.78, Use the swap offset as key, perf-profile.calltrace.cycles-pp._raw_spin_lock_irq.__add_to_swap_cache.add_to_swap_cache.add_to_swap.shrink_page_list: 6.25, perf-profile.calltrace.cycles-pp._raw_spin_lock_irqsave.__remove_mapping.shrink_page_list.shrink_inactive_list.shrink_node_memcg: 5.94, Link: http://lkml.kernel.org/r/1473270649-27229-1-git-send-email-ying.huang@intel.com Signed-off-by: "Huang, Ying" Cc: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: "Kirill A. Shutemov" Cc: Dave Hansen Cc: Dan Williams Cc: Joonsoo Kim Cc: Hugh Dickins Cc: Mel Gorman Cc: Minchan Kim Cc: Aaron Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 5 +++-- mm/mincore.c | 5 +++-- mm/swap_state.c | 8 ++++---- mm/swapfile.c | 4 ++-- 4 files changed, 12 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0739d41..60bb830 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4408,7 +4408,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, * Because lookup_swap_cache() updates some statistics counter, * we call find_get_page() with swapper_space directly. */ - page = find_get_page(swap_address_space(ent), ent.val); + page = find_get_page(swap_address_space(ent), swp_offset(ent)); if (do_memsw_account()) entry->val = ent.val; @@ -4446,7 +4446,8 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, swp_entry_t swp = radix_to_swp_entry(page); if (do_memsw_account()) *entry = swp; - page = find_get_page(swap_address_space(swp), swp.val); + page = find_get_page(swap_address_space(swp), + swp_offset(swp)); } } else page = find_get_page(mapping, pgoff); diff --git a/mm/mincore.c b/mm/mincore.c index c0b5ba9..bfb8664 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -66,7 +66,8 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) */ if (radix_tree_exceptional_entry(page)) { swp_entry_t swp = radix_to_swp_entry(page); - page = find_get_page(swap_address_space(swp), swp.val); + page = find_get_page(swap_address_space(swp), + swp_offset(swp)); } } else page = find_get_page(mapping, pgoff); @@ -150,7 +151,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, } else { #ifdef CONFIG_SWAP *vec = mincore_page(swap_address_space(entry), - entry.val); + swp_offset(entry)); #else WARN_ON(1); *vec = 1; diff --git a/mm/swap_state.c b/mm/swap_state.c index 8679c99..35d7e0e 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -94,7 +94,7 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry) address_space = swap_address_space(entry); spin_lock_irq(&address_space->tree_lock); error = radix_tree_insert(&address_space->page_tree, - entry.val, page); + swp_offset(entry), page); if (likely(!error)) { address_space->nrpages++; __inc_node_page_state(page, NR_FILE_PAGES); @@ -145,7 +145,7 @@ void __delete_from_swap_cache(struct page *page) entry.val = page_private(page); address_space = swap_address_space(entry); - radix_tree_delete(&address_space->page_tree, page_private(page)); + radix_tree_delete(&address_space->page_tree, swp_offset(entry)); set_page_private(page, 0); ClearPageSwapCache(page); address_space->nrpages--; @@ -283,7 +283,7 @@ struct page * lookup_swap_cache(swp_entry_t entry) { struct page *page; - page = find_get_page(swap_address_space(entry), entry.val); + page = find_get_page(swap_address_space(entry), swp_offset(entry)); if (page) { INC_CACHE_INFO(find_success); @@ -310,7 +310,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * called after lookup_swap_cache() failed, re-calling * that would confuse statistics. */ - found_page = find_get_page(swapper_space, entry.val); + found_page = find_get_page(swapper_space, swp_offset(entry)); if (found_page) break; diff --git a/mm/swapfile.c b/mm/swapfile.c index 134c085..2210de2 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -105,7 +105,7 @@ __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) struct page *page; int ret = 0; - page = find_get_page(swap_address_space(entry), entry.val); + page = find_get_page(swap_address_space(entry), swp_offset(entry)); if (!page) return 0; /* @@ -1005,7 +1005,7 @@ int free_swap_and_cache(swp_entry_t entry) if (p) { if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) { page = find_get_page(swap_address_space(entry), - entry.val); + swp_offset(entry)); if (page && !trylock_page(page)) { put_page(page); page = NULL; -- cgit v1.1 From 3250845d0526407330592dd43b9f1354b6fe7a14 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 7 Oct 2016 17:00:28 -0700 Subject: Revert "mm, oom: prevent premature OOM killer invocation for high order request" Patch series "reintroduce compaction feedback for OOM decisions". After several people reported OOM's for order-2 allocations in 4.7 due to Michal Hocko's OOM rework, he reverted the part that considered compaction feedback [1] in the decisions to retry reclaim/compaction. This was to provide a fix quickly for 4.8 rc and 4.7 stable series, while mmotm had an almost complete solution that instead improved compaction reliability. This series completes the mmotm solution and reintroduces the compaction feedback into OOM decisions. The first two patches restore the state of mmotm before the temporary solution was merged, the last patch should be the missing piece for reliability. The third patch restricts the hardened compaction to non-costly orders, since costly orders don't result in OOMs in the first place. [1] http://marc.info/?i=20160822093249.GA14916%40dhcp22.suse.cz%3E This patch (of 4): Commit 6b4e3181d7bd ("mm, oom: prevent premature OOM killer invocation for high order request") was intended as a quick fix of OOM regressions for 4.8 and stable 4.7.x kernels. For a better long-term solution, we still want to consider compaction feedback, which should be possible after some more improvements in the following patches. This reverts commit 6b4e3181d7bd5ca5ab6f45929e4a5ffa7ab4ab7f. Link: http://lkml.kernel.org/r/20160906135258.18335-2-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Mel Gorman Cc: Joonsoo Kim Cc: David Rientjes Cc: Rik van Riel Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e00f545..634806f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3156,6 +3156,54 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, return NULL; } +static inline bool +should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, + enum compact_result compact_result, + enum compact_priority *compact_priority, + int compaction_retries) +{ + int max_retries = MAX_COMPACT_RETRIES; + + if (!order) + return false; + + /* + * compaction considers all the zone as desperately out of memory + * so it doesn't really make much sense to retry except when the + * failure could be caused by insufficient priority + */ + if (compaction_failed(compact_result)) { + if (*compact_priority > MIN_COMPACT_PRIORITY) { + (*compact_priority)--; + return true; + } + return false; + } + + /* + * make sure the compaction wasn't deferred or didn't bail out early + * due to locks contention before we declare that we should give up. + * But do not retry if the given zonelist is not suitable for + * compaction. + */ + if (compaction_withdrawn(compact_result)) + return compaction_zonelist_suitable(ac, order, alloc_flags); + + /* + * !costly requests are much more important than __GFP_REPEAT + * costly ones because they are de facto nofail and invoke OOM + * killer to move on while costly can fail and users are ready + * to cope with that. 1/4 retries is rather arbitrary but we + * would need much more detailed feedback from compaction to + * make a better decision. + */ + if (order > PAGE_ALLOC_COSTLY_ORDER) + max_retries /= 4; + if (compaction_retries <= max_retries) + return true; + + return false; +} #else static inline struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, @@ -3166,8 +3214,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, return NULL; } -#endif /* CONFIG_COMPACTION */ - static inline bool should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags, enum compact_result compact_result, @@ -3194,6 +3240,7 @@ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_fla } return false; } +#endif /* CONFIG_COMPACTION */ /* Perform direct synchronous page reclaim */ static int -- cgit v1.1 From d943649831aba0fcdda37a0e9e25b332a634cf5e Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 7 Oct 2016 17:00:31 -0700 Subject: mm, compaction: more reliably increase direct compaction priority During reclaim/compaction loop, compaction priority can be increased by the should_compact_retry() function, but the current code is not optimal. Priority is only increased when compaction_failed() is true, which means that compaction has scanned the whole zone. This may not happen even after multiple attempts with a lower priority due to parallel activity, so we might needlessly struggle on the lower priorities and possibly run out of compaction retry attempts in the process. After this patch we are guaranteed at least one attempt at the highest compaction priority even if we exhaust all retries at the lower priorities. Link: http://lkml.kernel.org/r/20160906135258.18335-3-vbabka@suse.cz Signed-off-by: Vlastimil Babka Cc: Michal Hocko Cc: Mel Gorman Cc: Joonsoo Kim Cc: David Rientjes Cc: Rik van Riel Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 634806f..a8703b5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3160,25 +3160,23 @@ static inline bool should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, enum compact_result compact_result, enum compact_priority *compact_priority, - int compaction_retries) + int *compaction_retries) { int max_retries = MAX_COMPACT_RETRIES; if (!order) return false; + if (compaction_made_progress(compact_result)) + (*compaction_retries)++; + /* * compaction considers all the zone as desperately out of memory * so it doesn't really make much sense to retry except when the * failure could be caused by insufficient priority */ - if (compaction_failed(compact_result)) { - if (*compact_priority > MIN_COMPACT_PRIORITY) { - (*compact_priority)--; - return true; - } - return false; - } + if (compaction_failed(compact_result)) + goto check_priority; /* * make sure the compaction wasn't deferred or didn't bail out early @@ -3199,9 +3197,19 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, */ if (order > PAGE_ALLOC_COSTLY_ORDER) max_retries /= 4; - if (compaction_retries <= max_retries) + if (*compaction_retries <= max_retries) return true; + /* + * Make sure there are attempts at the highest priority if we exhausted + * all retries or failed at the lower priorities. + */ +check_priority: + if (*compact_priority > MIN_COMPACT_PRIORITY) { + (*compact_priority)--; + *compaction_retries = 0; + return true; + } return false; } #else @@ -3218,7 +3226,7 @@ static inline bool should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags, enum compact_result compact_result, enum compact_priority *compact_priority, - int compaction_retries) + int *compaction_retries) { struct zone *zone; struct zoneref *z; @@ -3620,9 +3628,6 @@ retry: if (page) goto got_pg; - if (order && compaction_made_progress(compact_result)) - compaction_retries++; - /* Do not loop if specifically requested */ if (gfp_mask & __GFP_NORETRY) goto nopage; @@ -3657,7 +3662,7 @@ retry: if (did_some_progress > 0 && should_compact_retry(ac, order, alloc_flags, compact_result, &compact_priority, - compaction_retries)) + &compaction_retries)) goto retry; /* Reclaim has failed us, start killing things */ -- cgit v1.1 From c2033b00dbe856909fcaccf038e4e0d3dcfb85af Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 7 Oct 2016 17:00:34 -0700 Subject: mm, compaction: restrict full priority to non-costly orders The new ultimate compaction priority disables some heuristics, which may result in excessive cost. This is fine for non-costly orders where we want to try hard before resulting for OOM, but might be disruptive for costly orders which do not trigger OOM and should generally have some fallback. Thus, we disable the full priority for costly orders. Suggested-by: Michal Hocko Link: http://lkml.kernel.org/r/20160906135258.18335-4-vbabka@suse.cz Signed-off-by: Vlastimil Babka Cc: Michal Hocko Cc: Mel Gorman Cc: Joonsoo Kim Cc: David Rientjes Cc: Rik van Riel Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a8703b5..891e388 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3163,6 +3163,7 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, int *compaction_retries) { int max_retries = MAX_COMPACT_RETRIES; + int min_priority; if (!order) return false; @@ -3205,7 +3206,9 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, * all retries or failed at the lower priorities. */ check_priority: - if (*compact_priority > MIN_COMPACT_PRIORITY) { + min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ? + MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY; + if (*compact_priority > min_priority) { (*compact_priority)--; *compaction_retries = 0; return true; -- cgit v1.1 From 9f7e3387939b036faacf4e7f32de7bb92a6635d6 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 7 Oct 2016 17:00:37 -0700 Subject: mm, compaction: make full priority ignore pageblock suitability Several people have reported premature OOMs for order-2 allocations (stack) due to OOM rework in 4.7. In the scenario (parallel kernel build and dd writing to two drives) many pageblocks get marked as Unmovable and compaction free scanner struggles to isolate free pages. Joonsoo Kim pointed out that the free scanner skips pageblocks that are not movable to prevent filling them and forcing non-movable allocations to fallback to other pageblocks. Such heuristic makes sense to help prevent long-term fragmentation, but premature OOMs are relatively more urgent problem. As a compromise, this patch disables the heuristic only for the ultimate compaction priority. Link: http://lkml.kernel.org/r/20160906135258.18335-5-vbabka@suse.cz Reported-by: Ralf-Peter Rohbeck Reported-by: Arkadiusz Miskiewicz Reported-by: Olaf Hering Suggested-by: Joonsoo Kim Signed-off-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Michal Hocko Cc: Mel Gorman Cc: Joonsoo Kim Cc: David Rientjes Cc: Rik van Riel Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 11 ++++++++--- mm/internal.h | 1 + 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 29f6c49..86d4d0b 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -997,8 +997,12 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, #ifdef CONFIG_COMPACTION /* Returns true if the page is within a block suitable for migration to */ -static bool suitable_migration_target(struct page *page) +static bool suitable_migration_target(struct compact_control *cc, + struct page *page) { + if (cc->ignore_block_suitable) + return true; + /* If the page is a large free page, then disallow migration */ if (PageBuddy(page)) { /* @@ -1083,7 +1087,7 @@ static void isolate_freepages(struct compact_control *cc) continue; /* Check the block is suitable for migration */ - if (!suitable_migration_target(page)) + if (!suitable_migration_target(cc, page)) continue; /* If isolation recently failed, do not retry */ @@ -1656,7 +1660,8 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, .classzone_idx = classzone_idx, .direct_compaction = true, .whole_zone = (prio == MIN_COMPACT_PRIORITY), - .ignore_skip_hint = (prio == MIN_COMPACT_PRIORITY) + .ignore_skip_hint = (prio == MIN_COMPACT_PRIORITY), + .ignore_block_suitable = (prio == MIN_COMPACT_PRIORITY) }; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); diff --git a/mm/internal.h b/mm/internal.h index 5214bf8..537ac99 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -178,6 +178,7 @@ struct compact_control { unsigned long last_migrated_pfn;/* Not yet flushed page being freed */ enum migrate_mode mode; /* Async or sync migration mode */ bool ignore_skip_hint; /* Scan blocks even if marked skip */ + bool ignore_block_suitable; /* Scan blocks considered unsuitable */ bool direct_compaction; /* False from kcompactd or /proc/... */ bool whole_zone; /* Whole zone should/has been scanned */ int order; /* order a direct compactor needs */ -- cgit v1.1 From 423b452e1553e3d19b632880bf2adf1f058ab267 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 7 Oct 2016 17:00:40 -0700 Subject: mm, page_alloc: pull no_progress_loops update to should_reclaim_retry() The should_reclaim_retry() makes decisions based on no_progress_loops, so it makes sense to also update the counter there. It will be also consistent with should_compact_retry() and compaction_retries. No functional change. [hillf.zj@alibaba-inc.com: fix missing pointer dereferences] Link: http://lkml.kernel.org/r/20160926162025.21555-3-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Hillf Danton Acked-by: Michal Hocko Cc: Mel Gorman Cc: Joonsoo Kim Cc: David Rientjes Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 891e388..bcfa647 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3402,16 +3402,26 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) static inline bool should_reclaim_retry(gfp_t gfp_mask, unsigned order, struct alloc_context *ac, int alloc_flags, - bool did_some_progress, int no_progress_loops) + bool did_some_progress, int *no_progress_loops) { struct zone *zone; struct zoneref *z; /* + * Costly allocations might have made a progress but this doesn't mean + * their order will become available due to high fragmentation so + * always increment the no progress counter for them + */ + if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) + *no_progress_loops = 0; + else + (*no_progress_loops)++; + + /* * Make sure we converge to OOM if we cannot make any progress * several times in the row. */ - if (no_progress_loops > MAX_RECLAIM_RETRIES) + if (*no_progress_loops > MAX_RECLAIM_RETRIES) return false; /* @@ -3426,7 +3436,7 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, unsigned long reclaimable; available = reclaimable = zone_reclaimable_pages(zone); - available -= DIV_ROUND_UP(no_progress_loops * available, + available -= DIV_ROUND_UP((*no_progress_loops) * available, MAX_RECLAIM_RETRIES); available += zone_page_state_snapshot(zone, NR_FREE_PAGES); @@ -3642,18 +3652,8 @@ retry: if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT)) goto nopage; - /* - * Costly allocations might have made a progress but this doesn't mean - * their order will become available due to high fragmentation so - * always increment the no progress counter for them - */ - if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) - no_progress_loops = 0; - else - no_progress_loops++; - if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, - did_some_progress > 0, no_progress_loops)) + did_some_progress > 0, &no_progress_loops)) goto retry; /* -- cgit v1.1 From cc5c9f098fe48a8736add8a23c983524ca16cea5 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 7 Oct 2016 17:00:43 -0700 Subject: mm, compaction: ignore fragindex from compaction_zonelist_suitable() The compaction_zonelist_suitable() function tries to determine if compaction will be able to proceed after sufficient reclaim, i.e. whether there are enough reclaimable pages to provide enough order-0 freepages for compaction. This addition of reclaimable pages to the free pages works well for the order-0 watermark check, but in the fragmentation index check we only consider truly free pages. Thus we can get fragindex value close to 0 which indicates failure do to lack of memory, and wrongly decide that compaction won't be suitable even after reclaim. Instead of trying to somehow adjust fragindex for reclaimable pages, let's just skip it from compaction_zonelist_suitable(). Link: http://lkml.kernel.org/r/20160926162025.21555-4-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Mel Gorman Cc: Joonsoo Kim Cc: David Rientjes Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 86d4d0b..b918bdb 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1379,7 +1379,6 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, int classzone_idx, unsigned long wmark_target) { - int fragindex; unsigned long watermark; if (is_via_compact_memory(order)) @@ -1415,6 +1414,18 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, ALLOC_CMA, wmark_target)) return COMPACT_SKIPPED; + return COMPACT_CONTINUE; +} + +enum compact_result compaction_suitable(struct zone *zone, int order, + unsigned int alloc_flags, + int classzone_idx) +{ + enum compact_result ret; + int fragindex; + + ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx, + zone_page_state(zone, NR_FREE_PAGES)); /* * fragmentation index determines if allocation failures are due to * low memory or external fragmentation @@ -1426,21 +1437,12 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, * * Only compact if a failure would be due to fragmentation. */ - fragindex = fragmentation_index(zone, order); - if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) - return COMPACT_NOT_SUITABLE_ZONE; - - return COMPACT_CONTINUE; -} - -enum compact_result compaction_suitable(struct zone *zone, int order, - unsigned int alloc_flags, - int classzone_idx) -{ - enum compact_result ret; + if (ret == COMPACT_CONTINUE) { + fragindex = fragmentation_index(zone, order); + if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) + ret = COMPACT_NOT_SUITABLE_ZONE; + } - ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx, - zone_page_state(zone, NR_FREE_PAGES)); trace_mm_compaction_suitable(zone, order, ret); if (ret == COMPACT_NOT_SUITABLE_ZONE) ret = COMPACT_SKIPPED; @@ -1473,8 +1475,7 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, available += zone_page_state_snapshot(zone, NR_FREE_PAGES); compact_result = __compaction_suitable(zone, order, alloc_flags, ac_classzone_idx(ac), available); - if (compact_result != COMPACT_SKIPPED && - compact_result != COMPACT_NOT_SUITABLE_ZONE) + if (compact_result != COMPACT_SKIPPED) return true; } -- cgit v1.1 From 20311420282f3402888f1d9b8b80d924d491aadf Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 7 Oct 2016 17:00:46 -0700 Subject: mm, compaction: restrict fragindex to costly orders Fragmentation index and the vm.extfrag_threshold sysctl is meant as a heuristic to prevent excessive compaction for costly orders (i.e. THP). It's unlikely to make any difference for non-costly orders, especially with the default threshold. But we cannot afford any uncertainty for the non-costly orders where the only alternative to successful reclaim/compaction is OOM. After the recent patches we are guaranteed maximum effort without heuristics from compaction before deciding OOM, and fragindex is the last remaining heuristic. Therefore skip fragindex altogether for non-costly orders. Suggested-by: Michal Hocko Link: http://lkml.kernel.org/r/20160926162025.21555-5-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Mel Gorman Cc: Joonsoo Kim Cc: David Rientjes Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index b918bdb..0409a4a 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1435,9 +1435,14 @@ enum compact_result compaction_suitable(struct zone *zone, int order, * index towards 0 implies failure is due to lack of memory * index towards 1000 implies failure is due to fragmentation * - * Only compact if a failure would be due to fragmentation. + * Only compact if a failure would be due to fragmentation. Also + * ignore fragindex for non-costly orders where the alternative to + * a successful reclaim/compaction is OOM. Fragindex and the + * vm.extfrag_threshold sysctl is meant as a heuristic to prevent + * excessive compaction for costly orders, but it should not be at the + * expense of system stability. */ - if (ret == COMPACT_CONTINUE) { + if (ret == COMPACT_CONTINUE && (order > PAGE_ALLOC_COSTLY_ORDER)) { fragindex = fragmentation_index(zone, order); if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) ret = COMPACT_NOT_SUITABLE_ZONE; -- cgit v1.1 From a104808e212a9ee97e6b9cb6945185e50905f009 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Fri, 7 Oct 2016 17:00:49 -0700 Subject: mm: don't emit warning from pagefault_out_of_memory() Commit c32b3cbe0d06 ("oom, PM: make OOM detection in the freezer path raceless") inserted a WARN_ON() into pagefault_out_of_memory() in order to warn when we raced with disabling the OOM killer. Now, patch "oom, suspend: fix oom_killer_disable vs. pm suspend properly" introduced a timeout for oom_killer_disable(). Even if we raced with disabling the OOM killer and the system is OOM livelocked, the OOM killer will be enabled eventually (in 20 seconds by default) and the OOM livelock will be solved. Therefore, we no longer need to warn when we raced with disabling the OOM killer. Link: http://lkml.kernel.org/r/1473442120-7246-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp Signed-off-by: Tetsuo Handa Acked-by: Michal Hocko Cc: David Rientjes Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 0034baf..f284e92 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -1069,16 +1069,6 @@ void pagefault_out_of_memory(void) if (!mutex_trylock(&oom_lock)) return; - - if (!out_of_memory(&oc)) { - /* - * There shouldn't be any user tasks runnable while the - * OOM killer is disabled, so the current task has to - * be a racing OOM victim for which oom_killer_disable() - * is waiting for. - */ - WARN_ON(test_thread_flag(TIF_MEMDIE)); - } - + out_of_memory(&oc); mutex_unlock(&oom_lock); } -- cgit v1.1 From cc30c5d6461a2813406f7f84d581643781922a82 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 7 Oct 2016 17:00:52 -0700 Subject: mm/page_io.c: replace some BUG_ON()s with VM_BUG_ON_PAGE() So they are CONFIG_DEBUG_VM-only and more informative. Cc: Al Viro Cc: David S. Miller Cc: Hugh Dickins Cc: Jens Axboe Cc: Joe Perches Cc: Mel Gorman Cc: Michal Hocko Cc: Peter Zijlstra Cc: Rik van Riel Cc: Santosh Shilimkar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_io.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_io.c b/mm/page_io.c index eafe5dd..a2651f5 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -264,7 +264,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, int ret; struct swap_info_struct *sis = page_swap_info(page); - BUG_ON(!PageSwapCache(page)); + VM_BUG_ON_PAGE(!PageSwapCache(page), page); if (sis->flags & SWP_FILE) { struct kiocb kiocb; struct file *swap_file = sis->swap_file; @@ -338,7 +338,7 @@ int swap_readpage(struct page *page) int ret = 0; struct swap_info_struct *sis = page_swap_info(page); - BUG_ON(!PageSwapCache(page)); + VM_BUG_ON_PAGE(!PageSwapCache(page), page); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageUptodate(page), page); if (frontswap_load(page) == 0) { @@ -388,7 +388,8 @@ int swap_set_page_dirty(struct page *page) if (sis->flags & SWP_FILE) { struct address_space *mapping = sis->swap_file->f_mapping; - BUG_ON(!PageSwapCache(page)); + + VM_BUG_ON_PAGE(!PageSwapCache(page), page); return mapping->a_ops->set_page_dirty(page); } else { return __set_page_dirty_no_writeback(page); -- cgit v1.1 From 2d75807383459c04d457bf2d295fa6ad858507d2 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 7 Oct 2016 17:00:58 -0700 Subject: mm: memcontrol: consolidate cgroup socket tracking The cgroup core and the memory controller need to track socket ownership for different purposes, but the tracking sites being entirely different is kind of ugly. Be a better citizen and rename the memory controller callbacks to match the cgroup core callbacks, then move them to the same place. [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/20160914194846.11153-3-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Tejun Heo Cc: "David S. Miller" Cc: Michal Hocko Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 60bb830..ae052b5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2939,16 +2939,16 @@ static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit) /* * The active flag needs to be written after the static_key * update. This is what guarantees that the socket activation - * function is the last one to run. See sock_update_memcg() for - * details, and note that we don't mark any socket as belonging - * to this memcg until that flag is up. + * function is the last one to run. See mem_cgroup_sk_alloc() + * for details, and note that we don't mark any socket as + * belonging to this memcg until that flag is up. * * We need to do this, because static_keys will span multiple * sites, but we can't control their order. If we mark a socket * as accounted, but the accounting functions are not patched in * yet, we'll lose accounting. * - * We never race with the readers in sock_update_memcg(), + * We never race with the readers in mem_cgroup_sk_alloc(), * because when this value change, the code to process it is not * patched in yet. */ @@ -5651,11 +5651,15 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key); EXPORT_SYMBOL(memcg_sockets_enabled_key); -void sock_update_memcg(struct sock *sk) +void mem_cgroup_sk_alloc(struct sock *sk) { struct mem_cgroup *memcg; - /* Socket cloning can throw us here with sk_cgrp already + if (!mem_cgroup_sockets_enabled) + return; + + /* + * Socket cloning can throw us here with sk_memcg already * filled. It won't however, necessarily happen from * process context. So the test for root memcg given * the current task's memcg won't help us in this case. @@ -5680,12 +5684,11 @@ void sock_update_memcg(struct sock *sk) out: rcu_read_unlock(); } -EXPORT_SYMBOL(sock_update_memcg); -void sock_release_memcg(struct sock *sk) +void mem_cgroup_sk_free(struct sock *sk) { - WARN_ON(!sk->sk_memcg); - css_put(&sk->sk_memcg->css); + if (sk->sk_memcg) + css_put(&sk->sk_memcg->css); } /** -- cgit v1.1 From 19938e350adc60f3b9381ae6fc68da40f7d1a9f6 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Fri, 7 Oct 2016 17:01:01 -0700 Subject: mm/shmem.c: constify anon_ops Every other dentry_operations instance is const, and this one might as well be. Link: http://lkml.kernel.org/r/1473890528-7009-1-git-send-email-linux@rasmusvillemoes.dk Signed-off-by: Rasmus Villemoes Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 971fc83..dee0631 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -4078,7 +4078,7 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range); /* common code */ -static struct dentry_operations anon_ops = { +static const struct dentry_operations anon_ops = { .d_dname = simple_dname }; -- cgit v1.1 From 914a051654c5401cb216a939e214e17ec018b6a9 Mon Sep 17 00:00:00 2001 From: Wanlong Gao Date: Fri, 7 Oct 2016 17:01:04 -0700 Subject: mm: nobootmem: move the comment of free_all_bootmem Commit b4def3509d18 ("mm, nobootmem: clean-up of free_low_memory_core_early()") removed the unnecessary nodeid argument, after that, this comment becomes more confused. We should move it to the right place. Fixes: b4def3509d18c1db9 ("mm, nobootmem: clean-up of free_low_memory_core_early()") Link: http://lkml.kernel.org/r/1473996082-14603-1-git-send-email-wanlong.gao@gmail.com Signed-off-by: Wanlong Gao Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/nobootmem.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 490d46a..ba609b6 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -137,6 +137,11 @@ static unsigned long __init free_low_memory_core_early(void) for_each_reserved_mem_region(i, &start, &end) reserve_bootmem_region(start, end); + /* + * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id + * because in some case like Node0 doesn't have RAM installed + * low ram will be on Node1 + */ for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, NULL) count += __free_memory_core(start, end); @@ -194,11 +199,6 @@ unsigned long __init free_all_bootmem(void) reset_all_zones_managed_pages(); - /* - * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id - * because in some case like Node0 doesn't have RAM installed - * low ram will be on Node1 - */ pages = free_low_memory_core_early(); totalram_pages += pages; -- cgit v1.1 From 2247bb335ab9c40058484cac36ea74ee652f3b7b Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Fri, 7 Oct 2016 17:01:07 -0700 Subject: mm/hugetlb: fix memory offline with hugepage size > memory block size Patch series "mm/hugetlb: memory offline issues with hugepages", v4. This addresses several issues with hugepages and memory offline. While the first patch fixes a panic, and is therefore rather important, the last patch is just a performance optimization. The second patch fixes a theoretical issue with reserved hugepages, while still leaving some ugly usability issue, see description. This patch (of 3): dissolve_free_huge_pages() will either run into the VM_BUG_ON() or a list corruption and addressing exception when trying to set a memory block offline that is part (but not the first part) of a "gigantic" hugetlb page with a size > memory block size. When no other smaller hugetlb page sizes are present, the VM_BUG_ON() will trigger directly. In the other case we will run into an addressing exception later, because dissolve_free_huge_page() will not work on the head page of the compound hugetlb page which will result in a NULL hstate from page_hstate(). To fix this, first remove the VM_BUG_ON() because it is wrong, and then use the compound head page in dissolve_free_huge_page(). This means that an unused pre-allocated gigantic page that has any part of itself inside the memory block that is going offline will be dissolved completely. Losing an unused gigantic hugepage is preferable to failing the memory offline, for example in the situation where a (possibly faulty) memory DIMM needs to go offline. Fixes: c8721bbb ("mm: memory-hotplug: enable memory hotplug to handle hugepage") Link: http://lkml.kernel.org/r/20160926172811.94033-2-gerald.schaefer@de.ibm.com Signed-off-by: Gerald Schaefer Acked-by: Michal Hocko Acked-by: Naoya Horiguchi Cc: "Kirill A . Shutemov" Cc: Vlastimil Babka Cc: Mike Kravetz Cc: "Aneesh Kumar K . V" Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Rui Teng Cc: Dave Hansen Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 87e11d8..603bdd0 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1443,13 +1443,14 @@ static void dissolve_free_huge_page(struct page *page) { spin_lock(&hugetlb_lock); if (PageHuge(page) && !page_count(page)) { - struct hstate *h = page_hstate(page); - int nid = page_to_nid(page); - list_del(&page->lru); + struct page *head = compound_head(page); + struct hstate *h = page_hstate(head); + int nid = page_to_nid(head); + list_del(&head->lru); h->free_huge_pages--; h->free_huge_pages_node[nid]--; h->max_huge_pages--; - update_and_free_page(h, page); + update_and_free_page(h, head); } spin_unlock(&hugetlb_lock); } @@ -1457,7 +1458,8 @@ static void dissolve_free_huge_page(struct page *page) /* * Dissolve free hugepages in a given pfn range. Used by memory hotplug to * make specified memory blocks removable from the system. - * Note that start_pfn should aligned with (minimum) hugepage size. + * Note that this will dissolve a free gigantic hugepage completely, if any + * part of it lies within the given range. */ void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) { @@ -1466,7 +1468,6 @@ void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) if (!hugepages_supported()) return; - VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << minimum_order)); for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) dissolve_free_huge_page(pfn_to_page(pfn)); } -- cgit v1.1 From 082d5b6b60e9f25e1511557fcfcb21eedd267446 Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Fri, 7 Oct 2016 17:01:10 -0700 Subject: mm/hugetlb: check for reserved hugepages during memory offline In dissolve_free_huge_pages(), free hugepages will be dissolved without making sure that there are enough of them left to satisfy hugepage reservations. Fix this by adding a return value to dissolve_free_huge_pages() and checking h->free_huge_pages vs. h->resv_huge_pages. Note that this may lead to the situation where dissolve_free_huge_page() returns an error and all free hugepages that were dissolved before that error are lost, while the memory block still cannot be set offline. Fixes: c8721bbb ("mm: memory-hotplug: enable memory hotplug to handle hugepage") Link: http://lkml.kernel.org/r/20160926172811.94033-3-gerald.schaefer@de.ibm.com Signed-off-by: Gerald Schaefer Acked-by: Michal Hocko Acked-by: Naoya Horiguchi Cc: "Kirill A . Shutemov" Cc: Vlastimil Babka Cc: Mike Kravetz Cc: "Aneesh Kumar K . V" Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Rui Teng Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 26 +++++++++++++++++++++----- mm/memory_hotplug.c | 4 +++- 2 files changed, 24 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 603bdd0..91ae1f5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1437,22 +1437,32 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, /* * Dissolve a given free hugepage into free buddy pages. This function does - * nothing for in-use (including surplus) hugepages. + * nothing for in-use (including surplus) hugepages. Returns -EBUSY if the + * number of free hugepages would be reduced below the number of reserved + * hugepages. */ -static void dissolve_free_huge_page(struct page *page) +static int dissolve_free_huge_page(struct page *page) { + int rc = 0; + spin_lock(&hugetlb_lock); if (PageHuge(page) && !page_count(page)) { struct page *head = compound_head(page); struct hstate *h = page_hstate(head); int nid = page_to_nid(head); + if (h->free_huge_pages - h->resv_huge_pages == 0) { + rc = -EBUSY; + goto out; + } list_del(&head->lru); h->free_huge_pages--; h->free_huge_pages_node[nid]--; h->max_huge_pages--; update_and_free_page(h, head); } +out: spin_unlock(&hugetlb_lock); + return rc; } /* @@ -1460,16 +1470,22 @@ static void dissolve_free_huge_page(struct page *page) * make specified memory blocks removable from the system. * Note that this will dissolve a free gigantic hugepage completely, if any * part of it lies within the given range. + * Also note that if dissolve_free_huge_page() returns with an error, all + * free hugepages that were dissolved before that error are lost. */ -void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) +int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) { unsigned long pfn; + int rc = 0; if (!hugepages_supported()) - return; + return rc; for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) - dissolve_free_huge_page(pfn_to_page(pfn)); + if (rc = dissolve_free_huge_page(pfn_to_page(pfn))) + break; + + return rc; } /* diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 9d29ba0..9629273 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1945,7 +1945,9 @@ repeat: * dissolve free hugepages in the memory block before doing offlining * actually in order to make hugetlbfs's object counting consistent. */ - dissolve_free_huge_pages(start_pfn, end_pfn); + ret = dissolve_free_huge_pages(start_pfn, end_pfn); + if (ret) + goto failed_removal; /* check again */ offlined_pages = check_pages_isolated(start_pfn, end_pfn); if (offlined_pages < 0) { -- cgit v1.1 From eb03aa008561004257900983193d024e57abdd96 Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Fri, 7 Oct 2016 17:01:13 -0700 Subject: mm/hugetlb: improve locking in dissolve_free_huge_pages() For every pfn aligned to minimum_order, dissolve_free_huge_pages() will call dissolve_free_huge_page() which takes the hugetlb spinlock, even if the page is not huge at all or a hugepage that is in-use. Improve this by doing the PageHuge() and page_count() checks already in dissolve_free_huge_pages() before calling dissolve_free_huge_page(). In dissolve_free_huge_page(), when holding the spinlock, those checks need to be revalidated. Link: http://lkml.kernel.org/r/20160926172811.94033-4-gerald.schaefer@de.ibm.com Signed-off-by: Gerald Schaefer Acked-by: Michal Hocko Acked-by: Naoya Horiguchi Cc: "Kirill A . Shutemov" Cc: Vlastimil Babka Cc: Mike Kravetz Cc: "Aneesh Kumar K . V" Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Rui Teng Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 91ae1f5..770d83e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1476,14 +1476,20 @@ out: int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) { unsigned long pfn; + struct page *page; int rc = 0; if (!hugepages_supported()) return rc; - for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) - if (rc = dissolve_free_huge_page(pfn_to_page(pfn))) - break; + for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) { + page = pfn_to_page(pfn); + if (PageHuge(page) && !page_count(page)) { + rc = dissolve_free_huge_page(page); + if (rc) + break; + } + } return rc; } -- cgit v1.1 From ac34dcd263a3afe9a2e4d58a2d93bb66d700ac7c Mon Sep 17 00:00:00 2001 From: Yisheng Xie Date: Fri, 7 Oct 2016 17:01:16 -0700 Subject: mm/page_isolation: fix typo: "paes" -> "pages" Fix typo in comment. Link: http://lkml.kernel.org/r/1474788764-5774-1-git-send-email-ysxie@foxmail.com Signed-off-by: Yisheng Xie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_isolation.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 064b7fb..a5594bf 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -55,7 +55,7 @@ static int set_migratetype_isolate(struct page *page, ret = 0; /* - * immobile means "not-on-lru" paes. If immobile is larger than + * immobile means "not-on-lru" pages. If immobile is larger than * removable-by-driver pages reported by notifier, we'll fail. */ -- cgit v1.1 From 6213055f2c068b63078649457391ecea9b489ea3 Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Fri, 7 Oct 2016 17:01:19 -0700 Subject: mm,ksm: add __GFP_HIGH to the allocation in alloc_stable_node() According to Hugh's suggestion, alloc_stable_node() with GFP_KERNEL can in rare cases cause a hung task warning. At present, if alloc_stable_node() allocation fails, two break_cows may want to allocate a couple of pages, and the issue will come up when free memory is under pressure. We fix it by adding __GFP_HIGH to GFP, to grant access to memory reserves, increasing the likelihood of allocation success. [akpm@linux-foundation.org: tweak comment] Link: http://lkml.kernel.org/r/1474354484-58233-1-git-send-email-zhongjiang@huawei.com Signed-off-by: zhong jiang Suggested-by: Hugh Dickins Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/ksm.c b/mm/ksm.c index 5048083..9ae6011 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -299,7 +299,12 @@ static inline void free_rmap_item(struct rmap_item *rmap_item) static inline struct stable_node *alloc_stable_node(void) { - return kmem_cache_alloc(stable_node_cache, GFP_KERNEL); + /* + * The allocation can take too long with GFP_KERNEL when memory is under + * pressure, which may lead to hung task warnings. Adding __GFP_HIGH + * grants access to memory reserves, helping to avoid this problem. + */ + return kmem_cache_alloc(stable_node_cache, GFP_KERNEL | __GFP_HIGH); } static inline void free_stable_node(struct stable_node *stable_node) -- cgit v1.1 From 6d2329f8872f23e46a19d240930571510ce525eb Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Fri, 7 Oct 2016 17:01:22 -0700 Subject: mm: vm_page_prot: update with WRITE_ONCE/READ_ONCE vma->vm_page_prot is read lockless from the rmap_walk, it may be updated concurrently and this prevents the risk of reading intermediate values. Link: http://lkml.kernel.org/r/1474660305-19222-1-git-send-email-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Cc: Rik van Riel Cc: Hugh Dickins Cc: Mel Gorman Cc: Jan Vorlicek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 2 +- mm/migrate.c | 2 +- mm/mmap.c | 16 +++++++++------- mm/mprotect.c | 2 +- 4 files changed, 12 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 12b9f1a..cdcd25c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1620,7 +1620,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, if (soft_dirty) entry = pte_swp_mksoft_dirty(entry); } else { - entry = mk_pte(page + i, vma->vm_page_prot); + entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot)); entry = maybe_mkwrite(entry, vma); if (!write) entry = pte_wrprotect(entry); diff --git a/mm/migrate.c b/mm/migrate.c index f7ee04a..99250ae 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -234,7 +234,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, goto unlock; get_page(new); - pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); + pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot))); if (pte_swp_soft_dirty(*ptep)) pte = pte_mksoft_dirty(pte); diff --git a/mm/mmap.c b/mm/mmap.c index 7a0707a..b3b74cc 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -116,13 +116,15 @@ static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags) void vma_set_page_prot(struct vm_area_struct *vma) { unsigned long vm_flags = vma->vm_flags; + pgprot_t vm_page_prot; - vma->vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags); - if (vma_wants_writenotify(vma)) { + vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags); + if (vma_wants_writenotify(vma, vm_page_prot)) { vm_flags &= ~VM_SHARED; - vma->vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, - vm_flags); + vm_page_prot = vm_pgprot_modify(vm_page_prot, vm_flags); } + /* remove_protection_ptes reads vma->vm_page_prot without mmap_sem */ + WRITE_ONCE(vma->vm_page_prot, vm_page_prot); } /* @@ -1386,7 +1388,7 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) * to the private version (using protection_map[] without the * VM_SHARED bit). */ -int vma_wants_writenotify(struct vm_area_struct *vma) +int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot) { vm_flags_t vm_flags = vma->vm_flags; const struct vm_operations_struct *vm_ops = vma->vm_ops; @@ -1401,8 +1403,8 @@ int vma_wants_writenotify(struct vm_area_struct *vma) /* The open routine did something to the protections that pgprot_modify * won't preserve? */ - if (pgprot_val(vma->vm_page_prot) != - pgprot_val(vm_pgprot_modify(vma->vm_page_prot, vm_flags))) + if (pgprot_val(vm_page_prot) != + pgprot_val(vm_pgprot_modify(vm_page_prot, vm_flags))) return 0; /* Do we need to track softdirty? */ diff --git a/mm/mprotect.c b/mm/mprotect.c index a4830f0..063bbed 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -327,7 +327,7 @@ success: * held in write mode. */ vma->vm_flags = newflags; - dirty_accountable = vma_wants_writenotify(vma); + dirty_accountable = vma_wants_writenotify(vma, vma->vm_page_prot); vma_set_page_prot(vma); change_protection(vma, start, end, vma->vm_page_prot, -- cgit v1.1 From fb8c41e9ad1f356b06b46a63ada10b7dce2a5d94 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Fri, 7 Oct 2016 17:01:25 -0700 Subject: mm: vma_adjust: remove superfluous confusing update in remove_next == 1 case mm->highest_vm_end doesn't need any update. After finally removing the oddness from vma_merge case 8 that was causing: 1) constant risk of trouble whenever anybody would check vma fields from rmap_walks, like it happened when page migration was introduced and it read the vma->vm_page_prot from a rmap_walk 2) the callers of vma_merge to re-initialize any value different from the current vma, instead of vma_merge() more reliably returning a vma that already matches all fields passed as parameter .. it is also worth to take the opportunity of cleaning up superfluous code in vma_adjust(), that if not removed adds up to the hard readability of the function. Link: http://lkml.kernel.org/r/1474492522-2261-5-git-send-email-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Cc: Rik van Riel Cc: Hugh Dickins Cc: Mel Gorman Cc: Jan Vorlicek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index b3b74cc..183694b 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -817,8 +817,28 @@ again: } else if (next) vma_gap_update(next); - else - mm->highest_vm_end = end; + else { + /* + * If remove_next == 2 we obviously can't + * reach this path. + * + * If remove_next == 3 we can't reach this + * path because pre-swap() next is always not + * NULL. pre-swap() "next" is not being + * removed and its next->vm_end is not altered + * (and furthermore "end" already matches + * next->vm_end in remove_next == 3). + * + * We reach this only in the remove_next == 1 + * case if the "next" vma that was removed was + * the highest vma of the mm. However in such + * case next->vm_end == "end" and the extended + * "vma" has vma->vm_end == next->vm_end so + * mm->highest_vm_end doesn't need any update + * in remove_next == 1 case. + */ + VM_WARN_ON(mm->highest_vm_end != end); + } } if (insert && file) uprobe_mmap(insert); -- cgit v1.1 From e86f15ee64d8ee46255d964d55f74f5ba9af8c36 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Fri, 7 Oct 2016 17:01:28 -0700 Subject: mm: vma_merge: fix vm_page_prot SMP race condition against rmap_walk The rmap_walk can access vm_page_prot (and potentially vm_flags in the pte/pmd manipulations). So it's not safe to wait the caller to update the vm_page_prot/vm_flags after vma_merge returned potentially removing the "next" vma and extending the "current" vma over the next->vm_start,vm_end range, but still with the "current" vma vm_page_prot, after releasing the rmap locks. The vm_page_prot/vm_flags must be transferred from the "next" vma to the current vma while vma_merge still holds the rmap locks. The side effect of this race condition is pte corruption during migrate as remove_migration_ptes when run on a address of the "next" vma that got removed, used the vm_page_prot of the current vma. migrate mprotect ------------ ------------- migrating in "next" vma vma_merge() # removes "next" vma and # extends "current" vma # current vma is not with # vm_page_prot updated remove_migration_ptes read vm_page_prot of current "vma" establish pte with wrong permissions vm_set_page_prot(vma) # too late! change_protection in the old vma range only, next range is not updated This caused segmentation faults and potentially memory corruption in heavy mprotect loads with some light page migration caused by compaction in the background. Hugh Dickins pointed out the comment about the Odd case 8 in vma_merge which confirms the case 8 is only buggy one where the race can trigger, in all other vma_merge cases the above cannot happen. This fix removes the oddness factor from case 8 and it converts it from: AAAA PPPPNNNNXXXX -> PPPPNNNNNNNN to: AAAA PPPPNNNNXXXX -> PPPPXXXXXXXX XXXX has the right vma properties for the whole merged vma returned by vma_adjust, so it solves the problem fully. It has the added benefits that the callers could stop updating vma properties when vma_merge succeeds however the callers are not updated by this patch (there are bits like VM_SOFTDIRTY that still need special care for the whole range, as the vma merging ignores them, but as long as they're not processed by rmap walks and instead they're accessed with the mmap_sem at least for reading, they are fine not to be updated within vma_adjust before releasing the rmap_locks). Link: http://lkml.kernel.org/r/1474309513-20313-1-git-send-email-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Reported-by: Aditya Mandaleeka Cc: Rik van Riel Cc: Hugh Dickins Cc: Mel Gorman Cc: Jan Vorlicek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 157 ++++++++++++++++++++++++++++++++++++++++++++++++---------- mm/mprotect.c | 1 + 2 files changed, 131 insertions(+), 27 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 183694b..e53637f 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -601,14 +601,24 @@ static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) mm->map_count++; } -static inline void -__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, - struct vm_area_struct *prev) +static __always_inline void __vma_unlink_common(struct mm_struct *mm, + struct vm_area_struct *vma, + struct vm_area_struct *prev, + bool has_prev) { struct vm_area_struct *next; vma_rb_erase(vma, &mm->mm_rb); - prev->vm_next = next = vma->vm_next; + next = vma->vm_next; + if (has_prev) + prev->vm_next = next; + else { + prev = vma->vm_prev; + if (prev) + prev->vm_next = next; + else + mm->mmap = next; + } if (next) next->vm_prev = prev; @@ -616,6 +626,19 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, vmacache_invalidate(mm); } +static inline void __vma_unlink_prev(struct mm_struct *mm, + struct vm_area_struct *vma, + struct vm_area_struct *prev) +{ + __vma_unlink_common(mm, vma, prev, true); +} + +static inline void __vma_unlink(struct mm_struct *mm, + struct vm_area_struct *vma) +{ + __vma_unlink_common(mm, vma, NULL, false); +} + /* * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that * is already present in an i_mmap tree without adjusting the tree. @@ -623,11 +646,12 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, * are necessary. The "insert" vma (if any) is to be inserted * before we drop the necessary locks. */ -int vma_adjust(struct vm_area_struct *vma, unsigned long start, - unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) +int __vma_adjust(struct vm_area_struct *vma, unsigned long start, + unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert, + struct vm_area_struct *expand) { struct mm_struct *mm = vma->vm_mm; - struct vm_area_struct *next = vma->vm_next; + struct vm_area_struct *next = vma->vm_next, *orig_vma = vma; struct address_space *mapping = NULL; struct rb_root *root = NULL; struct anon_vma *anon_vma = NULL; @@ -643,9 +667,38 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, /* * vma expands, overlapping all the next, and * perhaps the one after too (mprotect case 6). + * The only two other cases that gets here are + * case 1, case 7 and case 8. */ - remove_next = 1 + (end > next->vm_end); - end = next->vm_end; + if (next == expand) { + /* + * The only case where we don't expand "vma" + * and we expand "next" instead is case 8. + */ + VM_WARN_ON(end != next->vm_end); + /* + * remove_next == 3 means we're + * removing "vma" and that to do so we + * swapped "vma" and "next". + */ + remove_next = 3; + VM_WARN_ON(file != next->vm_file); + swap(vma, next); + } else { + VM_WARN_ON(expand != vma); + /* + * case 1, 6, 7, remove_next == 2 is case 6, + * remove_next == 1 is case 1 or 7. + */ + remove_next = 1 + (end > next->vm_end); + VM_WARN_ON(remove_next == 2 && + end != next->vm_next->vm_end); + VM_WARN_ON(remove_next == 1 && + end != next->vm_end); + /* trim end to next, for case 6 first pass */ + end = next->vm_end; + } + exporter = next; importer = vma; @@ -664,6 +717,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, adjust_next = (end - next->vm_start) >> PAGE_SHIFT; exporter = next; importer = vma; + VM_WARN_ON(expand != importer); } else if (end < vma->vm_end) { /* * vma shrinks, and !insert tells it's not @@ -673,6 +727,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, adjust_next = -((vma->vm_end - end) >> PAGE_SHIFT); exporter = vma; importer = next; + VM_WARN_ON(expand != importer); } /* @@ -690,7 +745,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, } } again: - vma_adjust_trans_huge(vma, start, end, adjust_next); + vma_adjust_trans_huge(orig_vma, start, end, adjust_next); if (file) { mapping = file->f_mapping; @@ -716,8 +771,8 @@ again: if (!anon_vma && adjust_next) anon_vma = next->anon_vma; if (anon_vma) { - VM_BUG_ON_VMA(adjust_next && next->anon_vma && - anon_vma != next->anon_vma, next); + VM_WARN_ON(adjust_next && next->anon_vma && + anon_vma != next->anon_vma); anon_vma_lock_write(anon_vma); anon_vma_interval_tree_pre_update_vma(vma); if (adjust_next) @@ -757,7 +812,11 @@ again: * vma_merge has merged next into vma, and needs * us to remove next before dropping the locks. */ - __vma_unlink(mm, next, vma); + if (remove_next != 3) + __vma_unlink_prev(mm, next, vma); + else + /* vma is not before next if they've been swapped */ + __vma_unlink(mm, next); if (file) __remove_shared_vm_struct(next, file, mapping); } else if (insert) { @@ -809,7 +868,27 @@ again: * we must remove another next too. It would clutter * up the code too much to do both in one go. */ - next = vma->vm_next; + if (remove_next != 3) { + /* + * If "next" was removed and vma->vm_end was + * expanded (up) over it, in turn + * "next->vm_prev->vm_end" changed and the + * "vma->vm_next" gap must be updated. + */ + next = vma->vm_next; + } else { + /* + * For the scope of the comment "next" and + * "vma" considered pre-swap(): if "vma" was + * removed, next->vm_start was expanded (down) + * over it and the "next" gap must be updated. + * Because of the swap() the post-swap() "vma" + * actually points to pre-swap() "next" + * (post-swap() "next" as opposed is now a + * dangling pointer). + */ + next = vma; + } if (remove_next == 2) { remove_next = 1; end = next->vm_end; @@ -958,13 +1037,24 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, * cannot merge might become might become might become * PPNNNNNNNNNN PPPPPPPPPPNN PPPPPPPPPPPP 6 or * mmap, brk or case 4 below case 5 below PPPPPPPPXXXX 7 or - * mremap move: PPPPNNNNNNNN 8 + * mremap move: PPPPXXXXXXXX 8 * AAAA * PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN * might become case 1 below case 2 below case 3 below * - * Odd one out? Case 8, because it extends NNNN but needs flags of XXXX: - * mprotect_fixup updates vm_flags & vm_page_prot on successful return. + * It is important for case 8 that the the vma NNNN overlapping the + * region AAAA is never going to extended over XXXX. Instead XXXX must + * be extended in region AAAA and NNNN must be removed. This way in + * all cases where vma_merge succeeds, the moment vma_adjust drops the + * rmap_locks, the properties of the merged vma will be already + * correct for the whole merged range. Some of those properties like + * vm_page_prot/vm_flags may be accessed by rmap_walks and they must + * be correct for the whole merged range immediately after the + * rmap_locks are released. Otherwise if XXXX would be removed and + * NNNN would be extended over the XXXX range, remove_migration_ptes + * or other rmap walkers (if working on addresses beyond the "end" + * parameter) may establish ptes with the wrong permissions of NNNN + * instead of the right permissions of XXXX. */ struct vm_area_struct *vma_merge(struct mm_struct *mm, struct vm_area_struct *prev, unsigned long addr, @@ -989,9 +1079,14 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, else next = mm->mmap; area = next; - if (next && next->vm_end == end) /* cases 6, 7, 8 */ + if (area && area->vm_end == end) /* cases 6, 7, 8 */ next = next->vm_next; + /* verify some invariant that must be enforced by the caller */ + VM_WARN_ON(prev && addr <= prev->vm_start); + VM_WARN_ON(area && end > area->vm_end); + VM_WARN_ON(addr >= end); + /* * Can it merge with the predecessor? */ @@ -1012,11 +1107,12 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) { /* cases 1, 6 */ - err = vma_adjust(prev, prev->vm_start, - next->vm_end, prev->vm_pgoff, NULL); + err = __vma_adjust(prev, prev->vm_start, + next->vm_end, prev->vm_pgoff, NULL, + prev); } else /* cases 2, 5, 7 */ - err = vma_adjust(prev, prev->vm_start, - end, prev->vm_pgoff, NULL); + err = __vma_adjust(prev, prev->vm_start, + end, prev->vm_pgoff, NULL, prev); if (err) return NULL; khugepaged_enter_vma_merge(prev, vm_flags); @@ -1032,11 +1128,18 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, anon_vma, file, pgoff+pglen, vm_userfaultfd_ctx)) { if (prev && addr < prev->vm_end) /* case 4 */ - err = vma_adjust(prev, prev->vm_start, - addr, prev->vm_pgoff, NULL); - else /* cases 3, 8 */ - err = vma_adjust(area, addr, next->vm_end, - next->vm_pgoff - pglen, NULL); + err = __vma_adjust(prev, prev->vm_start, + addr, prev->vm_pgoff, NULL, next); + else { /* cases 3, 8 */ + err = __vma_adjust(area, addr, next->vm_end, + next->vm_pgoff - pglen, NULL, next); + /* + * In case 3 area is already equal to next and + * this is a noop, but in case 8 "area" has + * been removed and next was expanded over it. + */ + area = next; + } if (err) return NULL; khugepaged_enter_vma_merge(area, vm_flags); diff --git a/mm/mprotect.c b/mm/mprotect.c index 063bbed..ec91dfd 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -304,6 +304,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, vma->vm_userfaultfd_ctx); if (*pprev) { vma = *pprev; + VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY); goto success; } -- cgit v1.1 From 97a42cd4398162aba77da55b568d85e5ec6b7705 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Fri, 7 Oct 2016 17:01:31 -0700 Subject: mm: vma_adjust: remove superfluous check for next not NULL If next would be NULL we couldn't reach such code path. Link: http://lkml.kernel.org/r/1474309513-20313-2-git-send-email-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Cc: Rik van Riel Cc: Hugh Dickins Cc: Mel Gorman Cc: Jan Vorlicek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index e53637f..aa29d43 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -706,7 +706,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, * If next doesn't have anon_vma, import from vma after * next, if the vma overlaps with it. */ - if (remove_next == 2 && next && !next->anon_vma) + if (remove_next == 2 && !next->anon_vma) exporter = next->vm_next; } else if (end > next->vm_start) { -- cgit v1.1 From 86d12e471d9f152217744f2054e63e3742949879 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Fri, 7 Oct 2016 17:01:34 -0700 Subject: mm: vma_adjust: minor comment correction The cases are three not two. Link: http://lkml.kernel.org/r/1474492522-2261-3-git-send-email-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Cc: Rik van Riel Cc: Hugh Dickins Cc: Mel Gorman Cc: Jan Vorlicek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index aa29d43..4dc65be 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -667,7 +667,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, /* * vma expands, overlapping all the next, and * perhaps the one after too (mprotect case 6). - * The only two other cases that gets here are + * The only other cases that gets here are * case 1, case 7 and case 8. */ if (next == expand) { -- cgit v1.1 From 8f26e0b176f3484c49d55d88fe6083a9cf9ff443 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Fri, 7 Oct 2016 17:01:37 -0700 Subject: mm: vma_merge: correct false positive from __vma_unlink->validate_mm_rb The old code was always doing: vma->vm_end = next->vm_end vma_rb_erase(next) // in __vma_unlink vma->vm_next = next->vm_next // in __vma_unlink next = vma->vm_next vma_gap_update(next) The new code still does the above for remove_next == 1 and 2, but for remove_next == 3 it has been changed and it does: next->vm_start = vma->vm_start vma_rb_erase(vma) // in __vma_unlink vma_gap_update(next) In the latter case, while unlinking "vma", validate_mm_rb() is told to ignore "vma" that is being removed, but next->vm_start was reduced instead. So for the new case, to avoid the false positive from validate_mm_rb, it should be "next" that is ignored when "vma" is being unlinked. "vma" and "next" in the above comment, considered pre-swap(). Link: http://lkml.kernel.org/r/1474492522-2261-4-git-send-email-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Tested-by: Shaun Tancheff Cc: Rik van Riel Cc: Hugh Dickins Cc: Mel Gorman Cc: Jan Vorlicek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 59 +++++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 41 insertions(+), 18 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 4dc65be..1af87c1 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -402,15 +402,9 @@ static inline void vma_rb_insert(struct vm_area_struct *vma, rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks); } -static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root) +static void __vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root) { /* - * All rb_subtree_gap values must be consistent prior to erase, - * with the possible exception of the vma being erased. - */ - validate_mm_rb(root, vma); - - /* * Note rb_erase_augmented is a fairly large inline function, * so make sure we instantiate it only once with our desired * augmented rbtree callbacks. @@ -418,6 +412,32 @@ static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root) rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks); } +static __always_inline void vma_rb_erase_ignore(struct vm_area_struct *vma, + struct rb_root *root, + struct vm_area_struct *ignore) +{ + /* + * All rb_subtree_gap values must be consistent prior to erase, + * with the possible exception of the "next" vma being erased if + * next->vm_start was reduced. + */ + validate_mm_rb(root, ignore); + + __vma_rb_erase(vma, root); +} + +static __always_inline void vma_rb_erase(struct vm_area_struct *vma, + struct rb_root *root) +{ + /* + * All rb_subtree_gap values must be consistent prior to erase, + * with the possible exception of the vma being erased. + */ + validate_mm_rb(root, vma); + + __vma_rb_erase(vma, root); +} + /* * vma has some anon_vma assigned, and is already inserted on that * anon_vma's interval trees. @@ -604,11 +624,12 @@ static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) static __always_inline void __vma_unlink_common(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, - bool has_prev) + bool has_prev, + struct vm_area_struct *ignore) { struct vm_area_struct *next; - vma_rb_erase(vma, &mm->mm_rb); + vma_rb_erase_ignore(vma, &mm->mm_rb, ignore); next = vma->vm_next; if (has_prev) prev->vm_next = next; @@ -630,13 +651,7 @@ static inline void __vma_unlink_prev(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev) { - __vma_unlink_common(mm, vma, prev, true); -} - -static inline void __vma_unlink(struct mm_struct *mm, - struct vm_area_struct *vma) -{ - __vma_unlink_common(mm, vma, NULL, false); + __vma_unlink_common(mm, vma, prev, true, vma); } /* @@ -815,8 +830,16 @@ again: if (remove_next != 3) __vma_unlink_prev(mm, next, vma); else - /* vma is not before next if they've been swapped */ - __vma_unlink(mm, next); + /* + * vma is not before next if they've been + * swapped. + * + * pre-swap() next->vm_start was reduced so + * tell validate_mm_rb to ignore pre-swap() + * "next" (which is stored in post-swap() + * "vma"). + */ + __vma_unlink_common(mm, next, NULL, false, vma); if (file) __remove_shared_vm_struct(next, file, mapping); } else if (insert) { -- cgit v1.1 From 9996f05eac09815121bb718249f21914a667791f Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 7 Oct 2016 17:01:40 -0700 Subject: mm: clarify why we avoid page_mapcount() for slab pages in dump_page() Let's add comment on why we skip page_mapcount() for sl[aou]b pages. Link: http://lkml.kernel.org/r/20160922105532.GB24593@node Signed-off-by: Kirill A. Shutemov Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/debug.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'mm') diff --git a/mm/debug.c b/mm/debug.c index 74c7cae..9feb699 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -42,6 +42,11 @@ const struct trace_print_flags vmaflag_names[] = { void __dump_page(struct page *page, const char *reason) { + /* + * Avoid VM_BUG_ON() in page_mapcount(). + * page->_mapcount space in struct page is used by sl[aou]b pages to + * encode own info. + */ int mapcount = PageSlab(page) ? 0 : page_mapcount(page); pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx", -- cgit v1.1 From 82e7d3abec86cba9df945a765bba384f8ac113a7 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 7 Oct 2016 17:01:43 -0700 Subject: oom: print nodemask in the oom report We have received a hard to explain oom report from a customer. The oom triggered regardless there is a lot of free memory: PoolThread invoked oom-killer: gfp_mask=0x280da, order=0, oom_adj=0, oom_score_adj=0 PoolThread cpuset=/ mems_allowed=0-7 Pid: 30055, comm: PoolThread Tainted: G E X 3.0.101-80-default #1 Call Trace: dump_trace+0x75/0x300 dump_stack+0x69/0x6f dump_header+0x8e/0x110 oom_kill_process+0xa6/0x350 out_of_memory+0x2b7/0x310 __alloc_pages_slowpath+0x7dd/0x820 __alloc_pages_nodemask+0x1e9/0x200 alloc_pages_vma+0xe1/0x290 do_anonymous_page+0x13e/0x300 do_page_fault+0x1fd/0x4c0 page_fault+0x25/0x30 [...] active_anon:1135959151 inactive_anon:1051962 isolated_anon:0 active_file:13093 inactive_file:222506 isolated_file:0 unevictable:262144 dirty:2 writeback:0 unstable:0 free:432672819 slab_reclaimable:7917 slab_unreclaimable:95308 mapped:261139 shmem:166297 pagetables:2228282 bounce:0 [...] Node 0 DMA free:15896kB min:0kB low:0kB high:0kB active_anon:0kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:15672kB mlocked:0kB dirty:0kB writeback:0kB mapped:0kB shmem:0kB slab_reclaimable:0kB slab_unreclaimable:0kB kernel_stack:0kB pagetables:0kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? yes lowmem_reserve[]: 0 2892 775542 775542 Node 0 DMA32 free:2783784kB min:28kB low:32kB high:40kB active_anon:0kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:2961572kB mlocked:0kB dirty:0kB writeback:0kB mapped:0kB shmem:0kB slab_reclaimable:0kB slab_unreclaimable:0kB kernel_stack:0kB pagetables:0kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? yes lowmem_reserve[]: 0 0 772650 772650 Node 0 Normal free:8120kB min:8160kB low:10200kB high:12240kB active_anon:779334960kB inactive_anon:2198744kB active_file:0kB inactive_file:180kB unevictable:131072kB isolated(anon):0kB isolated(file):0kB present:791193600kB mlocked:131072kB dirty:0kB writeback:0kB mapped:372940kB shmem:361480kB slab_reclaimable:4536kB slab_unreclaimable:68472kB kernel_stack:10104kB pagetables:1414820kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:2280 all_unreclaimable? yes lowmem_reserve[]: 0 0 0 0 Node 1 Normal free:476718144kB min:8192kB low:10240kB high:12288kB active_anon:307623696kB inactive_anon:283620kB active_file:10392kB inactive_file:69908kB unevictable:131072kB isolated(anon):0kB isolated(file):0kB present:794296320kB mlocked:131072kB dirty:4kB writeback:0kB mapped:257208kB shmem:189896kB slab_reclaimable:3868kB slab_unreclaimable:44756kB kernel_stack:1848kB pagetables:1369432kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no lowmem_reserve[]: 0 0 0 0 Node 2 Normal free:386002452kB min:8192kB low:10240kB high:12288kB active_anon:398563752kB inactive_anon:68184kB active_file:10292kB inactive_file:29936kB unevictable:131072kB isolated(anon):0kB isolated(file):0kB present:794296320kB mlocked:131072kB dirty:0kB writeback:0kB mapped:32084kB shmem:776kB slab_reclaimable:6888kB slab_unreclaimable:60056kB kernel_stack:8208kB pagetables:1282880kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no lowmem_reserve[]: 0 0 0 0 Node 3 Normal free:196406760kB min:8192kB low:10240kB high:12288kB active_anon:587445640kB inactive_anon:164396kB active_file:5716kB inactive_file:709844kB unevictable:131072kB isolated(anon):0kB isolated(file):0kB present:794296320kB mlocked:131072kB dirty:0kB writeback:0kB mapped:291776kB shmem:111416kB slab_reclaimable:5152kB slab_unreclaimable:44516kB kernel_stack:2168kB pagetables:1455956kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no lowmem_reserve[]: 0 0 0 0 Node 4 Normal free:425338880kB min:8192kB low:10240kB high:12288kB active_anon:359695204kB inactive_anon:43216kB active_file:5748kB inactive_file:14772kB unevictable:131072kB isolated(anon):0kB isolated(file):0kB present:794296320kB mlocked:131072kB dirty:0kB writeback:0kB mapped:24708kB shmem:1120kB slab_reclaimable:1884kB slab_unreclaimable:41060kB kernel_stack:1856kB pagetables:1100208kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no lowmem_reserve[]: 0 0 0 0 Node 5 Normal free:11140kB min:8192kB low:10240kB high:12288kB active_anon:784240872kB inactive_anon:1217164kB active_file:28kB inactive_file:48kB unevictable:131072kB isolated(anon):0kB isolated(file):0kB present:794296320kB mlocked:131072kB dirty:0kB writeback:0kB mapped:11408kB shmem:0kB slab_reclaimable:2008kB slab_unreclaimable:49220kB kernel_stack:1360kB pagetables:531600kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:1202 all_unreclaimable? yes lowmem_reserve[]: 0 0 0 0 Node 6 Normal free:243395332kB min:8192kB low:10240kB high:12288kB active_anon:542015544kB inactive_anon:40208kB active_file:968kB inactive_file:8484kB unevictable:131072kB isolated(anon):0kB isolated(file):0kB present:794296320kB mlocked:131072kB dirty:0kB writeback:0kB mapped:19992kB shmem:496kB slab_reclaimable:1672kB slab_unreclaimable:37052kB kernel_stack:2088kB pagetables:750264kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no lowmem_reserve[]: 0 0 0 0 Node 7 Normal free:10768kB min:8192kB low:10240kB high:12288kB active_anon:784916936kB inactive_anon:192316kB active_file:19228kB inactive_file:56852kB unevictable:131072kB isolated(anon):0kB isolated(file):0kB present:794296320kB mlocked:131072kB dirty:4kB writeback:0kB mapped:34440kB shmem:4kB slab_reclaimable:5660kB slab_unreclaimable:36100kB kernel_stack:1328kB pagetables:1007968kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no lowmem_reserve[]: 0 0 0 0 So all nodes but Node 0 have a lot of free memory which should suggest that there is an available memory especially when mems_allowed=0-7. One could speculate that a massive process has managed to terminate and free up a lot of memory while racing with the above allocation request. Although this is highly unlikely it cannot be ruled out. A further debugging, however shown that the faulting process had mempolicy (not cpuset) to bind to Node 0. We cannot see that information from the report though. mems_allowed turned out to be more confusing than really helpful. Fix this by always priting the nodemask. It is either mempolicy mask (and non-null) or the one defined by the cpusets. The new output for the above oom report would be PoolThread invoked oom-killer: gfp_mask=0x280da(GFP_HIGHUSER_MOVABLE|__GFP_ZERO), nodemask=0, order=0, oom_adj=0, oom_score_adj=0 This patch doesn't touch show_mem and the node filtering based on the cpuset node mask because mempolicy is always a subset of cpusets and seeing the full cpuset oom context might be helpful for tunning more specific mempolicies inside cpusets (e.g. when they turn out to be too restrictive). To prevent from ugly ifdefs the mask is printed even for !NUMA configurations but this should be OK (a single node will be printed). Link: http://lkml.kernel.org/r/20160930214146.28600-1-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: Sellami Abdelkader Acked-by: Vlastimil Babka Cc: David Rientjes Cc: Sellami Abdelkader Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index f284e92..ec9f11d 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -403,8 +403,11 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) static void dump_header(struct oom_control *oc, struct task_struct *p) { - pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n", - current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order, + nodemask_t *nm = (oc->nodemask) ? oc->nodemask : &cpuset_current_mems_allowed; + + pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), nodemask=%*pbl, order=%d, oom_score_adj=%hd\n", + current->comm, oc->gfp_mask, &oc->gfp_mask, + nodemask_pr_args(nm), oc->order, current->signal->oom_score_adj); if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order) pr_warn("COMPACTION is disabled!!!\n"); -- cgit v1.1 From 461a7184320a1b4d2c12ad538354062fef4ee0f1 Mon Sep 17 00:00:00 2001 From: Yisheng Xie Date: Fri, 7 Oct 2016 17:01:46 -0700 Subject: mm/hugetlb: introduce ARCH_HAS_GIGANTIC_PAGE Avoid making ifdef get pretty unwieldy if many ARCHs support gigantic page. No functional change with this patch. Link: http://lkml.kernel.org/r/1475227569-63446-2-git-send-email-xieyisheng1@huawei.com Signed-off-by: Yisheng Xie Suggested-by: Michal Hocko Acked-by: Michal Hocko Acked-by: Naoya Horiguchi Acked-by: Hillf Danton Cc: Hanjun Guo Cc: Will Deacon Cc: Dave Hansen Cc: Sudeep Holla Cc: Catalin Marinas Cc: Mark Rutland Cc: Rob Herring Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 770d83e..e4a4500 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1022,7 +1022,7 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) ((node = hstate_next_node_to_free(hs, mask)) || 1); \ nr_nodes--) -#if (defined(CONFIG_X86_64) || defined(CONFIG_S390)) && \ +#if defined(CONFIG_ARCH_HAS_GIGANTIC_PAGE) && \ ((defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || \ defined(CONFIG_CMA)) static void destroy_compound_gigantic_page(struct page *page, -- cgit v1.1 From c2a9737f45e27d8263ff9643f994bda9bac0b944 Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Fri, 7 Oct 2016 17:01:52 -0700 Subject: vfs,mm: fix a dead loop in truncate_inode_pages_range() We triggered a deadloop in truncate_inode_pages_range() on 32 bits architecture with the test case bellow: ... fd = open(); write(fd, buf, 4096); preadv64(fd, &iovec, 1, 0xffffffff000); ftruncate(fd, 0); ... Then ftruncate() will not return forever. The filesystem used in this case is ubifs, but it can be triggered on many other filesystems. When preadv64() is called with offset=0xffffffff000, a page with index=0xffffffff will be added to the radix tree of ->mapping. Then this page can be found in ->mapping with pagevec_lookup(). After that, truncate_inode_pages_range(), which is called in ftruncate(), will fall into an infinite loop: - find a page with index=0xffffffff, since index>=end, this page won't be truncated - index++, and index become 0 - the page with index=0xffffffff will be found again The data type of index is unsigned long, so index won't overflow to 0 on 64 bits architecture in this case, and the dead loop won't happen. Since truncate_inode_pages_range() is executed with holding lock of inode->i_rwsem, any operation related with this lock will be blocked, and a hung task will happen, e.g.: INFO: task truncate_test:3364 blocked for more than 120 seconds. ... call_rwsem_down_write_failed+0x17/0x30 generic_file_write_iter+0x32/0x1c0 ubifs_write_iter+0xcc/0x170 __vfs_write+0xc4/0x120 vfs_write+0xb2/0x1b0 SyS_write+0x46/0xa0 The page with index=0xffffffff added to ->mapping is useless. Fix this by checking the read position before allocating pages. Link: http://lkml.kernel.org/r/1475151010-40166-1-git-send-email-fangwei1@huawei.com Signed-off-by: Wei Fang Cc: Christoph Hellwig Cc: Dave Chinner Cc: Al Viro Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 1b05f75..2f7b778 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1687,6 +1687,10 @@ static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos, unsigned int prev_offset; int error = 0; + if (unlikely(*ppos >= inode->i_sb->s_maxbytes)) + return -EINVAL; + iov_iter_truncate(iter, inode->i_sb->s_maxbytes); + index = *ppos >> PAGE_SHIFT; prev_index = ra->prev_pos >> PAGE_SHIFT; prev_offset = ra->prev_pos & (PAGE_SIZE-1); -- cgit v1.1 From 7877cdcc3893c1bd9a833b2f0398e7320794c6e6 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 7 Oct 2016 17:01:55 -0700 Subject: mm: consolidate warn_alloc_failed users warn_alloc_failed is currently used from the page and vmalloc allocators. This is a good reuse of the code except that vmalloc would appreciate a slightly different warning message. This is already handled by the fmt parameter except that "%s: page allocation failure: order:%u, mode:%#x(%pGg)" is printed anyway. This might be quite misleading because it might be a vmalloc failure which leads to the warning while the page allocator is not the culprit here. Fix this by always using the fmt string and only print the context that makes sense for the particular context (e.g. order makes only very little sense for the vmalloc context). Rename the function to not miss any user and also because a later patch will reuse it also for !failure cases. Link: http://lkml.kernel.org/r/20160929084407.7004-2-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Tetsuo Handa Cc: Johannes Weiner Cc: Mel Gorman Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 27 ++++++++++++--------------- mm/vmalloc.c | 14 ++++++-------- 2 files changed, 18 insertions(+), 23 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bcfa647..5ab2e30 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2979,9 +2979,11 @@ static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); -void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, const char *fmt, ...) +void warn_alloc(gfp_t gfp_mask, const char *fmt, ...) { unsigned int filter = SHOW_MEM_FILTER_NODES; + struct va_format vaf; + va_list args; if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || debug_guardpage_minorder() > 0) @@ -2999,22 +3001,16 @@ void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, const char *fmt, ...) if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM)) filter &= ~SHOW_MEM_FILTER_NODES; - if (fmt) { - struct va_format vaf; - va_list args; + pr_warn("%s: ", current->comm); - va_start(args, fmt); + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + pr_cont("%pV", &vaf); + va_end(args); - vaf.fmt = fmt; - vaf.va = &args; + pr_cont(", mode:%#x(%pGg)\n", gfp_mask, &gfp_mask); - pr_warn("%pV", &vaf); - - va_end(args); - } - - pr_warn("%s: page allocation failure: order:%u, mode:%#x(%pGg)\n", - current->comm, order, gfp_mask, &gfp_mask); dump_stack(); if (!should_suppress_show_mem()) show_mem(filter); @@ -3680,7 +3676,8 @@ retry: } nopage: - warn_alloc_failed(gfp_mask, order, NULL); + warn_alloc(gfp_mask, + "page allocation failure: order:%u", order); got_pg: return page; } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 80660a0..f2481cb 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1601,7 +1601,6 @@ static void *__vmalloc_node(unsigned long size, unsigned long align, static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot, int node) { - const int order = 0; struct page **pages; unsigned int nr_pages, array_size, i; const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; @@ -1629,9 +1628,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, struct page *page; if (node == NUMA_NO_NODE) - page = alloc_pages(alloc_mask, order); + page = alloc_page(alloc_mask); else - page = alloc_pages_node(node, alloc_mask, order); + page = alloc_pages_node(node, alloc_mask, 0); if (unlikely(!page)) { /* Successfully allocated i pages, free them in __vunmap() */ @@ -1648,8 +1647,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, return area->addr; fail: - warn_alloc_failed(gfp_mask, order, - "vmalloc: allocation failure, allocated %ld of %ld bytes\n", + warn_alloc(gfp_mask, + "vmalloc: allocation failure, allocated %ld of %ld bytes", (area->nr_pages*PAGE_SIZE), area->size); vfree(area->addr); return NULL; @@ -1710,9 +1709,8 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, return addr; fail: - warn_alloc_failed(gfp_mask, 0, - "vmalloc: allocation failure: %lu bytes\n", - real_size); + warn_alloc(gfp_mask, + "vmalloc: allocation failure: %lu bytes", real_size); return NULL; } -- cgit v1.1 From 63f53dea0c9866e93802d50a230c460a024c44e5 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 7 Oct 2016 17:01:58 -0700 Subject: mm: warn about allocations which stall for too long Currently we do warn only about allocation failures but small allocations are basically nofail and they might loop in the page allocator for a long time. Especially when the reclaim cannot make any progress - e.g. GFP_NOFS cannot invoke the oom killer and rely on a different context to make a forward progress in case there is a lot memory used by filesystems. Give us at least a clue when something like this happens and warn about allocations which take more than 10s. Print the basic allocation context information along with the cumulative time spent in the allocation as well as the allocation stack. Repeat the warning after every 10 seconds so that we know that the problem is permanent rather than ephemeral. Link: http://lkml.kernel.org/r/20160929084407.7004-3-mhocko@kernel.org Signed-off-by: Michal Hocko Cc: Vlastimil Babka Cc: Tetsuo Handa Cc: Johannes Weiner Cc: Mel Gorman Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5ab2e30..ca423cc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3493,6 +3493,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, enum compact_result compact_result; int compaction_retries = 0; int no_progress_loops = 0; + unsigned long alloc_start = jiffies; + unsigned int stall_timeout = 10 * HZ; /* * In the slowpath, we sanity check order to avoid ever trying to @@ -3648,6 +3650,14 @@ retry: if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT)) goto nopage; + /* Make sure we know about allocations which stall for too long */ + if (time_after(jiffies, alloc_start + stall_timeout)) { + warn_alloc(gfp_mask, + "page alloction stalls for %ums, order:%u\n", + jiffies_to_msecs(jiffies-alloc_start), order); + stall_timeout += 10 * HZ; + } + if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, did_some_progress > 0, &no_progress_loops)) goto retry; -- cgit v1.1 From 72e2936c04f7d2a4bf87d7f72d3bf11cf91ebb47 Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Fri, 7 Oct 2016 17:02:01 -0700 Subject: mm: remove unnecessary condition in remove_inode_hugepages When the huge page is added to the page cahce (huge_add_to_page_cache), the page private flag will be cleared. since this code (remove_inode_hugepages) will only be called for pages in the page cahce, PagePrivate(page) will always be false. The patch remove the code without any functional change. Link: http://lkml.kernel.org/r/1475113323-29368-1-git-send-email-zhongjiang@huawei.com Signed-off-by: zhong jiang Reviewed-by: Naoya Horiguchi Reviewed-by: Mike Kravetz Tested-by: Mike Kravetz Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e4a4500..ec49d9e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -567,13 +567,13 @@ retry: * appear as a "reserved" entry instead of simply dangling with incorrect * counts. */ -void hugetlb_fix_reserve_counts(struct inode *inode, bool restore_reserve) +void hugetlb_fix_reserve_counts(struct inode *inode) { struct hugepage_subpool *spool = subpool_inode(inode); long rsv_adjust; rsv_adjust = hugepage_subpool_get_pages(spool, 1); - if (restore_reserve && rsv_adjust) { + if (rsv_adjust) { struct hstate *h = hstate_inode(inode); hugetlb_acct_memory(h, 1); -- cgit v1.1 From 68ba0326b4e14988f9e0c24a6e12a85cf2acd1ca Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 7 Oct 2016 17:02:14 -0700 Subject: proc: much faster /proc/vmstat Every current KDE system has process named ksysguardd polling files below once in several seconds: $ strace -e trace=open -p $(pidof ksysguardd) Process 1812 attached open("/etc/mtab", O_RDONLY|O_CLOEXEC) = 8 open("/etc/mtab", O_RDONLY|O_CLOEXEC) = 8 open("/proc/net/dev", O_RDONLY) = 8 open("/proc/net/wireless", O_RDONLY) = -1 ENOENT (No such file or directory) open("/proc/stat", O_RDONLY) = 8 open("/proc/vmstat", O_RDONLY) = 8 Hell knows what it is doing but speed up reading /proc/vmstat by 33%! Benchmark is open+read+close 1.000.000 times. BEFORE $ perf stat -r 10 taskset -c 3 ./proc-vmstat Performance counter stats for 'taskset -c 3 ./proc-vmstat' (10 runs): 13146.768464 task-clock (msec) # 0.960 CPUs utilized ( +- 0.60% ) 15 context-switches # 0.001 K/sec ( +- 1.41% ) 1 cpu-migrations # 0.000 K/sec ( +- 11.11% ) 104 page-faults # 0.008 K/sec ( +- 0.57% ) 45,489,799,349 cycles # 3.460 GHz ( +- 0.03% ) 9,970,175,743 stalled-cycles-frontend # 21.92% frontend cycles idle ( +- 0.10% ) 2,800,298,015 stalled-cycles-backend # 6.16% backend cycles idle ( +- 0.32% ) 79,241,190,850 instructions # 1.74 insn per cycle # 0.13 stalled cycles per insn ( +- 0.00% ) 17,616,096,146 branches # 1339.956 M/sec ( +- 0.00% ) 176,106,232 branch-misses # 1.00% of all branches ( +- 0.18% ) 13.691078109 seconds time elapsed ( +- 0.03% ) ^^^^^^^^^^^^ AFTER $ perf stat -r 10 taskset -c 3 ./proc-vmstat Performance counter stats for 'taskset -c 3 ./proc-vmstat' (10 runs): 8688.353749 task-clock (msec) # 0.950 CPUs utilized ( +- 1.25% ) 10 context-switches # 0.001 K/sec ( +- 2.13% ) 1 cpu-migrations # 0.000 K/sec 104 page-faults # 0.012 K/sec ( +- 0.56% ) 30,384,010,730 cycles # 3.497 GHz ( +- 0.07% ) 12,296,259,407 stalled-cycles-frontend # 40.47% frontend cycles idle ( +- 0.13% ) 3,370,668,651 stalled-cycles-backend # 11.09% backend cycles idle ( +- 0.69% ) 28,969,052,879 instructions # 0.95 insn per cycle # 0.42 stalled cycles per insn ( +- 0.01% ) 6,308,245,891 branches # 726.058 M/sec ( +- 0.00% ) 214,685,502 branch-misses # 3.40% of all branches ( +- 0.26% ) 9.146081052 seconds time elapsed ( +- 0.07% ) ^^^^^^^^^^^ vsnprintf() is slow because: 1. format_decode() is busy looking for format specifier: 2 branches per character (not in this case, but in others) 2. approximately million branches while parsing format mini language and everywhere 3. just look at what string() does /proc/vmstat is good case because most of its content are strings Link: http://lkml.kernel.org/r/20160806125455.GA1187@p183.telecom.by Signed-off-by: Alexey Dobriyan Cc: Joe Perches Cc: Andi Kleen Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmstat.c b/mm/vmstat.c index 73aab31..8857e0e 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1513,7 +1513,10 @@ static int vmstat_show(struct seq_file *m, void *arg) { unsigned long *l = arg; unsigned long off = l - (unsigned long *)m->private; - seq_printf(m, "%s %lu\n", vmstat_text[off], *l); + + seq_puts(m, vmstat_text[off]); + seq_put_decimal_ull(m, ' ', *l); + seq_putc(m, '\n'); return 0; } -- cgit v1.1 From 75ba1d07fd6a494851db5132612944a9d4773f9c Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Fri, 7 Oct 2016 17:02:20 -0700 Subject: seq/proc: modify seq_put_decimal_[u]ll to take a const char *, not char Allow some seq_puts removals by taking a string instead of a single char. [akpm@linux-foundation.org: update vmstat_show(), per Joe] Link: http://lkml.kernel.org/r/667e1cf3d436de91a5698170a1e98d882905e956.1470704995.git.joe@perches.com Signed-off-by: Joe Perches Cc: Joe Perches Cc: Andi Kleen Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmstat.c b/mm/vmstat.c index 8857e0e..604f26a 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1515,7 +1515,7 @@ static int vmstat_show(struct seq_file *m, void *arg) unsigned long off = l - (unsigned long *)m->private; seq_puts(m, vmstat_text[off]); - seq_put_decimal_ull(m, ' ', *l); + seq_put_decimal_ull(m, " ", *l); seq_putc(m, '\n'); return 0; } -- cgit v1.1