diff options
-rw-r--r-- | include/linux/mempolicy.h | 13 | ||||
-rw-r--r-- | mm/mempolicy.c | 44 | ||||
-rw-r--r-- | mm/oom_kill.c | 104 |
3 files changed, 124 insertions, 37 deletions
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 7b9ef6b..31ac26c 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -210,6 +210,8 @@ extern struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, nodemask_t **nodemask); extern bool init_nodemask_of_mempolicy(nodemask_t *mask); +extern bool mempolicy_nodemask_intersects(struct task_struct *tsk, + const nodemask_t *mask); extern unsigned slab_node(struct mempolicy *policy); extern enum zone_type policy_zone; @@ -338,7 +340,16 @@ static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma, return node_zonelist(0, gfp_flags); } -static inline bool init_nodemask_of_mempolicy(nodemask_t *m) { return false; } +static inline bool init_nodemask_of_mempolicy(nodemask_t *m) +{ + return false; +} + +static inline bool mempolicy_nodemask_intersects(struct task_struct *tsk, + const nodemask_t *mask) +{ + return false; +} static inline int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from_nodes, diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 5bc0a96..8a73708 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1712,6 +1712,50 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask) } #endif +/* + * mempolicy_nodemask_intersects + * + * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default + * policy. Otherwise, check for intersection between mask and the policy + * nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local' + * policy, always return true since it may allocate elsewhere on fallback. + * + * Takes task_lock(tsk) to prevent freeing of its mempolicy. + */ +bool mempolicy_nodemask_intersects(struct task_struct *tsk, + const nodemask_t *mask) +{ + struct mempolicy *mempolicy; + bool ret = true; + + if (!mask) + return ret; + task_lock(tsk); + mempolicy = tsk->mempolicy; + if (!mempolicy) + goto out; + + switch (mempolicy->mode) { + case MPOL_PREFERRED: + /* + * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to + * allocate from, they may fallback to other nodes when oom. + * Thus, it's possible for tsk to have allocated memory from + * nodes in mask. + */ + break; + case MPOL_BIND: + case MPOL_INTERLEAVE: + ret = nodes_intersects(mempolicy->v.nodes, *mask); + break; + default: + BUG(); + } +out: + task_unlock(tsk); + return ret; +} + /* Allocate a page in interleaved policy. Own path because it needs to do special accounting. */ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 7c8488f..13ceed7 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -27,6 +27,7 @@ #include <linux/module.h> #include <linux/notifier.h> #include <linux/memcontrol.h> +#include <linux/mempolicy.h> #include <linux/security.h> int sysctl_panic_on_oom; @@ -35,23 +36,57 @@ int sysctl_oom_dump_tasks; static DEFINE_SPINLOCK(zone_scan_lock); /* #define DEBUG */ -/* - * Is all threads of the target process nodes overlap ours? +#ifdef CONFIG_NUMA +/** + * has_intersects_mems_allowed() - check task eligiblity for kill + * @tsk: task struct of which task to consider + * @mask: nodemask passed to page allocator for mempolicy ooms + * + * Task eligibility is determined by whether or not a candidate task, @tsk, + * shares the same mempolicy nodes as current if it is bound by such a policy + * and whether or not it has the same set of allowed cpuset nodes. */ -static int has_intersects_mems_allowed(struct task_struct *tsk) +static bool has_intersects_mems_allowed(struct task_struct *tsk, + const nodemask_t *mask) { - struct task_struct *t; + struct task_struct *start = tsk; - t = tsk; do { - if (cpuset_mems_allowed_intersects(current, t)) - return 1; - t = next_thread(t); - } while (t != tsk); - - return 0; + if (mask) { + /* + * If this is a mempolicy constrained oom, tsk's + * cpuset is irrelevant. Only return true if its + * mempolicy intersects current, otherwise it may be + * needlessly killed. + */ + if (mempolicy_nodemask_intersects(tsk, mask)) + return true; + } else { + /* + * This is not a mempolicy constrained oom, so only + * check the mems of tsk's cpuset. + */ + if (cpuset_mems_allowed_intersects(current, tsk)) + return true; + } + tsk = next_thread(tsk); + } while (tsk != start); + return false; +} +#else +static bool has_intersects_mems_allowed(struct task_struct *tsk, + const nodemask_t *mask) +{ + return true; } +#endif /* CONFIG_NUMA */ +/* + * The process p may have detached its own ->mm while exiting or through + * use_mm(), but one or more of its subthreads may still have a valid + * pointer. Return p, or any of its subthreads with a valid ->mm, with + * task_lock() held. + */ static struct task_struct *find_lock_task_mm(struct task_struct *p) { struct task_struct *t = p; @@ -106,10 +141,6 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) * The memory size of the process is the basis for the badness. */ points = p->mm->total_vm; - - /* - * After this unlock we can no longer dereference local variable `mm' - */ task_unlock(p); /* @@ -253,7 +284,8 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist, * (not docbooked, we don't want this one cluttering up the manual) */ static struct task_struct *select_bad_process(unsigned long *ppoints, - struct mem_cgroup *mem) + struct mem_cgroup *mem, enum oom_constraint constraint, + const nodemask_t *mask) { struct task_struct *p; struct task_struct *chosen = NULL; @@ -269,7 +301,9 @@ static struct task_struct *select_bad_process(unsigned long *ppoints, continue; if (mem && !task_in_mem_cgroup(p, mem)) continue; - if (!has_intersects_mems_allowed(p)) + if (!has_intersects_mems_allowed(p, + constraint == CONSTRAINT_MEMORY_POLICY ? mask : + NULL)) continue; /* @@ -497,7 +531,7 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask) panic("out of memory(memcg). panic_on_oom is selected.\n"); read_lock(&tasklist_lock); retry: - p = select_bad_process(&points, mem); + p = select_bad_process(&points, mem, CONSTRAINT_NONE, NULL); if (!p || PTR_ERR(p) == -1UL) goto out; @@ -576,7 +610,8 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) /* * Must be called with tasklist_lock held for read. */ -static void __out_of_memory(gfp_t gfp_mask, int order) +static void __out_of_memory(gfp_t gfp_mask, int order, + enum oom_constraint constraint, const nodemask_t *mask) { struct task_struct *p; unsigned long points; @@ -590,7 +625,7 @@ retry: * Rambo mode: Shoot down a process and hope it solves whatever * issues we may have. */ - p = select_bad_process(&points, NULL); + p = select_bad_process(&points, NULL, constraint, mask); if (PTR_ERR(p) == -1UL) return; @@ -624,7 +659,8 @@ void pagefault_out_of_memory(void) panic("out of memory from page fault. panic_on_oom is selected.\n"); read_lock(&tasklist_lock); - __out_of_memory(0, 0); /* unknown gfp_mask and order */ + /* unknown gfp_mask and order */ + __out_of_memory(0, 0, CONSTRAINT_NONE, NULL); read_unlock(&tasklist_lock); /* @@ -640,6 +676,7 @@ void pagefault_out_of_memory(void) * @zonelist: zonelist pointer * @gfp_mask: memory allocation flags * @order: amount of memory being requested as a power of 2 + * @nodemask: nodemask passed to page allocator * * If we run out of memory, we have the choice between either * killing a random task (bad), letting the system crash (worse) @@ -678,24 +715,19 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, */ constraint = constrained_alloc(zonelist, gfp_mask, nodemask); read_lock(&tasklist_lock); - - switch (constraint) { - case CONSTRAINT_MEMORY_POLICY: - oom_kill_process(current, gfp_mask, order, 0, NULL, - "No available memory (MPOL_BIND)"); - break; - - case CONSTRAINT_NONE: - if (sysctl_panic_on_oom) { + if (unlikely(sysctl_panic_on_oom)) { + /* + * panic_on_oom only affects CONSTRAINT_NONE, the kernel + * should not panic for cpuset or mempolicy induced memory + * failures. + */ + if (constraint == CONSTRAINT_NONE) { dump_header(NULL, gfp_mask, order, NULL); - panic("out of memory. panic_on_oom is selected\n"); + read_unlock(&tasklist_lock); + panic("Out of memory: panic_on_oom is enabled\n"); } - /* Fall-through */ - case CONSTRAINT_CPUSET: - __out_of_memory(gfp_mask, order); - break; } - + __out_of_memory(gfp_mask, order, constraint, nodemask); read_unlock(&tasklist_lock); /* |