From a63d83f427fbce97a6cea0db2e64b0eb8435cd10 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 9 Aug 2010 17:19:46 -0700 Subject: oom: badness heuristic rewrite This a complete rewrite of the oom killer's badness() heuristic which is used to determine which task to kill in oom conditions. The goal is to make it as simple and predictable as possible so the results are better understood and we end up killing the task which will lead to the most memory freeing while still respecting the fine-tuning from userspace. Instead of basing the heuristic on mm->total_vm for each task, the task's rss and swap space is used instead. This is a better indication of the amount of memory that will be freeable if the oom killed task is chosen and subsequently exits. This helps specifically in cases where KDE or GNOME is chosen for oom kill on desktop systems instead of a memory hogging task. The baseline for the heuristic is a proportion of memory that each task is currently using in memory plus swap compared to the amount of "allowable" memory. "Allowable," in this sense, means the system-wide resources for unconstrained oom conditions, the set of mempolicy nodes, the mems attached to current's cpuset, or a memory controller's limit. The proportion is given on a scale of 0 (never kill) to 1000 (always kill), roughly meaning that if a task has a badness() score of 500 that the task consumes approximately 50% of allowable memory resident in RAM or in swap space. The proportion is always relative to the amount of "allowable" memory and not the total amount of RAM systemwide so that mempolicies and cpusets may operate in isolation; they shall not need to know the true size of the machine on which they are running if they are bound to a specific set of nodes or mems, respectively. Root tasks are given 3% extra memory just like __vm_enough_memory() provides in LSMs. In the event of two tasks consuming similar amounts of memory, it is generally better to save root's task. Because of the change in the badness() heuristic's baseline, it is also necessary to introduce a new user interface to tune it. It's not possible to redefine the meaning of /proc/pid/oom_adj with a new scale since the ABI cannot be changed for backward compatability. Instead, a new tunable, /proc/pid/oom_score_adj, is added that ranges from -1000 to +1000. It may be used to polarize the heuristic such that certain tasks are never considered for oom kill while others may always be considered. The value is added directly into the badness() score so a value of -500, for example, means to discount 50% of its memory consumption in comparison to other tasks either on the system, bound to the mempolicy, in the cpuset, or sharing the same memory controller. /proc/pid/oom_adj is changed so that its meaning is rescaled into the units used by /proc/pid/oom_score_adj, and vice versa. Changing one of these per-task tunables will rescale the value of the other to an equivalent meaning. Although /proc/pid/oom_adj was originally defined as a bitshift on the badness score, it now shares the same linear growth as /proc/pid/oom_score_adj but with different granularity. This is required so the ABI is not broken with userspace applications and allows oom_adj to be deprecated for future removal. Signed-off-by: David Rientjes Cc: Nick Piggin Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Oleg Nesterov Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 94 +++++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 90 insertions(+), 4 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index 5949d0a..f923b72 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -63,6 +63,7 @@ #include #include #include +#include #include #include #include @@ -430,12 +431,11 @@ static const struct file_operations proc_lstats_operations = { static int proc_oom_score(struct task_struct *task, char *buffer) { unsigned long points = 0; - struct timespec uptime; - do_posix_clock_monotonic_gettime(&uptime); read_lock(&tasklist_lock); if (pid_alive(task)) - points = badness(task, NULL, NULL, uptime.tv_sec); + points = oom_badness(task, NULL, NULL, + totalram_pages + total_swap_pages); read_unlock(&tasklist_lock); return sprintf(buffer, "%lu\n", points); } @@ -1038,7 +1038,15 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, } task->signal->oom_adj = oom_adjust; - + /* + * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum + * value is always attainable. + */ + if (task->signal->oom_adj == OOM_ADJUST_MAX) + task->signal->oom_score_adj = OOM_SCORE_ADJ_MAX; + else + task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) / + -OOM_DISABLE; unlock_task_sighand(task, &flags); put_task_struct(task); @@ -1051,6 +1059,82 @@ static const struct file_operations proc_oom_adjust_operations = { .llseek = generic_file_llseek, }; +static ssize_t oom_score_adj_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); + char buffer[PROC_NUMBUF]; + int oom_score_adj = OOM_SCORE_ADJ_MIN; + unsigned long flags; + size_t len; + + if (!task) + return -ESRCH; + if (lock_task_sighand(task, &flags)) { + oom_score_adj = task->signal->oom_score_adj; + unlock_task_sighand(task, &flags); + } + put_task_struct(task); + len = snprintf(buffer, sizeof(buffer), "%d\n", oom_score_adj); + return simple_read_from_buffer(buf, count, ppos, buffer, len); +} + +static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task; + char buffer[PROC_NUMBUF]; + unsigned long flags; + long oom_score_adj; + int err; + + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) + return -EFAULT; + + err = strict_strtol(strstrip(buffer), 0, &oom_score_adj); + if (err) + return -EINVAL; + if (oom_score_adj < OOM_SCORE_ADJ_MIN || + oom_score_adj > OOM_SCORE_ADJ_MAX) + return -EINVAL; + + task = get_proc_task(file->f_path.dentry->d_inode); + if (!task) + return -ESRCH; + if (!lock_task_sighand(task, &flags)) { + put_task_struct(task); + return -ESRCH; + } + if (oom_score_adj < task->signal->oom_score_adj && + !capable(CAP_SYS_RESOURCE)) { + unlock_task_sighand(task, &flags); + put_task_struct(task); + return -EACCES; + } + + task->signal->oom_score_adj = oom_score_adj; + /* + * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is + * always attainable. + */ + if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) + task->signal->oom_adj = OOM_DISABLE; + else + task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) / + OOM_SCORE_ADJ_MAX; + unlock_task_sighand(task, &flags); + put_task_struct(task); + return count; +} + +static const struct file_operations proc_oom_score_adj_operations = { + .read = oom_score_adj_read, + .write = oom_score_adj_write, +}; + #ifdef CONFIG_AUDITSYSCALL #define TMPBUFLEN 21 static ssize_t proc_loginuid_read(struct file * file, char __user * buf, @@ -2623,6 +2707,7 @@ static const struct pid_entry tgid_base_stuff[] = { #endif INF("oom_score", S_IRUGO, proc_oom_score), REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations), + REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), #ifdef CONFIG_AUDITSYSCALL REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), REG("sessionid", S_IRUGO, proc_sessionid_operations), @@ -2957,6 +3042,7 @@ static const struct pid_entry tid_base_stuff[] = { #endif INF("oom_score", S_IRUGO, proc_oom_score), REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations), + REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), #ifdef CONFIG_AUDITSYSCALL REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), REG("sessionid", S_IRUSR, proc_sessionid_operations), -- cgit v1.1