From 1f7dd3e5a6e4f093017fff12232572ee1aa4639b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 3 Dec 2015 10:18:21 -0500 Subject: cgroup: fix handling of multi-destination migration from subtree_control enabling Consider the following v2 hierarchy. P0 (+memory) --- P1 (-memory) --- A \- B P0 has memory enabled in its subtree_control while P1 doesn't. If both A and B contain processes, they would belong to the memory css of P1. Now if memory is enabled on P1's subtree_control, memory csses should be created on both A and B and A's processes should be moved to the former and B's processes the latter. IOW, enabling controllers can cause atomic migrations into different csses. The core cgroup migration logic has been updated accordingly but the controller migration methods haven't and still assume that all tasks migrate to a single target css; furthermore, the methods were fed the css in which subtree_control was updated which is the parent of the target csses. pids controller depends on the migration methods to move charges and this made the controller attribute charges to the wrong csses often triggering the following warning by driving a counter negative. WARNING: CPU: 1 PID: 1 at kernel/cgroup_pids.c:97 pids_cancel.constprop.6+0x31/0x40() Modules linked in: CPU: 1 PID: 1 Comm: systemd Not tainted 4.4.0-rc1+ #29 ... ffffffff81f65382 ffff88007c043b90 ffffffff81551ffc 0000000000000000 ffff88007c043bc8 ffffffff810de202 ffff88007a752000 ffff88007a29ab00 ffff88007c043c80 ffff88007a1d8400 0000000000000001 ffff88007c043bd8 Call Trace: [] dump_stack+0x4e/0x82 [] warn_slowpath_common+0x82/0xc0 [] warn_slowpath_null+0x1a/0x20 [] pids_cancel.constprop.6+0x31/0x40 [] pids_can_attach+0x6d/0xf0 [] cgroup_taskset_migrate+0x6c/0x330 [] cgroup_migrate+0xf5/0x190 [] cgroup_attach_task+0x176/0x200 [] __cgroup_procs_write+0x2ad/0x460 [] cgroup_procs_write+0x14/0x20 [] cgroup_file_write+0x35/0x1c0 [] kernfs_fop_write+0x141/0x190 [] __vfs_write+0x28/0xe0 [] vfs_write+0xac/0x1a0 [] SyS_write+0x49/0xb0 [] entry_SYSCALL_64_fastpath+0x12/0x76 This patch fixes the bug by removing @css parameter from the three migration methods, ->can_attach, ->cancel_attach() and ->attach() and updating cgroup_taskset iteration helpers also return the destination css in addition to the task being migrated. All controllers are updated accordingly. * Controllers which don't care whether there are one or multiple target csses can be converted trivially. cpu, io, freezer, perf, netclassid and netprio fall in this category. * cpuset's current implementation assumes that there's single source and destination and thus doesn't support v2 hierarchy already. The only change made by this patchset is how that single destination css is obtained. * memory migration path already doesn't do anything on v2. How the single destination css is obtained is updated and the prep stage of mem_cgroup_can_attach() is reordered to accomodate the change. * pids is the only controller which was affected by this bug. It now correctly handles multi-destination migrations and no longer causes counter underflow from incorrect accounting. Signed-off-by: Tejun Heo Reported-and-tested-by: Daniel Wagner Cc: Aleksa Sarai --- kernel/sched/core.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4d568ac9..a9db4819 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8217,12 +8217,12 @@ static void cpu_cgroup_fork(struct task_struct *task, void *private) sched_move_task(task); } -static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) { struct task_struct *task; + struct cgroup_subsys_state *css; - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, css, tset) { #ifdef CONFIG_RT_GROUP_SCHED if (!sched_rt_can_attach(css_tg(css), task)) return -EINVAL; @@ -8235,12 +8235,12 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, return 0; } -static void cpu_cgroup_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void cpu_cgroup_attach(struct cgroup_taskset *tset) { struct task_struct *task; + struct cgroup_subsys_state *css; - cgroup_taskset_for_each(task, tset) + cgroup_taskset_for_each(task, css, tset) sched_move_task(task); } -- cgit v1.1 From 119d6f6a3be8b424b200dcee56e74484d5445f7e Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Mon, 30 Nov 2015 20:34:20 -0500 Subject: sched/core: Remove false-positive warning from wake_up_process() Because wakeups can (fundamentally) be late, a task might not be in the expected state. Therefore testing against a task's state is racy, and can yield false positives. Signed-off-by: Sasha Levin Signed-off-by: Peter Zijlstra (Intel) Acked-by: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: oleg@redhat.com Fixes: 9067ac85d533 ("wake_up_process() should be never used to wakeup a TASK_STOPPED/TRACED task") Link: http://lkml.kernel.org/r/1448933660-23082-1-git-send-email-sasha.levin@oracle.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4d568ac9..fc8c987 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2039,7 +2039,6 @@ out: */ int wake_up_process(struct task_struct *p) { - WARN_ON(task_is_stopped_or_traced(p)); return try_to_wake_up(p, TASK_NORMAL, 0); } EXPORT_SYMBOL(wake_up_process); -- cgit v1.1 From 8295c69925ad53ec32ca54ac9fc194ff21bc40e2 Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Wed, 2 Dec 2015 19:52:59 +0800 Subject: sched/core: Clear the root_domain cpumasks in init_rootdomain() root_domain::rto_mask allocated through alloc_cpumask_var() contains garbage data, this may cause problems. For instance, When doing pull_rt_task(), it may do useless iterations if rto_mask retains some extra garbage bits. Worse still, this violates the isolated domain rule for clustered scheduling using cpuset, because the tasks(with all the cpus allowed) belongs to one root domain can be pulled away into another root domain. The patch cleans the garbage by using zalloc_cpumask_var() instead of alloc_cpumask_var() for root_domain::rto_mask allocation, thereby addressing the issues. Do the same thing for root_domain's other cpumask memembers: dlo_mask, span, and online. Signed-off-by: Xunlei Pang Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1449057179-29321-1-git-send-email-xlpang@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fc8c987..eee4ee6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5846,13 +5846,13 @@ static int init_rootdomain(struct root_domain *rd) { memset(rd, 0, sizeof(*rd)); - if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) + if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL)) goto out; - if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) + if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL)) goto free_span; - if (!alloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL)) + if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL)) goto free_online; - if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) + if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) goto free_dlo_mask; init_dl_bw(&rd->dl_bw); -- cgit v1.1 From b75a22531588e77aa8c2daf228c9723916ae2cd0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 6 Oct 2015 14:36:17 +0200 Subject: sched/core: Better document the try_to_wake_up() barriers Explain how the control dependency and smp_rmb() end up providing ACQUIRE semantics and pair with smp_store_release() in finish_lock_switch(). Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index eee4ee6..b64f163 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1953,7 +1953,13 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) while (p->on_cpu) cpu_relax(); /* - * Pairs with the smp_wmb() in finish_lock_switch(). + * Combined with the control dependency above, we have an effective + * smp_load_acquire() without the need for full barriers. + * + * Pairs with the smp_store_release() in finish_lock_switch(). + * + * This ensures that tasks getting woken will be fully ordered against + * their previous state and preserve Program Order. */ smp_rmb(); -- cgit v1.1 From ecf7d01c229d11a44609c0067889372c91fb4f36 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 7 Oct 2015 14:14:13 +0200 Subject: sched/core: Fix an SMP ordering race in try_to_wake_up() vs. schedule() Oleg noticed that its possible to falsely observe p->on_cpu == 0 such that we'll prematurely continue with the wakeup and effectively run p on two CPUs at the same time. Even though the overlap is very limited; the task is in the middle of being scheduled out; it could still result in corruption of the scheduler data structures. CPU0 CPU1 set_current_state(...) context_switch(X, Y) prepare_lock_switch(Y) Y->on_cpu = 1; finish_lock_switch(X) store_release(X->on_cpu, 0); try_to_wake_up(X) LOCK(p->pi_lock); t = X->on_cpu; // 0 context_switch(Y, X) prepare_lock_switch(X) X->on_cpu = 1; finish_lock_switch(Y) store_release(Y->on_cpu, 0); schedule(); deactivate_task(X); X->on_rq = 0; if (X->on_rq) // false if (t) while (X->on_cpu) cpu_relax(); context_switch(X, ..) finish_lock_switch(X) store_release(X->on_cpu, 0); Avoid the load of X->on_cpu being hoisted over the X->on_rq load. Reported-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b64f163..7063c6a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1947,6 +1947,25 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) #ifdef CONFIG_SMP /* + * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be + * possible to, falsely, observe p->on_cpu == 0. + * + * One must be running (->on_cpu == 1) in order to remove oneself + * from the runqueue. + * + * [S] ->on_cpu = 1; [L] ->on_rq + * UNLOCK rq->lock + * RMB + * LOCK rq->lock + * [S] ->on_rq = 0; [L] ->on_cpu + * + * Pairs with the full barrier implied in the UNLOCK+LOCK on rq->lock + * from the consecutive calls to schedule(); the first switching to our + * task, the second putting it to sleep. + */ + smp_rmb(); + + /* * If the owning (remote) cpu is still in the middle of schedule() with * this task as prev, wait until its done referencing the task. */ -- cgit v1.1