From a1f84a3ab8e002159498814eaa7e48c33752b04b Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Tue, 27 Oct 2009 15:35:38 +0100 Subject: sched: Check for an idle shared cache in select_task_rq_fair() When waking affine, check for an idle shared cache, and if found, wake to that CPU/sibling instead of the waker's CPU. This improves pgsql+oltp ramp up by roughly 8%. Possibly more for other loads, depending on overlap. The trade-off is a roughly 1% peak downturn if tasks are truly synchronous. Signed-off-by: Mike Galbraith Cc: Arjan van de Ven Cc: Peter Zijlstra Cc: LKML-Reference: <1256654138.17752.7.camel@marge.simson.net> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 4e777b4..da87385 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1372,11 +1372,36 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag want_sd = 0; } - if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && - cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { + if (want_affine && (tmp->flags & SD_WAKE_AFFINE)) { + int candidate = -1, i; - affine_sd = tmp; - want_affine = 0; + if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) + candidate = cpu; + + /* + * Check for an idle shared cache. + */ + if (tmp->flags & SD_PREFER_SIBLING) { + if (candidate == cpu) { + if (!cpu_rq(prev_cpu)->cfs.nr_running) + candidate = prev_cpu; + } + + if (candidate == -1 || candidate == cpu) { + for_each_cpu(i, sched_domain_span(tmp)) { + if (!cpu_rq(i)->cfs.nr_running) { + candidate = i; + break; + } + } + } + } + + if (candidate >= 0) { + affine_sd = tmp; + want_affine = 0; + cpu = candidate; + } } if (!want_sd && !want_affine) -- cgit v1.1 From fd210738f6601d0fb462df9a2fe5a41896ff6a8f Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Thu, 5 Nov 2009 10:57:46 +0100 Subject: sched: Fix affinity logic in select_task_rq_fair() Ingo Molnar reported: [ 26.804000] BUG: using smp_processor_id() in preemptible [00000000] code: events/1/10 [ 26.808000] caller is vmstat_update+0x26/0x70 [ 26.812000] Pid: 10, comm: events/1 Not tainted 2.6.32-rc5 #6887 [ 26.816000] Call Trace: [ 26.820000] [] ? printk+0x28/0x3c [ 26.824000] [] debug_smp_processor_id+0xf0/0x110 [ 26.824000] mount used greatest stack depth: 1464 bytes left [ 26.828000] [] vmstat_update+0x26/0x70 [ 26.832000] [] worker_thread+0x188/0x310 [ 26.836000] [] ? worker_thread+0x127/0x310 [ 26.840000] [] ? autoremove_wake_function+0x0/0x60 [ 26.844000] [] ? worker_thread+0x0/0x310 [ 26.848000] [] kthread+0x7c/0x90 [ 26.852000] [] ? kthread+0x0/0x90 [ 26.856000] [] kernel_thread_helper+0x7/0x10 [ 26.860000] BUG: using smp_processor_id() in preemptible [00000000] code: events/1/10 [ 26.864000] caller is vmstat_update+0x3c/0x70 Because this commit: a1f84a3: sched: Check for an idle shared cache in select_task_rq_fair() broke ->cpus_allowed. Signed-off-by: Mike Galbraith Cc: Peter Zijlstra Cc: arjan@infradead.org Cc: LKML-Reference: <1257415066.12867.1.camel@marge.simson.net> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index da87385..e4d4483 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1389,6 +1389,8 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag if (candidate == -1 || candidate == cpu) { for_each_cpu(i, sched_domain_span(tmp)) { + if (!cpumask_test_cpu(i, &p->cpus_allowed)) + continue; if (!cpu_rq(i)->cfs.nr_running) { candidate = i; break; -- cgit v1.1 From a50bde5130f65733142b32975616427d0ea50856 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 12 Nov 2009 15:55:28 +0100 Subject: sched: Cleanup select_task_rq_fair() Clean up the new affine to idle sibling bits while trying to grok them. Should not have any function differences. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091112145610.832503781@chello.nl> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 73 +++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 51 insertions(+), 22 deletions(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index e4d4483..a32df15 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1319,6 +1319,41 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) } /* + * Try and locate an idle CPU in the sched_domain. + */ +static int +select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target) +{ + int cpu = smp_processor_id(); + int prev_cpu = task_cpu(p); + int i; + + /* + * If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE + * test in select_task_rq_fair) and the prev_cpu is idle then that's + * always a better target than the current cpu. + */ + if (target == cpu) { + if (!cpu_rq(prev_cpu)->cfs.nr_running) + target = prev_cpu; + } + + /* + * Otherwise, iterate the domain and find an elegible idle cpu. + */ + if (target == -1 || target == cpu) { + for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) { + if (!cpu_rq(i)->cfs.nr_running) { + target = i; + break; + } + } + } + + return target; +} + +/* * sched_balance_self: balance the current task (running on cpu) in domains * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and * SD_BALANCE_EXEC. @@ -1373,36 +1408,30 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag } if (want_affine && (tmp->flags & SD_WAKE_AFFINE)) { - int candidate = -1, i; + int target = -1; + /* + * If both cpu and prev_cpu are part of this domain, + * cpu is a valid SD_WAKE_AFFINE target. + */ if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) - candidate = cpu; + target = cpu; /* - * Check for an idle shared cache. + * If there's an idle sibling in this domain, make that + * the wake_affine target instead of the current cpu. + * + * XXX: should we possibly do this outside of + * WAKE_AFFINE, in case the shared cache domain is + * smaller than the WAKE_AFFINE domain? */ - if (tmp->flags & SD_PREFER_SIBLING) { - if (candidate == cpu) { - if (!cpu_rq(prev_cpu)->cfs.nr_running) - candidate = prev_cpu; - } - - if (candidate == -1 || candidate == cpu) { - for_each_cpu(i, sched_domain_span(tmp)) { - if (!cpumask_test_cpu(i, &p->cpus_allowed)) - continue; - if (!cpu_rq(i)->cfs.nr_running) { - candidate = i; - break; - } - } - } - } + if (tmp->flags & SD_PREFER_SIBLING) + target = select_idle_sibling(p, tmp, target); - if (candidate >= 0) { + if (target >= 0) { affine_sd = tmp; want_affine = 0; - cpu = candidate; + cpu = target; } } -- cgit v1.1 From fe3bcfe1f6c1fc4ea7706ac2d05e579fd9092682 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 12 Nov 2009 15:55:29 +0100 Subject: sched: More generic WAKE_AFFINE vs select_idle_sibling() Instead of only considering SD_WAKE_AFFINE | SD_PREFER_SIBLING domains also allow all SD_PREFER_SIBLING domains below a SD_WAKE_AFFINE domain to change the affinity target. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091112145610.909723612@chello.nl> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index a32df15..f28a267 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1333,20 +1333,16 @@ select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target) * test in select_task_rq_fair) and the prev_cpu is idle then that's * always a better target than the current cpu. */ - if (target == cpu) { - if (!cpu_rq(prev_cpu)->cfs.nr_running) - target = prev_cpu; - } + if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running) + return prev_cpu; /* * Otherwise, iterate the domain and find an elegible idle cpu. */ - if (target == -1 || target == cpu) { - for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) { - if (!cpu_rq(i)->cfs.nr_running) { - target = i; - break; - } + for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) { + if (!cpu_rq(i)->cfs.nr_running) { + target = i; + break; } } @@ -1407,7 +1403,12 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag want_sd = 0; } - if (want_affine && (tmp->flags & SD_WAKE_AFFINE)) { + /* + * While iterating the domains looking for a spanning + * WAKE_AFFINE domain, adjust the affine target to any idle cpu + * in cache sharing domains along the way. + */ + if (want_affine) { int target = -1; /* @@ -1420,17 +1421,15 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag /* * If there's an idle sibling in this domain, make that * the wake_affine target instead of the current cpu. - * - * XXX: should we possibly do this outside of - * WAKE_AFFINE, in case the shared cache domain is - * smaller than the WAKE_AFFINE domain? */ if (tmp->flags & SD_PREFER_SIBLING) target = select_idle_sibling(p, tmp, target); if (target >= 0) { - affine_sd = tmp; - want_affine = 0; + if (tmp->flags & SD_WAKE_AFFINE) { + affine_sd = tmp; + want_affine = 0; + } cpu = target; } } -- cgit v1.1 From 36ace27e3e60d44ea69ce394b2e45386ae98d9d9 Mon Sep 17 00:00:00 2001 From: Tim Blechmann Date: Tue, 24 Nov 2009 11:55:45 +0100 Subject: sched: Optimize branch hint in pick_next_task_fair() Branch hint profiling on my nehalem machine showed 90% incorrect branch hints: 15728471 158903754 90 pick_next_task_fair sched_fair.c 1555 Signed-off-by: Tim Blechmann Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: <4B0BBBB1.2050100@klingt.org> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched_fair.c') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index f28a267..24086e7 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1704,7 +1704,7 @@ static struct task_struct *pick_next_task_fair(struct rq *rq) struct cfs_rq *cfs_rq = &rq->cfs; struct sched_entity *se; - if (unlikely(!cfs_rq->nr_running)) + if (!cfs_rq->nr_running) return NULL; do { -- cgit v1.1