summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/sched/topology.h8
-rw-r--r--kernel/sched/fair.c126
-rw-r--r--kernel/sched/features.h1
3 files changed, 16 insertions, 119 deletions
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index d7b6dab..7d065ab 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -71,14 +71,6 @@ struct sched_domain_shared {
atomic_t ref;
atomic_t nr_busy_cpus;
int has_idle_cores;
-
- /*
- * Some variables from the most recent sd_lb_stats for this domain,
- * used by wake_affine().
- */
- unsigned long nr_running;
- unsigned long load;
- unsigned long capacity;
};
struct sched_domain {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 70ba32e..28cabed 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5356,115 +5356,36 @@ static int wake_wide(struct task_struct *p)
return 1;
}
-struct llc_stats {
- unsigned long nr_running;
- unsigned long load;
- unsigned long capacity;
- int has_capacity;
-};
-
-static bool get_llc_stats(struct llc_stats *stats, int cpu)
-{
- struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
-
- if (!sds)
- return false;
-
- stats->nr_running = READ_ONCE(sds->nr_running);
- stats->load = READ_ONCE(sds->load);
- stats->capacity = READ_ONCE(sds->capacity);
- stats->has_capacity = stats->nr_running < per_cpu(sd_llc_size, cpu);
-
- return true;
-}
-
/*
- * Can a task be moved from prev_cpu to this_cpu without causing a load
- * imbalance that would trigger the load balancer?
+ * The purpose of wake_affine() is to quickly determine on which CPU we can run
+ * soonest. For the purpose of speed we only consider the waking and previous
+ * CPU.
*
- * Since we're running on 'stale' values, we might in fact create an imbalance
- * but recomputing these values is expensive, as that'd mean iteration 2 cache
- * domains worth of CPUs.
+ * wake_affine_idle() - only considers 'now', it check if the waking CPU is (or
+ * will be) idle.
*/
+
static bool
-wake_affine_llc(struct sched_domain *sd, struct task_struct *p,
- int this_cpu, int prev_cpu, int sync)
+wake_affine_idle(struct sched_domain *sd, struct task_struct *p,
+ int this_cpu, int prev_cpu, int sync)
{
- struct llc_stats prev_stats, this_stats;
- s64 this_eff_load, prev_eff_load;
- unsigned long task_load;
-
- if (!get_llc_stats(&prev_stats, prev_cpu) ||
- !get_llc_stats(&this_stats, this_cpu))
- return false;
-
- /*
- * If sync wakeup then subtract the (maximum possible)
- * effect of the currently running task from the load
- * of the current LLC.
- */
- if (sync) {
- unsigned long current_load = task_h_load(current);
-
- /* in this case load hits 0 and this LLC is considered 'idle' */
- if (current_load > this_stats.load)
- return true;
-
- this_stats.load -= current_load;
- }
-
- /*
- * The has_capacity stuff is not SMT aware, but by trying to balance
- * the nr_running on both ends we try and fill the domain at equal
- * rates, thereby first consuming cores before siblings.
- */
-
- /* if the old cache has capacity, stay there */
- if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1)
- return false;
-
- /* if this cache has capacity, come here */
- if (this_stats.has_capacity && this_stats.nr_running+1 < prev_stats.nr_running)
+ if (idle_cpu(this_cpu))
return true;
- /*
- * Check to see if we can move the load without causing too much
- * imbalance.
- */
- task_load = task_h_load(p);
-
- this_eff_load = 100;
- this_eff_load *= prev_stats.capacity;
-
- prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
- prev_eff_load *= this_stats.capacity;
-
- this_eff_load *= this_stats.load + task_load;
- prev_eff_load *= prev_stats.load - task_load;
+ if (sync && cpu_rq(this_cpu)->nr_running == 1)
+ return true;
- return this_eff_load <= prev_eff_load;
+ return false;
}
static int wake_affine(struct sched_domain *sd, struct task_struct *p,
int prev_cpu, int sync)
{
int this_cpu = smp_processor_id();
- bool affine;
-
- /*
- * Default to no affine wakeups; wake_affine() should not effect a task
- * placement the load-balancer feels inclined to undo. The conservative
- * option is therefore to not move tasks when they wake up.
- */
- affine = false;
+ bool affine = false;
- /*
- * If the wakeup is across cache domains, try to evaluate if movement
- * makes sense, otherwise rely on select_idle_siblings() to do
- * placement inside the cache domain.
- */
- if (!cpus_share_cache(prev_cpu, this_cpu))
- affine = wake_affine_llc(sd, p, this_cpu, prev_cpu, sync);
+ if (sched_feat(WA_IDLE) && !affine)
+ affine = wake_affine_idle(sd, p, this_cpu, prev_cpu, sync);
schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
if (affine) {
@@ -7600,7 +7521,6 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
*/
static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
{
- struct sched_domain_shared *shared = env->sd->shared;
struct sched_domain *child = env->sd->child;
struct sched_group *sg = env->sd->groups;
struct sg_lb_stats *local = &sds->local_stat;
@@ -7672,22 +7592,6 @@ next_group:
if (env->dst_rq->rd->overload != overload)
env->dst_rq->rd->overload = overload;
}
-
- if (!shared)
- return;
-
- /*
- * Since these are sums over groups they can contain some CPUs
- * multiple times for the NUMA domains.
- *
- * Currently only wake_affine_llc() and find_busiest_group()
- * uses these numbers, only the last is affected by this problem.
- *
- * XXX fix that.
- */
- WRITE_ONCE(shared->nr_running, sds->total_running);
- WRITE_ONCE(shared->load, sds->total_load);
- WRITE_ONCE(shared->capacity, sds->total_capacity);
}
/**
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index d3fb155..0a519f8 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -81,3 +81,4 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true)
SCHED_FEAT(LB_MIN, false)
SCHED_FEAT(ATTACH_AGE_LOAD, true)
+SCHED_FEAT(WA_IDLE, true)
OpenPOWER on IntegriCloud