diff options
author | Peter Zijlstra <a.p.zijlstra@chello.nl> | 2010-11-15 15:47:00 -0800 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2010-11-18 13:27:46 +0100 |
commit | 2069dd75c7d0f49355939e5586daf5a9ab216db7 (patch) | |
tree | c221747420e47b194a2a634024438a55420224d5 /kernel/sched_fair.c | |
parent | 48c5ccae88dcd989d9de507e8510313c6cbd352b (diff) | |
download | op-kernel-dev-2069dd75c7d0f49355939e5586daf5a9ab216db7.zip op-kernel-dev-2069dd75c7d0f49355939e5586daf5a9ab216db7.tar.gz |
sched: Rewrite tg_shares_up)
By tracking a per-cpu load-avg for each cfs_rq and folding it into a
global task_group load on each tick we can rework tg_shares_up to be
strictly per-cpu.
This should improve cpu-cgroup performance for smp systems
significantly.
[ Paul: changed to use queueing cfs_rq + bug fixes ]
Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20101115234937.580480400@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel/sched_fair.c')
-rw-r--r-- | kernel/sched_fair.c | 164 |
1 files changed, 107 insertions, 57 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index f4f6a83..d86544b 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -417,7 +417,6 @@ int sched_proc_update_handler(struct ctl_table *table, int write, WRT_SYSCTL(sched_min_granularity); WRT_SYSCTL(sched_latency); WRT_SYSCTL(sched_wakeup_granularity); - WRT_SYSCTL(sched_shares_ratelimit); #undef WRT_SYSCTL return 0; @@ -633,7 +632,6 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) list_add(&se->group_node, &cfs_rq->tasks); } cfs_rq->nr_running++; - se->on_rq = 1; } static void @@ -647,9 +645,89 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) list_del_init(&se->group_node); } cfs_rq->nr_running--; - se->on_rq = 0; } +#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED +static void update_cfs_load(struct cfs_rq *cfs_rq) +{ + u64 period = sched_avg_period(); + u64 now, delta; + + if (!cfs_rq) + return; + + now = rq_of(cfs_rq)->clock; + delta = now - cfs_rq->load_stamp; + + cfs_rq->load_stamp = now; + cfs_rq->load_period += delta; + cfs_rq->load_avg += delta * cfs_rq->load.weight; + + while (cfs_rq->load_period > period) { + /* + * Inline assembly required to prevent the compiler + * optimising this loop into a divmod call. + * See __iter_div_u64_rem() for another example of this. + */ + asm("" : "+rm" (cfs_rq->load_period)); + cfs_rq->load_period /= 2; + cfs_rq->load_avg /= 2; + } +} + +static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, + unsigned long weight) +{ + if (se->on_rq) + account_entity_dequeue(cfs_rq, se); + + update_load_set(&se->load, weight); + + if (se->on_rq) + account_entity_enqueue(cfs_rq, se); +} + +static void update_cfs_shares(struct cfs_rq *cfs_rq) +{ + struct task_group *tg; + struct sched_entity *se; + long load_weight, load, shares; + + if (!cfs_rq) + return; + + tg = cfs_rq->tg; + se = tg->se[cpu_of(rq_of(cfs_rq))]; + if (!se) + return; + + load = cfs_rq->load.weight; + + load_weight = atomic_read(&tg->load_weight); + load_weight -= cfs_rq->load_contribution; + load_weight += load; + + shares = (tg->shares * load); + if (load_weight) + shares /= load_weight; + + if (shares < MIN_SHARES) + shares = MIN_SHARES; + if (shares > tg->shares) + shares = tg->shares; + + reweight_entity(cfs_rq_of(se), se, shares); +} +#else /* CONFIG_FAIR_GROUP_SCHED */ +static inline void update_cfs_load(struct cfs_rq *cfs_rq) +{ +} + +static inline void update_cfs_shares(struct cfs_rq *cfs_rq) +{ +} +#endif /* CONFIG_FAIR_GROUP_SCHED */ + static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) { #ifdef CONFIG_SCHEDSTATS @@ -771,7 +849,9 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); + update_cfs_load(cfs_rq); account_entity_enqueue(cfs_rq, se); + update_cfs_shares(cfs_rq); if (flags & ENQUEUE_WAKEUP) { place_entity(cfs_rq, se, 0); @@ -782,6 +862,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) check_spread(cfs_rq, se); if (se != cfs_rq->curr) __enqueue_entity(cfs_rq, se); + se->on_rq = 1; } static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -825,8 +906,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); + se->on_rq = 0; + update_cfs_load(cfs_rq); account_entity_dequeue(cfs_rq, se); update_min_vruntime(cfs_rq); + update_cfs_shares(cfs_rq); /* * Normalize the entity after updating the min_vruntime because the @@ -1055,6 +1139,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) flags = ENQUEUE_WAKEUP; } + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + update_cfs_load(cfs_rq); + update_cfs_shares(cfs_rq); + } + hrtick_update(rq); } @@ -1071,12 +1162,20 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); dequeue_entity(cfs_rq, se, flags); + /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) break; flags |= DEQUEUE_SLEEP; } + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + update_cfs_load(cfs_rq); + update_cfs_shares(cfs_rq); + } + hrtick_update(rq); } @@ -1143,51 +1242,20 @@ static void task_waking_fair(struct rq *rq, struct task_struct *p) * Adding load to a group doesn't make a group heavier, but can cause movement * of group shares between cpus. Assuming the shares were perfectly aligned one * can calculate the shift in shares. - * - * The problem is that perfectly aligning the shares is rather expensive, hence - * we try to avoid doing that too often - see update_shares(), which ratelimits - * this change. - * - * We compensate this by not only taking the current delta into account, but - * also considering the delta between when the shares were last adjusted and - * now. - * - * We still saw a performance dip, some tracing learned us that between - * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased - * significantly. Therefore try to bias the error in direction of failing - * the affine wakeup. - * */ -static long effective_load(struct task_group *tg, int cpu, - long wl, long wg) +static long effective_load(struct task_group *tg, int cpu, long wl, long wg) { struct sched_entity *se = tg->se[cpu]; if (!tg->parent) return wl; - /* - * By not taking the decrease of shares on the other cpu into - * account our error leans towards reducing the affine wakeups. - */ - if (!wl && sched_feat(ASYM_EFF_LOAD)) - return wl; - for_each_sched_entity(se) { long S, rw, s, a, b; - long more_w; - - /* - * Instead of using this increment, also add the difference - * between when the shares were last updated and now. - */ - more_w = se->my_q->load.weight - se->my_q->rq_weight; - wl += more_w; - wg += more_w; S = se->my_q->tg->shares; - s = se->my_q->shares; - rw = se->my_q->rq_weight; + s = se->load.weight; + rw = se->my_q->load.weight; a = S*(rw + wl); b = S*rw + s*wg; @@ -1508,23 +1576,6 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ sd = tmp; } -#ifdef CONFIG_FAIR_GROUP_SCHED - if (sched_feat(LB_SHARES_UPDATE)) { - /* - * Pick the largest domain to update shares over - */ - tmp = sd; - if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight)) - tmp = affine_sd; - - if (tmp) { - raw_spin_unlock(&rq->lock); - update_shares(tmp); - raw_spin_lock(&rq->lock); - } - } -#endif - if (affine_sd) { if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) return select_idle_sibling(p, cpu); @@ -3014,7 +3065,6 @@ static int load_balance(int this_cpu, struct rq *this_rq, schedstat_inc(sd, lb_count[idle]); redo: - update_shares(sd); group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, cpus, balance); @@ -3156,8 +3206,6 @@ out_one_pinned: else ld_moved = 0; out: - if (ld_moved) - update_shares(sd); return ld_moved; } @@ -3549,6 +3597,8 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) int update_next_balance = 0; int need_serialize; + update_shares(cpu); + for_each_domain(cpu, sd) { if (!(sd->flags & SD_LOAD_BALANCE)) continue; |