diff options
author | Peter Zijlstra <a.p.zijlstra@chello.nl> | 2008-02-25 17:34:02 +0100 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2008-03-04 17:54:06 +0100 |
commit | 62fb185130e4d420f71a30ff59d8b16b74ef5d2b (patch) | |
tree | 474c0824a5bf90950b0a430a11a52b358c9e1f31 /kernel/sched.c | |
parent | 976dde010e513a9c7c3117a32b7b015f84b37430 (diff) | |
download | op-kernel-dev-62fb185130e4d420f71a30ff59d8b16b74ef5d2b.zip op-kernel-dev-62fb185130e4d420f71a30ff59d8b16b74ef5d2b.tar.gz |
sched: revert load_balance_monitor() changes
The following commits cause a number of regressions:
commit 58e2d4ca581167c2a079f4ee02be2f0bc52e8729
Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Date: Fri Jan 25 21:08:00 2008 +0100
sched: group scheduling, change how cpu load is calculated
commit 6b2d7700266b9402e12824e11e0099ae6a4a6a79
Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Date: Fri Jan 25 21:08:00 2008 +0100
sched: group scheduler, fix fairness of cpu bandwidth allocation for task groups
Namely:
- very frequent wakeups on SMP, reported by PowerTop users.
- cacheline trashing on (large) SMP
- some latencies larger than 500ms
While there is a mergeable patch to fix the latter, the former issues
are not fixable in a manner suitable for .25 (we're at -rc3 now).
Hence we revert them and try again in v2.6.26.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
CC: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Tested-by: Alexey Zaytsev <alexey.zaytsev@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 283 |
1 files changed, 35 insertions, 248 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index f06950c..dcd553c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -174,41 +174,6 @@ struct task_group { struct sched_entity **se; /* runqueue "owned" by this group on each cpu */ struct cfs_rq **cfs_rq; - - /* - * shares assigned to a task group governs how much of cpu bandwidth - * is allocated to the group. The more shares a group has, the more is - * the cpu bandwidth allocated to it. - * - * For ex, lets say that there are three task groups, A, B and C which - * have been assigned shares 1000, 2000 and 3000 respectively. Then, - * cpu bandwidth allocated by the scheduler to task groups A, B and C - * should be: - * - * Bw(A) = 1000/(1000+2000+3000) * 100 = 16.66% - * Bw(B) = 2000/(1000+2000+3000) * 100 = 33.33% - * Bw(C) = 3000/(1000+2000+3000) * 100 = 50% - * - * The weight assigned to a task group's schedulable entities on every - * cpu (task_group.se[a_cpu]->load.weight) is derived from the task - * group's shares. For ex: lets say that task group A has been - * assigned shares of 1000 and there are two CPUs in a system. Then, - * - * tg_A->se[0]->load.weight = tg_A->se[1]->load.weight = 1000; - * - * Note: It's not necessary that each of a task's group schedulable - * entity have the same weight on all CPUs. If the group - * has 2 of its tasks on CPU0 and 1 task on CPU1, then a - * better distribution of weight could be: - * - * tg_A->se[0]->load.weight = 2/3 * 2000 = 1333 - * tg_A->se[1]->load.weight = 1/2 * 2000 = 667 - * - * rebalance_shares() is responsible for distributing the shares of a - * task groups like this among the group's schedulable entities across - * cpus. - * - */ unsigned long shares; #endif @@ -250,22 +215,12 @@ static DEFINE_SPINLOCK(task_group_lock); static DEFINE_MUTEX(doms_cur_mutex); #ifdef CONFIG_FAIR_GROUP_SCHED -#ifdef CONFIG_SMP -/* kernel thread that runs rebalance_shares() periodically */ -static struct task_struct *lb_monitor_task; -static int load_balance_monitor(void *unused); -#endif - -static void set_se_shares(struct sched_entity *se, unsigned long shares); - #ifdef CONFIG_USER_SCHED # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) #else # define INIT_TASK_GROUP_LOAD NICE_0_LOAD #endif -#define MIN_GROUP_SHARES 2 - static int init_task_group_load = INIT_TASK_GROUP_LOAD; #endif @@ -1245,16 +1200,6 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime); static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} #endif -static inline void inc_cpu_load(struct rq *rq, unsigned long load) -{ - update_load_add(&rq->load, load); -} - -static inline void dec_cpu_load(struct rq *rq, unsigned long load) -{ - update_load_sub(&rq->load, load); -} - #ifdef CONFIG_SMP static unsigned long source_load(int cpu, int type); static unsigned long target_load(int cpu, int type); @@ -1272,14 +1217,26 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); #define sched_class_highest (&rt_sched_class) -static void inc_nr_running(struct rq *rq) +static inline void inc_load(struct rq *rq, const struct task_struct *p) +{ + update_load_add(&rq->load, p->se.load.weight); +} + +static inline void dec_load(struct rq *rq, const struct task_struct *p) +{ + update_load_sub(&rq->load, p->se.load.weight); +} + +static void inc_nr_running(struct task_struct *p, struct rq *rq) { rq->nr_running++; + inc_load(rq, p); } -static void dec_nr_running(struct rq *rq) +static void dec_nr_running(struct task_struct *p, struct rq *rq) { rq->nr_running--; + dec_load(rq, p); } static void set_load_weight(struct task_struct *p) @@ -1371,7 +1328,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) rq->nr_uninterruptible--; enqueue_task(rq, p, wakeup); - inc_nr_running(rq); + inc_nr_running(p, rq); } /* @@ -1383,7 +1340,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) rq->nr_uninterruptible++; dequeue_task(rq, p, sleep); - dec_nr_running(rq); + dec_nr_running(p, rq); } /** @@ -2023,7 +1980,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) * management (if any): */ p->sched_class->task_new(rq, p); - inc_nr_running(rq); + inc_nr_running(p, rq); } check_preempt_curr(rq, p); #ifdef CONFIG_SMP @@ -4362,8 +4319,10 @@ void set_user_nice(struct task_struct *p, long nice) goto out_unlock; } on_rq = p->se.on_rq; - if (on_rq) + if (on_rq) { dequeue_task(rq, p, 0); + dec_load(rq, p); + } p->static_prio = NICE_TO_PRIO(nice); set_load_weight(p); @@ -4373,6 +4332,7 @@ void set_user_nice(struct task_struct *p, long nice) if (on_rq) { enqueue_task(rq, p, 0); + inc_load(rq, p); /* * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: @@ -7087,21 +7047,6 @@ void __init sched_init_smp(void) if (set_cpus_allowed(current, non_isolated_cpus) < 0) BUG(); sched_init_granularity(); - -#ifdef CONFIG_FAIR_GROUP_SCHED - if (nr_cpu_ids == 1) - return; - - lb_monitor_task = kthread_create(load_balance_monitor, NULL, - "group_balance"); - if (!IS_ERR(lb_monitor_task)) { - lb_monitor_task->flags |= PF_NOFREEZE; - wake_up_process(lb_monitor_task); - } else { - printk(KERN_ERR "Could not create load balance monitor thread" - "(error = %ld) \n", PTR_ERR(lb_monitor_task)); - } -#endif } #else void __init sched_init_smp(void) @@ -7424,157 +7369,6 @@ void set_curr_task(int cpu, struct task_struct *p) #ifdef CONFIG_GROUP_SCHED -#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP -/* - * distribute shares of all task groups among their schedulable entities, - * to reflect load distribution across cpus. - */ -static int rebalance_shares(struct sched_domain *sd, int this_cpu) -{ - struct cfs_rq *cfs_rq; - struct rq *rq = cpu_rq(this_cpu); - cpumask_t sdspan = sd->span; - int balanced = 1; - - /* Walk thr' all the task groups that we have */ - for_each_leaf_cfs_rq(rq, cfs_rq) { - int i; - unsigned long total_load = 0, total_shares; - struct task_group *tg = cfs_rq->tg; - - /* Gather total task load of this group across cpus */ - for_each_cpu_mask(i, sdspan) - total_load += tg->cfs_rq[i]->load.weight; - - /* Nothing to do if this group has no load */ - if (!total_load) - continue; - - /* - * tg->shares represents the number of cpu shares the task group - * is eligible to hold on a single cpu. On N cpus, it is - * eligible to hold (N * tg->shares) number of cpu shares. - */ - total_shares = tg->shares * cpus_weight(sdspan); - - /* - * redistribute total_shares across cpus as per the task load - * distribution. - */ - for_each_cpu_mask(i, sdspan) { - unsigned long local_load, local_shares; - - local_load = tg->cfs_rq[i]->load.weight; - local_shares = (local_load * total_shares) / total_load; - if (!local_shares) - local_shares = MIN_GROUP_SHARES; - if (local_shares == tg->se[i]->load.weight) - continue; - - spin_lock_irq(&cpu_rq(i)->lock); - set_se_shares(tg->se[i], local_shares); - spin_unlock_irq(&cpu_rq(i)->lock); - balanced = 0; - } - } - - return balanced; -} - -/* - * How frequently should we rebalance_shares() across cpus? - * - * The more frequently we rebalance shares, the more accurate is the fairness - * of cpu bandwidth distribution between task groups. However higher frequency - * also implies increased scheduling overhead. - * - * sysctl_sched_min_bal_int_shares represents the minimum interval between - * consecutive calls to rebalance_shares() in the same sched domain. - * - * sysctl_sched_max_bal_int_shares represents the maximum interval between - * consecutive calls to rebalance_shares() in the same sched domain. - * - * These settings allows for the appropriate trade-off between accuracy of - * fairness and the associated overhead. - * - */ - -/* default: 8ms, units: milliseconds */ -const_debug unsigned int sysctl_sched_min_bal_int_shares = 8; - -/* default: 128ms, units: milliseconds */ -const_debug unsigned int sysctl_sched_max_bal_int_shares = 128; - -/* kernel thread that runs rebalance_shares() periodically */ -static int load_balance_monitor(void *unused) -{ - unsigned int timeout = sysctl_sched_min_bal_int_shares; - struct sched_param schedparm; - int ret; - - /* - * We don't want this thread's execution to be limited by the shares - * assigned to default group (init_task_group). Hence make it run - * as a SCHED_RR RT task at the lowest priority. - */ - schedparm.sched_priority = 1; - ret = sched_setscheduler(current, SCHED_RR, &schedparm); - if (ret) - printk(KERN_ERR "Couldn't set SCHED_RR policy for load balance" - " monitor thread (error = %d) \n", ret); - - while (!kthread_should_stop()) { - int i, cpu, balanced = 1; - - /* Prevent cpus going down or coming up */ - get_online_cpus(); - /* lockout changes to doms_cur[] array */ - lock_doms_cur(); - /* - * Enter a rcu read-side critical section to safely walk rq->sd - * chain on various cpus and to walk task group list - * (rq->leaf_cfs_rq_list) in rebalance_shares(). - */ - rcu_read_lock(); - - for (i = 0; i < ndoms_cur; i++) { - cpumask_t cpumap = doms_cur[i]; - struct sched_domain *sd = NULL, *sd_prev = NULL; - - cpu = first_cpu(cpumap); - - /* Find the highest domain at which to balance shares */ - for_each_domain(cpu, sd) { - if (!(sd->flags & SD_LOAD_BALANCE)) - continue; - sd_prev = sd; - } - - sd = sd_prev; - /* sd == NULL? No load balance reqd in this domain */ - if (!sd) - continue; - - balanced &= rebalance_shares(sd, cpu); - } - - rcu_read_unlock(); - - unlock_doms_cur(); - put_online_cpus(); - - if (!balanced) - timeout = sysctl_sched_min_bal_int_shares; - else if (timeout < sysctl_sched_max_bal_int_shares) - timeout *= 2; - - msleep_interruptible(timeout); - } - - return 0; -} -#endif /* CONFIG_SMP */ - #ifdef CONFIG_FAIR_GROUP_SCHED static void free_fair_sched_group(struct task_group *tg) { @@ -7841,29 +7635,25 @@ void sched_move_task(struct task_struct *tsk) } #ifdef CONFIG_FAIR_GROUP_SCHED -/* rq->lock to be locked by caller */ static void set_se_shares(struct sched_entity *se, unsigned long shares) { struct cfs_rq *cfs_rq = se->cfs_rq; struct rq *rq = cfs_rq->rq; int on_rq; - if (!shares) - shares = MIN_GROUP_SHARES; + spin_lock_irq(&rq->lock); on_rq = se->on_rq; - if (on_rq) { + if (on_rq) dequeue_entity(cfs_rq, se, 0); - dec_cpu_load(rq, se->load.weight); - } se->load.weight = shares; se->load.inv_weight = div64_64((1ULL<<32), shares); - if (on_rq) { + if (on_rq) enqueue_entity(cfs_rq, se, 0); - inc_cpu_load(rq, se->load.weight); - } + + spin_unlock_irq(&rq->lock); } static DEFINE_MUTEX(shares_mutex); @@ -7873,18 +7663,18 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) int i; unsigned long flags; + /* + * A weight of 0 or 1 can cause arithmetics problems. + * (The default weight is 1024 - so there's no practical + * limitation from this.) + */ + if (shares < 2) + shares = 2; + mutex_lock(&shares_mutex); if (tg->shares == shares) goto done; - if (shares < MIN_GROUP_SHARES) - shares = MIN_GROUP_SHARES; - - /* - * Prevent any load balance activity (rebalance_shares, - * load_balance_fair) from referring to this group first, - * by taking it off the rq->leaf_cfs_rq_list on each cpu. - */ spin_lock_irqsave(&task_group_lock, flags); for_each_possible_cpu(i) unregister_fair_sched_group(tg, i); @@ -7898,11 +7688,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) * w/o tripping rebalance_share or load_balance_fair. */ tg->shares = shares; - for_each_possible_cpu(i) { - spin_lock_irq(&cpu_rq(i)->lock); + for_each_possible_cpu(i) set_se_shares(tg->se[i], shares); - spin_unlock_irq(&cpu_rq(i)->lock); - } /* * Enable load balance activity on this group, by inserting it back on |