From e2b245f89ee3f5b03fb42d843a79a58cf4773181 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=2E=20Sch=C3=B6nherr?= Date: Mon, 1 Aug 2011 11:03:28 +0200 Subject: sched: Remove rq->avg_load_per_task MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit a2d47777 ("sched: fix stale value in average load per task") the variable rq->avg_load_per_task is no longer required. Remove it. Signed-off-by: Jan H. Schönherr Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1312189408-17172-1-git-send-email-schnhrr@cs.tu-berlin.de Signed-off-by: Ingo Molnar --- kernel/sched.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index ccacdbd..b1e8f0e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -520,8 +520,6 @@ struct rq { int cpu; int online; - unsigned long avg_load_per_task; - u64 rt_avg; u64 age_stamp; u64 idle_stamp; @@ -1569,11 +1567,9 @@ static unsigned long cpu_avg_load_per_task(int cpu) unsigned long nr_running = ACCESS_ONCE(rq->nr_running); if (nr_running) - rq->avg_load_per_task = rq->load.weight / nr_running; - else - rq->avg_load_per_task = 0; + return rq->load.weight / nr_running; - return rq->avg_load_per_task; + return 0; } #ifdef CONFIG_PREEMPT -- cgit v1.1 From c350a04efd1c89cd256b2abc8f07a21d0d53ff24 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Wed, 27 Jul 2011 17:14:55 +0200 Subject: sched: fix broken SCHED_RESET_ON_FORK handling Setting child->prio = current->normal_prio _after_ SCHED_RESET_ON_FORK has been handled for an RT parent gives birth to a deranged mutant child with non-RT policy, but RT prio and sched_class. Move PI leakage protection up, always set priorities and weight, and if the child is leaving RT class, reset rt_priority to the proper value. Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1311779695.8691.2.camel@marge.simson.net Signed-off-by: Ingo Molnar --- kernel/sched.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index b1e8f0e..cf427bb 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2844,19 +2844,23 @@ void sched_fork(struct task_struct *p) p->state = TASK_RUNNING; /* + * Make sure we do not leak PI boosting priority to the child. + */ + p->prio = current->normal_prio; + + /* * Revert to default priority/policy on fork if requested. */ if (unlikely(p->sched_reset_on_fork)) { - if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) { + if (task_has_rt_policy(p)) { p->policy = SCHED_NORMAL; - p->normal_prio = p->static_prio; - } - - if (PRIO_TO_NICE(p->static_prio) < 0) { p->static_prio = NICE_TO_PRIO(0); - p->normal_prio = p->static_prio; - set_load_weight(p); - } + p->rt_priority = 0; + } else if (PRIO_TO_NICE(p->static_prio) < 0) + p->static_prio = NICE_TO_PRIO(0); + + p->prio = p->normal_prio = __normal_prio(p); + set_load_weight(p); /* * We don't need the reset flag anymore after the fork. It has @@ -2865,11 +2869,6 @@ void sched_fork(struct task_struct *p) p->sched_reset_on_fork = 0; } - /* - * Make sure we do not leak PI boosting priority to the child. - */ - p->prio = current->normal_prio; - if (!rt_prio(p->prio)) p->sched_class = &fair_sched_class; -- cgit v1.1 From 953bfcd10e6f3697233e8e5128c611d275da39c1 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 21 Jul 2011 09:43:27 -0700 Subject: sched: Implement hierarchical task accounting for SCHED_OTHER Introduce hierarchical task accounting for the group scheduling case in CFS, as well as promoting the responsibility for maintaining rq->nr_running to the scheduling classes. The primary motivation for this is that with scheduling classes supporting bandwidth throttling it is possible for entities participating in throttled sub-trees to not have root visible changes in rq->nr_running across activate and de-activate operations. This in turn leads to incorrect idle and weight-per-task load balance decisions. This also allows us to make a small fixlet to the fastpath in pick_next_task() under group scheduling. Note: this issue also exists with the existing sched_rt throttling mechanism. This patch does not address that. Signed-off-by: Paul Turner Reviewed-by: Hidetoshi Seto Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110721184756.878333391@google.com Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index cf427bb..cd1a531 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -311,7 +311,7 @@ struct task_group root_task_group; /* CFS-related fields in a runqueue */ struct cfs_rq { struct load_weight load; - unsigned long nr_running; + unsigned long nr_running, h_nr_running; u64 exec_clock; u64 min_vruntime; @@ -1802,7 +1802,6 @@ static void activate_task(struct rq *rq, struct task_struct *p, int flags) rq->nr_uninterruptible--; enqueue_task(rq, p, flags); - inc_nr_running(rq); } /* @@ -1814,7 +1813,6 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) rq->nr_uninterruptible++; dequeue_task(rq, p, flags); - dec_nr_running(rq); } #ifdef CONFIG_IRQ_TIME_ACCOUNTING @@ -4258,7 +4256,7 @@ pick_next_task(struct rq *rq) * Optimization: we know that if all tasks are in * the fair class we can call that function directly: */ - if (likely(rq->nr_running == rq->cfs.nr_running)) { + if (likely(rq->nr_running == rq->cfs.h_nr_running)) { p = fair_sched_class.pick_next_task(rq); if (likely(p)) return p; -- cgit v1.1 From ab84d31e15502fb626169ba2663381e34bf965b2 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 21 Jul 2011 09:43:28 -0700 Subject: sched: Introduce primitives to account for CFS bandwidth tracking In this patch we introduce the notion of CFS bandwidth, partitioned into globally unassigned bandwidth, and locally claimed bandwidth. - The global bandwidth is per task_group, it represents a pool of unclaimed bandwidth that cfs_rqs can allocate from. - The local bandwidth is tracked per-cfs_rq, this represents allotments from the global pool bandwidth assigned to a specific cpu. Bandwidth is managed via cgroupfs, adding two new interfaces to the cpu subsystem: - cpu.cfs_period_us : the bandwidth period in usecs - cpu.cfs_quota_us : the cpu bandwidth (in usecs) that this tg will be allowed to consume over period above. Signed-off-by: Paul Turner Signed-off-by: Nikhil Rao Signed-off-by: Bharata B Rao Reviewed-by: Hidetoshi Seto Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110721184756.972636699@google.com Signed-off-by: Ingo Molnar --- kernel/sched.c | 196 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 192 insertions(+), 4 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index cd1a531..f08cb23 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -247,6 +247,14 @@ struct cfs_rq; static LIST_HEAD(task_groups); +struct cfs_bandwidth { +#ifdef CONFIG_CFS_BANDWIDTH + raw_spinlock_t lock; + ktime_t period; + u64 quota; +#endif +}; + /* task group related information */ struct task_group { struct cgroup_subsys_state css; @@ -278,6 +286,8 @@ struct task_group { #ifdef CONFIG_SCHED_AUTOGROUP struct autogroup *autogroup; #endif + + struct cfs_bandwidth cfs_bandwidth; }; /* task_group_lock serializes the addition/removal of task groups */ @@ -377,9 +387,48 @@ struct cfs_rq { unsigned long load_contribution; #endif +#ifdef CONFIG_CFS_BANDWIDTH + int runtime_enabled; + s64 runtime_remaining; +#endif #endif }; +#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_CFS_BANDWIDTH +static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) +{ + return &tg->cfs_bandwidth; +} + +static inline u64 default_cfs_period(void); + +static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) +{ + raw_spin_lock_init(&cfs_b->lock); + cfs_b->quota = RUNTIME_INF; + cfs_b->period = ns_to_ktime(default_cfs_period()); +} + +static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) +{ + cfs_rq->runtime_enabled = 0; +} + +static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) +{} +#else +static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} +static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} +static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} + +static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) +{ + return NULL; +} +#endif /* CONFIG_CFS_BANDWIDTH */ +#endif /* CONFIG_FAIR_GROUP_SCHED */ + /* Real-Time classes' related field in a runqueue: */ struct rt_rq { struct rt_prio_array active; @@ -7971,6 +8020,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, /* allow initial update_cfs_load() to truncate */ cfs_rq->load_stamp = 1; #endif + init_cfs_rq_runtime(cfs_rq); tg->cfs_rq[cpu] = cfs_rq; tg->se[cpu] = se; @@ -8110,6 +8160,7 @@ void __init sched_init(void) * We achieve this by letting root_task_group's tasks sit * directly in rq->cfs (i.e root_task_group->se[] = NULL). */ + init_cfs_bandwidth(&root_task_group.cfs_bandwidth); init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -8351,6 +8402,8 @@ static void free_fair_sched_group(struct task_group *tg) { int i; + destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); + for_each_possible_cpu(i) { if (tg->cfs_rq) kfree(tg->cfs_rq[i]); @@ -8378,6 +8431,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) tg->shares = NICE_0_LOAD; + init_cfs_bandwidth(tg_cfs_bandwidth(tg)); + for_each_possible_cpu(i) { cfs_rq = kzalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, cpu_to_node(i)); @@ -8753,7 +8808,7 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) return walk_tg_tree(tg_schedulable, tg_nop, &data); } -static int tg_set_bandwidth(struct task_group *tg, +static int tg_set_rt_bandwidth(struct task_group *tg, u64 rt_period, u64 rt_runtime) { int i, err = 0; @@ -8792,7 +8847,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) if (rt_runtime_us < 0) rt_runtime = RUNTIME_INF; - return tg_set_bandwidth(tg, rt_period, rt_runtime); + return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); } long sched_group_rt_runtime(struct task_group *tg) @@ -8817,7 +8872,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) if (rt_period == 0) return -EINVAL; - return tg_set_bandwidth(tg, rt_period, rt_runtime); + return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); } long sched_group_rt_period(struct task_group *tg) @@ -9007,6 +9062,128 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) return (u64) scale_load_down(tg->shares); } + +#ifdef CONFIG_CFS_BANDWIDTH +const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ +const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ + +static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) +{ + int i; + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); + static DEFINE_MUTEX(mutex); + + if (tg == &root_task_group) + return -EINVAL; + + /* + * Ensure we have at some amount of bandwidth every period. This is + * to prevent reaching a state of large arrears when throttled via + * entity_tick() resulting in prolonged exit starvation. + */ + if (quota < min_cfs_quota_period || period < min_cfs_quota_period) + return -EINVAL; + + /* + * Likewise, bound things on the otherside by preventing insane quota + * periods. This also allows us to normalize in computing quota + * feasibility. + */ + if (period > max_cfs_quota_period) + return -EINVAL; + + mutex_lock(&mutex); + raw_spin_lock_irq(&cfs_b->lock); + cfs_b->period = ns_to_ktime(period); + cfs_b->quota = quota; + raw_spin_unlock_irq(&cfs_b->lock); + + for_each_possible_cpu(i) { + struct cfs_rq *cfs_rq = tg->cfs_rq[i]; + struct rq *rq = rq_of(cfs_rq); + + raw_spin_lock_irq(&rq->lock); + cfs_rq->runtime_enabled = quota != RUNTIME_INF; + cfs_rq->runtime_remaining = 0; + raw_spin_unlock_irq(&rq->lock); + } + mutex_unlock(&mutex); + + return 0; +} + +int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) +{ + u64 quota, period; + + period = ktime_to_ns(tg_cfs_bandwidth(tg)->period); + if (cfs_quota_us < 0) + quota = RUNTIME_INF; + else + quota = (u64)cfs_quota_us * NSEC_PER_USEC; + + return tg_set_cfs_bandwidth(tg, period, quota); +} + +long tg_get_cfs_quota(struct task_group *tg) +{ + u64 quota_us; + + if (tg_cfs_bandwidth(tg)->quota == RUNTIME_INF) + return -1; + + quota_us = tg_cfs_bandwidth(tg)->quota; + do_div(quota_us, NSEC_PER_USEC); + + return quota_us; +} + +int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) +{ + u64 quota, period; + + period = (u64)cfs_period_us * NSEC_PER_USEC; + quota = tg_cfs_bandwidth(tg)->quota; + + if (period <= 0) + return -EINVAL; + + return tg_set_cfs_bandwidth(tg, period, quota); +} + +long tg_get_cfs_period(struct task_group *tg) +{ + u64 cfs_period_us; + + cfs_period_us = ktime_to_ns(tg_cfs_bandwidth(tg)->period); + do_div(cfs_period_us, NSEC_PER_USEC); + + return cfs_period_us; +} + +static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft) +{ + return tg_get_cfs_quota(cgroup_tg(cgrp)); +} + +static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype, + s64 cfs_quota_us) +{ + return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us); +} + +static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft) +{ + return tg_get_cfs_period(cgroup_tg(cgrp)); +} + +static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype, + u64 cfs_period_us) +{ + return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us); +} + +#endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED @@ -9041,6 +9218,18 @@ static struct cftype cpu_files[] = { .write_u64 = cpu_shares_write_u64, }, #endif +#ifdef CONFIG_CFS_BANDWIDTH + { + .name = "cfs_quota_us", + .read_s64 = cpu_cfs_quota_read_s64, + .write_s64 = cpu_cfs_quota_write_s64, + }, + { + .name = "cfs_period_us", + .read_u64 = cpu_cfs_period_read_u64, + .write_u64 = cpu_cfs_period_write_u64, + }, +#endif #ifdef CONFIG_RT_GROUP_SCHED { .name = "rt_runtime_us", @@ -9350,4 +9539,3 @@ struct cgroup_subsys cpuacct_subsys = { .subsys_id = cpuacct_subsys_id, }; #endif /* CONFIG_CGROUP_CPUACCT */ - -- cgit v1.1 From a790de99599a29ad3f18667530cf4b9f4b7e3234 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 21 Jul 2011 09:43:29 -0700 Subject: sched: Validate CFS quota hierarchies Add constraints validation for CFS bandwidth hierarchies. Validate that: max(child bandwidth) <= parent_bandwidth In a quota limited hierarchy, an unconstrained entity (e.g. bandwidth==RUNTIME_INF) inherits the bandwidth of its parent. This constraint is chosen over sum(child_bandwidth) as notion of over-commit is valuable within SCHED_OTHER. Some basic code from the RT case is re-factored for reuse. Signed-off-by: Paul Turner Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110721184757.083774572@google.com Signed-off-by: Ingo Molnar --- kernel/sched.c | 112 +++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 98 insertions(+), 14 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index f08cb23..ea6850d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -252,6 +252,7 @@ struct cfs_bandwidth { raw_spinlock_t lock; ktime_t period; u64 quota; + s64 hierarchal_quota; #endif }; @@ -1518,7 +1519,8 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load) update_load_sub(&rq->load, load); } -#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED) +#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ + (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) typedef int (*tg_visitor)(struct task_group *, void *); /* @@ -8708,12 +8710,7 @@ unsigned long sched_group_shares(struct task_group *tg) } #endif -#ifdef CONFIG_RT_GROUP_SCHED -/* - * Ensure that the real time constraints are schedulable. - */ -static DEFINE_MUTEX(rt_constraints_mutex); - +#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) static unsigned long to_ratio(u64 period, u64 runtime) { if (runtime == RUNTIME_INF) @@ -8721,6 +8718,13 @@ static unsigned long to_ratio(u64 period, u64 runtime) return div64_u64(runtime << 20, period); } +#endif + +#ifdef CONFIG_RT_GROUP_SCHED +/* + * Ensure that the real time constraints are schedulable. + */ +static DEFINE_MUTEX(rt_constraints_mutex); /* Must be called with tasklist_lock held */ static inline int tg_has_rt_tasks(struct task_group *tg) @@ -8741,7 +8745,7 @@ struct rt_schedulable_data { u64 rt_runtime; }; -static int tg_schedulable(struct task_group *tg, void *data) +static int tg_rt_schedulable(struct task_group *tg, void *data) { struct rt_schedulable_data *d = data; struct task_group *child; @@ -8805,7 +8809,7 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) .rt_runtime = runtime, }; - return walk_tg_tree(tg_schedulable, tg_nop, &data); + return walk_tg_tree(tg_rt_schedulable, tg_nop, &data); } static int tg_set_rt_bandwidth(struct task_group *tg, @@ -9064,14 +9068,17 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) } #ifdef CONFIG_CFS_BANDWIDTH +static DEFINE_MUTEX(cfs_constraints_mutex); + const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ +static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); + static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) { - int i; + int i, ret = 0; struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); - static DEFINE_MUTEX(mutex); if (tg == &root_task_group) return -EINVAL; @@ -9092,7 +9099,11 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) if (period > max_cfs_quota_period) return -EINVAL; - mutex_lock(&mutex); + mutex_lock(&cfs_constraints_mutex); + ret = __cfs_schedulable(tg, period, quota); + if (ret) + goto out_unlock; + raw_spin_lock_irq(&cfs_b->lock); cfs_b->period = ns_to_ktime(period); cfs_b->quota = quota; @@ -9107,9 +9118,10 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) cfs_rq->runtime_remaining = 0; raw_spin_unlock_irq(&rq->lock); } - mutex_unlock(&mutex); +out_unlock: + mutex_unlock(&cfs_constraints_mutex); - return 0; + return ret; } int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) @@ -9183,6 +9195,78 @@ static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype, return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us); } +struct cfs_schedulable_data { + struct task_group *tg; + u64 period, quota; +}; + +/* + * normalize group quota/period to be quota/max_period + * note: units are usecs + */ +static u64 normalize_cfs_quota(struct task_group *tg, + struct cfs_schedulable_data *d) +{ + u64 quota, period; + + if (tg == d->tg) { + period = d->period; + quota = d->quota; + } else { + period = tg_get_cfs_period(tg); + quota = tg_get_cfs_quota(tg); + } + + /* note: these should typically be equivalent */ + if (quota == RUNTIME_INF || quota == -1) + return RUNTIME_INF; + + return to_ratio(period, quota); +} + +static int tg_cfs_schedulable_down(struct task_group *tg, void *data) +{ + struct cfs_schedulable_data *d = data; + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); + s64 quota = 0, parent_quota = -1; + + if (!tg->parent) { + quota = RUNTIME_INF; + } else { + struct cfs_bandwidth *parent_b = tg_cfs_bandwidth(tg->parent); + + quota = normalize_cfs_quota(tg, d); + parent_quota = parent_b->hierarchal_quota; + + /* + * ensure max(child_quota) <= parent_quota, inherit when no + * limit is set + */ + if (quota == RUNTIME_INF) + quota = parent_quota; + else if (parent_quota != RUNTIME_INF && quota > parent_quota) + return -EINVAL; + } + cfs_b->hierarchal_quota = quota; + + return 0; +} + +static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) +{ + struct cfs_schedulable_data data = { + .tg = tg, + .period = period, + .quota = quota, + }; + + if (quota != RUNTIME_INF) { + do_div(data.period, NSEC_PER_USEC); + do_div(data.quota, NSEC_PER_USEC); + } + + return walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data); +} #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ -- cgit v1.1 From ec12cb7f31e28854efae7dd6f9544e0a66379040 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 21 Jul 2011 09:43:30 -0700 Subject: sched: Accumulate per-cfs_rq cpu usage and charge against bandwidth Account bandwidth usage on the cfs_rq level versus the task_groups to which they belong. Whether we are tracking bandwidth on a given cfs_rq is maintained under cfs_rq->runtime_enabled. cfs_rq's which belong to a bandwidth constrained task_group have their runtime accounted via the update_curr() path, which withdraws bandwidth from the global pool as desired. Updates involving the global pool are currently protected under cfs_bandwidth->lock, local runtime is protected by rq->lock. This patch only assigns and tracks quota, no action is taken in the case that cfs_rq->runtime_used exceeds cfs_rq->runtime_assigned. Signed-off-by: Paul Turner Signed-off-by: Nikhil Rao Signed-off-by: Bharata B Rao Reviewed-by: Hidetoshi Seto Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110721184757.179386821@google.com Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index ea6850d..35561c6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -251,7 +251,7 @@ struct cfs_bandwidth { #ifdef CONFIG_CFS_BANDWIDTH raw_spinlock_t lock; ktime_t period; - u64 quota; + u64 quota, runtime; s64 hierarchal_quota; #endif }; @@ -407,6 +407,7 @@ static inline u64 default_cfs_period(void); static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) { raw_spin_lock_init(&cfs_b->lock); + cfs_b->runtime = 0; cfs_b->quota = RUNTIME_INF; cfs_b->period = ns_to_ktime(default_cfs_period()); } @@ -9107,6 +9108,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) raw_spin_lock_irq(&cfs_b->lock); cfs_b->period = ns_to_ktime(period); cfs_b->quota = quota; + cfs_b->runtime = quota; raw_spin_unlock_irq(&cfs_b->lock); for_each_possible_cpu(i) { -- cgit v1.1 From 58088ad0152ba4b7997388c93d0ca208ec1ece75 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 21 Jul 2011 09:43:31 -0700 Subject: sched: Add a timer to handle CFS bandwidth refresh This patch adds a per-task_group timer which handles the refresh of the global CFS bandwidth pool. Since the RT pool is using a similar timer there's some small refactoring to share this support. Signed-off-by: Paul Turner Reviewed-by: Hidetoshi Seto Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110721184757.277271273@google.com Signed-off-by: Ingo Molnar --- kernel/sched.c | 107 ++++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 86 insertions(+), 21 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 35561c6..34bf8e6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -196,10 +196,28 @@ static inline int rt_bandwidth_enabled(void) return sysctl_sched_rt_runtime >= 0; } -static void start_rt_bandwidth(struct rt_bandwidth *rt_b) +static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) { - ktime_t now; + unsigned long delta; + ktime_t soft, hard, now; + for (;;) { + if (hrtimer_active(period_timer)) + break; + + now = hrtimer_cb_get_time(period_timer); + hrtimer_forward(period_timer, now, period); + + soft = hrtimer_get_softexpires(period_timer); + hard = hrtimer_get_expires(period_timer); + delta = ktime_to_ns(ktime_sub(hard, soft)); + __hrtimer_start_range_ns(period_timer, soft, delta, + HRTIMER_MODE_ABS_PINNED, 0); + } +} + +static void start_rt_bandwidth(struct rt_bandwidth *rt_b) +{ if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) return; @@ -207,22 +225,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) return; raw_spin_lock(&rt_b->rt_runtime_lock); - for (;;) { - unsigned long delta; - ktime_t soft, hard; - - if (hrtimer_active(&rt_b->rt_period_timer)) - break; - - now = hrtimer_cb_get_time(&rt_b->rt_period_timer); - hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); - - soft = hrtimer_get_softexpires(&rt_b->rt_period_timer); - hard = hrtimer_get_expires(&rt_b->rt_period_timer); - delta = ktime_to_ns(ktime_sub(hard, soft)); - __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta, - HRTIMER_MODE_ABS_PINNED, 0); - } + start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period); raw_spin_unlock(&rt_b->rt_runtime_lock); } @@ -253,6 +256,9 @@ struct cfs_bandwidth { ktime_t period; u64 quota, runtime; s64 hierarchal_quota; + + int idle, timer_active; + struct hrtimer period_timer; #endif }; @@ -403,6 +409,28 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) } static inline u64 default_cfs_period(void); +static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun); + +static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) +{ + struct cfs_bandwidth *cfs_b = + container_of(timer, struct cfs_bandwidth, period_timer); + ktime_t now; + int overrun; + int idle = 0; + + for (;;) { + now = hrtimer_cb_get_time(timer); + overrun = hrtimer_forward(timer, now, cfs_b->period); + + if (!overrun) + break; + + idle = do_sched_cfs_period_timer(cfs_b, overrun); + } + + return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; +} static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) { @@ -410,6 +438,9 @@ static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) cfs_b->runtime = 0; cfs_b->quota = RUNTIME_INF; cfs_b->period = ns_to_ktime(default_cfs_period()); + + hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + cfs_b->period_timer.function = sched_cfs_period_timer; } static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) @@ -417,8 +448,34 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) cfs_rq->runtime_enabled = 0; } +/* requires cfs_b->lock, may release to reprogram timer */ +static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) +{ + /* + * The timer may be active because we're trying to set a new bandwidth + * period or because we're racing with the tear-down path + * (timer_active==0 becomes visible before the hrtimer call-back + * terminates). In either case we ensure that it's re-programmed + */ + while (unlikely(hrtimer_active(&cfs_b->period_timer))) { + raw_spin_unlock(&cfs_b->lock); + /* ensure cfs_b->lock is available while we wait */ + hrtimer_cancel(&cfs_b->period_timer); + + raw_spin_lock(&cfs_b->lock); + /* if someone else restarted the timer then we're done */ + if (cfs_b->timer_active) + return; + } + + cfs_b->timer_active = 1; + start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period); +} + static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) -{} +{ + hrtimer_cancel(&cfs_b->period_timer); +} #else static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} @@ -9078,7 +9135,7 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) { - int i, ret = 0; + int i, ret = 0, runtime_enabled; struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); if (tg == &root_task_group) @@ -9105,10 +9162,18 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) if (ret) goto out_unlock; + runtime_enabled = quota != RUNTIME_INF; raw_spin_lock_irq(&cfs_b->lock); cfs_b->period = ns_to_ktime(period); cfs_b->quota = quota; cfs_b->runtime = quota; + + /* restart the period timer (if active) to handle new period expiry */ + if (runtime_enabled && cfs_b->timer_active) { + /* force a reprogram */ + cfs_b->timer_active = 0; + __start_cfs_bandwidth(cfs_b); + } raw_spin_unlock_irq(&cfs_b->lock); for_each_possible_cpu(i) { @@ -9116,7 +9181,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) struct rq *rq = rq_of(cfs_rq); raw_spin_lock_irq(&rq->lock); - cfs_rq->runtime_enabled = quota != RUNTIME_INF; + cfs_rq->runtime_enabled = runtime_enabled; cfs_rq->runtime_remaining = 0; raw_spin_unlock_irq(&rq->lock); } -- cgit v1.1 From a9cf55b2861057a213e610da2fec52125439a11d Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 21 Jul 2011 09:43:32 -0700 Subject: sched: Expire invalid runtime Since quota is managed using a global state but consumed on a per-cpu basis we need to ensure that our per-cpu state is appropriately synchronized. Most importantly, runtime that is state (from a previous period) should not be locally consumable. We take advantage of existing sched_clock synchronization about the jiffy to efficiently detect whether we have (globally) crossed a quota boundary above. One catch is that the direction of spread on sched_clock is undefined, specifically, we don't know whether our local clock is behind or ahead of the one responsible for the current expiration time. Fortunately we can differentiate these by considering whether the global deadline has advanced. If it has not, then we assume our clock to be "fast" and advance our local expiration; otherwise, we know the deadline has truly passed and we expire our local runtime. Signed-off-by: Paul Turner Reviewed-by: Hidetoshi Seto Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110721184757.379275352@google.com Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 34bf8e6..a2d5514 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -256,6 +256,7 @@ struct cfs_bandwidth { ktime_t period; u64 quota, runtime; s64 hierarchal_quota; + u64 runtime_expires; int idle, timer_active; struct hrtimer period_timer; @@ -396,6 +397,7 @@ struct cfs_rq { #endif #ifdef CONFIG_CFS_BANDWIDTH int runtime_enabled; + u64 runtime_expires; s64 runtime_remaining; #endif #endif @@ -9166,8 +9168,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) raw_spin_lock_irq(&cfs_b->lock); cfs_b->period = ns_to_ktime(period); cfs_b->quota = quota; - cfs_b->runtime = quota; + __refill_cfs_bandwidth_runtime(cfs_b); /* restart the period timer (if active) to handle new period expiry */ if (runtime_enabled && cfs_b->timer_active) { /* force a reprogram */ -- cgit v1.1 From 85dac906bec3bb41bfaa7ccaa65c4706de5cfdf8 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 21 Jul 2011 09:43:33 -0700 Subject: sched: Add support for throttling group entities Now that consumption is tracked (via update_curr()) we add support to throttle group entities (and their corresponding cfs_rqs) in the case where this is no run-time remaining. Throttled entities are dequeued to prevent scheduling, additionally we mark them as throttled (using cfs_rq->throttled) to prevent them from becoming re-enqueued until they are unthrottled. A list of a task_group's throttled entities are maintained on the cfs_bandwidth structure. Note: While the machinery for throttling is added in this patch the act of throttling an entity exceeding its bandwidth is deferred until later within the series. Signed-off-by: Paul Turner Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110721184757.480608533@google.com Signed-off-by: Ingo Molnar --- kernel/sched.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index a2d5514..044260a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -260,6 +260,8 @@ struct cfs_bandwidth { int idle, timer_active; struct hrtimer period_timer; + struct list_head throttled_cfs_rq; + #endif }; @@ -399,6 +401,9 @@ struct cfs_rq { int runtime_enabled; u64 runtime_expires; s64 runtime_remaining; + + int throttled; + struct list_head throttled_list; #endif #endif }; @@ -441,6 +446,7 @@ static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) cfs_b->quota = RUNTIME_INF; cfs_b->period = ns_to_ktime(default_cfs_period()); + INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); cfs_b->period_timer.function = sched_cfs_period_timer; } @@ -448,6 +454,7 @@ static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) { cfs_rq->runtime_enabled = 0; + INIT_LIST_HEAD(&cfs_rq->throttled_list); } /* requires cfs_b->lock, may release to reprogram timer */ -- cgit v1.1 From 671fd9dabe5239ad218c7eb48b2b9edee50250e6 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 21 Jul 2011 09:43:34 -0700 Subject: sched: Add support for unthrottling group entities At the start of each period we refresh the global bandwidth pool. At this time we must also unthrottle any cfs_rq entities who are now within bandwidth once more (as quota permits). Unthrottled entities have their corresponding cfs_rq->throttled flag cleared and their entities re-enqueued. Signed-off-by: Paul Turner Reviewed-by: Hidetoshi Seto Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110721184757.574628950@google.com Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 044260a..4bbabc2 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9192,6 +9192,9 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) raw_spin_lock_irq(&rq->lock); cfs_rq->runtime_enabled = runtime_enabled; cfs_rq->runtime_remaining = 0; + + if (cfs_rq_throttled(cfs_rq)) + unthrottle_cfs_rq(cfs_rq); raw_spin_unlock_irq(&rq->lock); } out_unlock: -- cgit v1.1 From 8277434ef1202ce30315f8edb3fc760aa6e74493 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 21 Jul 2011 09:43:35 -0700 Subject: sched: Allow for positional tg_tree walks Extend walk_tg_tree to accept a positional argument static int walk_tg_tree_from(struct task_group *from, tg_visitor down, tg_visitor up, void *data) Existing semantics are preserved, caller must hold rcu_lock() or sufficient analogue. Signed-off-by: Paul Turner Reviewed-by: Hidetoshi Seto Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110721184757.677889157@google.com Signed-off-by: Ingo Molnar --- kernel/sched.c | 50 +++++++++++++++++++++++++++++++++++++------------- 1 file changed, 37 insertions(+), 13 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 4bbabc2..8ec1e7a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1591,20 +1591,23 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load) typedef int (*tg_visitor)(struct task_group *, void *); /* - * Iterate the full tree, calling @down when first entering a node and @up when - * leaving it for the final time. + * Iterate task_group tree rooted at *from, calling @down when first entering a + * node and @up when leaving it for the final time. + * + * Caller must hold rcu_lock or sufficient equivalent. */ -static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) +static int walk_tg_tree_from(struct task_group *from, + tg_visitor down, tg_visitor up, void *data) { struct task_group *parent, *child; int ret; - rcu_read_lock(); - parent = &root_task_group; + parent = from; + down: ret = (*down)(parent, data); if (ret) - goto out_unlock; + goto out; list_for_each_entry_rcu(child, &parent->children, siblings) { parent = child; goto down; @@ -1613,19 +1616,29 @@ up: continue; } ret = (*up)(parent, data); - if (ret) - goto out_unlock; + if (ret || parent == from) + goto out; child = parent; parent = parent->parent; if (parent) goto up; -out_unlock: - rcu_read_unlock(); - +out: return ret; } +/* + * Iterate the full tree, calling @down when first entering a node and @up when + * leaving it for the final time. + * + * Caller must hold rcu_lock or sufficient equivalent. + */ + +static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) +{ + return walk_tg_tree_from(&root_task_group, down, up, data); +} + static int tg_nop(struct task_group *tg, void *data) { return 0; @@ -8870,13 +8883,19 @@ static int tg_rt_schedulable(struct task_group *tg, void *data) static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) { + int ret; + struct rt_schedulable_data data = { .tg = tg, .rt_period = period, .rt_runtime = runtime, }; - return walk_tg_tree(tg_rt_schedulable, tg_nop, &data); + rcu_read_lock(); + ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data); + rcu_read_unlock(); + + return ret; } static int tg_set_rt_bandwidth(struct task_group *tg, @@ -9333,6 +9352,7 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data) static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) { + int ret; struct cfs_schedulable_data data = { .tg = tg, .period = period, @@ -9344,7 +9364,11 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) do_div(data.quota, NSEC_PER_USEC); } - return walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data); + rcu_read_lock(); + ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data); + rcu_read_unlock(); + + return ret; } #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ -- cgit v1.1 From 64660c864f46202b932b911a69deb09805bdbaf8 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 21 Jul 2011 09:43:36 -0700 Subject: sched: Prevent interactions with throttled entities From the perspective of load-balance and shares distribution, throttled entities should be invisible. However, both of these operations work on 'active' lists and are not inherently aware of what group hierarchies may be present. In some cases this may be side-stepped (e.g. we could sideload via tg_load_down in load balance) while in others (e.g. update_shares()) it is more difficult to compute without incurring some O(n^2) costs. Instead, track hierarchicaal throttled state at time of transition. This allows us to easily identify whether an entity belongs to a throttled hierarchy and avoid incorrect interactions with it. Also, when an entity leaves a throttled hierarchy we need to advance its time averaging for shares averaging so that the elapsed throttled time is not considered as part of the cfs_rq's operation. We also use this information to prevent buddy interactions in the wakeup and yield_to() paths. Signed-off-by: Paul Turner Reviewed-by: Hidetoshi Seto Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110721184757.777916795@google.com Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 8ec1e7a..5db05f6f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -402,7 +402,7 @@ struct cfs_rq { u64 runtime_expires; s64 runtime_remaining; - int throttled; + int throttled, throttle_count; struct list_head throttled_list; #endif #endif -- cgit v1.1 From 8cb120d3e41a0464a559d639d519cef563717a4e Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 21 Jul 2011 09:43:38 -0700 Subject: sched: Migrate throttled tasks on HOTPLUG Throttled tasks are invisisble to cpu-offline since they are not eligible for selection by pick_next_task(). The regular 'escape' path for a thread that is blocked at offline is via ttwu->select_task_rq, however this will not handle a throttled group since there are no individual thread wakeups on an unthrottle. Resolve this by unthrottling offline cpus so that threads can be migrated. Signed-off-by: Paul Turner Reviewed-by: Hidetoshi Seto Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110721184757.989000590@google.com Signed-off-by: Ingo Molnar --- kernel/sched.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 5db05f6f..3973172 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6335,6 +6335,30 @@ static void calc_global_load_remove(struct rq *rq) rq->calc_load_active = 0; } +#ifdef CONFIG_CFS_BANDWIDTH +static void unthrottle_offline_cfs_rqs(struct rq *rq) +{ + struct cfs_rq *cfs_rq; + + for_each_leaf_cfs_rq(rq, cfs_rq) { + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); + + if (!cfs_rq->runtime_enabled) + continue; + + /* + * clock_task is not advancing so we just need to make sure + * there's some valid quota amount + */ + cfs_rq->runtime_remaining = cfs_b->quota; + if (cfs_rq_throttled(cfs_rq)) + unthrottle_cfs_rq(cfs_rq); + } +} +#else +static void unthrottle_offline_cfs_rqs(struct rq *rq) {} +#endif + /* * Migrate all tasks from the rq, sleeping tasks will be migrated by * try_to_wake_up()->select_task_rq(). @@ -6360,6 +6384,9 @@ static void migrate_tasks(unsigned int dead_cpu) */ rq->stop = NULL; + /* Ensure any throttled groups are reachable by pick_next_task */ + unthrottle_offline_cfs_rqs(rq); + for ( ; ; ) { /* * There's this thread running, bail when that's the only -- cgit v1.1 From e8da1b18b32064c43881bceef0f051c2110c9ab9 Mon Sep 17 00:00:00 2001 From: Nikhil Rao Date: Thu, 21 Jul 2011 09:43:40 -0700 Subject: sched: Add exports tracking cfs bandwidth control statistics This change introduces statistics exports for the cpu sub-system, these are added through the use of a stat file similar to that exported by other subsystems. The following exports are included: nr_periods: number of periods in which execution occurred nr_throttled: the number of periods above in which execution was throttle throttled_time: cumulative wall-time that any cpus have been throttled for this group Signed-off-by: Paul Turner Signed-off-by: Nikhil Rao Signed-off-by: Bharata B Rao Reviewed-by: Hidetoshi Seto Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110721184758.198901931@google.com Signed-off-by: Ingo Molnar --- kernel/sched.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 3973172..35c9185 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -262,6 +262,9 @@ struct cfs_bandwidth { struct hrtimer period_timer; struct list_head throttled_cfs_rq; + /* statistics */ + int nr_periods, nr_throttled; + u64 throttled_time; #endif }; @@ -402,6 +405,7 @@ struct cfs_rq { u64 runtime_expires; s64 runtime_remaining; + u64 throttled_timestamp; int throttled, throttle_count; struct list_head throttled_list; #endif @@ -9397,6 +9401,19 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) return ret; } + +static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft, + struct cgroup_map_cb *cb) +{ + struct task_group *tg = cgroup_tg(cgrp); + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); + + cb->fill(cb, "nr_periods", cfs_b->nr_periods); + cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); + cb->fill(cb, "throttled_time", cfs_b->throttled_time); + + return 0; +} #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -9443,6 +9460,10 @@ static struct cftype cpu_files[] = { .read_u64 = cpu_cfs_period_read_u64, .write_u64 = cpu_cfs_period_write_u64, }, + { + .name = "stat", + .read_map = cpu_stats_show, + }, #endif #ifdef CONFIG_RT_GROUP_SCHED { -- cgit v1.1 From d8b4986d3dbc4fabc2054d63f1d31d6ed2fb1ca8 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 21 Jul 2011 09:43:41 -0700 Subject: sched: Return unused runtime on group dequeue When a local cfs_rq blocks we return the majority of its remaining quota to the global bandwidth pool for use by other runqueues. We do this only when the quota is current and there is more than min_cfs_rq_quota [1ms by default] of runtime remaining on the rq. In the case where there are throttled runqueues and we have sufficient bandwidth to meter out a slice, a second timer is kicked off to handle this delivery, unthrottling where appropriate. Using a 'worst case' antagonist which executes on each cpu for 1ms before moving onto the next on a fairly large machine: no quota generations: 197.47 ms /cgroup/a/cpuacct.usage 199.46 ms /cgroup/a/cpuacct.usage 205.46 ms /cgroup/a/cpuacct.usage 198.46 ms /cgroup/a/cpuacct.usage 208.39 ms /cgroup/a/cpuacct.usage Since we are allowed to use "stale" quota our usage is effectively bounded by the rate of input into the global pool and performance is relatively stable. with quota generations [1s increments]: 119.58 ms /cgroup/a/cpuacct.usage 119.65 ms /cgroup/a/cpuacct.usage 119.64 ms /cgroup/a/cpuacct.usage 119.63 ms /cgroup/a/cpuacct.usage 119.60 ms /cgroup/a/cpuacct.usage The large deficit here is due to quota generations (/intentionally/) preventing us from now using previously stranded slack quota. The cost is that this quota becomes unavailable. with quota generations and quota return: 200.09 ms /cgroup/a/cpuacct.usage 200.09 ms /cgroup/a/cpuacct.usage 198.09 ms /cgroup/a/cpuacct.usage 200.09 ms /cgroup/a/cpuacct.usage 200.06 ms /cgroup/a/cpuacct.usage By returning unused quota we're able to both stably consume our desired quota and prevent unintentional overages due to the abuse of slack quota from previous quota periods (especially on a large machine). Signed-off-by: Paul Turner Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110721184758.306848658@google.com Signed-off-by: Ingo Molnar --- kernel/sched.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 35c9185..6baade0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -259,7 +259,7 @@ struct cfs_bandwidth { u64 runtime_expires; int idle, timer_active; - struct hrtimer period_timer; + struct hrtimer period_timer, slack_timer; struct list_head throttled_cfs_rq; /* statistics */ @@ -421,6 +421,16 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) static inline u64 default_cfs_period(void); static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun); +static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b); + +static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) +{ + struct cfs_bandwidth *cfs_b = + container_of(timer, struct cfs_bandwidth, slack_timer); + do_sched_cfs_slack_timer(cfs_b); + + return HRTIMER_NORESTART; +} static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) { @@ -453,6 +463,8 @@ static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); cfs_b->period_timer.function = sched_cfs_period_timer; + hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + cfs_b->slack_timer.function = sched_cfs_slack_timer; } static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) @@ -488,6 +500,7 @@ static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) { hrtimer_cancel(&cfs_b->period_timer); + hrtimer_cancel(&cfs_b->slack_timer); } #else static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} -- cgit v1.1 From fa14ff4accfb24e59d2473f3d864d6648d80563b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 12 Sep 2011 13:06:17 +0200 Subject: sched: Convert to struct llist Use the generic llist primitives. We had a private lockless list implementation in the scheduler in the wake-list code, now that we have a generic llist implementation that provides all required operations, switch to it. This patch is not expected to change any behavior. Signed-off-by: Peter Zijlstra Cc: Huang Ying Cc: Andrew Morton Link: http://lkml.kernel.org/r/1315836353.26517.42.camel@twins Signed-off-by: Ingo Molnar --- kernel/sched.c | 48 ++++++++++-------------------------------------- 1 file changed, 10 insertions(+), 38 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index c5cf15e..1874c74 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -702,7 +702,7 @@ struct rq { #endif #ifdef CONFIG_SMP - struct task_struct *wake_list; + struct llist_head wake_list; #endif }; @@ -2698,42 +2698,26 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) } #ifdef CONFIG_SMP -static void sched_ttwu_do_pending(struct task_struct *list) +static void sched_ttwu_pending(void) { struct rq *rq = this_rq(); + struct llist_node *llist = llist_del_all(&rq->wake_list); + struct task_struct *p; raw_spin_lock(&rq->lock); - while (list) { - struct task_struct *p = list; - list = list->wake_entry; + while (llist) { + p = llist_entry(llist, struct task_struct, wake_entry); + llist = llist_next(llist); ttwu_do_activate(rq, p, 0); } raw_spin_unlock(&rq->lock); } -#ifdef CONFIG_HOTPLUG_CPU - -static void sched_ttwu_pending(void) -{ - struct rq *rq = this_rq(); - struct task_struct *list = xchg(&rq->wake_list, NULL); - - if (!list) - return; - - sched_ttwu_do_pending(list); -} - -#endif /* CONFIG_HOTPLUG_CPU */ - void scheduler_ipi(void) { - struct rq *rq = this_rq(); - struct task_struct *list = xchg(&rq->wake_list, NULL); - - if (!list) + if (llist_empty(&this_rq()->wake_list)) return; /* @@ -2750,25 +2734,13 @@ void scheduler_ipi(void) * somewhat pessimize the simple resched case. */ irq_enter(); - sched_ttwu_do_pending(list); + sched_ttwu_pending(); irq_exit(); } static void ttwu_queue_remote(struct task_struct *p, int cpu) { - struct rq *rq = cpu_rq(cpu); - struct task_struct *next = rq->wake_list; - - for (;;) { - struct task_struct *old = next; - - p->wake_entry = next; - next = cmpxchg(&rq->wake_list, old, p); - if (next == old) - break; - } - - if (!next) + if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) smp_send_reschedule(cpu); } -- cgit v1.1 From 908a3283728d92df36e0c7cd63304fd35e93a8a9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2011 15:32:06 +0200 Subject: sched: Fix idle_cpu() On -rt we observed hackbench waking all 400 tasks to a single cpu. This is because of select_idle_sibling()'s interaction with the new ipi based wakeup scheme. The existing idle_cpu() test only checks to see if the current task on that cpu is the idle task, it does not take already queued tasks into account, nor does it take queued to be woken tasks into account. If the remote wakeup IPIs come hard enough, there won't be time to schedule away from the idle task, and would thus keep thinking the cpu was in fact idle, regardless of the fact that there were already several hundred tasks runnable. We couldn't reproduce on mainline, but there's no reason it couldn't happen. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-3o30p18b2paswpc9ohy2gltp@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 1874c74..4cdc91c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5138,7 +5138,20 @@ EXPORT_SYMBOL(task_nice); */ int idle_cpu(int cpu) { - return cpu_curr(cpu) == cpu_rq(cpu)->idle; + struct rq *rq = cpu_rq(cpu); + + if (rq->curr != rq->idle) + return 0; + + if (rq->nr_running) + return 0; + +#ifdef CONFIG_SMP + if (!llist_empty(&rq->wake_list)) + return 0; +#endif + + return 1; } /** -- cgit v1.1 From ca38062e57e97791c2f62e3dbd06caf3ebb5721c Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 3 Oct 2011 15:09:00 -0700 Subject: sched: Use resched IPI to kick off the nohz idle balance Current use of smp call function to kick the nohz idle balance can deadlock in this scenario. 1. cpu-A did a generic_exec_single() to cpu-B and after queuing its call single data (csd) to the call single queue, cpu-A took a timer interrupt. Actual IPI to cpu-B to process the call single queue is not yet sent. 2. As part of the timer interrupt handler, cpu-A decided to kick cpu-B for the idle load balancing (sets cpu-B's rq->nohz_balance_kick to 1) and __smp_call_function_single() with nowait will queue the csd to the cpu-B's queue. But the generic_exec_single() won't send an IPI to cpu-B as the call single queue was not empty. 3. cpu-A is busy with lot of interrupts 4. Meanwhile cpu-B is entering and exiting idle and noticed that it has it's rq->nohz_balance_kick set to '1'. So it will go ahead and do the idle load balancer and clear its rq->nohz_balance_kick. 5. At this point, csd queued as part of the step-2 above is still locked and waiting to be serviced on cpu-B. 6. cpu-A is still busy with interrupt load and now it got another timer interrupt and as part of it decided to kick cpu-B for another idle load balancing (as it finds cpu-B's rq->nohz_balance_kick cleared in step-4 above) and does __smp_call_function_single() with the same csd that is still locked. 7. And we get a deadlock waiting for the csd_lock() in the __smp_call_function_single(). Main issue here is that cpu-B can service the idle load balancer kick request from cpu-A even with out receiving the IPI and this lead to doing multiple __smp_call_function_single() on the same csd leading to deadlock. To kick a cpu, scheduler already has the reschedule vector reserved. Use that mechanism (kick_process()) instead of using the generic smp call function mechanism to kick off the nohz idle load balancing and avoid the deadlock. [ This issue is present from 2.6.35+ kernels, but marking it -stable only from v3.0+ as the proposed fix depends on the scheduler_ipi() that is introduced recently. ] Reported-by: Prarit Bhargava Signed-off-by: Suresh Siddha Cc: stable@kernel.org # v3.0+ Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20111003220934.834943260@sbsiddha-desk.sc.intel.com Signed-off-by: Ingo Molnar --- kernel/sched.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 4cdc91c..9e49af0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1404,6 +1404,18 @@ void wake_up_idle_cpu(int cpu) smp_send_reschedule(cpu); } +static inline bool got_nohz_idle_kick(void) +{ + return idle_cpu(smp_processor_id()) && this_rq()->nohz_balance_kick; +} + +#else /* CONFIG_NO_HZ */ + +static inline bool got_nohz_idle_kick(void) +{ + return false; +} + #endif /* CONFIG_NO_HZ */ static u64 sched_avg_period(void) @@ -2717,7 +2729,7 @@ static void sched_ttwu_pending(void) void scheduler_ipi(void) { - if (llist_empty(&this_rq()->wake_list)) + if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) return; /* @@ -2735,6 +2747,12 @@ void scheduler_ipi(void) */ irq_enter(); sched_ttwu_pending(); + + /* + * Check if someone kicked us for doing the nohz idle load balance. + */ + if (unlikely(got_nohz_idle_kick() && !need_resched())) + raise_softirq_irqoff(SCHED_SOFTIRQ); irq_exit(); } @@ -8288,7 +8306,6 @@ void __init sched_init(void) rq_attach_root(rq, &def_root_domain); #ifdef CONFIG_NO_HZ rq->nohz_balance_kick = 0; - init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i)); #endif #endif init_rq_hrtick(rq); -- cgit v1.1 From 6eb57e0d65ebd99a71d435dc96d83e725752eef8 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 3 Oct 2011 15:09:01 -0700 Subject: sched: Request for idle balance during nohz idle load balance rq's idle_at_tick is set to idle/busy during the timer tick depending on the cpu was idle or not. This will be used later in the load balance that will be done in the softirq context (which is a process context in -RT kernels). For nohz kernels, for the cpu doing nohz idle load balance on behalf of all the idle cpu's, its rq->idle_at_tick might have a stale value (which is recorded when it got the timer tick presumably when it is busy). As the nohz idle load balancing is also being done at the same place as the regular load balancing, nohz idle load balancing was bailing out when it sees rq's idle_at_tick not set. Thus leading to poor system utilization. Rename rq's idle_at_tick to idle_balance and set it when someone requests for nohz idle balance on an idle cpu. Reported-by: Srivatsa Vaddagiri Signed-off-by: Suresh Siddha Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20111003220934.892350549@sbsiddha-desk.sc.intel.com Signed-off-by: Ingo Molnar --- kernel/sched.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 9e49af0..7bc9b0e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -644,7 +644,7 @@ struct rq { unsigned long cpu_power; - unsigned char idle_at_tick; + unsigned char idle_balance; /* For active balancing */ int post_schedule; int active_balance; @@ -2751,8 +2751,10 @@ void scheduler_ipi(void) /* * Check if someone kicked us for doing the nohz idle load balance. */ - if (unlikely(got_nohz_idle_kick() && !need_resched())) + if (unlikely(got_nohz_idle_kick() && !need_resched())) { + this_rq()->idle_balance = 1; raise_softirq_irqoff(SCHED_SOFTIRQ); + } irq_exit(); } @@ -4247,7 +4249,7 @@ void scheduler_tick(void) perf_event_task_tick(); #ifdef CONFIG_SMP - rq->idle_at_tick = idle_cpu(cpu); + rq->idle_balance = idle_cpu(cpu); trigger_load_balance(rq, cpu); #endif } -- cgit v1.1 From fa17b507f142d37aeac322a95f6f7c6375f25601 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 16 Jun 2011 12:23:22 +0200 Subject: sched: Wrap scheduler p->cpus_allowed access This task is preparatory for the migrate_disable() implementation, but stands on its own and provides a cleanup. It currently only converts those sites required for task-placement. Kosaki-san once mentioned replacing cpus_allowed with a proper cpumask_t instead of the NR_CPUS sized array it currently is, that would also require something like this. Signed-off-by: Peter Zijlstra Acked-by: Thomas Gleixner Cc: KOSAKI Motohiro Link: http://lkml.kernel.org/n/tip-e42skvaddos99psip0vce41o@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 7bc9b0e..45174ca 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2544,11 +2544,11 @@ static int select_fallback_rq(int cpu, struct task_struct *p) /* Look for allowed, online CPU in same node. */ for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) - if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) + if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) return dest_cpu; /* Any allowed, online CPU? */ - dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); + dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask); if (dest_cpu < nr_cpu_ids) return dest_cpu; @@ -2585,7 +2585,7 @@ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) * [ this allows ->select_task() to simply return task_cpu(p) and * not worry about this generic constraint ] */ - if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) || + if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) || !cpu_online(cpu))) cpu = select_fallback_rq(task_cpu(p), p); @@ -6262,7 +6262,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) if (task_cpu(p) != src_cpu) goto done; /* Affinity changed (again). */ - if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) + if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) goto fail; /* -- cgit v1.1 From 4939602a2441306008c6dca38216b741d4e09a42 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 25 Jun 2011 15:45:46 +0200 Subject: sched: Unify the ->cpus_allowed mask copy Currently every sched_class::set_cpus_allowed() implementation has to copy the cpumask into task_struct::cpus_allowed, this is pointless, put this copy in the generic code. Signed-off-by: Peter Zijlstra Acked-by: Thomas Gleixner Link: http://lkml.kernel.org/n/tip-jhl5s9fckd9ptw1fzbqqlrd3@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 45174ca..ce9a9e7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6161,10 +6161,9 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) { if (p->sched_class && p->sched_class->set_cpus_allowed) p->sched_class->set_cpus_allowed(p, new_mask); - else { - cpumask_copy(&p->cpus_allowed, new_mask); - p->rt.nr_cpus_allowed = cpumask_weight(new_mask); - } + + cpumask_copy(&p->cpus_allowed, new_mask); + p->rt.nr_cpus_allowed = cpumask_weight(new_mask); } /* -- cgit v1.1 From 510f5acc4f4fb07f3f075900dc468d6e380beff6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 17 Jul 2011 20:47:54 +0200 Subject: sched: Don't use tasklist_lock for debug prints Avoid taking locks from debug prints, this avoids latencies on -rt, and improves reliability of the debug code. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index ce9a9e7..24637c7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6021,7 +6021,7 @@ void show_state_filter(unsigned long state_filter) printk(KERN_INFO " task PC stack pid father\n"); #endif - read_lock(&tasklist_lock); + rcu_read_lock(); do_each_thread(g, p) { /* * reset the NMI-timeout, listing all files on a slow @@ -6037,7 +6037,7 @@ void show_state_filter(unsigned long state_filter) #ifdef CONFIG_SCHED_DEBUG sysrq_sched_debug_show(); #endif - read_unlock(&tasklist_lock); + rcu_read_unlock(); /* * Only show locks if all tasks are dumped: */ -- cgit v1.1