summaryrefslogtreecommitdiffstats
path: root/kernel/sched/core.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-07-03 13:08:04 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2017-07-03 13:08:04 -0700
commit9bd42183b951051f73de121f7ee17091e7d26fbb (patch)
treec85c680126a0548a3c5f083e35f5b1cadce636f6 /kernel/sched/core.c
parent7447d56217e215e50317f308aee1ed293ac4f749 (diff)
parent72298e5c92c50edd8cb7cfda4519483ce65fa166 (diff)
downloadop-kernel-dev-9bd42183b951051f73de121f7ee17091e7d26fbb.zip
op-kernel-dev-9bd42183b951051f73de121f7ee17091e7d26fbb.tar.gz
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: "The main changes in this cycle were: - Add the SYSTEM_SCHEDULING bootup state to move various scheduler debug checks earlier into the bootup. This turns silent and sporadically deadly bugs into nice, deterministic splats. Fix some of the splats that triggered. (Thomas Gleixner) - A round of restructuring and refactoring of the load-balancing and topology code (Peter Zijlstra) - Another round of consolidating ~20 of incremental scheduler code history: this time in terms of wait-queue nomenclature. (I didn't get much feedback on these renaming patches, and we can still easily change any names I might have misplaced, so if anyone hates a new name, please holler and I'll fix it.) (Ingo Molnar) - sched/numa improvements, fixes and updates (Rik van Riel) - Another round of x86/tsc scheduler clock code improvements, in hope of making it more robust (Peter Zijlstra) - Improve NOHZ behavior (Frederic Weisbecker) - Deadline scheduler improvements and fixes (Luca Abeni, Daniel Bristot de Oliveira) - Simplify and optimize the topology setup code (Lauro Ramos Venancio) - Debloat and decouple scheduler code some more (Nicolas Pitre) - Simplify code by making better use of llist primitives (Byungchul Park) - ... plus other fixes and improvements" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (103 commits) sched/cputime: Refactor the cputime_adjust() code sched/debug: Expose the number of RT/DL tasks that can migrate sched/numa: Hide numa_wake_affine() from UP build sched/fair: Remove effective_load() sched/numa: Implement NUMA node level wake_affine() sched/fair: Simplify wake_affine() for the single socket case sched/numa: Override part of migrate_degrades_locality() when idle balancing sched/rt: Move RT related code from sched/core.c to sched/rt.c sched/deadline: Move DL related code from sched/core.c to sched/deadline.c sched/cpuset: Only offer CONFIG_CPUSETS if SMP is enabled sched/fair: Spare idle load balancing on nohz_full CPUs nohz: Move idle balancer registration to the idle path sched/loadavg: Generalize "_idle" naming to "_nohz" sched/core: Drop the unused try_get_task_struct() helper function sched/fair: WARN() and refuse to set buddy when !se->on_rq sched/debug: Fix SCHED_WARN_ON() to return a value on !CONFIG_SCHED_DEBUG as well sched/wait: Disambiguate wq_entry->task_list and wq_head->task_list naming sched/wait: Move bit_wait_table[] and related functionality from sched/core.c to sched/wait_bit.c sched/wait: Split out the wait_bit*() APIs from <linux/wait.h> into <linux/wait_bit.h> sched/wait: Re-adjust macro line continuation backslashes in <linux/wait.h> ...
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r--kernel/sched/core.c772
1 files changed, 55 insertions, 717 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5b60f3a..17c667b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10,6 +10,7 @@
#include <uapi/linux/sched/types.h>
#include <linux/sched/loadavg.h>
#include <linux/sched/hotplug.h>
+#include <linux/wait_bit.h>
#include <linux/cpuset.h>
#include <linux/delayacct.h>
#include <linux/init_task.h>
@@ -788,36 +789,6 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
dequeue_task(rq, p, flags);
}
-void sched_set_stop_task(int cpu, struct task_struct *stop)
-{
- struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
- struct task_struct *old_stop = cpu_rq(cpu)->stop;
-
- if (stop) {
- /*
- * Make it appear like a SCHED_FIFO task, its something
- * userspace knows about and won't get confused about.
- *
- * Also, it will make PI more or less work without too
- * much confusion -- but then, stop work should not
- * rely on PI working anyway.
- */
- sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
-
- stop->sched_class = &stop_sched_class;
- }
-
- cpu_rq(cpu)->stop = stop;
-
- if (old_stop) {
- /*
- * Reset it back to a normal scheduling class so that
- * it can die in pieces.
- */
- old_stop->sched_class = &rt_sched_class;
- }
-}
-
/*
* __normal_prio - return the priority that is based on the static prio
*/
@@ -1588,6 +1559,36 @@ static void update_avg(u64 *avg, u64 sample)
*avg += diff >> 3;
}
+void sched_set_stop_task(int cpu, struct task_struct *stop)
+{
+ struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
+ struct task_struct *old_stop = cpu_rq(cpu)->stop;
+
+ if (stop) {
+ /*
+ * Make it appear like a SCHED_FIFO task, its something
+ * userspace knows about and won't get confused about.
+ *
+ * Also, it will make PI more or less work without too
+ * much confusion -- but then, stop work should not
+ * rely on PI working anyway.
+ */
+ sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
+
+ stop->sched_class = &stop_sched_class;
+ }
+
+ cpu_rq(cpu)->stop = stop;
+
+ if (old_stop) {
+ /*
+ * Reset it back to a normal scheduling class so that
+ * it can die in pieces.
+ */
+ old_stop->sched_class = &rt_sched_class;
+ }
+}
+
#else
static inline int __set_cpus_allowed_ptr(struct task_struct *p,
@@ -1731,7 +1732,7 @@ void sched_ttwu_pending(void)
{
struct rq *rq = this_rq();
struct llist_node *llist = llist_del_all(&rq->wake_list);
- struct task_struct *p;
+ struct task_struct *p, *t;
struct rq_flags rf;
if (!llist)
@@ -1740,17 +1741,8 @@ void sched_ttwu_pending(void)
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
- while (llist) {
- int wake_flags = 0;
-
- p = llist_entry(llist, struct task_struct, wake_entry);
- llist = llist_next(llist);
-
- if (p->sched_remote_wakeup)
- wake_flags = WF_MIGRATED;
-
- ttwu_do_activate(rq, p, wake_flags, &rf);
- }
+ llist_for_each_entry_safe(p, t, llist, wake_entry)
+ ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
rq_unlock_irqrestore(rq, &rf);
}
@@ -2148,23 +2140,6 @@ int wake_up_state(struct task_struct *p, unsigned int state)
}
/*
- * This function clears the sched_dl_entity static params.
- */
-void __dl_clear_params(struct task_struct *p)
-{
- struct sched_dl_entity *dl_se = &p->dl;
-
- dl_se->dl_runtime = 0;
- dl_se->dl_deadline = 0;
- dl_se->dl_period = 0;
- dl_se->flags = 0;
- dl_se->dl_bw = 0;
-
- dl_se->dl_throttled = 0;
- dl_se->dl_yielded = 0;
-}
-
-/*
* Perform scheduler related setup for a newly forked process p.
* p is forked by current.
*
@@ -2193,6 +2168,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
RB_CLEAR_NODE(&p->dl.rb_node);
init_dl_task_timer(&p->dl);
+ init_dl_inactive_task_timer(&p->dl);
__dl_clear_params(p);
INIT_LIST_HEAD(&p->rt.run_list);
@@ -2430,7 +2406,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
unsigned long to_ratio(u64 period, u64 runtime)
{
if (runtime == RUNTIME_INF)
- return 1ULL << 20;
+ return BW_UNIT;
/*
* Doing this here saves a lot of checks in all
@@ -2440,93 +2416,9 @@ unsigned long to_ratio(u64 period, u64 runtime)
if (period == 0)
return 0;
- return div64_u64(runtime << 20, period);
+ return div64_u64(runtime << BW_SHIFT, period);
}
-#ifdef CONFIG_SMP
-inline struct dl_bw *dl_bw_of(int i)
-{
- RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
- "sched RCU must be held");
- return &cpu_rq(i)->rd->dl_bw;
-}
-
-static inline int dl_bw_cpus(int i)
-{
- struct root_domain *rd = cpu_rq(i)->rd;
- int cpus = 0;
-
- RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
- "sched RCU must be held");
- for_each_cpu_and(i, rd->span, cpu_active_mask)
- cpus++;
-
- return cpus;
-}
-#else
-inline struct dl_bw *dl_bw_of(int i)
-{
- return &cpu_rq(i)->dl.dl_bw;
-}
-
-static inline int dl_bw_cpus(int i)
-{
- return 1;
-}
-#endif
-
-/*
- * We must be sure that accepting a new task (or allowing changing the
- * parameters of an existing one) is consistent with the bandwidth
- * constraints. If yes, this function also accordingly updates the currently
- * allocated bandwidth to reflect the new situation.
- *
- * This function is called while holding p's rq->lock.
- *
- * XXX we should delay bw change until the task's 0-lag point, see
- * __setparam_dl().
- */
-static int dl_overflow(struct task_struct *p, int policy,
- const struct sched_attr *attr)
-{
-
- struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
- u64 period = attr->sched_period ?: attr->sched_deadline;
- u64 runtime = attr->sched_runtime;
- u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
- int cpus, err = -1;
-
- /* !deadline task may carry old deadline bandwidth */
- if (new_bw == p->dl.dl_bw && task_has_dl_policy(p))
- return 0;
-
- /*
- * Either if a task, enters, leave, or stays -deadline but changes
- * its parameters, we may need to update accordingly the total
- * allocated bandwidth of the container.
- */
- raw_spin_lock(&dl_b->lock);
- cpus = dl_bw_cpus(task_cpu(p));
- if (dl_policy(policy) && !task_has_dl_policy(p) &&
- !__dl_overflow(dl_b, cpus, 0, new_bw)) {
- __dl_add(dl_b, new_bw);
- err = 0;
- } else if (dl_policy(policy) && task_has_dl_policy(p) &&
- !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
- __dl_clear(dl_b, p->dl.dl_bw);
- __dl_add(dl_b, new_bw);
- err = 0;
- } else if (!dl_policy(policy) && task_has_dl_policy(p)) {
- __dl_clear(dl_b, p->dl.dl_bw);
- err = 0;
- }
- raw_spin_unlock(&dl_b->lock);
-
- return err;
-}
-
-extern void init_dl_bw(struct dl_bw *dl_b);
-
/*
* wake_up_new_task - wake up a newly created task for the first time.
*
@@ -3687,7 +3579,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
exception_exit(prev_state);
}
-int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
+int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
void *key)
{
return try_to_wake_up(curr->private, mode, wake_flags);
@@ -4009,46 +3901,6 @@ static struct task_struct *find_process_by_pid(pid_t pid)
}
/*
- * This function initializes the sched_dl_entity of a newly becoming
- * SCHED_DEADLINE task.
- *
- * Only the static values are considered here, the actual runtime and the
- * absolute deadline will be properly calculated when the task is enqueued
- * for the first time with its new policy.
- */
-static void
-__setparam_dl(struct task_struct *p, const struct sched_attr *attr)
-{
- struct sched_dl_entity *dl_se = &p->dl;
-
- dl_se->dl_runtime = attr->sched_runtime;
- dl_se->dl_deadline = attr->sched_deadline;
- dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
- dl_se->flags = attr->sched_flags;
- dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
-
- /*
- * Changing the parameters of a task is 'tricky' and we're not doing
- * the correct thing -- also see task_dead_dl() and switched_from_dl().
- *
- * What we SHOULD do is delay the bandwidth release until the 0-lag
- * point. This would include retaining the task_struct until that time
- * and change dl_overflow() to not immediately decrement the current
- * amount.
- *
- * Instead we retain the current runtime/deadline and let the new
- * parameters take effect after the current reservation period lapses.
- * This is safe (albeit pessimistic) because the 0-lag point is always
- * before the current scheduling deadline.
- *
- * We can still have temporary overloads because we do not delay the
- * change in bandwidth until that time; so admission control is
- * not on the safe side. It does however guarantee tasks will never
- * consume more than promised.
- */
-}
-
-/*
* sched_setparam() passes in -1 for its policy, to let the functions
* it calls know not to change it.
*/
@@ -4101,59 +3953,6 @@ static void __setscheduler(struct rq *rq, struct task_struct *p,
p->sched_class = &fair_sched_class;
}
-static void
-__getparam_dl(struct task_struct *p, struct sched_attr *attr)
-{
- struct sched_dl_entity *dl_se = &p->dl;
-
- attr->sched_priority = p->rt_priority;
- attr->sched_runtime = dl_se->dl_runtime;
- attr->sched_deadline = dl_se->dl_deadline;
- attr->sched_period = dl_se->dl_period;
- attr->sched_flags = dl_se->flags;
-}
-
-/*
- * This function validates the new parameters of a -deadline task.
- * We ask for the deadline not being zero, and greater or equal
- * than the runtime, as well as the period of being zero or
- * greater than deadline. Furthermore, we have to be sure that
- * user parameters are above the internal resolution of 1us (we
- * check sched_runtime only since it is always the smaller one) and
- * below 2^63 ns (we have to check both sched_deadline and
- * sched_period, as the latter can be zero).
- */
-static bool
-__checkparam_dl(const struct sched_attr *attr)
-{
- /* deadline != 0 */
- if (attr->sched_deadline == 0)
- return false;
-
- /*
- * Since we truncate DL_SCALE bits, make sure we're at least
- * that big.
- */
- if (attr->sched_runtime < (1ULL << DL_SCALE))
- return false;
-
- /*
- * Since we use the MSB for wrap-around and sign issues, make
- * sure it's not set (mind that period can be equal to zero).
- */
- if (attr->sched_deadline & (1ULL << 63) ||
- attr->sched_period & (1ULL << 63))
- return false;
-
- /* runtime <= deadline <= period (if period != 0) */
- if ((attr->sched_period != 0 &&
- attr->sched_period < attr->sched_deadline) ||
- attr->sched_deadline < attr->sched_runtime)
- return false;
-
- return true;
-}
-
/*
* Check the target process has a UID that matches the current process's:
*/
@@ -4170,19 +3969,6 @@ static bool check_same_owner(struct task_struct *p)
return match;
}
-static bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
-{
- struct sched_dl_entity *dl_se = &p->dl;
-
- if (dl_se->dl_runtime != attr->sched_runtime ||
- dl_se->dl_deadline != attr->sched_deadline ||
- dl_se->dl_period != attr->sched_period ||
- dl_se->flags != attr->sched_flags)
- return true;
-
- return false;
-}
-
static int __sched_setscheduler(struct task_struct *p,
const struct sched_attr *attr,
bool user, bool pi)
@@ -4197,8 +3983,8 @@ static int __sched_setscheduler(struct task_struct *p,
int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
struct rq *rq;
- /* May grab non-irq protected spin_locks: */
- BUG_ON(in_interrupt());
+ /* The pi code expects interrupts enabled */
+ BUG_ON(pi && in_interrupt());
recheck:
/* Double check policy once rq lock held: */
if (policy < 0) {
@@ -4211,7 +3997,8 @@ recheck:
return -EINVAL;
}
- if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK))
+ if (attr->sched_flags &
+ ~(SCHED_FLAG_RESET_ON_FORK | SCHED_FLAG_RECLAIM))
return -EINVAL;
/*
@@ -4362,7 +4149,7 @@ change:
* of a SCHED_DEADLINE task) we need to check if enough bandwidth
* is available.
*/
- if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) {
+ if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
task_rq_unlock(rq, p, &rf);
return -EBUSY;
}
@@ -5463,26 +5250,17 @@ void init_idle(struct task_struct *idle, int cpu)
#endif
}
+#ifdef CONFIG_SMP
+
int cpuset_cpumask_can_shrink(const struct cpumask *cur,
const struct cpumask *trial)
{
- int ret = 1, trial_cpus;
- struct dl_bw *cur_dl_b;
- unsigned long flags;
+ int ret = 1;
if (!cpumask_weight(cur))
return ret;
- rcu_read_lock_sched();
- cur_dl_b = dl_bw_of(cpumask_any(cur));
- trial_cpus = cpumask_weight(trial);
-
- raw_spin_lock_irqsave(&cur_dl_b->lock, flags);
- if (cur_dl_b->bw != -1 &&
- cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw)
- ret = 0;
- raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags);
- rcu_read_unlock_sched();
+ ret = dl_cpuset_cpumask_can_shrink(cur, trial);
return ret;
}
@@ -5506,43 +5284,14 @@ int task_can_attach(struct task_struct *p,
goto out;
}
-#ifdef CONFIG_SMP
if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
- cs_cpus_allowed)) {
- unsigned int dest_cpu = cpumask_any_and(cpu_active_mask,
- cs_cpus_allowed);
- struct dl_bw *dl_b;
- bool overflow;
- int cpus;
- unsigned long flags;
-
- rcu_read_lock_sched();
- dl_b = dl_bw_of(dest_cpu);
- raw_spin_lock_irqsave(&dl_b->lock, flags);
- cpus = dl_bw_cpus(dest_cpu);
- overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
- if (overflow)
- ret = -EBUSY;
- else {
- /*
- * We reserve space for this task in the destination
- * root_domain, as we can't fail after this point.
- * We will free resources in the source root_domain
- * later on (see set_cpus_allowed_dl()).
- */
- __dl_add(dl_b, p->dl.dl_bw);
- }
- raw_spin_unlock_irqrestore(&dl_b->lock, flags);
- rcu_read_unlock_sched();
+ cs_cpus_allowed))
+ ret = dl_task_can_attach(p, cs_cpus_allowed);
- }
-#endif
out:
return ret;
}
-#ifdef CONFIG_SMP
-
bool sched_smp_initialized __read_mostly;
#ifdef CONFIG_NUMA_BALANCING
@@ -5805,23 +5554,8 @@ static void cpuset_cpu_active(void)
static int cpuset_cpu_inactive(unsigned int cpu)
{
- unsigned long flags;
- struct dl_bw *dl_b;
- bool overflow;
- int cpus;
-
if (!cpuhp_tasks_frozen) {
- rcu_read_lock_sched();
- dl_b = dl_bw_of(cpu);
-
- raw_spin_lock_irqsave(&dl_b->lock, flags);
- cpus = dl_bw_cpus(cpu);
- overflow = __dl_overflow(dl_b, cpus, 0, 0);
- raw_spin_unlock_irqrestore(&dl_b->lock, flags);
-
- rcu_read_unlock_sched();
-
- if (overflow)
+ if (dl_cpu_busy(cpu))
return -EBUSY;
cpuset_update_active_cpus();
} else {
@@ -5952,7 +5686,6 @@ void __init sched_init_smp(void)
cpumask_var_t non_isolated_cpus;
alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
- alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
sched_init_numa();
@@ -5962,7 +5695,7 @@ void __init sched_init_smp(void)
* happen.
*/
mutex_lock(&sched_domains_mutex);
- init_sched_domains(cpu_active_mask);
+ sched_init_domains(cpu_active_mask);
cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
if (cpumask_empty(non_isolated_cpus))
cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
@@ -5978,7 +5711,6 @@ void __init sched_init_smp(void)
init_sched_dl_class();
sched_init_smt();
- sched_clock_init_late();
sched_smp_initialized = true;
}
@@ -5994,7 +5726,6 @@ early_initcall(migration_init);
void __init sched_init_smp(void)
{
sched_init_granularity();
- sched_clock_init_late();
}
#endif /* CONFIG_SMP */
@@ -6020,28 +5751,13 @@ static struct kmem_cache *task_group_cache __read_mostly;
DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
-#define WAIT_TABLE_BITS 8
-#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS)
-static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned;
-
-wait_queue_head_t *bit_waitqueue(void *word, int bit)
-{
- const int shift = BITS_PER_LONG == 32 ? 5 : 6;
- unsigned long val = (unsigned long)word << shift | bit;
-
- return bit_wait_table + hash_long(val, WAIT_TABLE_BITS);
-}
-EXPORT_SYMBOL(bit_waitqueue);
-
void __init sched_init(void)
{
int i, j;
unsigned long alloc_size = 0, ptr;
sched_clock_init();
-
- for (i = 0; i < WAIT_TABLE_SIZE; i++)
- init_waitqueue_head(bit_wait_table + i);
+ wait_bit_init();
#ifdef CONFIG_FAIR_GROUP_SCHED
alloc_size += 2 * nr_cpu_ids * sizeof(void **);
@@ -6193,7 +5909,6 @@ void __init sched_init(void)
calc_load_update = jiffies + LOAD_FREQ;
#ifdef CONFIG_SMP
- zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
/* May be allocated at isolcpus cmdline parse time */
if (cpu_isolated_map == NULL)
zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
@@ -6245,8 +5960,10 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
!is_idle_task(current)) ||
- system_state != SYSTEM_RUNNING || oops_in_progress)
+ system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||
+ oops_in_progress)
return;
+
if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
return;
prev_jiffy = jiffies;
@@ -6501,385 +6218,6 @@ void sched_move_task(struct task_struct *tsk)
task_rq_unlock(rq, tsk, &rf);
}
-#endif /* CONFIG_CGROUP_SCHED */
-
-#ifdef CONFIG_RT_GROUP_SCHED
-/*
- * Ensure that the real time constraints are schedulable.
- */
-static DEFINE_MUTEX(rt_constraints_mutex);
-
-/* Must be called with tasklist_lock held */
-static inline int tg_has_rt_tasks(struct task_group *tg)
-{
- struct task_struct *g, *p;
-
- /*
- * Autogroups do not have RT tasks; see autogroup_create().
- */
- if (task_group_is_autogroup(tg))
- return 0;
-
- for_each_process_thread(g, p) {
- if (rt_task(p) && task_group(p) == tg)
- return 1;
- }
-
- return 0;
-}
-
-struct rt_schedulable_data {
- struct task_group *tg;
- u64 rt_period;
- u64 rt_runtime;
-};
-
-static int tg_rt_schedulable(struct task_group *tg, void *data)
-{
- struct rt_schedulable_data *d = data;
- struct task_group *child;
- unsigned long total, sum = 0;
- u64 period, runtime;
-
- period = ktime_to_ns(tg->rt_bandwidth.rt_period);
- runtime = tg->rt_bandwidth.rt_runtime;
-
- if (tg == d->tg) {
- period = d->rt_period;
- runtime = d->rt_runtime;
- }
-
- /*
- * Cannot have more runtime than the period.
- */
- if (runtime > period && runtime != RUNTIME_INF)
- return -EINVAL;
-
- /*
- * Ensure we don't starve existing RT tasks.
- */
- if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
- return -EBUSY;
-
- total = to_ratio(period, runtime);
-
- /*
- * Nobody can have more than the global setting allows.
- */
- if (total > to_ratio(global_rt_period(), global_rt_runtime()))
- return -EINVAL;
-
- /*
- * The sum of our children's runtime should not exceed our own.
- */
- list_for_each_entry_rcu(child, &tg->children, siblings) {
- period = ktime_to_ns(child->rt_bandwidth.rt_period);
- runtime = child->rt_bandwidth.rt_runtime;
-
- if (child == d->tg) {
- period = d->rt_period;
- runtime = d->rt_runtime;
- }
-
- sum += to_ratio(period, runtime);
- }
-
- if (sum > total)
- return -EINVAL;
-
- return 0;
-}
-
-static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
-{
- int ret;
-
- struct rt_schedulable_data data = {
- .tg = tg,
- .rt_period = period,
- .rt_runtime = runtime,
- };
-
- rcu_read_lock();
- ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
- rcu_read_unlock();
-
- return ret;
-}
-
-static int tg_set_rt_bandwidth(struct task_group *tg,
- u64 rt_period, u64 rt_runtime)
-{
- int i, err = 0;
-
- /*
- * Disallowing the root group RT runtime is BAD, it would disallow the
- * kernel creating (and or operating) RT threads.
- */
- if (tg == &root_task_group && rt_runtime == 0)
- return -EINVAL;
-
- /* No period doesn't make any sense. */
- if (rt_period == 0)
- return -EINVAL;
-
- mutex_lock(&rt_constraints_mutex);
- read_lock(&tasklist_lock);
- err = __rt_schedulable(tg, rt_period, rt_runtime);
- if (err)
- goto unlock;
-
- raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
- tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
- tg->rt_bandwidth.rt_runtime = rt_runtime;
-
- for_each_possible_cpu(i) {
- struct rt_rq *rt_rq = tg->rt_rq[i];
-
- raw_spin_lock(&rt_rq->rt_runtime_lock);
- rt_rq->rt_runtime = rt_runtime;
- raw_spin_unlock(&rt_rq->rt_runtime_lock);
- }
- raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
-unlock:
- read_unlock(&tasklist_lock);
- mutex_unlock(&rt_constraints_mutex);
-
- return err;
-}
-
-static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
-{
- u64 rt_runtime, rt_period;
-
- rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
- rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
- if (rt_runtime_us < 0)
- rt_runtime = RUNTIME_INF;
-
- return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
-}
-
-static long sched_group_rt_runtime(struct task_group *tg)
-{
- u64 rt_runtime_us;
-
- if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
- return -1;
-
- rt_runtime_us = tg->rt_bandwidth.rt_runtime;
- do_div(rt_runtime_us, NSEC_PER_USEC);
- return rt_runtime_us;
-}
-
-static int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
-{
- u64 rt_runtime, rt_period;
-
- rt_period = rt_period_us * NSEC_PER_USEC;
- rt_runtime = tg->rt_bandwidth.rt_runtime;
-
- return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
-}
-
-static long sched_group_rt_period(struct task_group *tg)
-{
- u64 rt_period_us;
-
- rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
- do_div(rt_period_us, NSEC_PER_USEC);
- return rt_period_us;
-}
-#endif /* CONFIG_RT_GROUP_SCHED */
-
-#ifdef CONFIG_RT_GROUP_SCHED
-static int sched_rt_global_constraints(void)
-{
- int ret = 0;
-
- mutex_lock(&rt_constraints_mutex);
- read_lock(&tasklist_lock);
- ret = __rt_schedulable(NULL, 0, 0);
- read_unlock(&tasklist_lock);
- mutex_unlock(&rt_constraints_mutex);
-
- return ret;
-}
-
-static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
-{
- /* Don't accept realtime tasks when there is no way for them to run */
- if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
- return 0;
-
- return 1;
-}
-
-#else /* !CONFIG_RT_GROUP_SCHED */
-static int sched_rt_global_constraints(void)
-{
- unsigned long flags;
- int i;
-
- raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
- for_each_possible_cpu(i) {
- struct rt_rq *rt_rq = &cpu_rq(i)->rt;
-
- raw_spin_lock(&rt_rq->rt_runtime_lock);
- rt_rq->rt_runtime = global_rt_runtime();
- raw_spin_unlock(&rt_rq->rt_runtime_lock);
- }
- raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
-
- return 0;
-}
-#endif /* CONFIG_RT_GROUP_SCHED */
-
-static int sched_dl_global_validate(void)
-{
- u64 runtime = global_rt_runtime();
- u64 period = global_rt_period();
- u64 new_bw = to_ratio(period, runtime);
- struct dl_bw *dl_b;
- int cpu, ret = 0;
- unsigned long flags;
-
- /*
- * Here we want to check the bandwidth not being set to some
- * value smaller than the currently allocated bandwidth in
- * any of the root_domains.
- *
- * FIXME: Cycling on all the CPUs is overdoing, but simpler than
- * cycling on root_domains... Discussion on different/better
- * solutions is welcome!
- */
- for_each_possible_cpu(cpu) {
- rcu_read_lock_sched();
- dl_b = dl_bw_of(cpu);
-
- raw_spin_lock_irqsave(&dl_b->lock, flags);
- if (new_bw < dl_b->total_bw)
- ret = -EBUSY;
- raw_spin_unlock_irqrestore(&dl_b->lock, flags);
-
- rcu_read_unlock_sched();
-
- if (ret)
- break;
- }
-
- return ret;
-}
-
-static void sched_dl_do_global(void)
-{
- u64 new_bw = -1;
- struct dl_bw *dl_b;
- int cpu;
- unsigned long flags;
-
- def_dl_bandwidth.dl_period = global_rt_period();
- def_dl_bandwidth.dl_runtime = global_rt_runtime();
-
- if (global_rt_runtime() != RUNTIME_INF)
- new_bw = to_ratio(global_rt_period(), global_rt_runtime());
-
- /*
- * FIXME: As above...
- */
- for_each_possible_cpu(cpu) {
- rcu_read_lock_sched();
- dl_b = dl_bw_of(cpu);
-
- raw_spin_lock_irqsave(&dl_b->lock, flags);
- dl_b->bw = new_bw;
- raw_spin_unlock_irqrestore(&dl_b->lock, flags);
-
- rcu_read_unlock_sched();
- }
-}
-
-static int sched_rt_global_validate(void)
-{
- if (sysctl_sched_rt_period <= 0)
- return -EINVAL;
-
- if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
- (sysctl_sched_rt_runtime > sysctl_sched_rt_period))
- return -EINVAL;
-
- return 0;
-}
-
-static void sched_rt_do_global(void)
-{
- def_rt_bandwidth.rt_runtime = global_rt_runtime();
- def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
-}
-
-int sched_rt_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
-{
- int old_period, old_runtime;
- static DEFINE_MUTEX(mutex);
- int ret;
-
- mutex_lock(&mutex);
- old_period = sysctl_sched_rt_period;
- old_runtime = sysctl_sched_rt_runtime;
-
- ret = proc_dointvec(table, write, buffer, lenp, ppos);
-
- if (!ret && write) {
- ret = sched_rt_global_validate();
- if (ret)
- goto undo;
-
- ret = sched_dl_global_validate();
- if (ret)
- goto undo;
-
- ret = sched_rt_global_constraints();
- if (ret)
- goto undo;
-
- sched_rt_do_global();
- sched_dl_do_global();
- }
- if (0) {
-undo:
- sysctl_sched_rt_period = old_period;
- sysctl_sched_rt_runtime = old_runtime;
- }
- mutex_unlock(&mutex);
-
- return ret;
-}
-
-int sched_rr_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
-{
- int ret;
- static DEFINE_MUTEX(mutex);
-
- mutex_lock(&mutex);
- ret = proc_dointvec(table, write, buffer, lenp, ppos);
- /*
- * Make sure that internally we keep jiffies.
- * Also, writing zero resets the timeslice to default:
- */
- if (!ret && write) {
- sched_rr_timeslice =
- sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE :
- msecs_to_jiffies(sysctl_sched_rr_timeslice);
- }
- mutex_unlock(&mutex);
- return ret;
-}
-
-#ifdef CONFIG_CGROUP_SCHED
static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
{
OpenPOWER on IntegriCloud