summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/audit.c4
-rw-r--r--kernel/cgroup.c10
-rw-r--r--kernel/cgroup_freezer.c116
-rw-r--r--kernel/context_tracking.c2
-rw-r--r--kernel/cpu.c6
-rw-r--r--kernel/hrtimer.c30
-rw-r--r--kernel/irq/irqdesc.c7
-rw-r--r--kernel/irq/manage.c17
-rw-r--r--kernel/locking/lockdep.c2
-rw-r--r--kernel/locking/locktorture.c2
-rw-r--r--kernel/module.c6
-rw-r--r--kernel/power/snapshot.c2
-rw-r--r--kernel/power/suspend.c5
-rw-r--r--kernel/printk/printk.c4
-rw-r--r--kernel/sched/core.c576
-rw-r--r--kernel/sched/cpudeadline.c37
-rw-r--r--kernel/sched/cpudeadline.h6
-rw-r--r--kernel/sched/cpupri.c10
-rw-r--r--kernel/sched/cpupri.h2
-rw-r--r--kernel/sched/cputime.c32
-rw-r--r--kernel/sched/deadline.c24
-rw-r--r--kernel/sched/fair.c644
-rw-r--r--kernel/sched/features.h8
-rw-r--r--kernel/sched/idle.c170
-rw-r--r--kernel/sched/rt.c129
-rw-r--r--kernel/sched/sched.h59
-rw-r--r--kernel/sched/stop_task.c4
-rw-r--r--kernel/seccomp.c2
-rw-r--r--kernel/softirq.c9
-rw-r--r--kernel/sys.c6
-rw-r--r--kernel/time/tick-common.c2
-rw-r--r--kernel/time/tick-sched.c5
-rw-r--r--kernel/timer.c2
-rw-r--r--kernel/trace/ftrace.c27
-rw-r--r--kernel/trace/trace_events_trigger.c2
-rw-r--r--kernel/trace/trace_functions.c16
-rw-r--r--kernel/trace/trace_uprobe.c6
-rw-r--r--kernel/tracepoint.c4
-rw-r--r--kernel/watchdog.c6
-rw-r--r--kernel/workqueue.c42
40 files changed, 1204 insertions, 839 deletions
diff --git a/kernel/audit.c b/kernel/audit.c
index 7c28936..47845c5 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -643,13 +643,13 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
if ((task_active_pid_ns(current) != &init_pid_ns))
return -EPERM;
- if (!capable(CAP_AUDIT_CONTROL))
+ if (!netlink_capable(skb, CAP_AUDIT_CONTROL))
err = -EPERM;
break;
case AUDIT_USER:
case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2:
- if (!capable(CAP_AUDIT_WRITE))
+ if (!netlink_capable(skb, CAP_AUDIT_WRITE))
err = -EPERM;
break;
default: /* bad msg */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 9fcdaa7..3f1ca93 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -348,7 +348,7 @@ struct cgrp_cset_link {
* reference-counted, to improve performance when child cgroups
* haven't been created.
*/
-static struct css_set init_css_set = {
+struct css_set init_css_set = {
.refcount = ATOMIC_INIT(1),
.cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
.tasks = LIST_HEAD_INIT(init_css_set.tasks),
@@ -1495,7 +1495,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
*/
if (!use_task_css_set_links)
cgroup_enable_task_cg_lists();
-retry:
+
mutex_lock(&cgroup_tree_mutex);
mutex_lock(&cgroup_mutex);
@@ -1503,7 +1503,7 @@ retry:
ret = parse_cgroupfs_options(data, &opts);
if (ret)
goto out_unlock;
-
+retry:
/* look for a matching existing root */
if (!opts.subsys_mask && !opts.none && !opts.name) {
cgrp_dfl_root_visible = true;
@@ -1562,9 +1562,9 @@ retry:
if (!atomic_inc_not_zero(&root->cgrp.refcnt)) {
mutex_unlock(&cgroup_mutex);
mutex_unlock(&cgroup_tree_mutex);
- kfree(opts.release_agent);
- kfree(opts.name);
msleep(10);
+ mutex_lock(&cgroup_tree_mutex);
+ mutex_lock(&cgroup_mutex);
goto retry;
}
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 2bc4a22..345628c 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -21,6 +21,7 @@
#include <linux/uaccess.h>
#include <linux/freezer.h>
#include <linux/seq_file.h>
+#include <linux/mutex.h>
/*
* A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is
@@ -42,9 +43,10 @@ enum freezer_state_flags {
struct freezer {
struct cgroup_subsys_state css;
unsigned int state;
- spinlock_t lock;
};
+static DEFINE_MUTEX(freezer_mutex);
+
static inline struct freezer *css_freezer(struct cgroup_subsys_state *css)
{
return css ? container_of(css, struct freezer, css) : NULL;
@@ -93,7 +95,6 @@ freezer_css_alloc(struct cgroup_subsys_state *parent_css)
if (!freezer)
return ERR_PTR(-ENOMEM);
- spin_lock_init(&freezer->lock);
return &freezer->css;
}
@@ -110,14 +111,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css)
struct freezer *freezer = css_freezer(css);
struct freezer *parent = parent_freezer(freezer);
- /*
- * The following double locking and freezing state inheritance
- * guarantee that @cgroup can never escape ancestors' freezing
- * states. See css_for_each_descendant_pre() for details.
- */
- if (parent)
- spin_lock_irq(&parent->lock);
- spin_lock_nested(&freezer->lock, SINGLE_DEPTH_NESTING);
+ mutex_lock(&freezer_mutex);
freezer->state |= CGROUP_FREEZER_ONLINE;
@@ -126,10 +120,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css)
atomic_inc(&system_freezing_cnt);
}
- spin_unlock(&freezer->lock);
- if (parent)
- spin_unlock_irq(&parent->lock);
-
+ mutex_unlock(&freezer_mutex);
return 0;
}
@@ -144,14 +135,14 @@ static void freezer_css_offline(struct cgroup_subsys_state *css)
{
struct freezer *freezer = css_freezer(css);
- spin_lock_irq(&freezer->lock);
+ mutex_lock(&freezer_mutex);
if (freezer->state & CGROUP_FREEZING)
atomic_dec(&system_freezing_cnt);
freezer->state = 0;
- spin_unlock_irq(&freezer->lock);
+ mutex_unlock(&freezer_mutex);
}
static void freezer_css_free(struct cgroup_subsys_state *css)
@@ -175,7 +166,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
struct task_struct *task;
bool clear_frozen = false;
- spin_lock_irq(&freezer->lock);
+ mutex_lock(&freezer_mutex);
/*
* Make the new tasks conform to the current state of @new_css.
@@ -197,21 +188,13 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
}
}
- spin_unlock_irq(&freezer->lock);
-
- /*
- * Propagate FROZEN clearing upwards. We may race with
- * update_if_frozen(), but as long as both work bottom-up, either
- * update_if_frozen() sees child's FROZEN cleared or we clear the
- * parent's FROZEN later. No parent w/ !FROZEN children can be
- * left FROZEN.
- */
+ /* propagate FROZEN clearing upwards */
while (clear_frozen && (freezer = parent_freezer(freezer))) {
- spin_lock_irq(&freezer->lock);
freezer->state &= ~CGROUP_FROZEN;
clear_frozen = freezer->state & CGROUP_FREEZING;
- spin_unlock_irq(&freezer->lock);
}
+
+ mutex_unlock(&freezer_mutex);
}
/**
@@ -228,9 +211,6 @@ static void freezer_fork(struct task_struct *task)
{
struct freezer *freezer;
- rcu_read_lock();
- freezer = task_freezer(task);
-
/*
* The root cgroup is non-freezable, so we can skip locking the
* freezer. This is safe regardless of race with task migration.
@@ -238,24 +218,18 @@ static void freezer_fork(struct task_struct *task)
* to do. If we lost and root is the new cgroup, noop is still the
* right thing to do.
*/
- if (!parent_freezer(freezer))
- goto out;
+ if (task_css_is_root(task, freezer_cgrp_id))
+ return;
- /*
- * Grab @freezer->lock and freeze @task after verifying @task still
- * belongs to @freezer and it's freezing. The former is for the
- * case where we have raced against task migration and lost and
- * @task is already in a different cgroup which may not be frozen.
- * This isn't strictly necessary as freeze_task() is allowed to be
- * called spuriously but let's do it anyway for, if nothing else,
- * documentation.
- */
- spin_lock_irq(&freezer->lock);
- if (freezer == task_freezer(task) && (freezer->state & CGROUP_FREEZING))
+ mutex_lock(&freezer_mutex);
+ rcu_read_lock();
+
+ freezer = task_freezer(task);
+ if (freezer->state & CGROUP_FREEZING)
freeze_task(task);
- spin_unlock_irq(&freezer->lock);
-out:
+
rcu_read_unlock();
+ mutex_unlock(&freezer_mutex);
}
/**
@@ -281,22 +255,24 @@ static void update_if_frozen(struct cgroup_subsys_state *css)
struct css_task_iter it;
struct task_struct *task;
- WARN_ON_ONCE(!rcu_read_lock_held());
-
- spin_lock_irq(&freezer->lock);
+ lockdep_assert_held(&freezer_mutex);
if (!(freezer->state & CGROUP_FREEZING) ||
(freezer->state & CGROUP_FROZEN))
- goto out_unlock;
+ return;
/* are all (live) children frozen? */
+ rcu_read_lock();
css_for_each_child(pos, css) {
struct freezer *child = css_freezer(pos);
if ((child->state & CGROUP_FREEZER_ONLINE) &&
- !(child->state & CGROUP_FROZEN))
- goto out_unlock;
+ !(child->state & CGROUP_FROZEN)) {
+ rcu_read_unlock();
+ return;
+ }
}
+ rcu_read_unlock();
/* are all tasks frozen? */
css_task_iter_start(css, &it);
@@ -317,21 +293,29 @@ static void update_if_frozen(struct cgroup_subsys_state *css)
freezer->state |= CGROUP_FROZEN;
out_iter_end:
css_task_iter_end(&it);
-out_unlock:
- spin_unlock_irq(&freezer->lock);
}
static int freezer_read(struct seq_file *m, void *v)
{
struct cgroup_subsys_state *css = seq_css(m), *pos;
+ mutex_lock(&freezer_mutex);
rcu_read_lock();
/* update states bottom-up */
- css_for_each_descendant_post(pos, css)
+ css_for_each_descendant_post(pos, css) {
+ if (!css_tryget(pos))
+ continue;
+ rcu_read_unlock();
+
update_if_frozen(pos);
+ rcu_read_lock();
+ css_put(pos);
+ }
+
rcu_read_unlock();
+ mutex_unlock(&freezer_mutex);
seq_puts(m, freezer_state_strs(css_freezer(css)->state));
seq_putc(m, '\n');
@@ -373,7 +357,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze,
unsigned int state)
{
/* also synchronizes against task migration, see freezer_attach() */
- lockdep_assert_held(&freezer->lock);
+ lockdep_assert_held(&freezer_mutex);
if (!(freezer->state & CGROUP_FREEZER_ONLINE))
return;
@@ -414,31 +398,29 @@ static void freezer_change_state(struct freezer *freezer, bool freeze)
* descendant will try to inherit its parent's FREEZING state as
* CGROUP_FREEZING_PARENT.
*/
+ mutex_lock(&freezer_mutex);
rcu_read_lock();
css_for_each_descendant_pre(pos, &freezer->css) {
struct freezer *pos_f = css_freezer(pos);
struct freezer *parent = parent_freezer(pos_f);
- spin_lock_irq(&pos_f->lock);
+ if (!css_tryget(pos))
+ continue;
+ rcu_read_unlock();
- if (pos_f == freezer) {
+ if (pos_f == freezer)
freezer_apply_state(pos_f, freeze,
CGROUP_FREEZING_SELF);
- } else {
- /*
- * Our update to @parent->state is already visible
- * which is all we need. No need to lock @parent.
- * For more info on synchronization, see
- * freezer_post_create().
- */
+ else
freezer_apply_state(pos_f,
parent->state & CGROUP_FREEZING,
CGROUP_FREEZING_PARENT);
- }
- spin_unlock_irq(&pos_f->lock);
+ rcu_read_lock();
+ css_put(pos);
}
rcu_read_unlock();
+ mutex_unlock(&freezer_mutex);
}
static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft,
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 6cb20d2..019d450 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -120,7 +120,7 @@ void context_tracking_user_enter(void)
* instead of preempt_schedule() to exit user context if needed before
* calling the scheduler.
*/
-asmlinkage void __sched notrace preempt_schedule_context(void)
+asmlinkage __visible void __sched notrace preempt_schedule_context(void)
{
enum ctx_state prev_ctx;
diff --git a/kernel/cpu.c b/kernel/cpu.c
index a9e710e..247979a 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -726,10 +726,12 @@ void set_cpu_present(unsigned int cpu, bool present)
void set_cpu_online(unsigned int cpu, bool online)
{
- if (online)
+ if (online) {
cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits));
- else
+ cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits));
+ } else {
cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits));
+ }
}
void set_cpu_active(unsigned int cpu, bool active)
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index d55092c..e0501fe 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -234,6 +234,11 @@ again:
goto again;
}
timer->base = new_base;
+ } else {
+ if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
+ cpu = this_cpu;
+ goto again;
+ }
}
return new_base;
}
@@ -569,6 +574,23 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
cpu_base->expires_next.tv64 = expires_next.tv64;
+ /*
+ * If a hang was detected in the last timer interrupt then we
+ * leave the hang delay active in the hardware. We want the
+ * system to make progress. That also prevents the following
+ * scenario:
+ * T1 expires 50ms from now
+ * T2 expires 5s from now
+ *
+ * T1 is removed, so this code is called and would reprogram
+ * the hardware to 5s from now. Any hrtimer_start after that
+ * will not reprogram the hardware due to hang_detected being
+ * set. So we'd effectivly block all timers until the T2 event
+ * fires.
+ */
+ if (cpu_base->hang_detected)
+ return;
+
if (cpu_base->expires_next.tv64 != KTIME_MAX)
tick_program_event(cpu_base->expires_next, 1);
}
@@ -968,11 +990,8 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
/* Remove an active timer from the queue: */
ret = remove_hrtimer(timer, base);
- /* Switch the timer base, if necessary: */
- new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
-
if (mode & HRTIMER_MODE_REL) {
- tim = ktime_add_safe(tim, new_base->get_time());
+ tim = ktime_add_safe(tim, base->get_time());
/*
* CONFIG_TIME_LOW_RES is a temporary way for architectures
* to signal that they simply return xtime in
@@ -987,6 +1006,9 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
hrtimer_set_expires_range_ns(timer, tim, delta_ns);
+ /* Switch the timer base, if necessary: */
+ new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
+
timer_stats_hrtimer_set_start_info(timer);
leftmost = enqueue_hrtimer(timer, new_base);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index a7174617..bb07f29 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -363,6 +363,13 @@ __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
if (from > irq)
return -EINVAL;
from = irq;
+ } else {
+ /*
+ * For interrupts which are freely allocated the
+ * architecture can force a lower bound to the @from
+ * argument. x86 uses this to exclude the GSI space.
+ */
+ from = arch_dynirq_lower_bound(from);
}
mutex_lock(&sparse_irq_lock);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 2486a4c..d34131c 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -180,7 +180,7 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
struct irq_chip *chip = irq_data_get_irq_chip(data);
int ret;
- ret = chip->irq_set_affinity(data, mask, false);
+ ret = chip->irq_set_affinity(data, mask, force);
switch (ret) {
case IRQ_SET_MASK_OK:
cpumask_copy(data->affinity, mask);
@@ -192,7 +192,8 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
return ret;
}
-int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask)
+int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
+ bool force)
{
struct irq_chip *chip = irq_data_get_irq_chip(data);
struct irq_desc *desc = irq_data_to_desc(data);
@@ -202,7 +203,7 @@ int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask)
return -EINVAL;
if (irq_can_move_pcntxt(data)) {
- ret = irq_do_set_affinity(data, mask, false);
+ ret = irq_do_set_affinity(data, mask, force);
} else {
irqd_set_move_pending(data);
irq_copy_pending(desc, mask);
@@ -217,13 +218,7 @@ int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask)
return ret;
}
-/**
- * irq_set_affinity - Set the irq affinity of a given irq
- * @irq: Interrupt to set affinity
- * @mask: cpumask
- *
- */
-int irq_set_affinity(unsigned int irq, const struct cpumask *mask)
+int __irq_set_affinity(unsigned int irq, const struct cpumask *mask, bool force)
{
struct irq_desc *desc = irq_to_desc(irq);
unsigned long flags;
@@ -233,7 +228,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *mask)
return -EINVAL;
raw_spin_lock_irqsave(&desc->lock, flags);
- ret = __irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask);
+ ret = irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask, force);
raw_spin_unlock_irqrestore(&desc->lock, flags);
return ret;
}
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index b0e9467..d24e433 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -4188,7 +4188,7 @@ void debug_show_held_locks(struct task_struct *task)
}
EXPORT_SYMBOL_GPL(debug_show_held_locks);
-asmlinkage void lockdep_sys_exit(void)
+asmlinkage __visible void lockdep_sys_exit(void)
{
struct task_struct *curr = current;
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index f26b1a1..23343be 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -216,7 +216,7 @@ static int lock_torture_writer(void *arg)
static DEFINE_TORTURE_RANDOM(rand);
VERBOSE_TOROUT_STRING("lock_torture_writer task started");
- set_user_nice(current, 19);
+ set_user_nice(current, MAX_NICE);
do {
schedule_timeout_uninterruptible(1);
diff --git a/kernel/module.c b/kernel/module.c
index 1186940..079c461 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -815,9 +815,6 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
return -EFAULT;
name[MODULE_NAME_LEN-1] = '\0';
- if (!(flags & O_NONBLOCK))
- pr_warn("waiting module removal not supported: please upgrade\n");
-
if (mutex_lock_interruptible(&module_mutex) != 0)
return -EINTR;
@@ -3271,6 +3268,9 @@ static int load_module(struct load_info *info, const char __user *uargs,
dynamic_debug_setup(info->debug, info->num_debug);
+ /* Ftrace init must be called in the MODULE_STATE_UNFORMED state */
+ ftrace_module_init(mod);
+
/* Finally it's fully formed, ready to start executing. */
err = complete_formation(mod, info);
if (err)
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 18fb7a2..1ea328a 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1586,7 +1586,7 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
return -ENOMEM;
}
-asmlinkage int swsusp_save(void)
+asmlinkage __visible int swsusp_save(void)
{
unsigned int nr_pages, nr_highmem;
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index c3ad9ca..155721f 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -14,6 +14,7 @@
#include <linux/init.h>
#include <linux/console.h>
#include <linux/cpu.h>
+#include <linux/cpuidle.h>
#include <linux/syscalls.h>
#include <linux/gfp.h>
#include <linux/io.h>
@@ -53,7 +54,11 @@ static void freeze_begin(void)
static void freeze_enter(void)
{
+ cpuidle_use_deepest_state(true);
+ cpuidle_resume();
wait_event(suspend_freeze_wait_head, suspend_freeze_wake);
+ cpuidle_pause();
+ cpuidle_use_deepest_state(false);
}
void freeze_wake(void)
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index a45b509..7228258 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1674,7 +1674,7 @@ EXPORT_SYMBOL(printk_emit);
*
* See the vsnprintf() documentation for format string extensions over C99.
*/
-asmlinkage int printk(const char *fmt, ...)
+asmlinkage __visible int printk(const char *fmt, ...)
{
va_list args;
int r;
@@ -1737,7 +1737,7 @@ void early_vprintk(const char *fmt, va_list ap)
}
}
-asmlinkage void early_printk(const char *fmt, ...)
+asmlinkage __visible void early_printk(const char *fmt, ...)
{
va_list ap;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8a70ec0..54f5722 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -522,6 +522,71 @@ static inline void init_hrtick(void)
#endif /* CONFIG_SCHED_HRTICK */
/*
+ * cmpxchg based fetch_or, macro so it works for different integer types
+ */
+#define fetch_or(ptr, val) \
+({ typeof(*(ptr)) __old, __val = *(ptr); \
+ for (;;) { \
+ __old = cmpxchg((ptr), __val, __val | (val)); \
+ if (__old == __val) \
+ break; \
+ __val = __old; \
+ } \
+ __old; \
+})
+
+#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
+/*
+ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
+ * this avoids any races wrt polling state changes and thereby avoids
+ * spurious IPIs.
+ */
+static bool set_nr_and_not_polling(struct task_struct *p)
+{
+ struct thread_info *ti = task_thread_info(p);
+ return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
+}
+
+/*
+ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set.
+ *
+ * If this returns true, then the idle task promises to call
+ * sched_ttwu_pending() and reschedule soon.
+ */
+static bool set_nr_if_polling(struct task_struct *p)
+{
+ struct thread_info *ti = task_thread_info(p);
+ typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags);
+
+ for (;;) {
+ if (!(val & _TIF_POLLING_NRFLAG))
+ return false;
+ if (val & _TIF_NEED_RESCHED)
+ return true;
+ old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
+ if (old == val)
+ break;
+ val = old;
+ }
+ return true;
+}
+
+#else
+static bool set_nr_and_not_polling(struct task_struct *p)
+{
+ set_tsk_need_resched(p);
+ return true;
+}
+
+#ifdef CONFIG_SMP
+static bool set_nr_if_polling(struct task_struct *p)
+{
+ return false;
+}
+#endif
+#endif
+
+/*
* resched_task - mark a task 'to be rescheduled now'.
*
* On UP this means the setting of the need_resched flag, on SMP it
@@ -537,18 +602,18 @@ void resched_task(struct task_struct *p)
if (test_tsk_need_resched(p))
return;
- set_tsk_need_resched(p);
-
cpu = task_cpu(p);
+
if (cpu == smp_processor_id()) {
+ set_tsk_need_resched(p);
set_preempt_need_resched();
return;
}
- /* NEED_RESCHED must be visible before we test polling */
- smp_mb();
- if (!tsk_is_polling(p))
+ if (set_nr_and_not_polling(p))
smp_send_reschedule(cpu);
+ else
+ trace_sched_wake_idle_without_ipi(cpu);
}
void resched_cpu(int cpu)
@@ -611,27 +676,10 @@ static void wake_up_idle_cpu(int cpu)
if (cpu == smp_processor_id())
return;
- /*
- * This is safe, as this function is called with the timer
- * wheel base lock of (cpu) held. When the CPU is on the way
- * to idle and has not yet set rq->curr to idle then it will
- * be serialized on the timer wheel base lock and take the new
- * timer into account automatically.
- */
- if (rq->curr != rq->idle)
- return;
-
- /*
- * We can set TIF_RESCHED on the idle task of the other CPU
- * lockless. The worst case is that the other CPU runs the
- * idle task through an additional NOOP schedule()
- */
- set_tsk_need_resched(rq->idle);
-
- /* NEED_RESCHED must be visible before we test polling */
- smp_mb();
- if (!tsk_is_polling(rq->idle))
+ if (set_nr_and_not_polling(rq->idle))
smp_send_reschedule(cpu);
+ else
+ trace_sched_wake_idle_without_ipi(cpu);
}
static bool wake_up_full_nohz_cpu(int cpu)
@@ -857,7 +905,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
rq->clock_task += delta;
#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
- if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
+ if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
sched_rt_avg_update(rq, irq_delta + steal);
#endif
}
@@ -1490,13 +1538,17 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
}
#ifdef CONFIG_SMP
-static void sched_ttwu_pending(void)
+void sched_ttwu_pending(void)
{
struct rq *rq = this_rq();
struct llist_node *llist = llist_del_all(&rq->wake_list);
struct task_struct *p;
+ unsigned long flags;
- raw_spin_lock(&rq->lock);
+ if (!llist)
+ return;
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
while (llist) {
p = llist_entry(llist, struct task_struct, wake_entry);
@@ -1504,7 +1556,7 @@ static void sched_ttwu_pending(void)
ttwu_do_activate(rq, p, 0);
}
- raw_spin_unlock(&rq->lock);
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
}
void scheduler_ipi(void)
@@ -1550,8 +1602,14 @@ void scheduler_ipi(void)
static void ttwu_queue_remote(struct task_struct *p, int cpu)
{
- if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))
- smp_send_reschedule(cpu);
+ struct rq *rq = cpu_rq(cpu);
+
+ if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
+ if (!set_nr_if_polling(rq->idle))
+ smp_send_reschedule(cpu);
+ else
+ trace_sched_wake_idle_without_ipi(cpu);
+ }
}
bool cpus_share_cache(int this_cpu, int that_cpu)
@@ -2208,7 +2266,7 @@ static inline void post_schedule(struct rq *rq)
* schedule_tail - first thing a freshly forked thread must call.
* @prev: the thread we just switched away from.
*/
-asmlinkage void schedule_tail(struct task_struct *prev)
+asmlinkage __visible void schedule_tail(struct task_struct *prev)
__releases(rq->lock)
{
struct rq *rq = this_rq();
@@ -2608,8 +2666,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev)
if (likely(prev->sched_class == class &&
rq->nr_running == rq->cfs.h_nr_running)) {
p = fair_sched_class.pick_next_task(rq, prev);
- if (likely(p && p != RETRY_TASK))
- return p;
+ if (unlikely(p == RETRY_TASK))
+ goto again;
+
+ /* assumes fair_sched_class->next == idle_sched_class */
+ if (unlikely(!p))
+ p = idle_sched_class.pick_next_task(rq, prev);
+
+ return p;
}
again:
@@ -2757,7 +2821,7 @@ static inline void sched_submit_work(struct task_struct *tsk)
blk_schedule_flush_plug(tsk);
}
-asmlinkage void __sched schedule(void)
+asmlinkage __visible void __sched schedule(void)
{
struct task_struct *tsk = current;
@@ -2767,7 +2831,7 @@ asmlinkage void __sched schedule(void)
EXPORT_SYMBOL(schedule);
#ifdef CONFIG_CONTEXT_TRACKING
-asmlinkage void __sched schedule_user(void)
+asmlinkage __visible void __sched schedule_user(void)
{
/*
* If we come here after a random call to set_need_resched(),
@@ -2799,7 +2863,7 @@ void __sched schedule_preempt_disabled(void)
* off of preempt_enable. Kernel preemptions off return from interrupt
* occur there and call schedule directly.
*/
-asmlinkage void __sched notrace preempt_schedule(void)
+asmlinkage __visible void __sched notrace preempt_schedule(void)
{
/*
* If there is a non-zero preempt_count or interrupts are disabled,
@@ -2829,7 +2893,7 @@ EXPORT_SYMBOL(preempt_schedule);
* Note, that this is called and return with irqs disabled. This will
* protect us against recursive calling from irq.
*/
-asmlinkage void __sched preempt_schedule_irq(void)
+asmlinkage __visible void __sched preempt_schedule_irq(void)
{
enum ctx_state prev_state;
@@ -3012,7 +3076,7 @@ EXPORT_SYMBOL(set_user_nice);
int can_nice(const struct task_struct *p, const int nice)
{
/* convert nice value [19,-20] to rlimit style value [1,40] */
- int nice_rlim = 20 - nice;
+ int nice_rlim = nice_to_rlimit(nice);
return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
capable(CAP_SYS_NICE));
@@ -3036,17 +3100,10 @@ SYSCALL_DEFINE1(nice, int, increment)
* We don't have to worry. Conceptually one call occurs first
* and we have a single winner.
*/
- if (increment < -40)
- increment = -40;
- if (increment > 40)
- increment = 40;
-
+ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
nice = task_nice(current) + increment;
- if (nice < MIN_NICE)
- nice = MIN_NICE;
- if (nice > MAX_NICE)
- nice = MAX_NICE;
+ nice = clamp_val(nice, MIN_NICE, MAX_NICE);
if (increment < 0 && !can_nice(current, nice))
return -EPERM;
@@ -3140,6 +3197,7 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
dl_se->dl_throttled = 0;
dl_se->dl_new = 1;
+ dl_se->dl_yielded = 0;
}
static void __setscheduler_params(struct task_struct *p,
@@ -3204,17 +3262,40 @@ __getparam_dl(struct task_struct *p, struct sched_attr *attr)
* We ask for the deadline not being zero, and greater or equal
* than the runtime, as well as the period of being zero or
* greater than deadline. Furthermore, we have to be sure that
- * user parameters are above the internal resolution (1us); we
- * check sched_runtime only since it is always the smaller one.
+ * user parameters are above the internal resolution of 1us (we
+ * check sched_runtime only since it is always the smaller one) and
+ * below 2^63 ns (we have to check both sched_deadline and
+ * sched_period, as the latter can be zero).
*/
static bool
__checkparam_dl(const struct sched_attr *attr)
{
- return attr && attr->sched_deadline != 0 &&
- (attr->sched_period == 0 ||
- (s64)(attr->sched_period - attr->sched_deadline) >= 0) &&
- (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0 &&
- attr->sched_runtime >= (2 << (DL_SCALE - 1));
+ /* deadline != 0 */
+ if (attr->sched_deadline == 0)
+ return false;
+
+ /*
+ * Since we truncate DL_SCALE bits, make sure we're at least
+ * that big.
+ */
+ if (attr->sched_runtime < (1ULL << DL_SCALE))
+ return false;
+
+ /*
+ * Since we use the MSB for wrap-around and sign issues, make
+ * sure it's not set (mind that period can be equal to zero).
+ */
+ if (attr->sched_deadline & (1ULL << 63) ||
+ attr->sched_period & (1ULL << 63))
+ return false;
+
+ /* runtime <= deadline <= period (if period != 0) */
+ if ((attr->sched_period != 0 &&
+ attr->sched_period < attr->sched_deadline) ||
+ attr->sched_deadline < attr->sched_runtime)
+ return false;
+
+ return true;
}
/*
@@ -3612,13 +3693,11 @@ static int sched_copy_attr(struct sched_attr __user *uattr,
*/
attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
-out:
- return ret;
+ return 0;
err_size:
put_user(sizeof(*attr), &uattr->size);
- ret = -E2BIG;
- goto out;
+ return -E2BIG;
}
/**
@@ -3655,6 +3734,7 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
* sys_sched_setattr - same as above, but with extended sched_attr
* @pid: the pid in question.
* @uattr: structure containing the extended parameters.
+ * @flags: for future extension.
*/
SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
unsigned int, flags)
@@ -3666,8 +3746,12 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
if (!uattr || pid < 0 || flags)
return -EINVAL;
- if (sched_copy_attr(uattr, &attr))
- return -EFAULT;
+ retval = sched_copy_attr(uattr, &attr);
+ if (retval)
+ return retval;
+
+ if (attr.sched_policy < 0)
+ return -EINVAL;
rcu_read_lock();
retval = -ESRCH;
@@ -3717,7 +3801,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
*/
SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
{
- struct sched_param lp;
+ struct sched_param lp = { .sched_priority = 0 };
struct task_struct *p;
int retval;
@@ -3734,11 +3818,8 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
if (retval)
goto out_unlock;
- if (task_has_dl_policy(p)) {
- retval = -EINVAL;
- goto out_unlock;
- }
- lp.sched_priority = p->rt_priority;
+ if (task_has_rt_policy(p))
+ lp.sched_priority = p->rt_priority;
rcu_read_unlock();
/*
@@ -3776,7 +3857,7 @@ static int sched_read_attr(struct sched_attr __user *uattr,
for (; addr < end; addr++) {
if (*addr)
- goto err_size;
+ return -EFBIG;
}
attr->size = usize;
@@ -3786,12 +3867,7 @@ static int sched_read_attr(struct sched_attr __user *uattr,
if (ret)
return -EFAULT;
-out:
- return ret;
-
-err_size:
- ret = -E2BIG;
- goto out;
+ return 0;
}
/**
@@ -3799,6 +3875,7 @@ err_size:
* @pid: the pid in question.
* @uattr: structure containing the extended parameters.
* @size: sizeof(attr) for fwd/bwd comp.
+ * @flags: for future extension.
*/
SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
unsigned int, size, unsigned int, flags)
@@ -4161,7 +4238,7 @@ EXPORT_SYMBOL(yield);
* false (0) if we failed to boost the target.
* -ESRCH if there's no task to yield to.
*/
-bool __sched yield_to(struct task_struct *p, bool preempt)
+int __sched yield_to(struct task_struct *p, bool preempt)
{
struct task_struct *curr = current;
struct rq *rq, *p_rq;
@@ -5055,11 +5132,20 @@ static struct notifier_block migration_notifier = {
.priority = CPU_PRI_MIGRATION,
};
+static void __cpuinit set_cpu_rq_start_time(void)
+{
+ int cpu = smp_processor_id();
+ struct rq *rq = cpu_rq(cpu);
+ rq->age_stamp = sched_clock_cpu(cpu);
+}
+
static int sched_cpu_active(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_STARTING:
+ set_cpu_rq_start_time();
+ return NOTIFY_OK;
case CPU_DOWN_FAILED:
set_cpu_active((long)hcpu, true);
return NOTIFY_OK;
@@ -5178,14 +5264,13 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
}
/*
- * Even though we initialize ->power to something semi-sane,
- * we leave power_orig unset. This allows us to detect if
+ * Even though we initialize ->capacity to something semi-sane,
+ * we leave capacity_orig unset. This allows us to detect if
* domain iteration is still funny without causing /0 traps.
*/
- if (!group->sgp->power_orig) {
+ if (!group->sgc->capacity_orig) {
printk(KERN_CONT "\n");
- printk(KERN_ERR "ERROR: domain->cpu_power not "
- "set\n");
+ printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n");
break;
}
@@ -5207,9 +5292,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
printk(KERN_CONT " %s", str);
- if (group->sgp->power != SCHED_POWER_SCALE) {
- printk(KERN_CONT " (cpu_power = %d)",
- group->sgp->power);
+ if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
+ printk(KERN_CONT " (cpu_capacity = %d)",
+ group->sgc->capacity);
}
group = group->next;
@@ -5267,8 +5352,9 @@ static int sd_degenerate(struct sched_domain *sd)
SD_BALANCE_NEWIDLE |
SD_BALANCE_FORK |
SD_BALANCE_EXEC |
- SD_SHARE_CPUPOWER |
- SD_SHARE_PKG_RESOURCES)) {
+ SD_SHARE_CPUCAPACITY |
+ SD_SHARE_PKG_RESOURCES |
+ SD_SHARE_POWERDOMAIN)) {
if (sd->groups != sd->groups->next)
return 0;
}
@@ -5297,9 +5383,10 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
SD_BALANCE_NEWIDLE |
SD_BALANCE_FORK |
SD_BALANCE_EXEC |
- SD_SHARE_CPUPOWER |
+ SD_SHARE_CPUCAPACITY |
SD_SHARE_PKG_RESOURCES |
- SD_PREFER_SIBLING);
+ SD_PREFER_SIBLING |
+ SD_SHARE_POWERDOMAIN);
if (nr_node_ids == 1)
pflags &= ~SD_SERIALIZE;
}
@@ -5421,7 +5508,7 @@ static struct root_domain *alloc_rootdomain(void)
return rd;
}
-static void free_sched_groups(struct sched_group *sg, int free_sgp)
+static void free_sched_groups(struct sched_group *sg, int free_sgc)
{
struct sched_group *tmp, *first;
@@ -5432,8 +5519,8 @@ static void free_sched_groups(struct sched_group *sg, int free_sgp)
do {
tmp = sg->next;
- if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
- kfree(sg->sgp);
+ if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
+ kfree(sg->sgc);
kfree(sg);
sg = tmp;
@@ -5451,7 +5538,7 @@ static void free_sched_domain(struct rcu_head *rcu)
if (sd->flags & SD_OVERLAP) {
free_sched_groups(sd->groups, 1);
} else if (atomic_dec_and_test(&sd->groups->ref)) {
- kfree(sd->groups->sgp);
+ kfree(sd->groups->sgc);
kfree(sd->groups);
}
kfree(sd);
@@ -5573,17 +5660,6 @@ static int __init isolated_cpu_setup(char *str)
__setup("isolcpus=", isolated_cpu_setup);
-static const struct cpumask *cpu_cpu_mask(int cpu)
-{
- return cpumask_of_node(cpu_to_node(cpu));
-}
-
-struct sd_data {
- struct sched_domain **__percpu sd;
- struct sched_group **__percpu sg;
- struct sched_group_power **__percpu sgp;
-};
-
struct s_data {
struct sched_domain ** __percpu sd;
struct root_domain *rd;
@@ -5596,21 +5672,6 @@ enum s_alloc {
sa_none,
};
-struct sched_domain_topology_level;
-
-typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
-typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
-
-#define SDTL_OVERLAP 0x01
-
-struct sched_domain_topology_level {
- sched_domain_init_f init;
- sched_domain_mask_f mask;
- int flags;
- int numa_level;
- struct sd_data data;
-};
-
/*
* Build an iteration mask that can exclude certain CPUs from the upwards
* domain traversal.
@@ -5688,17 +5749,17 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
cpumask_or(covered, covered, sg_span);
- sg->sgp = *per_cpu_ptr(sdd->sgp, i);
- if (atomic_inc_return(&sg->sgp->ref) == 1)
+ sg->sgc = *per_cpu_ptr(sdd->sgc, i);
+ if (atomic_inc_return(&sg->sgc->ref) == 1)
build_group_mask(sd, sg);
/*
- * Initialize sgp->power such that even if we mess up the
+ * Initialize sgc->capacity such that even if we mess up the
* domains and no possible iteration will get us here, we won't
* die on a /0 trap.
*/
- sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
- sg->sgp->power_orig = sg->sgp->power;
+ sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
+ sg->sgc->capacity_orig = sg->sgc->capacity;
/*
* Make sure the first group of this domain contains the
@@ -5736,8 +5797,8 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
if (sg) {
*sg = *per_cpu_ptr(sdd->sg, cpu);
- (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
- atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */
+ (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu);
+ atomic_set(&(*sg)->sgc->ref, 1); /* for claim_allocations */
}
return cpu;
@@ -5746,7 +5807,7 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
/*
* build_sched_groups will build a circular linked list of the groups
* covered by the given span, and will set each group's ->cpumask correctly,
- * and ->cpu_power to 0.
+ * and ->cpu_capacity to 0.
*
* Assumes the sched_domain tree is fully constructed
*/
@@ -5778,8 +5839,6 @@ build_sched_groups(struct sched_domain *sd, int cpu)
continue;
group = get_group(i, sdd, &sg);
- cpumask_clear(sched_group_cpus(sg));
- sg->sgp->power = 0;
cpumask_setall(sched_group_mask(sg));
for_each_cpu(j, span) {
@@ -5802,16 +5861,16 @@ build_sched_groups(struct sched_domain *sd, int cpu)
}
/*
- * Initialize sched groups cpu_power.
+ * Initialize sched groups cpu_capacity.
*
- * cpu_power indicates the capacity of sched group, which is used while
+ * cpu_capacity indicates the capacity of sched group, which is used while
* distributing the load between different sched groups in a sched domain.
- * Typically cpu_power for all the groups in a sched domain will be same unless
- * there are asymmetries in the topology. If there are asymmetries, group
- * having more cpu_power will pickup more load compared to the group having
- * less cpu_power.
+ * Typically cpu_capacity for all the groups in a sched domain will be same
+ * unless there are asymmetries in the topology. If there are asymmetries,
+ * group having more cpu_capacity will pickup more load compared to the
+ * group having less cpu_capacity.
*/
-static void init_sched_groups_power(int cpu, struct sched_domain *sd)
+static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
{
struct sched_group *sg = sd->groups;
@@ -5825,13 +5884,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
if (cpu != group_balance_cpu(sg))
return;
- update_group_power(sd, cpu);
- atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
-}
-
-int __weak arch_sd_sibling_asym_packing(void)
-{
- return 0*SD_ASYM_PACKING;
+ update_group_capacity(sd, cpu);
+ atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight);
}
/*
@@ -5839,34 +5893,6 @@ int __weak arch_sd_sibling_asym_packing(void)
* Non-inlined to reduce accumulated stack pressure in build_sched_domains()
*/
-#ifdef CONFIG_SCHED_DEBUG
-# define SD_INIT_NAME(sd, type) sd->name = #type
-#else
-# define SD_INIT_NAME(sd, type) do { } while (0)
-#endif
-
-#define SD_INIT_FUNC(type) \
-static noinline struct sched_domain * \
-sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
-{ \
- struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \
- *sd = SD_##type##_INIT; \
- SD_INIT_NAME(sd, type); \
- sd->private = &tl->data; \
- return sd; \
-}
-
-SD_INIT_FUNC(CPU)
-#ifdef CONFIG_SCHED_SMT
- SD_INIT_FUNC(SIBLING)
-#endif
-#ifdef CONFIG_SCHED_MC
- SD_INIT_FUNC(MC)
-#endif
-#ifdef CONFIG_SCHED_BOOK
- SD_INIT_FUNC(BOOK)
-#endif
-
static int default_relax_domain_level = -1;
int sched_domain_level_max;
@@ -5950,101 +5976,158 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
*per_cpu_ptr(sdd->sg, cpu) = NULL;
- if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
- *per_cpu_ptr(sdd->sgp, cpu) = NULL;
-}
-
-#ifdef CONFIG_SCHED_SMT
-static const struct cpumask *cpu_smt_mask(int cpu)
-{
- return topology_thread_cpumask(cpu);
+ if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
+ *per_cpu_ptr(sdd->sgc, cpu) = NULL;
}
-#endif
-
-/*
- * Topology list, bottom-up.
- */
-static struct sched_domain_topology_level default_topology[] = {
-#ifdef CONFIG_SCHED_SMT
- { sd_init_SIBLING, cpu_smt_mask, },
-#endif
-#ifdef CONFIG_SCHED_MC
- { sd_init_MC, cpu_coregroup_mask, },
-#endif
-#ifdef CONFIG_SCHED_BOOK
- { sd_init_BOOK, cpu_book_mask, },
-#endif
- { sd_init_CPU, cpu_cpu_mask, },
- { NULL, },
-};
-
-static struct sched_domain_topology_level *sched_domain_topology = default_topology;
-
-#define for_each_sd_topology(tl) \
- for (tl = sched_domain_topology; tl->init; tl++)
#ifdef CONFIG_NUMA
-
static int sched_domains_numa_levels;
static int *sched_domains_numa_distance;
static struct cpumask ***sched_domains_numa_masks;
static int sched_domains_curr_level;
+#endif
-static inline int sd_local_flags(int level)
-{
- if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)
- return 0;
-
- return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
-}
+/*
+ * SD_flags allowed in topology descriptions.
+ *
+ * SD_SHARE_CPUCAPACITY - describes SMT topologies
+ * SD_SHARE_PKG_RESOURCES - describes shared caches
+ * SD_NUMA - describes NUMA topologies
+ * SD_SHARE_POWERDOMAIN - describes shared power domain
+ *
+ * Odd one out:
+ * SD_ASYM_PACKING - describes SMT quirks
+ */
+#define TOPOLOGY_SD_FLAGS \
+ (SD_SHARE_CPUCAPACITY | \
+ SD_SHARE_PKG_RESOURCES | \
+ SD_NUMA | \
+ SD_ASYM_PACKING | \
+ SD_SHARE_POWERDOMAIN)
static struct sched_domain *
-sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
+sd_init(struct sched_domain_topology_level *tl, int cpu)
{
struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
- int level = tl->numa_level;
- int sd_weight = cpumask_weight(
- sched_domains_numa_masks[level][cpu_to_node(cpu)]);
+ int sd_weight, sd_flags = 0;
+
+#ifdef CONFIG_NUMA
+ /*
+ * Ugly hack to pass state to sd_numa_mask()...
+ */
+ sched_domains_curr_level = tl->numa_level;
+#endif
+
+ sd_weight = cpumask_weight(tl->mask(cpu));
+
+ if (tl->sd_flags)
+ sd_flags = (*tl->sd_flags)();
+ if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
+ "wrong sd_flags in topology description\n"))
+ sd_flags &= ~TOPOLOGY_SD_FLAGS;
*sd = (struct sched_domain){
.min_interval = sd_weight,
.max_interval = 2*sd_weight,
.busy_factor = 32,
.imbalance_pct = 125,
- .cache_nice_tries = 2,
- .busy_idx = 3,
- .idle_idx = 2,
+
+ .cache_nice_tries = 0,
+ .busy_idx = 0,
+ .idle_idx = 0,
.newidle_idx = 0,
.wake_idx = 0,
.forkexec_idx = 0,
.flags = 1*SD_LOAD_BALANCE
| 1*SD_BALANCE_NEWIDLE
- | 0*SD_BALANCE_EXEC
- | 0*SD_BALANCE_FORK
+ | 1*SD_BALANCE_EXEC
+ | 1*SD_BALANCE_FORK
| 0*SD_BALANCE_WAKE
- | 0*SD_WAKE_AFFINE
- | 0*SD_SHARE_CPUPOWER
+ | 1*SD_WAKE_AFFINE
+ | 0*SD_SHARE_CPUCAPACITY
| 0*SD_SHARE_PKG_RESOURCES
- | 1*SD_SERIALIZE
+ | 0*SD_SERIALIZE
| 0*SD_PREFER_SIBLING
- | 1*SD_NUMA
- | sd_local_flags(level)
+ | 0*SD_NUMA
+ | sd_flags
,
+
.last_balance = jiffies,
.balance_interval = sd_weight,
+ .smt_gain = 0,
+ .max_newidle_lb_cost = 0,
+ .next_decay_max_lb_cost = jiffies,
+#ifdef CONFIG_SCHED_DEBUG
+ .name = tl->name,
+#endif
};
- SD_INIT_NAME(sd, NUMA);
- sd->private = &tl->data;
/*
- * Ugly hack to pass state to sd_numa_mask()...
+ * Convert topological properties into behaviour.
*/
- sched_domains_curr_level = tl->numa_level;
+
+ if (sd->flags & SD_SHARE_CPUCAPACITY) {
+ sd->imbalance_pct = 110;
+ sd->smt_gain = 1178; /* ~15% */
+
+ } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
+ sd->imbalance_pct = 117;
+ sd->cache_nice_tries = 1;
+ sd->busy_idx = 2;
+
+#ifdef CONFIG_NUMA
+ } else if (sd->flags & SD_NUMA) {
+ sd->cache_nice_tries = 2;
+ sd->busy_idx = 3;
+ sd->idle_idx = 2;
+
+ sd->flags |= SD_SERIALIZE;
+ if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
+ sd->flags &= ~(SD_BALANCE_EXEC |
+ SD_BALANCE_FORK |
+ SD_WAKE_AFFINE);
+ }
+
+#endif
+ } else {
+ sd->flags |= SD_PREFER_SIBLING;
+ sd->cache_nice_tries = 1;
+ sd->busy_idx = 2;
+ sd->idle_idx = 1;
+ }
+
+ sd->private = &tl->data;
return sd;
}
+/*
+ * Topology list, bottom-up.
+ */
+static struct sched_domain_topology_level default_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+ { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
+#endif
+#ifdef CONFIG_SCHED_MC
+ { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
+#endif
+ { cpu_cpu_mask, SD_INIT_NAME(DIE) },
+ { NULL, },
+};
+
+struct sched_domain_topology_level *sched_domain_topology = default_topology;
+
+#define for_each_sd_topology(tl) \
+ for (tl = sched_domain_topology; tl->mask; tl++)
+
+void set_sched_topology(struct sched_domain_topology_level *tl)
+{
+ sched_domain_topology = tl;
+}
+
+#ifdef CONFIG_NUMA
+
static const struct cpumask *sd_numa_mask(int cpu)
{
return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
@@ -6188,7 +6271,10 @@ static void sched_init_numa(void)
}
}
- tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
+ /* Compute default topology size */
+ for (i = 0; sched_domain_topology[i].mask; i++);
+
+ tl = kzalloc((i + level + 1) *
sizeof(struct sched_domain_topology_level), GFP_KERNEL);
if (!tl)
return;
@@ -6196,18 +6282,19 @@ static void sched_init_numa(void)
/*
* Copy the default topology bits..
*/
- for (i = 0; default_topology[i].init; i++)
- tl[i] = default_topology[i];
+ for (i = 0; sched_domain_topology[i].mask; i++)
+ tl[i] = sched_domain_topology[i];
/*
* .. and append 'j' levels of NUMA goodness.
*/
for (j = 0; j < level; i++, j++) {
tl[i] = (struct sched_domain_topology_level){
- .init = sd_numa_init,
.mask = sd_numa_mask,
+ .sd_flags = cpu_numa_flags,
.flags = SDTL_OVERLAP,
.numa_level = j,
+ SD_INIT_NAME(NUMA)
};
}
@@ -6292,14 +6379,14 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
if (!sdd->sg)
return -ENOMEM;
- sdd->sgp = alloc_percpu(struct sched_group_power *);
- if (!sdd->sgp)
+ sdd->sgc = alloc_percpu(struct sched_group_capacity *);
+ if (!sdd->sgc)
return -ENOMEM;
for_each_cpu(j, cpu_map) {
struct sched_domain *sd;
struct sched_group *sg;
- struct sched_group_power *sgp;
+ struct sched_group_capacity *sgc;
sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
GFP_KERNEL, cpu_to_node(j));
@@ -6317,12 +6404,12 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
*per_cpu_ptr(sdd->sg, j) = sg;
- sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),
+ sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),
GFP_KERNEL, cpu_to_node(j));
- if (!sgp)
+ if (!sgc)
return -ENOMEM;
- *per_cpu_ptr(sdd->sgp, j) = sgp;
+ *per_cpu_ptr(sdd->sgc, j) = sgc;
}
}
@@ -6349,15 +6436,15 @@ static void __sdt_free(const struct cpumask *cpu_map)
if (sdd->sg)
kfree(*per_cpu_ptr(sdd->sg, j));
- if (sdd->sgp)
- kfree(*per_cpu_ptr(sdd->sgp, j));
+ if (sdd->sgc)
+ kfree(*per_cpu_ptr(sdd->sgc, j));
}
free_percpu(sdd->sd);
sdd->sd = NULL;
free_percpu(sdd->sg);
sdd->sg = NULL;
- free_percpu(sdd->sgp);
- sdd->sgp = NULL;
+ free_percpu(sdd->sgc);
+ sdd->sgc = NULL;
}
}
@@ -6365,7 +6452,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
const struct cpumask *cpu_map, struct sched_domain_attr *attr,
struct sched_domain *child, int cpu)
{
- struct sched_domain *sd = tl->init(tl, cpu);
+ struct sched_domain *sd = sd_init(tl, cpu);
if (!sd)
return child;
@@ -6427,14 +6514,14 @@ static int build_sched_domains(const struct cpumask *cpu_map,
}
}
- /* Calculate CPU power for physical packages and nodes */
+ /* Calculate CPU capacity for physical packages and nodes */
for (i = nr_cpumask_bits-1; i >= 0; i--) {
if (!cpumask_test_cpu(i, cpu_map))
continue;
for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
claim_allocations(i, sd);
- init_sched_groups_power(i, sd);
+ init_sched_groups_capacity(i, sd);
}
}
@@ -6877,7 +6964,7 @@ void __init sched_init(void)
#ifdef CONFIG_SMP
rq->sd = NULL;
rq->rd = NULL;
- rq->cpu_power = SCHED_POWER_SCALE;
+ rq->cpu_capacity = SCHED_CAPACITY_SCALE;
rq->post_schedule = 0;
rq->active_balance = 0;
rq->next_balance = jiffies;
@@ -6935,6 +7022,7 @@ void __init sched_init(void)
if (cpu_isolated_map == NULL)
zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
idle_thread_set_boot_cpu();
+ set_cpu_rq_start_time();
#endif
init_sched_fair_class();
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 5b9bb42..bd95963 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -13,6 +13,7 @@
#include <linux/gfp.h>
#include <linux/kernel.h>
+#include <linux/slab.h>
#include "cpudeadline.h"
static inline int parent(int i)
@@ -39,8 +40,10 @@ static void cpudl_exchange(struct cpudl *cp, int a, int b)
{
int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu;
- swap(cp->elements[a], cp->elements[b]);
- swap(cp->cpu_to_idx[cpu_a], cp->cpu_to_idx[cpu_b]);
+ swap(cp->elements[a].cpu, cp->elements[b].cpu);
+ swap(cp->elements[a].dl , cp->elements[b].dl );
+
+ swap(cp->elements[cpu_a].idx, cp->elements[cpu_b].idx);
}
static void cpudl_heapify(struct cpudl *cp, int idx)
@@ -140,7 +143,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
WARN_ON(!cpu_present(cpu));
raw_spin_lock_irqsave(&cp->lock, flags);
- old_idx = cp->cpu_to_idx[cpu];
+ old_idx = cp->elements[cpu].idx;
if (!is_valid) {
/* remove item */
if (old_idx == IDX_INVALID) {
@@ -155,8 +158,8 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl;
cp->elements[old_idx].cpu = new_cpu;
cp->size--;
- cp->cpu_to_idx[new_cpu] = old_idx;
- cp->cpu_to_idx[cpu] = IDX_INVALID;
+ cp->elements[new_cpu].idx = old_idx;
+ cp->elements[cpu].idx = IDX_INVALID;
while (old_idx > 0 && dl_time_before(
cp->elements[parent(old_idx)].dl,
cp->elements[old_idx].dl)) {
@@ -173,7 +176,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
cp->size++;
cp->elements[cp->size - 1].dl = 0;
cp->elements[cp->size - 1].cpu = cpu;
- cp->cpu_to_idx[cpu] = cp->size - 1;
+ cp->elements[cpu].idx = cp->size - 1;
cpudl_change_key(cp, cp->size - 1, dl);
cpumask_clear_cpu(cpu, cp->free_cpus);
} else {
@@ -195,10 +198,21 @@ int cpudl_init(struct cpudl *cp)
memset(cp, 0, sizeof(*cp));
raw_spin_lock_init(&cp->lock);
cp->size = 0;
- for (i = 0; i < NR_CPUS; i++)
- cp->cpu_to_idx[i] = IDX_INVALID;
- if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL))
+
+ cp->elements = kcalloc(nr_cpu_ids,
+ sizeof(struct cpudl_item),
+ GFP_KERNEL);
+ if (!cp->elements)
+ return -ENOMEM;
+
+ if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) {
+ kfree(cp->elements);
return -ENOMEM;
+ }
+
+ for_each_possible_cpu(i)
+ cp->elements[i].idx = IDX_INVALID;
+
cpumask_setall(cp->free_cpus);
return 0;
@@ -210,7 +224,6 @@ int cpudl_init(struct cpudl *cp)
*/
void cpudl_cleanup(struct cpudl *cp)
{
- /*
- * nothing to do for the moment
- */
+ free_cpumask_var(cp->free_cpus);
+ kfree(cp->elements);
}
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index a202789..538c979 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -5,17 +5,17 @@
#define IDX_INVALID -1
-struct array_item {
+struct cpudl_item {
u64 dl;
int cpu;
+ int idx;
};
struct cpudl {
raw_spinlock_t lock;
int size;
- int cpu_to_idx[NR_CPUS];
- struct array_item elements[NR_CPUS];
cpumask_var_t free_cpus;
+ struct cpudl_item *elements;
};
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 746bc93..981fcd7 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -30,6 +30,7 @@
#include <linux/gfp.h>
#include <linux/sched.h>
#include <linux/sched/rt.h>
+#include <linux/slab.h>
#include "cpupri.h"
/* Convert between a 140 based task->prio, and our 102 based cpupri */
@@ -70,8 +71,7 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
int idx = 0;
int task_pri = convert_prio(p->prio);
- if (task_pri >= MAX_RT_PRIO)
- return 0;
+ BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES);
for (idx = 0; idx < task_pri; idx++) {
struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
@@ -219,8 +219,13 @@ int cpupri_init(struct cpupri *cp)
goto cleanup;
}
+ cp->cpu_to_pri = kcalloc(nr_cpu_ids, sizeof(int), GFP_KERNEL);
+ if (!cp->cpu_to_pri)
+ goto cleanup;
+
for_each_possible_cpu(i)
cp->cpu_to_pri[i] = CPUPRI_INVALID;
+
return 0;
cleanup:
@@ -237,6 +242,7 @@ void cpupri_cleanup(struct cpupri *cp)
{
int i;
+ kfree(cp->cpu_to_pri);
for (i = 0; i < CPUPRI_NR_PRIORITIES; i++)
free_cpumask_var(cp->pri_to_cpu[i].mask);
}
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
index f6d7561..6b03334 100644
--- a/kernel/sched/cpupri.h
+++ b/kernel/sched/cpupri.h
@@ -17,7 +17,7 @@ struct cpupri_vec {
struct cpupri {
struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
- int cpu_to_pri[NR_CPUS];
+ int *cpu_to_pri;
};
#ifdef CONFIG_SMP
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index a95097c..72fdf06 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -332,50 +332,50 @@ out:
* softirq as those do not count in task exec_runtime any more.
*/
static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
- struct rq *rq)
+ struct rq *rq, int ticks)
{
- cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+ cputime_t scaled = cputime_to_scaled(cputime_one_jiffy);
+ u64 cputime = (__force u64) cputime_one_jiffy;
u64 *cpustat = kcpustat_this_cpu->cpustat;
if (steal_account_process_tick())
return;
+ cputime *= ticks;
+ scaled *= ticks;
+
if (irqtime_account_hi_update()) {
- cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy;
+ cpustat[CPUTIME_IRQ] += cputime;
} else if (irqtime_account_si_update()) {
- cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy;
+ cpustat[CPUTIME_SOFTIRQ] += cputime;
} else if (this_cpu_ksoftirqd() == p) {
/*
* ksoftirqd time do not get accounted in cpu_softirq_time.
* So, we have to handle it separately here.
* Also, p->stime needs to be updated for ksoftirqd.
*/
- __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
- CPUTIME_SOFTIRQ);
+ __account_system_time(p, cputime, scaled, CPUTIME_SOFTIRQ);
} else if (user_tick) {
- account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
+ account_user_time(p, cputime, scaled);
} else if (p == rq->idle) {
- account_idle_time(cputime_one_jiffy);
+ account_idle_time(cputime);
} else if (p->flags & PF_VCPU) { /* System time or guest time */
- account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
+ account_guest_time(p, cputime, scaled);
} else {
- __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
- CPUTIME_SYSTEM);
+ __account_system_time(p, cputime, scaled, CPUTIME_SYSTEM);
}
}
static void irqtime_account_idle_ticks(int ticks)
{
- int i;
struct rq *rq = this_rq();
- for (i = 0; i < ticks; i++)
- irqtime_account_process_tick(current, 0, rq);
+ irqtime_account_process_tick(current, 0, rq, ticks);
}
#else /* CONFIG_IRQ_TIME_ACCOUNTING */
static inline void irqtime_account_idle_ticks(int ticks) {}
static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
- struct rq *rq) {}
+ struct rq *rq, int nr_ticks) {}
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
/*
@@ -464,7 +464,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
return;
if (sched_clock_irqtime) {
- irqtime_account_process_tick(p, user_tick, rq);
+ irqtime_account_process_tick(p, user_tick, rq, 1);
return;
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 27ef409..0d6b170 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -57,8 +57,6 @@ void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime)
dl_b->dl_runtime = runtime;
}
-extern unsigned long to_ratio(u64 period, u64 runtime);
-
void init_dl_bw(struct dl_bw *dl_b)
{
raw_spin_lock_init(&dl_b->lock);
@@ -520,7 +518,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
* We need to take care of a possible races here. In fact, the
* task might have changed its scheduling policy to something
* different from SCHED_DEADLINE or changed its reservation
- * parameters (through sched_setscheduler()).
+ * parameters (through sched_setattr()).
*/
if (!dl_task(p) || dl_se->dl_new)
goto unlock;
@@ -528,6 +526,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
sched_clock_tick();
update_rq_clock(rq);
dl_se->dl_throttled = 0;
+ dl_se->dl_yielded = 0;
if (p->on_rq) {
enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
if (task_has_dl_policy(rq->curr))
@@ -740,7 +739,7 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
WARN_ON(!dl_prio(prio));
dl_rq->dl_nr_running++;
- inc_nr_running(rq_of_dl_rq(dl_rq));
+ add_nr_running(rq_of_dl_rq(dl_rq), 1);
inc_dl_deadline(dl_rq, deadline);
inc_dl_migration(dl_se, dl_rq);
@@ -754,7 +753,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
WARN_ON(!dl_prio(prio));
WARN_ON(!dl_rq->dl_nr_running);
dl_rq->dl_nr_running--;
- dec_nr_running(rq_of_dl_rq(dl_rq));
+ sub_nr_running(rq_of_dl_rq(dl_rq), 1);
dec_dl_deadline(dl_rq, dl_se->deadline);
dec_dl_migration(dl_se, dl_rq);
@@ -893,10 +892,10 @@ static void yield_task_dl(struct rq *rq)
* We make the task go to sleep until its current deadline by
* forcing its runtime to zero. This way, update_curr_dl() stops
* it and the bandwidth timer will wake it up and will give it
- * new scheduling parameters (thanks to dl_new=1).
+ * new scheduling parameters (thanks to dl_yielded=1).
*/
if (p->dl.runtime > 0) {
- rq->curr->dl.dl_new = 1;
+ rq->curr->dl.dl_yielded = 1;
p->dl.runtime = 0;
}
update_curr_dl(rq);
@@ -1021,8 +1020,17 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
dl_rq = &rq->dl;
- if (need_pull_dl_task(rq, prev))
+ if (need_pull_dl_task(rq, prev)) {
pull_dl_task(rq);
+ /*
+ * pull_rt_task() can drop (and re-acquire) rq->lock; this
+ * means a stop task can slip in, in which case we need to
+ * re-start task selection.
+ */
+ if (rq->stop && rq->stop->on_rq)
+ return RETRY_TASK;
+ }
+
/*
* When prev is DL, we may throttle it in put_prev_task().
* So, we update time before we check for dl_nr_running.
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7e9bd0b..d3c73122 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1017,7 +1017,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
static unsigned long weighted_cpuload(const int cpu);
static unsigned long source_load(int cpu, int type);
static unsigned long target_load(int cpu, int type);
-static unsigned long power_of(int cpu);
+static unsigned long capacity_of(int cpu);
static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
/* Cached statistics for all CPUs within a node */
@@ -1026,11 +1026,11 @@ struct numa_stats {
unsigned long load;
/* Total compute capacity of CPUs on a node */
- unsigned long power;
+ unsigned long compute_capacity;
/* Approximate capacity in terms of runnable tasks on a node */
- unsigned long capacity;
- int has_capacity;
+ unsigned long task_capacity;
+ int has_free_capacity;
};
/*
@@ -1046,7 +1046,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
ns->nr_running += rq->nr_running;
ns->load += weighted_cpuload(cpu);
- ns->power += power_of(cpu);
+ ns->compute_capacity += capacity_of(cpu);
cpus++;
}
@@ -1056,15 +1056,16 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
* the @ns structure is NULL'ed and task_numa_compare() will
* not find this node attractive.
*
- * We'll either bail at !has_capacity, or we'll detect a huge imbalance
- * and bail there.
+ * We'll either bail at !has_free_capacity, or we'll detect a huge
+ * imbalance and bail there.
*/
if (!cpus)
return;
- ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power;
- ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE);
- ns->has_capacity = (ns->nr_running < ns->capacity);
+ ns->load = (ns->load * SCHED_CAPACITY_SCALE) / ns->compute_capacity;
+ ns->task_capacity =
+ DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE);
+ ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
}
struct task_numa_env {
@@ -1095,6 +1096,34 @@ static void task_numa_assign(struct task_numa_env *env,
env->best_cpu = env->dst_cpu;
}
+static bool load_too_imbalanced(long orig_src_load, long orig_dst_load,
+ long src_load, long dst_load,
+ struct task_numa_env *env)
+{
+ long imb, old_imb;
+
+ /* We care about the slope of the imbalance, not the direction. */
+ if (dst_load < src_load)
+ swap(dst_load, src_load);
+
+ /* Is the difference below the threshold? */
+ imb = dst_load * 100 - src_load * env->imbalance_pct;
+ if (imb <= 0)
+ return false;
+
+ /*
+ * The imbalance is above the allowed threshold.
+ * Compare it with the old imbalance.
+ */
+ if (orig_dst_load < orig_src_load)
+ swap(orig_dst_load, orig_src_load);
+
+ old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct;
+
+ /* Would this change make things worse? */
+ return (old_imb > imb);
+}
+
/*
* This checks if the overall compute and NUMA accesses of the system would
* be improved if the source tasks was migrated to the target dst_cpu taking
@@ -1107,7 +1136,8 @@ static void task_numa_compare(struct task_numa_env *env,
struct rq *src_rq = cpu_rq(env->src_cpu);
struct rq *dst_rq = cpu_rq(env->dst_cpu);
struct task_struct *cur;
- long dst_load, src_load;
+ long orig_src_load, src_load;
+ long orig_dst_load, dst_load;
long load;
long imp = (groupimp > 0) ? groupimp : taskimp;
@@ -1166,8 +1196,8 @@ static void task_numa_compare(struct task_numa_env *env,
if (!cur) {
/* Is there capacity at our destination? */
- if (env->src_stats.has_capacity &&
- !env->dst_stats.has_capacity)
+ if (env->src_stats.has_free_capacity &&
+ !env->dst_stats.has_free_capacity)
goto unlock;
goto balance;
@@ -1181,13 +1211,13 @@ static void task_numa_compare(struct task_numa_env *env,
* In the overloaded case, try and keep the load balanced.
*/
balance:
- dst_load = env->dst_stats.load;
- src_load = env->src_stats.load;
+ orig_dst_load = env->dst_stats.load;
+ orig_src_load = env->src_stats.load;
- /* XXX missing power terms */
+ /* XXX missing capacity terms */
load = task_h_load(env->p);
- dst_load += load;
- src_load -= load;
+ dst_load = orig_dst_load + load;
+ src_load = orig_src_load - load;
if (cur) {
load = task_h_load(cur);
@@ -1195,11 +1225,8 @@ balance:
src_load += load;
}
- /* make src_load the smaller */
- if (dst_load < src_load)
- swap(dst_load, src_load);
-
- if (src_load * env->imbalance_pct < dst_load * 100)
+ if (load_too_imbalanced(orig_src_load, orig_dst_load,
+ src_load, dst_load, env))
goto unlock;
assign:
@@ -1275,8 +1302,8 @@ static int task_numa_migrate(struct task_struct *p)
groupimp = group_weight(p, env.dst_nid) - groupweight;
update_numa_stats(&env.dst_stats, env.dst_nid);
- /* If the preferred nid has capacity, try to use it. */
- if (env.dst_stats.has_capacity)
+ /* If the preferred nid has free capacity, try to use it. */
+ if (env.dst_stats.has_free_capacity)
task_numa_find_cpu(&env, taskimp, groupimp);
/* No space available on the preferred nid. Look elsewhere. */
@@ -1301,7 +1328,16 @@ static int task_numa_migrate(struct task_struct *p)
if (env.best_cpu == -1)
return -EAGAIN;
- sched_setnuma(p, env.dst_nid);
+ /*
+ * If the task is part of a workload that spans multiple NUMA nodes,
+ * and is migrating into one of the workload's active nodes, remember
+ * this node as the task's preferred numa node, so the workload can
+ * settle down.
+ * A task that migrated to a second choice node will be better off
+ * trying for a better one later. Do not set the preferred node here.
+ */
+ if (p->numa_group && node_isset(env.dst_nid, p->numa_group->active_nodes))
+ sched_setnuma(p, env.dst_nid);
/*
* Reset the scan period if the task is being rescheduled on an
@@ -1326,12 +1362,15 @@ static int task_numa_migrate(struct task_struct *p)
/* Attempt to migrate a task to a CPU on the preferred node. */
static void numa_migrate_preferred(struct task_struct *p)
{
+ unsigned long interval = HZ;
+
/* This task has no NUMA fault statistics yet */
if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory))
return;
/* Periodically retry migrating the task to the preferred node */
- p->numa_migrate_retry = jiffies + HZ;
+ interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
+ p->numa_migrate_retry = jiffies + interval;
/* Success if task is already running on preferred CPU */
if (task_node(p) == p->numa_preferred_nid)
@@ -1497,7 +1536,7 @@ static void task_numa_placement(struct task_struct *p)
/* If the task is part of a group prevent parallel updates to group stats */
if (p->numa_group) {
group_lock = &p->numa_group->lock;
- spin_lock(group_lock);
+ spin_lock_irq(group_lock);
}
/* Find the node with the highest number of faults */
@@ -1572,7 +1611,7 @@ static void task_numa_placement(struct task_struct *p)
}
}
- spin_unlock(group_lock);
+ spin_unlock_irq(group_lock);
}
/* Preferred node as the node with the most faults */
@@ -1677,7 +1716,8 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
if (!join)
return;
- double_lock(&my_grp->lock, &grp->lock);
+ BUG_ON(irqs_disabled());
+ double_lock_irq(&my_grp->lock, &grp->lock);
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
my_grp->faults[i] -= p->numa_faults_memory[i];
@@ -1691,7 +1731,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
grp->nr_tasks++;
spin_unlock(&my_grp->lock);
- spin_unlock(&grp->lock);
+ spin_unlock_irq(&grp->lock);
rcu_assign_pointer(p->numa_group, grp);
@@ -1710,14 +1750,14 @@ void task_numa_free(struct task_struct *p)
void *numa_faults = p->numa_faults_memory;
if (grp) {
- spin_lock(&grp->lock);
+ spin_lock_irq(&grp->lock);
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
grp->faults[i] -= p->numa_faults_memory[i];
grp->total_faults -= p->total_numa_faults;
list_del(&p->numa_entry);
grp->nr_tasks--;
- spin_unlock(&grp->lock);
+ spin_unlock_irq(&grp->lock);
rcu_assign_pointer(p->numa_group, NULL);
put_numa_group(grp);
}
@@ -1737,6 +1777,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
struct task_struct *p = current;
bool migrated = flags & TNF_MIGRATED;
int cpu_node = task_node(current);
+ int local = !!(flags & TNF_FAULT_LOCAL);
int priv;
if (!numabalancing_enabled)
@@ -1785,6 +1826,17 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
task_numa_group(p, last_cpupid, flags, &priv);
}
+ /*
+ * If a workload spans multiple NUMA nodes, a shared fault that
+ * occurs wholly within the set of nodes that the workload is
+ * actively using should be counted as local. This allows the
+ * scan rate to slow down when a workload has settled down.
+ */
+ if (!priv && !local && p->numa_group &&
+ node_isset(cpu_node, p->numa_group->active_nodes) &&
+ node_isset(mem_node, p->numa_group->active_nodes))
+ local = 1;
+
task_numa_placement(p);
/*
@@ -1799,7 +1851,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages;
p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages;
- p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
+ p->numa_faults_locality[local] += pages;
}
static void reset_ptenuma_scan(struct task_struct *p)
@@ -3173,10 +3225,12 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
* has not truly expired.
*
* Fortunately we can check determine whether this the case by checking
- * whether the global deadline has advanced.
+ * whether the global deadline has advanced. It is valid to compare
+ * cfs_b->runtime_expires without any locks since we only care about
+ * exact equality, so a partial write will still work.
*/
- if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
+ if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
/* extend local deadline, drift is bounded above by 2 ticks */
cfs_rq->runtime_expires += TICK_NSEC;
} else {
@@ -3300,7 +3354,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
}
if (!se)
- rq->nr_running -= task_delta;
+ sub_nr_running(rq, task_delta);
cfs_rq->throttled = 1;
cfs_rq->throttled_clock = rq_clock(rq);
@@ -3351,7 +3405,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
}
if (!se)
- rq->nr_running += task_delta;
+ add_nr_running(rq, task_delta);
/* determine whether we need to wake up potentially idle cpu */
if (rq->curr == rq->idle && rq->cfs.nr_running)
@@ -3405,21 +3459,21 @@ next:
static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
{
u64 runtime, runtime_expires;
- int idle = 1, throttled;
+ int throttled;
- raw_spin_lock(&cfs_b->lock);
/* no need to continue the timer with no bandwidth constraint */
if (cfs_b->quota == RUNTIME_INF)
- goto out_unlock;
+ goto out_deactivate;
throttled = !list_empty(&cfs_b->throttled_cfs_rq);
- /* idle depends on !throttled (for the case of a large deficit) */
- idle = cfs_b->idle && !throttled;
cfs_b->nr_periods += overrun;
- /* if we're going inactive then everything else can be deferred */
- if (idle)
- goto out_unlock;
+ /*
+ * idle depends on !throttled (for the case of a large deficit), and if
+ * we're going inactive then everything else can be deferred
+ */
+ if (cfs_b->idle && !throttled)
+ goto out_deactivate;
/*
* if we have relooped after returning idle once, we need to update our
@@ -3433,7 +3487,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
if (!throttled) {
/* mark as potentially idle for the upcoming period */
cfs_b->idle = 1;
- goto out_unlock;
+ return 0;
}
/* account preceding periods in which throttling occurred */
@@ -3473,12 +3527,12 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
* timer to remain active while there are any throttled entities.)
*/
cfs_b->idle = 0;
-out_unlock:
- if (idle)
- cfs_b->timer_active = 0;
- raw_spin_unlock(&cfs_b->lock);
- return idle;
+ return 0;
+
+out_deactivate:
+ cfs_b->timer_active = 0;
+ return 1;
}
/* a cfs_rq won't donate quota below this amount */
@@ -3655,6 +3709,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
int overrun;
int idle = 0;
+ raw_spin_lock(&cfs_b->lock);
for (;;) {
now = hrtimer_cb_get_time(timer);
overrun = hrtimer_forward(timer, now, cfs_b->period);
@@ -3664,6 +3719,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
idle = do_sched_cfs_period_timer(cfs_b, overrun);
}
+ raw_spin_unlock(&cfs_b->lock);
return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
}
@@ -3723,8 +3779,6 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
struct cfs_rq *cfs_rq;
for_each_leaf_cfs_rq(rq, cfs_rq) {
- struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
-
if (!cfs_rq->runtime_enabled)
continue;
@@ -3732,7 +3786,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
* clock_task is not advancing so we just need to make sure
* there's some valid quota amount
*/
- cfs_rq->runtime_remaining = cfs_b->quota;
+ cfs_rq->runtime_remaining = 1;
if (cfs_rq_throttled(cfs_rq))
unthrottle_cfs_rq(cfs_rq);
}
@@ -3883,7 +3937,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (!se) {
update_rq_runnable_avg(rq, rq->nr_running);
- inc_nr_running(rq);
+ add_nr_running(rq, 1);
}
hrtick_update(rq);
}
@@ -3943,7 +3997,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
}
if (!se) {
- dec_nr_running(rq);
+ sub_nr_running(rq, 1);
update_rq_runnable_avg(rq, 1);
}
hrtick_update(rq);
@@ -3989,9 +4043,9 @@ static unsigned long target_load(int cpu, int type)
return max(rq->cpu_load[type-1], total);
}
-static unsigned long power_of(int cpu)
+static unsigned long capacity_of(int cpu)
{
- return cpu_rq(cpu)->cpu_power;
+ return cpu_rq(cpu)->cpu_capacity;
}
static unsigned long cpu_avg_load_per_task(int cpu)
@@ -4013,8 +4067,8 @@ static void record_wakee(struct task_struct *p)
* about the boundary, really active task won't care
* about the loss.
*/
- if (jiffies > current->wakee_flip_decay_ts + HZ) {
- current->wakee_flips = 0;
+ if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
+ current->wakee_flips >>= 1;
current->wakee_flip_decay_ts = jiffies;
}
@@ -4234,12 +4288,12 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
s64 this_eff_load, prev_eff_load;
this_eff_load = 100;
- this_eff_load *= power_of(prev_cpu);
+ this_eff_load *= capacity_of(prev_cpu);
this_eff_load *= this_load +
effective_load(tg, this_cpu, weight, weight);
prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
- prev_eff_load *= power_of(this_cpu);
+ prev_eff_load *= capacity_of(this_cpu);
prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
balanced = this_eff_load <= prev_eff_load;
@@ -4315,8 +4369,8 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
avg_load += load;
}
- /* Adjust by relative CPU power of the group */
- avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
+ /* Adjust by relative CPU capacity of the group */
+ avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity;
if (local_group) {
this_load = avg_load;
@@ -4448,10 +4502,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
sd = tmp;
}
- if (affine_sd) {
- if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
- prev_cpu = cpu;
+ if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync))
+ prev_cpu = cpu;
+ if (sd_flag & SD_BALANCE_WAKE) {
new_cpu = select_idle_sibling(p, prev_cpu);
goto unlock;
}
@@ -4519,6 +4573,9 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu)
atomic_long_add(se->avg.load_avg_contrib,
&cfs_rq->removed_load);
}
+
+ /* We have migrated, no longer consider this task hot */
+ se->exec_start = 0;
}
#endif /* CONFIG_SMP */
@@ -4893,14 +4950,14 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
*
* W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
*
- * P_i is the cpu power (or compute capacity) of cpu i, typically it is the
+ * C_i is the compute capacity of cpu i, typically it is the
* fraction of 'recent' time available for SCHED_OTHER task execution. But it
* can also include other factors [XXX].
*
* To achieve this balance we define a measure of imbalance which follows
* directly from (1):
*
- * imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j } (4)
+ * imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j } (4)
*
* We them move tasks around to minimize the imbalance. In the continuous
* function space it is obvious this converges, in the discrete case we get
@@ -5069,6 +5126,7 @@ task_hot(struct task_struct *p, u64 now)
/* Returns true if the destination node has incurred more faults */
static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
{
+ struct numa_group *numa_group = rcu_dereference(p->numa_group);
int src_nid, dst_nid;
if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory ||
@@ -5082,21 +5140,29 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
if (src_nid == dst_nid)
return false;
- /* Always encourage migration to the preferred node. */
- if (dst_nid == p->numa_preferred_nid)
- return true;
+ if (numa_group) {
+ /* Task is already in the group's interleave set. */
+ if (node_isset(src_nid, numa_group->active_nodes))
+ return false;
+
+ /* Task is moving into the group's interleave set. */
+ if (node_isset(dst_nid, numa_group->active_nodes))
+ return true;
- /* If both task and group weight improve, this move is a winner. */
- if (task_weight(p, dst_nid) > task_weight(p, src_nid) &&
- group_weight(p, dst_nid) > group_weight(p, src_nid))
+ return group_faults(p, dst_nid) > group_faults(p, src_nid);
+ }
+
+ /* Encourage migration to the preferred node. */
+ if (dst_nid == p->numa_preferred_nid)
return true;
- return false;
+ return task_faults(p, dst_nid) > task_faults(p, src_nid);
}
static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
{
+ struct numa_group *numa_group = rcu_dereference(p->numa_group);
int src_nid, dst_nid;
if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
@@ -5111,16 +5177,23 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
if (src_nid == dst_nid)
return false;
+ if (numa_group) {
+ /* Task is moving within/into the group's interleave set. */
+ if (node_isset(dst_nid, numa_group->active_nodes))
+ return false;
+
+ /* Task is moving out of the group's interleave set. */
+ if (node_isset(src_nid, numa_group->active_nodes))
+ return true;
+
+ return group_faults(p, dst_nid) < group_faults(p, src_nid);
+ }
+
/* Migrating away from the preferred node is always bad. */
if (src_nid == p->numa_preferred_nid)
return true;
- /* If either task or group weight get worse, don't do it. */
- if (task_weight(p, dst_nid) < task_weight(p, src_nid) ||
- group_weight(p, dst_nid) < group_weight(p, src_nid))
- return true;
-
- return false;
+ return task_faults(p, dst_nid) < task_faults(p, src_nid);
}
#else
@@ -5459,13 +5532,13 @@ struct sg_lb_stats {
unsigned long group_load; /* Total load over the CPUs of the group */
unsigned long sum_weighted_load; /* Weighted load of group's tasks */
unsigned long load_per_task;
- unsigned long group_power;
+ unsigned long group_capacity;
unsigned int sum_nr_running; /* Nr tasks running in the group */
- unsigned int group_capacity;
+ unsigned int group_capacity_factor;
unsigned int idle_cpus;
unsigned int group_weight;
int group_imb; /* Is there an imbalance in the group ? */
- int group_has_capacity; /* Is there extra capacity in the group? */
+ int group_has_free_capacity;
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
unsigned int nr_preferred_running;
@@ -5480,7 +5553,7 @@ struct sd_lb_stats {
struct sched_group *busiest; /* Busiest group in this sd */
struct sched_group *local; /* Local group in this sd */
unsigned long total_load; /* Total load of all groups in sd */
- unsigned long total_pwr; /* Total power of all groups in sd */
+ unsigned long total_capacity; /* Total capacity of all groups in sd */
unsigned long avg_load; /* Average load across all groups in sd */
struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
@@ -5499,7 +5572,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
.busiest = NULL,
.local = NULL,
.total_load = 0UL,
- .total_pwr = 0UL,
+ .total_capacity = 0UL,
.busiest_stat = {
.avg_load = 0UL,
},
@@ -5534,17 +5607,17 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
return load_idx;
}
-static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
+static unsigned long default_scale_capacity(struct sched_domain *sd, int cpu)
{
- return SCHED_POWER_SCALE;
+ return SCHED_CAPACITY_SCALE;
}
-unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
+unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
{
- return default_scale_freq_power(sd, cpu);
+ return default_scale_capacity(sd, cpu);
}
-static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
+static unsigned long default_scale_smt_capacity(struct sched_domain *sd, int cpu)
{
unsigned long weight = sd->span_weight;
unsigned long smt_gain = sd->smt_gain;
@@ -5554,15 +5627,16 @@ static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
return smt_gain;
}
-unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
+unsigned long __weak arch_scale_smt_capacity(struct sched_domain *sd, int cpu)
{
- return default_scale_smt_power(sd, cpu);
+ return default_scale_smt_capacity(sd, cpu);
}
-static unsigned long scale_rt_power(int cpu)
+static unsigned long scale_rt_capacity(int cpu)
{
struct rq *rq = cpu_rq(cpu);
u64 total, available, age_stamp, avg;
+ s64 delta;
/*
* Since we're reading these variables without serialization make sure
@@ -5571,74 +5645,78 @@ static unsigned long scale_rt_power(int cpu)
age_stamp = ACCESS_ONCE(rq->age_stamp);
avg = ACCESS_ONCE(rq->rt_avg);
- total = sched_avg_period() + (rq_clock(rq) - age_stamp);
+ delta = rq_clock(rq) - age_stamp;
+ if (unlikely(delta < 0))
+ delta = 0;
+
+ total = sched_avg_period() + delta;
if (unlikely(total < avg)) {
- /* Ensures that power won't end up being negative */
+ /* Ensures that capacity won't end up being negative */
available = 0;
} else {
available = total - avg;
}
- if (unlikely((s64)total < SCHED_POWER_SCALE))
- total = SCHED_POWER_SCALE;
+ if (unlikely((s64)total < SCHED_CAPACITY_SCALE))
+ total = SCHED_CAPACITY_SCALE;
- total >>= SCHED_POWER_SHIFT;
+ total >>= SCHED_CAPACITY_SHIFT;
return div_u64(available, total);
}
-static void update_cpu_power(struct sched_domain *sd, int cpu)
+static void update_cpu_capacity(struct sched_domain *sd, int cpu)
{
unsigned long weight = sd->span_weight;
- unsigned long power = SCHED_POWER_SCALE;
+ unsigned long capacity = SCHED_CAPACITY_SCALE;
struct sched_group *sdg = sd->groups;
- if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
- if (sched_feat(ARCH_POWER))
- power *= arch_scale_smt_power(sd, cpu);
+ if ((sd->flags & SD_SHARE_CPUCAPACITY) && weight > 1) {
+ if (sched_feat(ARCH_CAPACITY))
+ capacity *= arch_scale_smt_capacity(sd, cpu);
else
- power *= default_scale_smt_power(sd, cpu);
+ capacity *= default_scale_smt_capacity(sd, cpu);
- power >>= SCHED_POWER_SHIFT;
+ capacity >>= SCHED_CAPACITY_SHIFT;
}
- sdg->sgp->power_orig = power;
+ sdg->sgc->capacity_orig = capacity;
- if (sched_feat(ARCH_POWER))
- power *= arch_scale_freq_power(sd, cpu);
+ if (sched_feat(ARCH_CAPACITY))
+ capacity *= arch_scale_freq_capacity(sd, cpu);
else
- power *= default_scale_freq_power(sd, cpu);
+ capacity *= default_scale_capacity(sd, cpu);
- power >>= SCHED_POWER_SHIFT;
+ capacity >>= SCHED_CAPACITY_SHIFT;
- power *= scale_rt_power(cpu);
- power >>= SCHED_POWER_SHIFT;
+ capacity *= scale_rt_capacity(cpu);
+ capacity >>= SCHED_CAPACITY_SHIFT;
- if (!power)
- power = 1;
+ if (!capacity)
+ capacity = 1;
- cpu_rq(cpu)->cpu_power = power;
- sdg->sgp->power = power;
+ cpu_rq(cpu)->cpu_capacity = capacity;
+ sdg->sgc->capacity = capacity;
}
-void update_group_power(struct sched_domain *sd, int cpu)
+void update_group_capacity(struct sched_domain *sd, int cpu)
{
struct sched_domain *child = sd->child;
struct sched_group *group, *sdg = sd->groups;
- unsigned long power, power_orig;
+ unsigned long capacity, capacity_orig;
unsigned long interval;
interval = msecs_to_jiffies(sd->balance_interval);
interval = clamp(interval, 1UL, max_load_balance_interval);
- sdg->sgp->next_update = jiffies + interval;
+ sdg->sgc->next_update = jiffies + interval;
if (!child) {
- update_cpu_power(sd, cpu);
+ update_cpu_capacity(sd, cpu);
return;
}
- power_orig = power = 0;
+ capacity_orig = capacity = 0;
if (child->flags & SD_OVERLAP) {
/*
@@ -5647,31 +5725,31 @@ void update_group_power(struct sched_domain *sd, int cpu)
*/
for_each_cpu(cpu, sched_group_cpus(sdg)) {
- struct sched_group_power *sgp;
+ struct sched_group_capacity *sgc;
struct rq *rq = cpu_rq(cpu);
/*
- * build_sched_domains() -> init_sched_groups_power()
+ * build_sched_domains() -> init_sched_groups_capacity()
* gets here before we've attached the domains to the
* runqueues.
*
- * Use power_of(), which is set irrespective of domains
- * in update_cpu_power().
+ * Use capacity_of(), which is set irrespective of domains
+ * in update_cpu_capacity().
*
- * This avoids power/power_orig from being 0 and
+ * This avoids capacity/capacity_orig from being 0 and
* causing divide-by-zero issues on boot.
*
- * Runtime updates will correct power_orig.
+ * Runtime updates will correct capacity_orig.
*/
if (unlikely(!rq->sd)) {
- power_orig += power_of(cpu);
- power += power_of(cpu);
+ capacity_orig += capacity_of(cpu);
+ capacity += capacity_of(cpu);
continue;
}
- sgp = rq->sd->groups->sgp;
- power_orig += sgp->power_orig;
- power += sgp->power;
+ sgc = rq->sd->groups->sgc;
+ capacity_orig += sgc->capacity_orig;
+ capacity += sgc->capacity;
}
} else {
/*
@@ -5681,14 +5759,14 @@ void update_group_power(struct sched_domain *sd, int cpu)
group = child->groups;
do {
- power_orig += group->sgp->power_orig;
- power += group->sgp->power;
+ capacity_orig += group->sgc->capacity_orig;
+ capacity += group->sgc->capacity;
group = group->next;
} while (group != child->groups);
}
- sdg->sgp->power_orig = power_orig;
- sdg->sgp->power = power;
+ sdg->sgc->capacity_orig = capacity_orig;
+ sdg->sgc->capacity = capacity;
}
/*
@@ -5702,15 +5780,15 @@ static inline int
fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
{
/*
- * Only siblings can have significantly less than SCHED_POWER_SCALE
+ * Only siblings can have significantly less than SCHED_CAPACITY_SCALE
*/
- if (!(sd->flags & SD_SHARE_CPUPOWER))
+ if (!(sd->flags & SD_SHARE_CPUCAPACITY))
return 0;
/*
- * If ~90% of the cpu_power is still there, we're good.
+ * If ~90% of the cpu_capacity is still there, we're good.
*/
- if (group->sgp->power * 32 > group->sgp->power_orig * 29)
+ if (group->sgc->capacity * 32 > group->sgc->capacity_orig * 29)
return 1;
return 0;
@@ -5747,34 +5825,35 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
static inline int sg_imbalanced(struct sched_group *group)
{
- return group->sgp->imbalance;
+ return group->sgc->imbalance;
}
/*
- * Compute the group capacity.
+ * Compute the group capacity factor.
*
- * Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by
+ * Avoid the issue where N*frac(smt_capacity) >= 1 creates 'phantom' cores by
* first dividing out the smt factor and computing the actual number of cores
- * and limit power unit capacity with that.
+ * and limit unit capacity with that.
*/
-static inline int sg_capacity(struct lb_env *env, struct sched_group *group)
+static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *group)
{
- unsigned int capacity, smt, cpus;
- unsigned int power, power_orig;
+ unsigned int capacity_factor, smt, cpus;
+ unsigned int capacity, capacity_orig;
- power = group->sgp->power;
- power_orig = group->sgp->power_orig;
+ capacity = group->sgc->capacity;
+ capacity_orig = group->sgc->capacity_orig;
cpus = group->group_weight;
- /* smt := ceil(cpus / power), assumes: 1 < smt_power < 2 */
- smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig);
- capacity = cpus / smt; /* cores */
+ /* smt := ceil(cpus / capacity), assumes: 1 < smt_capacity < 2 */
+ smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, capacity_orig);
+ capacity_factor = cpus / smt; /* cores */
- capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE));
- if (!capacity)
- capacity = fix_small_capacity(env->sd, group);
+ capacity_factor = min_t(unsigned,
+ capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE));
+ if (!capacity_factor)
+ capacity_factor = fix_small_capacity(env->sd, group);
- return capacity;
+ return capacity_factor;
}
/**
@@ -5814,9 +5893,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->idle_cpus++;
}
- /* Adjust by relative CPU power of the group */
- sgs->group_power = group->sgp->power;
- sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power;
+ /* Adjust by relative CPU capacity of the group */
+ sgs->group_capacity = group->sgc->capacity;
+ sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
if (sgs->sum_nr_running)
sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
@@ -5824,10 +5903,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->group_weight = group->group_weight;
sgs->group_imb = sg_imbalanced(group);
- sgs->group_capacity = sg_capacity(env, group);
+ sgs->group_capacity_factor = sg_capacity_factor(env, group);
- if (sgs->group_capacity > sgs->sum_nr_running)
- sgs->group_has_capacity = 1;
+ if (sgs->group_capacity_factor > sgs->sum_nr_running)
+ sgs->group_has_free_capacity = 1;
}
/**
@@ -5851,7 +5930,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
if (sgs->avg_load <= sds->busiest_stat.avg_load)
return false;
- if (sgs->sum_nr_running > sgs->group_capacity)
+ if (sgs->sum_nr_running > sgs->group_capacity_factor)
return true;
if (sgs->group_imb)
@@ -5931,8 +6010,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
sgs = &sds->local_stat;
if (env->idle != CPU_NEWLY_IDLE ||
- time_after_eq(jiffies, sg->sgp->next_update))
- update_group_power(env->sd, env->dst_cpu);
+ time_after_eq(jiffies, sg->sgc->next_update))
+ update_group_capacity(env->sd, env->dst_cpu);
}
update_sg_lb_stats(env, sg, load_idx, local_group, sgs);
@@ -5942,17 +6021,17 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
/*
* In case the child domain prefers tasks go to siblings
- * first, lower the sg capacity to one so that we'll try
+ * first, lower the sg capacity factor to one so that we'll try
* and move all the excess tasks away. We lower the capacity
* of a group only if the local group has the capacity to fit
- * these excess tasks, i.e. nr_running < group_capacity. The
+ * these excess tasks, i.e. nr_running < group_capacity_factor. The
* extra check prevents the case where you always pull from the
* heaviest group when it is already under-utilized (possible
* with a large weight task outweighs the tasks on the system).
*/
if (prefer_sibling && sds->local &&
- sds->local_stat.group_has_capacity)
- sgs->group_capacity = min(sgs->group_capacity, 1U);
+ sds->local_stat.group_has_free_capacity)
+ sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U);
if (update_sd_pick_busiest(env, sds, sg, sgs)) {
sds->busiest = sg;
@@ -5962,7 +6041,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
next_group:
/* Now, start updating sd_lb_stats */
sds->total_load += sgs->group_load;
- sds->total_pwr += sgs->group_power;
+ sds->total_capacity += sgs->group_capacity;
sg = sg->next;
} while (sg != env->sd->groups);
@@ -6009,8 +6088,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
return 0;
env->imbalance = DIV_ROUND_CLOSEST(
- sds->busiest_stat.avg_load * sds->busiest_stat.group_power,
- SCHED_POWER_SCALE);
+ sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
+ SCHED_CAPACITY_SCALE);
return 1;
}
@@ -6025,7 +6104,7 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
static inline
void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
{
- unsigned long tmp, pwr_now = 0, pwr_move = 0;
+ unsigned long tmp, capa_now = 0, capa_move = 0;
unsigned int imbn = 2;
unsigned long scaled_busy_load_per_task;
struct sg_lb_stats *local, *busiest;
@@ -6039,8 +6118,8 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
imbn = 1;
scaled_busy_load_per_task =
- (busiest->load_per_task * SCHED_POWER_SCALE) /
- busiest->group_power;
+ (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
+ busiest->group_capacity;
if (busiest->avg_load + scaled_busy_load_per_task >=
local->avg_load + (scaled_busy_load_per_task * imbn)) {
@@ -6050,38 +6129,38 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
/*
* OK, we don't have enough imbalance to justify moving tasks,
- * however we may be able to increase total CPU power used by
+ * however we may be able to increase total CPU capacity used by
* moving them.
*/
- pwr_now += busiest->group_power *
+ capa_now += busiest->group_capacity *
min(busiest->load_per_task, busiest->avg_load);
- pwr_now += local->group_power *
+ capa_now += local->group_capacity *
min(local->load_per_task, local->avg_load);
- pwr_now /= SCHED_POWER_SCALE;
+ capa_now /= SCHED_CAPACITY_SCALE;
/* Amount of load we'd subtract */
if (busiest->avg_load > scaled_busy_load_per_task) {
- pwr_move += busiest->group_power *
+ capa_move += busiest->group_capacity *
min(busiest->load_per_task,
busiest->avg_load - scaled_busy_load_per_task);
}
/* Amount of load we'd add */
- if (busiest->avg_load * busiest->group_power <
- busiest->load_per_task * SCHED_POWER_SCALE) {
- tmp = (busiest->avg_load * busiest->group_power) /
- local->group_power;
+ if (busiest->avg_load * busiest->group_capacity <
+ busiest->load_per_task * SCHED_CAPACITY_SCALE) {
+ tmp = (busiest->avg_load * busiest->group_capacity) /
+ local->group_capacity;
} else {
- tmp = (busiest->load_per_task * SCHED_POWER_SCALE) /
- local->group_power;
+ tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
+ local->group_capacity;
}
- pwr_move += local->group_power *
+ capa_move += local->group_capacity *
min(local->load_per_task, local->avg_load + tmp);
- pwr_move /= SCHED_POWER_SCALE;
+ capa_move /= SCHED_CAPACITY_SCALE;
/* Move if we gain throughput */
- if (pwr_move > pwr_now)
+ if (capa_move > capa_now)
env->imbalance = busiest->load_per_task;
}
@@ -6111,7 +6190,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
/*
* In the presence of smp nice balancing, certain scenarios can have
* max load less than avg load(as we skip the groups at or below
- * its cpu_power, while calculating max_load..)
+ * its cpu_capacity, while calculating max_load..)
*/
if (busiest->avg_load <= sds->avg_load ||
local->avg_load >= sds->avg_load) {
@@ -6126,10 +6205,10 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
* have to drop below capacity to reach cpu-load equilibrium.
*/
load_above_capacity =
- (busiest->sum_nr_running - busiest->group_capacity);
+ (busiest->sum_nr_running - busiest->group_capacity_factor);
- load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
- load_above_capacity /= busiest->group_power;
+ load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_CAPACITY_SCALE);
+ load_above_capacity /= busiest->group_capacity;
}
/*
@@ -6144,9 +6223,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
/* How much load to actually move to equalise the imbalance */
env->imbalance = min(
- max_pull * busiest->group_power,
- (sds->avg_load - local->avg_load) * local->group_power
- ) / SCHED_POWER_SCALE;
+ max_pull * busiest->group_capacity,
+ (sds->avg_load - local->avg_load) * local->group_capacity
+ ) / SCHED_CAPACITY_SCALE;
/*
* if *imbalance is less than the average load per runnable task
@@ -6200,7 +6279,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
if (!sds.busiest || busiest->sum_nr_running == 0)
goto out_balanced;
- sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
+ sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
+ / sds.total_capacity;
/*
* If the busiest group is imbalanced the below checks don't
@@ -6211,8 +6291,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
goto force_balance;
/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
- if (env->idle == CPU_NEWLY_IDLE && local->group_has_capacity &&
- !busiest->group_has_capacity)
+ if (env->idle == CPU_NEWLY_IDLE && local->group_has_free_capacity &&
+ !busiest->group_has_free_capacity)
goto force_balance;
/*
@@ -6266,11 +6346,11 @@ static struct rq *find_busiest_queue(struct lb_env *env,
struct sched_group *group)
{
struct rq *busiest = NULL, *rq;
- unsigned long busiest_load = 0, busiest_power = 1;
+ unsigned long busiest_load = 0, busiest_capacity = 1;
int i;
for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
- unsigned long power, capacity, wl;
+ unsigned long capacity, capacity_factor, wl;
enum fbq_type rt;
rq = cpu_rq(i);
@@ -6298,34 +6378,34 @@ static struct rq *find_busiest_queue(struct lb_env *env,
if (rt > env->fbq_type)
continue;
- power = power_of(i);
- capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
- if (!capacity)
- capacity = fix_small_capacity(env->sd, group);
+ capacity = capacity_of(i);
+ capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE);
+ if (!capacity_factor)
+ capacity_factor = fix_small_capacity(env->sd, group);
wl = weighted_cpuload(i);
/*
* When comparing with imbalance, use weighted_cpuload()
- * which is not scaled with the cpu power.
+ * which is not scaled with the cpu capacity.
*/
- if (capacity && rq->nr_running == 1 && wl > env->imbalance)
+ if (capacity_factor && rq->nr_running == 1 && wl > env->imbalance)
continue;
/*
* For the load comparisons with the other cpu's, consider
- * the weighted_cpuload() scaled with the cpu power, so that
- * the load can be moved away from the cpu that is potentially
- * running at a lower capacity.
+ * the weighted_cpuload() scaled with the cpu capacity, so
+ * that the load can be moved away from the cpu that is
+ * potentially running at a lower capacity.
*
- * Thus we're looking for max(wl_i / power_i), crosswise
+ * Thus we're looking for max(wl_i / capacity_i), crosswise
* multiplication to rid ourselves of the division works out
- * to: wl_i * power_j > wl_j * power_i; where j is our
- * previous maximum.
+ * to: wl_i * capacity_j > wl_j * capacity_i; where j is
+ * our previous maximum.
*/
- if (wl * busiest_power > busiest_load * power) {
+ if (wl * busiest_capacity > busiest_load * capacity) {
busiest_load = wl;
- busiest_power = power;
+ busiest_capacity = capacity;
busiest = rq;
}
}
@@ -6533,7 +6613,7 @@ more_balance:
* We failed to reach balance because of affinity.
*/
if (sd_parent) {
- int *group_imbalance = &sd_parent->groups->sgp->imbalance;
+ int *group_imbalance = &sd_parent->groups->sgc->imbalance;
if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
*group_imbalance = 1;
@@ -6639,27 +6719,62 @@ out:
return ld_moved;
}
+static inline unsigned long
+get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
+{
+ unsigned long interval = sd->balance_interval;
+
+ if (cpu_busy)
+ interval *= sd->busy_factor;
+
+ /* scale ms to jiffies */
+ interval = msecs_to_jiffies(interval);
+ interval = clamp(interval, 1UL, max_load_balance_interval);
+
+ return interval;
+}
+
+static inline void
+update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance)
+{
+ unsigned long interval, next;
+
+ interval = get_sd_balance_interval(sd, cpu_busy);
+ next = sd->last_balance + interval;
+
+ if (time_after(*next_balance, next))
+ *next_balance = next;
+}
+
/*
* idle_balance is called by schedule() if this_cpu is about to become
* idle. Attempts to pull tasks from other CPUs.
*/
static int idle_balance(struct rq *this_rq)
{
+ unsigned long next_balance = jiffies + HZ;
+ int this_cpu = this_rq->cpu;
struct sched_domain *sd;
int pulled_task = 0;
- unsigned long next_balance = jiffies + HZ;
u64 curr_cost = 0;
- int this_cpu = this_rq->cpu;
idle_enter_fair(this_rq);
+
/*
* We must set idle_stamp _before_ calling idle_balance(), such that we
* measure the duration of idle_balance() as idle time.
*/
this_rq->idle_stamp = rq_clock(this_rq);
- if (this_rq->avg_idle < sysctl_sched_migration_cost)
+ if (this_rq->avg_idle < sysctl_sched_migration_cost) {
+ rcu_read_lock();
+ sd = rcu_dereference_check_sched_domain(this_rq->sd);
+ if (sd)
+ update_next_balance(sd, 0, &next_balance);
+ rcu_read_unlock();
+
goto out;
+ }
/*
* Drop the rq->lock, but keep IRQ/preempt disabled.
@@ -6669,20 +6784,20 @@ static int idle_balance(struct rq *this_rq)
update_blocked_averages(this_cpu);
rcu_read_lock();
for_each_domain(this_cpu, sd) {
- unsigned long interval;
int continue_balancing = 1;
u64 t0, domain_cost;
if (!(sd->flags & SD_LOAD_BALANCE))
continue;
- if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)
+ if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
+ update_next_balance(sd, 0, &next_balance);
break;
+ }
if (sd->flags & SD_BALANCE_NEWIDLE) {
t0 = sched_clock_cpu(this_cpu);
- /* If we've pulled tasks over stop searching: */
pulled_task = load_balance(this_cpu, this_rq,
sd, CPU_NEWLY_IDLE,
&continue_balancing);
@@ -6694,41 +6809,37 @@ static int idle_balance(struct rq *this_rq)
curr_cost += domain_cost;
}
- interval = msecs_to_jiffies(sd->balance_interval);
- if (time_after(next_balance, sd->last_balance + interval))
- next_balance = sd->last_balance + interval;
- if (pulled_task)
+ update_next_balance(sd, 0, &next_balance);
+
+ /*
+ * Stop searching for tasks to pull if there are
+ * now runnable tasks on this rq.
+ */
+ if (pulled_task || this_rq->nr_running > 0)
break;
}
rcu_read_unlock();
raw_spin_lock(&this_rq->lock);
+ if (curr_cost > this_rq->max_idle_balance_cost)
+ this_rq->max_idle_balance_cost = curr_cost;
+
/*
- * While browsing the domains, we released the rq lock.
- * A task could have be enqueued in the meantime
+ * While browsing the domains, we released the rq lock, a task could
+ * have been enqueued in the meantime. Since we're not going idle,
+ * pretend we pulled a task.
*/
- if (this_rq->cfs.h_nr_running && !pulled_task) {
+ if (this_rq->cfs.h_nr_running && !pulled_task)
pulled_task = 1;
- goto out;
- }
- if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
- /*
- * We are going idle. next_balance may be set based on
- * a busy processor. So reset next_balance.
- */
+out:
+ /* Move the next balance forward */
+ if (time_after(this_rq->next_balance, next_balance))
this_rq->next_balance = next_balance;
- }
- if (curr_cost > this_rq->max_idle_balance_cost)
- this_rq->max_idle_balance_cost = curr_cost;
-
-out:
/* Is there a task of a high priority class? */
- if (this_rq->nr_running != this_rq->cfs.h_nr_running &&
- (this_rq->dl.dl_nr_running ||
- (this_rq->rt.rt_nr_running && !rt_rq_throttled(&this_rq->rt))))
+ if (this_rq->nr_running != this_rq->cfs.h_nr_running)
pulled_task = -1;
if (pulled_task) {
@@ -6889,7 +7000,7 @@ static inline void set_cpu_sd_state_busy(void)
goto unlock;
sd->nohz_idle = 0;
- atomic_inc(&sd->groups->sgp->nr_busy_cpus);
+ atomic_inc(&sd->groups->sgc->nr_busy_cpus);
unlock:
rcu_read_unlock();
}
@@ -6906,7 +7017,7 @@ void set_cpu_sd_state_idle(void)
goto unlock;
sd->nohz_idle = 1;
- atomic_dec(&sd->groups->sgp->nr_busy_cpus);
+ atomic_dec(&sd->groups->sgc->nr_busy_cpus);
unlock:
rcu_read_unlock();
}
@@ -7009,16 +7120,9 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
break;
}
- interval = sd->balance_interval;
- if (idle != CPU_IDLE)
- interval *= sd->busy_factor;
-
- /* scale ms to jiffies */
- interval = msecs_to_jiffies(interval);
- interval = clamp(interval, 1UL, max_load_balance_interval);
+ interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
need_serialize = sd->flags & SD_SERIALIZE;
-
if (need_serialize) {
if (!spin_trylock(&balancing))
goto out;
@@ -7034,6 +7138,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
}
sd->last_balance = jiffies;
+ interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
}
if (need_serialize)
spin_unlock(&balancing);
@@ -7091,12 +7196,17 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
rq = cpu_rq(balance_cpu);
- raw_spin_lock_irq(&rq->lock);
- update_rq_clock(rq);
- update_idle_cpu_load(rq);
- raw_spin_unlock_irq(&rq->lock);
-
- rebalance_domains(rq, CPU_IDLE);
+ /*
+ * If time for next balance is due,
+ * do the balance.
+ */
+ if (time_after_eq(jiffies, rq->next_balance)) {
+ raw_spin_lock_irq(&rq->lock);
+ update_rq_clock(rq);
+ update_idle_cpu_load(rq);
+ raw_spin_unlock_irq(&rq->lock);
+ rebalance_domains(rq, CPU_IDLE);
+ }
if (time_after(this_rq->next_balance, rq->next_balance))
this_rq->next_balance = rq->next_balance;
@@ -7111,7 +7221,7 @@ end:
* of an idle cpu is the system.
* - This rq has more than one task.
* - At any scheduler domain level, this cpu's scheduler group has multiple
- * busy cpu's exceeding the group's power.
+ * busy cpu's exceeding the group's capacity.
* - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
* domain span are idle.
*/
@@ -7119,7 +7229,7 @@ static inline int nohz_kick_needed(struct rq *rq)
{
unsigned long now = jiffies;
struct sched_domain *sd;
- struct sched_group_power *sgp;
+ struct sched_group_capacity *sgc;
int nr_busy, cpu = rq->cpu;
if (unlikely(rq->idle_balance))
@@ -7149,8 +7259,8 @@ static inline int nohz_kick_needed(struct rq *rq)
sd = rcu_dereference(per_cpu(sd_busy, cpu));
if (sd) {
- sgp = sd->groups->sgp;
- nr_busy = atomic_read(&sgp->nr_busy_cpus);
+ sgc = sd->groups->sgc;
+ nr_busy = atomic_read(&sgc->nr_busy_cpus);
if (nr_busy > 1)
goto need_kick_unlock;
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 5716929..90284d1 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -37,18 +37,18 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true)
SCHED_FEAT(WAKEUP_PREEMPTION, true)
/*
- * Use arch dependent cpu power functions
+ * Use arch dependent cpu capacity functions
*/
-SCHED_FEAT(ARCH_POWER, true)
+SCHED_FEAT(ARCH_CAPACITY, true)
SCHED_FEAT(HRTICK, false)
SCHED_FEAT(DOUBLE_TICK, false)
SCHED_FEAT(LB_BIAS, true)
/*
- * Decrement CPU power based on time not spent running tasks
+ * Decrement CPU capacity based on time not spent running tasks
*/
-SCHED_FEAT(NONTASK_POWER, true)
+SCHED_FEAT(NONTASK_CAPACITY, true)
/*
* Queue remote wakeups on the target CPU and process them
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 8f4390a..cf009fb 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -12,6 +12,8 @@
#include <trace/events/power.h>
+#include "sched.h"
+
static int __read_mostly cpu_idle_force_poll;
void cpu_idle_poll_ctrl(bool enable)
@@ -67,24 +69,25 @@ void __weak arch_cpu_idle(void)
* cpuidle_idle_call - the main idle function
*
* NOTE: no locks or semaphores should be used here
- * return non-zero on failure
+ *
+ * On archs that support TIF_POLLING_NRFLAG, is called with polling
+ * set, and it returns with polling set. If it ever stops polling, it
+ * must clear the polling bit.
*/
-static int cpuidle_idle_call(void)
+static void cpuidle_idle_call(void)
{
struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
- int next_state, entered_state, ret;
+ int next_state, entered_state;
bool broadcast;
/*
* Check if the idle task must be rescheduled. If it is the
- * case, exit the function after re-enabling the local irq and
- * set again the polling flag
+ * case, exit the function after re-enabling the local irq.
*/
- if (current_clr_polling_and_test()) {
+ if (need_resched()) {
local_irq_enable();
- __current_set_polling();
- return 0;
+ return;
}
/*
@@ -101,104 +104,99 @@ static int cpuidle_idle_call(void)
rcu_idle_enter();
/*
- * Check if the cpuidle framework is ready, otherwise fallback
- * to the default arch specific idle method
+ * Ask the cpuidle framework to choose a convenient idle state.
+ * Fall back to the default arch idle method on errors.
*/
- ret = cpuidle_enabled(drv, dev);
-
- if (!ret) {
+ next_state = cpuidle_select(drv, dev);
+ if (next_state < 0) {
+use_default:
/*
- * Ask the governor to choose an idle state it thinks
- * it is convenient to go to. There is *always* a
- * convenient idle state
+ * We can't use the cpuidle framework, let's use the default
+ * idle routine.
*/
- next_state = cpuidle_select(drv, dev);
-
- /*
- * The idle task must be scheduled, it is pointless to
- * go to idle, just update no idle residency and get
- * out of this function
- */
- if (current_clr_polling_and_test()) {
- dev->last_residency = 0;
- entered_state = next_state;
+ if (current_clr_polling_and_test())
local_irq_enable();
- } else {
- broadcast = !!(drv->states[next_state].flags &
- CPUIDLE_FLAG_TIMER_STOP);
-
- if (broadcast)
- /*
- * Tell the time framework to switch
- * to a broadcast timer because our
- * local timer will be shutdown. If a
- * local timer is used from another
- * cpu as a broadcast timer, this call
- * may fail if it is not available
- */
- ret = clockevents_notify(
- CLOCK_EVT_NOTIFY_BROADCAST_ENTER,
- &dev->cpu);
-
- if (!ret) {
- trace_cpu_idle_rcuidle(next_state, dev->cpu);
-
- /*
- * Enter the idle state previously
- * returned by the governor
- * decision. This function will block
- * until an interrupt occurs and will
- * take care of re-enabling the local
- * interrupts
- */
- entered_state = cpuidle_enter(drv, dev,
- next_state);
-
- trace_cpu_idle_rcuidle(PWR_EVENT_EXIT,
- dev->cpu);
-
- if (broadcast)
- clockevents_notify(
- CLOCK_EVT_NOTIFY_BROADCAST_EXIT,
- &dev->cpu);
-
- /*
- * Give the governor an opportunity to reflect on the
- * outcome
- */
- cpuidle_reflect(dev, entered_state);
- }
- }
+ else
+ arch_cpu_idle();
+
+ goto exit_idle;
}
+
+ /*
+ * The idle task must be scheduled, it is pointless to
+ * go to idle, just update no idle residency and get
+ * out of this function
+ */
+ if (current_clr_polling_and_test()) {
+ dev->last_residency = 0;
+ entered_state = next_state;
+ local_irq_enable();
+ goto exit_idle;
+ }
+
+ broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP);
+
/*
- * We can't use the cpuidle framework, let's use the default
- * idle routine
+ * Tell the time framework to switch to a broadcast timer
+ * because our local timer will be shutdown. If a local timer
+ * is used from another cpu as a broadcast timer, this call may
+ * fail if it is not available
*/
- if (ret)
- arch_cpu_idle();
+ if (broadcast &&
+ clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu))
+ goto use_default;
+ trace_cpu_idle_rcuidle(next_state, dev->cpu);
+
+ /*
+ * Enter the idle state previously returned by the governor decision.
+ * This function will block until an interrupt occurs and will take
+ * care of re-enabling the local interrupts
+ */
+ entered_state = cpuidle_enter(drv, dev, next_state);
+
+ trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
+
+ if (broadcast)
+ clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu);
+
+ /*
+ * Give the governor an opportunity to reflect on the outcome
+ */
+ cpuidle_reflect(dev, entered_state);
+
+exit_idle:
__current_set_polling();
/*
- * It is up to the idle functions to enable back the local
- * interrupt
+ * It is up to the idle functions to reenable local interrupts
*/
if (WARN_ON_ONCE(irqs_disabled()))
local_irq_enable();
rcu_idle_exit();
start_critical_timings();
-
- return 0;
}
/*
* Generic idle loop implementation
+ *
+ * Called with polling cleared.
*/
static void cpu_idle_loop(void)
{
while (1) {
+ /*
+ * If the arch has a polling bit, we maintain an invariant:
+ *
+ * Our polling bit is clear if we're not scheduled (i.e. if
+ * rq->curr != rq->idle). This means that, if rq->idle has
+ * the polling bit set, then setting need_resched is
+ * guaranteed to cause the cpu to reschedule.
+ */
+
+ __current_set_polling();
tick_nohz_idle_enter();
while (!need_resched()) {
@@ -238,6 +236,17 @@ static void cpu_idle_loop(void)
*/
preempt_set_need_resched();
tick_nohz_idle_exit();
+ __current_clr_polling();
+
+ /*
+ * We promise to call sched_ttwu_pending and reschedule
+ * if need_resched is set while polling is set. That
+ * means that clearing polling needs to be visible
+ * before doing these things.
+ */
+ smp_mb__after_atomic();
+
+ sched_ttwu_pending();
schedule_preempt_disabled();
}
}
@@ -259,7 +268,6 @@ void cpu_startup_entry(enum cpuhp_state state)
*/
boot_init_stack_canary();
#endif
- __current_set_polling();
arch_cpu_idle_prepare();
cpu_idle_loop();
}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index d8cdf16..43406db 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -79,6 +79,8 @@ void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
rt_rq->overloaded = 0;
plist_head_init(&rt_rq->pushable_tasks);
#endif
+ /* We start is dequeued state, because no RT tasks are queued */
+ rt_rq->rt_queued = 0;
rt_rq->rt_time = 0;
rt_rq->rt_throttled = 0;
@@ -112,6 +114,13 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
return rt_se->rt_rq;
}
+static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
+{
+ struct rt_rq *rt_rq = rt_se->rt_rq;
+
+ return rt_rq->rq;
+}
+
void free_rt_sched_group(struct task_group *tg)
{
int i;
@@ -211,10 +220,16 @@ static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
return container_of(rt_rq, struct rq, rt);
}
-static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
+static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
{
struct task_struct *p = rt_task_of(rt_se);
- struct rq *rq = task_rq(p);
+
+ return task_rq(p);
+}
+
+static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
+{
+ struct rq *rq = rq_of_rt_se(rt_se);
return &rq->rt;
}
@@ -391,6 +406,9 @@ static inline void set_post_schedule(struct rq *rq)
}
#endif /* CONFIG_SMP */
+static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
+static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
+
static inline int on_rt_rq(struct sched_rt_entity *rt_se)
{
return !list_empty(&rt_se->run_list);
@@ -452,8 +470,11 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
rt_se = rt_rq->tg->rt_se[cpu];
if (rt_rq->rt_nr_running) {
- if (rt_se && !on_rt_rq(rt_se))
+ if (!rt_se)
+ enqueue_top_rt_rq(rt_rq);
+ else if (!on_rt_rq(rt_se))
enqueue_rt_entity(rt_se, false);
+
if (rt_rq->highest_prio.curr < curr->prio)
resched_task(curr);
}
@@ -466,10 +487,17 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
rt_se = rt_rq->tg->rt_se[cpu];
- if (rt_se && on_rt_rq(rt_se))
+ if (!rt_se)
+ dequeue_top_rt_rq(rt_rq);
+ else if (on_rt_rq(rt_se))
dequeue_rt_entity(rt_se);
}
+static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+{
+ return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
+}
+
static int rt_se_boosted(struct sched_rt_entity *rt_se)
{
struct rt_rq *rt_rq = group_rt_rq(rt_se);
@@ -532,12 +560,23 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
{
- if (rt_rq->rt_nr_running)
- resched_task(rq_of_rt_rq(rt_rq)->curr);
+ struct rq *rq = rq_of_rt_rq(rt_rq);
+
+ if (!rt_rq->rt_nr_running)
+ return;
+
+ enqueue_top_rt_rq(rt_rq);
+ resched_task(rq->curr);
}
static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
{
+ dequeue_top_rt_rq(rt_rq);
+}
+
+static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+{
+ return rt_rq->rt_throttled;
}
static inline const struct cpumask *sched_rt_period_mask(void)
@@ -885,7 +924,6 @@ static void update_curr_rt(struct rq *rq)
{
struct task_struct *curr = rq->curr;
struct sched_rt_entity *rt_se = &curr->rt;
- struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
u64 delta_exec;
if (curr->sched_class != &rt_sched_class)
@@ -910,7 +948,7 @@ static void update_curr_rt(struct rq *rq)
return;
for_each_sched_rt_entity(rt_se) {
- rt_rq = rt_rq_of_se(rt_se);
+ struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
raw_spin_lock(&rt_rq->rt_runtime_lock);
@@ -922,6 +960,38 @@ static void update_curr_rt(struct rq *rq)
}
}
+static void
+dequeue_top_rt_rq(struct rt_rq *rt_rq)
+{
+ struct rq *rq = rq_of_rt_rq(rt_rq);
+
+ BUG_ON(&rq->rt != rt_rq);
+
+ if (!rt_rq->rt_queued)
+ return;
+
+ BUG_ON(!rq->nr_running);
+
+ sub_nr_running(rq, rt_rq->rt_nr_running);
+ rt_rq->rt_queued = 0;
+}
+
+static void
+enqueue_top_rt_rq(struct rt_rq *rt_rq)
+{
+ struct rq *rq = rq_of_rt_rq(rt_rq);
+
+ BUG_ON(&rq->rt != rt_rq);
+
+ if (rt_rq->rt_queued)
+ return;
+ if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running)
+ return;
+
+ add_nr_running(rq, rt_rq->rt_nr_running);
+ rt_rq->rt_queued = 1;
+}
+
#if defined CONFIG_SMP
static void
@@ -1045,12 +1115,23 @@ void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
#endif /* CONFIG_RT_GROUP_SCHED */
static inline
+unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
+{
+ struct rt_rq *group_rq = group_rt_rq(rt_se);
+
+ if (group_rq)
+ return group_rq->rt_nr_running;
+ else
+ return 1;
+}
+
+static inline
void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
int prio = rt_se_prio(rt_se);
WARN_ON(!rt_prio(prio));
- rt_rq->rt_nr_running++;
+ rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
inc_rt_prio(rt_rq, prio);
inc_rt_migration(rt_se, rt_rq);
@@ -1062,7 +1143,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
WARN_ON(!rt_prio(rt_se_prio(rt_se)));
WARN_ON(!rt_rq->rt_nr_running);
- rt_rq->rt_nr_running--;
+ rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
dec_rt_prio(rt_rq, rt_se_prio(rt_se));
dec_rt_migration(rt_se, rt_rq);
@@ -1119,6 +1200,8 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
back = rt_se;
}
+ dequeue_top_rt_rq(rt_rq_of_se(back));
+
for (rt_se = back; rt_se; rt_se = rt_se->back) {
if (on_rt_rq(rt_se))
__dequeue_rt_entity(rt_se);
@@ -1127,13 +1210,18 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
{
+ struct rq *rq = rq_of_rt_se(rt_se);
+
dequeue_rt_stack(rt_se);
for_each_sched_rt_entity(rt_se)
__enqueue_rt_entity(rt_se, head);
+ enqueue_top_rt_rq(&rq->rt);
}
static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
{
+ struct rq *rq = rq_of_rt_se(rt_se);
+
dequeue_rt_stack(rt_se);
for_each_sched_rt_entity(rt_se) {
@@ -1142,6 +1230,7 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
if (rt_rq && rt_rq->rt_nr_running)
__enqueue_rt_entity(rt_se, false);
}
+ enqueue_top_rt_rq(&rq->rt);
}
/*
@@ -1159,8 +1248,6 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
-
- inc_nr_running(rq);
}
static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
@@ -1171,8 +1258,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
dequeue_rt_entity(rt_se);
dequeue_pushable_task(rq, p);
-
- dec_nr_running(rq);
}
/*
@@ -1362,10 +1447,11 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
pull_rt_task(rq);
/*
* pull_rt_task() can drop (and re-acquire) rq->lock; this
- * means a dl task can slip in, in which case we need to
- * re-start task selection.
+ * means a dl or stop task can slip in, in which case we need
+ * to re-start task selection.
*/
- if (unlikely(rq->dl.dl_nr_running))
+ if (unlikely((rq->stop && rq->stop->on_rq) ||
+ rq->dl.dl_nr_running))
return RETRY_TASK;
}
@@ -1376,10 +1462,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
if (prev->sched_class == &rt_sched_class)
update_curr_rt(rq);
- if (!rt_rq->rt_nr_running)
- return NULL;
-
- if (rt_rq_throttled(rt_rq))
+ if (!rt_rq->rt_queued)
return NULL;
put_prev_task(rq, prev);
@@ -1891,9 +1974,9 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
*/
if (p->on_rq && rq->curr != p) {
#ifdef CONFIG_SMP
- if (rq->rt.overloaded && push_rt_task(rq) &&
+ if (p->nr_cpus_allowed > 1 && rq->rt.overloaded &&
/* Don't resched if we changed runqueues */
- rq != task_rq(p))
+ push_rt_task(rq) && rq != task_rq(p))
check_resched = 0;
#endif /* CONFIG_SMP */
if (check_resched && p->prio < rq->curr->prio)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c9007f2..2f86361 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -409,6 +409,8 @@ struct rt_rq {
int overloaded;
struct plist_head pushable_tasks;
#endif
+ int rt_queued;
+
int rt_throttled;
u64 rt_time;
u64 rt_runtime;
@@ -423,18 +425,6 @@ struct rt_rq {
#endif
};
-#ifdef CONFIG_RT_GROUP_SCHED
-static inline int rt_rq_throttled(struct rt_rq *rt_rq)
-{
- return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
-}
-#else
-static inline int rt_rq_throttled(struct rt_rq *rt_rq)
-{
- return rt_rq->rt_throttled;
-}
-#endif
-
/* Deadline class' related fields in a runqueue */
struct dl_rq {
/* runqueue is an rbtree, ordered by deadline */
@@ -577,7 +567,7 @@ struct rq {
struct root_domain *rd;
struct sched_domain *sd;
- unsigned long cpu_power;
+ unsigned long cpu_capacity;
unsigned char idle_balance;
/* For active balancing */
@@ -680,6 +670,8 @@ extern int migrate_swap(struct task_struct *, struct task_struct *);
#ifdef CONFIG_SMP
+extern void sched_ttwu_pending(void);
+
#define rcu_dereference_check_sched_domain(p) \
rcu_dereference_check((p), \
lockdep_is_held(&sched_domains_mutex))
@@ -738,15 +730,15 @@ DECLARE_PER_CPU(struct sched_domain *, sd_numa);
DECLARE_PER_CPU(struct sched_domain *, sd_busy);
DECLARE_PER_CPU(struct sched_domain *, sd_asym);
-struct sched_group_power {
+struct sched_group_capacity {
atomic_t ref;
/*
- * CPU power of this group, SCHED_LOAD_SCALE being max power for a
- * single CPU.
+ * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity
+ * for a single CPU.
*/
- unsigned int power, power_orig;
+ unsigned int capacity, capacity_orig;
unsigned long next_update;
- int imbalance; /* XXX unrelated to power but shared group state */
+ int imbalance; /* XXX unrelated to capacity but shared group state */
/*
* Number of busy cpus in this group.
*/
@@ -760,7 +752,7 @@ struct sched_group {
atomic_t ref;
unsigned int group_weight;
- struct sched_group_power *sgp;
+ struct sched_group_capacity *sgc;
/*
* The CPUs this group covers.
@@ -783,7 +775,7 @@ static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
*/
static inline struct cpumask *sched_group_mask(struct sched_group *sg)
{
- return to_cpumask(sg->sgp->cpumask);
+ return to_cpumask(sg->sgc->cpumask);
}
/**
@@ -797,6 +789,10 @@ static inline unsigned int group_first_cpu(struct sched_group *group)
extern int group_balance_cpu(struct sched_group *sg);
+#else
+
+static inline void sched_ttwu_pending(void) { }
+
#endif /* CONFIG_SMP */
#include "stats.h"
@@ -1177,7 +1173,7 @@ extern const struct sched_class idle_sched_class;
#ifdef CONFIG_SMP
-extern void update_group_power(struct sched_domain *sd, int cpu);
+extern void update_group_capacity(struct sched_domain *sd, int cpu);
extern void trigger_load_balance(struct rq *rq);
@@ -1216,12 +1212,14 @@ extern void update_idle_cpu_load(struct rq *this_rq);
extern void init_task_runnable_average(struct task_struct *p);
-static inline void inc_nr_running(struct rq *rq)
+static inline void add_nr_running(struct rq *rq, unsigned count)
{
- rq->nr_running++;
+ unsigned prev_nr = rq->nr_running;
+
+ rq->nr_running = prev_nr + count;
#ifdef CONFIG_NO_HZ_FULL
- if (rq->nr_running == 2) {
+ if (prev_nr < 2 && rq->nr_running >= 2) {
if (tick_nohz_full_cpu(rq->cpu)) {
/* Order rq->nr_running write against the IPI */
smp_wmb();
@@ -1231,9 +1229,9 @@ static inline void inc_nr_running(struct rq *rq)
#endif
}
-static inline void dec_nr_running(struct rq *rq)
+static inline void sub_nr_running(struct rq *rq, unsigned count)
{
- rq->nr_running--;
+ rq->nr_running -= count;
}
static inline void rq_last_tick_reset(struct rq *rq)
@@ -1385,6 +1383,15 @@ static inline void double_lock(spinlock_t *l1, spinlock_t *l2)
spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
}
+static inline void double_lock_irq(spinlock_t *l1, spinlock_t *l2)
+{
+ if (l1 > l2)
+ swap(l1, l2);
+
+ spin_lock_irq(l1);
+ spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
+}
+
static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2)
{
if (l1 > l2)
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index d6ce65d..bfe0eda 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -41,13 +41,13 @@ pick_next_task_stop(struct rq *rq, struct task_struct *prev)
static void
enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
{
- inc_nr_running(rq);
+ add_nr_running(rq, 1);
}
static void
dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
{
- dec_nr_running(rq);
+ sub_nr_running(rq, 1);
}
static void yield_task_stop(struct rq *rq)
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 590c379..b35c215 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -255,6 +255,7 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
goto free_prog;
/* Allocate a new seccomp_filter */
+ ret = -ENOMEM;
filter = kzalloc(sizeof(struct seccomp_filter) +
sizeof(struct sock_filter_int) * new_len,
GFP_KERNEL|__GFP_NOWARN);
@@ -264,6 +265,7 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
ret = sk_convert_filter(fp, fprog->len, filter->insnsi, &new_len);
if (ret)
goto free_filter;
+ kfree(fp);
atomic_set(&filter->usage, 1);
filter->len = new_len;
diff --git a/kernel/softirq.c b/kernel/softirq.c
index b50990a..92f24f5 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -223,7 +223,7 @@ static inline bool lockdep_softirq_start(void) { return false; }
static inline void lockdep_softirq_end(bool in_hardirq) { }
#endif
-asmlinkage void __do_softirq(void)
+asmlinkage __visible void __do_softirq(void)
{
unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
unsigned long old_flags = current->flags;
@@ -299,7 +299,7 @@ restart:
tsk_restore_flags(current, old_flags, PF_MEMALLOC);
}
-asmlinkage void do_softirq(void)
+asmlinkage __visible void do_softirq(void)
{
__u32 pending;
unsigned long flags;
@@ -779,3 +779,8 @@ int __init __weak arch_early_irq_init(void)
{
return 0;
}
+
+unsigned int __weak arch_dynirq_lower_bound(unsigned int from)
+{
+ return from;
+}
diff --git a/kernel/sys.c b/kernel/sys.c
index fba0f29..66a751e 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -250,7 +250,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
else
p = current;
if (p) {
- niceval = 20 - task_nice(p);
+ niceval = nice_to_rlimit(task_nice(p));
if (niceval > retval)
retval = niceval;
}
@@ -261,7 +261,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
else
pgrp = task_pgrp(current);
do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
- niceval = 20 - task_nice(p);
+ niceval = nice_to_rlimit(task_nice(p));
if (niceval > retval)
retval = niceval;
} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
@@ -277,7 +277,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
do_each_thread(g, p) {
if (uid_eq(task_uid(p), uid)) {
- niceval = 20 - task_nice(p);
+ niceval = nice_to_rlimit(task_nice(p));
if (niceval > retval)
retval = niceval;
}
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 0156612..0a0608e 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -276,7 +276,7 @@ static bool tick_check_preferred(struct clock_event_device *curdev,
bool tick_check_replacement(struct clock_event_device *curdev,
struct clock_event_device *newdev)
{
- if (tick_check_percpu(curdev, newdev, smp_processor_id()))
+ if (!tick_check_percpu(curdev, newdev, smp_processor_id()))
return false;
return tick_check_preferred(curdev, newdev);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 9f8af69..6558b7a 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -84,6 +84,9 @@ static void tick_do_update_jiffies64(ktime_t now)
/* Keep the tick_next_period variable up to date */
tick_next_period = ktime_add(last_jiffies_update, tick_period);
+ } else {
+ write_sequnlock(&jiffies_lock);
+ return;
}
write_sequnlock(&jiffies_lock);
update_wall_time();
@@ -967,7 +970,7 @@ static void tick_nohz_switch_to_nohz(void)
struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
ktime_t next;
- if (!tick_nohz_active)
+ if (!tick_nohz_enabled)
return;
local_irq_disable();
diff --git a/kernel/timer.c b/kernel/timer.c
index 87bd529..3bb01a3 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -838,7 +838,7 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
bit = find_last_bit(&mask, BITS_PER_LONG);
- mask = (1 << bit) - 1;
+ mask = (1UL << bit) - 1;
expires_limit = expires_limit & ~(mask);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 1fd4b94..4a54a25 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -4330,16 +4330,11 @@ static void ftrace_init_module(struct module *mod,
ftrace_process_locs(mod, start, end);
}
-static int ftrace_module_notify_enter(struct notifier_block *self,
- unsigned long val, void *data)
+void ftrace_module_init(struct module *mod)
{
- struct module *mod = data;
-
- if (val == MODULE_STATE_COMING)
- ftrace_init_module(mod, mod->ftrace_callsites,
- mod->ftrace_callsites +
- mod->num_ftrace_callsites);
- return 0;
+ ftrace_init_module(mod, mod->ftrace_callsites,
+ mod->ftrace_callsites +
+ mod->num_ftrace_callsites);
}
static int ftrace_module_notify_exit(struct notifier_block *self,
@@ -4353,11 +4348,6 @@ static int ftrace_module_notify_exit(struct notifier_block *self,
return 0;
}
#else
-static int ftrace_module_notify_enter(struct notifier_block *self,
- unsigned long val, void *data)
-{
- return 0;
-}
static int ftrace_module_notify_exit(struct notifier_block *self,
unsigned long val, void *data)
{
@@ -4365,11 +4355,6 @@ static int ftrace_module_notify_exit(struct notifier_block *self,
}
#endif /* CONFIG_MODULES */
-struct notifier_block ftrace_module_enter_nb = {
- .notifier_call = ftrace_module_notify_enter,
- .priority = INT_MAX, /* Run before anything that can use kprobes */
-};
-
struct notifier_block ftrace_module_exit_nb = {
.notifier_call = ftrace_module_notify_exit,
.priority = INT_MIN, /* Run after anything that can remove kprobes */
@@ -4403,10 +4388,6 @@ void __init ftrace_init(void)
__start_mcount_loc,
__stop_mcount_loc);
- ret = register_module_notifier(&ftrace_module_enter_nb);
- if (ret)
- pr_warning("Failed to register trace ftrace module enter notifier\n");
-
ret = register_module_notifier(&ftrace_module_exit_nb);
if (ret)
pr_warning("Failed to register trace ftrace module exit notifier\n");
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 925f537..4747b47 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -77,7 +77,7 @@ event_triggers_call(struct ftrace_event_file *file, void *rec)
data->ops->func(data);
continue;
}
- filter = rcu_dereference(data->filter);
+ filter = rcu_dereference_sched(data->filter);
if (filter && !filter_match_preds(filter, rec))
continue;
if (data->cmd_ops->post_trigger) {
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 5b781d2..ffd5635 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -58,12 +58,16 @@ int ftrace_create_function_files(struct trace_array *tr,
{
int ret;
- /* The top level array uses the "global_ops". */
- if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL)) {
- ret = allocate_ftrace_ops(tr);
- if (ret)
- return ret;
- }
+ /*
+ * The top level array uses the "global_ops", and the files are
+ * created on boot up.
+ */
+ if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
+ return 0;
+
+ ret = allocate_ftrace_ops(tr);
+ if (ret)
+ return ret;
ftrace_create_filter_files(tr->ops, parent);
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 930e514..c082a74 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -732,9 +732,15 @@ static int uprobe_buffer_enable(void)
static void uprobe_buffer_disable(void)
{
+ int cpu;
+
BUG_ON(!mutex_is_locked(&event_mutex));
if (--uprobe_buffer_refcnt == 0) {
+ for_each_possible_cpu(cpu)
+ free_page((unsigned long)per_cpu_ptr(uprobe_cpu_buffer,
+ cpu)->buf);
+
free_percpu(uprobe_cpu_buffer);
uprobe_cpu_buffer = NULL;
}
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index ac5b23c..6620e58 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -188,7 +188,6 @@ static int tracepoint_add_func(struct tracepoint *tp,
WARN_ON_ONCE(1);
return PTR_ERR(old);
}
- release_probes(old);
/*
* rcu_assign_pointer has a smp_wmb() which makes sure that the new
@@ -200,6 +199,7 @@ static int tracepoint_add_func(struct tracepoint *tp,
rcu_assign_pointer(tp->funcs, tp_funcs);
if (!static_key_enabled(&tp->key))
static_key_slow_inc(&tp->key);
+ release_probes(old);
return 0;
}
@@ -221,7 +221,6 @@ static int tracepoint_remove_func(struct tracepoint *tp,
WARN_ON_ONCE(1);
return PTR_ERR(old);
}
- release_probes(old);
if (!tp_funcs) {
/* Removed last function */
@@ -232,6 +231,7 @@ static int tracepoint_remove_func(struct tracepoint *tp,
static_key_slow_dec(&tp->key);
}
rcu_assign_pointer(tp->funcs, tp_funcs);
+ release_probes(old);
return 0;
}
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index e90089f..516203e 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -138,7 +138,11 @@ static void __touch_watchdog(void)
void touch_softlockup_watchdog(void)
{
- __this_cpu_write(watchdog_touch_ts, 0);
+ /*
+ * Preemption can be enabled. It doesn't matter which CPU's timestamp
+ * gets zeroed here, so use the raw_ operation.
+ */
+ raw_cpu_write(watchdog_touch_ts, 0);
}
EXPORT_SYMBOL(touch_softlockup_watchdog);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0ee63af..a4bab46 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -100,10 +100,10 @@ enum {
/*
* Rescue workers are used only on emergencies and shared by
- * all cpus. Give -20.
+ * all cpus. Give MIN_NICE.
*/
- RESCUER_NICE_LEVEL = -20,
- HIGHPRI_NICE_LEVEL = -20,
+ RESCUER_NICE_LEVEL = MIN_NICE,
+ HIGHPRI_NICE_LEVEL = MIN_NICE,
WQ_NAME_LEN = 24,
};
@@ -1916,6 +1916,12 @@ static void send_mayday(struct work_struct *work)
/* mayday mayday mayday */
if (list_empty(&pwq->mayday_node)) {
+ /*
+ * If @pwq is for an unbound wq, its base ref may be put at
+ * any time due to an attribute change. Pin @pwq until the
+ * rescuer is done with it.
+ */
+ get_pwq(pwq);
list_add_tail(&pwq->mayday_node, &wq->maydays);
wake_up_process(wq->rescuer->task);
}
@@ -2398,6 +2404,7 @@ static int rescuer_thread(void *__rescuer)
struct worker *rescuer = __rescuer;
struct workqueue_struct *wq = rescuer->rescue_wq;
struct list_head *scheduled = &rescuer->scheduled;
+ bool should_stop;
set_user_nice(current, RESCUER_NICE_LEVEL);
@@ -2409,11 +2416,15 @@ static int rescuer_thread(void *__rescuer)
repeat:
set_current_state(TASK_INTERRUPTIBLE);
- if (kthread_should_stop()) {
- __set_current_state(TASK_RUNNING);
- rescuer->task->flags &= ~PF_WQ_WORKER;
- return 0;
- }
+ /*
+ * By the time the rescuer is requested to stop, the workqueue
+ * shouldn't have any work pending, but @wq->maydays may still have
+ * pwq(s) queued. This can happen by non-rescuer workers consuming
+ * all the work items before the rescuer got to them. Go through
+ * @wq->maydays processing before acting on should_stop so that the
+ * list is always empty on exit.
+ */
+ should_stop = kthread_should_stop();
/* see whether any pwq is asking for help */
spin_lock_irq(&wq_mayday_lock);
@@ -2445,6 +2456,12 @@ repeat:
process_scheduled_works(rescuer);
/*
+ * Put the reference grabbed by send_mayday(). @pool won't
+ * go away while we're holding its lock.
+ */
+ put_pwq(pwq);
+
+ /*
* Leave this pool. If keep_working() is %true, notify a
* regular worker; otherwise, we end up with 0 concurrency
* and stalling the execution.
@@ -2459,6 +2476,12 @@ repeat:
spin_unlock_irq(&wq_mayday_lock);
+ if (should_stop) {
+ __set_current_state(TASK_RUNNING);
+ rescuer->task->flags &= ~PF_WQ_WORKER;
+ return 0;
+ }
+
/* rescuers should never participate in concurrency management */
WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING));
schedule();
@@ -4100,7 +4123,8 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
if (!pwq) {
pr_warning("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n",
wq->name);
- goto out_unlock;
+ mutex_lock(&wq->mutex);
+ goto use_dfl_pwq;
}
/*
OpenPOWER on IntegriCloud