From db66d756c74acb886c51f11b501c2fe622018a0a Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Fri, 18 Apr 2014 01:59:15 +0900 Subject: sched/docbook: Fix 'make htmldocs' warnings caused by missing description When 'flags' argument to sched_{set,get}attr() syscalls were added in: 6d35ab48090b ("sched: Add 'flags' argument to sched_{set,get}attr() syscalls") no description for 'flags' was added. It causes the following warnings on "make htmldocs": Warning(/kernel/sched/core.c:3645): No description found for parameter 'flags' Warning(/kernel/sched/core.c:3789): No description found for parameter 'flags' Signed-off-by: Masanari Iida Cc: peterz@infradead.org Link: http://lkml.kernel.org/r/1397753955-2914-1-git-send-email-standby24x7@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 268a45e..9fe2190 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3639,6 +3639,7 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) * sys_sched_setattr - same as above, but with extended sched_attr * @pid: the pid in question. * @uattr: structure containing the extended parameters. + * @flags: for future extension. */ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, unsigned int, flags) @@ -3783,6 +3784,7 @@ err_size: * @pid: the pid in question. * @uattr: structure containing the extended parameters. * @size: sizeof(attr) for fwd/bwd comp. + * @flags: for future extension. */ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, unsigned int, size, unsigned int, flags) -- cgit v1.1 From 722a9f9299ca720a3f14660e7c0dce7b76a9cb42 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 2 May 2014 00:44:38 +0200 Subject: asmlinkage: Add explicit __visible to drivers/*, lib/*, kernel/* As requested by Linus add explicit __visible to the asmlinkage users. This marks functions visible to assembler. Tree sweep for rest of tree. Signed-off-by: Andi Kleen Link: http://lkml.kernel.org/r/1398984278-29319-4-git-send-email-andi@firstfloor.org Signed-off-by: H. Peter Anvin --- kernel/sched/core.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 268a45e..d9d8ece 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2192,7 +2192,7 @@ static inline void post_schedule(struct rq *rq) * schedule_tail - first thing a freshly forked thread must call. * @prev: the thread we just switched away from. */ -asmlinkage void schedule_tail(struct task_struct *prev) +asmlinkage __visible void schedule_tail(struct task_struct *prev) __releases(rq->lock) { struct rq *rq = this_rq(); @@ -2741,7 +2741,7 @@ static inline void sched_submit_work(struct task_struct *tsk) blk_schedule_flush_plug(tsk); } -asmlinkage void __sched schedule(void) +asmlinkage __visible void __sched schedule(void) { struct task_struct *tsk = current; @@ -2751,7 +2751,7 @@ asmlinkage void __sched schedule(void) EXPORT_SYMBOL(schedule); #ifdef CONFIG_CONTEXT_TRACKING -asmlinkage void __sched schedule_user(void) +asmlinkage __visible void __sched schedule_user(void) { /* * If we come here after a random call to set_need_resched(), @@ -2783,7 +2783,7 @@ void __sched schedule_preempt_disabled(void) * off of preempt_enable. Kernel preemptions off return from interrupt * occur there and call schedule directly. */ -asmlinkage void __sched notrace preempt_schedule(void) +asmlinkage __visible void __sched notrace preempt_schedule(void) { /* * If there is a non-zero preempt_count or interrupts are disabled, @@ -2813,7 +2813,7 @@ EXPORT_SYMBOL(preempt_schedule); * Note, that this is called and return with irqs disabled. This will * protect us against recursive calling from irq. */ -asmlinkage void __sched preempt_schedule_irq(void) +asmlinkage __visible void __sched preempt_schedule_irq(void) { enum ctx_state prev_state; -- cgit v1.1 From 2d513868e2a33e1d5315490ef4c861ee65babd65 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 2 May 2014 23:26:24 +0200 Subject: sched: Sanitize irq accounting madness Russell reported, that irqtime_account_idle_ticks() takes ages due to: for (i = 0; i < ticks; i++) irqtime_account_process_tick(current, 0, rq); It's sad, that this code was written way _AFTER_ the NOHZ idle functionality was available. I charge myself guitly for not paying attention when that crap got merged with commit abb74cefa ("sched: Export ns irqtimes through /proc/stat") So instead of looping nr_ticks times just apply the whole thing at once. As a side note: The whole cputime_t vs. u64 business in that context wants to be cleaned up as well. There is no point in having all these back and forth conversions. Lets standardise on u64 nsec for all kernel internal accounting and be done with it. Everything else does not make sense at all for fine grained accounting. Frederic, can you please take care of that? Reported-by: Russell King Signed-off-by: Thomas Gleixner Reviewed-by: Paul E. McKenney Signed-off-by: Peter Zijlstra Cc: Venkatesh Pallipadi Cc: Shaun Ruffell Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/alpine.DEB.2.02.1405022307000.6261@ionos.tec.linutronix.de Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index a95097c..72fdf06 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -332,50 +332,50 @@ out: * softirq as those do not count in task exec_runtime any more. */ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, - struct rq *rq) + struct rq *rq, int ticks) { - cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); + cputime_t scaled = cputime_to_scaled(cputime_one_jiffy); + u64 cputime = (__force u64) cputime_one_jiffy; u64 *cpustat = kcpustat_this_cpu->cpustat; if (steal_account_process_tick()) return; + cputime *= ticks; + scaled *= ticks; + if (irqtime_account_hi_update()) { - cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy; + cpustat[CPUTIME_IRQ] += cputime; } else if (irqtime_account_si_update()) { - cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy; + cpustat[CPUTIME_SOFTIRQ] += cputime; } else if (this_cpu_ksoftirqd() == p) { /* * ksoftirqd time do not get accounted in cpu_softirq_time. * So, we have to handle it separately here. * Also, p->stime needs to be updated for ksoftirqd. */ - __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, - CPUTIME_SOFTIRQ); + __account_system_time(p, cputime, scaled, CPUTIME_SOFTIRQ); } else if (user_tick) { - account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); + account_user_time(p, cputime, scaled); } else if (p == rq->idle) { - account_idle_time(cputime_one_jiffy); + account_idle_time(cputime); } else if (p->flags & PF_VCPU) { /* System time or guest time */ - account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); + account_guest_time(p, cputime, scaled); } else { - __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, - CPUTIME_SYSTEM); + __account_system_time(p, cputime, scaled, CPUTIME_SYSTEM); } } static void irqtime_account_idle_ticks(int ticks) { - int i; struct rq *rq = this_rq(); - for (i = 0; i < ticks; i++) - irqtime_account_process_tick(current, 0, rq); + irqtime_account_process_tick(current, 0, rq, ticks); } #else /* CONFIG_IRQ_TIME_ACCOUNTING */ static inline void irqtime_account_idle_ticks(int ticks) {} static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, - struct rq *rq) {} + struct rq *rq, int nr_ticks) {} #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ /* @@ -464,7 +464,7 @@ void account_process_tick(struct task_struct *p, int user_tick) return; if (sched_clock_irqtime) { - irqtime_account_process_tick(p, user_tick, rq); + irqtime_account_process_tick(p, user_tick, rq, 1); return; } -- cgit v1.1 From 5bfd126e80dca70431aef8fdbc1cf14535f3c338 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Tue, 15 Apr 2014 13:49:04 +0200 Subject: sched/deadline: Fix sched_yield() behavior yield_task_dl() is broken: o it forces current to be throttled setting its runtime to zero; o it sets current's dl_se->dl_new to one, expecting that dl_task_timer() will queue it back with proper parameters at replenish time. Unfortunately, dl_task_timer() has this check at the very beginning: if (!dl_task(p) || dl_se->dl_new) goto unlock; So, it just bails out and the task is never replenished. It actually yielded forever. To fix this, introduce a new flag indicating that the task properly yielded the CPU before its current runtime expired. While this is a little overdoing at the moment, the flag would be useful in the future to discriminate between "good" jobs (of which remaining runtime could be reclaimed, i.e. recycled) and "bad" jobs (for which dl_throttled task has been set) that needed to be stopped. Reported-by: yjay.kim Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140429103953.e68eba1b2ac3309214e3dc5a@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 1 + kernel/sched/deadline.c | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9fe2190..e62c65a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3124,6 +3124,7 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr) dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); dl_se->dl_throttled = 0; dl_se->dl_new = 1; + dl_se->dl_yielded = 0; } static void __setscheduler_params(struct task_struct *p, diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index b080957..800e99b 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -528,6 +528,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) sched_clock_tick(); update_rq_clock(rq); dl_se->dl_throttled = 0; + dl_se->dl_yielded = 0; if (p->on_rq) { enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); if (task_has_dl_policy(rq->curr)) @@ -893,10 +894,10 @@ static void yield_task_dl(struct rq *rq) * We make the task go to sleep until its current deadline by * forcing its runtime to zero. This way, update_curr_dl() stops * it and the bandwidth timer will wake it up and will give it - * new scheduling parameters (thanks to dl_new=1). + * new scheduling parameters (thanks to dl_yielded=1). */ if (p->dl.runtime > 0) { - rq->curr->dl.dl_new = 1; + rq->curr->dl.dl_yielded = 1; p->dl.runtime = 0; } update_curr_dl(rq); -- cgit v1.1 From 6a7cd273dc4bc3246f37ebe874754a54ccb29141 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 17 Apr 2014 10:05:02 +0800 Subject: sched/deadline: Fix memory leak Free cpudl->free_cpus allocated in cpudl_init(). Signed-off-by: Li Zefan Acked-by: Juri Lelli Signed-off-by: Peter Zijlstra Cc: # 3.14+ Link: http://lkml.kernel.org/r/534F36CE.2000409@huawei.com Signed-off-by: Ingo Molnar --- kernel/sched/cpudeadline.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 5b9bb42..ab001b5 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -210,7 +210,5 @@ int cpudl_init(struct cpudl *cp) */ void cpudl_cleanup(struct cpudl *cp) { - /* - * nothing to do for the moment - */ + free_cpumask_var(cp->free_cpus); } -- cgit v1.1 From 6227cb00cc120f9a43ce8313bb0475ddabcb7d01 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Sun, 13 Apr 2014 09:34:53 -0400 Subject: sched: Use CPUPRI_NR_PRIORITIES instead of MAX_RT_PRIO in cpupri check The check at the beginning of cpupri_find() makes sure that the task_pri variable does not exceed the cp->pri_to_cpu array length. But that length is CPUPRI_NR_PRIORITIES not MAX_RT_PRIO, where it will miss the last two priorities in that array. As task_pri is computed from convert_prio() which should never be bigger than CPUPRI_NR_PRIORITIES, if the check should cause a panic if it is hit. Reported-by: Mike Galbraith Signed-off-by: Steven Rostedt Signed-off-by: Peter Zijlstra Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1397015410.5212.13.camel@marge.simpson.net Signed-off-by: Ingo Molnar --- kernel/sched/cpupri.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 8b836b3..3031bac 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -70,8 +70,7 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, int idx = 0; int task_pri = convert_prio(p->prio); - if (task_pri >= MAX_RT_PRIO) - return 0; + BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES); for (idx = 0; idx < task_pri; idx++) { struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; -- cgit v1.1 From 6ccdc84b81a0a6c09a7f0427761d2f8cecfc2218 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 24 Apr 2014 12:00:47 +0200 Subject: sched: Skip double execution of pick_next_task_fair() Tim wrote: "The current code will call pick_next_task_fair a second time in the slow path if we did not pull any task in our first try. This is really unnecessary as we already know no task can be pulled and it doubles the delay for the cpu to enter idle. We instrumented some network workloads and that saw that pick_next_task_fair is frequently called twice before a cpu enters idle. The call to pick_next_task_fair can add non trivial latency as it calls load_balance which runs find_busiest_group on an hierarchy of sched domains spanning the cpus for a large system. For some 4 socket systems, we saw almost 0.25 msec spent per call of pick_next_task_fair before a cpu can be idled." Optimize the second call away for the common case and document the dependency. Reported-by: Tim Chen Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Len Brown Link: http://lkml.kernel.org/r/20140424100047.GP11096@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e62c65a..28921ec 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2592,8 +2592,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev) if (likely(prev->sched_class == class && rq->nr_running == rq->cfs.h_nr_running)) { p = fair_sched_class.pick_next_task(rq, prev); - if (likely(p && p != RETRY_TASK)) - return p; + if (unlikely(p == RETRY_TASK)) + goto again; + + /* assumes fair_sched_class->next == idle_sched_class */ + if (unlikely(!p)) + p = idle_sched_class.pick_next_task(rq, prev); + + return p; } again: -- cgit v1.1 From 0e5b5337f0da073e1f17aec3c322ea7826975d0d Mon Sep 17 00:00:00 2001 From: Jason Low Date: Mon, 28 Apr 2014 15:45:54 -0700 Subject: sched: Fix updating rq->max_idle_balance_cost and rq->next_balance in idle_balance() The following commit: e5fc66119ec9 ("sched: Fix race in idle_balance()") can potentially cause rq->max_idle_balance_cost to not be updated, even when load_balance(NEWLY_IDLE) is attempted and the per-sd max cost value is updated. Preeti noticed a similar issue with updating rq->next_balance. In this patch, we fix this by making sure we still check/update those values even if a task gets enqueued while browsing the domains. Signed-off-by: Jason Low Reviewed-by: Preeti U Murthy Signed-off-by: Peter Zijlstra Cc: morten.rasmussen@arm.com Cc: aswin@hp.com Cc: daniel.lezcano@linaro.org Cc: alex.shi@linaro.org Cc: efault@gmx.de Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1398725155-7591-2-git-send-email-jason.low2@hp.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7570dd9..0fdb96d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6653,6 +6653,7 @@ static int idle_balance(struct rq *this_rq) int this_cpu = this_rq->cpu; idle_enter_fair(this_rq); + /* * We must set idle_stamp _before_ calling idle_balance(), such that we * measure the duration of idle_balance() as idle time. @@ -6705,14 +6706,16 @@ static int idle_balance(struct rq *this_rq) raw_spin_lock(&this_rq->lock); + if (curr_cost > this_rq->max_idle_balance_cost) + this_rq->max_idle_balance_cost = curr_cost; + /* - * While browsing the domains, we released the rq lock. - * A task could have be enqueued in the meantime + * While browsing the domains, we released the rq lock, a task could + * have been enqueued in the meantime. Since we're not going idle, + * pretend we pulled a task. */ - if (this_rq->cfs.h_nr_running && !pulled_task) { + if (this_rq->cfs.h_nr_running && !pulled_task) pulled_task = 1; - goto out; - } if (pulled_task || time_after(jiffies, this_rq->next_balance)) { /* @@ -6722,9 +6725,6 @@ static int idle_balance(struct rq *this_rq) this_rq->next_balance = next_balance; } - if (curr_cost > this_rq->max_idle_balance_cost) - this_rq->max_idle_balance_cost = curr_cost; - out: /* Is there a task of a high priority class? */ if (this_rq->nr_running != this_rq->cfs.h_nr_running && -- cgit v1.1 From 2b4cfe64dee0d84506b951d81bf55d9891744d25 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Wed, 23 Apr 2014 18:30:34 -0700 Subject: sched/numa: Initialize newidle balance stats in sd_numa_init() Also initialize the per-sd variables for newidle load balancing in sd_numa_init(). Signed-off-by: Jason Low Acked-by: morten.rasmussen@arm.com Cc: daniel.lezcano@linaro.org Cc: alex.shi@linaro.org Cc: preeti@linux.vnet.ibm.com Cc: efault@gmx.de Cc: vincent.guittot@linaro.org Cc: aswin@hp.com Cc: chegu_vinod@hp.com Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1398303035-18255-3-git-send-email-jason.low2@hp.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 28921ec..13584f1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6026,6 +6026,8 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu) , .last_balance = jiffies, .balance_interval = sd_weight, + .max_newidle_lb_cost = 0, + .next_decay_max_lb_cost = jiffies, }; SD_INIT_NAME(sd, NUMA); sd->private = &tl->data; -- cgit v1.1 From 143cf23df25b7082cd706c3c53188e741e7881c3 Mon Sep 17 00:00:00 2001 From: Michael Kerrisk Date: Fri, 9 May 2014 16:54:15 +0200 Subject: sched: Make sched_setattr() correctly return -EFBIG The documented[1] behavior of sched_attr() in the proposed man page text is: sched_attr::size must be set to the size of the structure, as in sizeof(struct sched_attr), if the provided structure is smaller than the kernel structure, any additional fields are assumed '0'. If the provided structure is larger than the kernel structure, the kernel verifies all additional fields are '0' if not the syscall will fail with -E2BIG. As currently implemented, sched_copy_attr() returns -EFBIG for for this case, but the logic in sys_sched_setattr() converts that error to -EFAULT. This patch fixes the behavior. [1] http://thread.gmane.org/gmane.linux.kernel/1615615/focus=1697760 Signed-off-by: Michael Kerrisk Signed-off-by: Peter Zijlstra Cc: Cc: Linus Torvalds Link: http://lkml.kernel.org/r/536CEC17.9070903@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 13584f1..f2205f0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3658,8 +3658,9 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, if (!uattr || pid < 0 || flags) return -EINVAL; - if (sched_copy_attr(uattr, &attr)) - return -EFAULT; + retval = sched_copy_attr(uattr, &attr); + if (retval) + return retval; rcu_read_lock(); retval = -ESRCH; -- cgit v1.1 From dbdb22754fde671dc93d2fae06f8be113d47f2fb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 9 May 2014 10:49:03 +0200 Subject: sched: Disallow sched_attr::sched_policy < 0 The scheduler uses policy=-1 to preserve the current policy state to implement sys_sched_setparam(), this got exposed to userspace by accident through sys_sched_setattr(), cure this. Reported-by: Michael Kerrisk Signed-off-by: Peter Zijlstra Acked-by: Michael Kerrisk Cc: Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140509085311.GJ30445@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f2205f0..cdefcf7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3662,6 +3662,9 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, if (retval) return retval; + if (attr.sched_policy < 0) + return -EINVAL; + rcu_read_lock(); retval = -ESRCH; p = find_process_by_pid(pid); -- cgit v1.1 From ce5f7f8200ca2504f6f290044393d73ca314965a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 12 May 2014 22:50:34 +0200 Subject: sched/deadline: Change sched_getparam() behaviour vs SCHED_DEADLINE The way we read POSIX one should only call sched_getparam() when sched_getscheduler() returns either SCHED_FIFO or SCHED_RR. Given that we currently return sched_param::sched_priority=0 for all others, extend the same behaviour to SCHED_DEADLINE. Requested-by: Michael Kerrisk Signed-off-by: Peter Zijlstra Acked-by: Michael Kerrisk Cc: Dario Faggioli Cc: linux-man Cc: "Michael Kerrisk (man-pages)" Cc: Juri Lelli Cc: Linus Torvalds Cc: Link: http://lkml.kernel.org/r/20140512205034.GH13467@laptop.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index cdefcf7..f3f08bf 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3713,7 +3713,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) */ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) { - struct sched_param lp; + struct sched_param lp = { .sched_priority = 0 }; struct task_struct *p; int retval; @@ -3730,11 +3730,8 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) if (retval) goto out_unlock; - if (task_has_dl_policy(p)) { - retval = -EINVAL; - goto out_unlock; - } - lp.sched_priority = p->rt_priority; + if (task_has_rt_policy(p)) + lp.sched_priority = p->rt_priority; rcu_read_unlock(); /* -- cgit v1.1 From b0827819b0da4acfbc1df1e05edcf50efd07cbd1 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Tue, 13 May 2014 14:11:31 +0200 Subject: sched/deadline: Restrict user params max value to 2^63 ns Michael Kerrisk noticed that creating SCHED_DEADLINE reservations with certain parameters (e.g, a runtime of something near 2^64 ns) can cause a system freeze for some amount of time. The problem is that in the interface we have u64 sched_runtime; while internally we need to have a signed runtime (to cope with budget overruns) s64 runtime; At the time we setup a new dl_entity we copy the first value in the second. The cast turns out with negative values when sched_runtime is too big, and this causes the scheduler to go crazy right from the start. Moreover, considering how we deal with deadlines wraparound (s64)(a - b) < 0 we also have to restrict acceptable values for sched_{deadline,period}. This patch fixes the thing checking that user parameters are always below 2^63 ns (still large enough for everyone). It also rewrites other conditions that we check, since in __checkparam_dl we don't have to deal with deadline wraparounds and what we have now erroneously fails when the difference between values is too big. Reported-by: Michael Kerrisk Suggested-by: Peter Zijlstra Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra Cc: Cc: Dario Faggioli Cc: Dave Jones Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140513141131.20d944f81633ee937f256385@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 37 ++++++++++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 7 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f3f08bf..44e00ab 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3195,17 +3195,40 @@ __getparam_dl(struct task_struct *p, struct sched_attr *attr) * We ask for the deadline not being zero, and greater or equal * than the runtime, as well as the period of being zero or * greater than deadline. Furthermore, we have to be sure that - * user parameters are above the internal resolution (1us); we - * check sched_runtime only since it is always the smaller one. + * user parameters are above the internal resolution of 1us (we + * check sched_runtime only since it is always the smaller one) and + * below 2^63 ns (we have to check both sched_deadline and + * sched_period, as the latter can be zero). */ static bool __checkparam_dl(const struct sched_attr *attr) { - return attr && attr->sched_deadline != 0 && - (attr->sched_period == 0 || - (s64)(attr->sched_period - attr->sched_deadline) >= 0) && - (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0 && - attr->sched_runtime >= (2 << (DL_SCALE - 1)); + /* deadline != 0 */ + if (attr->sched_deadline == 0) + return false; + + /* + * Since we truncate DL_SCALE bits, make sure we're at least + * that big. + */ + if (attr->sched_runtime < (1ULL << DL_SCALE)) + return false; + + /* + * Since we use the MSB for wrap-around and sign issues, make + * sure it's not set (mind that period can be equal to zero). + */ + if (attr->sched_deadline & (1ULL << 63) || + attr->sched_period & (1ULL << 63)) + return false; + + /* runtime <= deadline <= period (if period != 0) */ + if ((attr->sched_period != 0 && + attr->sched_period < attr->sched_deadline) || + attr->sched_deadline < attr->sched_runtime) + return false; + + return true; } /* -- cgit v1.1 From 944770ab54babaef29d9d1dc8189898b3ee8afcf Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 14 May 2014 16:13:56 +0200 Subject: sched/deadline: Replace NR_CPUS arrays Tejun reported that his resume was failing due to order-3 allocations from sched_domain building. Replace the NR_CPUS arrays in there with a dynamically allocated array. Reported-by: Tejun Heo Signed-off-by: Peter Zijlstra Acked-by: Juri Lelli Cc: Johannes Weiner Cc: Linus Torvalds Link: http://lkml.kernel.org/n/tip-kat4gl1m5a6dwy6nzuqox45e@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/cpudeadline.c | 33 ++++++++++++++++++++++++--------- kernel/sched/cpudeadline.h | 6 +++--- 2 files changed, 27 insertions(+), 12 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index ab001b5..bd95963 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -13,6 +13,7 @@ #include #include +#include #include "cpudeadline.h" static inline int parent(int i) @@ -39,8 +40,10 @@ static void cpudl_exchange(struct cpudl *cp, int a, int b) { int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu; - swap(cp->elements[a], cp->elements[b]); - swap(cp->cpu_to_idx[cpu_a], cp->cpu_to_idx[cpu_b]); + swap(cp->elements[a].cpu, cp->elements[b].cpu); + swap(cp->elements[a].dl , cp->elements[b].dl ); + + swap(cp->elements[cpu_a].idx, cp->elements[cpu_b].idx); } static void cpudl_heapify(struct cpudl *cp, int idx) @@ -140,7 +143,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) WARN_ON(!cpu_present(cpu)); raw_spin_lock_irqsave(&cp->lock, flags); - old_idx = cp->cpu_to_idx[cpu]; + old_idx = cp->elements[cpu].idx; if (!is_valid) { /* remove item */ if (old_idx == IDX_INVALID) { @@ -155,8 +158,8 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl; cp->elements[old_idx].cpu = new_cpu; cp->size--; - cp->cpu_to_idx[new_cpu] = old_idx; - cp->cpu_to_idx[cpu] = IDX_INVALID; + cp->elements[new_cpu].idx = old_idx; + cp->elements[cpu].idx = IDX_INVALID; while (old_idx > 0 && dl_time_before( cp->elements[parent(old_idx)].dl, cp->elements[old_idx].dl)) { @@ -173,7 +176,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) cp->size++; cp->elements[cp->size - 1].dl = 0; cp->elements[cp->size - 1].cpu = cpu; - cp->cpu_to_idx[cpu] = cp->size - 1; + cp->elements[cpu].idx = cp->size - 1; cpudl_change_key(cp, cp->size - 1, dl); cpumask_clear_cpu(cpu, cp->free_cpus); } else { @@ -195,10 +198,21 @@ int cpudl_init(struct cpudl *cp) memset(cp, 0, sizeof(*cp)); raw_spin_lock_init(&cp->lock); cp->size = 0; - for (i = 0; i < NR_CPUS; i++) - cp->cpu_to_idx[i] = IDX_INVALID; - if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) + + cp->elements = kcalloc(nr_cpu_ids, + sizeof(struct cpudl_item), + GFP_KERNEL); + if (!cp->elements) + return -ENOMEM; + + if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) { + kfree(cp->elements); return -ENOMEM; + } + + for_each_possible_cpu(i) + cp->elements[i].idx = IDX_INVALID; + cpumask_setall(cp->free_cpus); return 0; @@ -211,4 +225,5 @@ int cpudl_init(struct cpudl *cp) void cpudl_cleanup(struct cpudl *cp) { free_cpumask_var(cp->free_cpus); + kfree(cp->elements); } diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index a202789..538c979 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h @@ -5,17 +5,17 @@ #define IDX_INVALID -1 -struct array_item { +struct cpudl_item { u64 dl; int cpu; + int idx; }; struct cpudl { raw_spinlock_t lock; int size; - int cpu_to_idx[NR_CPUS]; - struct array_item elements[NR_CPUS]; cpumask_var_t free_cpus; + struct cpudl_item *elements; }; -- cgit v1.1 From 4dac0b638310d2e92f6e19958b73d4c97c9734bb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 14 May 2014 16:04:26 +0200 Subject: sched/cpupri: Replace NR_CPUS arrays Tejun reported that his resume was failing due to order-3 allocations from sched_domain building. Replace the NR_CPUS arrays in there with a dynamically allocated array. Reported-by: Tejun Heo Signed-off-by: Peter Zijlstra Cc: Johannes Weiner Cc: Steven Rostedt Cc: Linus Torvalds Link: http://lkml.kernel.org/n/tip-7cysnkw1gik45r864t1nkudh@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/cpupri.c | 7 +++++++ kernel/sched/cpupri.h | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel/sched') diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 3031bac..8834243 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -30,6 +30,7 @@ #include #include #include +#include #include "cpupri.h" /* Convert between a 140 based task->prio, and our 102 based cpupri */ @@ -218,8 +219,13 @@ int cpupri_init(struct cpupri *cp) goto cleanup; } + cp->cpu_to_pri = kcalloc(nr_cpu_ids, sizeof(int), GFP_KERNEL); + if (!cp->cpu_to_pri) + goto cleanup; + for_each_possible_cpu(i) cp->cpu_to_pri[i] = CPUPRI_INVALID; + return 0; cleanup: @@ -236,6 +242,7 @@ void cpupri_cleanup(struct cpupri *cp) { int i; + kfree(cp->cpu_to_pri); for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) free_cpumask_var(cp->pri_to_cpu[i].mask); } diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index f6d7561..6b03334 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h @@ -17,7 +17,7 @@ struct cpupri_vec { struct cpupri { struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; - int cpu_to_pri[NR_CPUS]; + int *cpu_to_pri; }; #ifdef CONFIG_SMP -- cgit v1.1 From 6acbfb96976fc3350e30d964acb1dbbdf876d55e Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 16 May 2014 11:50:42 +0800 Subject: sched: Fix hotplug vs. set_cpus_allowed_ptr() Lai found that: WARNING: CPU: 1 PID: 13 at arch/x86/kernel/smp.c:124 native_smp_send_reschedule+0x2d/0x4b() ... migration_cpu_stop+0x1d/0x22 was caused by set_cpus_allowed_ptr() assuming that cpu_active_mask is always a sub-set of cpu_online_mask. This isn't true since 5fbd036b552f ("sched: Cleanup cpu_active madness"). So set active and online at the same time to avoid this particular problem. Fixes: 5fbd036b552f ("sched: Cleanup cpu_active madness") Signed-off-by: Lai Jiangshan Signed-off-by: Peter Zijlstra Cc: Andrew Morton Cc: Gautham R. Shenoy Cc: Linus Torvalds Cc: Michael wang Cc: Paul Gortmaker Cc: Rafael J. Wysocki Cc: Srivatsa S. Bhat Cc: Toshi Kani Link: http://lkml.kernel.org/r/53758B12.8060609@cn.fujitsu.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 44e00ab..86f3890 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5076,7 +5076,6 @@ static int sched_cpu_active(struct notifier_block *nfb, unsigned long action, void *hcpu) { switch (action & ~CPU_TASKS_FROZEN) { - case CPU_STARTING: case CPU_DOWN_FAILED: set_cpu_active((long)hcpu, true); return NOTIFY_OK; -- cgit v1.1