From 45a22f4c11fef4ecd5c61c0a299cd3f23d77be8e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 17 Feb 2014 13:09:50 +0100 Subject: inotify: Fix reporting of cookies for inotify events My rework of handling of notification events (namely commit 7053aee26a35 "fsnotify: do not share events between notification groups") broke sending of cookies with inotify events. We didn't propagate the value passed to fsnotify() properly and passed 4 uninitialized bytes to userspace instead (so it is also an information leak). Sadly I didn't notice this during my testing because inotify cookies aren't used very much and LTP inotify tests ignore them. Fix the problem by passing the cookie value properly. Fixes: 7053aee26a3548ebaba046ae2e52396ccf56ac6c Reported-by: Vegard Nossum Signed-off-by: Jan Kara --- kernel/audit_tree.c | 2 +- kernel/audit_watch.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 67ccf0e..135944a 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -916,7 +916,7 @@ static int audit_tree_handle_event(struct fsnotify_group *group, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, u32 mask, void *data, int data_type, - const unsigned char *file_name) + const unsigned char *file_name, u32 cookie) { return 0; } diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 2596fac..70b4554 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -471,7 +471,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, u32 mask, void *data, int data_type, - const unsigned char *dname) + const unsigned char *dname, u32 cookie) { struct inode *inode; struct audit_parent *parent; -- cgit v1.1 From 5ae8aabeaec3fe69c4fb21cbe5b17b72b35b5892 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Mon, 17 Feb 2014 10:45:36 -0800 Subject: sched_clock: Prevent callers from seeing half-updated data The generic sched_clock registration function was previously done lockless, due to the fact that it was expected to be called only once. However, now there are systems that may register multiple sched_clock sources, for which the lack of locking has casued problems: If two sched_clock sources are registered we may end up in a situation where a call to sched_clock() may be accessing the epoch cycle count for the old counter and the cycle count for the new counter. This can lead to confusing results where sched_clock() values jump and then are reset to 0 (due to the way the registration function forces the epoch_ns to be 0). Fix this by reorganizing the registration function to hold the seqlock for as short a time as possible while we update the clock_data structure for a new counter. We also put any accumulated time into epoch_ns instead of resetting the time to 0 so that the clock doesn't reset after each successful registration. [jstultz: Added extra context to the commit message] Reported-by: Will Deacon Signed-off-by: Stephen Boyd Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Will Deacon Cc: Peter Zijlstra Cc: Josh Cartwright Link: http://lkml.kernel.org/r/1392662736-7803-2-git-send-email-john.stultz@linaro.org Signed-off-by: John Stultz Signed-off-by: Thomas Gleixner --- kernel/time/sched_clock.c | 46 +++++++++++++++++++++++++++++----------------- 1 file changed, 29 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 0abb364..4d23dc4 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -116,20 +116,42 @@ static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt) void __init sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) { + u64 res, wrap, new_mask, new_epoch, cyc, ns; + u32 new_mult, new_shift; + ktime_t new_wrap_kt; unsigned long r; - u64 res, wrap; char r_unit; if (cd.rate > rate) return; WARN_ON(!irqs_disabled()); - read_sched_clock = read; - sched_clock_mask = CLOCKSOURCE_MASK(bits); - cd.rate = rate; /* calculate the mult/shift to convert counter ticks to ns. */ - clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 3600); + clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600); + + new_mask = CLOCKSOURCE_MASK(bits); + + /* calculate how many ns until we wrap */ + wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask); + new_wrap_kt = ns_to_ktime(wrap - (wrap >> 3)); + + /* update epoch for new counter and update epoch_ns from old counter*/ + new_epoch = read(); + cyc = read_sched_clock(); + ns = cd.epoch_ns + cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, + cd.mult, cd.shift); + + raw_write_seqcount_begin(&cd.seq); + read_sched_clock = read; + sched_clock_mask = new_mask; + cd.rate = rate; + cd.wrap_kt = new_wrap_kt; + cd.mult = new_mult; + cd.shift = new_shift; + cd.epoch_cyc = new_epoch; + cd.epoch_ns = ns; + raw_write_seqcount_end(&cd.seq); r = rate; if (r >= 4000000) { @@ -141,22 +163,12 @@ void __init sched_clock_register(u64 (*read)(void), int bits, } else r_unit = ' '; - /* calculate how many ns until we wrap */ - wrap = clocks_calc_max_nsecs(cd.mult, cd.shift, 0, sched_clock_mask); - cd.wrap_kt = ns_to_ktime(wrap - (wrap >> 3)); - /* calculate the ns resolution of this counter */ - res = cyc_to_ns(1ULL, cd.mult, cd.shift); + res = cyc_to_ns(1ULL, new_mult, new_shift); + pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n", bits, r, r_unit, res, wrap); - update_sched_clock(); - - /* - * Ensure that sched_clock() starts off at 0ns - */ - cd.epoch_ns = 0; - /* Enable IRQ time accounting if we have a fast enough sched_clock */ if (irqtime > 0 || (irqtime == -1 && rate >= 1000000)) enable_sched_clock_irqtime(); -- cgit v1.1 From 3d5f35bdfdef5fd627afe9b4bf9c4f32d17f4593 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Thu, 20 Feb 2014 09:19:39 +0100 Subject: sched/deadline: Fix bad accounting of nr_running Rostedt writes: My test suite was locking up hard when enabling mmiotracer. This was due to the mmiotracer placing all but one CPU offline. I found this out when I was able to reproduce the bug with just my stress-cpu-hotplug test. This bug baffled me because it would not always trigger, and would only trigger on the first run after boot up. The stress-cpu-hotplug test would crash hard the first run, or never crash at all. But a new reboot may cause it to crash on the first run again. I spent all week bisecting this, as I couldn't find a consistent reproducer. I finally narrowed it down to the sched deadline patches, and even more peculiar, to the commit that added the sched deadline boot up self test to the latency tracer. Then it dawned on me to what the bug was. All it took was to run a task under sched deadline to screw up the CPU hot plugging. This explained why it would lock up only on the first run of the stress-cpu-hotplug test. The bug happened when the boot up self test of the schedule latency tracer would test a deadline task. The deadline task would corrupt something that would cause CPU hotplug to fail. If it didn't corrupt it, the stress test would always work (there's no other sched deadline tasks that would run to cause problems). If it did corrupt on boot up, the first test would lockup hard. I proved this theory by running my deadline test program on another box, and then run the stress-cpu-hotplug test, and it would now consistently lock up. I could run stress-cpu-hotplug over and over with no problem, but once I ran the deadline test, the next run of the stress-cpu-hotplug would lock hard. After adding lots of tracing to the code, I found the cause. The function tracer showed that migrate_tasks() was stuck in an infinite loop, where rq->nr_running never equaled 1 to break out of it. When I added a trace_printk() to see what that number was, it was 335 and never decrementing! Looking at the deadline code I found: static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) { dequeue_dl_entity(&p->dl); dequeue_pushable_dl_task(rq, p); } static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) { update_curr_dl(rq); __dequeue_task_dl(rq, p, flags); dec_nr_running(rq); } And this: if (dl_runtime_exceeded(rq, dl_se)) { __dequeue_task_dl(rq, curr, 0); if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted))) dl_se->dl_throttled = 1; else enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); if (!is_leftmost(curr, &rq->dl)) resched_task(curr); } Notice how we call __dequeue_task_dl() and in the else case we call enqueue_task_dl()? Also notice that dequeue_task_dl() has underscores where enqueue_task_dl() does not. The enqueue_task_dl() calls inc_nr_running(rq), but __dequeue_task_dl() does not. This is where we get nr_running out of sync. [snip] Another point where nr_running can get out of sync is when the dl_timer fires: dl_se->dl_throttled = 0; if (p->on_rq) { enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); if (task_has_dl_policy(rq->curr)) check_preempt_curr_dl(rq, p, 0); else resched_task(rq->curr); This patch does two things: - correctly accounts for throttled tasks (that are now considered !running); - fixes the bug, updating nr_running from {inc,dec}_dl_tasks(), since we risk to update it twice in some situations (e.g., a task is dequeued while it has exceeded its budget). Cc: mingo@redhat.com Cc: torvalds@linux-foundation.org Cc: akpm@linux-foundation.org Reported-by: Steven Rostedt Reviewed-by: Steven Rostedt Tested-by: Steven Rostedt Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1392884379-13744-1-git-send-email-juri.lelli@gmail.com Signed-off-by: Thomas Gleixner --- kernel/sched/deadline.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 0dd5e09..b819577 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -717,6 +717,7 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) WARN_ON(!dl_prio(prio)); dl_rq->dl_nr_running++; + inc_nr_running(rq_of_dl_rq(dl_rq)); inc_dl_deadline(dl_rq, deadline); inc_dl_migration(dl_se, dl_rq); @@ -730,6 +731,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) WARN_ON(!dl_prio(prio)); WARN_ON(!dl_rq->dl_nr_running); dl_rq->dl_nr_running--; + dec_nr_running(rq_of_dl_rq(dl_rq)); dec_dl_deadline(dl_rq, dl_se->deadline); dec_dl_migration(dl_se, dl_rq); @@ -836,8 +838,6 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_dl_task(rq, p); - - inc_nr_running(rq); } static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) @@ -850,8 +850,6 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) { update_curr_dl(rq); __dequeue_task_dl(rq, p, flags); - - dec_nr_running(rq); } /* -- cgit v1.1 From 4df1638cfaf9b2b7ad993979a41965acab9cd156 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 19 Feb 2014 13:53:35 -0500 Subject: sched/deadline: Fix overflow to handle period==0 and deadline!=0 While debugging the crash with the bad nr_running accounting, I hit another bug where, after running my sched deadline test, I was getting failures to take a CPU offline. It was giving me a -EBUSY error. Adding a bunch of trace_printk()s around, I found that the cpu notifier that called sched_cpu_inactive() was returning a failure. The overflow value was coming up negative? Talking this over with Juri, the problem is that the total_bw update was suppose to be made by dl_overflow() which, during my tests, seemed to not be called. Adding more trace_printk()s, it wasn't that it wasn't called, but it exited out right away with the check of new_bw being equal to p->dl.dl_bw. The new_bw calculates the ratio between period and runtime. The bug is that if you set a deadline, you do not need to set a period if you plan on the period being equal to the deadline. That is, if period is zero and deadline is not, then the system call should set the period to be equal to the deadline. This is done elsewhere in the code. The fix is easy, check if period is set, and if it is not, then use the deadline. Cc: Juri Lelli Cc: Ingo Molnar Cc: Linus Torvalds Cc: Andrew Morton Signed-off-by: Steven Rostedt Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140219135335.7e74abd4@gandalf.local.home Signed-off-by: Thomas Gleixner --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b46131e..2491448 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1952,7 +1952,7 @@ static int dl_overflow(struct task_struct *p, int policy, { struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); - u64 period = attr->sched_period; + u64 period = attr->sched_period ?: attr->sched_deadline; u64 runtime = attr->sched_runtime; u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; int cpus, err = -1; -- cgit v1.1 From e9e7cb38c21c80c82af4b16608bb4c8c5ec6a28e Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Tue, 11 Feb 2014 09:24:26 +0100 Subject: sched/core: Fix sched_rt_global_validate Don't compare sysctl_sched_rt_runtime against sysctl_sched_rt_period if the former is equal to RUNTIME_INF, otherwise disabling -rt bandwidth management (with CONFIG_RT_GROUP_SCHED=n) fails. Cc: Ingo Molnar Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1392107067-19907-2-git-send-email-juri.lelli@gmail.com Signed-off-by: Thomas Gleixner --- kernel/sched/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2491448..98d33c1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7475,7 +7475,8 @@ static int sched_rt_global_validate(void) if (sysctl_sched_rt_period <= 0) return -EINVAL; - if (sysctl_sched_rt_runtime > sysctl_sched_rt_period) + if ((sysctl_sched_rt_runtime != RUNTIME_INF) && + (sysctl_sched_rt_runtime > sysctl_sched_rt_period)) return -EINVAL; return 0; -- cgit v1.1 From 495163420ab5398c84af96ca3eae2c6aa4a140da Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Tue, 11 Feb 2014 09:24:27 +0100 Subject: sched/core: Make dl_b->lock IRQ safe Fix this lockdep warning: [ 44.804600] ========================================================= [ 44.805746] [ INFO: possible irq lock inversion dependency detected ] [ 44.805746] 3.14.0-rc2-test+ #14 Not tainted [ 44.805746] --------------------------------------------------------- [ 44.805746] bash/3674 just changed the state of lock: [ 44.805746] (&dl_b->lock){+.....}, at: [] sched_rt_handler+0x132/0x248 [ 44.805746] but this lock was taken by another, HARDIRQ-safe lock in the past: [ 44.805746] (&rq->lock){-.-.-.} and interrupts could create inverse lock ordering between them. [ 44.805746] [ 44.805746] other info that might help us debug this: [ 44.805746] Possible interrupt unsafe locking scenario: [ 44.805746] [ 44.805746] CPU0 CPU1 [ 44.805746] ---- ---- [ 44.805746] lock(&dl_b->lock); [ 44.805746] local_irq_disable(); [ 44.805746] lock(&rq->lock); [ 44.805746] lock(&dl_b->lock); [ 44.805746] [ 44.805746] lock(&rq->lock); by making dl_b->lock acquiring always IRQ safe. Cc: Ingo Molnar Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1392107067-19907-3-git-send-email-juri.lelli@gmail.com Signed-off-by: Thomas Gleixner --- kernel/sched/core.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 98d33c1..33d030a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7422,6 +7422,7 @@ static int sched_dl_global_constraints(void) u64 period = global_rt_period(); u64 new_bw = to_ratio(period, runtime); int cpu, ret = 0; + unsigned long flags; /* * Here we want to check the bandwidth not being set to some @@ -7435,10 +7436,10 @@ static int sched_dl_global_constraints(void) for_each_possible_cpu(cpu) { struct dl_bw *dl_b = dl_bw_of(cpu); - raw_spin_lock(&dl_b->lock); + raw_spin_lock_irqsave(&dl_b->lock, flags); if (new_bw < dl_b->total_bw) ret = -EBUSY; - raw_spin_unlock(&dl_b->lock); + raw_spin_unlock_irqrestore(&dl_b->lock, flags); if (ret) break; @@ -7451,6 +7452,7 @@ static void sched_dl_do_global(void) { u64 new_bw = -1; int cpu; + unsigned long flags; def_dl_bandwidth.dl_period = global_rt_period(); def_dl_bandwidth.dl_runtime = global_rt_runtime(); @@ -7464,9 +7466,9 @@ static void sched_dl_do_global(void) for_each_possible_cpu(cpu) { struct dl_bw *dl_b = dl_bw_of(cpu); - raw_spin_lock(&dl_b->lock); + raw_spin_lock_irqsave(&dl_b->lock, flags); dl_b->bw = new_bw; - raw_spin_unlock(&dl_b->lock); + raw_spin_unlock_irqrestore(&dl_b->lock, flags); } } -- cgit v1.1 From 3cf1962cdbf6b3a9e3ef21116d215bbab350ea37 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 18 Feb 2014 17:12:44 -0500 Subject: sched,numa: add cond_resched to task_numa_work Normally task_numa_work scans over a fairly small amount of memory, but it is possible to run into a large unpopulated part of virtual memory, with no pages mapped. In that case, task_numa_work can run for a while, and it may make sense to reschedule as required. Cc: akpm@linux-foundation.org Cc: Andrea Arcangeli Signed-off-by: Rik van Riel Reported-by: Xing Gang Tested-by: Chegu Vinod Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1392761566-24834-2-git-send-email-riel@redhat.com Signed-off-by: Thomas Gleixner --- kernel/sched/fair.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 966cc2b..7815709 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1757,6 +1757,8 @@ void task_numa_work(struct callback_head *work) start = end; if (pages <= 0) goto out; + + cond_resched(); } while (end != vma->vm_end); } -- cgit v1.1 From 4efbc454ba68def5ef285b26ebfcfdb605b52755 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Sun, 16 Feb 2014 22:24:17 +0100 Subject: sched: Fix information leak in sys_sched_getattr() We're copying the on-stack structure to userspace, but forgot to give the right number of bytes to copy. This allows the calling process to obtain up to PAGE_SIZE bytes from the stack (and possibly adjacent kernel memory). This fix copies only as much as we actually have on the stack (attr->size defaults to the size of the struct) and leaves the rest of the userspace-provided buffer untouched. Found using kmemcheck + trinity. Fixes: d50dde5a10f30 ("sched: Add new scheduler syscalls to support an extended scheduling parameters ABI") Cc: Dario Faggioli Cc: Juri Lelli Cc: Ingo Molnar Signed-off-by: Vegard Nossum Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1392585857-10725-1-git-send-email-vegard.nossum@oracle.com Signed-off-by: Thomas Gleixner --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 33d030a..a6e7470 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3786,7 +3786,7 @@ static int sched_read_attr(struct sched_attr __user *uattr, attr->size = usize; } - ret = copy_to_user(uattr, attr, usize); + ret = copy_to_user(uattr, attr, attr->size); if (ret) return -EFAULT; -- cgit v1.1 From 6d35ab48090b10c5ea5604ed5d6e91f302dc6060 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 14 Feb 2014 17:19:29 +0100 Subject: sched: Add 'flags' argument to sched_{set,get}attr() syscalls Because of a recent syscall design debate; its deemed appropriate for each syscall to have a flags argument for future extension; without immediately requiring new syscalls. Cc: juri.lelli@gmail.com Cc: Ingo Molnar Suggested-by: Michael Kerrisk Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140214161929.GL27965@twins.programming.kicks-ass.net Signed-off-by: Thomas Gleixner --- kernel/sched/core.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a6e7470..6edbef2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3661,13 +3661,14 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) * @pid: the pid in question. * @uattr: structure containing the extended parameters. */ -SYSCALL_DEFINE2(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr) +SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, + unsigned int, flags) { struct sched_attr attr; struct task_struct *p; int retval; - if (!uattr || pid < 0) + if (!uattr || pid < 0 || flags) return -EINVAL; if (sched_copy_attr(uattr, &attr)) @@ -3804,8 +3805,8 @@ err_size: * @uattr: structure containing the extended parameters. * @size: sizeof(attr) for fwd/bwd comp. */ -SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, - unsigned int, size) +SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, + unsigned int, size, unsigned int, flags) { struct sched_attr attr = { .size = sizeof(struct sched_attr), @@ -3814,7 +3815,7 @@ SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, int retval; if (!uattr || pid < 0 || size > PAGE_SIZE || - size < SCHED_ATTR_SIZE_VER0) + size < SCHED_ATTR_SIZE_VER0 || flags) return -EINVAL; rcu_read_lock(); -- cgit v1.1 From 82b95800b256205cff2eeab5bbd03430d2d0f20d Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Mon, 17 Feb 2014 09:12:33 -0500 Subject: sched/deadline: Test for CPU's presence explicitly A hot-removed CPU may have ID that is numerically larger than the number of existing CPUs in the system (e.g. we can unplug CPU 4 from a system that has CPUs 0, 1 and 4). Thus the WARN_ONs should check whether the CPU in question is currently present, not whether its ID value is less than num_present_cpus(). Cc: Ingo Molnar Cc: Juri Lelli Cc: Steven Rostedt Reported-by: Konrad Rzeszutek Wilk Signed-off-by: Boris Ostrovsky Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1392646353-1874-1-git-send-email-boris.ostrovsky@oracle.com Signed-off-by: Thomas Gleixner --- kernel/sched/cpudeadline.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 045fc74..5b8838b 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -70,7 +70,7 @@ static void cpudl_heapify(struct cpudl *cp, int idx) static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl) { - WARN_ON(idx > num_present_cpus() || idx == IDX_INVALID); + WARN_ON(!cpu_present(idx) || idx == IDX_INVALID); if (dl_time_before(new_dl, cp->elements[idx].dl)) { cp->elements[idx].dl = new_dl; @@ -117,7 +117,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, } out: - WARN_ON(best_cpu > num_present_cpus() && best_cpu != -1); + WARN_ON(!cpu_present(best_cpu) && best_cpu != -1); return best_cpu; } @@ -137,7 +137,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) int old_idx, new_cpu; unsigned long flags; - WARN_ON(cpu > num_present_cpus()); + WARN_ON(!cpu_present(cpu)); raw_spin_lock_irqsave(&cp->lock, flags); old_idx = cp->cpu_to_idx[cpu]; -- cgit v1.1 From 995b9ea440862def83e8fcb1b498e68f93d4af59 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 18 Feb 2014 02:24:13 +0400 Subject: sched/deadline: Remove useless dl_nr_total In deadline class we do not have group scheduling like in RT. dl_nr_total is the same as dl_nr_running. So, one of them should be removed. Cc: Ingo Molnar Cc: Juri Lelli Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/368631392675853@web20h.yandex.ru Signed-off-by: Thomas Gleixner --- kernel/sched/deadline.c | 4 +--- kernel/sched/sched.h | 1 - 2 files changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index b819577..15cbc17 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -121,7 +121,7 @@ static inline void dl_clear_overload(struct rq *rq) static void update_dl_migration(struct dl_rq *dl_rq) { - if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_total > 1) { + if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_running > 1) { if (!dl_rq->overloaded) { dl_set_overload(rq_of_dl_rq(dl_rq)); dl_rq->overloaded = 1; @@ -137,7 +137,6 @@ static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) struct task_struct *p = dl_task_of(dl_se); dl_rq = &rq_of_dl_rq(dl_rq)->dl; - dl_rq->dl_nr_total++; if (p->nr_cpus_allowed > 1) dl_rq->dl_nr_migratory++; @@ -149,7 +148,6 @@ static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) struct task_struct *p = dl_task_of(dl_se); dl_rq = &rq_of_dl_rq(dl_rq)->dl; - dl_rq->dl_nr_total--; if (p->nr_cpus_allowed > 1) dl_rq->dl_nr_migratory--; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c2119fd..f964add 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -462,7 +462,6 @@ struct dl_rq { } earliest_dl; unsigned long dl_nr_migratory; - unsigned long dl_nr_total; int overloaded; /* -- cgit v1.1 From c685689fd24d310343ac33942e9a54a974ae9c43 Mon Sep 17 00:00:00 2001 From: Chuansheng Liu Date: Mon, 24 Feb 2014 11:29:50 +0800 Subject: genirq: Remove racy waitqueue_active check We hit one rare case below: T1 calling disable_irq(), but hanging at synchronize_irq() always; The corresponding irq thread is in sleeping state; And all CPUs are in idle state; After analysis, we found there is one possible scenerio which causes T1 is waiting there forever: CPU0 CPU1 synchronize_irq() wait_event() spin_lock() atomic_dec_and_test(&threads_active) insert the __wait into queue spin_unlock() if(waitqueue_active) atomic_read(&threads_active) wake_up() Here after inserted the __wait into queue on CPU0, and before test if queue is empty on CPU1, there is no barrier, it maybe cause it is not visible for CPU1 immediately, although CPU0 has updated the queue list. It is similar for CPU0 atomic_read() threads_active also. So we'd need one smp_mb() before waitqueue_active.that, but removing the waitqueue_active() check solves it as wel l and it makes things simple and clear. Signed-off-by: Chuansheng Liu Cc: Xiaoming Wang Link: http://lkml.kernel.org/r/1393212590-32543-1-git-send-email-chuansheng.liu@intel.com Cc: stable@vger.kernel.org Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 481a13c..d3bf660 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -802,8 +802,7 @@ static irqreturn_t irq_thread_fn(struct irq_desc *desc, static void wake_threads_waitq(struct irq_desc *desc) { - if (atomic_dec_and_test(&desc->threads_active) && - waitqueue_active(&desc->wait_for_threads)) + if (atomic_dec_and_test(&desc->threads_active)) wake_up(&desc->wait_for_threads); } -- cgit v1.1 From 791c9e0292671a3bfa95286bb5c08129d8605618 Mon Sep 17 00:00:00 2001 From: George McCollister Date: Tue, 18 Feb 2014 17:56:51 -0600 Subject: sched: Fix double normalization of vruntime dequeue_entity() is called when p->on_rq and sets se->on_rq = 0 which appears to guarentee that the !se->on_rq condition is met. If the task has done set_current_state(TASK_INTERRUPTIBLE) without schedule() the second condition will be met and vruntime will be incorrectly adjusted twice. In certain cases this can result in the task's vruntime never increasing past the vruntime of other tasks on the CFS' run queue, starving them of CPU time. This patch changes switched_from_fair() to use !p->on_rq instead of !se->on_rq. I'm able to cause a task with a priority of 120 to starve all other tasks with the same priority on an ARM platform running 3.2.51-rt72 PREEMPT RT by writing one character at time to a serial tty (16550 UART) in a tight loop. I'm also able to verify making this change corrects the problem on that platform and kernel version. Signed-off-by: George McCollister Signed-off-by: Peter Zijlstra Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1392767811-28916-1-git-send-email-george.mccollister@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7815709..9b4c4f3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7001,15 +7001,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) struct cfs_rq *cfs_rq = cfs_rq_of(se); /* - * Ensure the task's vruntime is normalized, so that when its + * Ensure the task's vruntime is normalized, so that when it's * switched back to the fair class the enqueue_entity(.flags=0) will * do the right thing. * - * If it was on_rq, then the dequeue_entity(.flags=0) will already - * have normalized the vruntime, if it was !on_rq, then only when + * If it's on_rq, then the dequeue_entity(.flags=0) will already + * have normalized the vruntime, if it's !on_rq, then only when * the task is sleeping will it still have non-normalized vruntime. */ - if (!se->on_rq && p->state != TASK_RUNNING) { + if (!p->on_rq && p->state != TASK_RUNNING) { /* * Fix up our vruntime so that the current sleep doesn't * cause 'unlimited' sleep bonus. -- cgit v1.1 From 3908ac13b550c93f97d8db136ff572be5495bc06 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 25 Feb 2014 19:52:23 +0400 Subject: sched/deadline: Cleanup RT leftovers from {inc/dec}_dl_migration In deadline class we do not have group scheduling. So, let's remove unnecessary X = X; equations. Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra Cc: Juri Lelli Link: http://lkml.kernel.org/r/1393343543.4089.5.camel@tkhai Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 15cbc17..aecf930 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -135,7 +135,6 @@ static void update_dl_migration(struct dl_rq *dl_rq) static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { struct task_struct *p = dl_task_of(dl_se); - dl_rq = &rq_of_dl_rq(dl_rq)->dl; if (p->nr_cpus_allowed > 1) dl_rq->dl_nr_migratory++; @@ -146,7 +145,6 @@ static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { struct task_struct *p = dl_task_of(dl_se); - dl_rq = &rq_of_dl_rq(dl_rq)->dl; if (p->nr_cpus_allowed > 1) dl_rq->dl_nr_migratory--; -- cgit v1.1 From eec751ed41a0ae7e92a43c33a458d7bd1b941631 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Mon, 24 Feb 2014 11:47:12 +0100 Subject: sched/deadline: Switch CPU's presence test order Commit 82b9580 ("sched/deadline: Test for CPU's presence explicitly") changed how we check if a CPU returned by cpudeadline machinery is valid. But, we don't want to call cpu_present() if best_cpu is equal to -1. So, switch the order of tests inside WARN_ON(). Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra Cc: boris.ostrovsky@oracle.com Cc: konrad.wilk@oracle.com Cc: rostedt@goodmis.org Link: http://lkml.kernel.org/r/1393238832-9100-1-git-send-email-juri.lelli@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/cpudeadline.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 5b8838b..5b9bb42 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -70,7 +70,7 @@ static void cpudl_heapify(struct cpudl *cp, int idx) static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl) { - WARN_ON(!cpu_present(idx) || idx == IDX_INVALID); + WARN_ON(idx == IDX_INVALID || !cpu_present(idx)); if (dl_time_before(new_dl, cp->elements[idx].dl)) { cp->elements[idx].dl = new_dl; @@ -117,7 +117,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, } out: - WARN_ON(!cpu_present(best_cpu) && best_cpu != -1); + WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); return best_cpu; } -- cgit v1.1 From faa5993736d9b44b508cab4f1f3a77d66641c6f4 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 21 Feb 2014 11:37:15 +0100 Subject: sched/deadline: Prevent rt_time growth to infinity Kirill Tkhai noted: Since deadline tasks share rt bandwidth, we must care about bandwidth timer set. Otherwise rt_time may grow up to infinity in update_curr_dl(), if there are no other available RT tasks on top level bandwidth. RT task were in fact throttled right after they got enqueued, and never executed again (rt_time never again went below rt_runtime). Peter then proposed to accrue DL execution on rt_time only when rt timer is active, and proposed a patch (this patch is a slight modification of that) to implement that behavior. While this solves Kirill problem, it has a drawback. Indeed, Kirill noted again: It looks we may get into a situation, when all CPU time is shared between RT and DL tasks: rt_runtime = n rt_period = 2n | RT working, DL sleeping | DL working, RT sleeping | ----------------------------------------------------------- | (1) duration = n | (2) duration = n | (repeat) |--------------------------|------------------------------| | (rt_bw timer is running) | (rt_bw timer is not running) | No time for fair tasks at all. While this can happen during the first period, if rq is always backlogged, RT tasks won't have the opportunity to execute anymore: rt_time reached rt_runtime during (1), suppose after (2) RT is enqueued back, it gets throttled since rt timer didn't fire, replenishment is from now on eaten up by DL tasks that accrue their execution on rt_time (while rt timer is active - we have an RT task waiting for replenishment). FAIR tasks are not touched after this first period. Ok, this is not ideal, and the situation is even worse! What above (the nice case), practically never happens in reality, where your rt timer is not aligned to tasks periods, tasks are in general not periodic, etc.. Long story short, you always risk to overload your system. This patch is based on Peter's idea, but exploits an additional fact: if you don't have RT tasks enqueued, it makes little sense to continue incrementing rt_time once you reached the upper limit (DL tasks have their own mechanism for throttling). This cures both problems: - no matter how many DL instances in the past, you'll have an rt_time slightly above rt_runtime when an RT task is enqueued, and from that point on (after the first replenishment), the task will normally execute; - you can still eat up all bandwidth during the first period, but not anymore after that, remember that DL execution will increment rt_time till the upper limit is reached. The situation is still not perfect! But, we have a simple solution for now, that limits how much you can jeopardize your system, as we keep working towards the right answer: RT groups scheduled using deadline servers. Reported-by: Kirill Tkhai Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20140225151515.617714e2f2cd6c558531ba61@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 8 ++++++-- kernel/sched/rt.c | 8 ++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index aecf930..6e79b3f 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -562,6 +562,8 @@ int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se) return 1; } +extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); + /* * Update the current task's runtime statistics (provided it is still * a -deadline task and has not been removed from the dl_rq). @@ -625,11 +627,13 @@ static void update_curr_dl(struct rq *rq) struct rt_rq *rt_rq = &rq->rt; raw_spin_lock(&rt_rq->rt_runtime_lock); - rt_rq->rt_time += delta_exec; /* * We'll let actual RT tasks worry about the overflow here, we - * have our own CBS to keep us inline -- see above. + * have our own CBS to keep us inline; only account when RT + * bandwidth is relevant. */ + if (sched_rt_bandwidth_account(rt_rq)) + rt_rq->rt_time += delta_exec; raw_spin_unlock(&rt_rq->rt_runtime_lock); } } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index a2740b7..1999021 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -538,6 +538,14 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) #endif /* CONFIG_RT_GROUP_SCHED */ +bool sched_rt_bandwidth_account(struct rt_rq *rt_rq) +{ + struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); + + return (hrtimer_active(&rt_b->rt_period_timer) || + rt_rq->rt_time < rt_b->rt_runtime); +} + #ifdef CONFIG_SMP /* * We ran out of runtime, see if we can borrow some from our neighbours. -- cgit v1.1 From e3703f8cdfcf39c25c4338c3ad8e68891cca3731 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 24 Feb 2014 12:06:12 +0100 Subject: perf: Fix hotplug splat Drew Richardson reported that he could make the kernel go *boom* when hotplugging while having perf events active. It turned out that when you have a group event, the code in __perf_event_exit_context() fails to remove the group siblings from the context. We then proceed with destroying and freeing the event, and when you re-plug the CPU and try and add another event to that CPU, things go *boom* because you've still got dead entries there. Reported-by: Drew Richardson Signed-off-by: Peter Zijlstra Cc: Will Deacon Cc: Link: http://lkml.kernel.org/n/tip-k6v5wundvusvcseqj1si0oz0@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 56003c6..fa0b2d4 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7856,14 +7856,14 @@ static void perf_pmu_rotate_stop(struct pmu *pmu) static void __perf_event_exit_context(void *__info) { struct perf_event_context *ctx = __info; - struct perf_event *event, *tmp; + struct perf_event *event; perf_pmu_rotate_stop(ctx->pmu); - list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) - __perf_remove_from_context(event); - list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) + rcu_read_lock(); + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) __perf_remove_from_context(event); + rcu_read_unlock(); } static void perf_event_exit_cpu_context(int cpu) @@ -7887,11 +7887,11 @@ static void perf_event_exit_cpu(int cpu) { struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); + perf_event_exit_cpu_context(cpu); + mutex_lock(&swhash->hlist_mutex); swevent_hlist_release(swhash); mutex_unlock(&swhash->hlist_mutex); - - perf_event_exit_cpu_context(cpu); } #else static inline void perf_event_exit_cpu(int cpu) { } -- cgit v1.1 From 64be38ab03e9b238a1299857fef8b3707c0ed045 Mon Sep 17 00:00:00 2001 From: Rashika Kheria Date: Thu, 27 Feb 2014 17:10:12 +0530 Subject: genirq: Include missing header file in irqdomain.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Include appropriate header file include/linux/of_irq.h in kernel/irq/irqdomain.c because it contains prototype definition of function define in kernel/irq/irqdomain.c. This eliminates the following warning in kernel/irq/irqdomain.c: kernel/irq/irqdomain.c:468:14: warning: no previous prototype for ‘irq_create_of_mapping’ [-Wmissing-prototypes] Signed-off-by: Rashika Kheria Reviewed-by: Josh Triplett Cc: Benjamin Herrenschmidt Link: http://lkml.kernel.org/r/eb89aebea7ff1a46122918ac389ebecf8248be9a.1393493276.git.rashika.kheria@gmail.com Signed-off-by: Thomas Gleixner --- kernel/irq/irqdomain.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index cf68bb3..f140337 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include -- cgit v1.1 From 4729583006772b9530404bc1bb7c3aa4a10ffd4d Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 27 Feb 2014 18:19:03 +0800 Subject: cpuset: fix a locking issue in cpuset_migrate_mm() I can trigger a lockdep warning: # mount -t cgroup -o cpuset xxx /cgroup # mkdir /cgroup/cpuset # mkdir /cgroup/tmp # echo 0 > /cgroup/tmp/cpuset.cpus # echo 0 > /cgroup/tmp/cpuset.mems # echo 1 > /cgroup/tmp/cpuset.memory_migrate # echo $$ > /cgroup/tmp/tasks # echo 1 > /cgruop/tmp/cpuset.mems =============================== [ INFO: suspicious RCU usage. ] 3.14.0-rc1-0.1-default+ #32 Not tainted ------------------------------- include/linux/cgroup.h:682 suspicious rcu_dereference_check() usage! ... [] dump_stack+0x72/0x86 [] lockdep_rcu_suspicious+0x101/0x140 [] cpuset_migrate_mm+0xb1/0xe0 ... We used to hold cgroup_mutex when calling cpuset_migrate_mm(), but now we hold cpuset_mutex, which causes task_css() to complain. This is not a false-positive but a real issue. Holding cpuset_mutex won't prevent a task from migrating to another cpuset, and it won't prevent the original task->cgroup from destroying during this change. Fixes: 5d21cc2db040 (cpuset: replace cgroup_mutex locking with cpuset internal locking) Cc: # 3.9+ Signed-off-by: Li Zefan Sigend-off-by: Tejun Heo --- kernel/cpuset.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 4410ac6..dba9e4a 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -974,12 +974,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, * Temporarilly set tasks mems_allowed to target nodes of migration, * so that the migration code can allocate pages on these nodes. * - * Call holding cpuset_mutex, so current's cpuset won't change - * during this call, as manage_mutex holds off any cpuset_attach() - * calls. Therefore we don't need to take task_lock around the - * call to guarantee_online_mems(), as we know no one is changing - * our task's cpuset. - * * While the mm_struct we are migrating is typically from some * other task, the task_struct mems_allowed that we are hacking * is for our current task, which must allocate new pages for that @@ -996,8 +990,10 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); + rcu_read_lock(); mems_cs = effective_nodemask_cpuset(task_cs(tsk)); guarantee_online_mems(mems_cs, &tsk->mems_allowed); + rcu_read_unlock(); } /* -- cgit v1.1 From 99afb0fd5f05aac467ffa85c36778fec4396209b Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 27 Feb 2014 18:19:36 +0800 Subject: cpuset: fix a race condition in __cpuset_node_allowed_softwall() It's not safe to access task's cpuset after releasing task_lock(). Holding callback_mutex won't help. Cc: Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cpuset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index dba9e4a..e6b1b66 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2482,9 +2482,9 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) task_lock(current); cs = nearest_hardwall_ancestor(task_cs(current)); + allowed = node_isset(node, cs->mems_allowed); task_unlock(current); - allowed = node_isset(node, cs->mems_allowed); mutex_unlock(&callback_mutex); return allowed; } -- cgit v1.1 From 45ab2813d40d88fc575e753c38478de242d03f88 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 26 Feb 2014 13:37:38 -0500 Subject: tracing: Do not add event files for modules that fail tracepoints If a module fails to add its tracepoints due to module tainting, do not create the module event infrastructure in the debugfs directory. As the events will not work and worse yet, they will silently fail, making the user wonder why the events they enable do not display anything. Having a warning on module load and the events not visible to the users will make the cause of the problem much clearer. Link: http://lkml.kernel.org/r/20140227154923.265882695@goodmis.org Fixes: 6d723736e472 "tracing/events: add support for modules to TRACE_EVENT" Acked-by: Mathieu Desnoyers Cc: stable@vger.kernel.org # 2.6.31+ Cc: Rusty Russell Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 10 ++++++++++ kernel/tracepoint.c | 7 ++++++- 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index e71ffd4..f3989ce 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1777,6 +1777,16 @@ static void trace_module_add_events(struct module *mod) { struct ftrace_event_call **call, **start, **end; + if (!mod->num_trace_events) + return; + + /* Don't add infrastructure for mods without tracepoints */ + if (trace_module_has_bad_taint(mod)) { + pr_err("%s: module has bad taint, not creating trace events\n", + mod->name); + return; + } + start = mod->trace_events; end = mod->trace_events + mod->num_trace_events; diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 29f2654..031cc56 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -631,6 +631,11 @@ void tracepoint_iter_reset(struct tracepoint_iter *iter) EXPORT_SYMBOL_GPL(tracepoint_iter_reset); #ifdef CONFIG_MODULES +bool trace_module_has_bad_taint(struct module *mod) +{ + return mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP)); +} + static int tracepoint_module_coming(struct module *mod) { struct tp_module *tp_mod, *iter; @@ -641,7 +646,7 @@ static int tracepoint_module_coming(struct module *mod) * module headers (for forced load), to make sure we don't cause a crash. * Staging and out-of-tree GPL modules are fine. */ - if (mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP))) + if (trace_module_has_bad_taint(mod)) return 0; mutex_lock(&tracepoints_mutex); tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL); -- cgit v1.1 From e97ca8e5b864f88b028c1759ba8536fa827d6d96 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 10 Mar 2014 15:49:43 -0700 Subject: mm: fix GFP_THISNODE callers and clarify GFP_THISNODE is for callers that implement their own clever fallback to remote nodes. It restricts the allocation to the specified node and does not invoke reclaim, assuming that the caller will take care of it when the fallback fails, e.g. through a subsequent allocation request without GFP_THISNODE set. However, many current GFP_THISNODE users only want the node exclusive aspect of the flag, without actually implementing their own fallback or triggering reclaim if necessary. This results in things like page migration failing prematurely even when there is easily reclaimable memory available, unless kswapd happens to be running already or a concurrent allocation attempt triggers the necessary reclaim. Convert all callsites that don't implement their own fallback strategy to __GFP_THISNODE. This restricts the allocation a single node too, but at the same time allows the allocator to enter the slowpath, wake kswapd, and invoke direct reclaim if necessary, to make the allocation happen when memory is full. Signed-off-by: Johannes Weiner Acked-by: Rik van Riel Cc: Jan Stancek Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/profile.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/profile.c b/kernel/profile.c index 6631e1e..ebdd9c1 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -549,14 +549,14 @@ static int create_hash_tables(void) struct page *page; page = alloc_pages_exact_node(node, - GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, + GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE, 0); if (!page) goto out_cleanup; per_cpu(cpu_profile_hits, cpu)[1] = (struct profile_hit *)page_address(page); page = alloc_pages_exact_node(node, - GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, + GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE, 0); if (!page) goto out_cleanup; -- cgit v1.1 From d44753b843e093f9e1f2f14806fbe106fff74898 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Mon, 3 Mar 2014 12:09:21 +0100 Subject: sched/deadline: Deny unprivileged users to set/change SCHED_DEADLINE policy Deny the use of SCHED_DEADLINE policy to unprivileged users. Even if root users can set the policy for normal users, we don't want the latter to be able to change their parameters (safest behavior). Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1393844961-18097-1-git-send-email-juri.lelli@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6edbef2..f5c6635 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3338,6 +3338,15 @@ recheck: return -EPERM; } + /* + * Can't set/change SCHED_DEADLINE policy at all for now + * (safest behavior); in the future we would like to allow + * unprivileged DL tasks to increase their relative deadline + * or reduce their runtime (both ways reducing utilization) + */ + if (dl_policy(policy)) + return -EPERM; + /* * Treat SCHED_IDLE as nice 20. Only allow a switch to * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. -- cgit v1.1 From 177c53d943368fc97644ebc0a250dc8e2d124250 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 28 Feb 2014 13:39:05 +0100 Subject: stop_machine: Fix^2 race between stop_two_cpus() and stop_cpus() We must use smp_call_function_single(.wait=1) for the irq_cpu_stop_queue_work() to ensure the queueing is actually done under stop_cpus_lock. Without this we could have dropped the lock by the time we do the queueing and get the race we tried to fix. Fixes: 7053ea1a34fa ("stop_machine: Fix race between stop_two_cpus() and stop_cpus()") Signed-off-by: Peter Zijlstra Cc: Prarit Bhargava Cc: Rik van Riel Cc: Mel Gorman Cc: Christoph Hellwig Cc: Andrew Morton Link: http://lkml.kernel.org/r/20140228123905.GK3104@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/stop_machine.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 84571e0..01fbae5 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -293,7 +293,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * */ smp_call_function_single(min(cpu1, cpu2), &irq_cpu_stop_queue_work, - &call_args, 0); + &call_args, 1); lg_local_unlock(&stop_cpus_lock); preempt_enable(); -- cgit v1.1 From 96b3d28bf4b00f62fc8386ff5d487d1830793a3d Mon Sep 17 00:00:00 2001 From: Fernando Luis Vazquez Cao Date: Thu, 6 Mar 2014 14:25:28 +0900 Subject: sched/clock: Prevent tracing recursion in sched_clock_cpu() Prevent tracing of preempt_disable/enable() in sched_clock_cpu(). When CONFIG_DEBUG_PREEMPT is enabled, preempt_disable/enable() are traced and this causes trace_clock() users (and probably others) to go into an infinite recursion. Systems with a stable sched_clock() are not affected. This problem is similar to that fixed by upstream commit 95ef1e52922 ("KVM guest: prevent tracing recursion with kvmclock"). Signed-off-by: Fernando Luis Vazquez Cao Signed-off-by: Peter Zijlstra Acked-by: Steven Rostedt Cc: Andrew Morton Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1394083528.4524.3.camel@nexus Signed-off-by: Ingo Molnar --- kernel/sched/clock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index 43c2bcc..b30a292 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -301,14 +301,14 @@ u64 sched_clock_cpu(int cpu) if (unlikely(!sched_clock_running)) return 0ull; - preempt_disable(); + preempt_disable_notrace(); scd = cpu_sdc(cpu); if (cpu != smp_processor_id()) clock = sched_clock_remote(scd); else clock = sched_clock_local(scd); - preempt_enable(); + preempt_enable_notrace(); return clock; } -- cgit v1.1