From d689fe222a858c767cb8594faf280048e532b53f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Nov 2013 21:01:57 +0100 Subject: NOHZ: Check for nohz active instead of nohz enabled RCU and the fine grained idle time accounting functions check tick_nohz_enabled. But that variable is merily telling that NOHZ has been enabled in the config and not been disabled on the command line. But it does not tell anything about nohz being active. That's what all this should check for. Matthew reported, that the idle accounting on his old P1 machine showed bogus values, when he enabled NOHZ in the config and did not disable it on the kernel command line. The reason is that his machine uses (refined) jiffies as a clocksource which explains why the "fine" grained accounting went into lala land, because it depends on when the system goes and leaves idle relative to the jiffies increment. Provide a tick_nohz_active indicator and let RCU and the accounting code use this instead of tick_nohz_enable. Reported-and-tested-by: Matthew Whitehead Signed-off-by: Thomas Gleixner Reviewed-by: Steven Rostedt Reviewed-by: Paul E. McKenney Cc: john.stultz@linaro.org Cc: mwhitehe@redhat.com Link: http://lkml.kernel.org/r/alpine.DEB.2.02.1311132052240.30673@ionos.tec.linutronix.de --- kernel/rcu/tree_plugin.h | 4 ++-- kernel/time/tick-sched.c | 21 +++++++++------------ 2 files changed, 11 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 6abb03d..08a7652 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1632,7 +1632,7 @@ module_param(rcu_idle_gp_delay, int, 0644); static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY; module_param(rcu_idle_lazy_gp_delay, int, 0644); -extern int tick_nohz_enabled; +extern int tick_nohz_active; /* * Try to advance callbacks for all flavors of RCU on the current CPU, but @@ -1729,7 +1729,7 @@ static void rcu_prepare_for_idle(int cpu) int tne; /* Handle nohz enablement switches conservatively. */ - tne = ACCESS_ONCE(tick_nohz_enabled); + tne = ACCESS_ONCE(tick_nohz_active); if (tne != rdtp->tick_nohz_enabled_snap) { if (rcu_cpu_has_callbacks(cpu, NULL)) invoke_rcu_core(); /* force nohz to see update. */ diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 3612fc7..a12df5a 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -361,8 +361,8 @@ void __init tick_nohz_init(void) /* * NO HZ enabled ? */ -int tick_nohz_enabled __read_mostly = 1; - +static int tick_nohz_enabled __read_mostly = 1; +int tick_nohz_active __read_mostly; /* * Enable / Disable tickless mode */ @@ -465,7 +465,7 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); ktime_t now, idle; - if (!tick_nohz_enabled) + if (!tick_nohz_active) return -1; now = ktime_get(); @@ -506,7 +506,7 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); ktime_t now, iowait; - if (!tick_nohz_enabled) + if (!tick_nohz_active) return -1; now = ktime_get(); @@ -799,11 +799,6 @@ void tick_nohz_idle_enter(void) local_irq_disable(); ts = &__get_cpu_var(tick_cpu_sched); - /* - * set ts->inidle unconditionally. even if the system did not - * switch to nohz mode the cpu frequency governers rely on the - * update of the idle time accounting in tick_nohz_start_idle(). - */ ts->inidle = 1; __tick_nohz_idle_enter(ts); @@ -973,7 +968,7 @@ static void tick_nohz_switch_to_nohz(void) struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); ktime_t next; - if (!tick_nohz_enabled) + if (!tick_nohz_active) return; local_irq_disable(); @@ -981,7 +976,7 @@ static void tick_nohz_switch_to_nohz(void) local_irq_enable(); return; } - + tick_nohz_active = 1; ts->nohz_mode = NOHZ_MODE_LOWRES; /* @@ -1139,8 +1134,10 @@ void tick_setup_sched_timer(void) } #ifdef CONFIG_NO_HZ_COMMON - if (tick_nohz_enabled) + if (tick_nohz_enabled) { ts->nohz_mode = NOHZ_MODE_HIGHRES; + tick_nohz_active = 1; + } #endif } #endif /* HIGH_RES_TIMERS */ -- cgit v1.1 From da554eba2e68c8ec051977db5ee1f42d384a01ed Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Fri, 15 Nov 2013 14:15:31 -0800 Subject: timer: Convert kmalloc_node(...GFP_ZERO...) to kzalloc_node(...) Use the helper function instead of __GFP_ZERO. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/timer.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index 6582b82..accfd24 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1518,9 +1518,8 @@ static int init_timers_cpu(int cpu) /* * The APs use this path later in boot */ - base = kmalloc_node(sizeof(*base), - GFP_KERNEL | __GFP_ZERO, - cpu_to_node(cpu)); + base = kzalloc_node(sizeof(*base), GFP_KERNEL, + cpu_to_node(cpu)); if (!base) return -ENOMEM; -- cgit v1.1 From 050ded1bbaea3331745cf2782315f5bc2582d083 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 15 Nov 2013 14:15:33 -0800 Subject: tick: Document tick_do_timer_cpu Taken straight from a tglx email ;) Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/time/tick-common.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 64522ec..162b03a 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -33,6 +33,21 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device); */ ktime_t tick_next_period; ktime_t tick_period; + +/* + * tick_do_timer_cpu is a timer core internal variable which holds the CPU NR + * which is responsible for calling do_timer(), i.e. the timekeeping stuff. This + * variable has two functions: + * + * 1) Prevent a thundering herd issue of a gazillion of CPUs trying to grab the + * timekeeping lock all at once. Only the CPU which is assigned to do the + * update is handling it. + * + * 2) Hand off the duty in the NOHZ idle case by setting the value to + * TICK_DO_TIMER_NONE, i.e. a non existing CPU. So the next cpu which looks + * at it will take over and keep the time keeping alive. The handover + * procedure also covers cpu hotplug. + */ int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; /* -- cgit v1.1 From d5b5f391d434c5cc8bcb1ab2d759738797b85f52 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 14 Nov 2013 16:23:04 +0100 Subject: ftrace, perf: Avoid infinite event generation loop Vince's perf-trinity fuzzer found yet another 'interesting' problem. When we sample the irq_work_exit tracepoint with period==1 (or PERF_SAMPLE_PERIOD) and we add an fasync SIGNAL handler we create an infinite event generation loop: ,-> | irq_work_exit() -> | trace_irq_work_exit() -> | ... | __perf_event_overflow() -> (due to fasync) | irq_work_queue() -> (irq_work_list must be empty) '--------- arch_irq_work_raise() Similar things can happen due to regular poll() wakeups if we exceed the ring-buffer wakeup watermark, or have an event_limit. To avoid this, dis-allow sampling this particular tracepoint. In order to achieve this, create a special perf_perm function pointer for each event and call this (when set) on trying to create a tracepoint perf event. [ roasted: use expr... to allow for ',' in your expression ] Reported-by: Vince Weaver Tested-by: Vince Weaver Signed-off-by: Peter Zijlstra Cc: Steven Rostedt Cc: Dave Jones Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20131114152304.GC5364@laptop.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/trace/trace_event_perf.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 78e27e3..630889f 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -24,6 +24,12 @@ static int total_ref_count; static int perf_trace_event_perm(struct ftrace_event_call *tp_event, struct perf_event *p_event) { + if (tp_event->perf_perm) { + int ret = tp_event->perf_perm(tp_event, p_event); + if (ret) + return ret; + } + /* The ftrace function trace is allowed only for root. */ if (ftrace_event_is_function(tp_event) && perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) -- cgit v1.1 From 06db0b21712f878b808480ef31097637013bbf0f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 13 Sep 2013 13:14:47 +0200 Subject: perf: Remove fragile swevent hlist optimization Currently we only allocate a single cpu hashtable for per-cpu swevents; do away with this optimization for it is fragile in the face of things like perf_pmu_migrate_context(). The easiest thing is to make sure all CPUs are consistent wrt state. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20130913111447.GN31370@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/events/core.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index d724e77..72348dc 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5680,11 +5680,6 @@ static void swevent_hlist_put(struct perf_event *event) { int cpu; - if (event->cpu != -1) { - swevent_hlist_put_cpu(event, event->cpu); - return; - } - for_each_possible_cpu(cpu) swevent_hlist_put_cpu(event, cpu); } @@ -5718,9 +5713,6 @@ static int swevent_hlist_get(struct perf_event *event) int err; int cpu, failed_cpu; - if (event->cpu != -1) - return swevent_hlist_get_cpu(event, event->cpu); - get_online_cpus(); for_each_possible_cpu(cpu) { err = swevent_hlist_get_cpu(event, cpu); -- cgit v1.1 From 0022cedd4a7d8a87841351e2b018bb6794cf2e67 Mon Sep 17 00:00:00 2001 From: Vince Weaver Date: Fri, 15 Nov 2013 12:39:45 -0500 Subject: perf/trace: Properly use u64 to hold event_id The 64-bit attr.config value for perf trace events was being copied into an "int" before doing a comparison, meaning the top 32 bits were being truncated. As far as I can tell this didn't cause any errors, but it did mean it was possible to create valid aliases for all the tracepoint ids which I don't think was intended. (For example, 0xffffffff00000018 and 0x18 both enable the same tracepoint). Signed-off-by: Vince Weaver Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1311151236100.11932@vincent-weaver-1.um.maine.edu Signed-off-by: Ingo Molnar --- kernel/trace/trace_event_perf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 630889f..e854f42 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -179,7 +179,7 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event, int perf_trace_init(struct perf_event *p_event) { struct ftrace_event_call *tp_event; - int event_id = p_event->attr.config; + u64 event_id = p_event->attr.config; int ret = -EINVAL; mutex_lock(&event_mutex); -- cgit v1.1 From 9abf24d465180f5f2eb26a43545348262f16b771 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Tue, 12 Nov 2013 22:11:26 +0530 Subject: sched: Check sched_domain before computing group power After commit 863bffc80898 ("sched/fair: Fix group power_orig computation"), we can dereference rq->sd before it is set. Fix this by falling back to power_of() in this case and add a comment explaining things. Signed-off-by: Srikar Dronamraju [ Added comment and tweaked patch. ] Signed-off-by: Peter Zijlstra Cc: mikey@neuling.org Link: http://lkml.kernel.org/r/20131113151718.GN21461@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e8b652e..fd773ad 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5379,10 +5379,31 @@ void update_group_power(struct sched_domain *sd, int cpu) */ for_each_cpu(cpu, sched_group_cpus(sdg)) { - struct sched_group *sg = cpu_rq(cpu)->sd->groups; + struct sched_group_power *sgp; + struct rq *rq = cpu_rq(cpu); - power_orig += sg->sgp->power_orig; - power += sg->sgp->power; + /* + * build_sched_domains() -> init_sched_groups_power() + * gets here before we've attached the domains to the + * runqueues. + * + * Use power_of(), which is set irrespective of domains + * in update_cpu_power(). + * + * This avoids power/power_orig from being 0 and + * causing divide-by-zero issues on boot. + * + * Runtime updates will correct power_orig. + */ + if (unlikely(!rq->sd)) { + power_orig += power_of(cpu); + power += power_of(cpu); + continue; + } + + sgp = rq->sd->groups->sgp; + power_orig += sgp->power_orig; + power += sgp->power; } } else { /* -- cgit v1.1 From 42eb088ed246a5a817bb45a8b32fe234cf1c0f8b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 19 Nov 2013 16:41:49 +0100 Subject: sched: Avoid NULL dereference on sd_busy Commit 37dc6b50cee9 ("sched: Remove unnecessary iteration over sched domains to update nr_busy_cpus") forgot to clear 'sd_busy' under some conditions leading to a possible NULL deref in set_cpu_sd_state_idle(). Reported-by: Anton Blanchard Cc: Preeti U Murthy Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20131118113701.GF3866@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c180860..a1591ca 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4910,8 +4910,9 @@ static void update_top_cache_domain(int cpu) if (sd) { id = cpumask_first(sched_domain_span(sd)); size = cpumask_weight(sched_domain_span(sd)); - rcu_assign_pointer(per_cpu(sd_busy, cpu), sd->parent); + sd = sd->parent; /* sd_busy */ } + rcu_assign_pointer(per_cpu(sd_busy, cpu), sd); rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); per_cpu(sd_llc_size, cpu) = size; -- cgit v1.1 From 0515973ffb16c2852a1bb1df2ca1456556faaaa5 Mon Sep 17 00:00:00 2001 From: Shigeru Yoshida Date: Sun, 17 Nov 2013 12:12:36 +0900 Subject: sched: Fix a trivial typo in comments Fix a trivial typo in rq_attach_root(). Signed-off-by: Shigeru Yoshida Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20131117.121236.1990617639803941055.shigeru.yoshida@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a1591ca..718730d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4762,7 +4762,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) cpumask_clear_cpu(rq->cpu, old_rd->span); /* - * If we dont want to free the old_rt yet then + * If we dont want to free the old_rd yet then * set old_rd to NULL to skip the freeing later * in this function: */ -- cgit v1.1 From 4be77398ac9d948773116b6be4a3c91b3d6ea18c Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 22 Nov 2013 11:44:51 -0800 Subject: time: Fix 1ns/tick drift w/ GENERIC_TIME_VSYSCALL_OLD Since commit 1e75fa8be9f (time: Condense timekeeper.xtime into xtime_sec - merged in v3.6), there has been an problem with the error accounting in the timekeeping code, such that when truncating to nanoseconds, we round up to the next nsec, but the balancing adjustment to the ntp_error value was dropped. This causes 1ns per tick drift forward of the clock. In 3.7, this logic was isolated to only GENERIC_TIME_VSYSCALL_OLD architectures (s390, ia64, powerpc). The fix is simply to balance the accounting and to subtract the added nanosecond from ntp_error. This allows the internal long-term clock steering to keep the clock accurate. While this fix removes the regression added in 1e75fa8be9f, the ideal solution is to move away from GENERIC_TIME_VSYSCALL_OLD and use the new VSYSCALL method, which avoids entirely the nanosecond granular rounding, and the resulting short-term clock adjustment oscillation needed to keep long term accurate time. [ jstultz: Many thanks to Martin for his efforts identifying this subtle bug, and providing the fix. ] Originally-from: Martin Schwidefsky Cc: Tony Luck Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Cc: Andy Lutomirski Cc: Paul Turner Cc: Steven Rostedt Cc: Richard Cochran Cc: Prarit Bhargava Cc: Fenghua Yu Cc: Thomas Gleixner Cc: stable #v3.6+ Link: http://lkml.kernel.org/r/1385149491-20307-1-git-send-email-john.stultz@linaro.org Signed-off-by: John Stultz Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 3abf534..87b4f00 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1347,7 +1347,7 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk) tk->xtime_nsec -= remainder; tk->xtime_nsec += 1ULL << tk->shift; tk->ntp_error += remainder << tk->ntp_error_shift; - + tk->ntp_error -= (1ULL << tk->shift) << tk->ntp_error_shift; } #else #define old_vsyscall_fixup(tk) -- cgit v1.1 From ac01810c9d2814238f08a227062e66a35a0e1ea2 Mon Sep 17 00:00:00 2001 From: Laxman Dewangan Date: Mon, 25 Nov 2013 19:39:47 +0530 Subject: irq: Enable all irqs unconditionally in irq_resume When the system enters suspend, it disables all interrupts in suspend_device_irqs(), including the interrupts marked EARLY_RESUME. On the resume side things are different. The EARLY_RESUME interrupts are reenabled in sys_core_ops->resume and the non EARLY_RESUME interrupts are reenabled in the normal system resume path. When suspend_noirq() failed or suspend is aborted for any other reason, we might omit the resume side call to sys_core_ops->resume() and therefor the interrupts marked EARLY_RESUME are not reenabled and stay disabled forever. To solve this, enable all irqs unconditionally in irq_resume() regardless whether interrupts marked EARLY_RESUMEhave been already enabled or not. This might try to reenable already enabled interrupts in the non failure case, but the only affected platform is XEN and it has been confirmed that it does not cause any side effects. [ tglx: Massaged changelog. ] Signed-off-by: Laxman Dewangan Acked-by-and-tested-by: Konrad Rzeszutek Wilk Acked-by: Heiko Stuebner Reviewed-by: Pavel Machek Cc: Cc: Cc: Cc: Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1385388587-16442-1-git-send-email-ldewangan@nvidia.com Signed-off-by: Thomas Gleixner --- kernel/irq/pm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index cb228bf..abcd6ca 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -50,7 +50,7 @@ static void resume_irqs(bool want_early) bool is_early = desc->action && desc->action->flags & IRQF_EARLY_RESUME; - if (is_early != want_early) + if (!is_early && want_early) continue; raw_spin_lock_irqsave(&desc->lock, flags); -- cgit v1.1 From 12997d1a999cd1b22e21a238c96780f2a55e4e13 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 18 Nov 2013 11:00:29 -0700 Subject: Revert "workqueue: allow work_on_cpu() to be called recursively" This reverts commit c2fda509667b0fda4372a237f5a59ea4570b1627. c2fda509667b removed lockdep annotation from work_on_cpu() to work around the PCI path that calls work_on_cpu() from within a work_on_cpu() work item (PF driver .probe() method -> pci_enable_sriov() -> add VFs -> VF driver .probe method). 961da7fb6b22 ("PCI: Avoid unnecessary CPU switch when calling driver .probe() method) avoids that recursive work_on_cpu() use in a different way, so this revert restores the work_on_cpu() lockdep annotation. Signed-off-by: Bjorn Helgaas Acked-by: Tejun Heo --- kernel/workqueue.c | 32 ++++++++++---------------------- 1 file changed, 10 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 987293d..5690b8e 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2840,19 +2840,6 @@ already_gone: return false; } -static bool __flush_work(struct work_struct *work) -{ - struct wq_barrier barr; - - if (start_flush_work(work, &barr)) { - wait_for_completion(&barr.done); - destroy_work_on_stack(&barr.work); - return true; - } else { - return false; - } -} - /** * flush_work - wait for a work to finish executing the last queueing instance * @work: the work to flush @@ -2866,10 +2853,18 @@ static bool __flush_work(struct work_struct *work) */ bool flush_work(struct work_struct *work) { + struct wq_barrier barr; + lock_map_acquire(&work->lockdep_map); lock_map_release(&work->lockdep_map); - return __flush_work(work); + if (start_flush_work(work, &barr)) { + wait_for_completion(&barr.done); + destroy_work_on_stack(&barr.work); + return true; + } else { + return false; + } } EXPORT_SYMBOL_GPL(flush_work); @@ -4814,14 +4809,7 @@ long work_on_cpu(int cpu, long (*fn)(void *), void *arg) INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn); schedule_work_on(cpu, &wfc.work); - - /* - * The work item is on-stack and can't lead to deadlock through - * flushing. Use __flush_work() to avoid spurious lockdep warnings - * when work_on_cpu()s are nested. - */ - __flush_work(&wfc.work); - + flush_work(&wfc.work); return wfc.ret; } EXPORT_SYMBOL_GPL(work_on_cpu); -- cgit v1.1 From 32e475d76a3e40879cd9ee4f69b19615062280d7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 Nov 2013 12:41:44 +0100 Subject: sched: Expose preempt_schedule_irq() Tony reported that aa0d53260596 ("ia64: Use preempt_schedule_irq") broke PREEMPT=n builds on ia64. Ok, wrapped my brain around it. I tripped over the magic asm foo which has a single need_resched check and schedule point for both sys call return and interrupt return. So you need the schedule_preempt_irq() for kernel preemption from interrupt return while on a normal syscall preemption a schedule would be sufficient. But using schedule_preempt_irq() is not harmful here in any way. It just sets the preempt_active bit also in cases where it would not be required. Even on preempt=n kernels adding the preempt_active bit is completely harmless. So instead of having an extra function, moving the existing one out of the ifdef PREEMPT looks like the sanest thing to do. It would also allow getting rid of various other sti/schedule/cli asm magic in other archs. Reported-and-Tested-by: Tony Luck Fixes: aa0d53260596 ("ia64: Use preempt_schedule_irq") Signed-off-by: Thomas Gleixner [slightly edited Changelog] Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/alpine.DEB.2.02.1311211230030.30673@ionos.tec.linutronix.de Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 718730d..e85cda2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2660,6 +2660,7 @@ asmlinkage void __sched notrace preempt_schedule(void) } while (need_resched()); } EXPORT_SYMBOL(preempt_schedule); +#endif /* CONFIG_PREEMPT */ /* * this is the entry point to schedule() from kernel preemption @@ -2693,8 +2694,6 @@ asmlinkage void __sched preempt_schedule_irq(void) exception_exit(prev_state); } -#endif /* CONFIG_PREEMPT */ - int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, void *key) { -- cgit v1.1 From 0e576acbc1d9600cf2d9b4a141a2554639959d50 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 29 Nov 2013 12:18:13 +0100 Subject: nohz: Fix another inconsistency between CONFIG_NO_HZ=n and nohz=off If CONFIG_NO_HZ=n tick_nohz_get_sleep_length() returns NSEC_PER_SEC/HZ. If CONFIG_NO_HZ=y and the nohz functionality is disabled via the command line option "nohz=off" or not enabled due to missing hardware support, then tick_nohz_get_sleep_length() returns 0. That happens because ts->sleep_length is never set in that case. Set it to NSEC_PER_SEC/HZ when the NOHZ mode is inactive. Reported-by: Michal Hocko Reported-by: Borislav Petkov Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index a12df5a..ea20f7d 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -711,8 +711,10 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) return false; } - if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) + if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) { + ts->sleep_length = (ktime_t) { .tv64 = NSEC_PER_SEC/HZ }; return false; + } if (need_resched()) return false; -- cgit v1.1 From 3ccb01239201af06a07482ec686b14cd148102a5 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 3 Dec 2013 12:41:20 -0500 Subject: tracing: Only run synchronize_sched() at instance deletion time It has been reported that boot up with FTRACE_SELFTEST enabled can take a very long time. There can be stalls of over a minute. This was tracked down to the synchronize_sched() called when a system call event is disabled. As the self tests enable and disable thousands of events, this makes the synchronize_sched() get called thousands of times. The synchornize_sched() was added with d562aff93bfb53 "tracing: Add support for SOFT_DISABLE to syscall events" which caused this regression (added in 3.13-rc1). The synchronize_sched() is to protect against the events being accessed when a tracer instance is being deleted. When an instance is being deleted all the events associated to it are unregistered. The synchronize_sched() makes sure that no more users are running when it finishes. Instead of calling synchronize_sched() for all syscall events, we only need to call it once, after the events are unregistered and before the instance is deleted. The event_mutex is held during this action to prevent new users from enabling events. Link: http://lkml.kernel.org/r/20131203124120.427b9661@gandalf.local.home Reported-by: Petr Mladek Acked-by: Tom Zanussi Acked-by: Petr Mladek Tested-by: Petr Mladek Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 3 +++ kernel/trace/trace_syscalls.c | 10 ---------- 2 files changed, 3 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f919a2e..a11800a 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2314,6 +2314,9 @@ int event_trace_del_tracer(struct trace_array *tr) /* Disable any running events */ __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0); + /* Access to events are within rcu_read_lock_sched() */ + synchronize_sched(); + down_write(&trace_event_sem); __trace_remove_event_dirs(tr); debugfs_remove_recursive(tr->event_dir); diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index e4b6d11..ea90eb5 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -431,11 +431,6 @@ static void unreg_event_syscall_enter(struct ftrace_event_file *file, if (!tr->sys_refcount_enter) unregister_trace_sys_enter(ftrace_syscall_enter, tr); mutex_unlock(&syscall_trace_lock); - /* - * Callers expect the event to be completely disabled on - * return, so wait for current handlers to finish. - */ - synchronize_sched(); } static int reg_event_syscall_exit(struct ftrace_event_file *file, @@ -474,11 +469,6 @@ static void unreg_event_syscall_exit(struct ftrace_event_file *file, if (!tr->sys_refcount_exit) unregister_trace_sys_exit(ftrace_syscall_exit, tr); mutex_unlock(&syscall_trace_lock); - /* - * Callers expect the event to be completely disabled on - * return, so wait for current handlers to finish. - */ - synchronize_sched(); } static int __init init_syscall_trace(struct ftrace_event_call *call) -- cgit v1.1 From 4fc9bbf98fd66f879e628d8537ba7c240be2b58e Mon Sep 17 00:00:00 2001 From: Khalid Aziz Date: Wed, 27 Nov 2013 15:19:25 -0700 Subject: PCI: Disable Bus Master only on kexec reboot Add a flag to tell the PCI subsystem that kernel is shutting down in preparation to kexec a kernel. Add code in PCI subsystem to use this flag to clear Bus Master bit on PCI devices only in case of kexec reboot. This fixes a power-off problem on Acer Aspire V5-573G and likely other machines and avoids any other issues caused by clearing Bus Master bit on PCI devices in normal shutdown path. The problem was introduced by b566a22c2332 ("PCI: disable Bus Master on PCI device shutdown"). This patch is based on discussion at http://marc.info/?l=linux-pci&m=138425645204355&w=2 Link: https://bugzilla.kernel.org/show_bug.cgi?id=63861 Reported-by: Chang Liu Signed-off-by: Khalid Aziz Signed-off-by: Bjorn Helgaas Acked-by: Konstantin Khlebnikov Cc: stable@vger.kernel.org # v3.5+ --- kernel/kexec.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 490afc0..d0d8fca 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -47,6 +47,9 @@ u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; size_t vmcoreinfo_size; size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); +/* Flag to indicate we are going to kexec a new kernel */ +bool kexec_in_progress = false; + /* Location of the reserved area for the crash kernel */ struct resource crashk_res = { .name = "Crash kernel", @@ -1675,6 +1678,7 @@ int kernel_kexec(void) } else #endif { + kexec_in_progress = true; kernel_restart_prepare(NULL); printk(KERN_EMERG "Starting new kernel\n"); machine_shutdown(); -- cgit v1.1 From 7cfe5b3310a1b45f385ff18647bddb487a6c5525 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 10 Dec 2013 17:42:50 +1030 Subject: Ignore generated file kernel/x509_certificate_list $ git status # On branch pending-rebases # Untracked files: # (use "git add ..." to include in what will be committed) # # kernel/x509_certificate_list nothing added to commit but untracked files present (use "git add" to track) $ Signed-off-by: Rusty Russell Signed-off-by: David Howells --- kernel/.gitignore | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/.gitignore b/kernel/.gitignore index b3097bd..790d83c 100644 --- a/kernel/.gitignore +++ b/kernel/.gitignore @@ -5,3 +5,4 @@ config_data.h config_data.gz timeconst.h hz.bc +x509_certificate_list -- cgit v1.1 From 62226983da070f7e51068ec2e3a4da34672964c7 Mon Sep 17 00:00:00 2001 From: Hendrik Brueckner Date: Thu, 5 Dec 2013 14:48:22 +0100 Subject: KEYS: correct alignment of system_certificate_list content in assembly file Apart from data-type specific alignment constraints, there are also architecture-specific alignment requirements. For example, on s390 symbols must be on even addresses implying a 2-byte alignment. If the system_certificate_list_end symbol is on an odd address and if this address is loaded, the least-significant bit is ignored. As a result, the load_system_certificate_list() fails to load the certificates because of a wrong certificate length calculation. To be safe, align system_certificate_list on an 8-byte boundary. Also improve the length calculation of the system_certificate_list content. Introduce a system_certificate_list_size (8-byte aligned because of unsigned long) variable that stores the length. Let the linker calculate this size by introducing a start and end label for the certificate content. Signed-off-by: Hendrik Brueckner Signed-off-by: David Howells --- kernel/system_certificates.S | 14 ++++++++++++-- kernel/system_keyring.c | 4 ++-- 2 files changed, 14 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/system_certificates.S b/kernel/system_certificates.S index 4aef390..3e9868d47 100644 --- a/kernel/system_certificates.S +++ b/kernel/system_certificates.S @@ -3,8 +3,18 @@ __INITRODATA + .align 8 .globl VMLINUX_SYMBOL(system_certificate_list) VMLINUX_SYMBOL(system_certificate_list): +__cert_list_start: .incbin "kernel/x509_certificate_list" - .globl VMLINUX_SYMBOL(system_certificate_list_end) -VMLINUX_SYMBOL(system_certificate_list_end): +__cert_list_end: + + .align 8 + .globl VMLINUX_SYMBOL(system_certificate_list_size) +VMLINUX_SYMBOL(system_certificate_list_size): +#ifdef CONFIG_64BIT + .quad __cert_list_end - __cert_list_start +#else + .long __cert_list_end - __cert_list_start +#endif diff --git a/kernel/system_keyring.c b/kernel/system_keyring.c index 564dd93..52ebc70 100644 --- a/kernel/system_keyring.c +++ b/kernel/system_keyring.c @@ -22,7 +22,7 @@ struct key *system_trusted_keyring; EXPORT_SYMBOL_GPL(system_trusted_keyring); extern __initconst const u8 system_certificate_list[]; -extern __initconst const u8 system_certificate_list_end[]; +extern __initconst const unsigned long system_certificate_list_size; /* * Load the compiled-in keys @@ -60,8 +60,8 @@ static __init int load_system_certificate_list(void) pr_notice("Loading compiled-in X.509 certificates\n"); - end = system_certificate_list_end; p = system_certificate_list; + end = p + system_certificate_list_size; while (p < end) { /* Each cert begins with an ASN.1 SEQUENCE tag and must be more * than 256 bytes in size. -- cgit v1.1 From f12d5bfceb7e1f9051563381ec047f7f13956c3c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 12 Dec 2013 09:38:42 -0800 Subject: futex: fix handling of read-only-mapped hugepages The hugepage code had the exact same bug that regular pages had in commit 7485d0d3758e ("futexes: Remove rw parameter from get_futex_key()"). The regular page case was fixed by commit 9ea71503a8ed ("futex: Fix regression with read only mappings"), but the transparent hugepage case (added in a5b338f2b0b1: "thp: update futex compound knowledge") case remained broken. Found by Dave Jones and his trinity tool. Reported-and-tested-by: Dave Jones Cc: stable@kernel.org # v2.6.38+ Acked-by: Thomas Gleixner Cc: Mel Gorman Cc: Darren Hart Cc: Andrea Arcangeli Cc: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/futex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 80ba086..02febad 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -288,7 +288,7 @@ again: put_page(page); /* serialize against __split_huge_page_splitting() */ local_irq_disable(); - if (likely(__get_user_pages_fast(address, 1, 1, &page) == 1)) { + if (likely(__get_user_pages_fast(address, 1, !ro, &page) == 1)) { page_head = compound_head(page); /* * page_head is valid pointer but we must pin -- cgit v1.1 From 5cdec2d833748fbd27d3682f7209225c504c79c5 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 12 Dec 2013 09:53:51 -0800 Subject: futex: move user address verification up to common code When debugging the read-only hugepage case, I was confused by the fact that get_futex_key() did an access_ok() only for the non-shared futex case, since the user address checking really isn't in any way specific to the private key handling. Now, it turns out that the shared key handling does effectively do the equivalent checks inside get_user_pages_fast() (it doesn't actually check the address range on x86, but does check the page protections for being a user page). So it wasn't actually a bug, but the fact that we treat the address differently for private and shared futexes threw me for a loop. Just move the check up, so that it gets done for both cases. Also, use the 'rw' parameter for the type, even if it doesn't actually matter any more (it's a historical artifact of the old racy i386 "page faults from kernel space don't check write protections"). Cc: Thomas Gleixner Signed-off-by: Linus Torvalds --- kernel/futex.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 02febad..f6ff019 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -251,6 +251,9 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) return -EINVAL; address -= key->both.offset; + if (unlikely(!access_ok(rw, uaddr, sizeof(u32)))) + return -EFAULT; + /* * PROCESS_PRIVATE futexes are fast. * As the mm cannot disappear under us and the 'key' only needs @@ -259,8 +262,6 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) * but access_ok() should be faster than find_vma() */ if (!fshared) { - if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))) - return -EFAULT; key->private.mm = mm; key->private.address = address; get_futex_key_refs(key); -- cgit v1.1 From c606850407d9096415e226c75a871d0650404446 Mon Sep 17 00:00:00 2001 From: Masami Ichikawa Date: Thu, 19 Dec 2013 20:00:47 +0900 Subject: PM / sleep: Fix memory leak in pm_vt_switch_unregister(). kmemleak reported a memory leak as below. unreferenced object 0xffff880118f14700 (size 32): comm "swapper/0", pid 1, jiffies 4294877401 (age 123.283s) hex dump (first 32 bytes): 00 01 10 00 00 00 ad de 00 02 20 00 00 00 ad de .......... ..... 00 d4 d2 18 01 88 ff ff 01 00 00 00 00 04 00 00 ................ backtrace: [] kmemleak_alloc+0x4e/0xb0 [] kmem_cache_alloc_trace+0x1ec/0x260 [] pm_vt_switch_required+0x76/0xb0 [] register_framebuffer+0x195/0x320 [] efifb_probe+0x718/0x780 [] platform_drv_probe+0x45/0xb0 [] driver_probe_device+0x87/0x3a0 [] __driver_attach+0x93/0xa0 [] bus_for_each_dev+0x63/0xa0 [] driver_attach+0x1e/0x20 [] bus_add_driver+0x180/0x250 [] driver_register+0x64/0xf0 [] __platform_driver_register+0x4a/0x50 [] efifb_driver_init+0x12/0x14 [] do_one_initcall+0xfa/0x1b0 [] kernel_init_freeable+0x17b/0x201 In pm_vt_switch_required(), "entry" variable is allocated via kmalloc(). So, in pm_vt_switch_unregister(), it needs to call kfree() when object is deleted from list. Signed-off-by: Masami Ichikawa Reviewed-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/console.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/power/console.c b/kernel/power/console.c index 463aa673..eacb8bd 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c @@ -81,6 +81,7 @@ void pm_vt_switch_unregister(struct device *dev) list_for_each_entry(tmp, &pm_vt_switch_list, head) { if (tmp->dev == dev) { list_del(&tmp->head); + kfree(tmp); break; } } -- cgit v1.1