From 2a01bb3885c9145dbb7583d5aa5f5d5504f6f46f Mon Sep 17 00:00:00 2001 From: Kyle McMartin Date: Wed, 11 Apr 2012 08:15:29 -0400 Subject: panic: Make panic_on_oops configurable Several distros set this by default by patching panic_on_oops. It seems to fit with the BOOTPARAM_{HARD,SOFT}_PANIC options though, so let's add a Kconfig entry and reduce some more upstream delta. Signed-off-by: Kyle McMartin Cc: Andrew Morton Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120411121529.GH26688@redacted.bos.redhat.com Signed-off-by: Ingo Molnar --- kernel/panic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 8ed89a1..b6215b7 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -27,7 +27,7 @@ #define PANIC_TIMER_STEP 100 #define PANIC_BLINK_SPD 18 -int panic_on_oops; +int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE; static unsigned long tainted_mask; static int pause_on_oops; static int pause_on_oops_flag; -- cgit v1.1 From 62be73eafaa045d3233337303fb140f7f8a61135 Mon Sep 17 00:00:00 2001 From: Seiji Aguchi Date: Tue, 15 May 2012 17:35:09 -0400 Subject: kdump: Execute kmsg_dump(KMSG_DUMP_PANIC) after smp_send_stop() This patch moves kmsg_dump(KMSG_DUMP_PANIC) below smp_send_stop(), to serialize the crash-logging process via smp_send_stop() and to thus retrieve a more stable crash image of all CPUs stopped. Signed-off-by: Seiji Aguchi Acked-by: Don Zickus Cc: dle-develop@lists.sourceforge.net Cc: Satoru Moriya Cc: Tony Luck Cc: a.p.zijlstra@chello.nl Link: http://lkml.kernel.org/r/5C4C569E8A4B9B42A84A977CF070A35B2E4D7A5CE2@USINDEVS01.corp.hds.com Signed-off-by: Ingo Molnar --- kernel/panic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index b6215b7..d2a5f4e 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -108,8 +108,6 @@ void panic(const char *fmt, ...) */ crash_kexec(NULL); - kmsg_dump(KMSG_DUMP_PANIC); - /* * Note smp_send_stop is the usual smp shutdown function, which * unfortunately means it may not be hardened to work in a panic @@ -117,6 +115,8 @@ void panic(const char *fmt, ...) */ smp_send_stop(); + kmsg_dump(KMSG_DUMP_PANIC); + atomic_notifier_call_chain(&panic_notifier_list, 0, buf); bust_spinlocks(0); -- cgit v1.1 From 967db0ea65b0bf8507a7643ac8f296c4f2c0a834 Mon Sep 17 00:00:00 2001 From: Salman Qazi Date: Wed, 6 Jun 2012 18:51:35 -0700 Subject: cgroup: make sure that decisions in __css_put are atomic __css_put is using atomic_dec on the ref count, and then looking at the ref count to make decisions. This is prone to races, as someone else may decrement ref count between our decrement and our decision. Instead, we should base our decisions on the value that we decremented the ref count to. (This results in an actual race on Google's kernel which I haven't been able to reproduce on the upstream kernel. Having said that, it's still incorrect by inspection). Signed-off-by: Salman Qazi Acked-by: Li Zefan Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org --- kernel/cgroup.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 72fcd30..ceeafe8 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4984,8 +4984,7 @@ void __css_put(struct cgroup_subsys_state *css) struct cgroup *cgrp = css->cgroup; rcu_read_lock(); - atomic_dec(&css->refcnt); - switch (css_refcnt(css)) { + switch (atomic_dec_return(&css->refcnt)) { case 1: if (notify_on_release(cgrp)) { set_bit(CGRP_RELEASABLE, &cgrp->flags); -- cgit v1.1 From f2bf1f6f5f89d031245067512449fc889b2f4bb2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 6 Jun 2012 19:50:40 -0400 Subject: tracing: Have tracing_off() actually turn tracing off A recent update to have tracing_on/off() only affect the ftrace ring buffers instead of all ring buffers had a cut and paste error. The tracing_off() did the exact same thing as tracing_on() and would not actually turn off tracing. Unfortunately, tracing_off() is more important to be working than tracing_on() as this is a key development tool, as it lets the developer turn off tracing as soon as a problem is discovered. It is also used by panic and oops code. This bug also breaks the 'echo func:traceoff > set_ftrace_filter' Cc: # 3.4 Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 68032c6..49249c2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -371,7 +371,7 @@ EXPORT_SYMBOL_GPL(tracing_on); void tracing_off(void) { if (global_trace.buffer) - ring_buffer_record_on(global_trace.buffer); + ring_buffer_record_off(global_trace.buffer); /* * This flag is only looked at when buffers haven't been * allocated yet. We don't really care about the race -- cgit v1.1 From 8f5af6f1f2d09fe5eac86a5dc1731a5917c1503a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 4 May 2012 08:31:53 -0700 Subject: rcu: RCU_FAST_NO_HZ detection of callback adoption In the present implementations of CPU hotplug, the outgoing CPU is guaranteed to run its stop-machine process on the way out, which will guarantee that RCU_FAST_NO_HZ forces the CPU out of dyntick-idle mode. However, new versions of CPU hotplug might not work this way. This commit therefore removes this design constraint by explicitly notifying CPUs when they adopt non-lazy RCU callbacks. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Tested-by: Heiko Carstens Tested-by: Pascal Chapperon --- kernel/rcutree.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 0da7b88..3b0f133 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1397,6 +1397,8 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) rdp->qlen_lazy += rsp->qlen_lazy; rdp->qlen += rsp->qlen; rdp->n_cbs_adopted += rsp->qlen; + if (rsp->qlen_lazy != rsp->qlen) + rcu_idle_count_callbacks_posted(); rsp->qlen_lazy = 0; rsp->qlen = 0; -- cgit v1.1 From fd4b352687fd8604d49c190c4c9ea9e369fd42d5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 5 May 2012 19:10:35 -0700 Subject: rcu: Update RCU_FAST_NO_HZ tracing for lazy callbacks In the current code, a short dyntick-idle interval (where there is at least one non-lazy callback on the CPU) and a long dyntick-idle interval (where there are only lazy callbacks on the CPU) are traced identically, which can be less than helpful. This commit therefore emits different event traces in these two cases. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Tested-by: Heiko Carstens Tested-by: Pascal Chapperon --- kernel/rcutree_plugin.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 2411000..5449f02 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -2165,15 +2165,17 @@ static void rcu_prepare_for_idle(int cpu) !rcu_pending(cpu) && !local_softirq_pending()) { /* Can we go dyntick-idle despite still having callbacks? */ - trace_rcu_prep_idle("Dyntick with callbacks"); per_cpu(rcu_dyntick_drain, cpu) = 0; per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; - if (rcu_cpu_has_nonlazy_callbacks(cpu)) + if (rcu_cpu_has_nonlazy_callbacks(cpu)) { + trace_rcu_prep_idle("Dyntick with callbacks"); per_cpu(rcu_idle_gp_timer_expires, cpu) = jiffies + RCU_IDLE_GP_DELAY; - else + } else { per_cpu(rcu_idle_gp_timer_expires, cpu) = jiffies + RCU_IDLE_LAZY_GP_DELAY; + trace_rcu_prep_idle("Dyntick with lazy callbacks"); + } tp = &per_cpu(rcu_idle_gp_timer, cpu); mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu)); per_cpu(rcu_nonlazy_posted_snap, cpu) = -- cgit v1.1 From 5955f7eecd77d6b440db278b266cfecdb72ecd00 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 9 May 2012 12:07:05 -0700 Subject: rcu: Move RCU_FAST_NO_HZ per-CPU variables to rcu_dynticks structure The RCU_FAST_NO_HZ code relies on a number of per-CPU variables. This works, but is hidden from someone scanning the data structures in rcutree.h. This commit therefore converts these per-CPU variables to fields in the per-CPU rcu_dynticks structures. Suggested-by: Peter Zijlstra Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Tested-by: Heiko Carstens Tested-by: Pascal Chapperon --- kernel/rcutree.h | 14 +++++++ kernel/rcutree_plugin.h | 99 ++++++++++++++++++++++--------------------------- 2 files changed, 58 insertions(+), 55 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 7f5d138..ea05649 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -84,6 +84,20 @@ struct rcu_dynticks { /* Process level is worth LLONG_MAX/2. */ int dynticks_nmi_nesting; /* Track NMI nesting level. */ atomic_t dynticks; /* Even value for idle, else odd. */ +#ifdef CONFIG_RCU_FAST_NO_HZ + int dyntick_drain; /* Prepare-for-idle state variable. */ + unsigned long dyntick_holdoff; + /* No retries for the jiffy of failure. */ + struct timer_list idle_gp_timer; + /* Wake up CPU sleeping with callbacks. */ + unsigned long idle_gp_timer_expires; + /* When to wake up CPU (for repost). */ + bool idle_first_pass; /* First pass of attempt to go idle? */ + unsigned long nonlazy_posted; + /* # times non-lazy CBs posted to CPU. */ + unsigned long nonlazy_posted_snap; + /* idle-period nonlazy_posted snapshot. */ +#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ }; /* RCU's kthread states for tracing. */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 5449f02..6bd9637 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1962,21 +1962,6 @@ static void rcu_idle_count_callbacks_posted(void) #define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ -/* Loop counter for rcu_prepare_for_idle(). */ -static DEFINE_PER_CPU(int, rcu_dyntick_drain); -/* If rcu_dyntick_holdoff==jiffies, don't try to enter dyntick-idle mode. */ -static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); -/* Timer to awaken the CPU if it enters dyntick-idle mode with callbacks. */ -static DEFINE_PER_CPU(struct timer_list, rcu_idle_gp_timer); -/* Scheduled expiry time for rcu_idle_gp_timer to allow reposting. */ -static DEFINE_PER_CPU(unsigned long, rcu_idle_gp_timer_expires); -/* Enable special processing on first attempt to enter dyntick-idle mode. */ -static DEFINE_PER_CPU(bool, rcu_idle_first_pass); -/* Running count of non-lazy callbacks posted, never decremented. */ -static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted); -/* Snapshot of rcu_nonlazy_posted to detect meaningful exits from idle. */ -static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted_snap); - /* * Allow the CPU to enter dyntick-idle mode if either: (1) There are no * callbacks on this CPU, (2) this CPU has not yet attempted to enter @@ -1988,13 +1973,15 @@ static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted_snap); */ int rcu_needs_cpu(int cpu) { + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + /* Flag a new idle sojourn to the idle-entry state machine. */ - per_cpu(rcu_idle_first_pass, cpu) = 1; + rdtp->idle_first_pass = 1; /* If no callbacks, RCU doesn't need the CPU. */ if (!rcu_cpu_has_callbacks(cpu)) return 0; /* Otherwise, RCU needs the CPU only if it recently tried and failed. */ - return per_cpu(rcu_dyntick_holdoff, cpu) == jiffies; + return rdtp->dyntick_holdoff == jiffies; } /* @@ -2075,21 +2062,24 @@ static void rcu_idle_gp_timer_func(unsigned long cpu_in) */ static void rcu_prepare_for_idle_init(int cpu) { - per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; - setup_timer(&per_cpu(rcu_idle_gp_timer, cpu), - rcu_idle_gp_timer_func, cpu); - per_cpu(rcu_idle_gp_timer_expires, cpu) = jiffies - 1; - per_cpu(rcu_idle_first_pass, cpu) = 1; + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + + rdtp->dyntick_holdoff = jiffies - 1; + setup_timer(&rdtp->idle_gp_timer, rcu_idle_gp_timer_func, cpu); + rdtp->idle_gp_timer_expires = jiffies - 1; + rdtp->idle_first_pass = 1; } /* * Clean up for exit from idle. Because we are exiting from idle, there - * is no longer any point to rcu_idle_gp_timer, so cancel it. This will + * is no longer any point to ->idle_gp_timer, so cancel it. This will * do nothing if this timer is not active, so just cancel it unconditionally. */ static void rcu_cleanup_after_idle(int cpu) { - del_timer(&per_cpu(rcu_idle_gp_timer, cpu)); + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + + del_timer(&rdtp->idle_gp_timer); trace_rcu_prep_idle("Cleanup after idle"); } @@ -2108,42 +2098,41 @@ static void rcu_cleanup_after_idle(int cpu) * Because it is not legal to invoke rcu_process_callbacks() with irqs * disabled, we do one pass of force_quiescent_state(), then do a * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked - * later. The per-cpu rcu_dyntick_drain variable controls the sequencing. + * later. The ->dyntick_drain field controls the sequencing. * * The caller must have disabled interrupts. */ static void rcu_prepare_for_idle(int cpu) { struct timer_list *tp; + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); /* * If this is an idle re-entry, for example, due to use of * RCU_NONIDLE() or the new idle-loop tracing API within the idle * loop, then don't take any state-machine actions, unless the * momentary exit from idle queued additional non-lazy callbacks. - * Instead, repost the rcu_idle_gp_timer if this CPU has callbacks + * Instead, repost the ->idle_gp_timer if this CPU has callbacks * pending. */ - if (!per_cpu(rcu_idle_first_pass, cpu) && - (per_cpu(rcu_nonlazy_posted, cpu) == - per_cpu(rcu_nonlazy_posted_snap, cpu))) { + if (!rdtp->idle_first_pass && + (rdtp->nonlazy_posted == rdtp->nonlazy_posted_snap)) { if (rcu_cpu_has_callbacks(cpu)) { - tp = &per_cpu(rcu_idle_gp_timer, cpu); - mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu)); + tp = &rdtp->idle_gp_timer; + mod_timer_pinned(tp, rdtp->idle_gp_timer_expires); } return; } - per_cpu(rcu_idle_first_pass, cpu) = 0; - per_cpu(rcu_nonlazy_posted_snap, cpu) = - per_cpu(rcu_nonlazy_posted, cpu) - 1; + rdtp->idle_first_pass = 0; + rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted - 1; /* * If there are no callbacks on this CPU, enter dyntick-idle mode. * Also reset state to avoid prejudicing later attempts. */ if (!rcu_cpu_has_callbacks(cpu)) { - per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; - per_cpu(rcu_dyntick_drain, cpu) = 0; + rdtp->dyntick_holdoff = jiffies - 1; + rdtp->dyntick_drain = 0; trace_rcu_prep_idle("No callbacks"); return; } @@ -2152,38 +2141,37 @@ static void rcu_prepare_for_idle(int cpu) * If in holdoff mode, just return. We will presumably have * refrained from disabling the scheduling-clock tick. */ - if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) { + if (rdtp->dyntick_holdoff == jiffies) { trace_rcu_prep_idle("In holdoff"); return; } - /* Check and update the rcu_dyntick_drain sequencing. */ - if (per_cpu(rcu_dyntick_drain, cpu) <= 0) { + /* Check and update the ->dyntick_drain sequencing. */ + if (rdtp->dyntick_drain <= 0) { /* First time through, initialize the counter. */ - per_cpu(rcu_dyntick_drain, cpu) = RCU_IDLE_FLUSHES; - } else if (per_cpu(rcu_dyntick_drain, cpu) <= RCU_IDLE_OPT_FLUSHES && + rdtp->dyntick_drain = RCU_IDLE_FLUSHES; + } else if (rdtp->dyntick_drain <= RCU_IDLE_OPT_FLUSHES && !rcu_pending(cpu) && !local_softirq_pending()) { /* Can we go dyntick-idle despite still having callbacks? */ - per_cpu(rcu_dyntick_drain, cpu) = 0; - per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; + rdtp->dyntick_drain = 0; + rdtp->dyntick_holdoff = jiffies; if (rcu_cpu_has_nonlazy_callbacks(cpu)) { trace_rcu_prep_idle("Dyntick with callbacks"); - per_cpu(rcu_idle_gp_timer_expires, cpu) = + rdtp->idle_gp_timer_expires = jiffies + RCU_IDLE_GP_DELAY; } else { - per_cpu(rcu_idle_gp_timer_expires, cpu) = + rdtp->idle_gp_timer_expires = jiffies + RCU_IDLE_LAZY_GP_DELAY; trace_rcu_prep_idle("Dyntick with lazy callbacks"); } - tp = &per_cpu(rcu_idle_gp_timer, cpu); - mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu)); - per_cpu(rcu_nonlazy_posted_snap, cpu) = - per_cpu(rcu_nonlazy_posted, cpu); + tp = &rdtp->idle_gp_timer; + mod_timer_pinned(tp, rdtp->idle_gp_timer_expires); + rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; return; /* Nothing more to do immediately. */ - } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { + } else if (--(rdtp->dyntick_drain) <= 0) { /* We have hit the limit, so time to give up. */ - per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; + rdtp->dyntick_holdoff = jiffies; trace_rcu_prep_idle("Begin holdoff"); invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */ return; @@ -2229,7 +2217,7 @@ static void rcu_prepare_for_idle(int cpu) */ static void rcu_idle_count_callbacks_posted(void) { - __this_cpu_add(rcu_nonlazy_posted, 1); + __this_cpu_add(rcu_dynticks.nonlazy_posted, 1); } #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ @@ -2240,11 +2228,12 @@ static void rcu_idle_count_callbacks_posted(void) static void print_cpu_stall_fast_no_hz(char *cp, int cpu) { - struct timer_list *tltp = &per_cpu(rcu_idle_gp_timer, cpu); + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + struct timer_list *tltp = &rdtp->idle_gp_timer; sprintf(cp, "drain=%d %c timer=%lu", - per_cpu(rcu_dyntick_drain, cpu), - per_cpu(rcu_dyntick_holdoff, cpu) == jiffies ? 'H' : '.', + rdtp->dyntick_drain, + rdtp->dyntick_holdoff == jiffies ? 'H' : '.', timer_pending(tltp) ? tltp->expires - jiffies : -1); } -- cgit v1.1 From aa9b16306e3243229580ff889cc59fd66bf77973 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 10 May 2012 16:41:44 -0700 Subject: rcu: Precompute RCU_FAST_NO_HZ timer offsets When a CPU is entering dyntick-idle mode, tick_nohz_stop_sched_tick() calls rcu_needs_cpu() see if RCU needs that CPU, and, if not, computes the next wakeup time based on the timer wheels. Only later, when actually entering the idle loop, rcu_prepare_for_idle() will be invoked. In some cases, rcu_prepare_for_idle() will post timers to wake the CPU back up. But all for naught: The next wakeup time for the CPU has already been computed, and posting a timer afterwards does not force that wakeup time to be recomputed. This means that rcu_prepare_for_idle()'s have no effect. This is not a problem on a busy system because something else will wake up the CPU soon enough. However, on lightly loaded systems, the CPU might stay asleep for a considerable length of time. If that CPU has a callback that the rest of the system is waiting on, the system might run very slowly or (in theory) even hang. This commit avoids this problem by having rcu_needs_cpu() give tick_nohz_stop_sched_tick() an estimate of when RCU will need the CPU to wake back up, which tick_nohz_stop_sched_tick() takes into account when programming the CPU's wakeup time. An alternative approach is for rcu_prepare_for_idle() to use hrtimers instead of normal timers, but timers are much more efficient than are hrtimers for frequently and repeatedly posting and cancelling a given timer, which is exactly what RCU_FAST_NO_HZ does. Reported-by: Pascal Chapperon Reported-by: Heiko Carstens Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Tested-by: Heiko Carstens Tested-by: Pascal Chapperon --- kernel/rcutree_plugin.h | 66 +++++++++++++++++++++++++++++++----------------- kernel/time/tick-sched.c | 7 ++++- 2 files changed, 49 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 6bd9637..5271a02 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1886,8 +1886,9 @@ static void __cpuinit rcu_prepare_kthreads(int cpu) * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs * any flavor of RCU. */ -int rcu_needs_cpu(int cpu) +int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) { + *delta_jiffies = ULONG_MAX; return rcu_cpu_has_callbacks(cpu); } @@ -1963,28 +1964,6 @@ static void rcu_idle_count_callbacks_posted(void) #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ /* - * Allow the CPU to enter dyntick-idle mode if either: (1) There are no - * callbacks on this CPU, (2) this CPU has not yet attempted to enter - * dyntick-idle mode, or (3) this CPU is in the process of attempting to - * enter dyntick-idle mode. Otherwise, if we have recently tried and failed - * to enter dyntick-idle mode, we refuse to try to enter it. After all, - * it is better to incur scheduling-clock interrupts than to spin - * continuously for the same time duration! - */ -int rcu_needs_cpu(int cpu) -{ - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); - - /* Flag a new idle sojourn to the idle-entry state machine. */ - rdtp->idle_first_pass = 1; - /* If no callbacks, RCU doesn't need the CPU. */ - if (!rcu_cpu_has_callbacks(cpu)) - return 0; - /* Otherwise, RCU needs the CPU only if it recently tried and failed. */ - return rdtp->dyntick_holdoff == jiffies; -} - -/* * Does the specified flavor of RCU have non-lazy callbacks pending on * the specified CPU? Both RCU flavor and CPU are specified by the * rcu_data structure. @@ -2027,6 +2006,47 @@ static bool rcu_cpu_has_nonlazy_callbacks(int cpu) } /* + * Allow the CPU to enter dyntick-idle mode if either: (1) There are no + * callbacks on this CPU, (2) this CPU has not yet attempted to enter + * dyntick-idle mode, or (3) this CPU is in the process of attempting to + * enter dyntick-idle mode. Otherwise, if we have recently tried and failed + * to enter dyntick-idle mode, we refuse to try to enter it. After all, + * it is better to incur scheduling-clock interrupts than to spin + * continuously for the same time duration! + * + * The delta_jiffies argument is used to store the time when RCU is + * going to need the CPU again if it still has callbacks. The reason + * for this is that rcu_prepare_for_idle() might need to post a timer, + * but if so, it will do so after tick_nohz_stop_sched_tick() has set + * the wakeup time for this CPU. This means that RCU's timer can be + * delayed until the wakeup time, which defeats the purpose of posting + * a timer. + */ +int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) +{ + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + + /* Flag a new idle sojourn to the idle-entry state machine. */ + rdtp->idle_first_pass = 1; + /* If no callbacks, RCU doesn't need the CPU. */ + if (!rcu_cpu_has_callbacks(cpu)) { + *delta_jiffies = ULONG_MAX; + return 0; + } + if (rdtp->dyntick_holdoff == jiffies) { + /* RCU recently tried and failed, so don't try again. */ + *delta_jiffies = 1; + return 1; + } + /* Set up for the possibility that RCU will post a timer. */ + if (rcu_cpu_has_nonlazy_callbacks(cpu)) + *delta_jiffies = RCU_IDLE_GP_DELAY; + else + *delta_jiffies = RCU_IDLE_LAZY_GP_DELAY; + return 0; +} + +/* * Handler for smp_call_function_single(). The only point of this * handler is to wake the CPU up, so the handler does only tracing. */ diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 6a3a5b9..52f5ebb 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -274,6 +274,7 @@ EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); static void tick_nohz_stop_sched_tick(struct tick_sched *ts) { unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; + unsigned long rcu_delta_jiffies; ktime_t last_update, expires, now; struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; u64 time_delta; @@ -322,7 +323,7 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts) time_delta = timekeeping_max_deferment(); } while (read_seqretry(&xtime_lock, seq)); - if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || + if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || printk_needs_cpu(cpu) || arch_needs_cpu(cpu)) { next_jiffies = last_jiffies + 1; delta_jiffies = 1; @@ -330,6 +331,10 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts) /* Get the next timer wheel timer */ next_jiffies = get_next_timer_interrupt(last_jiffies); delta_jiffies = next_jiffies - last_jiffies; + if (rcu_delta_jiffies < delta_jiffies) { + next_jiffies = last_jiffies + rcu_delta_jiffies; + delta_jiffies = rcu_delta_jiffies; + } } /* * Do not stop the tick, if we are only one off -- cgit v1.1 From 6ebb017de9d59a18c3ff9648270e8f6abaa93438 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Tue, 5 Jun 2012 08:52:34 +0200 Subject: printk: Fix alignment of buf causing crash on ARM EABI Commit 7ff9554bb578ba02166071d2d487b7fc7d860d62, printk: convert byte-buffer to variable-length record buffer, causes systems using EABI to crash very early in the boot cycle. The first entry in struct log is a u64, which for EABI must be 8 byte aligned. Make use of __alignof__() so the compiler to decide the alignment, but allow it to be overridden using CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS, for systems which can perform unaligned access and want to save a few bytes of space. Tested on Orion5x and Kirkwood. Signed-off-by: Andrew Lunn Tested-by: Stephen Warren Acked-by: Stephen Warren Acked-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- kernel/printk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 32462d2..f205c25 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -227,10 +227,10 @@ static u32 clear_idx; #define LOG_LINE_MAX 1024 /* record buffer */ -#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) #define LOG_ALIGN 4 #else -#define LOG_ALIGN 8 +#define LOG_ALIGN __alignof__(struct log) #endif #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); -- cgit v1.1 From a70270468234749741c5893ae78e5bb524771402 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Wed, 13 Jun 2012 09:35:48 -0400 Subject: watchdog: Quiet down the boot messages A bunch of bugzillas have complained how noisy the nmi_watchdog is during boot-up especially with its expected failure cases (like virt and bios resource contention). This is my attempt to quiet them down and keep it less confusing for the end user. What I did is print the message for cpu0 and save it for future comparisons. If future cpus have an identical message as cpu0, then don't print the redundant info. However, if a future cpu has a different message, happily print that loudly. Before the change, you would see something like: ..TIMER: vector=0x30 apic1=0 pin1=2 apic2=-1 pin2=-1 CPU0: Intel(R) Core(TM)2 Quad CPU Q9550 @ 2.83GHz stepping 0a Performance Events: PEBS fmt0+, Core2 events, Intel PMU driver. ... version: 2 ... bit width: 40 ... generic registers: 2 ... value mask: 000000ffffffffff ... max period: 000000007fffffff ... fixed-purpose events: 3 ... event mask: 0000000700000003 NMI watchdog enabled, takes one hw-pmu counter. Booting Node 0, Processors #1 NMI watchdog enabled, takes one hw-pmu counter. #2 NMI watchdog enabled, takes one hw-pmu counter. #3 Ok. NMI watchdog enabled, takes one hw-pmu counter. Brought up 4 CPUs Total of 4 processors activated (22607.24 BogoMIPS). After the change, it is simplified to: ..TIMER: vector=0x30 apic1=0 pin1=2 apic2=-1 pin2=-1 CPU0: Intel(R) Core(TM)2 Quad CPU Q9550 @ 2.83GHz stepping 0a Performance Events: PEBS fmt0+, Core2 events, Intel PMU driver. ... version: 2 ... bit width: 40 ... generic registers: 2 ... value mask: 000000ffffffffff ... max period: 000000007fffffff ... fixed-purpose events: 3 ... event mask: 0000000700000003 NMI watchdog: enabled on all CPUs, permanently consumes one hw-PMU counter. Booting Node 0, Processors #1 #2 #3 Ok. Brought up 4 CPUs V2: little changes based on Joe Perches' feedback V3: printk cleanup based on Ingo's feedback; checkpatch fix V4: keep printk as one long line V5: Ingo fix ups Reported-and-tested-by: Nathan Zimmer Signed-off-by: Don Zickus Cc: nzimmer@sgi.com Cc: joe@perches.com Link: http://lkml.kernel.org/r/1339594548-17227-1-git-send-email-dzickus@redhat.com Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index e5e1d85..4b1dfba 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -372,6 +372,13 @@ static int watchdog(void *unused) #ifdef CONFIG_HARDLOCKUP_DETECTOR +/* + * People like the simple clean cpu node info on boot. + * Reduce the watchdog noise by only printing messages + * that are different from what cpu0 displayed. + */ +static unsigned long cpu0_err; + static int watchdog_nmi_enable(int cpu) { struct perf_event_attr *wd_attr; @@ -390,11 +397,21 @@ static int watchdog_nmi_enable(int cpu) /* Try to register using hardware perf events */ event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); + + /* save cpu0 error for future comparision */ + if (cpu == 0 && IS_ERR(event)) + cpu0_err = PTR_ERR(event); + if (!IS_ERR(event)) { - pr_info("enabled, takes one hw-pmu counter.\n"); + /* only print for cpu0 or different than cpu0 */ + if (cpu == 0 || cpu0_err) + pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n"); goto out_save; } + /* skip displaying the same error again */ + if (cpu > 0 && (PTR_ERR(event) == cpu0_err)) + return PTR_ERR(event); /* vary the KERN level based on the returned errno */ if (PTR_ERR(event) == -EOPNOTSUPP) -- cgit v1.1 From e2ae715d66bf4becfb85eb84b7150e23cf27df30 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Fri, 15 Jun 2012 14:07:51 +0200 Subject: kmsg - kmsg_dump() use iterator to receive log buffer content Provide an iterator to receive the log buffer content, and convert all kmsg_dump() users to it. The structured data in the kmsg buffer now contains binary data, which should no longer be copied verbatim to the kmsg_dump() users. The iterator should provide reliable access to the buffer data, and also supports proper log line-aware chunking of data while iterating. Signed-off-by: Kay Sievers Tested-by: Tony Luck Reported-by: Anton Vorontsov Tested-by: Anton Vorontsov Signed-off-by: Greg Kroah-Hartman --- kernel/printk.c | 220 ++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 192 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index f205c25..ceb4a2f 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -909,7 +909,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) /* * Find first record that fits, including all following records, * into the user-provided buffer for this dump. - */ + */ seq = clear_seq; idx = clear_idx; while (seq < log_next_seq) { @@ -919,6 +919,8 @@ static int syslog_print_all(char __user *buf, int size, bool clear) idx = log_next(idx); seq++; } + + /* move first record forward until length fits into the buffer */ seq = clear_seq; idx = clear_idx; while (len > size && seq < log_next_seq) { @@ -929,7 +931,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) seq++; } - /* last message in this dump */ + /* last message fitting into this dump */ next_seq = log_next_seq; len = 0; @@ -2300,48 +2302,210 @@ module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR); * kmsg_dump - dump kernel log to kernel message dumpers. * @reason: the reason (oops, panic etc) for dumping * - * Iterate through each of the dump devices and call the oops/panic - * callbacks with the log buffer. + * Call each of the registered dumper's dump() callback, which can + * retrieve the kmsg records with kmsg_dump_get_line() or + * kmsg_dump_get_buffer(). */ void kmsg_dump(enum kmsg_dump_reason reason) { - u64 idx; struct kmsg_dumper *dumper; - const char *s1, *s2; - unsigned long l1, l2; unsigned long flags; if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump) return; - /* Theoretically, the log could move on after we do this, but - there's not a lot we can do about that. The new messages - will overwrite the start of what we dump. */ + rcu_read_lock(); + list_for_each_entry_rcu(dumper, &dump_list, list) { + if (dumper->max_reason && reason > dumper->max_reason) + continue; + + /* initialize iterator with data about the stored records */ + dumper->active = true; + + raw_spin_lock_irqsave(&logbuf_lock, flags); + dumper->cur_seq = clear_seq; + dumper->cur_idx = clear_idx; + dumper->next_seq = log_next_seq; + dumper->next_idx = log_next_idx; + raw_spin_unlock_irqrestore(&logbuf_lock, flags); + + /* invoke dumper which will iterate over records */ + dumper->dump(dumper, reason); + + /* reset iterator */ + dumper->active = false; + } + rcu_read_unlock(); +} + +/** + * kmsg_dump_get_line - retrieve one kmsg log line + * @dumper: registered kmsg dumper + * @syslog: include the "<4>" prefixes + * @line: buffer to copy the line to + * @size: maximum size of the buffer + * @len: length of line placed into buffer + * + * Start at the beginning of the kmsg buffer, with the oldest kmsg + * record, and copy one record into the provided buffer. + * + * Consecutive calls will return the next available record moving + * towards the end of the buffer with the youngest messages. + * + * A return value of FALSE indicates that there are no more records to + * read. + */ +bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, + char *line, size_t size, size_t *len) +{ + unsigned long flags; + struct log *msg; + size_t l = 0; + bool ret = false; + + if (!dumper->active) + goto out; raw_spin_lock_irqsave(&logbuf_lock, flags); - if (syslog_seq < log_first_seq) - idx = syslog_idx; - else - idx = log_first_idx; + if (dumper->cur_seq < log_first_seq) { + /* messages are gone, move to first available one */ + dumper->cur_seq = log_first_seq; + dumper->cur_idx = log_first_idx; + } - if (idx > log_next_idx) { - s1 = log_buf; - l1 = log_next_idx; + /* last entry */ + if (dumper->cur_seq >= log_next_seq) { + raw_spin_unlock_irqrestore(&logbuf_lock, flags); + goto out; + } - s2 = log_buf + idx; - l2 = log_buf_len - idx; - } else { - s1 = ""; - l1 = 0; + msg = log_from_idx(dumper->cur_idx); + l = msg_print_text(msg, syslog, + line, size); + + dumper->cur_idx = log_next(dumper->cur_idx); + dumper->cur_seq++; + ret = true; + raw_spin_unlock_irqrestore(&logbuf_lock, flags); +out: + if (len) + *len = l; + return ret; +} +EXPORT_SYMBOL_GPL(kmsg_dump_get_line); + +/** + * kmsg_dump_get_buffer - copy kmsg log lines + * @dumper: registered kmsg dumper + * @syslog: include the "<4>" prefixes + * @line: buffer to copy the line to + * @size: maximum size of the buffer + * @len: length of line placed into buffer + * + * Start at the end of the kmsg buffer and fill the provided buffer + * with as many of the the *youngest* kmsg records that fit into it. + * If the buffer is large enough, all available kmsg records will be + * copied with a single call. + * + * Consecutive calls will fill the buffer with the next block of + * available older records, not including the earlier retrieved ones. + * + * A return value of FALSE indicates that there are no more records to + * read. + */ +bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, + char *buf, size_t size, size_t *len) +{ + unsigned long flags; + u64 seq; + u32 idx; + u64 next_seq; + u32 next_idx; + size_t l = 0; + bool ret = false; + + if (!dumper->active) + goto out; + + raw_spin_lock_irqsave(&logbuf_lock, flags); + if (dumper->cur_seq < log_first_seq) { + /* messages are gone, move to first available one */ + dumper->cur_seq = log_first_seq; + dumper->cur_idx = log_first_idx; + } + + /* last entry */ + if (dumper->cur_seq >= dumper->next_seq) { + raw_spin_unlock_irqrestore(&logbuf_lock, flags); + goto out; + } + + /* calculate length of entire buffer */ + seq = dumper->cur_seq; + idx = dumper->cur_idx; + while (seq < dumper->next_seq) { + struct log *msg = log_from_idx(idx); + + l += msg_print_text(msg, true, NULL, 0); + idx = log_next(idx); + seq++; + } + + /* move first record forward until length fits into the buffer */ + seq = dumper->cur_seq; + idx = dumper->cur_idx; + while (l > size && seq < dumper->next_seq) { + struct log *msg = log_from_idx(idx); - s2 = log_buf + idx; - l2 = log_next_idx - idx; + l -= msg_print_text(msg, true, NULL, 0); + idx = log_next(idx); + seq++; } + + /* last message in next interation */ + next_seq = seq; + next_idx = idx; + + l = 0; + while (seq < dumper->next_seq) { + struct log *msg = log_from_idx(idx); + + l += msg_print_text(msg, syslog, + buf + l, size - l); + + idx = log_next(idx); + seq++; + } + + dumper->next_seq = next_seq; + dumper->next_idx = next_idx; + ret = true; raw_spin_unlock_irqrestore(&logbuf_lock, flags); +out: + if (len) + *len = l; + return ret; +} +EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer); - rcu_read_lock(); - list_for_each_entry_rcu(dumper, &dump_list, list) - dumper->dump(dumper, reason, s1, l1, s2, l2); - rcu_read_unlock(); +/** + * kmsg_dump_rewind - reset the interator + * @dumper: registered kmsg dumper + * + * Reset the dumper's iterator so that kmsg_dump_get_line() and + * kmsg_dump_get_buffer() can be called again and used multiple + * times within the same dumper.dump() callback. + */ +void kmsg_dump_rewind(struct kmsg_dumper *dumper) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&logbuf_lock, flags); + dumper->cur_seq = clear_seq; + dumper->cur_idx = clear_idx; + dumper->next_seq = log_next_seq; + dumper->next_idx = log_next_idx; + raw_spin_unlock_irqrestore(&logbuf_lock, flags); } +EXPORT_SYMBOL_GPL(kmsg_dump_rewind); #endif -- cgit v1.1 From 4a77a5a06ec66ed05199b301e7c25f42f979afdc Mon Sep 17 00:00:00 2001 From: Yuanhan Liu Date: Sat, 16 Jun 2012 21:21:51 +0800 Subject: printk: use mutex lock to stop syslog_seq from going wild Although syslog_seq and log_next_seq stuff are protected by logbuf_lock spin log, it's not enough. Say we have two processes A and B, and let syslog_seq = N, while log_next_seq = N + 1, and the two processes both come to syslog_print at almost the same time. And No matter which process get the spin lock first, it will increase syslog_seq by one, then release spin lock; thus later, another process increase syslog_seq by one again. In this case, syslog_seq is bigger than syslog_next_seq. And latter, it would make: wait_event_interruptiable(log_wait, syslog != log_next_seq) don't wait any more even there is no new write comes. Thus it introduce a infinite loop reading. I can easily see this kind of issue by the following steps: # cat /proc/kmsg # at meantime, I don't kill rsyslog # So they are the two processes. # xinit # I added drm.debug=6 in the kernel parameter line, # so that it will produce lots of message and let that # issue happen It's 100% reproducable on my side. And my disk will be filled up by /var/log/messages in a quite short time. So, introduce a mutex_lock to stop syslog_seq from going wild just like what devkmsg_read() does. It does fix this issue as expected. v2: use mutex_lock_interruptiable() instead (comments from Kay) Signed-off-by: Yuanhan Liu Reviewed-by: Fengguang Wu Acked-By: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- kernel/printk.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index ceb4a2f..572730b 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -414,7 +414,9 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, if (!user) return -EBADF; - mutex_lock(&user->lock); + ret = mutex_lock_interruptible(&user->lock); + if (ret) + return ret; raw_spin_lock(&logbuf_lock); while (user->seq == log_next_seq) { if (file->f_flags & O_NONBLOCK) { @@ -976,6 +978,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) { bool clear = false; static int saved_console_loglevel = -1; + static DEFINE_MUTEX(syslog_mutex); int error; error = check_syslog_permissions(type, from_file); @@ -1002,11 +1005,17 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) error = -EFAULT; goto out; } + error = mutex_lock_interruptible(&syslog_mutex); + if (error) + goto out; error = wait_event_interruptible(log_wait, syslog_seq != log_next_seq); - if (error) + if (error) { + mutex_unlock(&syslog_mutex); goto out; + } error = syslog_print(buf, len); + mutex_unlock(&syslog_mutex); break; /* Read/clear last kernel messages */ case SYSLOG_ACTION_READ_CLEAR: -- cgit v1.1 From b56a39ac263e5b8cafedd551a49c2105e68b98c2 Mon Sep 17 00:00:00 2001 From: Yuanhan Liu Date: Sat, 16 Jun 2012 12:40:55 +0800 Subject: printk: return -EINVAL if the message len is bigger than the buf size Just like what devkmsg_read() does, return -EINVAL if the message len is bigger than the buf size, or it will trigger a segfault error. Acked-by: Kay Sievers Acked-by: Fengguang Wu Signed-off-by: Yuanhan Liu Signed-off-by: Greg Kroah-Hartman --- kernel/printk.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 572730b..a2276b9 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -880,7 +880,9 @@ static int syslog_print(char __user *buf, int size) syslog_seq++; raw_spin_unlock_irq(&logbuf_lock); - if (len > 0 && copy_to_user(buf, text, len)) + if (len > size) + len = -EINVAL; + else if (len > 0 && copy_to_user(buf, text, len)) len = -EFAULT; kfree(text); -- cgit v1.1 From 9c5da09d266ca9b32eb16cf940f8161d949c2fe5 Mon Sep 17 00:00:00 2001 From: Salman Qazi Date: Thu, 14 Jun 2012 15:31:09 -0700 Subject: perf: Use css_tryget() to avoid propping up css refcount An rmdir pushes css's ref count to zero. However, if the associated directory is open at the time, the dentry ref count is non-zero. If the fd for this directory is then passed into perf_event_open, it does a css_get(). This bounces the ref count back up from zero. This is a problem by itself. But what makes it turn into a crash is the fact that we end up doing an extra dput, since we perform a dput when css_put sees the ref count go down to zero. css_tryget() does not fall into that trap. So, we use that instead. Reproduction test-case for the bug: #include #include #include #include #include #include #include #include #include #define PERF_FLAG_PID_CGROUP (1U << 2) int perf_event_open(struct perf_event_attr *hw_event_uptr, pid_t pid, int cpu, int group_fd, unsigned long flags) { return syscall(__NR_perf_event_open,hw_event_uptr, pid, cpu, group_fd, flags); } /* * Directly poke at the perf_event bug, since it's proving hard to repro * depending on where in the kernel tree. what moved? */ int main(int argc, char **argv) { int fd; struct perf_event_attr attr; memset(&attr, 0, sizeof(attr)); attr.exclude_kernel = 1; attr.size = sizeof(attr); mkdir("/dev/cgroup/perf_event/blah", 0777); fd = open("/dev/cgroup/perf_event/blah", O_RDONLY); perror("open"); rmdir("/dev/cgroup/perf_event/blah"); sleep(2); perf_event_open(&attr, fd, 0, -1, PERF_FLAG_PID_CGROUP); perror("perf_event_open"); close(fd); return 0; } Signed-off-by: Salman Qazi Signed-off-by: Peter Zijlstra Acked-by: Tejun Heo Link: http://lkml.kernel.org/r/20120614223108.1025.2503.stgit@dungbeetle.mtv.corp.google.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index f85c015..d7d71d6 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -253,9 +253,9 @@ perf_cgroup_match(struct perf_event *event) return !event->cgrp || event->cgrp == cpuctx->cgrp; } -static inline void perf_get_cgroup(struct perf_event *event) +static inline bool perf_tryget_cgroup(struct perf_event *event) { - css_get(&event->cgrp->css); + return css_tryget(&event->cgrp->css); } static inline void perf_put_cgroup(struct perf_event *event) @@ -484,7 +484,11 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, event->cgrp = cgrp; /* must be done before we fput() the file */ - perf_get_cgroup(event); + if (!perf_tryget_cgroup(event)) { + event->cgrp = NULL; + ret = -ENOENT; + goto out; + } /* * all events in a group must monitor -- cgit v1.1 From 8e3bbf42c6d73881956863cc3305456afe2bc4ea Mon Sep 17 00:00:00 2001 From: Salman Qazi Date: Thu, 14 Jun 2012 14:55:30 -0700 Subject: cgroups: Account for CSS_DEACT_BIAS in __css_put When we fixed the race between atomic_dec and css_refcnt, we missed the fact that css_refcnt internally subtracts CSS_DEACT_BIAS to get the actual reference count. This can potentially cause a refcount leak if __css_put races with cgroup_clear_css_refs. Signed-off-by: Salman Qazi Acked-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ceeafe8..2097684 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -255,12 +255,17 @@ int cgroup_lock_is_held(void) EXPORT_SYMBOL_GPL(cgroup_lock_is_held); +static int css_unbias_refcnt(int refcnt) +{ + return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS; +} + /* the current nr of refs, always >= 0 whether @css is deactivated or not */ static int css_refcnt(struct cgroup_subsys_state *css) { int v = atomic_read(&css->refcnt); - return v >= 0 ? v : v - CSS_DEACT_BIAS; + return css_unbias_refcnt(v); } /* convenient tests for these bits */ @@ -4982,9 +4987,12 @@ EXPORT_SYMBOL_GPL(__css_tryget); void __css_put(struct cgroup_subsys_state *css) { struct cgroup *cgrp = css->cgroup; + int v; rcu_read_lock(); - switch (atomic_dec_return(&css->refcnt)) { + v = css_unbias_refcnt(atomic_dec_return(&css->refcnt)); + + switch (v) { case 1: if (notify_on_release(cgrp)) { set_bit(CGRP_RELEASABLE, &cgrp->flags); -- cgit v1.1 From 4fe7efdbdfb1c7e7a7f31decfd831c0f31d37091 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 20 Jun 2012 12:53:01 -0700 Subject: mm: correctly synchronize rss-counters at exit/exec do_exit() and exec_mmap() call sync_mm_rss() before mm_release() does put_user(clear_child_tid) which can update task->rss_stat and thus make mm->rss_stat inconsistent. This triggers the "BUG:" printk in check_mm(). Let's fix this bug in the safest way, and optimize/cleanup this later. Reported-by: Markus Trippelsdorf Signed-off-by: Konstantin Khlebnikov Cc: Oleg Nesterov Cc: KAMEZAWA Hiroyuki Cc: Hugh Dickins Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 34867cc..c0277d3 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -643,6 +643,7 @@ static void exit_mm(struct task_struct * tsk) mm_release(tsk, mm); if (!mm) return; + sync_mm_rss(mm); /* * Serialize with any possible pending coredump. * We must hold mmap_sem around checking core_state -- cgit v1.1 From 6347e90091041e34bea625370794c92f4ce71228 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 20 Jun 2012 12:53:03 -0700 Subject: pidns: guarantee that the pidns init will be the last pidns process reaped Today we have a twofold bug. Sometimes release_task on pid == 1 in a pid namespace can run before other processes in a pid namespace have had release task called. With the result that pid_ns_release_proc can be called before the last proc_flus_task() is done using upid->ns->proc_mnt, resulting in the use of a stale pointer. This same set of circumstances can lead to waitpid(...) returning for a processes started with clone(CLONE_NEWPID) before the every process in the pid namespace has actually exited. To fix this modify zap_pid_ns_processess wait until all other processes in the pid namespace have exited, even EXIT_DEAD zombies. The delay_group_leader and related tests ensure that the thread gruop leader will be the last thread of a process group to be reaped, or to become EXIT_DEAD and self reap. With the change to zap_pid_ns_processes we get the guarantee that pid == 1 in a pid namespace will be the last task that release_task is called on. With pid == 1 being the last task to pass through release_task pid_ns_release_proc can no longer be called too early nor can wait return before all of the EXIT_DEAD tasks in a pid namespace have exited. Signed-off-by: Eric W. Biederman Signed-off-by: Oleg Nesterov Cc: Louis Rilling Cc: Mike Galbraith Acked-by: Pavel Emelyanov Tested-by: Andrew Wagin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 14 +++++++++++++- kernel/pid_namespace.c | 20 ++++++++++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index c0277d3..a85efd2 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -64,7 +64,6 @@ static void exit_mm(struct task_struct * tsk); static void __unhash_process(struct task_struct *p, bool group_dead) { nr_threads--; - detach_pid(p, PIDTYPE_PID); if (group_dead) { detach_pid(p, PIDTYPE_PGID); detach_pid(p, PIDTYPE_SID); @@ -72,7 +71,20 @@ static void __unhash_process(struct task_struct *p, bool group_dead) list_del_rcu(&p->tasks); list_del_init(&p->sibling); __this_cpu_dec(process_counts); + /* + * If we are the last child process in a pid namespace to be + * reaped, notify the reaper sleeping zap_pid_ns_processes(). + */ + if (IS_ENABLED(CONFIG_PID_NS)) { + struct task_struct *parent = p->real_parent; + + if ((task_active_pid_ns(p)->child_reaper == parent) && + list_empty(&parent->children) && + (parent->flags & PF_EXITING)) + wake_up_process(parent); + } } + detach_pid(p, PIDTYPE_PID); list_del_rcu(&p->thread_group); } diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 16b20e3..b3c7fd5 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -184,11 +184,31 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) } read_unlock(&tasklist_lock); + /* Firstly reap the EXIT_ZOMBIE children we may have. */ do { clear_thread_flag(TIF_SIGPENDING); rc = sys_wait4(-1, NULL, __WALL, NULL); } while (rc != -ECHILD); + /* + * sys_wait4() above can't reap the TASK_DEAD children. + * Make sure they all go away, see __unhash_process(). + */ + for (;;) { + bool need_wait = false; + + read_lock(&tasklist_lock); + if (!list_empty(¤t->children)) { + __set_current_state(TASK_UNINTERRUPTIBLE); + need_wait = true; + } + read_unlock(&tasklist_lock); + + if (!need_wait) + break; + schedule(); + } + if (pid_ns->reboot) current->signal->group_exit_code = pid_ns->reboot; -- cgit v1.1 From 50d75f8daead8a1f850c40a3b6c6575ab19b48cf Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 20 Jun 2012 12:53:04 -0700 Subject: pidns: find_new_reaper() can no longer switch to init_pid_ns.child_reaper find_new_reaper() changes pid_ns->child_reaper, see add0d4df ("pid_ns: zap_pid_ns_processes: fix the ->child_reaper changing"). The original reason has gone away after the previous patch, ->children list must be empty after zap_pid_ns_processes(). However now we can not switch to init_pid_ns.child_reaper. __unhash_process() relies on the "->child_reaper == parent" check, but this check does not work if the last exiting task is also the child reaper. As Eric sugested, we can change __unhash_process() to use the parent's pid_ns and remove this code. Also, with this change we can move detach_pid(PIDTYPE_PID) back, where it was before the previous fix. Signed-off-by: Oleg Nesterov Acked-by: "Eric W. Biederman" Cc: Louis Rilling Cc: Mike Galbraith Acked-by: Pavel Emelyanov Tested-by: Andrew Wagin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index a85efd2..2f59cc3 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -64,6 +64,7 @@ static void exit_mm(struct task_struct * tsk); static void __unhash_process(struct task_struct *p, bool group_dead) { nr_threads--; + detach_pid(p, PIDTYPE_PID); if (group_dead) { detach_pid(p, PIDTYPE_PGID); detach_pid(p, PIDTYPE_SID); @@ -78,13 +79,12 @@ static void __unhash_process(struct task_struct *p, bool group_dead) if (IS_ENABLED(CONFIG_PID_NS)) { struct task_struct *parent = p->real_parent; - if ((task_active_pid_ns(p)->child_reaper == parent) && + if ((task_active_pid_ns(parent)->child_reaper == parent) && list_empty(&parent->children) && (parent->flags & PF_EXITING)) wake_up_process(parent); } } - detach_pid(p, PIDTYPE_PID); list_del_rcu(&p->thread_group); } @@ -732,12 +732,6 @@ static struct task_struct *find_new_reaper(struct task_struct *father) zap_pid_ns_processes(pid_ns); write_lock_irq(&tasklist_lock); - /* - * We can not clear ->child_reaper or leave it alone. - * There may by stealth EXIT_DEAD tasks on ->children, - * forget_original_parent() must move them somewhere. - */ - pid_ns->child_reaper = init_pid_ns.child_reaper; } else if (father->signal->has_child_subreaper) { struct task_struct *reaper; -- cgit v1.1 From 5702c5eeab959e86ee2d9b4fe7f2d87e65b25d46 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Wed, 20 Jun 2012 12:53:04 -0700 Subject: c/r: prctl: Move PR_GET_TID_ADDRESS to a proper place During merging of PR_GET_TID_ADDRESS patch the code has been misplaced (it happened to appear under PR_MCE_KILL) in result noone can use this option. Fix it by moving code snippet to a proper place. Signed-off-by: Cyrill Gorcunov Acked-by: Kees Cook Cc: Oleg Nesterov Cc: Pavel Emelyanov Cc: Andrey Vagin Cc: Serge Hallyn Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index f0ec44d..e0c8ffc 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2127,9 +2127,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, else return -EINVAL; break; - case PR_GET_TID_ADDRESS: - error = prctl_get_tid_address(me, (int __user **)arg2); - break; default: return -EINVAL; } @@ -2147,6 +2144,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_SET_MM: error = prctl_set_mm(arg2, arg3, arg4, arg5); break; + case PR_GET_TID_ADDRESS: + error = prctl_get_tid_address(me, (int __user **)arg2); + break; case PR_SET_CHILD_SUBREAPER: me->signal->is_child_subreaper = !!arg2; error = 0; -- cgit v1.1