From 0b6b098efcddac2bf4e2a895c9b655560bbfcee4 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Fri, 25 Oct 2013 12:14:15 +0200 Subject: padata: make the sequence counter an atomic_t Using a spinlock to atomically increase a counter sounds wrong -- we've atomic_t for this! Also move 'seq_nr' to a different cache line than 'lock' to reduce cache line trashing. This has the nice side effect of decreasing the size of struct parallel_data from 192 to 128 bytes for a x86-64 build, e.g. occupying only two instead of three cache lines. Those changes results in a 5% performance increase on an IPsec test run using pcrypt. Btw. the seq_lock spinlock was never explicitly initialized -- one more reason to get rid of it. Signed-off-by: Mathias Krause Acked-by: Steffen Klassert Signed-off-by: Herbert Xu --- kernel/padata.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index 07af2c9..2abd25d 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -46,6 +46,7 @@ static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) static int padata_cpu_hash(struct parallel_data *pd) { + unsigned int seq_nr; int cpu_index; /* @@ -53,10 +54,8 @@ static int padata_cpu_hash(struct parallel_data *pd) * seq_nr mod. number of cpus in use. */ - spin_lock(&pd->seq_lock); - cpu_index = pd->seq_nr % cpumask_weight(pd->cpumask.pcpu); - pd->seq_nr++; - spin_unlock(&pd->seq_lock); + seq_nr = atomic_inc_return(&pd->seq_nr); + cpu_index = seq_nr % cpumask_weight(pd->cpumask.pcpu); return padata_index_to_cpu(pd, cpu_index); } @@ -429,7 +428,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, padata_init_pqueues(pd); padata_init_squeues(pd); setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd); - pd->seq_nr = 0; + atomic_set(&pd->seq_nr, -1); atomic_set(&pd->reorder_objects, 0); atomic_set(&pd->refcnt, 0); pd->pinst = pinst; -- cgit v1.1 From d689fe222a858c767cb8594faf280048e532b53f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Nov 2013 21:01:57 +0100 Subject: NOHZ: Check for nohz active instead of nohz enabled RCU and the fine grained idle time accounting functions check tick_nohz_enabled. But that variable is merily telling that NOHZ has been enabled in the config and not been disabled on the command line. But it does not tell anything about nohz being active. That's what all this should check for. Matthew reported, that the idle accounting on his old P1 machine showed bogus values, when he enabled NOHZ in the config and did not disable it on the kernel command line. The reason is that his machine uses (refined) jiffies as a clocksource which explains why the "fine" grained accounting went into lala land, because it depends on when the system goes and leaves idle relative to the jiffies increment. Provide a tick_nohz_active indicator and let RCU and the accounting code use this instead of tick_nohz_enable. Reported-and-tested-by: Matthew Whitehead Signed-off-by: Thomas Gleixner Reviewed-by: Steven Rostedt Reviewed-by: Paul E. McKenney Cc: john.stultz@linaro.org Cc: mwhitehe@redhat.com Link: http://lkml.kernel.org/r/alpine.DEB.2.02.1311132052240.30673@ionos.tec.linutronix.de --- kernel/rcu/tree_plugin.h | 4 ++-- kernel/time/tick-sched.c | 21 +++++++++------------ 2 files changed, 11 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 6abb03d..08a7652 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1632,7 +1632,7 @@ module_param(rcu_idle_gp_delay, int, 0644); static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY; module_param(rcu_idle_lazy_gp_delay, int, 0644); -extern int tick_nohz_enabled; +extern int tick_nohz_active; /* * Try to advance callbacks for all flavors of RCU on the current CPU, but @@ -1729,7 +1729,7 @@ static void rcu_prepare_for_idle(int cpu) int tne; /* Handle nohz enablement switches conservatively. */ - tne = ACCESS_ONCE(tick_nohz_enabled); + tne = ACCESS_ONCE(tick_nohz_active); if (tne != rdtp->tick_nohz_enabled_snap) { if (rcu_cpu_has_callbacks(cpu, NULL)) invoke_rcu_core(); /* force nohz to see update. */ diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 3612fc7..a12df5a 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -361,8 +361,8 @@ void __init tick_nohz_init(void) /* * NO HZ enabled ? */ -int tick_nohz_enabled __read_mostly = 1; - +static int tick_nohz_enabled __read_mostly = 1; +int tick_nohz_active __read_mostly; /* * Enable / Disable tickless mode */ @@ -465,7 +465,7 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); ktime_t now, idle; - if (!tick_nohz_enabled) + if (!tick_nohz_active) return -1; now = ktime_get(); @@ -506,7 +506,7 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); ktime_t now, iowait; - if (!tick_nohz_enabled) + if (!tick_nohz_active) return -1; now = ktime_get(); @@ -799,11 +799,6 @@ void tick_nohz_idle_enter(void) local_irq_disable(); ts = &__get_cpu_var(tick_cpu_sched); - /* - * set ts->inidle unconditionally. even if the system did not - * switch to nohz mode the cpu frequency governers rely on the - * update of the idle time accounting in tick_nohz_start_idle(). - */ ts->inidle = 1; __tick_nohz_idle_enter(ts); @@ -973,7 +968,7 @@ static void tick_nohz_switch_to_nohz(void) struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); ktime_t next; - if (!tick_nohz_enabled) + if (!tick_nohz_active) return; local_irq_disable(); @@ -981,7 +976,7 @@ static void tick_nohz_switch_to_nohz(void) local_irq_enable(); return; } - + tick_nohz_active = 1; ts->nohz_mode = NOHZ_MODE_LOWRES; /* @@ -1139,8 +1134,10 @@ void tick_setup_sched_timer(void) } #ifdef CONFIG_NO_HZ_COMMON - if (tick_nohz_enabled) + if (tick_nohz_enabled) { ts->nohz_mode = NOHZ_MODE_HIGHRES; + tick_nohz_active = 1; + } #endif } #endif /* HIGH_RES_TIMERS */ -- cgit v1.1 From da554eba2e68c8ec051977db5ee1f42d384a01ed Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Fri, 15 Nov 2013 14:15:31 -0800 Subject: timer: Convert kmalloc_node(...GFP_ZERO...) to kzalloc_node(...) Use the helper function instead of __GFP_ZERO. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/timer.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index 6582b82..accfd24 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1518,9 +1518,8 @@ static int init_timers_cpu(int cpu) /* * The APs use this path later in boot */ - base = kmalloc_node(sizeof(*base), - GFP_KERNEL | __GFP_ZERO, - cpu_to_node(cpu)); + base = kzalloc_node(sizeof(*base), GFP_KERNEL, + cpu_to_node(cpu)); if (!base) return -ENOMEM; -- cgit v1.1 From 050ded1bbaea3331745cf2782315f5bc2582d083 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 15 Nov 2013 14:15:33 -0800 Subject: tick: Document tick_do_timer_cpu Taken straight from a tglx email ;) Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/time/tick-common.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 64522ec..162b03a 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -33,6 +33,21 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device); */ ktime_t tick_next_period; ktime_t tick_period; + +/* + * tick_do_timer_cpu is a timer core internal variable which holds the CPU NR + * which is responsible for calling do_timer(), i.e. the timekeeping stuff. This + * variable has two functions: + * + * 1) Prevent a thundering herd issue of a gazillion of CPUs trying to grab the + * timekeeping lock all at once. Only the CPU which is assigned to do the + * update is handling it. + * + * 2) Hand off the duty in the NOHZ idle case by setting the value to + * TICK_DO_TIMER_NONE, i.e. a non existing CPU. So the next cpu which looks + * at it will take over and keep the time keeping alive. The handover + * procedure also covers cpu hotplug. + */ int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; /* -- cgit v1.1 From d5b5f391d434c5cc8bcb1ab2d759738797b85f52 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 14 Nov 2013 16:23:04 +0100 Subject: ftrace, perf: Avoid infinite event generation loop Vince's perf-trinity fuzzer found yet another 'interesting' problem. When we sample the irq_work_exit tracepoint with period==1 (or PERF_SAMPLE_PERIOD) and we add an fasync SIGNAL handler we create an infinite event generation loop: ,-> | irq_work_exit() -> | trace_irq_work_exit() -> | ... | __perf_event_overflow() -> (due to fasync) | irq_work_queue() -> (irq_work_list must be empty) '--------- arch_irq_work_raise() Similar things can happen due to regular poll() wakeups if we exceed the ring-buffer wakeup watermark, or have an event_limit. To avoid this, dis-allow sampling this particular tracepoint. In order to achieve this, create a special perf_perm function pointer for each event and call this (when set) on trying to create a tracepoint perf event. [ roasted: use expr... to allow for ',' in your expression ] Reported-by: Vince Weaver Tested-by: Vince Weaver Signed-off-by: Peter Zijlstra Cc: Steven Rostedt Cc: Dave Jones Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20131114152304.GC5364@laptop.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/trace/trace_event_perf.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 78e27e3..630889f 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -24,6 +24,12 @@ static int total_ref_count; static int perf_trace_event_perm(struct ftrace_event_call *tp_event, struct perf_event *p_event) { + if (tp_event->perf_perm) { + int ret = tp_event->perf_perm(tp_event, p_event); + if (ret) + return ret; + } + /* The ftrace function trace is allowed only for root. */ if (ftrace_event_is_function(tp_event) && perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) -- cgit v1.1 From 06db0b21712f878b808480ef31097637013bbf0f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 13 Sep 2013 13:14:47 +0200 Subject: perf: Remove fragile swevent hlist optimization Currently we only allocate a single cpu hashtable for per-cpu swevents; do away with this optimization for it is fragile in the face of things like perf_pmu_migrate_context(). The easiest thing is to make sure all CPUs are consistent wrt state. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20130913111447.GN31370@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/events/core.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index d724e77..72348dc 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5680,11 +5680,6 @@ static void swevent_hlist_put(struct perf_event *event) { int cpu; - if (event->cpu != -1) { - swevent_hlist_put_cpu(event, event->cpu); - return; - } - for_each_possible_cpu(cpu) swevent_hlist_put_cpu(event, cpu); } @@ -5718,9 +5713,6 @@ static int swevent_hlist_get(struct perf_event *event) int err; int cpu, failed_cpu; - if (event->cpu != -1) - return swevent_hlist_get_cpu(event, event->cpu); - get_online_cpus(); for_each_possible_cpu(cpu) { err = swevent_hlist_get_cpu(event, cpu); -- cgit v1.1 From 0022cedd4a7d8a87841351e2b018bb6794cf2e67 Mon Sep 17 00:00:00 2001 From: Vince Weaver Date: Fri, 15 Nov 2013 12:39:45 -0500 Subject: perf/trace: Properly use u64 to hold event_id The 64-bit attr.config value for perf trace events was being copied into an "int" before doing a comparison, meaning the top 32 bits were being truncated. As far as I can tell this didn't cause any errors, but it did mean it was possible to create valid aliases for all the tracepoint ids which I don't think was intended. (For example, 0xffffffff00000018 and 0x18 both enable the same tracepoint). Signed-off-by: Vince Weaver Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1311151236100.11932@vincent-weaver-1.um.maine.edu Signed-off-by: Ingo Molnar --- kernel/trace/trace_event_perf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 630889f..e854f42 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -179,7 +179,7 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event, int perf_trace_init(struct perf_event *p_event) { struct ftrace_event_call *tp_event; - int event_id = p_event->attr.config; + u64 event_id = p_event->attr.config; int ret = -EINVAL; mutex_lock(&event_mutex); -- cgit v1.1 From 9abf24d465180f5f2eb26a43545348262f16b771 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Tue, 12 Nov 2013 22:11:26 +0530 Subject: sched: Check sched_domain before computing group power After commit 863bffc80898 ("sched/fair: Fix group power_orig computation"), we can dereference rq->sd before it is set. Fix this by falling back to power_of() in this case and add a comment explaining things. Signed-off-by: Srikar Dronamraju [ Added comment and tweaked patch. ] Signed-off-by: Peter Zijlstra Cc: mikey@neuling.org Link: http://lkml.kernel.org/r/20131113151718.GN21461@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e8b652e..fd773ad 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5379,10 +5379,31 @@ void update_group_power(struct sched_domain *sd, int cpu) */ for_each_cpu(cpu, sched_group_cpus(sdg)) { - struct sched_group *sg = cpu_rq(cpu)->sd->groups; + struct sched_group_power *sgp; + struct rq *rq = cpu_rq(cpu); - power_orig += sg->sgp->power_orig; - power += sg->sgp->power; + /* + * build_sched_domains() -> init_sched_groups_power() + * gets here before we've attached the domains to the + * runqueues. + * + * Use power_of(), which is set irrespective of domains + * in update_cpu_power(). + * + * This avoids power/power_orig from being 0 and + * causing divide-by-zero issues on boot. + * + * Runtime updates will correct power_orig. + */ + if (unlikely(!rq->sd)) { + power_orig += power_of(cpu); + power += power_of(cpu); + continue; + } + + sgp = rq->sd->groups->sgp; + power_orig += sgp->power_orig; + power += sgp->power; } } else { /* -- cgit v1.1 From 42eb088ed246a5a817bb45a8b32fe234cf1c0f8b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 19 Nov 2013 16:41:49 +0100 Subject: sched: Avoid NULL dereference on sd_busy Commit 37dc6b50cee9 ("sched: Remove unnecessary iteration over sched domains to update nr_busy_cpus") forgot to clear 'sd_busy' under some conditions leading to a possible NULL deref in set_cpu_sd_state_idle(). Reported-by: Anton Blanchard Cc: Preeti U Murthy Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20131118113701.GF3866@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c180860..a1591ca 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4910,8 +4910,9 @@ static void update_top_cache_domain(int cpu) if (sd) { id = cpumask_first(sched_domain_span(sd)); size = cpumask_weight(sched_domain_span(sd)); - rcu_assign_pointer(per_cpu(sd_busy, cpu), sd->parent); + sd = sd->parent; /* sd_busy */ } + rcu_assign_pointer(per_cpu(sd_busy, cpu), sd); rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); per_cpu(sd_llc_size, cpu) = size; -- cgit v1.1 From 0515973ffb16c2852a1bb1df2ca1456556faaaa5 Mon Sep 17 00:00:00 2001 From: Shigeru Yoshida Date: Sun, 17 Nov 2013 12:12:36 +0900 Subject: sched: Fix a trivial typo in comments Fix a trivial typo in rq_attach_root(). Signed-off-by: Shigeru Yoshida Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20131117.121236.1990617639803941055.shigeru.yoshida@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a1591ca..718730d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4762,7 +4762,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) cpumask_clear_cpu(rq->cpu, old_rd->span); /* - * If we dont want to free the old_rt yet then + * If we dont want to free the old_rd yet then * set old_rd to NULL to skip the freeing later * in this function: */ -- cgit v1.1 From 4be77398ac9d948773116b6be4a3c91b3d6ea18c Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 22 Nov 2013 11:44:51 -0800 Subject: time: Fix 1ns/tick drift w/ GENERIC_TIME_VSYSCALL_OLD Since commit 1e75fa8be9f (time: Condense timekeeper.xtime into xtime_sec - merged in v3.6), there has been an problem with the error accounting in the timekeeping code, such that when truncating to nanoseconds, we round up to the next nsec, but the balancing adjustment to the ntp_error value was dropped. This causes 1ns per tick drift forward of the clock. In 3.7, this logic was isolated to only GENERIC_TIME_VSYSCALL_OLD architectures (s390, ia64, powerpc). The fix is simply to balance the accounting and to subtract the added nanosecond from ntp_error. This allows the internal long-term clock steering to keep the clock accurate. While this fix removes the regression added in 1e75fa8be9f, the ideal solution is to move away from GENERIC_TIME_VSYSCALL_OLD and use the new VSYSCALL method, which avoids entirely the nanosecond granular rounding, and the resulting short-term clock adjustment oscillation needed to keep long term accurate time. [ jstultz: Many thanks to Martin for his efforts identifying this subtle bug, and providing the fix. ] Originally-from: Martin Schwidefsky Cc: Tony Luck Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Cc: Andy Lutomirski Cc: Paul Turner Cc: Steven Rostedt Cc: Richard Cochran Cc: Prarit Bhargava Cc: Fenghua Yu Cc: Thomas Gleixner Cc: stable #v3.6+ Link: http://lkml.kernel.org/r/1385149491-20307-1-git-send-email-john.stultz@linaro.org Signed-off-by: John Stultz Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 3abf534..87b4f00 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1347,7 +1347,7 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk) tk->xtime_nsec -= remainder; tk->xtime_nsec += 1ULL << tk->shift; tk->ntp_error += remainder << tk->ntp_error_shift; - + tk->ntp_error -= (1ULL << tk->shift) << tk->ntp_error_shift; } #else #define old_vsyscall_fixup(tk) -- cgit v1.1 From e5fca243abae1445afbfceebda5f08462ef869d3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 Nov 2013 17:14:39 -0500 Subject: cgroup: use a dedicated workqueue for cgroup destruction Since be44562613851 ("cgroup: remove synchronize_rcu() from cgroup_diput()"), cgroup destruction path makes use of workqueue. css freeing is performed from a work item from that point on and a later commit, ea15f8ccdb430 ("cgroup: split cgroup destruction into two steps"), moves css offlining to workqueue too. As cgroup destruction isn't depended upon for memory reclaim, the destruction work items were put on the system_wq; unfortunately, some controller may block in the destruction path for considerable duration while holding cgroup_mutex. As large part of destruction path is synchronized through cgroup_mutex, when combined with high rate of cgroup removals, this has potential to fill up system_wq's max_active of 256. Also, it turns out that memcg's css destruction path ends up queueing and waiting for work items on system_wq through work_on_cpu(). If such operation happens while system_wq is fully occupied by cgroup destruction work items, work_on_cpu() can't make forward progress because system_wq is full and other destruction work items on system_wq can't make forward progress because the work item waiting for work_on_cpu() is holding cgroup_mutex, leading to deadlock. This can be fixed by queueing destruction work items on a separate workqueue. This patch creates a dedicated workqueue - cgroup_destroy_wq - for this purpose. As these work items shouldn't have inter-dependencies and mostly serialized by cgroup_mutex anyway, giving high concurrency level doesn't buy anything and the workqueue's @max_active is set to 1 so that destruction work items are executed one by one on each CPU. Hugh Dickins: Because cgroup_init() is run before init_workqueues(), cgroup_destroy_wq can't be allocated from cgroup_init(). Do it from a separate core_initcall(). In the future, we probably want to reorder so that workqueue init happens before cgroup_init(). Signed-off-by: Tejun Heo Reported-by: Hugh Dickins Reported-by: Shawn Bohrer Link: http://lkml.kernel.org/r/20131111220626.GA7509@sbohrermbp13-local.rgmadvisors.com Link: http://lkml.kernel.org/g/alpine.LNX.2.00.1310301606080.2333@eggly.anvils Cc: stable@vger.kernel.org # v3.9+ --- kernel/cgroup.c | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4c62513..a7b98ee 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -90,6 +90,14 @@ static DEFINE_MUTEX(cgroup_mutex); static DEFINE_MUTEX(cgroup_root_mutex); /* + * cgroup destruction makes heavy use of work items and there can be a lot + * of concurrent destructions. Use a separate workqueue so that cgroup + * destruction work items don't end up filling up max_active of system_wq + * which may lead to deadlock. + */ +static struct workqueue_struct *cgroup_destroy_wq; + +/* * Generate an array of cgroup subsystem pointers. At boot time, this is * populated with the built in subsystems, and modular subsystems are * registered after that. The mutable section of this array is protected by @@ -871,7 +879,7 @@ static void cgroup_free_rcu(struct rcu_head *head) struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head); INIT_WORK(&cgrp->destroy_work, cgroup_free_fn); - schedule_work(&cgrp->destroy_work); + queue_work(cgroup_destroy_wq, &cgrp->destroy_work); } static void cgroup_diput(struct dentry *dentry, struct inode *inode) @@ -4249,7 +4257,7 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head) * css_put(). dput() requires process context which we don't have. */ INIT_WORK(&css->destroy_work, css_free_work_fn); - schedule_work(&css->destroy_work); + queue_work(cgroup_destroy_wq, &css->destroy_work); } static void css_release(struct percpu_ref *ref) @@ -4539,7 +4547,7 @@ static void css_killed_ref_fn(struct percpu_ref *ref) container_of(ref, struct cgroup_subsys_state, refcnt); INIT_WORK(&css->destroy_work, css_killed_work_fn); - schedule_work(&css->destroy_work); + queue_work(cgroup_destroy_wq, &css->destroy_work); } /** @@ -5063,6 +5071,22 @@ out: return err; } +static int __init cgroup_wq_init(void) +{ + /* + * There isn't much point in executing destruction path in + * parallel. Good chunk is serialized with cgroup_mutex anyway. + * Use 1 for @max_active. + * + * We would prefer to do this in cgroup_init() above, but that + * is called before init_workqueues(): so leave this until after. + */ + cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); + BUG_ON(!cgroup_destroy_wq); + return 0; +} +core_initcall(cgroup_wq_init); + /* * proc_cgroup_show() * - Print task's cgroup paths into seq_file, one line for each hierarchy -- cgit v1.1 From 91151228065354a050fd0d190aefdd662a0580aa Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 14 Nov 2013 12:56:18 +0100 Subject: workqueue: swap set_cpus_allowed_ptr() and PF_NO_SETAFFINITY Move the setting of PF_NO_SETAFFINITY up before set_cpus_allowed() in create_worker(). Otherwise userland can change ->cpus_allowed in between. Signed-off-by: Oleg Nesterov Signed-off-by: Tejun Heo --- kernel/workqueue.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 987293d..f894242 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1736,16 +1736,17 @@ static struct worker *create_worker(struct worker_pool *pool) if (IS_ERR(worker->task)) goto fail; + set_user_nice(worker->task, pool->attrs->nice); + + /* prevent userland from meddling with cpumask of workqueue workers */ + worker->task->flags |= PF_NO_SETAFFINITY; + /* * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any * online CPUs. It'll be re-applied when any of the CPUs come up. */ - set_user_nice(worker->task, pool->attrs->nice); set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); - /* prevent userland from meddling with cpumask of workqueue workers */ - worker->task->flags |= PF_NO_SETAFFINITY; - /* * The caller is responsible for ensuring %POOL_DISASSOCIATED * remains stable across this function. See the comments above the -- cgit v1.1 From 8a2b75384444488fc4f2cbb9f0921b6a0794838f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 5 Sep 2013 12:30:04 -0400 Subject: workqueue: fix ordered workqueues in NUMA setups An ordered workqueue implements execution ordering by using single pool_workqueue with max_active == 1. On a given pool_workqueue, work items are processed in FIFO order and limiting max_active to 1 enforces the queued work items to be processed one by one. Unfortunately, 4c16bd327c ("workqueue: implement NUMA affinity for unbound workqueues") accidentally broke this guarantee by applying NUMA affinity to ordered workqueues too. On NUMA setups, an ordered workqueue would end up with separate pool_workqueues for different nodes. Each pool_workqueue still limits max_active to 1 but multiple work items may be executed concurrently and out of order depending on which node they are queued to. Fix it by using dedicated ordered_wq_attrs[] when creating ordered workqueues. The new attrs match the unbound ones except that no_numa is always set thus forcing all NUMA nodes to share the default pool_workqueue. While at it, add sanity check in workqueue creation path which verifies that an ordered workqueues has only the default pool_workqueue. Signed-off-by: Tejun Heo Reported-by: Libin Cc: stable@vger.kernel.org Cc: Lai Jiangshan --- kernel/workqueue.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f894242..bbb5e98 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -305,6 +305,9 @@ static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER); /* I: attributes used when instantiating standard unbound pools on demand */ static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS]; +/* I: attributes used when instantiating ordered pools on demand */ +static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS]; + struct workqueue_struct *system_wq __read_mostly; EXPORT_SYMBOL(system_wq); struct workqueue_struct *system_highpri_wq __read_mostly; @@ -4107,7 +4110,7 @@ out_unlock: static int alloc_and_link_pwqs(struct workqueue_struct *wq) { bool highpri = wq->flags & WQ_HIGHPRI; - int cpu; + int cpu, ret; if (!(wq->flags & WQ_UNBOUND)) { wq->cpu_pwqs = alloc_percpu(struct pool_workqueue); @@ -4127,6 +4130,13 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) mutex_unlock(&wq->mutex); } return 0; + } else if (wq->flags & __WQ_ORDERED) { + ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]); + /* there should only be single pwq for ordering guarantee */ + WARN(!ret && (wq->pwqs.next != &wq->dfl_pwq->pwqs_node || + wq->pwqs.prev != &wq->dfl_pwq->pwqs_node), + "ordering guarantee broken for workqueue %s\n", wq->name); + return ret; } else { return apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]); } @@ -5052,13 +5062,23 @@ static int __init init_workqueues(void) } } - /* create default unbound wq attrs */ + /* create default unbound and ordered wq attrs */ for (i = 0; i < NR_STD_WORKER_POOLS; i++) { struct workqueue_attrs *attrs; BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL))); attrs->nice = std_nice[i]; unbound_std_wq_attrs[i] = attrs; + + /* + * An ordered wq should have only one pwq as ordering is + * guaranteed by max_active which is enforced by pwqs. + * Turn off NUMA so that dfl_pwq is used for all nodes. + */ + BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL))); + attrs->nice = std_nice[i]; + attrs->no_numa = true; + ordered_wq_attrs[i] = attrs; } system_wq = alloc_workqueue("events", 0, 0); -- cgit v1.1 From 9ef28a73ff6a1598d6f915973c282fe28291f800 Mon Sep 17 00:00:00 2001 From: Li Bin Date: Mon, 9 Sep 2013 13:13:58 +0800 Subject: workqueue: fix comment typo for __queue_work() It seems the "dying" should be "draining" here. Signed-off-by: Li Bin Signed-off-by: Tejun Heo --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index bbb5e98..73bdf3c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1323,7 +1323,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, debug_work_activate(work); - /* if dying, only works from the same workqueue are allowed */ + /* if draining, only works from the same workqueue are allowed */ if (unlikely(wq->flags & __WQ_DRAINING) && WARN_ON_ONCE(!is_chained_work(wq))) return; -- cgit v1.1 From 4e8b22bd1a37447712f1b1d96352fc53b463c6b3 Mon Sep 17 00:00:00 2001 From: Li Bin Date: Tue, 10 Sep 2013 09:52:35 +0800 Subject: workqueue: fix pool ID allocation leakage and remove BUILD_BUG_ON() in init_workqueues When one work starts execution, the high bits of work's data contain pool ID. It can represent a maximum of WORK_OFFQ_POOL_NONE. Pool ID is assigned WORK_OFFQ_POOL_NONE when the work being initialized indicating that no pool is associated and get_work_pool() uses it to check the associated pool. So if worker_pool_assign_id() assigns a ID greater than or equal WORK_OFFQ_POOL_NONE to a pool, it triggers leakage, and it may break the non-reentrance guarantee. This patch fix this issue by modifying the worker_pool_assign_id() function calling idr_alloc() by setting @end param WORK_OFFQ_POOL_NONE. Furthermore, in the current implementation, the BUILD_BUG_ON() in init_workqueues makes no sense. The number of worker pools needed cannot be determined at compile time, because the number of backing pools for UNBOUND workqueues is dynamic based on the assigned custom attributes. So remove it. tj: Minor comment and indentation updates. Signed-off-by: Li Bin Signed-off-by: Tejun Heo --- kernel/workqueue.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 73bdf3c..c66912be 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -521,14 +521,21 @@ static inline void debug_work_activate(struct work_struct *work) { } static inline void debug_work_deactivate(struct work_struct *work) { } #endif -/* allocate ID and assign it to @pool */ +/** + * worker_pool_assign_id - allocate ID and assing it to @pool + * @pool: the pool pointer of interest + * + * Returns 0 if ID in [0, WORK_OFFQ_POOL_NONE) is allocated and assigned + * successfully, -errno on failure. + */ static int worker_pool_assign_id(struct worker_pool *pool) { int ret; lockdep_assert_held(&wq_pool_mutex); - ret = idr_alloc(&worker_pool_idr, pool, 0, 0, GFP_KERNEL); + ret = idr_alloc(&worker_pool_idr, pool, 0, WORK_OFFQ_POOL_NONE, + GFP_KERNEL); if (ret >= 0) { pool->id = ret; return 0; @@ -5020,10 +5027,6 @@ static int __init init_workqueues(void) int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; int i, cpu; - /* make sure we have enough bits for OFFQ pool ID */ - BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) < - WORK_CPU_END * NR_STD_WORKER_POOLS); - WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); -- cgit v1.1 From ac01810c9d2814238f08a227062e66a35a0e1ea2 Mon Sep 17 00:00:00 2001 From: Laxman Dewangan Date: Mon, 25 Nov 2013 19:39:47 +0530 Subject: irq: Enable all irqs unconditionally in irq_resume When the system enters suspend, it disables all interrupts in suspend_device_irqs(), including the interrupts marked EARLY_RESUME. On the resume side things are different. The EARLY_RESUME interrupts are reenabled in sys_core_ops->resume and the non EARLY_RESUME interrupts are reenabled in the normal system resume path. When suspend_noirq() failed or suspend is aborted for any other reason, we might omit the resume side call to sys_core_ops->resume() and therefor the interrupts marked EARLY_RESUME are not reenabled and stay disabled forever. To solve this, enable all irqs unconditionally in irq_resume() regardless whether interrupts marked EARLY_RESUMEhave been already enabled or not. This might try to reenable already enabled interrupts in the non failure case, but the only affected platform is XEN and it has been confirmed that it does not cause any side effects. [ tglx: Massaged changelog. ] Signed-off-by: Laxman Dewangan Acked-by-and-tested-by: Konrad Rzeszutek Wilk Acked-by: Heiko Stuebner Reviewed-by: Pavel Machek Cc: Cc: Cc: Cc: Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1385388587-16442-1-git-send-email-ldewangan@nvidia.com Signed-off-by: Thomas Gleixner --- kernel/irq/pm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index cb228bf..abcd6ca 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -50,7 +50,7 @@ static void resume_irqs(bool want_early) bool is_early = desc->action && desc->action->flags & IRQF_EARLY_RESUME; - if (is_early != want_early) + if (!is_early && want_early) continue; raw_spin_lock_irqsave(&desc->lock, flags); -- cgit v1.1 From 12997d1a999cd1b22e21a238c96780f2a55e4e13 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 18 Nov 2013 11:00:29 -0700 Subject: Revert "workqueue: allow work_on_cpu() to be called recursively" This reverts commit c2fda509667b0fda4372a237f5a59ea4570b1627. c2fda509667b removed lockdep annotation from work_on_cpu() to work around the PCI path that calls work_on_cpu() from within a work_on_cpu() work item (PF driver .probe() method -> pci_enable_sriov() -> add VFs -> VF driver .probe method). 961da7fb6b22 ("PCI: Avoid unnecessary CPU switch when calling driver .probe() method) avoids that recursive work_on_cpu() use in a different way, so this revert restores the work_on_cpu() lockdep annotation. Signed-off-by: Bjorn Helgaas Acked-by: Tejun Heo --- kernel/workqueue.c | 32 ++++++++++---------------------- 1 file changed, 10 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 987293d..5690b8e 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2840,19 +2840,6 @@ already_gone: return false; } -static bool __flush_work(struct work_struct *work) -{ - struct wq_barrier barr; - - if (start_flush_work(work, &barr)) { - wait_for_completion(&barr.done); - destroy_work_on_stack(&barr.work); - return true; - } else { - return false; - } -} - /** * flush_work - wait for a work to finish executing the last queueing instance * @work: the work to flush @@ -2866,10 +2853,18 @@ static bool __flush_work(struct work_struct *work) */ bool flush_work(struct work_struct *work) { + struct wq_barrier barr; + lock_map_acquire(&work->lockdep_map); lock_map_release(&work->lockdep_map); - return __flush_work(work); + if (start_flush_work(work, &barr)) { + wait_for_completion(&barr.done); + destroy_work_on_stack(&barr.work); + return true; + } else { + return false; + } } EXPORT_SYMBOL_GPL(flush_work); @@ -4814,14 +4809,7 @@ long work_on_cpu(int cpu, long (*fn)(void *), void *arg) INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn); schedule_work_on(cpu, &wfc.work); - - /* - * The work item is on-stack and can't lead to deadlock through - * flushing. Use __flush_work() to avoid spurious lockdep warnings - * when work_on_cpu()s are nested. - */ - __flush_work(&wfc.work); - + flush_work(&wfc.work); return wfc.ret; } EXPORT_SYMBOL_GPL(work_on_cpu); -- cgit v1.1 From 8a56d7761d2d041ae5e8215d20b4167d8aa93f51 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Mon, 25 Nov 2013 20:59:46 -0500 Subject: ftrace: Fix function graph with loading of modules Commit 8c4f3c3fa9681 "ftrace: Check module functions being traced on reload" fixed module loading and unloading with respect to function tracing, but it missed the function graph tracer. If you perform the following # cd /sys/kernel/debug/tracing # echo function_graph > current_tracer # modprobe nfsd # echo nop > current_tracer You'll get the following oops message: ------------[ cut here ]------------ WARNING: CPU: 2 PID: 2910 at /linux.git/kernel/trace/ftrace.c:1640 __ftrace_hash_rec_update.part.35+0x168/0x1b9() Modules linked in: nfsd exportfs nfs_acl lockd ipt_MASQUERADE sunrpc ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 ip6table_filter ip6_tables uinput snd_hda_codec_idt CPU: 2 PID: 2910 Comm: bash Not tainted 3.13.0-rc1-test #7 Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./To be filled by O.E.M., BIOS SDBLI944.86P 05/08/2007 0000000000000668 ffff8800787efcf8 ffffffff814fe193 ffff88007d500000 0000000000000000 ffff8800787efd38 ffffffff8103b80a 0000000000000668 ffffffff810b2b9a ffffffff81a48370 0000000000000001 ffff880037aea000 Call Trace: [] dump_stack+0x4f/0x7c [] warn_slowpath_common+0x81/0x9b [] ? __ftrace_hash_rec_update.part.35+0x168/0x1b9 [] warn_slowpath_null+0x1a/0x1c [] __ftrace_hash_rec_update.part.35+0x168/0x1b9 [] ? __mutex_lock_slowpath+0x364/0x364 [] ftrace_shutdown+0xd7/0x12b [] unregister_ftrace_graph+0x49/0x78 [] graph_trace_reset+0xe/0x10 [] tracing_set_tracer+0xa7/0x26a [] tracing_set_trace_write+0x8b/0xbd [] ? ftrace_return_to_handler+0xb2/0xde [] ? __sb_end_write+0x5e/0x5e [] vfs_write+0xab/0xf6 [] ftrace_graph_caller+0x85/0x85 [] SyS_write+0x59/0x82 [] ftrace_graph_caller+0x85/0x85 [] system_call_fastpath+0x16/0x1b ---[ end trace 940358030751eafb ]--- The above mentioned commit didn't go far enough. Well, it covered the function tracer by adding checks in __register_ftrace_function(). The problem is that the function graph tracer circumvents that (for a slight efficiency gain when function graph trace is running with a function tracer. The gain was not worth this). The problem came with ftrace_startup() which should always be called after __register_ftrace_function(), if you want this bug to be completely fixed. Anyway, this solution moves __register_ftrace_function() inside of ftrace_startup() and removes the need to call them both. Reported-by: Dave Wysochanski Fixes: ed926f9b35cd ("ftrace: Use counters to enable functions to trace") Cc: stable@vger.kernel.org # 3.0+ Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 64 ++++++++++++++++++++++++++++----------------------- 1 file changed, 35 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 22fa556..0e9f9ea 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -367,9 +367,6 @@ static int remove_ftrace_list_ops(struct ftrace_ops **list, static int __register_ftrace_function(struct ftrace_ops *ops) { - if (unlikely(ftrace_disabled)) - return -ENODEV; - if (FTRACE_WARN_ON(ops == &global_ops)) return -EINVAL; @@ -428,9 +425,6 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) { int ret; - if (ftrace_disabled) - return -ENODEV; - if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED))) return -EBUSY; @@ -2088,10 +2082,15 @@ static void ftrace_startup_enable(int command) static int ftrace_startup(struct ftrace_ops *ops, int command) { bool hash_enable = true; + int ret; if (unlikely(ftrace_disabled)) return -ENODEV; + ret = __register_ftrace_function(ops); + if (ret) + return ret; + ftrace_start_up++; command |= FTRACE_UPDATE_CALLS; @@ -2113,12 +2112,17 @@ static int ftrace_startup(struct ftrace_ops *ops, int command) return 0; } -static void ftrace_shutdown(struct ftrace_ops *ops, int command) +static int ftrace_shutdown(struct ftrace_ops *ops, int command) { bool hash_disable = true; + int ret; if (unlikely(ftrace_disabled)) - return; + return -ENODEV; + + ret = __unregister_ftrace_function(ops); + if (ret) + return ret; ftrace_start_up--; /* @@ -2153,9 +2157,10 @@ static void ftrace_shutdown(struct ftrace_ops *ops, int command) } if (!command || !ftrace_enabled) - return; + return 0; ftrace_run_update_code(command); + return 0; } static void ftrace_startup_sysctl(void) @@ -3060,16 +3065,13 @@ static void __enable_ftrace_function_probe(void) if (i == FTRACE_FUNC_HASHSIZE) return; - ret = __register_ftrace_function(&trace_probe_ops); - if (!ret) - ret = ftrace_startup(&trace_probe_ops, 0); + ret = ftrace_startup(&trace_probe_ops, 0); ftrace_probe_registered = 1; } static void __disable_ftrace_function_probe(void) { - int ret; int i; if (!ftrace_probe_registered) @@ -3082,9 +3084,7 @@ static void __disable_ftrace_function_probe(void) } /* no more funcs left */ - ret = __unregister_ftrace_function(&trace_probe_ops); - if (!ret) - ftrace_shutdown(&trace_probe_ops, 0); + ftrace_shutdown(&trace_probe_ops, 0); ftrace_probe_registered = 0; } @@ -4366,12 +4366,15 @@ core_initcall(ftrace_nodyn_init); static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } static inline void ftrace_startup_enable(int command) { } /* Keep as macros so we do not need to define the commands */ -# define ftrace_startup(ops, command) \ - ({ \ - (ops)->flags |= FTRACE_OPS_FL_ENABLED; \ - 0; \ +# define ftrace_startup(ops, command) \ + ({ \ + int ___ret = __register_ftrace_function(ops); \ + if (!___ret) \ + (ops)->flags |= FTRACE_OPS_FL_ENABLED; \ + ___ret; \ }) -# define ftrace_shutdown(ops, command) do { } while (0) +# define ftrace_shutdown(ops, command) __unregister_ftrace_function(ops) + # define ftrace_startup_sysctl() do { } while (0) # define ftrace_shutdown_sysctl() do { } while (0) @@ -4780,9 +4783,7 @@ int register_ftrace_function(struct ftrace_ops *ops) mutex_lock(&ftrace_lock); - ret = __register_ftrace_function(ops); - if (!ret) - ret = ftrace_startup(ops, 0); + ret = ftrace_startup(ops, 0); mutex_unlock(&ftrace_lock); @@ -4801,9 +4802,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops) int ret; mutex_lock(&ftrace_lock); - ret = __unregister_ftrace_function(ops); - if (!ret) - ftrace_shutdown(ops, 0); + ret = ftrace_shutdown(ops, 0); mutex_unlock(&ftrace_lock); return ret; @@ -4997,6 +4996,13 @@ ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state, return NOTIFY_DONE; } +/* Just a place holder for function graph */ +static struct ftrace_ops fgraph_ops __read_mostly = { + .func = ftrace_stub, + .flags = FTRACE_OPS_FL_STUB | FTRACE_OPS_FL_GLOBAL | + FTRACE_OPS_FL_RECURSION_SAFE, +}; + int register_ftrace_graph(trace_func_graph_ret_t retfunc, trace_func_graph_ent_t entryfunc) { @@ -5023,7 +5029,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, ftrace_graph_return = retfunc; ftrace_graph_entry = entryfunc; - ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET); + ret = ftrace_startup(&fgraph_ops, FTRACE_START_FUNC_RET); out: mutex_unlock(&ftrace_lock); @@ -5040,7 +5046,7 @@ void unregister_ftrace_graph(void) ftrace_graph_active--; ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; ftrace_graph_entry = ftrace_graph_entry_stub; - ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET); + ftrace_shutdown(&fgraph_ops, FTRACE_STOP_FUNC_RET); unregister_pm_notifier(&ftrace_suspend_notifier); unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); -- cgit v1.1 From 32e475d76a3e40879cd9ee4f69b19615062280d7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 Nov 2013 12:41:44 +0100 Subject: sched: Expose preempt_schedule_irq() Tony reported that aa0d53260596 ("ia64: Use preempt_schedule_irq") broke PREEMPT=n builds on ia64. Ok, wrapped my brain around it. I tripped over the magic asm foo which has a single need_resched check and schedule point for both sys call return and interrupt return. So you need the schedule_preempt_irq() for kernel preemption from interrupt return while on a normal syscall preemption a schedule would be sufficient. But using schedule_preempt_irq() is not harmful here in any way. It just sets the preempt_active bit also in cases where it would not be required. Even on preempt=n kernels adding the preempt_active bit is completely harmless. So instead of having an extra function, moving the existing one out of the ifdef PREEMPT looks like the sanest thing to do. It would also allow getting rid of various other sti/schedule/cli asm magic in other archs. Reported-and-Tested-by: Tony Luck Fixes: aa0d53260596 ("ia64: Use preempt_schedule_irq") Signed-off-by: Thomas Gleixner [slightly edited Changelog] Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/alpine.DEB.2.02.1311211230030.30673@ionos.tec.linutronix.de Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 718730d..e85cda2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2660,6 +2660,7 @@ asmlinkage void __sched notrace preempt_schedule(void) } while (need_resched()); } EXPORT_SYMBOL(preempt_schedule); +#endif /* CONFIG_PREEMPT */ /* * this is the entry point to schedule() from kernel preemption @@ -2693,8 +2694,6 @@ asmlinkage void __sched preempt_schedule_irq(void) exception_exit(prev_state); } -#endif /* CONFIG_PREEMPT */ - int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, void *key) { -- cgit v1.1 From 0fc0287c9ed1ffd3706f8b4d9b314aa102ef1245 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Nov 2013 15:03:41 +0100 Subject: cpuset: Fix memory allocator deadlock Juri hit the below lockdep report: [ 4.303391] ====================================================== [ 4.303392] [ INFO: SOFTIRQ-safe -> SOFTIRQ-unsafe lock order detected ] [ 4.303394] 3.12.0-dl-peterz+ #144 Not tainted [ 4.303395] ------------------------------------------------------ [ 4.303397] kworker/u4:3/689 [HC0[0]:SC0[0]:HE0:SE1] is trying to acquire: [ 4.303399] (&p->mems_allowed_seq){+.+...}, at: [] new_slab+0x6c/0x290 [ 4.303417] [ 4.303417] and this task is already holding: [ 4.303418] (&(&q->__queue_lock)->rlock){..-...}, at: [] blk_execute_rq_nowait+0x5b/0x100 [ 4.303431] which would create a new lock dependency: [ 4.303432] (&(&q->__queue_lock)->rlock){..-...} -> (&p->mems_allowed_seq){+.+...} [ 4.303436] [ 4.303898] the dependencies between the lock to be acquired and SOFTIRQ-irq-unsafe lock: [ 4.303918] -> (&p->mems_allowed_seq){+.+...} ops: 2762 { [ 4.303922] HARDIRQ-ON-W at: [ 4.303923] [] __lock_acquire+0x65a/0x1ff0 [ 4.303926] [] lock_acquire+0x93/0x140 [ 4.303929] [] kthreadd+0x86/0x180 [ 4.303931] [] ret_from_fork+0x7c/0xb0 [ 4.303933] SOFTIRQ-ON-W at: [ 4.303933] [] __lock_acquire+0x68c/0x1ff0 [ 4.303935] [] lock_acquire+0x93/0x140 [ 4.303940] [] kthreadd+0x86/0x180 [ 4.303955] [] ret_from_fork+0x7c/0xb0 [ 4.303959] INITIAL USE at: [ 4.303960] [] __lock_acquire+0x344/0x1ff0 [ 4.303963] [] lock_acquire+0x93/0x140 [ 4.303966] [] kthreadd+0x86/0x180 [ 4.303969] [] ret_from_fork+0x7c/0xb0 [ 4.303972] } Which reports that we take mems_allowed_seq with interrupts enabled. A little digging found that this can only be from cpuset_change_task_nodemask(). This is an actual deadlock because an interrupt doing an allocation will hit get_mems_allowed()->...->__read_seqcount_begin(), which will spin forever waiting for the write side to complete. Cc: John Stultz Cc: Mel Gorman Reported-by: Juri Lelli Signed-off-by: Peter Zijlstra Tested-by: Juri Lelli Acked-by: Li Zefan Acked-by: Mel Gorman Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org --- kernel/cpuset.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 6bf981e..4772034 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1033,8 +1033,10 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, need_loop = task_has_mempolicy(tsk) || !nodes_intersects(*newmems, tsk->mems_allowed); - if (need_loop) + if (need_loop) { + local_irq_disable(); write_seqcount_begin(&tsk->mems_allowed_seq); + } nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); @@ -1042,8 +1044,10 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2); tsk->mems_allowed = *newmems; - if (need_loop) + if (need_loop) { write_seqcount_end(&tsk->mems_allowed_seq); + local_irq_enable(); + } task_unlock(tsk); } -- cgit v1.1 From e605b36575e896edd8161534550c9ea021b03bc0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 27 Nov 2013 18:16:21 -0500 Subject: cgroup: fix cgroup_subsys_state leak for seq_files If a cgroup file implements either read_map() or read_seq_string(), such file is served using seq_file by overriding file->f_op to cgroup_seqfile_operations, which also overrides the release method to single_release() from cgroup_file_release(). Because cgroup_file_open() didn't use to acquire any resources, this used to be fine, but since f7d58818ba42 ("cgroup: pin cgroup_subsys_state when opening a cgroupfs file"), cgroup_file_open() pins the css (cgroup_subsys_state) which is put by cgroup_file_release(). The patch forgot to update the release path for seq_files and each open/release cycle leaks a css reference. Fix it by updating cgroup_file_release() to also handle seq_files and using it for seq_file release path too. Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org # v3.12 --- kernel/cgroup.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a7b98ee..8b729c2 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -199,6 +199,7 @@ static void cgroup_destroy_css_killed(struct cgroup *cgrp); static int cgroup_destroy_locked(struct cgroup *cgrp); static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], bool is_add); +static int cgroup_file_release(struct inode *inode, struct file *file); /** * cgroup_css - obtain a cgroup's css for the specified subsystem @@ -2429,7 +2430,7 @@ static const struct file_operations cgroup_seqfile_operations = { .read = seq_read, .write = cgroup_file_write, .llseek = seq_lseek, - .release = single_release, + .release = cgroup_file_release, }; static int cgroup_file_open(struct inode *inode, struct file *file) @@ -2490,6 +2491,8 @@ static int cgroup_file_release(struct inode *inode, struct file *file) ret = cft->release(inode, file); if (css->ss) css_put(css); + if (file->f_op == &cgroup_seqfile_operations) + single_release(inode, file); return ret; } -- cgit v1.1 From 5ecbe3c3c690b5ab493c730c317475287a9e8b45 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Thu, 28 Nov 2013 09:16:33 +0100 Subject: kernel/extable: fix address-checks for core_kernel and init areas The init_kernel_text() and core_kernel_text() functions should not include the labels _einittext and _etext when checking if an address is inside the .text or .init sections. Signed-off-by: Helge Deller Signed-off-by: Linus Torvalds --- kernel/extable.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/extable.c b/kernel/extable.c index 832cb28..763faf0 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -61,7 +61,7 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr) static inline int init_kernel_text(unsigned long addr) { if (addr >= (unsigned long)_sinittext && - addr <= (unsigned long)_einittext) + addr < (unsigned long)_einittext) return 1; return 0; } @@ -69,7 +69,7 @@ static inline int init_kernel_text(unsigned long addr) int core_kernel_text(unsigned long addr) { if (addr >= (unsigned long)_stext && - addr <= (unsigned long)_etext) + addr < (unsigned long)_etext) return 1; if (system_state == SYSTEM_BOOTING && -- cgit v1.1 From 0e576acbc1d9600cf2d9b4a141a2554639959d50 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 29 Nov 2013 12:18:13 +0100 Subject: nohz: Fix another inconsistency between CONFIG_NO_HZ=n and nohz=off If CONFIG_NO_HZ=n tick_nohz_get_sleep_length() returns NSEC_PER_SEC/HZ. If CONFIG_NO_HZ=y and the nohz functionality is disabled via the command line option "nohz=off" or not enabled due to missing hardware support, then tick_nohz_get_sleep_length() returns 0. That happens because ts->sleep_length is never set in that case. Set it to NSEC_PER_SEC/HZ when the NOHZ mode is inactive. Reported-by: Michal Hocko Reported-by: Borislav Petkov Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index a12df5a..ea20f7d 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -711,8 +711,10 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) return false; } - if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) + if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) { + ts->sleep_length = (ktime_t) { .tv64 = NSEC_PER_SEC/HZ }; return false; + } if (need_resched()) return false; -- cgit v1.1 From 3ccb01239201af06a07482ec686b14cd148102a5 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 3 Dec 2013 12:41:20 -0500 Subject: tracing: Only run synchronize_sched() at instance deletion time It has been reported that boot up with FTRACE_SELFTEST enabled can take a very long time. There can be stalls of over a minute. This was tracked down to the synchronize_sched() called when a system call event is disabled. As the self tests enable and disable thousands of events, this makes the synchronize_sched() get called thousands of times. The synchornize_sched() was added with d562aff93bfb53 "tracing: Add support for SOFT_DISABLE to syscall events" which caused this regression (added in 3.13-rc1). The synchronize_sched() is to protect against the events being accessed when a tracer instance is being deleted. When an instance is being deleted all the events associated to it are unregistered. The synchronize_sched() makes sure that no more users are running when it finishes. Instead of calling synchronize_sched() for all syscall events, we only need to call it once, after the events are unregistered and before the instance is deleted. The event_mutex is held during this action to prevent new users from enabling events. Link: http://lkml.kernel.org/r/20131203124120.427b9661@gandalf.local.home Reported-by: Petr Mladek Acked-by: Tom Zanussi Acked-by: Petr Mladek Tested-by: Petr Mladek Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 3 +++ kernel/trace/trace_syscalls.c | 10 ---------- 2 files changed, 3 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f919a2e..a11800a 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2314,6 +2314,9 @@ int event_trace_del_tracer(struct trace_array *tr) /* Disable any running events */ __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0); + /* Access to events are within rcu_read_lock_sched() */ + synchronize_sched(); + down_write(&trace_event_sem); __trace_remove_event_dirs(tr); debugfs_remove_recursive(tr->event_dir); diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index e4b6d11..ea90eb5 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -431,11 +431,6 @@ static void unreg_event_syscall_enter(struct ftrace_event_file *file, if (!tr->sys_refcount_enter) unregister_trace_sys_enter(ftrace_syscall_enter, tr); mutex_unlock(&syscall_trace_lock); - /* - * Callers expect the event to be completely disabled on - * return, so wait for current handlers to finish. - */ - synchronize_sched(); } static int reg_event_syscall_exit(struct ftrace_event_file *file, @@ -474,11 +469,6 @@ static void unreg_event_syscall_exit(struct ftrace_event_file *file, if (!tr->sys_refcount_exit) unregister_trace_sys_exit(ftrace_syscall_exit, tr); mutex_unlock(&syscall_trace_lock); - /* - * Callers expect the event to be completely disabled on - * return, so wait for current handlers to finish. - */ - synchronize_sched(); } static int __init init_syscall_trace(struct ftrace_event_call *call) -- cgit v1.1 From 266ccd505e8acb98717819cef9d91d66c7b237cc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 6 Dec 2013 15:07:32 -0500 Subject: cgroup: fix cgroup_create() error handling path ae7f164a09 ("cgroup: move cgroup->subsys[] assignment to online_css()") moved cgroup->subsys[] assignements later in cgroup_create() but didn't update error handling path accordingly leading to the following oops and leaking later css's after an online_css() failure. The oops is from cgroup destruction path being invoked on the partially constructed cgroup which is not ready to handle empty slots in cgrp->subsys[] array. BUG: unable to handle kernel NULL pointer dereference at 0000000000000008 IP: [] cgroup_destroy_locked+0x118/0x2f0 PGD a780a067 PUD aadbe067 PMD 0 Oops: 0000 [#1] SMP Modules linked in: CPU: 6 PID: 7360 Comm: mkdir Not tainted 3.13.0-rc2+ #69 Hardware name: task: ffff8800b9dbec00 ti: ffff8800a781a000 task.ti: ffff8800a781a000 RIP: 0010:[] [] cgroup_destroy_locked+0x118/0x2f0 RSP: 0018:ffff8800a781bd98 EFLAGS: 00010282 RAX: ffff880586903878 RBX: ffff880586903800 RCX: ffff880586903820 RDX: ffff880586903860 RSI: ffff8800a781bdb0 RDI: ffff880586903820 RBP: ffff8800a781bde8 R08: ffff88060e0b8048 R09: ffffffff811d7bc1 R10: 000000000000008c R11: 0000000000000001 R12: ffff8800a72286c0 R13: 0000000000000000 R14: ffffffff81cf7a40 R15: 0000000000000001 FS: 00007f60ecda57a0(0000) GS:ffff8806272c0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000008 CR3: 00000000a7a03000 CR4: 00000000000007e0 Stack: ffff880586903860 ffff880586903910 ffff8800a72286c0 ffff880586903820 ffffffff81cf7a40 ffff880586903800 ffff88060e0b8018 ffffffff81cf7a40 ffff8800b9dbec00 ffff8800b9dbf098 ffff8800a781bec8 ffffffff810ef5bf Call Trace: [] cgroup_mkdir+0x55f/0x5f0 [] vfs_mkdir+0xee/0x140 [] SyS_mkdirat+0x6e/0xf0 [] SyS_mkdir+0x19/0x20 [] system_call_fastpath+0x16/0x1b This patch moves reference bumping inside online_css() loop, clears css_ar[] as css's are brought online successfully, and updates err_destroy path so that either a css is fully online and destroyed by cgroup_destroy_locked() or the error path frees it. This creates a duplicate css free logic in the error path but it will be cleaned up soon. v2: Li pointed out that cgroup_destroy_locked() would do NULL-deref if invoked with a cgroup which doesn't have all css's populated. Update cgroup_destroy_locked() so that it skips NULL css's. Signed-off-by: Tejun Heo Acked-by: Li Zefan Reported-by: Vladimir Davydov Cc: stable@vger.kernel.org # v3.12+ --- kernel/cgroup.c | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8b729c2..bcb1755 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4426,14 +4426,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); root->number_of_cgroups++; - /* each css holds a ref to the cgroup's dentry and the parent css */ - for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; - - dget(dentry); - css_get(css->parent); - } - /* hold a ref to the parent's dentry */ dget(parent->dentry); @@ -4445,6 +4437,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, if (err) goto err_destroy; + /* each css holds a ref to the cgroup's dentry and parent css */ + dget(dentry); + css_get(css->parent); + + /* mark it consumed for error path */ + css_ar[ss->subsys_id] = NULL; + if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && parent->parent) { pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", @@ -4491,6 +4490,14 @@ err_free_cgrp: return err; err_destroy: + for_each_root_subsys(root, ss) { + struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; + + if (css) { + percpu_ref_cancel_init(&css->refcnt); + ss->css_free(css); + } + } cgroup_destroy_locked(cgrp); mutex_unlock(&cgroup_mutex); mutex_unlock(&dentry->d_inode->i_mutex); @@ -4652,8 +4659,12 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) * will be invoked to perform the rest of destruction once the * percpu refs of all css's are confirmed to be killed. */ - for_each_root_subsys(cgrp->root, ss) - kill_css(cgroup_css(cgrp, ss)); + for_each_root_subsys(cgrp->root, ss) { + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); + + if (css) + kill_css(css); + } /* * Mark @cgrp dead. This prevents further task migration and child -- cgit v1.1 From 4fc9bbf98fd66f879e628d8537ba7c240be2b58e Mon Sep 17 00:00:00 2001 From: Khalid Aziz Date: Wed, 27 Nov 2013 15:19:25 -0700 Subject: PCI: Disable Bus Master only on kexec reboot Add a flag to tell the PCI subsystem that kernel is shutting down in preparation to kexec a kernel. Add code in PCI subsystem to use this flag to clear Bus Master bit on PCI devices only in case of kexec reboot. This fixes a power-off problem on Acer Aspire V5-573G and likely other machines and avoids any other issues caused by clearing Bus Master bit on PCI devices in normal shutdown path. The problem was introduced by b566a22c2332 ("PCI: disable Bus Master on PCI device shutdown"). This patch is based on discussion at http://marc.info/?l=linux-pci&m=138425645204355&w=2 Link: https://bugzilla.kernel.org/show_bug.cgi?id=63861 Reported-by: Chang Liu Signed-off-by: Khalid Aziz Signed-off-by: Bjorn Helgaas Acked-by: Konstantin Khlebnikov Cc: stable@vger.kernel.org # v3.5+ --- kernel/kexec.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 490afc0..d0d8fca 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -47,6 +47,9 @@ u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; size_t vmcoreinfo_size; size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); +/* Flag to indicate we are going to kexec a new kernel */ +bool kexec_in_progress = false; + /* Location of the reserved area for the crash kernel */ struct resource crashk_res = { .name = "Crash kernel", @@ -1675,6 +1678,7 @@ int kernel_kexec(void) } else #endif { + kexec_in_progress = true; kernel_restart_prepare(NULL); printk(KERN_EMERG "Starting new kernel\n"); machine_shutdown(); -- cgit v1.1 From 7cfe5b3310a1b45f385ff18647bddb487a6c5525 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 10 Dec 2013 17:42:50 +1030 Subject: Ignore generated file kernel/x509_certificate_list $ git status # On branch pending-rebases # Untracked files: # (use "git add ..." to include in what will be committed) # # kernel/x509_certificate_list nothing added to commit but untracked files present (use "git add" to track) $ Signed-off-by: Rusty Russell Signed-off-by: David Howells --- kernel/.gitignore | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/.gitignore b/kernel/.gitignore index b3097bd..790d83c 100644 --- a/kernel/.gitignore +++ b/kernel/.gitignore @@ -5,3 +5,4 @@ config_data.h config_data.gz timeconst.h hz.bc +x509_certificate_list -- cgit v1.1 From 62226983da070f7e51068ec2e3a4da34672964c7 Mon Sep 17 00:00:00 2001 From: Hendrik Brueckner Date: Thu, 5 Dec 2013 14:48:22 +0100 Subject: KEYS: correct alignment of system_certificate_list content in assembly file Apart from data-type specific alignment constraints, there are also architecture-specific alignment requirements. For example, on s390 symbols must be on even addresses implying a 2-byte alignment. If the system_certificate_list_end symbol is on an odd address and if this address is loaded, the least-significant bit is ignored. As a result, the load_system_certificate_list() fails to load the certificates because of a wrong certificate length calculation. To be safe, align system_certificate_list on an 8-byte boundary. Also improve the length calculation of the system_certificate_list content. Introduce a system_certificate_list_size (8-byte aligned because of unsigned long) variable that stores the length. Let the linker calculate this size by introducing a start and end label for the certificate content. Signed-off-by: Hendrik Brueckner Signed-off-by: David Howells --- kernel/system_certificates.S | 14 ++++++++++++-- kernel/system_keyring.c | 4 ++-- 2 files changed, 14 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/system_certificates.S b/kernel/system_certificates.S index 4aef390..3e9868d47 100644 --- a/kernel/system_certificates.S +++ b/kernel/system_certificates.S @@ -3,8 +3,18 @@ __INITRODATA + .align 8 .globl VMLINUX_SYMBOL(system_certificate_list) VMLINUX_SYMBOL(system_certificate_list): +__cert_list_start: .incbin "kernel/x509_certificate_list" - .globl VMLINUX_SYMBOL(system_certificate_list_end) -VMLINUX_SYMBOL(system_certificate_list_end): +__cert_list_end: + + .align 8 + .globl VMLINUX_SYMBOL(system_certificate_list_size) +VMLINUX_SYMBOL(system_certificate_list_size): +#ifdef CONFIG_64BIT + .quad __cert_list_end - __cert_list_start +#else + .long __cert_list_end - __cert_list_start +#endif diff --git a/kernel/system_keyring.c b/kernel/system_keyring.c index 564dd93..52ebc70 100644 --- a/kernel/system_keyring.c +++ b/kernel/system_keyring.c @@ -22,7 +22,7 @@ struct key *system_trusted_keyring; EXPORT_SYMBOL_GPL(system_trusted_keyring); extern __initconst const u8 system_certificate_list[]; -extern __initconst const u8 system_certificate_list_end[]; +extern __initconst const unsigned long system_certificate_list_size; /* * Load the compiled-in keys @@ -60,8 +60,8 @@ static __init int load_system_certificate_list(void) pr_notice("Loading compiled-in X.509 certificates\n"); - end = system_certificate_list_end; p = system_certificate_list; + end = p + system_certificate_list_size; while (p < end) { /* Each cert begins with an ASN.1 SEQUENCE tag and must be more * than 256 bytes in size. -- cgit v1.1 From 8e8339a3a1069141985daaa2521ba304509ddecd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 11 Dec 2013 11:09:53 +0100 Subject: sched: Initialize power_orig for overlapping groups Yinghai reported that he saw a /0 in sg_capacity on his EX parts. Make sure to always initialize power_orig now that we actually use it. Ideally build_sched_domains() -> init_sched_groups_power() would also initialize this; but for some yet unexplained reason some setups seem to miss updates there. Reported-by: Yinghai Lu Tested-by: Yinghai Lu Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-l8ng2m9uml6fhibln8wqpom7@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e85cda2..19af58f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5112,6 +5112,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) * die on a /0 trap. */ sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span); + sg->sgp->power_orig = sg->sgp->power; /* * Make sure the first group of this domain contains the -- cgit v1.1 From 9dbdb155532395ba000c5d5d187658b0e17e529f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 18 Nov 2013 18:27:06 +0100 Subject: sched/fair: Rework sched_fair time accounting Christian suffers from a bad BIOS that wrecks his i5's TSC sync. This results in him occasionally seeing time going backwards - which crashes the scheduler ... Most of our time accounting can actually handle that except the most common one; the tick time update of sched_fair. There is a further problem with that code; previously we assumed that because we get a tick every TICK_NSEC our time delta could never exceed 32bits and math was simpler. However, ever since Frederic managed to get NO_HZ_FULL merged; this is no longer the case since now a task can run for a long time indeed without getting a tick. It only takes about ~4.2 seconds to overflow our u32 in nanoseconds. This means we not only need to better deal with time going backwards; but also means we need to be able to deal with large deltas. This patch reworks the entire code and uses mul_u64_u32_shr() as proposed by Andy a long while ago. We express our virtual time scale factor in a u32 multiplier and shift right and the 32bit mul_u64_u32_shr() implementation reduces to a single 32x32->64 multiply if the time delta is still short (common case). For 64bit a 64x64->128 multiply can be used if ARCH_SUPPORTS_INT128. Reported-and-Tested-by: Christian Engelmayer Signed-off-by: Peter Zijlstra Cc: fweisbec@gmail.com Cc: Paul Turner Cc: Stanislaw Gruszka Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20131118172706.GI3866@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 144 +++++++++++++++++++++++----------------------------- 1 file changed, 64 insertions(+), 80 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fd773ad..9030da7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -178,59 +178,61 @@ void sched_init_granularity(void) update_sysctl(); } -#if BITS_PER_LONG == 32 -# define WMULT_CONST (~0UL) -#else -# define WMULT_CONST (1UL << 32) -#endif - +#define WMULT_CONST (~0U) #define WMULT_SHIFT 32 -/* - * Shift right and round: - */ -#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) +static void __update_inv_weight(struct load_weight *lw) +{ + unsigned long w; + + if (likely(lw->inv_weight)) + return; + + w = scale_load_down(lw->weight); + + if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) + lw->inv_weight = 1; + else if (unlikely(!w)) + lw->inv_weight = WMULT_CONST; + else + lw->inv_weight = WMULT_CONST / w; +} /* - * delta *= weight / lw + * delta_exec * weight / lw.weight + * OR + * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT + * + * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case + * we're guaranteed shift stays positive because inv_weight is guaranteed to + * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22. + * + * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus + * weight/lw.weight <= 1, and therefore our shift will also be positive. */ -static unsigned long -calc_delta_mine(unsigned long delta_exec, unsigned long weight, - struct load_weight *lw) +static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw) { - u64 tmp; + u64 fact = scale_load_down(weight); + int shift = WMULT_SHIFT; - /* - * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched - * entities since MIN_SHARES = 2. Treat weight as 1 if less than - * 2^SCHED_LOAD_RESOLUTION. - */ - if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION))) - tmp = (u64)delta_exec * scale_load_down(weight); - else - tmp = (u64)delta_exec; + __update_inv_weight(lw); - if (!lw->inv_weight) { - unsigned long w = scale_load_down(lw->weight); - - if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) - lw->inv_weight = 1; - else if (unlikely(!w)) - lw->inv_weight = WMULT_CONST; - else - lw->inv_weight = WMULT_CONST / w; + if (unlikely(fact >> 32)) { + while (fact >> 32) { + fact >>= 1; + shift--; + } } - /* - * Check whether we'd overflow the 64-bit multiplication: - */ - if (unlikely(tmp > WMULT_CONST)) - tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, - WMULT_SHIFT/2); - else - tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT); + /* hint to use a 32x32->64 mul */ + fact = (u64)(u32)fact * lw->inv_weight; + + while (fact >> 32) { + fact >>= 1; + shift--; + } - return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); + return mul_u64_u32_shr(delta_exec, fact, shift); } @@ -443,7 +445,7 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) #endif /* CONFIG_FAIR_GROUP_SCHED */ static __always_inline -void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec); +void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec); /************************************************************** * Scheduling class tree data structure manipulation methods: @@ -612,11 +614,10 @@ int sched_proc_update_handler(struct ctl_table *table, int write, /* * delta /= w */ -static inline unsigned long -calc_delta_fair(unsigned long delta, struct sched_entity *se) +static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se) { if (unlikely(se->load.weight != NICE_0_LOAD)) - delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load); + delta = __calc_delta(delta, NICE_0_LOAD, &se->load); return delta; } @@ -665,7 +666,7 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) update_load_add(&lw, se->load.weight); load = &lw; } - slice = calc_delta_mine(slice, se->load.weight, load); + slice = __calc_delta(slice, se->load.weight, load); } return slice; } @@ -703,47 +704,32 @@ void init_task_runnable_average(struct task_struct *p) #endif /* - * Update the current task's runtime statistics. Skip current tasks that - * are not in our scheduling class. + * Update the current task's runtime statistics. */ -static inline void -__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, - unsigned long delta_exec) -{ - unsigned long delta_exec_weighted; - - schedstat_set(curr->statistics.exec_max, - max((u64)delta_exec, curr->statistics.exec_max)); - - curr->sum_exec_runtime += delta_exec; - schedstat_add(cfs_rq, exec_clock, delta_exec); - delta_exec_weighted = calc_delta_fair(delta_exec, curr); - - curr->vruntime += delta_exec_weighted; - update_min_vruntime(cfs_rq); -} - static void update_curr(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; u64 now = rq_clock_task(rq_of(cfs_rq)); - unsigned long delta_exec; + u64 delta_exec; if (unlikely(!curr)) return; - /* - * Get the amount of time the current task was running - * since the last time we changed load (this cannot - * overflow on 32 bits): - */ - delta_exec = (unsigned long)(now - curr->exec_start); - if (!delta_exec) + delta_exec = now - curr->exec_start; + if (unlikely((s64)delta_exec <= 0)) return; - __update_curr(cfs_rq, curr, delta_exec); curr->exec_start = now; + schedstat_set(curr->statistics.exec_max, + max(delta_exec, curr->statistics.exec_max)); + + curr->sum_exec_runtime += delta_exec; + schedstat_add(cfs_rq, exec_clock, delta_exec); + + curr->vruntime += calc_delta_fair(delta_exec, curr); + update_min_vruntime(cfs_rq); + if (entity_is_task(curr)) { struct task_struct *curtask = task_of(curr); @@ -3015,8 +3001,7 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) } } -static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, - unsigned long delta_exec) +static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) { /* dock delta_exec before expiring quota (as it could span periods) */ cfs_rq->runtime_remaining -= delta_exec; @@ -3034,7 +3019,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, } static __always_inline -void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) +void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) { if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled) return; @@ -3574,8 +3559,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) return rq_clock_task(rq_of(cfs_rq)); } -static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, - unsigned long delta_exec) {} +static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} -- cgit v1.1 From f12d5bfceb7e1f9051563381ec047f7f13956c3c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 12 Dec 2013 09:38:42 -0800 Subject: futex: fix handling of read-only-mapped hugepages The hugepage code had the exact same bug that regular pages had in commit 7485d0d3758e ("futexes: Remove rw parameter from get_futex_key()"). The regular page case was fixed by commit 9ea71503a8ed ("futex: Fix regression with read only mappings"), but the transparent hugepage case (added in a5b338f2b0b1: "thp: update futex compound knowledge") case remained broken. Found by Dave Jones and his trinity tool. Reported-and-tested-by: Dave Jones Cc: stable@kernel.org # v2.6.38+ Acked-by: Thomas Gleixner Cc: Mel Gorman Cc: Darren Hart Cc: Andrea Arcangeli Cc: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/futex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 80ba086..02febad 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -288,7 +288,7 @@ again: put_page(page); /* serialize against __split_huge_page_splitting() */ local_irq_disable(); - if (likely(__get_user_pages_fast(address, 1, 1, &page) == 1)) { + if (likely(__get_user_pages_fast(address, 1, !ro, &page) == 1)) { page_head = compound_head(page); /* * page_head is valid pointer but we must pin -- cgit v1.1 From 5cdec2d833748fbd27d3682f7209225c504c79c5 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 12 Dec 2013 09:53:51 -0800 Subject: futex: move user address verification up to common code When debugging the read-only hugepage case, I was confused by the fact that get_futex_key() did an access_ok() only for the non-shared futex case, since the user address checking really isn't in any way specific to the private key handling. Now, it turns out that the shared key handling does effectively do the equivalent checks inside get_user_pages_fast() (it doesn't actually check the address range on x86, but does check the page protections for being a user page). So it wasn't actually a bug, but the fact that we treat the address differently for private and shared futexes threw me for a loop. Just move the check up, so that it gets done for both cases. Also, use the 'rw' parameter for the type, even if it doesn't actually matter any more (it's a historical artifact of the old racy i386 "page faults from kernel space don't check write protections"). Cc: Thomas Gleixner Signed-off-by: Linus Torvalds --- kernel/futex.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 02febad..f6ff019 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -251,6 +251,9 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) return -EINVAL; address -= key->both.offset; + if (unlikely(!access_ok(rw, uaddr, sizeof(u32)))) + return -EFAULT; + /* * PROCESS_PRIVATE futexes are fast. * As the mm cannot disappear under us and the 'key' only needs @@ -259,8 +262,6 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) * but access_ok() should be faster than find_vma() */ if (!fshared) { - if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))) - return -EFAULT; key->private.mm = mm; key->private.address = address; get_futex_key_refs(key); -- cgit v1.1 From d7ec435fdd03cfee70dba934ee384acc87bd6d00 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 13 Dec 2013 15:20:19 +0000 Subject: X.509: Fix certificate gathering Fix the gathering of certificates from both the source tree and the build tree to correctly calculate the pathnames of all the certificates. The problem was that if the default generated cert, signing_key.x509, didn't exist then it would not have a path attached and if it did, it would have a path attached. This means that the contents of kernel/.x509.list would change between the first compilation in a directory and the second. After the second it would remain stable because the signing_key.x509 file exists. The consequence was that the kernel would get relinked unconditionally on the second recompilation. The second recompilation would also show something like this: X.509 certificate list changed CERTS kernel/x509_certificate_list - Including cert /home/torvalds/v2.6/linux/signing_key.x509 AS kernel/system_certificates.o LD kernel/built-in.o which is why the relink would happen. Unfortunately, it isn't a simple matter of just sticking a path on the front of the filename of the certificate in the build directory as make can't then work out how to build it. So the path has to be prepended to the name for sorting and duplicate elimination and then removed for the make rule if it is in the build tree. Reported-by: Linus Torvalds Signed-off-by: David Howells --- kernel/Makefile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index bbaf7d5..c23bb0b 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -137,9 +137,10 @@ $(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE ############################################################################### ifeq ($(CONFIG_SYSTEM_TRUSTED_KEYRING),y) X509_CERTIFICATES-y := $(wildcard *.x509) $(wildcard $(srctree)/*.x509) -X509_CERTIFICATES-$(CONFIG_MODULE_SIG) += signing_key.x509 -X509_CERTIFICATES := $(sort $(foreach CERT,$(X509_CERTIFICATES-y), \ +X509_CERTIFICATES-$(CONFIG_MODULE_SIG) += $(objtree)/signing_key.x509 +X509_CERTIFICATES-raw := $(sort $(foreach CERT,$(X509_CERTIFICATES-y), \ $(or $(realpath $(CERT)),$(CERT)))) +X509_CERTIFICATES := $(subst $(realpath $(objtree))/,,$(X509_CERTIFICATES-raw)) ifeq ($(X509_CERTIFICATES),) $(warning *** No X.509 certificates found ***) -- cgit v1.1 From f46a3cbbebdaa5ca7b3ab23d7b81925dbe152bcb Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 10 Dec 2013 22:39:57 +0400 Subject: KEYS: Remove files generated when SYSTEM_TRUSTED_KEYRING=y Always remove generated SYSTEM_TRUSTED_KEYRING files while doing make mrproper. Signed-off-by: Kirill Tkhai Signed-off-by: David Howells --- kernel/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index c23bb0b..bc010ee 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -165,9 +165,9 @@ $(obj)/x509_certificate_list: $(X509_CERTIFICATES) $(obj)/.x509.list targets += $(obj)/.x509.list $(obj)/.x509.list: @echo $(X509_CERTIFICATES) >$@ +endif clean-files := x509_certificate_list .x509.list -endif ifeq ($(CONFIG_MODULE_SIG),y) ############################################################################### -- cgit v1.1 From 6bd364d82920be726c2d678e7ba9e27112686e11 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 13 Dec 2013 15:00:32 +0800 Subject: KEYS: fix uninitialized persistent_keyring_register_sem We run into this bug: [ 2736.063245] Unable to handle kernel paging request for data at address 0x00000000 [ 2736.063293] Faulting instruction address: 0xc00000000037efb0 [ 2736.063300] Oops: Kernel access of bad area, sig: 11 [#1] [ 2736.063303] SMP NR_CPUS=2048 NUMA pSeries [ 2736.063310] Modules linked in: sg nfsv3 rpcsec_gss_krb5 nfsv4 dns_resolver nfs fscache nf_conntrack_netbios_ns nf_conntrack_broadcast ipt_MASQUERADE ip6table_mangle ip6table_security ip6table_raw ip6t_REJECT iptable_nat nf_nat_ipv4 iptable_mangle iptable_security iptable_raw ipt_REJECT nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack ebtable_filter ebtables ip6table_filter iptable_filter ip_tables ip6table_nat nf_conntrack_ipv6 nf_defrag_ipv6 nf_nat_ipv6 nf_nat nf_conntrack ip6_tables ibmveth pseries_rng nx_crypto nfsd auth_rpcgss nfs_acl lockd sunrpc binfmt_misc xfs libcrc32c dm_service_time sd_mod crc_t10dif crct10dif_common ibmvfc scsi_transport_fc scsi_tgt dm_mirror dm_region_hash dm_log dm_multipath dm_mod [ 2736.063383] CPU: 1 PID: 7128 Comm: ssh Not tainted 3.10.0-48.el7.ppc64 #1 [ 2736.063389] task: c000000131930120 ti: c0000001319a0000 task.ti: c0000001319a0000 [ 2736.063394] NIP: c00000000037efb0 LR: c0000000006c40f8 CTR: 0000000000000000 [ 2736.063399] REGS: c0000001319a3870 TRAP: 0300 Not tainted (3.10.0-48.el7.ppc64) [ 2736.063403] MSR: 8000000000009032 CR: 28824242 XER: 20000000 [ 2736.063415] SOFTE: 0 [ 2736.063418] CFAR: c00000000000908c [ 2736.063421] DAR: 0000000000000000, DSISR: 40000000 [ 2736.063425] GPR00: c0000000006c40f8 c0000001319a3af0 c000000001074788 c0000001319a3bf0 GPR04: 0000000000000000 0000000000000000 0000000000000020 000000000000000a GPR08: fffffffe00000002 00000000ffff0000 0000000080000001 c000000000924888 GPR12: 0000000028824248 c000000007e00400 00001fffffa0f998 0000000000000000 GPR16: 0000000000000022 00001fffffa0f998 0000010022e92470 0000000000000000 GPR20: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 GPR24: 0000000000000000 c000000000f4a828 00003ffffe527108 0000000000000000 GPR28: c000000000f4a730 c000000000f4a828 0000000000000000 c0000001319a3bf0 [ 2736.063498] NIP [c00000000037efb0] .__list_add+0x30/0x110 [ 2736.063504] LR [c0000000006c40f8] .rwsem_down_write_failed+0x78/0x264 [ 2736.063508] PACATMSCRATCH [800000000280f032] [ 2736.063511] Call Trace: [ 2736.063516] [c0000001319a3af0] [c0000001319a3b80] 0xc0000001319a3b80 (unreliable) [ 2736.063523] [c0000001319a3b80] [c0000000006c40f8] .rwsem_down_write_failed+0x78/0x264 [ 2736.063530] [c0000001319a3c50] [c0000000006c1bb0] .down_write+0x70/0x78 [ 2736.063536] [c0000001319a3cd0] [c0000000002e5ffc] .keyctl_get_persistent+0x20c/0x320 [ 2736.063542] [c0000001319a3dc0] [c0000000002e2388] .SyS_keyctl+0x238/0x260 [ 2736.063548] [c0000001319a3e30] [c000000000009e7c] syscall_exit+0x0/0x7c [ 2736.063553] Instruction dump: [ 2736.063556] 7c0802a6 fba1ffe8 fbc1fff0 fbe1fff8 7cbd2b78 7c9e2378 7c7f1b78 f8010010 [ 2736.063566] f821ff71 e8a50008 7fa52040 40de00c0 7fbd2840 40de0094 7fbff040 [ 2736.063579] ---[ end trace 2708241785538296 ]--- It's caused by uninitialized persistent_keyring_register_sem. The bug was introduced by commit f36f8c75, two typos are in that commit: CONFIG_KEYS_KERBEROS_CACHE should be CONFIG_PERSISTENT_KEYRINGS and krb_cache_register_sem should be persistent_keyring_register_sem. Signed-off-by: Xiao Guangrong Signed-off-by: David Howells --- kernel/user.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/user.c b/kernel/user.c index a3a0dbf..c006131 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -51,9 +51,9 @@ struct user_namespace init_user_ns = { .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, .proc_inum = PROC_USER_INIT_INO, -#ifdef CONFIG_KEYS_KERBEROS_CACHE - .krb_cache_register_sem = - __RWSEM_INITIALIZER(init_user_ns.krb_cache_register_sem), +#ifdef CONFIG_PERSISTENT_KEYRINGS + .persistent_keyring_register_sem = + __RWSEM_INITIALIZER(init_user_ns.persistent_keyring_register_sem), #endif }; EXPORT_SYMBOL_GPL(init_user_ns); -- cgit v1.1 From c4602c1c818bd6626178d6d3fcc152d9f2f48ac0 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 16 Dec 2013 15:20:01 +0800 Subject: ftrace: Initialize the ftrace profiler for each possible cpu Ftrace currently initializes only the online CPUs. This implementation has two problems: - If we online a CPU after we enable the function profile, and then run the test, we will lose the trace information on that CPU. Steps to reproduce: # echo 0 > /sys/devices/system/cpu/cpu1/online # cd /tracing/ # echo >> set_ftrace_filter # echo 1 > function_profile_enabled # echo 1 > /sys/devices/system/cpu/cpu1/online # run test - If we offline a CPU before we enable the function profile, we will not clear the trace information when we enable the function profile. It will trouble the users. Steps to reproduce: # cd /tracing/ # echo >> set_ftrace_filter # echo 1 > function_profile_enabled # run test # cat trace_stat/function* # echo 0 > /sys/devices/system/cpu/cpu1/online # echo 0 > function_profile_enabled # echo 1 > function_profile_enabled # cat trace_stat/function* # run test # cat trace_stat/function* So it is better that we initialize the ftrace profiler for each possible cpu every time we enable the function profile instead of just the online ones. Link: http://lkml.kernel.org/r/1387178401-10619-1-git-send-email-miaox@cn.fujitsu.com Cc: stable@vger.kernel.org # 2.6.31+ Signed-off-by: Miao Xie Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 0e9f9ea..72a0f81 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -775,7 +775,7 @@ static int ftrace_profile_init(void) int cpu; int ret = 0; - for_each_online_cpu(cpu) { + for_each_possible_cpu(cpu) { ret = ftrace_profile_init_cpu(cpu); if (ret) break; -- cgit v1.1 From c1a71504e9715812a2d15e7c03b5aa147ae70ded Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 17 Dec 2013 11:13:39 +0800 Subject: cgroup: don't recycle cgroup id until all csses' have been destroyed Hugh reported this bug: > CONFIG_MEMCG_SWAP is broken in 3.13-rc. Try something like this: > > mkdir -p /tmp/tmpfs /tmp/memcg > mount -t tmpfs -o size=1G tmpfs /tmp/tmpfs > mount -t cgroup -o memory memcg /tmp/memcg > mkdir /tmp/memcg/old > echo 512M >/tmp/memcg/old/memory.limit_in_bytes > echo $$ >/tmp/memcg/old/tasks > cp /dev/zero /tmp/tmpfs/zero 2>/dev/null > echo $$ >/tmp/memcg/tasks > rmdir /tmp/memcg/old > sleep 1 # let rmdir work complete > mkdir /tmp/memcg/new > umount /tmp/tmpfs > dmesg | grep WARNING > rmdir /tmp/memcg/new > umount /tmp/memcg > > Shows lots of WARNING: CPU: 1 PID: 1006 at kernel/res_counter.c:91 > res_counter_uncharge_locked+0x1f/0x2f() > > Breakage comes from 34c00c319ce7 ("memcg: convert to use cgroup id"). > > The lifetime of a cgroup id is different from the lifetime of the > css id it replaced: memsw's css_get()s do nothing to hold on to the > old cgroup id, it soon gets recycled to a new cgroup, which then > mysteriously inherits the old's swap, without any charge for it. Instead of removing cgroup id right after all the csses have been offlined, we should do that after csses have been destroyed. To make sure an invalid css pointer won't be returned after the css is destroyed, make sure css_from_id() returns NULL in this case. tj: Updated comment to note planned changes for cgrp->id. Reported-by: Hugh Dickins Signed-off-by: Li Zefan Reviewed-by: Michal Hocko Signed-off-by: Tejun Heo --- kernel/cgroup.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index bcb1755..bc1dcab 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -890,6 +890,16 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) struct cgroup *cgrp = dentry->d_fsdata; BUG_ON(!(cgroup_is_dead(cgrp))); + + /* + * XXX: cgrp->id is only used to look up css's. As cgroup + * and css's lifetimes will be decoupled, it should be made + * per-subsystem and moved to css->id so that lookups are + * successful until the target css is released. + */ + idr_remove(&cgrp->root->cgroup_idr, cgrp->id); + cgrp->id = -1; + call_rcu(&cgrp->rcu_head, cgroup_free_rcu); } else { struct cfent *cfe = __d_cfe(dentry); @@ -4268,6 +4278,7 @@ static void css_release(struct percpu_ref *ref) struct cgroup_subsys_state *css = container_of(ref, struct cgroup_subsys_state, refcnt); + rcu_assign_pointer(css->cgroup->subsys[css->ss->subsys_id], NULL); call_rcu(&css->rcu_head, css_free_rcu_fn); } @@ -4733,14 +4744,6 @@ static void cgroup_destroy_css_killed(struct cgroup *cgrp) /* delete this cgroup from parent->children */ list_del_rcu(&cgrp->sibling); - /* - * We should remove the cgroup object from idr before its grace - * period starts, so we won't be looking up a cgroup while the - * cgroup is being freed. - */ - idr_remove(&cgrp->root->cgroup_idr, cgrp->id); - cgrp->id = -1; - dput(d); set_bit(CGRP_RELEASABLE, &parent->flags); -- cgit v1.1 From 443772776c69ac9293d66b4d69fd9af16299cc2a Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Mon, 16 Dec 2013 14:17:36 +0200 Subject: perf: Disable all pmus on unthrottling and rescheduling Currently, only one PMU in a context gets disabled during unthrottling and event_sched_{out,in}(), however, events in one context may belong to different pmus, which results in PMUs being reprogrammed while they are still enabled. This means that mixed PMU use [which is rare in itself] resulted in potentially completely unreliable results: corrupted events, bogus results, etc. This patch temporarily disables PMUs that correspond to each event in the context while these events are being modified. Signed-off-by: Alexander Shishkin Reviewed-by: Andi Kleen Signed-off-by: Peter Zijlstra Cc: Frederic Weisbecker Cc: Mike Galbraith Cc: Paul Mackerras Cc: Stephane Eranian Cc: Alexander Shishkin Link: http://lkml.kernel.org/r/1387196256-8030-1-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 72348dc..f574401 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1396,6 +1396,8 @@ event_sched_out(struct perf_event *event, if (event->state != PERF_EVENT_STATE_ACTIVE) return; + perf_pmu_disable(event->pmu); + event->state = PERF_EVENT_STATE_INACTIVE; if (event->pending_disable) { event->pending_disable = 0; @@ -1412,6 +1414,8 @@ event_sched_out(struct perf_event *event, ctx->nr_freq--; if (event->attr.exclusive || !cpuctx->active_oncpu) cpuctx->exclusive = 0; + + perf_pmu_enable(event->pmu); } static void @@ -1652,6 +1656,7 @@ event_sched_in(struct perf_event *event, struct perf_event_context *ctx) { u64 tstamp = perf_event_time(event); + int ret = 0; if (event->state <= PERF_EVENT_STATE_OFF) return 0; @@ -1674,10 +1679,13 @@ event_sched_in(struct perf_event *event, */ smp_wmb(); + perf_pmu_disable(event->pmu); + if (event->pmu->add(event, PERF_EF_START)) { event->state = PERF_EVENT_STATE_INACTIVE; event->oncpu = -1; - return -EAGAIN; + ret = -EAGAIN; + goto out; } event->tstamp_running += tstamp - event->tstamp_stopped; @@ -1693,7 +1701,10 @@ event_sched_in(struct perf_event *event, if (event->attr.exclusive) cpuctx->exclusive = 1; - return 0; +out: + perf_pmu_enable(event->pmu); + + return ret; } static int @@ -2743,6 +2754,8 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, if (!event_filter_match(event)) continue; + perf_pmu_disable(event->pmu); + hwc = &event->hw; if (hwc->interrupts == MAX_INTERRUPTS) { @@ -2752,7 +2765,7 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, } if (!event->attr.freq || !event->attr.sample_freq) - continue; + goto next; /* * stop the event and update event->count @@ -2774,6 +2787,8 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, perf_adjust_period(event, period, delta, false); event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0); + next: + perf_pmu_enable(event->pmu); } perf_pmu_enable(ctx->pmu); -- cgit v1.1 From 5d4cf996cf134e8ddb4f906b8197feb9267c2b77 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 17 Dec 2013 09:21:25 +0000 Subject: sched: Assign correct scheduling domain to 'sd_llc' Commit 42eb088e (sched: Avoid NULL dereference on sd_busy) corrected a NULL dereference on sd_busy but the fix also altered what scheduling domain it used for the 'sd_llc' percpu variable. One impact of this is that a task selecting a runqueue may consider idle CPUs that are not cache siblings as candidates for running. Tasks are then running on CPUs that are not cache hot. This was found through bisection where ebizzy threads were not seeing equal performance and it looked like a scheduling fairness issue. This patch mitigates but does not completely fix the problem on all machines tested implying there may be an additional bug or a common root cause. Here are the average range of performance seen by individual ebizzy threads. It was tested on top of candidate patches related to x86 TLB range flushing. 4-core machine 3.13.0-rc3 3.13.0-rc3 vanilla fixsd-v3r3 Mean 1 0.00 ( 0.00%) 0.00 ( 0.00%) Mean 2 0.34 ( 0.00%) 0.10 ( 70.59%) Mean 3 1.29 ( 0.00%) 0.93 ( 27.91%) Mean 4 7.08 ( 0.00%) 0.77 ( 89.12%) Mean 5 193.54 ( 0.00%) 2.14 ( 98.89%) Mean 6 151.12 ( 0.00%) 2.06 ( 98.64%) Mean 7 115.38 ( 0.00%) 2.04 ( 98.23%) Mean 8 108.65 ( 0.00%) 1.92 ( 98.23%) 8-core machine Mean 1 0.00 ( 0.00%) 0.00 ( 0.00%) Mean 2 0.40 ( 0.00%) 0.21 ( 47.50%) Mean 3 23.73 ( 0.00%) 0.89 ( 96.25%) Mean 4 12.79 ( 0.00%) 1.04 ( 91.87%) Mean 5 13.08 ( 0.00%) 2.42 ( 81.50%) Mean 6 23.21 ( 0.00%) 69.46 (-199.27%) Mean 7 15.85 ( 0.00%) 101.72 (-541.77%) Mean 8 109.37 ( 0.00%) 19.13 ( 82.51%) Mean 12 124.84 ( 0.00%) 28.62 ( 77.07%) Mean 16 113.50 ( 0.00%) 24.16 ( 78.71%) It's eliminated for one machine and reduced for another. Signed-off-by: Mel Gorman Signed-off-by: Peter Zijlstra Cc: Alex Shi Cc: Andrew Morton Cc: Fengguang Wu Cc: H Peter Anvin Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20131217092124.GV11295@suse.de Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 19af58f..a88f4a4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4902,6 +4902,7 @@ DEFINE_PER_CPU(struct sched_domain *, sd_asym); static void update_top_cache_domain(int cpu) { struct sched_domain *sd; + struct sched_domain *busy_sd = NULL; int id = cpu; int size = 1; @@ -4909,9 +4910,9 @@ static void update_top_cache_domain(int cpu) if (sd) { id = cpumask_first(sched_domain_span(sd)); size = cpumask_weight(sched_domain_span(sd)); - sd = sd->parent; /* sd_busy */ + busy_sd = sd->parent; /* sd_busy */ } - rcu_assign_pointer(per_cpu(sd_busy, cpu), sd); + rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd); rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); per_cpu(sd_llc_size, cpu) = size; -- cgit v1.1 From 757dfcaa41844595964f1220f1d33182dae49976 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Wed, 27 Nov 2013 19:59:13 +0400 Subject: sched/rt: Fix rq's cpupri leak while enqueue/dequeue child RT entities This patch touches the RT group scheduling case. Functions inc_rt_prio_smp() and dec_rt_prio_smp() change (global) rq's priority, while rt_rq passed to them may be not the top-level rt_rq. This is wrong, because changing of priority on a child level does not guarantee that the priority is the highest all over the rq. So, this leak makes RT balancing unusable. The short example: the task having the highest priority among all rq's RT tasks (no one other task has the same priority) are waking on a throttle rt_rq. The rq's cpupri is set to the task's priority equivalent, but real rq->rt.highest_prio.curr is less. The patch below fixes the problem. Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra CC: Steven Rostedt CC: stable@vger.kernel.org Link: http://lkml.kernel.org/r/49231385567953@web4m.yandex.ru Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 7d57275..1c40655 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -901,6 +901,13 @@ inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) { struct rq *rq = rq_of_rt_rq(rt_rq); +#ifdef CONFIG_RT_GROUP_SCHED + /* + * Change rq's cpupri only if rt_rq is the top queue. + */ + if (&rq->rt != rt_rq) + return; +#endif if (rq->online && prio < prev_prio) cpupri_set(&rq->rd->cpupri, rq->cpu, prio); } @@ -910,6 +917,13 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) { struct rq *rq = rq_of_rt_rq(rt_rq); +#ifdef CONFIG_RT_GROUP_SCHED + /* + * Change rq's cpupri only if rt_rq is the top queue. + */ + if (&rq->rt != rt_rq) + return; +#endif if (rq->online && rt_rq->highest_prio.curr != prev_prio) cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); } -- cgit v1.1 From c97102ba96324da330078ad8619ba4dfe840dbe3 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 18 Dec 2013 17:08:31 -0800 Subject: kexec: migrate to reboot cpu Commit 1b3a5d02ee07 ("reboot: move arch/x86 reboot= handling to generic kernel") moved reboot= handling to generic code. In the process it also removed the code in native_machine_shutdown() which are moving reboot process to reboot_cpu/cpu0. I guess that thought must have been that all reboot paths are calling migrate_to_reboot_cpu(), so we don't need this special handling. But kexec reboot path (kernel_kexec()) is not calling migrate_to_reboot_cpu() so above change broke kexec. Now reboot can happen on non-boot cpu and when INIT is sent in second kerneo to bring up BP, it brings down the machine. So start calling migrate_to_reboot_cpu() in kexec reboot path to avoid this problem. Bisected by WANG Chao. Reported-by: Matthew Whitehead Reported-by: Dave Young Signed-off-by: Vivek Goyal Tested-by: Baoquan He Tested-by: WANG Chao Acked-by: H. Peter Anvin Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 1 + kernel/reboot.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index d0d8fca..9c97016 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1680,6 +1680,7 @@ int kernel_kexec(void) { kexec_in_progress = true; kernel_restart_prepare(NULL); + migrate_to_reboot_cpu(); printk(KERN_EMERG "Starting new kernel\n"); machine_shutdown(); } diff --git a/kernel/reboot.c b/kernel/reboot.c index f813b34..662c83f 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -104,7 +104,7 @@ int unregister_reboot_notifier(struct notifier_block *nb) } EXPORT_SYMBOL(unregister_reboot_notifier); -static void migrate_to_reboot_cpu(void) +void migrate_to_reboot_cpu(void) { /* The boot cpu is always logical cpu 0 */ int cpu = reboot_cpu; -- cgit v1.1 From 3c67f474558748b604e247d92b55dfe89654c81d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 18 Dec 2013 17:08:40 -0800 Subject: sched: numa: skip inaccessible VMAs Inaccessible VMA should not be trapping NUMA hint faults. Skip them. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Alex Thorlton Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched/fair.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9030da7..c7395d9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1738,6 +1738,13 @@ void task_numa_work(struct callback_head *work) (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) continue; + /* + * Skip inaccessible VMAs to avoid any confusion between + * PROT_NONE and NUMA hinting ptes + */ + if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) + continue; + do { start = max(start, vma->vm_start); end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); -- cgit v1.1 From 20841405940e7be0617612d521e206e4b6b325db Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 18 Dec 2013 17:08:44 -0800 Subject: mm: fix TLB flush race between migration, and change_protection_range There are a few subtle races, between change_protection_range (used by mprotect and change_prot_numa) on one side, and NUMA page migration and compaction on the other side. The basic race is that there is a time window between when the PTE gets made non-present (PROT_NONE or NUMA), and the TLB is flushed. During that time, a CPU may continue writing to the page. This is fine most of the time, however compaction or the NUMA migration code may come in, and migrate the page away. When that happens, the CPU may continue writing, through the cached translation, to what is no longer the current memory location of the process. This only affects x86, which has a somewhat optimistic pte_accessible. All other architectures appear to be safe, and will either always flush, or flush whenever there is a valid mapping, even with no permissions (SPARC). The basic race looks like this: CPU A CPU B CPU C load TLB entry make entry PTE/PMD_NUMA fault on entry read/write old page start migrating page change PTE/PMD to new page read/write old page [*] flush TLB reload TLB from new entry read/write new page lose data [*] the old page may belong to a new user at this point! The obvious fix is to flush remote TLB entries, by making sure that pte_accessible aware of the fact that PROT_NONE and PROT_NUMA memory may still be accessible if there is a TLB flush pending for the mm. This should fix both NUMA migration and compaction. [mgorman@suse.de: fix build] Signed-off-by: Rik van Riel Signed-off-by: Mel Gorman Cc: Alex Thorlton Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 728d5be..5721f0e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -537,6 +537,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) spin_lock_init(&mm->page_table_lock); mm_init_aio(mm); mm_init_owner(mm, p); + clear_tlb_flush_pending(mm); if (likely(!mm_alloc_pgd(mm))) { mm->def_flags = 0; -- cgit v1.1 From 85fbd722ad0f5d64d1ad15888cd1eb2188bfb557 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 18 Dec 2013 07:07:32 -0500 Subject: libata, freezer: avoid block device removal while system is frozen MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Freezable kthreads and workqueues are fundamentally problematic in that they effectively introduce a big kernel lock widely used in the kernel and have already been the culprit of several deadlock scenarios. This is the latest occurrence. During resume, libata rescans all the ports and revalidates all pre-existing devices. If it determines that a device has gone missing, the device is removed from the system which involves invalidating block device and flushing bdi while holding driver core layer locks. Unfortunately, this can race with the rest of device resume. Because freezable kthreads and workqueues are thawed after device resume is complete and block device removal depends on freezable workqueues and kthreads (e.g. bdi_wq, jbd2) to make progress, this can lead to deadlock - block device removal can't proceed because kthreads are frozen and kthreads can't be thawed because device resume is blocked behind block device removal. 839a8e8660b6 ("writeback: replace custom worker pool implementation with unbound workqueue") made this particular deadlock scenario more visible but the underlying problem has always been there - the original forker task and jbd2 are freezable too. In fact, this is highly likely just one of many possible deadlock scenarios given that freezer behaves as a big kernel lock and we don't have any debug mechanism around it. I believe the right thing to do is getting rid of freezable kthreads and workqueues. This is something fundamentally broken. For now, implement a funny workaround in libata - just avoid doing block device hot[un]plug while the system is frozen. Kernel engineering at its finest. :( v2: Add EXPORT_SYMBOL_GPL(pm_freezing) for cases where libata is built as a module. v3: Comment updated and polling interval changed to 10ms as suggested by Rafael. v4: Add #ifdef CONFIG_FREEZER around the hack as pm_freezing is not defined when FREEZER is not configured thus breaking build. Reported by kbuild test robot. Signed-off-by: Tejun Heo Reported-by: Tomaž Šolc Reviewed-by: "Rafael J. Wysocki" Link: https://bugzilla.kernel.org/show_bug.cgi?id=62801 Link: http://lkml.kernel.org/r/20131213174932.GA27070@htj.dyndns.org Cc: Greg Kroah-Hartman Cc: Len Brown Cc: Oleg Nesterov Cc: stable@vger.kernel.org Cc: kbuild test robot --- kernel/freezer.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/freezer.c b/kernel/freezer.c index b462fa1..aa6a8aa 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -19,6 +19,12 @@ EXPORT_SYMBOL(system_freezing_cnt); bool pm_freezing; bool pm_nosig_freezing; +/* + * Temporary export for the deadlock workaround in ata_scsi_hotplug(). + * Remove once the hack becomes unnecessary. + */ +EXPORT_SYMBOL_GPL(pm_freezing); + /* protects freezing and frozen transitions */ static DEFINE_SPINLOCK(freezer_lock); -- cgit v1.1 From 597d795a2a786d22dd872332428e2b9439ede639 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 20 Dec 2013 13:35:58 +0200 Subject: mm: do not allocate page->ptl dynamically, if spinlock_t fits to long In struct page we have enough space to fit long-size page->ptl there, but we use dynamically-allocated page->ptl if size(spinlock_t) is larger than sizeof(int). It hurts 64-bit architectures with CONFIG_GENERIC_LOCKBREAK, where sizeof(spinlock_t) == 8, but it easily fits into struct page. Signed-off-by: Kirill A. Shutemov Acked-by: Hugh Dickins Signed-off-by: Linus Torvalds --- kernel/bounds.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bounds.c b/kernel/bounds.c index 5253204..9fd4246 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c @@ -22,6 +22,6 @@ void foo(void) #ifdef CONFIG_SMP DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS)); #endif - DEFINE(BLOATED_SPINLOCKS, sizeof(spinlock_t) > sizeof(int)); + DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t)); /* End of constants */ } -- cgit v1.1 From c606850407d9096415e226c75a871d0650404446 Mon Sep 17 00:00:00 2001 From: Masami Ichikawa Date: Thu, 19 Dec 2013 20:00:47 +0900 Subject: PM / sleep: Fix memory leak in pm_vt_switch_unregister(). kmemleak reported a memory leak as below. unreferenced object 0xffff880118f14700 (size 32): comm "swapper/0", pid 1, jiffies 4294877401 (age 123.283s) hex dump (first 32 bytes): 00 01 10 00 00 00 ad de 00 02 20 00 00 00 ad de .......... ..... 00 d4 d2 18 01 88 ff ff 01 00 00 00 00 04 00 00 ................ backtrace: [] kmemleak_alloc+0x4e/0xb0 [] kmem_cache_alloc_trace+0x1ec/0x260 [] pm_vt_switch_required+0x76/0xb0 [] register_framebuffer+0x195/0x320 [] efifb_probe+0x718/0x780 [] platform_drv_probe+0x45/0xb0 [] driver_probe_device+0x87/0x3a0 [] __driver_attach+0x93/0xa0 [] bus_for_each_dev+0x63/0xa0 [] driver_attach+0x1e/0x20 [] bus_add_driver+0x180/0x250 [] driver_register+0x64/0xf0 [] __platform_driver_register+0x4a/0x50 [] efifb_driver_init+0x12/0x14 [] do_one_initcall+0xfa/0x1b0 [] kernel_init_freeable+0x17b/0x201 In pm_vt_switch_required(), "entry" variable is allocated via kmalloc(). So, in pm_vt_switch_unregister(), it needs to call kfree() when object is deleted from list. Signed-off-by: Masami Ichikawa Reviewed-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/console.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/power/console.c b/kernel/power/console.c index 463aa673..eacb8bd 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c @@ -81,6 +81,7 @@ void pm_vt_switch_unregister(struct device *dev) list_for_each_entry(tmp, &pm_vt_switch_list, head) { if (tmp->dev == dev) { list_del(&tmp->head); + kfree(tmp); break; } } -- cgit v1.1