From 6e48b550d1f5f1919e6500547ae14a73fbf66c7b Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Fri, 13 Apr 2012 09:52:59 +0100 Subject: tracing: Fix build breakage without CONFIG_PERF_EVENTS (again) Today's -next fails to link for me: kernel/built-in.o:(.data+0x178e50): undefined reference to `perf_ftrace_event_register' It looks like multiple fixes have been merged for the issue fixed by commit fa73dc9 (tracing: Fix build breakage without CONFIG_PERF_EVENTS) though I can't identify the other changes that have gone in at the minute, it's possible that the changes which caused the breakage fixed by the previous commit got dropped but the fix made it in. Link: http://lkml.kernel.org/r/1334307179-21255-1-git-send-email-broonie@opensource.wolfsonmicro.com Cc: Frederic Weisbecker Cc: Ingo Molnar Signed-off-by: Mark Brown Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 95059f0..f95d65d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -836,11 +836,11 @@ extern const char *__stop___trace_bprintk_fmt[]; filter) #include "trace_entries.h" -#ifdef CONFIG_FUNCTION_TRACER +#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_FUNCTION_TRACER) int perf_ftrace_event_register(struct ftrace_event_call *call, enum trace_reg type, void *data); #else #define perf_ftrace_event_register NULL -#endif /* CONFIG_FUNCTION_TRACER */ +#endif #endif /* _LINUX_KERNEL_TRACE_H */ -- cgit v1.1 From 348f0fc238efb441a28e7644c51f9fd3001b228a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 16 Apr 2012 15:41:28 -0400 Subject: tracing: Fix regression with tracing_on The change to make tracing_on affect only the ftrace ring buffer, caused a bug where it wont affect any ring buffer. The problem was that the buffer of the trace_array was passed to the write function and not the trace array itself. The trace_array can change the buffer when running a latency tracer. If this happens, then the buffer being disabled may not be the buffer currently used by ftrace. This will cause the tracing_on file to become useless. The simple fix is to pass the trace_array to the write function instead of the buffer. Then the actual buffer may be changed. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ed7b5d1..2a22255 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4629,7 +4629,8 @@ static ssize_t rb_simple_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ring_buffer *buffer = filp->private_data; + struct trace_array *tr = filp->private_data; + struct ring_buffer *buffer = tr->buffer; char buf[64]; int r; @@ -4647,7 +4648,8 @@ static ssize_t rb_simple_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ring_buffer *buffer = filp->private_data; + struct trace_array *tr = filp->private_data; + struct ring_buffer *buffer = tr->buffer; unsigned long val; int ret; @@ -4734,7 +4736,7 @@ static __init int tracer_init_debugfs(void) &trace_clock_fops); trace_create_file("tracing_on", 0644, d_tracer, - global_trace.buffer, &rb_simple_fops); + &global_trace, &rb_simple_fops); #ifdef CONFIG_DYNAMIC_FTRACE trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, -- cgit v1.1 From 92c38702e98e58438c3760ebb279c40bbca8bd5f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 16 Apr 2012 12:12:09 -0700 Subject: rcu: Permit call_rcu() from CPU_DYING notifiers As of: 29494be71afe ("rcu,cleanup: simplify the code when cpu is dying") RCU adopts callbacks from the dying CPU in its CPU_DYING notifier, which means that any callbacks posted by later CPU_DYING notifiers are ignored until the CPU comes back online. A WARN_ON_ONCE() was added to __call_rcu() by: e56014000816 ("rcu: Simplify offline processing") to check for this condition. Although this condition did not trigger (at least as far as I know) during -next testing, it did recently trigger in mainline: https://lkml.org/lkml/2012/4/2/34 What is needed longer term is for RCU's CPU_DEAD notifier to adopt any callbacks that were posted by CPU_DYING notifiers, however, the Linux kernel has been running with this sort of thing happening for quite some time. So the only thing that qualifies as a regression is the WARN_ON_ONCE(), which this commit removes. Making RCU's CPU_DEAD notifier adopt callbacks posted by CPU_DYING notifiers is a topic for the 3.5 release of the Linux kernel. Reported-by: Sergey Senozhatsky Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 1050d6d..d0c5baf 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1820,7 +1820,6 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), * a quiescent state betweentimes. */ local_irq_save(flags); - WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); rdp = this_cpu_ptr(rsp->rda); /* Add the callback to our list. */ -- cgit v1.1 From b435092f70ec5ebbfb6d075d5bf3c631b49a51de Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 18 Apr 2012 12:08:23 +0200 Subject: tick: Fix oneshot broadcast setup really Sven Joachim reported, that suspend/resume on rc3 trips over a NULL pointer dereference. Linus spotted the clockevent handler being NULL. commit fa4da365b(clockevents: tTack broadcast device mode change in tick_broadcast_switch_to_oneshot()) tried to fix a problem with the broadcast device setup, which was introduced in commit 77b0d60c5( clockevents: Leave the broadcast device in shutdown mode when not needed). The initial commit avoided to set up the broadcast device when no broadcast request bits were set, but that left the broadcast device disfunctional. In consequence deep idle states which need the broadcast device were not woken up. commit fa4da365b tried to fix that by initializing the state of the broadcast facility, but that missed the fact, that nothing initializes the event handler and some other state of the underlying clock event device. The fix is to revert both commits and make only the mode setting of the clock event device conditional on the state of active broadcast users. That initializes everything except the low level device mode, but this happens when the broadcast functionality is invoked by deep idle. Reported-and-tested-by: Sven Joachim Signed-off-by: Thomas Gleixner Cc: Rafael J. Wysocki Cc: Linus Torvalds Cc: Suresh Siddha Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1204181205540.2542@ionos --- kernel/time/tick-broadcast.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index bf57abd..119aca5 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -531,7 +531,6 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC; bc->event_handler = tick_handle_oneshot_broadcast; - clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); /* Take the do_timer update */ tick_do_timer_cpu = cpu; @@ -549,6 +548,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) to_cpumask(tmpmask)); if (was_periodic && !cpumask_empty(to_cpumask(tmpmask))) { + clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); tick_broadcast_init_next_event(to_cpumask(tmpmask), tick_next_period); tick_broadcast_set_event(tick_next_period, 1); @@ -577,15 +577,10 @@ void tick_broadcast_switch_to_oneshot(void) raw_spin_lock_irqsave(&tick_broadcast_lock, flags); tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT; - - if (cpumask_empty(tick_get_broadcast_mask())) - goto end; - bc = tick_broadcast_device.evtdev; if (bc) tick_broadcast_setup_oneshot(bc); -end: raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } -- cgit v1.1 From b9a6a23566960d0dd3f51e2e68b472cd61911078 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 18 Apr 2012 17:31:58 +0200 Subject: tick: Ensure that the broadcast device is initialized Santosh found another trap when we avoid to initialize the broadcast device in the switch_to_oneshot code. The broadcast device might be still in SHUTDOWN state when we actually need to use it. That obviously breaks, as set_next_event() is called on a shutdown device. This did not break on x86, but Suresh analyzed it: From the review, most likely on Sven's system we are force enabling the hpet using the pci quirk's method very late. And in this case, hpet_clockevent (which will be global_clock_event) handler can be null, specifically as this platform might not be using deeper c-states and using the reliable APIC timer. Prior to commit 'fa4da365bc7772c', that handler will be set to 'tick_handle_oneshot_broadcast' when we switch the broadcast timer to oneshot mode, even though we don't use it. Post commit 'fa4da365bc7772c', we stopped switching the broadcast mode to oneshot as this is not really needed and his platform's global_clock_event's handler will remain null. While on my SNB laptop, same is set to 'clockevents_handle_noop' because hpet gets enabled very early. (noop handler on my platform set when the early enabled hpet timer gets replaced by the lapic timer). But the commit 'fa4da365bc7772c' tracked the broadcast timer mode in the SW as oneshot, even though it didn't touch the HW timer. During resume however, tick_resume_broadcast() saw the SW broadcast mode as oneshot and actually programmed the broadcast device also into oneshot mode. So this triggered the null pointer de-reference after the hpet wraps around and depending on what the hpet counter is set to. On the normal platforms where hpet gets enabled early we should be seeing a spurious interrupt (in my SNB laptop I see one spurious interrupt after around 5 minutes ;) which is 32-bit hpet counter wraparound time), but that's a separate issue. Enforce the mode setting when trying to set an event. Reported-and-tested-by: Santosh Shilimkar Signed-off-by: Thomas Gleixner Acked-by: Suresh Siddha Cc: torvalds@linux-foundation.org Cc: svenjoac@gmx.de Cc: rjw@sisk.pl Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1204181723350.2542@ionos --- kernel/time/tick-broadcast.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 119aca5..029531f 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -373,6 +373,9 @@ static int tick_broadcast_set_event(ktime_t expires, int force) { struct clock_event_device *bc = tick_broadcast_device.evtdev; + if (bc->mode != CLOCK_EVT_MODE_ONESHOT) + clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); + return clockevents_program_event(bc, expires, force); } -- cgit v1.1 From a6371f80230eaaafd7eef7efeedaa9509bdc982d Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Wed, 18 Apr 2012 19:27:39 -0700 Subject: tick: Fix the spurious broadcast timer ticks after resume During resume, tick_resume_broadcast() programs the broadcast timer in oneshot mode unconditionally. On the platforms where broadcast timer is not really required, this will generate spurious broadcast timer ticks upon resume. For example, on the always running apic timer platforms with HPET, I see spurious hpet tick once every ~5minutes (which is the 32-bit hpet counter wraparound time). Similar to boot time, during resume make the oneshot mode setting of the broadcast clock event device conditional on the state of active broadcast users. Signed-off-by: Suresh Siddha Tested-by: Santosh Shilimkar Tested-by: svenjoac@gmx.de Cc: torvalds@linux-foundation.org Cc: rjw@sisk.pl Link: http://lkml.kernel.org/r/1334802459.28674.209.camel@sbsiddha-desk.sc.intel.com Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 029531f..f113755 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -346,7 +346,8 @@ int tick_resume_broadcast(void) tick_get_broadcast_mask()); break; case TICKDEV_MODE_ONESHOT: - broadcast = tick_resume_broadcast_oneshot(bc); + if (!cpumask_empty(tick_get_broadcast_mask())) + broadcast = tick_resume_broadcast_oneshot(bc); break; } } -- cgit v1.1 From db4c75cbebd7e5910cd3bcb6790272fcc3042857 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 19 Apr 2012 10:31:47 -0400 Subject: tracing: Fix stacktrace of latency tracers (irqsoff and friends) While debugging a latency with someone on IRC (mirage335) on #linux-rt (OFTC), we discovered that the stacktrace output of the latency tracers (preemptirqsoff) was empty. This bug was caused by the creation of the dynamic length stack trace again (like commit 12b5da3 "tracing: Fix ent_size in trace output" was). This bug is caused by the latency tracers requiring the next event to determine the time between the current event and the next. But by grabbing the next event, the iter->ent_size is set to the next event instead of the current one. As the stacktrace event is the last event, this makes the ent_size zero and causes nothing to be printed for the stack trace. The dynamic stacktrace uses the ent_size to determine how much of the stack can be printed. The ent_size of zero means no stack. The simple fix is to save the iter->ent_size before finding the next event. Note, mirage335 asked to remain anonymous from LKML and git, so I will not add the Reported-by and Tested-by tags, even though he did report the issue and tested the fix. Cc: stable@vger.kernel.org # 3.1+ Signed-off-by: Steven Rostedt --- kernel/trace/trace_output.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 859fae6b..df611a0 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -652,6 +652,8 @@ int trace_print_lat_context(struct trace_iterator *iter) { u64 next_ts; int ret; + /* trace_find_next_entry will reset ent_size */ + int ent_size = iter->ent_size; struct trace_seq *s = &iter->seq; struct trace_entry *entry = iter->ent, *next_entry = trace_find_next_entry(iter, NULL, @@ -660,6 +662,9 @@ int trace_print_lat_context(struct trace_iterator *iter) unsigned long abs_usecs = ns2usecs(iter->ts - iter->tr->time_start); unsigned long rel_usecs; + /* Restore the original ent_size */ + iter->ent_size = ent_size; + if (!next_entry) next_ts = iter->ts; rel_usecs = ns2usecs(next_ts - iter->ts); -- cgit v1.1 From 9f3045eca89a2e6fdd1901aafb9e28231d3f31fb Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Wed, 18 Apr 2012 16:29:57 -0400 Subject: irq: hide debug macros so they don't collide with others. The file kernel/irq/debug.h temporarily defines P, PS, PD and then undefines them. However these names aren't really "internal" enough, and collide with other more legit users such as the ones in the xtensa arch, causing: In file included from kernel/irq/internals.h:58:0, from kernel/irq/irqdesc.c:18: kernel/irq/debug.h:8:0: warning: "PS" redefined [enabled by default] arch/xtensa/include/asm/regs.h:59:0: note: this is the location of the previous definition Add a handful of underscores to do a better job of hiding these temporary macros. Cc: Thomas Gleixner Signed-off-by: Paul Gortmaker --- kernel/irq/debug.h | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h index 97a8bfa..e75e29e 100644 --- a/kernel/irq/debug.h +++ b/kernel/irq/debug.h @@ -4,10 +4,10 @@ #include -#define P(f) if (desc->status_use_accessors & f) printk("%14s set\n", #f) -#define PS(f) if (desc->istate & f) printk("%14s set\n", #f) +#define ___P(f) if (desc->status_use_accessors & f) printk("%14s set\n", #f) +#define ___PS(f) if (desc->istate & f) printk("%14s set\n", #f) /* FIXME */ -#define PD(f) do { } while (0) +#define ___PD(f) do { } while (0) static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) { @@ -23,23 +23,23 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) print_symbol("%s\n", (unsigned long)desc->action->handler); } - P(IRQ_LEVEL); - P(IRQ_PER_CPU); - P(IRQ_NOPROBE); - P(IRQ_NOREQUEST); - P(IRQ_NOTHREAD); - P(IRQ_NOAUTOEN); + ___P(IRQ_LEVEL); + ___P(IRQ_PER_CPU); + ___P(IRQ_NOPROBE); + ___P(IRQ_NOREQUEST); + ___P(IRQ_NOTHREAD); + ___P(IRQ_NOAUTOEN); - PS(IRQS_AUTODETECT); - PS(IRQS_REPLAY); - PS(IRQS_WAITING); - PS(IRQS_PENDING); + ___PS(IRQS_AUTODETECT); + ___PS(IRQS_REPLAY); + ___PS(IRQS_WAITING); + ___PS(IRQS_PENDING); - PD(IRQS_INPROGRESS); - PD(IRQS_DISABLED); - PD(IRQS_MASKED); + ___PD(IRQS_INPROGRESS); + ___PD(IRQS_DISABLED); + ___PD(IRQS_MASKED); } -#undef P -#undef PS -#undef PD +#undef ___P +#undef ___PS +#undef ___PD -- cgit v1.1 From f8262d476823a7ea1eb497ff9676d1eab2393c75 Mon Sep 17 00:00:00 2001 From: Bojan Smojver Date: Tue, 24 Apr 2012 23:53:28 +0200 Subject: PM / Hibernate: fix the number of pages used for hibernate/thaw buffering Hibernation regression fix, since 3.2. Calculate the number of required free pages based on non-high memory pages only, because that is where the buffers will come from. Commit 081a9d043c983f161b78fdc4671324d1342b86bc introduced a new buffer page allocation logic during hibernation, in order to improve the performance. The amount of pages allocated was calculated based on total amount of pages available, although only non-high memory pages are usable for this purpose. This caused hibernation code to attempt to over allocate pages on platforms that have high memory, which led to hangs. Signed-off-by: Bojan Smojver Signed-off-by: Rafael J. Wysocki --- kernel/power/swap.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 8742fd0..eef311a 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -51,6 +51,23 @@ #define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1) +/* + * Number of free pages that are not high. + */ +static inline unsigned long low_free_pages(void) +{ + return nr_free_pages() - nr_free_highpages(); +} + +/* + * Number of pages required to be kept free while writing the image. Always + * half of all available low pages before the writing starts. + */ +static inline unsigned long reqd_free_pages(void) +{ + return low_free_pages() / 2; +} + struct swap_map_page { sector_t entries[MAP_PAGE_ENTRIES]; sector_t next_swap; @@ -72,7 +89,7 @@ struct swap_map_handle { sector_t cur_swap; sector_t first_sector; unsigned int k; - unsigned long nr_free_pages, written; + unsigned long reqd_free_pages; u32 crc32; }; @@ -316,8 +333,7 @@ static int get_swap_writer(struct swap_map_handle *handle) goto err_rel; } handle->k = 0; - handle->nr_free_pages = nr_free_pages() >> 1; - handle->written = 0; + handle->reqd_free_pages = reqd_free_pages(); handle->first_sector = handle->cur_swap; return 0; err_rel: @@ -352,11 +368,11 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf, handle->cur_swap = offset; handle->k = 0; } - if (bio_chain && ++handle->written > handle->nr_free_pages) { + if (bio_chain && low_free_pages() <= handle->reqd_free_pages) { error = hib_wait_on_bio_chain(bio_chain); if (error) goto out; - handle->written = 0; + handle->reqd_free_pages = reqd_free_pages(); } out: return error; @@ -618,7 +634,7 @@ static int save_image_lzo(struct swap_map_handle *handle, * Adjust number of free pages after all allocations have been done. * We don't want to run out of pages when writing. */ - handle->nr_free_pages = nr_free_pages() >> 1; + handle->reqd_free_pages = reqd_free_pages(); /* * Start the CRC32 thread. -- cgit v1.1 From eb95308ee2a69403909e111837b9068c64cfc349 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 17 Apr 2012 13:38:40 +0200 Subject: sched: Fix more load-balancing fallout Commits 367456c756a6 ("sched: Ditch per cgroup task lists for load-balancing") and 5d6523ebd ("sched: Fix load-balance wreckage") left some more wreckage. By setting loop_max unconditionally to ->nr_running load-balancing could take a lot of time on very long runqueues (hackbench!). So keep the sysctl as max limit of the amount of tasks we'll iterate. Furthermore, the min load filter for migration completely fails with cgroups since inequality in per-cpu state can easily lead to such small loads :/ Furthermore the change to add new tasks to the tail of the queue instead of the head seems to have some effect.. not quite sure I understand why. Combined these fixes solve the huge hackbench regression reported by Tim when hackbench is ran in a cgroup. Reported-by: Tim Chen Acked-by: Tim Chen Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/1335365763.28150.267.camel@twins [ got rid of the CONFIG_PREEMPT tuning and made small readability edits ] Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 18 ++++++++++-------- kernel/sched/features.h | 1 + 2 files changed, 11 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0d97ebd..e955364 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -784,7 +784,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) update_load_add(&rq_of(cfs_rq)->load, se->load.weight); #ifdef CONFIG_SMP if (entity_is_task(se)) - list_add_tail(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); + list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); #endif cfs_rq->nr_running++; } @@ -3215,6 +3215,8 @@ static int move_one_task(struct lb_env *env) static unsigned long task_h_load(struct task_struct *p); +static const unsigned int sched_nr_migrate_break = 32; + /* * move_tasks tries to move up to load_move weighted load from busiest to * this_rq, as part of a balancing operation within domain "sd". @@ -3242,7 +3244,7 @@ static int move_tasks(struct lb_env *env) /* take a breather every nr_migrate tasks */ if (env->loop > env->loop_break) { - env->loop_break += sysctl_sched_nr_migrate; + env->loop_break += sched_nr_migrate_break; env->flags |= LBF_NEED_BREAK; break; } @@ -3252,7 +3254,7 @@ static int move_tasks(struct lb_env *env) load = task_h_load(p); - if (load < 16 && !env->sd->nr_balance_failed) + if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed) goto next; if ((load / 2) > env->load_move) @@ -4407,7 +4409,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, .dst_cpu = this_cpu, .dst_rq = this_rq, .idle = idle, - .loop_break = sysctl_sched_nr_migrate, + .loop_break = sched_nr_migrate_break, }; cpumask_copy(cpus, cpu_active_mask); @@ -4445,10 +4447,10 @@ redo: * correctly treated as an imbalance. */ env.flags |= LBF_ALL_PINNED; - env.load_move = imbalance; - env.src_cpu = busiest->cpu; - env.src_rq = busiest; - env.loop_max = busiest->nr_running; + env.load_move = imbalance; + env.src_cpu = busiest->cpu; + env.src_rq = busiest; + env.loop_max = min_t(unsigned long, sysctl_sched_nr_migrate, busiest->nr_running); more_balance: local_irq_save(flags); diff --git a/kernel/sched/features.h b/kernel/sched/features.h index e61fd73..de00a48 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -68,3 +68,4 @@ SCHED_FEAT(TTWU_QUEUE, true) SCHED_FEAT(FORCE_SD_OVERLAP, false) SCHED_FEAT(RT_RUNTIME_SHARE, true) +SCHED_FEAT(LB_MIN, false) -- cgit v1.1 From fb2cf2c660971bea0ad86a9a5c19ad39eab61344 Mon Sep 17 00:00:00 2001 From: "he, bo" Date: Wed, 25 Apr 2012 19:59:21 +0800 Subject: sched: Fix OOPS when build_sched_domains() percpu allocation fails Under extreme memory used up situations, percpu allocation might fail. We hit it when system goes to suspend-to-ram, causing a kworker panic: EIP: [] build_sched_domains+0x23a/0xad0 Kernel panic - not syncing: Fatal exception Pid: 3026, comm: kworker/u:3 3.0.8-137473-gf42fbef #1 Call Trace: [] panic+0x66/0x16c [...] [] partition_sched_domains+0x287/0x4b0 [] cpuset_update_active_cpus+0x1fe/0x210 [] cpuset_cpu_inactive+0x1d/0x30 [...] With this fix applied build_sched_domains() will return -ENOMEM and the suspend attempt fails. Signed-off-by: he, bo Reviewed-by: Zhang, Yanmin Reviewed-by: Srivatsa S. Bhat Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Link: http://lkml.kernel.org/r/1335355161.5892.17.camel@hebo [ So, we fail to deallocate a CPU because we cannot allocate RAM :-/ I don't like that kind of sad behavior but nevertheless it should not crash under high memory load. ] Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4603b9d..0533a68 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6405,16 +6405,26 @@ static void __sdt_free(const struct cpumask *cpu_map) struct sd_data *sdd = &tl->data; for_each_cpu(j, cpu_map) { - struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j); - if (sd && (sd->flags & SD_OVERLAP)) - free_sched_groups(sd->groups, 0); - kfree(*per_cpu_ptr(sdd->sd, j)); - kfree(*per_cpu_ptr(sdd->sg, j)); - kfree(*per_cpu_ptr(sdd->sgp, j)); + struct sched_domain *sd; + + if (sdd->sd) { + sd = *per_cpu_ptr(sdd->sd, j); + if (sd && (sd->flags & SD_OVERLAP)) + free_sched_groups(sd->groups, 0); + kfree(*per_cpu_ptr(sdd->sd, j)); + } + + if (sdd->sg) + kfree(*per_cpu_ptr(sdd->sg, j)); + if (sdd->sgp) + kfree(*per_cpu_ptr(sdd->sgp, j)); } free_percpu(sdd->sd); + sdd->sd = NULL; free_percpu(sdd->sg); + sdd->sg = NULL; free_percpu(sdd->sgp); + sdd->sgp = NULL; } } -- cgit v1.1 From 724b6daa13e100067c30cfc4d1ad06629609dc4e Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 11 Apr 2012 11:54:13 +1000 Subject: perf: Fix perf_event_for_each() to use sibling In perf_event_for_each() we call a function on an event, and then iterate over the siblings of the event. However we don't call the function on the siblings, we call it repeatedly on the original event - it seems "obvious" that we should be calling it with sibling as the argument. It looks like this broke in commit 75f937f24bd9 ("Fix ctx->mutex vs counter->mutex inversion"). The only effect of the bug is that the PERF_IOC_FLAG_GROUP parameter to the ioctls doesn't work. Signed-off-by: Michael Ellerman Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1334109253-31329-1-git-send-email-michael@ellerman.id.au Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index a6a9ec4..fd126f8 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3183,7 +3183,7 @@ static void perf_event_for_each(struct perf_event *event, perf_event_for_each_child(event, func); func(event); list_for_each_entry(sibling, &event->sibling_list, group_entry) - perf_event_for_each_child(event, func); + perf_event_for_each_child(sibling, func); mutex_unlock(&ctx->mutex); } -- cgit v1.1 From b7dafa0ef3145c31d7753be0a08b3cbda51f0209 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Thu, 10 May 2012 10:04:36 -0300 Subject: compat: Fix RT signal mask corruption via sigprocmask compat_sys_sigprocmask reads a smaller signal mask from userspace than sigprogmask accepts for setting. So the high word of blocked.sig[0] will be cleared, releasing any potentially blocked RT signal. This was discovered via userspace code that relies on get/setcontext. glibc's i386 versions of those functions use sigprogmask instead of rt_sigprogmask to save/restore signal mask and caused RT signal unblocking this way. As suggested by Linus, this replaces the sys_sigprocmask based compat version with one that open-codes the required logic, including the merge of the existing blocked set with the new one provided on SIG_SETMASK. Signed-off-by: Jan Kiszka Signed-off-by: Linus Torvalds --- kernel/compat.c | 63 +++++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 46 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 74ff849..d2c67aa 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -372,25 +372,54 @@ asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set) #ifdef __ARCH_WANT_SYS_SIGPROCMASK -asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set, - compat_old_sigset_t __user *oset) +/* + * sys_sigprocmask SIG_SETMASK sets the first (compat) word of the + * blocked set of signals to the supplied signal set + */ +static inline void compat_sig_setmask(sigset_t *blocked, compat_sigset_word set) { - old_sigset_t s; - long ret; - mm_segment_t old_fs; + memcpy(blocked->sig, &set, sizeof(set)); +} - if (set && get_user(s, set)) - return -EFAULT; - old_fs = get_fs(); - set_fs(KERNEL_DS); - ret = sys_sigprocmask(how, - set ? (old_sigset_t __user *) &s : NULL, - oset ? (old_sigset_t __user *) &s : NULL); - set_fs(old_fs); - if (ret == 0) - if (oset) - ret = put_user(s, oset); - return ret; +asmlinkage long compat_sys_sigprocmask(int how, + compat_old_sigset_t __user *nset, + compat_old_sigset_t __user *oset) +{ + old_sigset_t old_set, new_set; + sigset_t new_blocked; + + old_set = current->blocked.sig[0]; + + if (nset) { + if (get_user(new_set, nset)) + return -EFAULT; + new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP)); + + new_blocked = current->blocked; + + switch (how) { + case SIG_BLOCK: + sigaddsetmask(&new_blocked, new_set); + break; + case SIG_UNBLOCK: + sigdelsetmask(&new_blocked, new_set); + break; + case SIG_SETMASK: + compat_sig_setmask(&new_blocked, new_set); + break; + default: + return -EINVAL; + } + + set_current_blocked(&new_blocked); + } + + if (oset) { + if (put_user(old_set, oset)) + return -EFAULT; + } + + return 0; } #endif -- cgit v1.1 From 5e2bf0142231194d36fdc9596b36a261ed2b9fe7 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Thu, 10 May 2012 13:01:45 -0700 Subject: namespaces, pid_ns: fix leakage on fork() failure Fork() failure post namespace creation for a child cloned with CLONE_NEWPID leaks pid_namespace/mnt_cache due to proc being mounted during creation, but not unmounted during cleanup. Call pid_ns_release_proc() during cleanup. Signed-off-by: Mike Galbraith Acked-by: Oleg Nesterov Reviewed-by: "Eric W. Biederman" Cc: Pavel Emelyanov Cc: Cyrill Gorcunov Cc: Louis Rilling Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index b9372a0..687a15d5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include #include @@ -1464,6 +1465,8 @@ bad_fork_cleanup_io: if (p->io_context) exit_io_context(p); bad_fork_cleanup_namespaces: + if (unlikely(clone_flags & CLONE_NEWPID)) + pid_ns_release_proc(p->nsproxy->pid_ns); exit_task_namespaces(p); bad_fork_cleanup_mm: if (p->mm) -- cgit v1.1