From fe9161db2e6053da21e4649d77bbefaf3030b11d Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Wed, 1 Feb 2012 22:16:36 +0100 Subject: PM / Hibernate: Thaw kernel threads in SNAPSHOT_CREATE_IMAGE ioctl path In the SNAPSHOT_CREATE_IMAGE ioctl, if the call to hibernation_snapshot() fails, the frozen tasks are not thawed. And in the case of success, if we happen to exit due to a successful freezer test, all tasks (including those of userspace) are thawed, whereas actually we should have thawed only the kernel threads at that point. Fix both these issues. Signed-off-by: Srivatsa S. Bhat Signed-off-by: Rafael J. Wysocki Cc: stable@vger.kernel.org --- kernel/power/user.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/user.c b/kernel/power/user.c index e5a21a8..3e10007 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -249,13 +249,15 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, } pm_restore_gfp_mask(); error = hibernation_snapshot(data->platform_support); - if (!error) { + if (error) { + thaw_kernel_threads(); + } else { error = put_user(in_suspend, (int __user *)arg); if (!error && !freezer_test_done) data->ready = 1; if (freezer_test_done) { freezer_test_done = false; - thaw_processes(); + thaw_kernel_threads(); } } break; -- cgit v1.1 From 55ca6140e9bb307efc97a9301a4f501de02a6fd6 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Fri, 3 Feb 2012 15:37:16 -0800 Subject: kprobes: fix a memory leak in function pre_handler_kretprobe() In function pre_handler_kretprobe(), the allocated kretprobe_instance object will get leaked if the entry_handler callback returns non-zero. This may cause all the preallocated kretprobe_instance objects exhausted. This issue can be reproduced by changing samples/kprobes/kretprobe_example.c to probe "mutex_unlock". And the fix is straightforward: just put the allocated kretprobe_instance object back onto the free_instances list. [akpm@linux-foundation.org: use raw_spin_lock/unlock] Signed-off-by: Jiang Liu Acked-by: Jim Keniston Acked-by: Ananth N Mavinakayanahalli Cc: Masami Hiramatsu Cc: Anil S Keshavamurthy Cc: "David S. Miller" Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 29f5b65..9788c0e 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1673,8 +1673,12 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, ri->rp = rp; ri->task = current; - if (rp->entry_handler && rp->entry_handler(ri, regs)) + if (rp->entry_handler && rp->entry_handler(ri, regs)) { + raw_spin_lock_irqsave(&rp->lock, flags); + hlist_add_head(&ri->hlist, &rp->free_instances); + raw_spin_unlock_irqrestore(&rp->lock, flags); return 0; + } arch_prepare_kretprobe(ri, regs); -- cgit v1.1 From 379e0be812ab8a2a351e784b0c987788f5123090 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Fri, 3 Feb 2012 22:22:41 +0100 Subject: PM / Freezer: Thaw only kernel threads if freezing of kernel threads fails If freezing of kernel threads fails, we are expected to automatically thaw tasks in the error recovery path. However, at times, we encounter situations in which we would like the automatic error recovery path to thaw only the kernel threads, because we want to be able to do some more cleanup before we thaw userspace. Something like: error = freeze_kernel_threads(); if (error) { /* Do some cleanup */ /* Only then thaw userspace tasks*/ thaw_processes(); } An example of such a situation is where we freeze/thaw filesystems during suspend/hibernation. There, if freezing of kernel threads fails, we would like to thaw the frozen filesystems before thawing the userspace tasks. So, modify freeze_kernel_threads() to thaw only kernel threads in case of freezing failure. And change suspend_freeze_processes() accordingly. (At the same time, let us also get rid of the rather cryptic usage of the conditional operator (:?) in that function.) [rjw: In fact, this patch fixes a regression introduced during the 3.3 merge window, because without it thaw_processes() may be called before swsusp_free() in some situations and that may lead to massive memory allocation failures.] Signed-off-by: Srivatsa S. Bhat Acked-by: Tejun Heo Acked-by: Nigel Cunningham Signed-off-by: Rafael J. Wysocki --- kernel/power/power.h | 24 ++++++++++++++++++++++-- kernel/power/process.c | 7 +++++-- 2 files changed, 27 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/power/power.h b/kernel/power/power.h index 0c4defe..21724ee 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -231,8 +231,28 @@ extern int pm_test_level; #ifdef CONFIG_SUSPEND_FREEZER static inline int suspend_freeze_processes(void) { - int error = freeze_processes(); - return error ? : freeze_kernel_threads(); + int error; + + error = freeze_processes(); + + /* + * freeze_processes() automatically thaws every task if freezing + * fails. So we need not do anything extra upon error. + */ + if (error) + goto Finish; + + error = freeze_kernel_threads(); + + /* + * freeze_kernel_threads() thaws only kernel threads upon freezing + * failure. So we have to thaw the userspace tasks ourselves. + */ + if (error) + thaw_processes(); + + Finish: + return error; } static inline void suspend_thaw_processes(void) diff --git a/kernel/power/process.c b/kernel/power/process.c index eeca003..7e42645 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -143,7 +143,10 @@ int freeze_processes(void) /** * freeze_kernel_threads - Make freezable kernel threads go to the refrigerator. * - * On success, returns 0. On failure, -errno and system is fully thawed. + * On success, returns 0. On failure, -errno and only the kernel threads are + * thawed, so as to give a chance to the caller to do additional cleanups + * (if any) before thawing the userspace tasks. So, it is the responsibility + * of the caller to thaw the userspace tasks, when the time is right. */ int freeze_kernel_threads(void) { @@ -159,7 +162,7 @@ int freeze_kernel_threads(void) BUG_ON(in_atomic()); if (error) - thaw_processes(); + thaw_kernel_threads(); return error; } -- cgit v1.1 From 11a3122f6cf2d988a77eb8883d0fc49cd013a6d5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 7 Feb 2012 07:51:30 +0100 Subject: block: strip out locking optimization in put_io_context() put_io_context() performed a complex trylock dancing to avoid deferring ioc release to workqueue. It was also broken on UP because trylock was always assumed to succeed which resulted in unbalanced preemption count. While there are ways to fix the UP breakage, even the most pathological microbench (forced ioc allocation and tight fork/exit loop) fails to show any appreciable performance benefit of the optimization. Strip it out. If there turns out to be workloads which are affected by this change, simpler optimization from the discussion thread can be applied later. Signed-off-by: Tejun Heo LKML-Reference: <1328514611.21268.66.camel@sli10-conroe> Signed-off-by: Jens Axboe --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 051f090..c574aef 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -890,7 +890,7 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk) return -ENOMEM; new_ioc->ioprio = ioc->ioprio; - put_io_context(new_ioc, NULL); + put_io_context(new_ioc); } #endif return 0; -- cgit v1.1 From f39d47ff819ed52a2afbdbecbe35f23f7755f58d Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 7 Feb 2012 14:39:57 +0100 Subject: perf: Fix double start/stop in x86_pmu_start() The following patch fixes a bug introduced by the following commit: e050e3f0a71b ("perf: Fix broken interrupt rate throttling") The patch caused the following warning to pop up depending on the sampling frequency adjustments: ------------[ cut here ]------------ WARNING: at arch/x86/kernel/cpu/perf_event.c:995 x86_pmu_start+0x79/0xd4() It was caused by the following call sequence: perf_adjust_freq_unthr_context.part() { stop() if (delta > 0) { perf_adjust_period() { if (period > 8*...) { stop() ... start() } } } start() } Which caused a double start and a double stop, thus triggering the assert in x86_pmu_start(). The patch fixes the problem by avoiding the double calls. We pass a new argument to perf_adjust_period() to indicate whether or not the event is already stopped. We can't just remove the start/stop from that function because it's called from __perf_event_overflow where the event needs to be reloaded via a stop/start back-toback call. The patch reintroduces the assertion in x86_pmu_start() which was removed by commit: 84f2b9b ("perf: Remove deprecated WARN_ON_ONCE()") In this second version, we've added calls to disable/enable PMU during unthrottling or frequency adjustment based on bug report of spurious NMI interrupts from Eric Dumazet. Reported-and-tested-by: Eric Dumazet Signed-off-by: Stephane Eranian Acked-by: Peter Zijlstra Cc: markus@trippelsdorf.de Cc: paulus@samba.org Link: http://lkml.kernel.org/r/20120207133956.GA4932@quad [ Minor edits to the changelog and to the code ] Signed-off-by: Ingo Molnar --- kernel/events/core.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index ba36013..1b5c081 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2303,7 +2303,7 @@ do { \ static DEFINE_PER_CPU(int, perf_throttled_count); static DEFINE_PER_CPU(u64, perf_throttled_seq); -static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) +static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable) { struct hw_perf_event *hwc = &event->hw; s64 period, sample_period; @@ -2322,9 +2322,13 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) hwc->sample_period = sample_period; if (local64_read(&hwc->period_left) > 8*sample_period) { - event->pmu->stop(event, PERF_EF_UPDATE); + if (disable) + event->pmu->stop(event, PERF_EF_UPDATE); + local64_set(&hwc->period_left, 0); - event->pmu->start(event, PERF_EF_RELOAD); + + if (disable) + event->pmu->start(event, PERF_EF_RELOAD); } } @@ -2350,6 +2354,7 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, return; raw_spin_lock(&ctx->lock); + perf_pmu_disable(ctx->pmu); list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { if (event->state != PERF_EVENT_STATE_ACTIVE) @@ -2381,13 +2386,17 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, /* * restart the event * reload only if value has changed + * we have stopped the event so tell that + * to perf_adjust_period() to avoid stopping it + * twice. */ if (delta > 0) - perf_adjust_period(event, period, delta); + perf_adjust_period(event, period, delta, false); event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0); } + perf_pmu_enable(ctx->pmu); raw_spin_unlock(&ctx->lock); } @@ -4562,7 +4571,7 @@ static int __perf_event_overflow(struct perf_event *event, hwc->freq_time_stamp = now; if (delta > 0 && delta < 2*TICK_NSEC) - perf_adjust_period(event, delta, hwc->last_period); + perf_adjust_period(event, delta, hwc->last_period, true); } /* -- cgit v1.1 From f6302f1bcd75a042df69866d98b8d775a668f8f1 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 10 Feb 2012 09:03:58 +0100 Subject: relay: prevent integer overflow in relay_open() "subbuf_size" and "n_subbufs" come from the user and they need to be capped to prevent an integer overflow. Signed-off-by: Dan Carpenter Cc: stable@kernel.org Signed-off-by: Jens Axboe --- kernel/relay.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index 4335e1d..ab56a17 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -164,10 +164,14 @@ depopulate: */ static struct rchan_buf *relay_create_buf(struct rchan *chan) { - struct rchan_buf *buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL); - if (!buf) + struct rchan_buf *buf; + + if (chan->n_subbufs > UINT_MAX / sizeof(size_t *)) return NULL; + buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL); + if (!buf) + return NULL; buf->padding = kmalloc(chan->n_subbufs * sizeof(size_t *), GFP_KERNEL); if (!buf->padding) goto free_buf; @@ -574,6 +578,8 @@ struct rchan *relay_open(const char *base_filename, if (!(subbuf_size && n_subbufs)) return NULL; + if (subbuf_size > UINT_MAX / n_subbufs) + return NULL; chan = kzalloc(sizeof(struct rchan), GFP_KERNEL); if (!chan) -- cgit v1.1 From 10f296cbfe3b93188c41463fd7a53808ebdbcbe3 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Wed, 1 Feb 2012 10:33:11 +0800 Subject: module: make module param bint handle nul value Allow bint param accept nul values, just do same as bool param. Signed-off-by: Dave Young Cc: Rusty Russell Signed-off-by: Rusty Russell --- kernel/params.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index 32ee043..4bc965d 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -97,7 +97,8 @@ static int parse_one(char *param, for (i = 0; i < num_params; i++) { if (parameq(param, params[i].name)) { /* No one handled NULL, so do it here. */ - if (!val && params[i].ops->set != param_set_bool) + if (!val && params[i].ops->set != param_set_bool + && params[i].ops->set != param_set_bint) return -EINVAL; pr_debug("They are equal! Calling %p\n", params[i].ops->set); -- cgit v1.1 From 074b85175a43a23fdbde60f55feea636e0bf0f85 Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Wed, 8 Feb 2012 12:39:07 -0800 Subject: vfs: fix panic in __d_lookup() with high dentry hashtable counts When the number of dentry cache hash table entries gets too high (2147483648 entries), as happens by default on a 16TB system, use of a signed integer in the dcache_init() initialization loop prevents the dentry_hashtable from getting initialized, causing a panic in __d_lookup(). Fix this in dcache_init() and similar areas. Signed-off-by: Dimitri Sivanich Acked-by: David S. Miller Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- kernel/pid.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index ce8e00d..9f08dfa 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -543,12 +543,12 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) */ void __init pidhash_init(void) { - int i, pidhash_size; + unsigned int i, pidhash_size; pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, HASH_EARLY | HASH_SMALL, &pidhash_shift, NULL, 4096); - pidhash_size = 1 << pidhash_shift; + pidhash_size = 1U << pidhash_shift; for (i = 0; i < pidhash_size; i++) INIT_HLIST_HEAD(&pid_hash[i]); -- cgit v1.1 From ac5637611150281f398bb7a47e3fcb69a09e7803 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 7 Feb 2012 17:58:03 +0100 Subject: genirq: Unmask oneshot irqs when thread was not woken When the primary handler of an interrupt which is marked IRQ_ONESHOT returns IRQ_HANDLED or IRQ_NONE, then the interrupt thread is not woken and the unmask logic of the interrupt line is never invoked. This keeps the interrupt masked forever. This was not noticed as most IRQ_ONESHOT users wake the thread unconditionally (usually because they cannot access the underlying device from hard interrupt context). Though this behaviour was nowhere documented and not necessarily intentional. Some drivers can avoid the thread wakeup in certain cases and run into the situation where the interrupt line s kept masked. Handle it gracefully. Reported-and-tested-by: Lothar Wassmann Cc: stable@vger.kernel.org Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index f7c543a..b742edc 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -330,6 +330,24 @@ out_unlock: } EXPORT_SYMBOL_GPL(handle_simple_irq); +/* + * Called unconditionally from handle_level_irq() and only for oneshot + * interrupts from handle_fasteoi_irq() + */ +static void cond_unmask_irq(struct irq_desc *desc) +{ + /* + * We need to unmask in the following cases: + * - Standard level irq (IRQF_ONESHOT is not set) + * - Oneshot irq which did not wake the thread (caused by a + * spurious interrupt or a primary handler handling it + * completely). + */ + if (!irqd_irq_disabled(&desc->irq_data) && + irqd_irq_masked(&desc->irq_data) && !desc->threads_oneshot) + unmask_irq(desc); +} + /** * handle_level_irq - Level type irq handler * @irq: the interrupt number @@ -362,8 +380,8 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) handle_irq_event(desc); - if (!irqd_irq_disabled(&desc->irq_data) && !(desc->istate & IRQS_ONESHOT)) - unmask_irq(desc); + cond_unmask_irq(desc); + out_unlock: raw_spin_unlock(&desc->lock); } @@ -417,6 +435,9 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) preflow_handler(desc); handle_irq_event(desc); + if (desc->istate & IRQS_ONESHOT) + cond_unmask_irq(desc); + out_eoi: desc->irq_data.chip->irq_eoi(&desc->irq_data); out_unlock: -- cgit v1.1 From b4bc724e82e80478cba5fe9825b62e71ddf78757 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 8 Feb 2012 11:57:52 +0100 Subject: genirq: Handle pending irqs in irq_startup() An interrupt might be pending when irq_startup() is called, but the startup code does not invoke the resend logic. In some cases this prevents the device from issuing another interrupt which renders the device non functional. Call the resend function in irq_startup() to keep things going. Reported-and-tested-by: Russell King Cc: stable@vger.kernel.org Signed-off-by: Thomas Gleixner --- kernel/irq/autoprobe.c | 4 ++-- kernel/irq/chip.c | 17 ++++++++++------- kernel/irq/internals.h | 2 +- kernel/irq/manage.c | 2 +- 4 files changed, 14 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 342d8f4..0119b9d 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -53,7 +53,7 @@ unsigned long probe_irq_on(void) if (desc->irq_data.chip->irq_set_type) desc->irq_data.chip->irq_set_type(&desc->irq_data, IRQ_TYPE_PROBE); - irq_startup(desc); + irq_startup(desc, false); } raw_spin_unlock_irq(&desc->lock); } @@ -70,7 +70,7 @@ unsigned long probe_irq_on(void) raw_spin_lock_irq(&desc->lock); if (!desc->action && irq_settings_can_probe(desc)) { desc->istate |= IRQS_AUTODETECT | IRQS_WAITING; - if (irq_startup(desc)) + if (irq_startup(desc, false)) desc->istate |= IRQS_PENDING; } raw_spin_unlock_irq(&desc->lock); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index b742edc..fb7db75 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -157,19 +157,22 @@ static void irq_state_set_masked(struct irq_desc *desc) irqd_set(&desc->irq_data, IRQD_IRQ_MASKED); } -int irq_startup(struct irq_desc *desc) +int irq_startup(struct irq_desc *desc, bool resend) { + int ret = 0; + irq_state_clr_disabled(desc); desc->depth = 0; if (desc->irq_data.chip->irq_startup) { - int ret = desc->irq_data.chip->irq_startup(&desc->irq_data); + ret = desc->irq_data.chip->irq_startup(&desc->irq_data); irq_state_clr_masked(desc); - return ret; + } else { + irq_enable(desc); } - - irq_enable(desc); - return 0; + if (resend) + check_irq_resend(desc, desc->irq_data.irq); + return ret; } void irq_shutdown(struct irq_desc *desc) @@ -646,7 +649,7 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, irq_settings_set_noprobe(desc); irq_settings_set_norequest(desc); irq_settings_set_nothread(desc); - irq_startup(desc); + irq_startup(desc, true); } out: irq_put_desc_busunlock(desc, flags); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index b795231..40378ff 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -67,7 +67,7 @@ extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); -extern int irq_startup(struct irq_desc *desc); +extern int irq_startup(struct irq_desc *desc, bool resend); extern void irq_shutdown(struct irq_desc *desc); extern void irq_enable(struct irq_desc *desc); extern void irq_disable(struct irq_desc *desc); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index a9a9dbe..32313c0 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1027,7 +1027,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) desc->istate |= IRQS_ONESHOT; if (irq_settings_can_autoenable(desc)) - irq_startup(desc); + irq_startup(desc, true); else /* Undo nested disables: */ desc->depth = 1; -- cgit v1.1 From 1fd36adcd98c14d2fd97f545293c488775cb2823 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Feb 2012 17:49:54 +0000 Subject: Replace the fd_sets in struct fdtable with an array of unsigned longs Replace the fd_sets in struct fdtable with an array of unsigned longs and then use the standard non-atomic bit operations rather than the FD_* macros. This: (1) Removes the abuses of struct fd_set: (a) Since we don't want to allocate a full fd_set the vast majority of the time, we actually, in effect, just allocate a just-big-enough array of unsigned longs and cast it to an fd_set type - so why bother with the fd_set at all? (b) Some places outside of the core fdtable handling code (such as SELinux) want to look inside the array of unsigned longs hidden inside the fd_set struct for more efficient iteration over the entire set. (2) Eliminates the use of FD_*() macros in the kernel completely. (3) Permits the __FD_*() macros to be deleted entirely where not exposed to userspace. Signed-off-by: David Howells Link: http://lkml.kernel.org/r/20120216174954.23314.48147.stgit@warthog.procyon.org.uk Signed-off-by: H. Peter Anvin Cc: Al Viro --- kernel/exit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 4b4042f..4db0200 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -473,7 +473,7 @@ static void close_files(struct files_struct * files) i = j * __NFDBITS; if (i >= fdt->max_fds) break; - set = fdt->open_fds->fds_bits[j++]; + set = fdt->open_fds[j++]; while (set) { if (set & 1) { struct file * file = xchg(&fdt->fd[i], NULL); -- cgit v1.1 From 6684ba202b5ab2f36d574c72fe50c207d99b3e35 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Sun, 19 Feb 2012 17:38:00 -0800 Subject: compat: Add helper functions to read/write struct timeval, timespec Add helper functions to read and write struct timeval and struct timespec from userspace. We already had helper functions for reading and writing struct compat_timespec; add a set of functions to do the same with struct timeval, and add a second suite of functions which can be sensitive to COMPAT_USE_64BIT_TIME and access either 32- or 64-bit time structures. This also exports these helper functions to modules. Rename the existing inlines for converting between struct compat_timeval and native struct timespec so we can have a saner naming convention for the exported functions. Suggested-by: Linus Torvalds Signed-off-by: H. Peter Anvin --- kernel/compat.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 60 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index f346ced..74ff849 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -31,11 +31,10 @@ #include /* - * Note that the native side is already converted to a timespec, because - * that's what we want anyway. + * Get/set struct timeval with struct timespec on the native side */ -static int compat_get_timeval(struct timespec *o, - struct compat_timeval __user *i) +static int compat_get_timeval_convert(struct timespec *o, + struct compat_timeval __user *i) { long usec; @@ -46,8 +45,8 @@ static int compat_get_timeval(struct timespec *o, return 0; } -static int compat_put_timeval(struct compat_timeval __user *o, - struct timeval *i) +static int compat_put_timeval_convert(struct compat_timeval __user *o, + struct timeval *i) { return (put_user(i->tv_sec, &o->tv_sec) || put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0; @@ -117,7 +116,7 @@ asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv, if (tv) { struct timeval ktv; do_gettimeofday(&ktv); - if (compat_put_timeval(tv, &ktv)) + if (compat_put_timeval_convert(tv, &ktv)) return -EFAULT; } if (tz) { @@ -135,7 +134,7 @@ asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv, struct timezone ktz; if (tv) { - if (compat_get_timeval(&kts, tv)) + if (compat_get_timeval_convert(&kts, tv)) return -EFAULT; } if (tz) { @@ -146,12 +145,29 @@ asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv, return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL); } +int get_compat_timeval(struct timeval *tv, const struct compat_timeval __user *ctv) +{ + return (!access_ok(VERIFY_READ, ctv, sizeof(*ctv)) || + __get_user(tv->tv_sec, &ctv->tv_sec) || + __get_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0; +} +EXPORT_SYMBOL_GPL(get_compat_timeval); + +int put_compat_timeval(const struct timeval *tv, struct compat_timeval __user *ctv) +{ + return (!access_ok(VERIFY_WRITE, ctv, sizeof(*ctv)) || + __put_user(tv->tv_sec, &ctv->tv_sec) || + __put_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0; +} +EXPORT_SYMBOL_GPL(put_compat_timeval); + int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts) { return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) || __get_user(ts->tv_sec, &cts->tv_sec) || __get_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; } +EXPORT_SYMBOL_GPL(get_compat_timespec); int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user *cts) { @@ -161,6 +177,42 @@ int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user } EXPORT_SYMBOL_GPL(put_compat_timespec); +int compat_get_timeval(struct timeval *tv, const void __user *utv) +{ + if (COMPAT_USE_64BIT_TIME) + return copy_from_user(tv, utv, sizeof *tv) ? -EFAULT : 0; + else + return get_compat_timeval(tv, utv); +} +EXPORT_SYMBOL_GPL(compat_get_timeval); + +int compat_put_timeval(const struct timeval *tv, void __user *utv) +{ + if (COMPAT_USE_64BIT_TIME) + return copy_to_user(utv, tv, sizeof *tv) ? -EFAULT : 0; + else + return put_compat_timeval(tv, utv); +} +EXPORT_SYMBOL_GPL(compat_put_timeval); + +int compat_get_timespec(struct timespec *ts, const void __user *uts) +{ + if (COMPAT_USE_64BIT_TIME) + return copy_from_user(ts, uts, sizeof *ts) ? -EFAULT : 0; + else + return get_compat_timespec(ts, uts); +} +EXPORT_SYMBOL_GPL(compat_get_timespec); + +int compat_put_timespec(const struct timespec *ts, void __user *uts) +{ + if (COMPAT_USE_64BIT_TIME) + return copy_to_user(uts, ts, sizeof *ts) ? -EFAULT : 0; + else + return put_compat_timespec(ts, uts); +} +EXPORT_SYMBOL_GPL(compat_put_timespec); + static long compat_nanosleep_restart(struct restart_block *restart) { struct compat_timespec __user *rmtp; -- cgit v1.1 From 8c79a045fd590a26e81e75f5d8d4ec5c7d23e565 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 Jan 2012 14:51:37 +0100 Subject: sched/events: Revert trace_sched_stat_sleeptime() Commit 1ac9bc69 ("sched/tracing: Add a new tracepoint for sleeptime") added a new sched:sched_stat_sleeptime tracepoint. It's broken: the first sample we get on a task might be bad because of a stale sleep_start value that wasn't reset at the last task switch because the tracepoint was not active. It also breaks the existing schedstat samples due to the side effects of: - se->statistics.sleep_start = 0; ... - se->statistics.block_start = 0; Nor do I see means to fix it without adding overhead to the scheduler fast path, which I'm not willing to for the sake of redundant instrumentation. Most importantly, sleep time information can already be constructed by tracing context switches and wakeups, and taking the timestamp difference between the schedule-out, the wakeup and the schedule-in. Signed-off-by: Peter Zijlstra Cc: Andrew Vagin Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-pc4c9qhl8q6vg3bs4j6k0rbd@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 1 - kernel/sched/fair.c | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5255c9d..b342f57 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1932,7 +1932,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) local_irq_enable(); #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ finish_lock_switch(rq, prev); - trace_sched_stat_sleeptime(current, rq->clock); fire_sched_in_preempt_notifiers(current); if (mm) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7c6414f..aca16b8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1003,6 +1003,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) if (unlikely(delta > se->statistics.sleep_max)) se->statistics.sleep_max = delta; + se->statistics.sleep_start = 0; se->statistics.sum_sleep_runtime += delta; if (tsk) { @@ -1019,6 +1020,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) if (unlikely(delta > se->statistics.block_max)) se->statistics.block_max = delta; + se->statistics.block_start = 0; se->statistics.sum_sleep_runtime += delta; if (tsk) { -- cgit v1.1 From d80e731ecab420ddcb79ee9d0ac427acbc187b4b Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 24 Feb 2012 20:07:11 +0100 Subject: epoll: introduce POLLFREE to flush ->signalfd_wqh before kfree() This patch is intentionally incomplete to simplify the review. It ignores ep_unregister_pollwait() which plays with the same wqh. See the next change. epoll assumes that the EPOLL_CTL_ADD'ed file controls everything f_op->poll() needs. In particular it assumes that the wait queue can't go away until eventpoll_release(). This is not true in case of signalfd, the task which does EPOLL_CTL_ADD uses its ->sighand which is not connected to the file. This patch adds the special event, POLLFREE, currently only for epoll. It expects that init_poll_funcptr()'ed hook should do the necessary cleanup. Perhaps it should be defined as EPOLLFREE in eventpoll. __cleanup_sighand() is changed to do wake_up_poll(POLLFREE) if ->signalfd_wqh is not empty, we add the new signalfd_cleanup() helper. ep_poll_callback(POLLFREE) simply does list_del_init(task_list). This make this poll entry inconsistent, but we don't care. If you share epoll fd which contains our sigfd with another process you should blame yourself. signalfd is "really special". I simply do not know how we can define the "right" semantics if it used with epoll. The main problem is, epoll calls signalfd_poll() once to establish the connection with the wait queue, after that signalfd_poll(NULL) returns the different/inconsistent results depending on who does EPOLL_CTL_MOD/signalfd_read/etc. IOW: apart from sigmask, signalfd has nothing to do with the file, it works with the current thread. In short: this patch is the hack which tries to fix the symptoms. It also assumes that nobody can take tasklist_lock under epoll locks, this seems to be true. Note: - we do not have wake_up_all_poll() but wake_up_poll() is fine, poll/epoll doesn't use WQ_FLAG_EXCLUSIVE. - signalfd_cleanup() uses POLLHUP along with POLLFREE, we need a couple of simple changes in eventpoll.c to make sure it can't be "lost". Reported-by: Maxime Bizon Cc: Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/fork.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index b77fd559..e2cd3e2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -66,6 +66,7 @@ #include #include #include +#include #include #include @@ -935,8 +936,10 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) void __cleanup_sighand(struct sighand_struct *sighand) { - if (atomic_dec_and_test(&sighand->count)) + if (atomic_dec_and_test(&sighand->count)) { + signalfd_cleanup(sighand); kmem_cache_free(sighand_cachep, sighand); + } } -- cgit v1.1 From 8f2f748b0656257153bcf0941df8d6060acc5ca6 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Thu, 23 Feb 2012 15:27:15 +0530 Subject: CPU hotplug, cpusets, suspend: Don't touch cpusets during suspend/resume Currently, during CPU hotplug, the cpuset callbacks modify the cpusets to reflect the state of the system, and this handling is asymmetric. That is, upon CPU offline, that CPU is removed from all cpusets. However when it comes back online, it is put back only to the root cpuset. This gives rise to a significant problem during suspend/resume. During suspend, we offline all non-boot cpus and during resume we online them back. Which means, after a resume, all cpusets (except the root cpuset) will be restricted to just one single CPU (the boot cpu). But the whole point of suspend/resume is to restore the system to a state which is as close as possible to how it was before suspend. So to fix this, don't touch cpusets during suspend/resume. That is, modify the cpuset-related CPU hotplug callback to just ignore CPU hotplug when it is initiated as part of the suspend/resume sequence. Reported-by: Prashanth Nageshappa Signed-off-by: Srivatsa S. Bhat Cc: Linus Torvalds Cc: Andrew Morton Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/4F460D7B.1020703@linux.vnet.ibm.com Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b342f57..33a0676 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6728,7 +6728,7 @@ int __init sched_create_sysfs_power_savings_entries(struct device *dev) static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, void *hcpu) { - switch (action & ~CPU_TASKS_FROZEN) { + switch (action) { case CPU_ONLINE: case CPU_DOWN_FAILED: cpuset_update_active_cpus(); @@ -6741,7 +6741,7 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, void *hcpu) { - switch (action & ~CPU_TASKS_FROZEN) { + switch (action) { case CPU_DOWN_PREPARE: cpuset_update_active_cpus(); return NOTIFY_OK; -- cgit v1.1 From 30ce2f7eef095d1b8d070740f1948629814fe3c7 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 28 Feb 2012 10:19:38 +0900 Subject: perf/hwbp: Fix a possible memory leak If kzalloc() for TYPE_DATA failed on a given cpu, previous chunk of TYPE_INST will be leaked. Fix it. Thanks to Peter Zijlstra for suggesting this better solution. It should work as long as the initial value of the region is all 0's and that's the case of static (per-cpu) memory allocation. Signed-off-by: Namhyung Kim Acked-by: Frederic Weisbecker Acked-by: Peter Zijlstra Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Link: http://lkml.kernel.org/r/1330391978-28070-1-git-send-email-namhyung.kim@lge.com Signed-off-by: Ingo Molnar --- kernel/events/hw_breakpoint.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index b7971d6..ee706ce 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -651,10 +651,10 @@ int __init init_hw_breakpoint(void) err_alloc: for_each_possible_cpu(err_cpu) { - if (err_cpu == cpu) - break; for (i = 0; i < TYPE_MAX; i++) kfree(per_cpu(nr_task_bp_pinned[i], cpu)); + if (err_cpu == cpu) + break; } return -ENOMEM; -- cgit v1.1