From 26fcaf60fe3861409eb4c455c5c0d0f00f599b08 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Fri, 7 Jan 2011 01:42:31 +0100 Subject: PM: Fix oops in suspend/hibernate code related to failing ioremap() When ioremap() fails (which might happen for some reason), we nicely oops in suspend_nvs_save() due to NULL dereference by memcpy() in there. Fail gracefully instead. Signed-off-by: Jiri Slaby Signed-off-by: Rafael J. Wysocki Signed-off-by: Len Brown --- kernel/power/nvs.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/nvs.c b/kernel/power/nvs.c index 1836db6..57c6fab 100644 --- a/kernel/power/nvs.c +++ b/kernel/power/nvs.c @@ -105,7 +105,7 @@ int suspend_nvs_alloc(void) /** * suspend_nvs_save - save NVS memory regions */ -void suspend_nvs_save(void) +int suspend_nvs_save(void) { struct nvs_page *entry; @@ -114,8 +114,14 @@ void suspend_nvs_save(void) list_for_each_entry(entry, &nvs_list, node) if (entry->data) { entry->kaddr = ioremap(entry->phys_start, entry->size); + if (!entry->kaddr) { + suspend_nvs_free(); + return -ENOMEM; + } memcpy(entry->data, entry->kaddr, entry->size); } + + return 0; } /** -- cgit v1.1 From 976513dbfc1547c7b1822566923058655f0c32fd Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 7 Jan 2011 01:43:44 +0100 Subject: PM / ACPI: Move NVS saving and restoring code to drivers/acpi The saving of the ACPI NVS area during hibernation and suspend and restoring it during the subsequent resume is entirely specific to ACPI, so move it to drivers/acpi and drop the CONFIG_SUSPEND_NVS configuration option which is redundant. Signed-off-by: Rafael J. Wysocki Signed-off-by: Len Brown --- kernel/power/Kconfig | 5 -- kernel/power/Makefile | 1 - kernel/power/nvs.c | 142 -------------------------------------------------- 3 files changed, 148 deletions(-) delete mode 100644 kernel/power/nvs.c (limited to 'kernel') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index a5aff3e..2657299 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -100,13 +100,9 @@ config PM_SLEEP_ADVANCED_DEBUG depends on PM_ADVANCED_DEBUG default n -config SUSPEND_NVS - bool - config SUSPEND bool "Suspend to RAM and standby" depends on PM && ARCH_SUSPEND_POSSIBLE - select SUSPEND_NVS if HAS_IOMEM default y ---help--- Allow the system to enter sleep states in which main memory is @@ -140,7 +136,6 @@ config HIBERNATION depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE select LZO_COMPRESS select LZO_DECOMPRESS - select SUSPEND_NVS if HAS_IOMEM ---help--- Enable the suspend to disk (STD) functionality, which is usually called "hibernation" in user interfaces. STD checkpoints the diff --git a/kernel/power/Makefile b/kernel/power/Makefile index f9063c6..120a158 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -10,6 +10,5 @@ obj-$(CONFIG_SUSPEND) += suspend.o obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ block_io.o -obj-$(CONFIG_SUSPEND_NVS) += nvs.o obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o diff --git a/kernel/power/nvs.c b/kernel/power/nvs.c deleted file mode 100644 index 57c6fab..0000000 --- a/kernel/power/nvs.c +++ /dev/null @@ -1,142 +0,0 @@ -/* - * linux/kernel/power/hibernate_nvs.c - Routines for handling NVS memory - * - * Copyright (C) 2008,2009 Rafael J. Wysocki , Novell Inc. - * - * This file is released under the GPLv2. - */ - -#include -#include -#include -#include -#include -#include - -/* - * Platforms, like ACPI, may want us to save some memory used by them during - * suspend and to restore the contents of this memory during the subsequent - * resume. The code below implements a mechanism allowing us to do that. - */ - -struct nvs_page { - unsigned long phys_start; - unsigned int size; - void *kaddr; - void *data; - struct list_head node; -}; - -static LIST_HEAD(nvs_list); - -/** - * suspend_nvs_register - register platform NVS memory region to save - * @start - physical address of the region - * @size - size of the region - * - * The NVS region need not be page-aligned (both ends) and we arrange - * things so that the data from page-aligned addresses in this region will - * be copied into separate RAM pages. - */ -int suspend_nvs_register(unsigned long start, unsigned long size) -{ - struct nvs_page *entry, *next; - - while (size > 0) { - unsigned int nr_bytes; - - entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL); - if (!entry) - goto Error; - - list_add_tail(&entry->node, &nvs_list); - entry->phys_start = start; - nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK); - entry->size = (size < nr_bytes) ? size : nr_bytes; - - start += entry->size; - size -= entry->size; - } - return 0; - - Error: - list_for_each_entry_safe(entry, next, &nvs_list, node) { - list_del(&entry->node); - kfree(entry); - } - return -ENOMEM; -} - -/** - * suspend_nvs_free - free data pages allocated for saving NVS regions - */ -void suspend_nvs_free(void) -{ - struct nvs_page *entry; - - list_for_each_entry(entry, &nvs_list, node) - if (entry->data) { - free_page((unsigned long)entry->data); - entry->data = NULL; - if (entry->kaddr) { - iounmap(entry->kaddr); - entry->kaddr = NULL; - } - } -} - -/** - * suspend_nvs_alloc - allocate memory necessary for saving NVS regions - */ -int suspend_nvs_alloc(void) -{ - struct nvs_page *entry; - - list_for_each_entry(entry, &nvs_list, node) { - entry->data = (void *)__get_free_page(GFP_KERNEL); - if (!entry->data) { - suspend_nvs_free(); - return -ENOMEM; - } - } - return 0; -} - -/** - * suspend_nvs_save - save NVS memory regions - */ -int suspend_nvs_save(void) -{ - struct nvs_page *entry; - - printk(KERN_INFO "PM: Saving platform NVS memory\n"); - - list_for_each_entry(entry, &nvs_list, node) - if (entry->data) { - entry->kaddr = ioremap(entry->phys_start, entry->size); - if (!entry->kaddr) { - suspend_nvs_free(); - return -ENOMEM; - } - memcpy(entry->data, entry->kaddr, entry->size); - } - - return 0; -} - -/** - * suspend_nvs_restore - restore NVS memory regions - * - * This function is going to be called with interrupts disabled, so it - * cannot iounmap the virtual addresses used to access the NVS region. - */ -void suspend_nvs_restore(void) -{ - struct nvs_page *entry; - - printk(KERN_INFO "PM: Restoring platform NVS memory\n"); - - list_for_each_entry(entry, &nvs_list, node) - if (entry->data) - memcpy(entry->kaddr, entry->data, entry->size); -} -- cgit v1.1 From f123c98e7f168e949b283690693695f988332c3d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 6 Jan 2011 15:08:29 -0500 Subject: rtmutex: Fix comment about why new_owner can be NULL in wake_futex_pi() The comment about why rt_mutex_next_owner() can return NULL in wake_futex_pi() is not the normal case. Tracing the cause of why this occurs is more likely that waiter simply timedout. But because it originally caused contention with the futex, the owner will go into the kernel when it unlocks the lock. Then it will hit this code path and rt_mutex_next_owner() will return NULL. Cc: Thomas Gleixner Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/futex.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 3019b92..5696d38 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -791,10 +791,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); /* - * This happens when we have stolen the lock and the original - * pending owner did not enqueue itself back on the rt_mutex. - * Thats not a tragedy. We know that way, that a lock waiter - * is on the fly. We make the futex_q waiter the pending owner. + * It is possible that the next waiter (the one that brought + * this owner to the kernel) timed out and is no longer + * waiting on the lock. */ if (!new_owner) new_owner = this->task; -- cgit v1.1 From e159489baa717dbae70f9903770a6a4990865887 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 9 Jan 2011 23:32:15 +0100 Subject: workqueue: relax lockdep annotation on flush_work() Currently, the lockdep annotation in flush_work() requires exclusive access on the workqueue the target work is queued on and triggers warning if a work is trying to flush another work on the same workqueue; however, this is no longer true as workqueues can now execute multiple works concurrently. This patch adds lock_map_acquire_read() and make process_one_work() hold read access to the workqueue while executing a work and start_flush_work() check for write access if concurrnecy level is one or the workqueue has a rescuer (as only one execution resource - the rescuer - is guaranteed to be available under memory pressure), and read access if higher. This better represents what's going on and removes spurious lockdep warnings which are triggered by fake dependency chain created through flush_work(). * Peter pointed out that flushing another work from a WQ_MEM_RECLAIM wq breaks forward progress guarantee under memory pressure. Condition check accordingly updated. Signed-off-by: Tejun Heo Reported-by: "Rafael J. Wysocki" Tested-by: "Rafael J. Wysocki" Cc: Peter Zijlstra Cc: stable@kernel.org --- kernel/workqueue.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 8ee6ec8..930c239 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1840,7 +1840,7 @@ __acquires(&gcwq->lock) spin_unlock_irq(&gcwq->lock); work_clear_pending(work); - lock_map_acquire(&cwq->wq->lockdep_map); + lock_map_acquire_read(&cwq->wq->lockdep_map); lock_map_acquire(&lockdep_map); trace_workqueue_execute_start(work); f(work); @@ -2384,8 +2384,18 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, insert_wq_barrier(cwq, barr, work, worker); spin_unlock_irq(&gcwq->lock); - lock_map_acquire(&cwq->wq->lockdep_map); + /* + * If @max_active is 1 or rescuer is in use, flushing another work + * item on the same workqueue may lead to deadlock. Make sure the + * flusher is not running on the same workqueue by verifying write + * access. + */ + if (cwq->wq->saved_max_active == 1 || cwq->wq->flags & WQ_RESCUER) + lock_map_acquire(&cwq->wq->lockdep_map); + else + lock_map_acquire_read(&cwq->wq->lockdep_map); lock_map_release(&cwq->wq->lockdep_map); + return true; already_gone: spin_unlock_irq(&gcwq->lock); -- cgit v1.1 From 42c025f3de9042d9c9abd9a6f6205d1a0f4bcadf Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 11 Jan 2011 15:58:49 +0100 Subject: workqueue: note the nested NOT_RUNNING test in worker_clr_flags() isn't a noop The nested NOT_RUNNING test in worker_clr_flags() is slightly misleading in that if NOT_RUNNING were a single flag the nested test would be always %true and thus noop. Add a comment noting that the test isn't a noop. Signed-off-by: Tejun Heo Cc: Hillf Danton Cc: Andrew Morton --- kernel/workqueue.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 930c239..11869fa 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -768,7 +768,11 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) worker->flags &= ~flags; - /* if transitioning out of NOT_RUNNING, increment nr_running */ + /* + * If transitioning out of NOT_RUNNING, increment nr_running. Note + * that the nested NOT_RUNNING is not a noop. NOT_RUNNING is mask + * of multiple flags, not a single flag. + */ if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) if (!(worker->flags & WORKER_NOT_RUNNING)) atomic_inc(get_gcwq_nr_running(gcwq->cpu)); -- cgit v1.1 From 81e88fdc432a1552401d6e91a984dcccce72b8dc Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 12 Jan 2011 14:44:55 +0800 Subject: ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support Generic Hardware Error Source provides a way to report platform hardware errors (such as that from chipset). It works in so called "Firmware First" mode, that is, hardware errors are reported to firmware firstly, then reported to Linux by firmware. This way, some non-standard hardware error registers or non-standard hardware link can be checked by firmware to produce more valuable hardware error information for Linux. This patch adds POLL/IRQ/NMI notification types support. Because the memory area used to transfer hardware error information from BIOS to Linux can be determined only in NMI, IRQ or timer handler, but general ioremap can not be used in atomic context, so a special version of atomic ioremap is implemented for that. Known issue: - Error information can not be printed for recoverable errors notified via NMI, because printk is not NMI-safe. Will fix this via delay printing to IRQ context via irq_work or make printk NMI-safe. v2: - adjust printk format per comments. Signed-off-by: Huang Ying Reviewed-by: Andi Kleen Signed-off-by: Len Brown --- kernel/panic.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 4c13b1a..991bb87 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -34,6 +34,7 @@ static int pause_on_oops_flag; static DEFINE_SPINLOCK(pause_on_oops_lock); int panic_timeout; +EXPORT_SYMBOL_GPL(panic_timeout); ATOMIC_NOTIFIER_HEAD(panic_notifier_list); -- cgit v1.1 From 5fdade95f26372154459347dfb9f60721d22cfc7 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Tue, 11 Jan 2011 12:18:12 -0500 Subject: time: Rename misnamed minsec argument of clocks_calc_mult_shift() The minsec argument to clocks_calc_mult_shift() is misnamed. It is used to clamp the magnitude of the mult factor so that a multiplication with any value in the given range won't overflow a 64 bit result. Let's rename it to match the actual usage. Signed-off-by: Nicolas Pitre Acked-by: John Stultz LKML-Reference: Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index c18d7ef..8588abc 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -113,7 +113,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time); * @shift: pointer to shift variable * @from: frequency to convert from * @to: frequency to convert to - * @minsec: guaranteed runtime conversion range in seconds + * @maxsec: guaranteed runtime conversion range in seconds * * The function evaluates the shift/mult pair for the scaled math * operations of clocksources and clockevents. @@ -122,7 +122,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time); * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock * event @to is the counter frequency and @from is NSEC_PER_SEC. * - * The @minsec conversion range argument controls the time frame in + * The @maxsec conversion range argument controls the time frame in * seconds which must be covered by the runtime conversion with the * calculated mult and shift factors. This guarantees that no 64bit * overflow happens when the input value of the conversion is @@ -131,7 +131,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time); * factors. */ void -clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec) +clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec) { u64 tmp; u32 sft, sftacc= 32; @@ -140,7 +140,7 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec) * Calculate the shift factor which is limiting the conversion * range: */ - tmp = ((u64)minsec * from) >> 32; + tmp = ((u64)maxsec * from) >> 32; while (tmp) { tmp >>=1; sftacc--; -- cgit v1.1 From afa14e7c553ebe45844d76208f66017a43abd0e2 Mon Sep 17 00:00:00 2001 From: H Hartley Sweeten Date: Tue, 11 Jan 2011 17:59:38 -0600 Subject: timekeeping: Make local variables static Signed-off-by: H Hartley Sweeten Cc: John Stultz LKML-Reference: <0D753D10438DA54287A00B027084269764CE0E54B7@AUSP01VMBX24.collaborationhost.net> Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 5bb86da..eef7452 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -49,7 +49,7 @@ struct timekeeper { u32 mult; }; -struct timekeeper timekeeper; +static struct timekeeper timekeeper; /** * timekeeper_setup_internals - Set up internals to use clocksource clock. @@ -164,7 +164,7 @@ static struct timespec total_sleep_time; /* * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */ -struct timespec raw_time; +static struct timespec raw_time; /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; -- cgit v1.1 From 6c9ae009b298753a3baf71298d676a68b5a10c8f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 13 Jan 2011 15:45:38 -0800 Subject: irq: use per_cpu kstat_irqs Use modern per_cpu API to increment {soft|hard}irq counters, and use per_cpu allocation for (struct irq_desc)->kstats_irq instead of an array. This gives better SMP/NUMA locality and saves few instructions per irq. With small nr_cpuids values (8 for example), kstats_irq was a small array (less than L1_CACHE_BYTES), potentially source of false sharing. In the !CONFIG_SPARSE_IRQ case, remove the huge, NUMA/cache unfriendly kstat_irqs_all[NR_IRQS][NR_CPUS] array. Note: we still populate kstats_irq for all possible irqs in early_irq_init(). We probably could use on-demand allocations. (Code included in alloc_descs()). Problem is not all IRQS are used with a prior alloc_descs() call. kstat_irqs_this_cpu() is not used anymore, remove it. Signed-off-by: Eric Dumazet Reviewed-by: Christoph Lameter Cc: Ingo Molnar Cc: Andi Kleen Cc: Tejun Heo Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/irqdesc.c | 40 ++++++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 9988d03..282f202 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -72,6 +72,8 @@ static inline int desc_node(struct irq_desc *desc) { return 0; } static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) { + int cpu; + desc->irq_data.irq = irq; desc->irq_data.chip = &no_irq_chip; desc->irq_data.chip_data = NULL; @@ -83,7 +85,8 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) desc->irq_count = 0; desc->irqs_unhandled = 0; desc->name = NULL; - memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs))); + for_each_possible_cpu(cpu) + *per_cpu_ptr(desc->kstat_irqs, cpu) = 0; desc_smp_init(desc, node); } @@ -133,8 +136,7 @@ static struct irq_desc *alloc_desc(int irq, int node) if (!desc) return NULL; /* allocate based on nr_cpu_ids */ - desc->kstat_irqs = kzalloc_node(nr_cpu_ids * sizeof(*desc->kstat_irqs), - gfp, node); + desc->kstat_irqs = alloc_percpu(unsigned int); if (!desc->kstat_irqs) goto err_desc; @@ -149,7 +151,7 @@ static struct irq_desc *alloc_desc(int irq, int node) return desc; err_kstat: - kfree(desc->kstat_irqs); + free_percpu(desc->kstat_irqs); err_desc: kfree(desc); return NULL; @@ -166,7 +168,7 @@ static void free_desc(unsigned int irq) mutex_unlock(&sparse_irq_lock); free_masks(desc); - kfree(desc->kstat_irqs); + free_percpu(desc->kstat_irqs); kfree(desc); } @@ -234,7 +236,6 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { } }; -static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS]; int __init early_irq_init(void) { int count, i, node = first_online_node; @@ -250,7 +251,8 @@ int __init early_irq_init(void) for (i = 0; i < count; i++) { desc[i].irq_data.irq = i; desc[i].irq_data.chip = &no_irq_chip; - desc[i].kstat_irqs = kstat_irqs_all[i]; + /* TODO : do this allocation on-demand ... */ + desc[i].kstat_irqs = alloc_percpu(unsigned int); alloc_masks(desc + i, GFP_KERNEL, node); desc_smp_init(desc + i, node); lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); @@ -275,6 +277,22 @@ static void free_desc(unsigned int irq) static inline int alloc_descs(unsigned int start, unsigned int cnt, int node) { +#if defined(CONFIG_KSTAT_IRQS_ONDEMAND) + struct irq_desc *desc; + unsigned int i; + + for (i = 0; i < cnt; i++) { + desc = irq_to_desc(start + i); + if (desc && !desc->kstat_irqs) { + unsigned int __percpu *stats = alloc_percpu(unsigned int); + + if (!stats) + return -1; + if (cmpxchg(&desc->kstat_irqs, NULL, stats) != NULL) + free_percpu(stats); + } + } +#endif return start; } #endif /* !CONFIG_SPARSE_IRQ */ @@ -391,7 +409,9 @@ void dynamic_irq_cleanup(unsigned int irq) unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) { struct irq_desc *desc = irq_to_desc(irq); - return desc ? desc->kstat_irqs[cpu] : 0; + + return desc && desc->kstat_irqs ? + *per_cpu_ptr(desc->kstat_irqs, cpu) : 0; } #ifdef CONFIG_GENERIC_HARDIRQS @@ -401,10 +421,10 @@ unsigned int kstat_irqs(unsigned int irq) int cpu; int sum = 0; - if (!desc) + if (!desc || !desc->kstat_irqs) return 0; for_each_possible_cpu(cpu) - sum += desc->kstat_irqs[cpu]; + sum += *per_cpu_ptr(desc->kstat_irqs, cpu); return sum; } #endif /* CONFIG_GENERIC_HARDIRQS */ -- cgit v1.1 From 43bb40c9e3aa51a3b038c9df2c9afb4d4685614d Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Thu, 13 Jan 2011 15:45:40 -0800 Subject: sched: remove long deprecated CLONE_STOPPED flag This warning was added in commit bdff746a3915 ("clone: prepare to recycle CLONE_STOPPED") three years ago. 2.6.26 came and went. As far as I know, no-one is actually using CLONE_STOPPED. Signed-off-by: Dave Jones Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 28 +--------------------------- 1 file changed, 1 insertion(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index d9b44f2..76a1fdd 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1410,23 +1410,6 @@ long do_fork(unsigned long clone_flags, } /* - * We hope to recycle these flags after 2.6.26 - */ - if (unlikely(clone_flags & CLONE_STOPPED)) { - static int __read_mostly count = 100; - - if (count > 0 && printk_ratelimit()) { - char comm[TASK_COMM_LEN]; - - count--; - printk(KERN_INFO "fork(): process `%s' used deprecated " - "clone flags 0x%lx\n", - get_task_comm(comm, current), - clone_flags & CLONE_STOPPED); - } - } - - /* * When called from kernel_thread, don't do user tracing stuff. */ if (likely(user_mode(regs))) @@ -1464,16 +1447,7 @@ long do_fork(unsigned long clone_flags, */ p->flags &= ~PF_STARTING; - if (unlikely(clone_flags & CLONE_STOPPED)) { - /* - * We'll start up with an immediate SIGSTOP. - */ - sigaddset(&p->pending.signal, SIGSTOP); - set_tsk_thread_flag(p, TIF_SIGPENDING); - __set_task_state(p, TASK_STOPPED); - } else { - wake_up_new_task(p, clone_flags); - } + wake_up_new_task(p, clone_flags); tracehook_report_clone_complete(trace, regs, clone_flags, nr, p); -- cgit v1.1 From dabb16f639820267b3850d804571c70bd93d4e07 Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Thu, 13 Jan 2011 15:46:05 -0800 Subject: oom: allow a non-CAP_SYS_RESOURCE proces to oom_score_adj down We'd like to be able to oom_score_adj a process up/down as it enters/leaves the foreground. Currently, it is not possible to oom_adj down without CAP_SYS_RESOURCE. This patch allows a task to decrease its oom_score_adj back to the value that a CAP_SYS_RESOURCE thread set it to or its inherited value at fork. Assuming the thread that has forked it has oom_score_adj of 0, each process could decrease it back from 0 upon activation unless a CAP_SYS_RESOURCE thread elevated it to something higher. Alternative considered: * a setuid binary * a daemon with CAP_SYS_RESOURCE Since you don't wan't all processes to be able to reduce their oom_adj, a setuid or daemon implementation would be complex. The alternatives also have much higher overhead. This patch updated from original patch based on feedback from David Rientjes. Signed-off-by: Mandeep Singh Baines Acked-by: David Rientjes Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Rik van Riel Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 76a1fdd..1499607 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -910,6 +910,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->oom_adj = current->signal->oom_adj; sig->oom_score_adj = current->signal->oom_score_adj; + sig->oom_score_adj_min = current->signal->oom_score_adj_min; mutex_init(&sig->cred_guard_mutex); -- cgit v1.1 From a5b338f2b0b1ff73ae20c66ab831201549eaec01 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:34 -0800 Subject: thp: update futex compound knowledge Futex code is smarter than most other gup_fast O_DIRECT code and knows about the compound internals. However now doing a put_page(head_page) will not release the pin on the tail page taken by gup-fast, leading to all sort of refcounting bugchecks. Getting a stable head_page is a little tricky. page_head = page is there because if this is not a tail page it's also the page_head. Only in case this is a tail page, compound_head is called, otherwise it's guaranteed unnecessary. And if it's a tail page compound_head has to run atomically inside irq disabled section __get_user_pages_fast before returning. Otherwise ->first_page won't be a stable pointer. Disableing irq before __get_user_page_fast and releasing irq after running compound_head is needed because if __get_user_page_fast returns == 1, it means the huge pmd is established and cannot go away from under us. pmdp_splitting_flush_notify in __split_huge_page_splitting will have to wait for local_irq_enable before the IPI delivery can return. This means __split_huge_page_refcount can't be running from under us, and in turn when we run compound_head(page) we're not reading a dangling pointer from tailpage->first_page. Then after we get to stable head page, we are always safe to call compound_lock and after taking the compound lock on head page we can finally re-check if the page returned by gup-fast is still a tail page. in which case we're set and we didn't need to split the hugepage in order to take a futex on it. Signed-off-by: Andrea Arcangeli Acked-by: Mel Gorman Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/futex.c | 55 +++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 45 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 3019b92..5207563 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -233,7 +233,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; - struct page *page; + struct page *page, *page_head; int err; /* @@ -265,11 +265,46 @@ again: if (err < 0) return err; - page = compound_head(page); - lock_page(page); - if (!page->mapping) { - unlock_page(page); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + page_head = page; + if (unlikely(PageTail(page))) { put_page(page); + /* serialize against __split_huge_page_splitting() */ + local_irq_disable(); + if (likely(__get_user_pages_fast(address, 1, 1, &page) == 1)) { + page_head = compound_head(page); + /* + * page_head is valid pointer but we must pin + * it before taking the PG_lock and/or + * PG_compound_lock. The moment we re-enable + * irqs __split_huge_page_splitting() can + * return and the head page can be freed from + * under us. We can't take the PG_lock and/or + * PG_compound_lock on a page that could be + * freed from under us. + */ + if (page != page_head) { + get_page(page_head); + put_page(page); + } + local_irq_enable(); + } else { + local_irq_enable(); + goto again; + } + } +#else + page_head = compound_head(page); + if (page != page_head) { + get_page(page_head); + put_page(page); + } +#endif + + lock_page(page_head); + if (!page_head->mapping) { + unlock_page(page_head); + put_page(page_head); goto again; } @@ -280,20 +315,20 @@ again: * it's a read-only handle, it's expected that futexes attach to * the object not the particular process. */ - if (PageAnon(page)) { + if (PageAnon(page_head)) { key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ key->private.mm = mm; key->private.address = address; } else { key->both.offset |= FUT_OFF_INODE; /* inode-based key */ - key->shared.inode = page->mapping->host; - key->shared.pgoff = page->index; + key->shared.inode = page_head->mapping->host; + key->shared.pgoff = page_head->index; } get_futex_key_refs(key); - unlock_page(page); - put_page(page); + unlock_page(page_head); + put_page(page_head); return 0; } -- cgit v1.1 From e7a00c45f29c0155007aa150bf231a70fa470365 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:45 -0800 Subject: thp: add pmd_huge_pte to mm_struct This increase the size of the mm struct a bit but it is needed to preallocate one pte for each hugepage so that split_huge_page will not require a fail path. Guarantee of success is a fundamental property of split_huge_page to avoid decrasing swapping reliability and to avoid adding -ENOMEM fail paths that would otherwise force the hugepage-unaware VM code to learn rolling back in the middle of its pte mangling operations (if something we need it to learn handling pmd_trans_huge natively rather being capable of rollback). When split_huge_page runs a pte is needed to succeed the split, to map the newly splitted regular pages with a regular pte. This way all existing VM code remains backwards compatible by just adding a split_huge_page* one liner. The memory waste of those preallocated ptes is negligible and so it is worth it. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 1499607..f78f50b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -529,6 +529,9 @@ void __mmdrop(struct mm_struct *mm) mm_free_pgd(mm); destroy_context(mm); mmu_notifier_mm_destroy(mm); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + VM_BUG_ON(mm->pmd_huge_pte); +#endif free_mm(mm); } EXPORT_SYMBOL_GPL(__mmdrop); @@ -669,6 +672,10 @@ struct mm_struct *dup_mm(struct task_struct *tsk) mm->token_priority = 0; mm->last_interval = 0; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + mm->pmd_huge_pte = NULL; +#endif + if (!mm_init(mm, tsk)) goto fail_nomem; -- cgit v1.1 From ba76149f47d8c939efa0acc07a191237af900471 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:58 -0800 Subject: thp: khugepaged Add khugepaged to relocate fragmented pages into hugepages if new hugepages become available. (this is indipendent of the defrag logic that will have to make new hugepages available) The fundamental reason why khugepaged is unavoidable, is that some memory can be fragmented and not everything can be relocated. So when a virtual machine quits and releases gigabytes of hugepages, we want to use those freely available hugepages to create huge-pmd in the other virtual machines that may be running on fragmented memory, to maximize the CPU efficiency at all times. The scan is slow, it takes nearly zero cpu time, except when it copies data (in which case it means we definitely want to pay for that cpu time) so it seems a good tradeoff. In addition to the hugepages being released by other process releasing memory, we have the strong suspicion that the performance impact of potentially defragmenting hugepages during or before each page fault could lead to more performance inconsistency than allocating small pages at first and having them collapsed into large pages later... if they prove themselfs to be long lived mappings (khugepaged scan is slow so short lived mappings have low probability to run into khugepaged if compared to long lived mappings). Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index f78f50b..25e4291 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -66,6 +66,7 @@ #include #include #include +#include #include #include @@ -330,6 +331,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) retval = ksm_fork(mm, oldmm); if (retval) goto out; + retval = khugepaged_fork(mm, oldmm); + if (retval) + goto out; prev = NULL; for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { @@ -546,6 +550,7 @@ void mmput(struct mm_struct *mm) if (atomic_dec_and_test(&mm->mm_users)) { exit_aio(mm); ksm_exit(mm); + khugepaged_exit(mm); /* must run before exit_mmap */ exit_mmap(mm); set_mm_exe_file(mm, NULL); if (!list_empty(&mm->mmlist)) { -- cgit v1.1 From 3ec762ad8be364c2fadfe0d6b2cc6d4d3b5e1b54 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 14 Jan 2011 11:34:34 +0800 Subject: cgroups: Fix a lockdep warning at cgroup removal Commit 2fd6b7f5 ("fs: dcache scale subdirs") forgot to annotate a dentry lock, which caused a lockdep warning. Reported-by: Valdis Kletnieks Signed-off-by: Li Zefan --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 51cddc1..a7837e2 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -912,7 +912,7 @@ static void cgroup_d_remove_dir(struct dentry *dentry) parent = dentry->d_parent; spin_lock(&parent->d_lock); - spin_lock(&dentry->d_lock); + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); list_del_init(&dentry->d_u.d_child); spin_unlock(&dentry->d_lock); spin_unlock(&parent->d_lock); -- cgit v1.1 From c072a388d59a1d48e36864d0e66f42d71745be1c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 7 Jan 2011 02:33:47 -0800 Subject: rcu: demote SRCU_SYNCHRONIZE_DELAY from kernel-parameter status Because the adaptive synchronize_srcu_expedited() approach has worked very well in testing, remove the kernel parameter and replace it by a C-preprocessor macro. If someone finds problems with this approach, a more complex and aggressively adaptive approach might be required. Longer term, SRCU will be merged with the other RCU implementations, at which point synchronize_srcu_expedited() will be event driven, just as synchronize_sched_expedited() currently is. At that point, there will be no need for this adaptive approach. Reported-by: Linus Torvalds Signed-off-by: Paul E. McKenney --- kernel/srcu.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/srcu.c b/kernel/srcu.c index 98d8c1e..73ce23f 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c @@ -156,6 +156,16 @@ void __srcu_read_unlock(struct srcu_struct *sp, int idx) EXPORT_SYMBOL_GPL(__srcu_read_unlock); /* + * We use an adaptive strategy for synchronize_srcu() and especially for + * synchronize_srcu_expedited(). We spin for a fixed time period + * (defined below) to allow SRCU readers to exit their read-side critical + * sections. If there are still some readers after 10 microseconds, + * we repeatedly block for 1-millisecond time periods. This approach + * has done well in testing, so there is no need for a config parameter. + */ +#define SYNCHRONIZE_SRCU_READER_DELAY 10 + +/* * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). */ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) @@ -207,11 +217,12 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) * will have finished executing. We initially give readers * an arbitrarily chosen 10 microseconds to get out of their * SRCU read-side critical sections, then loop waiting 1/HZ - * seconds per iteration. + * seconds per iteration. The 10-microsecond value has done + * very well in testing. */ if (srcu_readers_active_idx(sp, idx)) - udelay(CONFIG_SRCU_SYNCHRONIZE_DELAY); + udelay(SYNCHRONIZE_SRCU_READER_DELAY); while (srcu_readers_active_idx(sp, idx)) schedule_timeout_interruptible(1); -- cgit v1.1 From b24efdfdf679cf9b05947c531971905fc727dd40 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 12 Jan 2011 14:18:11 -0800 Subject: rcu: avoid pointless blocked-task warnings If the RCU callback-processing kthread has nothing to do, it parks in a wait_event(). If RCU remains idle for more than two minutes, the kernel complains about this. This commit changes from wait_event() to wait_event_interruptible() to prevent the kernel from complaining just because RCU is idle. Reported-by: Russell King Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Tested-by: Thomas Weber Tested-by: Russell King --- kernel/rcutiny.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 0344937..0c343b9 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -189,7 +189,8 @@ static int rcu_kthread(void *arg) unsigned long flags; for (;;) { - wait_event(rcu_kthread_wq, have_rcu_kthread_work != 0); + wait_event_interruptible(rcu_kthread_wq, + have_rcu_kthread_work != 0); morework = rcu_boost(); local_irq_save(flags); work = have_rcu_kthread_work; -- cgit v1.1 From c72a04e34735ec3f19f4788b7f95017310b5e1eb Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 14 Jan 2011 05:31:45 +0000 Subject: cgroup_fs: fix cgroup use of simple_lookup() cgroup can't use simple_lookup(), since that'd override its desired ->d_op. Tested-by: Li Zefan Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5c5f4cc..ffb7bba 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -764,6 +764,7 @@ EXPORT_SYMBOL_GPL(cgroup_unlock); */ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); +static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *); static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); static int cgroup_populate_dir(struct cgroup *cgrp); static const struct inode_operations cgroup_dir_inode_operations; @@ -860,6 +861,11 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) iput(inode); } +static int cgroup_delete(const struct dentry *d) +{ + return 1; +} + static void remove_dir(struct dentry *d) { struct dentry *parent = dget(d->d_parent); @@ -1451,6 +1457,7 @@ static int cgroup_get_rootdir(struct super_block *sb) { static const struct dentry_operations cgroup_dops = { .d_iput = cgroup_diput, + .d_delete = cgroup_delete, }; struct inode *inode = @@ -2195,12 +2202,20 @@ static const struct file_operations cgroup_file_operations = { }; static const struct inode_operations cgroup_dir_inode_operations = { - .lookup = simple_lookup, + .lookup = cgroup_lookup, .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir, .rename = cgroup_rename, }; +static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) +{ + if (dentry->d_name.len > NAME_MAX) + return ERR_PTR(-ENAMETOOLONG); + d_add(dentry, NULL); + return NULL; +} + /* * Check if a file is a control file */ -- cgit v1.1 From 7f85803a26f304e698c673838aab06cc6d4d6e59 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 14 Jan 2011 17:49:02 +0800 Subject: tracing: Remove syscall_exit_fields There is no need for syscall_exit_fields as the syscall exit event class can already host the fields in its structure, like most other trace events do by default. Use that default behavior instead. Following this scheme, we don't need anymore to override the get_fields() callback of the syscall exit event class either. Hence both syscall_exit_fields and syscall_get_exit_fields() can be removed. Also changed some indentation to keep the following under 80 characters: ".fields = LIST_HEAD_INIT(event_class_syscall_exit.fields)," Acked-by: Frederic Weisbecker Signed-off-by: Lai Jiangshan LKML-Reference: <4D301C0E.8090408@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_syscalls.c | 33 ++++++++++++--------------------- 1 file changed, 12 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index bac752f..b706529 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -23,9 +23,6 @@ static int syscall_exit_register(struct ftrace_event_call *event, static int syscall_enter_define_fields(struct ftrace_event_call *call); static int syscall_exit_define_fields(struct ftrace_event_call *call); -/* All syscall exit events have the same fields */ -static LIST_HEAD(syscall_exit_fields); - static struct list_head * syscall_get_enter_fields(struct ftrace_event_call *call) { @@ -34,34 +31,28 @@ syscall_get_enter_fields(struct ftrace_event_call *call) return &entry->enter_fields; } -static struct list_head * -syscall_get_exit_fields(struct ftrace_event_call *call) -{ - return &syscall_exit_fields; -} - struct trace_event_functions enter_syscall_print_funcs = { - .trace = print_syscall_enter, + .trace = print_syscall_enter, }; struct trace_event_functions exit_syscall_print_funcs = { - .trace = print_syscall_exit, + .trace = print_syscall_exit, }; struct ftrace_event_class event_class_syscall_enter = { - .system = "syscalls", - .reg = syscall_enter_register, - .define_fields = syscall_enter_define_fields, - .get_fields = syscall_get_enter_fields, - .raw_init = init_syscall_trace, + .system = "syscalls", + .reg = syscall_enter_register, + .define_fields = syscall_enter_define_fields, + .get_fields = syscall_get_enter_fields, + .raw_init = init_syscall_trace, }; struct ftrace_event_class event_class_syscall_exit = { - .system = "syscalls", - .reg = syscall_exit_register, - .define_fields = syscall_exit_define_fields, - .get_fields = syscall_get_exit_fields, - .raw_init = init_syscall_trace, + .system = "syscalls", + .reg = syscall_exit_register, + .define_fields = syscall_exit_define_fields, + .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields), + .raw_init = init_syscall_trace, }; extern unsigned long __start_syscalls_metadata[]; -- cgit v1.1 From 977dda7c9b540f48b228174346d8b31542c1e99f Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Fri, 14 Jan 2011 17:57:50 -0800 Subject: sched: Update effective_load() to use global share weights Previously effective_load would approximate the global load weight present on a group taking advantage of: entity_weight = tg->shares ( lw / global_lw ), where entity_weight was provided by tg_shares_up. This worked (approximately) for an 'empty' (at tg level) cpu since we would place boost load representative of what a newly woken task would receive. However, now that load is instantaneously updated this assumption is no longer true and the load calculation is rather incorrect in this case. Fix this (and improve the general case) by re-writing effective_load to take advantage of the new shares distribution code. Signed-off-by: Paul Turner Signed-off-by: Peter Zijlstra LKML-Reference: <20110115015817.069769529@google.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index c62ebae..414145c 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1362,27 +1362,27 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) return wl; for_each_sched_entity(se) { - long S, rw, s, a, b; + long lw, w; - S = se->my_q->tg->shares; - s = se->load.weight; - rw = se->my_q->load.weight; + tg = se->my_q->tg; + w = se->my_q->load.weight; - a = S*(rw + wl); - b = S*rw + s*wg; + /* use this cpu's instantaneous contribution */ + lw = atomic_read(&tg->load_weight); + lw -= se->my_q->load_contribution; + lw += w + wg; - wl = s*(a-b); + wl += w; - if (likely(b)) - wl /= b; + if (lw > 0 && wl < lw) + wl = (wl * tg->shares) / lw; + else + wl = tg->shares; - /* - * Assume the group is already running and will - * thus already be accounted for in the weight. - * - * That is, moving shares between CPUs, does not - * alter the group weight. - */ + /* zero point is MIN_SHARES */ + if (wl < MIN_SHARES) + wl = MIN_SHARES; + wl -= se->load.weight; wg = 0; } -- cgit v1.1 From efe25c2c7b3a5d17b0c70987a758d8fe7af8e3d1 Mon Sep 17 00:00:00 2001 From: Bharata B Rao Date: Tue, 11 Jan 2011 15:41:54 +0530 Subject: sched: Reinstate group names in /proc/sched_debug Displaying of group names in /proc/sched_debug was dropped in autogroup patches. Add group names while displaying cfs_rq and tasks information. Signed-off-by: Bharata B Rao Signed-off-by: Peter Zijlstra LKML-Reference: <20110111101153.GE4772@in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched_debug.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) (limited to 'kernel') diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 1dfae3d..4d36f37 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -16,6 +16,8 @@ #include #include +static DEFINE_SPINLOCK(sched_debug_lock); + /* * This allows printing both to /proc/sched_debug and * to the console @@ -86,6 +88,23 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group } #endif +#ifdef CONFIG_CGROUP_SCHED +static char group_path[PATH_MAX]; + +static char *task_group_path(struct task_group *tg) +{ + /* + * May be NULL if the underlying cgroup isn't fully-created yet + */ + if (!tg->css.cgroup) { + group_path[0] = '\0'; + return group_path; + } + cgroup_path(tg->css.cgroup, group_path, PATH_MAX); + return group_path; +} +#endif + static void print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) { @@ -108,6 +127,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); #endif +#ifdef CONFIG_CGROUP_SCHED + SEQ_printf(m, " %s", task_group_path(task_group(p))); +#endif SEQ_printf(m, "\n"); } @@ -144,7 +166,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) struct sched_entity *last; unsigned long flags; +#ifdef CONFIG_FAIR_GROUP_SCHED + SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg)); +#else SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); +#endif SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", SPLIT_NS(cfs_rq->exec_clock)); @@ -191,7 +217,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) { +#ifdef CONFIG_RT_GROUP_SCHED + SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg)); +#else SEQ_printf(m, "\nrt_rq[%d]:\n", cpu); +#endif #define P(x) \ SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x)) @@ -212,6 +242,7 @@ extern __read_mostly int sched_clock_running; static void print_cpu(struct seq_file *m, int cpu) { struct rq *rq = cpu_rq(cpu); + unsigned long flags; #ifdef CONFIG_X86 { @@ -266,10 +297,14 @@ static void print_cpu(struct seq_file *m, int cpu) #undef P #endif + spin_lock_irqsave(&sched_debug_lock, flags); print_cfs_stats(m, cpu); print_rt_stats(m, cpu); + rcu_read_lock(); print_rq(m, rq, cpu); + rcu_read_unlock(); + spin_unlock_irqrestore(&sched_debug_lock, flags); } static const char *sched_tunable_scaling_names[] = { -- cgit v1.1 From 8ecedd7a06d27a31dbb36fab88e2ba6e6edd43ca Mon Sep 17 00:00:00 2001 From: Bharata B Rao Date: Tue, 11 Jan 2011 15:42:57 +0530 Subject: sched: Display autogroup names in /proc/sched_debug Add autogroup name to cfs_rq and tasks information to /proc/sched_debug. Signed-off-by: Bharata B Rao Signed-off-by: Peter Zijlstra LKML-Reference: <20110111101257.GF4772@in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched_autogroup.c | 5 +++++ kernel/sched_debug.c | 3 +++ 2 files changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c index 32a723b..938d52f 100644 --- a/kernel/sched_autogroup.c +++ b/kernel/sched_autogroup.c @@ -231,6 +231,11 @@ void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m) #ifdef CONFIG_SCHED_DEBUG static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) { + int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); + + if (!enabled || !tg->autogroup) + return 0; + return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); } #endif /* CONFIG_SCHED_DEBUG */ diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 4d36f37..e4d3725 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -93,6 +93,9 @@ static char group_path[PATH_MAX]; static char *task_group_path(struct task_group *tg) { + if (autogroup_path(tg, group_path, PATH_MAX)) + return group_path; + /* * May be NULL if the underlying cgroup isn't fully-created yet */ -- cgit v1.1 From f44937718ce3b8360f72f6c68c9481712517a867 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Thu, 13 Jan 2011 04:54:50 +0100 Subject: sched, autogroup: Fix CONFIG_RT_GROUP_SCHED sched_setscheduler() failure If CONFIG_RT_GROUP_SCHED is set, __sched_setscheduler() fails due to autogroup not allocating rt_runtime. Free unused/unusable rt_se and rt_rq, redirect RT tasks to the root task group, and tell __sched_setscheduler() that it's ok. Reported-and-tested-by: Bharata B Rao Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra LKML-Reference: <1294890890.8089.39.camel@marge.simson.net> Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 ++- kernel/sched_autogroup.c | 27 +++++++++++++++++++++++++++ kernel/sched_autogroup.h | 4 ++++ 3 files changed, 33 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index a0eb094..6cbff6b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4871,7 +4871,8 @@ recheck: * assigned. */ if (rt_bandwidth_enabled() && rt_policy(policy) && - task_group(p)->rt_bandwidth.rt_runtime == 0) { + task_group(p)->rt_bandwidth.rt_runtime == 0 && + !task_group_is_autogroup(task_group(p))) { __task_rq_unlock(rq); raw_spin_unlock_irqrestore(&p->pi_lock, flags); return -EPERM; diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c index 938d52f..9fb6562 100644 --- a/kernel/sched_autogroup.c +++ b/kernel/sched_autogroup.c @@ -27,6 +27,11 @@ static inline void autogroup_destroy(struct kref *kref) { struct autogroup *ag = container_of(kref, struct autogroup, kref); +#ifdef CONFIG_RT_GROUP_SCHED + /* We've redirected RT tasks to the root task group... */ + ag->tg->rt_se = NULL; + ag->tg->rt_rq = NULL; +#endif sched_destroy_group(ag->tg); } @@ -55,6 +60,10 @@ static inline struct autogroup *autogroup_task_get(struct task_struct *p) return ag; } +#ifdef CONFIG_RT_GROUP_SCHED +static void free_rt_sched_group(struct task_group *tg); +#endif + static inline struct autogroup *autogroup_create(void) { struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL); @@ -72,6 +81,19 @@ static inline struct autogroup *autogroup_create(void) init_rwsem(&ag->lock); ag->id = atomic_inc_return(&autogroup_seq_nr); ag->tg = tg; +#ifdef CONFIG_RT_GROUP_SCHED + /* + * Autogroup RT tasks are redirected to the root task group + * so we don't have to move tasks around upon policy change, + * or flail around trying to allocate bandwidth on the fly. + * A bandwidth exception in __sched_setscheduler() allows + * the policy change to proceed. Thereafter, task_group() + * returns &root_task_group, so zero bandwidth is required. + */ + free_rt_sched_group(tg); + tg->rt_se = root_task_group.rt_se; + tg->rt_rq = root_task_group.rt_rq; +#endif tg->autogroup = ag; return ag; @@ -106,6 +128,11 @@ task_wants_autogroup(struct task_struct *p, struct task_group *tg) return true; } +static inline bool task_group_is_autogroup(struct task_group *tg) +{ + return tg != &root_task_group && tg->autogroup; +} + static inline struct task_group * autogroup_task_group(struct task_struct *p, struct task_group *tg) { diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h index 5358e24..7b859ff 100644 --- a/kernel/sched_autogroup.h +++ b/kernel/sched_autogroup.h @@ -15,6 +15,10 @@ autogroup_task_group(struct task_struct *p, struct task_group *tg); static inline void autogroup_init(struct task_struct *init_task) { } static inline void autogroup_free(struct task_group *tg) { } +static inline bool task_group_is_autogroup(struct task_group *tg) +{ + return 0; +} static inline struct task_group * autogroup_task_group(struct task_struct *p, struct task_group *tg) -- cgit v1.1 From fce2097983d914ea8c2338efc6f6e9bb737f6ae4 Mon Sep 17 00:00:00 2001 From: Yong Zhang Date: Fri, 14 Jan 2011 15:57:39 +0800 Subject: sched: Replace rq->bkl_count with rq->rq_sched_info.bkl_count Now rq->rq_sched_info.bkl_count is not used for rq, scroll rq->bkl_count into it. Thus we can save some space for rq. Signed-off-by: Yong Zhang Signed-off-by: Peter Zijlstra LKML-Reference: <1294991859-13246-1-git-send-email-yong.zhang0@gmail.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 5 +---- kernel/sched_debug.c | 4 +++- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 6cbff6b..0a169a8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -553,9 +553,6 @@ struct rq { /* try_to_wake_up() stats */ unsigned int ttwu_count; unsigned int ttwu_local; - - /* BKL stats */ - unsigned int bkl_count; #endif }; @@ -3887,7 +3884,7 @@ static inline void schedule_debug(struct task_struct *prev) schedstat_inc(this_rq(), sched_count); #ifdef CONFIG_SCHEDSTATS if (unlikely(prev->lock_depth >= 0)) { - schedstat_inc(this_rq(), bkl_count); + schedstat_inc(this_rq(), rq_sched_info.bkl_count); schedstat_inc(prev, sched_info.bkl_count); } #endif diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index e4d3725..eb6cb8e 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -296,9 +296,11 @@ static void print_cpu(struct seq_file *m, int cpu) P(ttwu_count); P(ttwu_local); - P(bkl_count); + SEQ_printf(m, " .%-30s: %d\n", "bkl_count", + rq->rq_sched_info.bkl_count); #undef P +#undef P64 #endif spin_lock_irqsave(&sched_debug_lock, flags); print_cfs_stats(m, cpu); -- cgit v1.1 From d7d8294415f0ce4254827d4a2a5ee88b00be52a8 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Wed, 5 Jan 2011 05:41:17 +0100 Subject: sched: Fix signed unsigned comparison in check_preempt_tick() Signed unsigned comparison may lead to superfluous resched if leftmost is right of the current task, wasting a few cycles, and inadvertently _lengthening_ the current task's slice. Reported-by: Venkatesh Pallipadi Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra LKML-Reference: <1294202477.9384.5.camel@marge.simson.net> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 414145c..77e9166 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1062,6 +1062,9 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) struct sched_entity *se = __pick_next_entity(cfs_rq); s64 delta = curr->vruntime - se->vruntime; + if (delta < 0) + return; + if (delta > ideal_runtime) resched_task(rq_of(cfs_rq)->curr); } -- cgit v1.1 From c5ed5145591774bd9a2960ba4ca45a02fc70aad1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 17 Jan 2011 13:45:37 +0100 Subject: perf: Fix contexted inheritance Linus reported that the RCU lockdep annotation bits triggered for this rcu_dereference() because we're not holding rcu_read_lock(). Going over the code I cannot convince myself its correct: - holding a ref on the parent_ctx, doesn't avoid it being uncloned concurrently (as the comment says), so we can race with a free. - holding parent_ctx->mutex doesn't avoid the above free from taking place either, it would at best avoid parent_ctx from being freed. I.e. the warning is correct. To fix the bug, serialize against the unclone_ctx() call by extending the reach of the parent_ctx->lock. Reported-by: Linus Torvalds Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Paul E. McKenney LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index b782b7a..76be4c7 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -6494,7 +6494,6 @@ int perf_event_init_context(struct task_struct *child, int ctxn) raw_spin_lock_irqsave(&parent_ctx->lock, flags); parent_ctx->rotate_disable = 0; - raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); child_ctx = child->perf_event_ctxp[ctxn]; @@ -6502,12 +6501,11 @@ int perf_event_init_context(struct task_struct *child, int ctxn) /* * Mark the child context as a clone of the parent * context, or of whatever the parent is a clone of. - * Note that if the parent is a clone, it could get - * uncloned at any point, but that doesn't matter - * because the list of events and the generation - * count can't have changed since we took the mutex. + * + * Note that if the parent is a clone, the holding of + * parent_ctx->lock avoids it from being uncloned. */ - cloned_ctx = rcu_dereference(parent_ctx->parent_ctx); + cloned_ctx = parent_ctx->parent_ctx; if (cloned_ctx) { child_ctx->parent_ctx = cloned_ctx; child_ctx->parent_gen = parent_ctx->parent_gen; @@ -6518,6 +6516,7 @@ int perf_event_init_context(struct task_struct *child, int ctxn) get_ctx(child_ctx->parent_ctx); } + raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); mutex_unlock(&parent_ctx->mutex); perf_unpin_context(parent_ctx); -- cgit v1.1 From 22a4ec729017ba613337a28f306f94ba5023fe2e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 18 Jan 2011 17:10:08 +0100 Subject: perf: Find_get_context: fix the per-cpu-counter check If task == NULL, find_get_context() should always check that cpu is correct. Afaics, the bug was introduced by 38a81da2 "perf events: Clean up pid passing", but even before that commit "&& cpu != -1" was not exactly right, -ESRCH from find_task_by_vpid() is not accurate. Signed-off-by: Oleg Nesterov Acked-by: Peter Zijlstra Cc: Alan Stern Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Paul Mackerras Cc: Prasad Cc: Roland McGrath Cc: gregkh@suse.de Cc: stable@kernel.org LKML-Reference: <20110118161008.GB693@redhat.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 76be4c7..a962b19 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -2228,7 +2228,7 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) unsigned long flags; int ctxn, err; - if (!task && cpu != -1) { + if (!task) { /* Must be root to operate on a CPU event: */ if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) return ERR_PTR(-EACCES); -- cgit v1.1 From 66832eb4baaaa9abe4c993ddf9113a79e39b9915 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 18 Jan 2011 17:10:32 +0100 Subject: perf: Validate cpu early in perf_event_alloc() Starting from perf_event_alloc()->perf_init_event(), the kernel assumes that event->cpu is either -1 or the valid CPU number. Change perf_event_alloc() to validate this argument early. This also means we can remove the similar check in find_get_context(). Signed-off-by: Oleg Nesterov Acked-by: Peter Zijlstra Cc: Alan Stern Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Paul Mackerras Cc: Prasad Cc: Roland McGrath Cc: gregkh@suse.de Cc: stable@kernel.org LKML-Reference: <20110118161032.GC693@redhat.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index a962b19..67d9bd7 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -2233,9 +2233,6 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) return ERR_PTR(-EACCES); - if (cpu < 0 || cpu >= nr_cpumask_bits) - return ERR_PTR(-EINVAL); - /* * We could be clever and allow to attach a event to an * offline CPU and activate it when the CPU comes up, but @@ -5541,6 +5538,11 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, struct hw_perf_event *hwc; long err; + if ((unsigned)cpu >= nr_cpu_ids) { + if (!task || cpu != -1) + return ERR_PTR(-EINVAL); + } + event = kzalloc(sizeof(*event), GFP_KERNEL); if (!event) return ERR_PTR(-ENOMEM); @@ -5589,7 +5591,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (!overflow_handler && parent_event) overflow_handler = parent_event->overflow_handler; - + event->overflow_handler = overflow_handler; if (attr->disabled) -- cgit v1.1 From 068c5cc5ac7414a8e9eb7856b4bf3cc4d4744267 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 19 Jan 2011 12:26:11 +0100 Subject: sched, cgroup: Use exit hook to avoid use-after-free crash By not notifying the controller of the on-exit move back to init_css_set, we fail to move the task out of the previous cgroup's cfs_rq. This leads to an opportunity for a cgroup-destroy to come in and free the cgroup (there are no active tasks left in it after all) to which the not-quite dead task is still enqueued. Reported-by: Miklos Vajna Fixed-by: Mike Galbraith Signed-off-by: Peter Zijlstra Cc: Cc: Mike Galbraith Signed-off-by: Ingo Molnar LKML-Reference: <1293206353.29444.205.camel@laptop> --- kernel/sched.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 0a169a8..fa5272a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -606,6 +606,9 @@ static inline struct task_group *task_group(struct task_struct *p) struct task_group *tg; struct cgroup_subsys_state *css; + if (p->flags & PF_EXITING) + return &root_task_group; + css = task_subsys_state_check(p, cpu_cgroup_subsys_id, lockdep_is_held(&task_rq(p)->lock)); tg = container_of(css, struct task_group, css); @@ -8880,6 +8883,20 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, } } +static void +cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task) +{ + /* + * cgroup_exit() is called in the copy_process() failure path. + * Ignore this case since the task hasn't ran yet, this avoids + * trying to poke a half freed task state from generic code. + */ + if (!(task->flags & PF_EXITING)) + return; + + sched_move_task(task); +} + #ifdef CONFIG_FAIR_GROUP_SCHED static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, u64 shareval) @@ -8952,6 +8969,7 @@ struct cgroup_subsys cpu_cgroup_subsys = { .destroy = cpu_cgroup_destroy, .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, + .exit = cpu_cgroup_exit, .populate = cpu_cgroup_populate, .subsys_id = cpu_cgroup_subsys_id, .early_init = 1, -- cgit v1.1 From 490da40d82b31c0562d3f5edb37810f492ca1c34 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Wed, 19 Jan 2011 10:51:44 +0800 Subject: blktrace: Don't output messages if NOTIFY isn't set. Now if we enable blktrace, cfq has too many messages output to the trace buffer. It is fine if we don't specify any action mask. But if I do like this: blktrace /dev/sdb -a issue -a complete -o - | blkparse -i - I only want to see 'D' and 'C', while with the following command dd if=/mnt/ocfs2/test of=/dev/null bs=4k count=1 iflag=direct I will get(with a 2.6.37 vanilla kernel): 8,16 0 0 0.000000000 0 m N cfq3805 alloced 8,16 0 0 0.000004126 0 m N cfq3805 insert_request 8,16 0 0 0.000004884 0 m N cfq3805 add_to_rr 8,16 0 0 0.000008417 0 m N cfq workload slice:300 8,16 0 0 0.000009557 0 m N cfq3805 set_active wl_prio:0 wl_type:2 8,16 0 0 0.000010640 0 m N cfq3805 fifo= (null) 8,16 0 0 0.000011193 0 m N cfq3805 dispatch_insert 8,16 0 0 0.000012221 0 m N cfq3805 dispatched a request 8,16 0 0 0.000012802 0 m N cfq3805 activate rq, drv=1 8,16 0 1 0.000013181 3805 D R 114759 + 8 [dd] 8,16 0 2 0.000164244 0 C R 114759 + 8 [0] 8,16 0 0 0.000167997 0 m N cfq3805 complete rqnoidle 0 8,16 0 0 0.000168782 0 m N cfq3805 set_slice=100 8,16 0 0 0.000169874 0 m N cfq3805 arm_idle: 8 group_idle: 0 8,16 0 0 0.000170189 0 m N cfq schedule dispatch 8,16 0 0 0.000397938 0 m N cfq3805 slice expired t=0 8,16 0 0 0.000399763 0 m N cfq3805 sl_used=1 disp=1 charge=1 iops=0 sect=8 8,16 0 0 0.000400227 0 m N cfq3805 del_from_rr 8,16 0 0 0.000400882 0 m N cfq3805 put_queue See, there are 19 lines while I only need 2. I don't think it is appropriate for a user. So this patch will disable any messages if the BLK_TC_NOTIFY isn't set. Now the output for the same command will look like: 8,16 0 1 0.000000000 4908 D R 114759 + 8 [dd] 8,16 0 2 0.000146827 0 C R 114759 + 8 [0] Yes, it is what I want to see. Cc: Steven Rostedt Cc: Jeff Moyer Signed-off-by: Tao Ma Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 153562d..d95721f 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -138,6 +138,13 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) !blk_tracer_enabled)) return; + /* + * If the BLK_TC_NOTIFY action mask isn't set, don't send any note + * message to the trace. + */ + if (!(bt->act_mask & BLK_TC_NOTIFY)) + return; + local_irq_save(flags); buf = per_cpu_ptr(bt->msg_data, smp_processor_id()); va_start(args, fmt); -- cgit v1.1 From dbe08d82ce3967ccdf459f7951d02589cf967300 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 19 Jan 2011 19:22:07 +0100 Subject: perf: Fix find_get_context() vs perf_event_exit_task() race find_get_context() must not install the new perf_event_context if the task has already passed perf_event_exit_task(). If nothing else, this means the memory leak. Initially ctx->refcount == 2, it is supposed that perf_event_exit_task_context() should participate and do the necessary put_ctx(). find_lively_task_by_vpid() checks PF_EXITING but this buys nothing, by the time we call find_get_context() this task can be already dead. To the point, cmpxchg() can succeed when the task has already done the last schedule(). Change find_get_context() to populate task->perf_event_ctxp[] under task->perf_event_mutex, this way we can trust PF_EXITING because perf_event_exit_task() takes the same mutex. Also, change perf_event_exit_task_context() to use rcu_dereference(). Probably this is not strictly needed, but with or without this change find_get_context() can race with setup_new_exec()->perf_event_exit_task(), rcu_dereference() looks better. Signed-off-by: Oleg Nesterov Acked-by: Peter Zijlstra Cc: Alan Stern Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Paul Mackerras Cc: Prasad Cc: Roland McGrath LKML-Reference: <20110119182207.GB12183@redhat.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 84522c7..4ec55ef 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -2201,13 +2201,6 @@ find_lively_task_by_vpid(pid_t vpid) if (!task) return ERR_PTR(-ESRCH); - /* - * Can't attach events to a dying task. - */ - err = -ESRCH; - if (task->flags & PF_EXITING) - goto errout; - /* Reuse ptrace permission checks for now. */ err = -EACCES; if (!ptrace_may_access(task, PTRACE_MODE_READ)) @@ -2268,14 +2261,27 @@ retry: get_ctx(ctx); - if (cmpxchg(&task->perf_event_ctxp[ctxn], NULL, ctx)) { - /* - * We raced with some other task; use - * the context they set. - */ + err = 0; + mutex_lock(&task->perf_event_mutex); + /* + * If it has already passed perf_event_exit_task(). + * we must see PF_EXITING, it takes this mutex too. + */ + if (task->flags & PF_EXITING) + err = -ESRCH; + else if (task->perf_event_ctxp[ctxn]) + err = -EAGAIN; + else + rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx); + mutex_unlock(&task->perf_event_mutex); + + if (unlikely(err)) { put_task_struct(task); kfree(ctx); - goto retry; + + if (err == -EAGAIN) + goto retry; + goto errout; } } @@ -6127,7 +6133,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) * scheduled, so we are now safe from rescheduling changing * our context. */ - child_ctx = child->perf_event_ctxp[ctxn]; + child_ctx = rcu_dereference(child->perf_event_ctxp[ctxn]); task_ctx_sched_out(child_ctx, EVENT_ALL); /* -- cgit v1.1 From 8550d7cb6ed6c89add49c3b6ad4c753ab8a3d7f9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 19 Jan 2011 19:22:28 +0100 Subject: perf: Fix perf_event_init_task()/perf_event_free_task() interaction perf_event_init_task() should clear child->perf_event_ctxp[] before anything else. Otherwise, if perf_event_init_context(perf_hw_context) fails, perf_event_free_task() can free perf_event_ctxp[perf_sw_context] copied from parent->perf_event_ctxp[] by dup_task_struct(). Also move the initialization of perf_event_mutex and perf_event_list from perf_event_init_context() to perf_event_init_context(). Signed-off-by: Oleg Nesterov Acked-by: Peter Zijlstra Cc: Alan Stern Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Paul Mackerras Cc: Prasad Cc: Roland McGrath LKML-Reference: <20110119182228.GC12183@redhat.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 4ec55ef..244ca3a 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -6446,11 +6446,6 @@ int perf_event_init_context(struct task_struct *child, int ctxn) unsigned long flags; int ret = 0; - child->perf_event_ctxp[ctxn] = NULL; - - mutex_init(&child->perf_event_mutex); - INIT_LIST_HEAD(&child->perf_event_list); - if (likely(!parent->perf_event_ctxp[ctxn])) return 0; @@ -6539,6 +6534,10 @@ int perf_event_init_task(struct task_struct *child) { int ctxn, ret; + memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp)); + mutex_init(&child->perf_event_mutex); + INIT_LIST_HEAD(&child->perf_event_list); + for_each_task_context_nr(ctxn) { ret = perf_event_init_context(child, ctxn); if (ret) -- cgit v1.1 From 2d0640b47da74cff7c11642c798d40de861ed524 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Tue, 18 Jan 2011 22:46:34 -0800 Subject: hrtimers: Notify hrtimer users of switches to NOHZ mode When NOHZ=y and high res timers are disabled (via cmdline or Kconfig) tick_nohz_switch_to_nohz() will notify the user about switching into NOHZ mode. Nothing is printed for the case where HIGH_RES_TIMERS=y. Fix this for the HIGH_RES_TIMERS=y case by duplicating the printk from the low res NOHZ path in the high res NOHZ path. This confused me since I was thinking 'dmesg | grep -i NOHZ' would tell me if NOHZ was enabled, but if I have hrtimers there is nothing. Signed-off-by: Stephen Boyd Acked-by: Thomas Gleixner Cc: Peter Zijlstra LKML-Reference: <1295419594-13085-1-git-send-email-sboyd@codeaurora.org> Signed-off-by: Ingo Molnar --- kernel/time/tick-sched.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 3e216e0..c55ea24 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -642,8 +642,7 @@ static void tick_nohz_switch_to_nohz(void) } local_irq_enable(); - printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", - smp_processor_id()); + printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id()); } /* @@ -795,8 +794,10 @@ void tick_setup_sched_timer(void) } #ifdef CONFIG_NO_HZ - if (tick_nohz_enabled) + if (tick_nohz_enabled) { ts->nohz_mode = NOHZ_MODE_HIGHRES; + printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id()); + } #endif } #endif /* HIGH_RES_TIMERS */ -- cgit v1.1 From 2ce802f62ba32a7d95748ac92bf351f76affb6ff Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 20 Jan 2011 12:06:35 +0100 Subject: lockdep: Move early boot local IRQ enable/disable status to init/main.c During early boot, local IRQ is disabled until IRQ subsystem is properly initialized. During this time, no one should enable local IRQ and some operations which usually are not allowed with IRQ disabled, e.g. operations which might sleep or require communications with other processors, are allowed. lockdep tracked this with early_boot_irqs_off/on() callbacks. As other subsystems need this information too, move it to init/main.c and make it generally available. While at it, toggle the boolean to early_boot_irqs_disabled instead of enabled so that it can be initialized with %false and %true indicates the exceptional condition. Signed-off-by: Tejun Heo Acked-by: Peter Zijlstra Acked-by: Pekka Enberg Cc: Linus Torvalds LKML-Reference: <20110120110635.GB6036@htj.dyndns.org> Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 18 +----------------- kernel/trace/trace_irqsoff.c | 8 -------- 2 files changed, 1 insertion(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 42ba65d..0d2058d 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -2292,22 +2292,6 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark) } /* - * Debugging helper: via this flag we know that we are in - * 'early bootup code', and will warn about any invalid irqs-on event: - */ -static int early_boot_irqs_enabled; - -void early_boot_irqs_off(void) -{ - early_boot_irqs_enabled = 0; -} - -void early_boot_irqs_on(void) -{ - early_boot_irqs_enabled = 1; -} - -/* * Hardirqs will be enabled: */ void trace_hardirqs_on_caller(unsigned long ip) @@ -2319,7 +2303,7 @@ void trace_hardirqs_on_caller(unsigned long ip) if (unlikely(!debug_locks || current->lockdep_recursion)) return; - if (DEBUG_LOCKS_WARN_ON(unlikely(!early_boot_irqs_enabled))) + if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled))) return; if (unlikely(curr->hardirqs_enabled)) { diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 5cf8c60..92b6e1e 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -453,14 +453,6 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1) * Stubs: */ -void early_boot_irqs_off(void) -{ -} - -void early_boot_irqs_on(void) -{ -} - void trace_softirqs_on(unsigned long ip) { } -- cgit v1.1 From bd924e8cbd4b73ffb7d707a774c04f7e2cae88ed Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 20 Jan 2011 12:07:13 +0100 Subject: smp: Allow on_each_cpu() to be called while early_boot_irqs_disabled status to init/main.c percpu may end up calling vfree() during early boot which in turn may call on_each_cpu() for TLB flushes. The function of on_each_cpu() can be done safely while IRQ is disabled during early boot but it assumed that the function is always called with local IRQ enabled which ended up enabling local IRQ prematurely during boot and triggering a couple of warnings. This patch updates on_each_cpu() and smp_call_function_many() such on_each_cpu() can be used safely while early_boot_irqs_disabled is set. Signed-off-by: Tejun Heo Acked-by: Peter Zijlstra Acked-by: Pekka Enberg Cc: Linus Torvalds LKML-Reference: <20110120110713.GC6036@htj.dyndns.org> Signed-off-by: Ingo Molnar Reported-by: Ingo Molnar --- kernel/smp.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 4ec30e0..4b83cd6 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -430,7 +430,7 @@ void smp_call_function_many(const struct cpumask *mask, * can't happen. */ WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() - && !oops_in_progress); + && !oops_in_progress && !early_boot_irqs_disabled); /* So, what's a CPU they want? Ignoring this one. */ cpu = cpumask_first_and(mask, cpu_online_mask); @@ -533,17 +533,20 @@ void ipi_call_unlock_irq(void) #endif /* USE_GENERIC_SMP_HELPERS */ /* - * Call a function on all processors + * Call a function on all processors. May be used during early boot while + * early_boot_irqs_disabled is set. Use local_irq_save/restore() instead + * of local_irq_disable/enable(). */ int on_each_cpu(void (*func) (void *info), void *info, int wait) { + unsigned long flags; int ret = 0; preempt_disable(); ret = smp_call_function(func, info, wait); - local_irq_disable(); + local_irq_save(flags); func(info); - local_irq_enable(); + local_irq_restore(flags); preempt_enable(); return ret; } -- cgit v1.1 From 6dc19899958e420a931274b94019e267e2396d3e Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Thu, 20 Jan 2011 14:44:33 -0800 Subject: kernel/smp.c: fix smp_call_function_many() SMP race I noticed a failure where we hit the following WARN_ON in generic_smp_call_function_interrupt: if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) continue; data->csd.func(data->csd.info); refs = atomic_dec_return(&data->refs); WARN_ON(refs < 0); <------------------------- We atomically tested and cleared our bit in the cpumask, and yet the number of cpus left (ie refs) was 0. How can this be? It turns out commit 54fdade1c3332391948ec43530c02c4794a38172 ("generic-ipi: make struct call_function_data lockless") is at fault. It removes locking from smp_call_function_many and in doing so creates a rather complicated race. The problem comes about because: - The smp_call_function_many interrupt handler walks call_function.queue without any locking. - We reuse a percpu data structure in smp_call_function_many. - We do not wait for any RCU grace period before starting the next smp_call_function_many. Imagine a scenario where CPU A does two smp_call_functions back to back, and CPU B does an smp_call_function in between. We concentrate on how CPU C handles the calls: CPU A CPU B CPU C CPU D smp_call_function smp_call_function_interrupt walks call_function.queue sees data from CPU A on list smp_call_function smp_call_function_interrupt walks call_function.queue sees (stale) CPU A on list smp_call_function int clears last ref on A list_del_rcu, unlock smp_call_function reuses percpu *data A data->cpumask sees and clears bit in cpumask might be using old or new fn! decrements refs below 0 set data->refs (too late!) The important thing to note is since the interrupt handler walks a potentially stale call_function.queue without any locking, then another cpu can view the percpu *data structure at any time, even when the owner is in the process of initialising it. The following test case hits the WARN_ON 100% of the time on my PowerPC box (having 128 threads does help :) #include #include #define ITERATIONS 100 static void do_nothing_ipi(void *dummy) { } static void do_ipis(struct work_struct *dummy) { int i; for (i = 0; i < ITERATIONS; i++) smp_call_function(do_nothing_ipi, NULL, 1); printk(KERN_DEBUG "cpu %d finished\n", smp_processor_id()); } static struct work_struct work[NR_CPUS]; static int __init testcase_init(void) { int cpu; for_each_online_cpu(cpu) { INIT_WORK(&work[cpu], do_ipis); schedule_work_on(cpu, &work[cpu]); } return 0; } static void __exit testcase_exit(void) { } module_init(testcase_init) module_exit(testcase_exit) MODULE_LICENSE("GPL"); MODULE_AUTHOR("Anton Blanchard"); I tried to fix it by ordering the read and the write of ->cpumask and ->refs. In doing so I missed a critical case but Paul McKenney was able to spot my bug thankfully :) To ensure we arent viewing previous iterations the interrupt handler needs to read ->refs then ->cpumask then ->refs _again_. Thanks to Milton Miller and Paul McKenney for helping to debug this issue. [miltonm@bga.com: add WARN_ON and BUG_ON, remove extra read of refs before initial read of mask that doesn't help (also noted by Peter Zijlstra), adjust comments, hopefully clarify scenario ] [miltonm@bga.com: remove excess tests] Signed-off-by: Anton Blanchard Signed-off-by: Milton Miller Cc: Ingo Molnar Cc: "Paul E. McKenney" Cc: [2.6.32+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/smp.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 4ec30e0..17c6e58 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -195,6 +195,24 @@ void generic_smp_call_function_interrupt(void) list_for_each_entry_rcu(data, &call_function.queue, csd.list) { int refs; + /* + * Since we walk the list without any locks, we might + * see an entry that was completed, removed from the + * list and is in the process of being reused. + * + * We must check that the cpu is in the cpumask before + * checking the refs, and both must be set before + * executing the callback on this cpu. + */ + + if (!cpumask_test_cpu(cpu, data->cpumask)) + continue; + + smp_rmb(); + + if (atomic_read(&data->refs) == 0) + continue; + if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) continue; @@ -203,6 +221,8 @@ void generic_smp_call_function_interrupt(void) refs = atomic_dec_return(&data->refs); WARN_ON(refs < 0); if (!refs) { + WARN_ON(!cpumask_empty(data->cpumask)); + raw_spin_lock(&call_function.lock); list_del_rcu(&data->csd.list); raw_spin_unlock(&call_function.lock); @@ -454,11 +474,21 @@ void smp_call_function_many(const struct cpumask *mask, data = &__get_cpu_var(cfd_data); csd_lock(&data->csd); + BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask)); data->csd.func = func; data->csd.info = info; cpumask_and(data->cpumask, mask, cpu_online_mask); cpumask_clear_cpu(this_cpu, data->cpumask); + + /* + * To ensure the interrupt handler gets an complete view + * we order the cpumask and refs writes and order the read + * of them in the interrupt handler. In addition we may + * only clear our own cpu bit from the mask. + */ + smp_wmb(); + atomic_set(&data->refs, cpumask_weight(data->cpumask)); raw_spin_lock_irqsave(&call_function.lock, flags); -- cgit v1.1 From 225c8e010f2d17a62aef131e24c6e7c111f36f9b Mon Sep 17 00:00:00 2001 From: Milton Miller Date: Thu, 20 Jan 2011 14:44:34 -0800 Subject: kernel/smp.c: consolidate writes in smp_call_function_interrupt() We have to test the cpu mask in the interrupt handler before checking the refs, otherwise we can start to follow an entry before its deleted and find it partially initailzed for the next trip. Presently we also clear the cpumask bit before executing the called function, which implies getting write access to the line. After the function is called we then decrement refs, and if they go to zero we then unlock the structure. However, this implies getting write access to the call function data before and after another the function is called. If we can assert that no smp_call_function execution function is allowed to enable interrupts, then we can move both writes to after the function is called, hopfully allowing both writes with one cache line bounce. On a 256 thread system with a kernel compiled for 1024 threads, the time to execute testcase in the "smp_call_function_many race" changelog was reduced by about 30-40ms out of about 545 ms. I decided to keep this as WARN because its now a buggy function, even though the stack trace is of no value -- a simple printk would give us the information needed. Raw data: Without patch: ipi_test startup took 1219366ns complete 539819014ns total 541038380ns ipi_test startup took 1695754ns complete 543439872ns total 545135626ns ipi_test startup took 7513568ns complete 539606362ns total 547119930ns ipi_test startup took 13304064ns complete 533898562ns total 547202626ns ipi_test startup took 8668192ns complete 544264074ns total 552932266ns ipi_test startup took 4977626ns complete 548862684ns total 553840310ns ipi_test startup took 2144486ns complete 541292318ns total 543436804ns ipi_test startup took 21245824ns complete 530280180ns total 551526004ns With patch: ipi_test startup took 5961748ns complete 500859628ns total 506821376ns ipi_test startup took 8975996ns complete 495098924ns total 504074920ns ipi_test startup took 19797750ns complete 492204740ns total 512002490ns ipi_test startup took 14824796ns complete 487495878ns total 502320674ns ipi_test startup took 11514882ns complete 494439372ns total 505954254ns ipi_test startup took 8288084ns complete 502570774ns total 510858858ns ipi_test startup took 6789954ns complete 493388112ns total 500178066ns #include #include #include /* sched clock */ #define ITERATIONS 100 static void do_nothing_ipi(void *dummy) { } static void do_ipis(struct work_struct *dummy) { int i; for (i = 0; i < ITERATIONS; i++) smp_call_function(do_nothing_ipi, NULL, 1); printk(KERN_DEBUG "cpu %d finished\n", smp_processor_id()); } static struct work_struct work[NR_CPUS]; static int __init testcase_init(void) { int cpu; u64 start, started, done; start = local_clock(); for_each_online_cpu(cpu) { INIT_WORK(&work[cpu], do_ipis); schedule_work_on(cpu, &work[cpu]); } started = local_clock(); for_each_online_cpu(cpu) flush_work(&work[cpu]); done = local_clock(); pr_info("ipi_test startup took %lldns complete %lldns total %lldns\n", started-start, done-started, done-start); return 0; } static void __exit testcase_exit(void) { } module_init(testcase_init) module_exit(testcase_exit) MODULE_LICENSE("GPL"); MODULE_AUTHOR("Anton Blanchard"); Signed-off-by: Milton Miller Cc: Anton Blanchard Cc: Ingo Molnar Cc: "Paul E. McKenney" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/smp.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 17c6e58..2fe66f7 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -194,6 +194,7 @@ void generic_smp_call_function_interrupt(void) */ list_for_each_entry_rcu(data, &call_function.queue, csd.list) { int refs; + void (*func) (void *info); /* * Since we walk the list without any locks, we might @@ -213,24 +214,32 @@ void generic_smp_call_function_interrupt(void) if (atomic_read(&data->refs) == 0) continue; - if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) - continue; - + func = data->csd.func; /* for later warn */ data->csd.func(data->csd.info); + /* + * If the cpu mask is not still set then it enabled interrupts, + * we took another smp interrupt, and executed the function + * twice on this cpu. In theory that copy decremented refs. + */ + if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) { + WARN(1, "%pS enabled interrupts and double executed\n", + func); + continue; + } + refs = atomic_dec_return(&data->refs); WARN_ON(refs < 0); - if (!refs) { - WARN_ON(!cpumask_empty(data->cpumask)); - - raw_spin_lock(&call_function.lock); - list_del_rcu(&data->csd.list); - raw_spin_unlock(&call_function.lock); - } if (refs) continue; + WARN_ON(!cpumask_empty(data->cpumask)); + + raw_spin_lock(&call_function.lock); + list_del_rcu(&data->csd.list); + raw_spin_unlock(&call_function.lock); + csd_unlock(&data->csd); } -- cgit v1.1 From 1c77ff22f539ceaa64ea43d6a26d867e84602cb7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Jan 2011 19:41:35 +0100 Subject: genirq: Remove __do_IRQ All architectures are finally converted. Remove the cruft. Signed-off-by: Thomas Gleixner Cc: Richard Henderson Cc: Mike Frysinger Cc: David Howells Cc: Tony Luck Cc: Greg Ungerer Cc: Michal Simek Acked-by: David Howells Cc: Kyle McMartin Acked-by: Benjamin Herrenschmidt Cc: Chen Liqin Cc: "David S. Miller" Cc: Chris Metcalf Cc: Jeff Dike --- kernel/irq/Kconfig | 3 -- kernel/irq/handle.c | 111 ---------------------------------------------------- 2 files changed, 114 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 31d766b..8e42fec 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -9,9 +9,6 @@ menu "IRQ subsystem" config GENERIC_HARDIRQS def_bool y -config GENERIC_HARDIRQS_NO__DO_IRQ - def_bool y - # Select this to disable the deprecated stuff config GENERIC_HARDIRQS_NO_DEPRECATED def_bool n diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index e2347eb..3540a71 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -118,114 +118,3 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) return retval; } - -#ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ - -#ifdef CONFIG_ENABLE_WARN_DEPRECATED -# warning __do_IRQ is deprecated. Please convert to proper flow handlers -#endif - -/** - * __do_IRQ - original all in one highlevel IRQ handler - * @irq: the interrupt number - * - * __do_IRQ handles all normal device IRQ's (the special - * SMP cross-CPU interrupts have their own specific - * handlers). - * - * This is the original x86 implementation which is used for every - * interrupt type. - */ -unsigned int __do_IRQ(unsigned int irq) -{ - struct irq_desc *desc = irq_to_desc(irq); - struct irqaction *action; - unsigned int status; - - kstat_incr_irqs_this_cpu(irq, desc); - - if (CHECK_IRQ_PER_CPU(desc->status)) { - irqreturn_t action_ret; - - /* - * No locking required for CPU-local interrupts: - */ - if (desc->irq_data.chip->ack) - desc->irq_data.chip->ack(irq); - if (likely(!(desc->status & IRQ_DISABLED))) { - action_ret = handle_IRQ_event(irq, desc->action); - if (!noirqdebug) - note_interrupt(irq, desc, action_ret); - } - desc->irq_data.chip->end(irq); - return 1; - } - - raw_spin_lock(&desc->lock); - if (desc->irq_data.chip->ack) - desc->irq_data.chip->ack(irq); - /* - * REPLAY is when Linux resends an IRQ that was dropped earlier - * WAITING is used by probe to mark irqs that are being tested - */ - status = desc->status & ~(IRQ_REPLAY | IRQ_WAITING); - status |= IRQ_PENDING; /* we _want_ to handle it */ - - /* - * If the IRQ is disabled for whatever reason, we cannot - * use the action we have. - */ - action = NULL; - if (likely(!(status & (IRQ_DISABLED | IRQ_INPROGRESS)))) { - action = desc->action; - status &= ~IRQ_PENDING; /* we commit to handling */ - status |= IRQ_INPROGRESS; /* we are handling it */ - } - desc->status = status; - - /* - * If there is no IRQ handler or it was disabled, exit early. - * Since we set PENDING, if another processor is handling - * a different instance of this same irq, the other processor - * will take care of it. - */ - if (unlikely(!action)) - goto out; - - /* - * Edge triggered interrupts need to remember - * pending events. - * This applies to any hw interrupts that allow a second - * instance of the same irq to arrive while we are in do_IRQ - * or in the handler. But the code here only handles the _second_ - * instance of the irq, not the third or fourth. So it is mostly - * useful for irq hardware that does not mask cleanly in an - * SMP environment. - */ - for (;;) { - irqreturn_t action_ret; - - raw_spin_unlock(&desc->lock); - - action_ret = handle_IRQ_event(irq, action); - if (!noirqdebug) - note_interrupt(irq, desc, action_ret); - - raw_spin_lock(&desc->lock); - if (likely(!(desc->status & IRQ_PENDING))) - break; - desc->status &= ~IRQ_PENDING; - } - desc->status &= ~IRQ_INPROGRESS; - -out: - /* - * The ->end() handler has to deal with interrupts which got - * disabled while the handler was running. - */ - desc->irq_data.chip->end(irq); - raw_spin_unlock(&desc->lock); - - return 1; -} -#endif -- cgit v1.1 From 547e9fd7d328af261f184bf66effc5033c886498 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 19 Jan 2011 12:51:39 +0100 Subject: perf: Annotate cpuctx->ctx.mutex to avoid a lockdep splat Lockdep spotted: loop_1b_instruc/1899 is trying to acquire lock: (event_mutex){+.+.+.}, at: [] perf_trace_init+0x3b/0x2f7 but task is already holding lock: (&ctx->mutex){+.+.+.}, at: [] perf_event_init_context+0xc0/0x218 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #3 (&ctx->mutex){+.+.+.}: -> #2 (cpu_hotplug.lock){+.+.+.}: -> #1 (module_mutex){+.+...}: -> #0 (event_mutex){+.+.+.}: But because the deadlock would be cpuhotplug (cpu-event) vs fork (task-event) it cannot, in fact, happen. We can annotate this by giving the perf_event_context used for the cpuctx a different lock class from those used by tasks. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 244ca3a..c5fa717 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -5380,6 +5380,8 @@ free_dev: goto out; } +static struct lock_class_key cpuctx_mutex; + int perf_pmu_register(struct pmu *pmu, char *name, int type) { int cpu, ret; @@ -5428,6 +5430,7 @@ skip_type: cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); __perf_event_init_context(&cpuctx->ctx); + lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); cpuctx->ctx.type = cpu_context; cpuctx->ctx.pmu = pmu; cpuctx->jiffies_interval = 1; -- cgit v1.1 From 806839b22cbda90176d7f8d421889bddd7826e93 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 21 Jan 2011 18:45:47 +0100 Subject: perf: perf_event_exit_task_context: s/rcu_dereference/rcu_dereference_raw/ In theory, almost every user of task->child->perf_event_ctxp[] is wrong. find_get_context() can install the new context at any moment, we need read_barrier_depends(). dbe08d82ce3967ccdf459f7951d02589cf967300 "perf: Fix find_get_context() vs perf_event_exit_task() race" added rcu_dereference() into perf_event_exit_task_context() to make the precedent, but this makes __rcu_dereference_check() unhappy. Use rcu_dereference_raw() to shut up the warning. Reported-by: Ingo Molnar Signed-off-by: Oleg Nesterov Cc: acme@redhat.com Cc: paulus@samba.org Cc: stern@rowland.harvard.edu Cc: a.p.zijlstra@chello.nl Cc: fweisbec@gmail.com Cc: roland@redhat.com Cc: prasad@linux.vnet.ibm.com Cc: Paul E. McKenney LKML-Reference: <20110121174547.GA8796@redhat.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index c5fa717..126a302 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -6136,7 +6136,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) * scheduled, so we are now safe from rescheduling changing * our context. */ - child_ctx = rcu_dereference(child->perf_event_ctxp[ctxn]); + child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]); task_ctx_sched_out(child_ctx, EVENT_ALL); /* -- cgit v1.1 From e94965ed5beb23c6fabf7ed31f625e66d7ff28de Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Wed, 15 Dec 2010 14:00:19 -0800 Subject: module: show version information for built-in modules in sysfs Currently only drivers that are built as modules have their versions shown in /sys/module//version, but this information might also be useful for built-in drivers as well. This especially important for drivers that do not define any parameters - such drivers, if built-in, are completely invisible from userspace. This patch changes MODULE_VERSION() macro so that in case when we are compiling built-in module, version information is stored in a separate section. Kernel then uses this data to create 'version' sysfs attribute in the same fashion it creates attributes for module parameters. Signed-off-by: Dmitry Torokhov Signed-off-by: Rusty Russell --- kernel/params.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 54 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index 08107d1..0da1411 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -719,9 +719,7 @@ void destroy_params(const struct kernel_param *params, unsigned num) params[i].ops->free(params[i].arg); } -static void __init kernel_add_sysfs_param(const char *name, - struct kernel_param *kparam, - unsigned int name_skip) +static struct module_kobject * __init locate_module_kobject(const char *name) { struct module_kobject *mk; struct kobject *kobj; @@ -729,10 +727,7 @@ static void __init kernel_add_sysfs_param(const char *name, kobj = kset_find_obj(module_kset, name); if (kobj) { - /* We already have one. Remove params so we can add more. */ mk = to_module_kobject(kobj); - /* We need to remove it before adding parameters. */ - sysfs_remove_group(&mk->kobj, &mk->mp->grp); } else { mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL); BUG_ON(!mk); @@ -743,15 +738,36 @@ static void __init kernel_add_sysfs_param(const char *name, "%s", name); if (err) { kobject_put(&mk->kobj); - printk(KERN_ERR "Module '%s' failed add to sysfs, " - "error number %d\n", name, err); - printk(KERN_ERR "The system will be unstable now.\n"); - return; + printk(KERN_ERR + "Module '%s' failed add to sysfs, error number %d\n", + name, err); + printk(KERN_ERR + "The system will be unstable now.\n"); + return NULL; } - /* So that exit path is even. */ + + /* So that we hold reference in both cases. */ kobject_get(&mk->kobj); } + return mk; +} + +static void __init kernel_add_sysfs_param(const char *name, + struct kernel_param *kparam, + unsigned int name_skip) +{ + struct module_kobject *mk; + int err; + + mk = locate_module_kobject(name); + if (!mk) + return; + + /* We need to remove old parameters before adding more. */ + if (mk->mp) + sysfs_remove_group(&mk->kobj, &mk->mp->grp); + /* These should not fail at boot. */ err = add_sysfs_param(mk, kparam, kparam->name + name_skip); BUG_ON(err); @@ -796,6 +812,32 @@ static void __init param_sysfs_builtin(void) } } +ssize_t __modver_version_show(struct module_attribute *mattr, + struct module *mod, char *buf) +{ + struct module_version_attribute *vattr = + container_of(mattr, struct module_version_attribute, mattr); + + return sprintf(buf, "%s\n", vattr->version); +} + +extern struct module_version_attribute __start___modver[], __stop___modver[]; + +static void __init version_sysfs_builtin(void) +{ + const struct module_version_attribute *vattr; + struct module_kobject *mk; + int err; + + for (vattr = __start___modver; vattr < __stop___modver; vattr++) { + mk = locate_module_kobject(vattr->module_name); + if (mk) { + err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr); + kobject_uevent(&mk->kobj, KOBJ_ADD); + kobject_put(&mk->kobj); + } + } +} /* module-related sysfs stuff */ @@ -875,6 +917,7 @@ static int __init param_sysfs_init(void) } module_sysfs_initialized = 1; + version_sysfs_builtin(); param_sysfs_builtin(); return 0; -- cgit v1.1 From 3ff6dcac735704824c1dff64dc6863c390d364cc Mon Sep 17 00:00:00 2001 From: Yong Zhang Date: Mon, 24 Jan 2011 15:33:52 +0800 Subject: sched: Fix poor interactivity on UP systems due to group scheduler nice tune bug Michael Witten and Christian Kujau reported that the autogroup scheduling feature hurts interactivity on their UP systems. It turns out that this is an older bug in the group scheduling code, and the wider appeal provided by the autogroup feature exposed it more prominently. When on UP with FAIR_GROUP_SCHED enabled, tune shares only affect tg->shares, but is not reflected in tg->se->load. The reason is that update_cfs_shares() does nothing on UP. So introduce update_cfs_shares() for UP && FAIR_GROUP_SCHED. This issue was found when enable autogroup scheduling was enabled, but it is an older bug that also exists on cgroup.cpu on UP. Reported-and-Tested-by: Michael Witten Reported-and-Tested-by: Christian Kujau Signed-off-by: Yong Zhang Acked-by: Pekka Enberg Acked-by: Mike Galbraith Acked-by: Peter Zijlstra Cc: Linus Torvalds LKML-Reference: <20110124073352.GA24186@windriver.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 78 ++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 53 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 77e9166..3547699 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -699,7 +699,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) cfs_rq->nr_running--; } -#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_FAIR_GROUP_SCHED +# ifdef CONFIG_SMP static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, int global_update) { @@ -762,6 +763,51 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) list_del_leaf_cfs_rq(cfs_rq); } +static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg, + long weight_delta) +{ + long load_weight, load, shares; + + load = cfs_rq->load.weight + weight_delta; + + load_weight = atomic_read(&tg->load_weight); + load_weight -= cfs_rq->load_contribution; + load_weight += load; + + shares = (tg->shares * load); + if (load_weight) + shares /= load_weight; + + if (shares < MIN_SHARES) + shares = MIN_SHARES; + if (shares > tg->shares) + shares = tg->shares; + + return shares; +} + +static void update_entity_shares_tick(struct cfs_rq *cfs_rq) +{ + if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { + update_cfs_load(cfs_rq, 0); + update_cfs_shares(cfs_rq, 0); + } +} +# else /* CONFIG_SMP */ +static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) +{ +} + +static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg, + long weight_delta) +{ + return tg->shares; +} + +static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) +{ +} +# endif /* CONFIG_SMP */ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight) { @@ -782,7 +828,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) { struct task_group *tg; struct sched_entity *se; - long load_weight, load, shares; + long shares; if (!cfs_rq) return; @@ -791,32 +837,14 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) se = tg->se[cpu_of(rq_of(cfs_rq))]; if (!se) return; - - load = cfs_rq->load.weight + weight_delta; - - load_weight = atomic_read(&tg->load_weight); - load_weight -= cfs_rq->load_contribution; - load_weight += load; - - shares = (tg->shares * load); - if (load_weight) - shares /= load_weight; - - if (shares < MIN_SHARES) - shares = MIN_SHARES; - if (shares > tg->shares) - shares = tg->shares; +#ifndef CONFIG_SMP + if (likely(se->load.weight == tg->shares)) + return; +#endif + shares = calc_cfs_shares(cfs_rq, tg, weight_delta); reweight_entity(cfs_rq_of(se), se, shares); } - -static void update_entity_shares_tick(struct cfs_rq *cfs_rq) -{ - if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { - update_cfs_load(cfs_rq, 0); - update_cfs_shares(cfs_rq, 0); - } -} #else /* CONFIG_FAIR_GROUP_SCHED */ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) { -- cgit v1.1 From 8c6a98b22b750c9eb52653ba643faa17db8d3881 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Mon, 24 Jan 2011 09:31:38 -0800 Subject: Input: sysrq - ensure sysrq_enabled and __sysrq_enabled are consistent Currently sysrq_enabled and __sysrq_enabled are initialised separately and inconsistently, leading to sysrq being actually enabled by reported as not enabled in sysfs. The first change to the sysfs configurable synchronises these two: static int __read_mostly sysrq_enabled = 1; static int __sysrq_enabled; Add a common define to carry the default for these preventing them becoming out of sync again. Default this to 1 to mirror previous behaviour. Signed-off-by: Andy Whitcroft Cc: stable@kernel.org Signed-off-by: Dmitry Torokhov --- kernel/sysctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c33a1ed..3afce4d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -169,7 +169,8 @@ static int proc_taint(struct ctl_table *table, int write, #endif #ifdef CONFIG_MAGIC_SYSRQ -static int __sysrq_enabled; /* Note: sysrq code ises it's own private copy */ +/* Note: sysrq code uses it's own private copy */ +static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE; static int sysrq_sysctl_handler(ctl_table *table, int write, void __user *buffer, size_t *lenp, -- cgit v1.1 From ac751efa6a0d70f2c9daef5c7e3a92270f5c2dff Mon Sep 17 00:00:00 2001 From: Torben Hohn Date: Tue, 25 Jan 2011 15:07:35 -0800 Subject: console: rename acquire/release_console_sem() to console_lock/unlock() The -rt patches change the console_semaphore to console_mutex. As a result, a quite large chunk of the patches changes all acquire/release_console_sem() to acquire/release_console_mutex() This commit makes things use more neutral function names which dont make implications about the underlying lock. The only real change is the return value of console_trylock which is inverted from try_acquire_console_sem() This patch also paves the way to switching console_sem from a semaphore to a mutex. [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: make console_trylock return 1 on success, per Geert] Signed-off-by: Torben Hohn Cc: Thomas Gleixner Cc: Greg KH Cc: Ingo Molnar Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 100 ++++++++++++++++++++++++++++++-------------------------- 1 file changed, 54 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 53d9a9e..2ddbdc7 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -97,7 +97,7 @@ static int console_locked, console_suspended; /* * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars * It is also used in interesting ways to provide interlocking in - * release_console_sem(). + * console_unlock();. */ static DEFINE_SPINLOCK(logbuf_lock); @@ -501,7 +501,7 @@ static void _call_console_drivers(unsigned start, /* * Call the console drivers, asking them to write out * log_buf[start] to log_buf[end - 1]. - * The console_sem must be held. + * The console_lock must be held. */ static void call_console_drivers(unsigned start, unsigned end) { @@ -604,11 +604,11 @@ static int have_callable_console(void) * * This is printk(). It can be called from any context. We want it to work. * - * We try to grab the console_sem. If we succeed, it's easy - we log the output and + * We try to grab the console_lock. If we succeed, it's easy - we log the output and * call the console drivers. If we fail to get the semaphore we place the output * into the log buffer and return. The current holder of the console_sem will - * notice the new output in release_console_sem() and will send it to the - * consoles before releasing the semaphore. + * notice the new output in console_unlock(); and will send it to the + * consoles before releasing the lock. * * One effect of this deferred printing is that code which calls printk() and * then changes console_loglevel may break. This is because console_loglevel @@ -659,19 +659,19 @@ static inline int can_use_console(unsigned int cpu) /* * Try to get console ownership to actually show the kernel * messages from a 'printk'. Return true (and with the - * console_semaphore held, and 'console_locked' set) if it + * console_lock held, and 'console_locked' set) if it * is successful, false otherwise. * * This gets called with the 'logbuf_lock' spinlock held and * interrupts disabled. It should return with 'lockbuf_lock' * released but interrupts still disabled. */ -static int acquire_console_semaphore_for_printk(unsigned int cpu) +static int console_trylock_for_printk(unsigned int cpu) __releases(&logbuf_lock) { int retval = 0; - if (!try_acquire_console_sem()) { + if (console_trylock()) { retval = 1; /* @@ -827,12 +827,12 @@ asmlinkage int vprintk(const char *fmt, va_list args) * actual magic (print out buffers, wake up klogd, * etc). * - * The acquire_console_semaphore_for_printk() function + * The console_trylock_for_printk() function * will release 'logbuf_lock' regardless of whether it * actually gets the semaphore or not. */ - if (acquire_console_semaphore_for_printk(this_cpu)) - release_console_sem(); + if (console_trylock_for_printk(this_cpu)) + console_unlock(); lockdep_on(); out_restore_irqs: @@ -993,7 +993,7 @@ void suspend_console(void) if (!console_suspend_enabled) return; printk("Suspending console(s) (use no_console_suspend to debug)\n"); - acquire_console_sem(); + console_lock(); console_suspended = 1; up(&console_sem); } @@ -1004,7 +1004,7 @@ void resume_console(void) return; down(&console_sem); console_suspended = 0; - release_console_sem(); + console_unlock(); } /** @@ -1027,21 +1027,21 @@ static int __cpuinit console_cpu_notify(struct notifier_block *self, case CPU_DYING: case CPU_DOWN_FAILED: case CPU_UP_CANCELED: - acquire_console_sem(); - release_console_sem(); + console_lock(); + console_unlock(); } return NOTIFY_OK; } /** - * acquire_console_sem - lock the console system for exclusive use. + * console_lock - lock the console system for exclusive use. * - * Acquires a semaphore which guarantees that the caller has + * Acquires a lock which guarantees that the caller has * exclusive access to the console system and the console_drivers list. * * Can sleep, returns nothing. */ -void acquire_console_sem(void) +void console_lock(void) { BUG_ON(in_interrupt()); down(&console_sem); @@ -1050,21 +1050,29 @@ void acquire_console_sem(void) console_locked = 1; console_may_schedule = 1; } -EXPORT_SYMBOL(acquire_console_sem); +EXPORT_SYMBOL(console_lock); -int try_acquire_console_sem(void) +/** + * console_trylock - try to lock the console system for exclusive use. + * + * Tried to acquire a lock which guarantees that the caller has + * exclusive access to the console system and the console_drivers list. + * + * returns 1 on success, and 0 on failure to acquire the lock. + */ +int console_trylock(void) { if (down_trylock(&console_sem)) - return -1; + return 0; if (console_suspended) { up(&console_sem); - return -1; + return 0; } console_locked = 1; console_may_schedule = 0; - return 0; + return 1; } -EXPORT_SYMBOL(try_acquire_console_sem); +EXPORT_SYMBOL(console_trylock); int is_console_locked(void) { @@ -1095,20 +1103,20 @@ void wake_up_klogd(void) } /** - * release_console_sem - unlock the console system + * console_unlock - unlock the console system * - * Releases the semaphore which the caller holds on the console system + * Releases the console_lock which the caller holds on the console system * and the console driver list. * - * While the semaphore was held, console output may have been buffered - * by printk(). If this is the case, release_console_sem() emits - * the output prior to releasing the semaphore. + * While the console_lock was held, console output may have been buffered + * by printk(). If this is the case, console_unlock(); emits + * the output prior to releasing the lock. * * If there is output waiting for klogd, we wake it up. * - * release_console_sem() may be called from any context. + * console_unlock(); may be called from any context. */ -void release_console_sem(void) +void console_unlock(void) { unsigned long flags; unsigned _con_start, _log_end; @@ -1141,7 +1149,7 @@ void release_console_sem(void) if (wake_klogd) wake_up_klogd(); } -EXPORT_SYMBOL(release_console_sem); +EXPORT_SYMBOL(console_unlock); /** * console_conditional_schedule - yield the CPU if required @@ -1150,7 +1158,7 @@ EXPORT_SYMBOL(release_console_sem); * if this CPU should yield the CPU to another task, do * so here. * - * Must be called within acquire_console_sem(). + * Must be called within console_lock();. */ void __sched console_conditional_schedule(void) { @@ -1171,14 +1179,14 @@ void console_unblank(void) if (down_trylock(&console_sem) != 0) return; } else - acquire_console_sem(); + console_lock(); console_locked = 1; console_may_schedule = 0; for_each_console(c) if ((c->flags & CON_ENABLED) && c->unblank) c->unblank(); - release_console_sem(); + console_unlock(); } /* @@ -1189,7 +1197,7 @@ struct tty_driver *console_device(int *index) struct console *c; struct tty_driver *driver = NULL; - acquire_console_sem(); + console_lock(); for_each_console(c) { if (!c->device) continue; @@ -1197,7 +1205,7 @@ struct tty_driver *console_device(int *index) if (driver) break; } - release_console_sem(); + console_unlock(); return driver; } @@ -1208,17 +1216,17 @@ struct tty_driver *console_device(int *index) */ void console_stop(struct console *console) { - acquire_console_sem(); + console_lock(); console->flags &= ~CON_ENABLED; - release_console_sem(); + console_unlock(); } EXPORT_SYMBOL(console_stop); void console_start(struct console *console) { - acquire_console_sem(); + console_lock(); console->flags |= CON_ENABLED; - release_console_sem(); + console_unlock(); } EXPORT_SYMBOL(console_start); @@ -1340,7 +1348,7 @@ void register_console(struct console *newcon) * Put this console in the list - keep the * preferred driver at the head of the list. */ - acquire_console_sem(); + console_lock(); if ((newcon->flags & CON_CONSDEV) || console_drivers == NULL) { newcon->next = console_drivers; console_drivers = newcon; @@ -1352,14 +1360,14 @@ void register_console(struct console *newcon) } if (newcon->flags & CON_PRINTBUFFER) { /* - * release_console_sem() will print out the buffered messages + * console_unlock(); will print out the buffered messages * for us. */ spin_lock_irqsave(&logbuf_lock, flags); con_start = log_start; spin_unlock_irqrestore(&logbuf_lock, flags); } - release_console_sem(); + console_unlock(); console_sysfs_notify(); /* @@ -1396,7 +1404,7 @@ int unregister_console(struct console *console) return braille_unregister_console(console); #endif - acquire_console_sem(); + console_lock(); if (console_drivers == console) { console_drivers=console->next; res = 0; @@ -1418,7 +1426,7 @@ int unregister_console(struct console *console) if (console_drivers != NULL && console->flags & CON_CONSDEV) console_drivers->flags |= CON_CONSDEV; - release_console_sem(); + console_unlock(); console_sysfs_notify(); return res; } -- cgit v1.1 From e37b6a7b27b400c3aa488db8c6629a05095bc79c Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Fri, 21 Jan 2011 20:44:59 -0800 Subject: sched: Fix sign under-flows in wake_affine While care is taken around the zero-point in effective_load to not exceed the instantaneous rq->weight, it's still possible (e.g. using wake_idx != 0) for (load + effective_load) to underflow. In this case the comparing the unsigned values can result in incorrect balanced decisions. Signed-off-by: Paul Turner Signed-off-by: Peter Zijlstra LKML-Reference: <20110122044851.734245014@google.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 3547699..ccecfec 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1432,7 +1432,7 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu, static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) { - unsigned long this_load, load; + s64 this_load, load; int idx, this_cpu, prev_cpu; unsigned long tl_per_task; struct task_group *tg; @@ -1471,8 +1471,8 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) * Otherwise check if either cpus are near enough in load to allow this * task to be woken on this_cpu. */ - if (this_load) { - unsigned long this_eff_load, prev_eff_load; + if (this_load > 0) { + s64 this_eff_load, prev_eff_load; this_eff_load = 100; this_eff_load *= power_of(prev_cpu); -- cgit v1.1 From b815f1963e47b9b69bb17e0588bd5af5b1114ae0 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Fri, 21 Jan 2011 20:45:00 -0800 Subject: sched: Fix/remove redundant cfs_rq checks Since updates are against an entity's queuing cfs_rq it's not possible to enter update_cfs_{shares,load} with a NULL cfs_rq. (Indeed, update_cfs_load would crash prior to the check if we did anyway since we load is examined during the initializers). Also, in the update_cfs_load case there's no point in maintaining averages for rq->cfs_rq since we don't perform shares distribution at that level -- NULL check is replaced accordingly. Thanks to Dan Carpenter for pointing out the deference before NULL check. Signed-off-by: Paul Turner Signed-off-by: Peter Zijlstra LKML-Reference: <20110122044851.825284940@google.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index ccecfec..1997383 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -722,7 +722,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) u64 now, delta; unsigned long load = cfs_rq->load.weight; - if (!cfs_rq) + if (cfs_rq->tg == &root_task_group) return; now = rq_of(cfs_rq)->clock; @@ -830,9 +830,6 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) struct sched_entity *se; long shares; - if (!cfs_rq) - return; - tg = cfs_rq->tg; se = tg->se[cpu_of(rq_of(cfs_rq))]; if (!se) -- cgit v1.1 From 05ca62c6ca17f39b88fa956d5ebc1fa6e93ad5e3 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Fri, 21 Jan 2011 20:45:02 -0800 Subject: sched: Use rq->clock_task instead of rq->clock for correctly maintaining load averages The delta in clock_task is a more fair attribution of how much time a tg has been contributing load to the current cpu. While not really important it also means we're more in sync (by magnitude) with respect to periodic updates (since __update_curr deltas are clock_task based). Signed-off-by: Paul Turner Signed-off-by: Peter Zijlstra LKML-Reference: <20110122044852.007092349@google.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 1997383..0c26e2d 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -725,7 +725,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) if (cfs_rq->tg == &root_task_group) return; - now = rq_of(cfs_rq)->clock; + now = rq_of(cfs_rq)->clock_task; delta = now - cfs_rq->load_stamp; /* truncate load history at 4 idle periods */ -- cgit v1.1 From 88d4f0db7fa8785859c1d637f9aac210932b6216 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 25 Jan 2011 19:40:51 +0100 Subject: perf: Fix alloc_callchain_buffers() Commit 927c7a9e92c4 ("perf: Fix race in callchains") introduced a mismatch in the sizing of struct callchain_cpus_entries. nr_cpu_ids must be used instead of num_possible_cpus(), or we might get out of bound memory accesses on some machines. Signed-off-by: Eric Dumazet Cc: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: David Miller Cc: Stephane Eranian CC: stable@kernel.org LKML-Reference: <1295980851.3588.351.camel@edumazet-laptop> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 126a302..852ae8c 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1999,8 +1999,7 @@ static int alloc_callchain_buffers(void) * accessed from NMI. Use a temporary manual per cpu allocation * until that gets sorted out. */ - size = sizeof(*entries) + sizeof(struct perf_callchain_entry *) * - num_possible_cpus(); + size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]); entries = kzalloc(size, GFP_KERNEL); if (!entries) -- cgit v1.1 From aa5bd67dcfdf9af34c7fa36ebc87d4e1f7e91873 Mon Sep 17 00:00:00 2001 From: Kacper Kornet Date: Sat, 29 Jan 2011 00:21:04 +0100 Subject: Fix prlimit64 for suid/sgid processes Since check_prlimit_permission always fails in the case of SUID/GUID processes, such processes are not able to read or set their own limits. This commit changes this by assuming that process can always read/change its own limits. Signed-off-by: Kacper Kornet Acked-by: Jiri Slaby Signed-off-by: Linus Torvalds --- kernel/sys.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 31b71a2..18da702 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1385,7 +1385,8 @@ static int check_prlimit_permission(struct task_struct *task) const struct cred *cred = current_cred(), *tcred; tcred = __task_cred(task); - if ((cred->uid != tcred->euid || + if (current != task && + (cred->uid != tcred->euid || cred->uid != tcred->suid || cred->uid != tcred->uid || cred->gid != tcred->egid || -- cgit v1.1 From 4135038a582c20ffdadfcf6564852e0b72a20968 Mon Sep 17 00:00:00 2001 From: Marcin Slusarz Date: Fri, 28 Jan 2011 11:00:31 -0500 Subject: watchdog: Fix broken nowatchdog logic Passing nowatchdog to kernel disables 2 things: creation of watchdog threads AND initialization of percpu watchdog_hrtimer. As hrtimers are initialized only at boot it's not possible to enable watchdog later - for me all watchdog threads started to eat 100% of CPU time, but they could just crash. Additionally, even if these threads would start properly, watchdog_disable_all_cpus was guarded by no_watchdog check, so you couldn't disable watchdog. To fix this, remove no_watchdog variable and use already existing watchdog_enabled variable. Signed-off-by: Marcin Slusarz [ removed another no_watchdog instance ] Signed-off-by: Don Zickus Cc: Stephane Eranian Cc: Peter Zijlstra Cc: Frederic Weisbecker Cc: LKML-Reference: <1296230433-6261-1-git-send-email-dzickus@redhat.com> Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index d7ebdf4..d9961ea 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -27,7 +27,7 @@ #include #include -int watchdog_enabled; +int watchdog_enabled = 1; int __read_mostly softlockup_thresh = 60; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); @@ -43,9 +43,6 @@ static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); #endif -static int no_watchdog; - - /* boot commands */ /* * Should we panic when a soft-lockup or hard-lockup occurs: @@ -58,7 +55,7 @@ static int __init hardlockup_panic_setup(char *str) if (!strncmp(str, "panic", 5)) hardlockup_panic = 1; else if (!strncmp(str, "0", 1)) - no_watchdog = 1; + watchdog_enabled = 0; return 1; } __setup("nmi_watchdog=", hardlockup_panic_setup); @@ -77,7 +74,7 @@ __setup("softlockup_panic=", softlockup_panic_setup); static int __init nowatchdog_setup(char *str) { - no_watchdog = 1; + watchdog_enabled = 0; return 1; } __setup("nowatchdog", nowatchdog_setup); @@ -85,7 +82,7 @@ __setup("nowatchdog", nowatchdog_setup); /* deprecated */ static int __init nosoftlockup_setup(char *str) { - no_watchdog = 1; + watchdog_enabled = 0; return 1; } __setup("nosoftlockup", nosoftlockup_setup); @@ -476,9 +473,6 @@ static void watchdog_disable_all_cpus(void) { int cpu; - if (no_watchdog) - return; - for_each_online_cpu(cpu) watchdog_disable(cpu); @@ -530,7 +524,8 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) break; case CPU_ONLINE: case CPU_ONLINE_FROZEN: - err = watchdog_enable(hotcpu); + if (watchdog_enabled) + err = watchdog_enable(hotcpu); break; #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: @@ -555,9 +550,6 @@ void __init lockup_detector_init(void) void *cpu = (void *)(long)smp_processor_id(); int err; - if (no_watchdog) - return; - err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); WARN_ON(notifier_to_errno(err)); -- cgit v1.1 From 397357666de6b5b6adb5fa99f9758ec8cf30ac34 Mon Sep 17 00:00:00 2001 From: Marcin Slusarz Date: Fri, 28 Jan 2011 11:00:32 -0500 Subject: watchdog: Fix sysctl consistency If it was not possible to enable watchdog for any cpu, switch watchdog_enabled back to 0, because it's visible via kernel.watchdog sysctl. Signed-off-by: Marcin Slusarz Signed-off-by: Don Zickus Cc: Peter Zijlstra Cc: Frederic Weisbecker Cc: LKML-Reference: <1296230433-6261-2-git-send-email-dzickus@redhat.com> Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index d9961ea..c7e0049 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -429,9 +429,6 @@ static int watchdog_enable(int cpu) wake_up_process(p); } - /* if any cpu succeeds, watchdog is considered enabled for the system */ - watchdog_enabled = 1; - return 0; } @@ -459,12 +456,16 @@ static void watchdog_disable(int cpu) static void watchdog_enable_all_cpus(void) { int cpu; - int result = 0; + + watchdog_enabled = 0; for_each_online_cpu(cpu) - result += watchdog_enable(cpu); + if (!watchdog_enable(cpu)) + /* if any cpu succeeds, watchdog is considered + enabled for the system */ + watchdog_enabled = 1; - if (result) + if (!watchdog_enabled) printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n"); } -- cgit v1.1 From 9ffdc6c37df131f89d52001e0ef03091b158826f Mon Sep 17 00:00:00 2001 From: Marcin Slusarz Date: Fri, 28 Jan 2011 11:00:33 -0500 Subject: watchdog: Don't change watchdog state on read of sysctl Signed-off-by: Marcin Slusarz [ add {}'s to fix a warning ] Signed-off-by: Don Zickus Cc: Peter Zijlstra Cc: Frederic Weisbecker Cc: LKML-Reference: <1296230433-6261-3-git-send-email-dzickus@redhat.com> Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index c7e0049..f37f974 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -493,10 +493,12 @@ int proc_dowatchdog_enabled(struct ctl_table *table, int write, { proc_dointvec(table, write, buffer, length, ppos); - if (watchdog_enabled) - watchdog_enable_all_cpus(); - else - watchdog_disable_all_cpus(); + if (write) { + if (watchdog_enabled) + watchdog_enable_all_cpus(); + else + watchdog_disable_all_cpus(); + } return 0; } -- cgit v1.1 From f1a06390d013244e721372b3f9b66e39b6429c71 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 28 Jan 2011 08:47:15 +0100 Subject: genirq: Prevent irq storm on migration move_native_irq() masks and unmasks the interrupt line unconditionally, but the interrupt line might be masked due to a threaded oneshot handler in progress. Unmasking the line in that case can lead to interrupt storms. Observed on PREEMPT_RT. Originally-from: Ingo Molnar Signed-off-by: Thomas Gleixner Cc: stable@kernel.org --- kernel/irq/migration.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 1d25419..441fd62 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -56,6 +56,7 @@ void move_masked_irq(int irq) void move_native_irq(int irq) { struct irq_desc *desc = irq_to_desc(irq); + bool masked; if (likely(!(desc->status & IRQ_MOVE_PENDING))) return; @@ -63,8 +64,15 @@ void move_native_irq(int irq) if (unlikely(desc->status & IRQ_DISABLED)) return; - desc->irq_data.chip->irq_mask(&desc->irq_data); + /* + * Be careful vs. already masked interrupts. If this is a + * threaded interrupt with ONESHOT set, we can end up with an + * interrupt storm. + */ + masked = desc->status & IRQ_MASKED; + if (!masked) + desc->irq_data.chip->irq_mask(&desc->irq_data); move_masked_irq(irq); - desc->irq_data.chip->irq_unmask(&desc->irq_data); + if (!masked) + desc->irq_data.chip->irq_unmask(&desc->irq_data); } - -- cgit v1.1 From e4a9ea5ee7c8812a7bf0c3fb725ceeaa3d4c2fcc Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Jan 2011 09:15:30 -0500 Subject: tracing: Replace trace_event struct array with pointer array Currently the trace_event structures are placed in the _ftrace_events section, and at link time, the linker makes one large array of all the trace_event structures. On boot up, this array is read (much like the initcall sections) and the events are processed. The problem is that there is no guarantee that gcc will place complex structures nicely together in an array format. Two structures in the same file may be placed awkwardly, because gcc has no clue that they are suppose to be in an array. A hack was used previous to force the alignment to 4, to pack the structures together. But this caused alignment issues with other architectures (sparc). Instead of packing the structures into an array, the structures' addresses are now put into the _ftrace_event section. As pointers are always the natural alignment, gcc should always pack them tightly together (otherwise initcall, extable, etc would also fail). By having the pointers to the structures in the section, we can still iterate the trace_events without causing unnecessary alignment problems with other architectures, or depending on the current behaviour of gcc that will likely change in the future just to tick us kernel developers off a little more. The _ftrace_event section is also moved into the .init.data section as it is now only needed at boot up. Suggested-by: David Miller Cc: Mathieu Desnoyers Acked-by: David S. Miller Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 12 ++++++------ kernel/trace/trace_export.c | 6 +++--- 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 35fde09..5f499e04 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1284,7 +1284,7 @@ trace_create_file_ops(struct module *mod) static void trace_module_add_events(struct module *mod) { struct ftrace_module_file_ops *file_ops = NULL; - struct ftrace_event_call *call, *start, *end; + struct ftrace_event_call **call, **start, **end; start = mod->trace_events; end = mod->trace_events + mod->num_trace_events; @@ -1297,7 +1297,7 @@ static void trace_module_add_events(struct module *mod) return; for_each_event(call, start, end) { - __trace_add_event_call(call, mod, + __trace_add_event_call(*call, mod, &file_ops->id, &file_ops->enable, &file_ops->filter, &file_ops->format); } @@ -1367,8 +1367,8 @@ static struct notifier_block trace_module_nb = { .priority = 0, }; -extern struct ftrace_event_call __start_ftrace_events[]; -extern struct ftrace_event_call __stop_ftrace_events[]; +extern struct ftrace_event_call *__start_ftrace_events[]; +extern struct ftrace_event_call *__stop_ftrace_events[]; static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata; @@ -1384,7 +1384,7 @@ __setup("trace_event=", setup_trace_event); static __init int event_trace_init(void) { - struct ftrace_event_call *call; + struct ftrace_event_call **call; struct dentry *d_tracer; struct dentry *entry; struct dentry *d_events; @@ -1430,7 +1430,7 @@ static __init int event_trace_init(void) pr_warning("tracing: Failed to allocate common fields"); for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { - __trace_add_event_call(call, NULL, &ftrace_event_id_fops, + __trace_add_event_call(*call, NULL, &ftrace_event_id_fops, &ftrace_enable_fops, &ftrace_event_filter_fops, &ftrace_event_format_fops); diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 4b74d71..bbeec31 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -161,13 +161,13 @@ struct ftrace_event_class event_class_ftrace_##call = { \ .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ }; \ \ -struct ftrace_event_call __used \ -__attribute__((__aligned__(4))) \ -__attribute__((section("_ftrace_events"))) event_##call = { \ +struct ftrace_event_call __used event_##call = { \ .name = #call, \ .event.type = etype, \ .class = &event_class_ftrace_##call, \ .print_fmt = print, \ }; \ +struct ftrace_event_call __used \ +__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; #include "trace_entries.h" -- cgit v1.1 From 542e72fc90f5ed9eecb574f80f70868c7f296093 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 26 Jan 2011 15:38:35 +0100 Subject: perf: Fix reading in perf_event_read() It is quite possible for the event to have been disabled between perf_event_read() sending the IPI and the CPU servicing the IPI and calling __perf_event_read(), hence revalidate the state. Reported-by: Stephane Eranian Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 852ae8c..999835b 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1901,11 +1901,12 @@ static void __perf_event_read(void *info) return; raw_spin_lock(&ctx->lock); - update_context_time(ctx); + if (ctx->is_active) + update_context_time(ctx); update_event_times(event); + if (event->state == PERF_EVENT_STATE_ACTIVE) + event->pmu->read(event); raw_spin_unlock(&ctx->lock); - - event->pmu->read(event); } static inline u64 perf_event_count(struct perf_event *event) -- cgit v1.1 From 06c3bc655697b19521901f9254eb0bbb2c67e7e8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 2 Feb 2011 13:19:48 +0100 Subject: sched: Fix update_curr_rt() cpu_stopper_thread() migration_cpu_stop() __migrate_task() deactivate_task() dequeue_task() dequeue_task_rq() update_curr_rt() Will call update_curr_rt() on rq->curr, which at that time is rq->stop. The problem is that rq->stop.prio matches an RT prio and thus falsely assumes its a rt_sched_class task. Reported-Debuged-Tested-Acked-by: Thomas Gleixner Signed-off-by: Peter Zijlstra LKML-Reference: Cc: stable@kernel.org # .37 Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index c914ec7..ad62677 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -625,7 +625,7 @@ static void update_curr_rt(struct rq *rq) struct rt_rq *rt_rq = rt_rq_of_se(rt_se); u64 delta_exec; - if (!task_has_rt_policy(curr)) + if (curr->sched_class != &rt_sched_class) return; delta_exec = rq->clock_task - curr->se.exec_start; -- cgit v1.1 From 654986462939cd7ec18f276c6379a334dac106a7 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Wed, 26 Jan 2011 17:26:22 -0500 Subject: tracepoints: Fix section alignment using pointer array Make the tracepoints more robust, making them solid enough to handle compiler changes by not relying on anything based on compiler-specific behavior with respect to structure alignment. Implement an approach proposed by David Miller: use an array of const pointers to refer to the individual structures, and export this pointer array through the linker script rather than the structures per se. It will consume 32 extra bytes per tracepoint (24 for structure padding and 8 for the pointers), but are less likely to break due to compiler changes. History: commit 7e066fb8 tracepoints: add DECLARE_TRACE() and DEFINE_TRACE() added the aligned(32) type and variable attribute to the tracepoint structures to deal with gcc happily aligning statically defined structures on 32-byte multiples. One attempt was to use a 8-byte alignment for tracepoint structures by applying both the variable and type attribute to tracepoint structures definitions and declarations. It worked fine with gcc 4.5.1, but broke with gcc 4.4.4 and 4.4.5. The reason is that the "aligned" attribute only specify the _minimum_ alignment for a structure, leaving both the compiler and the linker free to align on larger multiples. Because tracepoint.c expects the structures to be placed as an array within each section, up-alignment cause NULL-pointer exceptions due to the extra unexpected padding. (this patch applies on top of -tip) Signed-off-by: Mathieu Desnoyers Acked-by: David S. Miller LKML-Reference: <20110126222622.GA10794@Krystal> CC: Frederic Weisbecker CC: Ingo Molnar CC: Thomas Gleixner CC: Andrew Morton CC: Peter Zijlstra CC: Rusty Russell Signed-off-by: Steven Rostedt --- kernel/module.c | 16 ++++++++-------- kernel/tracepoint.c | 31 ++++++++++++++++--------------- 2 files changed, 24 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 34e00b7..efa290e 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2460,9 +2460,9 @@ static void find_module_sections(struct module *mod, struct load_info *info) #endif #ifdef CONFIG_TRACEPOINTS - mod->tracepoints = section_objs(info, "__tracepoints", - sizeof(*mod->tracepoints), - &mod->num_tracepoints); + mod->tracepoints_ptrs = section_objs(info, "__tracepoints_ptrs", + sizeof(*mod->tracepoints_ptrs), + &mod->num_tracepoints); #endif #ifdef HAVE_JUMP_LABEL mod->jump_entries = section_objs(info, "__jump_table", @@ -3393,7 +3393,7 @@ void module_layout(struct module *mod, struct modversion_info *ver, struct kernel_param *kp, struct kernel_symbol *ks, - struct tracepoint *tp) + struct tracepoint * const *tp) { } EXPORT_SYMBOL(module_layout); @@ -3407,8 +3407,8 @@ void module_update_tracepoints(void) mutex_lock(&module_mutex); list_for_each_entry(mod, &modules, list) if (!mod->taints) - tracepoint_update_probe_range(mod->tracepoints, - mod->tracepoints + mod->num_tracepoints); + tracepoint_update_probe_range(mod->tracepoints_ptrs, + mod->tracepoints_ptrs + mod->num_tracepoints); mutex_unlock(&module_mutex); } @@ -3432,8 +3432,8 @@ int module_get_iter_tracepoints(struct tracepoint_iter *iter) else if (iter_mod > iter->module) iter->tracepoint = NULL; found = tracepoint_get_iter_range(&iter->tracepoint, - iter_mod->tracepoints, - iter_mod->tracepoints + iter_mod->tracepoints_ptrs, + iter_mod->tracepoints_ptrs + iter_mod->num_tracepoints); if (found) { iter->module = iter_mod; diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index e95ee7f..68187af 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -27,8 +27,8 @@ #include #include -extern struct tracepoint __start___tracepoints[]; -extern struct tracepoint __stop___tracepoints[]; +extern struct tracepoint * const __start___tracepoints_ptrs[]; +extern struct tracepoint * const __stop___tracepoints_ptrs[]; /* Set to 1 to enable tracepoint debug output */ static const int tracepoint_debug; @@ -298,10 +298,10 @@ static void disable_tracepoint(struct tracepoint *elem) * * Updates the probe callback corresponding to a range of tracepoints. */ -void -tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end) +void tracepoint_update_probe_range(struct tracepoint * const *begin, + struct tracepoint * const *end) { - struct tracepoint *iter; + struct tracepoint * const *iter; struct tracepoint_entry *mark_entry; if (!begin) @@ -309,12 +309,12 @@ tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end) mutex_lock(&tracepoints_mutex); for (iter = begin; iter < end; iter++) { - mark_entry = get_tracepoint(iter->name); + mark_entry = get_tracepoint((*iter)->name); if (mark_entry) { - set_tracepoint(&mark_entry, iter, + set_tracepoint(&mark_entry, *iter, !!mark_entry->refcount); } else { - disable_tracepoint(iter); + disable_tracepoint(*iter); } } mutex_unlock(&tracepoints_mutex); @@ -326,8 +326,8 @@ tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end) static void tracepoint_update_probes(void) { /* Core kernel tracepoints */ - tracepoint_update_probe_range(__start___tracepoints, - __stop___tracepoints); + tracepoint_update_probe_range(__start___tracepoints_ptrs, + __stop___tracepoints_ptrs); /* tracepoints in modules. */ module_update_tracepoints(); } @@ -514,8 +514,8 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_update_all); * Will return the first tracepoint in the range if the input tracepoint is * NULL. */ -int tracepoint_get_iter_range(struct tracepoint **tracepoint, - struct tracepoint *begin, struct tracepoint *end) +int tracepoint_get_iter_range(struct tracepoint * const **tracepoint, + struct tracepoint * const *begin, struct tracepoint * const *end) { if (!*tracepoint && begin != end) { *tracepoint = begin; @@ -534,7 +534,8 @@ static void tracepoint_get_iter(struct tracepoint_iter *iter) /* Core kernel tracepoints */ if (!iter->module) { found = tracepoint_get_iter_range(&iter->tracepoint, - __start___tracepoints, __stop___tracepoints); + __start___tracepoints_ptrs, + __stop___tracepoints_ptrs); if (found) goto end; } @@ -585,8 +586,8 @@ int tracepoint_module_notify(struct notifier_block *self, switch (val) { case MODULE_STATE_COMING: case MODULE_STATE_GOING: - tracepoint_update_probe_range(mod->tracepoints, - mod->tracepoints + mod->num_tracepoints); + tracepoint_update_probe_range(mod->tracepoints_ptrs, + mod->tracepoints_ptrs + mod->num_tracepoints); break; } return 0; -- cgit v1.1 From 3d56e331b6537671c66f1b510bed0f1e0331dfc8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 2 Feb 2011 17:06:09 -0500 Subject: tracing: Replace syscall_meta_data struct array with pointer array Currently the syscall_meta structures for the syscall tracepoints are placed in the __syscall_metadata section, and at link time, the linker makes one large array of all these syscall metadata structures. On boot up, this array is read (much like the initcall sections) and the syscall data is processed. The problem is that there is no guarantee that gcc will place complex structures nicely together in an array format. Two structures in the same file may be placed awkwardly, because gcc has no clue that they are suppose to be in an array. A hack was used previous to force the alignment to 4, to pack the structures together. But this caused alignment issues with other architectures (sparc). Instead of packing the structures into an array, the structures' addresses are now put into the __syscall_metadata section. As pointers are always the natural alignment, gcc should always pack them tightly together (otherwise initcall, extable, etc would also fail). By having the pointers to the structures in the section, we can still iterate the trace_events without causing unnecessary alignment problems with other architectures, or depending on the current behaviour of gcc that will likely change in the future just to tick us kernel developers off a little more. The __syscall_metadata section is also moved into the .init.data section as it is now only needed at boot up. Suggested-by: David Miller Acked-by: David S. Miller Cc: Mathieu Desnoyers Signed-off-by: Steven Rostedt --- kernel/trace/trace_syscalls.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index b706529..5c9fe08 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -55,20 +55,21 @@ struct ftrace_event_class event_class_syscall_exit = { .raw_init = init_syscall_trace, }; -extern unsigned long __start_syscalls_metadata[]; -extern unsigned long __stop_syscalls_metadata[]; +extern struct syscall_metadata *__start_syscalls_metadata[]; +extern struct syscall_metadata *__stop_syscalls_metadata[]; static struct syscall_metadata **syscalls_metadata; -static struct syscall_metadata *find_syscall_meta(unsigned long syscall) +static __init struct syscall_metadata * +find_syscall_meta(unsigned long syscall) { - struct syscall_metadata *start; - struct syscall_metadata *stop; + struct syscall_metadata **start; + struct syscall_metadata **stop; char str[KSYM_SYMBOL_LEN]; - start = (struct syscall_metadata *)__start_syscalls_metadata; - stop = (struct syscall_metadata *)__stop_syscalls_metadata; + start = __start_syscalls_metadata; + stop = __stop_syscalls_metadata; kallsyms_lookup(syscall, NULL, NULL, NULL, str); for ( ; start < stop; start++) { @@ -78,8 +79,8 @@ static struct syscall_metadata *find_syscall_meta(unsigned long syscall) * with "SyS" instead of "sys", leading to an unwanted * mismatch. */ - if (start->name && !strcmp(start->name + 3, str + 3)) - return start; + if ((*start)->name && !strcmp((*start)->name + 3, str + 3)) + return *start; } return NULL; } -- cgit v1.1 From f266a5110d453b7987194460ac7edd31f1a5426c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 3 Feb 2011 15:09:41 +0100 Subject: lockdep, timer: Fix del_timer_sync() annotation Calling local_bh_enable() will want to actually start processing softirqs, which isn't a good idea since this can get called with IRQs disabled. Cure this by using _local_bh_enable() which doesn't start processing softirqs, and use raw_local_irq_save() to avoid any softirqs from happening without letting lockdep think IRQs are in fact disabled. Reported-by: Nick Bowler Signed-off-by: Peter Zijlstra Reviewed-by: Yong Zhang LKML-Reference: <20110203141548.039540914@chello.nl> Signed-off-by: Thomas Gleixner --- kernel/timer.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index 43ca993..d53ce66 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -969,10 +969,14 @@ EXPORT_SYMBOL(try_to_del_timer_sync); int del_timer_sync(struct timer_list *timer) { #ifdef CONFIG_LOCKDEP + unsigned long flags; + + raw_local_irq_save(flags); local_bh_disable(); lock_map_acquire(&timer->lockdep_map); lock_map_release(&timer->lockdep_map); - local_bh_enable(); + _local_bh_enable(); + raw_local_irq_restore(flags); #endif /* * don't use it in hardirq context, because it -- cgit v1.1 From 2edeaa34a6e3f2c43b667f6c4f7b27944b811695 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Mon, 7 Feb 2011 13:36:10 +0000 Subject: CRED: Fix BUG() upon security_cred_alloc_blank() failure In cred_alloc_blank() since 2.6.32, abort_creds(new) is called with new->security == NULL and new->magic == 0 when security_cred_alloc_blank() returns an error. As a result, BUG() will be triggered if SELinux is enabled or CONFIG_DEBUG_CREDENTIALS=y. If CONFIG_DEBUG_CREDENTIALS=y, BUG() is called from __invalid_creds() because cred->magic == 0. Failing that, BUG() is called from selinux_cred_free() because selinux_cred_free() is not expecting cred->security == NULL. This does not affect smack_cred_free(), tomoyo_cred_free() or apparmor_cred_free(). Fix these bugs by (1) Set new->magic before calling security_cred_alloc_blank(). (2) Handle null cred->security in creds_are_invalid() and selinux_cred_free(). Signed-off-by: Tetsuo Handa Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- kernel/cred.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cred.c b/kernel/cred.c index 6a1aa00..fcf104b 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -252,13 +252,13 @@ struct cred *cred_alloc_blank(void) #endif atomic_set(&new->usage, 1); +#ifdef CONFIG_DEBUG_CREDENTIALS + new->magic = CRED_MAGIC; +#endif if (security_cred_alloc_blank(new, GFP_KERNEL) < 0) goto error; -#ifdef CONFIG_DEBUG_CREDENTIALS - new->magic = CRED_MAGIC; -#endif return new; error: @@ -748,7 +748,11 @@ bool creds_are_invalid(const struct cred *cred) if (cred->magic != CRED_MAGIC) return true; #ifdef CONFIG_SECURITY_SELINUX - if (selinux_is_enabled()) { + /* + * cred->security == NULL if security_cred_alloc_blank() or + * security_prepare_creds() returned an error. + */ + if (selinux_is_enabled() && cred->security) { if ((unsigned long) cred->security < PAGE_SIZE) return true; if ((*(u32 *)cred->security & 0xffffff00) == -- cgit v1.1 From fb2b2a1d37f80cc818fd4487b510f4e11816e5e1 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Mon, 7 Feb 2011 13:36:16 +0000 Subject: CRED: Fix memory and refcount leaks upon security_prepare_creds() failure In prepare_kernel_cred() since 2.6.29, put_cred(new) is called without assigning new->usage when security_prepare_creds() returned an error. As a result, memory for new and refcount for new->{user,group_info,tgcred} are leaked because put_cred(new) won't call __put_cred() unless old->usage == 1. Fix these leaks by assigning new->usage (and new->subscribers which was added in 2.6.32) before calling security_prepare_creds(). Signed-off-by: Tetsuo Handa Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- kernel/cred.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cred.c b/kernel/cred.c index fcf104b..3a9d6dd 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -657,6 +657,8 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) validate_creds(old); *new = *old; + atomic_set(&new->usage, 1); + set_cred_subscribers(new, 0); get_uid(new->user); get_group_info(new->group_info); @@ -674,8 +676,6 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) if (security_prepare_creds(new, old, GFP_KERNEL) < 0) goto error; - atomic_set(&new->usage, 1); - set_cred_subscribers(new, 0); put_cred(old); validate_creds(new); return new; -- cgit v1.1 From ee24aebffb75a7f940cf52c8cf6910947b3130c0 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 10 Feb 2011 17:53:55 -0800 Subject: cap_syslog: accept CAP_SYS_ADMIN for now In commit ce6ada35bdf7 ("security: Define CAP_SYSLOG") Serge Hallyn introduced CAP_SYSLOG, but broke backwards compatibility by no longer accepting CAP_SYS_ADMIN as an override (it would cause a warning and then reject the operation). Re-instate CAP_SYS_ADMIN - but keeping the warning - as an acceptable capability until any legacy applications have been updated. There are apparently applications out there that drop all capabilities except for CAP_SYS_ADMIN in order to access the syslog. (This is a re-implementation of a patch by Serge, cleaning the logic up and making the code more readable) Acked-by: Serge Hallyn Reviewed-by: James Morris Signed-off-by: Linus Torvalds --- kernel/printk.c | 54 +++++++++++++++++++++++++++++++++++------------------- 1 file changed, 35 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 2ddbdc7..3623152 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -262,25 +262,47 @@ int dmesg_restrict = 1; int dmesg_restrict; #endif +static int syslog_action_restricted(int type) +{ + if (dmesg_restrict) + return 1; + /* Unless restricted, we allow "read all" and "get buffer size" for everybody */ + return type != SYSLOG_ACTION_READ_ALL && type != SYSLOG_ACTION_SIZE_BUFFER; +} + +static int check_syslog_permissions(int type, bool from_file) +{ + /* + * If this is from /proc/kmsg and we've already opened it, then we've + * already done the capabilities checks at open time. + */ + if (from_file && type != SYSLOG_ACTION_OPEN) + return 0; + + if (syslog_action_restricted(type)) { + if (capable(CAP_SYSLOG)) + return 0; + /* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */ + if (capable(CAP_SYS_ADMIN)) { + WARN_ONCE(1, "Attempt to access syslog with CAP_SYS_ADMIN " + "but no CAP_SYSLOG (deprecated).\n"); + return 0; + } + return -EPERM; + } + return 0; +} + int do_syslog(int type, char __user *buf, int len, bool from_file) { unsigned i, j, limit, count; int do_clear = 0; char c; - int error = 0; + int error; - /* - * If this is from /proc/kmsg we only do the capabilities checks - * at open time. - */ - if (type == SYSLOG_ACTION_OPEN || !from_file) { - if (dmesg_restrict && !capable(CAP_SYSLOG)) - goto warn; /* switch to return -EPERM after 2.6.39 */ - if ((type != SYSLOG_ACTION_READ_ALL && - type != SYSLOG_ACTION_SIZE_BUFFER) && - !capable(CAP_SYSLOG)) - goto warn; /* switch to return -EPERM after 2.6.39 */ - } + error = check_syslog_permissions(type, from_file); + if (error) + goto out; error = security_syslog(type); if (error) @@ -423,12 +445,6 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) } out: return error; -warn: - /* remove after 2.6.39 */ - if (capable(CAP_SYS_ADMIN)) - WARN_ONCE(1, "Attempt to access syslog with CAP_SYS_ADMIN " - "but no CAP_SYSLOG (deprecated and denied).\n"); - return -EPERM; } SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) -- cgit v1.1 From 6037b715d6fab139742c3df8851db4c823081561 Mon Sep 17 00:00:00 2001 From: Chris Wright Date: Wed, 9 Feb 2011 22:11:51 -0800 Subject: security: add cred argument to security_capable() Expand security_capable() to include cred, so that it can be usable in a wider range of call sites. Signed-off-by: Chris Wright Acked-by: Serge Hallyn Signed-off-by: James Morris --- kernel/capability.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/capability.c b/kernel/capability.c index 2f05303..9e9385f 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -306,7 +306,7 @@ int capable(int cap) BUG(); } - if (security_capable(cap) == 0) { + if (security_capable(current_cred(), cap) == 0) { current->flags |= PF_SUPERPRIV; return 1; } -- cgit v1.1 From 01e05e9a90b8f4c3997ae0537e87720eb475e532 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 10 Feb 2011 15:01:22 -0800 Subject: ptrace: use safer wake up on ptrace_detach() The wake_up_process() call in ptrace_detach() is spurious and not interlocked with the tracee state. IOW, the tracee could be running or sleeping in any place in the kernel by the time wake_up_process() is called. This can lead to the tracee waking up unexpectedly which can be dangerous. The wake_up is spurious and should be removed but for now reduce its toxicity by only waking up if the tracee is in TRACED or STOPPED state. This bug can possibly be used as an attack vector. I don't think it will take too much effort to come up with an attack which triggers oops somewhere. Most sleeps are wrapped in condition test loops and should be safe but we have quite a number of places where sleep and wakeup conditions are expected to be interlocked. Although the window of opportunity is tiny, ptrace can be used by non-privileged users and with some loading the window can definitely be extended and exploited. Signed-off-by: Tejun Heo Acked-by: Roland McGrath Acked-by: Oleg Nesterov Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 99bbaa3..1708b1e 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -313,7 +313,7 @@ int ptrace_detach(struct task_struct *child, unsigned int data) child->exit_code = data; dead = __ptrace_detach(current, child); if (!child->exit_state) - wake_up_process(child); + wake_up_state(child, TASK_TRACED | TASK_STOPPED); } write_unlock_irq(&tasklist_lock); -- cgit v1.1